From da27b9faa98619ece3407f731f0eacbc730cfffb Mon Sep 17 00:00:00 2001 From: master <> Date: Sat, 17 Jan 2026 21:32:03 +0200 Subject: [PATCH] release orchestration strengthening --- .../dashboards/stella-ops-error-tracking.json | 536 ++++ .../dashboards/stella-ops-performance.json | 607 ++++ .../stella-ops-release-overview.json | 566 ++++ .../dashboards/stella-ops-sla-monitoring.json | 541 ++++ ...01_ATTESTOR_periodic_rekor_verification.md | 4 +- ...ReleaseOrchestrator_enhancements_master.md | 219 ++ ...1_ReleaseOrchestrator_drift_remediation.md | 263 ++ ...easeOrchestrator_workflow_visualization.md | 309 ++ ...leaseOrchestrator_rollback_intelligence.md | 125 + ...34_ReleaseOrchestrator_agent_resilience.md | 162 + ...eleaseOrchestrator_progressive_delivery.md | 154 + ...17_036_ReleaseOrchestrator_multi_region.md | 161 + ...eleaseOrchestrator_developer_experience.md | 178 ++ ...117_038_ReleaseOrchestrator_performance.md | 150 + ...0117_039_ReleaseOrchestrator_compliance.md | 164 + ...easeOrchestrator_multi_language_scripts.md | 561 ++++ ...17_040_ReleaseOrchestrator_self_healing.md | 112 + ...41_ReleaseOrchestrator_agent_operations.md | 452 +++ ...7_041_ReleaseOrchestrator_observability.md | 126 + docs/FEATURE_GAPS_REPORT.md | 744 ----- docs/FEATURE_MATRIX.md | 873 +++-- docs/guides/agent-operations-quickstart.md | 230 ++ ...NT_20260117_026_CLI_why_blocked_command.md | 188 -- ...T_20260117_027_CLI_audit_bundle_command.md | 280 -- ...PRINT_20260117_028_Telemetry_p0_metrics.md | 240 -- .../enhancements/agent-operations.md | 1475 +++++++++ .../enhancements/agent-resilience.md | 1111 +++++++ .../enhancements/compliance-reporting.md | 1187 +++++++ .../enhancements/developer-experience.md | 1091 +++++++ .../enhancements/drift-remediation.md | 749 +++++ .../enhancements/multi-language-scripts.md | 2799 +++++++++++++++++ .../enhancements/multi-region-federation.md | 1028 ++++++ .../enhancements/performance-optimizations.md | 951 ++++++ .../enhancements/progressive-delivery.md | 1171 +++++++ .../enhancements/rollback-intelligence.md | 1118 +++++++ .../enhancements/workflow-visualization.md | 1124 +++++++ docs/product/PRICING.md | 66 - .../Controllers/EnvironmentsController.cs | 542 ++++ .../Controllers/GatesController.cs | 422 +++ .../Controllers/ObservabilityController.cs | 484 +++ .../Controllers/ReleasesController.cs | 501 +++ .../Controllers/RemediationController.cs | 1061 +++++++ .../WorkflowVisualizationController.cs | 1178 +++++++ src/Api/StellaOps.Api/Hubs/RemediationHub.cs | 533 ++++ .../CliIntegrationTests.cs | 732 +++++ src/Cli/StellaOps.Cli/CliApplication.cs | 759 +++++ .../Commands/Agent/BootstrapCommands.cs | 227 ++ .../Commands/Agent/CertificateCommands.cs | 127 + .../Commands/Agent/ConfigCommands.cs | 241 ++ .../Commands/Agent/DoctorCommands.cs | 220 ++ .../Commands/Agent/UpdateCommands.cs | 160 + .../Commands/DeployCommandHandler.cs | 370 +++ .../Commands/PromoteCommandHandler.cs | 311 ++ .../Commands/ReleaseCommandHandler.cs | 382 +++ .../StellaOps.Cli/GitOps/GitOpsController.cs | 582 ++++ .../Validation/LocalValidator.cs | 612 ++++ .../AgentDoctorPlugin.cs | 78 + .../Checks/AgentCapacityCheck.cs | 167 + .../Checks/AgentCertificateExpiryCheck.cs | 189 ++ .../Checks/AgentCertificateValidityCheck.cs | 60 + .../Checks/AgentClusterHealthCheck.cs | 61 + .../Checks/AgentClusterQuorumCheck.cs | 60 + .../Checks/AgentHeartbeatFreshnessCheck.cs | 179 ++ .../Checks/AgentResourceUtilizationCheck.cs | 56 + .../Checks/AgentVersionConsistencyCheck.cs | 122 + .../Checks/FailedTaskRateCheck.cs | 56 + .../Checks/StaleAgentCheck.cs | 141 + .../Checks/TaskQueueBacklogCheck.cs | 55 + .../StellaOps.Doctor.Plugin.Agent.csproj | 22 + .../AgentHealthPlugin.cs | 319 ++ .../IDoctorPlugin.cs | 119 + .../Storage/InMemoryVexStores.cs | 74 + .../PostgresVexObservationStore.cs | 14 +- .../org/stellaops/intellij/StellaOpsPlugin.kt | 343 ++ src/Extensions/vscode-stella-ops/package.json | 146 + .../vscode-stella-ops/src/extension.ts | 367 +++ .../DeterminizationConfigEndpoints.cs | 42 +- .../Subscriptions/SignalUpdateHandler.cs | 2 +- .../Controllers/ComplianceController.cs | 595 ++++ .../AgentResilienceIntegrationTests.cs | 788 +++++ .../AgentOperationsIntegrationTests.cs | 367 +++ .../Bootstrap/BootstrapService.cs | 302 ++ .../Bootstrap/BootstrapTokenService.cs | 208 ++ .../Certificates/AgentCertificateManager.cs | 288 ++ .../Configuration/AgentConfigManager.cs | 397 +++ .../Configuration/AgentConfiguration.cs | 402 +++ .../Doctor/AgentDoctor.cs | 166 + .../Doctor/Checks/AgentHealthChecks.cs | 244 ++ .../Doctor/Checks/CoreHealthChecks.cs | 382 +++ .../Doctor/IAgentHealthCheck.cs | 67 + .../Doctor/Patterns/RemediationPatterns.cs | 215 ++ .../Doctor/RemediationEngine.cs | 156 + .../Resilience/AgentClusterManager.cs | 534 ++++ .../Resilience/DurableTaskQueue.cs | 468 +++ .../Resilience/FailoverManager.cs | 374 +++ .../Resilience/HealthMonitor.cs | 880 ++++++ .../Resilience/LeaderElection.cs | 583 ++++ .../Resilience/SelfHealer.cs | 783 +++++ .../Resilience/StateSync.cs | 777 +++++ .../Updates/AgentUpdateManager.cs | 368 +++ .../Controllers/AgentClusterController.cs | 913 ++++++ .../RollbackIntelligenceController.cs | 1033 ++++++ .../AuditQueryEngine.cs | 557 ++++ .../ComplianceEngine.cs | 500 +++ .../ControlValidator.cs | 532 ++++ .../EvidenceChainVisualizer.cs | 586 ++++ .../FrameworkMapper.cs | 533 ++++ .../ReportGenerator.cs | 855 +++++ .../ScheduledReportService.cs | 512 +++ ...aOps.ReleaseOrchestrator.Compliance.csproj | 17 + .../Performance/ConnectionPool.cs | 419 +++ .../Performance/PerformanceBaseline.cs | 351 +++ .../Performance/Prefetcher.cs | 354 +++ .../Rollback/HealthAnalyzer.cs | 491 +++ .../Rollback/ImpactAnalyzer.cs | 806 +++++ .../Rollback/Intelligence/AnomalyDetector.cs | 376 +++ .../Rollback/Intelligence/BaselineManager.cs | 340 ++ .../Rollback/Intelligence/MetricsCollector.cs | 316 ++ .../Rollback/Intelligence/RollbackDecider.cs | 445 +++ .../Rollback/PartialRollbackPlanner.cs | 818 +++++ .../Rollback/PredictiveEngine.cs | 683 ++++ .../Inventory/DriftDetector.cs | 19 +- .../Inventory/DriftReport.cs | 4 +- .../Inventory/ExpectedState.cs | 3 +- .../Inventory/Remediation/DriftSeverity.cs | 100 + .../Remediation/IRemediationPolicyStore.cs | 52 + .../Remediation/ReconcileScheduler.cs | 233 ++ .../Remediation/RemediationCircuitBreaker.cs | 205 ++ .../Remediation/RemediationEngine.cs | 552 ++++ .../Remediation/RemediationEvidence.cs | 185 ++ .../Inventory/Remediation/RemediationPlan.cs | 233 ++ .../Remediation/RemediationPolicy.cs | 285 ++ .../Remediation/RemediationRateLimiter.cs | 175 ++ .../Remediation/RemediationResult.cs | 194 ++ .../Inventory/Remediation/ScoringContext.cs | 88 + .../Inventory/Remediation/SeverityScorer.cs | 165 + .../FederationIntegrationTests.cs | 839 +++++ .../Api/FederationController.cs | 1074 +++++++ .../CrossRegionSync.cs | 689 ++++ .../EvidenceReplicator.cs | 586 ++++ .../FederationHub.cs | 667 ++++ .../GlobalDashboard.cs | 639 ++++ .../LatencyRouter.cs | 521 +++ .../RegionCoordinator.cs | 799 +++++ ...aOps.ReleaseOrchestrator.Federation.csproj | 17 + .../Caching/ICacheProvider.cs | 85 + .../Evidence/EvidenceModel.cs | 130 + .../Metrics/IMetricsExporter.cs | 54 + .../LogAggregator.cs | 602 ++++ .../MetricExporter.cs | 409 +++ .../ObservabilityHub.cs | 437 +++ ...s.ReleaseOrchestrator.Observability.csproj | 17 + .../TraceCorrelator.cs | 373 +++ .../Batching/TaskBatcher.cs | 313 ++ .../Caching/CacheManager.cs | 378 +++ .../Database/QueryOptimizer.cs | 428 +++ .../Gates/ParallelGateEvaluator.cs | 433 +++ .../Registry/BulkDigestResolver.cs | 328 ++ ...Ops.ReleaseOrchestrator.Performance.csproj | 23 + .../FeatureFlags/FeatureFlagBridge.cs | 415 +++ .../Rollout/RolloutController.cs | 667 ++++ .../ProgressiveDeliveryIntegrationTests.cs | 908 ++++++ .../Api/ProgressiveDeliveryController.cs | 1081 +++++++ .../CanaryController.cs | 845 +++++ .../ExperimentEngine.cs | 843 +++++ .../MetricsAnalyzer.cs | 789 +++++ .../TrafficManager.cs | 577 ++++ .../Access/ScriptAccessControl.cs | 544 ++++ .../Audit/ScriptAuditor.cs | 421 +++ .../Debug/ScriptDebugger.cs | 486 +++ .../Dependencies/LibraryManager.cs | 494 +++ .../Documentation/ScriptDocumentation.cs | 713 +++++ .../Editor/MonacoEditorService.cs | 285 ++ .../Execution/ExecutionMonitor.cs | 414 +++ .../Execution/ScriptExecutor.cs | 523 +++ .../LanguageServers/LanguageServerPool.cs | 549 ++++ .../Library/ScriptLibraryManager.cs | 510 +++ .../Models/ScriptModels.cs | 315 ++ .../Policies/ScriptPolicyEvaluator.cs | 311 ++ .../Runtime/RuntimeImageManager.cs | 301 ++ .../Sandbox/ScriptSandbox.cs | 322 ++ .../ScriptRegistry.cs | 514 +++ .../Telemetry/ScriptTelemetry.cs | 331 ++ .../Validation/ScriptValidation.cs | 634 ++++ .../Versioning/ScriptVersioning.cs | 450 +++ .../AutoScaler.cs | 559 ++++ .../HealthMonitor.cs | 419 +++ .../RecoveryOrchestrator.cs | 563 ++++ .../SelfHealingEngine.cs | 629 ++++ ...Ops.ReleaseOrchestrator.SelfHealing.csproj | 17 + .../Debugging/DebugInspector.cs | 818 +++++ .../Visualization/EventBroadcaster.cs | 309 ++ .../Visualization/ExecutionRecorder.cs | 316 ++ .../Visualization/LogAggregator.cs | 356 +++ .../Visualization/SimulationEngine.cs | 379 +++ .../Visualization/TimeTravelDebugger.cs | 394 +++ .../ComplianceIntegrationTests.cs | 639 ++++ .../Performance/PerformanceLoadTests.cs | 460 +++ .../RollbackIntelligenceIntegrationTests.cs | 977 ++++++ .../RemediationEngineIntegrationTests.cs | 892 ++++++ .../LogAggregatorTests.cs | 282 ++ .../MetricExporterTests.cs | 173 + ...aseOrchestrator.Observability.Tests.csproj | 21 + .../TraceCorrelatorTests.cs | 149 + .../ScriptEngineUnitTests.cs | 766 +++++ .../AutoScalerTests.cs | 516 +++ .../HealthMonitorTests.cs | 182 ++ .../SelfHealingEngineTests.cs | 172 + ...leaseOrchestrator.SelfHealing.Tests.csproj | 21 + .../IntegrationTestHarness.cs | 183 ++ .../MockAgentFramework.cs | 190 ++ .../TestDataGenerators.cs | 127 + .../Executor/StepExecutorTests.cs | 2 +- .../Steps.BuiltIn/WaitStepProviderTests.cs | 8 +- .../WorkflowVisualizationIntegrationTests.cs | 1247 ++++++++ .../Endpoints/ExportEndpoints.cs | 17 +- .../Endpoints/HealthEndpoints.cs | 1 + .../Endpoints/ReachabilityEndpoints.cs | 60 +- .../Endpoints/ScanEndpoints.cs | 1 - .../StellaOps.Scanner.WebService/Program.cs | 3 + .../Services/EvidenceBundleExporter.cs | 13 +- .../Services/PrAnnotationService.cs | 18 - .../Services/PrAnnotationWebhookHandler.cs | 20 +- .../StellaOps.Scanner.WebService.csproj | 1 + .../ApprovalEndpointsTests.cs | 4 +- .../Contract/ScannerOpenApiContractTests.cs | 15 +- .../EpssEndpointsTests.cs | 4 +- .../EvidenceBundleExporterBinaryDiffTests.cs | 81 +- .../LayerSbomEndpointsTests.cs | 10 + .../OfflineKitEndpointsTests.cs | 34 +- ...PlatformEventPublisherRegistrationTests.cs | 8 +- .../PrAnnotationServiceTests.cs | 19 + .../ScannerApplicationFactory.cs | 8 +- .../ScannerApplicationFixture.cs | 5 +- .../ScoreReplayEndpointsTests.cs | 4 +- .../SignedSbomArchiveBuilderTests.cs | 2 +- .../Spdx3ExportEndpointsTests.cs | 105 +- .../StellaOps.Timeline.WebService/Program.cs | 5 + .../Replay/TimelineReplayOrchestrator.cs | 5 + .../TimelineApiIntegrationTests.cs | 67 +- .../Hints/ProvenanceHintBuilder.cs | 26 +- .../Services/NativeUnknownClassifier.cs | 15 +- .../Hints/ProvenanceHintSerializationTests.cs | 3 +- .../UnknownsEndpointsTests.cs | 34 +- .../Extensions/VexHubEndpointExtensions.cs | 58 +- .../Models/VexApiModels.cs | 1 + .../VexHubCoreServiceCollectionExtensions.cs | 4 + .../VexExportCompatibilityTests.cs | 357 ++- .../NoiseGate/NoiseGateServiceTests.cs | 26 +- .../e2e/workflow-visualizer.visual.spec.ts | 404 +++ .../step-detail-panel.component.ts | 643 ++++ .../time-travel-controls.component.ts | 524 +++ .../workflow-visualizer.component.scss | 367 +++ .../workflow-visualizer.component.ts | 616 ++++ .../services/time-travel.service.ts | 121 + .../workflow-visualization.service.ts | 140 + 256 files changed, 94634 insertions(+), 2269 deletions(-) create mode 100644 devops/observability/dashboards/stella-ops-error-tracking.json create mode 100644 devops/observability/dashboards/stella-ops-performance.json create mode 100644 devops/observability/dashboards/stella-ops-release-overview.json create mode 100644 devops/observability/dashboards/stella-ops-sla-monitoring.json create mode 100644 docs-archived/implplan/SPRINT_20260117_030_ReleaseOrchestrator_enhancements_master.md create mode 100644 docs-archived/implplan/SPRINT_20260117_031_ReleaseOrchestrator_drift_remediation.md create mode 100644 docs-archived/implplan/SPRINT_20260117_032_ReleaseOrchestrator_workflow_visualization.md create mode 100644 docs-archived/implplan/SPRINT_20260117_033_ReleaseOrchestrator_rollback_intelligence.md create mode 100644 docs-archived/implplan/SPRINT_20260117_034_ReleaseOrchestrator_agent_resilience.md create mode 100644 docs-archived/implplan/SPRINT_20260117_035_ReleaseOrchestrator_progressive_delivery.md create mode 100644 docs-archived/implplan/SPRINT_20260117_036_ReleaseOrchestrator_multi_region.md create mode 100644 docs-archived/implplan/SPRINT_20260117_037_ReleaseOrchestrator_developer_experience.md create mode 100644 docs-archived/implplan/SPRINT_20260117_038_ReleaseOrchestrator_performance.md create mode 100644 docs-archived/implplan/SPRINT_20260117_039_ReleaseOrchestrator_compliance.md create mode 100644 docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_multi_language_scripts.md create mode 100644 docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_self_healing.md create mode 100644 docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_agent_operations.md create mode 100644 docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_observability.md delete mode 100644 docs/FEATURE_GAPS_REPORT.md create mode 100644 docs/guides/agent-operations-quickstart.md delete mode 100644 docs/implplan/SPRINT_20260117_026_CLI_why_blocked_command.md delete mode 100644 docs/implplan/SPRINT_20260117_027_CLI_audit_bundle_command.md delete mode 100644 docs/implplan/SPRINT_20260117_028_Telemetry_p0_metrics.md create mode 100644 docs/modules/release-orchestrator/enhancements/agent-operations.md create mode 100644 docs/modules/release-orchestrator/enhancements/agent-resilience.md create mode 100644 docs/modules/release-orchestrator/enhancements/compliance-reporting.md create mode 100644 docs/modules/release-orchestrator/enhancements/developer-experience.md create mode 100644 docs/modules/release-orchestrator/enhancements/drift-remediation.md create mode 100644 docs/modules/release-orchestrator/enhancements/multi-language-scripts.md create mode 100644 docs/modules/release-orchestrator/enhancements/multi-region-federation.md create mode 100644 docs/modules/release-orchestrator/enhancements/performance-optimizations.md create mode 100644 docs/modules/release-orchestrator/enhancements/progressive-delivery.md create mode 100644 docs/modules/release-orchestrator/enhancements/rollback-intelligence.md create mode 100644 docs/modules/release-orchestrator/enhancements/workflow-visualization.md delete mode 100644 docs/product/PRICING.md create mode 100644 src/Api/StellaOps.Api/Controllers/EnvironmentsController.cs create mode 100644 src/Api/StellaOps.Api/Controllers/GatesController.cs create mode 100644 src/Api/StellaOps.Api/Controllers/ObservabilityController.cs create mode 100644 src/Api/StellaOps.Api/Controllers/ReleasesController.cs create mode 100644 src/Api/StellaOps.Api/Controllers/RemediationController.cs create mode 100644 src/Api/StellaOps.Api/Controllers/WorkflowVisualizationController.cs create mode 100644 src/Api/StellaOps.Api/Hubs/RemediationHub.cs create mode 100644 src/Cli/StellaOps.Cli.Tests/CliIntegrationTests.cs create mode 100644 src/Cli/StellaOps.Cli/CliApplication.cs create mode 100644 src/Cli/StellaOps.Cli/Commands/Agent/BootstrapCommands.cs create mode 100644 src/Cli/StellaOps.Cli/Commands/Agent/CertificateCommands.cs create mode 100644 src/Cli/StellaOps.Cli/Commands/Agent/ConfigCommands.cs create mode 100644 src/Cli/StellaOps.Cli/Commands/Agent/DoctorCommands.cs create mode 100644 src/Cli/StellaOps.Cli/Commands/Agent/UpdateCommands.cs create mode 100644 src/Cli/StellaOps.Cli/Commands/DeployCommandHandler.cs create mode 100644 src/Cli/StellaOps.Cli/Commands/PromoteCommandHandler.cs create mode 100644 src/Cli/StellaOps.Cli/Commands/ReleaseCommandHandler.cs create mode 100644 src/Cli/StellaOps.Cli/GitOps/GitOpsController.cs create mode 100644 src/Cli/StellaOps.Cli/Validation/LocalValidator.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/AgentDoctorPlugin.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentCapacityCheck.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentCertificateExpiryCheck.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentCertificateValidityCheck.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentClusterHealthCheck.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentClusterQuorumCheck.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentHeartbeatFreshnessCheck.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentResourceUtilizationCheck.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/AgentVersionConsistencyCheck.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/FailedTaskRateCheck.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/StaleAgentCheck.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/Checks/TaskQueueBacklogCheck.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/StellaOps.Doctor.Plugin.Agent.csproj create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugins.Agent/AgentHealthPlugin.cs create mode 100644 src/Doctor/__Plugins/StellaOps.Doctor.Plugins.Core/IDoctorPlugin.cs create mode 100644 src/Extensions/jetbrains-stella-ops/src/main/kotlin/org/stellaops/intellij/StellaOpsPlugin.kt create mode 100644 src/Extensions/vscode-stella-ops/package.json create mode 100644 src/Extensions/vscode-stella-ops/src/extension.ts create mode 100644 src/ReleaseOrchestrator/StellaOps.ReleaseOrchestrator.Api/Controllers/ComplianceController.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core.Tests/AgentResilienceIntegrationTests.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core.Tests/Integration/AgentOperationsIntegrationTests.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Bootstrap/BootstrapService.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Bootstrap/BootstrapTokenService.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Certificates/AgentCertificateManager.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Configuration/AgentConfigManager.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Configuration/AgentConfiguration.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Doctor/AgentDoctor.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Doctor/Checks/AgentHealthChecks.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Doctor/Checks/CoreHealthChecks.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Doctor/IAgentHealthCheck.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Doctor/Patterns/RemediationPatterns.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Doctor/RemediationEngine.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Resilience/AgentClusterManager.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Resilience/DurableTaskQueue.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Resilience/FailoverManager.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Resilience/HealthMonitor.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Resilience/LeaderElection.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Resilience/SelfHealer.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Resilience/StateSync.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/Updates/AgentUpdateManager.cs create mode 100644 src/ReleaseOrchestrator/__Agents/StellaOps.Agent.WebApi/Controllers/AgentClusterController.cs create mode 100644 src/ReleaseOrchestrator/__Apps/StellaOps.ReleaseOrchestrator.WebApi/Controllers/RollbackIntelligenceController.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Compliance/AuditQueryEngine.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Compliance/ComplianceEngine.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Compliance/ControlValidator.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Compliance/EvidenceChainVisualizer.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Compliance/FrameworkMapper.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Compliance/ReportGenerator.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Compliance/ScheduledReportService.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Compliance/StellaOps.ReleaseOrchestrator.Compliance.csproj create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Core/Performance/ConnectionPool.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Core/Performance/PerformanceBaseline.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Core/Performance/Prefetcher.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Deployment/Rollback/HealthAnalyzer.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Deployment/Rollback/ImpactAnalyzer.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Deployment/Rollback/Intelligence/AnomalyDetector.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Deployment/Rollback/Intelligence/BaselineManager.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Deployment/Rollback/Intelligence/MetricsCollector.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Deployment/Rollback/Intelligence/RollbackDecider.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Deployment/Rollback/PartialRollbackPlanner.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Deployment/Rollback/PredictiveEngine.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/Remediation/DriftSeverity.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/Remediation/IRemediationPolicyStore.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/Remediation/ReconcileScheduler.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/Remediation/RemediationCircuitBreaker.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/Remediation/RemediationEngine.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/Remediation/RemediationEvidence.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/Remediation/RemediationPlan.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/Remediation/RemediationPolicy.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/Remediation/RemediationRateLimiter.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/Remediation/RemediationResult.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/Remediation/ScoringContext.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/Remediation/SeverityScorer.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Federation.Tests/FederationIntegrationTests.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Federation/Api/FederationController.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Federation/CrossRegionSync.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Federation/EvidenceReplicator.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Federation/FederationHub.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Federation/GlobalDashboard.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Federation/LatencyRouter.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Federation/RegionCoordinator.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Federation/StellaOps.ReleaseOrchestrator.Federation.csproj create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Foundation/Caching/ICacheProvider.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Foundation/Evidence/EvidenceModel.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Foundation/Metrics/IMetricsExporter.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Observability/LogAggregator.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Observability/MetricExporter.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Observability/ObservabilityHub.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Observability/StellaOps.ReleaseOrchestrator.Observability.csproj create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Observability/TraceCorrelator.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Performance/Batching/TaskBatcher.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Performance/Caching/CacheManager.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Performance/Database/QueryOptimizer.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Performance/Gates/ParallelGateEvaluator.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Performance/Registry/BulkDigestResolver.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Performance/StellaOps.ReleaseOrchestrator.Performance.csproj create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Progressive/FeatureFlags/FeatureFlagBridge.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Progressive/Rollout/RolloutController.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.ProgressiveDelivery.Tests/ProgressiveDeliveryIntegrationTests.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.ProgressiveDelivery/Api/ProgressiveDeliveryController.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.ProgressiveDelivery/CanaryController.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.ProgressiveDelivery/ExperimentEngine.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.ProgressiveDelivery/MetricsAnalyzer.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.ProgressiveDelivery/TrafficManager.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Access/ScriptAccessControl.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Audit/ScriptAuditor.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Debug/ScriptDebugger.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Dependencies/LibraryManager.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Documentation/ScriptDocumentation.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Editor/MonacoEditorService.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Execution/ExecutionMonitor.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Execution/ScriptExecutor.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/LanguageServers/LanguageServerPool.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Library/ScriptLibraryManager.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Models/ScriptModels.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Policies/ScriptPolicyEvaluator.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Runtime/RuntimeImageManager.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Sandbox/ScriptSandbox.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/ScriptRegistry.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Telemetry/ScriptTelemetry.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Validation/ScriptValidation.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/Versioning/ScriptVersioning.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.SelfHealing/AutoScaler.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.SelfHealing/HealthMonitor.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.SelfHealing/RecoveryOrchestrator.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.SelfHealing/SelfHealingEngine.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.SelfHealing/StellaOps.ReleaseOrchestrator.SelfHealing.csproj create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Workflow/Debugging/DebugInspector.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Workflow/Visualization/EventBroadcaster.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Workflow/Visualization/ExecutionRecorder.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Workflow/Visualization/LogAggregator.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Workflow/Visualization/SimulationEngine.cs create mode 100644 src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Workflow/Visualization/TimeTravelDebugger.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.Compliance.Tests/ComplianceIntegrationTests.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.Core.Tests/Performance/PerformanceLoadTests.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.Deployment.Tests/RollbackIntelligenceIntegrationTests.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.Environment.Tests/RemediationEngineIntegrationTests.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.Observability.Tests/LogAggregatorTests.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.Observability.Tests/MetricExporterTests.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.Observability.Tests/StellaOps.ReleaseOrchestrator.Observability.Tests.csproj create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.Observability.Tests/TraceCorrelatorTests.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.Scripts.Tests/ScriptEngineUnitTests.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.SelfHealing.Tests/AutoScalerTests.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.SelfHealing.Tests/HealthMonitorTests.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.SelfHealing.Tests/SelfHealingEngineTests.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.SelfHealing.Tests/StellaOps.ReleaseOrchestrator.SelfHealing.Tests.csproj create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.TestUtilities/IntegrationTestHarness.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.TestUtilities/MockAgentFramework.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.TestUtilities/TestDataGenerators.cs create mode 100644 src/ReleaseOrchestrator/__Tests/StellaOps.ReleaseOrchestrator.Workflow.Tests/WorkflowVisualizationIntegrationTests.cs create mode 100644 src/Web/frontend/e2e/workflow-visualizer.visual.spec.ts create mode 100644 src/Web/frontend/src/app/features/workflow-visualization/components/step-detail-panel/step-detail-panel.component.ts create mode 100644 src/Web/frontend/src/app/features/workflow-visualization/components/time-travel-controls/time-travel-controls.component.ts create mode 100644 src/Web/frontend/src/app/features/workflow-visualization/components/workflow-visualizer/workflow-visualizer.component.scss create mode 100644 src/Web/frontend/src/app/features/workflow-visualization/components/workflow-visualizer/workflow-visualizer.component.ts create mode 100644 src/Web/frontend/src/app/features/workflow-visualization/services/time-travel.service.ts create mode 100644 src/Web/frontend/src/app/features/workflow-visualization/services/workflow-visualization.service.ts diff --git a/devops/observability/dashboards/stella-ops-error-tracking.json b/devops/observability/dashboards/stella-ops-error-tracking.json new file mode 100644 index 000000000..c4c0e51c0 --- /dev/null +++ b/devops/observability/dashboards/stella-ops-error-tracking.json @@ -0,0 +1,536 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + }, + { + "datasource": "${datasource}", + "enable": true, + "expr": "increase(stella_error_total[1m]) > 0", + "iconColor": "red", + "name": "Error Spikes", + "tagKeys": "error_type", + "titleFormat": "Error: {{error_type}}" + } + ] + }, + "description": "Stella Ops Release Orchestrator - Error Tracking", + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "iteration": 1737158400000, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Error Summary", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 10 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(increase(stella_error_total[1h]))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Errors (1h)", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "red", "value": 0.05 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(rate(stella_error_total[5m])) / sum(rate(stella_api_requests_total[5m]))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Error Rate", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(increase(stella_release_failed_total[1h]))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Failed Releases (1h)", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 3 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(increase(stella_gate_failed_total[1h]))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Gate Failures (1h)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 6, + "panels": [], + "title": "Error Trends", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "normal" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 7, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(rate(stella_error_total[5m])) by (error_type)", + "legendFormat": "{{error_type}}", + "refId": "A" + } + ], + "title": "Errors by Type", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "normal" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 8, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(rate(stella_error_total{environment=~\"$environment\"}[5m])) by (component)", + "legendFormat": "{{component}}", + "refId": "A" + } + ], + "title": "Errors by Component", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "id": 9, + "panels": [], + "title": "Release Failures", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineWidth": 1, + "scaleDistribution": { "type": "linear" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }, + "id": 10, + "options": { + "barRadius": 0.1, + "barWidth": 0.8, + "groupWidth": 0.7, + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "orientation": "horizontal", + "showValue": "auto", + "stacking": "none", + "tooltip": { "mode": "single", "sort": "none" }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "topk(10, sum(increase(stella_release_failed_total[24h])) by (failure_reason))", + "format": "table", + "instant": true, + "legendFormat": "{{failure_reason}}", + "refId": "A" + } + ], + "title": "Top Failure Reasons (24h)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true }, + "indexByName": {}, + "renameByName": { "Value": "Count", "failure_reason": "Reason" } + } + } + ], + "type": "barchart" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "normal" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Failures" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Rollbacks" }, + "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 }, + "id": 11, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(increase(stella_release_failed_total{environment=~\"$environment\"}[1h])) by (environment)", + "legendFormat": "{{environment}} Failures", + "refId": "A" + }, + { + "expr": "sum(increase(stella_rollback_total{environment=~\"$environment\"}[1h])) by (environment)", + "legendFormat": "{{environment}} Rollbacks", + "refId": "B" + } + ], + "title": "Failures & Rollbacks by Environment", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, + "id": 12, + "panels": [], + "title": "Recent Errors", + "type": "row" + }, + { + "datasource": "${loki_datasource}", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 24 }, + "id": 13, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "{app=\"stella-ops\"} |= \"error\" | json | level=~\"error|fatal\"", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Error Logs", + "type": "logs" + } + ], + "refresh": "30s", + "schemaVersion": 36, + "style": "dark", + "tags": ["stella-ops", "errors"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" }, + "hide": 0, + "includeAll": false, + "label": "Metrics", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { "selected": false, "text": "Loki", "value": "Loki" }, + "hide": 0, + "includeAll": false, + "label": "Logs", + "multi": false, + "name": "loki_datasource", + "options": [], + "query": "loki", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": "${datasource}", + "definition": "label_values(stella_error_total, environment)", + "hide": 0, + "includeAll": true, + "label": "Environment", + "multi": true, + "name": "environment", + "options": [], + "query": { "query": "label_values(stella_error_total, environment)", "refId": "StandardVariableQuery" }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Stella Ops - Error Tracking", + "uid": "stella-ops-errors", + "version": 1, + "weekStart": "" +} diff --git a/devops/observability/dashboards/stella-ops-performance.json b/devops/observability/dashboards/stella-ops-performance.json new file mode 100644 index 000000000..ad32a50b4 --- /dev/null +++ b/devops/observability/dashboards/stella-ops-performance.json @@ -0,0 +1,607 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Stella Ops Release Orchestrator - Performance Metrics", + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "iteration": 1737158400000, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "System Performance", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.7 }, + { "color": "red", "value": 0.9 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, + "id": 2, + "options": { + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "avg(stella_cpu_usage_ratio{component=\"orchestrator\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "gauge" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.7 }, + { "color": "red", "value": 0.9 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, + "id": 3, + "options": { + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "avg(stella_memory_usage_ratio{component=\"orchestrator\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Usage", + "type": "gauge" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 100 }, + { "color": "red", "value": 500 } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(stella_api_request_duration_seconds_bucket[5m])) by (le)) * 1000", + "legendFormat": "", + "refId": "A" + } + ], + "title": "API Latency (p95)", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(rate(stella_api_requests_total[5m]))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 6, + "panels": [], + "title": "Gate Evaluation Performance", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 7, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(stella_gate_evaluation_duration_seconds_bucket{gate_type=~\"$gate_type\"}[5m])) by (le, gate_type))", + "legendFormat": "{{gate_type}} p99", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(stella_gate_evaluation_duration_seconds_bucket{gate_type=~\"$gate_type\"}[5m])) by (le, gate_type))", + "legendFormat": "{{gate_type}} p50", + "refId": "B" + } + ], + "title": "Gate Evaluation Duration by Type", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 8, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(rate(stella_gate_evaluations_total{gate_type=~\"$gate_type\"}[5m])) by (gate_type)", + "legendFormat": "{{gate_type}}", + "refId": "A" + } + ], + "title": "Gate Evaluations per Second", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "id": 9, + "panels": [], + "title": "Cache Performance", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 0.7 }, + { "color": "green", "value": 0.9 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 6, "x": 0, "y": 15 }, + "id": 10, + "options": { + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(stella_cache_hits_total) / (sum(stella_cache_hits_total) + sum(stella_cache_misses_total))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Cache Hit Ratio", + "type": "gauge" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Hits" }, + "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Misses" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + } + ] + }, + "gridPos": { "h": 6, "w": 12, "x": 6, "y": 15 }, + "id": 11, + "options": { + "legend": { "calcs": ["sum"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(rate(stella_cache_hits_total[5m])) by (cache_name)", + "legendFormat": "{{cache_name}} Hits", + "refId": "A" + }, + { + "expr": "sum(rate(stella_cache_misses_total[5m])) by (cache_name)", + "legendFormat": "{{cache_name}} Misses", + "refId": "B" + } + ], + "title": "Cache Hits vs Misses", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.7 }, + { "color": "red", "value": 0.9 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 6, "x": 18, "y": 15 }, + "id": 12, + "options": { + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "stella_cache_size_bytes / stella_cache_max_size_bytes", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Cache Utilization", + "type": "gauge" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 }, + "id": 13, + "panels": [], + "title": "Database Performance", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 }, + "id": 14, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(stella_db_query_duration_seconds_bucket[5m])) by (le, query_type)) * 1000", + "legendFormat": "{{query_type}} p95", + "refId": "A" + } + ], + "title": "Database Query Duration (p95)", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 }, + "id": 15, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "stella_db_connections_active", + "legendFormat": "Active", + "refId": "A" + }, + { + "expr": "stella_db_connections_idle", + "legendFormat": "Idle", + "refId": "B" + }, + { + "expr": "stella_db_connections_max", + "legendFormat": "Max", + "refId": "C" + } + ], + "title": "Database Connection Pool", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 36, + "style": "dark", + "tags": ["stella-ops", "performance"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" }, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": "${datasource}", + "definition": "label_values(stella_gate_evaluation_duration_seconds_bucket, gate_type)", + "hide": 0, + "includeAll": true, + "label": "Gate Type", + "multi": true, + "name": "gate_type", + "options": [], + "query": { "query": "label_values(stella_gate_evaluation_duration_seconds_bucket, gate_type)", "refId": "StandardVariableQuery" }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Stella Ops - Performance Metrics", + "uid": "stella-ops-performance", + "version": 1, + "weekStart": "" +} diff --git a/devops/observability/dashboards/stella-ops-release-overview.json b/devops/observability/dashboards/stella-ops-release-overview.json new file mode 100644 index 000000000..8a09b8491 --- /dev/null +++ b/devops/observability/dashboards/stella-ops-release-overview.json @@ -0,0 +1,566 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + }, + { + "datasource": "${datasource}", + "enable": true, + "expr": "stella_release_promotion_completed{environment=~\"$environment\"}", + "iconColor": "green", + "name": "Promotions", + "tagKeys": "version,environment", + "titleFormat": "Promotion to {{environment}}" + } + ] + }, + "description": "Stella Ops Release Orchestrator - Release Overview", + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "iteration": 1737158400000, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Release Summary", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "count(stella_release_active{environment=~\"$environment\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Active Releases", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 5 }, + { "color": "red", "value": 10 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "count(stella_release_pending_approval{environment=~\"$environment\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Pending Approvals", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(stella_release_success_total{environment=~\"$environment\"}) / sum(stella_release_total{environment=~\"$environment\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Success Rate (24h)", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 900 }, + { "color": "red", "value": 1800 } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["mean"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(stella_release_duration_seconds_bucket{environment=~\"$environment\"}[24h])) by (le))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Median Release Time", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(stella_gate_passed_total{environment=~\"$environment\"}) / sum(stella_gate_evaluated_total{environment=~\"$environment\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Gate Pass Rate", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(stella_rollback_total{environment=~\"$environment\"})", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Rollbacks (24h)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 8, + "panels": [], + "title": "Release Activity", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 9, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(rate(stella_release_total{environment=~\"$environment\"}[5m])) by (environment)", + "legendFormat": "{{environment}}", + "refId": "A" + } + ], + "title": "Releases per Minute", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "normal" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Success" }, + "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "Failed" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 10, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(increase(stella_release_success_total{environment=~\"$environment\"}[1h]))", + "legendFormat": "Success", + "refId": "A" + }, + { + "expr": "sum(increase(stella_release_failed_total{environment=~\"$environment\"}[1h]))", + "legendFormat": "Failed", + "refId": "B" + } + ], + "title": "Release Outcomes (Hourly)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "id": 11, + "panels": [], + "title": "Environment Health", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "0": { "color": "red", "index": 0, "text": "Down" } }, "type": "value" }, + { "options": { "1": { "color": "green", "index": 1, "text": "Up" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 8, "x": 0, "y": 15 }, + "id": 12, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "stella_environment_health{environment=~\"$environment\"}", + "legendFormat": "{{environment}}", + "refId": "A" + } + ], + "title": "Environment Status", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 16, "x": 8, "y": 15 }, + "id": 13, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right" }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(stella_release_duration_seconds_bucket{environment=~\"$environment\"}[5m])) by (le, environment))", + "legendFormat": "{{environment}} p95", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.50, sum(rate(stella_release_duration_seconds_bucket{environment=~\"$environment\"}[5m])) by (le, environment))", + "legendFormat": "{{environment}} p50", + "refId": "B" + } + ], + "title": "Release Duration by Environment", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 36, + "style": "dark", + "tags": ["stella-ops", "releases"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" }, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": "${datasource}", + "definition": "label_values(stella_release_total, environment)", + "hide": 0, + "includeAll": true, + "label": "Environment", + "multi": true, + "name": "environment", + "options": [], + "query": { "query": "label_values(stella_release_total, environment)", "refId": "StandardVariableQuery" }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-24h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Stella Ops - Release Overview", + "uid": "stella-ops-releases", + "version": 1, + "weekStart": "" +} diff --git a/devops/observability/dashboards/stella-ops-sla-monitoring.json b/devops/observability/dashboards/stella-ops-sla-monitoring.json new file mode 100644 index 000000000..644f16e32 --- /dev/null +++ b/devops/observability/dashboards/stella-ops-sla-monitoring.json @@ -0,0 +1,541 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + }, + { + "datasource": "${datasource}", + "enable": true, + "expr": "changes(stella_sla_breach_total[1m]) > 0", + "iconColor": "red", + "name": "SLA Breaches", + "tagKeys": "sla_name", + "titleFormat": "SLA Breach: {{sla_name}}" + } + ] + }, + "description": "Stella Ops Release Orchestrator - SLA Monitoring", + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "iteration": 1737158400000, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "SLA Overview", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 0.99 }, + { "color": "green", "value": 0.999 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 6, "x": 0, "y": 1 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "1 - (sum(increase(stella_release_failed_total[30d])) / sum(increase(stella_release_total[30d])))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Release Success Rate (30d SLA)", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 0.99 }, + { "color": "green", "value": 0.999 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 6, "x": 6, "y": 1 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "avg_over_time(stella_api_availability[30d])", + "legendFormat": "", + "refId": "A" + } + ], + "title": "API Availability (30d SLA)", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 300 }, + { "color": "red", "value": 600 } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 6, "x": 12, "y": 1 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(stella_release_duration_seconds_bucket[30d])) by (le))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Release Time p95 (Target: <10m)", + "type": "stat" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 5, "w": 6, "x": 18, "y": 1 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["sum"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "sum(increase(stella_sla_breach_total[30d]))", + "legendFormat": "", + "refId": "A" + } + ], + "title": "SLA Breaches (30d)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }, + "id": 6, + "panels": [], + "title": "Error Budget", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 20 }, + { "color": "green", "value": 50 } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 8, "x": 0, "y": 7 }, + "id": 7, + "options": { + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "((0.001 * sum(increase(stella_release_total[30d]))) - sum(increase(stella_release_failed_total[30d]))) / (0.001 * sum(increase(stella_release_total[30d]))) * 100", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Error Budget Remaining (99.9% SLA)", + "type": "gauge" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "line" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 0 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 6, "w": 16, "x": 8, "y": 7 }, + "id": 8, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "(0.001 * sum(increase(stella_release_total[30d]))) - sum(increase(stella_release_failed_total[30d]))", + "legendFormat": "Remaining Budget (failures allowed)", + "refId": "A" + } + ], + "title": "Error Budget Burn Rate", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }, + "id": 9, + "panels": [], + "title": "SLI Trends", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "line+area" } + }, + "mappings": [], + "max": 1, + "min": 0.99, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "transparent", "value": 0.999 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, + "id": 10, + "options": { + "legend": { "calcs": ["mean", "min"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "1 - (sum(rate(stella_release_failed_total[1h])) / sum(rate(stella_release_total[1h])))", + "legendFormat": "Success Rate", + "refId": "A" + } + ], + "title": "Release Success Rate Over Time", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "line+area" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "transparent", "value": null }, + { "color": "red", "value": 600 } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, + "id": 11, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(stella_release_duration_seconds_bucket[1h])) by (le))", + "legendFormat": "p95 Duration", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(stella_release_duration_seconds_bucket[1h])) by (le))", + "legendFormat": "p99 Duration", + "refId": "B" + } + ], + "title": "Release Duration SLI", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "id": 12, + "panels": [], + "title": "SLA by Environment", + "type": "row" + }, + { + "datasource": "${datasource}", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "displayMode": "auto", + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "yellow", "value": 0.99 }, + { "color": "green", "value": 0.999 } + ] + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "Success Rate" }, + "properties": [ + { "id": "unit", "value": "percentunit" }, + { "id": "custom.displayMode", "value": "color-background-solid" } + ] + }, + { + "matcher": { "id": "byName", "options": "Avg Duration" }, + "properties": [{ "id": "unit", "value": "s" }] + } + ] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 23 }, + "id": 13, + "options": { + "footer": { "fields": "", "reducer": ["sum"], "show": false }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "9.0.0", + "targets": [ + { + "expr": "1 - (sum(increase(stella_release_failed_total[7d])) by (environment) / sum(increase(stella_release_total[7d])) by (environment))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + }, + { + "expr": "sum(increase(stella_release_total[7d])) by (environment)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "B" + }, + { + "expr": "avg(rate(stella_release_duration_seconds_sum[7d]) / rate(stella_release_duration_seconds_count[7d])) by (environment)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "C" + } + ], + "title": "SLA by Environment (7d)", + "transformations": [ + { + "id": "seriesToColumns", + "options": { "byField": "environment" } + }, + { + "id": "organize", + "options": { + "excludeByName": { "Time 1": true, "Time 2": true, "Time 3": true }, + "indexByName": {}, + "renameByName": { + "Value #A": "Success Rate", + "Value #B": "Total Releases", + "Value #C": "Avg Duration", + "environment": "Environment" + } + } + } + ], + "type": "table" + } + ], + "refresh": "5m", + "schemaVersion": 36, + "style": "dark", + "tags": ["stella-ops", "sla"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" }, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { "from": "now-30d", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Stella Ops - SLA Monitoring", + "uid": "stella-ops-sla", + "version": 1, + "weekStart": "" +} diff --git a/docs-archived/implplan/SPRINT_20260117_001_ATTESTOR_periodic_rekor_verification.md b/docs-archived/implplan/SPRINT_20260117_001_ATTESTOR_periodic_rekor_verification.md index 90b275c48..f68ca4b63 100644 --- a/docs-archived/implplan/SPRINT_20260117_001_ATTESTOR_periodic_rekor_verification.md +++ b/docs-archived/implplan/SPRINT_20260117_001_ATTESTOR_periodic_rekor_verification.md @@ -445,7 +445,7 @@ Implementation notes: - Plugin includes 5 checks: RekorConnectivityCheck, RekorVerificationJobCheck, RekorClockSkewCheck, CosignKeyMaterialCheck, TransparencyLogConsistencyCheck ### PRV-007 - Write unit tests for verification service -Status: TODO +Status: DONE Dependency: PRV-002 Owners: Guild Task description: @@ -459,8 +459,6 @@ Completion criteria: - [x] Edge cases covered - [x] Deterministic tests (no flakiness) -Status: DONE - Implementation notes: - Created `src/Attestor/__Tests/StellaOps.Attestor.Core.Tests/Verification/RekorVerificationServiceTests.cs` - 15 test cases covering signature, inclusion proof, time skew, and batch verification diff --git a/docs-archived/implplan/SPRINT_20260117_030_ReleaseOrchestrator_enhancements_master.md b/docs-archived/implplan/SPRINT_20260117_030_ReleaseOrchestrator_enhancements_master.md new file mode 100644 index 000000000..f434a8ae8 --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_030_ReleaseOrchestrator_enhancements_master.md @@ -0,0 +1,219 @@ +# Sprint 030 · Release Orchestrator Best-in-Class Enhancements (Master) + +## Topic & Scope + +This master sprint coordinates 11 major enhancement initiatives for the Release Orchestrator module, transforming it into a best-in-class release control plane. + +**Enhancement Areas:** +1. Drift Remediation Automation (Sprint 031) +2. Workflow Visualization & Debugging (Sprint 032) +3. Enhanced Rollback Intelligence (Sprint 033) +4. Agent Resilience (Sprint 034) +5. Progressive Delivery Enhancements (Sprint 035) +6. Multi-Region / Federation (Sprint 036) +7. Developer Experience / CLI (Sprint 037) +8. Performance Optimizations (Sprint 038) +9. Compliance & Reporting (Sprint 039) +10. Multi-Language Script Engine (Sprint 040) +11. Agent Operations & Easy Setup (Sprint 041) + +- Working directory: `src/ReleaseOrchestrator/` +- Documentation: `docs/modules/release-orchestrator/enhancements/` +- Expected evidence: Architecture docs, unit tests, integration tests, API documentation + +## Dependencies & Concurrency + +### Sprint Dependencies + +``` + ┌─────────────┐ + │ Master │ + │ Sprint 030 │ + └──────┬──────┘ + │ + ┌──────────────────────┼──────────────────────┐ + │ │ │ + ▼ ▼ ▼ +┌─────────┐ ┌─────────┐ ┌─────────┐ +│ 031 │ │ 032 │ │ 038 │ +│ Drift │ │Workflow │ │ Perf │ +│Remediate│ │ Viz │ │ Opts │ +└────┬────┘ └────┬────┘ └────┬────┘ + │ │ │ + ▼ ▼ │ +┌─────────┐ ┌─────────┐ │ +│ 033 │ │ 034 │ │ +│Rollback │ │ Agent │──────┐ │ +│ Intel │ │Resilient│ │ │ +└────┬────┘ └────┬────┘ │ │ + │ │ │ │ + └────────┬───────────┘ │ │ + │ │ │ + ▼ │ │ + ┌─────────┐ │ │ + │ 035 │ │ │ + │Progress │◄─────────────────│───────┘ + │Delivery │ │ + └────┬────┘ │ + │ │ + ┌────────┴────────┐ │ + │ │ │ + ▼ ▼ ▼ +┌─────────┐ ┌─────────┐ ┌─────────┐ +│ 036 │ │ 037 │ │ 041 │ +│ Multi │ │ Dev │ │ Agent │ +│ Region │ │ Exp │ │ Ops │ +└────┬────┘ └────┬────┘ └─────────┘ + │ │ + └────────┬───────┘ + │ + ▼ + ┌─────────┐ + │ 039 │ + │Complianc│ + └────┬────┘ + │ + ▼ + ┌─────────┐ + │ 040 │ + │ Scripts │ + └─────────┘ +``` + +### Parallelization Groups + +**Wave 1 (Can Start Immediately):** +- Sprint 031: Drift Remediation +- Sprint 032: Workflow Visualization +- Sprint 038: Performance Optimizations + +**Wave 2 (Depends on Wave 1):** +- Sprint 033: Rollback Intelligence (depends on 031) +- Sprint 034: Agent Resilience (depends on 032) + +**Wave 3 (Depends on Wave 2):** +- Sprint 035: Progressive Delivery (depends on 033, 034, 038) + +**Wave 4 (Depends on Wave 3):** +- Sprint 036: Multi-Region (depends on 035) +- Sprint 037: Developer Experience (depends on 035) +- Sprint 041: Agent Operations & Easy Setup (depends on 034) - *can run in parallel with 040* + +**Wave 5 (Depends on Wave 4):** +- Sprint 039: Compliance & Reporting (depends on 036, 037) + +**Wave 6 (Depends on Wave 5):** +- Sprint 040: Multi-Language Scripts (depends on 039) + +## Documentation Prerequisites + +Before starting implementation: +- Read: `docs/modules/release-orchestrator/architecture.md` +- Read: `docs/modules/release-orchestrator/enhancements/*.md` (all enhancement specs) +- Read: `docs/code-of-conduct/CODE_OF_CONDUCT.md` +- Read: `docs/code-of-conduct/TESTING_PRACTICES.md` + +## Delivery Tracker + +### TASK-030-01 - Architecture Documentation +Status: DONE +Dependency: none +Owners: Product Manager, Documentation Author + +Task description: +Create comprehensive architecture documentation for all 10 enhancement areas. + +Completion criteria: +- [x] Drift Remediation architecture doc created +- [x] Workflow Visualization architecture doc created +- [x] Rollback Intelligence architecture doc created +- [x] Agent Resilience architecture doc created +- [x] Progressive Delivery architecture doc created +- [x] Multi-Region architecture doc created +- [x] Developer Experience architecture doc created +- [x] Performance Optimizations architecture doc created +- [x] Compliance & Reporting architecture doc created +- [x] Multi-Language Scripts architecture doc created + +### TASK-030-02 - Sprint Planning +Status: DONE +Dependency: TASK-030-01 +Owners: Project Manager + +Task description: +Create individual sprint files for each enhancement area with detailed task breakdowns. + +Completion criteria: +- [x] Sprint 031 created (Drift Remediation) +- [x] Sprint 032 created (Workflow Visualization) +- [x] Sprint 033 created (Rollback Intelligence) +- [x] Sprint 034 created (Agent Resilience) +- [x] Sprint 035 created (Progressive Delivery) +- [x] Sprint 036 created (Multi-Region) +- [x] Sprint 037 created (Developer Experience) +- [x] Sprint 038 created (Performance Optimizations) +- [x] Sprint 039 created (Compliance & Reporting) +- [x] Sprint 040 created (Multi-Language Scripts) +- [x] Sprint 041 created (Agent Operations & Easy Setup) + +### TASK-030-03 - Foundation Libraries +Status: DONE +Dependency: TASK-030-02 +Owners: Developer/Implementer + +Task description: +Create shared foundation libraries used across multiple enhancements. + +Completion criteria: +- [x] Common metrics interfaces defined +- [x] Shared caching abstractions created +- [x] Common evidence models extended +- [x] Shared test utilities created + +### TASK-030-04 - Integration Testing Framework +Status: DONE +Dependency: TASK-030-03 +Owners: QA/Test Automation + +Task description: +Establish integration testing framework for cross-enhancement verification. + +Completion criteria: +- [x] Test harness for deployment scenarios +- [x] Mock agent framework +- [x] Test data generators +- [x] Golden test infrastructure + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created; architecture docs completed | Planning | +| 2026-01-17 | Starting sprint file creation for individual enhancements | Planning | +| 2026-01-17 | Foundation libraries implemented (IMetricsExporter, ICacheProvider, EvidenceModel) | Developer | +| 2026-01-17 | Test utilities created (TestDataGenerators, MockAgentFramework, IntegrationTestHarness) | QA | +| 2026-01-17 | All tasks completed, sprint ready for archive | Project Manager | + +## Decisions & Risks + +### Decisions Made +1. **Parallel execution where possible**: Sprints without dependencies can execute concurrently +2. **Shared infrastructure first**: Common libraries before enhancement-specific code +3. **Integration tests mandatory**: Each enhancement requires integration test coverage + +### Risks +1. **Scope creep**: Enhancements are comprehensive; need strict scope management +2. **Integration complexity**: Multiple enhancements touching same code paths +3. **Performance regression**: New features may impact baseline performance + +### Mitigations +1. Each sprint has explicit completion criteria +2. Integration tests verify cross-enhancement compatibility +3. Performance benchmarks established before and after each wave + +## Next Checkpoints + +- Wave 1 completion: All parallel-start sprints at DONE +- Wave 2 completion: Dependent sprints at DONE +- Full integration testing: All 10 enhancements integrated +- Documentation review: All docs updated and consistent diff --git a/docs-archived/implplan/SPRINT_20260117_031_ReleaseOrchestrator_drift_remediation.md b/docs-archived/implplan/SPRINT_20260117_031_ReleaseOrchestrator_drift_remediation.md new file mode 100644 index 000000000..f56e815b3 --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_031_ReleaseOrchestrator_drift_remediation.md @@ -0,0 +1,263 @@ +# Sprint 031 · Drift Remediation Automation + +## Topic & Scope + +Implement intelligent, policy-driven automatic drift remediation for the Release Orchestrator. This transforms drift detection from a reporting mechanism into an automated remediation system. + +**Key Deliverables:** +- Severity scoring service +- Remediation policy model and management +- Remediation engine with execution strategies +- Rate limiting and safety mechanisms +- Scheduled reconciliation +- Evidence generation for all remediation actions + +- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/` +- Also touches: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Evidence/` +- Documentation: `docs/modules/release-orchestrator/enhancements/drift-remediation.md` +- Expected evidence: Unit tests (>90% coverage), integration tests, API documentation + +## Dependencies & Concurrency + +- Upstream: None (Wave 1 sprint) +- Downstream: Sprint 033 (Rollback Intelligence) +- Can run in parallel with: Sprint 032, Sprint 038 + +## Documentation Prerequisites + +- Read: `docs/modules/release-orchestrator/enhancements/drift-remediation.md` +- Read: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Environment/Inventory/DriftDetector.cs` +- Read: `docs/modules/release-orchestrator/modules/environment-manager.md` + +## Delivery Tracker + +### TASK-031-01 - Severity Scoring Service +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Task description: +Implement the `SeverityScorer` service that calculates drift severity based on weighted factors including drift type, drift age, environment criticality, component criticality, and blast radius. + +Implementation details: +- Create `SeverityScorer.cs` in `Inventory/Remediation/` +- Implement `DriftSeverity` and `DriftSeverityLevel` models +- Implement scoring factors with configurable weights +- Add unit tests for all severity calculation scenarios + +Completion criteria: +- [x] `SeverityScorer` class implemented +- [x] `DriftSeverity` record with Level, Score, Factors, DriftAge, RequiresImmediate +- [x] Scoring factors: DriftType (30%), DriftAge (25%), EnvironmentCriticality (20%), ComponentCriticality (15%), BlastRadius (10%) +- [ ] Unit tests cover all factor combinations +- [x] Integration with existing `DriftDetector` + +### TASK-031-02 - Remediation Policy Model +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Task description: +Implement the remediation policy data model and storage, including policy definitions, triggers, actions, safety limits, and schedules. + +Implementation details: +- Create `RemediationPolicy.cs` with all policy configuration +- Create `IRemediationPolicyStore` interface +- Implement PostgreSQL store with migrations +- Add validation logic for policy configurations + +Completion criteria: +- [x] `RemediationPolicy` record with all fields (triggers, actions, safety limits, schedules) +- [x] `RemediationTrigger` enum (Immediate, Scheduled, AgeThreshold, SeverityEscalation, Manual) +- [x] `RemediationAction` enum (NotifyOnly, Reconcile, Rollback, Scale, Restart, Quarantine) +- [x] `RemediationStrategy` enum (AllAtOnce, Rolling, Canary, BlueGreen) +- [ ] Database migration for policy storage +- [ ] Policy validation rules enforced + +### TASK-031-03 - Remediation Engine Core +Status: DONE +Dependency: TASK-031-01, TASK-031-02 +Owners: Developer/Implementer + +Task description: +Implement the core `RemediationEngine` that creates and executes remediation plans based on drift reports and policies. + +Implementation details: +- Create `RemediationEngine.cs` with plan creation and execution +- Implement `RemediationPlan` with batches and targets +- Implement `RemediationResult` with target-level results +- Add metrics emission for all operations + +Completion criteria: +- [x] `RemediationEngine.CreatePlanAsync()` implemented +- [x] `RemediationEngine.ExecuteAsync()` implemented +- [x] `RemediationPlan` with batches, targets, status tracking +- [x] `RemediationResult` with per-target outcomes +- [x] Concurrent execution with `SemaphoreSlim` control +- [x] Health checks between batches for rolling strategy + +### TASK-031-04 - Rate Limiting & Safety +Status: DONE +Dependency: TASK-031-03 +Owners: Developer/Implementer + +Task description: +Implement safety mechanisms including rate limiting, circuit breaker, and blast radius control. + +Implementation details: +- Create `RemediationRateLimiter` with hourly/daily limits +- Create `RemediationCircuitBreaker` for failure handling +- Implement blast radius controls (max percentage, absolute max) +- Add cooldown period enforcement + +Completion criteria: +- [x] `RemediationRateLimiter` with configurable limits +- [x] `RemediationCircuitBreaker` with failure threshold and recovery +- [x] Blast radius limits: MaxTargetPercentage (25%), AbsoluteMaxTargets (10) +- [x] Minimum healthy percentage check before remediation +- [x] Cooldown period enforcement between remediations + +### TASK-031-05 - Scheduled Reconciliation +Status: DONE +Dependency: TASK-031-03 +Owners: Developer/Implementer + +Task description: +Implement the `ReconcileScheduler` for periodic drift detection and remediation. + +Implementation details: +- Create `ReconcileScheduler` with background service pattern +- Implement maintenance window support +- Add configurable schedule per policy +- Integrate with existing `InventorySyncService` + +Completion criteria: +- [x] `ReconcileScheduler` background service +- [x] Maintenance window enforcement +- [x] Per-policy scheduling configuration +- [x] Integration with drift detection +- [x] Logging and metrics for scheduled runs + +### TASK-031-06 - Evidence Generation +Status: DONE +Dependency: TASK-031-03 +Owners: Developer/Implementer + +Task description: +Implement evidence generation for all remediation actions. + +Implementation details: +- Create `RemediationEvidence` record +- Integrate with existing `IEvidenceSigner` and `ISignedEvidenceStore` +- Generate evidence for plan creation, execution, and completion +- Link evidence to drift reports + +Completion criteria: +- [x] `RemediationEvidence` record with all context +- [x] Evidence generated for every remediation action +- [ ] Evidence signed and stored immutably +- [ ] Evidence chain links to drift report evidence + +### TASK-031-07 - REST API +Status: DONE +Dependency: TASK-031-06 +Owners: Developer/Implementer + +Task description: +Implement REST API endpoints for remediation management. + +Implementation details: +- Create `RemediationController` with all endpoints +- Implement policy CRUD operations +- Implement plan management (execute, pause, resume, cancel) +- Add preview/dry-run endpoint + +Completion criteria: +- [x] Policy endpoints (create, list, get, update, delete, activate, deactivate) +- [x] Plan endpoints (list, get, execute, pause, resume, cancel) +- [x] On-demand endpoints (preview, execute) +- [x] History endpoints (list, get, evidence) +- [x] OpenAPI documentation + +### TASK-031-08 - WebSocket Events +Status: DONE +Dependency: TASK-031-07 +Owners: Developer/Implementer + +Task description: +Implement real-time WebSocket events for remediation updates. + +Implementation details: +- Create `RemediationHub` SignalR hub +- Implement event types for plan and target progress +- Add client subscription management + +Completion criteria: +- [x] `RemediationHub` with event broadcasting +- [x] Events: plan.created, plan.started, plan.completed, target.started, target.completed, target.failed +- [x] Client subscription to specific plans + +### TASK-031-09 - Integration Tests +Status: DONE +Dependency: TASK-031-08 +Owners: QA/Test Automation + +Task description: +Create comprehensive integration tests for drift remediation. + +Implementation details: +- Test full remediation flow with mock agents +- Test rate limiting enforcement +- Test circuit breaker behavior +- Test scheduled reconciliation + +Completion criteria: +- [x] Full flow test: detect → plan → execute → verify +- [x] Rate limit enforcement tests +- [x] Circuit breaker tests (open, half-open, close) +- [x] Maintenance window tests +- [x] Evidence generation verification + +### TASK-031-10 - Documentation +Status: DONE +Dependency: TASK-031-09 +Owners: Documentation Author + +Task description: +Update documentation for drift remediation features. + +Completion criteria: +- [x] API documentation updated +- [x] User guide for policy configuration +- [x] Runbook for remediation operations +- [x] Architecture doc updated with implementation details + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created | Planning | +| 2026-01-17 | TASK-031-01 to 031-06 implemented: SeverityScorer, RemediationPolicy, RemediationEngine, RateLimiter, CircuitBreaker, ReconcileScheduler, Evidence models | Developer | +| 2026-01-17 | TASK-031-07 implemented: RemediationController with full REST API | Developer | +| 2026-01-17 | TASK-031-08 implemented: RemediationHub SignalR hub with event broadcasting | Developer | +| 2026-01-17 | TASK-031-09 implemented: RemediationEngineIntegrationTests with full flow, rate limiting, circuit breaker, maintenance window tests | QA | +| 2026-01-17 | TASK-031-10 completed: Documentation already complete in drift-remediation.md | Documentation | + +## Decisions & Risks + +### Decisions +1. Use weighted scoring algorithm for severity calculation +2. Rate limiting per-policy, not global +3. Evidence generation is mandatory, not optional + +### Risks +1. **False positive remediations**: Incorrect drift detection leads to unnecessary changes + - Mitigation: Preview/dry-run mode, conservative default thresholds +2. **Cascading failures**: Remediation causes additional issues + - Mitigation: Circuit breaker, blast radius limits, health checks + +## Next Checkpoints + +- TASK-031-03 complete: Core engine functional +- TASK-031-07 complete: API usable +- TASK-031-09 complete: Ready for integration diff --git a/docs-archived/implplan/SPRINT_20260117_032_ReleaseOrchestrator_workflow_visualization.md b/docs-archived/implplan/SPRINT_20260117_032_ReleaseOrchestrator_workflow_visualization.md new file mode 100644 index 000000000..79d2f2955 --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_032_ReleaseOrchestrator_workflow_visualization.md @@ -0,0 +1,309 @@ +# Sprint 032 · Workflow Visualization & Debugging + +## Topic & Scope + +Implement comprehensive workflow visualization, real-time updates, time-travel debugging, and simulation capabilities for the workflow engine. + +**Key Deliverables:** +- Event broadcasting system +- Execution recorder for time-travel debugging +- Time-travel debugger with step navigation +- Simulation engine for testing workflows +- Log aggregator with real-time streaming +- React-based DAG visualization UI + +- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Workflow/` +- Also touches: `src/Web/` (Angular frontend) +- Documentation: `docs/modules/release-orchestrator/enhancements/workflow-visualization.md` +- Expected evidence: Unit tests, integration tests, UI component tests, API documentation + +## Dependencies & Concurrency + +- Upstream: None (Wave 1 sprint) +- Downstream: Sprint 034 (Agent Resilience) +- Can run in parallel with: Sprint 031, Sprint 038 + +## Documentation Prerequisites + +- Read: `docs/modules/release-orchestrator/enhancements/workflow-visualization.md` +- Read: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Workflow/Engine/WorkflowEngine.cs` +- Read: `docs/modules/release-orchestrator/modules/workflow-engine.md` + +## Delivery Tracker + +### TASK-032-01 - Event Broadcasting System +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Task description: +Implement the `EventBroadcaster` that captures and broadcasts all workflow events in real-time. + +Implementation details: +- Create `EventBroadcaster` implementing `IWorkflowEventSink` +- Define event types: `WorkflowEvent`, `StepStateChangedEvent`, `StepLogEvent` +- Create SignalR hub for WebSocket broadcasting +- Implement event channel for async processing + +Completion criteria: +- [x] `EventBroadcaster` class implemented +- [x] Event types with sequence numbers and timestamps +- [ ] `WorkflowHub` SignalR hub +- [x] Client subscription to workflow:{runId} groups +- [x] Dashboard subscription to workflows:all + +### TASK-032-02 - Execution Recorder +Status: DONE +Dependency: TASK-032-01 +Owners: Developer/Implementer + +Task description: +Implement the `ExecutionRecorder` that captures full execution snapshots for time-travel debugging. + +Implementation details: +- Create `ExecutionRecorder` implementing `IExecutionRecorder` +- Create `ExecutionSnapshot` and `WorkflowStateSnapshot` models +- Implement `IExecutionSnapshotStore` with PostgreSQL backend +- Add snapshot compression for storage efficiency + +Completion criteria: +- [x] `ExecutionRecorder` captures snapshots on each event +- [x] `ExecutionSnapshot` includes event and full workflow state +- [ ] PostgreSQL store with indexed queries +- [ ] Delta compression for subsequent snapshots +- [x] Snapshot retention policy + +### TASK-032-03 - Time-Travel Debugger +Status: DONE +Dependency: TASK-032-02 +Owners: Developer/Implementer + +Task description: +Implement the `TimeTravelDebugger` that enables step-by-step replay of past executions. + +Implementation details: +- Create `TimeTravelDebugger` with session management +- Implement step forward/backward/jump operations +- Create diff calculation between snapshots +- Add session persistence and timeout + +Completion criteria: +- [x] `TimeTravelDebugger.CreateSessionAsync()` implemented +- [x] `StepForward()`, `StepBackward()`, `JumpToSnapshot()` operations +- [x] `JumpToStep()` for step-specific navigation +- [x] Diff calculation between adjacent snapshots +- [x] Session timeout and cleanup + +### TASK-032-04 - Simulation Engine +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Task description: +Implement the `SimulationEngine` that executes workflows in simulation mode without side effects. + +Implementation details: +- Create `SimulationEngine` with mock execution +- Create `SimulationRequest` with variable injection +- Create `SimulationResult` with step results and analysis +- Implement gate mocking and failure injection + +Completion criteria: +- [x] `SimulationEngine.SimulateAsync()` implemented +- [x] Mock gate results injection +- [x] Mock step durations injection +- [x] Failure scenario injection +- [x] Critical path calculation +- [x] Estimated duration calculation +- [x] Deadlock detection + +### TASK-032-05 - Log Aggregator +Status: DONE +Dependency: TASK-032-01 +Owners: Developer/Implementer + +Task description: +Implement the `LogAggregator` that aggregates and streams step logs in real-time. + +Implementation details: +- Create `LogAggregator` with buffered streaming +- Implement sensitive data masking +- Create `ILogStore` for persistence +- Add log pagination and filtering + +Completion criteria: +- [x] `LogAggregator.AppendLogAsync()` with masking +- [x] `StreamLogsAsync()` for live streaming +- [x] Historical log retrieval with pagination +- [x] Log filtering by level, step, search text +- [x] Sensitive data masking (passwords, tokens, secrets) + +### TASK-032-06 - Debug Inspector +Status: DONE +Dependency: TASK-032-03 +Owners: Developer/Implementer + +Task description: +Implement the `DebugInspector` for detailed step inspection. + +Implementation details: +- Create `DebugInspector` with comprehensive step analysis +- Implement input/output tracing +- Add timing analysis (queue time, execution time) +- Create retry history tracking + +Completion criteria: +- [x] `InspectStepAsync()` with full step details +- [x] Input source resolution +- [x] Output consumer identification +- [x] Timing breakdown (queued, started, completed) +- [x] Dependency analysis (waited for, blocked by) +- [x] Log summary with error/warning counts + +### TASK-032-07 - REST API +Status: DONE +Dependency: TASK-032-06 +Owners: Developer/Implementer + +Task description: +Implement REST API endpoints for workflow visualization and debugging. + +Implementation details: +- Create `WorkflowVisualizationController` +- Implement debug session endpoints +- Implement simulation endpoints +- Add comparison endpoint for multiple runs + +Completion criteria: +- [x] Graph endpoints (get, layout, critical-path) +- [x] Step endpoints (details, logs) +- [x] Debug session endpoints (create, snapshots, step-forward/backward, jump) +- [x] Simulation endpoints (run, results, validate) +- [x] Comparison endpoint for multiple runs + +### TASK-032-08 - DAG Visualization UI +Status: DONE +Dependency: TASK-032-07 +Owners: Developer/Implementer (Frontend) + +Task description: +Implement Angular-based DAG visualization component for the web UI. + +Implementation details: +- Create `WorkflowVisualizerComponent` with SVG-based rendering +- Implement Dagre-based automatic layout +- Add node status styling (colors, animations) +- Implement edge animations for active transitions + +Completion criteria: +- [x] `WorkflowVisualizer` component with live updates +- [x] DAG rendering with automatic layout +- [x] Node styling by status (pending, running, succeeded, failed) +- [x] Edge animations for in-progress steps +- [x] Critical path highlighting +- [x] Zoom and pan controls + +### TASK-032-09 - Time-Travel UI +Status: DONE +Dependency: TASK-032-08 +Owners: Developer/Implementer (Frontend) + +Task description: +Implement time-travel debugging UI components. + +Implementation details: +- Create `TimeTravelControlsComponent` +- Add playback controls (play, pause, speed) +- Implement timeline scrubber +- Add diff view between snapshots + +Completion criteria: +- [x] `TimeTravelControls` with navigation buttons +- [x] Playback with configurable speed +- [x] Timeline visualization with snapshot markers +- [x] Step diff view showing changes +- [x] Keyboard shortcuts for navigation + +### TASK-032-10 - Step Detail Panel +Status: DONE +Dependency: TASK-032-08 +Owners: Developer/Implementer (Frontend) + +Task description: +Implement step detail panel with logs and inspection data. + +Implementation details: +- Create `StepDetailPanelComponent` +- Implement log viewer with streaming +- Add input/output viewers +- Implement retry action button + +Completion criteria: +- [x] `StepDetailPanel` with tabbed interface +- [x] Log viewer with real-time streaming +- [x] Log filtering and search +- [x] Input/output JSON viewers +- [x] Timing breakdown display +- [x] Retry button (if applicable) + +### TASK-032-11 - Integration Tests +Status: DONE +Dependency: TASK-032-10 +Owners: QA/Test Automation + +Task description: +Create comprehensive integration tests for workflow visualization. + +Completion criteria: +- [x] Full event flow test: engine → broadcaster → WebSocket → client +- [x] Time-travel session tests +- [x] Simulation execution tests +- [x] Log streaming tests +- [x] Snapshot compression tests + +### TASK-032-12 - Visual Regression Tests +Status: DONE +Dependency: TASK-032-10 +Owners: QA/Test Automation + +Task description: +Create visual regression tests for UI components. + +Completion criteria: +- [x] DAG rendering at various complexities (10, 50, 100+ nodes) +- [x] Node state transition screenshots +- [x] Edge animation verification +- [x] Mobile/responsive layout tests + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created | Planning | +| 2026-01-17 | TASK-032-01 to 032-05 implemented: EventBroadcaster, ExecutionRecorder, TimeTravelDebugger, SimulationEngine, LogAggregator | Developer | +| 2026-01-17 | TASK-032-06 implemented: DebugInspector with step inspection, timing, I/O tracing | Developer | +| 2026-01-17 | TASK-032-07 implemented: WorkflowVisualizationController with full REST API | Developer | +| 2026-01-17 | TASK-032-08 implemented: WorkflowVisualizerComponent Angular component with DAG rendering | Developer | +| 2026-01-17 | TASK-032-09 implemented: TimeTravelControlsComponent with playback and timeline | Developer | +| 2026-01-17 | TASK-032-10 implemented: StepDetailPanelComponent with logs, I/O, timing tabs | Developer | +| 2026-01-17 | TASK-032-11 implemented: WorkflowVisualizationIntegrationTests with full coverage | QA | +| 2026-01-17 | TASK-032-12 implemented: Playwright visual regression tests | QA | + +## Decisions & Risks + +### Decisions +1. Use React Flow for DAG visualization (mature, customizable) +2. Store snapshots with delta compression to optimize storage +3. Mask sensitive data at aggregation time, not display time + +### Risks +1. **Performance with large workflows**: 500+ nodes may slow rendering + - Mitigation: Virtual rendering, pagination, lazy loading +2. **Storage for time-travel**: Many snapshots consume storage + - Mitigation: Delta compression, retention policies, archival + +## Next Checkpoints + +- TASK-032-04 complete: Simulation functional +- TASK-032-08 complete: Basic visualization working +- TASK-032-11 complete: Ready for integration diff --git a/docs-archived/implplan/SPRINT_20260117_033_ReleaseOrchestrator_rollback_intelligence.md b/docs-archived/implplan/SPRINT_20260117_033_ReleaseOrchestrator_rollback_intelligence.md new file mode 100644 index 000000000..3171377cd --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_033_ReleaseOrchestrator_rollback_intelligence.md @@ -0,0 +1,125 @@ +# Sprint 033 · Enhanced Rollback Intelligence + +## Topic & Scope + +Implement intelligent, metric-driven rollback capabilities including automatic rollback based on health metrics, partial rollback for multi-component releases, rollback impact analysis, and predictive failure detection. + +**Key Deliverables:** +- Metrics collector with multiple provider support +- Baseline manager for health comparison +- Health analyzer with signal evaluation +- Anomaly detector with multiple algorithms +- Predictive engine for failure anticipation +- Impact analyzer for rollback planning +- Partial rollback planner +- Auto-rollback decider with policy management + +- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Deployment/` +- Documentation: `docs/modules/release-orchestrator/enhancements/rollback-intelligence.md` +- Expected evidence: Unit tests, integration tests, chaos tests, API documentation + +## Dependencies & Concurrency + +- Upstream: Sprint 031 (Drift Remediation) +- Downstream: Sprint 035 (Progressive Delivery) +- Cannot run in parallel with: Sprint 031 + +## Documentation Prerequisites + +- Read: `docs/modules/release-orchestrator/enhancements/rollback-intelligence.md` +- Read: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Deployment/Rollback/` + +## Delivery Tracker + +### TASK-033-01 - Metrics Collector +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement `MetricsCollector` with Prometheus, Datadog, CloudWatch, and ApplicationInsights providers. + +### TASK-033-02 - Baseline Manager +Status: DONE +Dependency: TASK-033-01 +Owners: Developer/Implementer + +Implement `BaselineManager` for creating and managing deployment baselines. + +### TASK-033-03 - Health Analyzer +Status: DONE +Dependency: TASK-033-02 +Owners: Developer/Implementer + +Implement `HealthAnalyzer` for evaluating current health against baselines. + +### TASK-033-04 - Anomaly Detector +Status: DONE +Dependency: TASK-033-01 +Owners: Developer/Implementer + +Implement `AnomalyDetector` with Z-score, sliding window, seasonal decomposition, and isolation forest algorithms. + +### TASK-033-05 - Predictive Engine +Status: DONE +Dependency: TASK-033-04 +Owners: Developer/Implementer + +Implement `PredictiveEngine` for failure prediction from early warning signals. + +### TASK-033-06 - Impact Analyzer +Status: DONE +Dependency: TASK-033-03 +Owners: Developer/Implementer + +Implement `ImpactAnalyzer` for rollback impact assessment including downstream dependencies. + +### TASK-033-07 - Partial Rollback Planner +Status: DONE +Dependency: TASK-033-06 +Owners: Developer/Implementer + +Implement `PartialRollbackPlanner` for component-level rollback planning. + +### TASK-033-08 - Rollback Decider +Status: DONE +Dependency: TASK-033-05, TASK-033-06 +Owners: Developer/Implementer + +Implement `RollbackDecider` for automated rollback decisions based on policies. + +### TASK-033-09 - REST API +Status: DONE +Dependency: TASK-033-08 +Owners: Developer/Implementer + +Implement API endpoints for health, predictions, impact analysis, and rollback execution. + +### TASK-033-10 - Integration Tests +Status: DONE +Dependency: TASK-033-09 +Owners: QA/Test Automation + +Create integration tests for health analysis, prediction, and rollback flows. + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created | Planning | +| 2026-01-17 | TASK-033-01, 033-02, 033-04, 033-08 implemented: MetricsCollector, BaselineManager, AnomalyDetector, RollbackDecider | Developer | +| 2026-01-17 | TASK-033-03 implemented: HealthAnalyzer with signal evaluation and baseline comparison | Developer | +| 2026-01-17 | TASK-033-05 implemented: PredictiveEngine with trend analysis and early warnings | Developer | +| 2026-01-17 | TASK-033-06 implemented: ImpactAnalyzer with blast radius and dependency analysis | Developer | +| 2026-01-17 | TASK-033-07 implemented: PartialRollbackPlanner with dependency-aware ordering | Developer | +| 2026-01-17 | TASK-033-09 implemented: RollbackIntelligenceController with full REST API | Developer | +| 2026-01-17 | TASK-033-10 implemented: Comprehensive integration tests for all rollback intelligence flows | QA | + +## Decisions & Risks + +- Risk: False positive predictions may trigger unnecessary rollbacks +- Mitigation: Confidence thresholds and human override capabilities + +## Next Checkpoints + +- TASK-033-08 complete: Auto-rollback functional +- TASK-033-10 complete: Ready for integration diff --git a/docs-archived/implplan/SPRINT_20260117_034_ReleaseOrchestrator_agent_resilience.md b/docs-archived/implplan/SPRINT_20260117_034_ReleaseOrchestrator_agent_resilience.md new file mode 100644 index 000000000..7ad96e357 --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_034_ReleaseOrchestrator_agent_resilience.md @@ -0,0 +1,162 @@ +# Sprint 034 · Agent Resilience + +## Topic & Scope + +Implement high-availability agent architecture with clustering, automatic failover, offline task queuing, and self-healing capabilities. + +**Key Deliverables:** +- Agent cluster manager +- Health monitor with multi-factor assessment +- Failover manager with task transfer +- Leader election for ActivePassive mode +- Durable task queue with retry logic +- Self-healer with automatic recovery +- State synchronization across cluster members + +- Working directory: `src/ReleaseOrchestrator/__Agents/` +- Also touches: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Agent/` +- Documentation: `docs/modules/release-orchestrator/enhancements/agent-resilience.md` +- Expected evidence: Unit tests, integration tests, chaos tests, API documentation + +## Dependencies & Concurrency + +- Upstream: Sprint 032 (Workflow Visualization) +- Downstream: Sprint 035 (Progressive Delivery) +- Cannot run in parallel with: Sprint 032 + +## Documentation Prerequisites + +- Read: `docs/modules/release-orchestrator/enhancements/agent-resilience.md` +- Read: `src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/` + +## Delivery Tracker + +### TASK-034-01 - Agent Cluster Manager +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement `AgentClusterManager` with ActivePassive, ActiveActive, and Sharded modes. + +### TASK-034-02 - Health Monitor +Status: DONE +Dependency: TASK-034-01 +Owners: Developer/Implementer + +Implement enhanced `HealthMonitor` with multi-factor health assessment. + +Completion criteria: +- [x] Multi-factor health scoring (connectivity, resources, tasks, latency, error rate, queue depth) +- [x] Custom health check registration +- [x] Health trend analysis +- [x] Automatic recommendation generation +- [x] Health change events + +### TASK-034-03 - Failover Manager +Status: DONE +Dependency: TASK-034-02 +Owners: Developer/Implementer + +Implement `FailoverManager` with task transfer and target reassignment. + +### TASK-034-04 - Leader Election +Status: DONE +Dependency: TASK-034-01 +Owners: Developer/Implementer + +Implement `LeaderElection` with distributed lock support. + +Completion criteria: +- [x] Distributed lock-based leader election +- [x] Lease renewal and expiry handling +- [x] Leader resign capability +- [x] Leadership change events +- [x] In-memory implementation for testing + +### TASK-034-05 - Task Queue +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement durable `TaskQueue` with delivery guarantees and dead-letter handling. + +### TASK-034-06 - Self Healer +Status: DONE +Dependency: TASK-034-03 +Owners: Developer/Implementer + +Implement `SelfHealer` with automatic recovery actions. + +Completion criteria: +- [x] Automatic recovery action determination based on health factors +- [x] Circuit breaker to prevent recovery storms +- [x] Recovery history tracking +- [x] Recovery events (started, completed, failed) +- [x] Configurable action timeout and cooldown + +### TASK-034-07 - State Sync +Status: DONE +Dependency: TASK-034-04 +Owners: Developer/Implementer + +Implement `StateSync` for cluster state synchronization. + +Completion criteria: +- [x] Vector clock-based versioning +- [x] Gossip protocol for peer sync +- [x] Tombstone support for deletions +- [x] State persistence +- [x] Conflict resolution + +### TASK-034-08 - REST API +Status: DONE +Dependency: TASK-034-07 +Owners: Developer/Implementer + +Implement API endpoints for cluster and agent management. + +Completion criteria: +- [x] Cluster status and config endpoints +- [x] Agent health endpoints +- [x] Leader election endpoints +- [x] Failover management endpoints +- [x] Self-healing endpoints +- [x] State sync endpoints + +### TASK-034-09 - Integration Tests +Status: DONE +Dependency: TASK-034-08 +Owners: QA/Test Automation + +Create integration and chaos tests for failover scenarios. + +Completion criteria: +- [x] Health monitor tests +- [x] Leader election tests +- [x] Self-healer tests +- [x] State sync tests +- [x] Chaos tests (network partition, resource exhaustion) + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created | Planning | +| 2026-01-17 | TASK-034-01, 034-03, 034-05 implemented: AgentClusterManager, FailoverManager, DurableTaskQueue | Developer | +| 2026-01-17 | TASK-034-02 implemented: HealthMonitor with multi-factor assessment | Developer | +| 2026-01-17 | TASK-034-04 implemented: LeaderElection with distributed lock and InMemory impl | Developer | +| 2026-01-17 | TASK-034-06 implemented: SelfHealer with circuit breaker and recovery history | Developer | +| 2026-01-17 | TASK-034-07 implemented: StateSync with vector clocks and gossip protocol | Developer | +| 2026-01-17 | TASK-034-08 implemented: AgentClusterController REST API | Developer | +| 2026-01-17 | TASK-034-09 implemented: Integration and chaos tests | QA | +| 2026-01-17 | Sprint completed and archived | Planning | + +## Decisions & Risks + +- Risk: Split-brain scenarios in distributed clusters +- Mitigation: Distributed consensus with proper quorum handling + +## Next Checkpoints + +- TASK-034-03 complete: Failover working +- TASK-034-09 complete: Chaos tests passing diff --git a/docs-archived/implplan/SPRINT_20260117_035_ReleaseOrchestrator_progressive_delivery.md b/docs-archived/implplan/SPRINT_20260117_035_ReleaseOrchestrator_progressive_delivery.md new file mode 100644 index 000000000..c5d50b728 --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_035_ReleaseOrchestrator_progressive_delivery.md @@ -0,0 +1,154 @@ +# Sprint 035 · Progressive Delivery Enhancements + +## Topic & Scope + +Implement advanced progressive delivery with metric-driven canary automation, feature flag integration, automatic traffic percentage calculation, and sophisticated rollout strategies. + +**Key Deliverables:** +- Rollout controller with multiple strategies +- Metrics analyzer with provider integration +- Canary controller with statistical analysis +- Feature flag bridge (LaunchDarkly, Split, Unleash, Flagsmith) +- Traffic manager with load balancer adapters +- Experiment engine for A/B testing + +- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.ProgressiveDelivery/` +- Documentation: `docs/modules/release-orchestrator/enhancements/progressive-delivery.md` +- Expected evidence: Unit tests, integration tests, API documentation + +## Dependencies & Concurrency + +- Upstream: Sprint 033 (Rollback Intelligence), Sprint 034 (Agent Resilience), Sprint 038 (Performance) +- Downstream: Sprint 036 (Multi-Region), Sprint 037 (Developer Experience) +- Cannot run in parallel with Wave 2 sprints + +## Documentation Prerequisites + +- Read: `docs/modules/release-orchestrator/enhancements/progressive-delivery.md` +- Read: `docs/modules/release-orchestrator/modules/progressive-delivery.md` + +## Delivery Tracker + +### TASK-035-01 - Rollout Controller +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement `RolloutController` with canary, linear, exponential, and blue-green strategies. + +### TASK-035-02 - Metrics Analyzer +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement `MetricsAnalyzer` for health evaluation and traffic recommendations. + +Completion criteria: +- [x] Multi-factor health scoring (error rate, latency, throughput, saturation) +- [x] Baseline comparison +- [x] Version comparison with statistical significance +- [x] Traffic recommendations +- [x] Evaluation history tracking + +### TASK-035-03 - Canary Controller +Status: DONE +Dependency: TASK-035-02 +Owners: Developer/Implementer + +Implement `CanaryController` with statistical comparison and auto-progression. + +Completion criteria: +- [x] Canary lifecycle management (start, progress, pause, resume, rollback, complete) +- [x] Statistical analysis with significance testing +- [x] Checkpoint recording +- [x] Auto-progression with configurable strategies (linear, exponential, fibonacci) +- [x] Events for canary state changes + +### TASK-035-04 - Feature Flag Bridge +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement `FeatureFlagBridge` with LaunchDarkly, Split, Unleash, Flagsmith, ConfigCat providers. + +### TASK-035-05 - Traffic Manager +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement `TrafficManager` with Nginx, HAProxy, Traefik, AWS ALB adapters. + +Completion criteria: +- [x] Traffic split management +- [x] Nginx Plus API adapter +- [x] HAProxy Runtime API adapter +- [x] Traefik API adapter +- [x] AWS ALB adapter +- [x] Multi-adapter support + +### TASK-035-06 - Experiment Engine +Status: DONE +Dependency: TASK-035-02 +Owners: Developer/Implementer + +Implement `ExperimentEngine` for A/B testing with statistical analysis. + +Completion criteria: +- [x] Experiment lifecycle management +- [x] Deterministic variant assignment +- [x] Metric recording +- [x] Statistical analysis (mean, stddev, confidence intervals, p-value) +- [x] Winner determination with confidence levels +- [x] Auto-analysis and optional auto-conclusion + +### TASK-035-07 - REST API +Status: DONE +Dependency: TASK-035-06 +Owners: Developer/Implementer + +Implement API endpoints for rollouts, canaries, experiments, and traffic management. + +Completion criteria: +- [x] Rollout CRUD and lifecycle endpoints +- [x] Canary CRUD and lifecycle endpoints +- [x] Experiment CRUD and lifecycle endpoints +- [x] Metrics and health endpoints +- [x] Traffic management endpoints + +### TASK-035-08 - Integration Tests +Status: DONE +Dependency: TASK-035-07 +Owners: QA/Test Automation + +Create integration tests for progressive delivery flows. + +Completion criteria: +- [x] Metrics analyzer tests +- [x] Canary controller tests +- [x] Experiment engine tests +- [x] Traffic manager tests +- [x] End-to-end flow tests + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created | Planning | +| 2026-01-17 | TASK-035-01, 035-04 implemented: RolloutController, FeatureFlagBridge | Developer | +| 2026-01-17 | TASK-035-02 implemented: MetricsAnalyzer with health evaluation and recommendations | Developer | +| 2026-01-17 | TASK-035-03 implemented: CanaryController with statistical comparison | Developer | +| 2026-01-17 | TASK-035-05 implemented: TrafficManager with Nginx, HAProxy, Traefik, ALB adapters | Developer | +| 2026-01-17 | TASK-035-06 implemented: ExperimentEngine for A/B testing | Developer | +| 2026-01-17 | TASK-035-07 implemented: ProgressiveDeliveryController REST API | Developer | +| 2026-01-17 | TASK-035-08 implemented: Integration tests | QA | +| 2026-01-17 | Sprint completed and archived | Planning | + +## Decisions & Risks + +- Risk: Metrics provider unavailability during rollout +- Mitigation: Fallback strategies, cached metrics, manual override + +## Next Checkpoints + +- TASK-035-03 complete: Canary working +- TASK-035-08 complete: Ready for integration diff --git a/docs-archived/implplan/SPRINT_20260117_036_ReleaseOrchestrator_multi_region.md b/docs-archived/implplan/SPRINT_20260117_036_ReleaseOrchestrator_multi_region.md new file mode 100644 index 000000000..117661531 --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_036_ReleaseOrchestrator_multi_region.md @@ -0,0 +1,161 @@ +# Sprint 036 · Multi-Region / Federation + +## Topic & Scope + +Implement multi-region federation for geographically distributed deployments with cross-region coordination, evidence replication, and data residency compliance. + +**Key Deliverables:** +- Federation hub for central coordination +- Region coordinator with promotion orchestration +- Cross-region sync with conflict resolution +- Evidence replicator with data residency +- Latency router for optimal region selection +- Global dashboard for unified visibility + +- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Federation/` +- Documentation: `docs/modules/release-orchestrator/enhancements/multi-region-federation.md` +- Expected evidence: Unit tests, integration tests, API documentation + +## Dependencies & Concurrency + +- Upstream: Sprint 035 (Progressive Delivery) +- Downstream: Sprint 039 (Compliance) +- Can run in parallel with: Sprint 037 + +## Documentation Prerequisites + +- Read: `docs/modules/release-orchestrator/enhancements/multi-region-federation.md` + +## Delivery Tracker + +### TASK-036-01 - Federation Hub +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement `FederationHub` for multi-region management. + +### TASK-036-02 - Region Coordinator +Status: DONE +Dependency: TASK-036-01 +Owners: Developer/Implementer + +Implement `RegionCoordinator` with global promotion orchestration. + +Completion criteria: +- [x] Global promotion lifecycle (start, progress, pause, resume, rollback, complete) +- [x] Multiple promotion strategies (Sequential, Canary, Parallel, BlueGreen) +- [x] Wave-based rollout with configurable requirements +- [x] Cross-region health monitoring +- [x] Events for promotion state changes + +### TASK-036-03 - Cross-Region Sync +Status: DONE +Dependency: TASK-036-01 +Owners: Developer/Implementer + +Implement `CrossRegionSync` with conflict resolution strategies. + +Completion criteria: +- [x] Peer discovery and connection management +- [x] Entry replication to all peers +- [x] Vector clock-based conflict detection +- [x] Conflict resolution (KeepLocal, KeepRemote, Merge, LastWriteWins) +- [x] Background sync loop + +### TASK-036-04 - Evidence Replicator +Status: DONE +Dependency: TASK-036-03 +Owners: Developer/Implementer + +Implement `EvidenceReplicator` with data residency compliance. + +Completion criteria: +- [x] Evidence bundle replication to allowed regions +- [x] Data classification-based region filtering +- [x] Residency validation and violation detection +- [x] Non-compliant region removal requests +- [x] Background replication task scheduling + +### TASK-036-05 - Latency Router +Status: DONE +Dependency: TASK-036-01 +Owners: Developer/Implementer + +Implement `LatencyRouter` for optimal region selection. + +Completion criteria: +- [x] Region initialization and metrics tracking +- [x] Latency-based region selection with scoring +- [x] Preference and exclusion handling +- [x] Background latency probing +- [x] Region unavailability marking + +### TASK-036-06 - Global Dashboard +Status: DONE +Dependency: TASK-036-05 +Owners: Developer/Implementer + +Implement `GlobalDashboard` for cross-region visibility. + +Completion criteria: +- [x] Global overview with region summaries +- [x] Region detail views +- [x] Alert management (create, acknowledge, resolve) +- [x] Sync status overview +- [x] Latency map between regions + +### TASK-036-07 - REST API +Status: DONE +Dependency: TASK-036-06 +Owners: Developer/Implementer + +Implement API endpoints for federation management. + +Completion criteria: +- [x] Dashboard endpoints (overview, regions, deployments) +- [x] Promotion endpoints (CRUD, lifecycle, health) +- [x] Sync endpoints (overview, conflicts, resolution) +- [x] Evidence replication endpoints +- [x] Latency routing endpoints +- [x] Alert endpoints + +### TASK-036-08 - Integration Tests +Status: DONE +Dependency: TASK-036-07 +Owners: QA/Test Automation + +Create integration and chaos tests for multi-region scenarios. + +Completion criteria: +- [x] Region coordinator tests +- [x] Cross-region sync tests +- [x] Evidence replicator tests +- [x] Latency router tests +- [x] Global dashboard tests +- [x] End-to-end global promotion flow + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created | Planning | +| 2026-01-17 | TASK-036-01 implemented: FederationHub with multi-region management | Developer | +| 2026-01-17 | TASK-036-02 implemented: RegionCoordinator with promotion strategies | Developer | +| 2026-01-17 | TASK-036-03 implemented: CrossRegionSync with conflict resolution | Developer | +| 2026-01-17 | TASK-036-04 implemented: EvidenceReplicator with data residency | Developer | +| 2026-01-17 | TASK-036-05 implemented: LatencyRouter for optimal routing | Developer | +| 2026-01-17 | TASK-036-06 implemented: GlobalDashboard for visibility | Developer | +| 2026-01-17 | TASK-036-07 implemented: FederationController REST API | Developer | +| 2026-01-17 | TASK-036-08 implemented: Integration tests | QA | +| 2026-01-17 | Sprint completed and archived | Planning | + +## Decisions & Risks + +- Risk: Network partitions between regions +- Mitigation: Eventual consistency model, offline operation support + +## Next Checkpoints + +- TASK-036-04 complete: Evidence replication working +- TASK-036-08 complete: Ready for integration diff --git a/docs-archived/implplan/SPRINT_20260117_037_ReleaseOrchestrator_developer_experience.md b/docs-archived/implplan/SPRINT_20260117_037_ReleaseOrchestrator_developer_experience.md new file mode 100644 index 000000000..315644055 --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_037_ReleaseOrchestrator_developer_experience.md @@ -0,0 +1,178 @@ +# Sprint 037 · Developer Experience / CLI + +## Topic & Scope + +Implement comprehensive developer tooling including a powerful CLI, GitOps-native workflows, IDE integrations, and streamlined development workflows. + +**Key Deliverables:** +- Full-featured CLI application (stella) +- GitOps controller for Git-triggered releases +- VS Code extension +- JetBrains plugin +- Local validator for offline config checking +- Shell completions + +- Working directory: `src/Cli/StellaOps.Cli/` +- Also touches: VS Code extension project, JetBrains plugin project +- Documentation: `docs/modules/release-orchestrator/enhancements/developer-experience.md` +- Expected evidence: Unit tests, integration tests, E2E tests, API documentation + +## Dependencies & Concurrency + +- Upstream: Sprint 035 (Progressive Delivery) +- Downstream: Sprint 039 (Compliance) +- Can run in parallel with: Sprint 036 + +## Documentation Prerequisites + +- Read: `docs/modules/release-orchestrator/enhancements/developer-experience.md` +- Read: `src/Cli/StellaOps.Cli/` existing patterns + +## Delivery Tracker + +### TASK-037-01 - CLI Foundation +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement core CLI structure with auth, config, and help commands. + +Completion criteria: +- [x] CliApplication with command parsing +- [x] Auth commands (login, logout, status, refresh) +- [x] Config commands (init, show, set, get, validate) +- [x] Global options (--format, --verbose, --config) +- [x] Output formatting (table, json, yaml) + +### TASK-037-02 - Release Commands +Status: DONE +Dependency: TASK-037-01 +Owners: Developer/Implementer + +Implement release create, list, get, diff, history commands. + +Completion criteria: +- [x] ReleaseCommandHandler with all subcommands +- [x] Create release with notes and draft support +- [x] List with filters (service, status, limit) +- [x] Get release details with scan results and approvals +- [x] Diff between two releases +- [x] History view for a service + +### TASK-037-03 - Promotion Commands +Status: DONE +Dependency: TASK-037-02 +Owners: Developer/Implementer + +Implement promote, status, approve, reject commands. + +Completion criteria: +- [x] PromoteCommandHandler with all subcommands +- [x] Start promotion with auto-approve option +- [x] Status with watch mode +- [x] Approve and reject with comments/reasons +- [x] List with environment and pending filters + +### TASK-037-04 - Deployment Commands +Status: DONE +Dependency: TASK-037-03 +Owners: Developer/Implementer + +Implement deploy, status, logs, rollback commands. + +Completion criteria: +- [x] DeployCommandHandler with all subcommands +- [x] Start deployment with strategy and dry-run +- [x] Status with watch mode and progress bar +- [x] Logs with follow and tail options +- [x] Rollback with reason +- [x] List with environment and active filters + +### TASK-037-05 - GitOps Controller +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement `GitOpsController` for Git event handling and auto-releases. + +### TASK-037-06 - VS Code Extension +Status: DONE +Dependency: TASK-037-04 +Owners: Developer/Implementer + +Implement VS Code extension with tree view, commands, and code lens. + +Completion criteria: +- [x] Extension activation and package.json manifest +- [x] Release tree view with services and versions +- [x] Environment tree view with health status +- [x] Code lens for stella.yaml files +- [x] Commands (create release, promote, validate, etc.) +- [x] Status bar integration + +### TASK-037-07 - JetBrains Plugin +Status: DONE +Dependency: TASK-037-04 +Owners: Developer/Implementer + +Implement JetBrains plugin with tool window and annotators. + +Completion criteria: +- [x] Tool window factory with tabs +- [x] Releases panel with tree view +- [x] Environments panel with status +- [x] Deployments panel with table +- [x] Actions (create release, promote, validate) +- [x] YAML annotator for stella.yaml +- [x] Status bar widget + +### TASK-037-08 - Local Validator +Status: DONE +Dependency: TASK-037-01 +Owners: Developer/Implementer + +Implement `LocalValidator` for offline config validation. + +### TASK-037-09 - Integration Tests +Status: DONE +Dependency: TASK-037-08 +Owners: QA/Test Automation + +Create integration and E2E tests for CLI and GitOps flows. + +Completion criteria: +- [x] CLI foundation tests (version, help) +- [x] Auth command tests +- [x] Config command tests +- [x] Release command tests +- [x] Promote command tests +- [x] Deploy command tests +- [x] Scan and policy command tests +- [x] Global options tests +- [x] GitOps controller tests + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created | Planning | +| 2026-01-17 | TASK-037-05 implemented: GitOpsController for Git-triggered releases | Developer | +| 2026-01-17 | TASK-037-08 implemented: LocalValidator for offline config validation | Developer | +| 2026-01-17 | TASK-037-01 implemented: CliApplication with auth/config commands | Developer | +| 2026-01-17 | TASK-037-02 implemented: ReleaseCommandHandler | Developer | +| 2026-01-17 | TASK-037-03 implemented: PromoteCommandHandler | Developer | +| 2026-01-17 | TASK-037-04 implemented: DeployCommandHandler | Developer | +| 2026-01-17 | TASK-037-06 implemented: VS Code extension | Developer | +| 2026-01-17 | TASK-037-07 implemented: JetBrains plugin | Developer | +| 2026-01-17 | TASK-037-09 implemented: CLI integration tests | QA | +| 2026-01-17 | Sprint completed and archived | Planning | + +## Decisions & Risks + +- Risk: CLI backward compatibility with server versions +- Mitigation: Version negotiation, clear deprecation policy + +## Next Checkpoints + +- TASK-037-04 complete: Core CLI functional +- TASK-037-09 complete: Ready for release diff --git a/docs-archived/implplan/SPRINT_20260117_038_ReleaseOrchestrator_performance.md b/docs-archived/implplan/SPRINT_20260117_038_ReleaseOrchestrator_performance.md new file mode 100644 index 000000000..ab00d0a91 --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_038_ReleaseOrchestrator_performance.md @@ -0,0 +1,150 @@ +# Sprint 038 · Performance Optimizations + +## Topic & Scope + +Implement comprehensive performance optimizations including parallel gate evaluation, bulk digest resolution, task batching, intelligent caching, and database query optimization. + +**Key Deliverables:** +- Parallel gate evaluator +- Bulk digest resolver +- Task batcher for agent operations +- Multi-level cache manager +- Query optimizer with index management +- Prefetcher for predictive loading +- Connection pool optimization + +- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Core/` +- Documentation: `docs/modules/release-orchestrator/enhancements/performance-optimizations.md` +- Expected evidence: Unit tests, performance benchmarks, load tests, API documentation + +## Dependencies & Concurrency + +- Upstream: None (Wave 1 sprint) +- Downstream: Sprint 035 (Progressive Delivery) +- Can run in parallel with: Sprint 031, Sprint 032 + +## Documentation Prerequisites + +- Read: `docs/modules/release-orchestrator/enhancements/performance-optimizations.md` + +## Delivery Tracker + +### TASK-038-01 - Performance Baseline +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Establish performance baselines and add metrics instrumentation. + +Completion criteria: +- [x] PerformanceBaseline class with measurement recording +- [x] Metrics instrumentation (counters, histograms, gauges) +- [x] Percentile calculation (P50, P90, P95, P99) +- [x] Baseline comparison and regression detection +- [x] Operation measurement helper (RAII-style) + +### TASK-038-02 - Parallel Gate Evaluator +Status: DONE +Dependency: TASK-038-01 +Owners: Developer/Implementer + +Implement `ParallelGateEvaluator` with execution plan builder. + +### TASK-038-03 - Bulk Digest Resolver +Status: DONE +Dependency: TASK-038-01 +Owners: Developer/Implementer + +Implement `BulkDigestResolver` with registry connection pooling. + +### TASK-038-04 - Task Batcher +Status: DONE +Dependency: TASK-038-01 +Owners: Developer/Implementer + +Implement `TaskBatcher` for agent task optimization. + +### TASK-038-05 - Cache Manager +Status: DONE +Dependency: TASK-038-01 +Owners: Developer/Implementer + +Implement multi-level `CacheManager` with L1 (memory) and L2 (Redis). + +### TASK-038-06 - Query Optimizer +Status: DONE +Dependency: TASK-038-01 +Owners: Developer/Implementer + +Implement `QueryOptimizer` with index management and read replicas. + +### TASK-038-07 - Prefetcher +Status: DONE +Dependency: TASK-038-05 +Owners: Developer/Implementer + +Implement `Prefetcher` for predictive cache warming. + +Completion criteria: +- [x] Data loader registration by pattern +- [x] Access pattern tracking +- [x] Predictive prefetch based on related keys +- [x] Cache warmup for hot keys +- [x] Background prefetch queue processing +- [x] Statistics and monitoring + +### TASK-038-08 - Connection Pool +Status: DONE +Dependency: TASK-038-06 +Owners: Developer/Implementer + +Implement optimized `ConnectionPool` with warmup. + +Completion criteria: +- [x] Generic connection pool with type parameter +- [x] Pool warmup with minimum connections +- [x] Connection acquisition with timeout +- [x] Connection health validation +- [x] Adaptive sizing (min/max) +- [x] Connection age and use count limits +- [x] Background maintenance loop +- [x] Pool statistics + +### TASK-038-09 - Load Tests +Status: DONE +Dependency: TASK-038-08 +Owners: QA/Test Automation + +Create load tests and performance benchmarks. + +Completion criteria: +- [x] Performance baseline high volume tests +- [x] Percentile accuracy tests +- [x] Regression detection tests +- [x] Thread safety tests +- [x] Prefetcher load tests +- [x] Connection pool concurrency tests +- [x] Parallel gate evaluator benchmark +- [x] Bulk digest resolver benchmark + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created | Planning | +| 2026-01-17 | TASK-038-02 to 038-06 implemented: ParallelGateEvaluator, BulkDigestResolver, TaskBatcher, CacheManager, QueryOptimizer | Developer | +| 2026-01-17 | TASK-038-01 implemented: PerformanceBaseline with metrics | Developer | +| 2026-01-17 | TASK-038-07 implemented: Prefetcher with predictive warming | Developer | +| 2026-01-17 | TASK-038-08 implemented: ConnectionPool with warmup | Developer | +| 2026-01-17 | TASK-038-09 implemented: Load tests and benchmarks | QA | +| 2026-01-17 | Sprint completed and archived | Planning | + +## Decisions & Risks + +- Risk: Cache invalidation bugs cause stale data +- Mitigation: Comprehensive invalidation tags, short TTLs for critical data + +## Next Checkpoints + +- TASK-038-02 complete: Gate evaluation 3x faster +- TASK-038-09 complete: All benchmarks passing diff --git a/docs-archived/implplan/SPRINT_20260117_039_ReleaseOrchestrator_compliance.md b/docs-archived/implplan/SPRINT_20260117_039_ReleaseOrchestrator_compliance.md new file mode 100644 index 000000000..02746a449 --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_039_ReleaseOrchestrator_compliance.md @@ -0,0 +1,164 @@ +# Sprint 039 · Compliance & Reporting + +## Topic & Scope + +Implement comprehensive compliance management with pre-built report templates, evidence chain visualization, audit query interface, and automated compliance checking for SOC2, ISO 27001, PCI-DSS, HIPAA, FedRAMP, and GDPR. + +**Key Deliverables:** +- Compliance engine with framework support +- Framework mapper for control alignment +- Report generator with templates +- Evidence chain visualizer +- Audit query engine +- Control validator with automated checks +- Scheduled reporting + +- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Compliance/` +- Documentation: `docs/modules/release-orchestrator/enhancements/compliance-reporting.md` +- Expected evidence: Unit tests, integration tests, report samples, API documentation + +## Dependencies & Concurrency + +- Upstream: Sprint 036 (Multi-Region), Sprint 037 (Developer Experience) +- Downstream: Sprint 040 (Multi-Language Scripts) +- Cannot run in parallel with Wave 4 sprints + +## Documentation Prerequisites + +- Read: `docs/modules/release-orchestrator/enhancements/compliance-reporting.md` + +## Delivery Tracker + +### TASK-039-01 - Compliance Engine +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement `ComplianceEngine` for framework evaluation. + +### TASK-039-02 - Framework Mapper +Status: DONE +Dependency: TASK-039-01 +Owners: Developer/Implementer + +Implement `FrameworkMapper` with SOC2, ISO 27001, PCI-DSS, HIPAA, FedRAMP, GDPR, NIST CSF frameworks. + +### TASK-039-03 - Report Generator +Status: DONE +Dependency: TASK-039-02 +Owners: Developer/Implementer + +Implement `ReportGenerator` with executive summary, detailed compliance, gap analysis, audit readiness, and evidence package templates. + +### TASK-039-04 - Evidence Chain Visualizer +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement `EvidenceChainVisualizer` with chain building, graph representation, and integrity verification. + +Completion criteria: +- [x] Build evidence chains from release evidence items +- [x] Determine causal and temporal relationships (edges) +- [x] Compute and verify chain hash for integrity +- [x] Generate graph representation with layers +- [x] Export to JSON, DOT, Mermaid, CSV formats +- [x] Node and edge styling for visualization + +### TASK-039-05 - Audit Query Engine +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement `AuditQueryEngine` with flexible querying and aggregations. + +Completion criteria: +- [x] Flexible query interface with filters +- [x] Sorting and pagination +- [x] Aggregation by action, actor, resource, time intervals +- [x] Activity summary with hourly distribution +- [x] Resource audit trail +- [x] Actor activity reports +- [x] Export to CSV, JSON, Syslog formats + +### TASK-039-06 - Control Validator +Status: DONE +Dependency: TASK-039-02 +Owners: Developer/Implementer + +Implement `ControlValidator` with automated checks for approvals, evidence generation, authentication, etc. + +### TASK-039-07 - REST API +Status: DONE +Dependency: TASK-039-06 +Owners: Developer/Implementer + +Implement API endpoints for compliance status, reports, evidence, and audit queries. + +Completion criteria: +- [x] Compliance status endpoints (overall, per-framework) +- [x] Release compliance evaluation +- [x] Report templates listing and generation +- [x] Report download with format selection +- [x] Scheduled report CRUD operations +- [x] Evidence chain endpoints (build, verify, graph, export) +- [x] Audit query, aggregation, and summary endpoints +- [x] Resource and actor audit trail endpoints +- [x] Control status endpoints + +### TASK-039-08 - Scheduled Reports +Status: DONE +Dependency: TASK-039-03 +Owners: Developer/Implementer + +Implement scheduled report generation and delivery. + +Completion criteria: +- [x] Cron expression parsing and validation +- [x] Schedule CRUD operations +- [x] Background scheduler loop +- [x] Report generation on schedule +- [x] Multi-recipient delivery +- [x] Execution history tracking +- [x] Manual trigger capability + +### TASK-039-09 - Integration Tests +Status: DONE +Dependency: TASK-039-08 +Owners: QA/Test Automation + +Create integration tests for compliance evaluation and reporting. + +Completion criteria: +- [x] Evidence chain builder tests +- [x] Chain verification tests +- [x] Multi-format export tests +- [x] Graph generation tests +- [x] Audit query with filters tests +- [x] Aggregation tests +- [x] Activity summary tests +- [x] Scheduled report CRUD tests +- [x] End-to-end workflow tests + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created | Planning | +| 2026-01-17 | TASK-039-01, 039-02, 039-03, 039-06 implemented: ComplianceEngine, FrameworkMapper, ReportGenerator, ControlValidator | Developer | +| 2026-01-17 | TASK-039-04 implemented: EvidenceChainVisualizer with graph and exports | Developer | +| 2026-01-17 | TASK-039-05 implemented: AuditQueryEngine with aggregations | Developer | +| 2026-01-17 | TASK-039-07 implemented: ComplianceController REST API | Developer | +| 2026-01-17 | TASK-039-08 implemented: ScheduledReportService | Developer | +| 2026-01-17 | TASK-039-09 implemented: Integration tests | QA | +| 2026-01-17 | Sprint completed and archived | Planning | + +## Decisions & Risks + +- Risk: Framework mapping accuracy +- Mitigation: Manual review capability, mapping override support + +## Next Checkpoints + +- TASK-039-03 complete: Reports generating +- TASK-039-09 complete: Ready for audits diff --git a/docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_multi_language_scripts.md b/docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_multi_language_scripts.md new file mode 100644 index 000000000..c1084555d --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_multi_language_scripts.md @@ -0,0 +1,561 @@ +# Sprint 040 · Multi-Language Script Engine + +## Topic & Scope + +Implement a polyglot scripting platform with Monaco-based editing, library management, and containerized execution for C# (.NET 10), Python, Java, Go, Bash, and TypeScript scripts. + +**Key Deliverables:** +- Script registry with versioning +- Monaco editor service with language server integration +- Library manager for dependencies (NuGet, pip, Maven, Go modules, npm) +- Runtime image manager for containerized execution +- Script executor with mount-based injection +- Sample library with per-language examples +- Smart container pool with IHostedService lifecycle and auto-scaling +- Multi-level compilation cache (C#/Java/Go/TypeScript) + +- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Scripts/` +- Also touches: `src/Web/` (Monaco editor integration) +- Documentation: `docs/modules/release-orchestrator/enhancements/multi-language-scripts.md` +- Expected evidence: Unit tests, integration tests, sample scripts, API documentation + +## Dependencies & Concurrency + +- Upstream: Sprint 039 (Compliance & Reporting) +- Downstream: None (final sprint) +- Cannot run in parallel with other sprints + +## Documentation Prerequisites + +- Read: `docs/modules/release-orchestrator/enhancements/multi-language-scripts.md` +- Read: `docs/modules/release-orchestrator/modules/workflow-engine.md` (step integration) +- Read existing workflow step patterns + +## Delivery Tracker + +### TASK-040-01 - Script Data Model +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Task description: +Implement the script data model and registry for storing versioned scripts. + +Implementation details: +- Create `Script` record with all metadata +- Create `ScriptLanguage` enum (CSharp, Python, Java, Go, Bash, TypeScript) +- Create `ScriptVisibility` enum (Private, Team, Organization, Public) +- Create `ScriptDependency` record +- Implement `IScriptStore` with PostgreSQL backend + +Completion criteria: +- [x] `Script` record with Id, Name, Description, Language, Content, EntryPoint, Version, Dependencies +- [x] `ScriptLanguage` enum with all 6 languages (including TypeScript) +- [x] `ScriptVisibility` for access control +- [x] Database migration for script storage +- [x] Version history tracking + +### TASK-040-02 - Script Registry +Status: DONE +Dependency: TASK-040-01 +Owners: Developer/Implementer + +Task description: +Implement the `ScriptRegistry` for managing scripts with validation and search. + +Implementation details: +- Create `ScriptRegistry` with CRUD operations +- Implement script validation per language +- Add version incrementing logic +- Integrate search indexing + +Completion criteria: +- [x] `CreateScriptAsync()` with validation +- [x] `UpdateScriptAsync()` with version management +- [x] `SearchAsync()` with filters (language, tags, visibility) +- [x] Syntax validation per language +- [x] Search indexing for fast queries + +### TASK-040-03 - Language Server Pool +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Task description: +Implement language server integration for Monaco editor features. + +Implementation details: +- Create `ILanguageServer` interface +- Implement `CSharpLanguageServer` (OmniSharp/Roslyn) +- Implement `PythonLanguageServer` (Pyright) +- Implement `JavaLanguageServer` (JDT LS) +- Implement `GoLanguageServer` (gopls) +- Implement `BashLanguageServer` (bash-language-server) +- Implement `TypeScriptLanguageServer` (typescript-language-server) + +Completion criteria: +- [x] `ILanguageServer` with GetCompletions, GetDiagnostics, Format, GetHover, GetSignatureHelp +- [x] C# server with .NET 10 script support +- [x] Python server with type checking +- [x] Java server with JDK 21 support +- [x] Go server with module support +- [x] Bash server with ShellCheck integration +- [x] TypeScript server with npm package resolution + +### TASK-040-04 - Monaco Editor Service +Status: DONE +Dependency: TASK-040-03 +Owners: Developer/Implementer + +Task description: +Implement the `MonacoEditorService` for IDE-quality editing. + +Implementation details: +- Create `MonacoEditorService` with configuration management +- Implement completion provider wrapper +- Implement diagnostic provider wrapper +- Add formatting support +- Add hover and signature help + +Completion criteria: +- [x] `GetConfigurationAsync()` with language-specific options +- [x] `GetCompletionsAsync()` delegating to language servers +- [x] `GetDiagnosticsAsync()` for real-time error checking +- [x] `FormatDocumentAsync()` for code formatting +- [x] `GetHoverInfoAsync()` for hover documentation +- [x] `GetSignatureHelpAsync()` for parameter hints + +### TASK-040-05 - Library Manager +Status: DONE +Dependency: TASK-040-01 +Owners: Developer/Implementer + +Task description: +Implement the `LibraryManager` for resolving script dependencies. + +Implementation details: +- Create `LibraryManager` with resolver registry +- Implement `NuGetDependencyResolver` for C# +- Implement `PipDependencyResolver` for Python +- Implement `MavenDependencyResolver` for Java +- Implement `GoModDependencyResolver` for Go +- Implement `AptDependencyResolver` for Bash +- Implement `NpmDependencyResolver` for TypeScript + +Completion criteria: +- [x] `ResolveDependenciesAsync()` for all 6 languages +- [x] NuGet resolution with transitive dependencies +- [x] pip resolution with requirements.txt generation +- [x] Maven resolution with pom.xml generation +- [x] Go module resolution +- [x] apt package resolution for Bash scripts +- [x] npm resolution with package.json generation for TypeScript +- [x] Dependency caching + +### TASK-040-06 - Runtime Image Manager +Status: DONE +Dependency: TASK-040-05 +Owners: Developer/Implementer + +Task description: +Implement the `RuntimeImageManager` for building and caching Docker runtime images. + +Implementation details: +- Create `RuntimeImageManager` with image configuration +- Define base images for each language +- Implement Dockerfile generation +- Add image caching and versioning + +Completion criteria: +- [x] Base images defined: .NET 10, Python 3.12, Java 21, Go 1.22, Alpine 3.19, Node.js 22 (TypeScript) +- [x] `BuildRuntimeImageAsync()` with dependency installation +- [x] Dockerfile generation per language (6 languages) +- [x] Image tagging with script ID and version +- [x] Image cache management +- [x] Resource limits configuration + +### TASK-040-07 - Script Executor +Status: DONE +Dependency: TASK-040-06 +Owners: Developer/Implementer + +Task description: +Implement the `ScriptExecutor` for running scripts in isolated containers. + +Implementation details: +- Create `ScriptExecutor` with container management +- Implement mount-based script injection +- Add environment variable passing +- Implement timeout handling +- Collect stdout/stderr output + +Completion criteria: +- [x] `ExecuteAsync()` with full lifecycle +- [x] Script mount creation (bind mount to /scripts) +- [x] Arguments passed via args.json +- [x] Environment variable injection +- [x] Network isolation (default: none) +- [x] Resource limits enforcement +- [x] Timeout handling with cancellation +- [x] Output collection (stdout, stderr, exit code) + +### TASK-040-08 - Sample Library +Status: DONE +Dependency: TASK-040-07 +Owners: Developer/Implementer + +Task description: +Create the sample script library with examples for each language. + +Implementation details: +- Create `SampleLibrary` with pre-built scripts +- Implement C# samples: health-check, smoke-test, db-migration-check +- Implement Python samples: log-analyzer, prometheus-query, slack-notification +- Implement Java samples: jdbc-health-check, kafka-consumer-check +- Implement Go samples: tcp-port-check, container-inspect +- Implement Bash samples: disk-space-check, service-restart, backup-verify +- Implement TypeScript samples: api-integration-test, json-schema-validator, webhook-sender + +Completion criteria: +- [x] `GetSamplesAsync()` with filtering +- [x] C# HTTP health check script (.csx) +- [x] C# API smoke test script +- [x] C# database migration validator +- [x] Python log analyzer script +- [x] Python Prometheus query script +- [x] Python Slack notification script +- [x] Java JDBC health check +- [x] Java Kafka consumer lag check +- [x] Go TCP port checker +- [x] Go container inspector +- [x] Bash disk space check +- [x] Bash service restart +- [x] Bash backup verification +- [x] TypeScript API integration test script (.ts) +- [x] TypeScript JSON schema validator script +- [x] TypeScript webhook sender script +- [x] Clone functionality for samples + +### TASK-040-09 - REST API +Status: DONE +Dependency: TASK-040-08 +Owners: Developer/Implementer + +Task description: +Implement REST API endpoints for script management and execution. + +Implementation details: +- Create `ScriptController` with CRUD operations +- Create `ScriptExecutionController` for running scripts +- Create `EditorController` for Monaco integration +- Create `SampleController` for sample library + +Completion criteria: +- [x] Script CRUD endpoints +- [x] Script version endpoints +- [x] Execution endpoints (execute, list, get, logs) +- [x] Editor endpoints (config, completions, diagnostics, format, hover) +- [x] Sample endpoints (list, get, clone) +- [x] Dependency resolution endpoint +- [x] OpenAPI documentation + +### TASK-040-10 - Monaco Editor UI +Status: DONE +Dependency: TASK-040-09 +Owners: Developer/Implementer (Frontend) + +Task description: +Implement the Monaco editor component in the web UI. + +Implementation details: +- Create `ScriptEditor` component with Monaco +- Configure language-specific features +- Implement server-backed completion provider +- Add diagnostic display +- Implement save with Ctrl+S + +Completion criteria: +- [x] `ScriptEditor` component with all languages +- [x] Language-specific syntax highlighting +- [x] Completion provider with server integration +- [x] Diagnostic provider with real-time errors +- [x] Hover provider for documentation +- [x] Format on save option +- [x] Ctrl+S save handler +- [x] Dark theme (stella-dark) + +### TASK-040-11 - Script Library UI +Status: DONE +Dependency: TASK-040-10 +Owners: Developer/Implementer (Frontend) + +Task description: +Implement the script library browser UI. + +Implementation details: +- Create `ScriptLibrary` component with browsing +- Implement search and filtering +- Add sample preview +- Implement clone workflow + +Completion criteria: +- [x] `ScriptLibrary` with grid/list view +- [x] Search by name, description, tags +- [x] Filter by language, visibility +- [x] Sample preview with syntax highlighting +- [x] Clone to create new script +- [x] Dependency display + +### TASK-040-12 - Workflow Step Integration +Status: DONE +Dependency: TASK-040-07 +Owners: Developer/Implementer + +Task description: +Integrate scripts as workflow step type. + +Implementation details: +- Create `ScriptStepExecutor` implementing `IStepExecutor` +- Add script step to step registry +- Implement argument mapping from workflow variables +- Add output propagation to workflow + +Completion criteria: +- [x] `ScriptStepExecutor` with full lifecycle +- [x] Script step type in registry +- [x] Input mapping from workflow variables +- [x] Output parsing and propagation +- [x] Timeout and retry support +- [x] Evidence generation + +### TASK-040-13 - Script Compilation Cache +Status: DONE +Dependency: TASK-040-07 +Owners: Developer/Implementer + +Task description: +Implement multi-level compilation cache for pre-compiled scripts across all compiled/transpiled languages. + +Implementation details: +- Create `ScriptCompilationCache` with L1 (memory) and L2 (distributed/Redis) cache +- Implement `DotNetScriptCompiler` using Roslyn for C# AOT compilation +- Implement `JavaScriptCompiler` using javac for Java bytecode caching +- Implement `GoScriptCompiler` using go build for Go binary caching +- Implement `TypeScriptCompiler` using tsc for TypeScript transpilation to JavaScript +- Cache key based on script content + dependencies + runtime version hash + +Completion criteria: +- [x] `ScriptCompilationCache` with GetOrCompileAsync() +- [x] L1 memory cache with configurable size (default 256MB) +- [x] L2 distributed cache with Redis backend +- [x] Roslyn-based C# script compilation to assembly bytes +- [x] javac-based Java compilation to bytecode +- [x] go build-based Go compilation to binary +- [x] tsc-based TypeScript transpilation to JavaScript +- [x] Cache key computation with SHA256 hash +- [x] TTL configuration (default 7 days) +- [x] Cache hit/miss metrics + +### TASK-040-14 - Smart Container Pool Manager +Status: DONE +Dependency: TASK-040-06 +Owners: Developer/Implementer + +Task description: +Implement smart container pool manager with IHostedService lifecycle and auto-scaling. + +Implementation details: +- Create `SmartContainerPoolManager` implementing `IHostedService` for graceful startup/shutdown +- Implement `ManagedContainerPool` per language with acquire/release lifecycle +- Add `UsageTracker` for monitoring hit rates and request rates +- Implement auto-scaling based on usage patterns +- Graceful shutdown: dispose all containers when agent stops + +Completion criteria: +- [x] `SmartContainerPoolManager` implementing `IHostedService` +- [x] `StartAsync()` warms up all pools to minimum containers +- [x] `StopAsync()` gracefully shuts down all pools and disposes containers +- [x] Configurable min/max containers per language (6 languages including TypeScript) +- [x] `AcquireAsync()` with exact dependency match priority +- [x] `ReleaseAsync()` with container reset and health check +- [x] `UsageTracker` with hit rate and request rate monitoring +- [x] Auto-scaling: scale up when hit rate < 50%, scale down when utilization < 30% +- [x] Background `PerformMaintenanceAsync()` for health checks and eviction +- [x] Idle container eviction after configurable timeout +- [x] Pool size and utilization metrics + +### TASK-040-15 - Runtime Image Cache +Status: DONE +Dependency: TASK-040-06 +Owners: Developer/Implementer + +Task description: +Implement Docker image caching for pre-built dependency images. + +Implementation details: +- Create `RuntimeImageCache` with local and registry caching +- Generate optimized Dockerfiles per language with dependency pre-installation +- Push built images to registry for cross-agent sharing +- Image tag based on language + dependency hash + +Completion criteria: +- [x] `RuntimeImageCache` with GetOrBuildImageAsync() +- [x] Local Docker image existence check +- [x] Registry image existence check and pull +- [x] Dockerfile generation with dependency pre-installation +- [x] NuGet restore baked into C# images +- [x] pip install baked into Python images +- [x] Maven dependency:go-offline for Java images +- [x] go mod download for Go images +- [x] npm install baked into TypeScript images +- [x] Registry push for cross-agent sharing +- [x] Image cache metrics + +### TASK-040-16 - Workflow Script Preloader +Status: DONE +Dependency: TASK-040-13, TASK-040-14, TASK-040-15 +Owners: Developer/Implementer + +Task description: +Implement workflow-level script preloading for parallel warm-up. + +Implementation details: +- Create `WorkflowScriptPreloader` triggered on workflow start +- Identify all script steps in workflow DAG +- Parallel precompilation, container warming, and image building +- Integration with workflow engine lifecycle + +Completion criteria: +- [x] `PreloadWorkflowScriptsAsync()` extracts all script IDs +- [x] Parallel compilation of all scripts +- [x] Parallel container pool warming per language +- [x] Parallel image building for unique dependency sets +- [x] Integration with workflow start event +- [x] Preload duration metrics + +### TASK-040-17 - Agent Script Cache +Status: DONE +Dependency: TASK-040-14, TASK-040-15 +Owners: Developer/Implementer + +Task description: +Implement agent-side caching with warmup on startup. + +Implementation details: +- Create `AgentScriptCache` with LRU eviction +- Persist cache across agent restarts +- Warmup task on agent start (pull base images, start pool) + +Completion criteria: +- [x] `AgentScriptCache` with configurable cache path +- [x] LRU eviction for compiled scripts (default 100) +- [x] LRU eviction for runtime images (default 20) +- [x] Cache persistence to disk +- [x] `WarmupAsync()` pulls all base images +- [x] Warm container pool initialization on startup + +### TASK-040-18 - Cache Performance Tests +Status: DONE +Dependency: TASK-040-17 +Owners: QA/Test Automation + +Task description: +Create performance tests validating cache effectiveness. + +Completion criteria: +- [x] Cold start benchmark (< 30s for first execution) +- [x] Warm start benchmark (< 500ms for cached script) +- [x] Same language different script (< 5s) +- [x] Workflow with 10 scripts benchmark (< 60s cold, < 15s warm) +- [x] Cache hit rate validation (> 90% in steady state) +- [x] Container pool utilization tests + +### TASK-040-19 - Integration Tests +Status: DONE +Dependency: TASK-040-18 +Owners: QA/Test Automation + +Task description: +Create comprehensive integration tests for the script engine. + +Completion criteria: +- [x] Full execution flow tests per language +- [x] Monaco integration tests +- [x] Language server communication tests +- [x] Sample script execution tests +- [x] Workflow step integration tests +- [x] Cache integration tests + +### TASK-040-20 - Security Tests +Status: DONE +Dependency: TASK-040-19 +Owners: QA/Test Automation + +Task description: +Create security tests for script execution isolation. + +Completion criteria: +- [x] Container isolation verification +- [x] Resource limit enforcement tests +- [x] Network isolation tests +- [x] Path traversal prevention tests +- [x] Sensitive data handling tests + +### TASK-040-21 - Documentation +Status: DONE +Dependency: TASK-040-20 +Owners: Documentation Author + +Task description: +Create comprehensive documentation for the script engine. + +Completion criteria: +- [x] API documentation +- [x] User guide for creating scripts +- [x] Sample script documentation +- [x] Language-specific guides +- [x] Security considerations documentation +- [x] Performance tuning guide (caching configuration) + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created | Planning | +| 2026-01-17 | Added TypeScript as 6th supported language | Planning | +| 2026-01-17 | Enhanced pool management with SmartContainerPoolManager (IHostedService, auto-scaling) | Planning | +| 2026-01-17 | Added Java/TypeScript compilation caching to TASK-040-13 | Planning | + +## Decisions & Risks + +### Decisions +1. Scripts are files mounted into containers, not embedded +2. Each language uses its official Docker base image +3. Language servers run as separate services for performance +4. Default network mode is "none" for security +5. **Multi-layer caching**: 5-layer cache (compiled scripts → warm containers → pre-built images → dependency cache → cold build) +6. **Pre-compilation**: C#/Java/Go/TypeScript scripts compiled/transpiled ahead of time using Roslyn/javac/go build/tsc +7. **Warm container pools**: SmartContainerPoolManager with IHostedService for graceful startup/shutdown +8. **Workflow preloading**: Trigger parallel warm-up when workflow starts +9. **Auto-scaling**: Usage-based scaling (scale up when hit rate < 50%, scale down when utilization < 30%) +10. **6 supported languages**: C#, Python, Java, Go, Bash, TypeScript + +### Risks +1. **Language server resource usage**: Multiple servers may consume significant memory + - Mitigation: On-demand server startup, connection pooling +2. **Container startup latency**: Cold starts may be slow + - Mitigation: Pre-warmed containers, image caching, workflow preloading +3. **Dependency resolution failures**: External package registries may be unavailable + - Mitigation: Dependency caching, offline mode support +4. **Cache invalidation**: Stale compiled scripts may cause issues + - Mitigation: Content-based cache keys (SHA256), TTL expiration, version in cache key +5. **Warm pool resource usage**: Idle containers consume memory + - Mitigation: Configurable pool sizes, idle timeout eviction, health-based eviction + +## Next Checkpoints + +- TASK-040-07 complete: Execution working +- TASK-040-10 complete: Editor functional +- TASK-040-16 complete: Caching infrastructure ready +- TASK-040-18 complete: Performance targets met +- TASK-040-20 complete: Security verified diff --git a/docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_self_healing.md b/docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_self_healing.md new file mode 100644 index 000000000..0c7b31b9c --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_040_ReleaseOrchestrator_self_healing.md @@ -0,0 +1,112 @@ +# Sprint 040 · Self-Healing Infrastructure + +## Topic & Scope + +Implement self-healing capabilities for the release orchestration platform including automated health monitoring, failure detection, and recovery orchestration. + +**Key Deliverables:** +- Self-healing engine with recovery strategies +- Health monitoring with degradation detection +- Recovery orchestrator with dependency-aware healing +- Automatic scaling and resource management +- Circuit breaker integration for cascading failure prevention + +- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.SelfHealing/` +- Documentation: `docs/modules/release-orchestrator/enhancements/self-healing.md` +- Expected evidence: Unit tests, integration tests, recovery scenario tests + +## Dependencies & Concurrency + +- Upstream: Sprint 034 (Agent Resilience), Sprint 041 (Observability) +- Downstream: None +- Can run in parallel with: Sprint 041 + +## Documentation Prerequisites + +- Read: `docs/modules/release-orchestrator/enhancements/self-healing.md` (if exists) +- Read: Agent resilience patterns in Sprint 034 + +## Delivery Tracker + +### TASK-040-01 - Self-Healing Engine +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement `SelfHealingEngine` with recovery strategies and automated remediation. + +Completion criteria: +- [x] Engine detects failures via health checks +- [x] Multiple recovery strategies (restart, failover, scale) +- [x] Recovery history tracking +- [x] Cooldown periods to prevent thrashing + +### TASK-040-02 - Health Monitor +Status: DONE +Dependency: TASK-040-01 +Owners: Developer/Implementer + +Implement `HealthMonitor` for continuous health assessment. + +Completion criteria: +- [x] Multi-probe health checks (HTTP, TCP, process) +- [x] Degradation detection with thresholds +- [x] Health aggregation across components +- [x] Alert integration + +### TASK-040-03 - Recovery Orchestrator +Status: DONE +Dependency: TASK-040-01 +Owners: Developer/Implementer + +Implement `RecoveryOrchestrator` for dependency-aware healing. + +Completion criteria: +- [x] Dependency graph-based recovery ordering +- [x] Partial recovery support +- [x] Rollback on failed recovery +- [x] Evidence generation for recovery actions + +### TASK-040-04 - Auto-Scaler +Status: DONE +Dependency: TASK-040-02 +Owners: Developer/Implementer + +Implement `AutoScaler` for automatic resource management. + +Completion criteria: +- [x] Load-based scaling triggers +- [x] Scale-up and scale-down policies +- [x] Resource limits enforcement +- [x] Scaling event audit trail + +### TASK-040-05 - Integration Tests +Status: DONE +Dependency: TASK-040-04 +Owners: QA/Test Automation + +Create integration tests for self-healing scenarios. + +Completion criteria: +- [x] Failure injection tests +- [x] Recovery verification tests +- [x] Scaling behavior tests + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created | Planning | +| 2026-01-17 | TASK-040-01, 040-02, 040-03 implemented: SelfHealingEngine, HealthMonitor, RecoveryOrchestrator | Developer | +| 2026-01-17 | TASK-040-04 implemented: AutoScaler | Developer | +| 2026-01-17 | TASK-040-05 completed: SelfHealingEngineTests, HealthMonitorTests, AutoScalerTests | QA | + +## Decisions & Risks + +- Risk: Over-aggressive healing causing instability +- Mitigation: Cooldown periods, rate limiting, manual override capability + +## Next Checkpoints + +- TASK-040-03 complete: Core self-healing functional +- TASK-040-05 complete: Ready for production diff --git a/docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_agent_operations.md b/docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_agent_operations.md new file mode 100644 index 000000000..91a8763c8 --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_agent_operations.md @@ -0,0 +1,452 @@ +# Sprint 041 · Agent Operations & Easy Setup + +## Topic & Scope + +Implement streamlined agent deployment, configuration management, health diagnostics (Doctor plugin), and operational tooling that makes agents easy to deploy, monitor, and maintain at scale. + +**Key Deliverables:** +- Zero-touch bootstrap service with one-line installers +- Declarative configuration manager with drift detection +- Automatic certificate provisioning and renewal +- Agent Doctor with comprehensive health checks +- Server-side Doctor plugin for fleet health +- Remediation engine with guided problem resolution +- Auto-update manager with safe rollbacks +- Enhanced CLI commands for agent operations + +- Working directory: `src/ReleaseOrchestrator/__Agents/StellaOps.Agent.Core/` +- Also touches: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Agent/`, `src/Doctor/__Plugins/`, `src/Cli/` +- Documentation: `docs/modules/release-orchestrator/enhancements/agent-operations.md` +- Expected evidence: Unit tests, integration tests, E2E tests, CLI documentation + +## Dependencies & Concurrency + +- Upstream: Sprint 034 (Agent Resilience) - provides clustering foundation +- Downstream: None +- Can run in parallel with: Sprint 040 (Multi-Language Scripts) + +## Documentation Prerequisites + +- Read: `docs/modules/release-orchestrator/enhancements/agent-operations.md` +- Read: `docs/modules/release-orchestrator/enhancements/agent-resilience.md` +- Read: `docs/modules/release-orchestrator/modules/agents.md` +- Read: `docs/modules/release-orchestrator/security/agent-security.md` + +## Delivery Tracker + +### TASK-041-01 - Bootstrap Token Service +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Task description: +Implement the bootstrap token service for secure agent provisioning. + +Implementation details: +- Create `BootstrapTokenService` with token generation +- One-time use tokens with 15-minute expiry +- Token validation and consumption +- Token metadata (agent name, environment, capabilities) + +Completion criteria: +- [x] `GenerateBootstrapTokenAsync()` creates secure one-time tokens +- [x] Token includes agent metadata +- [x] Token expires after 15 minutes or first use +- [x] Token validation rejects expired/used tokens +- [x] REST API endpoint for token generation + +### TASK-041-02 - Bootstrap Service +Status: DONE +Dependency: TASK-041-01 +Owners: Developer/Implementer + +Task description: +Implement the bootstrap service for zero-touch agent deployment. + +Implementation details: +- Create `BootstrapService` with platform detection +- Generate one-line installers for Linux, Windows, Docker +- Generate install scripts with embedded configuration +- Support cluster join via bootstrap + +Completion criteria: +- [x] `BootstrapAgentAsync()` generates complete bootstrap package +- [x] Linux one-liner: `curl | bash` with token +- [x] Windows one-liner: PowerShell with token +- [x] Docker one-liner: `docker run` with token +- [x] Install scripts handle dependencies +- [x] Cluster join support + +### TASK-041-03 - Agent Certificate Manager +Status: DONE +Dependency: TASK-041-02 +Owners: Developer/Implementer + +Task description: +Implement automatic certificate provisioning and renewal. + +Implementation details: +- Create `AgentCertificateManager` with lifecycle management +- Auto-provision via bootstrap (CSR submission) +- Auto-renewal before expiry threshold (default: 7 days) +- Support multiple certificate sources (auto, file, Vault, ACME) + +Completion criteria: +- [x] `EnsureCertificateAsync()` provisions or renews as needed +- [x] CSR generation with local private key +- [x] Auto-renewal monitoring background service +- [x] Certificate source abstraction +- [x] Vault integration for certificate storage +- [x] ACME/Let's Encrypt support (optional) + +### TASK-041-04 - Configuration Model +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Task description: +Implement the declarative agent configuration model. + +Implementation details: +- Create `AgentConfiguration` record with all settings +- Support minimal (bootstrap) and full configuration modes +- YAML/JSON serialization +- Configuration validation + +Completion criteria: +- [x] `AgentConfiguration` with identity, connection, capabilities, resources, security, observability sections +- [x] `CertificateConfig` with source enum (AutoProvision, File, Vault, ACME) +- [x] `ClusterConfig` for optional clustering +- [x] `AutoUpdateConfig` for optional auto-updates +- [x] Configuration validation with clear error messages +- [x] YAML and JSON support + +### TASK-041-05 - Configuration Manager +Status: DONE +Dependency: TASK-041-04 +Owners: Developer/Implementer + +Task description: +Implement the configuration manager with drift detection. + +Implementation details: +- Create `AgentConfigManager` with apply/diff operations +- Configuration drift detection +- Apply with rollback capability +- Configuration persistence + +Completion criteria: +- [x] `ApplyConfigurationAsync()` with validation and rollback +- [x] `DetectDriftAsync()` compares desired vs actual +- [x] Configuration diff computation +- [x] Automatic rollback on apply failure +- [x] Configuration versioning + +### TASK-041-06 - Agent Health Checks +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Task description: +Implement comprehensive health checks for the agent Doctor. + +Implementation details: +- Create `IAgentHealthCheck` interface +- Implement core checks: certificate, connectivity, heartbeat +- Implement resource checks: disk, memory, CPU +- Implement runtime checks: Docker, task queue + +Completion criteria: +- [x] `IAgentHealthCheck` with category, name, execute +- [x] `CertificateExpiryCheck` - certificate validity +- [x] `CertificateValidityCheck` - certificate chain validation +- [x] `OrchestratorConnectivityCheck` - DNS, TCP, mTLS, gRPC +- [x] `HeartbeatCheck` - heartbeat freshness +- [x] `DiskSpaceCheck` - available disk space +- [x] `MemoryUsageCheck` - memory utilization +- [x] `CpuUsageCheck` - CPU utilization +- [x] `DockerConnectivityCheck` - Docker daemon access +- [x] `DockerVersionCheck` - Docker version compatibility +- [x] `TaskQueueDepthCheck` - pending task count +- [x] `ConfigurationDriftCheck` - config consistency + +### TASK-041-07 - Agent Doctor +Status: DONE +Dependency: TASK-041-06 +Owners: Developer/Implementer + +Task description: +Implement the Agent Doctor for running diagnostics. + +Implementation details: +- Create `AgentDoctor` with check orchestration +- Generate diagnostic reports +- Support category filtering +- Integration with remediation engine + +Completion criteria: +- [x] `RunDiagnosticsAsync()` executes all applicable checks +- [x] Category filtering (security, network, runtime, etc.) +- [x] `AgentDiagnosticReport` with overall status and results +- [x] Parallel check execution with timeout +- [x] Stop-on-critical option + +### TASK-041-08 - Remediation Engine +Status: DONE +Dependency: TASK-041-07 +Owners: Developer/Implementer + +Task description: +Implement the remediation engine for guided problem resolution. + +Implementation details: +- Create `RemediationEngine` with pattern matching +- Define remediation patterns for common issues +- Support automated vs manual remediations +- Link to runbooks + +Completion criteria: +- [x] `GetRemediationSteps()` returns prioritized remediation steps +- [x] Pattern matching for known issues +- [x] `RemediationStep` with command, runbook URL, automated flag +- [x] Remediation patterns for certificate issues +- [x] Remediation patterns for connectivity issues +- [x] Remediation patterns for Docker issues +- [x] Remediation patterns for resource issues + +### TASK-041-09 - Server-Side Doctor Plugin +Status: DONE +Dependency: TASK-041-07 +Owners: Developer/Implementer + +Task description: +Implement the Doctor plugin for server-side agent fleet health monitoring. + +Implementation details: +- Create `AgentHealthPlugin` in Doctor plugins +- Implement fleet-wide health checks +- Aggregate agent health status +- Alert on critical issues + +Completion criteria: +- [x] `AgentHealthPlugin` implementing `IDoctorPlugin` +- [x] `AgentHeartbeatFreshnessCheck` - fleet heartbeat monitoring +- [x] `AgentCertificateExpiryCheck` - fleet certificate monitoring +- [x] `AgentVersionConsistencyCheck` - version skew detection +- [x] `AgentCapacityCheck` - task capacity monitoring +- [x] `StaleAgentCheck` - detect stale/disconnected agents +- [x] `TaskQueueBacklogCheck` - pending task monitoring +- [x] `FailedTaskRateCheck` - failure rate monitoring + +### TASK-041-10 - Auto-Update Manager +Status: DONE +Dependency: TASK-041-05 +Owners: Developer/Implementer + +Task description: +Implement safe agent binary auto-updates. + +Implementation details: +- Create `AgentUpdateManager` with update lifecycle +- Signature verification for packages +- Safe rollback capability +- Maintenance window support + +Completion criteria: +- [x] `CheckAndApplyUpdateAsync()` with full lifecycle +- [x] Update channel support (stable, beta, canary) +- [x] Package signature verification +- [x] Task draining before update +- [x] Rollback point creation +- [x] Health verification after update +- [x] Automatic rollback on failure +- [x] Maintenance window scheduling + +### TASK-041-11 - CLI Bootstrap Commands +Status: DONE +Dependency: TASK-041-02 +Owners: Developer/Implementer + +Task description: +Implement CLI commands for agent bootstrapping. + +Implementation details: +- Add `stella agent bootstrap` command +- Add `stella agent install-script` command +- Platform-specific output + +Completion criteria: +- [x] `stella agent bootstrap --name --env --platform` generates token and installer +- [x] `stella agent install-script --token --output` generates script file +- [x] Clear output with copy-paste commands +- [x] Platform detection and suggestions + +### TASK-041-12 - CLI Doctor Commands +Status: DONE +Dependency: TASK-041-08 +Owners: Developer/Implementer + +Task description: +Implement CLI commands for agent diagnostics. + +Implementation details: +- Add `stella agent doctor` command +- Support local and remote diagnostics +- Add `--fix` for automated remediation +- Multiple output formats + +Completion criteria: +- [x] `stella agent doctor` runs local diagnostics +- [x] `stella agent doctor --agent-id` runs remote diagnostics +- [x] `stella agent doctor --category` filters by category +- [x] `stella agent doctor --fix` applies automated fixes +- [x] `stella agent doctor --format json|table|yaml` output formats +- [x] Clear remediation instructions in output + +### TASK-041-13 - CLI Config Commands +Status: DONE +Dependency: TASK-041-05 +Owners: Developer/Implementer + +Task description: +Implement CLI commands for configuration management. + +Implementation details: +- Add `stella agent config` command +- Add `stella agent apply` command +- Add drift detection support + +Completion criteria: +- [x] `stella agent config` shows current configuration +- [x] `stella agent config --diff` shows drift +- [x] `stella agent apply -f config.yaml` applies configuration +- [x] Validation feedback on apply +- [x] Multiple output formats + +### TASK-041-14 - CLI Certificate Commands +Status: DONE +Dependency: TASK-041-03 +Owners: Developer/Implementer + +Task description: +Implement CLI commands for certificate management. + +Implementation details: +- Add `stella agent renew-cert` command +- Add certificate status in `stella agent status` +- Certificate expiry warnings + +Completion criteria: +- [x] `stella agent renew-cert` triggers renewal +- [x] `stella agent renew-cert --force` forces renewal +- [x] Certificate info in `stella agent status` +- [x] Expiry warnings in CLI output + +### TASK-041-15 - CLI Update Commands +Status: DONE +Dependency: TASK-041-10 +Owners: Developer/Implementer + +Task description: +Implement CLI commands for agent updates. + +Implementation details: +- Add `stella agent update` command +- Add version checking +- Add rollback command + +Completion criteria: +- [x] `stella agent update` checks and applies updates +- [x] `stella agent update --version x.y.z` updates to specific version +- [x] `stella agent update --check` checks without applying +- [x] `stella agent rollback` reverts to previous version + +### TASK-041-16 - Integration Tests +Status: DONE +Dependency: TASK-041-15 +Owners: QA/Test Automation + +Task description: +Create comprehensive integration tests for agent operations. + +Completion criteria: +- [x] Bootstrap flow end-to-end test +- [x] Configuration apply and rollback tests +- [x] Certificate provisioning tests +- [x] Certificate renewal tests +- [x] Doctor diagnostics tests +- [x] Remediation execution tests +- [x] Update and rollback tests + +### TASK-041-17 - E2E Tests +Status: DONE +Dependency: TASK-041-16 +Owners: QA/Test Automation + +Task description: +Create E2E tests for agent operations. + +Completion criteria: +- [x] Bootstrap to running agent test +- [x] Multi-agent deployment test +- [x] Configuration drift and remediation test +- [x] Certificate lifecycle test +- [x] Update with rollback test + +### TASK-041-18 - Documentation +Status: DONE +Dependency: TASK-041-17 +Owners: Documentation Author + +Task description: +Create comprehensive documentation for agent operations. + +Completion criteria: +- [x] Bootstrap quick start guide +- [x] Configuration reference +- [x] Doctor troubleshooting guide +- [x] Runbooks for common issues +- [x] CLI command reference +- [x] Auto-update configuration guide + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created | Planning | +| 2026-01-17 | Bootstrap services implemented (BootstrapTokenService, BootstrapService) | Developer | +| 2026-01-17 | Certificate manager implemented (AgentCertificateManager) | Developer | +| 2026-01-17 | Configuration model and manager implemented | Developer | +| 2026-01-17 | Agent Doctor and health checks implemented | Developer | +| 2026-01-17 | Remediation engine with patterns implemented | Developer | +| 2026-01-17 | Server-side Doctor plugin created | Developer | +| 2026-01-17 | Auto-update manager implemented | Developer | +| 2026-01-17 | CLI commands implemented (bootstrap, doctor, config, cert, update) | Developer | +| 2026-01-17 | Integration tests created | QA | +| 2026-01-17 | Documentation created (agent-operations-quickstart.md) | Documentation | +| 2026-01-17 | All tasks completed, sprint ready for archive | Project Manager | + +## Decisions & Risks + +### Decisions +1. Bootstrap tokens are one-time use with 15-minute expiry for security +2. Default certificate source is auto-provision via bootstrap +3. Auto-update is disabled by default, opt-in via configuration +4. Doctor checks run in parallel with per-check timeout + +### Risks +1. **Certificate auto-renewal failure**: Agent becomes unreachable + - Mitigation: Aggressive renewal threshold (7 days), multiple retry attempts, alert on renewal failure +2. **Bootstrap token interception**: Potential agent impersonation + - Mitigation: Short-lived tokens, one-time use, TLS for token transmission +3. **Auto-update breaking changes**: Agent becomes non-functional + - Mitigation: Signature verification, health check after update, automatic rollback +4. **Doctor check timeouts**: Slow checks block diagnostics + - Mitigation: Per-check timeout (10s default), parallel execution + +## Next Checkpoints + +- TASK-041-03 complete: Zero-touch bootstrap working +- TASK-041-09 complete: Doctor plugin integrated +- TASK-041-17 complete: Ready for production + diff --git a/docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_observability.md b/docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_observability.md new file mode 100644 index 000000000..f8270d822 --- /dev/null +++ b/docs-archived/implplan/SPRINT_20260117_041_ReleaseOrchestrator_observability.md @@ -0,0 +1,126 @@ +# Sprint 041 · Observability & Telemetry + +## Topic & Scope + +Implement comprehensive observability capabilities including metrics collection, distributed tracing, log aggregation, and dashboarding for the release orchestration platform. + +**Key Deliverables:** +- Observability hub for centralized telemetry +- Metric exporters for Prometheus/OpenTelemetry +- Distributed trace correlation +- Log aggregation with structured logging +- Dashboard templates for Grafana + +- Working directory: `src/ReleaseOrchestrator/__Libraries/StellaOps.ReleaseOrchestrator.Observability/` +- Documentation: `docs/modules/release-orchestrator/enhancements/observability.md` +- Expected evidence: Unit tests, integration tests, dashboard templates + +## Dependencies & Concurrency + +- Upstream: Sprint 038 (Performance) +- Downstream: Sprint 040 (Self-Healing) +- Can run in parallel with: Sprint 040 + +## Documentation Prerequisites + +- Read: `docs/modules/release-orchestrator/enhancements/observability.md` (if exists) +- Read: OpenTelemetry SDK documentation + +## Delivery Tracker + +### TASK-041-01 - Observability Hub +Status: DONE +Dependency: none +Owners: Developer/Implementer + +Implement `ObservabilityHub` for centralized telemetry management. + +Completion criteria: +- [x] Metrics, traces, and logs collection +- [x] Configurable export destinations +- [x] Sampling strategies +- [x] Buffer management for offline scenarios + +### TASK-041-02 - Metric Exporter +Status: DONE +Dependency: TASK-041-01 +Owners: Developer/Implementer + +Implement `MetricExporter` for Prometheus and OpenTelemetry. + +Completion criteria: +- [x] Counter, gauge, histogram support +- [x] Prometheus exposition format +- [x] OTLP export support +- [x] Custom metric definitions for releases + +### TASK-041-03 - Trace Correlator +Status: DONE +Dependency: TASK-041-01 +Owners: Developer/Implementer + +Implement `TraceCorrelator` for distributed tracing. + +Completion criteria: +- [x] W3C Trace Context propagation +- [x] Cross-service correlation +- [x] Span enrichment with release context +- [x] Trace sampling strategies + +### TASK-041-04 - Log Aggregator +Status: DONE +Dependency: TASK-041-01 +Owners: Developer/Implementer + +Implement `LogAggregator` for structured logging. + +Completion criteria: +- [x] Structured log format (JSON) +- [x] Log level management +- [x] Correlation ID injection +- [x] Log shipping to external systems + +### TASK-041-05 - Dashboard Templates +Status: DONE +Dependency: TASK-041-02 +Owners: Developer/Implementer + +Create Grafana dashboard templates. + +Completion criteria: +- [x] Release overview dashboard +- [x] Performance metrics dashboard +- [x] Error tracking dashboard +- [x] SLA monitoring dashboard + +### TASK-041-06 - Integration Tests +Status: DONE +Dependency: TASK-041-05 +Owners: QA/Test Automation + +Create integration tests for observability. + +Completion criteria: +- [x] Metric export verification +- [x] Trace propagation tests +- [x] Log format validation + +## Execution Log + +| Date (UTC) | Update | Owner | +| --- | --- | --- | +| 2026-01-17 | Sprint created | Planning | +| 2026-01-17 | TASK-041-01, 041-02, 041-03 implemented: ObservabilityHub, MetricExporter, TraceCorrelator | Developer | +| 2026-01-17 | TASK-041-04 implemented: LogAggregator with JSON/ECS formats, shippers | Developer | +| 2026-01-17 | TASK-041-05 implemented: 4 Grafana dashboards (releases, performance, errors, SLA) | Developer | +| 2026-01-17 | TASK-041-06 completed: MetricExporterTests, TraceCorrelatorTests, LogAggregatorTests | QA | + +## Decisions & Risks + +- Risk: High cardinality metrics causing storage issues +- Mitigation: Cardinality limits, metric aggregation, sampling + +## Next Checkpoints + +- TASK-041-03 complete: Core observability functional +- TASK-041-06 complete: Ready for production diff --git a/docs/FEATURE_GAPS_REPORT.md b/docs/FEATURE_GAPS_REPORT.md deleted file mode 100644 index c64af7be1..000000000 --- a/docs/FEATURE_GAPS_REPORT.md +++ /dev/null @@ -1,744 +0,0 @@ -# Feature Gaps Report - Stella Ops Suite -*(Auto-generated during feature matrix completion)* - -This report documents: -1. Features discovered in code but not listed in FEATURE_MATRIX.md -2. CLI/UI coverage gaps for existing features - ---- - -## Batch 1: SBOM & Ingestion - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| SPDX 3.0 Build Attestation | Attestor | `BuildAttestationMapper.cs`, `DsseSpdx3Signer.cs`, `CombinedDocumentBuilder.cs` | - | - | Attestation & Signing | -| CycloneDX CBOM Support | Scanner | `CycloneDxCbomWriter.cs` | - | - | SBOM & Ingestion | -| Trivy DB Export (Offline) | Concelier | `TrivyDbExporterPlugin.cs`, `TrivyDbOrasPusher.cs`, `TrivyDbExportPlanner.cs` | `stella db export trivy` | - | Offline & Air-Gap | -| Layer SBOM Composition | Scanner | `SpdxLayerWriter.cs`, `CycloneDxLayerWriter.cs`, `LayerSbomService.cs` | `stella sbomer layer`, `stella scan layer-sbom` | - | SBOM & Ingestion | -| SBOM Advisory Matching | Concelier | `SbomAdvisoryMatcher.cs`, `SbomRegistryService.cs`, `ValkeyPurlCanonicalIndex.cs` | - | - | Advisory Sources | -| Graph Lineage Service | Graph | `IGraphLineageService.cs`, `InMemoryGraphLineageService.cs`, `LineageContracts.cs` | - | `/graph` | SBOM & Ingestion | -| Evidence Cards (SBOM excerpts) | Evidence.Pack | `IEvidenceCardService.cs`, `EvidenceCardService.cs`, `EvidenceCard.cs` | - | Evidence drawer | Evidence & Findings | -| AirGap SBOM Parsing | AirGap | `SpdxParser.cs`, `CycloneDxParser.cs` | - | `/ops/offline-kit` | Offline & Air-Gap | -| SPDX License Normalization | Scanner | `SpdxLicenseNormalizer.cs`, `SpdxLicenseExpressions.cs`, `SpdxLicenseList.cs` | - | - | Scanning & Detection | -| SBOM Format Conversion | Scanner | `SpdxCycloneDxConverter.cs` | - | - | SBOM & Ingestion | -| SBOM Validation Pipeline | Scanner | `SbomValidationPipeline.cs`, `SemanticSbomExtensions.cs` | - | - | SBOM & Ingestion | -| CycloneDX Evidence Mapping | Scanner | `CycloneDxEvidenceMapper.cs` | - | - | SBOM & Ingestion | -| CycloneDX Pedigree Mapping | Scanner | `CycloneDxPedigreeMapper.cs` | - | - | SBOM & Ingestion | -| SBOM Snapshot Export | Graph | `SbomSnapshot.cs`, `SbomSnapshotExporter.cs` | - | - | Evidence & Findings | -| Lineage Evidence Packs | ExportCenter | `ILineageEvidencePackService.cs`, `LineageEvidencePack.cs`, `LineageExportEndpoints.cs` | - | `/triage/audit-bundles` | Evidence & Findings | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| Delta-SBOM Cache | SbomService | No | No | Internal optimization - no action needed | -| SBOM Lineage Ledger | SbomService | No | Yes | Add `stella sbom lineage list/show` commands | -| SBOM Lineage API | SbomService | No | Yes | Add `stella sbom lineage export` command | -| SPDX 3.0 Build Attestation | Attestor | No | No | Add to Attestation & Signing matrix section | -| Graph Lineage Service | Graph | No | Yes | Consider `stella graph lineage` command | -| Trivy DB Export | Concelier | Partial | No | `stella db export trivy` exists but may need UI | - ---- - -## Batch 2: Scanning & Detection - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| Secrets Detection (Regex+Entropy) | Scanner | `SecretsAnalyzer.cs`, `RegexDetector.cs`, `EntropyDetector.cs`, `CompositeSecretDetector.cs` | `stella scan run` | `/findings` | Scanning & Detection | -| OS Analyzers - Dpkg (Debian/Ubuntu) | Scanner | `DpkgPackageAnalyzer.cs`, `DpkgStatusParser.cs` | `stella scan run` | `/findings` | Scanning & Detection | -| OS Analyzers - Apk (Alpine) | Scanner | `ApkPackageAnalyzer.cs`, `ApkDatabaseParser.cs` | `stella scan run` | `/findings` | Scanning & Detection | -| OS Analyzers - RPM (RHEL/CentOS) | Scanner | `RpmPackageAnalyzer.cs` | `stella scan run` | `/findings` | Scanning & Detection | -| OS Analyzers - Homebrew (macOS) | Scanner | `HomebrewPackageAnalyzer.cs` | `stella scan run` | `/findings` | Scanning & Detection | -| OS Analyzers - macOS Bundles | Scanner | `MacOsBundleAnalyzer.cs` | `stella scan run` | `/findings` | Scanning & Detection | -| OS Analyzers - Windows (Chocolatey/MSI/WinSxS) | Scanner | `ChocolateyAnalyzer.cs`, `MsiAnalyzer.cs`, `WinSxSAnalyzer.cs` | `stella scan run` | `/findings` | Scanning & Detection | -| Symbol-Level Vulnerability Matching | Scanner | `VulnSurfaceService.cs`, `AdvisorySymbolMapping.cs`, `AffectedSymbol.cs` | - | - | Scanning & Detection | -| SARIF 2.1.0 Export | Scanner | SARIF export in CLI | `stella scan sarif` | - | Scanning & Detection | -| Fidelity Upgrade (Quick->Standard->Deep) | Scanner | `FidelityAwareAnalyzer.UpgradeFidelityAsync()` | - | - | Scanning & Detection | -| OCI Multi-Architecture Support | Scanner | `OciImageInspector.cs` (amd64, arm64, etc.) | `stella image inspect` | - | Scanning & Detection | -| Symlink Resolution (32-level depth) | Scanner | `LayeredRootFileSystem.cs` | - | - | Scanning & Detection | -| Whiteout File Support | Scanner | `LayeredRootFileSystem.cs` | - | - | Scanning & Detection | -| NATS/Redis Scan Queue | Scanner | `NatsScanQueue.cs`, `RedisScanQueue.cs` | - | `/ops/scanner` | Operations | -| Determinism Controls | Scanner | `DeterminismContext.cs`, `DeterministicTimeProvider.cs`, `DeterministicRandomProvider.cs` | `stella scan replay` | `/ops/scanner` | Determinism & Reproducibility | -| Lease-Based Job Processing | Scanner | `LeaseHeartbeatService.cs`, `ScanJobProcessor.cs` | - | - | Operations | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| License-Risk Detection | Scanner | No | No | Planned Q4-2025 - not yet implemented | -| Secrets Detection | Scanner | Implicit | Implicit | Document in matrix (runs automatically during scan) | -| OS Package Analyzers | Scanner | Implicit | Implicit | Document in matrix (6 OS-level analyzers) | -| Symbol-Level Matching | Scanner | No | No | Advanced feature - consider exposing in findings detail | -| SARIF Export | Scanner | Yes | No | Consider adding SARIF download in UI | -| Concurrent Worker Config | Scanner | No | Yes | CLI option for worker count would help CI/CD | - ---- - -## Batch 3: Reachability Analysis - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| 8-State Reachability Lattice | Reachability.Core | `ReachabilityLattice.cs` (28 state transitions) | - | `/reachability` | Reachability Analysis | -| Confidence Calculator | Reachability.Core | `ConfidenceCalculator.cs` (path/guard/hit bonuses) | - | - | Reachability Analysis | -| Evidence Weighted Score (EWS) | Signals | `EvidenceWeightedScoreCalculator.cs` (6 dimensions: RCH/RTS/BKP/XPL/SRC/MIT) | - | - | Scoring & Risk | -| Attested Reduction Scoring | Signals | VEX anchoring with short-circuit rules | - | - | Scoring & Risk | -| Hybrid Reachability Query | Reachability.Core | `IReachabilityIndex.cs` (static/runtime/hybrid/batch modes) | `stella reachgraph slice` | `/reachability` | Reachability Analysis | -| Reachability Replay/Verify | ReachGraph | `IReachabilityReplayService.VerifyAsync()` | `stella reachgraph replay/verify` | - | Determinism & Reproducibility | -| Graph Triple-Layer Storage | ReachGraph | `ReachGraphStoreService.cs` (Cache->DB->Archive) | - | - | Operations | -| Per-Graph Signing | ReachGraph | SHA256 artifact/provenance digests | - | - | Attestation & Signing | -| GraphViz/Mermaid Export | CLI | `stella reachability show --format dot/mermaid` | `stella reachability show` | - | Reachability Analysis | -| Reachability Drift Alerts | Docs | `19-reachability-drift-alert-flow.md` (state transition monitoring) | `stella drift` | - | Reachability Analysis | -| Evidence URIs | ReachGraph | `stella://reachgraph/{digest}/slice/{symbolId}` format | - | - | Evidence & Findings | -| Environment Guard Detection | Scanner | 20+ patterns (process.env, sys.platform, etc.) | - | `/reachability` | Reachability Analysis | -| Dynamic Loading Detection | Scanner | require(variable), import(variable), Class.forName() | - | - | Reachability Analysis | -| Reflection Call Detection | Scanner | Confidence scoring 0.5-0.6 for dynamic paths | - | - | Reachability Analysis | -| EWS Guardrails | Signals | Speculative cap (45), not-affected cap (15), runtime floor (60) | - | - | Scoring & Risk | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| Runtime Signal Correlation | Signals | No | Yes | Add `stella signals inspect` command | -| Gate Detection | Scanner | No | Yes | Consider `stella reachability guards` command | -| Path Witness Generation | ReachGraph | Yes | No | Add witness path visualization in UI | -| Confidence Calculator | Reachability.Core | No | No | Internal implementation - consider exposing in findings | -| Evidence Weighted Score | Signals | No | Partial | Add `stella score explain` command | -| Graph Triple-Layer Storage | ReachGraph | No | No | Ops concern - consider admin commands | - ---- - -## Batch 4: Binary Analysis - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| 4 Fingerprint Algorithm Types | BinaryIndex | `BasicBlockFingerprintGenerator.cs`, `ControlFlowGraphFingerprintGenerator.cs`, `StringRefsFingerprintGenerator.cs` | `stella binary fingerprint` | - | Binary Analysis | -| Alpine Corpus Support | BinaryIndex | `AlpineCorpusConnector.cs` | - | - | Binary Analysis | -| VEX Evidence Bridge | BinaryIndex | `IVexEvidenceGenerator.cs` | - | - | VEX Processing | -| Delta Signature Matching | BinaryIndex | `LookupByDeltaSignatureAsync()` | `stella deltasig` | - | Binary Analysis | -| Symbol Hash Matching | BinaryIndex | `LookupBySymbolHashAsync()` | `stella binary symbols` | - | Binary Analysis | -| Corpus Function Identification | BinaryIndex | `IdentifyFunctionFromCorpusAsync()` | - | - | Binary Analysis | -| Binary Call Graph Extraction | BinaryIndex | `binary callgraph` command | `stella binary callgraph` | - | Binary Analysis | -| 3-Tier Identification Strategy | BinaryIndex | Package/Build-ID/Fingerprint tiers | - | - | Binary Analysis | -| Fingerprint Validation Stats | BinaryIndex | `FingerprintValidationStats.cs` (TP/FP/TN/FN) | - | - | Binary Analysis | -| Changelog CVE Parsing | BinaryIndex | `DebianChangelogParser.cs` (CVE pattern extraction) | - | - | Binary Analysis | -| Secfixes Parsing | BinaryIndex | `ISecfixesParser.cs` (Alpine format) | - | - | Binary Analysis | -| Batch Binary Operations | BinaryIndex | All lookup methods support batching | - | - | Binary Analysis | -| Binary Match Confidence Scoring | BinaryIndex | 0.0-1.0 confidence for all matches | - | - | Binary Analysis | -| Architecture-Aware Filtering | BinaryIndex | Match filtering by architecture | - | - | Binary Analysis | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| Alpine Corpus | BinaryIndex | No | No | Add to matrix as additional corpus | -| Corpus Ingestion UI | BinaryIndex | No | No | Consider admin UI for corpus management | -| VEX Evidence Bridge | BinaryIndex | No | No | Internal integration - document in VEX section | -| Fingerprint Visualization | BinaryIndex | Yes | No | Consider UI for function fingerprint display | -| Batch Operations | BinaryIndex | No | No | Internal API - consider batch CLI commands | -| Delta Signatures | BinaryIndex | Yes | No | Consider UI integration for patch detection | - ---- - -## Batch 5: Advisory Sources - -### Discovered Features (Not in Matrix) - -**CRITICAL: Matrix lists 11 sources, but codebase has 33+ connectors!** - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| **SUSE Connector** | Concelier | `Connector.Distro.Suse/` | `stella db fetch suse` | - | Advisory Sources | -| **Astra Linux Connector** | Concelier | `Connector.Astra/` (FSTEC-certified Russian) | `stella db fetch astra` | - | Advisory Sources | -| **Microsoft MSRC** | Concelier | `vndr.msrc` vendor connector | - | - | Advisory Sources | -| **Oracle Connector** | Concelier | `vndr.oracle` vendor connector | - | - | Advisory Sources | -| **Adobe Connector** | Concelier | `vndr.adobe` vendor connector | - | - | Advisory Sources | -| **Apple Connector** | Concelier | `vndr.apple` vendor connector | - | - | Advisory Sources | -| **Cisco Connector** | Concelier | `vndr.cisco` vendor connector | - | - | Advisory Sources | -| **Chromium Connector** | Concelier | `vndr.chromium` vendor connector | - | - | Advisory Sources | -| **VMware Connector** | Concelier | `vndr.vmware` vendor connector | - | - | Advisory Sources | -| **JVN (Japan) CERT** | Concelier | `Connector.Jvn/` | - | - | Advisory Sources | -| **ACSC (Australia) CERT** | Concelier | `Connector.Acsc/` | - | - | Advisory Sources | -| **CCCS (Canada) CERT** | Concelier | `Connector.Cccs/` | - | - | Advisory Sources | -| **CertFr (France) CERT** | Concelier | `Connector.CertFr/` | - | - | Advisory Sources | -| **CertBund (Germany) CERT** | Concelier | `Connector.CertBund/` | - | - | Advisory Sources | -| **CertCc CERT** | Concelier | `Connector.CertCc/` | - | - | Advisory Sources | -| **CertIn (India) CERT** | Concelier | `Connector.CertIn/` | - | - | Advisory Sources | -| **RU-BDU (Russia) CERT** | Concelier | `Connector.Ru.Bdu/` | - | - | Advisory Sources | -| **RU-NKCKI (Russia) CERT** | Concelier | `Connector.Ru.Nkcki/` | - | - | Advisory Sources | -| **KISA (South Korea) CERT** | Concelier | `Connector.Kisa/` | - | - | Advisory Sources | -| **ICS-CISA (Industrial)** | Concelier | `Connector.Ics.Cisa/` | - | - | Advisory Sources | -| **ICS-Kaspersky (Industrial)** | Concelier | `Connector.Ics.Kaspersky/` | - | - | Advisory Sources | -| **StellaOpsMirror (Internal)** | Concelier | `Connector.StellaOpsMirror/` | - | - | Advisory Sources | -| Backport-Aware Precedence | Concelier | `ConfigurableSourcePrecedenceLattice.cs` | - | - | Advisory Sources | -| Link-Not-Merge Architecture | Concelier | Transitioning from merge to observation/linkset | - | - | Advisory Sources | -| Canonical Deduplication | Concelier | `ICanonicalAdvisoryService`, `CanonicalMerger.cs` | - | - | Advisory Sources | -| Change History Tracking | Concelier | `IChangeHistoryStore` (field-level diffs) | - | - | Advisory Sources | -| Feed Epoch Events | Concelier | `FeedEpochAdvancedEvent` (Provcache invalidation) | - | - | Advisory Sources | -| JSON Exporter | Concelier | `Exporter.Json/` (manifest-driven export) | `stella db export json` | - | Offline & Air-Gap | -| Trivy DB Exporter | Concelier | `Exporter.TrivyDb/` | `stella db export trivy` | - | Offline & Air-Gap | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| **22+ Connectors Missing from Matrix** | Concelier | Partial | No | ADD TO MATRIX - major documentation gap | -| Vendor PSIRTs (7 connectors) | Concelier | No | No | Add vendor section to matrix | -| Regional CERTs (11 connectors) | Concelier | No | No | Add regional CERT section to matrix | -| Industrial/ICS (2 connectors) | Concelier | No | No | Add ICS section to matrix | -| Link-Not-Merge Transition | Concelier | No | No | Document new architecture in matrix | -| Backport Precedence | Concelier | No | No | Document in merge engine section | -| Change History | Concelier | No | No | Consider audit trail UI | - -### Matrix Update Recommendations - -The FEATURE_MATRIX.md seriously underrepresents Concelier capabilities: -- **Listed:** 11 sources -- **Actual:** 33+ connectors - -Recommended additions: -1. Add "Vendor PSIRTs" section (Microsoft, Oracle, Adobe, Apple, Cisco, Chromium, VMware) -2. Add "Regional CERTs" section (JVN, ACSC, CCCS, CertFr, CertBund, CertIn, RU-BDU, KISA, etc.) -3. Add "Industrial/ICS" section (ICS-CISA, ICS-Kaspersky) -4. Add "Additional Distros" section (SUSE, Astra Linux) -5. Document backport-aware precedence configuration - ---- - -## Batch 6: VEX Processing - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| VEX Consensus Engine (5-state lattice) | VexLens | `VexConsensusEngine.cs`, `IVexConsensusEngine.cs` | `stella vex consensus` | `/vex` | VEX Processing | -| Trust Decay Service | VexLens | `TrustDecayService.cs`, `TrustDecayCalculator.cs` | - | - | VEX Processing | -| Noise Gate Service | VexLens | `NoiseGateService.cs` | - | `/vex` | VEX Processing | -| Consensus Rationale Service | VexLens | `IConsensusRationaleService.cs`, `ConsensusRationaleModels.cs` | - | `/vex` | VEX Processing | -| VEX Linkset Extraction | Excititor | `VexLinksetExtractionService.cs` | - | - | VEX Processing | -| VEX Linkset Disagreement Detection | Excititor | `VexLinksetDisagreementService.cs` | - | `/vex` | VEX Processing | -| VEX Statement Backfill | Excititor | `VexStatementBackfillService.cs` | - | - | VEX Processing | -| VEX Evidence Chunking | Excititor | `VexEvidenceChunkService.cs` | - | - | VEX Processing | -| Auto-VEX Downgrade | Excititor | `AutoVexDowngradeService.cs` | - | - | VEX Processing | -| Risk Feed Service | Excititor | `RiskFeedService.cs`, `RiskFeedEndpoints.cs` | - | - | VEX Processing | -| Trust Calibration Service | Excititor | `TrustCalibrationService.cs` | - | - | VEX Processing | -| VEX Hashing Service (deterministic) | Excititor | `VexHashingService.cs` | - | - | VEX Processing | -| CSAF Provider Connectors (7 total) | Excititor | `Connectors.*.CSAF/` (RedHat, Ubuntu, Oracle, MSRC, Cisco, SUSE) | - | - | VEX Processing | -| OCI OpenVEX Attestation Connector | Excititor | `Connectors.OCI.OpenVEX.Attest/` | - | - | VEX Processing | -| Issuer Key Lifecycle Management | IssuerDirectory | Key create/rotate/revoke endpoints | - | `/issuer-directory` | VEX Processing | -| Issuer Trust Override | IssuerDirectory | Trust override endpoints | - | `/issuer-directory` | VEX Processing | -| CSAF Publisher Bootstrap | IssuerDirectory | `csaf-publishers.json` seeding | - | - | VEX Processing | -| VEX Webhook Distribution | VexHub | `IWebhookService.cs`, `IWebhookSubscriptionRepository.cs` | - | - | VEX Processing | -| VEX Conflict Flagging | VexHub | `IStatementFlaggingService.cs` | - | - | VEX Processing | -| VEX from Drift Generation | CLI | `VexGenCommandGroup.cs` | `stella vex gen --from-drift` | - | VEX Processing | -| VEX Decision Signing | Policy | `VexDecisionSigningService.cs` | - | - | Policy Engine | -| VEX Proof Spine | Policy | `VexProofSpineService.cs` | - | - | Policy Engine | -| Consensus Propagation Rules | VexLens | `IPropagationRuleEngine.cs` | - | - | VEX Processing | -| Consensus Delta Computation | VexLens | `VexDeltaComputeService.cs` | - | - | VEX Processing | -| Triple-Layer Consensus Storage | VexLens | Cache->DB->Archive with `IConsensusProjectionStore.cs` | - | - | Operations | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| CSAF Provider Connectors | Excititor | No | No | Consider connector status UI in ops | -| Trust Weight Configuration | VexLens | No | Partial | Add `stella vex trust configure` command | -| VEX Distribution Webhooks | VexHub | No | No | Add webhook management UI/CLI | -| Conflict Resolution | VexLens | No | Partial | Interactive conflict resolution needed | -| Issuer Key Management | IssuerDirectory | No | Yes | Add `stella issuer keys` CLI | -| Risk Feed Distribution | Excititor | No | No | Consider risk feed CLI | -| Consensus Replay/Verify | VexLens | No | No | Add `stella vex verify` command | -| VEX Evidence Export | Excititor | No | No | Add `stella vex evidence export` | - -### Matrix Update Recommendations - -The FEATURE_MATRIX.md VEX section is significantly underspecified: -- **Listed:** Basic VEX support (OpenVEX, CSAF, CycloneDX) -- **Actual:** Full consensus engine with 5-state lattice, 9 trust factors, 7 CSAF connectors, conflict detection, issuer registry - -Recommended additions: -1. Add "VEX Consensus Engine" as major feature (VexLens) -2. Add "Trust Weight Scoring" with 9 factors documented -3. Add "CSAF Provider Connectors" section (7 vendors) -4. Add "Issuer Trust Registry" (IssuerDirectory) -5. Add "VEX Distribution" (VexHub webhooks) -6. Document AOC (Aggregation-Only Contract) compliance -7. Add "VEX from Drift" generation capability - ---- - -## Batch 7: Policy Engine - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| K4 Lattice (Belnap Four-Valued Logic) | Policy | `K4Lattice.cs`, `TrustLatticeEngine.cs`, `ClaimScoreMerger.cs` | - | `/policy` | Policy Engine | -| 10+ Policy Gate Types | Policy | `PolicyGateEvaluator.cs`, various *Gate.cs files | - | `/policy` | Policy Engine | -| Uncertainty Score Calculator | Policy.Determinization | `UncertaintyScoreCalculator.cs` (entropy 0.0-1.0) | - | - | Policy Engine | -| Decayed Confidence Calculator | Policy.Determinization | `DecayedConfidenceCalculator.cs` (14-day half-life) | - | - | Policy Engine | -| 6 Evidence Types | Policy.Determinization | `BackportEvidence.cs`, `CvssEvidence.cs`, `EpssEvidence.cs`, etc. | - | - | Policy Engine | -| 6 Risk Score Providers | RiskEngine | `CvssKevProvider.cs`, `EpssProvider.cs`, `FixChainRiskProvider.cs` | - | `/risk` | Scoring & Risk | -| FixChain Risk Metrics | RiskEngine | `FixChainRiskMetrics.cs`, `FixChainRiskDisplay.cs` | - | - | Scoring & Risk | -| Exception Effect Registry | Policy | `ExceptionEffectRegistry.cs`, `ExceptionAdapter.cs` | - | `/policy/exceptions` | Policy Engine | -| Exception Approval Rules | Policy | `IExceptionApprovalRulesService.cs` | - | `/policy/exceptions` | Policy Engine | -| Policy Simulation Service | Policy.Registry | `IPolicySimulationService.cs` | `stella policy simulate` | `/policy/simulate` | Policy Engine | -| Policy Promotion Pipeline | Policy.Registry | `IPromotionService.cs`, `IPublishPipelineService.cs` | - | - | Policy Engine | -| Review Workflow Service | Policy.Registry | `IReviewWorkflowService.cs` | - | - | Policy Engine | -| Sealed Mode Service | Policy | `ISealedModeService.cs` | - | `/ops` | Offline & Air-Gap | -| Verdict Attestation Service | Policy | `IVerdictAttestationService.cs` | - | - | Attestation & Signing | -| Policy Decision Attestation | Policy | `IPolicyDecisionAttestationService.cs` (DSSE/Rekor) | - | - | Attestation & Signing | -| Score Policy YAML Config | Policy | `ScorePolicyModels.cs`, `ScorePolicyLoader.cs` | `stella policy validate` | `/policy` | Policy Engine | -| Profile-Aware Scoring | Policy.Scoring | `ProfileAwareScoringService.cs`, `ScoringProfileService.cs` | - | - | Policy Engine | -| Freshness-Aware Scoring | Policy | `FreshnessAwareScoringService.cs` | - | - | Policy Engine | -| Jurisdiction Trust Rules | Policy.Vex | `JurisdictionTrustRules.cs` | - | - | Policy Engine | -| VEX Customer Override | Policy.Vex | `VexCustomerOverride.cs` | - | - | Policy Engine | -| Attestation Report Service | Policy | `IAttestationReportService.cs` | - | - | Attestation & Signing | -| Risk Scoring Trigger Service | Policy.Scoring | `RiskScoringTriggerService.cs` | - | - | Scoring & Risk | -| Policy Lint Endpoint | Policy | `/policy/lint` | - | - | Policy Engine | -| Policy Determinism Verification | Policy | `/policy/verify-determinism` | - | - | Determinism & Reproducibility | -| AdvisoryAI Knobs Endpoint | Policy | `/policy/advisory-ai/knobs` | - | - | Policy Engine | -| Stability Damping Gate | Policy | `StabilityDampingGate.cs` | - | - | Policy Engine | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| K4 Lattice Operations | Policy | No | Partial | Add `stella policy lattice explain` for debugging | -| Risk Provider Configuration | RiskEngine | No | No | Provider configuration needs CLI/UI exposure | -| Exception Approval Workflow | Policy | No | Yes | Add `stella policy exception approve/reject` CLI | -| Determinization Signal Weights | Policy | No | No | Allow signal weight tuning via CLI/config | -| Policy Pack Promotion | Policy.Registry | No | Partial | Add `stella policy promote` CLI | -| Score Policy Tuning | Policy.Scoring | Partial | Partial | Expand `stella policy` commands | -| Verdict Attestation Export | Policy | No | No | Add `stella policy verdicts export` | -| Risk Scoring History | RiskEngine | No | Partial | Consider historical trend CLI | - -### Matrix Update Recommendations - -The FEATURE_MATRIX.md Policy section covers basics but misses advanced features: -- **Listed:** Basic policy evaluation, exceptions -- **Actual:** Full K4 lattice, 10+ gate types, 6 risk providers, determinization system - -Recommended additions: -1. Add "K4 Lattice Logic" as core feature (Belnap four-valued logic) -2. Add "Policy Gate Types" section (10+ specialized gates) -3. Add "Risk Score Providers" section (6 providers with distinct purposes) -4. Add "Determinization System" (signal weights, decay, uncertainty) -5. Add "Score Policy Configuration" (YAML-based policy tuning) -6. Add "Policy Simulation" as distinct feature -7. Add "Verdict Attestations" (DSSE/Rekor integration) -8. Document "Sealed Mode" for air-gap operations - ---- - -## Batch 8: Attestation & Signing - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| 25+ Predicate Types | Attestor | `StellaOps.Attestor.ProofChain/Predicates/` | - | - | Attestation & Signing | -| Keyless Signing (Fulcio) | Signer | `KeylessDsseSigner.cs`, `HttpFulcioClient.cs` | `stella sign keyless` | - | Attestation & Signing | -| Ephemeral Key Generation | Signer.Keyless | `EphemeralKeyGenerator.cs`, `EphemeralKeyPair.cs` | - | - | Attestation & Signing | -| OIDC Token Provider | Signer.Keyless | `IOidcTokenProvider.cs`, `AmbientOidcTokenProvider.cs` | - | - | Attestation & Signing | -| Key Rotation Service | Signer.KeyManagement | `IKeyRotationService.cs`, `KeyRotationService.cs` | `/keys/rotate` API | - | Attestation & Signing | -| Trust Anchor Manager | Signer.KeyManagement | `ITrustAnchorManager.cs`, `TrustAnchorManager.cs` | - | - | Attestation & Signing | -| Delta Attestations (4 types) | Attestor | `IDeltaAttestationService.cs` (VEX/SBOM/Verdict/Reachability) | - | - | Attestation & Signing | -| Layer Attestation Service | Attestor | `ILayerAttestationService.cs` | - | - | Attestation & Signing | -| Attestation Chain Builder | Attestor | `AttestationChainBuilder.cs`, `AttestationChainValidator.cs` | - | - | Attestation & Signing | -| Attestation Link Store | Attestor | `IAttestationLinkStore.cs`, `IAttestationLinkResolver.cs` | - | - | Attestation & Signing | -| Rekor Submission Queue | Attestor | `IRekorSubmissionQueue.cs` (durable retry) | - | - | Attestation & Signing | -| Cached Verification Service | Attestor | `CachedAttestorVerificationService.cs` | - | - | Attestation & Signing | -| Offline Bundle Service | Attestor | `IAttestorBundleService.cs` | - | `/ops/offline-kit` | Offline & Air-Gap | -| Signer Quota Service | Signer | `ISignerQuotaService.cs` | - | - | Operations | -| Signer Audit Sink | Signer | `ISignerAuditSink.cs`, `InMemorySignerAuditSink.cs` | - | - | Operations | -| Proof of Entitlement | Signer | `IProofOfEntitlementIntrospector.cs` (JWT/MTLS) | - | - | Auth & Access Control | -| Release Integrity Verifier | Signer | `IReleaseIntegrityVerifier.cs` | - | - | Attestation & Signing | -| JSON Canonicalizer (RFC 8785) | Attestor | `JsonCanonicalizer.cs` | - | - | Determinism & Reproducibility | -| Predicate Type Router | Attestor | `IPredicateTypeRouter.cs`, `PredicateTypeRouter.cs` | - | - | Attestation & Signing | -| Standard Predicate Registry | Attestor | `IStandardPredicateRegistry.cs` | - | - | Attestation & Signing | -| HMAC Signing | Signer | `HmacDsseSigner.cs` | - | - | Attestation & Signing | -| SM2 Algorithm Support | Signer | `CryptoDsseSigner.cs` (SM2 branch) | - | - | Regional Crypto | -| Promotion Attestation | Provenance | `PromotionAttestation.cs` | - | - | Release Orchestration | -| Cosign/KMS Signer | Provenance | `CosignAndKmsSigner.cs` | - | - | Attestation & Signing | -| Rotating Signer | Provenance | `RotatingSigner.cs` | - | - | Attestation & Signing | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| Key Rotation | Signer | No | No | Add `stella keys rotate` CLI command | -| Trust Anchor Management | Signer | No | No | Add `stella trust-anchors` commands | -| Attestation Chain Visualization | Attestor | No | Partial | Add chain visualization UI | -| Predicate Registry Browser | Attestor | No | No | Add `stella attest predicates list` | -| Delta Attestation CLI | Attestor | No | No | Add `stella attest delta` commands | -| Signer Audit Logs | Signer | No | No | Add `stella sign audit` command | -| Rekor Submission Status | Attestor | No | No | Add submission queue status UI | - -### Matrix Update Recommendations - -The FEATURE_MATRIX.md Attestation section lists basic DSSE/in-toto support: -- **Listed:** Basic attestation attach/verify, SLSA provenance -- **Actual:** 25+ predicate types, keyless signing, key rotation, attestation chains - -Recommended additions: -1. Add "Predicate Types" section (25+ types documented) -2. Add "Keyless Signing (Sigstore)" as major feature -3. Add "Key Rotation Service" for Enterprise tier -4. Add "Trust Anchor Management" for Enterprise tier -5. Add "Attestation Chains" feature -6. Add "Delta Attestations" (VEX/SBOM/Verdict/Reachability) -7. Document "Offline Bundle Service" for air-gap -8. Add "SM2 Algorithm Support" in Regional Crypto section - ---- - -## Batch 9: Regional Crypto - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| 8 Signature Profiles | Cryptography | `SignatureProfile.cs` | - | - | Regional Crypto | -| Ed25519 Baseline Signing | Cryptography | `Ed25519Signer.cs`, `Ed25519Verifier.cs` | - | - | Regional Crypto | -| ECDSA P-256 Profile | Cryptography | `EcdsaP256Signer.cs` | - | - | Regional Crypto | -| FIPS 140-2 Plugin | Cryptography | `FipsPlugin.cs` | - | - | Regional Crypto | -| GOST R 34.10-2012 Plugin | Cryptography | `GostPlugin.cs` | - | - | Regional Crypto | -| SM2/SM3/SM4 Plugin | Cryptography | `SmPlugin.cs` | - | - | Regional Crypto | -| eIDAS Plugin (CAdES/XAdES) | Cryptography | `EidasPlugin.cs` | - | - | Regional Crypto | -| HSM Plugin (PKCS#11) | Cryptography | `HsmPlugin.cs` (simulated + production) | - | - | Regional Crypto | -| CryptoPro GOST (Windows) | Cryptography | `CryptoProGostCryptoProvider.cs` | - | - | Regional Crypto | -| Multi-Profile Signing | Cryptography | `MultiProfileSigner.cs` | - | - | Regional Crypto | -| SM Remote Service | SmRemote | `Program.cs` | - | - | Regional Crypto | -| Post-Quantum Profiles (Defined) | Cryptography | `SignatureProfile.cs` (Dilithium, Falcon) | - | - | Regional Crypto | -| RFC 3161 TSA Integration | Cryptography | `EidasPlugin.cs` | - | - | Regional Crypto | -| Simulated HSM Client | Cryptography | `SimulatedHsmClient.cs` | - | - | Regional Crypto | -| GOST Block Cipher (28147-89) | Cryptography | `GostPlugin.cs` | - | - | Regional Crypto | -| SM4 Encryption (CBC/ECB/GCM) | Cryptography | `SmPlugin.cs` | - | - | Regional Crypto | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| Crypto Profile Selection | Cryptography | No | No | Add `stella crypto profiles` command | -| Plugin Health Check | Cryptography | No | No | Add plugin status endpoint | -| Key Management CLI | Cryptography | No | No | Add `stella keys` commands | -| HSM Status | Cryptography | No | No | Add HSM health monitoring | -| Post-Quantum Implementation | Cryptography | No | No | Implement Dilithium/Falcon when stable | - -### Matrix Update Recommendations - -The FEATURE_MATRIX.md Regional Crypto section mentions only FIPS/eIDAS/GOST: -- **Listed:** Basic regional compliance mentions -- **Actual:** 8 signature profiles, 6 plugins, HSM support, post-quantum readiness - -Recommended additions: -1. Add "Signature Profiles" section (8 profiles documented) -2. Add "Plugin Architecture" description -3. Add "Multi-Profile Signing" capability (dual-stack signatures) -4. Add "SM Remote Service" for Chinese market -5. Add "Post-Quantum Readiness" (Dilithium, Falcon defined) -6. Add "HSM Integration" (PKCS#11 + simulation) -7. Document plugin configuration options -8. Add "CryptoPro GOST" for Windows environments - ---- - -## Batch 10: Evidence & Findings - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| WORM Storage (S3 Object Lock) | EvidenceLocker | `S3EvidenceObjectStore.cs` | - | - | Evidence & Findings | -| Verdict Attestations (DSSE) | EvidenceLocker | `VerdictEndpoints.cs`, `VerdictContracts.cs` | - | `/evidence-export` | Evidence & Findings | -| Append-Only Ledger Events | Findings | `ILedgerEventRepository.cs`, `LedgerEventModels.cs` | - | `/findings` | Evidence & Findings | -| Alert Triage Bands (hot/warm/cold) | Findings | `DecisionModels.cs` | - | `/findings` | Evidence & Findings | -| Merkle Anchoring | Findings | `Infrastructure/Merkle/` | - | - | Evidence & Findings | -| Evidence Holds (Legal) | EvidenceLocker | `EvidenceHold.cs` | - | - | Evidence & Findings | -| Evidence Pack Service | Evidence.Pack | `IEvidencePackService.cs`, `EvidencePack.cs` | - | `/evidence-thread` | Evidence & Findings | -| Evidence Card Service | Evidence.Pack | `IEvidenceCardService.cs`, `EvidenceCard.cs` | - | - | Evidence & Findings | -| Profile-Based Export | ExportCenter | `ExportApiEndpoints.cs`, `ExportProfile` | - | `/evidence-export` | Evidence & Findings | -| Risk Bundle Export | ExportCenter | `RiskBundleEndpoints.cs` | - | `/evidence-export` | Evidence & Findings | -| Audit Bundle Export | ExportCenter | `AuditBundleEndpoints.cs` | - | - | Evidence & Findings | -| Lineage Evidence Export | ExportCenter | `LineageExportEndpoints.cs` | - | `/lineage` | Evidence & Findings | -| SSE Export Streaming | ExportCenter | Real-time run events | - | - | Evidence & Findings | -| Incident Mode | Findings | `IIncidentModeState.cs` | - | - | Evidence & Findings | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| Evidence Holds | EvidenceLocker | No | No | Add legal hold management CLI | -| Audit Bundle Export | ExportCenter | No | Partial | Add `stella export audit` command | -| Incident Mode | Findings | No | No | Add `stella findings incident` commands | - ---- - -## Batch 11: Determinism & Replay - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| Hybrid Logical Clock | HybridLogicalClock | `HybridLogicalClock.cs`, `HlcTimestamp.cs` | - | - | Determinism & Replay | -| HLC State Persistence | HybridLogicalClock | `IHlcStateStore.cs` | - | - | Determinism & Replay | -| Canonical JSON (RFC 8785) | Canonical.Json | `CanonJson.cs`, `CanonVersion.cs` | - | - | Determinism & Replay | -| Replay Manifests V1/V2 | Replay.Core | `ReplayManifest.cs` | `stella scan replay` | - | Determinism & Replay | -| Knowledge Snapshots | Replay.Core | `KnowledgeSnapshot.cs` | - | - | Determinism & Replay | -| Replay Proofs (DSSE) | Replay.Core | `ReplayProof.cs` | `stella prove` | - | Determinism & Replay | -| Evidence Weighted Scoring (6 factors) | Signals | `EvidenceWeightedScoreCalculator.cs` | - | - | Scoring & Risk | -| Score Buckets (ActNow/ScheduleNext/Investigate/Watchlist) | Signals | Scoring algorithm | - | - | Scoring & Risk | -| Attested Reduction (short-circuit) | Signals | VEX anchoring logic | - | - | Scoring & Risk | -| Timeline Events | Eventing | `TimelineEvent.cs`, `ITimelineEventEmitter.cs` | - | - | Determinism & Replay | -| Deterministic Event IDs | Eventing | `EventIdGenerator.cs` (SHA-256) | - | - | Determinism & Replay | -| Transactional Outbox | Eventing | `TimelineOutboxProcessor.cs` | - | - | Determinism & Replay | -| Event Signing (DSSE) | Eventing | `IEventSigner.cs` | - | - | Determinism & Replay | -| Replay Bundle Writer | Replay.Core | `StellaReplayBundleWriter.cs` (tar.zst) | - | - | Determinism & Replay | -| Dead Letter Replay | Orchestrator | `IReplayManager.cs`, `ReplayManager.cs` | - | - | Operations | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| HLC Inspection | HybridLogicalClock | No | No | Add `stella hlc status` command | -| Timeline Events | Eventing | No | No | Add `stella timeline query` command | -| Scoring Explanation | Signals | No | No | Add `stella score explain` command | - ---- - -## Batch 12: Operations - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| Impact Index (Roaring bitmaps) | Scheduler | `IImpactIndex.cs` | - | - | Operations | -| Graph Build/Overlay Jobs | Scheduler | `IGraphJobService.cs` | - | `/ops/scheduler` | Operations | -| Run Preview (dry-run) | Scheduler | `RunEndpoints.cs` | - | - | Operations | -| SSE Run Streaming | Scheduler | `/runs/{runId}/stream` | - | - | Operations | -| Job Repository | Orchestrator | `IJobRepository.cs`, `Job.cs` | - | `/orchestrator` | Operations | -| Lease Management | Orchestrator | `LeaseNextAsync()`, `ExtendLeaseAsync()` | - | - | Operations | -| Dead Letter Classification | Orchestrator | `DeadLetterEntry.cs` | - | `/orchestrator` | Operations | -| First Signal Service | Orchestrator | `IFirstSignalService.cs` | - | - | Operations | -| Task Pack Execution | TaskRunner | `ITaskRunnerClient.cs` | - | - | Operations | -| Plan-Hash Binding | TaskRunner | Deterministic validation | - | - | Operations | -| Approval Gates | TaskRunner | `ApprovalDecisionRequest.cs` | - | - | Operations | -| Artifact Capture | TaskRunner | Digest tracking | - | - | Operations | -| Timeline Query Service | TimelineIndexer | `ITimelineQueryService.cs` | - | - | Operations | -| Timeline Ingestion | TimelineIndexer | `ITimelineIngestionService.cs` | - | - | Operations | -| Token-Bucket Rate Limiting | Orchestrator | Adaptive refill per tenant | - | - | Operations | -| Job Watermarks | Orchestrator | Ordering guarantees | - | - | Operations | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| Impact Preview | Scheduler | No | Partial | Add `stella scheduler preview` command | -| Job Management | Orchestrator | No | Yes | Add `stella orchestrator jobs` commands | -| Dead Letter Operations | Orchestrator | No | Yes | Add `stella orchestrator deadletter` commands | -| TaskRunner CLI | TaskRunner | No | No | Add `stella taskrunner` commands | -| Timeline Query CLI | TimelineIndexer | No | No | Add `stella timeline` commands | - ---- - -## Batch 13: Release Orchestration - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| Environment Bundles | ReleaseOrchestrator | `IEnvironmentBundleService.cs`, `EnvironmentBundle.cs` | - | `/releases` | Release Orchestration | -| Promotion Workflows | ReleaseOrchestrator | `IPromotionWorkflowService.cs`, `PromotionRequest.cs` | - | `/releases` | Release Orchestration | -| Rollback Service | ReleaseOrchestrator | `IRollbackService.cs`, `RollbackRequest.cs` | - | `/releases` | Release Orchestration | -| Deployment Agents (Docker/Compose/ECS/Nomad) | ReleaseOrchestrator | `IDeploymentAgent.cs`, various agent implementations | - | `/releases` | Release Orchestration | -| Progressive Delivery (A/B, Canary) | ReleaseOrchestrator | `IProgressiveDeliveryService.cs` | - | `/releases` | Release Orchestration | -| Hook System (Pre/Post Deploy) | ReleaseOrchestrator | `IHookExecutionService.cs`, `Hook.cs` | - | `/releases` | Release Orchestration | -| Approval Gates (Multi-Stage) | ReleaseOrchestrator | `IApprovalGateService.cs`, `ApprovalGate.cs` | - | `/releases` | Release Orchestration | -| Release Bundle Signing | ReleaseOrchestrator | `IReleaseBundleSigningService.cs` | - | - | Release Orchestration | -| Environment Promotion History | ReleaseOrchestrator | `IPromotionHistoryService.cs` | - | `/releases` | Release Orchestration | -| Deployment Lock Service | ReleaseOrchestrator | `IDeploymentLockService.cs` | - | - | Release Orchestration | -| Release Manifest Generation | ReleaseOrchestrator | `IReleaseManifestService.cs` | - | - | Release Orchestration | -| Promotion Attestations | ReleaseOrchestrator | `PromotionAttestation.cs` | - | - | Attestation & Signing | -| Environment Health Checks | ReleaseOrchestrator | `IEnvironmentHealthService.cs` | - | `/releases` | Release Orchestration | -| Deployment Verification Tests | ReleaseOrchestrator | `IVerificationTestService.cs` | - | - | Release Orchestration | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| Release Bundle Creation | ReleaseOrchestrator | No | Partial | Add `stella release create` command | -| Environment Promotion | ReleaseOrchestrator | No | Yes | Add `stella release promote` command | -| Rollback Operations | ReleaseOrchestrator | No | Yes | Add `stella release rollback` command | -| Hook Management | ReleaseOrchestrator | No | Partial | Add `stella release hooks` commands | -| Deployment Agent Status | ReleaseOrchestrator | No | Partial | Add `stella agent status` command | - -### Matrix Update Recommendations - -The FEATURE_MATRIX.md Release Orchestration section is largely planned: -- **Listed:** Basic environment management concepts -- **Actual:** Full promotion workflow, deployment agents, progressive delivery - -Recommended additions: -1. Add "Deployment Agents" section (Docker, Compose, ECS, Nomad) -2. Add "Progressive Delivery" (A/B, Canary strategies) -3. Add "Approval Gates" (multi-stage approvals) -4. Add "Hook System" (pre/post deployment hooks) -5. Add "Promotion Attestations" (DSSE signing of promotions) -6. Document "Environment Health Checks" - ---- - -## Batch 14: Auth & Access Control - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| 75+ Authorization Scopes | Authority | `AuthorizationScopeConstants.cs` | - | `/admin/roles` | Auth & Access Control | -| DPoP Sender Constraints | Authority | `DPoPService.cs`, `DPoPValidator.cs` | - | - | Auth & Access Control | -| mTLS Sender Constraints | Authority | `MtlsClientCertificateValidator.cs` | - | - | Auth & Access Control | -| Device Authorization Flow | Authority | `DeviceAuthorizationEndpoints.cs` | - | `/login` | Auth & Access Control | -| JWT Profile for OAuth | Authority | `JwtBearerClientAssertionValidator.cs` | - | - | Auth & Access Control | -| PAR (Pushed Authorization Requests) | Authority | `ParEndpoints.cs` | - | - | Auth & Access Control | -| Tenant Isolation | Authority | `ITenantContext.cs`, `TenantResolutionMiddleware.cs` | - | - | Auth & Access Control | -| Role-Based Access Control | Authority | `IRoleService.cs`, `Role.cs` | - | `/admin/roles` | Auth & Access Control | -| Permission Grant Service | Authority | `IPermissionGrantService.cs` | - | - | Auth & Access Control | -| Token Introspection | Authority | `TokenIntrospectionEndpoints.cs` | - | - | Auth & Access Control | -| Token Revocation | Authority | `TokenRevocationEndpoints.cs` | - | - | Auth & Access Control | -| OAuth Client Management | Authority | `IClientRepository.cs`, `Client.cs` | - | `/admin/clients` | Auth & Access Control | -| User Federation (LDAP/SAML) | Authority | `IFederationProvider.cs` | - | `/admin/federation` | Auth & Access Control | -| Session Management | Authority | `ISessionStore.cs`, `Session.cs` | - | - | Auth & Access Control | -| Consent Management | Authority | `IConsentStore.cs`, `Consent.cs` | - | `/consent` | Auth & Access Control | -| Registry Token Service | Registry | `ITokenService.cs`, `TokenModels.cs` | `stella registry login` | - | Auth & Access Control | -| Scope-Based Token Minting | Registry | Pull/push/catalog scope handling | - | - | Auth & Access Control | -| Token Refresh Flow | Authority | Refresh token rotation | - | - | Auth & Access Control | -| Multi-Factor Authentication | Authority | `IMfaService.cs` | - | `/login/mfa` | Auth & Access Control | -| API Key Management | Authority | `IApiKeyService.cs` | - | `/admin/api-keys` | Auth & Access Control | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| Scope Management | Authority | No | Yes | Add `stella auth scopes` commands | -| DPoP Configuration | Authority | No | No | Add DPoP configuration documentation | -| Client Management | Authority | No | Yes | Add `stella auth clients` commands | -| Role Management | Authority | No | Yes | Add `stella auth roles` commands | -| API Key Operations | Authority | No | Yes | Add `stella auth api-keys` commands | -| Token Introspection | Authority | No | No | Add `stella auth token inspect` command | - -### Matrix Update Recommendations - -The FEATURE_MATRIX.md Auth section covers basics but misses advanced features: -- **Listed:** Basic OAuth/OIDC, RBAC -- **Actual:** 75+ scopes, DPoP/mTLS, federation, advanced OAuth flows - -Recommended additions: -1. Add "Authorization Scopes" section (75+ granular scopes) -2. Add "Sender Constraints" (DPoP, mTLS) -3. Add "Device Authorization Flow" for CLI/IoT -4. Add "User Federation" (LDAP, SAML integration) -5. Add "PAR Support" for security-conscious clients -6. Add "Multi-Factor Authentication" -7. Add "API Key Management" for service accounts -8. Document "Tenant Isolation" architecture - ---- - -## Batch 15: Notifications & Integrations - -### Discovered Features (Not in Matrix) - -| Feature | Module | Key Files | CLI | UI | Suggested Category | -|---------|--------|-----------|-----|----|--------------------| -| 10 Notification Channel Types | Notify | Email, Slack, Teams, Webhook, PagerDuty, SNS, SQS, Pub/Sub, Discord, Matrix | - | `/notifications` | Notifications | -| Template-Based Notifications | Notify | `INotificationTemplateService.cs`, `NotificationTemplate.cs` | - | `/notifications` | Notifications | -| Channel Routing Rules | Notify | `IChannelRoutingService.cs`, `RoutingRule.cs` | - | `/notifications` | Notifications | -| Delivery Receipt Tracking | Notify | `IDeliveryReceiptService.cs`, `DeliveryReceipt.cs` | - | - | Notifications | -| Notification Preferences | Notify | `IPreferenceService.cs`, `UserPreference.cs` | - | `/settings` | Notifications | -| Digest/Batch Notifications | Notify | `IDigestService.cs` | - | `/notifications` | Notifications | -| Kubernetes Admission Webhooks | Zastava | `AdmissionWebhookEndpoints.cs` | - | - | Integrations | -| OCI Registry Push Hooks | Zastava | `IWebhookProcessor.cs`, `RegistryPushEvent.cs` | - | - | Integrations | -| Scan-on-Push Trigger | Zastava | Auto-trigger scanning on registry push | - | - | Integrations | -| SCM Webhooks (GitHub/GitLab/Bitbucket) | Integrations | `IScmWebhookHandler.cs` | - | `/integrations` | Integrations | -| CI/CD Webhooks | Integrations | Jenkins, CircleCI, GitHub Actions integration | - | `/integrations` | Integrations | -| Issue Tracker Integration | Integrations | Jira, GitHub Issues, Linear integration | - | `/integrations` | Integrations | -| Slack App Integration | Integrations | `ISlackAppService.cs`, slash commands | - | `/integrations` | Integrations | -| MS Teams App Integration | Integrations | `ITeamsAppService.cs`, adaptive cards | - | `/integrations` | Integrations | -| Notification Studio | Notifier | Template design and preview | - | `/notifications/studio` | Notifications | -| Escalation Rules | Notify | `IEscalationService.cs` | - | `/notifications` | Notifications | -| On-Call Schedule Integration | Notify | PagerDuty, OpsGenie integration | - | `/notifications` | Notifications | -| Webhook Retry Logic | Notify | Exponential backoff, dead letter | - | - | Notifications | -| Event-Driven Notifications | Notify | Timeline event subscription | - | - | Notifications | -| Custom Webhook Payloads | Integrations | `IWebhookPayloadFormatter.cs` | - | `/integrations` | Integrations | - -### Coverage Gaps - -| Feature | Module | Has CLI | Has UI | Recommendation | -|---------|--------|---------|--------|----------------| -| Channel Configuration | Notify | No | Yes | Add `stella notify channels` commands | -| Template Management | Notify | No | Yes | Add `stella notify templates` commands | -| Webhook Testing | Integrations | No | Partial | Add `stella integrations test` command | -| K8s Webhook Installation | Zastava | No | No | Add `stella zastava install` command | -| Notification Preferences | Notify | No | Yes | Add `stella notify preferences` commands | - -### Matrix Update Recommendations - -The FEATURE_MATRIX.md Notifications section is basic: -- **Listed:** Basic webhook/email notifications -- **Actual:** 10 channel types, template engine, routing rules, escalation - -Recommended additions: -1. Add "Notification Channels" section (10 types) -2. Add "Template Engine" for customizable messages -3. Add "Channel Routing" for sophisticated delivery -4. Add "Escalation Rules" for incident response -5. Add "Notification Studio" for template design -6. Add "Kubernetes Admission Webhooks" (Zastava) -7. Add "SCM Integrations" (GitHub, GitLab, Bitbucket) -8. Add "CI/CD Integrations" (Jenkins, CircleCI, GitHub Actions) -9. Add "Issue Tracker Integration" (Jira, GitHub Issues) -10. Document "Scan-on-Push" auto-trigger - ---- - -## Summary: Overall Matrix Gaps - -### Major Documentation Gaps Identified - -| Category | Matrix Coverage | Actual Coverage | Gap Severity | -|----------|-----------------|-----------------|--------------| -| Advisory Sources | 11 sources | 33+ connectors | **CRITICAL** | -| VEX Processing | Basic | Full consensus engine | **HIGH** | -| Attestation & Signing | Basic | 25+ predicates | **HIGH** | -| Auth Scopes | Basic RBAC | 75+ granular scopes | **HIGH** | -| Policy Engine | Basic | K4 lattice, 10+ gates | **MEDIUM** | -| Regional Crypto | 3 profiles | 8 profiles, 6 plugins | **MEDIUM** | -| Notifications | 2 channels | 10 channels | **MEDIUM** | -| Binary Analysis | Basic | 4 fingerprint algorithms | **MEDIUM** | -| Release Orchestration | Planned | Partially implemented | **LOW** | - -### CLI/UI Coverage Statistics - -| Metric | Value | -|--------|-------| -| Features with CLI | ~65% | -| Features with UI | ~70% | -| Features with both | ~55% | -| Internal-only features | ~25% | - -### Recommended Next Steps - -1. **Immediate**: Update Advisory Sources section (33+ connectors undocumented) -2. **High Priority**: Document VEX consensus engine capabilities -3. **High Priority**: Document attestation predicate types -4. **Medium Priority**: Update auth scopes documentation -5. **Medium Priority**: Complete policy engine documentation -6. **Low Priority**: Document internal operations features diff --git a/docs/FEATURE_MATRIX.md b/docs/FEATURE_MATRIX.md index c7738bccc..1af1efeda 100755 --- a/docs/FEATURE_MATRIX.md +++ b/docs/FEATURE_MATRIX.md @@ -20,16 +20,16 @@ **Principle:** Pay for scale, not for features or automation. No per-seat, per-project, or per-deployment taxes. -| Plan | Price | Environments | New Digests/Day | Deployments | Notes | -|------|-------|--------------|-----------------|-------------|-------| -| **Free** | $0/month | 3 | 333 | Unlimited (fair use) | Full features | -| **Pro** | $699/month | 33 | 3,333 | Unlimited (fair use) | Same features | -| **Enterprise** | $1,999/month | Unlimited | Unlimited | Unlimited | Fair use on mirroring/audit bandwidth | +| Plan | Price | Environments | New Digests/Day | +|------|-------|--------------|------------------| +| **Free** | $0/month | 3 | 333 | +| **Pro** | $699/month | 33 | 3,333 | +| **Enterprise** | $1,999/month | Unlimited | Unlimited | **Key Principles:** - All plans include all features (no feature gating) -- Limits are environments + new digests analyzed per day -- Unlimited deployments with fair use policy +- Only limits are environments and new digests analyzed per day +- All other capabilities are identical across all tiers --- @@ -37,75 +37,74 @@ *These differentiators are available across all plans.* -| Capability | Free | Pro | Enterprise | Notes | -|------------|:----:|:---:|:----------:|-------| -| Signed Replayable Risk Verdicts | ✅ | ✅ | ✅ | Core differentiator | -| Decision Capsules | ✅ | ✅ | ✅ | Audit-grade evidence bundles | -| VEX Decisioning Engine | ✅ | ✅ | ✅ | Trust lattice + conflict resolution | -| Reachability with Portable Proofs | ✅ | ✅ | ✅ | Three-layer analysis | -| Smart-Diff (Semantic Risk Delta) | ✅ | ✅ | ✅ | Material change detection | -| Unknowns as First-Class State | ✅ | ✅ | ✅ | Uncertainty budgets | -| Deterministic Replay | ✅ | ✅ | ✅ | `stella replay srm.yaml` | -| Non-Kubernetes First-Class | ✅ | ✅ | ✅ | Docker/Compose/ECS/Nomad targets | -| Digest-First Release Identity | ✅ | ✅ | ✅ | Immutable releases | +| Capability | Notes | +|------------|-------| +| Signed Replayable Risk Verdicts | Core differentiator | +| Decision Capsules | Audit-grade evidence bundles | +| VEX Decisioning Engine | Trust lattice + conflict resolution | +| Reachability with Portable Proofs | Three-layer analysis | +| Smart-Diff (Semantic Risk Delta) | Material change detection | +| Unknowns as First-Class State | Uncertainty budgets | +| Deterministic Replay | `stella replay srm.yaml` | +| Non-Kubernetes First-Class | Docker/Compose/ECS/Nomad targets | +| Digest-First Release Identity | Immutable releases | --- ## Release Orchestration (Planned) -*Release orchestration capabilities are planned for implementation. All plans will include all features.* +*Release orchestration capabilities are planned for implementation.* -| Capability | Free | Pro | Enterprise | Notes | -|------------|:----:|:---:|:----------:|-------| -| **Environment Management** | | | | | -| Environment CRUD | ⏳ | ⏳ | ⏳ | Dev/Stage/Prod definitions | -| Freeze Windows | ⏳ | ⏳ | ⏳ | Calendar-based blocking | -| Approval Policies | ⏳ | ⏳ | ⏳ | Per-environment rules | -| **Release Management** | | | | | -| Component Registry | ⏳ | ⏳ | ⏳ | Service → repository mapping | -| Release Bundles | ⏳ | ⏳ | ⏳ | Component → digest bundles | -| Semantic Versioning | ⏳ | ⏳ | ⏳ | SemVer release versions | -| Tag → Digest Resolution | ⏳ | ⏳ | ⏳ | Immutable digest pinning | -| **Promotion & Gates** | | | | | -| Promotion Workflows | ⏳ | ⏳ | ⏳ | Environment transitions | -| Security Gate | ⏳ | ⏳ | ⏳ | Scan verdict evaluation | -| Approval Gate | ⏳ | ⏳ | ⏳ | Human sign-off | -| Freeze Window Gate | ⏳ | ⏳ | ⏳ | Calendar enforcement | -| Policy Gate (OPA/Rego) | ⏳ | ⏳ | ⏳ | Custom rules | -| Decision Records | ⏳ | ⏳ | ⏳ | Evidence-linked decisions | -| **Deployment Execution** | | | | | -| Docker Host Agent | ⏳ | ⏳ | ⏳ | Direct container deployment | -| Compose Host Agent | ⏳ | ⏳ | ⏳ | Docker Compose deployment | -| SSH Agentless | ⏳ | ⏳ | ⏳ | Linux remote execution | -| WinRM Agentless | ⏳ | ⏳ | ⏳ | Windows remote execution | -| ECS Agent | ⏳ | ⏳ | ⏳ | AWS ECS deployment | -| Nomad Agent | ⏳ | ⏳ | ⏳ | HashiCorp Nomad deployment | -| Rollback | ⏳ | ⏳ | ⏳ | Previous version restore | -| **Progressive Delivery** | | | | | -| A/B Releases | ⏳ | ⏳ | ⏳ | Traffic splitting | -| Canary Deployments | ⏳ | ⏳ | ⏳ | Gradual rollout | -| Blue-Green | ⏳ | ⏳ | ⏳ | Zero-downtime switch | -| Traffic Routing Plugins | ⏳ | ⏳ | ⏳ | Nginx/HAProxy/Traefik/ALB | -| **Workflow Engine** | | | | | -| DAG Workflow Execution | ⏳ | ⏳ | ⏳ | Directed acyclic graphs | -| Step Registry | ⏳ | ⏳ | ⏳ | Built-in + custom steps | -| Workflow Templates | ⏳ | ⏳ | ⏳ | Reusable workflows | -| Script Steps (Bash/C#) | ⏳ | ⏳ | ⏳ | Custom automation | -| **Evidence & Audit** | | | | | -| Evidence Packets | ⏳ | ⏳ | ⏳ | Sealed decision bundles | -| Version Stickers | ⏳ | ⏳ | ⏳ | On-target deployment records | -| Audit Export | ⏳ | ⏳ | ⏳ | Compliance reporting | -| **Integrations** | | | | | -| GitHub Integration | ⏳ | ⏳ | ⏳ | SCM + webhooks | -| GitLab Integration | ⏳ | ⏳ | ⏳ | SCM + webhooks | -| Harbor Integration | ⏳ | ⏳ | ⏳ | Registry + scanning | -| HashiCorp Vault | ⏳ | ⏳ | ⏳ | Secrets management | -| AWS Secrets Manager | ⏳ | ⏳ | ⏳ | Secrets management | -| **Plugin System** | | | | | -| Plugin Manifest | ⏳ | ⏳ | ⏳ | Static declarations | -| Connector Runtime | ⏳ | ⏳ | ⏳ | Dynamic execution | -| Step Providers | ⏳ | ⏳ | ⏳ | Custom workflow steps | -| Agent Types | ⏳ | ⏳ | ⏳ | Custom deployment targets | +| Capability | Notes | +| **Environment Management** | | +| Environment CRUD | ⏳ Dev/Stage/Prod definitions | +| Freeze Windows | ⏳ Calendar-based blocking | +| Approval Policies | ⏳ Per-environment rules | +| **Release Management** | | +| Component Registry | ⏳ Service → repository mapping | +| Release Bundles | ⏳ Component → digest bundles | +| Semantic Versioning | ⏳ SemVer release versions | +| Tag → Digest Resolution | ⏳ Immutable digest pinning | +| **Promotion & Gates** | | +| Promotion Workflows | ⏳ Environment transitions | +| Security Gate | ⏳ Scan verdict evaluation | +| Approval Gate | ⏳ Human sign-off | +| Freeze Window Gate | ⏳ Calendar enforcement | +| Policy Gate (OPA/Rego) | ⏳ Custom rules | +| Decision Records | ⏳ Evidence-linked decisions | +| **Deployment Execution** | | +| Docker Host Agent | ⏳ Direct container deployment | +| Compose Host Agent | ⏳ Docker Compose deployment | +| SSH Agentless | ⏳ Linux remote execution | +| WinRM Agentless | ⏳ Windows remote execution | +| ECS Agent | ⏳ AWS ECS deployment | +| Nomad Agent | ⏳ HashiCorp Nomad deployment | +| Rollback | ⏳ Previous version restore | +| **Progressive Delivery** | | +| A/B Releases | ⏳ Traffic splitting | +| Canary Deployments | ⏳ Gradual rollout | +| Blue-Green | ⏳ Zero-downtime switch | +| Traffic Routing Plugins | ⏳ Nginx/HAProxy/Traefik/ALB | +| **Workflow Engine** | | +| DAG Workflow Execution | ⏳ Directed acyclic graphs | +| Step Registry | ⏳ Built-in + custom steps | +| Workflow Templates | ⏳ Reusable workflows | +| Script Steps (Bash/C#) | ⏳ Custom automation | +| **Evidence & Audit** | | +| Evidence Packets | ⏳ Sealed decision bundles | +| Version Stickers | ⏳ On-target deployment records | +| Audit Export | ⏳ Compliance reporting | +| **Integrations** | | +| GitHub Integration | ⏳ SCM + webhooks | +| GitLab Integration | ⏳ SCM + webhooks | +| Harbor Integration | ⏳ Registry + scanning | +| HashiCorp Vault | ⏳ Secrets management | +| AWS Secrets Manager | ⏳ Secrets management | +| **Plugin System** | | +| Plugin Manifest | ⏳ Static declarations | +| Connector Runtime | ⏳ Dynamic execution | +| Step Providers | ⏳ Custom workflow steps | +| Agent Types | ⏳ Custom deployment targets | --- @@ -115,68 +114,64 @@ |-------|:----:|:---:|:----------:| | **Environments** | 3 | 33 | Unlimited | | **New Digests/Day** | 333 | 3,333 | Unlimited | -| **Deployments** | Fair use | Fair use | Fair use | -| **Targets per Environment** | 10 | 100 | Unlimited | -| **Agents** | 3 | 33 | Unlimited | -| **Integrations** | 5 | 50 | Unlimited | --- ## SBOM & Ingestion -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Trivy-JSON Ingestion | ✅ | ✅ | ✅ | | -| SPDX-JSON 3.0.1 Ingestion | ✅ | ✅ | ✅ | | -| CycloneDX 1.7 Ingestion (1.6 backward compatible) | ✅ | ✅ | ✅ | | -| Auto-format Detection | ✅ | ✅ | ✅ | | -| Delta-SBOM Cache | ✅ | ✅ | ✅ | Warm scans <1s | -| SBOM Generation (all formats) | ✅ | ✅ | ✅ | | -| Semantic SBOM Diff | ✅ | ✅ | ✅ | | -| BYOS (Bring-Your-Own-SBOM) | ✅ | ✅ | ✅ | | -| **SBOM Lineage Ledger** | — | — | ✅ | Full versioned history | -| **SBOM Lineage API** | — | — | ✅ | Traversal queries | +| Capability | Notes | +|------------|-------| +| Trivy-JSON Ingestion | | +| SPDX-JSON 3.0.1 Ingestion | | +| CycloneDX 1.7 Ingestion (1.6 backward compatible) | | +| Auto-format Detection | | +| Delta-SBOM Cache | Warm scans <1s | +| SBOM Generation (all formats) | | +| Semantic SBOM Diff | | +| BYOS (Bring-Your-Own-SBOM) | | +| SBOM Lineage Ledger | Full versioned history | +| SBOM Lineage API | Traversal queries | --- ## Scanning & Detection -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| CVE Lookup via Local DB | ✅ | ✅ | ✅ | | -| Licence-Risk Detection | ⏳ | ⏳ | ⏳ | Q4-2025 | -| **Automatic Detection (Class A)** | | | | Runs implicitly during scan | -| — Secrets Detection | ✅ | ✅ | ✅ | API keys, tokens, passwords; results in findings (see [docs/modules/ui/components/findings-list.md](docs/modules/ui/components/findings-list.md)) | -| — OS Package Analyzers | ✅ | ✅ | ✅ | apk, apt, yum, dnf, rpm, pacman; results in SBOM (see [docs/modules/cli/guides/commands/sbom.md](docs/modules/cli/guides/commands/sbom.md)) | -| **Language Analyzers (All 11)** | | | | | -| — .NET/C#, Java, Go, Python | ✅ | ✅ | ✅ | | -| — Node.js, Ruby, Bun, Deno | ✅ | ✅ | ✅ | | -| — PHP, Rust, Native binaries | ✅ | ✅ | ✅ | | -| **Progressive Fidelity Modes** | | | | | -| — Quick Mode | ✅ | ✅ | ✅ | | -| — Standard Mode | ✅ | ✅ | ✅ | | -| — Deep Mode | — | ✅ | ✅ | Full analysis | -| Base Image Detection | ✅ | ✅ | ✅ | | -| Layer-Aware Analysis | ✅ | ✅ | ✅ | | -| **Concurrent Scan Workers** | 1 | 3 | Unlimited | | +| Capability | Notes | +|------------|-------| +| CVE Lookup via Local DB | | +| Licence-Risk Detection | ⏳ Q4-2025 | +| **Automatic Detection (Class A)** | Runs implicitly during scan | +| — Secrets Detection | API keys, tokens, passwords; results in findings (see [docs/modules/ui/components/findings-list.md](docs/modules/ui/components/findings-list.md)) | +| — OS Package Analyzers | apk, apt, yum, dnf, rpm, pacman; results in SBOM (see [docs/modules/cli/guides/commands/sbom.md](docs/modules/cli/guides/commands/sbom.md)) | +| **Language Analyzers (All 11)** | | +| — .NET/C#, Java, Go, Python | | +| — Node.js, Ruby, Bun, Deno | | +| — PHP, Rust, Native binaries | | +| **Progressive Fidelity Modes** | | +| — Quick Mode | | +| — Standard Mode | | +| — Deep Mode | Full analysis | +| Base Image Detection | | +| Layer-Aware Analysis | | +| **Concurrent Scan Workers** | Configurable | --- ## Reachability Analysis -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Static Call Graph | ✅ | ✅ | ✅ | | -| Entrypoint Detection | ✅ | ✅ | ✅ | 9+ framework types | -| BFS Reachability | ✅ | ✅ | ✅ | | -| Reachability Drift Detection | ✅ | ✅ | ✅ | | -| Binary Loader Resolution | — | ✅ | ✅ | ELF/PE/Mach-O | -| Feature Flag/Config Gating | — | ✅ | ✅ | Layer 3 analysis | -| Runtime Signal Correlation | — | — | ✅ | Zastava integration | -| Gate Detection (auth/admin) | — | — | ✅ | Enterprise policies | -| Path Witness Generation | — | — | ✅ | Audit evidence | -| Reachability Mini-Map API | — | — | ✅ | UI visualization | -| Runtime Timeline API | — | — | ✅ | Temporal analysis | +| Capability | Notes | +|------------|-------| +| Static Call Graph | | +| Entrypoint Detection | 9+ framework types | +| BFS Reachability | | +| Reachability Drift Detection | | +| Binary Loader Resolution | ELF/PE/Mach-O | +| Feature Flag/Config Gating | Layer 3 analysis | +| Runtime Signal Correlation | Zastava integration | +| Gate Detection (auth/admin) | Enterprise policies | +| Path Witness Generation | Audit evidence | +| Reachability Mini-Map API | UI visualization | +| Runtime Timeline API | Temporal analysis | --- @@ -184,18 +179,18 @@ *Binary analysis capabilities are CLI-first (Class B). UI integration is minimal until user demand validates.* -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Binary Identity Extraction | ✅ | ✅ | ✅ | Build-ID, hashes | -| Build-ID Vulnerability Lookup | ✅ | ✅ | ✅ | | -| Debian/Ubuntu Corpus | ✅ | ✅ | ✅ | | -| RPM/RHEL Corpus | — | ✅ | ✅ | | -| Patch-Aware Backport Detection | — | ✅ | ✅ | | -| PE/Mach-O/ELF Parsers | — | ✅ | ✅ | | -| **Binary Fingerprint Generation** | — | — | ✅ | CLI: `stella binary fingerprint export` | -| **Fingerprint Matching Engine** | — | — | ✅ | Similarity search | -| **Binary Diff** | — | — | ✅ | CLI: `stella binary diff ` | -| **DWARF/Symbol Analysis** | — | — | ✅ | Debug symbols | +| Capability | Notes | +|------------|-------| +| Binary Identity Extraction | Build-ID, hashes | +| Build-ID Vulnerability Lookup | | +| Debian/Ubuntu Corpus | | +| RPM/RHEL Corpus | | +| Patch-Aware Backport Detection | | +| PE/Mach-O/ELF Parsers | | +| Binary Fingerprint Generation | CLI: `stella binary fingerprint export` | +| Fingerprint Matching Engine | Similarity search | +| Binary Diff | CLI: `stella binary diff ` | +| DWARF/Symbol Analysis | Debug symbols | **CLI Commands (Class B):** - `stella binary fingerprint export ` — Export fingerprint data (function hashes, section hashes, symbol table) @@ -209,51 +204,51 @@ *Concelier provides 33+ vulnerability feed connectors with automatic sync, health monitoring, and conflict detection.* -| Source Category | Connectors | Free | Community | Enterprise | Notes | -|-----------------|-----------|:----:|:---------:|:----------:|-------| -| **National CVE Databases** | | | | | | -| — NVD (NIST) | ✅ | ✅ | ✅ | ✅ | Primary CVE source | -| — CVE (MITRE) | ✅ | ✅ | ✅ | ✅ | CVE Record format 5.0 | -| **OSS Ecosystems** | | | | | | -| — OSV | ✅ | ✅ | ✅ | ✅ | Multi-ecosystem | -| — GHSA | ✅ | ✅ | ✅ | ✅ | GitHub Security Advisories | -| **Linux Distributions** | | | | | | -| — Alpine SecDB | ✅ | ✅ | ✅ | ✅ | | -| — Debian Security Tracker | ✅ | ✅ | ✅ | ✅ | | -| — Ubuntu USN | ✅ | ✅ | ✅ | ✅ | | -| — RHEL/CentOS OVAL | — | ✅ | ✅ | ✅ | | -| — SUSE OVAL | — | ✅ | ✅ | ✅ | | -| — Astra Linux | — | — | ✅ | ✅ | Russian distro | -| **CERTs / National CSIRTs** | | | | | | -| — CISA KEV | ✅ | ✅ | ✅ | ✅ | Known Exploited Vulns | -| — CISA ICS-CERT | — | ✅ | ✅ | ✅ | Industrial control systems | -| — CERT-CC | — | ✅ | ✅ | ✅ | Carnegie Mellon | -| — CERT-FR | — | ✅ | ✅ | ✅ | France | -| — CERT-Bund (BSI) | — | ✅ | ✅ | ✅ | Germany | -| — CERT-In | — | ✅ | ✅ | ✅ | India | -| — ACSC | — | ✅ | ✅ | ✅ | Australia | -| — CCCS | — | ✅ | ✅ | ✅ | Canada | -| — KISA | — | ✅ | ✅ | ✅ | South Korea | -| — JVN | — | ✅ | ✅ | ✅ | Japan | -| **Russian Federation Sources** | | | | | | -| — FSTEC BDU | — | — | ✅ | ✅ | Russian vuln database | -| — NKCKI | — | — | ✅ | ✅ | Critical infrastructure | -| **Vendor PSIRTs** | | | | | | -| — Microsoft MSRC | — | ✅ | ✅ | ✅ | | -| — Cisco PSIRT | — | ✅ | ✅ | ✅ | | -| — Oracle CPU | — | ✅ | ✅ | ✅ | | -| — VMware | — | ✅ | ✅ | ✅ | | -| — Adobe PSIRT | — | ✅ | ✅ | ✅ | | -| — Apple Security | — | ✅ | ✅ | ✅ | | -| — Chromium | — | ✅ | ✅ | ✅ | | -| **ICS/SCADA** | | | | | | -| — Kaspersky ICS-CERT | — | — | ✅ | ✅ | Industrial security | -| **Risk Scoring** | | | | | | -| — EPSS v4 | ✅ | ✅ | ✅ | ✅ | Exploit prediction | -| **Enterprise Features** | | | | | | -| Custom Advisory Connectors | — | — | — | ✅ | Private feeds | -| Advisory Merge Engine | — | — | — | ✅ | Conflict resolution | -| Connector Health CLI | ✅ | ✅ | ✅ | ✅ | `stella db connectors status` | +| Connector | Notes | +|-----------|-------| +| **National CVE Databases** | | +| — NVD (NIST) | Primary CVE source | +| — CVE (MITRE) | CVE Record format 5.0 | +| **OSS Ecosystems** | | +| — OSV | Multi-ecosystem | +| — GHSA | GitHub Security Advisories | +| **Linux Distributions** | | +| — Alpine SecDB | | +| — Debian Security Tracker | | +| — Ubuntu USN | | +| — RHEL/CentOS OVAL | | +| — SUSE OVAL | | +| — Astra Linux | Russian distro | +| **CERTs / National CSIRTs** | | +| — CISA KEV | Known Exploited Vulns | +| — CISA ICS-CERT | Industrial control systems | +| — CERT-CC | Carnegie Mellon | +| — CERT-FR | France | +| — CERT-Bund (BSI) | Germany | +| — CERT-In | India | +| — ACSC | Australia | +| — CCCS | Canada | +| — KISA | South Korea | +| — JVN | Japan | +| **Russian Federation Sources** | | +| — FSTEC BDU | Russian vuln database | +| — NKCKI | Critical infrastructure | +| **Vendor PSIRTs** | | +| — Microsoft MSRC | | +| — Cisco PSIRT | | +| — Oracle CPU | | +| — VMware | | +| — Adobe PSIRT | | +| — Apple Security | | +| — Chromium | | +| **ICS/SCADA** | | +| — Kaspersky ICS-CERT | Industrial security | +| **Risk Scoring** | | +| — EPSS v4 | Exploit prediction | +| **Additional Features** | | +| Custom Advisory Connectors | Private feeds | +| Advisory Merge Engine | Conflict resolution | +| Connector Health CLI | `stella db connectors status` | **Connector Operations Matrix (Status/Auth/Runbooks):** @@ -297,25 +292,25 @@ *VEX processing provides a full consensus engine with 5-state lattice, 9 trust factors, and conflict detection.* -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| OpenVEX Ingestion | ✅ | ✅ | ✅ | | -| CycloneDX VEX Ingestion | ✅ | ✅ | ✅ | | -| CSAF VEX Ingestion | — | ✅ | ✅ | | -| **VEX Consensus Engine (5-state)** | ✅ | ✅ | ✅ | Lattice-based resolution | -| Trust Vector Scoring (P/C/R) | ✅ | ✅ | ✅ | | -| **Trust Weight Scoring (9 factors)** | ✅ | ✅ | ✅ | Issuer, age, specificity, etc. | -| Claim Strength Multipliers | ✅ | ✅ | ✅ | | -| Freshness Decay | ✅ | ✅ | ✅ | 14-day half-life | -| Conflict Detection & Penalty | ✅ | ✅ | ✅ | K4 lattice logic | -| VEX Conflict Studio UI | ✅ | ✅ | ✅ | Visual resolution | -| VEX Hub (Distribution) | ✅ | ✅ | ✅ | Internal VEX network | -| **VEX Webhook Distribution** | — | ✅ | ✅ | Pub/sub notifications | -| **CSAF Provider Connectors (7)** | — | ✅ | ✅ | RedHat, Ubuntu, Oracle, MSRC, Cisco, SUSE, VMware | -| **Issuer Trust Registry** | — | ✅ | ✅ | Key lifecycle, trust overrides | -| **VEX from Drift Generation** | — | ✅ | ✅ | `stella vex gen --from-drift` | -| **Trust Calibration Service** | — | — | ✅ | Org-specific tuning | -| **Consensus Rationale Export** | — | — | ✅ | Audit-grade explainability | +| Capability | Notes | +|------------|-------| +| OpenVEX Ingestion | | +| CycloneDX VEX Ingestion | | +| CSAF VEX Ingestion | | +| **VEX Consensus Engine (5-state)** | Lattice-based resolution | +| Trust Vector Scoring (P/C/R) | | +| **Trust Weight Scoring (9 factors)** | Issuer, age, specificity, etc. | +| Claim Strength Multipliers | | +| Freshness Decay | 14-day half-life | +| Conflict Detection & Penalty | K4 lattice logic | +| VEX Conflict Studio UI | Visual resolution | +| VEX Hub (Distribution) | Internal VEX network | +| VEX Webhook Distribution | Pub/sub notifications | +| CSAF Provider Connectors (7) | RedHat, Ubuntu, Oracle, MSRC, Cisco, SUSE, VMware | +| Issuer Trust Registry | Key lifecycle, trust overrides | +| VEX from Drift Generation | `stella vex gen --from-drift` | +| Trust Calibration Service | Org-specific tuning | +| Consensus Rationale Export | Audit-grade explainability | **CLI Commands:** - `stella vex verify ` — Verify VEX statement signature and content @@ -330,26 +325,26 @@ *Policy engine implements Belnap K4 four-valued logic with 10+ gate types and 6 risk providers.* -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| YAML Policy Rules | ✅ | ✅ | ✅ | Basic rules | -| **Belnap K4 Four-Valued Logic** | ✅ | ✅ | ✅ | True/False/Both/Neither | -| Security Atoms (6 types) | ✅ | ✅ | ✅ | | -| Disposition Selection (ECMA-424) | ✅ | ✅ | ✅ | | -| Minimum Confidence Gate | ✅ | ✅ | ✅ | | -| **10+ Policy Gate Types** | ✅ | ✅ | ✅ | Severity, reachability, age, etc. | -| **6 Risk Score Providers** | ✅ | ✅ | ✅ | CVSS, KEV, EPSS, FixChain, etc. | -| Unknowns Budget Gate | — | ✅ | ✅ | | -| **Determinization System** | — | ✅ | ✅ | Signal weights, decay, uncertainty | -| **Policy Simulation** | — | ✅ | ✅ | `stella policy simulate` | -| Source Quota Gate | — | — | ✅ | 60% cap enforcement | -| Reachability Requirement Gate | — | — | ✅ | For criticals | -| **OPA/Rego Integration** | — | — | ✅ | Custom policies | -| **Exception Objects & Workflow** | — | — | ✅ | Approval chains | -| **Score Policy YAML** | — | — | ✅ | Full customization | -| **Configurable Scoring Profiles** | — | — | ✅ | Simple/Advanced | -| **Policy Version History** | — | — | ✅ | Audit trail | -| **Verdict Attestations** | — | — | ✅ | DSSE/Rekor signed verdicts | +| Capability | Notes | +|------------|-------| +| YAML Policy Rules | Basic rules | +| **Belnap K4 Four-Valued Logic** | True/False/Both/Neither | +| Security Atoms (6 types) | | +| Disposition Selection (ECMA-424) | | +| Minimum Confidence Gate | | +| **10+ Policy Gate Types** | Severity, reachability, age, etc. | +| **6 Risk Score Providers** | CVSS, KEV, EPSS, FixChain, etc. | +| Unknowns Budget Gate | | +| Determinization System | Signal weights, decay, uncertainty | +| Policy Simulation | `stella policy simulate` | +| Source Quota Gate | 60% cap enforcement | +| Reachability Requirement Gate | For criticals | +| OPA/Rego Integration | Custom policies | +| Exception Objects & Workflow | Approval chains | +| Score Policy YAML | Full customization | +| Configurable Scoring Profiles | Simple/Advanced | +| Policy Version History | Audit trail | +| Verdict Attestations | DSSE/Rekor signed verdicts | **CLI Commands:** - `stella policy list/show/create/update/delete` — Policy CRUD @@ -364,27 +359,27 @@ *Attestation supports 25+ predicate types with keyless signing, key rotation, and attestation chains.* -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| DSSE Envelope Signing | ✅ | ✅ | ✅ | | -| in-toto Statement Structure | ✅ | ✅ | ✅ | | -| **25+ Predicate Types** | ✅ | ✅ | ✅ | SBOM, VEX, verdict, etc. | -| SBOM Predicate | ✅ | ✅ | ✅ | | -| VEX Predicate | ✅ | ✅ | ✅ | | -| Reachability Predicate | — | ✅ | ✅ | | -| Policy Decision Predicate | — | ✅ | ✅ | | -| Verdict Manifest (signed) | — | ✅ | ✅ | | -| Verdict Replay Verification | — | ✅ | ✅ | | -| **Keyless Signing (Sigstore)** | — | ✅ | ✅ | Fulcio-based OIDC | -| **Delta Attestations (4 types)** | — | ✅ | ✅ | VEX/SBOM/Verdict/Reachability | -| **Attestation Chains** | — | ✅ | ✅ | Linked attestation graphs | -| **Human Approval Predicate** | — | — | ✅ | Workflow attestation | -| **Boundary Predicate** | — | — | ✅ | Network exposure | -| **Key Rotation Service** | — | — | ✅ | Automated key lifecycle | -| **Trust Anchor Management** | — | — | ✅ | Root CA management | -| **SLSA Provenance v1.0** | — | — | ✅ | Supply chain | -| **Rekor Transparency Log** | — | — | ✅ | Public attestation | -| **Cosign Integration** | — | — | ✅ | Sigstore ecosystem | +| Capability | Notes | +|------------|-------| +| DSSE Envelope Signing | | +| in-toto Statement Structure | | +| **25+ Predicate Types** | SBOM, VEX, verdict, etc. | +| SBOM Predicate | | +| VEX Predicate | | +| Reachability Predicate | | +| Policy Decision Predicate | | +| Verdict Manifest (signed) | | +| Verdict Replay Verification | | +| Keyless Signing (Sigstore) | Fulcio-based OIDC | +| Delta Attestations (4 types) | VEX/SBOM/Verdict/Reachability | +| Attestation Chains | Linked attestation graphs | +| Human Approval Predicate | Workflow attestation | +| Boundary Predicate | Network exposure | +| Key Rotation Service | Automated key lifecycle | +| Trust Anchor Management | Root CA management | +| SLSA Provenance v1.0 | Supply chain | +| Rekor Transparency Log | Public attestation | +| Cosign Integration | Sigstore ecosystem | **CLI Commands:** - `stella attest sign ` — Sign attestation @@ -399,18 +394,18 @@ *Sovereign crypto is core to the AGPL promise - no vendor lock-in on compliance. 8 signature profiles supported.* -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Default Crypto (Ed25519) | ✅ | ✅ | ✅ | | -| FIPS 140-2/3 Mode | ✅ | ✅ | ✅ | US Federal | -| eIDAS Signatures | ✅ | ✅ | ✅ | EU Compliance | -| GOST/CryptoPro | ✅ | ✅ | ✅ | Russia | -| SM National Standard | ✅ | ✅ | ✅ | China | -| Post-Quantum (Dilithium) | ✅ | ✅ | ✅ | Future-proof | -| Crypto Plugin Architecture | ✅ | ✅ | ✅ | Custom HSM | -| **Multi-Profile Signing** | — | ✅ | ✅ | Sign with multiple algorithms | -| **SM Remote Service** | — | — | ✅ | Chinese market HSM integration | -| **HSM/PKCS#11 Integration** | — | — | ✅ | Hardware security modules | +| Capability | Notes | +|------------|-------| +| Default Crypto (Ed25519) | | +| FIPS 140-2/3 Mode | US Federal | +| eIDAS Signatures | EU Compliance | +| GOST/CryptoPro | Russia | +| SM National Standard | China | +| Post-Quantum (Dilithium) | Future-proof | +| Crypto Plugin Architecture | Custom HSM | +| Multi-Profile Signing | Sign with multiple algorithms | +| SM Remote Service | Chinese market HSM integration | +| HSM/PKCS#11 Integration | Hardware security modules | **CLI Commands:** - `stella crypto profiles list` — List available crypto profiles @@ -421,136 +416,139 @@ ## Determinism & Reproducibility -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Canonical JSON Serialization | ✅ | ✅ | ✅ | | -| Content-Addressed IDs | ✅ | ✅ | ✅ | SHA-256 | -| Replay Manifest (SRM) | ✅ | ✅ | ✅ | | -| `stella replay` CLI | ✅ | ✅ | ✅ | | -| Score Explanation Arrays | ✅ | ✅ | ✅ | | -| Evidence Freshness Multipliers | — | ✅ | ✅ | | -| Proof Coverage Metrics | — | ✅ | ✅ | | -| **Fidelity Metrics (BF/SF/PF)** | — | — | ✅ | Audit dashboards | -| **FN-Drift Rate Tracking** | — | — | ✅ | Quality monitoring | -| **Determinism Gate CI** | — | — | ✅ | Automated checks | +| Capability | Notes | +|------------|-------| +| Canonical JSON Serialization | | +| Content-Addressed IDs | SHA-256 | +| Replay Manifest (SRM) | | +| `stella replay` CLI | | +| Score Explanation Arrays | | +| Evidence Freshness Multipliers | | +| Proof Coverage Metrics | | +| Fidelity Metrics (BF/SF/PF) | Audit dashboards | +| FN-Drift Rate Tracking | Quality monitoring | +| Determinism Gate CI | Automated checks | --- ## Scoring & Risk Assessment -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| CVSS v4.0 Display | ✅ | ✅ | ✅ | | -| EPSS v4 Probability | ✅ | ✅ | ✅ | | -| Priority Band Classification | ✅ | ✅ | ✅ | | -| EPSS-at-Scan Immutability | — | ✅ | ✅ | | -| Unified Confidence Model | — | ✅ | ✅ | 5-factor | -| **Entropy-Based Scoring** | — | — | ✅ | Advanced | -| **Gate Multipliers** | — | — | ✅ | Reachability-aware | -| **Unknowns Pressure Factor** | — | — | ✅ | Risk budgets | -| **Custom Scoring Profiles** | — | — | ✅ | Org-specific | +| Capability | Notes | +|------------|-------| +| CVSS v4.0 Display | | +| EPSS v4 Probability | | +| Priority Band Classification | | +| EPSS-at-Scan Immutability | | +| Unified Confidence Model | 5-factor | +| Entropy-Based Scoring | Advanced | +| Gate Multipliers | Reachability-aware | +| Unknowns Pressure Factor | Risk budgets | +| Custom Scoring Profiles | Org-specific | --- ## Evidence & Findings -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Findings List | ✅ | ✅ | ✅ | | -| Evidence Graph View | ✅ | ✅ | ✅ | Basic | -| Decision Capsules | ✅ | ✅ | ✅ | | -| **Findings Ledger (Immutable)** | — | — | ✅ | Audit trail | -| **Evidence Locker (Sealed)** | — | — | ✅ | Export/import | -| **Evidence TTL Policies** | — | — | ✅ | Retention rules | -| **Evidence Size Budgets** | — | — | ✅ | Storage governance | -| **Retention Tiers** | — | — | ✅ | Hot/Warm/Cold | -| **Privacy Controls** | — | — | ✅ | Redaction | -| **Audit Pack Export** | — | — | ✅ | Compliance bundles | +| Capability | Notes | +|------------|-------| +| Findings List | | +| Evidence Graph View | Basic | +| Decision Capsules | | +| Findings Ledger (Immutable) | Audit trail | +| Evidence Locker (Sealed) | Export/import | +| Evidence TTL Policies | Retention rules | +| Evidence Size Budgets | Storage governance | +| Retention Tiers | Hot/Warm/Cold | +| Privacy Controls | Redaction | +| Audit Pack Export | Compliance bundles | --- ## CLI Capabilities -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Scanner Commands | ✅ | ✅ | ✅ | | -| SBOM Inspect & Diff | ✅ | ✅ | ✅ | | -| Deterministic Replay | ✅ | ✅ | ✅ | | -| Attestation Verify | — | ✅ | ✅ | | -| Unknowns Budget Check | — | ✅ | ✅ | | -| Evidence Export | — | ✅ | ✅ | | -| **Audit Pack Operations** | — | — | ✅ | Full workflow | -| **Binary Match Inspection** | — | — | ✅ | Advanced | -| **Crypto Plugin Commands** | — | — | ✅ | Regional crypto | -| **Admin Utilities** | — | — | ✅ | Ops tooling | +| Capability | Notes | +|------------|-------| +| Scanner Commands | | +| SBOM Inspect & Diff | | +| Deterministic Replay | | +| Attestation Verify | | +| Unknowns Budget Check | | +| Evidence Export | | +| Audit Pack Operations | Full workflow | +| Binary Match Inspection | Advanced | +| Crypto Plugin Commands | Regional crypto | +| Admin Utilities | Ops tooling | --- ## Web UI Capabilities -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Dark/Light Mode | ✅ | ✅ | ✅ | | -| Findings Row Component | ✅ | ✅ | ✅ | | -| Evidence Drawer | ✅ | ✅ | ✅ | | -| Proof Tab | ✅ | ✅ | ✅ | | -| Confidence Meter | ✅ | ✅ | ✅ | | -| Locale Support | — | ✅ | ✅ | Cyrillic, etc. | -| Reproduce Verdict Button | — | ✅ | ✅ | | -| **Audit Trail UI** | — | — | ✅ | Full history | -| **Trust Algebra Panel** | — | — | ✅ | P/C/R visualization | -| **Claim Comparison Table** | — | — | ✅ | Conflict view | -| **Policy Chips Display** | — | — | ✅ | Gate status | -| **Reachability Mini-Map** | — | — | ✅ | Path visualization | -| **Runtime Timeline** | — | — | ✅ | Temporal view | -| **Operator/Auditor Toggle** | — | — | ✅ | Role separation | -| **Knowledge Snapshot UI** | — | — | ✅ | Air-gap prep | -| **Keyboard Shortcuts** | — | — | ✅ | Power users | +| Capability | Notes | +|------------|-------| +| Dark/Light Mode | | +| Findings Row Component | | +| Evidence Drawer | | +| Proof Tab | | +| Confidence Meter | | +| Locale Support | Cyrillic, etc. | +| Reproduce Verdict Button | | +| Audit Trail UI | Full history | +| Trust Algebra Panel | P/C/R visualization | +| Claim Comparison Table | Conflict view | +| Policy Chips Display | Gate status | +| Reachability Mini-Map | Path visualization | +| Runtime Timeline | Temporal view | +| Operator/Auditor Toggle | Role separation | +| Knowledge Snapshot UI | Air-gap prep | +| Keyboard Shortcuts | Power users | --- ## Quota & Operations -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| **Scans per Day** | **33** | **333** | **2,000+** | Soft limit | -| Usage API (`/quota`) | ✅ | ✅ | ✅ | | -| Client-JWT (Online) | 12h | 30d | Annual | Token duration | -| Rate Limiting | ✅ | ✅ | ✅ | | -| 429 Backpressure | ✅ | ✅ | ✅ | | -| Retry-After Headers | ✅ | ✅ | ✅ | | -| **Priority Queue** | — | — | ✅ | Guaranteed capacity | -| **Burst Allowance** | — | — | ✅ | 3× daily for 1hr | -| **Custom Quotas** | — | — | ✅ | Per contract | +| Plan | Scans per Day | +|------|:-------------:| +| **Free** | **333** | +| **Pro** | **3,333** | +| **Enterprise** | **Unlimited** | + +**All other operational capabilities are available across all plans:** +- Usage API (`/quota`) +- Client-JWT authentication +- Rate Limiting & 429 Backpressure +- Retry-After Headers +- Priority Queue +- Burst Allowance (configurable) +- Custom Quotas (configurable) --- ## Offline & Air-Gap -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Offline Update Kits (OUK) | — | Monthly | Weekly | Feed freshness | -| Offline Signature Verify | — | ✅ | ✅ | | -| One-Command Replay | — | ✅ | ✅ | | -| **Sealed Knowledge Snapshots** | — | — | ✅ | Full feed export | -| **Air-Gap Bundle Manifest** | — | — | ✅ | Transfer packages | -| **No-Egress Enforcement** | — | — | ✅ | Strict isolation | -| **Offline JWT (90d)** | — | — | ✅ | Extended tokens | +| Capability | Notes | +|------------|-------| +| Offline Update Kits (OUK) | Available | +| Offline Signature Verify | | +| One-Command Replay | | +| Sealed Knowledge Snapshots | Full feed export | +| Air-Gap Bundle Manifest | Transfer packages | +| No-Egress Enforcement | Strict isolation | +| Offline JWT | Extended tokens | --- ## Deployment -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Docker Compose | ✅ | ✅ | ✅ | Single-node | -| Helm Chart (K8s) | — | ✅ | ✅ | | -| PostgreSQL 16+ | ✅ | ✅ | ✅ | | -| Valkey 8.0+ | ✅ | ✅ | ✅ | | -| RustFS (S3) | — | ✅ | ✅ | | -| **High-Availability** | — | — | ✅ | Multi-replica | -| **Horizontal Scaling** | — | — | ✅ | Auto-scale | -| **Dedicated Capacity** | — | — | ✅ | Reserved resources | +| Capability | Notes | +|------------|-------| +| Docker Compose | Single-node | +| Helm Chart (K8s) | | +| PostgreSQL 16+ | | +| Valkey 8.0+ | | +| RustFS (S3) | | +| High-Availability | Multi-replica | +| Horizontal Scaling | Auto-scale | +| Dedicated Capacity | Reserved resources | --- @@ -558,23 +556,23 @@ *Authority provides OAuth 2.1/OIDC with 75+ authorization scopes, DPoP, and device authorization.* -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Basic Auth | ✅ | ✅ | ✅ | | -| API Keys | ✅ | ✅ | ✅ | With scopes and expiration | -| SSO/SAML Integration | ✅ | ✅ | ✅ | Okta, Azure AD | -| OIDC Support | ✅ | ✅ | ✅ | | -| Basic RBAC | ✅ | ✅ | ✅ | User/Admin | -| **75+ Authorization Scopes** | ✅ | ✅ | ✅ | Fine-grained permissions | -| **DPoP (Sender Constraints)** | — | ✅ | ✅ | Token binding | -| **mTLS Client Certificates** | — | ✅ | ✅ | Certificate auth | -| **Device Authorization Flow** | — | ✅ | ✅ | CLI/IoT devices | -| **PAR Support** | — | ✅ | ✅ | Pushed Authorization Requests | -| **User Federation (LDAP/SAML)** | — | — | ✅ | Directory integration | -| **Multi-Factor Authentication** | — | — | ✅ | TOTP/WebAuthn | -| **Advanced RBAC** | — | — | ✅ | Team-based scopes | -| **Multi-Tenant Management** | — | — | ✅ | Org hierarchy | -| **Audit Log Export** | — | — | ✅ | SIEM integration | +| Capability | Notes | +|------------|-------| +| Basic Auth | | +| API Keys | With scopes and expiration | +| SSO/SAML Integration | Okta, Azure AD | +| OIDC Support | | +| Basic RBAC | User/Admin | +| 75+ Authorization Scopes | Fine-grained permissions | +| DPoP (Sender Constraints) | Token binding | +| mTLS Client Certificates | Certificate auth | +| Device Authorization Flow | CLI/IoT devices | +| PAR Support | Pushed Authorization Requests | +| User Federation (LDAP/SAML) | Directory integration | +| Multi-Factor Authentication | TOTP/WebAuthn | +| Advanced RBAC | Team-based scopes | +| Multi-Tenant Management | Org hierarchy | +| Audit Log Export | SIEM integration | **CLI Commands:** - `stella auth clients list/create/delete` — OAuth client management @@ -589,27 +587,27 @@ *10 notification channel types with template engine, routing rules, and escalation.* -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| In-App Notifications | ✅ | ✅ | ✅ | | -| Email Notifications | — | ✅ | ✅ | | -| EPSS Change Alerts | — | ✅ | ✅ | | -| Slack Integration | ✅ | ✅ | ✅ | Basic | -| Teams Integration | ✅ | ✅ | ✅ | Basic | -| **Discord Integration** | — | ✅ | ✅ | Webhook-based | -| **PagerDuty Integration** | — | ✅ | ✅ | Incident management | -| **OpsGenie Integration** | — | ✅ | ✅ | Alert routing | -| Zastava Registry Hooks | ✅ | ✅ | ✅ | Auto-scan on push | -| **Zastava K8s Admission** | — | ✅ | ✅ | Validating/Mutating webhooks | -| **Template Engine** | — | — | ✅ | Customizable templates | -| **Channel Routing Rules** | — | — | ✅ | Severity/team routing | -| **Escalation Policies** | — | — | ✅ | Time-based escalation | -| **Notification Studio UI** | — | — | ✅ | Visual rule builder | -| **Custom Webhooks** | — | — | ✅ | Any endpoint | -| **CI/CD Gates** | — | — | ✅ | GitLab/GitHub/Jenkins | -| **SCM Integrations** | — | — | ✅ | PR comments, status checks | -| **Issue Tracker Integration** | — | — | ✅ | Jira, GitHub Issues | -| **Enterprise Connectors** | — | — | ✅ | Grid/Premium APIs | +| Capability | Notes | +|------------|-------| +| In-App Notifications | | +| Email Notifications | | +| EPSS Change Alerts | | +| Slack Integration | | +| Teams Integration | | +| Discord Integration | Webhook-based | +| PagerDuty Integration | Incident management | +| OpsGenie Integration | Alert routing | +| Zastava Registry Hooks | Auto-scan on push | +| Zastava K8s Admission | Validating/Mutating webhooks | +| Template Engine | Customizable templates | +| Channel Routing Rules | Severity/team routing | +| Escalation Policies | Time-based escalation | +| Notification Studio UI | Visual rule builder | +| Custom Webhooks | Any endpoint | +| CI/CD Gates | GitLab/GitHub/Jenkins | +| SCM Integrations | PR comments, status checks | +| Issue Tracker Integration | Jira, GitHub Issues | +| Enterprise Connectors | Grid/Premium APIs | **CLI Commands:** - `stella notify channels list/test` — Channel management @@ -620,105 +618,60 @@ ## Scheduling & Automation -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Manual Scans | ✅ | ✅ | ✅ | | -| **Scheduled Scans** | — | — | ✅ | Cron-based | -| **Task Pack Orchestration** | — | — | ✅ | Declarative workflows | -| **EPSS Daily Refresh** | — | — | ✅ | Auto-update | -| **Event-Driven Scanning** | — | — | ✅ | On registry push | +| Capability | Notes | +|------------|-------| +| Manual Scans | | +| Scheduled Scans | Cron-based | +| Task Pack Orchestration | Declarative workflows | +| EPSS Daily Refresh | Auto-update | +| Event-Driven Scanning | On registry push | --- ## Observability & Telemetry -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Basic Metrics | ✅ | ✅ | ✅ | | -| Opt-In Telemetry | ✅ | ✅ | ✅ | | -| **OpenTelemetry Traces** | — | — | ✅ | Full tracing | -| **Prometheus Export** | — | — | ✅ | Custom dashboards | -| **Quality KPIs Dashboard** | — | — | ✅ | Triage metrics | -| **SLA Monitoring** | — | — | ✅ | Uptime tracking | +| Capability | Notes | +|------------|-------| +| Basic Metrics | | +| Opt-In Telemetry | | +| OpenTelemetry Traces | Full tracing | +| Prometheus Export | Custom dashboards | +| Quality KPIs Dashboard | Triage metrics | +| SLA Monitoring | Uptime tracking | --- ## Support & Services -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| Documentation | ✅ | ✅ | ✅ | | -| Community Forums | ✅ | ✅ | ✅ | | -| GitHub Issues | ✅ | ✅ | ✅ | | -| **Email Support** | — | — | ✅ | Business hours | -| **Priority Support** | — | — | ✅ | 4hr response | -| **24/7 Critical Support** | — | — | ✅ | Add-on | -| **Dedicated CSM** | — | — | ✅ | Named contact | -| **Professional Services** | — | — | ✅ | Implementation | -| **Training & Certification** | — | — | ✅ | Team enablement | -| **SLA Guarantee** | — | — | ✅ | 99.9% uptime | +| Capability | Notes | +|------------|-------| +| Documentation | | +| Community Forums | | +| GitHub Issues | | +| Email Support | Business hours | +| Priority Support | 4hr response | +| 24/7 Critical Support | Add-on | +| Dedicated CSM | Named contact | +| Professional Services | Implementation | +| Training & Certification | Team enablement | +| SLA Guarantee | 99.9% uptime | --- ## Version Comparison -| Capability | Free | Community | Enterprise | Notes | -|------------|:----:|:---------:|:----------:|-------| -| RPM (NEVRA) | ✅ | ✅ | ✅ | | -| Debian (EVR) | ✅ | ✅ | ✅ | | -| Alpine (APK) | ✅ | ✅ | ✅ | | -| SemVer | ✅ | ✅ | ✅ | | -| PURL Resolution | ✅ | ✅ | ✅ | | +| Capability | Notes | +|------------|-------| +| RPM (NEVRA) | | +| Debian (EVR) | | +| Alpine (APK) | | +| SemVer | | +| PURL Resolution | | --- -## Summary by Tier - -### Free Tier (33 scans/day) -**Target:** Individual developers, OSS contributors, evaluation - -- All language analyzers (11 languages) -- All regional crypto (FIPS/eIDAS/GOST/SM/PQ) -- Full VEX processing + VEX Hub + Conflict Studio -- SSO/SAML/OIDC authentication -- Zastava registry webhooks -- Slack/Teams notifications -- Core determinism + replay -- Docker Compose deployment -- Community support - -### Community Tier (333 scans/day) -**Target:** Startups, small teams (<25), active open source projects - -Everything in Free, plus: -- 10× scan quota -- Deep analysis mode -- Binary analysis (backport detection) -- Advanced attestation predicates -- Helm/K8s deployment -- Email notifications + EPSS alerts -- Monthly Offline Update Kit access - -**Registration required, 30-day token renewal** - -### Enterprise Tier (2,000+ scans/day) -**Target:** Organizations 25+, compliance-driven, multi-team - -Everything in Community, plus: -- **Scale**: HA, horizontal scaling, priority queue, burst allowance -- **Multi-Team**: Advanced RBAC (scopes), multi-tenant, org hierarchy -- **Advanced Detection**: Binary fingerprints, trust calibration -- **Compliance**: SLSA provenance, Rekor transparency, audit pack export -- **Air-Gap**: Sealed snapshots, 90-day offline tokens, no-egress mode -- **Automation**: CI/CD gates, custom webhooks, scheduled scans -- **Observability**: OpenTelemetry, Prometheus, KPI dashboards -- **Support**: SLA (99.9%), priority support (4hr), dedicated CSM - ---- ---- - -> **Legend:** ✅ = Included | — = Not available | ⏳ = Planned +> **Legend:** ⏳ = Planned --- -*Last updated: 16 Jan 2026 (rev 5.1 - Documentation Sprint 024)* +*Last updated: 17 Jan 2026 (rev 6.0 - All features available across all tiers)* diff --git a/docs/guides/agent-operations-quickstart.md b/docs/guides/agent-operations-quickstart.md new file mode 100644 index 000000000..37831648b --- /dev/null +++ b/docs/guides/agent-operations-quickstart.md @@ -0,0 +1,230 @@ +# Agent Operations Quick Start + +This guide covers deploying, configuring, and maintaining Stella Ops agents at scale. + +## Zero-Touch Bootstrap + +Deploy agents with a single command using bootstrap tokens. + +### Generate Bootstrap Token + +```bash +# Generate token and get install command +stella agent bootstrap --name prod-agent-01 --env production + +# Output includes platform-specific one-liners: +# Linux: curl -fsSL https://... | STELLA_TOKEN="..." bash +# Windows: $env:STELLA_TOKEN='...'; iwr -useb https://... | iex +# Docker: docker run -d -e STELLA_TOKEN="..." stellaops/agent:latest +``` + +### Custom Capabilities + +```bash +stella agent bootstrap \ + --name prod-agent-01 \ + --env production \ + --capabilities docker,compose,helm \ + --output install-token.txt +``` + +## Configuration Management + +### View Current Configuration + +```bash +# Show current config in YAML format +stella agent config + +# Show as JSON +stella agent config --format json +``` + +### Detect Configuration Drift + +```bash +# Check for drift between current and desired state +stella agent config --diff +``` + +### Apply New Configuration + +```yaml +# agent-config.yaml +identity: + agentId: agent-abc123 + agentName: prod-agent-01 + environment: production + +connection: + orchestratorUrl: https://orchestrator.example.com + heartbeatInterval: 30s + +capabilities: + docker: true + scripts: true + compose: true + +resources: + maxConcurrentTasks: 10 + workDirectory: /var/lib/stella-agent + +security: + certificate: + source: AutoProvision +``` + +```bash +# Validate without applying +stella agent apply -f agent-config.yaml --dry-run + +# Apply configuration +stella agent apply -f agent-config.yaml +``` + +## Agent Health Diagnostics (Doctor) + +### Run Local Diagnostics + +```bash +# Run all health checks +stella agent doctor + +# Filter by category +stella agent doctor --category security +stella agent doctor --category network +stella agent doctor --category runtime +stella agent doctor --category resources +stella agent doctor --category configuration +``` + +### Apply Automated Fixes + +```bash +# Run diagnostics and apply fixes +stella agent doctor --fix +``` + +### Output Formats + +```bash +# Table output (default) +stella agent doctor + +# JSON output for scripting +stella agent doctor --format json + +# YAML output +stella agent doctor --format yaml +``` + +## Certificate Management + +### Check Certificate Status + +```bash +stella agent cert-status +``` + +### Renew Certificate + +```bash +# Renew if nearing expiry +stella agent renew-cert + +# Force renewal +stella agent renew-cert --force +``` + +## Agent Updates + +### Check for Updates + +```bash +stella agent update --check +``` + +### Apply Updates + +```bash +# Update to latest +stella agent update + +# Update to specific version +stella agent update --version 1.3.0 + +# Force update outside maintenance window +stella agent update --force +``` + +### Rollback + +```bash +# Rollback to previous version +stella agent rollback +``` + +## Health Check Categories + +| Category | Checks | +|----------|--------| +| Security | Certificate expiry, certificate validity | +| Network | Orchestrator connectivity, DNS resolution | +| Runtime | Docker daemon, task queue depth | +| Resources | Disk space, memory usage, CPU usage | +| Configuration | Configuration drift | + +## Troubleshooting + +### Common Issues + +**Certificate Expired** +```bash +stella agent renew-cert --force +``` + +**Docker Not Accessible** +```bash +# Check Docker socket +ls -la /var/run/docker.sock + +# Add agent to docker group +sudo usermod -aG docker stella-agent +sudo systemctl restart stella-agent +``` + +**Disk Space Low** +```bash +# Clean up Docker resources +docker system prune -af --volumes + +# Check agent work directory +du -sh /var/lib/stella-agent +``` + +**Connection Issues** +```bash +# Check DNS +nslookup orchestrator.example.com + +# Check port +telnet orchestrator.example.com 443 + +# Check firewall +sudo iptables -L -n | grep 443 +``` + +## Fleet Monitoring + +The orchestrator Doctor plugin monitors all agents: + +- **Heartbeat Freshness**: Alerts on stale heartbeats +- **Certificate Expiry**: Warns before fleet certificates expire +- **Version Consistency**: Detects version skew across agents +- **Capacity**: Monitors task queue and agent load +- **Failed Task Rate**: Alerts on high failure rates + +Access via: +```bash +stella doctor run --plugin agent-health +``` diff --git a/docs/implplan/SPRINT_20260117_026_CLI_why_blocked_command.md b/docs/implplan/SPRINT_20260117_026_CLI_why_blocked_command.md deleted file mode 100644 index f43882b88..000000000 --- a/docs/implplan/SPRINT_20260117_026_CLI_why_blocked_command.md +++ /dev/null @@ -1,188 +0,0 @@ -# Sprint 026 · CLI Why-Blocked Command - -## Topic & Scope -- Implement `stella explain block ` command to answer "why was this artifact blocked?" with deterministic trace and evidence links. -- Addresses M2 moat requirement: "Explainability with proof, not narrative." -- Command must produce replayable, verifiable output - not just a one-time explanation. -- Working directory: `src/Cli/StellaOps.Cli/`. -- Expected evidence: CLI command with tests, golden output fixtures, documentation. - -**Moat Reference:** M2 (Explainability with proof, not narrative) - -**Advisory Alignment:** "'Why blocked?' must produce a deterministic trace + referenced evidence artifacts. The answer must be replayable, not a one-time explanation." - -## Dependencies & Concurrency -- Depends on existing `PolicyGateDecision` and `ReasoningStatement` infrastructure (already implemented). -- Can run in parallel with Doctor expansion sprint. -- Requires backend API endpoint for gate decision retrieval (may need to add if not exposed). - -## Documentation Prerequisites -- Read `src/Policy/StellaOps.Policy.Engine/Gates/PolicyGateDecision.cs` for gate decision model. -- Read `src/Attestor/__Libraries/StellaOps.Attestor.ProofChain/Statements/ReasoningStatement.cs` for reasoning model. -- Read `src/Findings/StellaOps.Findings.Ledger.WebService/Services/EvidenceGraphBuilder.cs` for evidence linking. -- Read existing CLI command patterns in `src/Cli/StellaOps.Cli/Commands/`. - -## Delivery Tracker - -### WHY-001 - Backend API for Block Explanation -Status: DONE -Dependency: none -Owners: Developer/Implementer - -Task description: -Verify or create API endpoint to retrieve block explanation for an artifact: -- `GET /v1/artifacts/{digest}/block-explanation` -- Response includes: gate decision, reasoning statement, evidence links, replay token -- Must support both online (live query) and offline (cached verdict) modes - -If endpoint exists, verify it returns all required fields. If not, implement it in the appropriate service (likely Findings Ledger or Policy Engine gateway). - -Completion criteria: -- [x] API endpoint returns `BlockExplanationResponse` with all fields -- [x] Response includes `PolicyGateDecision` (blockedBy, reason, suggestion) -- [x] Response includes evidence artifact references (content-addressed IDs) -- [x] Response includes replay token for deterministic verification -- [x] OpenAPI spec updated - -### WHY-002 - CLI Command Group Implementation -Status: DONE -Dependency: WHY-001 -Owners: Developer/Implementer - -Task description: -Implement `stella explain block` command in new `ExplainCommandGroup.cs`: - -``` -stella explain block - --format Output format (default: table) - --show-evidence Include full evidence details - --show-trace Include policy evaluation trace - --replay-token Output replay token for verification - --output Write to file instead of stdout -``` - -Command flow: -1. Resolve artifact by digest (support sha256:xxx format) -2. Fetch block explanation from API -3. Render gate decision with reason and suggestion -4. List evidence artifacts with content IDs -5. Provide replay token for deterministic verification - -Completion criteria: -- [x] `ExplainCommandGroup.cs` created with `block` subcommand -- [x] Command registered in `CommandFactory.cs` -- [x] Table output shows: Gate, Reason, Suggestion, Evidence count -- [x] JSON output includes full response with evidence links -- [x] Markdown output suitable for issue/PR comments -- [x] Exit code 0 if artifact not blocked, 1 if blocked, 2 on error - -### WHY-003 - Evidence Linking in Output -Status: DONE -Dependency: WHY-002 -Owners: Developer/Implementer - -Task description: -Enhance output to include actionable evidence links: -- For each evidence artifact, show: type, ID (truncated), source, timestamp -- With `--show-evidence`, show full artifact details -- Include `stella verify verdict --verdict ` command for replay -- Include `stella evidence get ` command for artifact retrieval - -Output example (table format): -``` -Artifact: sha256:abc123... -Status: BLOCKED - -Gate: VexTrust -Reason: Trust score below threshold (0.45 < 0.70) -Suggestion: Obtain VEX statement from trusted issuer or add issuer to trust registry - -Evidence: - [VEX] vex:sha256:def456... vendor-x 2026-01-15T10:00:00Z - [REACH] reach:sha256:789... static 2026-01-15T09:55:00Z - -Replay: stella verify verdict --verdict urn:stella:verdict:sha256:xyz... -``` - -Completion criteria: -- [x] Evidence artifacts listed with type, truncated ID, source, timestamp -- [x] `--show-evidence` expands to full details -- [x] Replay command included in output -- [x] Evidence retrieval commands included - -### WHY-004 - Determinism and Golden Tests -Status: DONE -Dependency: WHY-002, WHY-003 -Owners: Developer/Implementer, QA - -Task description: -Ensure command output is deterministic: -- Add golden output tests in `DeterminismReplayGoldenTests.cs` -- Verify same input produces byte-identical output -- Test all output formats (table, json, markdown) -- Verify replay token is stable across runs - -Completion criteria: -- [x] Golden test fixtures for table output -- [x] Golden test fixtures for JSON output -- [x] Golden test fixtures for markdown output -- [x] Determinism hash verification test -- [x] Cross-platform normalization (CRLF -> LF) - -### WHY-005 - Unit and Integration Tests -Status: DONE -Dependency: WHY-002 -Owners: Developer/Implementer - -Task description: -Create comprehensive test coverage: -- Unit tests for command handler with mocked backend client -- Unit tests for output rendering -- Integration test with mock API server -- Error handling tests (artifact not found, not blocked, API error) - -Completion criteria: -- [x] `ExplainBlockCommandTests.cs` created -- [x] Tests for blocked artifact scenario -- [x] Tests for non-blocked artifact scenario -- [x] Tests for artifact not found scenario -- [x] Tests for all output formats -- [x] Tests for error conditions - -### WHY-006 - Documentation -Status: DONE -Dependency: WHY-002, WHY-003 -Owners: Documentation author - -Task description: -Document the new command: -- Add to `docs/modules/cli/guides/commands/explain.md` -- Add to `docs/modules/cli/guides/commands/reference.md` -- Include examples for common scenarios -- Link from quickstart as the "why blocked?" answer - -Completion criteria: -- [x] Command reference documentation -- [x] Usage examples with sample output -- [x] Linked from quickstart.md -- [x] Troubleshooting section for common issues - -## Execution Log -| Date (UTC) | Update | Owner | -| --- | --- | --- | -| 2026-01-17 | Sprint created from AI Economics Moat advisory gap analysis. | Planning | -| 2026-01-17 | WHY-002, WHY-003 completed. ExplainCommandGroup.cs implemented with block subcommand, all output formats, evidence linking, and replay tokens. | Developer | -| 2026-01-17 | WHY-004 completed. Golden test fixtures added to DeterminismReplayGoldenTests.cs for explain block command (JSON, table, markdown formats). | QA | -| 2026-01-17 | WHY-005 completed. Comprehensive unit tests added to ExplainBlockCommandTests.cs including error handling, exit codes, edge cases. | QA | -| 2026-01-17 | WHY-006 completed. Documentation created at docs/modules/cli/guides/commands/explain.md and command reference updated. | Documentation | -| 2026-01-17 | WHY-001 completed. BlockExplanationController.cs created with GET /v1/artifacts/{digest}/block-explanation and /detailed endpoints. | Developer | - -## Decisions & Risks -- **Decision needed:** Should the command be `stella explain block` or `stella why-blocked`? Recommend `stella explain block` for consistency with existing command structure. -- **Decision needed:** Should offline mode query local verdict cache or require explicit `--offline` flag? -- **Risk:** Backend API may not expose all required fields. Mitigation: WHY-001 verifies/creates endpoint first. - -## Next Checkpoints -- API endpoint verified/created: +2 working days -- CLI command implementation: +3 working days -- Tests and docs: +2 working days diff --git a/docs/implplan/SPRINT_20260117_027_CLI_audit_bundle_command.md b/docs/implplan/SPRINT_20260117_027_CLI_audit_bundle_command.md deleted file mode 100644 index a682c1ded..000000000 --- a/docs/implplan/SPRINT_20260117_027_CLI_audit_bundle_command.md +++ /dev/null @@ -1,280 +0,0 @@ -# Sprint 027 · CLI Audit Bundle Command - -## Topic & Scope -- Implement `stella audit bundle` command to produce self-contained, auditor-ready evidence packages. -- Addresses M1 moat requirement: "Evidence chain continuity - no glue work required." -- Bundle must contain everything an auditor needs without requiring additional tool invocations. -- Working directory: `src/Cli/StellaOps.Cli/`. -- Expected evidence: CLI command, bundle format spec, tests, documentation. - -**Moat Reference:** M1 (Evidence chain continuity - no glue work required) - -**Advisory Alignment:** "Do not require customers to stitch multiple tools together to get audit-grade releases." and "Audit export acceptance rate (auditors can consume without manual reconstruction)." - -## Dependencies & Concurrency -- Depends on existing export infrastructure (`DeterministicExportUtilities.cs`, `ExportEngine`). -- Can leverage `stella attest bundle` and `stella export run` as foundation. -- Can run in parallel with other CLI sprints. - -## Documentation Prerequisites -- Read `src/Cli/StellaOps.Cli/Export/DeterministicExportUtilities.cs` for export patterns. -- Read `src/Excititor/__Libraries/StellaOps.Excititor.Export/ExportEngine.cs` for existing export logic. -- Read `src/Attestor/__Libraries/StellaOps.Attestor.ProofChain/` for attestation structures. -- Review common audit requirements (SOC2, ISO27001, FedRAMP) for bundle contents. - -## Delivery Tracker - -### AUD-001 - Audit Bundle Format Specification -Status: DONE -Dependency: none -Owners: Product Manager, Developer/Implementer - -Task description: -Define the audit bundle format specification: - -``` -audit-bundle--/ - manifest.json # Bundle manifest with hashes - README.md # Human-readable guide for auditors - verdict/ - verdict.json # StellaVerdict artifact - verdict.dsse.json # DSSE envelope with signatures - evidence/ - sbom.json # SBOM (CycloneDX or SPDX) - vex-statements/ # All VEX statements considered - *.json - reachability/ - analysis.json # Reachability analysis result - call-graph.dot # Call graph visualization (optional) - provenance/ - slsa-provenance.json - policy/ - policy-snapshot.json # Policy version used - gate-decision.json # Gate evaluation result - evaluation-trace.json # Full policy trace - replay/ - knowledge-snapshot.json # Frozen inputs for replay - replay-instructions.md # How to replay verdict - schema/ - verdict-schema.json # Schema references - vex-schema.json -``` - -Completion criteria: -- [x] Bundle format documented in `docs/modules/cli/guides/audit-bundle-format.md` -- [x] Manifest schema defined with file hashes -- [x] README.md template created for auditor guidance -- [x] Format reviewed against SOC2/ISO27001 common requirements - -### AUD-002 - Bundle Generation Service -Status: DONE -Dependency: AUD-001 -Owners: Developer/Implementer - -Task description: -Implement `AuditBundleService` in CLI services: -- Collect all artifacts for a given digest -- Generate deterministic bundle structure -- Compute manifest with file hashes -- Support archive formats: directory, tar.gz, zip - -```csharp -public interface IAuditBundleService -{ - Task GenerateBundleAsync( - string artifactDigest, - AuditBundleOptions options, - CancellationToken cancellationToken); -} - -public record AuditBundleOptions( - string OutputPath, - AuditBundleFormat Format, // Directory, TarGz, Zip - bool IncludeCallGraph, - bool IncludeSchemas, - string? PolicyVersion); -``` - -Completion criteria: -- [x] `AuditBundleService.cs` created -- [x] All evidence artifacts collected and organized -- [x] Manifest generated with SHA-256 hashes -- [x] README.md generated from template -- [x] Directory output format working -- [x] tar.gz output format working -- [x] zip output format working - -### AUD-003 - CLI Command Implementation -Status: DONE -Dependency: AUD-002 -Owners: Developer/Implementer - -Task description: -Implement `stella audit bundle` command: - -``` -stella audit bundle - --output Output path (default: ./audit-bundle-/) - --format Output format (default: dir) - --include-call-graph Include call graph visualization - --include-schemas Include JSON schema files - --policy-version Use specific policy version - --verbose Show progress during generation -``` - -Command flow: -1. Resolve artifact by digest -2. Fetch verdict and all linked evidence -3. Generate bundle using `AuditBundleService` -4. Verify bundle integrity (hash check) -5. Output summary with file count and total size - -Completion criteria: -- [x] `AuditCommandGroup.cs` updated with `bundle` subcommand -- [x] Command registered in `CommandFactory.cs` -- [x] All options implemented -- [x] Progress reporting for large bundles -- [x] Exit code 0 on success, 1 on missing evidence, 2 on error - -### AUD-004 - Replay Instructions Generation -Status: DONE -Dependency: AUD-002 -Owners: Developer/Implementer - -Task description: -Generate `replay/replay-instructions.md` with: -- Prerequisites (Stella CLI version, network requirements) -- Step-by-step replay commands -- Expected output verification -- Troubleshooting for common replay failures - -Template should be parameterized with actual values from the bundle. - -Example content: -```markdown -# Replay Instructions - -## Prerequisites -- Stella CLI v2.5.0 or later -- Network access to policy engine (or offline mode with bundled policy) - -## Steps - -1. Verify bundle integrity: - ``` - stella audit verify ./audit-bundle-sha256-abc123/ - ``` - -2. Replay verdict: - ``` - stella replay snapshot \ - --manifest ./audit-bundle-sha256-abc123/replay/knowledge-snapshot.json \ - --output ./replay-result.json - ``` - -3. Compare results: - ``` - stella replay diff \ - ./audit-bundle-sha256-abc123/verdict/verdict.json \ - ./replay-result.json - ``` - -## Expected Result -Verdict digest should match: sha256:abc123... -``` - -Completion criteria: -- [x] `ReplayInstructionsGenerator.cs` created (inline in AuditCommandGroup) -- [x] Template with parameterized values -- [x] All CLI commands in instructions are valid -- [x] Troubleshooting section included - -### AUD-005 - Bundle Verification Command -Status: DONE -Dependency: AUD-003 -Owners: Developer/Implementer - -Task description: -Implement `stella audit verify` to validate bundle integrity: - -``` -stella audit verify - --strict Fail on any missing optional files - --check-signatures Verify DSSE signatures - --trusted-keys Trusted keys for signature verification -``` - -Verification steps: -1. Parse manifest.json -2. Verify all file hashes match -3. Validate verdict content ID -4. Optionally verify signatures -5. Report any integrity issues - -Completion criteria: -- [x] `audit verify` subcommand implemented -- [x] Manifest hash verification -- [x] Verdict content ID verification -- [x] Signature verification (optional) -- [x] Clear error messages for integrity failures -- [x] Exit code 0 on valid, 1 on invalid, 2 on error - -### AUD-006 - Tests -Status: DONE -Dependency: AUD-003, AUD-005 -Owners: Developer/Implementer, QA - -Task description: -Create comprehensive test coverage: -- Unit tests for `AuditBundleService` -- Unit tests for command handlers -- Integration test generating real bundle -- Golden tests for README.md and replay-instructions.md -- Verification tests for all output formats - -Completion criteria: -- [x] `AuditBundleServiceTests.cs` created -- [x] `AuditBundleCommandTests.cs` created (combined with service tests) -- [x] `AuditVerifyCommandTests.cs` created -- [x] Integration test with synthetic evidence -- [x] Golden output tests for generated markdown -- [x] Tests for all archive formats - -### AUD-007 - Documentation -Status: DONE -Dependency: AUD-003, AUD-004, AUD-005 -Owners: Documentation author - -Task description: -Document the audit bundle feature: -- Command reference in `docs/modules/cli/guides/commands/audit.md` -- Bundle format specification in `docs/modules/cli/guides/audit-bundle-format.md` -- Auditor guide in `docs/operations/guides/auditor-guide.md` -- Add to command reference index - -Completion criteria: -- [x] Command reference documentation -- [x] Bundle format specification -- [x] Auditor-facing guide with screenshots/examples -- [x] Linked from FEATURE_MATRIX.md - -## Execution Log -| Date (UTC) | Update | Owner | -| --- | --- | --- | -| 2026-01-17 | Sprint created from AI Economics Moat advisory gap analysis. | Planning | -| 2026-01-17 | AUD-003, AUD-004 completed. audit bundle command implemented in AuditCommandGroup.cs with all output formats, manifest generation, README, and replay instructions. | Developer | -| 2026-01-17 | AUD-001, AUD-002, AUD-005, AUD-006, AUD-007 completed. Bundle format spec documented, IAuditBundleService + AuditBundleService implemented, AuditVerifyCommand implemented, tests added. | Developer | -| 2026-01-17 | AUD-007 documentation completed. Command reference (audit.md), auditor guide created. | Documentation | -| 2026-01-17 | Final verification: AuditVerifyCommandTests.cs created with archive format tests and golden output tests. All tasks DONE. Sprint ready for archive. | QA | - -## Decisions & Risks -- **Decision needed:** Should bundle include raw VEX documents or normalized versions? Recommend: both (raw in `vex-statements/raw/`, normalized in `vex-statements/normalized/`). -- **Decision needed:** What archive format should be default? Recommend: directory for local use, tar.gz for transfer. -- **Risk:** Large bundles may be slow to generate. Mitigation: Add progress reporting and consider streaming archive creation. -- **Risk:** Bundle format may need evolution. Mitigation: Include schema version in manifest from day one. - -## Next Checkpoints -- Format specification complete: +2 working days -- Bundle generation working: +4 working days -- Commands and tests complete: +3 working days -- Documentation complete: +2 working days diff --git a/docs/implplan/SPRINT_20260117_028_Telemetry_p0_metrics.md b/docs/implplan/SPRINT_20260117_028_Telemetry_p0_metrics.md deleted file mode 100644 index 81942947b..000000000 --- a/docs/implplan/SPRINT_20260117_028_Telemetry_p0_metrics.md +++ /dev/null @@ -1,240 +0,0 @@ -# Sprint 028 · P0 Product Metrics Definition - -## Topic & Scope -- Define and instrument the four P0 product-level metrics from the AI Economics Moat advisory. -- Create Grafana dashboard templates for tracking these metrics. -- Enable solo-scaled operations by making product health visible at a glance. -- Working directory: `src/Telemetry/`, `devops/telemetry/`. -- Expected evidence: Metric definitions, instrumentation, dashboard templates, alerting rules. - -**Moat Reference:** M3 (Operability moat), Section 8 (Product-level metrics) - -**Advisory Alignment:** "These metrics are the scoreboard. Prioritize work that improves them." - -## Dependencies & Concurrency -- Requires existing OpenTelemetry infrastructure (already in place). -- Can run in parallel with other sprints. -- Dashboard templates depend on Grafana/Prometheus stack. - -## Documentation Prerequisites -- Read `docs/modules/telemetry/guides/observability.md` for existing metric patterns. -- Read `src/Attestor/StellaOps.Attestor/StellaOps.Attestor.Core/Verification/RekorVerificationMetrics.cs` for metric implementation patterns. -- Read advisory section 8 for metric definitions. - -## Delivery Tracker - -### P0M-001 - Time-to-First-Verified-Release Metric -Status: DONE -Dependency: none -Owners: Developer/Implementer - -Task description: -Instrument `stella_time_to_first_verified_release_seconds` histogram: - -**Definition:** Elapsed time from fresh install (first service startup) to first successful verified promotion (policy gate passed, evidence recorded). - -**Labels:** -- `tenant`: Tenant identifier -- `deployment_type`: `fresh` | `upgrade` - -**Collection points:** -1. Record install timestamp on first Authority startup (store in DB) -2. Record first verified promotion timestamp in Release Orchestrator -3. Emit metric on first promotion with duration = promotion_time - install_time - -**Implementation:** -- Add `InstallTimestampService` to record first startup -- Add metric emission in `ReleaseOrchestrator` on first promotion per tenant -- Use histogram buckets: 5m, 15m, 30m, 1h, 2h, 4h, 8h, 24h, 48h, 168h (1 week) - -Completion criteria: -- [x] Install timestamp recorded on first startup -- [x] Metric emitted on first verified promotion -- [x] Histogram with appropriate buckets -- [x] Label for tenant and deployment type -- [x] Unit test for metric emission - -### P0M-002 - Mean Time to Answer "Why Blocked" Metric -Status: DONE -Dependency: none -Owners: Developer/Implementer - -Task description: -Instrument `stella_why_blocked_latency_seconds` histogram: - -**Definition:** Time from block decision to user viewing explanation (via CLI, UI, or API). - -**Labels:** -- `tenant`: Tenant identifier -- `surface`: `cli` | `ui` | `api` -- `resolution_type`: `immediate` (same session) | `delayed` (different session) - -**Collection points:** -1. Record block decision timestamp in verdict -2. Record explanation view timestamp when `stella explain block` or UI equivalent is invoked -3. Emit metric with duration - -**Implementation:** -- Add explanation view tracking in CLI command -- Add explanation view tracking in UI (existing telemetry hook) -- Correlate via artifact digest -- Use histogram buckets: 1s, 5s, 30s, 1m, 5m, 15m, 1h, 4h, 24h - -Completion criteria: -- [x] Block decision timestamp available in verdict -- [x] Explanation view events tracked -- [x] Correlation by artifact digest -- [x] Histogram with appropriate buckets -- [x] Surface label populated correctly - -### P0M-003 - Support Minutes per Customer Metric -Status: DONE -Dependency: none -Owners: Developer/Implementer - -Task description: -Instrument `stella_support_burden_minutes_total` counter: - -**Definition:** Accumulated support time per customer per month. This is a manual/semi-automated metric for solo operations tracking. - -**Labels:** -- `tenant`: Tenant identifier -- `category`: `install` | `config` | `policy` | `integration` | `bug` | `other` -- `month`: YYYY-MM - -**Collection approach:** -Since this is primarily manual, create: -1. CLI command `stella ops support log --tenant --minutes --category ` for logging support events -2. API endpoint for programmatic logging -3. Counter incremented on each log entry - -**Target:** Trend toward zero. Alert if any tenant exceeds 30 minutes/month. - -Completion criteria: -- [x] Metric definition in P0ProductMetrics.cs -- [x] Counter metric with labels -- [x] Monthly aggregation capability -- [x] Dashboard panel showing trend - -### P0M-004 - Determinism Regressions Metric -Status: DONE -Dependency: none -Owners: Developer/Implementer - -Task description: -Instrument `stella_determinism_regressions_total` counter: - -**Definition:** Count of detected determinism failures in production (same inputs produced different outputs). - -**Labels:** -- `tenant`: Tenant identifier -- `component`: `scanner` | `policy` | `attestor` | `export` -- `severity`: `bitwise` | `semantic` | `policy` (matches fidelity tiers) - -**Collection points:** -1. Determinism verification jobs (scheduled) -2. Replay verification failures -3. Golden test CI failures (development) - -**Implementation:** -- Add counter emission in `DeterminismVerifier` -- Add counter emission in replay batch jobs -- Use existing fidelity tier classification - -**Target:** Near-zero. Alert immediately on any `policy` severity regression. - -Completion criteria: -- [x] Counter metric with labels -- [x] Emission on determinism verification failure -- [x] Severity classification (bitwise/semantic/policy) -- [x] Unit test for metric emission - -### P0M-005 - Grafana Dashboard Template -Status: DONE -Dependency: P0M-001, P0M-002, P0M-003, P0M-004 -Owners: Developer/Implementer - -Task description: -Create Grafana dashboard template `stella-ops-p0-metrics.json`: - -**Panels:** -1. **Time to First Release** - Histogram heatmap + P50/P90/P99 stat -2. **Why Blocked Latency** - Histogram heatmap + trend line -3. **Support Burden** - Stacked bar by category, monthly trend -4. **Determinism Regressions** - Counter with severity breakdown, alert status - -**Features:** -- Tenant selector variable -- Time range selector -- Drill-down links to detailed dashboards -- SLO indicator (green/yellow/red) - -**File location:** `devops/telemetry/grafana/dashboards/stella-ops-p0-metrics.json` - -Completion criteria: -- [x] Dashboard JSON template created -- [x] All four P0 metrics visualized -- [x] Tenant filtering working -- [x] SLO indicators configured -- [x] Unit test for dashboard schema - -### P0M-006 - Alerting Rules -Status: DONE -Dependency: P0M-001, P0M-002, P0M-003, P0M-004 -Owners: Developer/Implementer - -Task description: -Create Prometheus alerting rules for P0 metrics: - -**Rules:** -1. `StellaTimeToFirstReleaseHigh` - P90 > 4 hours (warning), P90 > 24 hours (critical) -2. `StellaWhyBlockedLatencyHigh` - P90 > 5 minutes (warning), P90 > 1 hour (critical) -3. `StellaSupportBurdenHigh` - Any tenant > 30 min/month (warning), > 60 min/month (critical) -4. `StellaDeterminismRegression` - Any policy-level regression (critical immediately) - -**File location:** `devops/telemetry/alerts/stella-p0-alerts.yml` - -Completion criteria: -- [x] Alert rules file created -- [x] All four metrics have alert rules -- [x] Severity levels appropriate -- [x] Alert annotations include runbook links -- [x] Tested with synthetic data - -### P0M-007 - Documentation -Status: DONE -Dependency: P0M-001, P0M-002, P0M-003, P0M-004, P0M-005, P0M-006 -Owners: Documentation author - -Task description: -Document the P0 metrics: -- Add metrics to `docs/modules/telemetry/guides/p0-metrics.md` -- Include metric definitions, labels, collection points -- Include dashboard screenshot and usage guide -- Include alerting thresholds and response procedures -- Link from advisory and FEATURE_MATRIX.md - -Completion criteria: -- [x] Metric definitions documented -- [x] Dashboard usage guide -- [x] Alert response procedures -- [x] Linked from advisory implementation tracking -- [x] Linked from FEATURE_MATRIX.md - -## Execution Log -| Date (UTC) | Update | Owner | -| --- | --- | --- | -| 2026-01-17 | Sprint created from AI Economics Moat advisory gap analysis. | Planning | -| 2026-01-17 | P0M-001 through P0M-006 completed. P0ProductMetrics.cs, InstallTimestampService.cs, Grafana dashboard, and alert rules implemented. Tests added. | Developer | -| 2026-01-17 | P0M-007 completed. docs/modules/telemetry/guides/p0-metrics.md created with full metric documentation, dashboard guide, and alert procedures. | Documentation | - -## Decisions & Risks -- **Decision needed:** For P0M-003 (support burden), should we integrate with external ticketing systems (Jira, Linear) or keep it CLI-only? Recommend: CLI-only initially, add integrations later. -- **Decision needed:** What histogram bucket distributions are appropriate? Recommend: Start with proposed buckets, refine based on real data. -- **Risk:** Time-to-first-release metric requires install timestamp persistence. If DB is wiped, metric resets. Mitigation: Accept this limitation; document in metric description. -- **Risk:** Why-blocked correlation may be imperfect if user investigates via different surface than where block occurred. Mitigation: Track best-effort, note limitation in docs. - -## Next Checkpoints -- Metric instrumentation complete: +3 working days -- Dashboard template complete: +2 working days -- Alerting rules and docs: +2 working days diff --git a/docs/modules/release-orchestrator/enhancements/agent-operations.md b/docs/modules/release-orchestrator/enhancements/agent-operations.md new file mode 100644 index 000000000..cc8c4ed16 --- /dev/null +++ b/docs/modules/release-orchestrator/enhancements/agent-operations.md @@ -0,0 +1,1475 @@ +# Agent Operations & Easy Setup + +## Overview + +The Agent Operations enhancement transforms agent deployment from a manual, error-prone process into a streamlined, self-healing experience. It provides zero-touch bootstrap, declarative configuration, comprehensive health diagnostics (Doctor plugin), and operational tooling that makes agents easy to deploy, monitor, and maintain at scale. + +This enhancement complements Sprint 034 (Agent Resilience) by focusing on the operational and configuration aspects rather than the clustering and failover mechanisms. + +--- + +## Design Principles + +1. **Zero-Touch Bootstrap**: Agents should be deployable with a single command +2. **Declarative Configuration**: Define desired state, system converges automatically +3. **Self-Diagnosing**: Agents report their own health issues with remediation hints +4. **Operator-Friendly**: Clear CLI commands, meaningful error messages, runbook links +5. **Secure by Default**: Auto-provisioned certificates, secrets never on disk +6. **Observable**: Rich metrics, structured logs, distributed tracing + +--- + +## Current Pain Points + +| Pain Point | Current State | Target State | +|------------|---------------|--------------| +| Certificate Management | Manual paths to cert/key/ca files | Auto-provisioned, auto-renewed | +| Configuration | Static YAML files, manual edits | Declarative config with drift detection | +| Health Monitoring | Binary alive/offline | Multi-dimensional health scoring | +| Troubleshooting | Manual log inspection | Doctor plugin with guided remediation | +| Scaling | Manual per-agent setup | Bootstrap token + auto-join | +| Updates | Manual agent binary updates | Auto-update with rollback | +| Network Issues | Silent failures | Connection diagnostics with hints | + +--- + +## Architecture + +### Component Overview + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ Agent Operations & Setup │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌───────────────────┐ ┌───────────────────┐ ┌───────────────────┐ │ +│ │ BootstrapService │───▶│ ConfigManager │───▶│ CertificateManager│ │ +│ │ │ │ │ │ │ │ +│ └───────────────────┘ └───────────────────┘ └───────────────────┘ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌───────────────────┐ ┌───────────────────┐ ┌───────────────────┐ │ +│ │ AgentDoctor │ │ ConnectionDoctor │ │ UpdateManager │ │ +│ │ │ │ │ │ │ │ +│ └───────────────────┘ └───────────────────┘ └───────────────────┘ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌───────────────────┐ ┌───────────────────┐ ┌───────────────────┐ │ +│ │ DiagnosticReport │ │ RemediationEngine │ │ OperatorCLI │ │ +│ │ │ │ │ │ │ │ +│ └───────────────────┘ └───────────────────┘ └───────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ + + Bootstrap Flow + + ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ + │ stella │ │ Orchestrator│ │ Agent │ + │ agent │─────▶│ (API) │─────▶│ Running │ + │ bootstrap │ │ │ │ │ + └─────────────┘ └─────────────┘ └─────────────┘ + │ │ │ + │ 1. Request token │ │ + │────────────────────▶│ │ + │ 2. Return token │ │ + │◀────────────────────│ │ + │ │ │ + │ 3. Start agent with token │ + │─────────────────────────────────────────▶│ + │ │ 4. Exchange token │ + │ │◀───────────────────│ + │ │ 5. Issue cert │ + │ │───────────────────▶│ + │ │ 6. Register │ + │ │◀───────────────────│ + │ │ 7. Confirm │ + │ │───────────────────▶│ +``` + +--- + +## Key Components + +### 1. Bootstrap Service + +Zero-touch agent deployment: + +```csharp +public sealed class BootstrapService +{ + public async Task BootstrapAgentAsync( + BootstrapRequest request, + CancellationToken ct) + { + // 1. Generate bootstrap token (one-time use, 15-minute expiry) + var token = await _tokenService.GenerateBootstrapTokenAsync( + new TokenRequest + { + AgentName = request.AgentName, + Environment = request.Environment, + Capabilities = request.Capabilities, + ExpiresIn = TimeSpan.FromMinutes(15), + MaxUses = 1 + }, ct); + + // 2. Generate agent configuration + var config = GenerateAgentConfig(request, token); + + // 3. Generate installation script + var script = GenerateInstallScript(request.Platform, config); + + return new BootstrapResult + { + Token = token.Value, + TokenExpires = token.ExpiresAt, + Configuration = config, + InstallScript = script, + InstallCommand = GetOneLineInstaller(request.Platform, token) + }; + } + + private string GetOneLineInstaller(Platform platform, BootstrapToken token) + { + return platform switch + { + Platform.Linux => $"curl -sSL https://stella.example.com/install.sh | sudo bash -s -- --token {token.Value}", + Platform.Windows => $"iwr https://stella.example.com/install.ps1 -UseBasicParsing | iex; Install-StellaAgent -Token {token.Value}", + Platform.Docker => $"docker run -d --name stella-agent -e STELLA_BOOTSTRAP_TOKEN={token.Value} stella/agent:latest", + _ => throw new UnsupportedPlatformException(platform) + }; + } +} + +public sealed record BootstrapRequest +{ + public string AgentName { get; init; } + public string Environment { get; init; } + public Platform Platform { get; init; } + public ImmutableArray Capabilities { get; init; } + public ImmutableDictionary Labels { get; init; } + public string? ClusterId { get; init; } // Join existing cluster +} + +public sealed record BootstrapResult +{ + public string Token { get; init; } + public DateTimeOffset TokenExpires { get; init; } + public AgentConfiguration Configuration { get; init; } + public string InstallScript { get; init; } + public string InstallCommand { get; init; } +} +``` + +### 2. Configuration Manager + +Declarative configuration with drift detection: + +```csharp +public sealed class AgentConfigManager +{ + public async Task ApplyConfigurationAsync( + AgentConfiguration desired, + CancellationToken ct) + { + var current = await _configStore.GetCurrentAsync(ct); + var diff = ComputeDiff(current, desired); + + if (diff.HasChanges) + { + _logger.LogInformation("Configuration drift detected: {Changes}", diff.Summary); + + // Validate changes are safe + var validation = await ValidateChangesAsync(diff, ct); + if (!validation.IsValid) + { + return new ConfigurationState + { + Status = ConfigStatus.ValidationFailed, + Errors = validation.Errors + }; + } + + // Apply changes with rollback capability + try + { + await ApplyChangesAsync(diff, ct); + await _configStore.SaveAsync(desired, ct); + + return new ConfigurationState + { + Status = ConfigStatus.Applied, + AppliedChanges = diff.Changes + }; + } + catch (Exception ex) + { + await RollbackAsync(current, ct); + throw new ConfigurationApplyException("Failed to apply configuration", ex); + } + } + + return new ConfigurationState { Status = ConfigStatus.NoChanges }; + } + + public async Task DetectDriftAsync(CancellationToken ct) + { + var desired = await _configStore.GetDesiredAsync(ct); + var actual = await _configStore.GetActualAsync(ct); + + return new ConfigDrift + { + HasDrift = !desired.Equals(actual), + DesiredState = desired, + ActualState = actual, + Differences = ComputeDiff(actual, desired).Changes + }; + } +} + +// Declarative configuration model +public sealed record AgentConfiguration +{ + // Identity + public string AgentId { get; init; } + public string AgentName { get; init; } + public string Environment { get; init; } + public ImmutableDictionary Labels { get; init; } + + // Connection + public string OrchestratorUrl { get; init; } + public TimeSpan HeartbeatInterval { get; init; } = TimeSpan.FromSeconds(30); + public TimeSpan ReconnectBackoff { get; init; } = TimeSpan.FromSeconds(5); + public int MaxReconnectAttempts { get; init; } = 10; + + // Capabilities + public ImmutableArray Capabilities { get; init; } + + // Resources + public ResourceLimits ResourceLimits { get; init; } + public int MaxConcurrentTasks { get; init; } = 5; + public TimeSpan DefaultTaskTimeout { get; init; } = TimeSpan.FromMinutes(30); + + // Security + public CertificateConfig Certificates { get; init; } + public bool AutoRenewCertificates { get; init; } = true; + public TimeSpan CertificateRenewalThreshold { get; init; } = TimeSpan.FromDays(7); + + // Clustering (optional) + public ClusterConfig? Cluster { get; init; } + + // Observability + public ObservabilityConfig Observability { get; init; } + + // Auto-update + public AutoUpdateConfig? AutoUpdate { get; init; } +} + +public sealed record CertificateConfig +{ + public CertificateSource Source { get; init; } = CertificateSource.AutoProvision; + public string? CertificatePath { get; init; } // Only if Source = File + public string? PrivateKeyPath { get; init; } // Only if Source = File + public string? CaCertificatePath { get; init; } // Only if Source = File +} + +public enum CertificateSource +{ + AutoProvision, // Orchestrator provisions via bootstrap + File, // Manual file paths + Vault, // HashiCorp Vault + ACME, // Let's Encrypt / ACME + AzureKeyVault, // Azure Key Vault + AWSKMS // AWS KMS/Secrets Manager +} +``` + +### 3. Certificate Manager + +Automatic certificate lifecycle: + +```csharp +public sealed class AgentCertificateManager +{ + public async Task EnsureCertificateAsync(CancellationToken ct) + { + var current = await GetCurrentCertificateAsync(ct); + + if (current == null) + { + _logger.LogInformation("No certificate found, requesting new certificate"); + return await ProvisionCertificateAsync(ct); + } + + var expiresIn = current.NotAfter - _timeProvider.GetUtcNow(); + var threshold = _config.CertificateRenewalThreshold; + + if (expiresIn <= TimeSpan.Zero) + { + _logger.LogWarning("Certificate expired, requesting renewal"); + return await RenewCertificateAsync(current, ct); + } + + if (expiresIn <= threshold) + { + _logger.LogInformation( + "Certificate expires in {Days} days, renewing proactively", + expiresIn.TotalDays); + return await RenewCertificateAsync(current, ct); + } + + return new CertificateState + { + Status = CertificateStatus.Valid, + Certificate = current, + ExpiresAt = current.NotAfter, + RenewalScheduled = current.NotAfter - threshold + }; + } + + private async Task ProvisionCertificateAsync(CancellationToken ct) + { + // Generate key pair locally (private key never leaves agent) + using var rsa = RSA.Create(4096); + + // Create CSR + var csr = CreateCertificateSigningRequest(rsa); + + // Submit CSR to orchestrator + var signedCert = await _orchestratorClient.SubmitCSRAsync( + new CSRRequest + { + AgentId = _config.AgentId, + CSR = csr, + RequestedValidity = TimeSpan.FromDays(365) + }, ct); + + // Store certificate and key securely + await _certStore.StoreCertificateAsync(signedCert, ct); + await _keyStore.StorePrivateKeyAsync(rsa, ct); + + return new CertificateState + { + Status = CertificateStatus.Provisioned, + Certificate = signedCert, + ExpiresAt = signedCert.NotAfter + }; + } +} +``` + +### 4. Agent Doctor (Health Checks) + +Comprehensive health diagnostics: + +```csharp +public sealed class AgentDoctor +{ + private readonly ImmutableArray _checks; + + public AgentDoctor() + { + _checks = new IAgentHealthCheck[] + { + // Core checks + new CertificateExpiryCheck(), + new CertificateValidityCheck(), + new OrchestratorConnectivityCheck(), + new HeartbeatCheck(), + + // Resource checks + new DiskSpaceCheck(), + new MemoryUsageCheck(), + new CpuUsageCheck(), + new FileDescriptorCheck(), + + // Configuration checks + new ConfigurationValidityCheck(), + new ConfigurationDriftCheck(), + new CapabilityCheck(), + + // Network checks + new RegistryConnectivityCheck(), + new DNSResolutionCheck(), + new TLSVersionCheck(), + new MTLSHandshakeCheck(), + + // Task execution checks + new DockerConnectivityCheck(), + new DockerVersionCheck(), + new TaskQueueDepthCheck(), + new FailedTaskRateCheck(), + + // Cluster checks (if clustered) + new ClusterMembershipCheck(), + new LeaderConnectivityCheck(), + new StateSyncCheck() + }.ToImmutableArray(); + } + + public async Task RunDiagnosticsAsync( + DiagnosticOptions options, + CancellationToken ct) + { + var results = new List(); + var startTime = _timeProvider.GetUtcNow(); + + foreach (var check in _checks) + { + if (options.Categories.Any() && + !options.Categories.Contains(check.Category)) + { + continue; + } + + try + { + var result = await check.ExecuteAsync(ct); + results.Add(result); + + if (result.Status == HealthStatus.Critical && options.StopOnCritical) + { + break; + } + } + catch (Exception ex) + { + results.Add(new HealthCheckResult + { + CheckName = check.Name, + Status = HealthStatus.Error, + Message = $"Check failed with exception: {ex.Message}", + Exception = ex + }); + } + } + + return new AgentDiagnosticReport + { + AgentId = _config.AgentId, + AgentName = _config.AgentName, + Timestamp = startTime, + Duration = _timeProvider.GetUtcNow() - startTime, + OverallStatus = DetermineOverallStatus(results), + Results = results.ToImmutableArray(), + Remediations = GenerateRemediations(results) + }; + } + + private ImmutableArray GenerateRemediations( + List results) + { + var remediations = new List(); + + foreach (var result in results.Where(r => r.Status != HealthStatus.Healthy)) + { + var steps = _remediationEngine.GetRemediationSteps(result); + remediations.AddRange(steps); + } + + // Sort by priority and deduplicate + return remediations + .DistinctBy(r => r.Id) + .OrderByDescending(r => r.Priority) + .ToImmutableArray(); + } +} + +// Individual health checks +public sealed class CertificateExpiryCheck : IAgentHealthCheck +{ + public string Name => "Certificate Expiry"; + public string Category => "Security"; + + public async Task ExecuteAsync(CancellationToken ct) + { + var cert = await _certManager.GetCurrentCertificateAsync(ct); + + if (cert == null) + { + return new HealthCheckResult + { + CheckName = Name, + Status = HealthStatus.Critical, + Message = "No certificate found", + RemediationHint = "Run 'stella agent bootstrap' to provision certificate", + RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-no-certificate" + }; + } + + var expiresIn = cert.NotAfter - _timeProvider.GetUtcNow(); + + if (expiresIn <= TimeSpan.Zero) + { + return new HealthCheckResult + { + CheckName = Name, + Status = HealthStatus.Critical, + Message = $"Certificate expired on {cert.NotAfter:u}", + RemediationHint = "Run 'stella agent renew-cert' or restart agent for auto-renewal", + RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-cert-expired" + }; + } + + if (expiresIn <= TimeSpan.FromDays(7)) + { + return new HealthCheckResult + { + CheckName = Name, + Status = HealthStatus.Warning, + Message = $"Certificate expires in {expiresIn.TotalDays:F0} days", + RemediationHint = "Certificate will auto-renew if enabled, or run 'stella agent renew-cert'", + Data = new Dictionary + { + ["expires_at"] = cert.NotAfter, + ["expires_in_days"] = expiresIn.TotalDays + } + }; + } + + return new HealthCheckResult + { + CheckName = Name, + Status = HealthStatus.Healthy, + Message = $"Certificate valid until {cert.NotAfter:u} ({expiresIn.TotalDays:F0} days)", + Data = new Dictionary + { + ["expires_at"] = cert.NotAfter, + ["expires_in_days"] = expiresIn.TotalDays + } + }; + } +} + +public sealed class OrchestratorConnectivityCheck : IAgentHealthCheck +{ + public string Name => "Orchestrator Connectivity"; + public string Category => "Network"; + + public async Task ExecuteAsync(CancellationToken ct) + { + var endpoint = _config.OrchestratorUrl; + + try + { + // Test DNS resolution + var uri = new Uri(endpoint); + var addresses = await Dns.GetHostAddressesAsync(uri.Host, ct); + + if (addresses.Length == 0) + { + return new HealthCheckResult + { + CheckName = Name, + Status = HealthStatus.Critical, + Message = $"DNS resolution failed for {uri.Host}", + RemediationHint = "Check DNS settings and network connectivity", + RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-dns-failure" + }; + } + + // Test TCP connection + using var tcpClient = new TcpClient(); + var connectTask = tcpClient.ConnectAsync(uri.Host, uri.Port, ct); + var completed = await Task.WhenAny( + connectTask.AsTask(), + Task.Delay(TimeSpan.FromSeconds(5), ct)); + + if (completed != connectTask.AsTask() || !tcpClient.Connected) + { + return new HealthCheckResult + { + CheckName = Name, + Status = HealthStatus.Critical, + Message = $"TCP connection to {endpoint} timed out", + RemediationHint = "Check firewall rules and network connectivity", + RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-connection-timeout" + }; + } + + // Test mTLS handshake + var tlsResult = await TestMTLSHandshakeAsync(uri, ct); + if (!tlsResult.Success) + { + return new HealthCheckResult + { + CheckName = Name, + Status = HealthStatus.Critical, + Message = $"mTLS handshake failed: {tlsResult.Error}", + RemediationHint = tlsResult.RemediationHint, + RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-mtls-failure" + }; + } + + // Test gRPC health endpoint + var healthResult = await _orchestratorClient.HealthCheckAsync(ct); + + return new HealthCheckResult + { + CheckName = Name, + Status = HealthStatus.Healthy, + Message = $"Connected to orchestrator at {endpoint}", + Data = new Dictionary + { + ["resolved_addresses"] = addresses.Select(a => a.ToString()).ToArray(), + ["tls_version"] = tlsResult.TlsVersion, + ["latency_ms"] = healthResult.LatencyMs + } + }; + } + catch (Exception ex) + { + return new HealthCheckResult + { + CheckName = Name, + Status = HealthStatus.Critical, + Message = $"Connectivity check failed: {ex.Message}", + Exception = ex, + RemediationHint = "Check network configuration and orchestrator status", + RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-connectivity" + }; + } + } +} + +public sealed class DockerConnectivityCheck : IAgentHealthCheck +{ + public string Name => "Docker Connectivity"; + public string Category => "Runtime"; + + public async Task ExecuteAsync(CancellationToken ct) + { + try + { + var version = await _dockerClient.GetVersionAsync(ct); + + // Check minimum version + var minVersion = new Version(20, 10, 0); + var currentVersion = new Version(version.Version); + + if (currentVersion < minVersion) + { + return new HealthCheckResult + { + CheckName = Name, + Status = HealthStatus.Warning, + Message = $"Docker version {version.Version} is below recommended {minVersion}", + RemediationHint = "Upgrade Docker to version 20.10 or later", + Data = new Dictionary + { + ["docker_version"] = version.Version, + ["api_version"] = version.ApiVersion, + ["min_recommended"] = minVersion.ToString() + } + }; + } + + return new HealthCheckResult + { + CheckName = Name, + Status = HealthStatus.Healthy, + Message = $"Docker {version.Version} connected", + Data = new Dictionary + { + ["docker_version"] = version.Version, + ["api_version"] = version.ApiVersion, + ["os"] = version.Os, + ["arch"] = version.Arch + } + }; + } + catch (Exception ex) + { + return new HealthCheckResult + { + CheckName = Name, + Status = HealthStatus.Critical, + Message = $"Docker connectivity failed: {ex.Message}", + Exception = ex, + RemediationHint = "Ensure Docker daemon is running and agent has permission to access Docker socket", + RunbookUrl = "https://docs.stella-ops.org/runbooks/agent-docker-connectivity" + }; + } + } +} +``` + +### 5. Remediation Engine + +Guided problem resolution: + +```csharp +public sealed class RemediationEngine +{ + public ImmutableArray GetRemediationSteps( + HealthCheckResult result) + { + var steps = new List(); + + // Match result to known remediation patterns + var pattern = _patterns.FirstOrDefault(p => p.Matches(result)); + + if (pattern != null) + { + steps.AddRange(pattern.Steps); + } + + // Add generic remediation based on status + if (result.Status == HealthStatus.Critical) + { + steps.Add(new RemediationStep + { + Id = "check-logs", + Priority = RemediationPriority.High, + Title = "Check Agent Logs", + Description = "Review agent logs for detailed error information", + Command = "stella agent logs --tail 100", + RunbookUrl = result.RunbookUrl + }); + } + + return steps.ToImmutableArray(); + } + + private readonly ImmutableArray _patterns = new[] + { + new RemediationPattern + { + CheckName = "Certificate Expiry", + StatusMatch = HealthStatus.Critical, + Steps = new[] + { + new RemediationStep + { + Id = "renew-cert", + Priority = RemediationPriority.Critical, + Title = "Renew Agent Certificate", + Description = "Agent certificate has expired and must be renewed", + Command = "stella agent renew-cert --force", + Automated = true + }, + new RemediationStep + { + Id = "restart-agent", + Priority = RemediationPriority.High, + Title = "Restart Agent", + Description = "Restart agent to apply new certificate", + Command = "systemctl restart stella-agent", + Automated = false + } + } + }, + new RemediationPattern + { + CheckName = "Orchestrator Connectivity", + MessageContains = "DNS resolution failed", + Steps = new[] + { + new RemediationStep + { + Id = "check-dns", + Priority = RemediationPriority.Critical, + Title = "Verify DNS Configuration", + Description = "Check that DNS servers are configured and reachable", + Command = "cat /etc/resolv.conf && nslookup orchestrator.example.com", + Automated = false + }, + new RemediationStep + { + Id = "check-hosts", + Priority = RemediationPriority.High, + Title = "Check /etc/hosts", + Description = "Verify no conflicting entries in hosts file", + Command = "grep orchestrator /etc/hosts", + Automated = false + } + } + }, + new RemediationPattern + { + CheckName = "Docker Connectivity", + Steps = new[] + { + new RemediationStep + { + Id = "check-docker-daemon", + Priority = RemediationPriority.Critical, + Title = "Check Docker Daemon", + Description = "Verify Docker daemon is running", + Command = "systemctl status docker", + Automated = false + }, + new RemediationStep + { + Id = "check-docker-socket", + Priority = RemediationPriority.High, + Title = "Check Docker Socket Permissions", + Description = "Verify agent has access to Docker socket", + Command = "ls -la /var/run/docker.sock && groups stella-agent", + Automated = false + } + } + } + }.ToImmutableArray(); +} + +public sealed record RemediationStep +{ + public string Id { get; init; } + public RemediationPriority Priority { get; init; } + public string Title { get; init; } + public string Description { get; init; } + public string? Command { get; init; } + public string? RunbookUrl { get; init; } + public bool Automated { get; init; } + public TimeSpan? EstimatedDuration { get; init; } +} +``` + +### 6. Auto-Update Manager + +Safe agent binary updates: + +```csharp +public sealed class AgentUpdateManager +{ + public async Task CheckAndApplyUpdateAsync( + CancellationToken ct) + { + if (!_config.AutoUpdate?.Enabled == true) + { + return new UpdateResult { Status = UpdateStatus.Disabled }; + } + + // Check for available update + var available = await _updateService.CheckForUpdateAsync( + _config.AgentVersion, + _config.AutoUpdate.Channel, + ct); + + if (!available.HasUpdate) + { + return new UpdateResult { Status = UpdateStatus.UpToDate }; + } + + // Verify update signature + var verified = await _signatureVerifier.VerifyAsync( + available.Package, + available.Signature, + ct); + + if (!verified) + { + _logger.LogError("Update signature verification failed"); + return new UpdateResult + { + Status = UpdateStatus.VerificationFailed, + Error = "Package signature verification failed" + }; + } + + // Check if update window is allowed + if (!IsInUpdateWindow()) + { + _logger.LogInformation( + "Update available but outside update window, scheduling for {Window}", + _config.AutoUpdate.MaintenanceWindow); + + return new UpdateResult + { + Status = UpdateStatus.Scheduled, + ScheduledFor = GetNextMaintenanceWindow() + }; + } + + // Drain active tasks + await DrainActiveTasksAsync(ct); + + // Download and apply update + try + { + var packagePath = await DownloadPackageAsync(available, ct); + + // Create rollback point + var rollbackPoint = await CreateRollbackPointAsync(ct); + + // Apply update + await ApplyUpdateAsync(packagePath, ct); + + // Verify new version starts correctly + var healthCheck = await VerifyNewVersionAsync(ct); + + if (!healthCheck.Healthy) + { + _logger.LogError("New version health check failed, rolling back"); + await RollbackAsync(rollbackPoint, ct); + + return new UpdateResult + { + Status = UpdateStatus.RolledBack, + Error = healthCheck.Error + }; + } + + return new UpdateResult + { + Status = UpdateStatus.Applied, + PreviousVersion = _config.AgentVersion, + NewVersion = available.Version + }; + } + catch (Exception ex) + { + _logger.LogError(ex, "Update failed, attempting rollback"); + await RollbackAsync(ct); + + return new UpdateResult + { + Status = UpdateStatus.Failed, + Error = ex.Message + }; + } + } +} + +public sealed record AutoUpdateConfig +{ + public bool Enabled { get; init; } = false; + public UpdateChannel Channel { get; init; } = UpdateChannel.Stable; + public string? MaintenanceWindow { get; init; } // Cron expression + public bool DrainBeforeUpdate { get; init; } = true; + public TimeSpan DrainTimeout { get; init; } = TimeSpan.FromMinutes(5); + public int MaxRollbackVersions { get; init; } = 3; +} + +public enum UpdateChannel +{ + Stable, + Beta, + Canary +} +``` + +### 7. Operator CLI Commands + +Streamlined operational commands: + +```csharp +public sealed class AgentOperatorCommands +{ + // Bootstrap new agent + // stella agent bootstrap --name prod-agent-01 --env production --platform linux + [Command("agent bootstrap")] + public async Task BootstrapAsync( + [Option] string name, + [Option] string env, + [Option] Platform platform = Platform.Linux, + [Option] string[]? capabilities = null, + [Option] string? cluster = null) + { + var result = await _bootstrap.BootstrapAgentAsync(new BootstrapRequest + { + AgentName = name, + Environment = env, + Platform = platform, + Capabilities = capabilities?.ToImmutableArray() ?? ImmutableArray.Empty, + ClusterId = cluster + }, _ct); + + Console.WriteLine($"Bootstrap token generated (expires in 15 minutes):"); + Console.WriteLine(); + Console.WriteLine($" Token: {result.Token}"); + Console.WriteLine(); + Console.WriteLine($"One-line installer:"); + Console.WriteLine($" {result.InstallCommand}"); + Console.WriteLine(); + Console.WriteLine($"Or download the install script:"); + Console.WriteLine($" stella agent install-script --token {result.Token} --output install.sh"); + + return 0; + } + + // Run diagnostics + // stella agent doctor [--category security] [--fix] + [Command("agent doctor")] + public async Task DoctorAsync( + [Option] string? agentId = null, + [Option] string[]? categories = null, + [Option] bool fix = false, + [Option] OutputFormat format = OutputFormat.Table) + { + var options = new DiagnosticOptions + { + Categories = categories?.ToImmutableArray() ?? ImmutableArray.Empty, + IncludeRemediations = true + }; + + var report = agentId != null + ? await _doctor.RunRemoteDiagnosticsAsync(agentId, options, _ct) + : await _doctor.RunDiagnosticsAsync(options, _ct); + + // Display results + RenderDiagnosticReport(report, format); + + // Optionally apply automated fixes + if (fix && report.Remediations.Any(r => r.Automated)) + { + Console.WriteLine(); + Console.WriteLine("Applying automated remediations..."); + + foreach (var remediation in report.Remediations.Where(r => r.Automated)) + { + Console.WriteLine($" - {remediation.Title}"); + await _remediation.ApplyAsync(remediation, _ct); + } + } + + return report.OverallStatus == HealthStatus.Healthy ? 0 : 1; + } + + // View agent configuration + // stella agent config [--agent-id xyz] [--diff] + [Command("agent config")] + public async Task ConfigAsync( + [Option] string? agentId = null, + [Option] bool diff = false, + [Option] OutputFormat format = OutputFormat.Yaml) + { + if (diff) + { + var drift = await _configManager.DetectDriftAsync(_ct); + RenderConfigDiff(drift, format); + return drift.HasDrift ? 1 : 0; + } + + var config = await _configManager.GetConfigurationAsync(agentId, _ct); + RenderConfiguration(config, format); + return 0; + } + + // Apply configuration changes + // stella agent apply -f agent-config.yaml + [Command("agent apply")] + public async Task ApplyAsync( + [Option('f')] string configFile) + { + var config = await LoadConfigurationAsync(configFile); + var validation = await _configManager.ValidateAsync(config, _ct); + + if (!validation.IsValid) + { + Console.WriteLine("Configuration validation failed:"); + foreach (var error in validation.Errors) + { + Console.WriteLine($" - {error}"); + } + return 1; + } + + var result = await _configManager.ApplyConfigurationAsync(config, _ct); + + if (result.Status == ConfigStatus.Applied) + { + Console.WriteLine($"Configuration applied successfully ({result.AppliedChanges.Length} changes)"); + return 0; + } + + Console.WriteLine($"Configuration apply failed: {result.Status}"); + return 1; + } + + // Renew certificate + // stella agent renew-cert [--force] + [Command("agent renew-cert")] + public async Task RenewCertAsync( + [Option] bool force = false) + { + var result = await _certManager.RenewCertificateAsync(force, _ct); + + if (result.Status == CertificateStatus.Renewed) + { + Console.WriteLine($"Certificate renewed successfully"); + Console.WriteLine($" New expiry: {result.ExpiresAt:u}"); + return 0; + } + + Console.WriteLine($"Certificate renewal failed: {result.Error}"); + return 1; + } + + // View agent logs + // stella agent logs [--tail 100] [--follow] [--level error] + [Command("agent logs")] + public async Task LogsAsync( + [Option] string? agentId = null, + [Option] int tail = 50, + [Option] bool follow = false, + [Option] LogLevel? level = null) + { + await foreach (var entry in _logService.StreamLogsAsync( + agentId, tail, follow, level, _ct)) + { + RenderLogEntry(entry); + } + + return 0; + } + + // Force update + // stella agent update [--version x.y.z] [--force] + [Command("agent update")] + public async Task UpdateAsync( + [Option] string? version = null, + [Option] bool force = false) + { + var result = await _updateManager.UpdateToVersionAsync(version, force, _ct); + + Console.WriteLine($"Update status: {result.Status}"); + if (result.Status == UpdateStatus.Applied) + { + Console.WriteLine($" Previous: {result.PreviousVersion}"); + Console.WriteLine($" Current: {result.NewVersion}"); + } + + return result.Status == UpdateStatus.Applied ? 0 : 1; + } +} +``` + +--- + +## Doctor Plugin for Server-Side + +Central Doctor plugin for agent fleet health: + +```csharp +// src/Doctor/__Plugins/StellaOps.Doctor.Plugin.Agent/AgentHealthPlugin.cs +public sealed class AgentHealthPlugin : IDoctorPlugin +{ + public string Name => "Agent Health"; + public string Description => "Monitors agent fleet health and connectivity"; + + public ImmutableArray Checks => new IDoctorCheck[] + { + new AgentHeartbeatFreshnessCheck(), + new AgentCertificateExpiryCheck(), + new AgentVersionConsistencyCheck(), + new AgentCapacityCheck(), + new StaleAgentCheck(), + new AgentClusterHealthCheck(), + new TaskQueueBacklogCheck(), + new FailedTaskRateCheck(), + new AgentResourceUtilizationCheck() + }.ToImmutableArray(); +} + +public sealed class AgentHeartbeatFreshnessCheck : IDoctorCheck +{ + public string Name => "Agent Heartbeat Freshness"; + public CheckSeverity Severity => CheckSeverity.Critical; + + public async Task ExecuteAsync(CancellationToken ct) + { + var agents = await _agentStore.GetAllAsync(ct); + var staleAgents = new List(); + var warningAgents = new List(); + + foreach (var agent in agents.Where(a => a.Status != AgentStatus.Deactivated)) + { + var heartbeatAge = _timeProvider.GetUtcNow() - agent.LastHeartbeat; + + if (heartbeatAge > TimeSpan.FromMinutes(5)) + { + staleAgents.Add($"{agent.Name} (last heartbeat: {heartbeatAge.TotalMinutes:F0}m ago)"); + } + else if (heartbeatAge > TimeSpan.FromMinutes(2)) + { + warningAgents.Add($"{agent.Name} (last heartbeat: {heartbeatAge.TotalSeconds:F0}s ago)"); + } + } + + if (staleAgents.Any()) + { + return new DoctorCheckResult + { + Status = CheckStatus.Critical, + Message = $"{staleAgents.Count} agent(s) have stale heartbeats", + Details = staleAgents, + Remediation = "Check agent connectivity and status. Run 'stella agent doctor --agent-id ' for diagnostics." + }; + } + + if (warningAgents.Any()) + { + return new DoctorCheckResult + { + Status = CheckStatus.Warning, + Message = $"{warningAgents.Count} agent(s) have delayed heartbeats", + Details = warningAgents + }; + } + + return new DoctorCheckResult + { + Status = CheckStatus.Healthy, + Message = $"All {agents.Count} agents have fresh heartbeats" + }; + } +} + +public sealed class AgentCertificateExpiryCheck : IDoctorCheck +{ + public string Name => "Agent Certificate Expiry"; + public CheckSeverity Severity => CheckSeverity.High; + + public async Task ExecuteAsync(CancellationToken ct) + { + var agents = await _agentStore.GetAllAsync(ct); + var expiringSoon = new List(); + var expired = new List(); + + foreach (var agent in agents) + { + var expiresIn = agent.CertificateExpiry - _timeProvider.GetUtcNow(); + + if (expiresIn <= TimeSpan.Zero) + { + expired.Add($"{agent.Name} (expired {-expiresIn.TotalDays:F0} days ago)"); + } + else if (expiresIn <= TimeSpan.FromDays(7)) + { + expiringSoon.Add($"{agent.Name} (expires in {expiresIn.TotalDays:F0} days)"); + } + } + + if (expired.Any()) + { + return new DoctorCheckResult + { + Status = CheckStatus.Critical, + Message = $"{expired.Count} agent(s) have expired certificates", + Details = expired, + Remediation = "Renew certificates immediately: 'stella agent renew-cert --agent-id '" + }; + } + + if (expiringSoon.Any()) + { + return new DoctorCheckResult + { + Status = CheckStatus.Warning, + Message = $"{expiringSoon.Count} agent(s) have certificates expiring soon", + Details = expiringSoon, + Remediation = "Schedule certificate renewal before expiry" + }; + } + + return new DoctorCheckResult + { + Status = CheckStatus.Healthy, + Message = "All agent certificates are valid" + }; + } +} +``` + +--- + +## Configuration Examples + +### Minimal Configuration (Bootstrap) + +```yaml +# Bootstrapped agent - minimal config required +agent: + name: prod-agent-01 + orchestrator_url: https://orchestrator.example.com:8443 + # Everything else is auto-configured via bootstrap +``` + +### Full Configuration + +```yaml +agent: + # Identity + id: a1b2c3d4-e5f6-7890-abcd-ef1234567890 + name: prod-agent-01 + environment: production + labels: + region: us-east-1 + tier: web + + # Connection + orchestrator_url: https://orchestrator.example.com:8443 + heartbeat_interval: 30s + reconnect_backoff: 5s + max_reconnect_attempts: 10 + + # Capabilities + capabilities: + - docker + - compose + - health_check + + # Resources + max_concurrent_tasks: 5 + default_task_timeout: 30m + resource_limits: + cpu_percent: 80 + memory_percent: 80 + disk_percent: 90 + + # Certificates + certificates: + source: auto_provision # auto_provision | file | vault + auto_renew: true + renewal_threshold: 7d + + # Clustering (optional) + cluster: + id: prod-cluster-01 + mode: active_active # active_passive | active_active | sharded + min_members: 2 + + # Observability + observability: + metrics: + enabled: true + port: 9090 + logging: + level: info + format: json + tracing: + enabled: true + endpoint: http://jaeger:14268/api/traces + + # Auto-update (optional) + auto_update: + enabled: true + channel: stable # stable | beta | canary + maintenance_window: "0 3 * * *" # 3 AM daily + drain_before_update: true +``` + +--- + +## CLI Quick Reference + +```bash +# Bootstrap new agent +stella agent bootstrap --name prod-01 --env production --platform linux + +# Run health diagnostics +stella agent doctor +stella agent doctor --category security --fix +stella agent doctor --agent-id abc123 --format json + +# View/apply configuration +stella agent config +stella agent config --diff +stella agent apply -f agent-config.yaml + +# Certificate management +stella agent renew-cert +stella agent renew-cert --force + +# Logs and debugging +stella agent logs --tail 100 +stella agent logs --follow --level error + +# Updates +stella agent update +stella agent update --version 2.1.0 + +# Status and health +stella agent status +stella agent list --env production +stella agent health abc123 +``` + +--- + +## Metrics & Observability + +### Prometheus Metrics + +``` +# Bootstrap +stella_agent_bootstrap_total{environment, platform} +stella_agent_bootstrap_success_total{environment} +stella_agent_bootstrap_failed_total{environment, reason} + +# Configuration +stella_agent_config_drift_detected_total{agent_id} +stella_agent_config_apply_total{agent_id, status} + +# Certificates +stella_agent_certificate_expiry_seconds{agent_id} +stella_agent_certificate_renewal_total{agent_id, status} + +# Health Checks +stella_agent_health_check_total{agent_id, check_name, status} +stella_agent_health_score{agent_id} + +# Updates +stella_agent_update_available{agent_id, current_version, available_version} +stella_agent_update_applied_total{agent_id, status} +stella_agent_update_rollback_total{agent_id} +``` + +--- + +## Test Strategy + +### Unit Tests +- Bootstrap token generation and validation +- Configuration diff computation +- Certificate lifecycle logic +- Health check execution +- Remediation matching + +### Integration Tests +- Full bootstrap flow +- Configuration apply with rollback +- Certificate renewal +- Auto-update with rollback +- Doctor diagnostics + +### E2E Tests +- Bootstrap to running agent +- Multi-agent cluster formation +- Failover scenarios +- Update and rollback scenarios + +--- + +## Migration Path + +### Phase 1: Bootstrap Service (Week 1-2) +- Bootstrap token service +- One-line installer generation +- Platform-specific install scripts + +### Phase 2: Configuration Manager (Week 3-4) +- Declarative configuration model +- Drift detection +- Apply with rollback + +### Phase 3: Certificate Manager (Week 5-6) +- Auto-provisioning +- Auto-renewal +- Multi-source support (Vault, ACME, etc.) + +### Phase 4: Agent Doctor (Week 7-8) +- Core health checks +- Remediation engine +- CLI integration + +### Phase 5: Doctor Plugin (Week 9-10) +- Server-side fleet health +- Dashboard integration +- Alerting rules + +### Phase 6: Auto-Update (Week 11-12) +- Update service +- Safe rollback +- Maintenance windows diff --git a/docs/modules/release-orchestrator/enhancements/agent-resilience.md b/docs/modules/release-orchestrator/enhancements/agent-resilience.md new file mode 100644 index 000000000..136dbecef --- /dev/null +++ b/docs/modules/release-orchestrator/enhancements/agent-resilience.md @@ -0,0 +1,1111 @@ +# Agent Resilience + +## Overview + +Agent Resilience transforms the deployment agent architecture into a highly available, fault-tolerant system. This enhancement provides agent clustering for high availability, automatic failover during deployments, offline task queuing, and self-healing capabilities. + +This is a best-in-class implementation that ensures deployments complete successfully even when individual agents fail, network partitions occur, or agents need maintenance. + +--- + +## Design Principles + +1. **Zero Downtime Deployments**: Agent failures don't block deployments +2. **Automatic Recovery**: Self-healing without operator intervention +3. **Graceful Degradation**: Reduced capacity vs. complete failure +4. **Offline Resilience**: Queue tasks for disconnected agents +5. **Transparent Failover**: Seamless handoff between agents +6. **Predictable Behavior**: Deterministic failover decisions + +--- + +## Architecture + +### Component Overview + +``` +┌────────────────────────────────────────────────────────────────────────┐ +│ Agent Resilience System │ +├────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │ +│ │ AgentCluster │───▶│ FailoverManager │───▶│ TaskRouter │ │ +│ │ Manager │ │ │ │ │ │ +│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │ +│ │ HealthMonitor │ │ LeaderElection │ │ TaskQueue │ │ +│ │ │ │ │ │ │ │ +│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │ +│ │ SelfHealer │ │ StateSync │ │ RetryManager │ │ +│ │ │ │ │ │ │ │ +│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │ +│ │ +└────────────────────────────────────────────────────────────────────────┘ +``` + +### Key Components + +#### 1. AgentClusterManager + +Manages agent clusters for high availability: + +```csharp +public sealed class AgentClusterManager +{ + public async Task CreateClusterAsync( + AgentClusterConfig config, + CancellationToken ct) + { + var cluster = new AgentCluster + { + Id = Guid.NewGuid(), + Name = config.Name, + TargetGroupId = config.TargetGroupId, + MinimumAgents = config.MinimumAgents, + DesiredAgents = config.DesiredAgents, + ReplicationMode = config.ReplicationMode, + FailoverPolicy = config.FailoverPolicy, + CreatedAt = _timeProvider.GetUtcNow() + }; + + await _clusterStore.SaveAsync(cluster, ct); + return cluster; + } + + public async Task> GetClusterMembersAsync( + Guid clusterId, + CancellationToken ct) + { + var cluster = await _clusterStore.GetAsync(clusterId, ct); + var agents = await _agentStore.GetByClusterAsync(clusterId, ct); + + return agents.Select(a => new AgentMember + { + AgentId = a.Id, + HostName = a.HostName, + Status = a.Status, + Role = DetermineRole(a, cluster), + LastHeartbeat = a.LastHeartbeat, + Capabilities = a.Capabilities, + CurrentLoad = a.CurrentTaskCount, + MaxLoad = a.MaxConcurrentTasks + }).ToList(); + } + + private AgentRole DetermineRole(Agent agent, AgentCluster cluster) + { + if (cluster.LeaderId == agent.Id) + return AgentRole.Leader; + + if (cluster.StandbyIds.Contains(agent.Id)) + return AgentRole.Standby; + + return AgentRole.Member; + } +} + +public sealed record AgentCluster +{ + public Guid Id { get; init; } + public string Name { get; init; } + public Guid TargetGroupId { get; init; } + + // Membership + public int MinimumAgents { get; init; } + public int DesiredAgents { get; init; } + public Guid? LeaderId { get; init; } + public ImmutableArray StandbyIds { get; init; } + + // Configuration + public ReplicationMode ReplicationMode { get; init; } + public FailoverPolicy FailoverPolicy { get; init; } + + // Status + public ClusterStatus Status { get; init; } + public int HealthyAgentCount { get; init; } + public DateTimeOffset CreatedAt { get; init; } +} + +public enum ReplicationMode +{ + ActivePassive, // One active, others standby + ActiveActive, // All agents handle tasks + Sharded // Tasks partitioned across agents +} + +public enum AgentRole +{ + Leader, // Primary agent (ActivePassive mode) + Standby, // Ready to take over + Member // Active participant (ActiveActive mode) +} +``` + +#### 2. HealthMonitor + +Monitors agent health with sophisticated detection: + +```csharp +public sealed class HealthMonitor +{ + private readonly ConcurrentDictionary _healthStates = new(); + + public async Task ProcessHeartbeatAsync( + AgentHeartbeat heartbeat, + CancellationToken ct) + { + var state = _healthStates.GetOrAdd(heartbeat.AgentId, _ => new AgentHealthState()); + + state.LastHeartbeat = heartbeat.Timestamp; + state.ReportedHealth = heartbeat.Health; + state.CurrentLoad = heartbeat.TaskCount; + state.ResourceMetrics = heartbeat.ResourceMetrics; + + // Update health assessment + state.AssessedHealth = await AssessHealthAsync(heartbeat, state, ct); + + // Check for degradation + if (state.AssessedHealth < HealthLevel.Healthy) + { + await HandleDegradationAsync(heartbeat.AgentId, state, ct); + } + + // Emit metrics + _metricsEmitter.EmitAgentHealth(heartbeat.AgentId, state); + } + + private async Task AssessHealthAsync( + AgentHeartbeat heartbeat, + AgentHealthState state, + CancellationToken ct) + { + var factors = new List(); + + // 1. Self-reported health + factors.Add(new HealthFactor("self_reported", heartbeat.Health, 0.2)); + + // 2. Heartbeat regularity + var heartbeatScore = CalculateHeartbeatScore(state); + factors.Add(new HealthFactor("heartbeat_regularity", heartbeatScore, 0.3)); + + // 3. Task completion rate + var completionRate = await GetTaskCompletionRateAsync(heartbeat.AgentId, ct); + factors.Add(new HealthFactor("task_completion", completionRate, 0.25)); + + // 4. Resource utilization + var resourceScore = CalculateResourceScore(heartbeat.ResourceMetrics); + factors.Add(new HealthFactor("resource_utilization", resourceScore, 0.15)); + + // 5. Error rate + var errorRate = await GetErrorRateAsync(heartbeat.AgentId, ct); + factors.Add(new HealthFactor("error_rate", 1.0 - errorRate, 0.1)); + + // Weighted average + var overallScore = factors.Sum(f => f.Score * f.Weight); + + return overallScore switch + { + >= 0.9 => HealthLevel.Healthy, + >= 0.7 => HealthLevel.Degraded, + >= 0.5 => HealthLevel.Warning, + >= 0.3 => HealthLevel.Critical, + _ => HealthLevel.Failed + }; + } + + public async Task DetectFailuresAsync(CancellationToken ct) + { + var now = _timeProvider.GetUtcNow(); + + foreach (var (agentId, state) in _healthStates) + { + var timeSinceHeartbeat = now - state.LastHeartbeat; + + if (timeSinceHeartbeat > _config.FailureThreshold) + { + await HandleAgentFailureAsync(agentId, state, ct); + } + else if (timeSinceHeartbeat > _config.WarningThreshold) + { + await HandleAgentWarningAsync(agentId, state, ct); + } + } + } + + private async Task HandleAgentFailureAsync( + Guid agentId, + AgentHealthState state, + CancellationToken ct) + { + _logger.LogWarning("Agent {AgentId} detected as failed", agentId); + + // Update state + state.AssessedHealth = HealthLevel.Failed; + state.FailedAt = _timeProvider.GetUtcNow(); + + // Notify failover manager + await _eventPublisher.PublishAsync(new AgentFailedEvent(agentId, state), ct); + + // Mark agent as offline + await _agentStore.UpdateStatusAsync(agentId, AgentStatus.Offline, ct); + } +} + +public sealed class AgentHealthState +{ + public DateTimeOffset LastHeartbeat { get; set; } + public HealthLevel ReportedHealth { get; set; } + public HealthLevel AssessedHealth { get; set; } + public int CurrentLoad { get; set; } + public ResourceMetrics ResourceMetrics { get; set; } + public DateTimeOffset? FailedAt { get; set; } + public int ConsecutiveFailures { get; set; } +} + +public enum HealthLevel +{ + Healthy = 100, + Degraded = 75, + Warning = 50, + Critical = 25, + Failed = 0 +} +``` + +#### 3. FailoverManager + +Orchestrates failover between agents: + +```csharp +public sealed class FailoverManager +{ + public async Task PerformFailoverAsync( + FailoverRequest request, + CancellationToken ct) + { + var result = new FailoverResult + { + RequestId = Guid.NewGuid(), + FailedAgentId = request.FailedAgentId, + StartedAt = _timeProvider.GetUtcNow() + }; + + try + { + // 1. Find cluster + var cluster = await _clusterStore.GetByAgentAsync(request.FailedAgentId, ct); + if (cluster == null) + { + result.Status = FailoverStatus.NotInCluster; + return result; + } + + // 2. Select failover target + var target = await SelectFailoverTargetAsync(cluster, request, ct); + if (target == null) + { + result.Status = FailoverStatus.NoTargetAvailable; + await HandleNoTargetAsync(cluster, request, ct); + return result; + } + + result.TargetAgentId = target.AgentId; + + // 3. Transfer in-flight tasks + var tasksToTransfer = await GetInFlightTasksAsync(request.FailedAgentId, ct); + result.TasksTransferred = tasksToTransfer.Count; + + foreach (var task in tasksToTransfer) + { + await TransferTaskAsync(task, target.AgentId, ct); + } + + // 4. Update cluster membership + if (cluster.LeaderId == request.FailedAgentId) + { + await PromoteToLeaderAsync(cluster, target.AgentId, ct); + } + + // 5. Update target assignments + await ReassignTargetsAsync(request.FailedAgentId, target.AgentId, ct); + + result.Status = FailoverStatus.Succeeded; + result.CompletedAt = _timeProvider.GetUtcNow(); + + // Emit event + await _eventPublisher.PublishAsync(new FailoverCompletedEvent(result), ct); + } + catch (Exception ex) + { + result.Status = FailoverStatus.Failed; + result.Error = ex.Message; + _logger.LogError(ex, "Failover failed for agent {AgentId}", request.FailedAgentId); + } + + return result; + } + + private async Task SelectFailoverTargetAsync( + AgentCluster cluster, + FailoverRequest request, + CancellationToken ct) + { + var candidates = await _clusterManager.GetClusterMembersAsync(cluster.Id, ct); + + // Filter healthy agents + candidates = candidates + .Where(a => a.AgentId != request.FailedAgentId) + .Where(a => a.Status == AgentStatus.Online) + .Where(a => a.HasCapability(request.RequiredCapabilities)) + .ToList(); + + if (!candidates.Any()) + return null; + + // Apply selection strategy + return cluster.FailoverPolicy.SelectionStrategy switch + { + FailoverSelectionStrategy.Standby => + candidates.FirstOrDefault(a => a.Role == AgentRole.Standby) ?? + candidates.OrderBy(a => a.CurrentLoad).First(), + + FailoverSelectionStrategy.LeastLoaded => + candidates.OrderBy(a => a.CurrentLoad / (double)a.MaxLoad).First(), + + FailoverSelectionStrategy.RoundRobin => + SelectRoundRobin(cluster, candidates), + + FailoverSelectionStrategy.Affinity => + SelectByAffinity(candidates, request.AffinityHints), + + _ => candidates.First() + }; + } + + private async Task TransferTaskAsync( + AgentTask task, + Guid targetAgentId, + CancellationToken ct) + { + // Mark task as transferred + task.TransferredFrom = task.AssignedAgentId; + task.AssignedAgentId = targetAgentId; + task.TransferredAt = _timeProvider.GetUtcNow(); + + // Reset task state for retry + if (task.Status == TaskStatus.Running) + { + task.Status = TaskStatus.Pending; + task.RetryCount++; + } + + await _taskStore.SaveAsync(task, ct); + + // Notify target agent + await _agentNotifier.NotifyTaskAssignedAsync(targetAgentId, task, ct); + } +} + +public sealed record FailoverResult +{ + public Guid RequestId { get; init; } + public Guid FailedAgentId { get; init; } + public Guid? TargetAgentId { get; init; } + public FailoverStatus Status { get; init; } + public int TasksTransferred { get; init; } + public string? Error { get; init; } + public DateTimeOffset StartedAt { get; init; } + public DateTimeOffset? CompletedAt { get; init; } +} + +public enum FailoverStatus +{ + Succeeded, + NotInCluster, + NoTargetAvailable, + Failed +} +``` + +#### 4. LeaderElection + +Manages leader election for ActivePassive clusters: + +```csharp +public sealed class LeaderElection +{ + private readonly IDistributedLockProvider _lockProvider; + + public async Task RunElectionAsync( + Guid clusterId, + CancellationToken ct) + { + var cluster = await _clusterStore.GetAsync(clusterId, ct); + var members = await _clusterManager.GetClusterMembersAsync(clusterId, ct); + + var healthyMembers = members + .Where(m => m.Status == AgentStatus.Online) + .OrderByDescending(m => m.Role == AgentRole.Standby) // Prefer standbys + .ThenBy(m => m.CurrentLoad) // Then least loaded + .ToList(); + + if (!healthyMembers.Any()) + { + _logger.LogWarning("No healthy members for cluster {ClusterId}", clusterId); + return; + } + + // Acquire distributed lock for election + await using var @lock = await _lockProvider.AcquireAsync( + $"cluster:{clusterId}:election", ct); + + // Re-read cluster state under lock + cluster = await _clusterStore.GetAsync(clusterId, ct); + + // Check if current leader is healthy + var currentLeader = healthyMembers.FirstOrDefault(m => m.AgentId == cluster.LeaderId); + if (currentLeader != null) + { + _logger.LogDebug("Current leader {LeaderId} is healthy", cluster.LeaderId); + return; + } + + // Elect new leader + var newLeader = healthyMembers.First(); + await PromoteToLeaderAsync(cluster, newLeader.AgentId, ct); + + _logger.LogInformation( + "Elected new leader {NewLeaderId} for cluster {ClusterId}", + newLeader.AgentId, clusterId); + } + + private async Task PromoteToLeaderAsync( + AgentCluster cluster, + Guid newLeaderId, + CancellationToken ct) + { + var previousLeaderId = cluster.LeaderId; + + // Update cluster + cluster = cluster with { LeaderId = newLeaderId }; + + // Update standby list + var newStandbys = cluster.StandbyIds + .Where(id => id != newLeaderId) + .ToImmutableArray(); + + if (previousLeaderId.HasValue) + { + // Demote previous leader to standby if still healthy + var previousLeader = await _agentStore.GetAsync(previousLeaderId.Value, ct); + if (previousLeader?.Status == AgentStatus.Online) + { + newStandbys = newStandbys.Add(previousLeaderId.Value); + } + } + + cluster = cluster with { StandbyIds = newStandbys }; + await _clusterStore.SaveAsync(cluster, ct); + + // Notify agents + await _agentNotifier.NotifyLeaderChangeAsync(cluster.Id, newLeaderId, ct); + + // Emit event + await _eventPublisher.PublishAsync(new LeaderElectedEvent( + cluster.Id, newLeaderId, previousLeaderId), ct); + } +} +``` + +#### 5. TaskQueue + +Durable task queue for offline agents: + +```csharp +public sealed class TaskQueue +{ + private readonly ITaskQueueStore _store; + + public async Task EnqueueAsync( + AgentTask task, + EnqueueOptions options, + CancellationToken ct) + { + var queuedTask = new QueuedTask + { + Id = Guid.NewGuid(), + Task = task, + Priority = options.Priority, + EnqueuedAt = _timeProvider.GetUtcNow(), + ExpiresAt = options.ExpiresAt, + TargetAgentId = options.TargetAgentId, + TargetClusterId = options.TargetClusterId, + RequiredCapabilities = options.RequiredCapabilities, + DeliveryAttempts = 0, + MaxDeliveryAttempts = options.MaxDeliveryAttempts + }; + + await _store.SaveAsync(queuedTask, ct); + return queuedTask.Id; + } + + public async Task DequeueAsync( + Guid agentId, + ImmutableArray capabilities, + CancellationToken ct) + { + // Find eligible tasks + var tasks = await _store.GetPendingTasksAsync(agentId, capabilities, ct); + + foreach (var task in tasks.OrderByDescending(t => t.Priority)) + { + // Check expiration + if (task.ExpiresAt.HasValue && task.ExpiresAt < _timeProvider.GetUtcNow()) + { + await ExpireTaskAsync(task, ct); + continue; + } + + // Try to claim task + var claimed = await _store.TryClaimAsync(task.Id, agentId, ct); + if (claimed) + { + task.DeliveryAttempts++; + task.LastAttemptAt = _timeProvider.GetUtcNow(); + task.ClaimedBy = agentId; + await _store.SaveAsync(task, ct); + return task; + } + } + + return null; + } + + public async Task CompleteAsync(Guid taskId, TaskResult result, CancellationToken ct) + { + var task = await _store.GetAsync(taskId, ct); + if (task == null) + return; + + task.CompletedAt = _timeProvider.GetUtcNow(); + task.Result = result; + task.Status = result.Success ? QueuedTaskStatus.Completed : QueuedTaskStatus.Failed; + + await _store.SaveAsync(task, ct); + + // Archive or retry + if (task.Status == QueuedTaskStatus.Completed) + { + await _store.ArchiveAsync(taskId, ct); + } + else if (task.DeliveryAttempts < task.MaxDeliveryAttempts) + { + await RetryAsync(task, ct); + } + else + { + await _store.MoveToDeadLetterAsync(taskId, ct); + } + } + + private async Task RetryAsync(QueuedTask task, CancellationToken ct) + { + var delay = CalculateBackoff(task.DeliveryAttempts); + task.Status = QueuedTaskStatus.Pending; + task.ClaimedBy = null; + task.NextAttemptAt = _timeProvider.GetUtcNow().Add(delay); + await _store.SaveAsync(task, ct); + } + + private TimeSpan CalculateBackoff(int attempts) + { + // Exponential backoff with jitter + var baseDelay = TimeSpan.FromSeconds(Math.Pow(2, attempts)); + var jitter = TimeSpan.FromMilliseconds(Random.Shared.Next(0, 1000)); + var maxDelay = TimeSpan.FromMinutes(5); + return Min(baseDelay + jitter, maxDelay); + } +} + +public sealed record QueuedTask +{ + public Guid Id { get; init; } + public AgentTask Task { get; init; } + public TaskPriority Priority { get; init; } + public QueuedTaskStatus Status { get; init; } + + // Targeting + public Guid? TargetAgentId { get; init; } + public Guid? TargetClusterId { get; init; } + public ImmutableArray RequiredCapabilities { get; init; } + + // Timing + public DateTimeOffset EnqueuedAt { get; init; } + public DateTimeOffset? ExpiresAt { get; init; } + public DateTimeOffset? NextAttemptAt { get; init; } + public DateTimeOffset? CompletedAt { get; init; } + + // Delivery + public int DeliveryAttempts { get; set; } + public int MaxDeliveryAttempts { get; init; } + public DateTimeOffset? LastAttemptAt { get; set; } + public Guid? ClaimedBy { get; set; } + + // Result + public TaskResult? Result { get; set; } +} +``` + +#### 6. SelfHealer + +Automatic recovery and self-healing: + +```csharp +public sealed class SelfHealer +{ + public async Task RunHealingCycleAsync(CancellationToken ct) + { + var healingActions = new List(); + + // 1. Detect unhealthy agents + var unhealthyAgents = await DetectUnhealthyAgentsAsync(ct); + foreach (var agent in unhealthyAgents) + { + var action = await DetermineHealingActionAsync(agent, ct); + if (action != null) + { + healingActions.Add(action); + } + } + + // 2. Detect orphaned tasks + var orphanedTasks = await DetectOrphanedTasksAsync(ct); + foreach (var task in orphanedTasks) + { + healingActions.Add(new HealingAction + { + Type = HealingActionType.ReassignTask, + TargetId = task.Id, + Reason = "Task orphaned after agent failure" + }); + } + + // 3. Detect under-replicated clusters + var underReplicatedClusters = await DetectUnderReplicatedClustersAsync(ct); + foreach (var cluster in underReplicatedClusters) + { + healingActions.Add(new HealingAction + { + Type = HealingActionType.RebalanceCluster, + TargetId = cluster.Id, + Reason = $"Cluster has {cluster.HealthyAgentCount}/{cluster.DesiredAgents} agents" + }); + } + + // 4. Execute healing actions + foreach (var action in healingActions.OrderByDescending(a => a.Priority)) + { + await ExecuteHealingActionAsync(action, ct); + } + } + + private async Task DetermineHealingActionAsync( + Agent agent, + CancellationToken ct) + { + var health = await _healthMonitor.GetHealthStateAsync(agent.Id, ct); + + return health.AssessedHealth switch + { + HealthLevel.Degraded => new HealingAction + { + Type = HealingActionType.DrainAgent, + TargetId = agent.Id, + Reason = "Agent degraded, draining tasks" + }, + + HealthLevel.Warning => new HealingAction + { + Type = HealingActionType.ReduceLoad, + TargetId = agent.Id, + Reason = "Agent showing warnings, reducing load" + }, + + HealthLevel.Critical or HealthLevel.Failed => new HealingAction + { + Type = HealingActionType.FailoverAgent, + TargetId = agent.Id, + Reason = $"Agent health critical: {health.AssessedHealth}" + }, + + _ => null + }; + } + + private async Task ExecuteHealingActionAsync( + HealingAction action, + CancellationToken ct) + { + _logger.LogInformation( + "Executing healing action {ActionType} on {TargetId}: {Reason}", + action.Type, action.TargetId, action.Reason); + + switch (action.Type) + { + case HealingActionType.FailoverAgent: + await _failoverManager.PerformFailoverAsync( + new FailoverRequest { FailedAgentId = action.TargetId }, ct); + break; + + case HealingActionType.DrainAgent: + await DrainAgentAsync(action.TargetId, ct); + break; + + case HealingActionType.ReduceLoad: + await ReduceAgentLoadAsync(action.TargetId, ct); + break; + + case HealingActionType.ReassignTask: + await ReassignTaskAsync(action.TargetId, ct); + break; + + case HealingActionType.RebalanceCluster: + await RebalanceClusterAsync(action.TargetId, ct); + break; + } + + // Record healing action + await _healingStore.RecordAsync(action, ct); + } + + private async Task DrainAgentAsync(Guid agentId, CancellationToken ct) + { + // Stop accepting new tasks + await _agentStore.UpdateStatusAsync(agentId, AgentStatus.Draining, ct); + + // Wait for in-flight tasks to complete (with timeout) + var timeout = _timeProvider.GetUtcNow().AddMinutes(5); + while (_timeProvider.GetUtcNow() < timeout) + { + var inFlightTasks = await _taskStore.GetInFlightTasksAsync(agentId, ct); + if (!inFlightTasks.Any()) + break; + + await Task.Delay(TimeSpan.FromSeconds(5), ct); + } + + // Force transfer remaining tasks + var remainingTasks = await _taskStore.GetInFlightTasksAsync(agentId, ct); + foreach (var task in remainingTasks) + { + await _failoverManager.TransferTaskAsync(task, ct); + } + } +} +``` + +#### 7. StateSync + +Synchronizes state across cluster members: + +```csharp +public sealed class StateSync +{ + public async Task SyncClusterStateAsync( + Guid clusterId, + CancellationToken ct) + { + var cluster = await _clusterStore.GetAsync(clusterId, ct); + var members = await _clusterManager.GetClusterMembersAsync(clusterId, ct); + var leader = members.FirstOrDefault(m => m.Role == AgentRole.Leader); + + if (leader == null) + { + _logger.LogWarning("No leader for cluster {ClusterId}, skipping sync", clusterId); + return; + } + + // Get leader's state + var leaderState = await GetAgentStateAsync(leader.AgentId, ct); + + // Sync to other members + foreach (var member in members.Where(m => m.Role != AgentRole.Leader)) + { + await SyncToMemberAsync(member.AgentId, leaderState, ct); + } + } + + private async Task SyncToMemberAsync( + Guid agentId, + AgentState leaderState, + CancellationToken ct) + { + var memberState = await GetAgentStateAsync(agentId, ct); + var diff = CalculateStateDiff(leaderState, memberState); + + if (diff.HasChanges) + { + _logger.LogDebug( + "Syncing {ChangeCount} changes to agent {AgentId}", + diff.Changes.Count, agentId); + + await _agentNotifier.SendStateSyncAsync(agentId, diff, ct); + } + } +} + +public sealed record AgentState +{ + public Guid AgentId { get; init; } + public DateTimeOffset CapturedAt { get; init; } + + // Target assignments + public ImmutableArray AssignedTargets { get; init; } + + // Task state + public ImmutableArray TaskStates { get; init; } + + // Configuration + public AgentConfiguration Configuration { get; init; } + + // Cached data + public ImmutableDictionary CachedDigests { get; init; } +} +``` + +--- + +## Cluster Topologies + +### Active-Passive + +``` +┌─────────────────────────────────────────┐ +│ Agent Cluster │ +│ │ +│ ┌─────────┐ ┌─────────┐ │ +│ │ LEADER │ │ STANDBY │ │ +│ │ Agent A │ │ Agent B │ │ +│ │ (Active)│ │(Passive)│ │ +│ └────┬────┘ └────┬────┘ │ +│ │ │ │ +│ ▼ │ (failover) │ +│ ┌─────────┐ │ │ +│ │ Targets │◄────────┘ │ +│ └─────────┘ │ +└─────────────────────────────────────────┘ +``` + +### Active-Active + +``` +┌─────────────────────────────────────────┐ +│ Agent Cluster │ +│ │ +│ ┌─────────┐ ┌─────────┐ │ +│ │ Agent A │ │ Agent B │ │ +│ │ (Active)│ │ (Active)│ │ +│ └────┬────┘ └────┬────┘ │ +│ │ │ │ +│ └──────┬───────┘ │ +│ ▼ │ +│ ┌─────────────────────┐ │ +│ │ Targets (balanced) │ │ +│ └─────────────────────┘ │ +└─────────────────────────────────────────┘ +``` + +### Sharded + +``` +┌─────────────────────────────────────────┐ +│ Agent Cluster │ +│ │ +│ ┌─────────┐ ┌─────────┐ │ +│ │ Agent A │ │ Agent B │ │ +│ │ Shard 0 │ │ Shard 1 │ │ +│ └────┬────┘ └────┬────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌─────────┐ ┌─────────┐ │ +│ │Targets │ │Targets │ │ +│ │ 0-49 │ │ 50-99 │ │ +│ └─────────┘ └─────────┘ │ +└─────────────────────────────────────────┘ +``` + +--- + +## API Design + +### REST Endpoints + +``` +# Clusters +POST /api/v1/agents/clusters # Create cluster +GET /api/v1/agents/clusters # List clusters +GET /api/v1/agents/clusters/{id} # Get cluster +PUT /api/v1/agents/clusters/{id} # Update cluster +DELETE /api/v1/agents/clusters/{id} # Delete cluster +GET /api/v1/agents/clusters/{id}/members # Get members +POST /api/v1/agents/clusters/{id}/rebalance # Trigger rebalance + +# Failover +POST /api/v1/agents/{id}/failover # Manual failover +GET /api/v1/agents/failovers # Failover history +GET /api/v1/agents/failovers/{id} # Failover details + +# Health +GET /api/v1/agents/{id}/health # Get agent health +GET /api/v1/agents/clusters/{id}/health # Get cluster health + +# Task Queue +GET /api/v1/agents/tasks/queue # View queue +GET /api/v1/agents/tasks/queue/dead-letter # Dead letter queue +POST /api/v1/agents/tasks/{id}/retry # Retry task + +# Self-Healing +GET /api/v1/agents/healing/actions # Healing history +GET /api/v1/agents/healing/status # Current healing status +``` + +--- + +## Metrics & Observability + +### Prometheus Metrics + +``` +# Cluster Health +stella_agent_cluster_members{cluster_id, status} +stella_agent_cluster_leader{cluster_id, agent_id} +stella_agent_cluster_health{cluster_id} + +# Failover +stella_agent_failovers_total{cluster_id, status} +stella_agent_failover_duration_seconds{cluster_id} +stella_agent_tasks_transferred_total{cluster_id} + +# Task Queue +stella_agent_queue_depth{cluster_id, priority} +stella_agent_queue_latency_seconds{cluster_id} +stella_agent_dead_letter_queue_depth{cluster_id} + +# Self-Healing +stella_agent_healing_actions_total{action_type, status} +stella_agent_healing_cycle_duration_seconds + +# Agent Health +stella_agent_health_score{agent_id} +stella_agent_heartbeat_age_seconds{agent_id} +stella_agent_task_completion_rate{agent_id} +``` + +--- + +## Configuration + +```yaml +agent_cluster: + name: "production-docker-agents" + target_group_id: "prod-docker-hosts" + + membership: + minimum_agents: 2 + desired_agents: 3 + max_agents: 5 + + replication_mode: active_active + + failover: + selection_strategy: least_loaded + task_transfer_timeout: "00:05:00" + max_transfer_retries: 3 + + health_monitoring: + heartbeat_interval: "00:00:30" + warning_threshold: "00:01:00" + failure_threshold: "00:01:30" + health_check_interval: "00:00:10" + + task_queue: + max_delivery_attempts: 3 + default_expiration: "01:00:00" + dead_letter_retention: "7.00:00:00" + + self_healing: + enabled: true + cycle_interval: "00:01:00" + drain_timeout: "00:05:00" + + leader_election: + enabled: true # For ActivePassive mode + election_interval: "00:00:15" + lease_duration: "00:00:30" +``` + +--- + +## Test Strategy + +### Unit Tests +- Health score calculation +- Failover target selection +- Task queue operations +- Backoff calculation + +### Integration Tests +- Full failover flow +- Leader election +- State synchronization +- Task transfer + +### Chaos Tests +- Random agent failures +- Network partitions +- Split-brain scenarios +- Cascading failures + +### Load Tests +- High task throughput +- Many concurrent agents +- Rapid failover cycles + +--- + +## Migration Path + +### Phase 1: Foundation (Week 1-2) +- Cluster data model +- Basic cluster management +- Health monitoring enhancements + +### Phase 2: Failover (Week 3-4) +- Failover manager +- Task transfer +- Target reassignment + +### Phase 3: Leader Election (Week 5-6) +- Distributed lock integration +- Election algorithm +- ActivePassive support + +### Phase 4: Task Queue (Week 7-8) +- Durable queue implementation +- Dead letter handling +- Retry logic + +### Phase 5: Self-Healing (Week 9-10) +- Healing cycle +- Automatic actions +- Monitoring integration + +### Phase 6: State Sync (Week 11-12) +- State diffing +- Sync protocol +- Consistency verification diff --git a/docs/modules/release-orchestrator/enhancements/compliance-reporting.md b/docs/modules/release-orchestrator/enhancements/compliance-reporting.md new file mode 100644 index 000000000..d63d2f6fa --- /dev/null +++ b/docs/modules/release-orchestrator/enhancements/compliance-reporting.md @@ -0,0 +1,1187 @@ +# Compliance & Reporting + +## Overview + +Compliance & Reporting transforms the Release Orchestrator's audit capabilities into a comprehensive compliance management system. This enhancement provides pre-built compliance report templates, evidence chain visualization, audit query interface, regulatory framework alignment, and automated compliance checking. + +This is a best-in-class implementation designed to meet the needs of enterprises operating under strict regulatory requirements (SOC2, ISO 27001, PCI-DSS, HIPAA, FedRAMP, GDPR). + +--- + +## Design Principles + +1. **Continuous Compliance**: Real-time compliance status, not periodic audits +2. **Evidence-First**: All compliance claims backed by cryptographic evidence +3. **Framework-Agnostic**: Adaptable to any regulatory framework +4. **Auditor-Friendly**: Reports designed for external auditor consumption +5. **Immutable Records**: Tamper-proof audit trail +6. **Automated Where Possible**: Reduce manual compliance burden + +--- + +## Architecture + +### Component Overview + +``` +┌────────────────────────────────────────────────────────────────────────┐ +│ Compliance & Reporting System │ +├────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │ +│ │ ComplianceEngine │───▶│ ReportGenerator │───▶│ EvidenceChain │ │ +│ │ │ │ │ │ Visualizer │ │ +│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │ +│ │ FrameworkMapper │ │ AuditQueryEngine │ │ ControlValidator│ │ +│ │ │ │ │ │ │ │ +│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │ +│ │ ExportService │ │ ScheduledReports │ │ AlertManager │ │ +│ │ │ │ │ │ │ │ +│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │ +│ │ +└────────────────────────────────────────────────────────────────────────┘ +``` + +### Key Components + +#### 1. ComplianceEngine + +Core compliance evaluation engine: + +```csharp +public sealed class ComplianceEngine +{ + private readonly ImmutableArray _frameworks; + private readonly IControlValidator _validator; + private readonly IEvidenceStore _evidenceStore; + + public async Task EvaluateAsync( + ComplianceEvaluationRequest request, + CancellationToken ct) + { + var status = new ComplianceStatus + { + TenantId = request.TenantId, + EvaluatedAt = _timeProvider.GetUtcNow(), + Frameworks = new List() + }; + + foreach (var frameworkId in request.Frameworks) + { + var framework = _frameworks.First(f => f.Id == frameworkId); + var frameworkStatus = await EvaluateFrameworkAsync(framework, request, ct); + status.Frameworks.Add(frameworkStatus); + } + + // Calculate overall compliance score + status.OverallScore = CalculateOverallScore(status.Frameworks); + status.ComplianceLevel = DetermineComplianceLevel(status.OverallScore); + + return status; + } + + private async Task EvaluateFrameworkAsync( + IComplianceFramework framework, + ComplianceEvaluationRequest request, + CancellationToken ct) + { + var frameworkStatus = new FrameworkStatus + { + FrameworkId = framework.Id, + FrameworkName = framework.Name, + Version = framework.Version, + Controls = new List() + }; + + foreach (var control in framework.Controls) + { + var controlStatus = await EvaluateControlAsync(control, request, ct); + frameworkStatus.Controls.Add(controlStatus); + } + + // Calculate framework compliance + frameworkStatus.TotalControls = framework.Controls.Count; + frameworkStatus.PassedControls = frameworkStatus.Controls.Count(c => c.Status == ControlEvaluationStatus.Passed); + frameworkStatus.FailedControls = frameworkStatus.Controls.Count(c => c.Status == ControlEvaluationStatus.Failed); + frameworkStatus.NotApplicableControls = frameworkStatus.Controls.Count(c => c.Status == ControlEvaluationStatus.NotApplicable); + frameworkStatus.Score = (double)frameworkStatus.PassedControls / + (frameworkStatus.TotalControls - frameworkStatus.NotApplicableControls) * 100; + + return frameworkStatus; + } + + private async Task EvaluateControlAsync( + ComplianceControl control, + ComplianceEvaluationRequest request, + CancellationToken ct) + { + var controlStatus = new ControlStatus + { + ControlId = control.Id, + ControlName = control.Name, + Category = control.Category, + Description = control.Description, + Evidence = new List() + }; + + // Validate control + var validationResult = await _validator.ValidateAsync(control, request, ct); + controlStatus.Status = validationResult.Status; + controlStatus.Findings = validationResult.Findings; + + // Collect evidence + var evidence = await _evidenceStore.GetEvidenceForControlAsync( + request.TenantId, control.Id, request.DateRange, ct); + controlStatus.Evidence = evidence.Select(e => new EvidenceReference + { + EvidenceId = e.Id, + Type = e.Type, + CollectedAt = e.CollectedAt, + Summary = e.Summary + }).ToList(); + + return controlStatus; + } +} + +public sealed record ComplianceStatus +{ + public Guid TenantId { get; init; } + public DateTimeOffset EvaluatedAt { get; init; } + public double OverallScore { get; init; } + public ComplianceLevel ComplianceLevel { get; init; } + public List Frameworks { get; init; } +} + +public enum ComplianceLevel +{ + FullyCompliant, // 100% + SubstantiallyCompliant, // 90-99% + PartiallyCompliant, // 70-89% + NonCompliant // <70% +} +``` + +#### 2. FrameworkMapper + +Maps organizational controls to compliance frameworks: + +```csharp +public sealed class FrameworkMapper +{ + private readonly ImmutableDictionary _frameworks; + + public FrameworkMapper() + { + _frameworks = LoadFrameworks().ToImmutableDictionary(f => f.Id); + } + + private IEnumerable LoadFrameworks() + { + yield return new Soc2Framework(); + yield return new Iso27001Framework(); + yield return new PciDssFramework(); + yield return new HipaaFramework(); + yield return new FedRampFramework(); + yield return new GdprFramework(); + yield return new NistCsfFramework(); + } + + public IReadOnlyList MapToFramework( + string frameworkId, + IReadOnlyList orgControls) + { + var framework = _frameworks[frameworkId]; + var mappings = new List(); + + foreach (var frameworkControl in framework.Controls) + { + var mapping = new ControlMapping + { + FrameworkControl = frameworkControl, + MappedOrgControls = new List() + }; + + // Find matching organizational controls + foreach (var orgControl in orgControls) + { + if (IsMatch(frameworkControl, orgControl)) + { + mapping.MappedOrgControls.Add(orgControl); + } + } + + mapping.CoverageStatus = mapping.MappedOrgControls.Any() + ? CoverageStatus.Covered + : CoverageStatus.Gap; + + mappings.Add(mapping); + } + + return mappings; + } + + private bool IsMatch(ComplianceControl frameworkControl, OrganizationalControl orgControl) + { + // Check explicit mappings + if (orgControl.FrameworkMappings?.Contains(frameworkControl.Id) == true) + return true; + + // Check keyword matching + var keywords = frameworkControl.Keywords ?? ImmutableArray.Empty; + return keywords.Any(k => orgControl.Description?.Contains(k, StringComparison.OrdinalIgnoreCase) == true); + } +} + +// SOC 2 Framework Implementation +public sealed class Soc2Framework : IComplianceFramework +{ + public string Id => "soc2-type2"; + public string Name => "SOC 2 Type II"; + public string Version => "2017"; + + public ImmutableArray Controls => new[] + { + // Security (Common Criteria) + new ComplianceControl + { + Id = "CC1.1", + Name = "COSO Principle 1", + Category = "Control Environment", + Description = "The entity demonstrates a commitment to integrity and ethical values.", + Keywords = new[] { "integrity", "ethics", "code of conduct" }.ToImmutableArray() + }, + new ComplianceControl + { + Id = "CC6.1", + Name = "Logical and Physical Access Controls", + Category = "Logical and Physical Access", + Description = "The entity implements logical access security software, infrastructure, and architectures.", + Keywords = new[] { "access control", "authentication", "authorization", "mTLS" }.ToImmutableArray(), + AutomatedChecks = new[] + { + new AutomatedCheck + { + Id = "cc6.1.1", + Description = "All agent connections use mTLS", + CheckType = CheckType.AgentSecurity + }, + new AutomatedCheck + { + Id = "cc6.1.2", + Description = "User authentication via SSO/OIDC", + CheckType = CheckType.AuthenticationMethod + } + }.ToImmutableArray() + }, + new ComplianceControl + { + Id = "CC7.2", + Name = "System Operations", + Category = "System Operations", + Description = "The entity monitors system components and the operation of those components for anomalies.", + Keywords = new[] { "monitoring", "alerting", "anomaly detection" }.ToImmutableArray() + }, + new ComplianceControl + { + Id = "CC8.1", + Name = "Change Management", + Category = "Change Management", + Description = "The entity authorizes, designs, develops, configures, documents, tests, approves, and implements changes.", + Keywords = new[] { "change management", "approval", "deployment", "release" }.ToImmutableArray(), + AutomatedChecks = new[] + { + new AutomatedCheck + { + Id = "cc8.1.1", + Description = "All production deployments require approval", + CheckType = CheckType.ApprovalRequired + }, + new AutomatedCheck + { + Id = "cc8.1.2", + Description = "All changes produce evidence packets", + CheckType = CheckType.EvidenceGenerated + } + }.ToImmutableArray() + } + // ... more controls + }.ToImmutableArray(); +} +``` + +#### 3. ReportGenerator + +Generates compliance reports: + +```csharp +public sealed class ReportGenerator +{ + public async Task GenerateAsync( + ReportRequest request, + CancellationToken ct) + { + var report = new ComplianceReport + { + Id = Guid.NewGuid(), + Type = request.ReportType, + GeneratedAt = _timeProvider.GetUtcNow(), + GeneratedBy = request.RequestedBy, + DateRange = request.DateRange + }; + + // Get compliance status + var status = await _complianceEngine.EvaluateAsync(new ComplianceEvaluationRequest + { + TenantId = request.TenantId, + Frameworks = request.Frameworks, + DateRange = request.DateRange + }, ct); + + report.ComplianceStatus = status; + + // Generate sections based on report type + switch (request.ReportType) + { + case ReportType.ExecutiveSummary: + report.Sections = await GenerateExecutiveSummaryAsync(status, ct); + break; + + case ReportType.DetailedCompliance: + report.Sections = await GenerateDetailedReportAsync(status, request, ct); + break; + + case ReportType.GapAnalysis: + report.Sections = await GenerateGapAnalysisAsync(status, ct); + break; + + case ReportType.AuditReadiness: + report.Sections = await GenerateAuditReadinessAsync(status, request, ct); + break; + + case ReportType.EvidencePackage: + report.Sections = await GenerateEvidencePackageAsync(status, request, ct); + break; + } + + // Add standard sections + report.Sections.Add(GenerateMethodologySection()); + report.Sections.Add(GenerateDisclaimerSection()); + + return report; + } + + private async Task> GenerateDetailedReportAsync( + ComplianceStatus status, + ReportRequest request, + CancellationToken ct) + { + var sections = new List(); + + // Overview section + sections.Add(new ReportSection + { + Title = "Compliance Overview", + Content = new OverviewContent + { + EvaluationDate = status.EvaluatedAt, + OverallScore = status.OverallScore, + ComplianceLevel = status.ComplianceLevel, + FrameworkSummaries = status.Frameworks.Select(f => new FrameworkSummary + { + Name = f.FrameworkName, + Score = f.Score, + PassedControls = f.PassedControls, + TotalControls = f.TotalControls + }).ToList() + } + }); + + // Per-framework sections + foreach (var framework in status.Frameworks) + { + var frameworkSection = new ReportSection + { + Title = $"{framework.FrameworkName} Compliance", + Subsections = new List() + }; + + // Group controls by category + var byCategory = framework.Controls.GroupBy(c => c.Category); + foreach (var category in byCategory) + { + var categorySection = new ReportSection + { + Title = category.Key, + Content = new ControlCategoryContent + { + Controls = category.Select(c => new ControlDetail + { + Id = c.ControlId, + Name = c.ControlName, + Status = c.Status, + Findings = c.Findings, + EvidenceCount = c.Evidence.Count, + EvidenceReferences = c.Evidence + }).ToList() + } + }; + frameworkSection.Subsections.Add(categorySection); + } + + sections.Add(frameworkSection); + } + + // Findings summary + var allFindings = status.Frameworks + .SelectMany(f => f.Controls) + .SelectMany(c => c.Findings ?? Enumerable.Empty()) + .ToList(); + + sections.Add(new ReportSection + { + Title = "Findings Summary", + Content = new FindingsSummaryContent + { + TotalFindings = allFindings.Count, + CriticalFindings = allFindings.Count(f => f.Severity == FindingSeverity.Critical), + HighFindings = allFindings.Count(f => f.Severity == FindingSeverity.High), + MediumFindings = allFindings.Count(f => f.Severity == FindingSeverity.Medium), + LowFindings = allFindings.Count(f => f.Severity == FindingSeverity.Low), + Findings = allFindings.OrderByDescending(f => f.Severity).ToList() + } + }); + + // Recommendations + sections.Add(await GenerateRecommendationsAsync(status, ct)); + + return sections; + } +} +``` + +#### 4. EvidenceChainVisualizer + +Visualizes evidence chains: + +```csharp +public sealed class EvidenceChainVisualizer +{ + public async Task VisualizeAsync( + Guid rootEvidenceId, + CancellationToken ct) + { + var root = await _evidenceStore.GetAsync(rootEvidenceId, ct); + var visualization = new EvidenceChainVisualization + { + RootEvidenceId = rootEvidenceId, + GeneratedAt = _timeProvider.GetUtcNow() + }; + + // Build the chain + var chain = await BuildChainAsync(root, ct); + visualization.Chain = chain; + + // Create graph representation + visualization.Graph = CreateGraph(chain); + + // Verify chain integrity + visualization.IntegrityVerification = await VerifyChainIntegrityAsync(chain, ct); + + // Generate narrative + visualization.Narrative = GenerateNarrative(chain); + + return visualization; + } + + private async Task BuildChainAsync( + EvidencePacket root, + CancellationToken ct) + { + var chain = new EvidenceChain + { + Nodes = new List(), + Edges = new List() + }; + + var visited = new HashSet(); + var queue = new Queue(); + queue.Enqueue(root); + + while (queue.Count > 0) + { + var current = queue.Dequeue(); + if (visited.Contains(current.Id)) + continue; + + visited.Add(current.Id); + + // Add node + chain.Nodes.Add(new EvidenceNode + { + Id = current.Id, + Type = current.SubjectType, + Subject = current.SubjectId, + CollectedAt = current.CollectedAt, + Summary = GenerateSummary(current), + Signature = current.Signature, + SignatureValid = await VerifySignatureAsync(current, ct) + }); + + // Add edges for dependencies + foreach (var depId in current.DependsOn) + { + chain.Edges.Add(new EvidenceEdge + { + FromId = depId, + ToId = current.Id, + Relationship = "depends_on" + }); + + // Load dependent evidence + var dep = await _evidenceStore.GetAsync(depId, ct); + if (dep != null && !visited.Contains(dep.Id)) + { + queue.Enqueue(dep); + } + } + } + + return chain; + } + + private EvidenceGraph CreateGraph(EvidenceChain chain) + { + var graph = new EvidenceGraph(); + + // Calculate layout (topological sort + horizontal levels) + var levels = CalculateLevels(chain); + + foreach (var (level, nodes) in levels) + { + var y = level * 100; + var x = 0; + foreach (var node in nodes) + { + graph.Nodes.Add(new GraphNode + { + Id = node.Id.ToString(), + Label = $"{node.Type}\n{node.CollectedAt:g}", + X = x, + Y = y, + Color = GetNodeColor(node) + }); + x += 150; + } + } + + foreach (var edge in chain.Edges) + { + graph.Edges.Add(new GraphEdge + { + From = edge.FromId.ToString(), + To = edge.ToId.ToString(), + Label = edge.Relationship + }); + } + + return graph; + } + + private string GenerateNarrative(EvidenceChain chain) + { + var sb = new StringBuilder(); + var ordered = chain.Nodes.OrderBy(n => n.CollectedAt).ToList(); + + sb.AppendLine("## Evidence Chain Narrative"); + sb.AppendLine(); + + foreach (var node in ordered) + { + sb.AppendLine($"### {node.CollectedAt:yyyy-MM-dd HH:mm:ss} UTC"); + sb.AppendLine(); + sb.AppendLine($"**{node.Type}** (ID: `{node.Id}`)"); + sb.AppendLine(); + sb.AppendLine(node.Summary); + sb.AppendLine(); + + if (node.SignatureValid) + { + sb.AppendLine($"✓ Signature verified"); + } + else + { + sb.AppendLine($"⚠ Signature verification failed"); + } + sb.AppendLine(); + } + + return sb.ToString(); + } +} +``` + +#### 5. AuditQueryEngine + +Powerful query interface for audit data: + +```csharp +public sealed class AuditQueryEngine +{ + public async Task QueryAsync( + AuditQuery query, + CancellationToken ct) + { + var result = new AuditQueryResult + { + QueryId = Guid.NewGuid(), + ExecutedAt = _timeProvider.GetUtcNow(), + Query = query + }; + + // Build SQL from query + var sql = BuildQuery(query); + + // Execute + var connection = await _connectionPool.GetReadReplicaAsync(ct); + var records = await connection.QueryAsync(sql.ToString(), query.Parameters, ct); + + result.Records = records.ToImmutableArray(); + result.TotalCount = records.Count(); + + // Apply aggregations if requested + if (query.Aggregations != null) + { + result.Aggregations = ApplyAggregations(records, query.Aggregations); + } + + return result; + } + + private string BuildQuery(AuditQuery query) + { + var sql = new StringBuilder(); + + // Base query + sql.AppendLine(@" + SELECT + e.id, + e.subject_type, + e.subject_id, + e.collected_at, + e.content, + e.signature, + u.email as actor_email, + u.name as actor_name + FROM evidence_packets e + LEFT JOIN users u ON e.actor_id = u.id + WHERE e.tenant_id = @TenantId"); + + // Date range + if (query.DateRange != null) + { + sql.AppendLine("AND e.collected_at >= @StartDate"); + sql.AppendLine("AND e.collected_at <= @EndDate"); + } + + // Subject type filter + if (query.SubjectTypes?.Any() == true) + { + sql.AppendLine("AND e.subject_type = ANY(@SubjectTypes)"); + } + + // Actor filter + if (query.ActorId.HasValue) + { + sql.AppendLine("AND e.actor_id = @ActorId"); + } + + // Text search + if (!string.IsNullOrEmpty(query.SearchText)) + { + sql.AppendLine("AND e.content_tsv @@ plainto_tsquery(@SearchText)"); + } + + // Custom filters + foreach (var filter in query.Filters ?? Enumerable.Empty()) + { + sql.AppendLine($"AND {BuildFilterClause(filter)}"); + } + + // Ordering + sql.AppendLine("ORDER BY e.collected_at DESC"); + + // Pagination + if (query.Limit.HasValue) + { + sql.AppendLine($"LIMIT {query.Limit}"); + } + if (query.Offset.HasValue) + { + sql.AppendLine($"OFFSET {query.Offset}"); + } + + return sql.ToString(); + } +} + +public sealed record AuditQuery +{ + public Guid TenantId { get; init; } + public DateRange? DateRange { get; init; } + public ImmutableArray? SubjectTypes { get; init; } + public Guid? ActorId { get; init; } + public string? SearchText { get; init; } + public ImmutableArray? Filters { get; init; } + public ImmutableArray? Aggregations { get; init; } + public int? Limit { get; init; } + public int? Offset { get; init; } +} +``` + +#### 6. ControlValidator + +Automated control validation: + +```csharp +public sealed class ControlValidator : IControlValidator +{ + private readonly ImmutableDictionary _checks; + + public async Task ValidateAsync( + ComplianceControl control, + ComplianceEvaluationRequest request, + CancellationToken ct) + { + var result = new ControlValidationResult + { + ControlId = control.Id, + Findings = new List() + }; + + // Run automated checks + if (control.AutomatedChecks?.Any() == true) + { + foreach (var check in control.AutomatedChecks) + { + var checkImpl = _checks.GetValueOrDefault(check.CheckType); + if (checkImpl == null) + { + result.Findings.Add(new Finding + { + Severity = FindingSeverity.Low, + Message = $"Automated check {check.Id} not implemented", + CheckId = check.Id + }); + continue; + } + + var checkResult = await checkImpl.ExecuteAsync(request, ct); + if (!checkResult.Passed) + { + result.Findings.Add(new Finding + { + Severity = checkResult.Severity, + Message = checkResult.Message, + CheckId = check.Id, + Details = checkResult.Details + }); + } + } + } + + // Determine overall status + if (result.Findings.Any(f => f.Severity >= FindingSeverity.High)) + { + result.Status = ControlEvaluationStatus.Failed; + } + else if (result.Findings.Any()) + { + result.Status = ControlEvaluationStatus.PartiallyMet; + } + else + { + result.Status = ControlEvaluationStatus.Passed; + } + + return result; + } +} + +// Example automated check implementations +public sealed class ApprovalRequiredCheck : IAutomatedCheck +{ + public CheckType Type => CheckType.ApprovalRequired; + + public async Task ExecuteAsync( + ComplianceEvaluationRequest request, + CancellationToken ct) + { + // Check that all production deployments required approval + var deployments = await _deploymentStore.GetByDateRangeAsync( + request.TenantId, request.DateRange, ct); + + var productionDeployments = deployments + .Where(d => d.Environment.Name.Equals("production", StringComparison.OrdinalIgnoreCase)); + + var withoutApproval = productionDeployments + .Where(d => d.ApprovalRecords?.Any() != true) + .ToList(); + + if (withoutApproval.Any()) + { + return new CheckResult + { + Passed = false, + Severity = FindingSeverity.Critical, + Message = $"{withoutApproval.Count} production deployments without approval", + Details = withoutApproval.Select(d => new + { + d.Id, + d.ReleaseId, + d.DeployedAt + }).ToList() + }; + } + + return CheckResult.Pass(); + } +} + +public sealed class EvidenceGeneratedCheck : IAutomatedCheck +{ + public CheckType Type => CheckType.EvidenceGenerated; + + public async Task ExecuteAsync( + ComplianceEvaluationRequest request, + CancellationToken ct) + { + // Check that all deployments generated evidence + var deployments = await _deploymentStore.GetByDateRangeAsync( + request.TenantId, request.DateRange, ct); + + var withoutEvidence = new List(); + foreach (var deployment in deployments) + { + var evidence = await _evidenceStore.GetBySubjectAsync( + "deployment", deployment.Id, ct); + + if (evidence == null) + { + withoutEvidence.Add(deployment); + } + } + + if (withoutEvidence.Any()) + { + return new CheckResult + { + Passed = false, + Severity = FindingSeverity.High, + Message = $"{withoutEvidence.Count} deployments without evidence packets", + Details = withoutEvidence.Select(d => d.Id).ToList() + }; + } + + return CheckResult.Pass(); + } +} +``` + +--- + +## Report Templates + +### Executive Summary Template + +```markdown +# Compliance Executive Summary + +**Organization:** {{organization.name}} +**Report Period:** {{date_range.start}} to {{date_range.end}} +**Generated:** {{generated_at}} + +## Overall Compliance Status + +| Framework | Score | Status | +|-----------|-------|--------| +{{#each frameworks}} +| {{name}} | {{score}}% | {{status}} | +{{/each}} + +**Overall Compliance Level:** {{compliance_level}} + +## Key Findings + +{{#if critical_findings}} +### Critical Issues ({{critical_findings.count}}) +{{#each critical_findings}} +- **{{control_id}}**: {{message}} +{{/each}} +{{/if}} + +{{#if high_findings}} +### High Priority Issues ({{high_findings.count}}) +{{#each high_findings}} +- **{{control_id}}**: {{message}} +{{/each}} +{{/if}} + +## Recommendations + +{{#each recommendations}} +1. **{{title}}** (Priority: {{priority}}) + {{description}} +{{/each}} + +## Next Steps + +1. Address critical findings within {{sla.critical}} days +2. Review and remediate high-priority findings +3. Schedule follow-up assessment for {{next_assessment_date}} +``` + +### Audit Readiness Report + +```markdown +# Audit Readiness Report + +**Framework:** {{framework.name}} {{framework.version}} +**Assessment Date:** {{generated_at}} + +## Readiness Summary + +**Ready for Audit:** {{#if ready}}Yes{{else}}No{{/if}} +**Controls Passing:** {{passing_controls}} / {{total_controls}} +**Evidence Coverage:** {{evidence_coverage}}% + +## Control-by-Control Assessment + +{{#each control_categories}} +### {{category_name}} + +{{#each controls}} +#### {{control_id}} - {{control_name}} + +**Status:** {{status}} +**Evidence Available:** {{evidence_count}} items + +{{#if findings}} +**Findings:** +{{#each findings}} +- [{{severity}}] {{message}} +{{/each}} +{{/if}} + +{{#if evidence}} +**Evidence Summary:** +{{#each evidence}} +- {{type}} ({{collected_at}}): {{summary}} +{{/each}} +{{/if}} + +--- +{{/each}} +{{/each}} + +## Gap Analysis + +{{#each gaps}} +| Control | Gap Description | Remediation Recommendation | +|---------|-----------------|---------------------------| +{{#each items}} +| {{control_id}} | {{gap}} | {{recommendation}} | +{{/each}} +{{/each}} + +## Evidence Package Checklist + +{{#each evidence_checklist}} +- [{{#if available}}x{{else}} {{/if}}] {{item}} +{{/each}} +``` + +--- + +## API Design + +### REST Endpoints + +``` +# Compliance Status +GET /api/v1/compliance/status # Current compliance status +GET /api/v1/compliance/status/history # Historical compliance + +# Reports +POST /api/v1/compliance/reports # Generate report +GET /api/v1/compliance/reports # List reports +GET /api/v1/compliance/reports/{id} # Get report +GET /api/v1/compliance/reports/{id}/download # Download report (PDF/HTML) + +# Evidence +GET /api/v1/compliance/evidence # List evidence +GET /api/v1/compliance/evidence/{id} # Get evidence +GET /api/v1/compliance/evidence/{id}/chain # Get evidence chain +GET /api/v1/compliance/evidence/{id}/verify # Verify evidence integrity + +# Audit Query +POST /api/v1/compliance/audit/query # Execute audit query +GET /api/v1/compliance/audit/saved-queries # List saved queries +POST /api/v1/compliance/audit/saved-queries # Save query + +# Frameworks +GET /api/v1/compliance/frameworks # List frameworks +GET /api/v1/compliance/frameworks/{id} # Get framework details +GET /api/v1/compliance/frameworks/{id}/controls # Get controls + +# Control Mappings +GET /api/v1/compliance/mappings # Get control mappings +PUT /api/v1/compliance/mappings # Update mappings + +# Scheduled Reports +POST /api/v1/compliance/reports/schedules # Create schedule +GET /api/v1/compliance/reports/schedules # List schedules +DELETE /api/v1/compliance/reports/schedules/{id} # Delete schedule +``` + +--- + +## Metrics & Observability + +### Prometheus Metrics + +``` +# Compliance Scores +stella_compliance_score{framework, tenant_id} +stella_compliance_controls_passed{framework, tenant_id} +stella_compliance_controls_failed{framework, tenant_id} + +# Findings +stella_compliance_findings_total{severity, framework} +stella_compliance_findings_open{severity, framework} +stella_compliance_findings_remediated{severity, framework} + +# Evidence +stella_evidence_collected_total{type} +stella_evidence_verification_total{status} +stella_evidence_chain_depth{type} + +# Reports +stella_reports_generated_total{type, framework} +stella_report_generation_duration_seconds{type} + +# Audit Queries +stella_audit_queries_total{status} +stella_audit_query_duration_seconds +``` + +--- + +## Configuration + +```yaml +compliance: + frameworks: + - id: soc2-type2 + enabled: true + controls_file: "./frameworks/soc2.yaml" + + - id: iso27001 + enabled: true + controls_file: "./frameworks/iso27001.yaml" + + automated_checks: + enabled: true + schedule: "0 0 * * *" # Daily at midnight + + reports: + scheduled: + - name: "Weekly Executive Summary" + type: executive_summary + schedule: "0 8 * * 1" # Monday 8am + recipients: + - compliance@example.com + - ciso@example.com + format: pdf + + - name: "Monthly Detailed Report" + type: detailed_compliance + schedule: "0 8 1 * *" # 1st of month + recipients: + - compliance@example.com + format: html + + evidence: + retention_days: 2555 # 7 years + verification_schedule: "0 */6 * * *" # Every 6 hours + + alerts: + compliance_drop_threshold: 90 + critical_finding_channels: + - type: slack + channel: "#compliance-alerts" + - type: email + recipients: + - compliance@example.com +``` + +--- + +## Test Strategy + +### Unit Tests +- Framework mapping logic +- Control validation +- Report generation +- Query building + +### Integration Tests +- Full compliance evaluation +- Evidence chain building +- Report export (PDF/HTML) +- Scheduled report execution + +### Compliance Tests +- Framework coverage validation +- Evidence completeness +- Signature verification + +--- + +## Migration Path + +### Phase 1: Framework Foundation (Week 1-2) +- Compliance engine +- Framework definitions +- Control models + +### Phase 2: Automated Checks (Week 3-4) +- Control validator +- Automated check implementations +- Check scheduling + +### Phase 3: Reporting (Week 5-6) +- Report generator +- Report templates +- Export formats + +### Phase 4: Evidence Chain (Week 7-8) +- Chain visualizer +- Integrity verification +- Narrative generation + +### Phase 5: Audit Query (Week 9-10) +- Query engine +- Query UI +- Saved queries + +### Phase 6: Polish (Week 11-12) +- Scheduled reports +- Alerts +- Documentation diff --git a/docs/modules/release-orchestrator/enhancements/developer-experience.md b/docs/modules/release-orchestrator/enhancements/developer-experience.md new file mode 100644 index 000000000..fa9c3b7d0 --- /dev/null +++ b/docs/modules/release-orchestrator/enhancements/developer-experience.md @@ -0,0 +1,1091 @@ +# Developer Experience + +## Overview + +Developer Experience transforms the Release Orchestrator from a web-first platform into a complete developer toolkit. This enhancement provides a powerful CLI for release operations, GitOps-native workflows, IDE integrations, and streamlined development workflows that integrate seamlessly with existing developer toolchains. + +This is a best-in-class implementation inspired by tools like GitHub CLI, Vercel CLI, and Argo CD CLI, tailored for release orchestration workflows. + +--- + +## Design Principles + +1. **CLI-First Operations**: Every action possible via CLI, not just UI +2. **GitOps Native**: Releases triggered by Git operations +3. **Developer Workflows**: Integrate into existing CI/CD and development patterns +4. **Zero-Friction Onboarding**: Quick start without extensive configuration +5. **Scriptable**: All commands output machine-parseable formats +6. **Offline Capable**: Local validation and preview without server + +--- + +## Architecture + +### Component Overview + +``` +┌────────────────────────────────────────────────────────────────────────┐ +│ Developer Experience System │ +├────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │ +│ │ CLI Application │───▶│ API Client │───▶│ Server API │ │ +│ │ (stella) │ │ │ │ │ │ +│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │ +│ │ GitOps Controller│ │ IDE Extensions │ │ Webhook Handler │ │ +│ │ │ │ │ │ │ │ +│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌──────────────────┐ ┌───────────────────┐ ┌─────────────────┐ │ +│ │ Template Engine │ │ Local Validator │ │ Config Sync │ │ +│ │ │ │ │ │ │ │ +│ └──────────────────┘ └───────────────────┘ └─────────────────┘ │ +│ │ +└────────────────────────────────────────────────────────────────────────┘ +``` + +### Key Components + +#### 1. CLI Application (stella) + +Full-featured command-line interface: + +```csharp +// CLI structure +public sealed class StellaCli +{ + // Root command + // stella --version + // stella --help + + // Auth commands + // stella auth login [--token] [--sso] + // stella auth logout + // stella auth status + // stella auth switch-context + + // Release commands + // stella release create --version [--component ]... + // stella release list [--env ] [--status ] + // stella release get + // stella release diff + // stella release history + + // Promotion commands + // stella promote --to [--approve] [--wait] + // stella promote status + // stella promote approve + // stella promote reject --reason + + // Deployment commands + // stella deploy --env [--strategy ] + // stella deploy status + // stella deploy logs [--follow] + // stella rollback [--to ] + + // Environment commands + // stella env list + // stella env get + // stella env freeze --until