up the blokcing tasks
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Notify Smoke Test / Notifier Service Tests (push) Has been cancelled
Notify Smoke Test / Notification Smoke Test (push) Has been cancelled
Notify Smoke Test / Notify Unit Tests (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Manifest Integrity / Validate Schema Integrity (push) Has been cancelled
Manifest Integrity / Validate Contract Documents (push) Has been cancelled
Manifest Integrity / Validate Pack Fixtures (push) Has been cancelled
Manifest Integrity / Audit SHA256SUMS Files (push) Has been cancelled
Manifest Integrity / Verify Merkle Roots (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Risk Bundle CI / risk-bundle-build (push) Has been cancelled
Scanner Analyzers / Discover Analyzers (push) Has been cancelled
Scanner Analyzers / Validate Test Fixtures (push) Has been cancelled
Risk Bundle CI / risk-bundle-offline-kit (push) Has been cancelled
Risk Bundle CI / publish-checksums (push) Has been cancelled
Scanner Analyzers / Build Analyzers (push) Has been cancelled
Scanner Analyzers / Test Language Analyzers (push) Has been cancelled
Scanner Analyzers / Verify Deterministic Output (push) Has been cancelled
devportal-offline / build-offline (push) Has been cancelled
Mirror Thin Bundle Sign & Verify / mirror-sign (push) Has been cancelled

This commit is contained in:
StellaOps Bot
2025-12-11 02:32:18 +02:00
parent 92bc4d3a07
commit 49922dff5a
474 changed files with 76071 additions and 12411 deletions

View File

@@ -0,0 +1,568 @@
using System.Collections.Concurrent;
using System.Collections.Immutable;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Console;
/// <summary>
/// Evidence bundle coordinator per SCHED-WORKER-CONSOLE-23-202.
/// Coordinates evidence bundle jobs (enqueue, track status, cleanup) and exposes job manifests to Web gateway.
/// Ensures idempotent reruns and cancellation support.
/// </summary>
public sealed class EvidenceBundleCoordinator : BackgroundService
{
private readonly IEvidenceBundleJobQueue _jobQueue;
private readonly IEvidenceBundleGenerator _bundleGenerator;
private readonly IEvidenceBundleStore _bundleStore;
private readonly IJobManifestProvider _manifestProvider;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly SchedulerWorkerMetrics _metrics;
private readonly ILogger<EvidenceBundleCoordinator> _logger;
private readonly ConcurrentDictionary<string, CancellationTokenSource> _runningJobs = new();
public EvidenceBundleCoordinator(
IEvidenceBundleJobQueue jobQueue,
IEvidenceBundleGenerator bundleGenerator,
IEvidenceBundleStore bundleStore,
IJobManifestProvider manifestProvider,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
SchedulerWorkerMetrics metrics,
ILogger<EvidenceBundleCoordinator> logger)
{
_jobQueue = jobQueue ?? throw new ArgumentNullException(nameof(jobQueue));
_bundleGenerator = bundleGenerator ?? throw new ArgumentNullException(nameof(bundleGenerator));
_bundleStore = bundleStore ?? throw new ArgumentNullException(nameof(bundleStore));
_manifestProvider = manifestProvider ?? throw new ArgumentNullException(nameof(manifestProvider));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("Evidence bundle coordinator started.");
// Start cleanup task
var cleanupTask = RunCleanupLoopAsync(stoppingToken);
try
{
await RunJobProcessingLoopAsync(stoppingToken).ConfigureAwait(false);
}
finally
{
// Cancel all running jobs
foreach (var cts in _runningJobs.Values)
{
cts.Cancel();
}
await cleanupTask.ConfigureAwait(false);
}
_logger.LogInformation("Evidence bundle coordinator stopped.");
}
private async Task RunJobProcessingLoopAsync(CancellationToken stoppingToken)
{
while (!stoppingToken.IsCancellationRequested)
{
try
{
// Dequeue jobs
var jobs = await _jobQueue
.DequeueAsync(_options.Policy.Dispatch.BatchSize, stoppingToken)
.ConfigureAwait(false);
if (jobs.Count == 0)
{
await Task.Delay(_options.Policy.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
foreach (var job in jobs)
{
if (stoppingToken.IsCancellationRequested)
{
break;
}
// Check for cancellation request
if (job.Status == BundleJobStatus.CancellationRequested)
{
await HandleCancellationAsync(job, stoppingToken).ConfigureAwait(false);
continue;
}
// Check idempotency
var existingBundle = await _bundleStore.GetBundleAsync(
job.TenantId,
job.IdempotencyKey,
stoppingToken).ConfigureAwait(false);
if (existingBundle is not null && existingBundle.Status == BundleStatus.Completed)
{
_logger.LogInformation(
"Job {JobId} already completed (idempotency key: {IdempotencyKey}), skipping.",
job.JobId,
job.IdempotencyKey);
continue;
}
// Process job
await ProcessJobAsync(job, stoppingToken).ConfigureAwait(false);
}
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error in evidence bundle coordinator loop.");
await Task.Delay(_options.Policy.Dispatch.RetryBackoff, stoppingToken).ConfigureAwait(false);
}
}
}
private async Task ProcessJobAsync(EvidenceBundleJob job, CancellationToken stoppingToken)
{
var startedAt = _timeProvider.GetUtcNow();
var jobCts = CancellationTokenSource.CreateLinkedTokenSource(stoppingToken);
if (!_runningJobs.TryAdd(job.JobId, jobCts))
{
_logger.LogWarning("Job {JobId} is already running.", job.JobId);
return;
}
_logger.LogInformation(
"Processing evidence bundle job {JobId} for tenant {TenantId}.",
job.JobId,
job.TenantId);
try
{
// Update status to running
await _jobQueue.UpdateStatusAsync(
job.JobId,
BundleJobStatus.Running,
stoppingToken).ConfigureAwait(false);
// Generate bundle
var bundle = await _bundleGenerator.GenerateAsync(
job,
jobCts.Token).ConfigureAwait(false);
// Store bundle
await _bundleStore.StoreBundleAsync(
job.TenantId,
job.IdempotencyKey,
bundle,
stoppingToken).ConfigureAwait(false);
// Update manifest
await _manifestProvider.UpdateManifestAsync(
job.TenantId,
job.JobId,
new JobManifest(
JobId: job.JobId,
TenantId: job.TenantId,
Status: BundleJobStatus.Completed,
BundleUri: bundle.StorageUri,
BundleSize: bundle.SizeBytes,
BundleChecksum: bundle.Checksum,
StartedAt: startedAt,
CompletedAt: _timeProvider.GetUtcNow(),
Metadata: job.Metadata),
stoppingToken).ConfigureAwait(false);
// Update job status
await _jobQueue.UpdateStatusAsync(
job.JobId,
BundleJobStatus.Completed,
stoppingToken).ConfigureAwait(false);
var duration = _timeProvider.GetUtcNow() - startedAt;
_logger.LogInformation(
"Evidence bundle job {JobId} completed: {BundleUri}, size={Size} bytes in {Duration}ms.",
job.JobId,
bundle.StorageUri,
bundle.SizeBytes,
duration.TotalMilliseconds);
}
catch (OperationCanceledException) when (jobCts.Token.IsCancellationRequested && !stoppingToken.IsCancellationRequested)
{
_logger.LogInformation("Job {JobId} was cancelled.", job.JobId);
await _jobQueue.UpdateStatusAsync(
job.JobId,
BundleJobStatus.Cancelled,
stoppingToken).ConfigureAwait(false);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(ex, "Evidence bundle job {JobId} failed.", job.JobId);
await _jobQueue.UpdateStatusAsync(
job.JobId,
BundleJobStatus.Failed,
stoppingToken).ConfigureAwait(false);
await _manifestProvider.UpdateManifestAsync(
job.TenantId,
job.JobId,
new JobManifest(
JobId: job.JobId,
TenantId: job.TenantId,
Status: BundleJobStatus.Failed,
BundleUri: null,
BundleSize: null,
BundleChecksum: null,
StartedAt: startedAt,
CompletedAt: _timeProvider.GetUtcNow(),
Error: ex.Message,
Metadata: job.Metadata),
stoppingToken).ConfigureAwait(false);
}
finally
{
_runningJobs.TryRemove(job.JobId, out _);
jobCts.Dispose();
}
}
private async Task HandleCancellationAsync(EvidenceBundleJob job, CancellationToken stoppingToken)
{
_logger.LogInformation("Cancelling job {JobId}.", job.JobId);
if (_runningJobs.TryGetValue(job.JobId, out var cts))
{
cts.Cancel();
}
else
{
// Job not running, mark as cancelled directly
await _jobQueue.UpdateStatusAsync(
job.JobId,
BundleJobStatus.Cancelled,
stoppingToken).ConfigureAwait(false);
}
}
private async Task RunCleanupLoopAsync(CancellationToken stoppingToken)
{
while (!stoppingToken.IsCancellationRequested)
{
try
{
await Task.Delay(TimeSpan.FromMinutes(5), stoppingToken).ConfigureAwait(false);
// Cleanup expired bundles
var expiredCount = await _bundleStore.CleanupExpiredAsync(
TimeSpan.FromDays(7),
stoppingToken).ConfigureAwait(false);
if (expiredCount > 0)
{
_logger.LogInformation("Cleaned up {Count} expired evidence bundles.", expiredCount);
}
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error in cleanup loop.");
}
}
}
/// <summary>
/// Requests cancellation of a running job.
/// </summary>
public async ValueTask RequestCancellationAsync(string jobId, CancellationToken cancellationToken = default)
{
await _jobQueue.UpdateStatusAsync(
jobId,
BundleJobStatus.CancellationRequested,
cancellationToken).ConfigureAwait(false);
}
}
/// <summary>
/// Queue interface for evidence bundle jobs.
/// </summary>
public interface IEvidenceBundleJobQueue
{
ValueTask<IReadOnlyList<EvidenceBundleJob>> DequeueAsync(int maxCount, CancellationToken cancellationToken = default);
ValueTask EnqueueAsync(EvidenceBundleJob job, CancellationToken cancellationToken = default);
ValueTask UpdateStatusAsync(string jobId, BundleJobStatus status, CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for generating evidence bundles.
/// </summary>
public interface IEvidenceBundleGenerator
{
ValueTask<GeneratedBundle> GenerateAsync(EvidenceBundleJob job, CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for storing evidence bundles.
/// </summary>
public interface IEvidenceBundleStore
{
ValueTask StoreBundleAsync(string tenantId, string idempotencyKey, GeneratedBundle bundle, CancellationToken cancellationToken = default);
ValueTask<StoredBundle?> GetBundleAsync(string tenantId, string idempotencyKey, CancellationToken cancellationToken = default);
ValueTask<int> CleanupExpiredAsync(TimeSpan maxAge, CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for job manifest provider (exposed to Web gateway).
/// </summary>
public interface IJobManifestProvider
{
ValueTask UpdateManifestAsync(string tenantId, string jobId, JobManifest manifest, CancellationToken cancellationToken = default);
ValueTask<JobManifest?> GetManifestAsync(string tenantId, string jobId, CancellationToken cancellationToken = default);
ValueTask<IReadOnlyList<JobManifest>> ListManifestsAsync(string tenantId, int maxCount, CancellationToken cancellationToken = default);
}
/// <summary>
/// Represents an evidence bundle job.
/// </summary>
public sealed record EvidenceBundleJob(
string JobId,
string TenantId,
string IdempotencyKey,
BundleJobStatus Status,
BundleType BundleType,
ImmutableArray<string> ArtifactIds,
DateTimeOffset RequestedAt,
ImmutableDictionary<string, string>? Metadata = null);
/// <summary>
/// Status of an evidence bundle job.
/// </summary>
public enum BundleJobStatus
{
Pending,
Running,
Completed,
Failed,
CancellationRequested,
Cancelled
}
/// <summary>
/// Type of evidence bundle.
/// </summary>
public enum BundleType
{
Sbom,
Findings,
Attestation,
PolicyResult,
Combined
}
/// <summary>
/// A generated evidence bundle.
/// </summary>
public sealed record GeneratedBundle(
string BundleId,
string StorageUri,
long SizeBytes,
string Checksum,
string ChecksumAlgorithm,
BundleType BundleType,
int ArtifactCount,
DateTimeOffset GeneratedAt);
/// <summary>
/// A stored evidence bundle.
/// </summary>
public sealed record StoredBundle(
string BundleId,
string TenantId,
string IdempotencyKey,
string StorageUri,
long SizeBytes,
BundleStatus Status,
DateTimeOffset CreatedAt,
DateTimeOffset? ExpiresAt);
/// <summary>
/// Status of a stored bundle.
/// </summary>
public enum BundleStatus
{
Pending,
Completed,
Expired
}
/// <summary>
/// Job manifest exposed to Web gateway.
/// </summary>
public sealed record JobManifest(
string JobId,
string TenantId,
BundleJobStatus Status,
string? BundleUri,
long? BundleSize,
string? BundleChecksum,
DateTimeOffset StartedAt,
DateTimeOffset? CompletedAt,
string? Error = null,
ImmutableDictionary<string, string>? Metadata = null);
/// <summary>
/// In-memory implementation of evidence bundle job queue.
/// </summary>
public sealed class InMemoryEvidenceBundleJobQueue : IEvidenceBundleJobQueue
{
private readonly ConcurrentQueue<EvidenceBundleJob> _queue = new();
private readonly ConcurrentDictionary<string, BundleJobStatus> _statuses = new();
public ValueTask<IReadOnlyList<EvidenceBundleJob>> DequeueAsync(int maxCount, CancellationToken cancellationToken = default)
{
var results = new List<EvidenceBundleJob>();
while (results.Count < maxCount && _queue.TryDequeue(out var job))
{
// Check if status changed (e.g., cancellation requested)
if (_statuses.TryGetValue(job.JobId, out var status))
{
job = job with { Status = status };
}
results.Add(job);
}
return ValueTask.FromResult<IReadOnlyList<EvidenceBundleJob>>(results);
}
public ValueTask EnqueueAsync(EvidenceBundleJob job, CancellationToken cancellationToken = default)
{
_queue.Enqueue(job);
_statuses[job.JobId] = job.Status;
return ValueTask.CompletedTask;
}
public ValueTask UpdateStatusAsync(string jobId, BundleJobStatus status, CancellationToken cancellationToken = default)
{
_statuses[jobId] = status;
return ValueTask.CompletedTask;
}
}
/// <summary>
/// In-memory implementation of evidence bundle store.
/// </summary>
public sealed class InMemoryEvidenceBundleStore : IEvidenceBundleStore
{
private readonly ConcurrentDictionary<string, StoredBundle> _bundles = new();
public ValueTask StoreBundleAsync(string tenantId, string idempotencyKey, GeneratedBundle bundle, CancellationToken cancellationToken = default)
{
var key = $"{tenantId}:{idempotencyKey}";
var stored = new StoredBundle(
bundle.BundleId,
tenantId,
idempotencyKey,
bundle.StorageUri,
bundle.SizeBytes,
BundleStatus.Completed,
DateTimeOffset.UtcNow,
DateTimeOffset.UtcNow.AddDays(7));
_bundles[key] = stored;
return ValueTask.CompletedTask;
}
public ValueTask<StoredBundle?> GetBundleAsync(string tenantId, string idempotencyKey, CancellationToken cancellationToken = default)
{
var key = $"{tenantId}:{idempotencyKey}";
return ValueTask.FromResult(_bundles.TryGetValue(key, out var bundle) ? bundle : null);
}
public ValueTask<int> CleanupExpiredAsync(TimeSpan maxAge, CancellationToken cancellationToken = default)
{
var cutoff = DateTimeOffset.UtcNow - maxAge;
var toRemove = _bundles
.Where(kvp => kvp.Value.CreatedAt < cutoff)
.Select(kvp => kvp.Key)
.ToList();
foreach (var key in toRemove)
{
_bundles.TryRemove(key, out _);
}
return ValueTask.FromResult(toRemove.Count);
}
}
/// <summary>
/// In-memory implementation of job manifest provider.
/// </summary>
public sealed class InMemoryJobManifestProvider : IJobManifestProvider
{
private readonly ConcurrentDictionary<string, JobManifest> _manifests = new();
public ValueTask UpdateManifestAsync(string tenantId, string jobId, JobManifest manifest, CancellationToken cancellationToken = default)
{
var key = $"{tenantId}:{jobId}";
_manifests[key] = manifest;
return ValueTask.CompletedTask;
}
public ValueTask<JobManifest?> GetManifestAsync(string tenantId, string jobId, CancellationToken cancellationToken = default)
{
var key = $"{tenantId}:{jobId}";
return ValueTask.FromResult(_manifests.TryGetValue(key, out var manifest) ? manifest : null);
}
public ValueTask<IReadOnlyList<JobManifest>> ListManifestsAsync(string tenantId, int maxCount, CancellationToken cancellationToken = default)
{
var results = _manifests
.Where(kvp => kvp.Key.StartsWith($"{tenantId}:"))
.Select(kvp => kvp.Value)
.OrderByDescending(m => m.StartedAt)
.Take(maxCount)
.ToList();
return ValueTask.FromResult<IReadOnlyList<JobManifest>>(results);
}
}
/// <summary>
/// Null implementation of evidence bundle generator for testing.
/// </summary>
public sealed class NullEvidenceBundleGenerator : IEvidenceBundleGenerator
{
public static NullEvidenceBundleGenerator Instance { get; } = new();
public ValueTask<GeneratedBundle> GenerateAsync(EvidenceBundleJob job, CancellationToken cancellationToken = default)
{
return ValueTask.FromResult(new GeneratedBundle(
BundleId: $"bundle-{job.JobId}",
StorageUri: $"mem://{job.TenantId}/bundles/{job.JobId}.zip",
SizeBytes: 0,
Checksum: "0000000000000000000000000000000000000000000000000000000000000000",
ChecksumAlgorithm: "SHA256",
BundleType: job.BundleType,
ArtifactCount: job.ArtifactIds.Length,
GeneratedAt: DateTimeOffset.UtcNow));
}
}

View File

@@ -0,0 +1,383 @@
using System.Collections.Concurrent;
using System.Collections.Immutable;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Console;
/// <summary>
/// Progress streaming worker per SCHED-WORKER-CONSOLE-23-201.
/// Streams run progress events (stage status, tuples processed, SLA hints) to Redis/NATS for Console SSE.
/// Includes heartbeat, dedupe, and retention policy. Publishes metrics and structured logs for queue lag.
/// </summary>
public sealed class ProgressStreamingWorker : BackgroundService
{
private readonly IProgressEventSource _eventSource;
private readonly IProgressStreamPublisher _streamPublisher;
private readonly IProgressEventDeduplicator _deduplicator;
private readonly IHeartbeatService _heartbeatService;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly SchedulerWorkerMetrics _metrics;
private readonly ILogger<ProgressStreamingWorker> _logger;
public ProgressStreamingWorker(
IProgressEventSource eventSource,
IProgressStreamPublisher streamPublisher,
IProgressEventDeduplicator deduplicator,
IHeartbeatService heartbeatService,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
SchedulerWorkerMetrics metrics,
ILogger<ProgressStreamingWorker> logger)
{
_eventSource = eventSource ?? throw new ArgumentNullException(nameof(eventSource));
_streamPublisher = streamPublisher ?? throw new ArgumentNullException(nameof(streamPublisher));
_deduplicator = deduplicator ?? throw new ArgumentNullException(nameof(deduplicator));
_heartbeatService = heartbeatService ?? throw new ArgumentNullException(nameof(heartbeatService));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("Progress streaming worker started.");
// Start heartbeat task
var heartbeatTask = RunHeartbeatLoopAsync(stoppingToken);
try
{
await RunEventStreamingLoopAsync(stoppingToken).ConfigureAwait(false);
}
finally
{
await heartbeatTask.ConfigureAwait(false);
}
_logger.LogInformation("Progress streaming worker stopped.");
}
private async Task RunEventStreamingLoopAsync(CancellationToken stoppingToken)
{
while (!stoppingToken.IsCancellationRequested)
{
try
{
// Get next batch of progress events
var events = await _eventSource
.GetEventsAsync(100, stoppingToken)
.ConfigureAwait(false);
if (events.Count == 0)
{
await Task.Delay(TimeSpan.FromMilliseconds(100), stoppingToken).ConfigureAwait(false);
continue;
}
// Deduplicate events
var uniqueEvents = new List<ProgressEvent>();
foreach (var evt in events)
{
if (await _deduplicator.TryMarkAsProcessedAsync(evt.EventId, stoppingToken).ConfigureAwait(false))
{
uniqueEvents.Add(evt);
}
else
{
_logger.LogDebug("Skipping duplicate event {EventId}.", evt.EventId);
}
}
if (uniqueEvents.Count == 0)
{
continue;
}
// Group by tenant for efficient publishing
var byTenant = uniqueEvents.GroupBy(e => e.TenantId);
foreach (var tenantGroup in byTenant)
{
var tenantId = tenantGroup.Key;
var tenantEvents = tenantGroup.ToList();
try
{
// Publish to stream
await _streamPublisher.PublishAsync(
tenantId,
tenantEvents,
stoppingToken).ConfigureAwait(false);
// Log queue lag metrics
foreach (var evt in tenantEvents)
{
var lag = _timeProvider.GetUtcNow() - evt.Timestamp;
if (lag.TotalSeconds > 5)
{
_logger.LogWarning(
"Progress event lag detected: {EventId}, lag={Lag}s",
evt.EventId,
lag.TotalSeconds);
}
}
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(
ex,
"Failed to publish {Count} events for tenant {TenantId}.",
tenantEvents.Count,
tenantId);
}
}
// Acknowledge processed events
await _eventSource.AcknowledgeAsync(
uniqueEvents.Select(e => e.EventId).ToList(),
stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error in progress streaming loop.");
await Task.Delay(TimeSpan.FromSeconds(1), stoppingToken).ConfigureAwait(false);
}
}
}
private async Task RunHeartbeatLoopAsync(CancellationToken stoppingToken)
{
while (!stoppingToken.IsCancellationRequested)
{
try
{
await _heartbeatService.SendHeartbeatAsync(stoppingToken).ConfigureAwait(false);
await Task.Delay(TimeSpan.FromSeconds(10), stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error sending heartbeat.");
}
}
}
}
/// <summary>
/// Source interface for progress events.
/// </summary>
public interface IProgressEventSource
{
ValueTask<IReadOnlyList<ProgressEvent>> GetEventsAsync(int maxCount, CancellationToken cancellationToken = default);
ValueTask AcknowledgeAsync(IReadOnlyList<string> eventIds, CancellationToken cancellationToken = default);
}
/// <summary>
/// Publisher interface for progress streams.
/// </summary>
public interface IProgressStreamPublisher
{
ValueTask PublishAsync(string tenantId, IReadOnlyList<ProgressEvent> events, CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for event deduplication.
/// </summary>
public interface IProgressEventDeduplicator
{
/// <summary>
/// Tries to mark an event as processed. Returns true if this is the first time, false if duplicate.
/// </summary>
ValueTask<bool> TryMarkAsProcessedAsync(string eventId, CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for heartbeat service.
/// </summary>
public interface IHeartbeatService
{
ValueTask SendHeartbeatAsync(CancellationToken cancellationToken = default);
}
/// <summary>
/// A progress event for streaming.
/// </summary>
public sealed record ProgressEvent(
string EventId,
string TenantId,
string RunId,
ProgressEventType Type,
RunStage Stage,
int TuplesProcessed,
int TuplesTotal,
SlaHint? SlaHint,
DateTimeOffset Timestamp,
ImmutableDictionary<string, string>? Metadata = null);
/// <summary>
/// Type of progress event.
/// </summary>
public enum ProgressEventType
{
RunStarted,
StageChanged,
ProgressUpdate,
SlaWarning,
RunCompleted,
RunFailed,
Heartbeat
}
/// <summary>
/// Stage of a run.
/// </summary>
public enum RunStage
{
Queued,
Initializing,
Scanning,
Resolving,
Evaluating,
Aggregating,
Finalizing,
Completed,
Failed,
Cancelled
}
/// <summary>
/// SLA hint for progress events.
/// </summary>
public sealed record SlaHint(
TimeSpan EstimatedRemaining,
TimeSpan SlaThreshold,
bool AtRisk,
string? Message = null);
/// <summary>
/// In-memory implementation of progress event source.
/// </summary>
public sealed class InMemoryProgressEventSource : IProgressEventSource
{
private readonly ConcurrentQueue<ProgressEvent> _events = new();
private readonly ConcurrentDictionary<string, bool> _acknowledged = new();
public ValueTask<IReadOnlyList<ProgressEvent>> GetEventsAsync(int maxCount, CancellationToken cancellationToken = default)
{
var results = new List<ProgressEvent>();
while (results.Count < maxCount && _events.TryDequeue(out var evt))
{
if (!_acknowledged.ContainsKey(evt.EventId))
{
results.Add(evt);
}
}
return ValueTask.FromResult<IReadOnlyList<ProgressEvent>>(results);
}
public ValueTask AcknowledgeAsync(IReadOnlyList<string> eventIds, CancellationToken cancellationToken = default)
{
foreach (var eventId in eventIds)
{
_acknowledged[eventId] = true;
}
return ValueTask.CompletedTask;
}
/// <summary>
/// Enqueues an event (for testing).
/// </summary>
public void Enqueue(ProgressEvent evt)
{
_events.Enqueue(evt);
}
}
/// <summary>
/// In-memory implementation of progress stream publisher.
/// </summary>
public sealed class InMemoryProgressStreamPublisher : IProgressStreamPublisher
{
private readonly ConcurrentDictionary<string, List<ProgressEvent>> _streams = new();
public ValueTask PublishAsync(string tenantId, IReadOnlyList<ProgressEvent> events, CancellationToken cancellationToken = default)
{
var stream = _streams.GetOrAdd(tenantId, _ => []);
lock (stream)
{
stream.AddRange(events);
}
return ValueTask.CompletedTask;
}
/// <summary>
/// Gets published events for a tenant (for testing).
/// </summary>
public IReadOnlyList<ProgressEvent> GetEvents(string tenantId)
{
return _streams.TryGetValue(tenantId, out var stream)
? stream.ToList()
: [];
}
}
/// <summary>
/// In-memory implementation of event deduplicator with TTL.
/// </summary>
public sealed class InMemoryProgressEventDeduplicator : IProgressEventDeduplicator
{
private readonly ConcurrentDictionary<string, DateTimeOffset> _processed = new();
private readonly TimeSpan _retentionPeriod;
public InMemoryProgressEventDeduplicator(TimeSpan? retentionPeriod = null)
{
_retentionPeriod = retentionPeriod ?? TimeSpan.FromMinutes(30);
}
public ValueTask<bool> TryMarkAsProcessedAsync(string eventId, CancellationToken cancellationToken = default)
{
var now = DateTimeOffset.UtcNow;
// Clean up old entries periodically
if (_processed.Count > 10000)
{
var cutoff = now - _retentionPeriod;
var toRemove = _processed.Where(kvp => kvp.Value < cutoff).Select(kvp => kvp.Key).ToList();
foreach (var key in toRemove)
{
_processed.TryRemove(key, out _);
}
}
// Try to add
return ValueTask.FromResult(_processed.TryAdd(eventId, now));
}
}
/// <summary>
/// Null implementation of heartbeat service.
/// </summary>
public sealed class NullHeartbeatService : IHeartbeatService
{
public static NullHeartbeatService Instance { get; } = new();
public ValueTask SendHeartbeatAsync(CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
}

View File

@@ -0,0 +1,276 @@
using System.Collections.Immutable;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Exceptions;
/// <summary>
/// Exception lifecycle worker per SCHED-WORKER-25-101.
/// Handles auto-activation/expiry of exceptions and publishes exception.* events with retries/backoff.
/// </summary>
public sealed class ExceptionLifecycleWorker : BackgroundService
{
private readonly IExceptionRepository _exceptionRepository;
private readonly IExceptionEventPublisher _eventPublisher;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly SchedulerWorkerMetrics _metrics;
private readonly ILogger<ExceptionLifecycleWorker> _logger;
public ExceptionLifecycleWorker(
IExceptionRepository exceptionRepository,
IExceptionEventPublisher eventPublisher,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
SchedulerWorkerMetrics metrics,
ILogger<ExceptionLifecycleWorker> logger)
{
_exceptionRepository = exceptionRepository ?? throw new ArgumentNullException(nameof(exceptionRepository));
_eventPublisher = eventPublisher ?? throw new ArgumentNullException(nameof(eventPublisher));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("Exception lifecycle worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
var now = _timeProvider.GetUtcNow();
// Process pending activations
await ProcessPendingActivationsAsync(now, stoppingToken).ConfigureAwait(false);
// Process expired exceptions
await ProcessExpiredExceptionsAsync(now, stoppingToken).ConfigureAwait(false);
await Task.Delay(TimeSpan.FromMinutes(1), stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (System.Exception ex)
{
_logger.LogError(ex, "Error in exception lifecycle worker loop.");
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Exception lifecycle worker stopped.");
}
private async Task ProcessPendingActivationsAsync(DateTimeOffset now, CancellationToken cancellationToken)
{
var pendingActivations = await _exceptionRepository
.GetPendingActivationsAsync(now, cancellationToken)
.ConfigureAwait(false);
foreach (var exception in pendingActivations)
{
try
{
var activated = exception with
{
State = ExceptionState.Active,
ActivatedAt = now
};
await _exceptionRepository
.UpdateAsync(activated, cancellationToken)
.ConfigureAwait(false);
await PublishEventWithRetryAsync(
ExceptionEventType.Activated,
activated,
cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"Exception {ExceptionId} activated for tenant {TenantId}.",
exception.ExceptionId,
exception.TenantId);
}
catch (System.Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(
ex,
"Failed to activate exception {ExceptionId}.",
exception.ExceptionId);
}
}
}
private async Task ProcessExpiredExceptionsAsync(DateTimeOffset now, CancellationToken cancellationToken)
{
var expired = await _exceptionRepository
.GetExpiredExceptionsAsync(now, cancellationToken)
.ConfigureAwait(false);
foreach (var exception in expired)
{
try
{
var expiredRecord = exception with
{
State = ExceptionState.Expired,
ExpiredAt = now
};
await _exceptionRepository
.UpdateAsync(expiredRecord, cancellationToken)
.ConfigureAwait(false);
await PublishEventWithRetryAsync(
ExceptionEventType.Expired,
expiredRecord,
cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"Exception {ExceptionId} expired for tenant {TenantId}.",
exception.ExceptionId,
exception.TenantId);
}
catch (System.Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(
ex,
"Failed to expire exception {ExceptionId}.",
exception.ExceptionId);
}
}
}
private async Task PublishEventWithRetryAsync(
ExceptionEventType eventType,
ExceptionRecord exception,
CancellationToken cancellationToken)
{
const int maxRetries = 3;
var delay = TimeSpan.FromSeconds(1);
for (var attempt = 0; attempt < maxRetries; attempt++)
{
try
{
await _eventPublisher.PublishAsync(
eventType,
exception,
cancellationToken).ConfigureAwait(false);
return;
}
catch (System.Exception ex) when (ex is not OperationCanceledException && attempt < maxRetries - 1)
{
_logger.LogWarning(
ex,
"Failed to publish {EventType} event for exception {ExceptionId} (attempt {Attempt}), retrying...",
eventType,
exception.ExceptionId,
attempt + 1);
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
delay *= 2; // Exponential backoff
}
}
}
}
/// <summary>
/// Repository interface for exceptions.
/// </summary>
public interface IExceptionRepository
{
ValueTask<IReadOnlyList<ExceptionRecord>> GetPendingActivationsAsync(
DateTimeOffset asOf,
CancellationToken cancellationToken = default);
ValueTask<IReadOnlyList<ExceptionRecord>> GetExpiredExceptionsAsync(
DateTimeOffset asOf,
CancellationToken cancellationToken = default);
ValueTask<IReadOnlyList<ExceptionRecord>> GetExpiringExceptionsAsync(
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
CancellationToken cancellationToken = default);
ValueTask UpdateAsync(
ExceptionRecord record,
CancellationToken cancellationToken = default);
ValueTask<ExceptionRecord?> GetAsync(
string exceptionId,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Record representing an exception in the system.
/// </summary>
public sealed record ExceptionRecord(
string ExceptionId,
string TenantId,
string PolicyId,
string VulnerabilityId,
string? ComponentPurl,
ExceptionState State,
DateTimeOffset CreatedAt,
DateTimeOffset? ActivationDate,
DateTimeOffset? ExpirationDate,
DateTimeOffset? ActivatedAt = null,
DateTimeOffset? ExpiredAt = null,
string? Justification = null,
string? CreatedBy = null);
/// <summary>
/// State of an exception.
/// </summary>
public enum ExceptionState
{
Pending,
Active,
Expired,
Revoked
}
/// <summary>
/// Event types for exception lifecycle.
/// </summary>
public enum ExceptionEventType
{
Created,
Activated,
Expiring,
Expired,
Revoked
}
/// <summary>
/// Publisher interface for exception events.
/// </summary>
public interface IExceptionEventPublisher
{
ValueTask PublishAsync(
ExceptionEventType eventType,
ExceptionRecord exception,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Null implementation of exception event publisher for testing.
/// </summary>
public sealed class NullExceptionEventPublisher : IExceptionEventPublisher
{
public static NullExceptionEventPublisher Instance { get; } = new();
public ValueTask PublishAsync(
ExceptionEventType eventType,
ExceptionRecord exception,
CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
}

View File

@@ -0,0 +1,313 @@
using System.Collections.Immutable;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Exceptions;
/// <summary>
/// Expiring notification worker per SCHED-WORKER-25-102.
/// Generates digests of soon-to-expire exceptions, marks them as 'expiring',
/// and updates metrics/alerts for Console dashboards.
/// </summary>
public sealed class ExpiringNotificationWorker : BackgroundService
{
private readonly IExceptionRepository _exceptionRepository;
private readonly IExceptionEventPublisher _eventPublisher;
private readonly IExpiringDigestService _digestService;
private readonly IExpiringAlertService _alertService;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly SchedulerWorkerMetrics _metrics;
private readonly ILogger<ExpiringNotificationWorker> _logger;
public ExpiringNotificationWorker(
IExceptionRepository exceptionRepository,
IExceptionEventPublisher eventPublisher,
IExpiringDigestService digestService,
IExpiringAlertService alertService,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
SchedulerWorkerMetrics metrics,
ILogger<ExpiringNotificationWorker> logger)
{
_exceptionRepository = exceptionRepository ?? throw new ArgumentNullException(nameof(exceptionRepository));
_eventPublisher = eventPublisher ?? throw new ArgumentNullException(nameof(eventPublisher));
_digestService = digestService ?? throw new ArgumentNullException(nameof(digestService));
_alertService = alertService ?? throw new ArgumentNullException(nameof(alertService));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
if (!_options.Exception.ExpiringNotificationEnabled)
{
_logger.LogInformation("Expiring notification worker is disabled.");
return;
}
_logger.LogInformation("Expiring notification worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
var now = _timeProvider.GetUtcNow();
// Process exceptions expiring within the notification window
await ProcessExpiringExceptionsAsync(now, stoppingToken).ConfigureAwait(false);
// Wait for the configured interval before next check
await Task.Delay(_options.Exception.ExpiringCheckInterval, stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (System.Exception ex)
{
_logger.LogError(ex, "Error in expiring notification worker loop.");
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Expiring notification worker stopped.");
}
private async Task ProcessExpiringExceptionsAsync(DateTimeOffset now, CancellationToken cancellationToken)
{
// Calculate the notification window
var windowStart = now;
var windowEnd = now.Add(_options.Exception.ExpiringNotificationWindow);
// Get exceptions expiring within the window
var expiringExceptions = await _exceptionRepository
.GetExpiringExceptionsAsync(windowStart, windowEnd, cancellationToken)
.ConfigureAwait(false);
if (expiringExceptions.Count == 0)
{
_logger.LogDebug("No expiring exceptions found within notification window.");
return;
}
_logger.LogInformation(
"Found {Count} exceptions expiring within notification window ({WindowStart} - {WindowEnd}).",
expiringExceptions.Count,
windowStart,
windowEnd);
// Group by tenant for digest generation
var byTenant = expiringExceptions
.GroupBy(static e => e.TenantId)
.ToList();
foreach (var tenantGroup in byTenant)
{
var tenantId = tenantGroup.Key;
var tenantExpiring = tenantGroup.ToList();
try
{
// Mark each exception as expiring and publish event
foreach (var exception in tenantExpiring)
{
await MarkAsExpiringAndNotifyAsync(exception, now, cancellationToken)
.ConfigureAwait(false);
}
// Generate digest for this tenant
var digest = await _digestService.GenerateDigestAsync(
tenantId,
tenantExpiring,
windowEnd,
cancellationToken).ConfigureAwait(false);
// Emit alert for the digest
await _alertService.EmitExpiringAlertAsync(
tenantId,
digest,
cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"Generated expiring digest for tenant {TenantId}: {ExceptionCount} exceptions, digest ID {DigestId}.",
tenantId,
tenantExpiring.Count,
digest.DigestId);
}
catch (System.Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(
ex,
"Failed to process expiring exceptions for tenant {TenantId}.",
tenantId);
}
}
}
private async Task MarkAsExpiringAndNotifyAsync(
ExceptionRecord exception,
DateTimeOffset now,
CancellationToken cancellationToken)
{
// Only mark active exceptions as expiring
if (exception.State != ExceptionState.Active)
{
return;
}
try
{
// Publish expiring event with retry
await PublishEventWithRetryAsync(
ExceptionEventType.Expiring,
exception,
cancellationToken).ConfigureAwait(false);
_logger.LogDebug(
"Exception {ExceptionId} for tenant {TenantId} marked as expiring (expires at {ExpirationDate}).",
exception.ExceptionId,
exception.TenantId,
exception.ExpirationDate);
}
catch (System.Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogWarning(
ex,
"Failed to publish expiring event for exception {ExceptionId}.",
exception.ExceptionId);
}
}
private async Task PublishEventWithRetryAsync(
ExceptionEventType eventType,
ExceptionRecord exception,
CancellationToken cancellationToken)
{
const int maxRetries = 3;
var delay = TimeSpan.FromSeconds(1);
for (var attempt = 0; attempt < maxRetries; attempt++)
{
try
{
await _eventPublisher.PublishAsync(
eventType,
exception,
cancellationToken).ConfigureAwait(false);
return;
}
catch (System.Exception ex) when (ex is not OperationCanceledException && attempt < maxRetries - 1)
{
_logger.LogWarning(
ex,
"Failed to publish {EventType} event for exception {ExceptionId} (attempt {Attempt}), retrying...",
eventType,
exception.ExceptionId,
attempt + 1);
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
delay *= 2; // Exponential backoff
}
}
}
}
/// <summary>
/// Service for generating expiring exception digests.
/// </summary>
public interface IExpiringDigestService
{
/// <summary>
/// Generates a digest of expiring exceptions for a tenant.
/// </summary>
ValueTask<ExpiringDigest> GenerateDigestAsync(
string tenantId,
IReadOnlyList<ExceptionRecord> expiringExceptions,
DateTimeOffset windowEnd,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Service for emitting expiring exception alerts.
/// </summary>
public interface IExpiringAlertService
{
/// <summary>
/// Emits an alert for expiring exceptions.
/// </summary>
ValueTask EmitExpiringAlertAsync(
string tenantId,
ExpiringDigest digest,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Digest of expiring exceptions for notification.
/// </summary>
public sealed record ExpiringDigest(
string DigestId,
string TenantId,
DateTimeOffset GeneratedAt,
DateTimeOffset WindowEnd,
int TotalCount,
int CriticalCount,
int HighCount,
ImmutableArray<ExpiringDigestEntry> Entries);
/// <summary>
/// Individual entry in an expiring digest.
/// </summary>
public sealed record ExpiringDigestEntry(
string ExceptionId,
string PolicyId,
string VulnerabilityId,
string? ComponentPurl,
DateTimeOffset ExpirationDate,
TimeSpan TimeUntilExpiry);
/// <summary>
/// Null implementation of expiring digest service for testing.
/// </summary>
public sealed class NullExpiringDigestService : IExpiringDigestService
{
public static NullExpiringDigestService Instance { get; } = new();
public ValueTask<ExpiringDigest> GenerateDigestAsync(
string tenantId,
IReadOnlyList<ExceptionRecord> expiringExceptions,
DateTimeOffset windowEnd,
CancellationToken cancellationToken = default)
{
var digest = new ExpiringDigest(
DigestId: Guid.NewGuid().ToString("N"),
TenantId: tenantId,
GeneratedAt: DateTimeOffset.UtcNow,
WindowEnd: windowEnd,
TotalCount: expiringExceptions.Count,
CriticalCount: 0,
HighCount: 0,
Entries: []);
return ValueTask.FromResult(digest);
}
}
/// <summary>
/// Null implementation of expiring alert service for testing.
/// </summary>
public sealed class NullExpiringAlertService : IExpiringAlertService
{
public static NullExpiringAlertService Instance { get; } = new();
public ValueTask EmitExpiringAlertAsync(
string tenantId,
ExpiringDigest digest,
CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
}

View File

@@ -20,7 +20,8 @@ public sealed class SchedulerWorkerMetrics : IDisposable
private readonly Counter<long> _runnerDeltaFindingsTotal;
private readonly Counter<long> _runnerKevHitsTotal;
private readonly Counter<long> _surfaceManifestPrefetchTotal;
private readonly Counter<long> _surfaceManifestPrefetchTotal;
private readonly Counter<long> _policyReEvaluationTotal;
private readonly Histogram<double> _policyReEvaluationDurationSeconds;
private readonly Histogram<double> _runDurationSeconds;
private readonly UpDownCounter<long> _runsActive;
private readonly Counter<long> _graphJobsTotal;
@@ -71,10 +72,14 @@ public sealed class SchedulerWorkerMetrics : IDisposable
"scheduler_surface_manifest_prefetch_total",
unit: "attempt",
description: "Surface manifest prefetch attempts grouped by result.");
_surfaceManifestPrefetchTotal = _meter.CreateCounter<long>(
"scheduler_surface_manifest_prefetch_total",
unit: "attempt",
description: "Surface manifest prefetch attempts grouped by result.");
_policyReEvaluationTotal = _meter.CreateCounter<long>(
"scheduler_policy_reevaluation_total",
unit: "count",
description: "Policy re-evaluation jobs grouped by tenant and status.");
_policyReEvaluationDurationSeconds = _meter.CreateHistogram<double>(
"scheduler_policy_reevaluation_duration_seconds",
unit: "s",
description: "Policy re-evaluation job durations grouped by tenant and status.");
_runDurationSeconds = _meter.CreateHistogram<double>(
"scheduler_run_duration_seconds",
unit: "s",
@@ -188,6 +193,18 @@ public sealed class SchedulerWorkerMetrics : IDisposable
_surfaceManifestPrefetchTotal.Add(1, tags);
}
public void RecordPolicyReEvaluation(string tenantId, string status, TimeSpan duration)
{
var tags = new[]
{
new KeyValuePair<string, object?>("tenant", tenantId),
new KeyValuePair<string, object?>("status", status)
};
_policyReEvaluationTotal.Add(1, tags);
_policyReEvaluationDurationSeconds.Record(Math.Max(duration.TotalSeconds, 0d), tags);
}
public void RecordDeltaSummaries(string mode, IReadOnlyList<DeltaSummary> deltas)
{
if (deltas.Count == 0)

View File

@@ -15,12 +15,21 @@ public sealed class SchedulerWorkerOptions
public GraphOptions Graph { get; set; } = new();
public SurfaceOptions Surface { get; set; } = new();
public ExceptionOptions Exception { get; set; } = new();
public ReachabilityOptions Reachability { get; set; } = new();
public void Validate()
{
Planner.Validate();
Runner.Validate();
Policy.Validate();
Graph.Validate();
Surface.Validate();
Exception.Validate();
Reachability.Validate();
}
public sealed class PlannerOptions
@@ -280,21 +289,21 @@ public sealed class SchedulerWorkerOptions
/// </summary>
public bool Enabled { get; set; } = true;
public DispatchOptions Dispatch { get; set; } = new();
public ApiOptions Api { get; set; } = new();
public TargetingOptions Targeting { get; set; } = new();
public WebhookOptions Webhook { get; set; } = new();
public void Validate()
{
Dispatch.Validate();
Api.Validate();
Targeting.Validate();
Webhook.Validate();
}
public DispatchOptions Dispatch { get; set; } = new();
public ApiOptions Api { get; set; } = new();
public TargetingOptions Targeting { get; set; } = new();
public WebhookOptions Webhook { get; set; } = new();
public void Validate()
{
Dispatch.Validate();
Api.Validate();
Targeting.Validate();
Webhook.Validate();
}
public sealed class DispatchOptions
{
@@ -433,11 +442,11 @@ public sealed class SchedulerWorkerOptions
}
}
public sealed class TargetingOptions
{
/// <summary>
/// When disabled the worker skips policy delta targeting.
/// </summary>
public sealed class TargetingOptions
{
/// <summary>
/// When disabled the worker skips policy delta targeting.
/// </summary>
public bool Enabled { get; set; } = true;
/// <summary>
@@ -457,59 +466,59 @@ public sealed class SchedulerWorkerOptions
throw new InvalidOperationException("Policy targeting MaxSboms must be greater than zero.");
}
}
}
public sealed class WebhookOptions
{
/// <summary>
/// Controls whether webhook callbacks are emitted when simulations complete.
/// </summary>
public bool Enabled { get; set; }
/// <summary>
/// Absolute endpoint to invoke for webhook callbacks.
/// </summary>
public string? Endpoint { get; set; }
/// <summary>
/// Optional header to carry an API key.
/// </summary>
public string? ApiKeyHeader { get; set; }
/// <summary>
/// Optional API key value aligned with <see cref="ApiKeyHeader"/>.
/// </summary>
public string? ApiKey { get; set; }
/// <summary>
/// Request timeout in seconds.
/// </summary>
public int TimeoutSeconds { get; set; } = 10;
public void Validate()
{
if (!Enabled)
{
return;
}
if (string.IsNullOrWhiteSpace(Endpoint))
{
throw new InvalidOperationException("Policy webhook endpoint must be configured when enabled.");
}
if (!Uri.TryCreate(Endpoint, UriKind.Absolute, out _))
{
throw new InvalidOperationException("Policy webhook endpoint must be an absolute URI.");
}
if (TimeoutSeconds <= 0)
{
throw new InvalidOperationException("Policy webhook timeout must be greater than zero.");
}
}
}
}
}
public sealed class WebhookOptions
{
/// <summary>
/// Controls whether webhook callbacks are emitted when simulations complete.
/// </summary>
public bool Enabled { get; set; }
/// <summary>
/// Absolute endpoint to invoke for webhook callbacks.
/// </summary>
public string? Endpoint { get; set; }
/// <summary>
/// Optional header to carry an API key.
/// </summary>
public string? ApiKeyHeader { get; set; }
/// <summary>
/// Optional API key value aligned with <see cref="ApiKeyHeader"/>.
/// </summary>
public string? ApiKey { get; set; }
/// <summary>
/// Request timeout in seconds.
/// </summary>
public int TimeoutSeconds { get; set; } = 10;
public void Validate()
{
if (!Enabled)
{
return;
}
if (string.IsNullOrWhiteSpace(Endpoint))
{
throw new InvalidOperationException("Policy webhook endpoint must be configured when enabled.");
}
if (!Uri.TryCreate(Endpoint, UriKind.Absolute, out _))
{
throw new InvalidOperationException("Policy webhook endpoint must be an absolute URI.");
}
if (TimeoutSeconds <= 0)
{
throw new InvalidOperationException("Policy webhook timeout must be greater than zero.");
}
}
}
}
public sealed class GraphOptions
{
@@ -700,4 +709,174 @@ public sealed class SchedulerWorkerOptions
}
}
}
/// <summary>
/// Options for Surface.FS pointer evaluation per SCHED-SURFACE-01.
/// </summary>
public sealed class SurfaceOptions
{
/// <summary>
/// When enabled, Surface.FS pointers are evaluated during planning to detect drift.
/// </summary>
public bool Enabled { get; set; } = true;
/// <summary>
/// When enabled, the worker operates in sealed mode rejecting external storage URIs.
/// </summary>
public bool SealedMode { get; set; } = false;
/// <summary>
/// When enabled, images with unchanged versions are skipped to avoid redundant scans.
/// </summary>
public bool SkipRedundantScans { get; set; } = true;
/// <summary>
/// Allowed dataset types for Surface.FS pointers.
/// </summary>
public HashSet<string> AllowedDatasets { get; set; } = new(StringComparer.OrdinalIgnoreCase)
{
"sbom",
"findings",
"reachability",
"policy",
"attestation"
};
/// <summary>
/// Time-to-live for cached pointer versions.
/// </summary>
public TimeSpan CacheTtl { get; set; } = TimeSpan.FromMinutes(30);
public void Validate()
{
if (AllowedDatasets.Count == 0)
{
throw new InvalidOperationException("Surface allowed datasets must contain at least one value.");
}
if (CacheTtl <= TimeSpan.Zero)
{
throw new InvalidOperationException("Surface cache TTL must be greater than zero.");
}
}
}
/// <summary>
/// Options for exception lifecycle workers per SCHED-WORKER-25-101/25-102.
/// </summary>
public sealed class ExceptionOptions
{
/// <summary>
/// When enabled, the expiring notification worker generates and sends digests.
/// </summary>
public bool ExpiringNotificationEnabled { get; set; } = true;
/// <summary>
/// Notification window for expiring exceptions.
/// Exceptions expiring within this window will be included in digests.
/// </summary>
public TimeSpan ExpiringNotificationWindow { get; set; } = TimeSpan.FromDays(7);
/// <summary>
/// Interval between expiring notification checks.
/// </summary>
public TimeSpan ExpiringCheckInterval { get; set; } = TimeSpan.FromHours(1);
/// <summary>
/// Maximum number of retries for publishing exception events.
/// </summary>
public int MaxPublishRetries { get; set; } = 3;
/// <summary>
/// Base delay for exponential backoff when retrying event publishing.
/// </summary>
public TimeSpan PublishRetryDelay { get; set; } = TimeSpan.FromSeconds(1);
public void Validate()
{
if (ExpiringNotificationWindow <= TimeSpan.Zero)
{
throw new InvalidOperationException("Exception expiring notification window must be greater than zero.");
}
if (ExpiringCheckInterval <= TimeSpan.Zero)
{
throw new InvalidOperationException("Exception expiring check interval must be greater than zero.");
}
if (MaxPublishRetries < 0)
{
throw new InvalidOperationException("Exception max publish retries cannot be negative.");
}
if (PublishRetryDelay < TimeSpan.Zero)
{
throw new InvalidOperationException("Exception publish retry delay cannot be negative.");
}
}
}
/// <summary>
/// Options for reachability joiner worker per SCHED-WORKER-26-201.
/// </summary>
public sealed class ReachabilityOptions
{
/// <summary>
/// When enabled, the reachability joiner worker combines SBOM snapshots with signals.
/// </summary>
public bool Enabled { get; set; } = true;
/// <summary>
/// Maximum number of SBOM snapshots to process per batch.
/// </summary>
public int BatchSize { get; set; } = 50;
/// <summary>
/// Polling interval for the reachability joiner loop.
/// </summary>
public TimeSpan PollInterval { get; set; } = TimeSpan.FromSeconds(10);
/// <summary>
/// Delay applied when no work is available.
/// </summary>
public TimeSpan IdleDelay { get; set; } = TimeSpan.FromSeconds(30);
/// <summary>
/// Time-to-live for cached reachability facts.
/// </summary>
public TimeSpan FactCacheTtl { get; set; } = TimeSpan.FromHours(24);
/// <summary>
/// Maximum number of concurrent signal processing tasks.
/// </summary>
public int MaxConcurrency { get; set; } = Environment.ProcessorCount;
public void Validate()
{
if (BatchSize <= 0)
{
throw new InvalidOperationException("Reachability batch size must be greater than zero.");
}
if (PollInterval <= TimeSpan.Zero)
{
throw new InvalidOperationException("Reachability poll interval must be greater than zero.");
}
if (IdleDelay < TimeSpan.Zero)
{
throw new InvalidOperationException("Reachability idle delay cannot be negative.");
}
if (FactCacheTtl <= TimeSpan.Zero)
{
throw new InvalidOperationException("Reachability fact cache TTL must be greater than zero.");
}
if (MaxConcurrency <= 0)
{
throw new InvalidOperationException("Reachability max concurrency must be greater than zero.");
}
}
}
}

View File

@@ -0,0 +1,140 @@
using System.Text.Json.Serialization;
using System.Text.RegularExpressions;
namespace StellaOps.Scheduler.Worker.Planning;
/// <summary>
/// Represents a Surface.FS pointer per SCHED-SURFACE-01 contract.
/// Format: surfacefs://&lt;tenant&gt;/&lt;dataset&gt;/&lt;version&gt;
/// </summary>
public sealed partial record SurfaceFsPointer
{
public SurfaceFsPointer(
string tenantId,
string dataset,
string version,
string? storageUri = null,
DateTimeOffset? createdAt = null)
{
if (string.IsNullOrWhiteSpace(tenantId))
{
throw new ArgumentException("Tenant ID is required.", nameof(tenantId));
}
if (string.IsNullOrWhiteSpace(dataset))
{
throw new ArgumentException("Dataset is required.", nameof(dataset));
}
if (string.IsNullOrWhiteSpace(version))
{
throw new ArgumentException("Version is required.", nameof(version));
}
TenantId = tenantId;
Dataset = dataset;
Version = version;
StorageUri = storageUri;
CreatedAt = createdAt;
}
/// <summary>
/// Tenant identifier.
/// </summary>
[JsonPropertyName("tenant_id")]
public string TenantId { get; init; }
/// <summary>
/// Dataset type (e.g., "sbom", "findings", "reachability").
/// </summary>
[JsonPropertyName("dataset")]
public string Dataset { get; init; }
/// <summary>
/// Version identifier (content hash or monotonic version).
/// </summary>
[JsonPropertyName("version")]
public string Version { get; init; }
/// <summary>
/// Storage URI (unset/relative in sealed mode; content-addressed path recommended).
/// </summary>
[JsonPropertyName("storage_uri")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public string? StorageUri { get; init; }
/// <summary>
/// Creation timestamp (RFC3339 UTC).
/// </summary>
[JsonPropertyName("created_at")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public DateTimeOffset? CreatedAt { get; init; }
/// <summary>
/// Generates a canonical URI for this pointer.
/// </summary>
public string ToUri() => $"surfacefs://{TenantId}/{Dataset}/{Version}";
/// <summary>
/// Generates a cache key for this pointer.
/// </summary>
public string ToCacheKey() => $"surface_fs_pointer::{TenantId}::{Dataset}::{Version}";
/// <summary>
/// Parses a Surface.FS URI into a pointer.
/// </summary>
public static SurfaceFsPointer? Parse(string uri)
{
if (string.IsNullOrWhiteSpace(uri))
{
return null;
}
var match = SurfaceFsUriRegex().Match(uri);
if (!match.Success)
{
return null;
}
return new SurfaceFsPointer(
tenantId: match.Groups["tenant"].Value,
dataset: match.Groups["dataset"].Value,
version: match.Groups["version"].Value);
}
/// <summary>
/// Tries to parse a Surface.FS URI.
/// </summary>
public static bool TryParse(string uri, out SurfaceFsPointer? pointer)
{
pointer = Parse(uri);
return pointer is not null;
}
[GeneratedRegex(@"^surfacefs://(?<tenant>[^/]+)/(?<dataset>[^/]+)/(?<version>.+)$", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant)]
private static partial Regex SurfaceFsUriRegex();
}
/// <summary>
/// Known dataset types for Surface.FS pointers.
/// </summary>
public static class SurfaceFsDatasets
{
public const string Sbom = "sbom";
public const string Findings = "findings";
public const string Reachability = "reachability";
public const string Policy = "policy";
public const string Attestation = "attestation";
/// <summary>
/// Default allowed datasets for scheduler operations.
/// </summary>
public static readonly IReadOnlySet<string> DefaultAllowlist = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
Sbom,
Findings,
Reachability,
Policy,
Attestation
};
}

View File

@@ -0,0 +1,356 @@
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Planning;
/// <summary>
/// Service for evaluating Surface.FS pointers during delta scan planning.
/// Implements SCHED-SURFACE-01: prioritizes drift-triggered assets and avoids redundant work.
/// </summary>
public interface ISurfaceFsPointerEvaluator
{
/// <summary>
/// Validates a Surface.FS pointer against the allowlist and sealed mode rules.
/// </summary>
SurfaceFsValidationResult Validate(SurfaceFsPointer pointer);
/// <summary>
/// Checks if the pointer represents drift from the cached version.
/// </summary>
ValueTask<SurfaceFsDriftResult> CheckDriftAsync(
SurfaceFsPointer pointer,
CancellationToken cancellationToken = default);
/// <summary>
/// Evaluates pointers for a batch of images and prioritizes drift-triggered assets.
/// </summary>
ValueTask<SurfaceFsEvaluationResult> EvaluateForPlanningAsync(
IReadOnlyList<ImpactImage> images,
IReadOnlyDictionary<string, SurfaceFsPointer> manifestPointers,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Result of pointer validation.
/// </summary>
public sealed record SurfaceFsValidationResult(
bool IsValid,
string? Error = null)
{
public static SurfaceFsValidationResult Valid { get; } = new(true);
public static SurfaceFsValidationResult Invalid(string error) => new(false, error);
}
/// <summary>
/// Result of drift detection.
/// </summary>
public sealed record SurfaceFsDriftResult(
SurfaceFsPointer Pointer,
bool HasDrift,
string? CachedVersion = null,
DateTimeOffset? CachedAt = null)
{
/// <summary>
/// The priority boost for drift-triggered assets (higher = more priority).
/// </summary>
public int PriorityBoost => HasDrift ? 10 : 0;
}
/// <summary>
/// Result of batch evaluation for planning.
/// </summary>
public sealed record SurfaceFsEvaluationResult(
IReadOnlyList<ImpactImage> PrioritizedImages,
IReadOnlyList<ImpactImage> SkippedImages,
int DriftTriggeredCount,
int RedundantCount)
{
/// <summary>
/// Indicates if any drift was detected.
/// </summary>
public bool HasDrift => DriftTriggeredCount > 0;
}
/// <summary>
/// Default implementation of Surface.FS pointer evaluator.
/// </summary>
public sealed class SurfaceFsPointerEvaluator : ISurfaceFsPointerEvaluator
{
private readonly ISurfaceFsPointerCache _cache;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly ILogger<SurfaceFsPointerEvaluator> _logger;
public SurfaceFsPointerEvaluator(
ISurfaceFsPointerCache cache,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
ILogger<SurfaceFsPointerEvaluator> logger)
{
_cache = cache ?? throw new ArgumentNullException(nameof(cache));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public SurfaceFsValidationResult Validate(SurfaceFsPointer pointer)
{
ArgumentNullException.ThrowIfNull(pointer);
var surfaceOptions = _options.Surface;
// Validate dataset against allowlist
if (!surfaceOptions.AllowedDatasets.Contains(pointer.Dataset))
{
return SurfaceFsValidationResult.Invalid(
$"Dataset '{pointer.Dataset}' is not in the allowed list.");
}
// In sealed mode, reject external storage URIs
if (surfaceOptions.SealedMode && !string.IsNullOrWhiteSpace(pointer.StorageUri))
{
if (!IsLocalOrContentAddressedUri(pointer.StorageUri))
{
return SurfaceFsValidationResult.Invalid(
$"External storage URI '{pointer.StorageUri}' not permitted in sealed mode.");
}
}
return SurfaceFsValidationResult.Valid;
}
public async ValueTask<SurfaceFsDriftResult> CheckDriftAsync(
SurfaceFsPointer pointer,
CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(pointer);
var cached = await _cache.GetAsync(pointer.TenantId, pointer.Dataset, cancellationToken).ConfigureAwait(false);
if (cached is null)
{
// No cached version means this is new - treat as drift
return new SurfaceFsDriftResult(pointer, HasDrift: true);
}
var hasDrift = !string.Equals(cached.Version, pointer.Version, StringComparison.Ordinal);
return new SurfaceFsDriftResult(
pointer,
HasDrift: hasDrift,
CachedVersion: cached.Version,
CachedAt: cached.CreatedAt);
}
public async ValueTask<SurfaceFsEvaluationResult> EvaluateForPlanningAsync(
IReadOnlyList<ImpactImage> images,
IReadOnlyDictionary<string, SurfaceFsPointer> manifestPointers,
CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(images);
ArgumentNullException.ThrowIfNull(manifestPointers);
if (images.Count == 0)
{
return new SurfaceFsEvaluationResult(
PrioritizedImages: [],
SkippedImages: [],
DriftTriggeredCount: 0,
RedundantCount: 0);
}
var driftImages = new List<(ImpactImage Image, int Priority)>();
var noDriftImages = new List<ImpactImage>();
var skippedImages = new List<ImpactImage>();
var driftCount = 0;
var redundantCount = 0;
foreach (var image in images)
{
if (!manifestPointers.TryGetValue(image.ImageDigest, out var pointer))
{
// No pointer for this image - include without priority boost
noDriftImages.Add(image);
continue;
}
var validation = Validate(pointer);
if (!validation.IsValid)
{
_logger.LogDebug(
"Skipping image {Digest} due to invalid pointer: {Error}",
image.ImageDigest,
validation.Error);
skippedImages.Add(image);
continue;
}
var drift = await CheckDriftAsync(pointer, cancellationToken).ConfigureAwait(false);
if (drift.HasDrift)
{
driftImages.Add((image, drift.PriorityBoost));
driftCount++;
_logger.LogDebug(
"Image {Digest} has drift: cached={CachedVersion}, new={NewVersion}",
image.ImageDigest,
drift.CachedVersion ?? "(none)",
pointer.Version);
}
else
{
// Check if this would be redundant work (same version already processed)
if (_options.Surface.SkipRedundantScans)
{
skippedImages.Add(image);
redundantCount++;
_logger.LogDebug(
"Skipping redundant scan for image {Digest} (version {Version} unchanged)",
image.ImageDigest,
pointer.Version);
}
else
{
noDriftImages.Add(image);
}
}
}
// Prioritize drift-triggered images first, then the rest
var prioritized = driftImages
.OrderByDescending(static x => x.Priority)
.ThenBy(static x => x.Image.ImageDigest, StringComparer.OrdinalIgnoreCase)
.Select(static x => x.Image)
.Concat(noDriftImages.OrderBy(static x => x.ImageDigest, StringComparer.OrdinalIgnoreCase))
.ToList();
_logger.LogInformation(
"Surface.FS evaluation: {Total} images, {DriftCount} drift-triggered, {RedundantCount} redundant, {SkippedCount} skipped",
images.Count,
driftCount,
redundantCount,
skippedImages.Count);
return new SurfaceFsEvaluationResult(
PrioritizedImages: prioritized,
SkippedImages: skippedImages,
DriftTriggeredCount: driftCount,
RedundantCount: redundantCount);
}
private static bool IsLocalOrContentAddressedUri(string uri)
{
if (string.IsNullOrWhiteSpace(uri))
{
return true;
}
// Allow relative paths
if (!uri.Contains("://", StringComparison.Ordinal))
{
return true;
}
// Allow file:// URIs
if (uri.StartsWith("file://", StringComparison.OrdinalIgnoreCase))
{
return true;
}
// Allow content-addressed schemes
if (uri.StartsWith("sha256:", StringComparison.OrdinalIgnoreCase) ||
uri.StartsWith("sha512:", StringComparison.OrdinalIgnoreCase) ||
uri.StartsWith("content:", StringComparison.OrdinalIgnoreCase))
{
return true;
}
return false;
}
}
/// <summary>
/// Cache interface for Surface.FS pointers.
/// </summary>
public interface ISurfaceFsPointerCache
{
/// <summary>
/// Gets a cached pointer for the specified tenant and dataset.
/// </summary>
ValueTask<SurfaceFsPointer?> GetAsync(
string tenantId,
string dataset,
CancellationToken cancellationToken = default);
/// <summary>
/// Sets/updates a cached pointer.
/// </summary>
ValueTask SetAsync(
SurfaceFsPointer pointer,
CancellationToken cancellationToken = default);
/// <summary>
/// Removes a cached pointer.
/// </summary>
ValueTask RemoveAsync(
string tenantId,
string dataset,
CancellationToken cancellationToken = default);
}
/// <summary>
/// In-memory implementation of Surface.FS pointer cache.
/// </summary>
public sealed class InMemorySurfaceFsPointerCache : ISurfaceFsPointerCache
{
private readonly Dictionary<string, SurfaceFsPointer> _cache = new(StringComparer.OrdinalIgnoreCase);
private readonly object _lock = new();
public ValueTask<SurfaceFsPointer?> GetAsync(
string tenantId,
string dataset,
CancellationToken cancellationToken = default)
{
var key = BuildKey(tenantId, dataset);
lock (_lock)
{
return ValueTask.FromResult(_cache.GetValueOrDefault(key));
}
}
public ValueTask SetAsync(
SurfaceFsPointer pointer,
CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(pointer);
var key = BuildKey(pointer.TenantId, pointer.Dataset);
lock (_lock)
{
_cache[key] = pointer;
}
return ValueTask.CompletedTask;
}
public ValueTask RemoveAsync(
string tenantId,
string dataset,
CancellationToken cancellationToken = default)
{
var key = BuildKey(tenantId, dataset);
lock (_lock)
{
_cache.Remove(key);
}
return ValueTask.CompletedTask;
}
private static string BuildKey(string tenantId, string dataset)
=> $"{tenantId}::{dataset}";
}

View File

@@ -0,0 +1,134 @@
using System.Text.Json.Serialization;
namespace StellaOps.Scheduler.Worker.Policy;
/// <summary>
/// Policy activation event per SCHED-WORKER-23-101 contract.
/// Event type: scheduler.policy.activation.requested
/// </summary>
public sealed record PolicyActivationEvent
{
public PolicyActivationEvent(
string jobId,
string policyRunId,
string tenantId,
int priority,
DateTimeOffset requestedAtUtc,
PolicyThrottleSource throttleSource)
{
if (string.IsNullOrWhiteSpace(jobId))
{
throw new ArgumentException("Job ID is required.", nameof(jobId));
}
if (string.IsNullOrWhiteSpace(policyRunId))
{
throw new ArgumentException("Policy run ID is required.", nameof(policyRunId));
}
if (string.IsNullOrWhiteSpace(tenantId))
{
throw new ArgumentException("Tenant ID is required.", nameof(tenantId));
}
JobId = jobId;
PolicyRunId = policyRunId;
TenantId = tenantId;
Priority = priority;
RequestedAtUtc = requestedAtUtc;
ThrottleSource = throttleSource;
}
/// <summary>
/// Event type constant.
/// </summary>
public const string EventType = "scheduler.policy.activation.requested";
/// <summary>
/// Unique job identifier for idempotency.
/// </summary>
[JsonPropertyName("job_id")]
public string JobId { get; init; }
/// <summary>
/// Associated policy run identifier.
/// </summary>
[JsonPropertyName("policy_run_id")]
public string PolicyRunId { get; init; }
/// <summary>
/// Tenant scope for this activation.
/// </summary>
[JsonPropertyName("tenant_id")]
public string TenantId { get; init; }
/// <summary>
/// Processing priority (higher = more urgent).
/// </summary>
[JsonPropertyName("priority")]
public int Priority { get; init; }
/// <summary>
/// UTC timestamp when activation was requested.
/// </summary>
[JsonPropertyName("requested_at_utc")]
public DateTimeOffset RequestedAtUtc { get; init; }
/// <summary>
/// Source of throttle configuration.
/// </summary>
[JsonPropertyName("throttle_source")]
[JsonConverter(typeof(JsonStringEnumConverter))]
public PolicyThrottleSource ThrottleSource { get; init; }
/// <summary>
/// Optional bundle pointers for policy/export data.
/// </summary>
[JsonPropertyName("bundle_pointers")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public PolicyBundlePointers? BundlePointers { get; init; }
}
/// <summary>
/// Throttle source configuration for policy activation.
/// </summary>
public enum PolicyThrottleSource
{
/// <summary>
/// Use default scheduler throttling rules.
/// </summary>
[JsonPropertyName("scheduler-default")]
SchedulerDefault,
/// <summary>
/// Use policy-specific throttle signals.
/// </summary>
[JsonPropertyName("policy-signal")]
PolicySignal,
/// <summary>
/// Manual override of throttle configuration.
/// </summary>
[JsonPropertyName("manual-override")]
ManualOverride
}
/// <summary>
/// Optional bundle pointers for policy activation.
/// </summary>
public sealed record PolicyBundlePointers
{
/// <summary>
/// Pointer to policy definition bundle.
/// </summary>
[JsonPropertyName("policy_bundle")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public string? PolicyBundle { get; init; }
/// <summary>
/// Pointer to export data bundle.
/// </summary>
[JsonPropertyName("export_bundle")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public string? ExportBundle { get; init; }
}

View File

@@ -0,0 +1,501 @@
using System.Collections.Immutable;
using System.Threading.RateLimiting;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Policy;
/// <summary>
/// Policy re-evaluation worker per SCHED-WORKER-23-101.
/// Handles policy activation events, shards assets, honors rate limits, and updates progress.
/// </summary>
public sealed class PolicyReEvaluationWorker : BackgroundService
{
private readonly IPolicyActivationQueue _activationQueue;
private readonly IPolicyReEvaluationService _reEvaluationService;
private readonly IPolicyProgressReporter _progressReporter;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly SchedulerWorkerMetrics _metrics;
private readonly ILogger<PolicyReEvaluationWorker> _logger;
public PolicyReEvaluationWorker(
IPolicyActivationQueue activationQueue,
IPolicyReEvaluationService reEvaluationService,
IPolicyProgressReporter progressReporter,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
SchedulerWorkerMetrics metrics,
ILogger<PolicyReEvaluationWorker> logger)
{
_activationQueue = activationQueue ?? throw new ArgumentNullException(nameof(activationQueue));
_reEvaluationService = reEvaluationService ?? throw new ArgumentNullException(nameof(reEvaluationService));
_progressReporter = progressReporter ?? throw new ArgumentNullException(nameof(progressReporter));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
if (!_options.Policy.Enabled)
{
_logger.LogInformation("Policy re-evaluation worker is disabled.");
return;
}
_logger.LogInformation("Policy re-evaluation worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
var events = await _activationQueue
.DequeueAsync(_options.Policy.Dispatch.BatchSize, stoppingToken)
.ConfigureAwait(false);
if (events.Count == 0)
{
await Task.Delay(_options.Policy.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
foreach (var activationEvent in events)
{
if (stoppingToken.IsCancellationRequested)
{
break;
}
await ProcessActivationEventAsync(activationEvent, stoppingToken).ConfigureAwait(false);
}
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
_logger.LogInformation("Policy re-evaluation worker stopping due to cancellation.");
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error in policy re-evaluation worker loop.");
await Task.Delay(_options.Policy.Dispatch.RetryBackoff, stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Policy re-evaluation worker stopped.");
}
private async Task ProcessActivationEventAsync(
PolicyActivationEvent activationEvent,
CancellationToken cancellationToken)
{
var startedAt = _timeProvider.GetUtcNow();
_logger.LogInformation(
"Processing policy activation event: JobId={JobId}, PolicyRunId={PolicyRunId}, Tenant={TenantId}, Priority={Priority}",
activationEvent.JobId,
activationEvent.PolicyRunId,
activationEvent.TenantId,
activationEvent.Priority);
try
{
// Report progress: started
await _progressReporter.ReportStartedAsync(
activationEvent.TenantId,
activationEvent.PolicyRunId,
activationEvent.JobId,
cancellationToken).ConfigureAwait(false);
// Execute re-evaluation
var result = await _reEvaluationService.ExecuteAsync(
activationEvent,
cancellationToken).ConfigureAwait(false);
// Report progress: completed
await _progressReporter.ReportCompletedAsync(
activationEvent.TenantId,
activationEvent.PolicyRunId,
activationEvent.JobId,
result,
cancellationToken).ConfigureAwait(false);
var duration = _timeProvider.GetUtcNow() - startedAt;
_metrics.RecordPolicyReEvaluation(
activationEvent.TenantId,
result.Status.ToString().ToLowerInvariant(),
duration);
_logger.LogInformation(
"Policy activation completed: JobId={JobId}, Status={Status}, AssetsProcessed={AssetsProcessed}, Duration={Duration}ms",
activationEvent.JobId,
result.Status,
result.AssetsProcessed,
duration.TotalMilliseconds);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(
ex,
"Policy activation failed: JobId={JobId}, PolicyRunId={PolicyRunId}",
activationEvent.JobId,
activationEvent.PolicyRunId);
await _progressReporter.ReportFailedAsync(
activationEvent.TenantId,
activationEvent.PolicyRunId,
activationEvent.JobId,
ex.Message,
cancellationToken).ConfigureAwait(false);
var duration = _timeProvider.GetUtcNow() - startedAt;
_metrics.RecordPolicyReEvaluation(
activationEvent.TenantId,
"failed",
duration);
}
}
}
/// <summary>
/// Queue interface for policy activation events.
/// </summary>
public interface IPolicyActivationQueue
{
/// <summary>
/// Dequeues activation events for processing.
/// </summary>
ValueTask<IReadOnlyList<PolicyActivationEvent>> DequeueAsync(
int maxCount,
CancellationToken cancellationToken = default);
/// <summary>
/// Enqueues an activation event for processing.
/// </summary>
ValueTask EnqueueAsync(
PolicyActivationEvent activationEvent,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Service for executing policy re-evaluation.
/// </summary>
public interface IPolicyReEvaluationService
{
/// <summary>
/// Executes re-evaluation for a policy activation event.
/// </summary>
ValueTask<PolicyReEvaluationResult> ExecuteAsync(
PolicyActivationEvent activationEvent,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Result of policy re-evaluation execution.
/// </summary>
public sealed record PolicyReEvaluationResult(
PolicyReEvaluationStatus Status,
int AssetsProcessed,
int ShardsCompleted,
int ShardsTotal,
ImmutableArray<string> FailedAssets,
DateTimeOffset CompletedAt)
{
public static PolicyReEvaluationResult NoWork(DateTimeOffset completedAt)
=> new(PolicyReEvaluationStatus.NoWork, 0, 0, 0, [], completedAt);
public static PolicyReEvaluationResult Success(
int assetsProcessed,
int shardsCompleted,
int shardsTotal,
DateTimeOffset completedAt)
=> new(PolicyReEvaluationStatus.Completed, assetsProcessed, shardsCompleted, shardsTotal, [], completedAt);
public static PolicyReEvaluationResult PartialSuccess(
int assetsProcessed,
int shardsCompleted,
int shardsTotal,
ImmutableArray<string> failedAssets,
DateTimeOffset completedAt)
=> new(PolicyReEvaluationStatus.PartiallyCompleted, assetsProcessed, shardsCompleted, shardsTotal, failedAssets, completedAt);
}
/// <summary>
/// Status of policy re-evaluation.
/// </summary>
public enum PolicyReEvaluationStatus
{
NoWork,
Completed,
PartiallyCompleted,
Failed
}
/// <summary>
/// Reporter for policy re-evaluation progress.
/// </summary>
public interface IPolicyProgressReporter
{
ValueTask ReportStartedAsync(
string tenantId,
string policyRunId,
string jobId,
CancellationToken cancellationToken = default);
ValueTask ReportProgressAsync(
string tenantId,
string policyRunId,
string jobId,
int shardsCompleted,
int shardsTotal,
int assetsProcessed,
CancellationToken cancellationToken = default);
ValueTask ReportCompletedAsync(
string tenantId,
string policyRunId,
string jobId,
PolicyReEvaluationResult result,
CancellationToken cancellationToken = default);
ValueTask ReportFailedAsync(
string tenantId,
string policyRunId,
string jobId,
string error,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Default implementation of policy re-evaluation service.
/// </summary>
public sealed class PolicyReEvaluationService : IPolicyReEvaluationService
{
private readonly IPolicyAssetSharder _sharder;
private readonly IPolicyShardProcessor _shardProcessor;
private readonly IPolicyProgressReporter _progressReporter;
private readonly RateLimiter _rateLimiter;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly ILogger<PolicyReEvaluationService> _logger;
public PolicyReEvaluationService(
IPolicyAssetSharder sharder,
IPolicyShardProcessor shardProcessor,
IPolicyProgressReporter progressReporter,
RateLimiter rateLimiter,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
ILogger<PolicyReEvaluationService> logger)
{
_sharder = sharder ?? throw new ArgumentNullException(nameof(sharder));
_shardProcessor = shardProcessor ?? throw new ArgumentNullException(nameof(shardProcessor));
_progressReporter = progressReporter ?? throw new ArgumentNullException(nameof(progressReporter));
_rateLimiter = rateLimiter ?? throw new ArgumentNullException(nameof(rateLimiter));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async ValueTask<PolicyReEvaluationResult> ExecuteAsync(
PolicyActivationEvent activationEvent,
CancellationToken cancellationToken = default)
{
// Shard assets for processing
var shards = await _sharder.ShardAssetsAsync(
activationEvent.TenantId,
activationEvent.PolicyRunId,
cancellationToken).ConfigureAwait(false);
if (shards.Count == 0)
{
_logger.LogDebug(
"No assets to re-evaluate for policy run {PolicyRunId}",
activationEvent.PolicyRunId);
return PolicyReEvaluationResult.NoWork(_timeProvider.GetUtcNow());
}
_logger.LogInformation(
"Processing {ShardCount} shards for policy run {PolicyRunId}",
shards.Count,
activationEvent.PolicyRunId);
var shardsCompleted = 0;
var assetsProcessed = 0;
var failedAssets = new List<string>();
foreach (var shard in shards)
{
// Honor rate limits
using var lease = await _rateLimiter.AcquireAsync(1, cancellationToken).ConfigureAwait(false);
if (!lease.IsAcquired)
{
_logger.LogWarning(
"Rate limit exceeded for policy run {PolicyRunId}, waiting...",
activationEvent.PolicyRunId);
await Task.Delay(TimeSpan.FromSeconds(1), cancellationToken).ConfigureAwait(false);
}
try
{
var result = await _shardProcessor.ProcessShardAsync(
shard,
activationEvent,
cancellationToken).ConfigureAwait(false);
assetsProcessed += result.AssetsProcessed;
failedAssets.AddRange(result.FailedAssetIds);
shardsCompleted++;
// Report progress
await _progressReporter.ReportProgressAsync(
activationEvent.TenantId,
activationEvent.PolicyRunId,
activationEvent.JobId,
shardsCompleted,
shards.Count,
assetsProcessed,
cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(
ex,
"Failed to process shard {ShardId} for policy run {PolicyRunId}",
shard.ShardId,
activationEvent.PolicyRunId);
failedAssets.AddRange(shard.AssetIds);
}
}
var completedAt = _timeProvider.GetUtcNow();
if (failedAssets.Count == 0)
{
return PolicyReEvaluationResult.Success(
assetsProcessed,
shardsCompleted,
shards.Count,
completedAt);
}
return PolicyReEvaluationResult.PartialSuccess(
assetsProcessed,
shardsCompleted,
shards.Count,
[.. failedAssets],
completedAt);
}
}
/// <summary>
/// Interface for sharding assets for policy re-evaluation.
/// </summary>
public interface IPolicyAssetSharder
{
/// <summary>
/// Shards assets for a policy run into processable chunks.
/// </summary>
ValueTask<IReadOnlyList<PolicyAssetShard>> ShardAssetsAsync(
string tenantId,
string policyRunId,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Represents a shard of assets for policy re-evaluation.
/// </summary>
public sealed record PolicyAssetShard(
string ShardId,
string TenantId,
string PolicyRunId,
ImmutableArray<string> AssetIds,
int ShardIndex,
int TotalShards);
/// <summary>
/// Interface for processing individual policy shards.
/// </summary>
public interface IPolicyShardProcessor
{
/// <summary>
/// Processes a single shard of assets.
/// </summary>
ValueTask<PolicyShardResult> ProcessShardAsync(
PolicyAssetShard shard,
PolicyActivationEvent activationEvent,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Result of processing a policy shard.
/// </summary>
public sealed record PolicyShardResult(
string ShardId,
int AssetsProcessed,
ImmutableArray<string> FailedAssetIds);
/// <summary>
/// In-memory implementation of policy activation queue.
/// </summary>
public sealed class InMemoryPolicyActivationQueue : IPolicyActivationQueue
{
private readonly Queue<PolicyActivationEvent> _queue = new();
private readonly object _lock = new();
public ValueTask<IReadOnlyList<PolicyActivationEvent>> DequeueAsync(
int maxCount,
CancellationToken cancellationToken = default)
{
var results = new List<PolicyActivationEvent>();
lock (_lock)
{
while (results.Count < maxCount && _queue.Count > 0)
{
results.Add(_queue.Dequeue());
}
}
return ValueTask.FromResult<IReadOnlyList<PolicyActivationEvent>>(results);
}
public ValueTask EnqueueAsync(
PolicyActivationEvent activationEvent,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
_queue.Enqueue(activationEvent);
}
return ValueTask.CompletedTask;
}
}
/// <summary>
/// Null implementation of progress reporter for testing.
/// </summary>
public sealed class NullPolicyProgressReporter : IPolicyProgressReporter
{
public static NullPolicyProgressReporter Instance { get; } = new();
public ValueTask ReportStartedAsync(string tenantId, string policyRunId, string jobId, CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
public ValueTask ReportProgressAsync(string tenantId, string policyRunId, string jobId, int shardsCompleted, int shardsTotal, int assetsProcessed, CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
public ValueTask ReportCompletedAsync(string tenantId, string policyRunId, string jobId, PolicyReEvaluationResult result, CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
public ValueTask ReportFailedAsync(string tenantId, string policyRunId, string jobId, string error, CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
}

View File

@@ -0,0 +1,198 @@
using System.Collections.Immutable;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Policy;
/// <summary>
/// Reconciliation worker per SCHED-WORKER-23-102.
/// Ensures policy re-evaluation completion within SLA, emits alerts on backlog, and persists status to policy_runs.
/// </summary>
public sealed class PolicyReconciliationWorker : BackgroundService
{
private readonly IPolicyRunRepository _policyRunRepository;
private readonly IPolicyBacklogAlertService _alertService;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly SchedulerWorkerMetrics _metrics;
private readonly ILogger<PolicyReconciliationWorker> _logger;
public PolicyReconciliationWorker(
IPolicyRunRepository policyRunRepository,
IPolicyBacklogAlertService alertService,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
SchedulerWorkerMetrics metrics,
ILogger<PolicyReconciliationWorker> logger)
{
_policyRunRepository = policyRunRepository ?? throw new ArgumentNullException(nameof(policyRunRepository));
_alertService = alertService ?? throw new ArgumentNullException(nameof(alertService));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
if (!_options.Policy.Enabled)
{
_logger.LogInformation("Policy reconciliation worker is disabled.");
return;
}
_logger.LogInformation("Policy reconciliation worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
await ReconcileAsync(stoppingToken).ConfigureAwait(false);
await Task.Delay(TimeSpan.FromMinutes(1), stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error in policy reconciliation worker loop.");
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Policy reconciliation worker stopped.");
}
private async Task ReconcileAsync(CancellationToken cancellationToken)
{
var now = _timeProvider.GetUtcNow();
var slaThreshold = now.AddMinutes(-30); // 30-minute SLA
// Find policy runs that are overdue
var overdueRuns = await _policyRunRepository
.GetOverdueRunsAsync(slaThreshold, cancellationToken)
.ConfigureAwait(false);
if (overdueRuns.Count == 0)
{
_logger.LogDebug("No overdue policy runs found.");
return;
}
_logger.LogWarning(
"Found {Count} overdue policy runs exceeding SLA threshold.",
overdueRuns.Count);
// Group by tenant for alert aggregation
var byTenant = overdueRuns.GroupBy(static r => r.TenantId);
foreach (var tenantGroup in byTenant)
{
var tenantId = tenantGroup.Key;
var tenantOverdue = tenantGroup.ToList();
// Emit backlog alert
await _alertService.EmitBacklogAlertAsync(
tenantId,
tenantOverdue.Count,
slaThreshold,
cancellationToken).ConfigureAwait(false);
// Update policy run status
foreach (var run in tenantOverdue)
{
var updated = run with
{
Status = PolicyRunStatus.SlaBreached,
SlaBreachedAt = now
};
await _policyRunRepository
.UpdateAsync(updated, cancellationToken)
.ConfigureAwait(false);
_logger.LogWarning(
"Policy run {RunId} for tenant {TenantId} marked as SLA breached (started at {StartedAt}).",
run.RunId,
tenantId,
run.StartedAt);
}
}
}
}
/// <summary>
/// Repository interface for policy runs.
/// </summary>
public interface IPolicyRunRepository
{
ValueTask<IReadOnlyList<PolicyRunRecord>> GetOverdueRunsAsync(
DateTimeOffset threshold,
CancellationToken cancellationToken = default);
ValueTask UpdateAsync(
PolicyRunRecord record,
CancellationToken cancellationToken = default);
ValueTask<PolicyRunRecord?> GetAsync(
string runId,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Record representing a policy run in the system.
/// </summary>
public sealed record PolicyRunRecord(
string RunId,
string TenantId,
string PolicyId,
PolicyRunStatus Status,
DateTimeOffset StartedAt,
DateTimeOffset? CompletedAt = null,
DateTimeOffset? SlaBreachedAt = null,
int AssetsTotal = 0,
int AssetsCompleted = 0,
string? Error = null);
/// <summary>
/// Status of a policy run.
/// </summary>
public enum PolicyRunStatus
{
Pending,
Running,
Completed,
Failed,
SlaBreached,
Cancelled
}
/// <summary>
/// Service for emitting backlog alerts.
/// </summary>
public interface IPolicyBacklogAlertService
{
ValueTask EmitBacklogAlertAsync(
string tenantId,
int overdueCount,
DateTimeOffset threshold,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Null implementation of backlog alert service for testing.
/// </summary>
public sealed class NullPolicyBacklogAlertService : IPolicyBacklogAlertService
{
public static NullPolicyBacklogAlertService Instance { get; } = new();
public ValueTask EmitBacklogAlertAsync(
string tenantId,
int overdueCount,
DateTimeOffset threshold,
CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
}

View File

@@ -0,0 +1,470 @@
using System.Collections.Immutable;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Reachability;
/// <summary>
/// Reachability joiner worker per SCHED-WORKER-26-201.
/// Combines SBOM snapshots with signals, writes cached facts, and schedules updates on new events.
/// </summary>
public sealed class ReachabilityJoinerWorker : BackgroundService
{
private readonly ISbomSnapshotQueue _snapshotQueue;
private readonly ISignalProvider _signalProvider;
private readonly IReachabilityFactCache _factCache;
private readonly IReachabilityUpdateScheduler _updateScheduler;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly SchedulerWorkerMetrics _metrics;
private readonly ILogger<ReachabilityJoinerWorker> _logger;
public ReachabilityJoinerWorker(
ISbomSnapshotQueue snapshotQueue,
ISignalProvider signalProvider,
IReachabilityFactCache factCache,
IReachabilityUpdateScheduler updateScheduler,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
SchedulerWorkerMetrics metrics,
ILogger<ReachabilityJoinerWorker> logger)
{
_snapshotQueue = snapshotQueue ?? throw new ArgumentNullException(nameof(snapshotQueue));
_signalProvider = signalProvider ?? throw new ArgumentNullException(nameof(signalProvider));
_factCache = factCache ?? throw new ArgumentNullException(nameof(factCache));
_updateScheduler = updateScheduler ?? throw new ArgumentNullException(nameof(updateScheduler));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
if (!_options.Reachability.Enabled)
{
_logger.LogInformation("Reachability joiner worker is disabled.");
return;
}
_logger.LogInformation("Reachability joiner worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
// Dequeue SBOM snapshots for processing
var snapshots = await _snapshotQueue
.DequeueAsync(_options.Reachability.BatchSize, stoppingToken)
.ConfigureAwait(false);
if (snapshots.Count == 0)
{
await Task.Delay(_options.Reachability.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
_logger.LogDebug(
"Processing {Count} SBOM snapshots for reachability analysis.",
snapshots.Count);
// Process snapshots concurrently with bounded parallelism
await ProcessSnapshotsAsync(snapshots, stoppingToken).ConfigureAwait(false);
await Task.Delay(_options.Reachability.PollInterval, stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (System.Exception ex)
{
_logger.LogError(ex, "Error in reachability joiner worker loop.");
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Reachability joiner worker stopped.");
}
private async Task ProcessSnapshotsAsync(
IReadOnlyList<SbomSnapshot> snapshots,
CancellationToken cancellationToken)
{
var semaphore = new SemaphoreSlim(_options.Reachability.MaxConcurrency);
var tasks = new List<Task>();
foreach (var snapshot in snapshots)
{
await semaphore.WaitAsync(cancellationToken).ConfigureAwait(false);
tasks.Add(ProcessSnapshotWithSemaphoreAsync(snapshot, semaphore, cancellationToken));
}
await Task.WhenAll(tasks).ConfigureAwait(false);
}
private async Task ProcessSnapshotWithSemaphoreAsync(
SbomSnapshot snapshot,
SemaphoreSlim semaphore,
CancellationToken cancellationToken)
{
try
{
await ProcessSnapshotAsync(snapshot, cancellationToken).ConfigureAwait(false);
}
finally
{
semaphore.Release();
}
}
private async Task ProcessSnapshotAsync(
SbomSnapshot snapshot,
CancellationToken cancellationToken)
{
var startedAt = _timeProvider.GetUtcNow();
try
{
_logger.LogDebug(
"Processing SBOM snapshot {SnapshotId} for tenant {TenantId}, artifact {ArtifactId}.",
snapshot.SnapshotId,
snapshot.TenantId,
snapshot.ArtifactId);
// Fetch signals for the snapshot's components
var signals = await _signalProvider.GetSignalsAsync(
snapshot.TenantId,
snapshot.ComponentPurls,
cancellationToken).ConfigureAwait(false);
// Join snapshot with signals to produce reachability facts
var facts = JoinSnapshotWithSignals(snapshot, signals);
if (facts.Count == 0)
{
_logger.LogDebug(
"No reachability facts produced for snapshot {SnapshotId}.",
snapshot.SnapshotId);
return;
}
// Write facts to cache
await _factCache.WriteFactsAsync(
snapshot.TenantId,
snapshot.ArtifactId,
facts,
_options.Reachability.FactCacheTtl,
cancellationToken).ConfigureAwait(false);
// Schedule downstream updates for affected policies
await _updateScheduler.ScheduleUpdatesAsync(
snapshot.TenantId,
snapshot.ArtifactId,
facts,
cancellationToken).ConfigureAwait(false);
var duration = _timeProvider.GetUtcNow() - startedAt;
_logger.LogInformation(
"Processed SBOM snapshot {SnapshotId}: {FactCount} facts produced, {SignalCount} signals matched in {Duration}ms.",
snapshot.SnapshotId,
facts.Count,
signals.Count,
duration.TotalMilliseconds);
}
catch (System.Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(
ex,
"Failed to process SBOM snapshot {SnapshotId} for tenant {TenantId}.",
snapshot.SnapshotId,
snapshot.TenantId);
}
}
private static IReadOnlyList<ReachabilityFact> JoinSnapshotWithSignals(
SbomSnapshot snapshot,
IReadOnlyDictionary<string, ComponentSignal> signals)
{
var facts = new List<ReachabilityFact>();
foreach (var purl in snapshot.ComponentPurls)
{
if (!signals.TryGetValue(purl, out var signal))
{
continue;
}
var fact = new ReachabilityFact(
FactId: $"{snapshot.SnapshotId}:{purl}",
TenantId: snapshot.TenantId,
ArtifactId: snapshot.ArtifactId,
ComponentPurl: purl,
IsReachable: signal.IsReachable,
Confidence: signal.Confidence,
Evidence: signal.Evidence,
ProducedAt: DateTimeOffset.UtcNow);
facts.Add(fact);
}
return facts;
}
}
/// <summary>
/// Queue interface for SBOM snapshots awaiting reachability analysis.
/// </summary>
public interface ISbomSnapshotQueue
{
/// <summary>
/// Dequeues SBOM snapshots for processing.
/// </summary>
ValueTask<IReadOnlyList<SbomSnapshot>> DequeueAsync(
int maxCount,
CancellationToken cancellationToken = default);
/// <summary>
/// Enqueues an SBOM snapshot for processing.
/// </summary>
ValueTask EnqueueAsync(
SbomSnapshot snapshot,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Provider interface for component reachability signals.
/// </summary>
public interface ISignalProvider
{
/// <summary>
/// Gets reachability signals for the specified components.
/// </summary>
ValueTask<IReadOnlyDictionary<string, ComponentSignal>> GetSignalsAsync(
string tenantId,
ImmutableArray<string> componentPurls,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Cache interface for storing reachability facts.
/// </summary>
public interface IReachabilityFactCache
{
/// <summary>
/// Writes reachability facts to the cache.
/// </summary>
ValueTask WriteFactsAsync(
string tenantId,
string artifactId,
IReadOnlyList<ReachabilityFact> facts,
TimeSpan ttl,
CancellationToken cancellationToken = default);
/// <summary>
/// Reads reachability facts from the cache.
/// </summary>
ValueTask<IReadOnlyList<ReachabilityFact>> ReadFactsAsync(
string tenantId,
string artifactId,
CancellationToken cancellationToken = default);
/// <summary>
/// Invalidates cached facts for an artifact.
/// </summary>
ValueTask InvalidateAsync(
string tenantId,
string artifactId,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Scheduler interface for triggering downstream updates on new reachability facts.
/// </summary>
public interface IReachabilityUpdateScheduler
{
/// <summary>
/// Schedules policy re-evaluation updates based on new reachability facts.
/// </summary>
ValueTask ScheduleUpdatesAsync(
string tenantId,
string artifactId,
IReadOnlyList<ReachabilityFact> facts,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Represents an SBOM snapshot for reachability analysis.
/// </summary>
public sealed record SbomSnapshot(
string SnapshotId,
string TenantId,
string ArtifactId,
string ImageDigest,
ImmutableArray<string> ComponentPurls,
DateTimeOffset CreatedAt);
/// <summary>
/// Represents a reachability signal for a component.
/// </summary>
public sealed record ComponentSignal(
string ComponentPurl,
bool IsReachable,
float Confidence,
ImmutableArray<SignalEvidence> Evidence);
/// <summary>
/// Evidence supporting a reachability signal.
/// </summary>
public sealed record SignalEvidence(
string Source,
string Type,
string Details,
float Weight);
/// <summary>
/// Represents a cached reachability fact.
/// </summary>
public sealed record ReachabilityFact(
string FactId,
string TenantId,
string ArtifactId,
string ComponentPurl,
bool IsReachable,
float Confidence,
ImmutableArray<SignalEvidence> Evidence,
DateTimeOffset ProducedAt);
/// <summary>
/// In-memory implementation of SBOM snapshot queue.
/// </summary>
public sealed class InMemorySbomSnapshotQueue : ISbomSnapshotQueue
{
private readonly Queue<SbomSnapshot> _queue = new();
private readonly object _lock = new();
public ValueTask<IReadOnlyList<SbomSnapshot>> DequeueAsync(
int maxCount,
CancellationToken cancellationToken = default)
{
var results = new List<SbomSnapshot>();
lock (_lock)
{
while (results.Count < maxCount && _queue.Count > 0)
{
results.Add(_queue.Dequeue());
}
}
return ValueTask.FromResult<IReadOnlyList<SbomSnapshot>>(results);
}
public ValueTask EnqueueAsync(
SbomSnapshot snapshot,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
_queue.Enqueue(snapshot);
}
return ValueTask.CompletedTask;
}
}
/// <summary>
/// In-memory implementation of reachability fact cache.
/// </summary>
public sealed class InMemoryReachabilityFactCache : IReachabilityFactCache
{
private readonly Dictionary<string, (IReadOnlyList<ReachabilityFact> Facts, DateTimeOffset ExpiresAt)> _cache = new();
private readonly object _lock = new();
public ValueTask WriteFactsAsync(
string tenantId,
string artifactId,
IReadOnlyList<ReachabilityFact> facts,
TimeSpan ttl,
CancellationToken cancellationToken = default)
{
var key = BuildKey(tenantId, artifactId);
lock (_lock)
{
_cache[key] = (facts, DateTimeOffset.UtcNow.Add(ttl));
}
return ValueTask.CompletedTask;
}
public ValueTask<IReadOnlyList<ReachabilityFact>> ReadFactsAsync(
string tenantId,
string artifactId,
CancellationToken cancellationToken = default)
{
var key = BuildKey(tenantId, artifactId);
lock (_lock)
{
if (_cache.TryGetValue(key, out var entry) && entry.ExpiresAt > DateTimeOffset.UtcNow)
{
return ValueTask.FromResult(entry.Facts);
}
}
return ValueTask.FromResult<IReadOnlyList<ReachabilityFact>>([]);
}
public ValueTask InvalidateAsync(
string tenantId,
string artifactId,
CancellationToken cancellationToken = default)
{
var key = BuildKey(tenantId, artifactId);
lock (_lock)
{
_cache.Remove(key);
}
return ValueTask.CompletedTask;
}
private static string BuildKey(string tenantId, string artifactId)
=> $"{tenantId}:{artifactId}";
}
/// <summary>
/// Null implementation of signal provider for testing.
/// </summary>
public sealed class NullSignalProvider : ISignalProvider
{
public static NullSignalProvider Instance { get; } = new();
public ValueTask<IReadOnlyDictionary<string, ComponentSignal>> GetSignalsAsync(
string tenantId,
ImmutableArray<string> componentPurls,
CancellationToken cancellationToken = default)
=> ValueTask.FromResult<IReadOnlyDictionary<string, ComponentSignal>>(
new Dictionary<string, ComponentSignal>());
}
/// <summary>
/// Null implementation of reachability update scheduler for testing.
/// </summary>
public sealed class NullReachabilityUpdateScheduler : IReachabilityUpdateScheduler
{
public static NullReachabilityUpdateScheduler Instance { get; } = new();
public ValueTask ScheduleUpdatesAsync(
string tenantId,
string artifactId,
IReadOnlyList<ReachabilityFact> facts,
CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
}

View File

@@ -0,0 +1,455 @@
using System.Collections.Immutable;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Reachability;
/// <summary>
/// Staleness monitor per SCHED-WORKER-26-202.
/// Monitors reachability facts for staleness, publishes warnings, and updates dashboards.
/// </summary>
public sealed class ReachabilityStalenessMonitor : BackgroundService
{
private readonly IReachabilityFactStore _factStore;
private readonly IStalenessAlertPublisher _alertPublisher;
private readonly IStalenessMetricsReporter _metricsReporter;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly SchedulerWorkerMetrics _metrics;
private readonly ILogger<ReachabilityStalenessMonitor> _logger;
public ReachabilityStalenessMonitor(
IReachabilityFactStore factStore,
IStalenessAlertPublisher alertPublisher,
IStalenessMetricsReporter metricsReporter,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
SchedulerWorkerMetrics metrics,
ILogger<ReachabilityStalenessMonitor> logger)
{
_factStore = factStore ?? throw new ArgumentNullException(nameof(factStore));
_alertPublisher = alertPublisher ?? throw new ArgumentNullException(nameof(alertPublisher));
_metricsReporter = metricsReporter ?? throw new ArgumentNullException(nameof(metricsReporter));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
if (!_options.Reachability.Enabled)
{
_logger.LogInformation("Reachability staleness monitor is disabled.");
return;
}
_logger.LogInformation("Reachability staleness monitor started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
var now = _timeProvider.GetUtcNow();
// Check for stale facts across all tenants
await CheckForStalenessAsync(now, stoppingToken).ConfigureAwait(false);
// Wait for the configured check interval
await Task.Delay(_options.Reachability.PollInterval, stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error in reachability staleness monitor loop.");
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Reachability staleness monitor stopped.");
}
private async Task CheckForStalenessAsync(DateTimeOffset now, CancellationToken cancellationToken)
{
// Get all tenants with reachability facts
var tenants = await _factStore.GetTenantsWithFactsAsync(cancellationToken).ConfigureAwait(false);
if (tenants.Count == 0)
{
_logger.LogDebug("No tenants with reachability facts to monitor.");
return;
}
var stalenessThreshold = now.Subtract(_options.Reachability.FactCacheTtl);
var warningThreshold = now.Subtract(_options.Reachability.FactCacheTtl.Multiply(0.8)); // 80% of TTL
foreach (var tenantId in tenants)
{
try
{
await CheckTenantStalenessAsync(
tenantId,
now,
stalenessThreshold,
warningThreshold,
cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(
ex,
"Failed to check staleness for tenant {TenantId}.",
tenantId);
}
}
}
private async Task CheckTenantStalenessAsync(
string tenantId,
DateTimeOffset now,
DateTimeOffset stalenessThreshold,
DateTimeOffset warningThreshold,
CancellationToken cancellationToken)
{
// Get staleness summary for this tenant
var summary = await _factStore.GetStalenessSummaryAsync(
tenantId,
stalenessThreshold,
warningThreshold,
cancellationToken).ConfigureAwait(false);
// Report metrics
await _metricsReporter.ReportStalenessMetricsAsync(
tenantId,
summary,
cancellationToken).ConfigureAwait(false);
// Publish alerts if necessary
if (summary.StaleCount > 0)
{
_logger.LogWarning(
"Tenant {TenantId} has {StaleCount} stale reachability facts (threshold: {Threshold}).",
tenantId,
summary.StaleCount,
stalenessThreshold);
await _alertPublisher.PublishStaleAlertAsync(
tenantId,
summary,
StalenessLevel.Stale,
cancellationToken).ConfigureAwait(false);
}
else if (summary.WarningCount > 0)
{
_logger.LogInformation(
"Tenant {TenantId} has {WarningCount} reachability facts approaching staleness.",
tenantId,
summary.WarningCount);
await _alertPublisher.PublishStaleAlertAsync(
tenantId,
summary,
StalenessLevel.Warning,
cancellationToken).ConfigureAwait(false);
}
else
{
_logger.LogDebug(
"Tenant {TenantId} reachability facts are fresh ({FreshCount} facts).",
tenantId,
summary.FreshCount);
}
}
}
/// <summary>
/// Store interface for reachability facts with staleness queries.
/// </summary>
public interface IReachabilityFactStore
{
/// <summary>
/// Gets all tenant IDs that have reachability facts.
/// </summary>
ValueTask<IReadOnlyList<string>> GetTenantsWithFactsAsync(
CancellationToken cancellationToken = default);
/// <summary>
/// Gets a staleness summary for a tenant.
/// </summary>
ValueTask<StalenessSummary> GetStalenessSummaryAsync(
string tenantId,
DateTimeOffset stalenessThreshold,
DateTimeOffset warningThreshold,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets stale facts for a tenant.
/// </summary>
ValueTask<IReadOnlyList<StaleFact>> GetStaleFactsAsync(
string tenantId,
DateTimeOffset threshold,
int maxCount,
CancellationToken cancellationToken = default);
/// <summary>
/// Marks facts as requiring refresh.
/// </summary>
ValueTask MarkForRefreshAsync(
string tenantId,
IReadOnlyList<string> factIds,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Publisher interface for staleness alerts.
/// </summary>
public interface IStalenessAlertPublisher
{
/// <summary>
/// Publishes an alert for stale reachability facts.
/// </summary>
ValueTask PublishStaleAlertAsync(
string tenantId,
StalenessSummary summary,
StalenessLevel level,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Reporter interface for staleness metrics.
/// </summary>
public interface IStalenessMetricsReporter
{
/// <summary>
/// Reports staleness metrics for dashboards.
/// </summary>
ValueTask ReportStalenessMetricsAsync(
string tenantId,
StalenessSummary summary,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Summary of reachability fact staleness for a tenant.
/// </summary>
public sealed record StalenessSummary(
string TenantId,
int TotalCount,
int FreshCount,
int WarningCount,
int StaleCount,
DateTimeOffset? OldestFactTimestamp,
DateTimeOffset? NewestFactTimestamp,
ImmutableArray<string> StaleArtifactIds);
/// <summary>
/// Represents a stale reachability fact.
/// </summary>
public sealed record StaleFact(
string FactId,
string TenantId,
string ArtifactId,
string ComponentPurl,
DateTimeOffset ProducedAt,
TimeSpan Age);
/// <summary>
/// Level of staleness for alerts.
/// </summary>
public enum StalenessLevel
{
/// <summary>
/// Facts are fresh and valid.
/// </summary>
Fresh,
/// <summary>
/// Facts are approaching staleness threshold.
/// </summary>
Warning,
/// <summary>
/// Facts have exceeded staleness threshold.
/// </summary>
Stale,
/// <summary>
/// Facts are critically stale and may affect policy decisions.
/// </summary>
Critical
}
/// <summary>
/// In-memory implementation of reachability fact store.
/// </summary>
public sealed class InMemoryReachabilityFactStore : IReachabilityFactStore
{
private readonly Dictionary<string, List<StoredFact>> _facts = new();
private readonly object _lock = new();
public ValueTask<IReadOnlyList<string>> GetTenantsWithFactsAsync(
CancellationToken cancellationToken = default)
{
lock (_lock)
{
return ValueTask.FromResult<IReadOnlyList<string>>(_facts.Keys.ToList());
}
}
public ValueTask<StalenessSummary> GetStalenessSummaryAsync(
string tenantId,
DateTimeOffset stalenessThreshold,
DateTimeOffset warningThreshold,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
if (!_facts.TryGetValue(tenantId, out var facts) || facts.Count == 0)
{
return ValueTask.FromResult(new StalenessSummary(
tenantId, 0, 0, 0, 0, null, null, []));
}
var staleCount = facts.Count(f => f.ProducedAt < stalenessThreshold);
var warningCount = facts.Count(f => f.ProducedAt >= stalenessThreshold && f.ProducedAt < warningThreshold);
var freshCount = facts.Count(f => f.ProducedAt >= warningThreshold);
var staleArtifacts = facts
.Where(f => f.ProducedAt < stalenessThreshold)
.Select(f => f.ArtifactId)
.Distinct()
.ToImmutableArray();
return ValueTask.FromResult(new StalenessSummary(
TenantId: tenantId,
TotalCount: facts.Count,
FreshCount: freshCount,
WarningCount: warningCount,
StaleCount: staleCount,
OldestFactTimestamp: facts.Min(f => f.ProducedAt),
NewestFactTimestamp: facts.Max(f => f.ProducedAt),
StaleArtifactIds: staleArtifacts));
}
}
public ValueTask<IReadOnlyList<StaleFact>> GetStaleFactsAsync(
string tenantId,
DateTimeOffset threshold,
int maxCount,
CancellationToken cancellationToken = default)
{
var now = DateTimeOffset.UtcNow;
lock (_lock)
{
if (!_facts.TryGetValue(tenantId, out var facts))
{
return ValueTask.FromResult<IReadOnlyList<StaleFact>>([]);
}
var staleFacts = facts
.Where(f => f.ProducedAt < threshold)
.OrderBy(f => f.ProducedAt)
.Take(maxCount)
.Select(f => new StaleFact(
f.FactId,
f.TenantId,
f.ArtifactId,
f.ComponentPurl,
f.ProducedAt,
now - f.ProducedAt))
.ToList();
return ValueTask.FromResult<IReadOnlyList<StaleFact>>(staleFacts);
}
}
public ValueTask MarkForRefreshAsync(
string tenantId,
IReadOnlyList<string> factIds,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
if (_facts.TryGetValue(tenantId, out var facts))
{
var factIdSet = new HashSet<string>(factIds);
foreach (var fact in facts.Where(f => factIdSet.Contains(f.FactId)))
{
fact.MarkedForRefresh = true;
}
}
}
return ValueTask.CompletedTask;
}
/// <summary>
/// Adds a fact to the store (for testing).
/// </summary>
public void AddFact(string tenantId, string factId, string artifactId, string componentPurl, DateTimeOffset producedAt)
{
lock (_lock)
{
if (!_facts.TryGetValue(tenantId, out var facts))
{
facts = [];
_facts[tenantId] = facts;
}
facts.Add(new StoredFact
{
FactId = factId,
TenantId = tenantId,
ArtifactId = artifactId,
ComponentPurl = componentPurl,
ProducedAt = producedAt,
MarkedForRefresh = false
});
}
}
private sealed class StoredFact
{
public required string FactId { get; init; }
public required string TenantId { get; init; }
public required string ArtifactId { get; init; }
public required string ComponentPurl { get; init; }
public required DateTimeOffset ProducedAt { get; init; }
public bool MarkedForRefresh { get; set; }
}
}
/// <summary>
/// Null implementation of staleness alert publisher for testing.
/// </summary>
public sealed class NullStalenessAlertPublisher : IStalenessAlertPublisher
{
public static NullStalenessAlertPublisher Instance { get; } = new();
public ValueTask PublishStaleAlertAsync(
string tenantId,
StalenessSummary summary,
StalenessLevel level,
CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
}
/// <summary>
/// Null implementation of staleness metrics reporter for testing.
/// </summary>
public sealed class NullStalenessMetricsReporter : IStalenessMetricsReporter
{
public static NullStalenessMetricsReporter Instance { get; } = new();
public ValueTask ReportStalenessMetricsAsync(
string tenantId,
StalenessSummary summary,
CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
}

View File

@@ -0,0 +1,452 @@
using System.Collections.Immutable;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Resolver;
/// <summary>
/// Evaluation orchestration worker per SCHED-WORKER-29-002.
/// Invokes Policy Engine batch eval, writes results to Findings Ledger projector queue,
/// and handles retries/backoff.
/// </summary>
public sealed class EvaluationOrchestrationWorker : BackgroundService
{
private readonly IPolicyEvaluationJobQueue _jobQueue;
private readonly ICandidateFindingStore _findingStore;
private readonly IPolicyEngineEvaluator _policyEvaluator;
private readonly IFindingsLedgerProjector _ledgerProjector;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly SchedulerWorkerMetrics _metrics;
private readonly ILogger<EvaluationOrchestrationWorker> _logger;
public EvaluationOrchestrationWorker(
IPolicyEvaluationJobQueue jobQueue,
ICandidateFindingStore findingStore,
IPolicyEngineEvaluator policyEvaluator,
IFindingsLedgerProjector ledgerProjector,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
SchedulerWorkerMetrics metrics,
ILogger<EvaluationOrchestrationWorker> logger)
{
_jobQueue = jobQueue ?? throw new ArgumentNullException(nameof(jobQueue));
_findingStore = findingStore ?? throw new ArgumentNullException(nameof(findingStore));
_policyEvaluator = policyEvaluator ?? throw new ArgumentNullException(nameof(policyEvaluator));
_ledgerProjector = ledgerProjector ?? throw new ArgumentNullException(nameof(ledgerProjector));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("Evaluation orchestration worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
// Dequeue evaluation jobs
var jobs = await _jobQueue
.DequeueAsync(_options.Policy.Dispatch.BatchSize, stoppingToken)
.ConfigureAwait(false);
if (jobs.Count == 0)
{
await Task.Delay(_options.Policy.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
foreach (var job in jobs)
{
if (stoppingToken.IsCancellationRequested)
{
break;
}
await ProcessEvaluationJobAsync(job, stoppingToken).ConfigureAwait(false);
}
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error in evaluation orchestration worker loop.");
await Task.Delay(_options.Policy.Dispatch.RetryBackoff, stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Evaluation orchestration worker stopped.");
}
private async Task ProcessEvaluationJobAsync(
PolicyEvaluationJob job,
CancellationToken cancellationToken)
{
var startedAt = _timeProvider.GetUtcNow();
_logger.LogInformation(
"Processing evaluation job {JobId} for tenant {TenantId}, artifact {ArtifactId} with {FindingCount} candidates.",
job.JobId,
job.TenantId,
job.ArtifactId,
job.CandidateFindingIds.Length);
try
{
// 1. Load candidate findings
var candidates = await _findingStore.GetFindingsAsync(
job.TenantId,
job.CandidateFindingIds,
cancellationToken).ConfigureAwait(false);
if (candidates.Count == 0)
{
_logger.LogWarning(
"No candidate findings found for evaluation job {JobId}.",
job.JobId);
return;
}
// 2. Invoke Policy Engine batch eval with retries
var evalResult = await EvaluateWithRetryAsync(
job,
candidates,
cancellationToken).ConfigureAwait(false);
// 3. Write results to Findings Ledger projector queue
var projectionEntries = evalResult.EvaluatedFindings
.Select(f => new FindingsLedgerEntry(
EntryId: $"{job.JobId}:{f.FindingId}",
TenantId: job.TenantId,
ArtifactId: job.ArtifactId,
FindingId: f.FindingId,
ComponentPurl: f.ComponentPurl,
VulnerabilityId: f.VulnerabilityId,
Severity: f.Severity,
PolicyOutcome: f.PolicyOutcome,
PolicyId: f.PolicyId,
ExceptionId: f.AppliedExceptionId,
IsReachable: f.IsReachable,
EvaluatedAt: f.EvaluatedAt,
Metadata: f.Metadata))
.ToList();
await _ledgerProjector.EnqueueAsync(
job.TenantId,
projectionEntries,
cancellationToken).ConfigureAwait(false);
var duration = _timeProvider.GetUtcNow() - startedAt;
_logger.LogInformation(
"Evaluation job {JobId} completed: {EvaluatedCount}/{TotalCount} findings, {ViolationCount} violations in {Duration}ms.",
job.JobId,
evalResult.EvaluatedFindings.Length,
candidates.Count,
evalResult.EvaluatedFindings.Count(f => f.PolicyOutcome == PolicyOutcome.Violation),
duration.TotalMilliseconds);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(
ex,
"Evaluation job {JobId} failed.",
job.JobId);
}
}
private async Task<BatchEvaluationResult> EvaluateWithRetryAsync(
PolicyEvaluationJob job,
IReadOnlyList<CandidateFinding> candidates,
CancellationToken cancellationToken)
{
var maxAttempts = _options.Policy.Dispatch.MaxAttempts;
var delay = _options.Policy.Dispatch.RetryBackoff;
for (var attempt = 1; attempt <= maxAttempts; attempt++)
{
try
{
return await _policyEvaluator.EvaluateBatchAsync(
job.TenantId,
job.ArtifactId,
candidates,
cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (ex is not OperationCanceledException && attempt < maxAttempts)
{
_logger.LogWarning(
ex,
"Batch evaluation failed for job {JobId} (attempt {Attempt}/{MaxAttempts}), retrying...",
job.JobId,
attempt,
maxAttempts);
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
delay = delay.Multiply(2); // Exponential backoff
}
}
throw new InvalidOperationException($"Batch evaluation failed after {maxAttempts} attempts for job {job.JobId}.");
}
}
/// <summary>
/// Queue interface for policy evaluation jobs.
/// </summary>
public interface IPolicyEvaluationJobQueue
{
ValueTask<IReadOnlyList<PolicyEvaluationJob>> DequeueAsync(int maxCount, CancellationToken cancellationToken = default);
ValueTask EnqueueAsync(PolicyEvaluationJob job, CancellationToken cancellationToken = default);
}
/// <summary>
/// Store interface for candidate findings.
/// </summary>
public interface ICandidateFindingStore
{
ValueTask<IReadOnlyList<CandidateFinding>> GetFindingsAsync(
string tenantId,
ImmutableArray<string> findingIds,
CancellationToken cancellationToken = default);
ValueTask StoreFindingsAsync(
string tenantId,
IReadOnlyList<CandidateFinding> findings,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for Policy Engine batch evaluation.
/// </summary>
public interface IPolicyEngineEvaluator
{
ValueTask<BatchEvaluationResult> EvaluateBatchAsync(
string tenantId,
string artifactId,
IReadOnlyList<CandidateFinding> candidates,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for Findings Ledger projector queue.
/// </summary>
public interface IFindingsLedgerProjector
{
ValueTask EnqueueAsync(
string tenantId,
IReadOnlyList<FindingsLedgerEntry> entries,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Result of batch policy evaluation.
/// </summary>
public sealed record BatchEvaluationResult(
string BatchId,
ImmutableArray<EvaluatedFinding> EvaluatedFindings,
int SkippedCount,
DateTimeOffset EvaluatedAt);
/// <summary>
/// A finding after policy evaluation.
/// </summary>
public sealed record EvaluatedFinding(
string FindingId,
string ComponentPurl,
string VulnerabilityId,
string Severity,
PolicyOutcome PolicyOutcome,
string PolicyId,
string? AppliedExceptionId,
bool? IsReachable,
DateTimeOffset EvaluatedAt,
ImmutableDictionary<string, string>? Metadata = null);
/// <summary>
/// Policy evaluation outcome.
/// </summary>
public enum PolicyOutcome
{
Pass,
Warning,
Violation,
Skipped,
Error
}
/// <summary>
/// Entry for Findings Ledger projection.
/// </summary>
public sealed record FindingsLedgerEntry(
string EntryId,
string TenantId,
string ArtifactId,
string FindingId,
string ComponentPurl,
string VulnerabilityId,
string Severity,
PolicyOutcome PolicyOutcome,
string PolicyId,
string? ExceptionId,
bool? IsReachable,
DateTimeOffset EvaluatedAt,
ImmutableDictionary<string, string>? Metadata);
/// <summary>
/// In-memory implementation of policy evaluation job queue.
/// </summary>
public sealed class InMemoryPolicyEvaluationJobQueue : IPolicyEvaluationJobQueue
{
private readonly Queue<PolicyEvaluationJob> _queue = new();
private readonly object _lock = new();
public ValueTask<IReadOnlyList<PolicyEvaluationJob>> DequeueAsync(int maxCount, CancellationToken cancellationToken = default)
{
var results = new List<PolicyEvaluationJob>();
lock (_lock)
{
while (results.Count < maxCount && _queue.Count > 0)
{
results.Add(_queue.Dequeue());
}
}
return ValueTask.FromResult<IReadOnlyList<PolicyEvaluationJob>>(results);
}
public ValueTask EnqueueAsync(PolicyEvaluationJob job, CancellationToken cancellationToken = default)
{
lock (_lock)
{
_queue.Enqueue(job);
}
return ValueTask.CompletedTask;
}
}
/// <summary>
/// In-memory implementation of candidate finding store.
/// </summary>
public sealed class InMemoryCandidateFindingStore : ICandidateFindingStore
{
private readonly Dictionary<string, CandidateFinding> _findings = new();
private readonly object _lock = new();
public ValueTask<IReadOnlyList<CandidateFinding>> GetFindingsAsync(
string tenantId,
ImmutableArray<string> findingIds,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
var results = findingIds
.Where(id => _findings.TryGetValue(id, out var f) && f.TenantId == tenantId)
.Select(id => _findings[id])
.ToList();
return ValueTask.FromResult<IReadOnlyList<CandidateFinding>>(results);
}
}
public ValueTask StoreFindingsAsync(
string tenantId,
IReadOnlyList<CandidateFinding> findings,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
foreach (var finding in findings)
{
if (finding.TenantId == tenantId)
{
_findings[finding.FindingId] = finding;
}
}
}
return ValueTask.CompletedTask;
}
}
/// <summary>
/// In-memory implementation of findings ledger projector.
/// </summary>
public sealed class InMemoryFindingsLedgerProjector : IFindingsLedgerProjector
{
private readonly Queue<FindingsLedgerEntry> _queue = new();
private readonly object _lock = new();
public ValueTask EnqueueAsync(
string tenantId,
IReadOnlyList<FindingsLedgerEntry> entries,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
foreach (var entry in entries)
{
if (entry.TenantId == tenantId)
{
_queue.Enqueue(entry);
}
}
}
return ValueTask.CompletedTask;
}
/// <summary>
/// Gets queued entries (for testing).
/// </summary>
public IReadOnlyList<FindingsLedgerEntry> GetQueuedEntries()
{
lock (_lock)
{
return [.. _queue];
}
}
}
/// <summary>
/// Null implementation of policy engine evaluator for testing.
/// </summary>
public sealed class NullPolicyEngineEvaluator : IPolicyEngineEvaluator
{
public static NullPolicyEngineEvaluator Instance { get; } = new();
public ValueTask<BatchEvaluationResult> EvaluateBatchAsync(
string tenantId,
string artifactId,
IReadOnlyList<CandidateFinding> candidates,
CancellationToken cancellationToken = default)
{
var evaluatedFindings = candidates
.Select(c => new EvaluatedFinding(
c.FindingId,
c.ComponentPurl,
c.VulnerabilityId,
c.Severity,
PolicyOutcome.Pass,
"default-policy",
null,
null,
DateTimeOffset.UtcNow))
.ToImmutableArray();
return ValueTask.FromResult(new BatchEvaluationResult(
BatchId: Guid.NewGuid().ToString("N"),
EvaluatedFindings: evaluatedFindings,
SkippedCount: 0,
EvaluatedAt: DateTimeOffset.UtcNow));
}
}

View File

@@ -0,0 +1,411 @@
using System.Collections.Immutable;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Resolver;
/// <summary>
/// Resolver monitoring worker per SCHED-WORKER-29-003.
/// Monitors resolver/evaluation backlog, SLA breaches, and export job queue.
/// Exposes metrics and alerts for DevOps dashboards.
/// </summary>
public sealed class ResolverMonitoringWorker : BackgroundService
{
private readonly IResolverQueueMetrics _resolverMetrics;
private readonly IEvaluationQueueMetrics _evaluationMetrics;
private readonly IExportQueueMetrics _exportMetrics;
private readonly ISlaBreachDetector _slaBreachDetector;
private readonly IMonitoringAlertPublisher _alertPublisher;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly SchedulerWorkerMetrics _metrics;
private readonly ILogger<ResolverMonitoringWorker> _logger;
public ResolverMonitoringWorker(
IResolverQueueMetrics resolverMetrics,
IEvaluationQueueMetrics evaluationMetrics,
IExportQueueMetrics exportMetrics,
ISlaBreachDetector slaBreachDetector,
IMonitoringAlertPublisher alertPublisher,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
SchedulerWorkerMetrics metrics,
ILogger<ResolverMonitoringWorker> logger)
{
_resolverMetrics = resolverMetrics ?? throw new ArgumentNullException(nameof(resolverMetrics));
_evaluationMetrics = evaluationMetrics ?? throw new ArgumentNullException(nameof(evaluationMetrics));
_exportMetrics = exportMetrics ?? throw new ArgumentNullException(nameof(exportMetrics));
_slaBreachDetector = slaBreachDetector ?? throw new ArgumentNullException(nameof(slaBreachDetector));
_alertPublisher = alertPublisher ?? throw new ArgumentNullException(nameof(alertPublisher));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("Resolver monitoring worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
var now = _timeProvider.GetUtcNow();
// Collect and report metrics
await CollectAndReportMetricsAsync(now, stoppingToken).ConfigureAwait(false);
// Check for SLA breaches
await CheckSlaBreachesAsync(now, stoppingToken).ConfigureAwait(false);
// Wait for next monitoring cycle
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error in resolver monitoring worker loop.");
await Task.Delay(TimeSpan.FromSeconds(10), stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Resolver monitoring worker stopped.");
}
private async Task CollectAndReportMetricsAsync(DateTimeOffset now, CancellationToken cancellationToken)
{
// Resolver queue metrics
var resolverStats = await _resolverMetrics.GetQueueStatsAsync(cancellationToken).ConfigureAwait(false);
ReportQueueMetrics("resolver", resolverStats);
// Evaluation queue metrics
var evalStats = await _evaluationMetrics.GetQueueStatsAsync(cancellationToken).ConfigureAwait(false);
ReportQueueMetrics("evaluation", evalStats);
// Export queue metrics
var exportStats = await _exportMetrics.GetQueueStatsAsync(cancellationToken).ConfigureAwait(false);
ReportQueueMetrics("export", exportStats);
// Log summary
_logger.LogDebug(
"Queue stats - Resolver: {ResolverDepth}, Evaluation: {EvalDepth}, Export: {ExportDepth}",
resolverStats.QueueDepth,
evalStats.QueueDepth,
exportStats.QueueDepth);
// Check for backlog alerts
await CheckBacklogAlertsAsync(resolverStats, evalStats, exportStats, cancellationToken).ConfigureAwait(false);
}
private void ReportQueueMetrics(string queueType, QueueStats stats)
{
// These would typically be reported via the metrics system
// For now, we're using the existing SchedulerWorkerMetrics
_logger.LogDebug(
"{QueueType} queue: depth={Depth}, oldest={OldestAge}s, throughput={Throughput}/s",
queueType,
stats.QueueDepth,
stats.OldestItemAge?.TotalSeconds ?? 0,
stats.ThroughputPerSecond);
}
private async Task CheckBacklogAlertsAsync(
QueueStats resolverStats,
QueueStats evalStats,
QueueStats exportStats,
CancellationToken cancellationToken)
{
const int backlogThreshold = 1000;
const int criticalThreshold = 5000;
// Resolver backlog
if (resolverStats.QueueDepth >= criticalThreshold)
{
await _alertPublisher.PublishAlertAsync(
new MonitoringAlert(
AlertId: $"resolver-backlog-critical-{_timeProvider.GetUtcNow().Ticks}",
Type: AlertType.BacklogCritical,
Source: "resolver",
Message: $"Resolver queue backlog critical: {resolverStats.QueueDepth} items",
Severity: AlertSeverity.Critical,
Value: resolverStats.QueueDepth,
Threshold: criticalThreshold,
Timestamp: _timeProvider.GetUtcNow()),
cancellationToken).ConfigureAwait(false);
}
else if (resolverStats.QueueDepth >= backlogThreshold)
{
await _alertPublisher.PublishAlertAsync(
new MonitoringAlert(
AlertId: $"resolver-backlog-warning-{_timeProvider.GetUtcNow().Ticks}",
Type: AlertType.BacklogWarning,
Source: "resolver",
Message: $"Resolver queue backlog elevated: {resolverStats.QueueDepth} items",
Severity: AlertSeverity.Warning,
Value: resolverStats.QueueDepth,
Threshold: backlogThreshold,
Timestamp: _timeProvider.GetUtcNow()),
cancellationToken).ConfigureAwait(false);
}
// Evaluation backlog
if (evalStats.QueueDepth >= criticalThreshold)
{
await _alertPublisher.PublishAlertAsync(
new MonitoringAlert(
AlertId: $"evaluation-backlog-critical-{_timeProvider.GetUtcNow().Ticks}",
Type: AlertType.BacklogCritical,
Source: "evaluation",
Message: $"Evaluation queue backlog critical: {evalStats.QueueDepth} items",
Severity: AlertSeverity.Critical,
Value: evalStats.QueueDepth,
Threshold: criticalThreshold,
Timestamp: _timeProvider.GetUtcNow()),
cancellationToken).ConfigureAwait(false);
}
// Export backlog
if (exportStats.QueueDepth >= backlogThreshold)
{
await _alertPublisher.PublishAlertAsync(
new MonitoringAlert(
AlertId: $"export-backlog-warning-{_timeProvider.GetUtcNow().Ticks}",
Type: AlertType.BacklogWarning,
Source: "export",
Message: $"Export queue backlog elevated: {exportStats.QueueDepth} items",
Severity: AlertSeverity.Warning,
Value: exportStats.QueueDepth,
Threshold: backlogThreshold,
Timestamp: _timeProvider.GetUtcNow()),
cancellationToken).ConfigureAwait(false);
}
}
private async Task CheckSlaBreachesAsync(DateTimeOffset now, CancellationToken cancellationToken)
{
// Check resolver SLA breaches
var resolverBreaches = await _slaBreachDetector.DetectResolverBreachesAsync(
now,
cancellationToken).ConfigureAwait(false);
foreach (var breach in resolverBreaches)
{
_logger.LogWarning(
"Resolver SLA breach: Job {JobId}, tenant {TenantId}, age {Age}",
breach.JobId,
breach.TenantId,
breach.Age);
await _alertPublisher.PublishAlertAsync(
new MonitoringAlert(
AlertId: $"resolver-sla-breach-{breach.JobId}",
Type: AlertType.SlaBreach,
Source: "resolver",
Message: $"Resolver job {breach.JobId} exceeded SLA: {breach.Age.TotalMinutes:F1} minutes",
Severity: AlertSeverity.High,
Value: (long)breach.Age.TotalSeconds,
Threshold: (long)breach.SlaThreshold.TotalSeconds,
Timestamp: now,
Metadata: new Dictionary<string, string>
{
["job_id"] = breach.JobId,
["tenant_id"] = breach.TenantId
}.ToImmutableDictionary()),
cancellationToken).ConfigureAwait(false);
}
// Check evaluation SLA breaches
var evalBreaches = await _slaBreachDetector.DetectEvaluationBreachesAsync(
now,
cancellationToken).ConfigureAwait(false);
foreach (var breach in evalBreaches)
{
_logger.LogWarning(
"Evaluation SLA breach: Job {JobId}, tenant {TenantId}, age {Age}",
breach.JobId,
breach.TenantId,
breach.Age);
await _alertPublisher.PublishAlertAsync(
new MonitoringAlert(
AlertId: $"evaluation-sla-breach-{breach.JobId}",
Type: AlertType.SlaBreach,
Source: "evaluation",
Message: $"Evaluation job {breach.JobId} exceeded SLA: {breach.Age.TotalMinutes:F1} minutes",
Severity: AlertSeverity.High,
Value: (long)breach.Age.TotalSeconds,
Threshold: (long)breach.SlaThreshold.TotalSeconds,
Timestamp: now,
Metadata: new Dictionary<string, string>
{
["job_id"] = breach.JobId,
["tenant_id"] = breach.TenantId
}.ToImmutableDictionary()),
cancellationToken).ConfigureAwait(false);
}
}
}
/// <summary>
/// Interface for resolver queue metrics.
/// </summary>
public interface IResolverQueueMetrics
{
ValueTask<QueueStats> GetQueueStatsAsync(CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for evaluation queue metrics.
/// </summary>
public interface IEvaluationQueueMetrics
{
ValueTask<QueueStats> GetQueueStatsAsync(CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for export queue metrics.
/// </summary>
public interface IExportQueueMetrics
{
ValueTask<QueueStats> GetQueueStatsAsync(CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for SLA breach detection.
/// </summary>
public interface ISlaBreachDetector
{
ValueTask<IReadOnlyList<SlaBreach>> DetectResolverBreachesAsync(DateTimeOffset now, CancellationToken cancellationToken = default);
ValueTask<IReadOnlyList<SlaBreach>> DetectEvaluationBreachesAsync(DateTimeOffset now, CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for monitoring alert publishing.
/// </summary>
public interface IMonitoringAlertPublisher
{
ValueTask PublishAlertAsync(MonitoringAlert alert, CancellationToken cancellationToken = default);
}
/// <summary>
/// Queue statistics.
/// </summary>
public sealed record QueueStats(
int QueueDepth,
TimeSpan? OldestItemAge,
double ThroughputPerSecond,
int ProcessedLastMinute,
int FailedLastMinute);
/// <summary>
/// SLA breach information.
/// </summary>
public sealed record SlaBreach(
string JobId,
string TenantId,
TimeSpan Age,
TimeSpan SlaThreshold,
DateTimeOffset StartedAt);
/// <summary>
/// A monitoring alert.
/// </summary>
public sealed record MonitoringAlert(
string AlertId,
AlertType Type,
string Source,
string Message,
AlertSeverity Severity,
long Value,
long Threshold,
DateTimeOffset Timestamp,
ImmutableDictionary<string, string>? Metadata = null);
/// <summary>
/// Type of monitoring alert.
/// </summary>
public enum AlertType
{
BacklogWarning,
BacklogCritical,
SlaBreach,
ThroughputDrop,
ErrorRateHigh,
ServiceDegraded
}
/// <summary>
/// Severity of monitoring alert.
/// </summary>
public enum AlertSeverity
{
Info,
Warning,
High,
Critical
}
/// <summary>
/// Null implementation of resolver queue metrics.
/// </summary>
public sealed class NullResolverQueueMetrics : IResolverQueueMetrics
{
public static NullResolverQueueMetrics Instance { get; } = new();
public ValueTask<QueueStats> GetQueueStatsAsync(CancellationToken cancellationToken = default)
=> ValueTask.FromResult(new QueueStats(0, null, 0, 0, 0));
}
/// <summary>
/// Null implementation of evaluation queue metrics.
/// </summary>
public sealed class NullEvaluationQueueMetrics : IEvaluationQueueMetrics
{
public static NullEvaluationQueueMetrics Instance { get; } = new();
public ValueTask<QueueStats> GetQueueStatsAsync(CancellationToken cancellationToken = default)
=> ValueTask.FromResult(new QueueStats(0, null, 0, 0, 0));
}
/// <summary>
/// Null implementation of export queue metrics.
/// </summary>
public sealed class NullExportQueueMetrics : IExportQueueMetrics
{
public static NullExportQueueMetrics Instance { get; } = new();
public ValueTask<QueueStats> GetQueueStatsAsync(CancellationToken cancellationToken = default)
=> ValueTask.FromResult(new QueueStats(0, null, 0, 0, 0));
}
/// <summary>
/// Null implementation of SLA breach detector.
/// </summary>
public sealed class NullSlaBreachDetector : ISlaBreachDetector
{
public static NullSlaBreachDetector Instance { get; } = new();
public ValueTask<IReadOnlyList<SlaBreach>> DetectResolverBreachesAsync(DateTimeOffset now, CancellationToken cancellationToken = default)
=> ValueTask.FromResult<IReadOnlyList<SlaBreach>>([]);
public ValueTask<IReadOnlyList<SlaBreach>> DetectEvaluationBreachesAsync(DateTimeOffset now, CancellationToken cancellationToken = default)
=> ValueTask.FromResult<IReadOnlyList<SlaBreach>>([]);
}
/// <summary>
/// Null implementation of monitoring alert publisher.
/// </summary>
public sealed class NullMonitoringAlertPublisher : IMonitoringAlertPublisher
{
public static NullMonitoringAlertPublisher Instance { get; } = new();
public ValueTask PublishAlertAsync(MonitoringAlert alert, CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
}

View File

@@ -0,0 +1,479 @@
using System.Collections.Immutable;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Resolver;
/// <summary>
/// Resolver worker per SCHED-WORKER-29-001.
/// Generates candidate findings from inventory + advisory evidence,
/// respects ecosystem version semantics and path scope,
/// and emits jobs for policy evaluation.
/// </summary>
public sealed class ResolverWorker : BackgroundService
{
private readonly IResolverJobQueue _jobQueue;
private readonly IInventoryProvider _inventoryProvider;
private readonly IAdvisoryProvider _advisoryProvider;
private readonly IVersionMatcher _versionMatcher;
private readonly ICandidateFindingEmitter _findingEmitter;
private readonly IPolicyEvaluationJobEmitter _evaluationJobEmitter;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly SchedulerWorkerMetrics _metrics;
private readonly ILogger<ResolverWorker> _logger;
public ResolverWorker(
IResolverJobQueue jobQueue,
IInventoryProvider inventoryProvider,
IAdvisoryProvider advisoryProvider,
IVersionMatcher versionMatcher,
ICandidateFindingEmitter findingEmitter,
IPolicyEvaluationJobEmitter evaluationJobEmitter,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
SchedulerWorkerMetrics metrics,
ILogger<ResolverWorker> logger)
{
_jobQueue = jobQueue ?? throw new ArgumentNullException(nameof(jobQueue));
_inventoryProvider = inventoryProvider ?? throw new ArgumentNullException(nameof(inventoryProvider));
_advisoryProvider = advisoryProvider ?? throw new ArgumentNullException(nameof(advisoryProvider));
_versionMatcher = versionMatcher ?? throw new ArgumentNullException(nameof(versionMatcher));
_findingEmitter = findingEmitter ?? throw new ArgumentNullException(nameof(findingEmitter));
_evaluationJobEmitter = evaluationJobEmitter ?? throw new ArgumentNullException(nameof(evaluationJobEmitter));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("Resolver worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
// Dequeue resolver jobs
var jobs = await _jobQueue
.DequeueAsync(_options.Policy.Dispatch.BatchSize, stoppingToken)
.ConfigureAwait(false);
if (jobs.Count == 0)
{
await Task.Delay(_options.Policy.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
foreach (var job in jobs)
{
if (stoppingToken.IsCancellationRequested)
{
break;
}
await ProcessResolverJobAsync(job, stoppingToken).ConfigureAwait(false);
}
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error in resolver worker loop.");
await Task.Delay(_options.Policy.Dispatch.RetryBackoff, stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Resolver worker stopped.");
}
private async Task ProcessResolverJobAsync(
ResolverJob job,
CancellationToken cancellationToken)
{
var startedAt = _timeProvider.GetUtcNow();
_logger.LogInformation(
"Processing resolver job {JobId} for tenant {TenantId}, artifact {ArtifactId}.",
job.JobId,
job.TenantId,
job.ArtifactId);
try
{
// 1. Load inventory for the artifact
var inventory = await _inventoryProvider.GetInventoryAsync(
job.TenantId,
job.ArtifactId,
cancellationToken).ConfigureAwait(false);
if (inventory.Components.Length == 0)
{
_logger.LogDebug(
"No components found in inventory for artifact {ArtifactId}.",
job.ArtifactId);
return;
}
// 2. Get relevant advisories
var ecosystems = inventory.Components
.Select(c => c.Ecosystem)
.Distinct()
.ToList();
var advisories = await _advisoryProvider.GetAdvisoriesForEcosystemsAsync(
ecosystems,
job.AdvisoryFilter,
cancellationToken).ConfigureAwait(false);
_logger.LogDebug(
"Found {ComponentCount} components and {AdvisoryCount} advisories for job {JobId}.",
inventory.Components.Length,
advisories.Count,
job.JobId);
// 3. Match components against advisories
var candidateFindings = new List<CandidateFinding>();
foreach (var component in inventory.Components)
{
// Apply path scope filter if specified
if (job.PathScope is not null && !MatchesPathScope(component.FilePath, job.PathScope))
{
continue;
}
var relevantAdvisories = advisories
.Where(a => a.Ecosystem == component.Ecosystem)
.ToList();
foreach (var advisory in relevantAdvisories)
{
// Check if component matches advisory affected range
var isAffected = await _versionMatcher.IsAffectedAsync(
component,
advisory,
cancellationToken).ConfigureAwait(false);
if (isAffected)
{
var finding = new CandidateFinding(
FindingId: $"{job.JobId}:{component.Purl}:{advisory.AdvisoryId}",
JobId: job.JobId,
TenantId: job.TenantId,
ArtifactId: job.ArtifactId,
ComponentPurl: component.Purl,
ComponentVersion: component.Version,
ComponentEcosystem: component.Ecosystem,
VulnerabilityId: advisory.VulnerabilityId,
AdvisoryId: advisory.AdvisoryId,
Severity: advisory.Severity,
AffectedRange: advisory.AffectedRange,
FixedVersion: advisory.FixedVersion,
FilePath: component.FilePath,
MatchedAt: _timeProvider.GetUtcNow());
candidateFindings.Add(finding);
}
}
}
_logger.LogInformation(
"Generated {FindingCount} candidate findings for job {JobId}.",
candidateFindings.Count,
job.JobId);
// 4. Emit candidate findings
if (candidateFindings.Count > 0)
{
await _findingEmitter.EmitAsync(
job.TenantId,
candidateFindings,
cancellationToken).ConfigureAwait(false);
// 5. Emit policy evaluation job
await _evaluationJobEmitter.EmitAsync(
new PolicyEvaluationJob(
JobId: $"eval-{job.JobId}",
TenantId: job.TenantId,
ArtifactId: job.ArtifactId,
ResolverJobId: job.JobId,
CandidateFindingIds: [.. candidateFindings.Select(f => f.FindingId)],
RequestedAt: _timeProvider.GetUtcNow()),
cancellationToken).ConfigureAwait(false);
}
var duration = _timeProvider.GetUtcNow() - startedAt;
_logger.LogInformation(
"Resolver job {JobId} completed: {ComponentCount} components, {FindingCount} findings in {Duration}ms.",
job.JobId,
inventory.Components.Length,
candidateFindings.Count,
duration.TotalMilliseconds);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(
ex,
"Resolver job {JobId} failed.",
job.JobId);
}
}
private static bool MatchesPathScope(string? filePath, PathScope scope)
{
if (string.IsNullOrEmpty(filePath))
{
return scope.IncludeRootLevel;
}
// Check include patterns
if (scope.IncludePatterns.Length > 0)
{
var matches = scope.IncludePatterns.Any(p => MatchesGlob(filePath, p));
if (!matches)
{
return false;
}
}
// Check exclude patterns
if (scope.ExcludePatterns.Length > 0)
{
var excluded = scope.ExcludePatterns.Any(p => MatchesGlob(filePath, p));
if (excluded)
{
return false;
}
}
return true;
}
private static bool MatchesGlob(string path, string pattern)
{
// Simple glob matching (supports * and **)
var regexPattern = "^" + System.Text.RegularExpressions.Regex.Escape(pattern)
.Replace(@"\*\*", ".*")
.Replace(@"\*", "[^/]*") + "$";
return System.Text.RegularExpressions.Regex.IsMatch(path, regexPattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
}
}
/// <summary>
/// Queue interface for resolver jobs.
/// </summary>
public interface IResolverJobQueue
{
ValueTask<IReadOnlyList<ResolverJob>> DequeueAsync(int maxCount, CancellationToken cancellationToken = default);
ValueTask EnqueueAsync(ResolverJob job, CancellationToken cancellationToken = default);
}
/// <summary>
/// Provider interface for component inventory.
/// </summary>
public interface IInventoryProvider
{
ValueTask<ComponentInventory> GetInventoryAsync(
string tenantId,
string artifactId,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Provider interface for security advisories.
/// </summary>
public interface IAdvisoryProvider
{
ValueTask<IReadOnlyList<SecurityAdvisory>> GetAdvisoriesForEcosystemsAsync(
IReadOnlyList<string> ecosystems,
AdvisoryFilter? filter,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for version matching against advisories.
/// </summary>
public interface IVersionMatcher
{
ValueTask<bool> IsAffectedAsync(
InventoryComponent component,
SecurityAdvisory advisory,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Emitter interface for candidate findings.
/// </summary>
public interface ICandidateFindingEmitter
{
ValueTask EmitAsync(
string tenantId,
IReadOnlyList<CandidateFinding> findings,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Emitter interface for policy evaluation jobs.
/// </summary>
public interface IPolicyEvaluationJobEmitter
{
ValueTask EmitAsync(
PolicyEvaluationJob job,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Represents a resolver job.
/// </summary>
public sealed record ResolverJob(
string JobId,
string TenantId,
string ArtifactId,
DateTimeOffset RequestedAt,
AdvisoryFilter? AdvisoryFilter = null,
PathScope? PathScope = null);
/// <summary>
/// Filter for advisories.
/// </summary>
public sealed record AdvisoryFilter(
ImmutableArray<string> Severities,
DateTimeOffset? PublishedAfter = null,
bool IncludeWithdrawn = false,
bool OnlyKev = false);
/// <summary>
/// Scope for path-based filtering.
/// </summary>
public sealed record PathScope(
ImmutableArray<string> IncludePatterns,
ImmutableArray<string> ExcludePatterns,
bool IncludeRootLevel = true);
/// <summary>
/// Component inventory for an artifact.
/// </summary>
public sealed record ComponentInventory(
string ArtifactId,
string TenantId,
ImmutableArray<InventoryComponent> Components,
DateTimeOffset GeneratedAt);
/// <summary>
/// A component in the inventory.
/// </summary>
public sealed record InventoryComponent(
string Purl,
string Name,
string Version,
string Ecosystem,
string? FilePath = null,
bool IsDirect = true);
/// <summary>
/// A security advisory.
/// </summary>
public sealed record SecurityAdvisory(
string AdvisoryId,
string VulnerabilityId,
string Ecosystem,
string Severity,
string AffectedRange,
string? FixedVersion,
DateTimeOffset PublishedAt,
bool IsKev = false,
bool IsWithdrawn = false);
/// <summary>
/// A candidate finding from resolver.
/// </summary>
public sealed record CandidateFinding(
string FindingId,
string JobId,
string TenantId,
string ArtifactId,
string ComponentPurl,
string ComponentVersion,
string ComponentEcosystem,
string VulnerabilityId,
string AdvisoryId,
string Severity,
string AffectedRange,
string? FixedVersion,
string? FilePath,
DateTimeOffset MatchedAt);
/// <summary>
/// A policy evaluation job.
/// </summary>
public sealed record PolicyEvaluationJob(
string JobId,
string TenantId,
string ArtifactId,
string ResolverJobId,
ImmutableArray<string> CandidateFindingIds,
DateTimeOffset RequestedAt);
/// <summary>
/// In-memory implementation of resolver job queue.
/// </summary>
public sealed class InMemoryResolverJobQueue : IResolverJobQueue
{
private readonly Queue<ResolverJob> _queue = new();
private readonly object _lock = new();
public ValueTask<IReadOnlyList<ResolverJob>> DequeueAsync(int maxCount, CancellationToken cancellationToken = default)
{
var results = new List<ResolverJob>();
lock (_lock)
{
while (results.Count < maxCount && _queue.Count > 0)
{
results.Add(_queue.Dequeue());
}
}
return ValueTask.FromResult<IReadOnlyList<ResolverJob>>(results);
}
public ValueTask EnqueueAsync(ResolverJob job, CancellationToken cancellationToken = default)
{
lock (_lock)
{
_queue.Enqueue(job);
}
return ValueTask.CompletedTask;
}
}
/// <summary>
/// Null implementation of candidate finding emitter for testing.
/// </summary>
public sealed class NullCandidateFindingEmitter : ICandidateFindingEmitter
{
public static NullCandidateFindingEmitter Instance { get; } = new();
public ValueTask EmitAsync(string tenantId, IReadOnlyList<CandidateFinding> findings, CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
}
/// <summary>
/// Null implementation of policy evaluation job emitter for testing.
/// </summary>
public sealed class NullPolicyEvaluationJobEmitter : IPolicyEvaluationJobEmitter
{
public static NullPolicyEvaluationJobEmitter Instance { get; } = new();
public ValueTask EmitAsync(PolicyEvaluationJob job, CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
}

View File

@@ -0,0 +1,563 @@
using System.Collections.Immutable;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Simulation;
/// <summary>
/// Policy batch simulation worker per SCHED-WORKER-27-301.
/// Shards SBOM inventories, invokes Policy Engine, emits partial results,
/// handles retries/backoff, and publishes progress events.
/// </summary>
public sealed class PolicyBatchSimulationWorker : BackgroundService
{
private readonly ISimulationJobQueue _jobQueue;
private readonly ISimulationSharder _sharder;
private readonly IPolicyEngineClient _policyEngine;
private readonly ISimulationResultStore _resultStore;
private readonly ISimulationProgressPublisher _progressPublisher;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly SchedulerWorkerMetrics _metrics;
private readonly ILogger<PolicyBatchSimulationWorker> _logger;
public PolicyBatchSimulationWorker(
ISimulationJobQueue jobQueue,
ISimulationSharder sharder,
IPolicyEngineClient policyEngine,
ISimulationResultStore resultStore,
ISimulationProgressPublisher progressPublisher,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
SchedulerWorkerMetrics metrics,
ILogger<PolicyBatchSimulationWorker> logger)
{
_jobQueue = jobQueue ?? throw new ArgumentNullException(nameof(jobQueue));
_sharder = sharder ?? throw new ArgumentNullException(nameof(sharder));
_policyEngine = policyEngine ?? throw new ArgumentNullException(nameof(policyEngine));
_resultStore = resultStore ?? throw new ArgumentNullException(nameof(resultStore));
_progressPublisher = progressPublisher ?? throw new ArgumentNullException(nameof(progressPublisher));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
if (!_options.Policy.Enabled)
{
_logger.LogInformation("Policy batch simulation worker is disabled.");
return;
}
_logger.LogInformation("Policy batch simulation worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
// Dequeue simulation jobs
var jobs = await _jobQueue
.DequeueAsync(_options.Policy.Dispatch.BatchSize, stoppingToken)
.ConfigureAwait(false);
if (jobs.Count == 0)
{
await Task.Delay(_options.Policy.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
foreach (var job in jobs)
{
if (stoppingToken.IsCancellationRequested)
{
break;
}
await ProcessSimulationJobAsync(job, stoppingToken).ConfigureAwait(false);
}
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error in policy batch simulation worker loop.");
await Task.Delay(_options.Policy.Dispatch.RetryBackoff, stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Policy batch simulation worker stopped.");
}
private async Task ProcessSimulationJobAsync(
SimulationJob job,
CancellationToken cancellationToken)
{
var startedAt = _timeProvider.GetUtcNow();
_logger.LogInformation(
"Processing simulation job {JobId} for tenant {TenantId}, policy {PolicyId}.",
job.JobId,
job.TenantId,
job.PolicyId);
try
{
// Publish job started
await _progressPublisher.PublishStartedAsync(job, cancellationToken).ConfigureAwait(false);
// Shard the SBOM inventory
var shards = await _sharder.ShardInventoryAsync(
job.TenantId,
job.SbomIds,
cancellationToken).ConfigureAwait(false);
_logger.LogDebug(
"Sharded {SbomCount} SBOMs into {ShardCount} shards for job {JobId}.",
job.SbomIds.Length,
shards.Count,
job.JobId);
var completedShards = 0;
var totalFindings = 0;
var failedShards = new List<string>();
foreach (var shard in shards)
{
try
{
// Process shard with retry
var result = await ProcessShardWithRetryAsync(
job,
shard,
cancellationToken).ConfigureAwait(false);
// Store partial result
await _resultStore.StorePartialResultAsync(
job.JobId,
shard.ShardId,
result,
cancellationToken).ConfigureAwait(false);
completedShards++;
totalFindings += result.FindingsCount;
// Publish progress
await _progressPublisher.PublishProgressAsync(
job,
completedShards,
shards.Count,
totalFindings,
cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(
ex,
"Failed to process shard {ShardId} for job {JobId}.",
shard.ShardId,
job.JobId);
failedShards.Add(shard.ShardId);
}
}
var duration = _timeProvider.GetUtcNow() - startedAt;
var status = failedShards.Count == 0
? SimulationStatus.Completed
: failedShards.Count == shards.Count
? SimulationStatus.Failed
: SimulationStatus.PartiallyCompleted;
// Publish completion
await _progressPublisher.PublishCompletedAsync(
job,
status,
completedShards,
shards.Count,
totalFindings,
[.. failedShards],
duration,
cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"Simulation job {JobId} completed with status {Status}: {CompletedShards}/{TotalShards} shards, {TotalFindings} findings in {Duration}ms.",
job.JobId,
status,
completedShards,
shards.Count,
totalFindings,
duration.TotalMilliseconds);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(
ex,
"Simulation job {JobId} failed.",
job.JobId);
await _progressPublisher.PublishFailedAsync(
job,
ex.Message,
cancellationToken).ConfigureAwait(false);
}
}
private async Task<SimulationShardResult> ProcessShardWithRetryAsync(
SimulationJob job,
SimulationShard shard,
CancellationToken cancellationToken)
{
var maxAttempts = _options.Policy.Dispatch.MaxAttempts;
var delay = _options.Policy.Dispatch.RetryBackoff;
for (var attempt = 1; attempt <= maxAttempts; attempt++)
{
try
{
return await _policyEngine.EvaluateAsync(
job.TenantId,
job.PolicyId,
shard.SbomIds,
job.SimulationOptions,
cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (ex is not OperationCanceledException && attempt < maxAttempts)
{
_logger.LogWarning(
ex,
"Shard {ShardId} evaluation failed (attempt {Attempt}/{MaxAttempts}), retrying...",
shard.ShardId,
attempt,
maxAttempts);
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
delay = delay.Multiply(2); // Exponential backoff
}
}
throw new InvalidOperationException($"Shard {shard.ShardId} failed after {maxAttempts} attempts.");
}
}
/// <summary>
/// Queue interface for simulation jobs.
/// </summary>
public interface ISimulationJobQueue
{
/// <summary>
/// Dequeues simulation jobs for processing.
/// </summary>
ValueTask<IReadOnlyList<SimulationJob>> DequeueAsync(
int maxCount,
CancellationToken cancellationToken = default);
/// <summary>
/// Enqueues a simulation job.
/// </summary>
ValueTask EnqueueAsync(
SimulationJob job,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for sharding SBOM inventories.
/// </summary>
public interface ISimulationSharder
{
/// <summary>
/// Shards SBOM IDs into processable chunks.
/// </summary>
ValueTask<IReadOnlyList<SimulationShard>> ShardInventoryAsync(
string tenantId,
ImmutableArray<string> sbomIds,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Client interface for Policy Engine evaluation.
/// </summary>
public interface IPolicyEngineClient
{
/// <summary>
/// Evaluates a policy against SBOMs.
/// </summary>
ValueTask<SimulationShardResult> EvaluateAsync(
string tenantId,
string policyId,
ImmutableArray<string> sbomIds,
SimulationOptions options,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Store interface for simulation results.
/// </summary>
public interface ISimulationResultStore
{
/// <summary>
/// Stores a partial result for a shard.
/// </summary>
ValueTask StorePartialResultAsync(
string jobId,
string shardId,
SimulationShardResult result,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets all partial results for a job.
/// </summary>
ValueTask<IReadOnlyList<SimulationShardResult>> GetPartialResultsAsync(
string jobId,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Publisher interface for simulation progress events.
/// </summary>
public interface ISimulationProgressPublisher
{
ValueTask PublishStartedAsync(
SimulationJob job,
CancellationToken cancellationToken = default);
ValueTask PublishProgressAsync(
SimulationJob job,
int completedShards,
int totalShards,
int totalFindings,
CancellationToken cancellationToken = default);
ValueTask PublishCompletedAsync(
SimulationJob job,
SimulationStatus status,
int completedShards,
int totalShards,
int totalFindings,
ImmutableArray<string> failedShards,
TimeSpan duration,
CancellationToken cancellationToken = default);
ValueTask PublishFailedAsync(
SimulationJob job,
string error,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Represents a simulation job.
/// </summary>
public sealed record SimulationJob(
string JobId,
string TenantId,
string PolicyId,
ImmutableArray<string> SbomIds,
SimulationOptions SimulationOptions,
DateTimeOffset RequestedAt,
string? RequestedBy = null);
/// <summary>
/// Options for policy simulation.
/// </summary>
public sealed record SimulationOptions(
bool IncludeReachability = true,
bool IncludeExceptions = true,
bool DryRun = true,
int? MaxFindings = null);
/// <summary>
/// Represents a shard of SBOMs for simulation.
/// </summary>
public sealed record SimulationShard(
string ShardId,
int ShardIndex,
int TotalShards,
ImmutableArray<string> SbomIds);
/// <summary>
/// Result of evaluating a simulation shard.
/// </summary>
public sealed record SimulationShardResult(
string ShardId,
int SbomsProcessed,
int FindingsCount,
int ViolationsCount,
int WarningsCount,
ImmutableArray<SimulationFinding> Findings,
DateTimeOffset EvaluatedAt);
/// <summary>
/// A finding from policy simulation.
/// </summary>
public sealed record SimulationFinding(
string FindingId,
string SbomId,
string ComponentPurl,
string VulnerabilityId,
string Severity,
string PolicyOutcome,
string? ExceptionId = null,
bool? IsReachable = null);
/// <summary>
/// Status of a simulation job.
/// </summary>
public enum SimulationStatus
{
Pending,
Running,
Completed,
PartiallyCompleted,
Failed,
Cancelled
}
/// <summary>
/// In-memory implementation of simulation job queue.
/// </summary>
public sealed class InMemorySimulationJobQueue : ISimulationJobQueue
{
private readonly Queue<SimulationJob> _queue = new();
private readonly object _lock = new();
public ValueTask<IReadOnlyList<SimulationJob>> DequeueAsync(
int maxCount,
CancellationToken cancellationToken = default)
{
var results = new List<SimulationJob>();
lock (_lock)
{
while (results.Count < maxCount && _queue.Count > 0)
{
results.Add(_queue.Dequeue());
}
}
return ValueTask.FromResult<IReadOnlyList<SimulationJob>>(results);
}
public ValueTask EnqueueAsync(
SimulationJob job,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
_queue.Enqueue(job);
}
return ValueTask.CompletedTask;
}
}
/// <summary>
/// Default implementation of simulation sharder.
/// </summary>
public sealed class DefaultSimulationSharder : ISimulationSharder
{
private readonly int _shardSize;
public DefaultSimulationSharder(int shardSize = 100)
{
_shardSize = shardSize > 0 ? shardSize : throw new ArgumentOutOfRangeException(nameof(shardSize));
}
public ValueTask<IReadOnlyList<SimulationShard>> ShardInventoryAsync(
string tenantId,
ImmutableArray<string> sbomIds,
CancellationToken cancellationToken = default)
{
if (sbomIds.Length == 0)
{
return ValueTask.FromResult<IReadOnlyList<SimulationShard>>([]);
}
var shards = new List<SimulationShard>();
var totalShards = (int)Math.Ceiling(sbomIds.Length / (double)_shardSize);
for (var i = 0; i < totalShards; i++)
{
var shardSboms = sbomIds
.Skip(i * _shardSize)
.Take(_shardSize)
.ToImmutableArray();
shards.Add(new SimulationShard(
ShardId: $"{tenantId}-shard-{i:D4}",
ShardIndex: i,
TotalShards: totalShards,
SbomIds: shardSboms));
}
return ValueTask.FromResult<IReadOnlyList<SimulationShard>>(shards);
}
}
/// <summary>
/// In-memory implementation of simulation result store.
/// </summary>
public sealed class InMemorySimulationResultStore : ISimulationResultStore
{
private readonly Dictionary<string, List<SimulationShardResult>> _results = new();
private readonly object _lock = new();
public ValueTask StorePartialResultAsync(
string jobId,
string shardId,
SimulationShardResult result,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
if (!_results.TryGetValue(jobId, out var results))
{
results = [];
_results[jobId] = results;
}
results.Add(result);
}
return ValueTask.CompletedTask;
}
public ValueTask<IReadOnlyList<SimulationShardResult>> GetPartialResultsAsync(
string jobId,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
if (_results.TryGetValue(jobId, out var results))
{
return ValueTask.FromResult<IReadOnlyList<SimulationShardResult>>(results.ToList());
}
}
return ValueTask.FromResult<IReadOnlyList<SimulationShardResult>>([]);
}
}
/// <summary>
/// Null implementation of simulation progress publisher for testing.
/// </summary>
public sealed class NullSimulationProgressPublisher : ISimulationProgressPublisher
{
public static NullSimulationProgressPublisher Instance { get; } = new();
public ValueTask PublishStartedAsync(SimulationJob job, CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
public ValueTask PublishProgressAsync(SimulationJob job, int completedShards, int totalShards, int totalFindings, CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
public ValueTask PublishCompletedAsync(SimulationJob job, SimulationStatus status, int completedShards, int totalShards, int totalFindings, ImmutableArray<string> failedShards, TimeSpan duration, CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
public ValueTask PublishFailedAsync(SimulationJob job, string error, CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
}

View File

@@ -0,0 +1,502 @@
using System.Collections.Immutable;
using System.Security.Cryptography;
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Simulation;
/// <summary>
/// Simulation reducer worker per SCHED-WORKER-27-302.
/// Aggregates shard outputs into final manifests with counts, deltas, and samples.
/// Writes to object storage with checksums and emits completion events.
/// </summary>
public sealed class SimulationReducerWorker : BackgroundService
{
private readonly IReducerJobQueue _jobQueue;
private readonly ISimulationResultStore _resultStore;
private readonly ISimulationManifestWriter _manifestWriter;
private readonly IReducerCompletionPublisher _completionPublisher;
private readonly SchedulerWorkerOptions _options;
private readonly TimeProvider _timeProvider;
private readonly SchedulerWorkerMetrics _metrics;
private readonly ILogger<SimulationReducerWorker> _logger;
public SimulationReducerWorker(
IReducerJobQueue jobQueue,
ISimulationResultStore resultStore,
ISimulationManifestWriter manifestWriter,
IReducerCompletionPublisher completionPublisher,
SchedulerWorkerOptions options,
TimeProvider? timeProvider,
SchedulerWorkerMetrics metrics,
ILogger<SimulationReducerWorker> logger)
{
_jobQueue = jobQueue ?? throw new ArgumentNullException(nameof(jobQueue));
_resultStore = resultStore ?? throw new ArgumentNullException(nameof(resultStore));
_manifestWriter = manifestWriter ?? throw new ArgumentNullException(nameof(manifestWriter));
_completionPublisher = completionPublisher ?? throw new ArgumentNullException(nameof(completionPublisher));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
if (!_options.Policy.Enabled)
{
_logger.LogInformation("Simulation reducer worker is disabled.");
return;
}
_logger.LogInformation("Simulation reducer worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
// Dequeue reducer jobs
var jobs = await _jobQueue
.DequeueAsync(_options.Policy.Dispatch.BatchSize, stoppingToken)
.ConfigureAwait(false);
if (jobs.Count == 0)
{
await Task.Delay(_options.Policy.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
foreach (var job in jobs)
{
if (stoppingToken.IsCancellationRequested)
{
break;
}
await ProcessReducerJobAsync(job, stoppingToken).ConfigureAwait(false);
}
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error in simulation reducer worker loop.");
await Task.Delay(_options.Policy.Dispatch.RetryBackoff, stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Simulation reducer worker stopped.");
}
private async Task ProcessReducerJobAsync(
ReducerJob job,
CancellationToken cancellationToken)
{
var startedAt = _timeProvider.GetUtcNow();
_logger.LogInformation(
"Processing reducer job for simulation {SimulationJobId}, tenant {TenantId}.",
job.SimulationJobId,
job.TenantId);
try
{
// Get all partial results
var partialResults = await _resultStore
.GetPartialResultsAsync(job.SimulationJobId, cancellationToken)
.ConfigureAwait(false);
if (partialResults.Count == 0)
{
_logger.LogWarning(
"No partial results found for simulation {SimulationJobId}.",
job.SimulationJobId);
await _completionPublisher.PublishCompletionAsync(
job,
ReducerStatus.NoResults,
null,
cancellationToken).ConfigureAwait(false);
return;
}
// Aggregate results into manifest
var manifest = AggregateResults(job, partialResults);
// Write manifest to object storage
var storageResult = await _manifestWriter.WriteManifestAsync(
job.TenantId,
job.SimulationJobId,
manifest,
cancellationToken).ConfigureAwait(false);
var duration = _timeProvider.GetUtcNow() - startedAt;
// Publish completion
await _completionPublisher.PublishCompletionAsync(
job,
ReducerStatus.Completed,
storageResult,
cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"Reducer job completed for simulation {SimulationJobId}: {TotalFindings} findings, {TotalViolations} violations, manifest stored at {StorageUri} in {Duration}ms.",
job.SimulationJobId,
manifest.TotalFindings,
manifest.TotalViolations,
storageResult.StorageUri,
duration.TotalMilliseconds);
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogError(
ex,
"Reducer job failed for simulation {SimulationJobId}.",
job.SimulationJobId);
await _completionPublisher.PublishCompletionAsync(
job,
ReducerStatus.Failed,
null,
cancellationToken).ConfigureAwait(false);
}
}
private SimulationManifest AggregateResults(
ReducerJob job,
IReadOnlyList<SimulationShardResult> partialResults)
{
var allFindings = partialResults
.SelectMany(r => r.Findings)
.ToList();
// Calculate counts
var totalFindings = allFindings.Count;
var totalViolations = allFindings.Count(f => f.PolicyOutcome == "violation");
var totalWarnings = allFindings.Count(f => f.PolicyOutcome == "warning");
var totalPassed = allFindings.Count(f => f.PolicyOutcome == "pass");
// Calculate severity breakdown
var severityCounts = allFindings
.GroupBy(f => f.Severity)
.ToImmutableDictionary(g => g.Key, g => g.Count());
// Calculate delta from baseline if available
var delta = job.BaselineManifestUri is not null
? CalculateDelta(allFindings, job)
: null;
// Sample findings (top N by severity)
var samples = allFindings
.OrderByDescending(f => GetSeverityWeight(f.Severity))
.ThenBy(f => f.FindingId)
.Take(100)
.ToImmutableArray();
// Group by component
var byComponent = allFindings
.GroupBy(f => f.ComponentPurl)
.Select(g => new ComponentSummary(
g.Key,
g.Count(),
g.Count(f => f.PolicyOutcome == "violation"),
g.Any(f => f.IsReachable == true)))
.OrderByDescending(c => c.ViolationCount)
.Take(50)
.ToImmutableArray();
// Group by vulnerability
var byVulnerability = allFindings
.GroupBy(f => f.VulnerabilityId)
.Select(g => new VulnerabilitySummary(
g.Key,
g.First().Severity,
g.Count(),
g.Select(f => f.ComponentPurl).Distinct().Count()))
.OrderByDescending(v => GetSeverityWeight(v.Severity))
.ThenByDescending(v => v.AffectedComponentCount)
.Take(50)
.ToImmutableArray();
return new SimulationManifest(
ManifestId: $"{job.SimulationJobId}-manifest",
SimulationJobId: job.SimulationJobId,
TenantId: job.TenantId,
PolicyId: job.PolicyId,
GeneratedAt: _timeProvider.GetUtcNow(),
TotalSboms: partialResults.Sum(r => r.SbomsProcessed),
TotalFindings: totalFindings,
TotalViolations: totalViolations,
TotalWarnings: totalWarnings,
TotalPassed: totalPassed,
SeverityCounts: severityCounts,
Delta: delta,
SampleFindings: samples,
ComponentSummaries: byComponent,
VulnerabilitySummaries: byVulnerability);
}
private static SimulationDelta? CalculateDelta(
IReadOnlyList<SimulationFinding> findings,
ReducerJob job)
{
// Placeholder - in real implementation, would load baseline and compare
return new SimulationDelta(
BaselineManifestUri: job.BaselineManifestUri!,
NewFindings: 0,
ResolvedFindings: 0,
UnchangedFindings: findings.Count,
NewViolations: 0,
ResolvedViolations: 0);
}
private static int GetSeverityWeight(string severity)
{
return severity.ToLowerInvariant() switch
{
"critical" => 4,
"high" => 3,
"medium" => 2,
"low" => 1,
_ => 0
};
}
}
/// <summary>
/// Queue interface for reducer jobs.
/// </summary>
public interface IReducerJobQueue
{
/// <summary>
/// Dequeues reducer jobs for processing.
/// </summary>
ValueTask<IReadOnlyList<ReducerJob>> DequeueAsync(
int maxCount,
CancellationToken cancellationToken = default);
/// <summary>
/// Enqueues a reducer job.
/// </summary>
ValueTask EnqueueAsync(
ReducerJob job,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Writer interface for simulation manifests.
/// </summary>
public interface ISimulationManifestWriter
{
/// <summary>
/// Writes a manifest to object storage.
/// </summary>
ValueTask<ManifestStorageResult> WriteManifestAsync(
string tenantId,
string simulationJobId,
SimulationManifest manifest,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Publisher interface for reducer completion events.
/// </summary>
public interface IReducerCompletionPublisher
{
/// <summary>
/// Publishes reducer completion event.
/// </summary>
ValueTask PublishCompletionAsync(
ReducerJob job,
ReducerStatus status,
ManifestStorageResult? storageResult,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Represents a reducer job.
/// </summary>
public sealed record ReducerJob(
string ReducerJobId,
string SimulationJobId,
string TenantId,
string PolicyId,
string? BaselineManifestUri = null);
/// <summary>
/// Result of storing a manifest.
/// </summary>
public sealed record ManifestStorageResult(
string StorageUri,
string Checksum,
string ChecksumAlgorithm,
long SizeBytes,
DateTimeOffset StoredAt);
/// <summary>
/// Aggregated simulation manifest.
/// </summary>
public sealed record SimulationManifest(
string ManifestId,
string SimulationJobId,
string TenantId,
string PolicyId,
DateTimeOffset GeneratedAt,
int TotalSboms,
int TotalFindings,
int TotalViolations,
int TotalWarnings,
int TotalPassed,
ImmutableDictionary<string, int> SeverityCounts,
SimulationDelta? Delta,
ImmutableArray<SimulationFinding> SampleFindings,
ImmutableArray<ComponentSummary> ComponentSummaries,
ImmutableArray<VulnerabilitySummary> VulnerabilitySummaries);
/// <summary>
/// Delta comparison with baseline.
/// </summary>
public sealed record SimulationDelta(
string BaselineManifestUri,
int NewFindings,
int ResolvedFindings,
int UnchangedFindings,
int NewViolations,
int ResolvedViolations);
/// <summary>
/// Summary of findings by component.
/// </summary>
public sealed record ComponentSummary(
string ComponentPurl,
int FindingCount,
int ViolationCount,
bool HasReachableFindings);
/// <summary>
/// Summary of findings by vulnerability.
/// </summary>
public sealed record VulnerabilitySummary(
string VulnerabilityId,
string Severity,
int FindingCount,
int AffectedComponentCount);
/// <summary>
/// Status of a reducer job.
/// </summary>
public enum ReducerStatus
{
Pending,
Running,
Completed,
NoResults,
Failed
}
/// <summary>
/// In-memory implementation of reducer job queue.
/// </summary>
public sealed class InMemoryReducerJobQueue : IReducerJobQueue
{
private readonly Queue<ReducerJob> _queue = new();
private readonly object _lock = new();
public ValueTask<IReadOnlyList<ReducerJob>> DequeueAsync(
int maxCount,
CancellationToken cancellationToken = default)
{
var results = new List<ReducerJob>();
lock (_lock)
{
while (results.Count < maxCount && _queue.Count > 0)
{
results.Add(_queue.Dequeue());
}
}
return ValueTask.FromResult<IReadOnlyList<ReducerJob>>(results);
}
public ValueTask EnqueueAsync(
ReducerJob job,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
_queue.Enqueue(job);
}
return ValueTask.CompletedTask;
}
}
/// <summary>
/// In-memory implementation of simulation manifest writer.
/// </summary>
public sealed class InMemorySimulationManifestWriter : ISimulationManifestWriter
{
private readonly Dictionary<string, (SimulationManifest Manifest, ManifestStorageResult Result)> _manifests = new();
private readonly object _lock = new();
public ValueTask<ManifestStorageResult> WriteManifestAsync(
string tenantId,
string simulationJobId,
SimulationManifest manifest,
CancellationToken cancellationToken = default)
{
var json = JsonSerializer.Serialize(manifest);
var bytes = Encoding.UTF8.GetBytes(json);
var checksum = Convert.ToHexString(SHA256.HashData(bytes)).ToLowerInvariant();
var result = new ManifestStorageResult(
StorageUri: $"mem://{tenantId}/simulations/{simulationJobId}/manifest.json",
Checksum: checksum,
ChecksumAlgorithm: "SHA256",
SizeBytes: bytes.Length,
StoredAt: DateTimeOffset.UtcNow);
lock (_lock)
{
_manifests[$"{tenantId}/{simulationJobId}"] = (manifest, result);
}
return ValueTask.FromResult(result);
}
/// <summary>
/// Gets a stored manifest (for testing).
/// </summary>
public SimulationManifest? GetManifest(string tenantId, string simulationJobId)
{
lock (_lock)
{
return _manifests.TryGetValue($"{tenantId}/{simulationJobId}", out var entry)
? entry.Manifest
: null;
}
}
}
/// <summary>
/// Null implementation of reducer completion publisher for testing.
/// </summary>
public sealed class NullReducerCompletionPublisher : IReducerCompletionPublisher
{
public static NullReducerCompletionPublisher Instance { get; } = new();
public ValueTask PublishCompletionAsync(
ReducerJob job,
ReducerStatus status,
ManifestStorageResult? storageResult,
CancellationToken cancellationToken = default)
=> ValueTask.CompletedTask;
}

View File

@@ -0,0 +1,504 @@
using System.Collections.Immutable;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Simulation;
/// <summary>
/// Security enforcement per SCHED-WORKER-27-303.
/// Enforces tenant isolation, scope checks, and attestation integration for simulation jobs.
/// Includes secret scanning pipeline for uploaded policy sources.
/// </summary>
public sealed class SimulationSecurityEnforcer : ISimulationSecurityEnforcer
{
private readonly ITenantScopeValidator _scopeValidator;
private readonly IAttestationVerifier _attestationVerifier;
private readonly ISecretScanner _secretScanner;
private readonly SchedulerWorkerOptions _options;
private readonly ILogger<SimulationSecurityEnforcer> _logger;
public SimulationSecurityEnforcer(
ITenantScopeValidator scopeValidator,
IAttestationVerifier attestationVerifier,
ISecretScanner secretScanner,
SchedulerWorkerOptions options,
ILogger<SimulationSecurityEnforcer> logger)
{
_scopeValidator = scopeValidator ?? throw new ArgumentNullException(nameof(scopeValidator));
_attestationVerifier = attestationVerifier ?? throw new ArgumentNullException(nameof(attestationVerifier));
_secretScanner = secretScanner ?? throw new ArgumentNullException(nameof(secretScanner));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <summary>
/// Validates a simulation job for security compliance.
/// </summary>
public async ValueTask<SecurityValidationResult> ValidateJobAsync(
SimulationJob job,
SimulationSecurityContext context,
CancellationToken cancellationToken = default)
{
var violations = new List<SecurityViolation>();
// 1. Validate tenant isolation
var tenantResult = await ValidateTenantIsolationAsync(job, context, cancellationToken).ConfigureAwait(false);
violations.AddRange(tenantResult.Violations);
// 2. Validate scope permissions
var scopeResult = await ValidateScopePermissionsAsync(job, context, cancellationToken).ConfigureAwait(false);
violations.AddRange(scopeResult.Violations);
// 3. Validate attestations if required
if (context.RequireAttestation)
{
var attestationResult = await ValidateAttestationsAsync(job, context, cancellationToken).ConfigureAwait(false);
violations.AddRange(attestationResult.Violations);
}
// 4. Scan policy source for secrets if provided
if (job.SimulationOptions is { } opts && context.PolicySource is not null)
{
var secretResult = await ScanForSecretsAsync(context.PolicySource, cancellationToken).ConfigureAwait(false);
violations.AddRange(secretResult.Violations);
}
var isValid = violations.Count == 0 || violations.All(v => v.Severity != ViolationSeverity.Critical);
if (!isValid)
{
_logger.LogWarning(
"Security validation failed for job {JobId}: {ViolationCount} violations found.",
job.JobId,
violations.Count);
}
return new SecurityValidationResult(
IsValid: isValid,
Violations: [.. violations],
ValidatedAt: DateTimeOffset.UtcNow,
ValidatorVersion: "1.0.0");
}
/// <summary>
/// Validates simulation shard results for tenant isolation.
/// </summary>
public async ValueTask<SecurityValidationResult> ValidateShardResultAsync(
SimulationShardResult result,
SimulationSecurityContext context,
CancellationToken cancellationToken = default)
{
var violations = new List<SecurityViolation>();
// Verify all findings belong to the expected tenant
foreach (var finding in result.Findings)
{
var belongsToTenant = await _scopeValidator.ValidateFindingOwnershipAsync(
finding.SbomId,
context.TenantId,
cancellationToken).ConfigureAwait(false);
if (!belongsToTenant)
{
violations.Add(new SecurityViolation(
Code: "TENANT_ISOLATION_BREACH",
Message: $"Finding {finding.FindingId} references SBOM not owned by tenant {context.TenantId}.",
Severity: ViolationSeverity.Critical,
Source: "ShardResultValidator"));
}
}
return new SecurityValidationResult(
IsValid: violations.Count == 0,
Violations: [.. violations],
ValidatedAt: DateTimeOffset.UtcNow,
ValidatorVersion: "1.0.0");
}
private async ValueTask<ValidationStepResult> ValidateTenantIsolationAsync(
SimulationJob job,
SimulationSecurityContext context,
CancellationToken cancellationToken)
{
var violations = new List<SecurityViolation>();
// Verify job tenant matches context tenant
if (!string.Equals(job.TenantId, context.TenantId, StringComparison.Ordinal))
{
violations.Add(new SecurityViolation(
Code: "TENANT_MISMATCH",
Message: $"Job tenant {job.TenantId} does not match context tenant {context.TenantId}.",
Severity: ViolationSeverity.Critical,
Source: "TenantIsolation"));
}
// Verify all SBOMs belong to the tenant
var invalidSboms = new List<string>();
foreach (var sbomId in job.SbomIds)
{
var isOwned = await _scopeValidator.ValidateSbomOwnershipAsync(
sbomId,
context.TenantId,
cancellationToken).ConfigureAwait(false);
if (!isOwned)
{
invalidSboms.Add(sbomId);
}
}
if (invalidSboms.Count > 0)
{
violations.Add(new SecurityViolation(
Code: "SBOM_OWNERSHIP_VIOLATION",
Message: $"{invalidSboms.Count} SBOM(s) not owned by tenant {context.TenantId}: {string.Join(", ", invalidSboms.Take(5))}...",
Severity: ViolationSeverity.Critical,
Source: "TenantIsolation"));
}
// Verify policy belongs to tenant
var policyOwned = await _scopeValidator.ValidatePolicyOwnershipAsync(
job.PolicyId,
context.TenantId,
cancellationToken).ConfigureAwait(false);
if (!policyOwned)
{
violations.Add(new SecurityViolation(
Code: "POLICY_OWNERSHIP_VIOLATION",
Message: $"Policy {job.PolicyId} not owned by tenant {context.TenantId}.",
Severity: ViolationSeverity.Critical,
Source: "TenantIsolation"));
}
return new ValidationStepResult(violations);
}
private async ValueTask<ValidationStepResult> ValidateScopePermissionsAsync(
SimulationJob job,
SimulationSecurityContext context,
CancellationToken cancellationToken)
{
var violations = new List<SecurityViolation>();
// Verify caller has simulation permission
if (!context.Permissions.Contains("simulation:execute"))
{
violations.Add(new SecurityViolation(
Code: "MISSING_PERMISSION",
Message: "Caller lacks 'simulation:execute' permission.",
Severity: ViolationSeverity.Critical,
Source: "ScopeValidation"));
}
// Verify caller has read access to policy
if (!context.Permissions.Contains("policy:read"))
{
violations.Add(new SecurityViolation(
Code: "MISSING_PERMISSION",
Message: "Caller lacks 'policy:read' permission.",
Severity: ViolationSeverity.High,
Source: "ScopeValidation"));
}
// Verify rate limits not exceeded
var rateLimitResult = await _scopeValidator.CheckRateLimitAsync(
context.TenantId,
"simulation",
cancellationToken).ConfigureAwait(false);
if (!rateLimitResult.IsAllowed)
{
violations.Add(new SecurityViolation(
Code: "RATE_LIMIT_EXCEEDED",
Message: $"Simulation rate limit exceeded for tenant {context.TenantId}. Retry after {rateLimitResult.RetryAfter}.",
Severity: ViolationSeverity.High,
Source: "ScopeValidation"));
}
return new ValidationStepResult(violations);
}
private async ValueTask<ValidationStepResult> ValidateAttestationsAsync(
SimulationJob job,
SimulationSecurityContext context,
CancellationToken cancellationToken)
{
var violations = new List<SecurityViolation>();
// Verify policy has valid attestation
var policyAttestation = await _attestationVerifier.VerifyPolicyAttestationAsync(
job.PolicyId,
context.TenantId,
cancellationToken).ConfigureAwait(false);
if (!policyAttestation.IsValid)
{
violations.Add(new SecurityViolation(
Code: "INVALID_POLICY_ATTESTATION",
Message: $"Policy {job.PolicyId} attestation invalid: {policyAttestation.Reason}.",
Severity: ViolationSeverity.High,
Source: "AttestationVerification"));
}
// Verify SBOMs have valid attestations (sample check for large sets)
var sampleSize = Math.Min(job.SbomIds.Length, 10);
var sampleSboms = job.SbomIds.Take(sampleSize).ToList();
foreach (var sbomId in sampleSboms)
{
var sbomAttestation = await _attestationVerifier.VerifySbomAttestationAsync(
sbomId,
context.TenantId,
cancellationToken).ConfigureAwait(false);
if (!sbomAttestation.IsValid)
{
violations.Add(new SecurityViolation(
Code: "INVALID_SBOM_ATTESTATION",
Message: $"SBOM {sbomId} attestation invalid: {sbomAttestation.Reason}.",
Severity: ViolationSeverity.Medium,
Source: "AttestationVerification"));
}
}
return new ValidationStepResult(violations);
}
private async ValueTask<ValidationStepResult> ScanForSecretsAsync(
string policySource,
CancellationToken cancellationToken)
{
var violations = new List<SecurityViolation>();
var scanResult = await _secretScanner.ScanAsync(policySource, cancellationToken).ConfigureAwait(false);
foreach (var secret in scanResult.DetectedSecrets)
{
violations.Add(new SecurityViolation(
Code: "SECRET_DETECTED",
Message: $"Potential secret detected in policy source: {secret.Type} at line {secret.LineNumber}.",
Severity: ViolationSeverity.Critical,
Source: "SecretScanner"));
}
return new ValidationStepResult(violations);
}
private sealed record ValidationStepResult(List<SecurityViolation> Violations);
}
/// <summary>
/// Interface for simulation security enforcement.
/// </summary>
public interface ISimulationSecurityEnforcer
{
ValueTask<SecurityValidationResult> ValidateJobAsync(
SimulationJob job,
SimulationSecurityContext context,
CancellationToken cancellationToken = default);
ValueTask<SecurityValidationResult> ValidateShardResultAsync(
SimulationShardResult result,
SimulationSecurityContext context,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for tenant scope validation.
/// </summary>
public interface ITenantScopeValidator
{
ValueTask<bool> ValidateSbomOwnershipAsync(string sbomId, string tenantId, CancellationToken cancellationToken = default);
ValueTask<bool> ValidatePolicyOwnershipAsync(string policyId, string tenantId, CancellationToken cancellationToken = default);
ValueTask<bool> ValidateFindingOwnershipAsync(string sbomId, string tenantId, CancellationToken cancellationToken = default);
ValueTask<RateLimitResult> CheckRateLimitAsync(string tenantId, string operation, CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for attestation verification.
/// </summary>
public interface IAttestationVerifier
{
ValueTask<AttestationResult> VerifyPolicyAttestationAsync(string policyId, string tenantId, CancellationToken cancellationToken = default);
ValueTask<AttestationResult> VerifySbomAttestationAsync(string sbomId, string tenantId, CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for secret scanning.
/// </summary>
public interface ISecretScanner
{
ValueTask<SecretScanResult> ScanAsync(string content, CancellationToken cancellationToken = default);
}
/// <summary>
/// Security context for simulation jobs.
/// </summary>
public sealed record SimulationSecurityContext(
string TenantId,
string CallerId,
ImmutableHashSet<string> Permissions,
bool RequireAttestation = false,
string? PolicySource = null);
/// <summary>
/// Result of security validation.
/// </summary>
public sealed record SecurityValidationResult(
bool IsValid,
ImmutableArray<SecurityViolation> Violations,
DateTimeOffset ValidatedAt,
string ValidatorVersion);
/// <summary>
/// A security violation.
/// </summary>
public sealed record SecurityViolation(
string Code,
string Message,
ViolationSeverity Severity,
string Source);
/// <summary>
/// Severity of a security violation.
/// </summary>
public enum ViolationSeverity
{
Low,
Medium,
High,
Critical
}
/// <summary>
/// Result of rate limit check.
/// </summary>
public sealed record RateLimitResult(
bool IsAllowed,
int RemainingQuota,
TimeSpan? RetryAfter = null);
/// <summary>
/// Result of attestation verification.
/// </summary>
public sealed record AttestationResult(
bool IsValid,
string? Reason = null,
DateTimeOffset? VerifiedAt = null);
/// <summary>
/// Result of secret scanning.
/// </summary>
public sealed record SecretScanResult(
bool HasSecrets,
ImmutableArray<DetectedSecret> DetectedSecrets);
/// <summary>
/// A detected secret.
/// </summary>
public sealed record DetectedSecret(
string Type,
int LineNumber,
string Context);
/// <summary>
/// Default implementation of tenant scope validator.
/// </summary>
public sealed class DefaultTenantScopeValidator : ITenantScopeValidator
{
public ValueTask<bool> ValidateSbomOwnershipAsync(string sbomId, string tenantId, CancellationToken cancellationToken = default)
=> ValueTask.FromResult(true); // Placeholder
public ValueTask<bool> ValidatePolicyOwnershipAsync(string policyId, string tenantId, CancellationToken cancellationToken = default)
=> ValueTask.FromResult(true); // Placeholder
public ValueTask<bool> ValidateFindingOwnershipAsync(string sbomId, string tenantId, CancellationToken cancellationToken = default)
=> ValueTask.FromResult(true); // Placeholder
public ValueTask<RateLimitResult> CheckRateLimitAsync(string tenantId, string operation, CancellationToken cancellationToken = default)
=> ValueTask.FromResult(new RateLimitResult(true, 100)); // Placeholder
}
/// <summary>
/// Default implementation of attestation verifier.
/// </summary>
public sealed class DefaultAttestationVerifier : IAttestationVerifier
{
public ValueTask<AttestationResult> VerifyPolicyAttestationAsync(string policyId, string tenantId, CancellationToken cancellationToken = default)
=> ValueTask.FromResult(new AttestationResult(true, null, DateTimeOffset.UtcNow)); // Placeholder
public ValueTask<AttestationResult> VerifySbomAttestationAsync(string sbomId, string tenantId, CancellationToken cancellationToken = default)
=> ValueTask.FromResult(new AttestationResult(true, null, DateTimeOffset.UtcNow)); // Placeholder
}
/// <summary>
/// Regex-based secret scanner implementation.
/// </summary>
public sealed partial class RegexSecretScanner : ISecretScanner
{
private static readonly (string Type, Regex Pattern)[] SecretPatterns =
[
("AWS_ACCESS_KEY", AwsAccessKeyRegex()),
("AWS_SECRET_KEY", AwsSecretKeyRegex()),
("GITHUB_TOKEN", GithubTokenRegex()),
("GENERIC_API_KEY", GenericApiKeyRegex()),
("PRIVATE_KEY", PrivateKeyRegex()),
("PASSWORD_IN_URL", PasswordInUrlRegex())
];
public ValueTask<SecretScanResult> ScanAsync(string content, CancellationToken cancellationToken = default)
{
var detectedSecrets = new List<DetectedSecret>();
var lines = content.Split('\n');
for (var lineNumber = 0; lineNumber < lines.Length; lineNumber++)
{
var line = lines[lineNumber];
foreach (var (type, pattern) in SecretPatterns)
{
if (pattern.IsMatch(line))
{
// Mask the context to avoid exposing the secret
var maskedContext = pattern.Replace(line, "[REDACTED]");
detectedSecrets.Add(new DetectedSecret(type, lineNumber + 1, maskedContext));
}
}
}
return ValueTask.FromResult(new SecretScanResult(
HasSecrets: detectedSecrets.Count > 0,
DetectedSecrets: [.. detectedSecrets]));
}
[GeneratedRegex(@"AKIA[0-9A-Z]{16}", RegexOptions.Compiled)]
private static partial Regex AwsAccessKeyRegex();
[GeneratedRegex(@"[A-Za-z0-9/+=]{40}", RegexOptions.Compiled)]
private static partial Regex AwsSecretKeyRegex();
[GeneratedRegex(@"gh[pousr]_[A-Za-z0-9_]{36,}", RegexOptions.Compiled)]
private static partial Regex GithubTokenRegex();
[GeneratedRegex(@"(?i)(api[_-]?key|apikey|secret[_-]?key)\s*[:=]\s*['""]?[A-Za-z0-9_\-]{20,}['""]?", RegexOptions.Compiled)]
private static partial Regex GenericApiKeyRegex();
[GeneratedRegex(@"-----BEGIN (RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----", RegexOptions.Compiled)]
private static partial Regex PrivateKeyRegex();
[GeneratedRegex(@"://[^:]+:[^@]+@", RegexOptions.Compiled)]
private static partial Regex PasswordInUrlRegex();
}
/// <summary>
/// Null implementation of secret scanner for testing.
/// </summary>
public sealed class NullSecretScanner : ISecretScanner
{
public static NullSecretScanner Instance { get; } = new();
public ValueTask<SecretScanResult> ScanAsync(string content, CancellationToken cancellationToken = default)
=> ValueTask.FromResult(new SecretScanResult(false, []));
}