up the blokcing tasks
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Notify Smoke Test / Notifier Service Tests (push) Has been cancelled
Notify Smoke Test / Notification Smoke Test (push) Has been cancelled
Notify Smoke Test / Notify Unit Tests (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Manifest Integrity / Validate Schema Integrity (push) Has been cancelled
Manifest Integrity / Validate Contract Documents (push) Has been cancelled
Manifest Integrity / Validate Pack Fixtures (push) Has been cancelled
Manifest Integrity / Audit SHA256SUMS Files (push) Has been cancelled
Manifest Integrity / Verify Merkle Roots (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Risk Bundle CI / risk-bundle-build (push) Has been cancelled
Scanner Analyzers / Discover Analyzers (push) Has been cancelled
Scanner Analyzers / Validate Test Fixtures (push) Has been cancelled
Risk Bundle CI / risk-bundle-offline-kit (push) Has been cancelled
Risk Bundle CI / publish-checksums (push) Has been cancelled
Scanner Analyzers / Build Analyzers (push) Has been cancelled
Scanner Analyzers / Test Language Analyzers (push) Has been cancelled
Scanner Analyzers / Verify Deterministic Output (push) Has been cancelled
devportal-offline / build-offline (push) Has been cancelled
Mirror Thin Bundle Sign & Verify / mirror-sign (push) Has been cancelled
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Notify Smoke Test / Notifier Service Tests (push) Has been cancelled
Notify Smoke Test / Notification Smoke Test (push) Has been cancelled
Notify Smoke Test / Notify Unit Tests (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Manifest Integrity / Validate Schema Integrity (push) Has been cancelled
Manifest Integrity / Validate Contract Documents (push) Has been cancelled
Manifest Integrity / Validate Pack Fixtures (push) Has been cancelled
Manifest Integrity / Audit SHA256SUMS Files (push) Has been cancelled
Manifest Integrity / Verify Merkle Roots (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Risk Bundle CI / risk-bundle-build (push) Has been cancelled
Scanner Analyzers / Discover Analyzers (push) Has been cancelled
Scanner Analyzers / Validate Test Fixtures (push) Has been cancelled
Risk Bundle CI / risk-bundle-offline-kit (push) Has been cancelled
Risk Bundle CI / publish-checksums (push) Has been cancelled
Scanner Analyzers / Build Analyzers (push) Has been cancelled
Scanner Analyzers / Test Language Analyzers (push) Has been cancelled
Scanner Analyzers / Verify Deterministic Output (push) Has been cancelled
devportal-offline / build-offline (push) Has been cancelled
Mirror Thin Bundle Sign & Verify / mirror-sign (push) Has been cancelled
This commit is contained in:
@@ -0,0 +1,568 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Console;
|
||||
|
||||
/// <summary>
|
||||
/// Evidence bundle coordinator per SCHED-WORKER-CONSOLE-23-202.
|
||||
/// Coordinates evidence bundle jobs (enqueue, track status, cleanup) and exposes job manifests to Web gateway.
|
||||
/// Ensures idempotent reruns and cancellation support.
|
||||
/// </summary>
|
||||
public sealed class EvidenceBundleCoordinator : BackgroundService
|
||||
{
|
||||
private readonly IEvidenceBundleJobQueue _jobQueue;
|
||||
private readonly IEvidenceBundleGenerator _bundleGenerator;
|
||||
private readonly IEvidenceBundleStore _bundleStore;
|
||||
private readonly IJobManifestProvider _manifestProvider;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly ILogger<EvidenceBundleCoordinator> _logger;
|
||||
|
||||
private readonly ConcurrentDictionary<string, CancellationTokenSource> _runningJobs = new();
|
||||
|
||||
public EvidenceBundleCoordinator(
|
||||
IEvidenceBundleJobQueue jobQueue,
|
||||
IEvidenceBundleGenerator bundleGenerator,
|
||||
IEvidenceBundleStore bundleStore,
|
||||
IJobManifestProvider manifestProvider,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
ILogger<EvidenceBundleCoordinator> logger)
|
||||
{
|
||||
_jobQueue = jobQueue ?? throw new ArgumentNullException(nameof(jobQueue));
|
||||
_bundleGenerator = bundleGenerator ?? throw new ArgumentNullException(nameof(bundleGenerator));
|
||||
_bundleStore = bundleStore ?? throw new ArgumentNullException(nameof(bundleStore));
|
||||
_manifestProvider = manifestProvider ?? throw new ArgumentNullException(nameof(manifestProvider));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Evidence bundle coordinator started.");
|
||||
|
||||
// Start cleanup task
|
||||
var cleanupTask = RunCleanupLoopAsync(stoppingToken);
|
||||
|
||||
try
|
||||
{
|
||||
await RunJobProcessingLoopAsync(stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
finally
|
||||
{
|
||||
// Cancel all running jobs
|
||||
foreach (var cts in _runningJobs.Values)
|
||||
{
|
||||
cts.Cancel();
|
||||
}
|
||||
|
||||
await cleanupTask.ConfigureAwait(false);
|
||||
}
|
||||
|
||||
_logger.LogInformation("Evidence bundle coordinator stopped.");
|
||||
}
|
||||
|
||||
private async Task RunJobProcessingLoopAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Dequeue jobs
|
||||
var jobs = await _jobQueue
|
||||
.DequeueAsync(_options.Policy.Dispatch.BatchSize, stoppingToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (jobs.Count == 0)
|
||||
{
|
||||
await Task.Delay(_options.Policy.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var job in jobs)
|
||||
{
|
||||
if (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
// Check for cancellation request
|
||||
if (job.Status == BundleJobStatus.CancellationRequested)
|
||||
{
|
||||
await HandleCancellationAsync(job, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check idempotency
|
||||
var existingBundle = await _bundleStore.GetBundleAsync(
|
||||
job.TenantId,
|
||||
job.IdempotencyKey,
|
||||
stoppingToken).ConfigureAwait(false);
|
||||
|
||||
if (existingBundle is not null && existingBundle.Status == BundleStatus.Completed)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Job {JobId} already completed (idempotency key: {IdempotencyKey}), skipping.",
|
||||
job.JobId,
|
||||
job.IdempotencyKey);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Process job
|
||||
await ProcessJobAsync(job, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in evidence bundle coordinator loop.");
|
||||
await Task.Delay(_options.Policy.Dispatch.RetryBackoff, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ProcessJobAsync(EvidenceBundleJob job, CancellationToken stoppingToken)
|
||||
{
|
||||
var startedAt = _timeProvider.GetUtcNow();
|
||||
var jobCts = CancellationTokenSource.CreateLinkedTokenSource(stoppingToken);
|
||||
|
||||
if (!_runningJobs.TryAdd(job.JobId, jobCts))
|
||||
{
|
||||
_logger.LogWarning("Job {JobId} is already running.", job.JobId);
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Processing evidence bundle job {JobId} for tenant {TenantId}.",
|
||||
job.JobId,
|
||||
job.TenantId);
|
||||
|
||||
try
|
||||
{
|
||||
// Update status to running
|
||||
await _jobQueue.UpdateStatusAsync(
|
||||
job.JobId,
|
||||
BundleJobStatus.Running,
|
||||
stoppingToken).ConfigureAwait(false);
|
||||
|
||||
// Generate bundle
|
||||
var bundle = await _bundleGenerator.GenerateAsync(
|
||||
job,
|
||||
jobCts.Token).ConfigureAwait(false);
|
||||
|
||||
// Store bundle
|
||||
await _bundleStore.StoreBundleAsync(
|
||||
job.TenantId,
|
||||
job.IdempotencyKey,
|
||||
bundle,
|
||||
stoppingToken).ConfigureAwait(false);
|
||||
|
||||
// Update manifest
|
||||
await _manifestProvider.UpdateManifestAsync(
|
||||
job.TenantId,
|
||||
job.JobId,
|
||||
new JobManifest(
|
||||
JobId: job.JobId,
|
||||
TenantId: job.TenantId,
|
||||
Status: BundleJobStatus.Completed,
|
||||
BundleUri: bundle.StorageUri,
|
||||
BundleSize: bundle.SizeBytes,
|
||||
BundleChecksum: bundle.Checksum,
|
||||
StartedAt: startedAt,
|
||||
CompletedAt: _timeProvider.GetUtcNow(),
|
||||
Metadata: job.Metadata),
|
||||
stoppingToken).ConfigureAwait(false);
|
||||
|
||||
// Update job status
|
||||
await _jobQueue.UpdateStatusAsync(
|
||||
job.JobId,
|
||||
BundleJobStatus.Completed,
|
||||
stoppingToken).ConfigureAwait(false);
|
||||
|
||||
var duration = _timeProvider.GetUtcNow() - startedAt;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Evidence bundle job {JobId} completed: {BundleUri}, size={Size} bytes in {Duration}ms.",
|
||||
job.JobId,
|
||||
bundle.StorageUri,
|
||||
bundle.SizeBytes,
|
||||
duration.TotalMilliseconds);
|
||||
}
|
||||
catch (OperationCanceledException) when (jobCts.Token.IsCancellationRequested && !stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogInformation("Job {JobId} was cancelled.", job.JobId);
|
||||
|
||||
await _jobQueue.UpdateStatusAsync(
|
||||
job.JobId,
|
||||
BundleJobStatus.Cancelled,
|
||||
stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(ex, "Evidence bundle job {JobId} failed.", job.JobId);
|
||||
|
||||
await _jobQueue.UpdateStatusAsync(
|
||||
job.JobId,
|
||||
BundleJobStatus.Failed,
|
||||
stoppingToken).ConfigureAwait(false);
|
||||
|
||||
await _manifestProvider.UpdateManifestAsync(
|
||||
job.TenantId,
|
||||
job.JobId,
|
||||
new JobManifest(
|
||||
JobId: job.JobId,
|
||||
TenantId: job.TenantId,
|
||||
Status: BundleJobStatus.Failed,
|
||||
BundleUri: null,
|
||||
BundleSize: null,
|
||||
BundleChecksum: null,
|
||||
StartedAt: startedAt,
|
||||
CompletedAt: _timeProvider.GetUtcNow(),
|
||||
Error: ex.Message,
|
||||
Metadata: job.Metadata),
|
||||
stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
finally
|
||||
{
|
||||
_runningJobs.TryRemove(job.JobId, out _);
|
||||
jobCts.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
private async Task HandleCancellationAsync(EvidenceBundleJob job, CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Cancelling job {JobId}.", job.JobId);
|
||||
|
||||
if (_runningJobs.TryGetValue(job.JobId, out var cts))
|
||||
{
|
||||
cts.Cancel();
|
||||
}
|
||||
else
|
||||
{
|
||||
// Job not running, mark as cancelled directly
|
||||
await _jobQueue.UpdateStatusAsync(
|
||||
job.JobId,
|
||||
BundleJobStatus.Cancelled,
|
||||
stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task RunCleanupLoopAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await Task.Delay(TimeSpan.FromMinutes(5), stoppingToken).ConfigureAwait(false);
|
||||
|
||||
// Cleanup expired bundles
|
||||
var expiredCount = await _bundleStore.CleanupExpiredAsync(
|
||||
TimeSpan.FromDays(7),
|
||||
stoppingToken).ConfigureAwait(false);
|
||||
|
||||
if (expiredCount > 0)
|
||||
{
|
||||
_logger.LogInformation("Cleaned up {Count} expired evidence bundles.", expiredCount);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in cleanup loop.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Requests cancellation of a running job.
|
||||
/// </summary>
|
||||
public async ValueTask RequestCancellationAsync(string jobId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
await _jobQueue.UpdateStatusAsync(
|
||||
jobId,
|
||||
BundleJobStatus.CancellationRequested,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Queue interface for evidence bundle jobs.
|
||||
/// </summary>
|
||||
public interface IEvidenceBundleJobQueue
|
||||
{
|
||||
ValueTask<IReadOnlyList<EvidenceBundleJob>> DequeueAsync(int maxCount, CancellationToken cancellationToken = default);
|
||||
ValueTask EnqueueAsync(EvidenceBundleJob job, CancellationToken cancellationToken = default);
|
||||
ValueTask UpdateStatusAsync(string jobId, BundleJobStatus status, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for generating evidence bundles.
|
||||
/// </summary>
|
||||
public interface IEvidenceBundleGenerator
|
||||
{
|
||||
ValueTask<GeneratedBundle> GenerateAsync(EvidenceBundleJob job, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for storing evidence bundles.
|
||||
/// </summary>
|
||||
public interface IEvidenceBundleStore
|
||||
{
|
||||
ValueTask StoreBundleAsync(string tenantId, string idempotencyKey, GeneratedBundle bundle, CancellationToken cancellationToken = default);
|
||||
ValueTask<StoredBundle?> GetBundleAsync(string tenantId, string idempotencyKey, CancellationToken cancellationToken = default);
|
||||
ValueTask<int> CleanupExpiredAsync(TimeSpan maxAge, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for job manifest provider (exposed to Web gateway).
|
||||
/// </summary>
|
||||
public interface IJobManifestProvider
|
||||
{
|
||||
ValueTask UpdateManifestAsync(string tenantId, string jobId, JobManifest manifest, CancellationToken cancellationToken = default);
|
||||
ValueTask<JobManifest?> GetManifestAsync(string tenantId, string jobId, CancellationToken cancellationToken = default);
|
||||
ValueTask<IReadOnlyList<JobManifest>> ListManifestsAsync(string tenantId, int maxCount, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents an evidence bundle job.
|
||||
/// </summary>
|
||||
public sealed record EvidenceBundleJob(
|
||||
string JobId,
|
||||
string TenantId,
|
||||
string IdempotencyKey,
|
||||
BundleJobStatus Status,
|
||||
BundleType BundleType,
|
||||
ImmutableArray<string> ArtifactIds,
|
||||
DateTimeOffset RequestedAt,
|
||||
ImmutableDictionary<string, string>? Metadata = null);
|
||||
|
||||
/// <summary>
|
||||
/// Status of an evidence bundle job.
|
||||
/// </summary>
|
||||
public enum BundleJobStatus
|
||||
{
|
||||
Pending,
|
||||
Running,
|
||||
Completed,
|
||||
Failed,
|
||||
CancellationRequested,
|
||||
Cancelled
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of evidence bundle.
|
||||
/// </summary>
|
||||
public enum BundleType
|
||||
{
|
||||
Sbom,
|
||||
Findings,
|
||||
Attestation,
|
||||
PolicyResult,
|
||||
Combined
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A generated evidence bundle.
|
||||
/// </summary>
|
||||
public sealed record GeneratedBundle(
|
||||
string BundleId,
|
||||
string StorageUri,
|
||||
long SizeBytes,
|
||||
string Checksum,
|
||||
string ChecksumAlgorithm,
|
||||
BundleType BundleType,
|
||||
int ArtifactCount,
|
||||
DateTimeOffset GeneratedAt);
|
||||
|
||||
/// <summary>
|
||||
/// A stored evidence bundle.
|
||||
/// </summary>
|
||||
public sealed record StoredBundle(
|
||||
string BundleId,
|
||||
string TenantId,
|
||||
string IdempotencyKey,
|
||||
string StorageUri,
|
||||
long SizeBytes,
|
||||
BundleStatus Status,
|
||||
DateTimeOffset CreatedAt,
|
||||
DateTimeOffset? ExpiresAt);
|
||||
|
||||
/// <summary>
|
||||
/// Status of a stored bundle.
|
||||
/// </summary>
|
||||
public enum BundleStatus
|
||||
{
|
||||
Pending,
|
||||
Completed,
|
||||
Expired
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Job manifest exposed to Web gateway.
|
||||
/// </summary>
|
||||
public sealed record JobManifest(
|
||||
string JobId,
|
||||
string TenantId,
|
||||
BundleJobStatus Status,
|
||||
string? BundleUri,
|
||||
long? BundleSize,
|
||||
string? BundleChecksum,
|
||||
DateTimeOffset StartedAt,
|
||||
DateTimeOffset? CompletedAt,
|
||||
string? Error = null,
|
||||
ImmutableDictionary<string, string>? Metadata = null);
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of evidence bundle job queue.
|
||||
/// </summary>
|
||||
public sealed class InMemoryEvidenceBundleJobQueue : IEvidenceBundleJobQueue
|
||||
{
|
||||
private readonly ConcurrentQueue<EvidenceBundleJob> _queue = new();
|
||||
private readonly ConcurrentDictionary<string, BundleJobStatus> _statuses = new();
|
||||
|
||||
public ValueTask<IReadOnlyList<EvidenceBundleJob>> DequeueAsync(int maxCount, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = new List<EvidenceBundleJob>();
|
||||
|
||||
while (results.Count < maxCount && _queue.TryDequeue(out var job))
|
||||
{
|
||||
// Check if status changed (e.g., cancellation requested)
|
||||
if (_statuses.TryGetValue(job.JobId, out var status))
|
||||
{
|
||||
job = job with { Status = status };
|
||||
}
|
||||
|
||||
results.Add(job);
|
||||
}
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<EvidenceBundleJob>>(results);
|
||||
}
|
||||
|
||||
public ValueTask EnqueueAsync(EvidenceBundleJob job, CancellationToken cancellationToken = default)
|
||||
{
|
||||
_queue.Enqueue(job);
|
||||
_statuses[job.JobId] = job.Status;
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
public ValueTask UpdateStatusAsync(string jobId, BundleJobStatus status, CancellationToken cancellationToken = default)
|
||||
{
|
||||
_statuses[jobId] = status;
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of evidence bundle store.
|
||||
/// </summary>
|
||||
public sealed class InMemoryEvidenceBundleStore : IEvidenceBundleStore
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, StoredBundle> _bundles = new();
|
||||
|
||||
public ValueTask StoreBundleAsync(string tenantId, string idempotencyKey, GeneratedBundle bundle, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var key = $"{tenantId}:{idempotencyKey}";
|
||||
var stored = new StoredBundle(
|
||||
bundle.BundleId,
|
||||
tenantId,
|
||||
idempotencyKey,
|
||||
bundle.StorageUri,
|
||||
bundle.SizeBytes,
|
||||
BundleStatus.Completed,
|
||||
DateTimeOffset.UtcNow,
|
||||
DateTimeOffset.UtcNow.AddDays(7));
|
||||
|
||||
_bundles[key] = stored;
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
public ValueTask<StoredBundle?> GetBundleAsync(string tenantId, string idempotencyKey, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var key = $"{tenantId}:{idempotencyKey}";
|
||||
return ValueTask.FromResult(_bundles.TryGetValue(key, out var bundle) ? bundle : null);
|
||||
}
|
||||
|
||||
public ValueTask<int> CleanupExpiredAsync(TimeSpan maxAge, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var cutoff = DateTimeOffset.UtcNow - maxAge;
|
||||
var toRemove = _bundles
|
||||
.Where(kvp => kvp.Value.CreatedAt < cutoff)
|
||||
.Select(kvp => kvp.Key)
|
||||
.ToList();
|
||||
|
||||
foreach (var key in toRemove)
|
||||
{
|
||||
_bundles.TryRemove(key, out _);
|
||||
}
|
||||
|
||||
return ValueTask.FromResult(toRemove.Count);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of job manifest provider.
|
||||
/// </summary>
|
||||
public sealed class InMemoryJobManifestProvider : IJobManifestProvider
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, JobManifest> _manifests = new();
|
||||
|
||||
public ValueTask UpdateManifestAsync(string tenantId, string jobId, JobManifest manifest, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var key = $"{tenantId}:{jobId}";
|
||||
_manifests[key] = manifest;
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
public ValueTask<JobManifest?> GetManifestAsync(string tenantId, string jobId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var key = $"{tenantId}:{jobId}";
|
||||
return ValueTask.FromResult(_manifests.TryGetValue(key, out var manifest) ? manifest : null);
|
||||
}
|
||||
|
||||
public ValueTask<IReadOnlyList<JobManifest>> ListManifestsAsync(string tenantId, int maxCount, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = _manifests
|
||||
.Where(kvp => kvp.Key.StartsWith($"{tenantId}:"))
|
||||
.Select(kvp => kvp.Value)
|
||||
.OrderByDescending(m => m.StartedAt)
|
||||
.Take(maxCount)
|
||||
.ToList();
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<JobManifest>>(results);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of evidence bundle generator for testing.
|
||||
/// </summary>
|
||||
public sealed class NullEvidenceBundleGenerator : IEvidenceBundleGenerator
|
||||
{
|
||||
public static NullEvidenceBundleGenerator Instance { get; } = new();
|
||||
|
||||
public ValueTask<GeneratedBundle> GenerateAsync(EvidenceBundleJob job, CancellationToken cancellationToken = default)
|
||||
{
|
||||
return ValueTask.FromResult(new GeneratedBundle(
|
||||
BundleId: $"bundle-{job.JobId}",
|
||||
StorageUri: $"mem://{job.TenantId}/bundles/{job.JobId}.zip",
|
||||
SizeBytes: 0,
|
||||
Checksum: "0000000000000000000000000000000000000000000000000000000000000000",
|
||||
ChecksumAlgorithm: "SHA256",
|
||||
BundleType: job.BundleType,
|
||||
ArtifactCount: job.ArtifactIds.Length,
|
||||
GeneratedAt: DateTimeOffset.UtcNow));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,383 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Console;
|
||||
|
||||
/// <summary>
|
||||
/// Progress streaming worker per SCHED-WORKER-CONSOLE-23-201.
|
||||
/// Streams run progress events (stage status, tuples processed, SLA hints) to Redis/NATS for Console SSE.
|
||||
/// Includes heartbeat, dedupe, and retention policy. Publishes metrics and structured logs for queue lag.
|
||||
/// </summary>
|
||||
public sealed class ProgressStreamingWorker : BackgroundService
|
||||
{
|
||||
private readonly IProgressEventSource _eventSource;
|
||||
private readonly IProgressStreamPublisher _streamPublisher;
|
||||
private readonly IProgressEventDeduplicator _deduplicator;
|
||||
private readonly IHeartbeatService _heartbeatService;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly ILogger<ProgressStreamingWorker> _logger;
|
||||
|
||||
public ProgressStreamingWorker(
|
||||
IProgressEventSource eventSource,
|
||||
IProgressStreamPublisher streamPublisher,
|
||||
IProgressEventDeduplicator deduplicator,
|
||||
IHeartbeatService heartbeatService,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
ILogger<ProgressStreamingWorker> logger)
|
||||
{
|
||||
_eventSource = eventSource ?? throw new ArgumentNullException(nameof(eventSource));
|
||||
_streamPublisher = streamPublisher ?? throw new ArgumentNullException(nameof(streamPublisher));
|
||||
_deduplicator = deduplicator ?? throw new ArgumentNullException(nameof(deduplicator));
|
||||
_heartbeatService = heartbeatService ?? throw new ArgumentNullException(nameof(heartbeatService));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Progress streaming worker started.");
|
||||
|
||||
// Start heartbeat task
|
||||
var heartbeatTask = RunHeartbeatLoopAsync(stoppingToken);
|
||||
|
||||
try
|
||||
{
|
||||
await RunEventStreamingLoopAsync(stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
finally
|
||||
{
|
||||
await heartbeatTask.ConfigureAwait(false);
|
||||
}
|
||||
|
||||
_logger.LogInformation("Progress streaming worker stopped.");
|
||||
}
|
||||
|
||||
private async Task RunEventStreamingLoopAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Get next batch of progress events
|
||||
var events = await _eventSource
|
||||
.GetEventsAsync(100, stoppingToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (events.Count == 0)
|
||||
{
|
||||
await Task.Delay(TimeSpan.FromMilliseconds(100), stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Deduplicate events
|
||||
var uniqueEvents = new List<ProgressEvent>();
|
||||
foreach (var evt in events)
|
||||
{
|
||||
if (await _deduplicator.TryMarkAsProcessedAsync(evt.EventId, stoppingToken).ConfigureAwait(false))
|
||||
{
|
||||
uniqueEvents.Add(evt);
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogDebug("Skipping duplicate event {EventId}.", evt.EventId);
|
||||
}
|
||||
}
|
||||
|
||||
if (uniqueEvents.Count == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Group by tenant for efficient publishing
|
||||
var byTenant = uniqueEvents.GroupBy(e => e.TenantId);
|
||||
|
||||
foreach (var tenantGroup in byTenant)
|
||||
{
|
||||
var tenantId = tenantGroup.Key;
|
||||
var tenantEvents = tenantGroup.ToList();
|
||||
|
||||
try
|
||||
{
|
||||
// Publish to stream
|
||||
await _streamPublisher.PublishAsync(
|
||||
tenantId,
|
||||
tenantEvents,
|
||||
stoppingToken).ConfigureAwait(false);
|
||||
|
||||
// Log queue lag metrics
|
||||
foreach (var evt in tenantEvents)
|
||||
{
|
||||
var lag = _timeProvider.GetUtcNow() - evt.Timestamp;
|
||||
if (lag.TotalSeconds > 5)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Progress event lag detected: {EventId}, lag={Lag}s",
|
||||
evt.EventId,
|
||||
lag.TotalSeconds);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Failed to publish {Count} events for tenant {TenantId}.",
|
||||
tenantEvents.Count,
|
||||
tenantId);
|
||||
}
|
||||
}
|
||||
|
||||
// Acknowledge processed events
|
||||
await _eventSource.AcknowledgeAsync(
|
||||
uniqueEvents.Select(e => e.EventId).ToList(),
|
||||
stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in progress streaming loop.");
|
||||
await Task.Delay(TimeSpan.FromSeconds(1), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task RunHeartbeatLoopAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _heartbeatService.SendHeartbeatAsync(stoppingToken).ConfigureAwait(false);
|
||||
await Task.Delay(TimeSpan.FromSeconds(10), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error sending heartbeat.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Source interface for progress events.
|
||||
/// </summary>
|
||||
public interface IProgressEventSource
|
||||
{
|
||||
ValueTask<IReadOnlyList<ProgressEvent>> GetEventsAsync(int maxCount, CancellationToken cancellationToken = default);
|
||||
ValueTask AcknowledgeAsync(IReadOnlyList<string> eventIds, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Publisher interface for progress streams.
|
||||
/// </summary>
|
||||
public interface IProgressStreamPublisher
|
||||
{
|
||||
ValueTask PublishAsync(string tenantId, IReadOnlyList<ProgressEvent> events, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for event deduplication.
|
||||
/// </summary>
|
||||
public interface IProgressEventDeduplicator
|
||||
{
|
||||
/// <summary>
|
||||
/// Tries to mark an event as processed. Returns true if this is the first time, false if duplicate.
|
||||
/// </summary>
|
||||
ValueTask<bool> TryMarkAsProcessedAsync(string eventId, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for heartbeat service.
|
||||
/// </summary>
|
||||
public interface IHeartbeatService
|
||||
{
|
||||
ValueTask SendHeartbeatAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A progress event for streaming.
|
||||
/// </summary>
|
||||
public sealed record ProgressEvent(
|
||||
string EventId,
|
||||
string TenantId,
|
||||
string RunId,
|
||||
ProgressEventType Type,
|
||||
RunStage Stage,
|
||||
int TuplesProcessed,
|
||||
int TuplesTotal,
|
||||
SlaHint? SlaHint,
|
||||
DateTimeOffset Timestamp,
|
||||
ImmutableDictionary<string, string>? Metadata = null);
|
||||
|
||||
/// <summary>
|
||||
/// Type of progress event.
|
||||
/// </summary>
|
||||
public enum ProgressEventType
|
||||
{
|
||||
RunStarted,
|
||||
StageChanged,
|
||||
ProgressUpdate,
|
||||
SlaWarning,
|
||||
RunCompleted,
|
||||
RunFailed,
|
||||
Heartbeat
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stage of a run.
|
||||
/// </summary>
|
||||
public enum RunStage
|
||||
{
|
||||
Queued,
|
||||
Initializing,
|
||||
Scanning,
|
||||
Resolving,
|
||||
Evaluating,
|
||||
Aggregating,
|
||||
Finalizing,
|
||||
Completed,
|
||||
Failed,
|
||||
Cancelled
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SLA hint for progress events.
|
||||
/// </summary>
|
||||
public sealed record SlaHint(
|
||||
TimeSpan EstimatedRemaining,
|
||||
TimeSpan SlaThreshold,
|
||||
bool AtRisk,
|
||||
string? Message = null);
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of progress event source.
|
||||
/// </summary>
|
||||
public sealed class InMemoryProgressEventSource : IProgressEventSource
|
||||
{
|
||||
private readonly ConcurrentQueue<ProgressEvent> _events = new();
|
||||
private readonly ConcurrentDictionary<string, bool> _acknowledged = new();
|
||||
|
||||
public ValueTask<IReadOnlyList<ProgressEvent>> GetEventsAsync(int maxCount, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = new List<ProgressEvent>();
|
||||
|
||||
while (results.Count < maxCount && _events.TryDequeue(out var evt))
|
||||
{
|
||||
if (!_acknowledged.ContainsKey(evt.EventId))
|
||||
{
|
||||
results.Add(evt);
|
||||
}
|
||||
}
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<ProgressEvent>>(results);
|
||||
}
|
||||
|
||||
public ValueTask AcknowledgeAsync(IReadOnlyList<string> eventIds, CancellationToken cancellationToken = default)
|
||||
{
|
||||
foreach (var eventId in eventIds)
|
||||
{
|
||||
_acknowledged[eventId] = true;
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Enqueues an event (for testing).
|
||||
/// </summary>
|
||||
public void Enqueue(ProgressEvent evt)
|
||||
{
|
||||
_events.Enqueue(evt);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of progress stream publisher.
|
||||
/// </summary>
|
||||
public sealed class InMemoryProgressStreamPublisher : IProgressStreamPublisher
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, List<ProgressEvent>> _streams = new();
|
||||
|
||||
public ValueTask PublishAsync(string tenantId, IReadOnlyList<ProgressEvent> events, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var stream = _streams.GetOrAdd(tenantId, _ => []);
|
||||
|
||||
lock (stream)
|
||||
{
|
||||
stream.AddRange(events);
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets published events for a tenant (for testing).
|
||||
/// </summary>
|
||||
public IReadOnlyList<ProgressEvent> GetEvents(string tenantId)
|
||||
{
|
||||
return _streams.TryGetValue(tenantId, out var stream)
|
||||
? stream.ToList()
|
||||
: [];
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of event deduplicator with TTL.
|
||||
/// </summary>
|
||||
public sealed class InMemoryProgressEventDeduplicator : IProgressEventDeduplicator
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, DateTimeOffset> _processed = new();
|
||||
private readonly TimeSpan _retentionPeriod;
|
||||
|
||||
public InMemoryProgressEventDeduplicator(TimeSpan? retentionPeriod = null)
|
||||
{
|
||||
_retentionPeriod = retentionPeriod ?? TimeSpan.FromMinutes(30);
|
||||
}
|
||||
|
||||
public ValueTask<bool> TryMarkAsProcessedAsync(string eventId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
// Clean up old entries periodically
|
||||
if (_processed.Count > 10000)
|
||||
{
|
||||
var cutoff = now - _retentionPeriod;
|
||||
var toRemove = _processed.Where(kvp => kvp.Value < cutoff).Select(kvp => kvp.Key).ToList();
|
||||
foreach (var key in toRemove)
|
||||
{
|
||||
_processed.TryRemove(key, out _);
|
||||
}
|
||||
}
|
||||
|
||||
// Try to add
|
||||
return ValueTask.FromResult(_processed.TryAdd(eventId, now));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of heartbeat service.
|
||||
/// </summary>
|
||||
public sealed class NullHeartbeatService : IHeartbeatService
|
||||
{
|
||||
public static NullHeartbeatService Instance { get; } = new();
|
||||
|
||||
public ValueTask SendHeartbeatAsync(CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,276 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Exceptions;
|
||||
|
||||
/// <summary>
|
||||
/// Exception lifecycle worker per SCHED-WORKER-25-101.
|
||||
/// Handles auto-activation/expiry of exceptions and publishes exception.* events with retries/backoff.
|
||||
/// </summary>
|
||||
public sealed class ExceptionLifecycleWorker : BackgroundService
|
||||
{
|
||||
private readonly IExceptionRepository _exceptionRepository;
|
||||
private readonly IExceptionEventPublisher _eventPublisher;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly ILogger<ExceptionLifecycleWorker> _logger;
|
||||
|
||||
public ExceptionLifecycleWorker(
|
||||
IExceptionRepository exceptionRepository,
|
||||
IExceptionEventPublisher eventPublisher,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
ILogger<ExceptionLifecycleWorker> logger)
|
||||
{
|
||||
_exceptionRepository = exceptionRepository ?? throw new ArgumentNullException(nameof(exceptionRepository));
|
||||
_eventPublisher = eventPublisher ?? throw new ArgumentNullException(nameof(eventPublisher));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Exception lifecycle worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
// Process pending activations
|
||||
await ProcessPendingActivationsAsync(now, stoppingToken).ConfigureAwait(false);
|
||||
|
||||
// Process expired exceptions
|
||||
await ProcessExpiredExceptionsAsync(now, stoppingToken).ConfigureAwait(false);
|
||||
|
||||
await Task.Delay(TimeSpan.FromMinutes(1), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (System.Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in exception lifecycle worker loop.");
|
||||
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Exception lifecycle worker stopped.");
|
||||
}
|
||||
|
||||
private async Task ProcessPendingActivationsAsync(DateTimeOffset now, CancellationToken cancellationToken)
|
||||
{
|
||||
var pendingActivations = await _exceptionRepository
|
||||
.GetPendingActivationsAsync(now, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
foreach (var exception in pendingActivations)
|
||||
{
|
||||
try
|
||||
{
|
||||
var activated = exception with
|
||||
{
|
||||
State = ExceptionState.Active,
|
||||
ActivatedAt = now
|
||||
};
|
||||
|
||||
await _exceptionRepository
|
||||
.UpdateAsync(activated, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
await PublishEventWithRetryAsync(
|
||||
ExceptionEventType.Activated,
|
||||
activated,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Exception {ExceptionId} activated for tenant {TenantId}.",
|
||||
exception.ExceptionId,
|
||||
exception.TenantId);
|
||||
}
|
||||
catch (System.Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Failed to activate exception {ExceptionId}.",
|
||||
exception.ExceptionId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ProcessExpiredExceptionsAsync(DateTimeOffset now, CancellationToken cancellationToken)
|
||||
{
|
||||
var expired = await _exceptionRepository
|
||||
.GetExpiredExceptionsAsync(now, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
foreach (var exception in expired)
|
||||
{
|
||||
try
|
||||
{
|
||||
var expiredRecord = exception with
|
||||
{
|
||||
State = ExceptionState.Expired,
|
||||
ExpiredAt = now
|
||||
};
|
||||
|
||||
await _exceptionRepository
|
||||
.UpdateAsync(expiredRecord, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
await PublishEventWithRetryAsync(
|
||||
ExceptionEventType.Expired,
|
||||
expiredRecord,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Exception {ExceptionId} expired for tenant {TenantId}.",
|
||||
exception.ExceptionId,
|
||||
exception.TenantId);
|
||||
}
|
||||
catch (System.Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Failed to expire exception {ExceptionId}.",
|
||||
exception.ExceptionId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task PublishEventWithRetryAsync(
|
||||
ExceptionEventType eventType,
|
||||
ExceptionRecord exception,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
const int maxRetries = 3;
|
||||
var delay = TimeSpan.FromSeconds(1);
|
||||
|
||||
for (var attempt = 0; attempt < maxRetries; attempt++)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _eventPublisher.PublishAsync(
|
||||
eventType,
|
||||
exception,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
return;
|
||||
}
|
||||
catch (System.Exception ex) when (ex is not OperationCanceledException && attempt < maxRetries - 1)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Failed to publish {EventType} event for exception {ExceptionId} (attempt {Attempt}), retrying...",
|
||||
eventType,
|
||||
exception.ExceptionId,
|
||||
attempt + 1);
|
||||
|
||||
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
|
||||
delay *= 2; // Exponential backoff
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for exceptions.
|
||||
/// </summary>
|
||||
public interface IExceptionRepository
|
||||
{
|
||||
ValueTask<IReadOnlyList<ExceptionRecord>> GetPendingActivationsAsync(
|
||||
DateTimeOffset asOf,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask<IReadOnlyList<ExceptionRecord>> GetExpiredExceptionsAsync(
|
||||
DateTimeOffset asOf,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask<IReadOnlyList<ExceptionRecord>> GetExpiringExceptionsAsync(
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask UpdateAsync(
|
||||
ExceptionRecord record,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask<ExceptionRecord?> GetAsync(
|
||||
string exceptionId,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Record representing an exception in the system.
|
||||
/// </summary>
|
||||
public sealed record ExceptionRecord(
|
||||
string ExceptionId,
|
||||
string TenantId,
|
||||
string PolicyId,
|
||||
string VulnerabilityId,
|
||||
string? ComponentPurl,
|
||||
ExceptionState State,
|
||||
DateTimeOffset CreatedAt,
|
||||
DateTimeOffset? ActivationDate,
|
||||
DateTimeOffset? ExpirationDate,
|
||||
DateTimeOffset? ActivatedAt = null,
|
||||
DateTimeOffset? ExpiredAt = null,
|
||||
string? Justification = null,
|
||||
string? CreatedBy = null);
|
||||
|
||||
/// <summary>
|
||||
/// State of an exception.
|
||||
/// </summary>
|
||||
public enum ExceptionState
|
||||
{
|
||||
Pending,
|
||||
Active,
|
||||
Expired,
|
||||
Revoked
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event types for exception lifecycle.
|
||||
/// </summary>
|
||||
public enum ExceptionEventType
|
||||
{
|
||||
Created,
|
||||
Activated,
|
||||
Expiring,
|
||||
Expired,
|
||||
Revoked
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Publisher interface for exception events.
|
||||
/// </summary>
|
||||
public interface IExceptionEventPublisher
|
||||
{
|
||||
ValueTask PublishAsync(
|
||||
ExceptionEventType eventType,
|
||||
ExceptionRecord exception,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of exception event publisher for testing.
|
||||
/// </summary>
|
||||
public sealed class NullExceptionEventPublisher : IExceptionEventPublisher
|
||||
{
|
||||
public static NullExceptionEventPublisher Instance { get; } = new();
|
||||
|
||||
public ValueTask PublishAsync(
|
||||
ExceptionEventType eventType,
|
||||
ExceptionRecord exception,
|
||||
CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,313 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Exceptions;
|
||||
|
||||
/// <summary>
|
||||
/// Expiring notification worker per SCHED-WORKER-25-102.
|
||||
/// Generates digests of soon-to-expire exceptions, marks them as 'expiring',
|
||||
/// and updates metrics/alerts for Console dashboards.
|
||||
/// </summary>
|
||||
public sealed class ExpiringNotificationWorker : BackgroundService
|
||||
{
|
||||
private readonly IExceptionRepository _exceptionRepository;
|
||||
private readonly IExceptionEventPublisher _eventPublisher;
|
||||
private readonly IExpiringDigestService _digestService;
|
||||
private readonly IExpiringAlertService _alertService;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly ILogger<ExpiringNotificationWorker> _logger;
|
||||
|
||||
public ExpiringNotificationWorker(
|
||||
IExceptionRepository exceptionRepository,
|
||||
IExceptionEventPublisher eventPublisher,
|
||||
IExpiringDigestService digestService,
|
||||
IExpiringAlertService alertService,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
ILogger<ExpiringNotificationWorker> logger)
|
||||
{
|
||||
_exceptionRepository = exceptionRepository ?? throw new ArgumentNullException(nameof(exceptionRepository));
|
||||
_eventPublisher = eventPublisher ?? throw new ArgumentNullException(nameof(eventPublisher));
|
||||
_digestService = digestService ?? throw new ArgumentNullException(nameof(digestService));
|
||||
_alertService = alertService ?? throw new ArgumentNullException(nameof(alertService));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
if (!_options.Exception.ExpiringNotificationEnabled)
|
||||
{
|
||||
_logger.LogInformation("Expiring notification worker is disabled.");
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogInformation("Expiring notification worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
// Process exceptions expiring within the notification window
|
||||
await ProcessExpiringExceptionsAsync(now, stoppingToken).ConfigureAwait(false);
|
||||
|
||||
// Wait for the configured interval before next check
|
||||
await Task.Delay(_options.Exception.ExpiringCheckInterval, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (System.Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in expiring notification worker loop.");
|
||||
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Expiring notification worker stopped.");
|
||||
}
|
||||
|
||||
private async Task ProcessExpiringExceptionsAsync(DateTimeOffset now, CancellationToken cancellationToken)
|
||||
{
|
||||
// Calculate the notification window
|
||||
var windowStart = now;
|
||||
var windowEnd = now.Add(_options.Exception.ExpiringNotificationWindow);
|
||||
|
||||
// Get exceptions expiring within the window
|
||||
var expiringExceptions = await _exceptionRepository
|
||||
.GetExpiringExceptionsAsync(windowStart, windowEnd, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (expiringExceptions.Count == 0)
|
||||
{
|
||||
_logger.LogDebug("No expiring exceptions found within notification window.");
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Found {Count} exceptions expiring within notification window ({WindowStart} - {WindowEnd}).",
|
||||
expiringExceptions.Count,
|
||||
windowStart,
|
||||
windowEnd);
|
||||
|
||||
// Group by tenant for digest generation
|
||||
var byTenant = expiringExceptions
|
||||
.GroupBy(static e => e.TenantId)
|
||||
.ToList();
|
||||
|
||||
foreach (var tenantGroup in byTenant)
|
||||
{
|
||||
var tenantId = tenantGroup.Key;
|
||||
var tenantExpiring = tenantGroup.ToList();
|
||||
|
||||
try
|
||||
{
|
||||
// Mark each exception as expiring and publish event
|
||||
foreach (var exception in tenantExpiring)
|
||||
{
|
||||
await MarkAsExpiringAndNotifyAsync(exception, now, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
|
||||
// Generate digest for this tenant
|
||||
var digest = await _digestService.GenerateDigestAsync(
|
||||
tenantId,
|
||||
tenantExpiring,
|
||||
windowEnd,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Emit alert for the digest
|
||||
await _alertService.EmitExpiringAlertAsync(
|
||||
tenantId,
|
||||
digest,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Generated expiring digest for tenant {TenantId}: {ExceptionCount} exceptions, digest ID {DigestId}.",
|
||||
tenantId,
|
||||
tenantExpiring.Count,
|
||||
digest.DigestId);
|
||||
}
|
||||
catch (System.Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Failed to process expiring exceptions for tenant {TenantId}.",
|
||||
tenantId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task MarkAsExpiringAndNotifyAsync(
|
||||
ExceptionRecord exception,
|
||||
DateTimeOffset now,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
// Only mark active exceptions as expiring
|
||||
if (exception.State != ExceptionState.Active)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Publish expiring event with retry
|
||||
await PublishEventWithRetryAsync(
|
||||
ExceptionEventType.Expiring,
|
||||
exception,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Exception {ExceptionId} for tenant {TenantId} marked as expiring (expires at {ExpirationDate}).",
|
||||
exception.ExceptionId,
|
||||
exception.TenantId,
|
||||
exception.ExpirationDate);
|
||||
}
|
||||
catch (System.Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Failed to publish expiring event for exception {ExceptionId}.",
|
||||
exception.ExceptionId);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task PublishEventWithRetryAsync(
|
||||
ExceptionEventType eventType,
|
||||
ExceptionRecord exception,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
const int maxRetries = 3;
|
||||
var delay = TimeSpan.FromSeconds(1);
|
||||
|
||||
for (var attempt = 0; attempt < maxRetries; attempt++)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _eventPublisher.PublishAsync(
|
||||
eventType,
|
||||
exception,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
return;
|
||||
}
|
||||
catch (System.Exception ex) when (ex is not OperationCanceledException && attempt < maxRetries - 1)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Failed to publish {EventType} event for exception {ExceptionId} (attempt {Attempt}), retrying...",
|
||||
eventType,
|
||||
exception.ExceptionId,
|
||||
attempt + 1);
|
||||
|
||||
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
|
||||
delay *= 2; // Exponential backoff
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service for generating expiring exception digests.
|
||||
/// </summary>
|
||||
public interface IExpiringDigestService
|
||||
{
|
||||
/// <summary>
|
||||
/// Generates a digest of expiring exceptions for a tenant.
|
||||
/// </summary>
|
||||
ValueTask<ExpiringDigest> GenerateDigestAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<ExceptionRecord> expiringExceptions,
|
||||
DateTimeOffset windowEnd,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service for emitting expiring exception alerts.
|
||||
/// </summary>
|
||||
public interface IExpiringAlertService
|
||||
{
|
||||
/// <summary>
|
||||
/// Emits an alert for expiring exceptions.
|
||||
/// </summary>
|
||||
ValueTask EmitExpiringAlertAsync(
|
||||
string tenantId,
|
||||
ExpiringDigest digest,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Digest of expiring exceptions for notification.
|
||||
/// </summary>
|
||||
public sealed record ExpiringDigest(
|
||||
string DigestId,
|
||||
string TenantId,
|
||||
DateTimeOffset GeneratedAt,
|
||||
DateTimeOffset WindowEnd,
|
||||
int TotalCount,
|
||||
int CriticalCount,
|
||||
int HighCount,
|
||||
ImmutableArray<ExpiringDigestEntry> Entries);
|
||||
|
||||
/// <summary>
|
||||
/// Individual entry in an expiring digest.
|
||||
/// </summary>
|
||||
public sealed record ExpiringDigestEntry(
|
||||
string ExceptionId,
|
||||
string PolicyId,
|
||||
string VulnerabilityId,
|
||||
string? ComponentPurl,
|
||||
DateTimeOffset ExpirationDate,
|
||||
TimeSpan TimeUntilExpiry);
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of expiring digest service for testing.
|
||||
/// </summary>
|
||||
public sealed class NullExpiringDigestService : IExpiringDigestService
|
||||
{
|
||||
public static NullExpiringDigestService Instance { get; } = new();
|
||||
|
||||
public ValueTask<ExpiringDigest> GenerateDigestAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<ExceptionRecord> expiringExceptions,
|
||||
DateTimeOffset windowEnd,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var digest = new ExpiringDigest(
|
||||
DigestId: Guid.NewGuid().ToString("N"),
|
||||
TenantId: tenantId,
|
||||
GeneratedAt: DateTimeOffset.UtcNow,
|
||||
WindowEnd: windowEnd,
|
||||
TotalCount: expiringExceptions.Count,
|
||||
CriticalCount: 0,
|
||||
HighCount: 0,
|
||||
Entries: []);
|
||||
|
||||
return ValueTask.FromResult(digest);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of expiring alert service for testing.
|
||||
/// </summary>
|
||||
public sealed class NullExpiringAlertService : IExpiringAlertService
|
||||
{
|
||||
public static NullExpiringAlertService Instance { get; } = new();
|
||||
|
||||
public ValueTask EmitExpiringAlertAsync(
|
||||
string tenantId,
|
||||
ExpiringDigest digest,
|
||||
CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
}
|
||||
@@ -20,7 +20,8 @@ public sealed class SchedulerWorkerMetrics : IDisposable
|
||||
private readonly Counter<long> _runnerDeltaFindingsTotal;
|
||||
private readonly Counter<long> _runnerKevHitsTotal;
|
||||
private readonly Counter<long> _surfaceManifestPrefetchTotal;
|
||||
private readonly Counter<long> _surfaceManifestPrefetchTotal;
|
||||
private readonly Counter<long> _policyReEvaluationTotal;
|
||||
private readonly Histogram<double> _policyReEvaluationDurationSeconds;
|
||||
private readonly Histogram<double> _runDurationSeconds;
|
||||
private readonly UpDownCounter<long> _runsActive;
|
||||
private readonly Counter<long> _graphJobsTotal;
|
||||
@@ -71,10 +72,14 @@ public sealed class SchedulerWorkerMetrics : IDisposable
|
||||
"scheduler_surface_manifest_prefetch_total",
|
||||
unit: "attempt",
|
||||
description: "Surface manifest prefetch attempts grouped by result.");
|
||||
_surfaceManifestPrefetchTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_surface_manifest_prefetch_total",
|
||||
unit: "attempt",
|
||||
description: "Surface manifest prefetch attempts grouped by result.");
|
||||
_policyReEvaluationTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_policy_reevaluation_total",
|
||||
unit: "count",
|
||||
description: "Policy re-evaluation jobs grouped by tenant and status.");
|
||||
_policyReEvaluationDurationSeconds = _meter.CreateHistogram<double>(
|
||||
"scheduler_policy_reevaluation_duration_seconds",
|
||||
unit: "s",
|
||||
description: "Policy re-evaluation job durations grouped by tenant and status.");
|
||||
_runDurationSeconds = _meter.CreateHistogram<double>(
|
||||
"scheduler_run_duration_seconds",
|
||||
unit: "s",
|
||||
@@ -188,6 +193,18 @@ public sealed class SchedulerWorkerMetrics : IDisposable
|
||||
_surfaceManifestPrefetchTotal.Add(1, tags);
|
||||
}
|
||||
|
||||
public void RecordPolicyReEvaluation(string tenantId, string status, TimeSpan duration)
|
||||
{
|
||||
var tags = new[]
|
||||
{
|
||||
new KeyValuePair<string, object?>("tenant", tenantId),
|
||||
new KeyValuePair<string, object?>("status", status)
|
||||
};
|
||||
|
||||
_policyReEvaluationTotal.Add(1, tags);
|
||||
_policyReEvaluationDurationSeconds.Record(Math.Max(duration.TotalSeconds, 0d), tags);
|
||||
}
|
||||
|
||||
public void RecordDeltaSummaries(string mode, IReadOnlyList<DeltaSummary> deltas)
|
||||
{
|
||||
if (deltas.Count == 0)
|
||||
|
||||
@@ -15,12 +15,21 @@ public sealed class SchedulerWorkerOptions
|
||||
|
||||
public GraphOptions Graph { get; set; } = new();
|
||||
|
||||
public SurfaceOptions Surface { get; set; } = new();
|
||||
|
||||
public ExceptionOptions Exception { get; set; } = new();
|
||||
|
||||
public ReachabilityOptions Reachability { get; set; } = new();
|
||||
|
||||
public void Validate()
|
||||
{
|
||||
Planner.Validate();
|
||||
Runner.Validate();
|
||||
Policy.Validate();
|
||||
Graph.Validate();
|
||||
Surface.Validate();
|
||||
Exception.Validate();
|
||||
Reachability.Validate();
|
||||
}
|
||||
|
||||
public sealed class PlannerOptions
|
||||
@@ -280,21 +289,21 @@ public sealed class SchedulerWorkerOptions
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
public DispatchOptions Dispatch { get; set; } = new();
|
||||
|
||||
public ApiOptions Api { get; set; } = new();
|
||||
|
||||
public TargetingOptions Targeting { get; set; } = new();
|
||||
|
||||
public WebhookOptions Webhook { get; set; } = new();
|
||||
|
||||
public void Validate()
|
||||
{
|
||||
Dispatch.Validate();
|
||||
Api.Validate();
|
||||
Targeting.Validate();
|
||||
Webhook.Validate();
|
||||
}
|
||||
public DispatchOptions Dispatch { get; set; } = new();
|
||||
|
||||
public ApiOptions Api { get; set; } = new();
|
||||
|
||||
public TargetingOptions Targeting { get; set; } = new();
|
||||
|
||||
public WebhookOptions Webhook { get; set; } = new();
|
||||
|
||||
public void Validate()
|
||||
{
|
||||
Dispatch.Validate();
|
||||
Api.Validate();
|
||||
Targeting.Validate();
|
||||
Webhook.Validate();
|
||||
}
|
||||
|
||||
public sealed class DispatchOptions
|
||||
{
|
||||
@@ -433,11 +442,11 @@ public sealed class SchedulerWorkerOptions
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class TargetingOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// When disabled the worker skips policy delta targeting.
|
||||
/// </summary>
|
||||
public sealed class TargetingOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// When disabled the worker skips policy delta targeting.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
@@ -457,59 +466,59 @@ public sealed class SchedulerWorkerOptions
|
||||
throw new InvalidOperationException("Policy targeting MaxSboms must be greater than zero.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class WebhookOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Controls whether webhook callbacks are emitted when simulations complete.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Absolute endpoint to invoke for webhook callbacks.
|
||||
/// </summary>
|
||||
public string? Endpoint { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional header to carry an API key.
|
||||
/// </summary>
|
||||
public string? ApiKeyHeader { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional API key value aligned with <see cref="ApiKeyHeader"/>.
|
||||
/// </summary>
|
||||
public string? ApiKey { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Request timeout in seconds.
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; set; } = 10;
|
||||
|
||||
public void Validate()
|
||||
{
|
||||
if (!Enabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(Endpoint))
|
||||
{
|
||||
throw new InvalidOperationException("Policy webhook endpoint must be configured when enabled.");
|
||||
}
|
||||
|
||||
if (!Uri.TryCreate(Endpoint, UriKind.Absolute, out _))
|
||||
{
|
||||
throw new InvalidOperationException("Policy webhook endpoint must be an absolute URI.");
|
||||
}
|
||||
|
||||
if (TimeoutSeconds <= 0)
|
||||
{
|
||||
throw new InvalidOperationException("Policy webhook timeout must be greater than zero.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class WebhookOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Controls whether webhook callbacks are emitted when simulations complete.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Absolute endpoint to invoke for webhook callbacks.
|
||||
/// </summary>
|
||||
public string? Endpoint { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional header to carry an API key.
|
||||
/// </summary>
|
||||
public string? ApiKeyHeader { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional API key value aligned with <see cref="ApiKeyHeader"/>.
|
||||
/// </summary>
|
||||
public string? ApiKey { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Request timeout in seconds.
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; set; } = 10;
|
||||
|
||||
public void Validate()
|
||||
{
|
||||
if (!Enabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(Endpoint))
|
||||
{
|
||||
throw new InvalidOperationException("Policy webhook endpoint must be configured when enabled.");
|
||||
}
|
||||
|
||||
if (!Uri.TryCreate(Endpoint, UriKind.Absolute, out _))
|
||||
{
|
||||
throw new InvalidOperationException("Policy webhook endpoint must be an absolute URI.");
|
||||
}
|
||||
|
||||
if (TimeoutSeconds <= 0)
|
||||
{
|
||||
throw new InvalidOperationException("Policy webhook timeout must be greater than zero.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class GraphOptions
|
||||
{
|
||||
@@ -700,4 +709,174 @@ public sealed class SchedulerWorkerOptions
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for Surface.FS pointer evaluation per SCHED-SURFACE-01.
|
||||
/// </summary>
|
||||
public sealed class SurfaceOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// When enabled, Surface.FS pointers are evaluated during planning to detect drift.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// When enabled, the worker operates in sealed mode rejecting external storage URIs.
|
||||
/// </summary>
|
||||
public bool SealedMode { get; set; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// When enabled, images with unchanged versions are skipped to avoid redundant scans.
|
||||
/// </summary>
|
||||
public bool SkipRedundantScans { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Allowed dataset types for Surface.FS pointers.
|
||||
/// </summary>
|
||||
public HashSet<string> AllowedDatasets { get; set; } = new(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
"sbom",
|
||||
"findings",
|
||||
"reachability",
|
||||
"policy",
|
||||
"attestation"
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Time-to-live for cached pointer versions.
|
||||
/// </summary>
|
||||
public TimeSpan CacheTtl { get; set; } = TimeSpan.FromMinutes(30);
|
||||
|
||||
public void Validate()
|
||||
{
|
||||
if (AllowedDatasets.Count == 0)
|
||||
{
|
||||
throw new InvalidOperationException("Surface allowed datasets must contain at least one value.");
|
||||
}
|
||||
|
||||
if (CacheTtl <= TimeSpan.Zero)
|
||||
{
|
||||
throw new InvalidOperationException("Surface cache TTL must be greater than zero.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for exception lifecycle workers per SCHED-WORKER-25-101/25-102.
|
||||
/// </summary>
|
||||
public sealed class ExceptionOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// When enabled, the expiring notification worker generates and sends digests.
|
||||
/// </summary>
|
||||
public bool ExpiringNotificationEnabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Notification window for expiring exceptions.
|
||||
/// Exceptions expiring within this window will be included in digests.
|
||||
/// </summary>
|
||||
public TimeSpan ExpiringNotificationWindow { get; set; } = TimeSpan.FromDays(7);
|
||||
|
||||
/// <summary>
|
||||
/// Interval between expiring notification checks.
|
||||
/// </summary>
|
||||
public TimeSpan ExpiringCheckInterval { get; set; } = TimeSpan.FromHours(1);
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of retries for publishing exception events.
|
||||
/// </summary>
|
||||
public int MaxPublishRetries { get; set; } = 3;
|
||||
|
||||
/// <summary>
|
||||
/// Base delay for exponential backoff when retrying event publishing.
|
||||
/// </summary>
|
||||
public TimeSpan PublishRetryDelay { get; set; } = TimeSpan.FromSeconds(1);
|
||||
|
||||
public void Validate()
|
||||
{
|
||||
if (ExpiringNotificationWindow <= TimeSpan.Zero)
|
||||
{
|
||||
throw new InvalidOperationException("Exception expiring notification window must be greater than zero.");
|
||||
}
|
||||
|
||||
if (ExpiringCheckInterval <= TimeSpan.Zero)
|
||||
{
|
||||
throw new InvalidOperationException("Exception expiring check interval must be greater than zero.");
|
||||
}
|
||||
|
||||
if (MaxPublishRetries < 0)
|
||||
{
|
||||
throw new InvalidOperationException("Exception max publish retries cannot be negative.");
|
||||
}
|
||||
|
||||
if (PublishRetryDelay < TimeSpan.Zero)
|
||||
{
|
||||
throw new InvalidOperationException("Exception publish retry delay cannot be negative.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for reachability joiner worker per SCHED-WORKER-26-201.
|
||||
/// </summary>
|
||||
public sealed class ReachabilityOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// When enabled, the reachability joiner worker combines SBOM snapshots with signals.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of SBOM snapshots to process per batch.
|
||||
/// </summary>
|
||||
public int BatchSize { get; set; } = 50;
|
||||
|
||||
/// <summary>
|
||||
/// Polling interval for the reachability joiner loop.
|
||||
/// </summary>
|
||||
public TimeSpan PollInterval { get; set; } = TimeSpan.FromSeconds(10);
|
||||
|
||||
/// <summary>
|
||||
/// Delay applied when no work is available.
|
||||
/// </summary>
|
||||
public TimeSpan IdleDelay { get; set; } = TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <summary>
|
||||
/// Time-to-live for cached reachability facts.
|
||||
/// </summary>
|
||||
public TimeSpan FactCacheTtl { get; set; } = TimeSpan.FromHours(24);
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of concurrent signal processing tasks.
|
||||
/// </summary>
|
||||
public int MaxConcurrency { get; set; } = Environment.ProcessorCount;
|
||||
|
||||
public void Validate()
|
||||
{
|
||||
if (BatchSize <= 0)
|
||||
{
|
||||
throw new InvalidOperationException("Reachability batch size must be greater than zero.");
|
||||
}
|
||||
|
||||
if (PollInterval <= TimeSpan.Zero)
|
||||
{
|
||||
throw new InvalidOperationException("Reachability poll interval must be greater than zero.");
|
||||
}
|
||||
|
||||
if (IdleDelay < TimeSpan.Zero)
|
||||
{
|
||||
throw new InvalidOperationException("Reachability idle delay cannot be negative.");
|
||||
}
|
||||
|
||||
if (FactCacheTtl <= TimeSpan.Zero)
|
||||
{
|
||||
throw new InvalidOperationException("Reachability fact cache TTL must be greater than zero.");
|
||||
}
|
||||
|
||||
if (MaxConcurrency <= 0)
|
||||
{
|
||||
throw new InvalidOperationException("Reachability max concurrency must be greater than zero.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
using System.Text.Json.Serialization;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Planning;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a Surface.FS pointer per SCHED-SURFACE-01 contract.
|
||||
/// Format: surfacefs://<tenant>/<dataset>/<version>
|
||||
/// </summary>
|
||||
public sealed partial record SurfaceFsPointer
|
||||
{
|
||||
public SurfaceFsPointer(
|
||||
string tenantId,
|
||||
string dataset,
|
||||
string version,
|
||||
string? storageUri = null,
|
||||
DateTimeOffset? createdAt = null)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(tenantId))
|
||||
{
|
||||
throw new ArgumentException("Tenant ID is required.", nameof(tenantId));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(dataset))
|
||||
{
|
||||
throw new ArgumentException("Dataset is required.", nameof(dataset));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(version))
|
||||
{
|
||||
throw new ArgumentException("Version is required.", nameof(version));
|
||||
}
|
||||
|
||||
TenantId = tenantId;
|
||||
Dataset = dataset;
|
||||
Version = version;
|
||||
StorageUri = storageUri;
|
||||
CreatedAt = createdAt;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tenant identifier.
|
||||
/// </summary>
|
||||
[JsonPropertyName("tenant_id")]
|
||||
public string TenantId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Dataset type (e.g., "sbom", "findings", "reachability").
|
||||
/// </summary>
|
||||
[JsonPropertyName("dataset")]
|
||||
public string Dataset { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Version identifier (content hash or monotonic version).
|
||||
/// </summary>
|
||||
[JsonPropertyName("version")]
|
||||
public string Version { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Storage URI (unset/relative in sealed mode; content-addressed path recommended).
|
||||
/// </summary>
|
||||
[JsonPropertyName("storage_uri")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? StorageUri { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Creation timestamp (RFC3339 UTC).
|
||||
/// </summary>
|
||||
[JsonPropertyName("created_at")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public DateTimeOffset? CreatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Generates a canonical URI for this pointer.
|
||||
/// </summary>
|
||||
public string ToUri() => $"surfacefs://{TenantId}/{Dataset}/{Version}";
|
||||
|
||||
/// <summary>
|
||||
/// Generates a cache key for this pointer.
|
||||
/// </summary>
|
||||
public string ToCacheKey() => $"surface_fs_pointer::{TenantId}::{Dataset}::{Version}";
|
||||
|
||||
/// <summary>
|
||||
/// Parses a Surface.FS URI into a pointer.
|
||||
/// </summary>
|
||||
public static SurfaceFsPointer? Parse(string uri)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(uri))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var match = SurfaceFsUriRegex().Match(uri);
|
||||
if (!match.Success)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return new SurfaceFsPointer(
|
||||
tenantId: match.Groups["tenant"].Value,
|
||||
dataset: match.Groups["dataset"].Value,
|
||||
version: match.Groups["version"].Value);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tries to parse a Surface.FS URI.
|
||||
/// </summary>
|
||||
public static bool TryParse(string uri, out SurfaceFsPointer? pointer)
|
||||
{
|
||||
pointer = Parse(uri);
|
||||
return pointer is not null;
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"^surfacefs://(?<tenant>[^/]+)/(?<dataset>[^/]+)/(?<version>.+)$", RegexOptions.IgnoreCase | RegexOptions.CultureInvariant)]
|
||||
private static partial Regex SurfaceFsUriRegex();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Known dataset types for Surface.FS pointers.
|
||||
/// </summary>
|
||||
public static class SurfaceFsDatasets
|
||||
{
|
||||
public const string Sbom = "sbom";
|
||||
public const string Findings = "findings";
|
||||
public const string Reachability = "reachability";
|
||||
public const string Policy = "policy";
|
||||
public const string Attestation = "attestation";
|
||||
|
||||
/// <summary>
|
||||
/// Default allowed datasets for scheduler operations.
|
||||
/// </summary>
|
||||
public static readonly IReadOnlySet<string> DefaultAllowlist = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
Sbom,
|
||||
Findings,
|
||||
Reachability,
|
||||
Policy,
|
||||
Attestation
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,356 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Planning;
|
||||
|
||||
/// <summary>
|
||||
/// Service for evaluating Surface.FS pointers during delta scan planning.
|
||||
/// Implements SCHED-SURFACE-01: prioritizes drift-triggered assets and avoids redundant work.
|
||||
/// </summary>
|
||||
public interface ISurfaceFsPointerEvaluator
|
||||
{
|
||||
/// <summary>
|
||||
/// Validates a Surface.FS pointer against the allowlist and sealed mode rules.
|
||||
/// </summary>
|
||||
SurfaceFsValidationResult Validate(SurfaceFsPointer pointer);
|
||||
|
||||
/// <summary>
|
||||
/// Checks if the pointer represents drift from the cached version.
|
||||
/// </summary>
|
||||
ValueTask<SurfaceFsDriftResult> CheckDriftAsync(
|
||||
SurfaceFsPointer pointer,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Evaluates pointers for a batch of images and prioritizes drift-triggered assets.
|
||||
/// </summary>
|
||||
ValueTask<SurfaceFsEvaluationResult> EvaluateForPlanningAsync(
|
||||
IReadOnlyList<ImpactImage> images,
|
||||
IReadOnlyDictionary<string, SurfaceFsPointer> manifestPointers,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of pointer validation.
|
||||
/// </summary>
|
||||
public sealed record SurfaceFsValidationResult(
|
||||
bool IsValid,
|
||||
string? Error = null)
|
||||
{
|
||||
public static SurfaceFsValidationResult Valid { get; } = new(true);
|
||||
|
||||
public static SurfaceFsValidationResult Invalid(string error) => new(false, error);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of drift detection.
|
||||
/// </summary>
|
||||
public sealed record SurfaceFsDriftResult(
|
||||
SurfaceFsPointer Pointer,
|
||||
bool HasDrift,
|
||||
string? CachedVersion = null,
|
||||
DateTimeOffset? CachedAt = null)
|
||||
{
|
||||
/// <summary>
|
||||
/// The priority boost for drift-triggered assets (higher = more priority).
|
||||
/// </summary>
|
||||
public int PriorityBoost => HasDrift ? 10 : 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of batch evaluation for planning.
|
||||
/// </summary>
|
||||
public sealed record SurfaceFsEvaluationResult(
|
||||
IReadOnlyList<ImpactImage> PrioritizedImages,
|
||||
IReadOnlyList<ImpactImage> SkippedImages,
|
||||
int DriftTriggeredCount,
|
||||
int RedundantCount)
|
||||
{
|
||||
/// <summary>
|
||||
/// Indicates if any drift was detected.
|
||||
/// </summary>
|
||||
public bool HasDrift => DriftTriggeredCount > 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of Surface.FS pointer evaluator.
|
||||
/// </summary>
|
||||
public sealed class SurfaceFsPointerEvaluator : ISurfaceFsPointerEvaluator
|
||||
{
|
||||
private readonly ISurfaceFsPointerCache _cache;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<SurfaceFsPointerEvaluator> _logger;
|
||||
|
||||
public SurfaceFsPointerEvaluator(
|
||||
ISurfaceFsPointerCache cache,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
ILogger<SurfaceFsPointerEvaluator> logger)
|
||||
{
|
||||
_cache = cache ?? throw new ArgumentNullException(nameof(cache));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public SurfaceFsValidationResult Validate(SurfaceFsPointer pointer)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(pointer);
|
||||
|
||||
var surfaceOptions = _options.Surface;
|
||||
|
||||
// Validate dataset against allowlist
|
||||
if (!surfaceOptions.AllowedDatasets.Contains(pointer.Dataset))
|
||||
{
|
||||
return SurfaceFsValidationResult.Invalid(
|
||||
$"Dataset '{pointer.Dataset}' is not in the allowed list.");
|
||||
}
|
||||
|
||||
// In sealed mode, reject external storage URIs
|
||||
if (surfaceOptions.SealedMode && !string.IsNullOrWhiteSpace(pointer.StorageUri))
|
||||
{
|
||||
if (!IsLocalOrContentAddressedUri(pointer.StorageUri))
|
||||
{
|
||||
return SurfaceFsValidationResult.Invalid(
|
||||
$"External storage URI '{pointer.StorageUri}' not permitted in sealed mode.");
|
||||
}
|
||||
}
|
||||
|
||||
return SurfaceFsValidationResult.Valid;
|
||||
}
|
||||
|
||||
public async ValueTask<SurfaceFsDriftResult> CheckDriftAsync(
|
||||
SurfaceFsPointer pointer,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(pointer);
|
||||
|
||||
var cached = await _cache.GetAsync(pointer.TenantId, pointer.Dataset, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (cached is null)
|
||||
{
|
||||
// No cached version means this is new - treat as drift
|
||||
return new SurfaceFsDriftResult(pointer, HasDrift: true);
|
||||
}
|
||||
|
||||
var hasDrift = !string.Equals(cached.Version, pointer.Version, StringComparison.Ordinal);
|
||||
|
||||
return new SurfaceFsDriftResult(
|
||||
pointer,
|
||||
HasDrift: hasDrift,
|
||||
CachedVersion: cached.Version,
|
||||
CachedAt: cached.CreatedAt);
|
||||
}
|
||||
|
||||
public async ValueTask<SurfaceFsEvaluationResult> EvaluateForPlanningAsync(
|
||||
IReadOnlyList<ImpactImage> images,
|
||||
IReadOnlyDictionary<string, SurfaceFsPointer> manifestPointers,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(images);
|
||||
ArgumentNullException.ThrowIfNull(manifestPointers);
|
||||
|
||||
if (images.Count == 0)
|
||||
{
|
||||
return new SurfaceFsEvaluationResult(
|
||||
PrioritizedImages: [],
|
||||
SkippedImages: [],
|
||||
DriftTriggeredCount: 0,
|
||||
RedundantCount: 0);
|
||||
}
|
||||
|
||||
var driftImages = new List<(ImpactImage Image, int Priority)>();
|
||||
var noDriftImages = new List<ImpactImage>();
|
||||
var skippedImages = new List<ImpactImage>();
|
||||
var driftCount = 0;
|
||||
var redundantCount = 0;
|
||||
|
||||
foreach (var image in images)
|
||||
{
|
||||
if (!manifestPointers.TryGetValue(image.ImageDigest, out var pointer))
|
||||
{
|
||||
// No pointer for this image - include without priority boost
|
||||
noDriftImages.Add(image);
|
||||
continue;
|
||||
}
|
||||
|
||||
var validation = Validate(pointer);
|
||||
if (!validation.IsValid)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Skipping image {Digest} due to invalid pointer: {Error}",
|
||||
image.ImageDigest,
|
||||
validation.Error);
|
||||
skippedImages.Add(image);
|
||||
continue;
|
||||
}
|
||||
|
||||
var drift = await CheckDriftAsync(pointer, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (drift.HasDrift)
|
||||
{
|
||||
driftImages.Add((image, drift.PriorityBoost));
|
||||
driftCount++;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Image {Digest} has drift: cached={CachedVersion}, new={NewVersion}",
|
||||
image.ImageDigest,
|
||||
drift.CachedVersion ?? "(none)",
|
||||
pointer.Version);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Check if this would be redundant work (same version already processed)
|
||||
if (_options.Surface.SkipRedundantScans)
|
||||
{
|
||||
skippedImages.Add(image);
|
||||
redundantCount++;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Skipping redundant scan for image {Digest} (version {Version} unchanged)",
|
||||
image.ImageDigest,
|
||||
pointer.Version);
|
||||
}
|
||||
else
|
||||
{
|
||||
noDriftImages.Add(image);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Prioritize drift-triggered images first, then the rest
|
||||
var prioritized = driftImages
|
||||
.OrderByDescending(static x => x.Priority)
|
||||
.ThenBy(static x => x.Image.ImageDigest, StringComparer.OrdinalIgnoreCase)
|
||||
.Select(static x => x.Image)
|
||||
.Concat(noDriftImages.OrderBy(static x => x.ImageDigest, StringComparer.OrdinalIgnoreCase))
|
||||
.ToList();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Surface.FS evaluation: {Total} images, {DriftCount} drift-triggered, {RedundantCount} redundant, {SkippedCount} skipped",
|
||||
images.Count,
|
||||
driftCount,
|
||||
redundantCount,
|
||||
skippedImages.Count);
|
||||
|
||||
return new SurfaceFsEvaluationResult(
|
||||
PrioritizedImages: prioritized,
|
||||
SkippedImages: skippedImages,
|
||||
DriftTriggeredCount: driftCount,
|
||||
RedundantCount: redundantCount);
|
||||
}
|
||||
|
||||
private static bool IsLocalOrContentAddressedUri(string uri)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(uri))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// Allow relative paths
|
||||
if (!uri.Contains("://", StringComparison.Ordinal))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// Allow file:// URIs
|
||||
if (uri.StartsWith("file://", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// Allow content-addressed schemes
|
||||
if (uri.StartsWith("sha256:", StringComparison.OrdinalIgnoreCase) ||
|
||||
uri.StartsWith("sha512:", StringComparison.OrdinalIgnoreCase) ||
|
||||
uri.StartsWith("content:", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Cache interface for Surface.FS pointers.
|
||||
/// </summary>
|
||||
public interface ISurfaceFsPointerCache
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets a cached pointer for the specified tenant and dataset.
|
||||
/// </summary>
|
||||
ValueTask<SurfaceFsPointer?> GetAsync(
|
||||
string tenantId,
|
||||
string dataset,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Sets/updates a cached pointer.
|
||||
/// </summary>
|
||||
ValueTask SetAsync(
|
||||
SurfaceFsPointer pointer,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Removes a cached pointer.
|
||||
/// </summary>
|
||||
ValueTask RemoveAsync(
|
||||
string tenantId,
|
||||
string dataset,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of Surface.FS pointer cache.
|
||||
/// </summary>
|
||||
public sealed class InMemorySurfaceFsPointerCache : ISurfaceFsPointerCache
|
||||
{
|
||||
private readonly Dictionary<string, SurfaceFsPointer> _cache = new(StringComparer.OrdinalIgnoreCase);
|
||||
private readonly object _lock = new();
|
||||
|
||||
public ValueTask<SurfaceFsPointer?> GetAsync(
|
||||
string tenantId,
|
||||
string dataset,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var key = BuildKey(tenantId, dataset);
|
||||
lock (_lock)
|
||||
{
|
||||
return ValueTask.FromResult(_cache.GetValueOrDefault(key));
|
||||
}
|
||||
}
|
||||
|
||||
public ValueTask SetAsync(
|
||||
SurfaceFsPointer pointer,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(pointer);
|
||||
|
||||
var key = BuildKey(pointer.TenantId, pointer.Dataset);
|
||||
lock (_lock)
|
||||
{
|
||||
_cache[key] = pointer;
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
public ValueTask RemoveAsync(
|
||||
string tenantId,
|
||||
string dataset,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var key = BuildKey(tenantId, dataset);
|
||||
lock (_lock)
|
||||
{
|
||||
_cache.Remove(key);
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
private static string BuildKey(string tenantId, string dataset)
|
||||
=> $"{tenantId}::{dataset}";
|
||||
}
|
||||
@@ -0,0 +1,134 @@
|
||||
using System.Text.Json.Serialization;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Policy;
|
||||
|
||||
/// <summary>
|
||||
/// Policy activation event per SCHED-WORKER-23-101 contract.
|
||||
/// Event type: scheduler.policy.activation.requested
|
||||
/// </summary>
|
||||
public sealed record PolicyActivationEvent
|
||||
{
|
||||
public PolicyActivationEvent(
|
||||
string jobId,
|
||||
string policyRunId,
|
||||
string tenantId,
|
||||
int priority,
|
||||
DateTimeOffset requestedAtUtc,
|
||||
PolicyThrottleSource throttleSource)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(jobId))
|
||||
{
|
||||
throw new ArgumentException("Job ID is required.", nameof(jobId));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(policyRunId))
|
||||
{
|
||||
throw new ArgumentException("Policy run ID is required.", nameof(policyRunId));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(tenantId))
|
||||
{
|
||||
throw new ArgumentException("Tenant ID is required.", nameof(tenantId));
|
||||
}
|
||||
|
||||
JobId = jobId;
|
||||
PolicyRunId = policyRunId;
|
||||
TenantId = tenantId;
|
||||
Priority = priority;
|
||||
RequestedAtUtc = requestedAtUtc;
|
||||
ThrottleSource = throttleSource;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event type constant.
|
||||
/// </summary>
|
||||
public const string EventType = "scheduler.policy.activation.requested";
|
||||
|
||||
/// <summary>
|
||||
/// Unique job identifier for idempotency.
|
||||
/// </summary>
|
||||
[JsonPropertyName("job_id")]
|
||||
public string JobId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Associated policy run identifier.
|
||||
/// </summary>
|
||||
[JsonPropertyName("policy_run_id")]
|
||||
public string PolicyRunId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Tenant scope for this activation.
|
||||
/// </summary>
|
||||
[JsonPropertyName("tenant_id")]
|
||||
public string TenantId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Processing priority (higher = more urgent).
|
||||
/// </summary>
|
||||
[JsonPropertyName("priority")]
|
||||
public int Priority { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// UTC timestamp when activation was requested.
|
||||
/// </summary>
|
||||
[JsonPropertyName("requested_at_utc")]
|
||||
public DateTimeOffset RequestedAtUtc { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Source of throttle configuration.
|
||||
/// </summary>
|
||||
[JsonPropertyName("throttle_source")]
|
||||
[JsonConverter(typeof(JsonStringEnumConverter))]
|
||||
public PolicyThrottleSource ThrottleSource { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional bundle pointers for policy/export data.
|
||||
/// </summary>
|
||||
[JsonPropertyName("bundle_pointers")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public PolicyBundlePointers? BundlePointers { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Throttle source configuration for policy activation.
|
||||
/// </summary>
|
||||
public enum PolicyThrottleSource
|
||||
{
|
||||
/// <summary>
|
||||
/// Use default scheduler throttling rules.
|
||||
/// </summary>
|
||||
[JsonPropertyName("scheduler-default")]
|
||||
SchedulerDefault,
|
||||
|
||||
/// <summary>
|
||||
/// Use policy-specific throttle signals.
|
||||
/// </summary>
|
||||
[JsonPropertyName("policy-signal")]
|
||||
PolicySignal,
|
||||
|
||||
/// <summary>
|
||||
/// Manual override of throttle configuration.
|
||||
/// </summary>
|
||||
[JsonPropertyName("manual-override")]
|
||||
ManualOverride
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Optional bundle pointers for policy activation.
|
||||
/// </summary>
|
||||
public sealed record PolicyBundlePointers
|
||||
{
|
||||
/// <summary>
|
||||
/// Pointer to policy definition bundle.
|
||||
/// </summary>
|
||||
[JsonPropertyName("policy_bundle")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? PolicyBundle { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Pointer to export data bundle.
|
||||
/// </summary>
|
||||
[JsonPropertyName("export_bundle")]
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? ExportBundle { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,501 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Threading.RateLimiting;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Policy;
|
||||
|
||||
/// <summary>
|
||||
/// Policy re-evaluation worker per SCHED-WORKER-23-101.
|
||||
/// Handles policy activation events, shards assets, honors rate limits, and updates progress.
|
||||
/// </summary>
|
||||
public sealed class PolicyReEvaluationWorker : BackgroundService
|
||||
{
|
||||
private readonly IPolicyActivationQueue _activationQueue;
|
||||
private readonly IPolicyReEvaluationService _reEvaluationService;
|
||||
private readonly IPolicyProgressReporter _progressReporter;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly ILogger<PolicyReEvaluationWorker> _logger;
|
||||
|
||||
public PolicyReEvaluationWorker(
|
||||
IPolicyActivationQueue activationQueue,
|
||||
IPolicyReEvaluationService reEvaluationService,
|
||||
IPolicyProgressReporter progressReporter,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
ILogger<PolicyReEvaluationWorker> logger)
|
||||
{
|
||||
_activationQueue = activationQueue ?? throw new ArgumentNullException(nameof(activationQueue));
|
||||
_reEvaluationService = reEvaluationService ?? throw new ArgumentNullException(nameof(reEvaluationService));
|
||||
_progressReporter = progressReporter ?? throw new ArgumentNullException(nameof(progressReporter));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
if (!_options.Policy.Enabled)
|
||||
{
|
||||
_logger.LogInformation("Policy re-evaluation worker is disabled.");
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogInformation("Policy re-evaluation worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var events = await _activationQueue
|
||||
.DequeueAsync(_options.Policy.Dispatch.BatchSize, stoppingToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (events.Count == 0)
|
||||
{
|
||||
await Task.Delay(_options.Policy.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var activationEvent in events)
|
||||
{
|
||||
if (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
await ProcessActivationEventAsync(activationEvent, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogInformation("Policy re-evaluation worker stopping due to cancellation.");
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in policy re-evaluation worker loop.");
|
||||
await Task.Delay(_options.Policy.Dispatch.RetryBackoff, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Policy re-evaluation worker stopped.");
|
||||
}
|
||||
|
||||
private async Task ProcessActivationEventAsync(
|
||||
PolicyActivationEvent activationEvent,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var startedAt = _timeProvider.GetUtcNow();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Processing policy activation event: JobId={JobId}, PolicyRunId={PolicyRunId}, Tenant={TenantId}, Priority={Priority}",
|
||||
activationEvent.JobId,
|
||||
activationEvent.PolicyRunId,
|
||||
activationEvent.TenantId,
|
||||
activationEvent.Priority);
|
||||
|
||||
try
|
||||
{
|
||||
// Report progress: started
|
||||
await _progressReporter.ReportStartedAsync(
|
||||
activationEvent.TenantId,
|
||||
activationEvent.PolicyRunId,
|
||||
activationEvent.JobId,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Execute re-evaluation
|
||||
var result = await _reEvaluationService.ExecuteAsync(
|
||||
activationEvent,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Report progress: completed
|
||||
await _progressReporter.ReportCompletedAsync(
|
||||
activationEvent.TenantId,
|
||||
activationEvent.PolicyRunId,
|
||||
activationEvent.JobId,
|
||||
result,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var duration = _timeProvider.GetUtcNow() - startedAt;
|
||||
_metrics.RecordPolicyReEvaluation(
|
||||
activationEvent.TenantId,
|
||||
result.Status.ToString().ToLowerInvariant(),
|
||||
duration);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Policy activation completed: JobId={JobId}, Status={Status}, AssetsProcessed={AssetsProcessed}, Duration={Duration}ms",
|
||||
activationEvent.JobId,
|
||||
result.Status,
|
||||
result.AssetsProcessed,
|
||||
duration.TotalMilliseconds);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Policy activation failed: JobId={JobId}, PolicyRunId={PolicyRunId}",
|
||||
activationEvent.JobId,
|
||||
activationEvent.PolicyRunId);
|
||||
|
||||
await _progressReporter.ReportFailedAsync(
|
||||
activationEvent.TenantId,
|
||||
activationEvent.PolicyRunId,
|
||||
activationEvent.JobId,
|
||||
ex.Message,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var duration = _timeProvider.GetUtcNow() - startedAt;
|
||||
_metrics.RecordPolicyReEvaluation(
|
||||
activationEvent.TenantId,
|
||||
"failed",
|
||||
duration);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Queue interface for policy activation events.
|
||||
/// </summary>
|
||||
public interface IPolicyActivationQueue
|
||||
{
|
||||
/// <summary>
|
||||
/// Dequeues activation events for processing.
|
||||
/// </summary>
|
||||
ValueTask<IReadOnlyList<PolicyActivationEvent>> DequeueAsync(
|
||||
int maxCount,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Enqueues an activation event for processing.
|
||||
/// </summary>
|
||||
ValueTask EnqueueAsync(
|
||||
PolicyActivationEvent activationEvent,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service for executing policy re-evaluation.
|
||||
/// </summary>
|
||||
public interface IPolicyReEvaluationService
|
||||
{
|
||||
/// <summary>
|
||||
/// Executes re-evaluation for a policy activation event.
|
||||
/// </summary>
|
||||
ValueTask<PolicyReEvaluationResult> ExecuteAsync(
|
||||
PolicyActivationEvent activationEvent,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of policy re-evaluation execution.
|
||||
/// </summary>
|
||||
public sealed record PolicyReEvaluationResult(
|
||||
PolicyReEvaluationStatus Status,
|
||||
int AssetsProcessed,
|
||||
int ShardsCompleted,
|
||||
int ShardsTotal,
|
||||
ImmutableArray<string> FailedAssets,
|
||||
DateTimeOffset CompletedAt)
|
||||
{
|
||||
public static PolicyReEvaluationResult NoWork(DateTimeOffset completedAt)
|
||||
=> new(PolicyReEvaluationStatus.NoWork, 0, 0, 0, [], completedAt);
|
||||
|
||||
public static PolicyReEvaluationResult Success(
|
||||
int assetsProcessed,
|
||||
int shardsCompleted,
|
||||
int shardsTotal,
|
||||
DateTimeOffset completedAt)
|
||||
=> new(PolicyReEvaluationStatus.Completed, assetsProcessed, shardsCompleted, shardsTotal, [], completedAt);
|
||||
|
||||
public static PolicyReEvaluationResult PartialSuccess(
|
||||
int assetsProcessed,
|
||||
int shardsCompleted,
|
||||
int shardsTotal,
|
||||
ImmutableArray<string> failedAssets,
|
||||
DateTimeOffset completedAt)
|
||||
=> new(PolicyReEvaluationStatus.PartiallyCompleted, assetsProcessed, shardsCompleted, shardsTotal, failedAssets, completedAt);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of policy re-evaluation.
|
||||
/// </summary>
|
||||
public enum PolicyReEvaluationStatus
|
||||
{
|
||||
NoWork,
|
||||
Completed,
|
||||
PartiallyCompleted,
|
||||
Failed
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reporter for policy re-evaluation progress.
|
||||
/// </summary>
|
||||
public interface IPolicyProgressReporter
|
||||
{
|
||||
ValueTask ReportStartedAsync(
|
||||
string tenantId,
|
||||
string policyRunId,
|
||||
string jobId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask ReportProgressAsync(
|
||||
string tenantId,
|
||||
string policyRunId,
|
||||
string jobId,
|
||||
int shardsCompleted,
|
||||
int shardsTotal,
|
||||
int assetsProcessed,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask ReportCompletedAsync(
|
||||
string tenantId,
|
||||
string policyRunId,
|
||||
string jobId,
|
||||
PolicyReEvaluationResult result,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask ReportFailedAsync(
|
||||
string tenantId,
|
||||
string policyRunId,
|
||||
string jobId,
|
||||
string error,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of policy re-evaluation service.
|
||||
/// </summary>
|
||||
public sealed class PolicyReEvaluationService : IPolicyReEvaluationService
|
||||
{
|
||||
private readonly IPolicyAssetSharder _sharder;
|
||||
private readonly IPolicyShardProcessor _shardProcessor;
|
||||
private readonly IPolicyProgressReporter _progressReporter;
|
||||
private readonly RateLimiter _rateLimiter;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<PolicyReEvaluationService> _logger;
|
||||
|
||||
public PolicyReEvaluationService(
|
||||
IPolicyAssetSharder sharder,
|
||||
IPolicyShardProcessor shardProcessor,
|
||||
IPolicyProgressReporter progressReporter,
|
||||
RateLimiter rateLimiter,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
ILogger<PolicyReEvaluationService> logger)
|
||||
{
|
||||
_sharder = sharder ?? throw new ArgumentNullException(nameof(sharder));
|
||||
_shardProcessor = shardProcessor ?? throw new ArgumentNullException(nameof(shardProcessor));
|
||||
_progressReporter = progressReporter ?? throw new ArgumentNullException(nameof(progressReporter));
|
||||
_rateLimiter = rateLimiter ?? throw new ArgumentNullException(nameof(rateLimiter));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async ValueTask<PolicyReEvaluationResult> ExecuteAsync(
|
||||
PolicyActivationEvent activationEvent,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
// Shard assets for processing
|
||||
var shards = await _sharder.ShardAssetsAsync(
|
||||
activationEvent.TenantId,
|
||||
activationEvent.PolicyRunId,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (shards.Count == 0)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"No assets to re-evaluate for policy run {PolicyRunId}",
|
||||
activationEvent.PolicyRunId);
|
||||
|
||||
return PolicyReEvaluationResult.NoWork(_timeProvider.GetUtcNow());
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Processing {ShardCount} shards for policy run {PolicyRunId}",
|
||||
shards.Count,
|
||||
activationEvent.PolicyRunId);
|
||||
|
||||
var shardsCompleted = 0;
|
||||
var assetsProcessed = 0;
|
||||
var failedAssets = new List<string>();
|
||||
|
||||
foreach (var shard in shards)
|
||||
{
|
||||
// Honor rate limits
|
||||
using var lease = await _rateLimiter.AcquireAsync(1, cancellationToken).ConfigureAwait(false);
|
||||
if (!lease.IsAcquired)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Rate limit exceeded for policy run {PolicyRunId}, waiting...",
|
||||
activationEvent.PolicyRunId);
|
||||
|
||||
await Task.Delay(TimeSpan.FromSeconds(1), cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var result = await _shardProcessor.ProcessShardAsync(
|
||||
shard,
|
||||
activationEvent,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
assetsProcessed += result.AssetsProcessed;
|
||||
failedAssets.AddRange(result.FailedAssetIds);
|
||||
shardsCompleted++;
|
||||
|
||||
// Report progress
|
||||
await _progressReporter.ReportProgressAsync(
|
||||
activationEvent.TenantId,
|
||||
activationEvent.PolicyRunId,
|
||||
activationEvent.JobId,
|
||||
shardsCompleted,
|
||||
shards.Count,
|
||||
assetsProcessed,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Failed to process shard {ShardId} for policy run {PolicyRunId}",
|
||||
shard.ShardId,
|
||||
activationEvent.PolicyRunId);
|
||||
|
||||
failedAssets.AddRange(shard.AssetIds);
|
||||
}
|
||||
}
|
||||
|
||||
var completedAt = _timeProvider.GetUtcNow();
|
||||
|
||||
if (failedAssets.Count == 0)
|
||||
{
|
||||
return PolicyReEvaluationResult.Success(
|
||||
assetsProcessed,
|
||||
shardsCompleted,
|
||||
shards.Count,
|
||||
completedAt);
|
||||
}
|
||||
|
||||
return PolicyReEvaluationResult.PartialSuccess(
|
||||
assetsProcessed,
|
||||
shardsCompleted,
|
||||
shards.Count,
|
||||
[.. failedAssets],
|
||||
completedAt);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for sharding assets for policy re-evaluation.
|
||||
/// </summary>
|
||||
public interface IPolicyAssetSharder
|
||||
{
|
||||
/// <summary>
|
||||
/// Shards assets for a policy run into processable chunks.
|
||||
/// </summary>
|
||||
ValueTask<IReadOnlyList<PolicyAssetShard>> ShardAssetsAsync(
|
||||
string tenantId,
|
||||
string policyRunId,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents a shard of assets for policy re-evaluation.
|
||||
/// </summary>
|
||||
public sealed record PolicyAssetShard(
|
||||
string ShardId,
|
||||
string TenantId,
|
||||
string PolicyRunId,
|
||||
ImmutableArray<string> AssetIds,
|
||||
int ShardIndex,
|
||||
int TotalShards);
|
||||
|
||||
/// <summary>
|
||||
/// Interface for processing individual policy shards.
|
||||
/// </summary>
|
||||
public interface IPolicyShardProcessor
|
||||
{
|
||||
/// <summary>
|
||||
/// Processes a single shard of assets.
|
||||
/// </summary>
|
||||
ValueTask<PolicyShardResult> ProcessShardAsync(
|
||||
PolicyAssetShard shard,
|
||||
PolicyActivationEvent activationEvent,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of processing a policy shard.
|
||||
/// </summary>
|
||||
public sealed record PolicyShardResult(
|
||||
string ShardId,
|
||||
int AssetsProcessed,
|
||||
ImmutableArray<string> FailedAssetIds);
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of policy activation queue.
|
||||
/// </summary>
|
||||
public sealed class InMemoryPolicyActivationQueue : IPolicyActivationQueue
|
||||
{
|
||||
private readonly Queue<PolicyActivationEvent> _queue = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
public ValueTask<IReadOnlyList<PolicyActivationEvent>> DequeueAsync(
|
||||
int maxCount,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = new List<PolicyActivationEvent>();
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
while (results.Count < maxCount && _queue.Count > 0)
|
||||
{
|
||||
results.Add(_queue.Dequeue());
|
||||
}
|
||||
}
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<PolicyActivationEvent>>(results);
|
||||
}
|
||||
|
||||
public ValueTask EnqueueAsync(
|
||||
PolicyActivationEvent activationEvent,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_queue.Enqueue(activationEvent);
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of progress reporter for testing.
|
||||
/// </summary>
|
||||
public sealed class NullPolicyProgressReporter : IPolicyProgressReporter
|
||||
{
|
||||
public static NullPolicyProgressReporter Instance { get; } = new();
|
||||
|
||||
public ValueTask ReportStartedAsync(string tenantId, string policyRunId, string jobId, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
|
||||
public ValueTask ReportProgressAsync(string tenantId, string policyRunId, string jobId, int shardsCompleted, int shardsTotal, int assetsProcessed, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
|
||||
public ValueTask ReportCompletedAsync(string tenantId, string policyRunId, string jobId, PolicyReEvaluationResult result, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
|
||||
public ValueTask ReportFailedAsync(string tenantId, string policyRunId, string jobId, string error, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,198 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Policy;
|
||||
|
||||
/// <summary>
|
||||
/// Reconciliation worker per SCHED-WORKER-23-102.
|
||||
/// Ensures policy re-evaluation completion within SLA, emits alerts on backlog, and persists status to policy_runs.
|
||||
/// </summary>
|
||||
public sealed class PolicyReconciliationWorker : BackgroundService
|
||||
{
|
||||
private readonly IPolicyRunRepository _policyRunRepository;
|
||||
private readonly IPolicyBacklogAlertService _alertService;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly ILogger<PolicyReconciliationWorker> _logger;
|
||||
|
||||
public PolicyReconciliationWorker(
|
||||
IPolicyRunRepository policyRunRepository,
|
||||
IPolicyBacklogAlertService alertService,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
ILogger<PolicyReconciliationWorker> logger)
|
||||
{
|
||||
_policyRunRepository = policyRunRepository ?? throw new ArgumentNullException(nameof(policyRunRepository));
|
||||
_alertService = alertService ?? throw new ArgumentNullException(nameof(alertService));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
if (!_options.Policy.Enabled)
|
||||
{
|
||||
_logger.LogInformation("Policy reconciliation worker is disabled.");
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogInformation("Policy reconciliation worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await ReconcileAsync(stoppingToken).ConfigureAwait(false);
|
||||
await Task.Delay(TimeSpan.FromMinutes(1), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in policy reconciliation worker loop.");
|
||||
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Policy reconciliation worker stopped.");
|
||||
}
|
||||
|
||||
private async Task ReconcileAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var slaThreshold = now.AddMinutes(-30); // 30-minute SLA
|
||||
|
||||
// Find policy runs that are overdue
|
||||
var overdueRuns = await _policyRunRepository
|
||||
.GetOverdueRunsAsync(slaThreshold, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (overdueRuns.Count == 0)
|
||||
{
|
||||
_logger.LogDebug("No overdue policy runs found.");
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogWarning(
|
||||
"Found {Count} overdue policy runs exceeding SLA threshold.",
|
||||
overdueRuns.Count);
|
||||
|
||||
// Group by tenant for alert aggregation
|
||||
var byTenant = overdueRuns.GroupBy(static r => r.TenantId);
|
||||
|
||||
foreach (var tenantGroup in byTenant)
|
||||
{
|
||||
var tenantId = tenantGroup.Key;
|
||||
var tenantOverdue = tenantGroup.ToList();
|
||||
|
||||
// Emit backlog alert
|
||||
await _alertService.EmitBacklogAlertAsync(
|
||||
tenantId,
|
||||
tenantOverdue.Count,
|
||||
slaThreshold,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Update policy run status
|
||||
foreach (var run in tenantOverdue)
|
||||
{
|
||||
var updated = run with
|
||||
{
|
||||
Status = PolicyRunStatus.SlaBreached,
|
||||
SlaBreachedAt = now
|
||||
};
|
||||
|
||||
await _policyRunRepository
|
||||
.UpdateAsync(updated, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
_logger.LogWarning(
|
||||
"Policy run {RunId} for tenant {TenantId} marked as SLA breached (started at {StartedAt}).",
|
||||
run.RunId,
|
||||
tenantId,
|
||||
run.StartedAt);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for policy runs.
|
||||
/// </summary>
|
||||
public interface IPolicyRunRepository
|
||||
{
|
||||
ValueTask<IReadOnlyList<PolicyRunRecord>> GetOverdueRunsAsync(
|
||||
DateTimeOffset threshold,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask UpdateAsync(
|
||||
PolicyRunRecord record,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask<PolicyRunRecord?> GetAsync(
|
||||
string runId,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Record representing a policy run in the system.
|
||||
/// </summary>
|
||||
public sealed record PolicyRunRecord(
|
||||
string RunId,
|
||||
string TenantId,
|
||||
string PolicyId,
|
||||
PolicyRunStatus Status,
|
||||
DateTimeOffset StartedAt,
|
||||
DateTimeOffset? CompletedAt = null,
|
||||
DateTimeOffset? SlaBreachedAt = null,
|
||||
int AssetsTotal = 0,
|
||||
int AssetsCompleted = 0,
|
||||
string? Error = null);
|
||||
|
||||
/// <summary>
|
||||
/// Status of a policy run.
|
||||
/// </summary>
|
||||
public enum PolicyRunStatus
|
||||
{
|
||||
Pending,
|
||||
Running,
|
||||
Completed,
|
||||
Failed,
|
||||
SlaBreached,
|
||||
Cancelled
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service for emitting backlog alerts.
|
||||
/// </summary>
|
||||
public interface IPolicyBacklogAlertService
|
||||
{
|
||||
ValueTask EmitBacklogAlertAsync(
|
||||
string tenantId,
|
||||
int overdueCount,
|
||||
DateTimeOffset threshold,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of backlog alert service for testing.
|
||||
/// </summary>
|
||||
public sealed class NullPolicyBacklogAlertService : IPolicyBacklogAlertService
|
||||
{
|
||||
public static NullPolicyBacklogAlertService Instance { get; } = new();
|
||||
|
||||
public ValueTask EmitBacklogAlertAsync(
|
||||
string tenantId,
|
||||
int overdueCount,
|
||||
DateTimeOffset threshold,
|
||||
CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,470 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Reachability;
|
||||
|
||||
/// <summary>
|
||||
/// Reachability joiner worker per SCHED-WORKER-26-201.
|
||||
/// Combines SBOM snapshots with signals, writes cached facts, and schedules updates on new events.
|
||||
/// </summary>
|
||||
public sealed class ReachabilityJoinerWorker : BackgroundService
|
||||
{
|
||||
private readonly ISbomSnapshotQueue _snapshotQueue;
|
||||
private readonly ISignalProvider _signalProvider;
|
||||
private readonly IReachabilityFactCache _factCache;
|
||||
private readonly IReachabilityUpdateScheduler _updateScheduler;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly ILogger<ReachabilityJoinerWorker> _logger;
|
||||
|
||||
public ReachabilityJoinerWorker(
|
||||
ISbomSnapshotQueue snapshotQueue,
|
||||
ISignalProvider signalProvider,
|
||||
IReachabilityFactCache factCache,
|
||||
IReachabilityUpdateScheduler updateScheduler,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
ILogger<ReachabilityJoinerWorker> logger)
|
||||
{
|
||||
_snapshotQueue = snapshotQueue ?? throw new ArgumentNullException(nameof(snapshotQueue));
|
||||
_signalProvider = signalProvider ?? throw new ArgumentNullException(nameof(signalProvider));
|
||||
_factCache = factCache ?? throw new ArgumentNullException(nameof(factCache));
|
||||
_updateScheduler = updateScheduler ?? throw new ArgumentNullException(nameof(updateScheduler));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
if (!_options.Reachability.Enabled)
|
||||
{
|
||||
_logger.LogInformation("Reachability joiner worker is disabled.");
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogInformation("Reachability joiner worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Dequeue SBOM snapshots for processing
|
||||
var snapshots = await _snapshotQueue
|
||||
.DequeueAsync(_options.Reachability.BatchSize, stoppingToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (snapshots.Count == 0)
|
||||
{
|
||||
await Task.Delay(_options.Reachability.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Processing {Count} SBOM snapshots for reachability analysis.",
|
||||
snapshots.Count);
|
||||
|
||||
// Process snapshots concurrently with bounded parallelism
|
||||
await ProcessSnapshotsAsync(snapshots, stoppingToken).ConfigureAwait(false);
|
||||
|
||||
await Task.Delay(_options.Reachability.PollInterval, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (System.Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in reachability joiner worker loop.");
|
||||
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Reachability joiner worker stopped.");
|
||||
}
|
||||
|
||||
private async Task ProcessSnapshotsAsync(
|
||||
IReadOnlyList<SbomSnapshot> snapshots,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var semaphore = new SemaphoreSlim(_options.Reachability.MaxConcurrency);
|
||||
var tasks = new List<Task>();
|
||||
|
||||
foreach (var snapshot in snapshots)
|
||||
{
|
||||
await semaphore.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
tasks.Add(ProcessSnapshotWithSemaphoreAsync(snapshot, semaphore, cancellationToken));
|
||||
}
|
||||
|
||||
await Task.WhenAll(tasks).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task ProcessSnapshotWithSemaphoreAsync(
|
||||
SbomSnapshot snapshot,
|
||||
SemaphoreSlim semaphore,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
await ProcessSnapshotAsync(snapshot, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
finally
|
||||
{
|
||||
semaphore.Release();
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ProcessSnapshotAsync(
|
||||
SbomSnapshot snapshot,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var startedAt = _timeProvider.GetUtcNow();
|
||||
|
||||
try
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Processing SBOM snapshot {SnapshotId} for tenant {TenantId}, artifact {ArtifactId}.",
|
||||
snapshot.SnapshotId,
|
||||
snapshot.TenantId,
|
||||
snapshot.ArtifactId);
|
||||
|
||||
// Fetch signals for the snapshot's components
|
||||
var signals = await _signalProvider.GetSignalsAsync(
|
||||
snapshot.TenantId,
|
||||
snapshot.ComponentPurls,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Join snapshot with signals to produce reachability facts
|
||||
var facts = JoinSnapshotWithSignals(snapshot, signals);
|
||||
|
||||
if (facts.Count == 0)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"No reachability facts produced for snapshot {SnapshotId}.",
|
||||
snapshot.SnapshotId);
|
||||
return;
|
||||
}
|
||||
|
||||
// Write facts to cache
|
||||
await _factCache.WriteFactsAsync(
|
||||
snapshot.TenantId,
|
||||
snapshot.ArtifactId,
|
||||
facts,
|
||||
_options.Reachability.FactCacheTtl,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Schedule downstream updates for affected policies
|
||||
await _updateScheduler.ScheduleUpdatesAsync(
|
||||
snapshot.TenantId,
|
||||
snapshot.ArtifactId,
|
||||
facts,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var duration = _timeProvider.GetUtcNow() - startedAt;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Processed SBOM snapshot {SnapshotId}: {FactCount} facts produced, {SignalCount} signals matched in {Duration}ms.",
|
||||
snapshot.SnapshotId,
|
||||
facts.Count,
|
||||
signals.Count,
|
||||
duration.TotalMilliseconds);
|
||||
}
|
||||
catch (System.Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Failed to process SBOM snapshot {SnapshotId} for tenant {TenantId}.",
|
||||
snapshot.SnapshotId,
|
||||
snapshot.TenantId);
|
||||
}
|
||||
}
|
||||
|
||||
private static IReadOnlyList<ReachabilityFact> JoinSnapshotWithSignals(
|
||||
SbomSnapshot snapshot,
|
||||
IReadOnlyDictionary<string, ComponentSignal> signals)
|
||||
{
|
||||
var facts = new List<ReachabilityFact>();
|
||||
|
||||
foreach (var purl in snapshot.ComponentPurls)
|
||||
{
|
||||
if (!signals.TryGetValue(purl, out var signal))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var fact = new ReachabilityFact(
|
||||
FactId: $"{snapshot.SnapshotId}:{purl}",
|
||||
TenantId: snapshot.TenantId,
|
||||
ArtifactId: snapshot.ArtifactId,
|
||||
ComponentPurl: purl,
|
||||
IsReachable: signal.IsReachable,
|
||||
Confidence: signal.Confidence,
|
||||
Evidence: signal.Evidence,
|
||||
ProducedAt: DateTimeOffset.UtcNow);
|
||||
|
||||
facts.Add(fact);
|
||||
}
|
||||
|
||||
return facts;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Queue interface for SBOM snapshots awaiting reachability analysis.
|
||||
/// </summary>
|
||||
public interface ISbomSnapshotQueue
|
||||
{
|
||||
/// <summary>
|
||||
/// Dequeues SBOM snapshots for processing.
|
||||
/// </summary>
|
||||
ValueTask<IReadOnlyList<SbomSnapshot>> DequeueAsync(
|
||||
int maxCount,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Enqueues an SBOM snapshot for processing.
|
||||
/// </summary>
|
||||
ValueTask EnqueueAsync(
|
||||
SbomSnapshot snapshot,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Provider interface for component reachability signals.
|
||||
/// </summary>
|
||||
public interface ISignalProvider
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets reachability signals for the specified components.
|
||||
/// </summary>
|
||||
ValueTask<IReadOnlyDictionary<string, ComponentSignal>> GetSignalsAsync(
|
||||
string tenantId,
|
||||
ImmutableArray<string> componentPurls,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Cache interface for storing reachability facts.
|
||||
/// </summary>
|
||||
public interface IReachabilityFactCache
|
||||
{
|
||||
/// <summary>
|
||||
/// Writes reachability facts to the cache.
|
||||
/// </summary>
|
||||
ValueTask WriteFactsAsync(
|
||||
string tenantId,
|
||||
string artifactId,
|
||||
IReadOnlyList<ReachabilityFact> facts,
|
||||
TimeSpan ttl,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Reads reachability facts from the cache.
|
||||
/// </summary>
|
||||
ValueTask<IReadOnlyList<ReachabilityFact>> ReadFactsAsync(
|
||||
string tenantId,
|
||||
string artifactId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Invalidates cached facts for an artifact.
|
||||
/// </summary>
|
||||
ValueTask InvalidateAsync(
|
||||
string tenantId,
|
||||
string artifactId,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Scheduler interface for triggering downstream updates on new reachability facts.
|
||||
/// </summary>
|
||||
public interface IReachabilityUpdateScheduler
|
||||
{
|
||||
/// <summary>
|
||||
/// Schedules policy re-evaluation updates based on new reachability facts.
|
||||
/// </summary>
|
||||
ValueTask ScheduleUpdatesAsync(
|
||||
string tenantId,
|
||||
string artifactId,
|
||||
IReadOnlyList<ReachabilityFact> facts,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents an SBOM snapshot for reachability analysis.
|
||||
/// </summary>
|
||||
public sealed record SbomSnapshot(
|
||||
string SnapshotId,
|
||||
string TenantId,
|
||||
string ArtifactId,
|
||||
string ImageDigest,
|
||||
ImmutableArray<string> ComponentPurls,
|
||||
DateTimeOffset CreatedAt);
|
||||
|
||||
/// <summary>
|
||||
/// Represents a reachability signal for a component.
|
||||
/// </summary>
|
||||
public sealed record ComponentSignal(
|
||||
string ComponentPurl,
|
||||
bool IsReachable,
|
||||
float Confidence,
|
||||
ImmutableArray<SignalEvidence> Evidence);
|
||||
|
||||
/// <summary>
|
||||
/// Evidence supporting a reachability signal.
|
||||
/// </summary>
|
||||
public sealed record SignalEvidence(
|
||||
string Source,
|
||||
string Type,
|
||||
string Details,
|
||||
float Weight);
|
||||
|
||||
/// <summary>
|
||||
/// Represents a cached reachability fact.
|
||||
/// </summary>
|
||||
public sealed record ReachabilityFact(
|
||||
string FactId,
|
||||
string TenantId,
|
||||
string ArtifactId,
|
||||
string ComponentPurl,
|
||||
bool IsReachable,
|
||||
float Confidence,
|
||||
ImmutableArray<SignalEvidence> Evidence,
|
||||
DateTimeOffset ProducedAt);
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of SBOM snapshot queue.
|
||||
/// </summary>
|
||||
public sealed class InMemorySbomSnapshotQueue : ISbomSnapshotQueue
|
||||
{
|
||||
private readonly Queue<SbomSnapshot> _queue = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
public ValueTask<IReadOnlyList<SbomSnapshot>> DequeueAsync(
|
||||
int maxCount,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = new List<SbomSnapshot>();
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
while (results.Count < maxCount && _queue.Count > 0)
|
||||
{
|
||||
results.Add(_queue.Dequeue());
|
||||
}
|
||||
}
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<SbomSnapshot>>(results);
|
||||
}
|
||||
|
||||
public ValueTask EnqueueAsync(
|
||||
SbomSnapshot snapshot,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_queue.Enqueue(snapshot);
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of reachability fact cache.
|
||||
/// </summary>
|
||||
public sealed class InMemoryReachabilityFactCache : IReachabilityFactCache
|
||||
{
|
||||
private readonly Dictionary<string, (IReadOnlyList<ReachabilityFact> Facts, DateTimeOffset ExpiresAt)> _cache = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
public ValueTask WriteFactsAsync(
|
||||
string tenantId,
|
||||
string artifactId,
|
||||
IReadOnlyList<ReachabilityFact> facts,
|
||||
TimeSpan ttl,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var key = BuildKey(tenantId, artifactId);
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
_cache[key] = (facts, DateTimeOffset.UtcNow.Add(ttl));
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
public ValueTask<IReadOnlyList<ReachabilityFact>> ReadFactsAsync(
|
||||
string tenantId,
|
||||
string artifactId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var key = BuildKey(tenantId, artifactId);
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
if (_cache.TryGetValue(key, out var entry) && entry.ExpiresAt > DateTimeOffset.UtcNow)
|
||||
{
|
||||
return ValueTask.FromResult(entry.Facts);
|
||||
}
|
||||
}
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<ReachabilityFact>>([]);
|
||||
}
|
||||
|
||||
public ValueTask InvalidateAsync(
|
||||
string tenantId,
|
||||
string artifactId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var key = BuildKey(tenantId, artifactId);
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
_cache.Remove(key);
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
private static string BuildKey(string tenantId, string artifactId)
|
||||
=> $"{tenantId}:{artifactId}";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of signal provider for testing.
|
||||
/// </summary>
|
||||
public sealed class NullSignalProvider : ISignalProvider
|
||||
{
|
||||
public static NullSignalProvider Instance { get; } = new();
|
||||
|
||||
public ValueTask<IReadOnlyDictionary<string, ComponentSignal>> GetSignalsAsync(
|
||||
string tenantId,
|
||||
ImmutableArray<string> componentPurls,
|
||||
CancellationToken cancellationToken = default)
|
||||
=> ValueTask.FromResult<IReadOnlyDictionary<string, ComponentSignal>>(
|
||||
new Dictionary<string, ComponentSignal>());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of reachability update scheduler for testing.
|
||||
/// </summary>
|
||||
public sealed class NullReachabilityUpdateScheduler : IReachabilityUpdateScheduler
|
||||
{
|
||||
public static NullReachabilityUpdateScheduler Instance { get; } = new();
|
||||
|
||||
public ValueTask ScheduleUpdatesAsync(
|
||||
string tenantId,
|
||||
string artifactId,
|
||||
IReadOnlyList<ReachabilityFact> facts,
|
||||
CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,455 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Reachability;
|
||||
|
||||
/// <summary>
|
||||
/// Staleness monitor per SCHED-WORKER-26-202.
|
||||
/// Monitors reachability facts for staleness, publishes warnings, and updates dashboards.
|
||||
/// </summary>
|
||||
public sealed class ReachabilityStalenessMonitor : BackgroundService
|
||||
{
|
||||
private readonly IReachabilityFactStore _factStore;
|
||||
private readonly IStalenessAlertPublisher _alertPublisher;
|
||||
private readonly IStalenessMetricsReporter _metricsReporter;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly ILogger<ReachabilityStalenessMonitor> _logger;
|
||||
|
||||
public ReachabilityStalenessMonitor(
|
||||
IReachabilityFactStore factStore,
|
||||
IStalenessAlertPublisher alertPublisher,
|
||||
IStalenessMetricsReporter metricsReporter,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
ILogger<ReachabilityStalenessMonitor> logger)
|
||||
{
|
||||
_factStore = factStore ?? throw new ArgumentNullException(nameof(factStore));
|
||||
_alertPublisher = alertPublisher ?? throw new ArgumentNullException(nameof(alertPublisher));
|
||||
_metricsReporter = metricsReporter ?? throw new ArgumentNullException(nameof(metricsReporter));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
if (!_options.Reachability.Enabled)
|
||||
{
|
||||
_logger.LogInformation("Reachability staleness monitor is disabled.");
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogInformation("Reachability staleness monitor started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
// Check for stale facts across all tenants
|
||||
await CheckForStalenessAsync(now, stoppingToken).ConfigureAwait(false);
|
||||
|
||||
// Wait for the configured check interval
|
||||
await Task.Delay(_options.Reachability.PollInterval, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in reachability staleness monitor loop.");
|
||||
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Reachability staleness monitor stopped.");
|
||||
}
|
||||
|
||||
private async Task CheckForStalenessAsync(DateTimeOffset now, CancellationToken cancellationToken)
|
||||
{
|
||||
// Get all tenants with reachability facts
|
||||
var tenants = await _factStore.GetTenantsWithFactsAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (tenants.Count == 0)
|
||||
{
|
||||
_logger.LogDebug("No tenants with reachability facts to monitor.");
|
||||
return;
|
||||
}
|
||||
|
||||
var stalenessThreshold = now.Subtract(_options.Reachability.FactCacheTtl);
|
||||
var warningThreshold = now.Subtract(_options.Reachability.FactCacheTtl.Multiply(0.8)); // 80% of TTL
|
||||
|
||||
foreach (var tenantId in tenants)
|
||||
{
|
||||
try
|
||||
{
|
||||
await CheckTenantStalenessAsync(
|
||||
tenantId,
|
||||
now,
|
||||
stalenessThreshold,
|
||||
warningThreshold,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Failed to check staleness for tenant {TenantId}.",
|
||||
tenantId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task CheckTenantStalenessAsync(
|
||||
string tenantId,
|
||||
DateTimeOffset now,
|
||||
DateTimeOffset stalenessThreshold,
|
||||
DateTimeOffset warningThreshold,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
// Get staleness summary for this tenant
|
||||
var summary = await _factStore.GetStalenessSummaryAsync(
|
||||
tenantId,
|
||||
stalenessThreshold,
|
||||
warningThreshold,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Report metrics
|
||||
await _metricsReporter.ReportStalenessMetricsAsync(
|
||||
tenantId,
|
||||
summary,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Publish alerts if necessary
|
||||
if (summary.StaleCount > 0)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Tenant {TenantId} has {StaleCount} stale reachability facts (threshold: {Threshold}).",
|
||||
tenantId,
|
||||
summary.StaleCount,
|
||||
stalenessThreshold);
|
||||
|
||||
await _alertPublisher.PublishStaleAlertAsync(
|
||||
tenantId,
|
||||
summary,
|
||||
StalenessLevel.Stale,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
else if (summary.WarningCount > 0)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Tenant {TenantId} has {WarningCount} reachability facts approaching staleness.",
|
||||
tenantId,
|
||||
summary.WarningCount);
|
||||
|
||||
await _alertPublisher.PublishStaleAlertAsync(
|
||||
tenantId,
|
||||
summary,
|
||||
StalenessLevel.Warning,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Tenant {TenantId} reachability facts are fresh ({FreshCount} facts).",
|
||||
tenantId,
|
||||
summary.FreshCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Store interface for reachability facts with staleness queries.
|
||||
/// </summary>
|
||||
public interface IReachabilityFactStore
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets all tenant IDs that have reachability facts.
|
||||
/// </summary>
|
||||
ValueTask<IReadOnlyList<string>> GetTenantsWithFactsAsync(
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a staleness summary for a tenant.
|
||||
/// </summary>
|
||||
ValueTask<StalenessSummary> GetStalenessSummaryAsync(
|
||||
string tenantId,
|
||||
DateTimeOffset stalenessThreshold,
|
||||
DateTimeOffset warningThreshold,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets stale facts for a tenant.
|
||||
/// </summary>
|
||||
ValueTask<IReadOnlyList<StaleFact>> GetStaleFactsAsync(
|
||||
string tenantId,
|
||||
DateTimeOffset threshold,
|
||||
int maxCount,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Marks facts as requiring refresh.
|
||||
/// </summary>
|
||||
ValueTask MarkForRefreshAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<string> factIds,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Publisher interface for staleness alerts.
|
||||
/// </summary>
|
||||
public interface IStalenessAlertPublisher
|
||||
{
|
||||
/// <summary>
|
||||
/// Publishes an alert for stale reachability facts.
|
||||
/// </summary>
|
||||
ValueTask PublishStaleAlertAsync(
|
||||
string tenantId,
|
||||
StalenessSummary summary,
|
||||
StalenessLevel level,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reporter interface for staleness metrics.
|
||||
/// </summary>
|
||||
public interface IStalenessMetricsReporter
|
||||
{
|
||||
/// <summary>
|
||||
/// Reports staleness metrics for dashboards.
|
||||
/// </summary>
|
||||
ValueTask ReportStalenessMetricsAsync(
|
||||
string tenantId,
|
||||
StalenessSummary summary,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Summary of reachability fact staleness for a tenant.
|
||||
/// </summary>
|
||||
public sealed record StalenessSummary(
|
||||
string TenantId,
|
||||
int TotalCount,
|
||||
int FreshCount,
|
||||
int WarningCount,
|
||||
int StaleCount,
|
||||
DateTimeOffset? OldestFactTimestamp,
|
||||
DateTimeOffset? NewestFactTimestamp,
|
||||
ImmutableArray<string> StaleArtifactIds);
|
||||
|
||||
/// <summary>
|
||||
/// Represents a stale reachability fact.
|
||||
/// </summary>
|
||||
public sealed record StaleFact(
|
||||
string FactId,
|
||||
string TenantId,
|
||||
string ArtifactId,
|
||||
string ComponentPurl,
|
||||
DateTimeOffset ProducedAt,
|
||||
TimeSpan Age);
|
||||
|
||||
/// <summary>
|
||||
/// Level of staleness for alerts.
|
||||
/// </summary>
|
||||
public enum StalenessLevel
|
||||
{
|
||||
/// <summary>
|
||||
/// Facts are fresh and valid.
|
||||
/// </summary>
|
||||
Fresh,
|
||||
|
||||
/// <summary>
|
||||
/// Facts are approaching staleness threshold.
|
||||
/// </summary>
|
||||
Warning,
|
||||
|
||||
/// <summary>
|
||||
/// Facts have exceeded staleness threshold.
|
||||
/// </summary>
|
||||
Stale,
|
||||
|
||||
/// <summary>
|
||||
/// Facts are critically stale and may affect policy decisions.
|
||||
/// </summary>
|
||||
Critical
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of reachability fact store.
|
||||
/// </summary>
|
||||
public sealed class InMemoryReachabilityFactStore : IReachabilityFactStore
|
||||
{
|
||||
private readonly Dictionary<string, List<StoredFact>> _facts = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
public ValueTask<IReadOnlyList<string>> GetTenantsWithFactsAsync(
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return ValueTask.FromResult<IReadOnlyList<string>>(_facts.Keys.ToList());
|
||||
}
|
||||
}
|
||||
|
||||
public ValueTask<StalenessSummary> GetStalenessSummaryAsync(
|
||||
string tenantId,
|
||||
DateTimeOffset stalenessThreshold,
|
||||
DateTimeOffset warningThreshold,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (!_facts.TryGetValue(tenantId, out var facts) || facts.Count == 0)
|
||||
{
|
||||
return ValueTask.FromResult(new StalenessSummary(
|
||||
tenantId, 0, 0, 0, 0, null, null, []));
|
||||
}
|
||||
|
||||
var staleCount = facts.Count(f => f.ProducedAt < stalenessThreshold);
|
||||
var warningCount = facts.Count(f => f.ProducedAt >= stalenessThreshold && f.ProducedAt < warningThreshold);
|
||||
var freshCount = facts.Count(f => f.ProducedAt >= warningThreshold);
|
||||
|
||||
var staleArtifacts = facts
|
||||
.Where(f => f.ProducedAt < stalenessThreshold)
|
||||
.Select(f => f.ArtifactId)
|
||||
.Distinct()
|
||||
.ToImmutableArray();
|
||||
|
||||
return ValueTask.FromResult(new StalenessSummary(
|
||||
TenantId: tenantId,
|
||||
TotalCount: facts.Count,
|
||||
FreshCount: freshCount,
|
||||
WarningCount: warningCount,
|
||||
StaleCount: staleCount,
|
||||
OldestFactTimestamp: facts.Min(f => f.ProducedAt),
|
||||
NewestFactTimestamp: facts.Max(f => f.ProducedAt),
|
||||
StaleArtifactIds: staleArtifacts));
|
||||
}
|
||||
}
|
||||
|
||||
public ValueTask<IReadOnlyList<StaleFact>> GetStaleFactsAsync(
|
||||
string tenantId,
|
||||
DateTimeOffset threshold,
|
||||
int maxCount,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
if (!_facts.TryGetValue(tenantId, out var facts))
|
||||
{
|
||||
return ValueTask.FromResult<IReadOnlyList<StaleFact>>([]);
|
||||
}
|
||||
|
||||
var staleFacts = facts
|
||||
.Where(f => f.ProducedAt < threshold)
|
||||
.OrderBy(f => f.ProducedAt)
|
||||
.Take(maxCount)
|
||||
.Select(f => new StaleFact(
|
||||
f.FactId,
|
||||
f.TenantId,
|
||||
f.ArtifactId,
|
||||
f.ComponentPurl,
|
||||
f.ProducedAt,
|
||||
now - f.ProducedAt))
|
||||
.ToList();
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<StaleFact>>(staleFacts);
|
||||
}
|
||||
}
|
||||
|
||||
public ValueTask MarkForRefreshAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<string> factIds,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (_facts.TryGetValue(tenantId, out var facts))
|
||||
{
|
||||
var factIdSet = new HashSet<string>(factIds);
|
||||
foreach (var fact in facts.Where(f => factIdSet.Contains(f.FactId)))
|
||||
{
|
||||
fact.MarkedForRefresh = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds a fact to the store (for testing).
|
||||
/// </summary>
|
||||
public void AddFact(string tenantId, string factId, string artifactId, string componentPurl, DateTimeOffset producedAt)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (!_facts.TryGetValue(tenantId, out var facts))
|
||||
{
|
||||
facts = [];
|
||||
_facts[tenantId] = facts;
|
||||
}
|
||||
|
||||
facts.Add(new StoredFact
|
||||
{
|
||||
FactId = factId,
|
||||
TenantId = tenantId,
|
||||
ArtifactId = artifactId,
|
||||
ComponentPurl = componentPurl,
|
||||
ProducedAt = producedAt,
|
||||
MarkedForRefresh = false
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class StoredFact
|
||||
{
|
||||
public required string FactId { get; init; }
|
||||
public required string TenantId { get; init; }
|
||||
public required string ArtifactId { get; init; }
|
||||
public required string ComponentPurl { get; init; }
|
||||
public required DateTimeOffset ProducedAt { get; init; }
|
||||
public bool MarkedForRefresh { get; set; }
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of staleness alert publisher for testing.
|
||||
/// </summary>
|
||||
public sealed class NullStalenessAlertPublisher : IStalenessAlertPublisher
|
||||
{
|
||||
public static NullStalenessAlertPublisher Instance { get; } = new();
|
||||
|
||||
public ValueTask PublishStaleAlertAsync(
|
||||
string tenantId,
|
||||
StalenessSummary summary,
|
||||
StalenessLevel level,
|
||||
CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of staleness metrics reporter for testing.
|
||||
/// </summary>
|
||||
public sealed class NullStalenessMetricsReporter : IStalenessMetricsReporter
|
||||
{
|
||||
public static NullStalenessMetricsReporter Instance { get; } = new();
|
||||
|
||||
public ValueTask ReportStalenessMetricsAsync(
|
||||
string tenantId,
|
||||
StalenessSummary summary,
|
||||
CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,452 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Resolver;
|
||||
|
||||
/// <summary>
|
||||
/// Evaluation orchestration worker per SCHED-WORKER-29-002.
|
||||
/// Invokes Policy Engine batch eval, writes results to Findings Ledger projector queue,
|
||||
/// and handles retries/backoff.
|
||||
/// </summary>
|
||||
public sealed class EvaluationOrchestrationWorker : BackgroundService
|
||||
{
|
||||
private readonly IPolicyEvaluationJobQueue _jobQueue;
|
||||
private readonly ICandidateFindingStore _findingStore;
|
||||
private readonly IPolicyEngineEvaluator _policyEvaluator;
|
||||
private readonly IFindingsLedgerProjector _ledgerProjector;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly ILogger<EvaluationOrchestrationWorker> _logger;
|
||||
|
||||
public EvaluationOrchestrationWorker(
|
||||
IPolicyEvaluationJobQueue jobQueue,
|
||||
ICandidateFindingStore findingStore,
|
||||
IPolicyEngineEvaluator policyEvaluator,
|
||||
IFindingsLedgerProjector ledgerProjector,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
ILogger<EvaluationOrchestrationWorker> logger)
|
||||
{
|
||||
_jobQueue = jobQueue ?? throw new ArgumentNullException(nameof(jobQueue));
|
||||
_findingStore = findingStore ?? throw new ArgumentNullException(nameof(findingStore));
|
||||
_policyEvaluator = policyEvaluator ?? throw new ArgumentNullException(nameof(policyEvaluator));
|
||||
_ledgerProjector = ledgerProjector ?? throw new ArgumentNullException(nameof(ledgerProjector));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Evaluation orchestration worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Dequeue evaluation jobs
|
||||
var jobs = await _jobQueue
|
||||
.DequeueAsync(_options.Policy.Dispatch.BatchSize, stoppingToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (jobs.Count == 0)
|
||||
{
|
||||
await Task.Delay(_options.Policy.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var job in jobs)
|
||||
{
|
||||
if (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
await ProcessEvaluationJobAsync(job, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in evaluation orchestration worker loop.");
|
||||
await Task.Delay(_options.Policy.Dispatch.RetryBackoff, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Evaluation orchestration worker stopped.");
|
||||
}
|
||||
|
||||
private async Task ProcessEvaluationJobAsync(
|
||||
PolicyEvaluationJob job,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var startedAt = _timeProvider.GetUtcNow();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Processing evaluation job {JobId} for tenant {TenantId}, artifact {ArtifactId} with {FindingCount} candidates.",
|
||||
job.JobId,
|
||||
job.TenantId,
|
||||
job.ArtifactId,
|
||||
job.CandidateFindingIds.Length);
|
||||
|
||||
try
|
||||
{
|
||||
// 1. Load candidate findings
|
||||
var candidates = await _findingStore.GetFindingsAsync(
|
||||
job.TenantId,
|
||||
job.CandidateFindingIds,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (candidates.Count == 0)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"No candidate findings found for evaluation job {JobId}.",
|
||||
job.JobId);
|
||||
return;
|
||||
}
|
||||
|
||||
// 2. Invoke Policy Engine batch eval with retries
|
||||
var evalResult = await EvaluateWithRetryAsync(
|
||||
job,
|
||||
candidates,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// 3. Write results to Findings Ledger projector queue
|
||||
var projectionEntries = evalResult.EvaluatedFindings
|
||||
.Select(f => new FindingsLedgerEntry(
|
||||
EntryId: $"{job.JobId}:{f.FindingId}",
|
||||
TenantId: job.TenantId,
|
||||
ArtifactId: job.ArtifactId,
|
||||
FindingId: f.FindingId,
|
||||
ComponentPurl: f.ComponentPurl,
|
||||
VulnerabilityId: f.VulnerabilityId,
|
||||
Severity: f.Severity,
|
||||
PolicyOutcome: f.PolicyOutcome,
|
||||
PolicyId: f.PolicyId,
|
||||
ExceptionId: f.AppliedExceptionId,
|
||||
IsReachable: f.IsReachable,
|
||||
EvaluatedAt: f.EvaluatedAt,
|
||||
Metadata: f.Metadata))
|
||||
.ToList();
|
||||
|
||||
await _ledgerProjector.EnqueueAsync(
|
||||
job.TenantId,
|
||||
projectionEntries,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var duration = _timeProvider.GetUtcNow() - startedAt;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Evaluation job {JobId} completed: {EvaluatedCount}/{TotalCount} findings, {ViolationCount} violations in {Duration}ms.",
|
||||
job.JobId,
|
||||
evalResult.EvaluatedFindings.Length,
|
||||
candidates.Count,
|
||||
evalResult.EvaluatedFindings.Count(f => f.PolicyOutcome == PolicyOutcome.Violation),
|
||||
duration.TotalMilliseconds);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Evaluation job {JobId} failed.",
|
||||
job.JobId);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<BatchEvaluationResult> EvaluateWithRetryAsync(
|
||||
PolicyEvaluationJob job,
|
||||
IReadOnlyList<CandidateFinding> candidates,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var maxAttempts = _options.Policy.Dispatch.MaxAttempts;
|
||||
var delay = _options.Policy.Dispatch.RetryBackoff;
|
||||
|
||||
for (var attempt = 1; attempt <= maxAttempts; attempt++)
|
||||
{
|
||||
try
|
||||
{
|
||||
return await _policyEvaluator.EvaluateBatchAsync(
|
||||
job.TenantId,
|
||||
job.ArtifactId,
|
||||
candidates,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException && attempt < maxAttempts)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Batch evaluation failed for job {JobId} (attempt {Attempt}/{MaxAttempts}), retrying...",
|
||||
job.JobId,
|
||||
attempt,
|
||||
maxAttempts);
|
||||
|
||||
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
|
||||
delay = delay.Multiply(2); // Exponential backoff
|
||||
}
|
||||
}
|
||||
|
||||
throw new InvalidOperationException($"Batch evaluation failed after {maxAttempts} attempts for job {job.JobId}.");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Queue interface for policy evaluation jobs.
|
||||
/// </summary>
|
||||
public interface IPolicyEvaluationJobQueue
|
||||
{
|
||||
ValueTask<IReadOnlyList<PolicyEvaluationJob>> DequeueAsync(int maxCount, CancellationToken cancellationToken = default);
|
||||
ValueTask EnqueueAsync(PolicyEvaluationJob job, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Store interface for candidate findings.
|
||||
/// </summary>
|
||||
public interface ICandidateFindingStore
|
||||
{
|
||||
ValueTask<IReadOnlyList<CandidateFinding>> GetFindingsAsync(
|
||||
string tenantId,
|
||||
ImmutableArray<string> findingIds,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask StoreFindingsAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<CandidateFinding> findings,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for Policy Engine batch evaluation.
|
||||
/// </summary>
|
||||
public interface IPolicyEngineEvaluator
|
||||
{
|
||||
ValueTask<BatchEvaluationResult> EvaluateBatchAsync(
|
||||
string tenantId,
|
||||
string artifactId,
|
||||
IReadOnlyList<CandidateFinding> candidates,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for Findings Ledger projector queue.
|
||||
/// </summary>
|
||||
public interface IFindingsLedgerProjector
|
||||
{
|
||||
ValueTask EnqueueAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<FindingsLedgerEntry> entries,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of batch policy evaluation.
|
||||
/// </summary>
|
||||
public sealed record BatchEvaluationResult(
|
||||
string BatchId,
|
||||
ImmutableArray<EvaluatedFinding> EvaluatedFindings,
|
||||
int SkippedCount,
|
||||
DateTimeOffset EvaluatedAt);
|
||||
|
||||
/// <summary>
|
||||
/// A finding after policy evaluation.
|
||||
/// </summary>
|
||||
public sealed record EvaluatedFinding(
|
||||
string FindingId,
|
||||
string ComponentPurl,
|
||||
string VulnerabilityId,
|
||||
string Severity,
|
||||
PolicyOutcome PolicyOutcome,
|
||||
string PolicyId,
|
||||
string? AppliedExceptionId,
|
||||
bool? IsReachable,
|
||||
DateTimeOffset EvaluatedAt,
|
||||
ImmutableDictionary<string, string>? Metadata = null);
|
||||
|
||||
/// <summary>
|
||||
/// Policy evaluation outcome.
|
||||
/// </summary>
|
||||
public enum PolicyOutcome
|
||||
{
|
||||
Pass,
|
||||
Warning,
|
||||
Violation,
|
||||
Skipped,
|
||||
Error
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Entry for Findings Ledger projection.
|
||||
/// </summary>
|
||||
public sealed record FindingsLedgerEntry(
|
||||
string EntryId,
|
||||
string TenantId,
|
||||
string ArtifactId,
|
||||
string FindingId,
|
||||
string ComponentPurl,
|
||||
string VulnerabilityId,
|
||||
string Severity,
|
||||
PolicyOutcome PolicyOutcome,
|
||||
string PolicyId,
|
||||
string? ExceptionId,
|
||||
bool? IsReachable,
|
||||
DateTimeOffset EvaluatedAt,
|
||||
ImmutableDictionary<string, string>? Metadata);
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of policy evaluation job queue.
|
||||
/// </summary>
|
||||
public sealed class InMemoryPolicyEvaluationJobQueue : IPolicyEvaluationJobQueue
|
||||
{
|
||||
private readonly Queue<PolicyEvaluationJob> _queue = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
public ValueTask<IReadOnlyList<PolicyEvaluationJob>> DequeueAsync(int maxCount, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = new List<PolicyEvaluationJob>();
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
while (results.Count < maxCount && _queue.Count > 0)
|
||||
{
|
||||
results.Add(_queue.Dequeue());
|
||||
}
|
||||
}
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<PolicyEvaluationJob>>(results);
|
||||
}
|
||||
|
||||
public ValueTask EnqueueAsync(PolicyEvaluationJob job, CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_queue.Enqueue(job);
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of candidate finding store.
|
||||
/// </summary>
|
||||
public sealed class InMemoryCandidateFindingStore : ICandidateFindingStore
|
||||
{
|
||||
private readonly Dictionary<string, CandidateFinding> _findings = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
public ValueTask<IReadOnlyList<CandidateFinding>> GetFindingsAsync(
|
||||
string tenantId,
|
||||
ImmutableArray<string> findingIds,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
var results = findingIds
|
||||
.Where(id => _findings.TryGetValue(id, out var f) && f.TenantId == tenantId)
|
||||
.Select(id => _findings[id])
|
||||
.ToList();
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<CandidateFinding>>(results);
|
||||
}
|
||||
}
|
||||
|
||||
public ValueTask StoreFindingsAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<CandidateFinding> findings,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
foreach (var finding in findings)
|
||||
{
|
||||
if (finding.TenantId == tenantId)
|
||||
{
|
||||
_findings[finding.FindingId] = finding;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of findings ledger projector.
|
||||
/// </summary>
|
||||
public sealed class InMemoryFindingsLedgerProjector : IFindingsLedgerProjector
|
||||
{
|
||||
private readonly Queue<FindingsLedgerEntry> _queue = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
public ValueTask EnqueueAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<FindingsLedgerEntry> entries,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
if (entry.TenantId == tenantId)
|
||||
{
|
||||
_queue.Enqueue(entry);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets queued entries (for testing).
|
||||
/// </summary>
|
||||
public IReadOnlyList<FindingsLedgerEntry> GetQueuedEntries()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return [.. _queue];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of policy engine evaluator for testing.
|
||||
/// </summary>
|
||||
public sealed class NullPolicyEngineEvaluator : IPolicyEngineEvaluator
|
||||
{
|
||||
public static NullPolicyEngineEvaluator Instance { get; } = new();
|
||||
|
||||
public ValueTask<BatchEvaluationResult> EvaluateBatchAsync(
|
||||
string tenantId,
|
||||
string artifactId,
|
||||
IReadOnlyList<CandidateFinding> candidates,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var evaluatedFindings = candidates
|
||||
.Select(c => new EvaluatedFinding(
|
||||
c.FindingId,
|
||||
c.ComponentPurl,
|
||||
c.VulnerabilityId,
|
||||
c.Severity,
|
||||
PolicyOutcome.Pass,
|
||||
"default-policy",
|
||||
null,
|
||||
null,
|
||||
DateTimeOffset.UtcNow))
|
||||
.ToImmutableArray();
|
||||
|
||||
return ValueTask.FromResult(new BatchEvaluationResult(
|
||||
BatchId: Guid.NewGuid().ToString("N"),
|
||||
EvaluatedFindings: evaluatedFindings,
|
||||
SkippedCount: 0,
|
||||
EvaluatedAt: DateTimeOffset.UtcNow));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,411 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Resolver;
|
||||
|
||||
/// <summary>
|
||||
/// Resolver monitoring worker per SCHED-WORKER-29-003.
|
||||
/// Monitors resolver/evaluation backlog, SLA breaches, and export job queue.
|
||||
/// Exposes metrics and alerts for DevOps dashboards.
|
||||
/// </summary>
|
||||
public sealed class ResolverMonitoringWorker : BackgroundService
|
||||
{
|
||||
private readonly IResolverQueueMetrics _resolverMetrics;
|
||||
private readonly IEvaluationQueueMetrics _evaluationMetrics;
|
||||
private readonly IExportQueueMetrics _exportMetrics;
|
||||
private readonly ISlaBreachDetector _slaBreachDetector;
|
||||
private readonly IMonitoringAlertPublisher _alertPublisher;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly ILogger<ResolverMonitoringWorker> _logger;
|
||||
|
||||
public ResolverMonitoringWorker(
|
||||
IResolverQueueMetrics resolverMetrics,
|
||||
IEvaluationQueueMetrics evaluationMetrics,
|
||||
IExportQueueMetrics exportMetrics,
|
||||
ISlaBreachDetector slaBreachDetector,
|
||||
IMonitoringAlertPublisher alertPublisher,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
ILogger<ResolverMonitoringWorker> logger)
|
||||
{
|
||||
_resolverMetrics = resolverMetrics ?? throw new ArgumentNullException(nameof(resolverMetrics));
|
||||
_evaluationMetrics = evaluationMetrics ?? throw new ArgumentNullException(nameof(evaluationMetrics));
|
||||
_exportMetrics = exportMetrics ?? throw new ArgumentNullException(nameof(exportMetrics));
|
||||
_slaBreachDetector = slaBreachDetector ?? throw new ArgumentNullException(nameof(slaBreachDetector));
|
||||
_alertPublisher = alertPublisher ?? throw new ArgumentNullException(nameof(alertPublisher));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Resolver monitoring worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
// Collect and report metrics
|
||||
await CollectAndReportMetricsAsync(now, stoppingToken).ConfigureAwait(false);
|
||||
|
||||
// Check for SLA breaches
|
||||
await CheckSlaBreachesAsync(now, stoppingToken).ConfigureAwait(false);
|
||||
|
||||
// Wait for next monitoring cycle
|
||||
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in resolver monitoring worker loop.");
|
||||
await Task.Delay(TimeSpan.FromSeconds(10), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Resolver monitoring worker stopped.");
|
||||
}
|
||||
|
||||
private async Task CollectAndReportMetricsAsync(DateTimeOffset now, CancellationToken cancellationToken)
|
||||
{
|
||||
// Resolver queue metrics
|
||||
var resolverStats = await _resolverMetrics.GetQueueStatsAsync(cancellationToken).ConfigureAwait(false);
|
||||
ReportQueueMetrics("resolver", resolverStats);
|
||||
|
||||
// Evaluation queue metrics
|
||||
var evalStats = await _evaluationMetrics.GetQueueStatsAsync(cancellationToken).ConfigureAwait(false);
|
||||
ReportQueueMetrics("evaluation", evalStats);
|
||||
|
||||
// Export queue metrics
|
||||
var exportStats = await _exportMetrics.GetQueueStatsAsync(cancellationToken).ConfigureAwait(false);
|
||||
ReportQueueMetrics("export", exportStats);
|
||||
|
||||
// Log summary
|
||||
_logger.LogDebug(
|
||||
"Queue stats - Resolver: {ResolverDepth}, Evaluation: {EvalDepth}, Export: {ExportDepth}",
|
||||
resolverStats.QueueDepth,
|
||||
evalStats.QueueDepth,
|
||||
exportStats.QueueDepth);
|
||||
|
||||
// Check for backlog alerts
|
||||
await CheckBacklogAlertsAsync(resolverStats, evalStats, exportStats, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private void ReportQueueMetrics(string queueType, QueueStats stats)
|
||||
{
|
||||
// These would typically be reported via the metrics system
|
||||
// For now, we're using the existing SchedulerWorkerMetrics
|
||||
_logger.LogDebug(
|
||||
"{QueueType} queue: depth={Depth}, oldest={OldestAge}s, throughput={Throughput}/s",
|
||||
queueType,
|
||||
stats.QueueDepth,
|
||||
stats.OldestItemAge?.TotalSeconds ?? 0,
|
||||
stats.ThroughputPerSecond);
|
||||
}
|
||||
|
||||
private async Task CheckBacklogAlertsAsync(
|
||||
QueueStats resolverStats,
|
||||
QueueStats evalStats,
|
||||
QueueStats exportStats,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
const int backlogThreshold = 1000;
|
||||
const int criticalThreshold = 5000;
|
||||
|
||||
// Resolver backlog
|
||||
if (resolverStats.QueueDepth >= criticalThreshold)
|
||||
{
|
||||
await _alertPublisher.PublishAlertAsync(
|
||||
new MonitoringAlert(
|
||||
AlertId: $"resolver-backlog-critical-{_timeProvider.GetUtcNow().Ticks}",
|
||||
Type: AlertType.BacklogCritical,
|
||||
Source: "resolver",
|
||||
Message: $"Resolver queue backlog critical: {resolverStats.QueueDepth} items",
|
||||
Severity: AlertSeverity.Critical,
|
||||
Value: resolverStats.QueueDepth,
|
||||
Threshold: criticalThreshold,
|
||||
Timestamp: _timeProvider.GetUtcNow()),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
else if (resolverStats.QueueDepth >= backlogThreshold)
|
||||
{
|
||||
await _alertPublisher.PublishAlertAsync(
|
||||
new MonitoringAlert(
|
||||
AlertId: $"resolver-backlog-warning-{_timeProvider.GetUtcNow().Ticks}",
|
||||
Type: AlertType.BacklogWarning,
|
||||
Source: "resolver",
|
||||
Message: $"Resolver queue backlog elevated: {resolverStats.QueueDepth} items",
|
||||
Severity: AlertSeverity.Warning,
|
||||
Value: resolverStats.QueueDepth,
|
||||
Threshold: backlogThreshold,
|
||||
Timestamp: _timeProvider.GetUtcNow()),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
// Evaluation backlog
|
||||
if (evalStats.QueueDepth >= criticalThreshold)
|
||||
{
|
||||
await _alertPublisher.PublishAlertAsync(
|
||||
new MonitoringAlert(
|
||||
AlertId: $"evaluation-backlog-critical-{_timeProvider.GetUtcNow().Ticks}",
|
||||
Type: AlertType.BacklogCritical,
|
||||
Source: "evaluation",
|
||||
Message: $"Evaluation queue backlog critical: {evalStats.QueueDepth} items",
|
||||
Severity: AlertSeverity.Critical,
|
||||
Value: evalStats.QueueDepth,
|
||||
Threshold: criticalThreshold,
|
||||
Timestamp: _timeProvider.GetUtcNow()),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
// Export backlog
|
||||
if (exportStats.QueueDepth >= backlogThreshold)
|
||||
{
|
||||
await _alertPublisher.PublishAlertAsync(
|
||||
new MonitoringAlert(
|
||||
AlertId: $"export-backlog-warning-{_timeProvider.GetUtcNow().Ticks}",
|
||||
Type: AlertType.BacklogWarning,
|
||||
Source: "export",
|
||||
Message: $"Export queue backlog elevated: {exportStats.QueueDepth} items",
|
||||
Severity: AlertSeverity.Warning,
|
||||
Value: exportStats.QueueDepth,
|
||||
Threshold: backlogThreshold,
|
||||
Timestamp: _timeProvider.GetUtcNow()),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task CheckSlaBreachesAsync(DateTimeOffset now, CancellationToken cancellationToken)
|
||||
{
|
||||
// Check resolver SLA breaches
|
||||
var resolverBreaches = await _slaBreachDetector.DetectResolverBreachesAsync(
|
||||
now,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
foreach (var breach in resolverBreaches)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Resolver SLA breach: Job {JobId}, tenant {TenantId}, age {Age}",
|
||||
breach.JobId,
|
||||
breach.TenantId,
|
||||
breach.Age);
|
||||
|
||||
await _alertPublisher.PublishAlertAsync(
|
||||
new MonitoringAlert(
|
||||
AlertId: $"resolver-sla-breach-{breach.JobId}",
|
||||
Type: AlertType.SlaBreach,
|
||||
Source: "resolver",
|
||||
Message: $"Resolver job {breach.JobId} exceeded SLA: {breach.Age.TotalMinutes:F1} minutes",
|
||||
Severity: AlertSeverity.High,
|
||||
Value: (long)breach.Age.TotalSeconds,
|
||||
Threshold: (long)breach.SlaThreshold.TotalSeconds,
|
||||
Timestamp: now,
|
||||
Metadata: new Dictionary<string, string>
|
||||
{
|
||||
["job_id"] = breach.JobId,
|
||||
["tenant_id"] = breach.TenantId
|
||||
}.ToImmutableDictionary()),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
// Check evaluation SLA breaches
|
||||
var evalBreaches = await _slaBreachDetector.DetectEvaluationBreachesAsync(
|
||||
now,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
foreach (var breach in evalBreaches)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Evaluation SLA breach: Job {JobId}, tenant {TenantId}, age {Age}",
|
||||
breach.JobId,
|
||||
breach.TenantId,
|
||||
breach.Age);
|
||||
|
||||
await _alertPublisher.PublishAlertAsync(
|
||||
new MonitoringAlert(
|
||||
AlertId: $"evaluation-sla-breach-{breach.JobId}",
|
||||
Type: AlertType.SlaBreach,
|
||||
Source: "evaluation",
|
||||
Message: $"Evaluation job {breach.JobId} exceeded SLA: {breach.Age.TotalMinutes:F1} minutes",
|
||||
Severity: AlertSeverity.High,
|
||||
Value: (long)breach.Age.TotalSeconds,
|
||||
Threshold: (long)breach.SlaThreshold.TotalSeconds,
|
||||
Timestamp: now,
|
||||
Metadata: new Dictionary<string, string>
|
||||
{
|
||||
["job_id"] = breach.JobId,
|
||||
["tenant_id"] = breach.TenantId
|
||||
}.ToImmutableDictionary()),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for resolver queue metrics.
|
||||
/// </summary>
|
||||
public interface IResolverQueueMetrics
|
||||
{
|
||||
ValueTask<QueueStats> GetQueueStatsAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for evaluation queue metrics.
|
||||
/// </summary>
|
||||
public interface IEvaluationQueueMetrics
|
||||
{
|
||||
ValueTask<QueueStats> GetQueueStatsAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for export queue metrics.
|
||||
/// </summary>
|
||||
public interface IExportQueueMetrics
|
||||
{
|
||||
ValueTask<QueueStats> GetQueueStatsAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for SLA breach detection.
|
||||
/// </summary>
|
||||
public interface ISlaBreachDetector
|
||||
{
|
||||
ValueTask<IReadOnlyList<SlaBreach>> DetectResolverBreachesAsync(DateTimeOffset now, CancellationToken cancellationToken = default);
|
||||
ValueTask<IReadOnlyList<SlaBreach>> DetectEvaluationBreachesAsync(DateTimeOffset now, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for monitoring alert publishing.
|
||||
/// </summary>
|
||||
public interface IMonitoringAlertPublisher
|
||||
{
|
||||
ValueTask PublishAlertAsync(MonitoringAlert alert, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Queue statistics.
|
||||
/// </summary>
|
||||
public sealed record QueueStats(
|
||||
int QueueDepth,
|
||||
TimeSpan? OldestItemAge,
|
||||
double ThroughputPerSecond,
|
||||
int ProcessedLastMinute,
|
||||
int FailedLastMinute);
|
||||
|
||||
/// <summary>
|
||||
/// SLA breach information.
|
||||
/// </summary>
|
||||
public sealed record SlaBreach(
|
||||
string JobId,
|
||||
string TenantId,
|
||||
TimeSpan Age,
|
||||
TimeSpan SlaThreshold,
|
||||
DateTimeOffset StartedAt);
|
||||
|
||||
/// <summary>
|
||||
/// A monitoring alert.
|
||||
/// </summary>
|
||||
public sealed record MonitoringAlert(
|
||||
string AlertId,
|
||||
AlertType Type,
|
||||
string Source,
|
||||
string Message,
|
||||
AlertSeverity Severity,
|
||||
long Value,
|
||||
long Threshold,
|
||||
DateTimeOffset Timestamp,
|
||||
ImmutableDictionary<string, string>? Metadata = null);
|
||||
|
||||
/// <summary>
|
||||
/// Type of monitoring alert.
|
||||
/// </summary>
|
||||
public enum AlertType
|
||||
{
|
||||
BacklogWarning,
|
||||
BacklogCritical,
|
||||
SlaBreach,
|
||||
ThroughputDrop,
|
||||
ErrorRateHigh,
|
||||
ServiceDegraded
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Severity of monitoring alert.
|
||||
/// </summary>
|
||||
public enum AlertSeverity
|
||||
{
|
||||
Info,
|
||||
Warning,
|
||||
High,
|
||||
Critical
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of resolver queue metrics.
|
||||
/// </summary>
|
||||
public sealed class NullResolverQueueMetrics : IResolverQueueMetrics
|
||||
{
|
||||
public static NullResolverQueueMetrics Instance { get; } = new();
|
||||
|
||||
public ValueTask<QueueStats> GetQueueStatsAsync(CancellationToken cancellationToken = default)
|
||||
=> ValueTask.FromResult(new QueueStats(0, null, 0, 0, 0));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of evaluation queue metrics.
|
||||
/// </summary>
|
||||
public sealed class NullEvaluationQueueMetrics : IEvaluationQueueMetrics
|
||||
{
|
||||
public static NullEvaluationQueueMetrics Instance { get; } = new();
|
||||
|
||||
public ValueTask<QueueStats> GetQueueStatsAsync(CancellationToken cancellationToken = default)
|
||||
=> ValueTask.FromResult(new QueueStats(0, null, 0, 0, 0));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of export queue metrics.
|
||||
/// </summary>
|
||||
public sealed class NullExportQueueMetrics : IExportQueueMetrics
|
||||
{
|
||||
public static NullExportQueueMetrics Instance { get; } = new();
|
||||
|
||||
public ValueTask<QueueStats> GetQueueStatsAsync(CancellationToken cancellationToken = default)
|
||||
=> ValueTask.FromResult(new QueueStats(0, null, 0, 0, 0));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of SLA breach detector.
|
||||
/// </summary>
|
||||
public sealed class NullSlaBreachDetector : ISlaBreachDetector
|
||||
{
|
||||
public static NullSlaBreachDetector Instance { get; } = new();
|
||||
|
||||
public ValueTask<IReadOnlyList<SlaBreach>> DetectResolverBreachesAsync(DateTimeOffset now, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.FromResult<IReadOnlyList<SlaBreach>>([]);
|
||||
|
||||
public ValueTask<IReadOnlyList<SlaBreach>> DetectEvaluationBreachesAsync(DateTimeOffset now, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.FromResult<IReadOnlyList<SlaBreach>>([]);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of monitoring alert publisher.
|
||||
/// </summary>
|
||||
public sealed class NullMonitoringAlertPublisher : IMonitoringAlertPublisher
|
||||
{
|
||||
public static NullMonitoringAlertPublisher Instance { get; } = new();
|
||||
|
||||
public ValueTask PublishAlertAsync(MonitoringAlert alert, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,479 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Resolver;
|
||||
|
||||
/// <summary>
|
||||
/// Resolver worker per SCHED-WORKER-29-001.
|
||||
/// Generates candidate findings from inventory + advisory evidence,
|
||||
/// respects ecosystem version semantics and path scope,
|
||||
/// and emits jobs for policy evaluation.
|
||||
/// </summary>
|
||||
public sealed class ResolverWorker : BackgroundService
|
||||
{
|
||||
private readonly IResolverJobQueue _jobQueue;
|
||||
private readonly IInventoryProvider _inventoryProvider;
|
||||
private readonly IAdvisoryProvider _advisoryProvider;
|
||||
private readonly IVersionMatcher _versionMatcher;
|
||||
private readonly ICandidateFindingEmitter _findingEmitter;
|
||||
private readonly IPolicyEvaluationJobEmitter _evaluationJobEmitter;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly ILogger<ResolverWorker> _logger;
|
||||
|
||||
public ResolverWorker(
|
||||
IResolverJobQueue jobQueue,
|
||||
IInventoryProvider inventoryProvider,
|
||||
IAdvisoryProvider advisoryProvider,
|
||||
IVersionMatcher versionMatcher,
|
||||
ICandidateFindingEmitter findingEmitter,
|
||||
IPolicyEvaluationJobEmitter evaluationJobEmitter,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
ILogger<ResolverWorker> logger)
|
||||
{
|
||||
_jobQueue = jobQueue ?? throw new ArgumentNullException(nameof(jobQueue));
|
||||
_inventoryProvider = inventoryProvider ?? throw new ArgumentNullException(nameof(inventoryProvider));
|
||||
_advisoryProvider = advisoryProvider ?? throw new ArgumentNullException(nameof(advisoryProvider));
|
||||
_versionMatcher = versionMatcher ?? throw new ArgumentNullException(nameof(versionMatcher));
|
||||
_findingEmitter = findingEmitter ?? throw new ArgumentNullException(nameof(findingEmitter));
|
||||
_evaluationJobEmitter = evaluationJobEmitter ?? throw new ArgumentNullException(nameof(evaluationJobEmitter));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Resolver worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Dequeue resolver jobs
|
||||
var jobs = await _jobQueue
|
||||
.DequeueAsync(_options.Policy.Dispatch.BatchSize, stoppingToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (jobs.Count == 0)
|
||||
{
|
||||
await Task.Delay(_options.Policy.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var job in jobs)
|
||||
{
|
||||
if (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
await ProcessResolverJobAsync(job, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in resolver worker loop.");
|
||||
await Task.Delay(_options.Policy.Dispatch.RetryBackoff, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Resolver worker stopped.");
|
||||
}
|
||||
|
||||
private async Task ProcessResolverJobAsync(
|
||||
ResolverJob job,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var startedAt = _timeProvider.GetUtcNow();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Processing resolver job {JobId} for tenant {TenantId}, artifact {ArtifactId}.",
|
||||
job.JobId,
|
||||
job.TenantId,
|
||||
job.ArtifactId);
|
||||
|
||||
try
|
||||
{
|
||||
// 1. Load inventory for the artifact
|
||||
var inventory = await _inventoryProvider.GetInventoryAsync(
|
||||
job.TenantId,
|
||||
job.ArtifactId,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (inventory.Components.Length == 0)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"No components found in inventory for artifact {ArtifactId}.",
|
||||
job.ArtifactId);
|
||||
return;
|
||||
}
|
||||
|
||||
// 2. Get relevant advisories
|
||||
var ecosystems = inventory.Components
|
||||
.Select(c => c.Ecosystem)
|
||||
.Distinct()
|
||||
.ToList();
|
||||
|
||||
var advisories = await _advisoryProvider.GetAdvisoriesForEcosystemsAsync(
|
||||
ecosystems,
|
||||
job.AdvisoryFilter,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Found {ComponentCount} components and {AdvisoryCount} advisories for job {JobId}.",
|
||||
inventory.Components.Length,
|
||||
advisories.Count,
|
||||
job.JobId);
|
||||
|
||||
// 3. Match components against advisories
|
||||
var candidateFindings = new List<CandidateFinding>();
|
||||
|
||||
foreach (var component in inventory.Components)
|
||||
{
|
||||
// Apply path scope filter if specified
|
||||
if (job.PathScope is not null && !MatchesPathScope(component.FilePath, job.PathScope))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var relevantAdvisories = advisories
|
||||
.Where(a => a.Ecosystem == component.Ecosystem)
|
||||
.ToList();
|
||||
|
||||
foreach (var advisory in relevantAdvisories)
|
||||
{
|
||||
// Check if component matches advisory affected range
|
||||
var isAffected = await _versionMatcher.IsAffectedAsync(
|
||||
component,
|
||||
advisory,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (isAffected)
|
||||
{
|
||||
var finding = new CandidateFinding(
|
||||
FindingId: $"{job.JobId}:{component.Purl}:{advisory.AdvisoryId}",
|
||||
JobId: job.JobId,
|
||||
TenantId: job.TenantId,
|
||||
ArtifactId: job.ArtifactId,
|
||||
ComponentPurl: component.Purl,
|
||||
ComponentVersion: component.Version,
|
||||
ComponentEcosystem: component.Ecosystem,
|
||||
VulnerabilityId: advisory.VulnerabilityId,
|
||||
AdvisoryId: advisory.AdvisoryId,
|
||||
Severity: advisory.Severity,
|
||||
AffectedRange: advisory.AffectedRange,
|
||||
FixedVersion: advisory.FixedVersion,
|
||||
FilePath: component.FilePath,
|
||||
MatchedAt: _timeProvider.GetUtcNow());
|
||||
|
||||
candidateFindings.Add(finding);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Generated {FindingCount} candidate findings for job {JobId}.",
|
||||
candidateFindings.Count,
|
||||
job.JobId);
|
||||
|
||||
// 4. Emit candidate findings
|
||||
if (candidateFindings.Count > 0)
|
||||
{
|
||||
await _findingEmitter.EmitAsync(
|
||||
job.TenantId,
|
||||
candidateFindings,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// 5. Emit policy evaluation job
|
||||
await _evaluationJobEmitter.EmitAsync(
|
||||
new PolicyEvaluationJob(
|
||||
JobId: $"eval-{job.JobId}",
|
||||
TenantId: job.TenantId,
|
||||
ArtifactId: job.ArtifactId,
|
||||
ResolverJobId: job.JobId,
|
||||
CandidateFindingIds: [.. candidateFindings.Select(f => f.FindingId)],
|
||||
RequestedAt: _timeProvider.GetUtcNow()),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
var duration = _timeProvider.GetUtcNow() - startedAt;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Resolver job {JobId} completed: {ComponentCount} components, {FindingCount} findings in {Duration}ms.",
|
||||
job.JobId,
|
||||
inventory.Components.Length,
|
||||
candidateFindings.Count,
|
||||
duration.TotalMilliseconds);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Resolver job {JobId} failed.",
|
||||
job.JobId);
|
||||
}
|
||||
}
|
||||
|
||||
private static bool MatchesPathScope(string? filePath, PathScope scope)
|
||||
{
|
||||
if (string.IsNullOrEmpty(filePath))
|
||||
{
|
||||
return scope.IncludeRootLevel;
|
||||
}
|
||||
|
||||
// Check include patterns
|
||||
if (scope.IncludePatterns.Length > 0)
|
||||
{
|
||||
var matches = scope.IncludePatterns.Any(p => MatchesGlob(filePath, p));
|
||||
if (!matches)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check exclude patterns
|
||||
if (scope.ExcludePatterns.Length > 0)
|
||||
{
|
||||
var excluded = scope.ExcludePatterns.Any(p => MatchesGlob(filePath, p));
|
||||
if (excluded)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static bool MatchesGlob(string path, string pattern)
|
||||
{
|
||||
// Simple glob matching (supports * and **)
|
||||
var regexPattern = "^" + System.Text.RegularExpressions.Regex.Escape(pattern)
|
||||
.Replace(@"\*\*", ".*")
|
||||
.Replace(@"\*", "[^/]*") + "$";
|
||||
|
||||
return System.Text.RegularExpressions.Regex.IsMatch(path, regexPattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Queue interface for resolver jobs.
|
||||
/// </summary>
|
||||
public interface IResolverJobQueue
|
||||
{
|
||||
ValueTask<IReadOnlyList<ResolverJob>> DequeueAsync(int maxCount, CancellationToken cancellationToken = default);
|
||||
ValueTask EnqueueAsync(ResolverJob job, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Provider interface for component inventory.
|
||||
/// </summary>
|
||||
public interface IInventoryProvider
|
||||
{
|
||||
ValueTask<ComponentInventory> GetInventoryAsync(
|
||||
string tenantId,
|
||||
string artifactId,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Provider interface for security advisories.
|
||||
/// </summary>
|
||||
public interface IAdvisoryProvider
|
||||
{
|
||||
ValueTask<IReadOnlyList<SecurityAdvisory>> GetAdvisoriesForEcosystemsAsync(
|
||||
IReadOnlyList<string> ecosystems,
|
||||
AdvisoryFilter? filter,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for version matching against advisories.
|
||||
/// </summary>
|
||||
public interface IVersionMatcher
|
||||
{
|
||||
ValueTask<bool> IsAffectedAsync(
|
||||
InventoryComponent component,
|
||||
SecurityAdvisory advisory,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Emitter interface for candidate findings.
|
||||
/// </summary>
|
||||
public interface ICandidateFindingEmitter
|
||||
{
|
||||
ValueTask EmitAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<CandidateFinding> findings,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Emitter interface for policy evaluation jobs.
|
||||
/// </summary>
|
||||
public interface IPolicyEvaluationJobEmitter
|
||||
{
|
||||
ValueTask EmitAsync(
|
||||
PolicyEvaluationJob job,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents a resolver job.
|
||||
/// </summary>
|
||||
public sealed record ResolverJob(
|
||||
string JobId,
|
||||
string TenantId,
|
||||
string ArtifactId,
|
||||
DateTimeOffset RequestedAt,
|
||||
AdvisoryFilter? AdvisoryFilter = null,
|
||||
PathScope? PathScope = null);
|
||||
|
||||
/// <summary>
|
||||
/// Filter for advisories.
|
||||
/// </summary>
|
||||
public sealed record AdvisoryFilter(
|
||||
ImmutableArray<string> Severities,
|
||||
DateTimeOffset? PublishedAfter = null,
|
||||
bool IncludeWithdrawn = false,
|
||||
bool OnlyKev = false);
|
||||
|
||||
/// <summary>
|
||||
/// Scope for path-based filtering.
|
||||
/// </summary>
|
||||
public sealed record PathScope(
|
||||
ImmutableArray<string> IncludePatterns,
|
||||
ImmutableArray<string> ExcludePatterns,
|
||||
bool IncludeRootLevel = true);
|
||||
|
||||
/// <summary>
|
||||
/// Component inventory for an artifact.
|
||||
/// </summary>
|
||||
public sealed record ComponentInventory(
|
||||
string ArtifactId,
|
||||
string TenantId,
|
||||
ImmutableArray<InventoryComponent> Components,
|
||||
DateTimeOffset GeneratedAt);
|
||||
|
||||
/// <summary>
|
||||
/// A component in the inventory.
|
||||
/// </summary>
|
||||
public sealed record InventoryComponent(
|
||||
string Purl,
|
||||
string Name,
|
||||
string Version,
|
||||
string Ecosystem,
|
||||
string? FilePath = null,
|
||||
bool IsDirect = true);
|
||||
|
||||
/// <summary>
|
||||
/// A security advisory.
|
||||
/// </summary>
|
||||
public sealed record SecurityAdvisory(
|
||||
string AdvisoryId,
|
||||
string VulnerabilityId,
|
||||
string Ecosystem,
|
||||
string Severity,
|
||||
string AffectedRange,
|
||||
string? FixedVersion,
|
||||
DateTimeOffset PublishedAt,
|
||||
bool IsKev = false,
|
||||
bool IsWithdrawn = false);
|
||||
|
||||
/// <summary>
|
||||
/// A candidate finding from resolver.
|
||||
/// </summary>
|
||||
public sealed record CandidateFinding(
|
||||
string FindingId,
|
||||
string JobId,
|
||||
string TenantId,
|
||||
string ArtifactId,
|
||||
string ComponentPurl,
|
||||
string ComponentVersion,
|
||||
string ComponentEcosystem,
|
||||
string VulnerabilityId,
|
||||
string AdvisoryId,
|
||||
string Severity,
|
||||
string AffectedRange,
|
||||
string? FixedVersion,
|
||||
string? FilePath,
|
||||
DateTimeOffset MatchedAt);
|
||||
|
||||
/// <summary>
|
||||
/// A policy evaluation job.
|
||||
/// </summary>
|
||||
public sealed record PolicyEvaluationJob(
|
||||
string JobId,
|
||||
string TenantId,
|
||||
string ArtifactId,
|
||||
string ResolverJobId,
|
||||
ImmutableArray<string> CandidateFindingIds,
|
||||
DateTimeOffset RequestedAt);
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of resolver job queue.
|
||||
/// </summary>
|
||||
public sealed class InMemoryResolverJobQueue : IResolverJobQueue
|
||||
{
|
||||
private readonly Queue<ResolverJob> _queue = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
public ValueTask<IReadOnlyList<ResolverJob>> DequeueAsync(int maxCount, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = new List<ResolverJob>();
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
while (results.Count < maxCount && _queue.Count > 0)
|
||||
{
|
||||
results.Add(_queue.Dequeue());
|
||||
}
|
||||
}
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<ResolverJob>>(results);
|
||||
}
|
||||
|
||||
public ValueTask EnqueueAsync(ResolverJob job, CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_queue.Enqueue(job);
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of candidate finding emitter for testing.
|
||||
/// </summary>
|
||||
public sealed class NullCandidateFindingEmitter : ICandidateFindingEmitter
|
||||
{
|
||||
public static NullCandidateFindingEmitter Instance { get; } = new();
|
||||
|
||||
public ValueTask EmitAsync(string tenantId, IReadOnlyList<CandidateFinding> findings, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of policy evaluation job emitter for testing.
|
||||
/// </summary>
|
||||
public sealed class NullPolicyEvaluationJobEmitter : IPolicyEvaluationJobEmitter
|
||||
{
|
||||
public static NullPolicyEvaluationJobEmitter Instance { get; } = new();
|
||||
|
||||
public ValueTask EmitAsync(PolicyEvaluationJob job, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,563 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Simulation;
|
||||
|
||||
/// <summary>
|
||||
/// Policy batch simulation worker per SCHED-WORKER-27-301.
|
||||
/// Shards SBOM inventories, invokes Policy Engine, emits partial results,
|
||||
/// handles retries/backoff, and publishes progress events.
|
||||
/// </summary>
|
||||
public sealed class PolicyBatchSimulationWorker : BackgroundService
|
||||
{
|
||||
private readonly ISimulationJobQueue _jobQueue;
|
||||
private readonly ISimulationSharder _sharder;
|
||||
private readonly IPolicyEngineClient _policyEngine;
|
||||
private readonly ISimulationResultStore _resultStore;
|
||||
private readonly ISimulationProgressPublisher _progressPublisher;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly ILogger<PolicyBatchSimulationWorker> _logger;
|
||||
|
||||
public PolicyBatchSimulationWorker(
|
||||
ISimulationJobQueue jobQueue,
|
||||
ISimulationSharder sharder,
|
||||
IPolicyEngineClient policyEngine,
|
||||
ISimulationResultStore resultStore,
|
||||
ISimulationProgressPublisher progressPublisher,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
ILogger<PolicyBatchSimulationWorker> logger)
|
||||
{
|
||||
_jobQueue = jobQueue ?? throw new ArgumentNullException(nameof(jobQueue));
|
||||
_sharder = sharder ?? throw new ArgumentNullException(nameof(sharder));
|
||||
_policyEngine = policyEngine ?? throw new ArgumentNullException(nameof(policyEngine));
|
||||
_resultStore = resultStore ?? throw new ArgumentNullException(nameof(resultStore));
|
||||
_progressPublisher = progressPublisher ?? throw new ArgumentNullException(nameof(progressPublisher));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
if (!_options.Policy.Enabled)
|
||||
{
|
||||
_logger.LogInformation("Policy batch simulation worker is disabled.");
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogInformation("Policy batch simulation worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Dequeue simulation jobs
|
||||
var jobs = await _jobQueue
|
||||
.DequeueAsync(_options.Policy.Dispatch.BatchSize, stoppingToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (jobs.Count == 0)
|
||||
{
|
||||
await Task.Delay(_options.Policy.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var job in jobs)
|
||||
{
|
||||
if (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
await ProcessSimulationJobAsync(job, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in policy batch simulation worker loop.");
|
||||
await Task.Delay(_options.Policy.Dispatch.RetryBackoff, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Policy batch simulation worker stopped.");
|
||||
}
|
||||
|
||||
private async Task ProcessSimulationJobAsync(
|
||||
SimulationJob job,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var startedAt = _timeProvider.GetUtcNow();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Processing simulation job {JobId} for tenant {TenantId}, policy {PolicyId}.",
|
||||
job.JobId,
|
||||
job.TenantId,
|
||||
job.PolicyId);
|
||||
|
||||
try
|
||||
{
|
||||
// Publish job started
|
||||
await _progressPublisher.PublishStartedAsync(job, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Shard the SBOM inventory
|
||||
var shards = await _sharder.ShardInventoryAsync(
|
||||
job.TenantId,
|
||||
job.SbomIds,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Sharded {SbomCount} SBOMs into {ShardCount} shards for job {JobId}.",
|
||||
job.SbomIds.Length,
|
||||
shards.Count,
|
||||
job.JobId);
|
||||
|
||||
var completedShards = 0;
|
||||
var totalFindings = 0;
|
||||
var failedShards = new List<string>();
|
||||
|
||||
foreach (var shard in shards)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Process shard with retry
|
||||
var result = await ProcessShardWithRetryAsync(
|
||||
job,
|
||||
shard,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Store partial result
|
||||
await _resultStore.StorePartialResultAsync(
|
||||
job.JobId,
|
||||
shard.ShardId,
|
||||
result,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
completedShards++;
|
||||
totalFindings += result.FindingsCount;
|
||||
|
||||
// Publish progress
|
||||
await _progressPublisher.PublishProgressAsync(
|
||||
job,
|
||||
completedShards,
|
||||
shards.Count,
|
||||
totalFindings,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Failed to process shard {ShardId} for job {JobId}.",
|
||||
shard.ShardId,
|
||||
job.JobId);
|
||||
|
||||
failedShards.Add(shard.ShardId);
|
||||
}
|
||||
}
|
||||
|
||||
var duration = _timeProvider.GetUtcNow() - startedAt;
|
||||
var status = failedShards.Count == 0
|
||||
? SimulationStatus.Completed
|
||||
: failedShards.Count == shards.Count
|
||||
? SimulationStatus.Failed
|
||||
: SimulationStatus.PartiallyCompleted;
|
||||
|
||||
// Publish completion
|
||||
await _progressPublisher.PublishCompletedAsync(
|
||||
job,
|
||||
status,
|
||||
completedShards,
|
||||
shards.Count,
|
||||
totalFindings,
|
||||
[.. failedShards],
|
||||
duration,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Simulation job {JobId} completed with status {Status}: {CompletedShards}/{TotalShards} shards, {TotalFindings} findings in {Duration}ms.",
|
||||
job.JobId,
|
||||
status,
|
||||
completedShards,
|
||||
shards.Count,
|
||||
totalFindings,
|
||||
duration.TotalMilliseconds);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Simulation job {JobId} failed.",
|
||||
job.JobId);
|
||||
|
||||
await _progressPublisher.PublishFailedAsync(
|
||||
job,
|
||||
ex.Message,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<SimulationShardResult> ProcessShardWithRetryAsync(
|
||||
SimulationJob job,
|
||||
SimulationShard shard,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var maxAttempts = _options.Policy.Dispatch.MaxAttempts;
|
||||
var delay = _options.Policy.Dispatch.RetryBackoff;
|
||||
|
||||
for (var attempt = 1; attempt <= maxAttempts; attempt++)
|
||||
{
|
||||
try
|
||||
{
|
||||
return await _policyEngine.EvaluateAsync(
|
||||
job.TenantId,
|
||||
job.PolicyId,
|
||||
shard.SbomIds,
|
||||
job.SimulationOptions,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException && attempt < maxAttempts)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Shard {ShardId} evaluation failed (attempt {Attempt}/{MaxAttempts}), retrying...",
|
||||
shard.ShardId,
|
||||
attempt,
|
||||
maxAttempts);
|
||||
|
||||
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
|
||||
delay = delay.Multiply(2); // Exponential backoff
|
||||
}
|
||||
}
|
||||
|
||||
throw new InvalidOperationException($"Shard {shard.ShardId} failed after {maxAttempts} attempts.");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Queue interface for simulation jobs.
|
||||
/// </summary>
|
||||
public interface ISimulationJobQueue
|
||||
{
|
||||
/// <summary>
|
||||
/// Dequeues simulation jobs for processing.
|
||||
/// </summary>
|
||||
ValueTask<IReadOnlyList<SimulationJob>> DequeueAsync(
|
||||
int maxCount,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Enqueues a simulation job.
|
||||
/// </summary>
|
||||
ValueTask EnqueueAsync(
|
||||
SimulationJob job,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for sharding SBOM inventories.
|
||||
/// </summary>
|
||||
public interface ISimulationSharder
|
||||
{
|
||||
/// <summary>
|
||||
/// Shards SBOM IDs into processable chunks.
|
||||
/// </summary>
|
||||
ValueTask<IReadOnlyList<SimulationShard>> ShardInventoryAsync(
|
||||
string tenantId,
|
||||
ImmutableArray<string> sbomIds,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Client interface for Policy Engine evaluation.
|
||||
/// </summary>
|
||||
public interface IPolicyEngineClient
|
||||
{
|
||||
/// <summary>
|
||||
/// Evaluates a policy against SBOMs.
|
||||
/// </summary>
|
||||
ValueTask<SimulationShardResult> EvaluateAsync(
|
||||
string tenantId,
|
||||
string policyId,
|
||||
ImmutableArray<string> sbomIds,
|
||||
SimulationOptions options,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Store interface for simulation results.
|
||||
/// </summary>
|
||||
public interface ISimulationResultStore
|
||||
{
|
||||
/// <summary>
|
||||
/// Stores a partial result for a shard.
|
||||
/// </summary>
|
||||
ValueTask StorePartialResultAsync(
|
||||
string jobId,
|
||||
string shardId,
|
||||
SimulationShardResult result,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets all partial results for a job.
|
||||
/// </summary>
|
||||
ValueTask<IReadOnlyList<SimulationShardResult>> GetPartialResultsAsync(
|
||||
string jobId,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Publisher interface for simulation progress events.
|
||||
/// </summary>
|
||||
public interface ISimulationProgressPublisher
|
||||
{
|
||||
ValueTask PublishStartedAsync(
|
||||
SimulationJob job,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask PublishProgressAsync(
|
||||
SimulationJob job,
|
||||
int completedShards,
|
||||
int totalShards,
|
||||
int totalFindings,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask PublishCompletedAsync(
|
||||
SimulationJob job,
|
||||
SimulationStatus status,
|
||||
int completedShards,
|
||||
int totalShards,
|
||||
int totalFindings,
|
||||
ImmutableArray<string> failedShards,
|
||||
TimeSpan duration,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask PublishFailedAsync(
|
||||
SimulationJob job,
|
||||
string error,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents a simulation job.
|
||||
/// </summary>
|
||||
public sealed record SimulationJob(
|
||||
string JobId,
|
||||
string TenantId,
|
||||
string PolicyId,
|
||||
ImmutableArray<string> SbomIds,
|
||||
SimulationOptions SimulationOptions,
|
||||
DateTimeOffset RequestedAt,
|
||||
string? RequestedBy = null);
|
||||
|
||||
/// <summary>
|
||||
/// Options for policy simulation.
|
||||
/// </summary>
|
||||
public sealed record SimulationOptions(
|
||||
bool IncludeReachability = true,
|
||||
bool IncludeExceptions = true,
|
||||
bool DryRun = true,
|
||||
int? MaxFindings = null);
|
||||
|
||||
/// <summary>
|
||||
/// Represents a shard of SBOMs for simulation.
|
||||
/// </summary>
|
||||
public sealed record SimulationShard(
|
||||
string ShardId,
|
||||
int ShardIndex,
|
||||
int TotalShards,
|
||||
ImmutableArray<string> SbomIds);
|
||||
|
||||
/// <summary>
|
||||
/// Result of evaluating a simulation shard.
|
||||
/// </summary>
|
||||
public sealed record SimulationShardResult(
|
||||
string ShardId,
|
||||
int SbomsProcessed,
|
||||
int FindingsCount,
|
||||
int ViolationsCount,
|
||||
int WarningsCount,
|
||||
ImmutableArray<SimulationFinding> Findings,
|
||||
DateTimeOffset EvaluatedAt);
|
||||
|
||||
/// <summary>
|
||||
/// A finding from policy simulation.
|
||||
/// </summary>
|
||||
public sealed record SimulationFinding(
|
||||
string FindingId,
|
||||
string SbomId,
|
||||
string ComponentPurl,
|
||||
string VulnerabilityId,
|
||||
string Severity,
|
||||
string PolicyOutcome,
|
||||
string? ExceptionId = null,
|
||||
bool? IsReachable = null);
|
||||
|
||||
/// <summary>
|
||||
/// Status of a simulation job.
|
||||
/// </summary>
|
||||
public enum SimulationStatus
|
||||
{
|
||||
Pending,
|
||||
Running,
|
||||
Completed,
|
||||
PartiallyCompleted,
|
||||
Failed,
|
||||
Cancelled
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of simulation job queue.
|
||||
/// </summary>
|
||||
public sealed class InMemorySimulationJobQueue : ISimulationJobQueue
|
||||
{
|
||||
private readonly Queue<SimulationJob> _queue = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
public ValueTask<IReadOnlyList<SimulationJob>> DequeueAsync(
|
||||
int maxCount,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = new List<SimulationJob>();
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
while (results.Count < maxCount && _queue.Count > 0)
|
||||
{
|
||||
results.Add(_queue.Dequeue());
|
||||
}
|
||||
}
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<SimulationJob>>(results);
|
||||
}
|
||||
|
||||
public ValueTask EnqueueAsync(
|
||||
SimulationJob job,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_queue.Enqueue(job);
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of simulation sharder.
|
||||
/// </summary>
|
||||
public sealed class DefaultSimulationSharder : ISimulationSharder
|
||||
{
|
||||
private readonly int _shardSize;
|
||||
|
||||
public DefaultSimulationSharder(int shardSize = 100)
|
||||
{
|
||||
_shardSize = shardSize > 0 ? shardSize : throw new ArgumentOutOfRangeException(nameof(shardSize));
|
||||
}
|
||||
|
||||
public ValueTask<IReadOnlyList<SimulationShard>> ShardInventoryAsync(
|
||||
string tenantId,
|
||||
ImmutableArray<string> sbomIds,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (sbomIds.Length == 0)
|
||||
{
|
||||
return ValueTask.FromResult<IReadOnlyList<SimulationShard>>([]);
|
||||
}
|
||||
|
||||
var shards = new List<SimulationShard>();
|
||||
var totalShards = (int)Math.Ceiling(sbomIds.Length / (double)_shardSize);
|
||||
|
||||
for (var i = 0; i < totalShards; i++)
|
||||
{
|
||||
var shardSboms = sbomIds
|
||||
.Skip(i * _shardSize)
|
||||
.Take(_shardSize)
|
||||
.ToImmutableArray();
|
||||
|
||||
shards.Add(new SimulationShard(
|
||||
ShardId: $"{tenantId}-shard-{i:D4}",
|
||||
ShardIndex: i,
|
||||
TotalShards: totalShards,
|
||||
SbomIds: shardSboms));
|
||||
}
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<SimulationShard>>(shards);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of simulation result store.
|
||||
/// </summary>
|
||||
public sealed class InMemorySimulationResultStore : ISimulationResultStore
|
||||
{
|
||||
private readonly Dictionary<string, List<SimulationShardResult>> _results = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
public ValueTask StorePartialResultAsync(
|
||||
string jobId,
|
||||
string shardId,
|
||||
SimulationShardResult result,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (!_results.TryGetValue(jobId, out var results))
|
||||
{
|
||||
results = [];
|
||||
_results[jobId] = results;
|
||||
}
|
||||
|
||||
results.Add(result);
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
public ValueTask<IReadOnlyList<SimulationShardResult>> GetPartialResultsAsync(
|
||||
string jobId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (_results.TryGetValue(jobId, out var results))
|
||||
{
|
||||
return ValueTask.FromResult<IReadOnlyList<SimulationShardResult>>(results.ToList());
|
||||
}
|
||||
}
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<SimulationShardResult>>([]);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of simulation progress publisher for testing.
|
||||
/// </summary>
|
||||
public sealed class NullSimulationProgressPublisher : ISimulationProgressPublisher
|
||||
{
|
||||
public static NullSimulationProgressPublisher Instance { get; } = new();
|
||||
|
||||
public ValueTask PublishStartedAsync(SimulationJob job, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
|
||||
public ValueTask PublishProgressAsync(SimulationJob job, int completedShards, int totalShards, int totalFindings, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
|
||||
public ValueTask PublishCompletedAsync(SimulationJob job, SimulationStatus status, int completedShards, int totalShards, int totalFindings, ImmutableArray<string> failedShards, TimeSpan duration, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
|
||||
public ValueTask PublishFailedAsync(SimulationJob job, string error, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,502 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Simulation;
|
||||
|
||||
/// <summary>
|
||||
/// Simulation reducer worker per SCHED-WORKER-27-302.
|
||||
/// Aggregates shard outputs into final manifests with counts, deltas, and samples.
|
||||
/// Writes to object storage with checksums and emits completion events.
|
||||
/// </summary>
|
||||
public sealed class SimulationReducerWorker : BackgroundService
|
||||
{
|
||||
private readonly IReducerJobQueue _jobQueue;
|
||||
private readonly ISimulationResultStore _resultStore;
|
||||
private readonly ISimulationManifestWriter _manifestWriter;
|
||||
private readonly IReducerCompletionPublisher _completionPublisher;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly ILogger<SimulationReducerWorker> _logger;
|
||||
|
||||
public SimulationReducerWorker(
|
||||
IReducerJobQueue jobQueue,
|
||||
ISimulationResultStore resultStore,
|
||||
ISimulationManifestWriter manifestWriter,
|
||||
IReducerCompletionPublisher completionPublisher,
|
||||
SchedulerWorkerOptions options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
ILogger<SimulationReducerWorker> logger)
|
||||
{
|
||||
_jobQueue = jobQueue ?? throw new ArgumentNullException(nameof(jobQueue));
|
||||
_resultStore = resultStore ?? throw new ArgumentNullException(nameof(resultStore));
|
||||
_manifestWriter = manifestWriter ?? throw new ArgumentNullException(nameof(manifestWriter));
|
||||
_completionPublisher = completionPublisher ?? throw new ArgumentNullException(nameof(completionPublisher));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
if (!_options.Policy.Enabled)
|
||||
{
|
||||
_logger.LogInformation("Simulation reducer worker is disabled.");
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogInformation("Simulation reducer worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Dequeue reducer jobs
|
||||
var jobs = await _jobQueue
|
||||
.DequeueAsync(_options.Policy.Dispatch.BatchSize, stoppingToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (jobs.Count == 0)
|
||||
{
|
||||
await Task.Delay(_options.Policy.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var job in jobs)
|
||||
{
|
||||
if (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
await ProcessReducerJobAsync(job, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in simulation reducer worker loop.");
|
||||
await Task.Delay(_options.Policy.Dispatch.RetryBackoff, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Simulation reducer worker stopped.");
|
||||
}
|
||||
|
||||
private async Task ProcessReducerJobAsync(
|
||||
ReducerJob job,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var startedAt = _timeProvider.GetUtcNow();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Processing reducer job for simulation {SimulationJobId}, tenant {TenantId}.",
|
||||
job.SimulationJobId,
|
||||
job.TenantId);
|
||||
|
||||
try
|
||||
{
|
||||
// Get all partial results
|
||||
var partialResults = await _resultStore
|
||||
.GetPartialResultsAsync(job.SimulationJobId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (partialResults.Count == 0)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"No partial results found for simulation {SimulationJobId}.",
|
||||
job.SimulationJobId);
|
||||
|
||||
await _completionPublisher.PublishCompletionAsync(
|
||||
job,
|
||||
ReducerStatus.NoResults,
|
||||
null,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// Aggregate results into manifest
|
||||
var manifest = AggregateResults(job, partialResults);
|
||||
|
||||
// Write manifest to object storage
|
||||
var storageResult = await _manifestWriter.WriteManifestAsync(
|
||||
job.TenantId,
|
||||
job.SimulationJobId,
|
||||
manifest,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var duration = _timeProvider.GetUtcNow() - startedAt;
|
||||
|
||||
// Publish completion
|
||||
await _completionPublisher.PublishCompletionAsync(
|
||||
job,
|
||||
ReducerStatus.Completed,
|
||||
storageResult,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Reducer job completed for simulation {SimulationJobId}: {TotalFindings} findings, {TotalViolations} violations, manifest stored at {StorageUri} in {Duration}ms.",
|
||||
job.SimulationJobId,
|
||||
manifest.TotalFindings,
|
||||
manifest.TotalViolations,
|
||||
storageResult.StorageUri,
|
||||
duration.TotalMilliseconds);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Reducer job failed for simulation {SimulationJobId}.",
|
||||
job.SimulationJobId);
|
||||
|
||||
await _completionPublisher.PublishCompletionAsync(
|
||||
job,
|
||||
ReducerStatus.Failed,
|
||||
null,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private SimulationManifest AggregateResults(
|
||||
ReducerJob job,
|
||||
IReadOnlyList<SimulationShardResult> partialResults)
|
||||
{
|
||||
var allFindings = partialResults
|
||||
.SelectMany(r => r.Findings)
|
||||
.ToList();
|
||||
|
||||
// Calculate counts
|
||||
var totalFindings = allFindings.Count;
|
||||
var totalViolations = allFindings.Count(f => f.PolicyOutcome == "violation");
|
||||
var totalWarnings = allFindings.Count(f => f.PolicyOutcome == "warning");
|
||||
var totalPassed = allFindings.Count(f => f.PolicyOutcome == "pass");
|
||||
|
||||
// Calculate severity breakdown
|
||||
var severityCounts = allFindings
|
||||
.GroupBy(f => f.Severity)
|
||||
.ToImmutableDictionary(g => g.Key, g => g.Count());
|
||||
|
||||
// Calculate delta from baseline if available
|
||||
var delta = job.BaselineManifestUri is not null
|
||||
? CalculateDelta(allFindings, job)
|
||||
: null;
|
||||
|
||||
// Sample findings (top N by severity)
|
||||
var samples = allFindings
|
||||
.OrderByDescending(f => GetSeverityWeight(f.Severity))
|
||||
.ThenBy(f => f.FindingId)
|
||||
.Take(100)
|
||||
.ToImmutableArray();
|
||||
|
||||
// Group by component
|
||||
var byComponent = allFindings
|
||||
.GroupBy(f => f.ComponentPurl)
|
||||
.Select(g => new ComponentSummary(
|
||||
g.Key,
|
||||
g.Count(),
|
||||
g.Count(f => f.PolicyOutcome == "violation"),
|
||||
g.Any(f => f.IsReachable == true)))
|
||||
.OrderByDescending(c => c.ViolationCount)
|
||||
.Take(50)
|
||||
.ToImmutableArray();
|
||||
|
||||
// Group by vulnerability
|
||||
var byVulnerability = allFindings
|
||||
.GroupBy(f => f.VulnerabilityId)
|
||||
.Select(g => new VulnerabilitySummary(
|
||||
g.Key,
|
||||
g.First().Severity,
|
||||
g.Count(),
|
||||
g.Select(f => f.ComponentPurl).Distinct().Count()))
|
||||
.OrderByDescending(v => GetSeverityWeight(v.Severity))
|
||||
.ThenByDescending(v => v.AffectedComponentCount)
|
||||
.Take(50)
|
||||
.ToImmutableArray();
|
||||
|
||||
return new SimulationManifest(
|
||||
ManifestId: $"{job.SimulationJobId}-manifest",
|
||||
SimulationJobId: job.SimulationJobId,
|
||||
TenantId: job.TenantId,
|
||||
PolicyId: job.PolicyId,
|
||||
GeneratedAt: _timeProvider.GetUtcNow(),
|
||||
TotalSboms: partialResults.Sum(r => r.SbomsProcessed),
|
||||
TotalFindings: totalFindings,
|
||||
TotalViolations: totalViolations,
|
||||
TotalWarnings: totalWarnings,
|
||||
TotalPassed: totalPassed,
|
||||
SeverityCounts: severityCounts,
|
||||
Delta: delta,
|
||||
SampleFindings: samples,
|
||||
ComponentSummaries: byComponent,
|
||||
VulnerabilitySummaries: byVulnerability);
|
||||
}
|
||||
|
||||
private static SimulationDelta? CalculateDelta(
|
||||
IReadOnlyList<SimulationFinding> findings,
|
||||
ReducerJob job)
|
||||
{
|
||||
// Placeholder - in real implementation, would load baseline and compare
|
||||
return new SimulationDelta(
|
||||
BaselineManifestUri: job.BaselineManifestUri!,
|
||||
NewFindings: 0,
|
||||
ResolvedFindings: 0,
|
||||
UnchangedFindings: findings.Count,
|
||||
NewViolations: 0,
|
||||
ResolvedViolations: 0);
|
||||
}
|
||||
|
||||
private static int GetSeverityWeight(string severity)
|
||||
{
|
||||
return severity.ToLowerInvariant() switch
|
||||
{
|
||||
"critical" => 4,
|
||||
"high" => 3,
|
||||
"medium" => 2,
|
||||
"low" => 1,
|
||||
_ => 0
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Queue interface for reducer jobs.
|
||||
/// </summary>
|
||||
public interface IReducerJobQueue
|
||||
{
|
||||
/// <summary>
|
||||
/// Dequeues reducer jobs for processing.
|
||||
/// </summary>
|
||||
ValueTask<IReadOnlyList<ReducerJob>> DequeueAsync(
|
||||
int maxCount,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Enqueues a reducer job.
|
||||
/// </summary>
|
||||
ValueTask EnqueueAsync(
|
||||
ReducerJob job,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writer interface for simulation manifests.
|
||||
/// </summary>
|
||||
public interface ISimulationManifestWriter
|
||||
{
|
||||
/// <summary>
|
||||
/// Writes a manifest to object storage.
|
||||
/// </summary>
|
||||
ValueTask<ManifestStorageResult> WriteManifestAsync(
|
||||
string tenantId,
|
||||
string simulationJobId,
|
||||
SimulationManifest manifest,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Publisher interface for reducer completion events.
|
||||
/// </summary>
|
||||
public interface IReducerCompletionPublisher
|
||||
{
|
||||
/// <summary>
|
||||
/// Publishes reducer completion event.
|
||||
/// </summary>
|
||||
ValueTask PublishCompletionAsync(
|
||||
ReducerJob job,
|
||||
ReducerStatus status,
|
||||
ManifestStorageResult? storageResult,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents a reducer job.
|
||||
/// </summary>
|
||||
public sealed record ReducerJob(
|
||||
string ReducerJobId,
|
||||
string SimulationJobId,
|
||||
string TenantId,
|
||||
string PolicyId,
|
||||
string? BaselineManifestUri = null);
|
||||
|
||||
/// <summary>
|
||||
/// Result of storing a manifest.
|
||||
/// </summary>
|
||||
public sealed record ManifestStorageResult(
|
||||
string StorageUri,
|
||||
string Checksum,
|
||||
string ChecksumAlgorithm,
|
||||
long SizeBytes,
|
||||
DateTimeOffset StoredAt);
|
||||
|
||||
/// <summary>
|
||||
/// Aggregated simulation manifest.
|
||||
/// </summary>
|
||||
public sealed record SimulationManifest(
|
||||
string ManifestId,
|
||||
string SimulationJobId,
|
||||
string TenantId,
|
||||
string PolicyId,
|
||||
DateTimeOffset GeneratedAt,
|
||||
int TotalSboms,
|
||||
int TotalFindings,
|
||||
int TotalViolations,
|
||||
int TotalWarnings,
|
||||
int TotalPassed,
|
||||
ImmutableDictionary<string, int> SeverityCounts,
|
||||
SimulationDelta? Delta,
|
||||
ImmutableArray<SimulationFinding> SampleFindings,
|
||||
ImmutableArray<ComponentSummary> ComponentSummaries,
|
||||
ImmutableArray<VulnerabilitySummary> VulnerabilitySummaries);
|
||||
|
||||
/// <summary>
|
||||
/// Delta comparison with baseline.
|
||||
/// </summary>
|
||||
public sealed record SimulationDelta(
|
||||
string BaselineManifestUri,
|
||||
int NewFindings,
|
||||
int ResolvedFindings,
|
||||
int UnchangedFindings,
|
||||
int NewViolations,
|
||||
int ResolvedViolations);
|
||||
|
||||
/// <summary>
|
||||
/// Summary of findings by component.
|
||||
/// </summary>
|
||||
public sealed record ComponentSummary(
|
||||
string ComponentPurl,
|
||||
int FindingCount,
|
||||
int ViolationCount,
|
||||
bool HasReachableFindings);
|
||||
|
||||
/// <summary>
|
||||
/// Summary of findings by vulnerability.
|
||||
/// </summary>
|
||||
public sealed record VulnerabilitySummary(
|
||||
string VulnerabilityId,
|
||||
string Severity,
|
||||
int FindingCount,
|
||||
int AffectedComponentCount);
|
||||
|
||||
/// <summary>
|
||||
/// Status of a reducer job.
|
||||
/// </summary>
|
||||
public enum ReducerStatus
|
||||
{
|
||||
Pending,
|
||||
Running,
|
||||
Completed,
|
||||
NoResults,
|
||||
Failed
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of reducer job queue.
|
||||
/// </summary>
|
||||
public sealed class InMemoryReducerJobQueue : IReducerJobQueue
|
||||
{
|
||||
private readonly Queue<ReducerJob> _queue = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
public ValueTask<IReadOnlyList<ReducerJob>> DequeueAsync(
|
||||
int maxCount,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = new List<ReducerJob>();
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
while (results.Count < maxCount && _queue.Count > 0)
|
||||
{
|
||||
results.Add(_queue.Dequeue());
|
||||
}
|
||||
}
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<ReducerJob>>(results);
|
||||
}
|
||||
|
||||
public ValueTask EnqueueAsync(
|
||||
ReducerJob job,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_queue.Enqueue(job);
|
||||
}
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of simulation manifest writer.
|
||||
/// </summary>
|
||||
public sealed class InMemorySimulationManifestWriter : ISimulationManifestWriter
|
||||
{
|
||||
private readonly Dictionary<string, (SimulationManifest Manifest, ManifestStorageResult Result)> _manifests = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
public ValueTask<ManifestStorageResult> WriteManifestAsync(
|
||||
string tenantId,
|
||||
string simulationJobId,
|
||||
SimulationManifest manifest,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var json = JsonSerializer.Serialize(manifest);
|
||||
var bytes = Encoding.UTF8.GetBytes(json);
|
||||
var checksum = Convert.ToHexString(SHA256.HashData(bytes)).ToLowerInvariant();
|
||||
|
||||
var result = new ManifestStorageResult(
|
||||
StorageUri: $"mem://{tenantId}/simulations/{simulationJobId}/manifest.json",
|
||||
Checksum: checksum,
|
||||
ChecksumAlgorithm: "SHA256",
|
||||
SizeBytes: bytes.Length,
|
||||
StoredAt: DateTimeOffset.UtcNow);
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
_manifests[$"{tenantId}/{simulationJobId}"] = (manifest, result);
|
||||
}
|
||||
|
||||
return ValueTask.FromResult(result);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a stored manifest (for testing).
|
||||
/// </summary>
|
||||
public SimulationManifest? GetManifest(string tenantId, string simulationJobId)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return _manifests.TryGetValue($"{tenantId}/{simulationJobId}", out var entry)
|
||||
? entry.Manifest
|
||||
: null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of reducer completion publisher for testing.
|
||||
/// </summary>
|
||||
public sealed class NullReducerCompletionPublisher : IReducerCompletionPublisher
|
||||
{
|
||||
public static NullReducerCompletionPublisher Instance { get; } = new();
|
||||
|
||||
public ValueTask PublishCompletionAsync(
|
||||
ReducerJob job,
|
||||
ReducerStatus status,
|
||||
ManifestStorageResult? storageResult,
|
||||
CancellationToken cancellationToken = default)
|
||||
=> ValueTask.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,504 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Simulation;
|
||||
|
||||
/// <summary>
|
||||
/// Security enforcement per SCHED-WORKER-27-303.
|
||||
/// Enforces tenant isolation, scope checks, and attestation integration for simulation jobs.
|
||||
/// Includes secret scanning pipeline for uploaded policy sources.
|
||||
/// </summary>
|
||||
public sealed class SimulationSecurityEnforcer : ISimulationSecurityEnforcer
|
||||
{
|
||||
private readonly ITenantScopeValidator _scopeValidator;
|
||||
private readonly IAttestationVerifier _attestationVerifier;
|
||||
private readonly ISecretScanner _secretScanner;
|
||||
private readonly SchedulerWorkerOptions _options;
|
||||
private readonly ILogger<SimulationSecurityEnforcer> _logger;
|
||||
|
||||
public SimulationSecurityEnforcer(
|
||||
ITenantScopeValidator scopeValidator,
|
||||
IAttestationVerifier attestationVerifier,
|
||||
ISecretScanner secretScanner,
|
||||
SchedulerWorkerOptions options,
|
||||
ILogger<SimulationSecurityEnforcer> logger)
|
||||
{
|
||||
_scopeValidator = scopeValidator ?? throw new ArgumentNullException(nameof(scopeValidator));
|
||||
_attestationVerifier = attestationVerifier ?? throw new ArgumentNullException(nameof(attestationVerifier));
|
||||
_secretScanner = secretScanner ?? throw new ArgumentNullException(nameof(secretScanner));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates a simulation job for security compliance.
|
||||
/// </summary>
|
||||
public async ValueTask<SecurityValidationResult> ValidateJobAsync(
|
||||
SimulationJob job,
|
||||
SimulationSecurityContext context,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var violations = new List<SecurityViolation>();
|
||||
|
||||
// 1. Validate tenant isolation
|
||||
var tenantResult = await ValidateTenantIsolationAsync(job, context, cancellationToken).ConfigureAwait(false);
|
||||
violations.AddRange(tenantResult.Violations);
|
||||
|
||||
// 2. Validate scope permissions
|
||||
var scopeResult = await ValidateScopePermissionsAsync(job, context, cancellationToken).ConfigureAwait(false);
|
||||
violations.AddRange(scopeResult.Violations);
|
||||
|
||||
// 3. Validate attestations if required
|
||||
if (context.RequireAttestation)
|
||||
{
|
||||
var attestationResult = await ValidateAttestationsAsync(job, context, cancellationToken).ConfigureAwait(false);
|
||||
violations.AddRange(attestationResult.Violations);
|
||||
}
|
||||
|
||||
// 4. Scan policy source for secrets if provided
|
||||
if (job.SimulationOptions is { } opts && context.PolicySource is not null)
|
||||
{
|
||||
var secretResult = await ScanForSecretsAsync(context.PolicySource, cancellationToken).ConfigureAwait(false);
|
||||
violations.AddRange(secretResult.Violations);
|
||||
}
|
||||
|
||||
var isValid = violations.Count == 0 || violations.All(v => v.Severity != ViolationSeverity.Critical);
|
||||
|
||||
if (!isValid)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Security validation failed for job {JobId}: {ViolationCount} violations found.",
|
||||
job.JobId,
|
||||
violations.Count);
|
||||
}
|
||||
|
||||
return new SecurityValidationResult(
|
||||
IsValid: isValid,
|
||||
Violations: [.. violations],
|
||||
ValidatedAt: DateTimeOffset.UtcNow,
|
||||
ValidatorVersion: "1.0.0");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates simulation shard results for tenant isolation.
|
||||
/// </summary>
|
||||
public async ValueTask<SecurityValidationResult> ValidateShardResultAsync(
|
||||
SimulationShardResult result,
|
||||
SimulationSecurityContext context,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var violations = new List<SecurityViolation>();
|
||||
|
||||
// Verify all findings belong to the expected tenant
|
||||
foreach (var finding in result.Findings)
|
||||
{
|
||||
var belongsToTenant = await _scopeValidator.ValidateFindingOwnershipAsync(
|
||||
finding.SbomId,
|
||||
context.TenantId,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!belongsToTenant)
|
||||
{
|
||||
violations.Add(new SecurityViolation(
|
||||
Code: "TENANT_ISOLATION_BREACH",
|
||||
Message: $"Finding {finding.FindingId} references SBOM not owned by tenant {context.TenantId}.",
|
||||
Severity: ViolationSeverity.Critical,
|
||||
Source: "ShardResultValidator"));
|
||||
}
|
||||
}
|
||||
|
||||
return new SecurityValidationResult(
|
||||
IsValid: violations.Count == 0,
|
||||
Violations: [.. violations],
|
||||
ValidatedAt: DateTimeOffset.UtcNow,
|
||||
ValidatorVersion: "1.0.0");
|
||||
}
|
||||
|
||||
private async ValueTask<ValidationStepResult> ValidateTenantIsolationAsync(
|
||||
SimulationJob job,
|
||||
SimulationSecurityContext context,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var violations = new List<SecurityViolation>();
|
||||
|
||||
// Verify job tenant matches context tenant
|
||||
if (!string.Equals(job.TenantId, context.TenantId, StringComparison.Ordinal))
|
||||
{
|
||||
violations.Add(new SecurityViolation(
|
||||
Code: "TENANT_MISMATCH",
|
||||
Message: $"Job tenant {job.TenantId} does not match context tenant {context.TenantId}.",
|
||||
Severity: ViolationSeverity.Critical,
|
||||
Source: "TenantIsolation"));
|
||||
}
|
||||
|
||||
// Verify all SBOMs belong to the tenant
|
||||
var invalidSboms = new List<string>();
|
||||
foreach (var sbomId in job.SbomIds)
|
||||
{
|
||||
var isOwned = await _scopeValidator.ValidateSbomOwnershipAsync(
|
||||
sbomId,
|
||||
context.TenantId,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!isOwned)
|
||||
{
|
||||
invalidSboms.Add(sbomId);
|
||||
}
|
||||
}
|
||||
|
||||
if (invalidSboms.Count > 0)
|
||||
{
|
||||
violations.Add(new SecurityViolation(
|
||||
Code: "SBOM_OWNERSHIP_VIOLATION",
|
||||
Message: $"{invalidSboms.Count} SBOM(s) not owned by tenant {context.TenantId}: {string.Join(", ", invalidSboms.Take(5))}...",
|
||||
Severity: ViolationSeverity.Critical,
|
||||
Source: "TenantIsolation"));
|
||||
}
|
||||
|
||||
// Verify policy belongs to tenant
|
||||
var policyOwned = await _scopeValidator.ValidatePolicyOwnershipAsync(
|
||||
job.PolicyId,
|
||||
context.TenantId,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!policyOwned)
|
||||
{
|
||||
violations.Add(new SecurityViolation(
|
||||
Code: "POLICY_OWNERSHIP_VIOLATION",
|
||||
Message: $"Policy {job.PolicyId} not owned by tenant {context.TenantId}.",
|
||||
Severity: ViolationSeverity.Critical,
|
||||
Source: "TenantIsolation"));
|
||||
}
|
||||
|
||||
return new ValidationStepResult(violations);
|
||||
}
|
||||
|
||||
private async ValueTask<ValidationStepResult> ValidateScopePermissionsAsync(
|
||||
SimulationJob job,
|
||||
SimulationSecurityContext context,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var violations = new List<SecurityViolation>();
|
||||
|
||||
// Verify caller has simulation permission
|
||||
if (!context.Permissions.Contains("simulation:execute"))
|
||||
{
|
||||
violations.Add(new SecurityViolation(
|
||||
Code: "MISSING_PERMISSION",
|
||||
Message: "Caller lacks 'simulation:execute' permission.",
|
||||
Severity: ViolationSeverity.Critical,
|
||||
Source: "ScopeValidation"));
|
||||
}
|
||||
|
||||
// Verify caller has read access to policy
|
||||
if (!context.Permissions.Contains("policy:read"))
|
||||
{
|
||||
violations.Add(new SecurityViolation(
|
||||
Code: "MISSING_PERMISSION",
|
||||
Message: "Caller lacks 'policy:read' permission.",
|
||||
Severity: ViolationSeverity.High,
|
||||
Source: "ScopeValidation"));
|
||||
}
|
||||
|
||||
// Verify rate limits not exceeded
|
||||
var rateLimitResult = await _scopeValidator.CheckRateLimitAsync(
|
||||
context.TenantId,
|
||||
"simulation",
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!rateLimitResult.IsAllowed)
|
||||
{
|
||||
violations.Add(new SecurityViolation(
|
||||
Code: "RATE_LIMIT_EXCEEDED",
|
||||
Message: $"Simulation rate limit exceeded for tenant {context.TenantId}. Retry after {rateLimitResult.RetryAfter}.",
|
||||
Severity: ViolationSeverity.High,
|
||||
Source: "ScopeValidation"));
|
||||
}
|
||||
|
||||
return new ValidationStepResult(violations);
|
||||
}
|
||||
|
||||
private async ValueTask<ValidationStepResult> ValidateAttestationsAsync(
|
||||
SimulationJob job,
|
||||
SimulationSecurityContext context,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var violations = new List<SecurityViolation>();
|
||||
|
||||
// Verify policy has valid attestation
|
||||
var policyAttestation = await _attestationVerifier.VerifyPolicyAttestationAsync(
|
||||
job.PolicyId,
|
||||
context.TenantId,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!policyAttestation.IsValid)
|
||||
{
|
||||
violations.Add(new SecurityViolation(
|
||||
Code: "INVALID_POLICY_ATTESTATION",
|
||||
Message: $"Policy {job.PolicyId} attestation invalid: {policyAttestation.Reason}.",
|
||||
Severity: ViolationSeverity.High,
|
||||
Source: "AttestationVerification"));
|
||||
}
|
||||
|
||||
// Verify SBOMs have valid attestations (sample check for large sets)
|
||||
var sampleSize = Math.Min(job.SbomIds.Length, 10);
|
||||
var sampleSboms = job.SbomIds.Take(sampleSize).ToList();
|
||||
|
||||
foreach (var sbomId in sampleSboms)
|
||||
{
|
||||
var sbomAttestation = await _attestationVerifier.VerifySbomAttestationAsync(
|
||||
sbomId,
|
||||
context.TenantId,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!sbomAttestation.IsValid)
|
||||
{
|
||||
violations.Add(new SecurityViolation(
|
||||
Code: "INVALID_SBOM_ATTESTATION",
|
||||
Message: $"SBOM {sbomId} attestation invalid: {sbomAttestation.Reason}.",
|
||||
Severity: ViolationSeverity.Medium,
|
||||
Source: "AttestationVerification"));
|
||||
}
|
||||
}
|
||||
|
||||
return new ValidationStepResult(violations);
|
||||
}
|
||||
|
||||
private async ValueTask<ValidationStepResult> ScanForSecretsAsync(
|
||||
string policySource,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var violations = new List<SecurityViolation>();
|
||||
|
||||
var scanResult = await _secretScanner.ScanAsync(policySource, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
foreach (var secret in scanResult.DetectedSecrets)
|
||||
{
|
||||
violations.Add(new SecurityViolation(
|
||||
Code: "SECRET_DETECTED",
|
||||
Message: $"Potential secret detected in policy source: {secret.Type} at line {secret.LineNumber}.",
|
||||
Severity: ViolationSeverity.Critical,
|
||||
Source: "SecretScanner"));
|
||||
}
|
||||
|
||||
return new ValidationStepResult(violations);
|
||||
}
|
||||
|
||||
private sealed record ValidationStepResult(List<SecurityViolation> Violations);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for simulation security enforcement.
|
||||
/// </summary>
|
||||
public interface ISimulationSecurityEnforcer
|
||||
{
|
||||
ValueTask<SecurityValidationResult> ValidateJobAsync(
|
||||
SimulationJob job,
|
||||
SimulationSecurityContext context,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
ValueTask<SecurityValidationResult> ValidateShardResultAsync(
|
||||
SimulationShardResult result,
|
||||
SimulationSecurityContext context,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for tenant scope validation.
|
||||
/// </summary>
|
||||
public interface ITenantScopeValidator
|
||||
{
|
||||
ValueTask<bool> ValidateSbomOwnershipAsync(string sbomId, string tenantId, CancellationToken cancellationToken = default);
|
||||
ValueTask<bool> ValidatePolicyOwnershipAsync(string policyId, string tenantId, CancellationToken cancellationToken = default);
|
||||
ValueTask<bool> ValidateFindingOwnershipAsync(string sbomId, string tenantId, CancellationToken cancellationToken = default);
|
||||
ValueTask<RateLimitResult> CheckRateLimitAsync(string tenantId, string operation, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for attestation verification.
|
||||
/// </summary>
|
||||
public interface IAttestationVerifier
|
||||
{
|
||||
ValueTask<AttestationResult> VerifyPolicyAttestationAsync(string policyId, string tenantId, CancellationToken cancellationToken = default);
|
||||
ValueTask<AttestationResult> VerifySbomAttestationAsync(string sbomId, string tenantId, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for secret scanning.
|
||||
/// </summary>
|
||||
public interface ISecretScanner
|
||||
{
|
||||
ValueTask<SecretScanResult> ScanAsync(string content, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Security context for simulation jobs.
|
||||
/// </summary>
|
||||
public sealed record SimulationSecurityContext(
|
||||
string TenantId,
|
||||
string CallerId,
|
||||
ImmutableHashSet<string> Permissions,
|
||||
bool RequireAttestation = false,
|
||||
string? PolicySource = null);
|
||||
|
||||
/// <summary>
|
||||
/// Result of security validation.
|
||||
/// </summary>
|
||||
public sealed record SecurityValidationResult(
|
||||
bool IsValid,
|
||||
ImmutableArray<SecurityViolation> Violations,
|
||||
DateTimeOffset ValidatedAt,
|
||||
string ValidatorVersion);
|
||||
|
||||
/// <summary>
|
||||
/// A security violation.
|
||||
/// </summary>
|
||||
public sealed record SecurityViolation(
|
||||
string Code,
|
||||
string Message,
|
||||
ViolationSeverity Severity,
|
||||
string Source);
|
||||
|
||||
/// <summary>
|
||||
/// Severity of a security violation.
|
||||
/// </summary>
|
||||
public enum ViolationSeverity
|
||||
{
|
||||
Low,
|
||||
Medium,
|
||||
High,
|
||||
Critical
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of rate limit check.
|
||||
/// </summary>
|
||||
public sealed record RateLimitResult(
|
||||
bool IsAllowed,
|
||||
int RemainingQuota,
|
||||
TimeSpan? RetryAfter = null);
|
||||
|
||||
/// <summary>
|
||||
/// Result of attestation verification.
|
||||
/// </summary>
|
||||
public sealed record AttestationResult(
|
||||
bool IsValid,
|
||||
string? Reason = null,
|
||||
DateTimeOffset? VerifiedAt = null);
|
||||
|
||||
/// <summary>
|
||||
/// Result of secret scanning.
|
||||
/// </summary>
|
||||
public sealed record SecretScanResult(
|
||||
bool HasSecrets,
|
||||
ImmutableArray<DetectedSecret> DetectedSecrets);
|
||||
|
||||
/// <summary>
|
||||
/// A detected secret.
|
||||
/// </summary>
|
||||
public sealed record DetectedSecret(
|
||||
string Type,
|
||||
int LineNumber,
|
||||
string Context);
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of tenant scope validator.
|
||||
/// </summary>
|
||||
public sealed class DefaultTenantScopeValidator : ITenantScopeValidator
|
||||
{
|
||||
public ValueTask<bool> ValidateSbomOwnershipAsync(string sbomId, string tenantId, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.FromResult(true); // Placeholder
|
||||
|
||||
public ValueTask<bool> ValidatePolicyOwnershipAsync(string policyId, string tenantId, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.FromResult(true); // Placeholder
|
||||
|
||||
public ValueTask<bool> ValidateFindingOwnershipAsync(string sbomId, string tenantId, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.FromResult(true); // Placeholder
|
||||
|
||||
public ValueTask<RateLimitResult> CheckRateLimitAsync(string tenantId, string operation, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.FromResult(new RateLimitResult(true, 100)); // Placeholder
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of attestation verifier.
|
||||
/// </summary>
|
||||
public sealed class DefaultAttestationVerifier : IAttestationVerifier
|
||||
{
|
||||
public ValueTask<AttestationResult> VerifyPolicyAttestationAsync(string policyId, string tenantId, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.FromResult(new AttestationResult(true, null, DateTimeOffset.UtcNow)); // Placeholder
|
||||
|
||||
public ValueTask<AttestationResult> VerifySbomAttestationAsync(string sbomId, string tenantId, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.FromResult(new AttestationResult(true, null, DateTimeOffset.UtcNow)); // Placeholder
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Regex-based secret scanner implementation.
|
||||
/// </summary>
|
||||
public sealed partial class RegexSecretScanner : ISecretScanner
|
||||
{
|
||||
private static readonly (string Type, Regex Pattern)[] SecretPatterns =
|
||||
[
|
||||
("AWS_ACCESS_KEY", AwsAccessKeyRegex()),
|
||||
("AWS_SECRET_KEY", AwsSecretKeyRegex()),
|
||||
("GITHUB_TOKEN", GithubTokenRegex()),
|
||||
("GENERIC_API_KEY", GenericApiKeyRegex()),
|
||||
("PRIVATE_KEY", PrivateKeyRegex()),
|
||||
("PASSWORD_IN_URL", PasswordInUrlRegex())
|
||||
];
|
||||
|
||||
public ValueTask<SecretScanResult> ScanAsync(string content, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var detectedSecrets = new List<DetectedSecret>();
|
||||
var lines = content.Split('\n');
|
||||
|
||||
for (var lineNumber = 0; lineNumber < lines.Length; lineNumber++)
|
||||
{
|
||||
var line = lines[lineNumber];
|
||||
|
||||
foreach (var (type, pattern) in SecretPatterns)
|
||||
{
|
||||
if (pattern.IsMatch(line))
|
||||
{
|
||||
// Mask the context to avoid exposing the secret
|
||||
var maskedContext = pattern.Replace(line, "[REDACTED]");
|
||||
detectedSecrets.Add(new DetectedSecret(type, lineNumber + 1, maskedContext));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ValueTask.FromResult(new SecretScanResult(
|
||||
HasSecrets: detectedSecrets.Count > 0,
|
||||
DetectedSecrets: [.. detectedSecrets]));
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"AKIA[0-9A-Z]{16}", RegexOptions.Compiled)]
|
||||
private static partial Regex AwsAccessKeyRegex();
|
||||
|
||||
[GeneratedRegex(@"[A-Za-z0-9/+=]{40}", RegexOptions.Compiled)]
|
||||
private static partial Regex AwsSecretKeyRegex();
|
||||
|
||||
[GeneratedRegex(@"gh[pousr]_[A-Za-z0-9_]{36,}", RegexOptions.Compiled)]
|
||||
private static partial Regex GithubTokenRegex();
|
||||
|
||||
[GeneratedRegex(@"(?i)(api[_-]?key|apikey|secret[_-]?key)\s*[:=]\s*['""]?[A-Za-z0-9_\-]{20,}['""]?", RegexOptions.Compiled)]
|
||||
private static partial Regex GenericApiKeyRegex();
|
||||
|
||||
[GeneratedRegex(@"-----BEGIN (RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----", RegexOptions.Compiled)]
|
||||
private static partial Regex PrivateKeyRegex();
|
||||
|
||||
[GeneratedRegex(@"://[^:]+:[^@]+@", RegexOptions.Compiled)]
|
||||
private static partial Regex PasswordInUrlRegex();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null implementation of secret scanner for testing.
|
||||
/// </summary>
|
||||
public sealed class NullSecretScanner : ISecretScanner
|
||||
{
|
||||
public static NullSecretScanner Instance { get; } = new();
|
||||
|
||||
public ValueTask<SecretScanResult> ScanAsync(string content, CancellationToken cancellationToken = default)
|
||||
=> ValueTask.FromResult(new SecretScanResult(false, []));
|
||||
}
|
||||
Reference in New Issue
Block a user