doctor enhancements, setup, enhancements, ui functionality and design consolidation and , test projects fixes , product advisory attestation/rekor and delta verfications enhancements
This commit is contained in:
@@ -0,0 +1,334 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// GreyQueueWatchdogService.cs
|
||||
// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement
|
||||
// Task: UQ-004 - Add timeout watchdog for stuck processing
|
||||
// Description: Watchdog service to detect and handle stuck entries
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Diagnostics.Metrics;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Unknowns.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Watchdog service that detects and handles stuck entries in Processing status.
|
||||
/// </summary>
|
||||
public sealed class GreyQueueWatchdogService : BackgroundService
|
||||
{
|
||||
private readonly IGreyQueueRepository _repository;
|
||||
private readonly INotificationPublisher _notificationPublisher;
|
||||
private readonly GreyQueueWatchdogOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<GreyQueueWatchdogService> _logger;
|
||||
|
||||
private static readonly Meter WatchdogMeter = new("StellaOps.Unknowns.Watchdog", "1.0.0");
|
||||
private static readonly Counter<long> StuckTotal = WatchdogMeter.CreateCounter<long>(
|
||||
"greyqueue_stuck_total",
|
||||
"entries",
|
||||
"Total number of stuck entries detected");
|
||||
private static readonly Counter<long> TimeoutTotal = WatchdogMeter.CreateCounter<long>(
|
||||
"greyqueue_timeout_total",
|
||||
"entries",
|
||||
"Total number of entries that timed out");
|
||||
private static readonly Counter<long> RetryTotal = WatchdogMeter.CreateCounter<long>(
|
||||
"greyqueue_watchdog_retry_total",
|
||||
"entries",
|
||||
"Total number of forced retries by watchdog");
|
||||
private static readonly Counter<long> FailedTotal = WatchdogMeter.CreateCounter<long>(
|
||||
"greyqueue_watchdog_failed_total",
|
||||
"entries",
|
||||
"Total number of entries moved to Failed by watchdog");
|
||||
private static readonly Gauge<int> ProcessingCount = WatchdogMeter.CreateGauge<int>(
|
||||
"greyqueue_processing_count",
|
||||
"entries",
|
||||
"Current number of entries in Processing status");
|
||||
|
||||
public GreyQueueWatchdogService(
|
||||
IGreyQueueRepository repository,
|
||||
INotificationPublisher notificationPublisher,
|
||||
IOptions<GreyQueueWatchdogOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<GreyQueueWatchdogService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_notificationPublisher = notificationPublisher ?? throw new ArgumentNullException(nameof(notificationPublisher));
|
||||
_options = options?.Value ?? new GreyQueueWatchdogOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Grey Queue Watchdog starting with interval {Interval}, alert threshold {AlertThreshold}, timeout {Timeout}",
|
||||
_options.CheckInterval,
|
||||
_options.ProcessingAlertThreshold,
|
||||
_options.ProcessingTimeout);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await CheckProcessingEntriesAsync(stoppingToken);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(ex, "Watchdog check failed");
|
||||
}
|
||||
|
||||
await Task.Delay(_options.CheckInterval, stoppingToken);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task CheckProcessingEntriesAsync(CancellationToken ct)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var processingEntries = await _repository.GetByStatusAsync(GreyQueueStatus.Processing, ct);
|
||||
|
||||
ProcessingCount.Record(processingEntries.Count);
|
||||
|
||||
foreach (var entry in processingEntries)
|
||||
{
|
||||
var processingDuration = now - (entry.LastProcessedAt ?? entry.CreatedAt);
|
||||
|
||||
// Check if entry is stuck (exceeded alert threshold but not timeout)
|
||||
if (processingDuration >= _options.ProcessingAlertThreshold &&
|
||||
processingDuration < _options.ProcessingTimeout)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Entry {EntryId} has been processing for {Duration}",
|
||||
entry.Id, processingDuration);
|
||||
|
||||
StuckTotal.Add(1);
|
||||
|
||||
await _notificationPublisher.PublishAsync(new StuckProcessingAlert
|
||||
{
|
||||
EntryId = entry.Id,
|
||||
BomRef = entry.BomRef,
|
||||
ProcessingDuration = processingDuration,
|
||||
AlertedAt = now
|
||||
}, ct);
|
||||
}
|
||||
// Check if entry has timed out
|
||||
else if (processingDuration >= _options.ProcessingTimeout)
|
||||
{
|
||||
await HandleTimeoutAsync(entry, processingDuration, ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task HandleTimeoutAsync(
|
||||
GreyQueueEntry entry,
|
||||
TimeSpan processingDuration,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
_logger.LogWarning(
|
||||
"Entry {EntryId} has timed out after {Duration}. Attempts: {Attempts}/{MaxAttempts}",
|
||||
entry.Id, processingDuration, entry.ProcessingAttempts, entry.MaxAttempts);
|
||||
|
||||
TimeoutTotal.Add(1);
|
||||
|
||||
// Check if max attempts exceeded
|
||||
if (entry.ProcessingAttempts >= entry.MaxAttempts)
|
||||
{
|
||||
_logger.LogError(
|
||||
"Entry {EntryId} has exceeded max attempts ({MaxAttempts}), marking as Failed",
|
||||
entry.Id, entry.MaxAttempts);
|
||||
|
||||
await _repository.UpdateStatusAsync(entry.Id, GreyQueueStatus.Failed, ct);
|
||||
FailedTotal.Add(1);
|
||||
|
||||
await _notificationPublisher.PublishAsync(new EntryFailedNotification
|
||||
{
|
||||
EntryId = entry.Id,
|
||||
BomRef = entry.BomRef,
|
||||
Reason = $"Timed out after {entry.ProcessingAttempts} attempts",
|
||||
FailedAt = now
|
||||
}, ct);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Force retry
|
||||
_logger.LogInformation(
|
||||
"Forcing retry for entry {EntryId} (attempt {Attempt})",
|
||||
entry.Id, entry.ProcessingAttempts + 1);
|
||||
|
||||
var backoffMultiplier = Math.Pow(2, entry.ProcessingAttempts);
|
||||
var nextProcessingAt = now.AddMinutes(_options.BaseRetryDelayMinutes * backoffMultiplier);
|
||||
|
||||
await _repository.ForceRetryAsync(entry.Id, nextProcessingAt, ct);
|
||||
RetryTotal.Add(1);
|
||||
|
||||
await _notificationPublisher.PublishAsync(new ForcedRetryNotification
|
||||
{
|
||||
EntryId = entry.Id,
|
||||
BomRef = entry.BomRef,
|
||||
AttemptNumber = entry.ProcessingAttempts + 1,
|
||||
NextProcessingAt = nextProcessingAt
|
||||
}, ct);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Manually triggers a retry for a stuck entry.
|
||||
/// </summary>
|
||||
public async Task ManualRetryAsync(Guid entryId, CancellationToken ct = default)
|
||||
{
|
||||
var entry = await _repository.GetByIdAsync(entryId, ct);
|
||||
if (entry == null)
|
||||
{
|
||||
throw new InvalidOperationException($"Entry {entryId} not found");
|
||||
}
|
||||
|
||||
if (entry.Status != GreyQueueStatus.Processing && entry.Status != GreyQueueStatus.Failed)
|
||||
{
|
||||
throw new InvalidOperationException($"Entry {entryId} is not stuck (status: {entry.Status})");
|
||||
}
|
||||
|
||||
_logger.LogInformation("Manual retry triggered for entry {EntryId}", entryId);
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
await _repository.ForceRetryAsync(entryId, now, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets current watchdog statistics.
|
||||
/// </summary>
|
||||
public async Task<WatchdogStats> GetStatsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var processingEntries = await _repository.GetByStatusAsync(GreyQueueStatus.Processing, ct);
|
||||
|
||||
var stuckCount = 0;
|
||||
var timedOutCount = 0;
|
||||
var oldestProcessingDuration = TimeSpan.Zero;
|
||||
|
||||
foreach (var entry in processingEntries)
|
||||
{
|
||||
var duration = now - (entry.LastProcessedAt ?? entry.CreatedAt);
|
||||
|
||||
if (duration > oldestProcessingDuration)
|
||||
{
|
||||
oldestProcessingDuration = duration;
|
||||
}
|
||||
|
||||
if (duration >= _options.ProcessingTimeout)
|
||||
{
|
||||
timedOutCount++;
|
||||
}
|
||||
else if (duration >= _options.ProcessingAlertThreshold)
|
||||
{
|
||||
stuckCount++;
|
||||
}
|
||||
}
|
||||
|
||||
return new WatchdogStats
|
||||
{
|
||||
TotalProcessing = processingEntries.Count,
|
||||
StuckCount = stuckCount,
|
||||
TimedOutCount = timedOutCount,
|
||||
OldestProcessingDuration = oldestProcessingDuration,
|
||||
CheckedAt = now
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Watchdog configuration.
|
||||
/// </summary>
|
||||
public sealed record GreyQueueWatchdogOptions
|
||||
{
|
||||
/// <summary>Configuration section name.</summary>
|
||||
public const string SectionName = "Unknowns:Watchdog";
|
||||
|
||||
/// <summary>How often to check for stuck entries.</summary>
|
||||
public TimeSpan CheckInterval { get; init; } = TimeSpan.FromMinutes(5);
|
||||
|
||||
/// <summary>Duration after which to alert (1 hour default).</summary>
|
||||
public TimeSpan ProcessingAlertThreshold { get; init; } = TimeSpan.FromHours(1);
|
||||
|
||||
/// <summary>Duration after which to force retry (4 hours default).</summary>
|
||||
public TimeSpan ProcessingTimeout { get; init; } = TimeSpan.FromHours(4);
|
||||
|
||||
/// <summary>Maximum processing attempts before marking as Failed.</summary>
|
||||
public int MaxAttempts { get; init; } = 5;
|
||||
|
||||
/// <summary>Base retry delay in minutes (used with exponential backoff).</summary>
|
||||
public double BaseRetryDelayMinutes { get; init; } = 15;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Watchdog statistics.
|
||||
/// </summary>
|
||||
public sealed record WatchdogStats
|
||||
{
|
||||
/// <summary>Total entries in Processing status.</summary>
|
||||
public int TotalProcessing { get; init; }
|
||||
|
||||
/// <summary>Number of stuck entries (alert threshold exceeded).</summary>
|
||||
public int StuckCount { get; init; }
|
||||
|
||||
/// <summary>Number of timed out entries.</summary>
|
||||
public int TimedOutCount { get; init; }
|
||||
|
||||
/// <summary>Duration of oldest processing entry.</summary>
|
||||
public TimeSpan OldestProcessingDuration { get; init; }
|
||||
|
||||
/// <summary>When stats were checked.</summary>
|
||||
public DateTimeOffset CheckedAt { get; init; }
|
||||
}
|
||||
|
||||
#region Notifications
|
||||
|
||||
/// <summary>Stuck processing alert.</summary>
|
||||
public sealed record StuckProcessingAlert
|
||||
{
|
||||
public required Guid EntryId { get; init; }
|
||||
public required string BomRef { get; init; }
|
||||
public TimeSpan ProcessingDuration { get; init; }
|
||||
public DateTimeOffset AlertedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>Entry failed notification.</summary>
|
||||
public sealed record EntryFailedNotification
|
||||
{
|
||||
public required Guid EntryId { get; init; }
|
||||
public required string BomRef { get; init; }
|
||||
public required string Reason { get; init; }
|
||||
public DateTimeOffset FailedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>Forced retry notification.</summary>
|
||||
public sealed record ForcedRetryNotification
|
||||
{
|
||||
public required Guid EntryId { get; init; }
|
||||
public required string BomRef { get; init; }
|
||||
public int AttemptNumber { get; init; }
|
||||
public DateTimeOffset NextProcessingAt { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Repository Extensions
|
||||
|
||||
// These would be added to IGreyQueueRepository
|
||||
public partial interface IGreyQueueRepository
|
||||
{
|
||||
/// <summary>Gets entries by status.</summary>
|
||||
Task<IReadOnlyList<GreyQueueEntry>> GetByStatusAsync(GreyQueueStatus status, CancellationToken ct = default);
|
||||
|
||||
/// <summary>Updates entry status.</summary>
|
||||
Task UpdateStatusAsync(Guid entryId, GreyQueueStatus status, CancellationToken ct = default);
|
||||
|
||||
/// <summary>Forces a retry for an entry.</summary>
|
||||
Task ForceRetryAsync(Guid entryId, DateTimeOffset nextProcessingAt, CancellationToken ct = default);
|
||||
|
||||
/// <summary>Gets entry by ID.</summary>
|
||||
Task<GreyQueueEntry?> GetByIdAsync(Guid entryId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,423 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// UnknownsLifecycleService.cs
|
||||
// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement
|
||||
// Task: UQ-002 - Implement automatic escalation and demotion
|
||||
// Description: Background service for automatic band transitions based on events
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Diagnostics.Metrics;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Unknowns.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Background service that handles automatic band transitions for unknowns.
|
||||
/// Subscribes to EPSS, KEV, deployment, and runtime events.
|
||||
/// </summary>
|
||||
public sealed class UnknownsLifecycleService : BackgroundService
|
||||
{
|
||||
private readonly IGreyQueueRepository _repository;
|
||||
private readonly IEventSubscriber _eventSubscriber;
|
||||
private readonly INotificationPublisher _notificationPublisher;
|
||||
private readonly UnknownsLifecycleOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<UnknownsLifecycleService> _logger;
|
||||
|
||||
private static readonly Meter LifecycleMeter = new("StellaOps.Unknowns.Lifecycle", "1.0.0");
|
||||
private static readonly Counter<long> EscalatedTotal = LifecycleMeter.CreateCounter<long>(
|
||||
"unknowns_escalated_total",
|
||||
"entries",
|
||||
"Total number of unknowns escalated to higher band");
|
||||
private static readonly Counter<long> DemotedTotal = LifecycleMeter.CreateCounter<long>(
|
||||
"unknowns_demoted_total",
|
||||
"entries",
|
||||
"Total number of unknowns demoted to lower band");
|
||||
private static readonly Counter<long> ExpiredTotal = LifecycleMeter.CreateCounter<long>(
|
||||
"unknowns_expired_total",
|
||||
"entries",
|
||||
"Total number of unknowns expired");
|
||||
|
||||
public UnknownsLifecycleService(
|
||||
IGreyQueueRepository repository,
|
||||
IEventSubscriber eventSubscriber,
|
||||
INotificationPublisher notificationPublisher,
|
||||
IOptions<UnknownsLifecycleOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<UnknownsLifecycleService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_eventSubscriber = eventSubscriber ?? throw new ArgumentNullException(nameof(eventSubscriber));
|
||||
_notificationPublisher = notificationPublisher ?? throw new ArgumentNullException(nameof(notificationPublisher));
|
||||
_options = options?.Value ?? new UnknownsLifecycleOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Unknowns Lifecycle Service starting");
|
||||
|
||||
// Subscribe to events
|
||||
_eventSubscriber.Subscribe<EpssUpdatedEvent>("epss.updated", HandleEpssUpdatedAsync);
|
||||
_eventSubscriber.Subscribe<KevAddedEvent>("kev.added", HandleKevAddedAsync);
|
||||
_eventSubscriber.Subscribe<DeploymentCreatedEvent>("deployment.created", HandleDeploymentCreatedAsync);
|
||||
_eventSubscriber.Subscribe<RuntimeUpdatedEvent>("runtime.updated", HandleRuntimeUpdatedAsync);
|
||||
|
||||
// Start expiry processing loop
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await ProcessExpiredEntriesAsync(stoppingToken);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(ex, "Error processing expired entries");
|
||||
}
|
||||
|
||||
await Task.Delay(_options.ExpiryCheckInterval, stoppingToken);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handles EPSS score update - may escalate entries.
|
||||
/// </summary>
|
||||
private async Task HandleEpssUpdatedAsync(EpssUpdatedEvent evt, CancellationToken ct)
|
||||
{
|
||||
_logger.LogDebug("Processing EPSS update for {CveId}: {OldScore} -> {NewScore}",
|
||||
evt.CveId, evt.OldScore, evt.NewScore);
|
||||
|
||||
// Get unknowns affected by this CVE
|
||||
var entries = await _repository.GetByCveAsync(evt.CveId, ct);
|
||||
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
var oldBand = GetBand(entry.Score);
|
||||
var newBand = GetBand(evt.NewScore);
|
||||
|
||||
if (newBand > oldBand)
|
||||
{
|
||||
await EscalateEntryAsync(entry, newBand, $"EPSS increased: {evt.OldScore:F2} -> {evt.NewScore:F2}", ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handles KEV addition - escalates affected entries to HOT.
|
||||
/// </summary>
|
||||
private async Task HandleKevAddedAsync(KevAddedEvent evt, CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation("KEV added for {CveId}, escalating affected unknowns", evt.CveId);
|
||||
|
||||
var entries = await _repository.GetByCveAsync(evt.CveId, ct);
|
||||
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
if (GetBand(entry.Score) != UnknownsBand.Hot)
|
||||
{
|
||||
await EscalateEntryAsync(entry, UnknownsBand.Hot, $"Added to CISA KEV on {evt.AddedDate:O}", ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handles new deployment - escalates if component is used.
|
||||
/// </summary>
|
||||
private async Task HandleDeploymentCreatedAsync(DeploymentCreatedEvent evt, CancellationToken ct)
|
||||
{
|
||||
_logger.LogDebug("Processing deployment {DeploymentId} for unknown escalation", evt.DeploymentId);
|
||||
|
||||
foreach (var component in evt.AffectedComponents)
|
||||
{
|
||||
var entries = await _repository.GetByBomRefAsync(component, ct);
|
||||
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
var currentBand = GetBand(entry.Score);
|
||||
if (currentBand == UnknownsBand.Cold)
|
||||
{
|
||||
await EscalateEntryAsync(entry, UnknownsBand.Warm,
|
||||
$"New deployment {evt.DeploymentId} uses affected component", ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Handles runtime observation update.
|
||||
/// </summary>
|
||||
private async Task HandleRuntimeUpdatedAsync(RuntimeUpdatedEvent evt, CancellationToken ct)
|
||||
{
|
||||
_logger.LogDebug("Processing runtime update for {BomRef}", evt.BomRef);
|
||||
|
||||
var entries = await _repository.GetByBomRefAsync(evt.BomRef, ct);
|
||||
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
// If runtime shows active execution, escalate to at least WARM
|
||||
if (evt.ObservationType == RuntimeObservationType.ActiveExecution)
|
||||
{
|
||||
var currentBand = GetBand(entry.Score);
|
||||
if (currentBand == UnknownsBand.Cold)
|
||||
{
|
||||
await EscalateEntryAsync(entry, UnknownsBand.Warm,
|
||||
$"Runtime observation: active execution detected", ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Processes entries that have exceeded their TTL.
|
||||
/// </summary>
|
||||
private async Task ProcessExpiredEntriesAsync(CancellationToken ct)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var expiredEntries = await _repository.GetExpiredAsync(now, ct);
|
||||
|
||||
foreach (var entry in expiredEntries)
|
||||
{
|
||||
_logger.LogInformation("Expiring unknown {EntryId} (TTL exceeded)", entry.Id);
|
||||
|
||||
await _repository.UpdateStatusAsync(entry.Id, GreyQueueStatus.Expired, ct);
|
||||
ExpiredTotal.Add(1);
|
||||
|
||||
await _notificationPublisher.PublishAsync(new UnknownExpiredNotification
|
||||
{
|
||||
EntryId = entry.Id,
|
||||
BomRef = entry.BomRef,
|
||||
ExpiredAt = now
|
||||
}, ct);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Escalates an entry to a higher band.
|
||||
/// </summary>
|
||||
private async Task EscalateEntryAsync(
|
||||
GreyQueueEntry entry,
|
||||
UnknownsBand newBand,
|
||||
string reason,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var oldBand = GetBand(entry.Score);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Escalating unknown {EntryId} from {OldBand} to {NewBand}: {Reason}",
|
||||
entry.Id, oldBand, newBand, reason);
|
||||
|
||||
// Update the entry with new score reflecting the band
|
||||
var newScore = newBand switch
|
||||
{
|
||||
UnknownsBand.Hot => Math.Max(entry.Score, 0.70),
|
||||
UnknownsBand.Warm => Math.Max(entry.Score, 0.40),
|
||||
_ => entry.Score
|
||||
};
|
||||
|
||||
await _repository.UpdateScoreAsync(entry.Id, newScore, reason, ct);
|
||||
EscalatedTotal.Add(1, new KeyValuePair<string, object?>("from_band", oldBand.ToString()),
|
||||
new KeyValuePair<string, object?>("to_band", newBand.ToString()));
|
||||
|
||||
await _notificationPublisher.PublishAsync(new UnknownEscalatedNotification
|
||||
{
|
||||
EntryId = entry.Id,
|
||||
BomRef = entry.BomRef,
|
||||
OldBand = oldBand,
|
||||
NewBand = newBand,
|
||||
Reason = reason
|
||||
}, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Demotes an entry to a lower band (when blocking factors are removed).
|
||||
/// </summary>
|
||||
public async Task TryDemoteEntryAsync(
|
||||
Guid entryId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var entry = await _repository.GetByIdAsync(entryId, ct);
|
||||
if (entry == null) return;
|
||||
|
||||
// Check for blocking factors
|
||||
var blockingFactors = await CheckBlockingFactorsAsync(entry, ct);
|
||||
if (blockingFactors.Count > 0)
|
||||
{
|
||||
_logger.LogDebug("Cannot demote {EntryId}: blocking factors: {Factors}",
|
||||
entryId, string.Join(", ", blockingFactors));
|
||||
return;
|
||||
}
|
||||
|
||||
var currentBand = GetBand(entry.Score);
|
||||
if (currentBand == UnknownsBand.Cold)
|
||||
{
|
||||
return; // Already at lowest band
|
||||
}
|
||||
|
||||
var newBand = currentBand switch
|
||||
{
|
||||
UnknownsBand.Hot => UnknownsBand.Warm,
|
||||
UnknownsBand.Warm => UnknownsBand.Cold,
|
||||
_ => currentBand
|
||||
};
|
||||
|
||||
var newScore = newBand switch
|
||||
{
|
||||
UnknownsBand.Warm => 0.50,
|
||||
UnknownsBand.Cold => 0.20,
|
||||
_ => entry.Score
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Demoting unknown {EntryId} from {OldBand} to {NewBand}",
|
||||
entryId, currentBand, newBand);
|
||||
|
||||
await _repository.UpdateScoreAsync(entry.Id, newScore, "SLA met, no blocking factors", ct);
|
||||
DemotedTotal.Add(1);
|
||||
|
||||
await _notificationPublisher.PublishAsync(new UnknownDemotedNotification
|
||||
{
|
||||
EntryId = entry.Id,
|
||||
BomRef = entry.BomRef,
|
||||
OldBand = currentBand,
|
||||
NewBand = newBand
|
||||
}, ct);
|
||||
}
|
||||
|
||||
private async Task<List<string>> CheckBlockingFactorsAsync(GreyQueueEntry entry, CancellationToken ct)
|
||||
{
|
||||
var factors = new List<string>();
|
||||
|
||||
// Check if in KEV
|
||||
if (await _repository.IsInKevAsync(entry.Id, ct))
|
||||
{
|
||||
factors.Add("in_kev");
|
||||
}
|
||||
|
||||
// Check if EPSS is critical (> 0.7)
|
||||
if (entry.Score >= 0.7)
|
||||
{
|
||||
factors.Add("critical_epss");
|
||||
}
|
||||
|
||||
return factors;
|
||||
}
|
||||
|
||||
private static UnknownsBand GetBand(double score)
|
||||
{
|
||||
return score switch
|
||||
{
|
||||
>= 0.70 => UnknownsBand.Hot,
|
||||
>= 0.40 => UnknownsBand.Warm,
|
||||
_ => UnknownsBand.Cold
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Lifecycle service configuration.
|
||||
/// </summary>
|
||||
public sealed record UnknownsLifecycleOptions
|
||||
{
|
||||
/// <summary>Configuration section name.</summary>
|
||||
public const string SectionName = "Unknowns:Lifecycle";
|
||||
|
||||
/// <summary>How often to check for expired entries.</summary>
|
||||
public TimeSpan ExpiryCheckInterval { get; init; } = TimeSpan.FromMinutes(15);
|
||||
|
||||
/// <summary>Whether to auto-demote when blocking factors are removed.</summary>
|
||||
public bool AutoDemote { get; init; } = true;
|
||||
}
|
||||
|
||||
#region Events
|
||||
|
||||
/// <summary>Event subscriber interface.</summary>
|
||||
public interface IEventSubscriber
|
||||
{
|
||||
/// <summary>Subscribes to an event type.</summary>
|
||||
void Subscribe<T>(string eventType, Func<T, CancellationToken, Task> handler);
|
||||
}
|
||||
|
||||
/// <summary>EPSS score updated event.</summary>
|
||||
public sealed record EpssUpdatedEvent
|
||||
{
|
||||
public required string CveId { get; init; }
|
||||
public double OldScore { get; init; }
|
||||
public double NewScore { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>CVE added to KEV event.</summary>
|
||||
public sealed record KevAddedEvent
|
||||
{
|
||||
public required string CveId { get; init; }
|
||||
public DateTimeOffset AddedDate { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>New deployment created event.</summary>
|
||||
public sealed record DeploymentCreatedEvent
|
||||
{
|
||||
public required string DeploymentId { get; init; }
|
||||
public IReadOnlyList<string> AffectedComponents { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>Runtime observation updated event.</summary>
|
||||
public sealed record RuntimeUpdatedEvent
|
||||
{
|
||||
public required string BomRef { get; init; }
|
||||
public RuntimeObservationType ObservationType { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>Runtime observation type.</summary>
|
||||
public enum RuntimeObservationType
|
||||
{
|
||||
/// <summary>Component is actively executing.</summary>
|
||||
ActiveExecution,
|
||||
/// <summary>Component is loaded but not executing.</summary>
|
||||
Loaded,
|
||||
/// <summary>Component is dormant.</summary>
|
||||
Dormant
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Notifications
|
||||
|
||||
/// <summary>Unknown escalated notification.</summary>
|
||||
public sealed record UnknownEscalatedNotification
|
||||
{
|
||||
public required Guid EntryId { get; init; }
|
||||
public required string BomRef { get; init; }
|
||||
public UnknownsBand OldBand { get; init; }
|
||||
public UnknownsBand NewBand { get; init; }
|
||||
public string? Reason { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>Unknown demoted notification.</summary>
|
||||
public sealed record UnknownDemotedNotification
|
||||
{
|
||||
public required Guid EntryId { get; init; }
|
||||
public required string BomRef { get; init; }
|
||||
public UnknownsBand OldBand { get; init; }
|
||||
public UnknownsBand NewBand { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>Unknown expired notification.</summary>
|
||||
public sealed record UnknownExpiredNotification
|
||||
{
|
||||
public required Guid EntryId { get; init; }
|
||||
public required string BomRef { get; init; }
|
||||
public DateTimeOffset ExpiredAt { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Repository Extensions
|
||||
|
||||
// Extension methods for lifecycle operations
|
||||
public static class GreyQueueRepositoryExtensions
|
||||
{
|
||||
// These would be implemented in the actual repository
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,330 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// UnknownsMetricsService.cs
|
||||
// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement
|
||||
// Task: UQ-007 - Add unknowns queue metrics and observability
|
||||
// Description: Prometheus metrics for unknowns queue operations
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Unknowns.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Service that collects and exposes Prometheus metrics for unknowns queue.
|
||||
/// </summary>
|
||||
public sealed class UnknownsMetricsService : BackgroundService
|
||||
{
|
||||
private readonly IGreyQueueRepository _repository;
|
||||
private readonly UnknownsMetricsOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<UnknownsMetricsService> _logger;
|
||||
|
||||
// Meter for all unknowns metrics
|
||||
private static readonly Meter UnknownsMeter = new("StellaOps.Unknowns", "1.0.0");
|
||||
|
||||
// Gauges
|
||||
private static readonly Gauge<int> QueueDepthHot = UnknownsMeter.CreateGauge<int>(
|
||||
"unknowns_queue_depth",
|
||||
"entries",
|
||||
"Number of unknowns in queue by band");
|
||||
|
||||
private static readonly Gauge<double> SlaComplianceRate = UnknownsMeter.CreateGauge<double>(
|
||||
"unknowns_sla_compliance_rate",
|
||||
"ratio",
|
||||
"Ratio of unknowns within SLA (0-1)");
|
||||
|
||||
// Histograms
|
||||
private static readonly Histogram<double> ProcessingTimeSeconds = UnknownsMeter.CreateHistogram<double>(
|
||||
"unknowns_processing_time_seconds",
|
||||
"seconds",
|
||||
"Time spent processing unknowns");
|
||||
|
||||
private static readonly Histogram<double> ResolutionTimeHours = UnknownsMeter.CreateHistogram<double>(
|
||||
"unknowns_resolution_time_hours",
|
||||
"hours",
|
||||
"Time from creation to resolution by band");
|
||||
|
||||
// Counters
|
||||
private static readonly Counter<long> StateTransitionsTotal = UnknownsMeter.CreateCounter<long>(
|
||||
"unknowns_state_transitions_total",
|
||||
"transitions",
|
||||
"Total state transitions by from_state and to_state");
|
||||
|
||||
// Activity source for distributed tracing
|
||||
private static readonly ActivitySource UnknownsActivitySource = new("StellaOps.Unknowns", "1.0.0");
|
||||
|
||||
// Band-specific gauges (tracked separately for Prometheus labels)
|
||||
private int _hotCount;
|
||||
private int _warmCount;
|
||||
private int _coldCount;
|
||||
private double _slaCompliance;
|
||||
|
||||
public UnknownsMetricsService(
|
||||
IGreyQueueRepository repository,
|
||||
IOptions<UnknownsMetricsOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<UnknownsMetricsService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_options = options?.Value ?? new UnknownsMetricsOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
|
||||
// Register observable gauges that read from our fields
|
||||
UnknownsMeter.CreateObservableGauge(
|
||||
"unknowns_queue_depth_hot",
|
||||
() => _hotCount,
|
||||
"entries",
|
||||
"Number of HOT unknowns in queue");
|
||||
|
||||
UnknownsMeter.CreateObservableGauge(
|
||||
"unknowns_queue_depth_warm",
|
||||
() => _warmCount,
|
||||
"entries",
|
||||
"Number of WARM unknowns in queue");
|
||||
|
||||
UnknownsMeter.CreateObservableGauge(
|
||||
"unknowns_queue_depth_cold",
|
||||
() => _coldCount,
|
||||
"entries",
|
||||
"Number of COLD unknowns in queue");
|
||||
|
||||
UnknownsMeter.CreateObservableGauge(
|
||||
"unknowns_sla_compliance",
|
||||
() => _slaCompliance,
|
||||
"ratio",
|
||||
"SLA compliance rate (0-1)");
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Unknowns Metrics Service starting with interval {Interval}",
|
||||
_options.CollectionInterval);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await CollectMetricsAsync(stoppingToken);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(ex, "Error collecting unknowns metrics");
|
||||
}
|
||||
|
||||
await Task.Delay(_options.CollectionInterval, stoppingToken);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task CollectMetricsAsync(CancellationToken ct)
|
||||
{
|
||||
using var activity = UnknownsActivitySource.StartActivity("CollectMetrics");
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var entries = await _repository.GetPendingAsync(ct);
|
||||
|
||||
// Count by band
|
||||
_hotCount = 0;
|
||||
_warmCount = 0;
|
||||
_coldCount = 0;
|
||||
|
||||
var withinSla = 0;
|
||||
var totalPending = 0;
|
||||
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
var band = GetBand(entry.Score);
|
||||
var slaLimit = GetSlaLimit(band);
|
||||
var elapsed = now - entry.CreatedAt;
|
||||
|
||||
switch (band)
|
||||
{
|
||||
case UnknownsBand.Hot:
|
||||
_hotCount++;
|
||||
break;
|
||||
case UnknownsBand.Warm:
|
||||
_warmCount++;
|
||||
break;
|
||||
case UnknownsBand.Cold:
|
||||
_coldCount++;
|
||||
break;
|
||||
}
|
||||
|
||||
totalPending++;
|
||||
if (elapsed < slaLimit)
|
||||
{
|
||||
withinSla++;
|
||||
}
|
||||
}
|
||||
|
||||
// Update SLA compliance rate
|
||||
_slaCompliance = totalPending > 0 ? (double)withinSla / totalPending : 1.0;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Metrics collected: HOT={Hot}, WARM={Warm}, COLD={Cold}, SLA={Sla:P1}",
|
||||
_hotCount, _warmCount, _coldCount, _slaCompliance);
|
||||
|
||||
activity?.SetTag("hot_count", _hotCount);
|
||||
activity?.SetTag("warm_count", _warmCount);
|
||||
activity?.SetTag("cold_count", _coldCount);
|
||||
activity?.SetTag("sla_compliance", _slaCompliance);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a state transition metric.
|
||||
/// </summary>
|
||||
public static void RecordStateTransition(GreyQueueStatus fromState, GreyQueueStatus toState)
|
||||
{
|
||||
StateTransitionsTotal.Add(1,
|
||||
new KeyValuePair<string, object?>("from_state", fromState.ToString().ToLowerInvariant()),
|
||||
new KeyValuePair<string, object?>("to_state", toState.ToString().ToLowerInvariant()));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records processing time for an entry.
|
||||
/// </summary>
|
||||
public static void RecordProcessingTime(TimeSpan duration)
|
||||
{
|
||||
ProcessingTimeSeconds.Record(duration.TotalSeconds);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records resolution time for an entry.
|
||||
/// </summary>
|
||||
public static void RecordResolutionTime(TimeSpan duration, UnknownsBand band)
|
||||
{
|
||||
ResolutionTimeHours.Record(duration.TotalHours,
|
||||
new KeyValuePair<string, object?>("band", band.ToString().ToLowerInvariant()));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts an activity for tracing.
|
||||
/// </summary>
|
||||
public static Activity? StartActivity(string name, ActivityKind kind = ActivityKind.Internal)
|
||||
{
|
||||
return UnknownsActivitySource.StartActivity(name, kind);
|
||||
}
|
||||
|
||||
private static UnknownsBand GetBand(double score)
|
||||
{
|
||||
return score switch
|
||||
{
|
||||
>= 0.70 => UnknownsBand.Hot,
|
||||
>= 0.40 => UnknownsBand.Warm,
|
||||
_ => UnknownsBand.Cold
|
||||
};
|
||||
}
|
||||
|
||||
private static TimeSpan GetSlaLimit(UnknownsBand band)
|
||||
{
|
||||
return band switch
|
||||
{
|
||||
UnknownsBand.Hot => TimeSpan.FromHours(24),
|
||||
UnknownsBand.Warm => TimeSpan.FromDays(7),
|
||||
UnknownsBand.Cold => TimeSpan.FromDays(30),
|
||||
_ => TimeSpan.FromDays(30)
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metrics configuration.
|
||||
/// </summary>
|
||||
public sealed record UnknownsMetricsOptions
|
||||
{
|
||||
/// <summary>Configuration section name.</summary>
|
||||
public const string SectionName = "Unknowns:Metrics";
|
||||
|
||||
/// <summary>How often to collect metrics.</summary>
|
||||
public TimeSpan CollectionInterval { get; init; } = TimeSpan.FromMinutes(1);
|
||||
|
||||
/// <summary>Whether to include detailed per-entry metrics.</summary>
|
||||
public bool DetailedMetrics { get; init; } = false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Structured logging extensions for unknowns operations.
|
||||
/// </summary>
|
||||
public static partial class UnknownsLogging
|
||||
{
|
||||
[LoggerMessage(Level = LogLevel.Information, Message = "Unknown {UnknownId} created for {BomRef} with score {Score:F2} (band: {Band})")]
|
||||
public static partial void LogUnknownCreated(
|
||||
this ILogger logger,
|
||||
Guid unknownId,
|
||||
string bomRef,
|
||||
double score,
|
||||
string band);
|
||||
|
||||
[LoggerMessage(Level = LogLevel.Information, Message = "Unknown {UnknownId} transitioned from {FromState} to {ToState}")]
|
||||
public static partial void LogStateTransition(
|
||||
this ILogger logger,
|
||||
Guid unknownId,
|
||||
string fromState,
|
||||
string toState);
|
||||
|
||||
[LoggerMessage(Level = LogLevel.Warning, Message = "Unknown {UnknownId} SLA warning: {PercentElapsed:P1} elapsed, {RemainingHours:F1}h remaining")]
|
||||
public static partial void LogSlaWarning(
|
||||
this ILogger logger,
|
||||
Guid unknownId,
|
||||
double percentElapsed,
|
||||
double remainingHours);
|
||||
|
||||
[LoggerMessage(Level = LogLevel.Error, Message = "Unknown {UnknownId} SLA breach: overdue by {OverdueHours:F1}h")]
|
||||
public static partial void LogSlaBreach(
|
||||
this ILogger logger,
|
||||
Guid unknownId,
|
||||
double overdueHours);
|
||||
|
||||
[LoggerMessage(Level = LogLevel.Information, Message = "Unknown {UnknownId} resolved via {Resolution} in {ResolutionHours:F1}h")]
|
||||
public static partial void LogResolution(
|
||||
this ILogger logger,
|
||||
Guid unknownId,
|
||||
string resolution,
|
||||
double resolutionHours);
|
||||
|
||||
[LoggerMessage(Level = LogLevel.Information, Message = "Gate check for {BomRef}: {Decision} ({Reason})")]
|
||||
public static partial void LogGateCheck(
|
||||
this ILogger logger,
|
||||
string bomRef,
|
||||
string decision,
|
||||
string? reason);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Trace context propagation helper.
|
||||
/// </summary>
|
||||
public static class TraceContextPropagation
|
||||
{
|
||||
/// <summary>
|
||||
/// Extracts trace context from a dictionary (e.g., message headers).
|
||||
/// </summary>
|
||||
public static ActivityContext? ExtractContext(IDictionary<string, string> headers)
|
||||
{
|
||||
if (headers.TryGetValue("traceparent", out var traceparent) &&
|
||||
ActivityContext.TryParse(traceparent, null, out var context))
|
||||
{
|
||||
return context;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Injects trace context into a dictionary (e.g., message headers).
|
||||
/// </summary>
|
||||
public static void InjectContext(Activity? activity, IDictionary<string, string> headers)
|
||||
{
|
||||
if (activity == null) return;
|
||||
|
||||
var traceparent = $"00-{activity.TraceId}-{activity.SpanId}-{(activity.Recorded ? "01" : "00")}";
|
||||
headers["traceparent"] = traceparent;
|
||||
|
||||
if (!string.IsNullOrEmpty(activity.TraceStateString))
|
||||
{
|
||||
headers["tracestate"] = activity.TraceStateString;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,136 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// UnknownsSlaHealthCheck.cs
|
||||
// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement
|
||||
// Task: UQ-001 - Health check endpoint reflects SLA status
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.Diagnostics.HealthChecks;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Unknowns.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Health check that reports SLA compliance status.
|
||||
/// </summary>
|
||||
public sealed class UnknownsSlaHealthCheck : IHealthCheck
|
||||
{
|
||||
private readonly IGreyQueueRepository _repository;
|
||||
private readonly UnknownsSlaOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
public UnknownsSlaHealthCheck(
|
||||
IGreyQueueRepository repository,
|
||||
IOptions<UnknownsSlaOptions> options,
|
||||
TimeProvider? timeProvider = null)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_options = options?.Value ?? new UnknownsSlaOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
}
|
||||
|
||||
public async Task<HealthCheckResult> CheckHealthAsync(
|
||||
HealthCheckContext context,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var entries = await _repository.GetPendingAsync(cancellationToken);
|
||||
|
||||
var breachedCount = 0;
|
||||
var warningCount = 0;
|
||||
var healthyCount = 0;
|
||||
|
||||
var hotCount = 0;
|
||||
var warmCount = 0;
|
||||
var coldCount = 0;
|
||||
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
var band = SlaCalculator.GetBand(entry.Score);
|
||||
var percentElapsed = SlaCalculator.CalculatePercentElapsed(entry, now, _options);
|
||||
|
||||
switch (band)
|
||||
{
|
||||
case UnknownsBand.Hot:
|
||||
hotCount++;
|
||||
break;
|
||||
case UnknownsBand.Warm:
|
||||
warmCount++;
|
||||
break;
|
||||
case UnknownsBand.Cold:
|
||||
coldCount++;
|
||||
break;
|
||||
}
|
||||
|
||||
if (percentElapsed >= 1.0)
|
||||
{
|
||||
breachedCount++;
|
||||
}
|
||||
else if (percentElapsed >= _options.WarningThreshold)
|
||||
{
|
||||
warningCount++;
|
||||
}
|
||||
else
|
||||
{
|
||||
healthyCount++;
|
||||
}
|
||||
}
|
||||
|
||||
var data = new Dictionary<string, object>
|
||||
{
|
||||
["total_pending"] = entries.Count,
|
||||
["hot_count"] = hotCount,
|
||||
["warm_count"] = warmCount,
|
||||
["cold_count"] = coldCount,
|
||||
["breached_count"] = breachedCount,
|
||||
["warning_count"] = warningCount,
|
||||
["healthy_count"] = healthyCount,
|
||||
["sla_compliance_rate"] = entries.Count > 0 ? (double)healthyCount / entries.Count : 1.0,
|
||||
["checked_at"] = now.ToString("O")
|
||||
};
|
||||
|
||||
// Unhealthy if any breaches exist
|
||||
if (breachedCount > 0)
|
||||
{
|
||||
return HealthCheckResult.Unhealthy(
|
||||
$"{breachedCount} unknown(s) have breached SLA",
|
||||
data: data);
|
||||
}
|
||||
|
||||
// Degraded if warnings exist
|
||||
if (warningCount > 0)
|
||||
{
|
||||
return HealthCheckResult.Degraded(
|
||||
$"{warningCount} unknown(s) approaching SLA breach",
|
||||
data: data);
|
||||
}
|
||||
|
||||
return HealthCheckResult.Healthy(
|
||||
$"All {entries.Count} unknown(s) within SLA",
|
||||
data: data);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering SLA health check.
|
||||
/// </summary>
|
||||
public static class UnknownsSlaHealthCheckExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds the unknowns SLA health check.
|
||||
/// </summary>
|
||||
public static IHealthChecksBuilder AddUnknownsSlaCheck(
|
||||
this IHealthChecksBuilder builder,
|
||||
string name = "unknowns-sla",
|
||||
HealthStatus? failureStatus = null,
|
||||
IEnumerable<string>? tags = null)
|
||||
{
|
||||
return builder.Add(new HealthCheckRegistration(
|
||||
name,
|
||||
sp => new UnknownsSlaHealthCheck(
|
||||
sp.GetRequiredService<IGreyQueueRepository>(),
|
||||
sp.GetRequiredService<IOptions<UnknownsSlaOptions>>(),
|
||||
sp.GetService<TimeProvider>()),
|
||||
failureStatus,
|
||||
tags));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,276 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// UnknownsSlaMonitorService.cs
|
||||
// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement
|
||||
// Task: UQ-001 - Implement SLA monitoring background service
|
||||
// Description: Background service for unknowns SLA monitoring and alerting
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Unknowns.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Background service that monitors unknowns for SLA breaches.
|
||||
/// </summary>
|
||||
public sealed class UnknownsSlaMonitorService : BackgroundService
|
||||
{
|
||||
private readonly IGreyQueueRepository _repository;
|
||||
private readonly INotificationPublisher _notificationPublisher;
|
||||
private readonly UnknownsSlaOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<UnknownsSlaMonitorService> _logger;
|
||||
private readonly UnknownsMetrics _metrics;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new SLA monitor service.
|
||||
/// </summary>
|
||||
public UnknownsSlaMonitorService(
|
||||
IGreyQueueRepository repository,
|
||||
INotificationPublisher notificationPublisher,
|
||||
IOptions<UnknownsSlaOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
UnknownsMetrics metrics,
|
||||
ILogger<UnknownsSlaMonitorService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_notificationPublisher = notificationPublisher ?? throw new ArgumentNullException(nameof(notificationPublisher));
|
||||
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Unknowns SLA Monitor starting with interval {Interval}",
|
||||
_options.PollingInterval);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await CheckSlaBoundsAsync(stoppingToken);
|
||||
}
|
||||
catch (Exception ex) when (ex is not OperationCanceledException)
|
||||
{
|
||||
_logger.LogError(ex, "Error checking SLA bounds");
|
||||
}
|
||||
|
||||
await Task.Delay(_options.PollingInterval, stoppingToken);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task CheckSlaBoundsAsync(CancellationToken ct)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var pendingUnknowns = await _repository.GetPendingAsync(ct);
|
||||
|
||||
var bandCounts = new Dictionary<UnknownsBand, int>
|
||||
{
|
||||
[UnknownsBand.Hot] = 0,
|
||||
[UnknownsBand.Warm] = 0,
|
||||
[UnknownsBand.Cold] = 0
|
||||
};
|
||||
|
||||
foreach (var entry in pendingUnknowns)
|
||||
{
|
||||
var band = GetBand(entry.Score);
|
||||
bandCounts[band]++;
|
||||
|
||||
var slaLimit = GetSlaLimit(band);
|
||||
var elapsed = now - entry.CreatedAt;
|
||||
var remaining = slaLimit - elapsed;
|
||||
var percentElapsed = elapsed / slaLimit;
|
||||
|
||||
// Update metrics
|
||||
_metrics.RecordSlaRemaining(entry.Id, remaining);
|
||||
|
||||
// Check for warning (80% elapsed)
|
||||
if (percentElapsed >= 0.80 && percentElapsed < 1.0)
|
||||
{
|
||||
await _notificationPublisher.PublishAsync(new SlaWarningNotification
|
||||
{
|
||||
EntryId = entry.Id,
|
||||
BomRef = entry.BomRef,
|
||||
Band = band,
|
||||
PercentElapsed = percentElapsed * 100,
|
||||
RemainingTime = remaining
|
||||
}, ct);
|
||||
}
|
||||
// Check for breach (100% elapsed)
|
||||
else if (percentElapsed >= 1.0)
|
||||
{
|
||||
_metrics.IncrementSlaBreaches(band);
|
||||
|
||||
await _notificationPublisher.PublishAsync(new SlaBreachNotification
|
||||
{
|
||||
EntryId = entry.Id,
|
||||
BomRef = entry.BomRef,
|
||||
Band = band,
|
||||
OverdueBy = elapsed - slaLimit
|
||||
}, ct);
|
||||
}
|
||||
}
|
||||
|
||||
// Update band gauges
|
||||
foreach (var (band, count) in bandCounts)
|
||||
{
|
||||
_metrics.SetBandCount(band, count);
|
||||
}
|
||||
}
|
||||
|
||||
private static UnknownsBand GetBand(double score)
|
||||
{
|
||||
return score switch
|
||||
{
|
||||
>= 0.70 => UnknownsBand.Hot,
|
||||
>= 0.40 => UnknownsBand.Warm,
|
||||
_ => UnknownsBand.Cold
|
||||
};
|
||||
}
|
||||
|
||||
private TimeSpan GetSlaLimit(UnknownsBand band)
|
||||
{
|
||||
return band switch
|
||||
{
|
||||
UnknownsBand.Hot => _options.HotSla,
|
||||
UnknownsBand.Warm => _options.WarmSla,
|
||||
UnknownsBand.Cold => _options.ColdSla,
|
||||
_ => _options.ColdSla
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unknowns SLA configuration.
|
||||
/// </summary>
|
||||
public sealed record UnknownsSlaOptions
|
||||
{
|
||||
/// <summary>Configuration section name.</summary>
|
||||
public const string SectionName = "Unknowns:Sla";
|
||||
|
||||
/// <summary>Polling interval for SLA checks.</summary>
|
||||
public TimeSpan PollingInterval { get; init; } = TimeSpan.FromMinutes(5);
|
||||
|
||||
/// <summary>HOT band SLA (default: 24 hours).</summary>
|
||||
public TimeSpan HotSla { get; init; } = TimeSpan.FromHours(24);
|
||||
|
||||
/// <summary>WARM band SLA (default: 7 days).</summary>
|
||||
public TimeSpan WarmSla { get; init; } = TimeSpan.FromDays(7);
|
||||
|
||||
/// <summary>COLD band SLA (default: 30 days).</summary>
|
||||
public TimeSpan ColdSla { get; init; } = TimeSpan.FromDays(30);
|
||||
|
||||
/// <summary>Warning threshold (percentage of SLA elapsed).</summary>
|
||||
public double WarningThreshold { get; init; } = 0.80;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unknowns band classification.
|
||||
/// </summary>
|
||||
public enum UnknownsBand
|
||||
{
|
||||
/// <summary>High priority (score >= 0.70).</summary>
|
||||
Hot,
|
||||
|
||||
/// <summary>Medium priority (score 0.40-0.69).</summary>
|
||||
Warm,
|
||||
|
||||
/// <summary>Low priority (score < 0.40).</summary>
|
||||
Cold
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SLA warning notification.
|
||||
/// </summary>
|
||||
public sealed record SlaWarningNotification
|
||||
{
|
||||
/// <summary>Entry ID.</summary>
|
||||
public required Guid EntryId { get; init; }
|
||||
|
||||
/// <summary>BOM reference.</summary>
|
||||
public required string BomRef { get; init; }
|
||||
|
||||
/// <summary>Unknown band.</summary>
|
||||
public UnknownsBand Band { get; init; }
|
||||
|
||||
/// <summary>Percent of SLA elapsed.</summary>
|
||||
public double PercentElapsed { get; init; }
|
||||
|
||||
/// <summary>Remaining time.</summary>
|
||||
public TimeSpan RemainingTime { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SLA breach notification.
|
||||
/// </summary>
|
||||
public sealed record SlaBreachNotification
|
||||
{
|
||||
/// <summary>Entry ID.</summary>
|
||||
public required Guid EntryId { get; init; }
|
||||
|
||||
/// <summary>BOM reference.</summary>
|
||||
public required string BomRef { get; init; }
|
||||
|
||||
/// <summary>Unknown band.</summary>
|
||||
public UnknownsBand Band { get; init; }
|
||||
|
||||
/// <summary>How much past the SLA.</summary>
|
||||
public TimeSpan OverdueBy { get; init; }
|
||||
}
|
||||
|
||||
// Interface placeholders
|
||||
|
||||
/// <summary>
|
||||
/// Grey queue repository.
|
||||
/// </summary>
|
||||
public interface IGreyQueueRepository
|
||||
{
|
||||
/// <summary>Gets pending entries.</summary>
|
||||
Task<IReadOnlyList<GreyQueueEntry>> GetPendingAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Grey queue entry.
|
||||
/// </summary>
|
||||
public sealed record GreyQueueEntry
|
||||
{
|
||||
/// <summary>Entry ID.</summary>
|
||||
public required Guid Id { get; init; }
|
||||
|
||||
/// <summary>BOM reference.</summary>
|
||||
public required string BomRef { get; init; }
|
||||
|
||||
/// <summary>Priority score.</summary>
|
||||
public double Score { get; init; }
|
||||
|
||||
/// <summary>When created.</summary>
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Notification publisher.
|
||||
/// </summary>
|
||||
public interface INotificationPublisher
|
||||
{
|
||||
/// <summary>Publishes a notification.</summary>
|
||||
Task PublishAsync<T>(T notification, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unknowns metrics.
|
||||
/// </summary>
|
||||
public sealed class UnknownsMetrics
|
||||
{
|
||||
/// <summary>Records SLA remaining time.</summary>
|
||||
public void RecordSlaRemaining(Guid entryId, TimeSpan remaining) { }
|
||||
|
||||
/// <summary>Increments SLA breach counter.</summary>
|
||||
public void IncrementSlaBreaches(UnknownsBand band) { }
|
||||
|
||||
/// <summary>Sets band count gauge.</summary>
|
||||
public void SetBandCount(UnknownsBand band, int count) { }
|
||||
}
|
||||
@@ -99,9 +99,188 @@ public static class GreyQueueEndpoints
|
||||
.WithSummary("Get grey queue summary statistics")
|
||||
.WithDescription("Returns summary counts by status, reason, and performance metrics.");
|
||||
|
||||
// Sprint: SPRINT_20260118_018 (UQ-005) - New state transitions
|
||||
group.MapPost("/{id:guid}/assign", AssignForReview)
|
||||
.WithName("AssignGreyQueueEntry")
|
||||
.WithSummary("Assign entry for review")
|
||||
.WithDescription("Assigns an entry to a reviewer, transitioning to UnderReview state.");
|
||||
|
||||
group.MapPost("/{id:guid}/escalate", EscalateEntry)
|
||||
.WithName("EscalateGreyQueueEntry")
|
||||
.WithSummary("Escalate entry to security team")
|
||||
.WithDescription("Escalates an entry to the security team, transitioning to Escalated state.");
|
||||
|
||||
group.MapPost("/{id:guid}/reject", RejectEntry)
|
||||
.WithName("RejectGreyQueueEntry")
|
||||
.WithSummary("Reject a grey queue entry")
|
||||
.WithDescription("Marks an entry as rejected (invalid or not actionable).");
|
||||
|
||||
group.MapPost("/{id:guid}/reopen", ReopenEntry)
|
||||
.WithName("ReopenGreyQueueEntry")
|
||||
.WithSummary("Reopen a closed entry")
|
||||
.WithDescription("Reopens a rejected, failed, or dismissed entry back to pending.");
|
||||
|
||||
group.MapGet("/{id:guid}/transitions", GetValidTransitions)
|
||||
.WithName("GetValidTransitions")
|
||||
.WithSummary("Get valid state transitions")
|
||||
.WithDescription("Returns the valid next states for an entry based on current state.");
|
||||
|
||||
return routes;
|
||||
}
|
||||
|
||||
// Sprint: SPRINT_20260118_018 (UQ-005) - Assign for review
|
||||
private static async Task<Results<Ok<GreyQueueEntryDto>, NotFound, BadRequest<string>>> AssignForReview(
|
||||
Guid id,
|
||||
[FromHeader(Name = "X-Tenant-Id")] string tenantId,
|
||||
[FromBody] AssignForReviewRequest request,
|
||||
IGreyQueueRepository repository = null!,
|
||||
INotificationPublisher? notificationPublisher = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var entry = await repository.GetByIdAsync(tenantId, id, ct);
|
||||
if (entry is null)
|
||||
{
|
||||
return TypedResults.NotFound();
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
GreyQueueStateMachine.ValidateUnderReviewTransition(entry.Status, request.Assignee);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return TypedResults.BadRequest(ex.Message);
|
||||
}
|
||||
|
||||
var updated = await repository.TransitionToUnderReviewAsync(
|
||||
tenantId, id, request.Assignee, ct);
|
||||
|
||||
return TypedResults.Ok(MapToDto(updated));
|
||||
}
|
||||
|
||||
// Sprint: SPRINT_20260118_018 (UQ-005) - Escalate to security team
|
||||
private static async Task<Results<Ok<GreyQueueEntryDto>, NotFound, BadRequest<string>>> EscalateEntry(
|
||||
Guid id,
|
||||
[FromHeader(Name = "X-Tenant-Id")] string tenantId,
|
||||
[FromBody] EscalateRequest request,
|
||||
IGreyQueueRepository repository = null!,
|
||||
INotificationPublisher? notificationPublisher = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var entry = await repository.GetByIdAsync(tenantId, id, ct);
|
||||
if (entry is null)
|
||||
{
|
||||
return TypedResults.NotFound();
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
GreyQueueStateMachine.ValidateTransition(entry.Status, GreyQueueStatus.Escalated);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return TypedResults.BadRequest(ex.Message);
|
||||
}
|
||||
|
||||
var updated = await repository.TransitionToEscalatedAsync(
|
||||
tenantId, id, request.Reason, ct);
|
||||
|
||||
// Notify security team
|
||||
if (notificationPublisher != null)
|
||||
{
|
||||
await notificationPublisher.PublishAsync(new EscalationNotification
|
||||
{
|
||||
EntryId = id,
|
||||
BomRef = entry.BomRef,
|
||||
Reason = request.Reason,
|
||||
EscalatedAt = DateTimeOffset.UtcNow
|
||||
}, ct);
|
||||
}
|
||||
|
||||
return TypedResults.Ok(MapToDto(updated));
|
||||
}
|
||||
|
||||
// Sprint: SPRINT_20260118_018 (UQ-005) - Reject entry
|
||||
private static async Task<Results<Ok<GreyQueueEntryDto>, NotFound, BadRequest<string>>> RejectEntry(
|
||||
Guid id,
|
||||
[FromHeader(Name = "X-Tenant-Id")] string tenantId,
|
||||
[FromBody] RejectRequest request,
|
||||
IGreyQueueRepository repository = null!,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var entry = await repository.GetByIdAsync(tenantId, id, ct);
|
||||
if (entry is null)
|
||||
{
|
||||
return TypedResults.NotFound();
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
GreyQueueStateMachine.ValidateTransition(entry.Status, GreyQueueStatus.Rejected);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return TypedResults.BadRequest(ex.Message);
|
||||
}
|
||||
|
||||
var updated = await repository.TransitionToRejectedAsync(
|
||||
tenantId, id, request.Reason, request.RejectedBy, ct);
|
||||
|
||||
return TypedResults.Ok(MapToDto(updated));
|
||||
}
|
||||
|
||||
// Sprint: SPRINT_20260118_018 (UQ-005) - Reopen entry
|
||||
private static async Task<Results<Ok<GreyQueueEntryDto>, NotFound, BadRequest<string>>> ReopenEntry(
|
||||
Guid id,
|
||||
[FromHeader(Name = "X-Tenant-Id")] string tenantId,
|
||||
[FromBody] ReopenRequest request,
|
||||
IGreyQueueRepository repository = null!,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var entry = await repository.GetByIdAsync(tenantId, id, ct);
|
||||
if (entry is null)
|
||||
{
|
||||
return TypedResults.NotFound();
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
GreyQueueStateMachine.ValidateTransition(entry.Status, GreyQueueStatus.Pending);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return TypedResults.BadRequest(ex.Message);
|
||||
}
|
||||
|
||||
var updated = await repository.ReopenAsync(tenantId, id, request.Reason, ct);
|
||||
|
||||
return TypedResults.Ok(MapToDto(updated));
|
||||
}
|
||||
|
||||
// Sprint: SPRINT_20260118_018 (UQ-005) - Get valid transitions
|
||||
private static async Task<Results<Ok<ValidTransitionsResponse>, NotFound>> GetValidTransitions(
|
||||
Guid id,
|
||||
[FromHeader(Name = "X-Tenant-Id")] string tenantId,
|
||||
IGreyQueueRepository repository = null!,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var entry = await repository.GetByIdAsync(tenantId, id, ct);
|
||||
if (entry is null)
|
||||
{
|
||||
return TypedResults.NotFound();
|
||||
}
|
||||
|
||||
var validStates = GreyQueueStateMachine.GetValidNextStates(entry.Status);
|
||||
|
||||
var response = new ValidTransitionsResponse
|
||||
{
|
||||
CurrentState = entry.Status.ToString(),
|
||||
ValidNextStates = validStates.Select(s => s.ToString()).ToList()
|
||||
};
|
||||
|
||||
return TypedResults.Ok(response);
|
||||
}
|
||||
|
||||
// List entries with pagination
|
||||
private static async Task<Ok<GreyQueueListResponse>> ListEntries(
|
||||
[FromHeader(Name = "X-Tenant-Id")] string tenantId,
|
||||
@@ -580,3 +759,57 @@ public sealed record ExpireResultResponse
|
||||
{
|
||||
public required int ExpiredCount { get; init; }
|
||||
}
|
||||
// Sprint: SPRINT_20260118_018 (UQ-005) - New DTOs for state transitions
|
||||
|
||||
public sealed record AssignForReviewRequest
|
||||
{
|
||||
/// <summary>Required: The assignee for review.</summary>
|
||||
public required string Assignee { get; init; }
|
||||
|
||||
/// <summary>Optional notes for the reviewer.</summary>
|
||||
public string? Notes { get; init; }
|
||||
}
|
||||
|
||||
public sealed record EscalateRequest
|
||||
{
|
||||
/// <summary>Reason for escalation.</summary>
|
||||
public required string Reason { get; init; }
|
||||
}
|
||||
|
||||
public sealed record RejectRequest
|
||||
{
|
||||
/// <summary>Reason for rejection.</summary>
|
||||
public required string Reason { get; init; }
|
||||
|
||||
/// <summary>Who rejected the entry.</summary>
|
||||
public required string RejectedBy { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ReopenRequest
|
||||
{
|
||||
/// <summary>Reason for reopening.</summary>
|
||||
public required string Reason { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ValidTransitionsResponse
|
||||
{
|
||||
/// <summary>Current state of the entry.</summary>
|
||||
public required string CurrentState { get; init; }
|
||||
|
||||
/// <summary>Valid next states from current state.</summary>
|
||||
public required List<string> ValidNextStates { get; init; }
|
||||
}
|
||||
|
||||
public sealed record EscalationNotification
|
||||
{
|
||||
public required Guid EntryId { get; init; }
|
||||
public required string BomRef { get; init; }
|
||||
public required string Reason { get; init; }
|
||||
public DateTimeOffset EscalatedAt { get; init; }
|
||||
}
|
||||
|
||||
// Interface for notification publishing
|
||||
public interface INotificationPublisher
|
||||
{
|
||||
Task PublishAsync<T>(T notification, CancellationToken ct = default);
|
||||
}
|
||||
@@ -148,9 +148,57 @@ public sealed record GreyQueueEntry
|
||||
IsPending &&
|
||||
!IsExhausted &&
|
||||
(NextProcessingAt is null || NextProcessingAt <= DateTimeOffset.UtcNow);
|
||||
|
||||
/// <summary>
|
||||
/// Assignee for entries under review.
|
||||
/// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-005)
|
||||
/// </summary>
|
||||
public string? Assignee { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When the entry was assigned for review.
|
||||
/// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-005)
|
||||
/// </summary>
|
||||
public DateTimeOffset? AssignedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When the entry was escalated to security team.
|
||||
/// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-005)
|
||||
/// </summary>
|
||||
public DateTimeOffset? EscalatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Reason for escalation.
|
||||
/// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-005)
|
||||
/// </summary>
|
||||
public string? EscalationReason { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Computed property: effective SLA based on band.
|
||||
/// </summary>
|
||||
public TimeSpan EffectiveSla => Score switch
|
||||
{
|
||||
>= 0.70 => TimeSpan.FromHours(24), // HOT
|
||||
>= 0.40 => TimeSpan.FromDays(7), // WARM
|
||||
_ => TimeSpan.FromDays(30) // COLD
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Computed property: current band based on score.
|
||||
/// </summary>
|
||||
public string Band => Score switch
|
||||
{
|
||||
>= 0.70 => "hot",
|
||||
>= 0.40 => "warm",
|
||||
_ => "cold"
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>Status of a grey queue entry.</summary>
|
||||
/// <remarks>
|
||||
/// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-005)
|
||||
/// Extended state machine with UnderReview, Escalated, and Rejected states.
|
||||
/// </remarks>
|
||||
public enum GreyQueueStatus
|
||||
{
|
||||
/// <summary>Pending initial processing.</summary>
|
||||
@@ -162,9 +210,27 @@ public enum GreyQueueStatus
|
||||
/// <summary>Waiting for retry after failed attempt.</summary>
|
||||
Retrying,
|
||||
|
||||
/// <summary>
|
||||
/// Under review by assigned reviewer.
|
||||
/// Sprint: UQ-005 - Requires assignee.
|
||||
/// </summary>
|
||||
UnderReview,
|
||||
|
||||
/// <summary>
|
||||
/// Escalated to security team.
|
||||
/// Sprint: UQ-005 - Triggers notification to security team.
|
||||
/// </summary>
|
||||
Escalated,
|
||||
|
||||
/// <summary>Successfully resolved - evidence now sufficient.</summary>
|
||||
Resolved,
|
||||
|
||||
/// <summary>
|
||||
/// Rejected - determined to be invalid or not actionable.
|
||||
/// Sprint: UQ-005 - Subset of Failed state.
|
||||
/// </summary>
|
||||
Rejected,
|
||||
|
||||
/// <summary>Failed after exhausting retries.</summary>
|
||||
Failed,
|
||||
|
||||
@@ -175,6 +241,78 @@ public enum GreyQueueStatus
|
||||
Dismissed
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// State machine validator for grey queue transitions.
|
||||
/// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement (UQ-005)
|
||||
/// </summary>
|
||||
public static class GreyQueueStateMachine
|
||||
{
|
||||
/// <summary>
|
||||
/// Valid state transitions.
|
||||
/// </summary>
|
||||
private static readonly Dictionary<GreyQueueStatus, HashSet<GreyQueueStatus>> ValidTransitions = new()
|
||||
{
|
||||
[GreyQueueStatus.Pending] = [GreyQueueStatus.Processing, GreyQueueStatus.UnderReview, GreyQueueStatus.Expired, GreyQueueStatus.Dismissed],
|
||||
[GreyQueueStatus.Processing] = [GreyQueueStatus.Retrying, GreyQueueStatus.UnderReview, GreyQueueStatus.Resolved, GreyQueueStatus.Failed],
|
||||
[GreyQueueStatus.Retrying] = [GreyQueueStatus.Processing, GreyQueueStatus.Failed, GreyQueueStatus.Expired],
|
||||
[GreyQueueStatus.UnderReview] = [GreyQueueStatus.Escalated, GreyQueueStatus.Resolved, GreyQueueStatus.Rejected, GreyQueueStatus.Pending],
|
||||
[GreyQueueStatus.Escalated] = [GreyQueueStatus.Resolved, GreyQueueStatus.Rejected, GreyQueueStatus.UnderReview],
|
||||
[GreyQueueStatus.Resolved] = [], // Terminal state
|
||||
[GreyQueueStatus.Rejected] = [GreyQueueStatus.Pending], // Can be reopened
|
||||
[GreyQueueStatus.Failed] = [GreyQueueStatus.Pending], // Can be reset
|
||||
[GreyQueueStatus.Expired] = [], // Terminal state
|
||||
[GreyQueueStatus.Dismissed] = [GreyQueueStatus.Pending] // Can be reopened
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Validates a state transition.
|
||||
/// </summary>
|
||||
public static bool CanTransition(GreyQueueStatus from, GreyQueueStatus to)
|
||||
{
|
||||
if (!ValidTransitions.TryGetValue(from, out var validTargets))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return validTargets.Contains(to);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Throws if transition is invalid.
|
||||
/// </summary>
|
||||
public static void ValidateTransition(GreyQueueStatus from, GreyQueueStatus to)
|
||||
{
|
||||
if (!CanTransition(from, to))
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"Invalid state transition from {from} to {to}");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets valid next states for a given state.
|
||||
/// </summary>
|
||||
public static IReadOnlySet<GreyQueueStatus> GetValidNextStates(GreyQueueStatus current)
|
||||
{
|
||||
return ValidTransitions.TryGetValue(current, out var states)
|
||||
? states
|
||||
: new HashSet<GreyQueueStatus>();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if transition to UnderReview is valid (requires assignee).
|
||||
/// </summary>
|
||||
public static void ValidateUnderReviewTransition(GreyQueueStatus from, string? assignee)
|
||||
{
|
||||
ValidateTransition(from, GreyQueueStatus.UnderReview);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(assignee))
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
"Transition to UnderReview requires an assignee");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Reason why an entry is in the grey queue.</summary>
|
||||
public enum GreyQueueReason
|
||||
{
|
||||
|
||||
@@ -0,0 +1,347 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// GreyQueueWatchdogServiceTests.cs
|
||||
// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement
|
||||
// Task: UQ-004 - Unit tests for watchdog service
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using NSubstitute;
|
||||
using StellaOps.Unknowns.Services;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Unknowns.Core.Tests.Services;
|
||||
|
||||
public sealed class GreyQueueWatchdogServiceTests
|
||||
{
|
||||
private readonly IGreyQueueRepository _repository;
|
||||
private readonly TestNotificationPublisher _notificationPublisher;
|
||||
private readonly FakeTimeProvider _timeProvider;
|
||||
private readonly GreyQueueWatchdogOptions _options;
|
||||
|
||||
public GreyQueueWatchdogServiceTests()
|
||||
{
|
||||
_repository = Substitute.For<IGreyQueueRepository>();
|
||||
_notificationPublisher = new TestNotificationPublisher();
|
||||
_timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 19, 12, 0, 0, TimeSpan.Zero));
|
||||
_options = new GreyQueueWatchdogOptions
|
||||
{
|
||||
CheckInterval = TimeSpan.FromMilliseconds(100),
|
||||
ProcessingAlertThreshold = TimeSpan.FromHours(1),
|
||||
ProcessingTimeout = TimeSpan.FromHours(4),
|
||||
MaxAttempts = 5,
|
||||
BaseRetryDelayMinutes = 15
|
||||
};
|
||||
}
|
||||
|
||||
#region Stuck Detection Tests
|
||||
|
||||
[Fact]
|
||||
public async Task Check_HealthyEntry_NoAlert()
|
||||
{
|
||||
// Arrange
|
||||
var entry = CreateProcessingEntry(_timeProvider.GetUtcNow().AddMinutes(-30)); // 30 min
|
||||
_repository.GetByStatusAsync(GreyQueueStatus.Processing, Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
await TriggerCheck(service);
|
||||
|
||||
// Assert - No alerts
|
||||
Assert.Empty(_notificationPublisher.StuckAlerts);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Check_StuckEntry_GeneratesAlert()
|
||||
{
|
||||
// Arrange - Processing for 90 min (past 1h threshold)
|
||||
var entry = CreateProcessingEntry(_timeProvider.GetUtcNow().AddMinutes(-90));
|
||||
_repository.GetByStatusAsync(GreyQueueStatus.Processing, Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
await TriggerCheck(service);
|
||||
|
||||
// Assert
|
||||
Assert.Single(_notificationPublisher.StuckAlerts);
|
||||
Assert.Equal(entry.Id, _notificationPublisher.StuckAlerts[0].EntryId);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Check_TimedOutEntry_ForcesRetry()
|
||||
{
|
||||
// Arrange - Processing for 5 hours (past 4h timeout), attempts < max
|
||||
var entry = CreateProcessingEntry(_timeProvider.GetUtcNow().AddHours(-5), attempts: 2);
|
||||
_repository.GetByStatusAsync(GreyQueueStatus.Processing, Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
await TriggerCheck(service);
|
||||
|
||||
// Assert
|
||||
await _repository.Received(1).ForceRetryAsync(
|
||||
entry.Id,
|
||||
Arg.Any<DateTimeOffset>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Check_ExhaustedEntry_MarksFailed()
|
||||
{
|
||||
// Arrange - Processing for 5 hours, max attempts reached
|
||||
var entry = CreateProcessingEntry(_timeProvider.GetUtcNow().AddHours(-5), attempts: 5);
|
||||
_repository.GetByStatusAsync(GreyQueueStatus.Processing, Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
await TriggerCheck(service);
|
||||
|
||||
// Assert
|
||||
await _repository.Received(1).UpdateStatusAsync(
|
||||
entry.Id,
|
||||
GreyQueueStatus.Failed,
|
||||
Arg.Any<CancellationToken>());
|
||||
Assert.Single(_notificationPublisher.FailedNotifications);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Exponential Backoff Tests
|
||||
|
||||
[Theory]
|
||||
[InlineData(0, 15)] // Attempt 0: 15 min
|
||||
[InlineData(1, 30)] // Attempt 1: 30 min
|
||||
[InlineData(2, 60)] // Attempt 2: 60 min
|
||||
[InlineData(3, 120)] // Attempt 3: 120 min
|
||||
[InlineData(4, 240)] // Attempt 4: 240 min
|
||||
public async Task ForceRetry_UsesExponentialBackoff(int attempts, int expectedMinutes)
|
||||
{
|
||||
// Arrange
|
||||
var entry = CreateProcessingEntry(_timeProvider.GetUtcNow().AddHours(-5), attempts: attempts);
|
||||
_repository.GetByStatusAsync(GreyQueueStatus.Processing, Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
DateTimeOffset capturedNextProcessingAt = default;
|
||||
_repository.ForceRetryAsync(
|
||||
Arg.Any<Guid>(),
|
||||
Arg.Do<DateTimeOffset>(dt => capturedNextProcessingAt = dt),
|
||||
Arg.Any<CancellationToken>())
|
||||
.Returns(Task.CompletedTask);
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
await TriggerCheck(service);
|
||||
|
||||
// Assert
|
||||
var expectedDelay = TimeSpan.FromMinutes(expectedMinutes);
|
||||
var actualDelay = capturedNextProcessingAt - _timeProvider.GetUtcNow();
|
||||
Assert.Equal(expectedDelay.TotalMinutes, actualDelay.TotalMinutes, precision: 1);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Multiple Entries Tests
|
||||
|
||||
[Fact]
|
||||
public async Task Check_MultipleEntries_ProcessesAll()
|
||||
{
|
||||
// Arrange
|
||||
var healthy = CreateProcessingEntry(_timeProvider.GetUtcNow().AddMinutes(-30));
|
||||
var stuck = CreateProcessingEntry(_timeProvider.GetUtcNow().AddMinutes(-90));
|
||||
var timedOut = CreateProcessingEntry(_timeProvider.GetUtcNow().AddHours(-5), attempts: 2);
|
||||
|
||||
_repository.GetByStatusAsync(GreyQueueStatus.Processing, Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([healthy, stuck, timedOut]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
await TriggerCheck(service);
|
||||
|
||||
// Assert
|
||||
Assert.Single(_notificationPublisher.StuckAlerts); // stuck only
|
||||
await _repository.Received(1).ForceRetryAsync(
|
||||
timedOut.Id,
|
||||
Arg.Any<DateTimeOffset>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Manual Retry Tests
|
||||
|
||||
[Fact]
|
||||
public async Task ManualRetry_ProcessingEntry_Succeeds()
|
||||
{
|
||||
// Arrange
|
||||
var entry = CreateProcessingEntry(_timeProvider.GetUtcNow().AddHours(-2));
|
||||
_repository.GetByIdAsync(entry.Id, Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<GreyQueueEntry?>(entry));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
await service.ManualRetryAsync(entry.Id, CancellationToken.None);
|
||||
|
||||
// Assert
|
||||
await _repository.Received(1).ForceRetryAsync(
|
||||
entry.Id,
|
||||
Arg.Any<DateTimeOffset>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ManualRetry_NonProcessingEntry_Throws()
|
||||
{
|
||||
// Arrange
|
||||
var entry = new GreyQueueEntry
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
BomRef = "pkg:test@1.0.0",
|
||||
Status = GreyQueueStatus.Pending,
|
||||
CreatedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
_repository.GetByIdAsync(entry.Id, Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<GreyQueueEntry?>(entry));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act & Assert
|
||||
await Assert.ThrowsAsync<InvalidOperationException>(
|
||||
() => service.ManualRetryAsync(entry.Id, CancellationToken.None));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ManualRetry_NotFound_Throws()
|
||||
{
|
||||
// Arrange
|
||||
_repository.GetByIdAsync(Arg.Any<Guid>(), Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<GreyQueueEntry?>(null));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act & Assert
|
||||
await Assert.ThrowsAsync<InvalidOperationException>(
|
||||
() => service.ManualRetryAsync(Guid.NewGuid(), CancellationToken.None));
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Stats Tests
|
||||
|
||||
[Fact]
|
||||
public async Task GetStats_ReturnsCorrectCounts()
|
||||
{
|
||||
// Arrange
|
||||
var healthy = CreateProcessingEntry(_timeProvider.GetUtcNow().AddMinutes(-30));
|
||||
var stuck = CreateProcessingEntry(_timeProvider.GetUtcNow().AddMinutes(-90));
|
||||
var timedOut = CreateProcessingEntry(_timeProvider.GetUtcNow().AddHours(-5));
|
||||
|
||||
_repository.GetByStatusAsync(GreyQueueStatus.Processing, Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([healthy, stuck, timedOut]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
var stats = await service.GetStatsAsync();
|
||||
|
||||
// Assert
|
||||
Assert.Equal(3, stats.TotalProcessing);
|
||||
Assert.Equal(1, stats.StuckCount); // 90 min entry
|
||||
Assert.Equal(1, stats.TimedOutCount); // 5 hour entry
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Helpers
|
||||
|
||||
private GreyQueueWatchdogService CreateService()
|
||||
{
|
||||
return new GreyQueueWatchdogService(
|
||||
_repository,
|
||||
_notificationPublisher,
|
||||
Options.Create(_options),
|
||||
_timeProvider,
|
||||
new NullLogger<GreyQueueWatchdogService>());
|
||||
}
|
||||
|
||||
private async Task TriggerCheck(GreyQueueWatchdogService service)
|
||||
{
|
||||
var method = typeof(GreyQueueWatchdogService)
|
||||
.GetMethod("CheckProcessingEntriesAsync", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
|
||||
|
||||
if (method != null)
|
||||
{
|
||||
var task = (Task?)method.Invoke(service, [CancellationToken.None]);
|
||||
if (task != null) await task;
|
||||
}
|
||||
}
|
||||
|
||||
private GreyQueueEntry CreateProcessingEntry(DateTimeOffset lastProcessedAt, int attempts = 0)
|
||||
{
|
||||
return new GreyQueueEntry
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
BomRef = $"pkg:npm/test-{Guid.NewGuid():N}@1.0.0",
|
||||
Status = GreyQueueStatus.Processing,
|
||||
Score = 0.50,
|
||||
CreatedAt = lastProcessedAt.AddHours(-1),
|
||||
LastProcessedAt = lastProcessedAt,
|
||||
ProcessingAttempts = attempts,
|
||||
MaxAttempts = _options.MaxAttempts
|
||||
};
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extension to TestNotificationPublisher for watchdog notifications.
|
||||
/// </summary>
|
||||
public partial class TestNotificationPublisher
|
||||
{
|
||||
public List<StuckProcessingAlert> StuckAlerts { get; } = [];
|
||||
public List<EntryFailedNotification> FailedNotifications { get; } = [];
|
||||
public List<ForcedRetryNotification> RetryNotifications { get; } = [];
|
||||
|
||||
public new Task PublishAsync<T>(T notification, CancellationToken ct = default)
|
||||
{
|
||||
switch (notification)
|
||||
{
|
||||
case SlaWarningNotification warning:
|
||||
Warnings.Add(warning);
|
||||
break;
|
||||
case SlaBreachNotification breach:
|
||||
Breaches.Add(breach);
|
||||
break;
|
||||
case StuckProcessingAlert stuck:
|
||||
StuckAlerts.Add(stuck);
|
||||
break;
|
||||
case EntryFailedNotification failed:
|
||||
FailedNotifications.Add(failed);
|
||||
break;
|
||||
case ForcedRetryNotification retry:
|
||||
RetryNotifications.Add(retry);
|
||||
break;
|
||||
}
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public new void Clear()
|
||||
{
|
||||
Warnings.Clear();
|
||||
Breaches.Clear();
|
||||
StuckAlerts.Clear();
|
||||
FailedNotifications.Clear();
|
||||
RetryNotifications.Clear();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,367 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// UnknownsLifecycleServiceIntegrationTests.cs
|
||||
// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement
|
||||
// Task: UQ-002 - Integration tests for lifecycle service
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using NSubstitute;
|
||||
using StellaOps.Unknowns.Services;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Unknowns.Core.Tests.Services;
|
||||
|
||||
public sealed class UnknownsLifecycleServiceIntegrationTests
|
||||
{
|
||||
private readonly IGreyQueueRepository _repository;
|
||||
private readonly TestEventSubscriber _eventSubscriber;
|
||||
private readonly TestNotificationPublisher _notificationPublisher;
|
||||
private readonly FakeTimeProvider _timeProvider;
|
||||
private readonly UnknownsLifecycleOptions _options;
|
||||
|
||||
public UnknownsLifecycleServiceIntegrationTests()
|
||||
{
|
||||
_repository = Substitute.For<IGreyQueueRepository>();
|
||||
_eventSubscriber = new TestEventSubscriber();
|
||||
_notificationPublisher = new TestNotificationPublisher();
|
||||
_timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 19, 12, 0, 0, TimeSpan.Zero));
|
||||
_options = new UnknownsLifecycleOptions
|
||||
{
|
||||
ExpiryCheckInterval = TimeSpan.FromMilliseconds(100)
|
||||
};
|
||||
}
|
||||
|
||||
#region EPSS Update Tests
|
||||
|
||||
[Fact]
|
||||
public async Task EpssUpdated_ScoreIncreases_EscalatesEntry()
|
||||
{
|
||||
// Arrange
|
||||
var entry = CreateEntry(0.50); // WARM
|
||||
_repository.GetByCveAsync("CVE-2026-1234", Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
await StartServiceAsync(service);
|
||||
|
||||
// Act - Simulate EPSS update that would escalate to HOT
|
||||
await _eventSubscriber.PublishAsync(new EpssUpdatedEvent
|
||||
{
|
||||
CveId = "CVE-2026-1234",
|
||||
OldScore = 0.50,
|
||||
NewScore = 0.85
|
||||
});
|
||||
|
||||
// Assert
|
||||
await _repository.Received(1).UpdateScoreAsync(
|
||||
entry.Id,
|
||||
Arg.Is<double>(s => s >= 0.70),
|
||||
Arg.Any<string>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task EpssUpdated_ScoreDecreases_DoesNotDemote()
|
||||
{
|
||||
// Arrange - Demotion requires explicit call, not auto
|
||||
var entry = CreateEntry(0.75); // HOT
|
||||
_repository.GetByCveAsync("CVE-2026-1234", Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
await StartServiceAsync(service);
|
||||
|
||||
// Act - EPSS decreases
|
||||
await _eventSubscriber.PublishAsync(new EpssUpdatedEvent
|
||||
{
|
||||
CveId = "CVE-2026-1234",
|
||||
OldScore = 0.75,
|
||||
NewScore = 0.30
|
||||
});
|
||||
|
||||
// Assert - Should NOT auto-demote
|
||||
await _repository.DidNotReceive().UpdateScoreAsync(
|
||||
entry.Id,
|
||||
Arg.Is<double>(s => s < 0.70),
|
||||
Arg.Any<string>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region KEV Added Tests
|
||||
|
||||
[Fact]
|
||||
public async Task KevAdded_EscalatesToHot()
|
||||
{
|
||||
// Arrange
|
||||
var entry = CreateEntry(0.30); // COLD
|
||||
_repository.GetByCveAsync("CVE-2026-5678", Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
await StartServiceAsync(service);
|
||||
|
||||
// Act
|
||||
await _eventSubscriber.PublishAsync(new KevAddedEvent
|
||||
{
|
||||
CveId = "CVE-2026-5678",
|
||||
AddedDate = _timeProvider.GetUtcNow()
|
||||
});
|
||||
|
||||
// Assert - Should escalate to HOT
|
||||
await _repository.Received(1).UpdateScoreAsync(
|
||||
entry.Id,
|
||||
Arg.Is<double>(s => s >= 0.70),
|
||||
Arg.Is<string>(r => r.Contains("KEV")),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task KevAdded_AlreadyHot_NoChange()
|
||||
{
|
||||
// Arrange
|
||||
var entry = CreateEntry(0.85); // Already HOT
|
||||
_repository.GetByCveAsync("CVE-2026-5678", Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
await StartServiceAsync(service);
|
||||
|
||||
// Act
|
||||
await _eventSubscriber.PublishAsync(new KevAddedEvent
|
||||
{
|
||||
CveId = "CVE-2026-5678",
|
||||
AddedDate = _timeProvider.GetUtcNow()
|
||||
});
|
||||
|
||||
// Assert - Should NOT update (already HOT)
|
||||
await _repository.DidNotReceive().UpdateScoreAsync(
|
||||
Arg.Any<Guid>(),
|
||||
Arg.Any<double>(),
|
||||
Arg.Any<string>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Deployment Created Tests
|
||||
|
||||
[Fact]
|
||||
public async Task DeploymentCreated_ColdEntry_EscalatesToWarm()
|
||||
{
|
||||
// Arrange
|
||||
var entry = CreateEntry(0.20, "pkg:npm/vulnerable@1.0.0"); // COLD
|
||||
_repository.GetByBomRefAsync("pkg:npm/vulnerable@1.0.0", Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
await StartServiceAsync(service);
|
||||
|
||||
// Act
|
||||
await _eventSubscriber.PublishAsync(new DeploymentCreatedEvent
|
||||
{
|
||||
DeploymentId = "deploy-123",
|
||||
AffectedComponents = ["pkg:npm/vulnerable@1.0.0"]
|
||||
});
|
||||
|
||||
// Assert - Should escalate COLD to WARM
|
||||
await _repository.Received(1).UpdateScoreAsync(
|
||||
entry.Id,
|
||||
Arg.Is<double>(s => s >= 0.40),
|
||||
Arg.Is<string>(r => r.Contains("deployment")),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeploymentCreated_WarmEntry_NoChange()
|
||||
{
|
||||
// Arrange
|
||||
var entry = CreateEntry(0.55, "pkg:npm/vulnerable@1.0.0"); // WARM
|
||||
_repository.GetByBomRefAsync("pkg:npm/vulnerable@1.0.0", Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
await StartServiceAsync(service);
|
||||
|
||||
// Act
|
||||
await _eventSubscriber.PublishAsync(new DeploymentCreatedEvent
|
||||
{
|
||||
DeploymentId = "deploy-123",
|
||||
AffectedComponents = ["pkg:npm/vulnerable@1.0.0"]
|
||||
});
|
||||
|
||||
// Assert - Already WARM, no change
|
||||
await _repository.DidNotReceive().UpdateScoreAsync(
|
||||
Arg.Any<Guid>(),
|
||||
Arg.Any<double>(),
|
||||
Arg.Any<string>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Expiry Tests
|
||||
|
||||
[Fact]
|
||||
public async Task ExpiredEntries_AreMarkedExpired()
|
||||
{
|
||||
// Arrange
|
||||
var expiredEntry = CreateEntry(0.20);
|
||||
_repository.GetExpiredAsync(Arg.Any<DateTimeOffset>(), Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([expiredEntry]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act - Trigger expiry check
|
||||
await TriggerExpiryCheck(service);
|
||||
|
||||
// Assert
|
||||
await _repository.Received(1).UpdateStatusAsync(
|
||||
expiredEntry.Id,
|
||||
GreyQueueStatus.Expired,
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Demotion with Blocking Factors Tests
|
||||
|
||||
[Fact]
|
||||
public async Task TryDemote_WithKevBlockingFactor_DoesNotDemote()
|
||||
{
|
||||
// Arrange
|
||||
var entry = CreateEntry(0.75); // HOT
|
||||
_repository.GetByIdAsync(entry.Id, Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<GreyQueueEntry?>(entry));
|
||||
_repository.IsInKevAsync(entry.Id, Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult(true)); // In KEV!
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
await service.TryDemoteEntryAsync(entry.Id, CancellationToken.None);
|
||||
|
||||
// Assert - Should NOT demote
|
||||
await _repository.DidNotReceive().UpdateScoreAsync(
|
||||
Arg.Any<Guid>(),
|
||||
Arg.Any<double>(),
|
||||
Arg.Any<string>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TryDemote_WithoutBlockingFactors_Demotes()
|
||||
{
|
||||
// Arrange
|
||||
var entry = CreateEntry(0.75); // HOT
|
||||
_repository.GetByIdAsync(entry.Id, Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<GreyQueueEntry?>(entry));
|
||||
_repository.IsInKevAsync(entry.Id, Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult(false));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
await service.TryDemoteEntryAsync(entry.Id, CancellationToken.None);
|
||||
|
||||
// Assert - Should demote to WARM
|
||||
await _repository.Received(1).UpdateScoreAsync(
|
||||
entry.Id,
|
||||
Arg.Is<double>(s => s >= 0.40 && s < 0.70),
|
||||
Arg.Any<string>(),
|
||||
Arg.Any<CancellationToken>());
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Helpers
|
||||
|
||||
private UnknownsLifecycleService CreateService()
|
||||
{
|
||||
return new UnknownsLifecycleService(
|
||||
_repository,
|
||||
_eventSubscriber,
|
||||
_notificationPublisher,
|
||||
Options.Create(_options),
|
||||
_timeProvider,
|
||||
new NullLogger<UnknownsLifecycleService>());
|
||||
}
|
||||
|
||||
private static async Task StartServiceAsync(UnknownsLifecycleService service)
|
||||
{
|
||||
// Start the service (which registers event handlers)
|
||||
var cts = new CancellationTokenSource();
|
||||
var task = service.StartAsync(cts.Token);
|
||||
await Task.Delay(50); // Give it time to register handlers
|
||||
cts.Cancel();
|
||||
try { await task; } catch (OperationCanceledException) { }
|
||||
}
|
||||
|
||||
private async Task TriggerExpiryCheck(UnknownsLifecycleService service)
|
||||
{
|
||||
var method = typeof(UnknownsLifecycleService)
|
||||
.GetMethod("ProcessExpiredEntriesAsync", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
|
||||
|
||||
if (method != null)
|
||||
{
|
||||
var task = (Task?)method.Invoke(service, [CancellationToken.None]);
|
||||
if (task != null) await task;
|
||||
}
|
||||
}
|
||||
|
||||
private static GreyQueueEntry CreateEntry(double score, string? bomRef = null)
|
||||
{
|
||||
return new GreyQueueEntry
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
BomRef = bomRef ?? $"pkg:npm/test-{Guid.NewGuid():N}@1.0.0",
|
||||
Score = score,
|
||||
CreatedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test event subscriber that allows publishing test events.
|
||||
/// </summary>
|
||||
public sealed class TestEventSubscriber : IEventSubscriber
|
||||
{
|
||||
private readonly Dictionary<string, List<Delegate>> _handlers = new();
|
||||
|
||||
public void Subscribe<T>(string eventType, Func<T, CancellationToken, Task> handler)
|
||||
{
|
||||
if (!_handlers.ContainsKey(eventType))
|
||||
{
|
||||
_handlers[eventType] = [];
|
||||
}
|
||||
_handlers[eventType].Add(handler);
|
||||
}
|
||||
|
||||
public async Task PublishAsync<T>(T evt)
|
||||
{
|
||||
var eventType = typeof(T).Name.Replace("Event", "").ToLowerInvariant();
|
||||
var mappedType = eventType switch
|
||||
{
|
||||
"epssupdated" => "epss.updated",
|
||||
"kevadded" => "kev.added",
|
||||
"deploymentcreated" => "deployment.created",
|
||||
"runtimeupdated" => "runtime.updated",
|
||||
_ => eventType
|
||||
};
|
||||
|
||||
if (_handlers.TryGetValue(mappedType, out var handlers))
|
||||
{
|
||||
foreach (var handler in handlers)
|
||||
{
|
||||
if (handler is Func<T, CancellationToken, Task> typedHandler)
|
||||
{
|
||||
await typedHandler(evt, CancellationToken.None);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,269 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// UnknownsSlaMonitorIntegrationTests.cs
|
||||
// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement
|
||||
// Task: UQ-001 - Integration tests with test clock
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using NSubstitute;
|
||||
using StellaOps.Unknowns.Services;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Unknowns.Core.Tests.Services;
|
||||
|
||||
public sealed class UnknownsSlaMonitorIntegrationTests : IAsyncDisposable
|
||||
{
|
||||
private readonly FakeTimeProvider _timeProvider;
|
||||
private readonly IGreyQueueRepository _repository;
|
||||
private readonly TestNotificationPublisher _notificationPublisher;
|
||||
private readonly UnknownsMetrics _metrics;
|
||||
private readonly UnknownsSlaOptions _options;
|
||||
private readonly CancellationTokenSource _cts;
|
||||
|
||||
public UnknownsSlaMonitorIntegrationTests()
|
||||
{
|
||||
_timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 19, 12, 0, 0, TimeSpan.Zero));
|
||||
_repository = Substitute.For<IGreyQueueRepository>();
|
||||
_notificationPublisher = new TestNotificationPublisher();
|
||||
_metrics = new UnknownsMetrics();
|
||||
_options = new UnknownsSlaOptions
|
||||
{
|
||||
PollingInterval = TimeSpan.FromMilliseconds(100), // Fast for testing
|
||||
HotSla = TimeSpan.FromHours(24),
|
||||
WarmSla = TimeSpan.FromDays(7),
|
||||
ColdSla = TimeSpan.FromDays(30),
|
||||
WarningThreshold = 0.80
|
||||
};
|
||||
_cts = new CancellationTokenSource();
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
_cts.Cancel();
|
||||
_cts.Dispose();
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Monitor_WithNoEntries_DoesNotPublishNotifications()
|
||||
{
|
||||
// Arrange
|
||||
_repository.GetPendingAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act - Run one polling cycle
|
||||
await RunOnePollingCycle(service);
|
||||
|
||||
// Assert
|
||||
Assert.Empty(_notificationPublisher.Warnings);
|
||||
Assert.Empty(_notificationPublisher.Breaches);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Monitor_WithHealthyEntry_DoesNotPublishNotifications()
|
||||
{
|
||||
// Arrange
|
||||
var entry = CreateEntry(_timeProvider.GetUtcNow().AddHours(-6), 0.75); // 6h of 24h = 25%
|
||||
_repository.GetPendingAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
await RunOnePollingCycle(service);
|
||||
|
||||
// Assert
|
||||
Assert.Empty(_notificationPublisher.Warnings);
|
||||
Assert.Empty(_notificationPublisher.Breaches);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Monitor_WithEntryAt80Percent_PublishesWarning()
|
||||
{
|
||||
// Arrange
|
||||
var entry = CreateEntry(_timeProvider.GetUtcNow().AddHours(-19.2), 0.75); // 80% of 24h
|
||||
_repository.GetPendingAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
await RunOnePollingCycle(service);
|
||||
|
||||
// Assert
|
||||
Assert.Single(_notificationPublisher.Warnings);
|
||||
Assert.Empty(_notificationPublisher.Breaches);
|
||||
Assert.Equal(entry.Id, _notificationPublisher.Warnings[0].EntryId);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Monitor_WithBreachedEntry_PublishesBreach()
|
||||
{
|
||||
// Arrange
|
||||
var entry = CreateEntry(_timeProvider.GetUtcNow().AddHours(-25), 0.75); // 25h of 24h = breached
|
||||
_repository.GetPendingAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
await RunOnePollingCycle(service);
|
||||
|
||||
// Assert
|
||||
Assert.Empty(_notificationPublisher.Warnings);
|
||||
Assert.Single(_notificationPublisher.Breaches);
|
||||
Assert.Equal(entry.Id, _notificationPublisher.Breaches[0].EntryId);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Monitor_TimeAdvances_EntryMovesToWarning()
|
||||
{
|
||||
// Arrange - Entry at 50%
|
||||
var entry = CreateEntry(_timeProvider.GetUtcNow().AddHours(-12), 0.75);
|
||||
_repository.GetPendingAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act 1 - Check at 50%
|
||||
await RunOnePollingCycle(service);
|
||||
Assert.Empty(_notificationPublisher.Warnings);
|
||||
|
||||
// Act 2 - Advance time to 80%
|
||||
_timeProvider.Advance(TimeSpan.FromHours(7.2)); // Now at 80%
|
||||
await RunOnePollingCycle(service);
|
||||
|
||||
// Assert
|
||||
Assert.Single(_notificationPublisher.Warnings);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Monitor_TimeAdvances_EntryMovesToBreach()
|
||||
{
|
||||
// Arrange - Entry at 90%
|
||||
var entry = CreateEntry(_timeProvider.GetUtcNow().AddHours(-21.6), 0.75);
|
||||
_repository.GetPendingAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([entry]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act 1 - Check at 90% (warning zone)
|
||||
await RunOnePollingCycle(service);
|
||||
Assert.Single(_notificationPublisher.Warnings);
|
||||
Assert.Empty(_notificationPublisher.Breaches);
|
||||
|
||||
// Act 2 - Advance time past 100%
|
||||
_notificationPublisher.Clear();
|
||||
_timeProvider.Advance(TimeSpan.FromHours(3)); // Now at 102.5%
|
||||
await RunOnePollingCycle(service);
|
||||
|
||||
// Assert
|
||||
Assert.Empty(_notificationPublisher.Warnings);
|
||||
Assert.Single(_notificationPublisher.Breaches);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Monitor_MultipleEntries_ClassifiesByBand()
|
||||
{
|
||||
// Arrange
|
||||
var hotEntry = CreateEntry(_timeProvider.GetUtcNow().AddHours(-20), 0.75); // HOT at 83%
|
||||
var warmEntry = CreateEntry(_timeProvider.GetUtcNow().AddDays(-6), 0.50); // WARM at 86%
|
||||
var coldEntry = CreateEntry(_timeProvider.GetUtcNow().AddDays(-10), 0.20); // COLD at 33%
|
||||
|
||||
_repository.GetPendingAsync(Arg.Any<CancellationToken>())
|
||||
.Returns(Task.FromResult<IReadOnlyList<GreyQueueEntry>>([hotEntry, warmEntry, coldEntry]));
|
||||
|
||||
var service = CreateService();
|
||||
|
||||
// Act
|
||||
await RunOnePollingCycle(service);
|
||||
|
||||
// Assert - HOT and WARM in warning, COLD is fine
|
||||
Assert.Equal(2, _notificationPublisher.Warnings.Count);
|
||||
Assert.Contains(_notificationPublisher.Warnings, w => w.EntryId == hotEntry.Id);
|
||||
Assert.Contains(_notificationPublisher.Warnings, w => w.EntryId == warmEntry.Id);
|
||||
}
|
||||
|
||||
#region Helpers
|
||||
|
||||
private UnknownsSlaMonitorService CreateService()
|
||||
{
|
||||
return new UnknownsSlaMonitorService(
|
||||
_repository,
|
||||
_notificationPublisher,
|
||||
Options.Create(_options),
|
||||
_timeProvider,
|
||||
_metrics,
|
||||
new NullLogger<UnknownsSlaMonitorService>());
|
||||
}
|
||||
|
||||
private async Task RunOnePollingCycle(UnknownsSlaMonitorService service)
|
||||
{
|
||||
// Use reflection to call the private CheckSlaBoundsAsync method
|
||||
var method = typeof(UnknownsSlaMonitorService)
|
||||
.GetMethod("CheckSlaBoundsAsync", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
|
||||
|
||||
if (method != null)
|
||||
{
|
||||
var task = (Task?)method.Invoke(service, [CancellationToken.None]);
|
||||
if (task != null) await task;
|
||||
}
|
||||
}
|
||||
|
||||
private static GreyQueueEntry CreateEntry(DateTimeOffset createdAt, double score)
|
||||
{
|
||||
return new GreyQueueEntry
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
BomRef = $"pkg:npm/test-{Guid.NewGuid():N}@1.0.0",
|
||||
Score = score,
|
||||
CreatedAt = createdAt
|
||||
};
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test notification publisher that captures notifications.
|
||||
/// </summary>
|
||||
public sealed class TestNotificationPublisher : INotificationPublisher
|
||||
{
|
||||
public List<SlaWarningNotification> Warnings { get; } = [];
|
||||
public List<SlaBreachNotification> Breaches { get; } = [];
|
||||
|
||||
public Task PublishAsync<T>(T notification, CancellationToken ct = default)
|
||||
{
|
||||
switch (notification)
|
||||
{
|
||||
case SlaWarningNotification warning:
|
||||
Warnings.Add(warning);
|
||||
break;
|
||||
case SlaBreachNotification breach:
|
||||
Breaches.Add(breach);
|
||||
break;
|
||||
}
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public void Clear()
|
||||
{
|
||||
Warnings.Clear();
|
||||
Breaches.Clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Null logger for testing.
|
||||
/// </summary>
|
||||
public sealed class NullLogger<T> : ILogger<T>
|
||||
{
|
||||
public IDisposable? BeginScope<TState>(TState state) where TState : notnull => null;
|
||||
public bool IsEnabled(LogLevel logLevel) => false;
|
||||
public void Log<TState>(LogLevel logLevel, EventId eventId, TState state, Exception? exception, Func<TState, Exception?, string> formatter) { }
|
||||
}
|
||||
@@ -0,0 +1,340 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// UnknownsSlaMonitorServiceTests.cs
|
||||
// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement
|
||||
// Task: UQ-001 - Unit tests for SLA calculation
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using NSubstitute;
|
||||
using StellaOps.Unknowns.Services;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Unknowns.Core.Tests.Services;
|
||||
|
||||
public sealed class UnknownsSlaMonitorServiceTests
|
||||
{
|
||||
private readonly IGreyQueueRepository _repository;
|
||||
private readonly INotificationPublisher _notificationPublisher;
|
||||
private readonly UnknownsMetrics _metrics;
|
||||
private readonly FakeTimeProvider _timeProvider;
|
||||
private readonly IOptions<UnknownsSlaOptions> _options;
|
||||
|
||||
public UnknownsSlaMonitorServiceTests()
|
||||
{
|
||||
_repository = Substitute.For<IGreyQueueRepository>();
|
||||
_notificationPublisher = Substitute.For<INotificationPublisher>();
|
||||
_metrics = new UnknownsMetrics();
|
||||
_timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 19, 12, 0, 0, TimeSpan.Zero));
|
||||
_options = Options.Create(new UnknownsSlaOptions
|
||||
{
|
||||
PollingInterval = TimeSpan.FromMinutes(5),
|
||||
HotSla = TimeSpan.FromHours(24),
|
||||
WarmSla = TimeSpan.FromDays(7),
|
||||
ColdSla = TimeSpan.FromDays(30),
|
||||
WarningThreshold = 0.80
|
||||
});
|
||||
}
|
||||
|
||||
#region Band Classification Tests
|
||||
|
||||
[Theory]
|
||||
[InlineData(0.70, "hot")]
|
||||
[InlineData(0.85, "hot")]
|
||||
[InlineData(1.00, "hot")]
|
||||
[InlineData(0.40, "warm")]
|
||||
[InlineData(0.55, "warm")]
|
||||
[InlineData(0.69, "warm")]
|
||||
[InlineData(0.00, "cold")]
|
||||
[InlineData(0.20, "cold")]
|
||||
[InlineData(0.39, "cold")]
|
||||
public void GetBand_ReturnsCorrectBand(double score, string expectedBand)
|
||||
{
|
||||
// Arrange & Act
|
||||
var band = SlaCalculator.GetBand(score);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(expectedBand, band.ToString().ToLowerInvariant());
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region SLA Calculation Tests
|
||||
|
||||
[Fact]
|
||||
public void CalculateSlaRemaining_HotBand_Returns24Hours()
|
||||
{
|
||||
// Arrange
|
||||
var createdAt = _timeProvider.GetUtcNow();
|
||||
var entry = CreateEntry(createdAt, 0.75);
|
||||
|
||||
// Act
|
||||
var remaining = SlaCalculator.CalculateRemaining(entry, _timeProvider.GetUtcNow(), _options.Value);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(TimeSpan.FromHours(24), remaining);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CalculateSlaRemaining_HotBand_After12Hours_Returns12Hours()
|
||||
{
|
||||
// Arrange
|
||||
var createdAt = _timeProvider.GetUtcNow().AddHours(-12);
|
||||
var entry = CreateEntry(createdAt, 0.75);
|
||||
|
||||
// Act
|
||||
var remaining = SlaCalculator.CalculateRemaining(entry, _timeProvider.GetUtcNow(), _options.Value);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(TimeSpan.FromHours(12), remaining);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CalculateSlaRemaining_WarmBand_Returns7Days()
|
||||
{
|
||||
// Arrange
|
||||
var createdAt = _timeProvider.GetUtcNow();
|
||||
var entry = CreateEntry(createdAt, 0.50);
|
||||
|
||||
// Act
|
||||
var remaining = SlaCalculator.CalculateRemaining(entry, _timeProvider.GetUtcNow(), _options.Value);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(TimeSpan.FromDays(7), remaining);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CalculateSlaRemaining_ColdBand_Returns30Days()
|
||||
{
|
||||
// Arrange
|
||||
var createdAt = _timeProvider.GetUtcNow();
|
||||
var entry = CreateEntry(createdAt, 0.20);
|
||||
|
||||
// Act
|
||||
var remaining = SlaCalculator.CalculateRemaining(entry, _timeProvider.GetUtcNow(), _options.Value);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(TimeSpan.FromDays(30), remaining);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CalculateSlaRemaining_Breached_ReturnsNegative()
|
||||
{
|
||||
// Arrange
|
||||
var createdAt = _timeProvider.GetUtcNow().AddHours(-25); // 25 hours ago
|
||||
var entry = CreateEntry(createdAt, 0.75); // HOT band (24h SLA)
|
||||
|
||||
// Act
|
||||
var remaining = SlaCalculator.CalculateRemaining(entry, _timeProvider.GetUtcNow(), _options.Value);
|
||||
|
||||
// Assert
|
||||
Assert.True(remaining < TimeSpan.Zero);
|
||||
Assert.Equal(TimeSpan.FromHours(-1), remaining);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Percentage Elapsed Tests
|
||||
|
||||
[Theory]
|
||||
[InlineData(0, 0.0)]
|
||||
[InlineData(12, 0.5)]
|
||||
[InlineData(19.2, 0.8)] // 80% warning threshold
|
||||
[InlineData(24, 1.0)]
|
||||
[InlineData(48, 2.0)]
|
||||
public void CalculatePercentElapsed_HotBand_ReturnsCorrectPercentage(double hoursElapsed, double expectedPercent)
|
||||
{
|
||||
// Arrange
|
||||
var createdAt = _timeProvider.GetUtcNow().AddHours(-hoursElapsed);
|
||||
var entry = CreateEntry(createdAt, 0.75);
|
||||
|
||||
// Act
|
||||
var percent = SlaCalculator.CalculatePercentElapsed(entry, _timeProvider.GetUtcNow(), _options.Value);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(expectedPercent, percent, precision: 2);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Warning Threshold Tests
|
||||
|
||||
[Fact]
|
||||
public void IsInWarningZone_At80Percent_ReturnsTrue()
|
||||
{
|
||||
// Arrange
|
||||
var createdAt = _timeProvider.GetUtcNow().AddHours(-19.2); // 80% of 24h
|
||||
var entry = CreateEntry(createdAt, 0.75);
|
||||
|
||||
// Act
|
||||
var isWarning = SlaCalculator.IsInWarningZone(entry, _timeProvider.GetUtcNow(), _options.Value);
|
||||
|
||||
// Assert
|
||||
Assert.True(isWarning);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsInWarningZone_At50Percent_ReturnsFalse()
|
||||
{
|
||||
// Arrange
|
||||
var createdAt = _timeProvider.GetUtcNow().AddHours(-12); // 50% of 24h
|
||||
var entry = CreateEntry(createdAt, 0.75);
|
||||
|
||||
// Act
|
||||
var isWarning = SlaCalculator.IsInWarningZone(entry, _timeProvider.GetUtcNow(), _options.Value);
|
||||
|
||||
// Assert
|
||||
Assert.False(isWarning);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsInWarningZone_At100Percent_ReturnsFalse_BecauseBreached()
|
||||
{
|
||||
// Arrange - Breached is not warning, it's breach
|
||||
var createdAt = _timeProvider.GetUtcNow().AddHours(-25);
|
||||
var entry = CreateEntry(createdAt, 0.75);
|
||||
|
||||
// Act
|
||||
var isWarning = SlaCalculator.IsInWarningZone(entry, _timeProvider.GetUtcNow(), _options.Value);
|
||||
|
||||
// Assert
|
||||
Assert.False(isWarning); // Past warning, now breached
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Breach Detection Tests
|
||||
|
||||
[Fact]
|
||||
public void IsBreached_BeforeSla_ReturnsFalse()
|
||||
{
|
||||
// Arrange
|
||||
var createdAt = _timeProvider.GetUtcNow().AddHours(-12);
|
||||
var entry = CreateEntry(createdAt, 0.75);
|
||||
|
||||
// Act
|
||||
var isBreached = SlaCalculator.IsBreached(entry, _timeProvider.GetUtcNow(), _options.Value);
|
||||
|
||||
// Assert
|
||||
Assert.False(isBreached);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsBreached_ExactlyAtSla_ReturnsTrue()
|
||||
{
|
||||
// Arrange
|
||||
var createdAt = _timeProvider.GetUtcNow().AddHours(-24);
|
||||
var entry = CreateEntry(createdAt, 0.75);
|
||||
|
||||
// Act
|
||||
var isBreached = SlaCalculator.IsBreached(entry, _timeProvider.GetUtcNow(), _options.Value);
|
||||
|
||||
// Assert
|
||||
Assert.True(isBreached);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsBreached_AfterSla_ReturnsTrue()
|
||||
{
|
||||
// Arrange
|
||||
var createdAt = _timeProvider.GetUtcNow().AddHours(-48);
|
||||
var entry = CreateEntry(createdAt, 0.75);
|
||||
|
||||
// Act
|
||||
var isBreached = SlaCalculator.IsBreached(entry, _timeProvider.GetUtcNow(), _options.Value);
|
||||
|
||||
// Assert
|
||||
Assert.True(isBreached);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Helpers
|
||||
|
||||
private static GreyQueueEntry CreateEntry(DateTimeOffset createdAt, double score)
|
||||
{
|
||||
return new GreyQueueEntry
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
BomRef = "pkg:npm/test@1.0.0",
|
||||
Score = score,
|
||||
CreatedAt = createdAt
|
||||
};
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SLA calculation helper for testability.
|
||||
/// </summary>
|
||||
public static class SlaCalculator
|
||||
{
|
||||
public static UnknownsBand GetBand(double score)
|
||||
{
|
||||
return score switch
|
||||
{
|
||||
>= 0.70 => UnknownsBand.Hot,
|
||||
>= 0.40 => UnknownsBand.Warm,
|
||||
_ => UnknownsBand.Cold
|
||||
};
|
||||
}
|
||||
|
||||
public static TimeSpan GetSlaLimit(UnknownsBand band, UnknownsSlaOptions options)
|
||||
{
|
||||
return band switch
|
||||
{
|
||||
UnknownsBand.Hot => options.HotSla,
|
||||
UnknownsBand.Warm => options.WarmSla,
|
||||
UnknownsBand.Cold => options.ColdSla,
|
||||
_ => options.ColdSla
|
||||
};
|
||||
}
|
||||
|
||||
public static TimeSpan CalculateRemaining(GreyQueueEntry entry, DateTimeOffset now, UnknownsSlaOptions options)
|
||||
{
|
||||
var band = GetBand(entry.Score);
|
||||
var slaLimit = GetSlaLimit(band, options);
|
||||
var elapsed = now - entry.CreatedAt;
|
||||
return slaLimit - elapsed;
|
||||
}
|
||||
|
||||
public static double CalculatePercentElapsed(GreyQueueEntry entry, DateTimeOffset now, UnknownsSlaOptions options)
|
||||
{
|
||||
var band = GetBand(entry.Score);
|
||||
var slaLimit = GetSlaLimit(band, options);
|
||||
var elapsed = now - entry.CreatedAt;
|
||||
return elapsed / slaLimit;
|
||||
}
|
||||
|
||||
public static bool IsInWarningZone(GreyQueueEntry entry, DateTimeOffset now, UnknownsSlaOptions options)
|
||||
{
|
||||
var percent = CalculatePercentElapsed(entry, now, options);
|
||||
return percent >= options.WarningThreshold && percent < 1.0;
|
||||
}
|
||||
|
||||
public static bool IsBreached(GreyQueueEntry entry, DateTimeOffset now, UnknownsSlaOptions options)
|
||||
{
|
||||
var percent = CalculatePercentElapsed(entry, now, options);
|
||||
return percent >= 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fake time provider for testing.
|
||||
/// </summary>
|
||||
public sealed class FakeTimeProvider : TimeProvider
|
||||
{
|
||||
private DateTimeOffset _now;
|
||||
|
||||
public FakeTimeProvider(DateTimeOffset initialTime)
|
||||
{
|
||||
_now = initialTime;
|
||||
}
|
||||
|
||||
public override DateTimeOffset GetUtcNow() => _now;
|
||||
|
||||
public void Advance(TimeSpan duration) => _now = _now.Add(duration);
|
||||
|
||||
public void SetTime(DateTimeOffset time) => _now = time;
|
||||
}
|
||||
Reference in New Issue
Block a user