335 lines
12 KiB
C#
335 lines
12 KiB
C#
// -----------------------------------------------------------------------------
|
|
// GreyQueueWatchdogService.cs
|
|
// Sprint: SPRINT_20260118_018_Unknowns_queue_enhancement
|
|
// Task: UQ-004 - Add timeout watchdog for stuck processing
|
|
// Description: Watchdog service to detect and handle stuck entries
|
|
// -----------------------------------------------------------------------------
|
|
|
|
using System.Diagnostics.Metrics;
|
|
using Microsoft.Extensions.Hosting;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
|
|
namespace StellaOps.Unknowns.Services;
|
|
|
|
/// <summary>
|
|
/// Watchdog service that detects and handles stuck entries in Processing status.
|
|
/// </summary>
|
|
public sealed class GreyQueueWatchdogService : BackgroundService
|
|
{
|
|
private readonly IGreyQueueRepository _repository;
|
|
private readonly INotificationPublisher _notificationPublisher;
|
|
private readonly GreyQueueWatchdogOptions _options;
|
|
private readonly TimeProvider _timeProvider;
|
|
private readonly ILogger<GreyQueueWatchdogService> _logger;
|
|
|
|
private static readonly Meter WatchdogMeter = new("StellaOps.Unknowns.Watchdog", "1.0.0");
|
|
private static readonly Counter<long> StuckTotal = WatchdogMeter.CreateCounter<long>(
|
|
"greyqueue_stuck_total",
|
|
"entries",
|
|
"Total number of stuck entries detected");
|
|
private static readonly Counter<long> TimeoutTotal = WatchdogMeter.CreateCounter<long>(
|
|
"greyqueue_timeout_total",
|
|
"entries",
|
|
"Total number of entries that timed out");
|
|
private static readonly Counter<long> RetryTotal = WatchdogMeter.CreateCounter<long>(
|
|
"greyqueue_watchdog_retry_total",
|
|
"entries",
|
|
"Total number of forced retries by watchdog");
|
|
private static readonly Counter<long> FailedTotal = WatchdogMeter.CreateCounter<long>(
|
|
"greyqueue_watchdog_failed_total",
|
|
"entries",
|
|
"Total number of entries moved to Failed by watchdog");
|
|
private static readonly Gauge<int> ProcessingCount = WatchdogMeter.CreateGauge<int>(
|
|
"greyqueue_processing_count",
|
|
"entries",
|
|
"Current number of entries in Processing status");
|
|
|
|
public GreyQueueWatchdogService(
|
|
IGreyQueueRepository repository,
|
|
INotificationPublisher notificationPublisher,
|
|
IOptions<GreyQueueWatchdogOptions> options,
|
|
TimeProvider timeProvider,
|
|
ILogger<GreyQueueWatchdogService> logger)
|
|
{
|
|
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
|
_notificationPublisher = notificationPublisher ?? throw new ArgumentNullException(nameof(notificationPublisher));
|
|
_options = options?.Value ?? new GreyQueueWatchdogOptions();
|
|
_timeProvider = timeProvider ?? TimeProvider.System;
|
|
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
|
}
|
|
|
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
|
{
|
|
_logger.LogInformation(
|
|
"Grey Queue Watchdog starting with interval {Interval}, alert threshold {AlertThreshold}, timeout {Timeout}",
|
|
_options.CheckInterval,
|
|
_options.ProcessingAlertThreshold,
|
|
_options.ProcessingTimeout);
|
|
|
|
while (!stoppingToken.IsCancellationRequested)
|
|
{
|
|
try
|
|
{
|
|
await CheckProcessingEntriesAsync(stoppingToken);
|
|
}
|
|
catch (Exception ex) when (ex is not OperationCanceledException)
|
|
{
|
|
_logger.LogError(ex, "Watchdog check failed");
|
|
}
|
|
|
|
await Task.Delay(_options.CheckInterval, stoppingToken);
|
|
}
|
|
}
|
|
|
|
private async Task CheckProcessingEntriesAsync(CancellationToken ct)
|
|
{
|
|
var now = _timeProvider.GetUtcNow();
|
|
var processingEntries = await _repository.GetByStatusAsync(GreyQueueStatus.Processing, ct);
|
|
|
|
ProcessingCount.Record(processingEntries.Count);
|
|
|
|
foreach (var entry in processingEntries)
|
|
{
|
|
var processingDuration = now - (entry.LastProcessedAt ?? entry.CreatedAt);
|
|
|
|
// Check if entry is stuck (exceeded alert threshold but not timeout)
|
|
if (processingDuration >= _options.ProcessingAlertThreshold &&
|
|
processingDuration < _options.ProcessingTimeout)
|
|
{
|
|
_logger.LogWarning(
|
|
"Entry {EntryId} has been processing for {Duration}",
|
|
entry.Id, processingDuration);
|
|
|
|
StuckTotal.Add(1);
|
|
|
|
await _notificationPublisher.PublishAsync(new StuckProcessingAlert
|
|
{
|
|
EntryId = entry.Id,
|
|
BomRef = entry.BomRef,
|
|
ProcessingDuration = processingDuration,
|
|
AlertedAt = now
|
|
}, ct);
|
|
}
|
|
// Check if entry has timed out
|
|
else if (processingDuration >= _options.ProcessingTimeout)
|
|
{
|
|
await HandleTimeoutAsync(entry, processingDuration, ct);
|
|
}
|
|
}
|
|
}
|
|
|
|
private async Task HandleTimeoutAsync(
|
|
GreyQueueEntry entry,
|
|
TimeSpan processingDuration,
|
|
CancellationToken ct)
|
|
{
|
|
var now = _timeProvider.GetUtcNow();
|
|
|
|
_logger.LogWarning(
|
|
"Entry {EntryId} has timed out after {Duration}. Attempts: {Attempts}/{MaxAttempts}",
|
|
entry.Id, processingDuration, entry.ProcessingAttempts, entry.MaxAttempts);
|
|
|
|
TimeoutTotal.Add(1);
|
|
|
|
// Check if max attempts exceeded
|
|
if (entry.ProcessingAttempts >= entry.MaxAttempts)
|
|
{
|
|
_logger.LogError(
|
|
"Entry {EntryId} has exceeded max attempts ({MaxAttempts}), marking as Failed",
|
|
entry.Id, entry.MaxAttempts);
|
|
|
|
await _repository.UpdateStatusAsync(entry.Id, GreyQueueStatus.Failed, ct);
|
|
FailedTotal.Add(1);
|
|
|
|
await _notificationPublisher.PublishAsync(new EntryFailedNotification
|
|
{
|
|
EntryId = entry.Id,
|
|
BomRef = entry.BomRef,
|
|
Reason = $"Timed out after {entry.ProcessingAttempts} attempts",
|
|
FailedAt = now
|
|
}, ct);
|
|
}
|
|
else
|
|
{
|
|
// Force retry
|
|
_logger.LogInformation(
|
|
"Forcing retry for entry {EntryId} (attempt {Attempt})",
|
|
entry.Id, entry.ProcessingAttempts + 1);
|
|
|
|
var backoffMultiplier = Math.Pow(2, entry.ProcessingAttempts);
|
|
var nextProcessingAt = now.AddMinutes(_options.BaseRetryDelayMinutes * backoffMultiplier);
|
|
|
|
await _repository.ForceRetryAsync(entry.Id, nextProcessingAt, ct);
|
|
RetryTotal.Add(1);
|
|
|
|
await _notificationPublisher.PublishAsync(new ForcedRetryNotification
|
|
{
|
|
EntryId = entry.Id,
|
|
BomRef = entry.BomRef,
|
|
AttemptNumber = entry.ProcessingAttempts + 1,
|
|
NextProcessingAt = nextProcessingAt
|
|
}, ct);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Manually triggers a retry for a stuck entry.
|
|
/// </summary>
|
|
public async Task ManualRetryAsync(Guid entryId, CancellationToken ct = default)
|
|
{
|
|
var entry = await _repository.GetByIdAsync(entryId, ct);
|
|
if (entry == null)
|
|
{
|
|
throw new InvalidOperationException($"Entry {entryId} not found");
|
|
}
|
|
|
|
if (entry.Status != GreyQueueStatus.Processing && entry.Status != GreyQueueStatus.Failed)
|
|
{
|
|
throw new InvalidOperationException($"Entry {entryId} is not stuck (status: {entry.Status})");
|
|
}
|
|
|
|
_logger.LogInformation("Manual retry triggered for entry {EntryId}", entryId);
|
|
|
|
var now = _timeProvider.GetUtcNow();
|
|
await _repository.ForceRetryAsync(entryId, now, ct);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets current watchdog statistics.
|
|
/// </summary>
|
|
public async Task<WatchdogStats> GetStatsAsync(CancellationToken ct = default)
|
|
{
|
|
var now = _timeProvider.GetUtcNow();
|
|
var processingEntries = await _repository.GetByStatusAsync(GreyQueueStatus.Processing, ct);
|
|
|
|
var stuckCount = 0;
|
|
var timedOutCount = 0;
|
|
var oldestProcessingDuration = TimeSpan.Zero;
|
|
|
|
foreach (var entry in processingEntries)
|
|
{
|
|
var duration = now - (entry.LastProcessedAt ?? entry.CreatedAt);
|
|
|
|
if (duration > oldestProcessingDuration)
|
|
{
|
|
oldestProcessingDuration = duration;
|
|
}
|
|
|
|
if (duration >= _options.ProcessingTimeout)
|
|
{
|
|
timedOutCount++;
|
|
}
|
|
else if (duration >= _options.ProcessingAlertThreshold)
|
|
{
|
|
stuckCount++;
|
|
}
|
|
}
|
|
|
|
return new WatchdogStats
|
|
{
|
|
TotalProcessing = processingEntries.Count,
|
|
StuckCount = stuckCount,
|
|
TimedOutCount = timedOutCount,
|
|
OldestProcessingDuration = oldestProcessingDuration,
|
|
CheckedAt = now
|
|
};
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Watchdog configuration.
|
|
/// </summary>
|
|
public sealed record GreyQueueWatchdogOptions
|
|
{
|
|
/// <summary>Configuration section name.</summary>
|
|
public const string SectionName = "Unknowns:Watchdog";
|
|
|
|
/// <summary>How often to check for stuck entries.</summary>
|
|
public TimeSpan CheckInterval { get; init; } = TimeSpan.FromMinutes(5);
|
|
|
|
/// <summary>Duration after which to alert (1 hour default).</summary>
|
|
public TimeSpan ProcessingAlertThreshold { get; init; } = TimeSpan.FromHours(1);
|
|
|
|
/// <summary>Duration after which to force retry (4 hours default).</summary>
|
|
public TimeSpan ProcessingTimeout { get; init; } = TimeSpan.FromHours(4);
|
|
|
|
/// <summary>Maximum processing attempts before marking as Failed.</summary>
|
|
public int MaxAttempts { get; init; } = 5;
|
|
|
|
/// <summary>Base retry delay in minutes (used with exponential backoff).</summary>
|
|
public double BaseRetryDelayMinutes { get; init; } = 15;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Watchdog statistics.
|
|
/// </summary>
|
|
public sealed record WatchdogStats
|
|
{
|
|
/// <summary>Total entries in Processing status.</summary>
|
|
public int TotalProcessing { get; init; }
|
|
|
|
/// <summary>Number of stuck entries (alert threshold exceeded).</summary>
|
|
public int StuckCount { get; init; }
|
|
|
|
/// <summary>Number of timed out entries.</summary>
|
|
public int TimedOutCount { get; init; }
|
|
|
|
/// <summary>Duration of oldest processing entry.</summary>
|
|
public TimeSpan OldestProcessingDuration { get; init; }
|
|
|
|
/// <summary>When stats were checked.</summary>
|
|
public DateTimeOffset CheckedAt { get; init; }
|
|
}
|
|
|
|
#region Notifications
|
|
|
|
/// <summary>Stuck processing alert.</summary>
|
|
public sealed record StuckProcessingAlert
|
|
{
|
|
public required Guid EntryId { get; init; }
|
|
public required string BomRef { get; init; }
|
|
public TimeSpan ProcessingDuration { get; init; }
|
|
public DateTimeOffset AlertedAt { get; init; }
|
|
}
|
|
|
|
/// <summary>Entry failed notification.</summary>
|
|
public sealed record EntryFailedNotification
|
|
{
|
|
public required Guid EntryId { get; init; }
|
|
public required string BomRef { get; init; }
|
|
public required string Reason { get; init; }
|
|
public DateTimeOffset FailedAt { get; init; }
|
|
}
|
|
|
|
/// <summary>Forced retry notification.</summary>
|
|
public sealed record ForcedRetryNotification
|
|
{
|
|
public required Guid EntryId { get; init; }
|
|
public required string BomRef { get; init; }
|
|
public int AttemptNumber { get; init; }
|
|
public DateTimeOffset NextProcessingAt { get; init; }
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Repository Extensions
|
|
|
|
// These would be added to IGreyQueueRepository
|
|
public partial interface IGreyQueueRepository
|
|
{
|
|
/// <summary>Gets entries by status.</summary>
|
|
Task<IReadOnlyList<GreyQueueEntry>> GetByStatusAsync(GreyQueueStatus status, CancellationToken ct = default);
|
|
|
|
/// <summary>Updates entry status.</summary>
|
|
Task UpdateStatusAsync(Guid entryId, GreyQueueStatus status, CancellationToken ct = default);
|
|
|
|
/// <summary>Forces a retry for an entry.</summary>
|
|
Task ForceRetryAsync(Guid entryId, DateTimeOffset nextProcessingAt, CancellationToken ct = default);
|
|
|
|
/// <summary>Gets entry by ID.</summary>
|
|
Task<GreyQueueEntry?> GetByIdAsync(Guid entryId, CancellationToken ct = default);
|
|
}
|
|
|
|
#endregion
|