Files
git.stella-ops.org/src/StellaOps.Scanner.Worker/Hosting/ScannerWorkerHostedService.cs
master daa6a4ae8c
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Build Test Deploy / build-test (push) Has been cancelled
Build Test Deploy / authority-container (push) Has been cancelled
Build Test Deploy / docs (push) Has been cancelled
Build Test Deploy / deploy (push) Has been cancelled
up
2025-10-19 10:38:55 +03:00

202 lines
8.4 KiB
C#

using System;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scanner.Worker.Diagnostics;
using StellaOps.Scanner.Worker.Options;
using StellaOps.Scanner.Worker.Processing;
namespace StellaOps.Scanner.Worker.Hosting;
public sealed partial class ScannerWorkerHostedService : BackgroundService
{
private readonly IScanJobSource _jobSource;
private readonly ScanJobProcessor _processor;
private readonly LeaseHeartbeatService _heartbeatService;
private readonly ScannerWorkerMetrics _metrics;
private readonly TimeProvider _timeProvider;
private readonly IOptionsMonitor<ScannerWorkerOptions> _options;
private readonly ILogger<ScannerWorkerHostedService> _logger;
private readonly IDelayScheduler _delayScheduler;
public ScannerWorkerHostedService(
IScanJobSource jobSource,
ScanJobProcessor processor,
LeaseHeartbeatService heartbeatService,
ScannerWorkerMetrics metrics,
TimeProvider timeProvider,
IDelayScheduler delayScheduler,
IOptionsMonitor<ScannerWorkerOptions> options,
ILogger<ScannerWorkerHostedService> logger)
{
_jobSource = jobSource ?? throw new ArgumentNullException(nameof(jobSource));
_processor = processor ?? throw new ArgumentNullException(nameof(processor));
_heartbeatService = heartbeatService ?? throw new ArgumentNullException(nameof(heartbeatService));
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
_delayScheduler = delayScheduler ?? throw new ArgumentNullException(nameof(delayScheduler));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
var runningJobs = new HashSet<Task>();
var delayStrategy = new PollDelayStrategy(_options.CurrentValue.Polling);
WorkerStarted(_logger);
while (!stoppingToken.IsCancellationRequested)
{
runningJobs.RemoveWhere(static task => task.IsCompleted);
var options = _options.CurrentValue;
if (runningJobs.Count >= options.MaxConcurrentJobs)
{
var completed = await Task.WhenAny(runningJobs).ConfigureAwait(false);
runningJobs.Remove(completed);
continue;
}
IScanJobLease? lease = null;
try
{
lease = await _jobSource.TryAcquireAsync(stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Scanner worker failed to acquire job lease; backing off.");
}
if (lease is null)
{
var delay = delayStrategy.NextDelay();
await _delayScheduler.DelayAsync(delay, stoppingToken).ConfigureAwait(false);
continue;
}
delayStrategy.Reset();
runningJobs.Add(RunJobAsync(lease, stoppingToken));
}
if (runningJobs.Count > 0)
{
await Task.WhenAll(runningJobs).ConfigureAwait(false);
}
WorkerStopping(_logger);
}
private async Task RunJobAsync(IScanJobLease lease, CancellationToken stoppingToken)
{
var options = _options.CurrentValue;
var jobStart = _timeProvider.GetUtcNow();
var queueLatency = jobStart - lease.EnqueuedAtUtc;
var jobCts = CancellationTokenSource.CreateLinkedTokenSource(stoppingToken);
var jobToken = jobCts.Token;
var context = new ScanJobContext(lease, _timeProvider, jobStart, jobToken);
_metrics.RecordQueueLatency(context, queueLatency);
JobAcquired(_logger, lease.JobId, lease.ScanId, lease.Attempt, queueLatency.TotalMilliseconds);
var heartbeatTask = _heartbeatService.RunAsync(lease, jobToken);
Exception? processingException = null;
try
{
await _processor.ExecuteAsync(context, jobToken).ConfigureAwait(false);
jobCts.Cancel();
await heartbeatTask.ConfigureAwait(false);
await lease.CompleteAsync(stoppingToken).ConfigureAwait(false);
var duration = _timeProvider.GetUtcNow() - jobStart;
_metrics.RecordJobDuration(context, duration);
_metrics.IncrementJobCompleted(context);
JobCompleted(_logger, lease.JobId, lease.ScanId, duration.TotalMilliseconds);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
processingException = null;
await lease.AbandonAsync("host-stopping", CancellationToken.None).ConfigureAwait(false);
JobAbandoned(_logger, lease.JobId, lease.ScanId);
}
catch (Exception ex)
{
processingException = ex;
var duration = _timeProvider.GetUtcNow() - jobStart;
_metrics.RecordJobDuration(context, duration);
var reason = ex.GetType().Name;
var maxAttempts = options.Queue.MaxAttempts;
if (lease.Attempt >= maxAttempts)
{
await lease.PoisonAsync(reason, CancellationToken.None).ConfigureAwait(false);
_metrics.IncrementJobFailed(context, reason);
JobPoisoned(_logger, lease.JobId, lease.ScanId, lease.Attempt, maxAttempts, ex);
}
else
{
await lease.AbandonAsync(reason, CancellationToken.None).ConfigureAwait(false);
JobAbandonedWithError(_logger, lease.JobId, lease.ScanId, lease.Attempt, maxAttempts, ex);
}
}
finally
{
jobCts.Cancel();
try
{
await heartbeatTask.ConfigureAwait(false);
}
catch (Exception ex) when (processingException is null && ex is not OperationCanceledException)
{
_logger.LogWarning(ex, "Heartbeat loop ended with an exception for job {JobId}.", lease.JobId);
}
await lease.DisposeAsync().ConfigureAwait(false);
jobCts.Dispose();
}
}
[LoggerMessage(EventId = 2000, Level = LogLevel.Information, Message = "Scanner worker host started.")]
private static partial void WorkerStarted(ILogger logger);
[LoggerMessage(EventId = 2001, Level = LogLevel.Information, Message = "Scanner worker host stopping.")]
private static partial void WorkerStopping(ILogger logger);
[LoggerMessage(
EventId = 2002,
Level = LogLevel.Information,
Message = "Leased job {JobId} (scan {ScanId}) attempt {Attempt}; queue latency {LatencyMs:F0} ms.")]
private static partial void JobAcquired(ILogger logger, string jobId, string scanId, int attempt, double latencyMs);
[LoggerMessage(
EventId = 2003,
Level = LogLevel.Information,
Message = "Job {JobId} (scan {ScanId}) completed in {DurationMs:F0} ms.")]
private static partial void JobCompleted(ILogger logger, string jobId, string scanId, double durationMs);
[LoggerMessage(
EventId = 2004,
Level = LogLevel.Warning,
Message = "Job {JobId} (scan {ScanId}) abandoned due to host shutdown.")]
private static partial void JobAbandoned(ILogger logger, string jobId, string scanId);
[LoggerMessage(
EventId = 2005,
Level = LogLevel.Warning,
Message = "Job {JobId} (scan {ScanId}) attempt {Attempt}/{MaxAttempts} abandoned after failure; job will be retried.")]
private static partial void JobAbandonedWithError(ILogger logger, string jobId, string scanId, int attempt, int maxAttempts, Exception exception);
[LoggerMessage(
EventId = 2006,
Level = LogLevel.Error,
Message = "Job {JobId} (scan {ScanId}) attempt {Attempt}/{MaxAttempts} exceeded retry budget; quarantining job.")]
private static partial void JobPoisoned(ILogger logger, string jobId, string scanId, int attempt, int maxAttempts, Exception exception);
}