feat(rate-limiting): Implement core rate limiting functionality with configuration, decision-making, metrics, middleware, and service registration

- Add RateLimitConfig for configuration management with YAML binding support.
- Introduce RateLimitDecision to encapsulate the result of rate limit checks.
- Implement RateLimitMetrics for OpenTelemetry metrics tracking.
- Create RateLimitMiddleware for enforcing rate limits on incoming requests.
- Develop RateLimitService to orchestrate instance and environment rate limit checks.
- Add RateLimitServiceCollectionExtensions for dependency injection registration.
This commit is contained in:
master
2025-12-17 18:02:37 +02:00
parent 394b57f6bf
commit 8bbfe4d2d2
211 changed files with 47179 additions and 1590 deletions

View File

@@ -0,0 +1,203 @@
// -----------------------------------------------------------------------------
// PartitionExhaustionAlert.cs
// Sprint: SPRINT_3422_0001_0001_time_based_partitioning
// Task: 6.4 - Add alerting for partition exhaustion
// Description: Prometheus/OpenTelemetry metrics and alerts for partition health
// -----------------------------------------------------------------------------
using System.Diagnostics;
using System.Diagnostics.Metrics;
using Npgsql;
namespace StellaOps.Scheduler.Worker.Execution;
/// <summary>
/// Monitors partition health and emits alerts when partitions are running low.
/// Per Sprint 3422 - Time-Based Partitioning.
/// </summary>
public sealed class PartitionHealthMonitor
{
private static readonly Meter Meter = new("StellaOps.Partitions", "1.0.0");
private static readonly ActivitySource ActivitySource = new("StellaOps.Partitions");
// Gauges for partition metrics
private static readonly ObservableGauge<int> FuturePartitions = Meter.CreateObservableGauge<int>(
"stellaops.partitions.future_count",
() => _lastFuturePartitionCounts.Select(kv =>
new Measurement<int>(kv.Value, new KeyValuePair<string, object?>("table", kv.Key))),
description: "Number of future partitions available per table");
private static readonly ObservableGauge<int> DaysUntilExhaustion = Meter.CreateObservableGauge<int>(
"stellaops.partitions.days_until_exhaustion",
() => _lastDaysUntilExhaustion.Select(kv =>
new Measurement<int>(kv.Value, new KeyValuePair<string, object?>("table", kv.Key))),
description: "Days until partition exhaustion per table");
// Counters for alerts
private static readonly Counter<int> AlertsFired = Meter.CreateCounter<int>(
"stellaops.partitions.alerts_fired",
description: "Number of partition exhaustion alerts fired");
// State for observable gauges
private static Dictionary<string, int> _lastFuturePartitionCounts = new();
private static Dictionary<string, int> _lastDaysUntilExhaustion = new();
/// <summary>
/// Check partition health and fire alerts if needed.
/// </summary>
/// <param name="connection">PostgreSQL connection.</param>
/// <param name="alertThreshold">Days threshold for warning alert.</param>
/// <param name="criticalThreshold">Days threshold for critical alert.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>List of partition health status for each table.</returns>
public async Task<List<PartitionHealthStatus>> CheckHealthAsync(
NpgsqlConnection connection,
int alertThreshold = 30,
int criticalThreshold = 7,
CancellationToken cancellationToken = default)
{
using var activity = ActivitySource.StartActivity("partitions.health_check", ActivityKind.Internal);
var results = new List<PartitionHealthStatus>();
var futureCounts = new Dictionary<string, int>();
var daysUntil = new Dictionary<string, int>();
// Query partition health from partition_mgmt schema
await using var cmd = connection.CreateCommand();
cmd.CommandText = """
SELECT
mt.schema_name,
mt.table_name,
COUNT(*) FILTER (WHERE p.partition_start > NOW()) as future_partitions,
MAX(p.partition_start) as last_partition_start
FROM partition_mgmt.managed_tables mt
LEFT JOIN partition_mgmt.partition_stats p
ON mt.schema_name = p.schema_name
AND mt.table_name = p.table_name
GROUP BY mt.schema_name, mt.table_name, mt.months_ahead
ORDER BY mt.schema_name, mt.table_name
""";
try
{
await using var reader = await cmd.ExecuteReaderAsync(cancellationToken);
while (await reader.ReadAsync(cancellationToken))
{
var schema = reader.GetString(0);
var table = reader.GetString(1);
var futureCount = reader.IsDBNull(2) ? 0 : reader.GetInt32(2);
var lastPartitionStart = reader.IsDBNull(3) ? (DateTimeOffset?)null : reader.GetDateTime(3);
var tableKey = $"{schema}.{table}";
var daysUntilExhaustion = lastPartitionStart.HasValue
? Math.Max(0, (int)(lastPartitionStart.Value - DateTimeOffset.UtcNow).TotalDays)
: 0;
futureCounts[tableKey] = futureCount;
daysUntil[tableKey] = daysUntilExhaustion;
var severity = daysUntilExhaustion <= criticalThreshold ? AlertSeverity.Critical
: daysUntilExhaustion <= alertThreshold ? AlertSeverity.Warning
: AlertSeverity.None;
var status = new PartitionHealthStatus(
SchemaName: schema,
TableName: table,
FuturePartitions: futureCount,
DaysUntilExhaustion: daysUntilExhaustion,
LastPartitionStart: lastPartitionStart,
Severity: severity,
AlertMessage: severity != AlertSeverity.None
? $"Partition exhaustion {severity.ToString().ToLowerInvariant()}: {tableKey} has {daysUntilExhaustion} days until exhaustion"
: null);
results.Add(status);
if (severity != AlertSeverity.None)
{
AlertsFired.Add(1, new TagList
{
{ "table", tableKey },
{ "severity", severity.ToString().ToLowerInvariant() }
});
activity?.AddEvent(new ActivityEvent(
"partition.exhaustion.alert",
tags: new ActivityTagsCollection
{
{ "table", tableKey },
{ "severity", severity.ToString() },
{ "days_until_exhaustion", daysUntilExhaustion }
}));
}
}
}
catch (PostgresException ex) when (ex.SqlState == "42P01") // undefined_table
{
// partition_mgmt schema doesn't exist yet
activity?.SetStatus(ActivityStatusCode.Error, "partition_mgmt schema not found");
}
// Update observable gauge state
_lastFuturePartitionCounts = futureCounts;
_lastDaysUntilExhaustion = daysUntil;
return results;
}
/// <summary>
/// Get alert summary for integration with notification systems.
/// </summary>
public static PartitionAlertSummary GetAlertSummary(IEnumerable<PartitionHealthStatus> statuses)
{
var criticalTables = statuses.Where(s => s.Severity == AlertSeverity.Critical).ToList();
var warningTables = statuses.Where(s => s.Severity == AlertSeverity.Warning).ToList();
return new PartitionAlertSummary(
CriticalCount: criticalTables.Count,
WarningCount: warningTables.Count,
CriticalTables: criticalTables.Select(s => $"{s.SchemaName}.{s.TableName}").ToList(),
WarningTables: warningTables.Select(s => $"{s.SchemaName}.{s.TableName}").ToList(),
OverallSeverity: criticalTables.Count > 0 ? AlertSeverity.Critical
: warningTables.Count > 0 ? AlertSeverity.Warning
: AlertSeverity.None);
}
}
/// <summary>
/// Health status for a single partitioned table.
/// </summary>
public sealed record PartitionHealthStatus(
string SchemaName,
string TableName,
int FuturePartitions,
int DaysUntilExhaustion,
DateTimeOffset? LastPartitionStart,
AlertSeverity Severity,
string? AlertMessage);
/// <summary>
/// Summary of partition alerts.
/// </summary>
public sealed record PartitionAlertSummary(
int CriticalCount,
int WarningCount,
IReadOnlyList<string> CriticalTables,
IReadOnlyList<string> WarningTables,
AlertSeverity OverallSeverity);
/// <summary>
/// Alert severity levels.
/// </summary>
public enum AlertSeverity
{
/// <summary>No alert needed.</summary>
None,
/// <summary>Warning: action needed soon.</summary>
Warning,
/// <summary>Critical: immediate action required.</summary>
Critical
}

View File

@@ -0,0 +1,250 @@
// -----------------------------------------------------------------------------
// PartitionMaintenanceWorker.cs
// Sprint: SPRINT_3422_0001_0001_time_based_partitioning
// Task: 6.1 - Create partition maintenance job
// Task: 6.2 - Create retention enforcement job
// Description: Background worker for partition creation and retention enforcement
// -----------------------------------------------------------------------------
using System.Data;
using System.Diagnostics;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using Npgsql;
using StellaOps.Scheduler.Storage.Postgres;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Execution;
/// <summary>
/// Background worker that manages partition lifecycle:
/// - Creates future partitions to avoid insert failures
/// - Drops old partitions to enforce retention policy
/// Per advisory guidelines, runs hourly by default.
/// </summary>
public sealed class PartitionMaintenanceWorker : BackgroundService
{
private readonly SchedulerDataSource _dataSource;
private readonly IOptions<PartitionMaintenanceOptions> _options;
private readonly ILogger<PartitionMaintenanceWorker> _logger;
private readonly ActivitySource _activitySource = new("StellaOps.Scheduler.PartitionMaintenance");
public PartitionMaintenanceWorker(
SchedulerDataSource dataSource,
IOptions<PartitionMaintenanceOptions> options,
ILogger<PartitionMaintenanceWorker> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("Partition maintenance worker started");
// Initial delay to let the system stabilize
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken);
while (!stoppingToken.IsCancellationRequested)
{
var opts = _options.Value;
if (!opts.Enabled)
{
_logger.LogDebug("Partition maintenance is disabled");
await Task.Delay(opts.Interval, stoppingToken);
continue;
}
using var activity = _activitySource.StartActivity("partition.maintenance", ActivityKind.Internal);
try
{
await RunMaintenanceCycleAsync(opts, stoppingToken);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Partition maintenance cycle failed");
activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
PartitionMaintenanceMetrics.RecordError("cycle_failed");
}
await Task.Delay(opts.Interval, stoppingToken);
}
_logger.LogInformation("Partition maintenance worker stopped");
}
private async Task RunMaintenanceCycleAsync(PartitionMaintenanceOptions opts, CancellationToken ct)
{
var sw = Stopwatch.StartNew();
var createdCount = 0;
var droppedCount = 0;
_logger.LogInformation("Starting partition maintenance cycle");
await using var conn = await _dataSource.GetConnectionAsync(ct);
await conn.OpenAsync(ct);
foreach (var (schemaTable, _) in opts.ManagedTables)
{
var parts = schemaTable.Split('.', 2);
if (parts.Length != 2)
{
_logger.LogWarning("Invalid managed table format: {Table}", schemaTable);
continue;
}
var schema = parts[0];
var table = parts[1];
try
{
// Step 1: Ensure future partitions exist
var created = await EnsureFuturePartitionsAsync(conn, schema, table, opts.MonthsAhead, ct);
createdCount += created;
// Step 2: Enforce retention policy
var retentionMonths = opts.GetRetentionMonths(schemaTable);
var dropped = await EnforceRetentionAsync(conn, schema, table, retentionMonths, ct);
droppedCount += dropped;
}
catch (PostgresException ex) when (ex.SqlState == "42P01") // undefined_table
{
_logger.LogDebug("Table {Schema}.{Table} does not exist (not partitioned yet), skipping", schema, table);
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to maintain partitions for {Schema}.{Table}", schema, table);
PartitionMaintenanceMetrics.RecordError($"{schema}.{table}");
}
}
sw.Stop();
_logger.LogInformation(
"Partition maintenance cycle completed in {ElapsedMs}ms: {Created} partitions created, {Dropped} partitions dropped",
sw.ElapsedMilliseconds, createdCount, droppedCount);
PartitionMaintenanceMetrics.RecordCycle(sw.Elapsed.TotalMilliseconds, createdCount, droppedCount);
}
private async Task<int> EnsureFuturePartitionsAsync(
NpgsqlConnection conn,
string schema,
string table,
int monthsAhead,
CancellationToken ct)
{
// Use the partition management function if available, otherwise create partitions manually
await using var cmd = conn.CreateCommand();
cmd.CommandText = @"
SELECT partition_mgmt.ensure_future_partitions($1, $2, $3)
WHERE EXISTS (
SELECT 1 FROM pg_proc p
JOIN pg_namespace n ON p.pronamespace = n.oid
WHERE n.nspname = 'partition_mgmt' AND p.proname = 'ensure_future_partitions'
)";
cmd.Parameters.AddWithValue(schema);
cmd.Parameters.AddWithValue(table);
cmd.Parameters.AddWithValue(monthsAhead);
var result = await cmd.ExecuteScalarAsync(ct);
var created = result is int count ? count : 0;
if (created > 0)
{
_logger.LogInformation("Created {Count} future partitions for {Schema}.{Table}", created, schema, table);
PartitionMaintenanceMetrics.RecordPartitionsCreated(schema, table, created);
}
return created;
}
private async Task<int> EnforceRetentionAsync(
NpgsqlConnection conn,
string schema,
string table,
int retentionMonths,
CancellationToken ct)
{
// Use the partition management function if available
await using var cmd = conn.CreateCommand();
cmd.CommandText = @"
SELECT partition_mgmt.enforce_retention($1, $2, $3)
WHERE EXISTS (
SELECT 1 FROM pg_proc p
JOIN pg_namespace n ON p.pronamespace = n.oid
WHERE n.nspname = 'partition_mgmt' AND p.proname = 'enforce_retention'
)";
cmd.Parameters.AddWithValue(schema);
cmd.Parameters.AddWithValue(table);
cmd.Parameters.AddWithValue(retentionMonths);
var result = await cmd.ExecuteScalarAsync(ct);
var dropped = result is int count ? count : 0;
if (dropped > 0)
{
_logger.LogInformation("Dropped {Count} old partitions for {Schema}.{Table} (retention: {Months} months)",
dropped, schema, table, retentionMonths);
PartitionMaintenanceMetrics.RecordPartitionsDropped(schema, table, dropped);
}
return dropped;
}
}
/// <summary>
/// Metrics for partition maintenance operations.
/// </summary>
public static class PartitionMaintenanceMetrics
{
private static readonly System.Diagnostics.Metrics.Meter Meter =
new("StellaOps.Scheduler.PartitionMaintenance", "1.0.0");
private static readonly System.Diagnostics.Metrics.Counter<int> PartitionsCreated =
Meter.CreateCounter<int>("stellaops.partitions.created", description: "Number of partitions created");
private static readonly System.Diagnostics.Metrics.Counter<int> PartitionsDropped =
Meter.CreateCounter<int>("stellaops.partitions.dropped", description: "Number of partitions dropped");
private static readonly System.Diagnostics.Metrics.Counter<int> Errors =
Meter.CreateCounter<int>("stellaops.partitions.errors", description: "Number of partition maintenance errors");
private static readonly System.Diagnostics.Metrics.Histogram<double> CycleDuration =
Meter.CreateHistogram<double>("stellaops.partitions.cycle_duration_ms", description: "Duration of maintenance cycle in ms");
public static void RecordPartitionsCreated(string schema, string table, int count)
{
PartitionsCreated.Add(count, new System.Diagnostics.TagList
{
{ "schema", schema },
{ "table", table }
});
}
public static void RecordPartitionsDropped(string schema, string table, int count)
{
PartitionsDropped.Add(count, new System.Diagnostics.TagList
{
{ "schema", schema },
{ "table", table }
});
}
public static void RecordError(string context)
{
Errors.Add(1, new System.Diagnostics.TagList { { "context", context } });
}
public static void RecordCycle(double durationMs, int created, int dropped)
{
CycleDuration.Record(durationMs);
}
}

View File

@@ -0,0 +1,78 @@
// -----------------------------------------------------------------------------
// PartitionMaintenanceOptions.cs
// Sprint: SPRINT_3422_0001_0001_time_based_partitioning
// Task: 6.1 - Create partition maintenance job
// Description: Configuration options for partition maintenance worker
// -----------------------------------------------------------------------------
namespace StellaOps.Scheduler.Worker.Options;
/// <summary>
/// Configuration options for partition maintenance.
/// </summary>
public sealed class PartitionMaintenanceOptions
{
/// <summary>
/// Whether partition maintenance is enabled. Default: true.
/// </summary>
public bool Enabled { get; set; } = true;
/// <summary>
/// Interval between maintenance runs. Default: 1 hour.
/// </summary>
public TimeSpan Interval { get; set; } = TimeSpan.FromHours(1);
/// <summary>
/// Number of months ahead to create partitions. Default: 3.
/// </summary>
public int MonthsAhead { get; set; } = 3;
/// <summary>
/// Retention period in months for scheduler tables. Default: 24 months.
/// </summary>
public int SchedulerRetentionMonths { get; set; } = 24;
/// <summary>
/// Retention period in months for vuln tables. Default: 36 months.
/// </summary>
public int VulnRetentionMonths { get; set; } = 36;
/// <summary>
/// Retention period in months for vex tables. Default: 36 months.
/// </summary>
public int VexRetentionMonths { get; set; } = 36;
/// <summary>
/// Retention period in months for notify tables. Default: 12 months.
/// </summary>
public int NotifyRetentionMonths { get; set; } = 12;
/// <summary>
/// Tables to manage with their schema. Key = schema.table, Value = retention months (0 = use default).
/// </summary>
public Dictionary<string, int> ManagedTables { get; set; } = new()
{
["scheduler.audit"] = 0, // Uses SchedulerRetentionMonths
["scheduler.runs"] = 0,
["scheduler.execution_logs"] = 0,
["vuln.merge_events"] = 0, // Uses VulnRetentionMonths
["vex.timeline_events"] = 0, // Uses VexRetentionMonths
["notify.deliveries"] = 0 // Uses NotifyRetentionMonths
};
/// <summary>
/// Get retention months for a specific table.
/// </summary>
public int GetRetentionMonths(string schemaTable)
{
if (ManagedTables.TryGetValue(schemaTable, out var months) && months > 0)
return months;
// Use schema-based defaults
return schemaTable.StartsWith("scheduler.") ? SchedulerRetentionMonths :
schemaTable.StartsWith("vuln.") ? VulnRetentionMonths :
schemaTable.StartsWith("vex.") ? VexRetentionMonths :
schemaTable.StartsWith("notify.") ? NotifyRetentionMonths :
24; // Default fallback
}
}

View File

@@ -0,0 +1,317 @@
// =============================================================================
// ScoreReplaySchedulerJob.cs
// Sprint: SPRINT_3401_0002_0001
// Task: SCORE-REPLAY-011 - Add scheduled job to rescore when feed snapshots change
// =============================================================================
using System.Diagnostics;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Planning;
/// <summary>
/// Configuration options for score replay scheduling.
/// </summary>
public sealed class ScoreReplaySchedulerOptions
{
/// <summary>
/// Whether automatic score replay is enabled.
/// </summary>
public bool Enabled { get; set; } = true;
/// <summary>
/// Maximum age in days for scans to be considered for replay.
/// </summary>
public int MaxAgeDays { get; set; } = 30;
/// <summary>
/// Whether to send notifications when scores change significantly.
/// </summary>
public bool NotifyOnDelta { get; set; } = true;
/// <summary>
/// Minimum score delta to trigger notification.
/// </summary>
public double DeltaThreshold { get; set; } = 0.5;
/// <summary>
/// Maximum number of scans to replay per run.
/// </summary>
public int MaxScansPerRun { get; set; } = 100;
/// <summary>
/// Parallelism for replay operations.
/// </summary>
public int Parallelism { get; set; } = 4;
}
/// <summary>
/// Result of a score replay operation.
/// </summary>
public sealed record ScoreReplayResult(
string ScanId,
string ReplayId,
bool Success,
double OriginalScore,
double ReplayedScore,
int FindingsAdded,
int FindingsRemoved,
int FindingsRescored,
TimeSpan Duration,
string? ErrorMessage = null);
/// <summary>
/// Summary of a score replay batch run.
/// </summary>
public sealed record ScoreReplayBatchSummary(
DateTimeOffset StartedAt,
DateTimeOffset CompletedAt,
string TriggerType,
string? FeedSnapshotHash,
int TotalScans,
int SuccessCount,
int FailureCount,
int SignificantDeltas,
IReadOnlyList<ScoreReplayResult> Results);
/// <summary>
/// Interface for the score replay scheduler.
/// </summary>
public interface IScoreReplayScheduler
{
/// <summary>
/// Triggers a score replay for all eligible scans.
/// </summary>
Task<ScoreReplayBatchSummary> ReplayAllAsync(
string triggerType,
string? feedSnapshotHash = null,
CancellationToken ct = default);
/// <summary>
/// Triggers a score replay for a specific scan.
/// </summary>
Task<ScoreReplayResult> ReplayScanAsync(
string scanId,
string triggerType,
string? feedSnapshotHash = null,
CancellationToken ct = default);
}
/// <summary>
/// Interface for the scanner replay client.
/// </summary>
public interface IScannerReplayClient
{
/// <summary>
/// Gets scans eligible for replay (within max age, has manifest).
/// </summary>
Task<IReadOnlyList<string>> GetEligibleScansAsync(
int maxAgeDays,
int limit,
CancellationToken ct = default);
/// <summary>
/// Triggers a score replay for a scan.
/// </summary>
Task<ScoreReplayResult> ReplayAsync(
string scanId,
string? feedSnapshotHash,
CancellationToken ct = default);
/// <summary>
/// Gets the current feed snapshot hash.
/// </summary>
Task<string> GetCurrentFeedSnapshotHashAsync(CancellationToken ct = default);
}
/// <summary>
/// Scheduled job that triggers score replays when feed snapshots change.
/// Per Sprint 3401.0002.0001 - Score Replay & Proof Bundle.
/// </summary>
public sealed class ScoreReplaySchedulerJob : IScoreReplayScheduler
{
private readonly IScannerReplayClient _scannerClient;
private readonly ScoreReplaySchedulerOptions _options;
private readonly ILogger<ScoreReplaySchedulerJob> _logger;
private string? _lastFeedSnapshotHash;
public ScoreReplaySchedulerJob(
IScannerReplayClient scannerClient,
IOptions<ScoreReplaySchedulerOptions> options,
ILogger<ScoreReplaySchedulerJob> logger)
{
_scannerClient = scannerClient ?? throw new ArgumentNullException(nameof(scannerClient));
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <summary>
/// Checks if a new feed snapshot is available and triggers replay if needed.
/// Called periodically by the scheduler.
/// </summary>
public async Task<bool> CheckAndReplayAsync(CancellationToken ct = default)
{
if (!_options.Enabled)
{
_logger.LogDebug("Score replay scheduler is disabled");
return false;
}
try
{
var currentHash = await _scannerClient.GetCurrentFeedSnapshotHashAsync(ct);
if (_lastFeedSnapshotHash is not null && _lastFeedSnapshotHash != currentHash)
{
_logger.LogInformation(
"Feed snapshot changed from {Old} to {New}, triggering replay",
_lastFeedSnapshotHash[..16],
currentHash[..16]);
await ReplayAllAsync("feed_update", currentHash, ct);
_lastFeedSnapshotHash = currentHash;
return true;
}
_lastFeedSnapshotHash = currentHash;
return false;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error checking for feed snapshot changes");
return false;
}
}
/// <inheritdoc/>
public async Task<ScoreReplayBatchSummary> ReplayAllAsync(
string triggerType,
string? feedSnapshotHash = null,
CancellationToken ct = default)
{
var startedAt = DateTimeOffset.UtcNow;
var results = new List<ScoreReplayResult>();
var successCount = 0;
var failureCount = 0;
var significantDeltas = 0;
_logger.LogInformation(
"Starting score replay batch. Trigger={Trigger}, MaxAge={Days}d, MaxScans={Max}",
triggerType,
_options.MaxAgeDays,
_options.MaxScansPerRun);
try
{
var eligibleScans = await _scannerClient.GetEligibleScansAsync(
_options.MaxAgeDays,
_options.MaxScansPerRun,
ct);
_logger.LogInformation("Found {Count} eligible scans for replay", eligibleScans.Count);
// Process in parallel batches
var semaphore = new SemaphoreSlim(_options.Parallelism);
var tasks = eligibleScans.Select(async scanId =>
{
await semaphore.WaitAsync(ct);
try
{
return await ReplayScanAsync(scanId, triggerType, feedSnapshotHash, ct);
}
finally
{
semaphore.Release();
}
});
var batchResults = await Task.WhenAll(tasks);
results.AddRange(batchResults);
foreach (var result in batchResults)
{
if (result.Success)
{
successCount++;
var delta = Math.Abs(result.ReplayedScore - result.OriginalScore);
if (delta >= _options.DeltaThreshold)
{
significantDeltas++;
}
}
else
{
failureCount++;
}
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Error during batch score replay");
}
var completedAt = DateTimeOffset.UtcNow;
_logger.LogInformation(
"Score replay batch completed. Success={Success}, Failed={Failed}, SignificantDeltas={Deltas}, Duration={Duration}ms",
successCount,
failureCount,
significantDeltas,
(completedAt - startedAt).TotalMilliseconds);
return new ScoreReplayBatchSummary(
StartedAt: startedAt,
CompletedAt: completedAt,
TriggerType: triggerType,
FeedSnapshotHash: feedSnapshotHash,
TotalScans: results.Count,
SuccessCount: successCount,
FailureCount: failureCount,
SignificantDeltas: significantDeltas,
Results: results);
}
/// <inheritdoc/>
public async Task<ScoreReplayResult> ReplayScanAsync(
string scanId,
string triggerType,
string? feedSnapshotHash = null,
CancellationToken ct = default)
{
var sw = Stopwatch.StartNew();
try
{
_logger.LogDebug("Replaying scan {ScanId}", scanId);
var result = await _scannerClient.ReplayAsync(scanId, feedSnapshotHash, ct);
sw.Stop();
_logger.LogDebug(
"Scan {ScanId} replayed. Delta={Delta:F2}, Duration={Duration}ms",
scanId,
result.ReplayedScore - result.OriginalScore,
sw.ElapsedMilliseconds);
return result;
}
catch (Exception ex)
{
sw.Stop();
_logger.LogWarning(ex, "Failed to replay scan {ScanId}", scanId);
return new ScoreReplayResult(
ScanId: scanId,
ReplayId: string.Empty,
Success: false,
OriginalScore: 0,
ReplayedScore: 0,
FindingsAdded: 0,
FindingsRemoved: 0,
FindingsRescored: 0,
Duration: sw.Elapsed,
ErrorMessage: ex.Message);
}
}
}