feat(rate-limiting): Implement core rate limiting functionality with configuration, decision-making, metrics, middleware, and service registration

- Add RateLimitConfig for configuration management with YAML binding support. - Introduce RateLimitDecision to encapsulate the result of rate limit checks. - Implement RateLimitMetrics for OpenTelemetry metrics tracking. - Create RateLimitMiddleware for enforcing rate limits on incoming requests. - Develop RateLimitService to orchestrate instance and environment rate limit checks. - Add RateLimitServiceCollectionExtensions for dependency injection registration.
2025-12-17 18:02:37 +02:00
parent 394b57f6bf
commit 8bbfe4d2d2
211 changed files with 47179 additions and 1590 deletions
--- a/src/Scheduler/__Libraries/StellaOps.Scheduler.Worker/Execution/PartitionHealthMonitor.cs
+++ b/src/Scheduler/__Libraries/StellaOps.Scheduler.Worker/Execution/PartitionHealthMonitor.cs
@@ -0,0 +1,203 @@
+// -----------------------------------------------------------------------------
+// PartitionExhaustionAlert.cs
+// Sprint: SPRINT_3422_0001_0001_time_based_partitioning
+// Task: 6.4 - Add alerting for partition exhaustion
+// Description: Prometheus/OpenTelemetry metrics and alerts for partition health
+// -----------------------------------------------------------------------------
+
+using System.Diagnostics;
+using System.Diagnostics.Metrics;
+using Npgsql;
+
+namespace StellaOps.Scheduler.Worker.Execution;
+
+/// <summary>
+/// Monitors partition health and emits alerts when partitions are running low.
+/// Per Sprint 3422 - Time-Based Partitioning.
+/// </summary>
+public sealed class PartitionHealthMonitor
+{
+    private static readonly Meter Meter = new("StellaOps.Partitions", "1.0.0");
+    private static readonly ActivitySource ActivitySource = new("StellaOps.Partitions");
+
+    // Gauges for partition metrics
+    private static readonly ObservableGauge<int> FuturePartitions = Meter.CreateObservableGauge<int>(
+        "stellaops.partitions.future_count",
+        () => _lastFuturePartitionCounts.Select(kv =>
+            new Measurement<int>(kv.Value, new KeyValuePair<string, object?>("table", kv.Key))),
+        description: "Number of future partitions available per table");
+
+    private static readonly ObservableGauge<int> DaysUntilExhaustion = Meter.CreateObservableGauge<int>(
+        "stellaops.partitions.days_until_exhaustion",
+        () => _lastDaysUntilExhaustion.Select(kv =>
+            new Measurement<int>(kv.Value, new KeyValuePair<string, object?>("table", kv.Key))),
+        description: "Days until partition exhaustion per table");
+
+    // Counters for alerts
+    private static readonly Counter<int> AlertsFired = Meter.CreateCounter<int>(
+        "stellaops.partitions.alerts_fired",
+        description: "Number of partition exhaustion alerts fired");
+
+    // State for observable gauges
+    private static Dictionary<string, int> _lastFuturePartitionCounts = new();
+    private static Dictionary<string, int> _lastDaysUntilExhaustion = new();
+
+    /// <summary>
+    /// Check partition health and fire alerts if needed.
+    /// </summary>
+    /// <param name="connection">PostgreSQL connection.</param>
+    /// <param name="alertThreshold">Days threshold for warning alert.</param>
+    /// <param name="criticalThreshold">Days threshold for critical alert.</param>
+    /// <param name="cancellationToken">Cancellation token.</param>
+    /// <returns>List of partition health status for each table.</returns>
+    public async Task<List<PartitionHealthStatus>> CheckHealthAsync(
+        NpgsqlConnection connection,
+        int alertThreshold = 30,
+        int criticalThreshold = 7,
+        CancellationToken cancellationToken = default)
+    {
+        using var activity = ActivitySource.StartActivity("partitions.health_check", ActivityKind.Internal);
+
+        var results = new List<PartitionHealthStatus>();
+        var futureCounts = new Dictionary<string, int>();
+        var daysUntil = new Dictionary<string, int>();
+
+        // Query partition health from partition_mgmt schema
+        await using var cmd = connection.CreateCommand();
+        cmd.CommandText = """
+            SELECT
+                mt.schema_name,
+                mt.table_name,
+                COUNT(*) FILTER (WHERE p.partition_start > NOW()) as future_partitions,
+                MAX(p.partition_start) as last_partition_start
+            FROM partition_mgmt.managed_tables mt
+            LEFT JOIN partition_mgmt.partition_stats p
+                ON mt.schema_name = p.schema_name
+                AND mt.table_name = p.table_name
+            GROUP BY mt.schema_name, mt.table_name, mt.months_ahead
+            ORDER BY mt.schema_name, mt.table_name
+            """;
+
+        try
+        {
+            await using var reader = await cmd.ExecuteReaderAsync(cancellationToken);
+
+            while (await reader.ReadAsync(cancellationToken))
+            {
+                var schema = reader.GetString(0);
+                var table = reader.GetString(1);
+                var futureCount = reader.IsDBNull(2) ? 0 : reader.GetInt32(2);
+                var lastPartitionStart = reader.IsDBNull(3) ? (DateTimeOffset?)null : reader.GetDateTime(3);
+
+                var tableKey = $"{schema}.{table}";
+                var daysUntilExhaustion = lastPartitionStart.HasValue
+                    ? Math.Max(0, (int)(lastPartitionStart.Value - DateTimeOffset.UtcNow).TotalDays)
+                    : 0;
+
+                futureCounts[tableKey] = futureCount;
+                daysUntil[tableKey] = daysUntilExhaustion;
+
+                var severity = daysUntilExhaustion <= criticalThreshold ? AlertSeverity.Critical
+                             : daysUntilExhaustion <= alertThreshold ? AlertSeverity.Warning
+                             : AlertSeverity.None;
+
+                var status = new PartitionHealthStatus(
+                    SchemaName: schema,
+                    TableName: table,
+                    FuturePartitions: futureCount,
+                    DaysUntilExhaustion: daysUntilExhaustion,
+                    LastPartitionStart: lastPartitionStart,
+                    Severity: severity,
+                    AlertMessage: severity != AlertSeverity.None
+                        ? $"Partition exhaustion {severity.ToString().ToLowerInvariant()}: {tableKey} has {daysUntilExhaustion} days until exhaustion"
+                        : null);
+
+                results.Add(status);
+
+                if (severity != AlertSeverity.None)
+                {
+                    AlertsFired.Add(1, new TagList
+                    {
+                        { "table", tableKey },
+                        { "severity", severity.ToString().ToLowerInvariant() }
+                    });
+
+                    activity?.AddEvent(new ActivityEvent(
+                        "partition.exhaustion.alert",
+                        tags: new ActivityTagsCollection
+                        {
+                            { "table", tableKey },
+                            { "severity", severity.ToString() },
+                            { "days_until_exhaustion", daysUntilExhaustion }
+                        }));
+                }
+            }
+        }
+        catch (PostgresException ex) when (ex.SqlState == "42P01") // undefined_table
+        {
+            // partition_mgmt schema doesn't exist yet
+            activity?.SetStatus(ActivityStatusCode.Error, "partition_mgmt schema not found");
+        }
+
+        // Update observable gauge state
+        _lastFuturePartitionCounts = futureCounts;
+        _lastDaysUntilExhaustion = daysUntil;
+
+        return results;
+    }
+
+    /// <summary>
+    /// Get alert summary for integration with notification systems.
+    /// </summary>
+    public static PartitionAlertSummary GetAlertSummary(IEnumerable<PartitionHealthStatus> statuses)
+    {
+        var criticalTables = statuses.Where(s => s.Severity == AlertSeverity.Critical).ToList();
+        var warningTables = statuses.Where(s => s.Severity == AlertSeverity.Warning).ToList();
+
+        return new PartitionAlertSummary(
+            CriticalCount: criticalTables.Count,
+            WarningCount: warningTables.Count,
+            CriticalTables: criticalTables.Select(s => $"{s.SchemaName}.{s.TableName}").ToList(),
+            WarningTables: warningTables.Select(s => $"{s.SchemaName}.{s.TableName}").ToList(),
+            OverallSeverity: criticalTables.Count > 0 ? AlertSeverity.Critical
+                           : warningTables.Count > 0 ? AlertSeverity.Warning
+                           : AlertSeverity.None);
+    }
+}
+
+/// <summary>
+/// Health status for a single partitioned table.
+/// </summary>
+public sealed record PartitionHealthStatus(
+    string SchemaName,
+    string TableName,
+    int FuturePartitions,
+    int DaysUntilExhaustion,
+    DateTimeOffset? LastPartitionStart,
+    AlertSeverity Severity,
+    string? AlertMessage);
+
+/// <summary>
+/// Summary of partition alerts.
+/// </summary>
+public sealed record PartitionAlertSummary(
+    int CriticalCount,
+    int WarningCount,
+    IReadOnlyList<string> CriticalTables,
+    IReadOnlyList<string> WarningTables,
+    AlertSeverity OverallSeverity);
+
+/// <summary>
+/// Alert severity levels.
+/// </summary>
+public enum AlertSeverity
+{
+    /// <summary>No alert needed.</summary>
+    None,
+
+    /// <summary>Warning: action needed soon.</summary>
+    Warning,
+
+    /// <summary>Critical: immediate action required.</summary>
+    Critical
+}
--- a/src/Scheduler/__Libraries/StellaOps.Scheduler.Worker/Execution/PartitionMaintenanceWorker.cs
+++ b/src/Scheduler/__Libraries/StellaOps.Scheduler.Worker/Execution/PartitionMaintenanceWorker.cs
@@ -0,0 +1,250 @@
+// -----------------------------------------------------------------------------
+// PartitionMaintenanceWorker.cs
+// Sprint: SPRINT_3422_0001_0001_time_based_partitioning
+// Task: 6.1 - Create partition maintenance job
+// Task: 6.2 - Create retention enforcement job
+// Description: Background worker for partition creation and retention enforcement
+// -----------------------------------------------------------------------------
+
+using System.Data;
+using System.Diagnostics;
+using Microsoft.Extensions.Hosting;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using Npgsql;
+using StellaOps.Scheduler.Storage.Postgres;
+using StellaOps.Scheduler.Worker.Options;
+
+namespace StellaOps.Scheduler.Worker.Execution;
+
+/// <summary>
+/// Background worker that manages partition lifecycle:
+/// - Creates future partitions to avoid insert failures
+/// - Drops old partitions to enforce retention policy
+/// Per advisory guidelines, runs hourly by default.
+/// </summary>
+public sealed class PartitionMaintenanceWorker : BackgroundService
+{
+    private readonly SchedulerDataSource _dataSource;
+    private readonly IOptions<PartitionMaintenanceOptions> _options;
+    private readonly ILogger<PartitionMaintenanceWorker> _logger;
+    private readonly ActivitySource _activitySource = new("StellaOps.Scheduler.PartitionMaintenance");
+
+    public PartitionMaintenanceWorker(
+        SchedulerDataSource dataSource,
+        IOptions<PartitionMaintenanceOptions> options,
+        ILogger<PartitionMaintenanceWorker> logger)
+    {
+        _dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
+        _options = options ?? throw new ArgumentNullException(nameof(options));
+        _logger = logger ?? throw new ArgumentNullException(nameof(logger));
+    }
+
+    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
+    {
+        _logger.LogInformation("Partition maintenance worker started");
+
+        // Initial delay to let the system stabilize
+        await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken);
+
+        while (!stoppingToken.IsCancellationRequested)
+        {
+            var opts = _options.Value;
+
+            if (!opts.Enabled)
+            {
+                _logger.LogDebug("Partition maintenance is disabled");
+                await Task.Delay(opts.Interval, stoppingToken);
+                continue;
+            }
+
+            using var activity = _activitySource.StartActivity("partition.maintenance", ActivityKind.Internal);
+
+            try
+            {
+                await RunMaintenanceCycleAsync(opts, stoppingToken);
+            }
+            catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
+            {
+                break;
+            }
+            catch (Exception ex)
+            {
+                _logger.LogError(ex, "Partition maintenance cycle failed");
+                activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
+                PartitionMaintenanceMetrics.RecordError("cycle_failed");
+            }
+
+            await Task.Delay(opts.Interval, stoppingToken);
+        }
+
+        _logger.LogInformation("Partition maintenance worker stopped");
+    }
+
+    private async Task RunMaintenanceCycleAsync(PartitionMaintenanceOptions opts, CancellationToken ct)
+    {
+        var sw = Stopwatch.StartNew();
+        var createdCount = 0;
+        var droppedCount = 0;
+
+        _logger.LogInformation("Starting partition maintenance cycle");
+
+        await using var conn = await _dataSource.GetConnectionAsync(ct);
+        await conn.OpenAsync(ct);
+
+        foreach (var (schemaTable, _) in opts.ManagedTables)
+        {
+            var parts = schemaTable.Split('.', 2);
+            if (parts.Length != 2)
+            {
+                _logger.LogWarning("Invalid managed table format: {Table}", schemaTable);
+                continue;
+            }
+
+            var schema = parts[0];
+            var table = parts[1];
+
+            try
+            {
+                // Step 1: Ensure future partitions exist
+                var created = await EnsureFuturePartitionsAsync(conn, schema, table, opts.MonthsAhead, ct);
+                createdCount += created;
+
+                // Step 2: Enforce retention policy
+                var retentionMonths = opts.GetRetentionMonths(schemaTable);
+                var dropped = await EnforceRetentionAsync(conn, schema, table, retentionMonths, ct);
+                droppedCount += dropped;
+            }
+            catch (PostgresException ex) when (ex.SqlState == "42P01") // undefined_table
+            {
+                _logger.LogDebug("Table {Schema}.{Table} does not exist (not partitioned yet), skipping", schema, table);
+            }
+            catch (Exception ex)
+            {
+                _logger.LogError(ex, "Failed to maintain partitions for {Schema}.{Table}", schema, table);
+                PartitionMaintenanceMetrics.RecordError($"{schema}.{table}");
+            }
+        }
+
+        sw.Stop();
+        _logger.LogInformation(
+            "Partition maintenance cycle completed in {ElapsedMs}ms: {Created} partitions created, {Dropped} partitions dropped",
+            sw.ElapsedMilliseconds, createdCount, droppedCount);
+
+        PartitionMaintenanceMetrics.RecordCycle(sw.Elapsed.TotalMilliseconds, createdCount, droppedCount);
+    }
+
+    private async Task<int> EnsureFuturePartitionsAsync(
+        NpgsqlConnection conn,
+        string schema,
+        string table,
+        int monthsAhead,
+        CancellationToken ct)
+    {
+        // Use the partition management function if available, otherwise create partitions manually
+        await using var cmd = conn.CreateCommand();
+        cmd.CommandText = @"
+            SELECT partition_mgmt.ensure_future_partitions($1, $2, $3)
+            WHERE EXISTS (
+                SELECT 1 FROM pg_proc p
+                JOIN pg_namespace n ON p.pronamespace = n.oid
+                WHERE n.nspname = 'partition_mgmt' AND p.proname = 'ensure_future_partitions'
+            )";
+        cmd.Parameters.AddWithValue(schema);
+        cmd.Parameters.AddWithValue(table);
+        cmd.Parameters.AddWithValue(monthsAhead);
+
+        var result = await cmd.ExecuteScalarAsync(ct);
+        var created = result is int count ? count : 0;
+
+        if (created > 0)
+        {
+            _logger.LogInformation("Created {Count} future partitions for {Schema}.{Table}", created, schema, table);
+            PartitionMaintenanceMetrics.RecordPartitionsCreated(schema, table, created);
+        }
+
+        return created;
+    }
+
+    private async Task<int> EnforceRetentionAsync(
+        NpgsqlConnection conn,
+        string schema,
+        string table,
+        int retentionMonths,
+        CancellationToken ct)
+    {
+        // Use the partition management function if available
+        await using var cmd = conn.CreateCommand();
+        cmd.CommandText = @"
+            SELECT partition_mgmt.enforce_retention($1, $2, $3)
+            WHERE EXISTS (
+                SELECT 1 FROM pg_proc p
+                JOIN pg_namespace n ON p.pronamespace = n.oid
+                WHERE n.nspname = 'partition_mgmt' AND p.proname = 'enforce_retention'
+            )";
+        cmd.Parameters.AddWithValue(schema);
+        cmd.Parameters.AddWithValue(table);
+        cmd.Parameters.AddWithValue(retentionMonths);
+
+        var result = await cmd.ExecuteScalarAsync(ct);
+        var dropped = result is int count ? count : 0;
+
+        if (dropped > 0)
+        {
+            _logger.LogInformation("Dropped {Count} old partitions for {Schema}.{Table} (retention: {Months} months)",
+                dropped, schema, table, retentionMonths);
+            PartitionMaintenanceMetrics.RecordPartitionsDropped(schema, table, dropped);
+        }
+
+        return dropped;
+    }
+}
+
+/// <summary>
+/// Metrics for partition maintenance operations.
+/// </summary>
+public static class PartitionMaintenanceMetrics
+{
+    private static readonly System.Diagnostics.Metrics.Meter Meter =
+        new("StellaOps.Scheduler.PartitionMaintenance", "1.0.0");
+
+    private static readonly System.Diagnostics.Metrics.Counter<int> PartitionsCreated =
+        Meter.CreateCounter<int>("stellaops.partitions.created", description: "Number of partitions created");
+
+    private static readonly System.Diagnostics.Metrics.Counter<int> PartitionsDropped =
+        Meter.CreateCounter<int>("stellaops.partitions.dropped", description: "Number of partitions dropped");
+
+    private static readonly System.Diagnostics.Metrics.Counter<int> Errors =
+        Meter.CreateCounter<int>("stellaops.partitions.errors", description: "Number of partition maintenance errors");
+
+    private static readonly System.Diagnostics.Metrics.Histogram<double> CycleDuration =
+        Meter.CreateHistogram<double>("stellaops.partitions.cycle_duration_ms", description: "Duration of maintenance cycle in ms");
+
+    public static void RecordPartitionsCreated(string schema, string table, int count)
+    {
+        PartitionsCreated.Add(count, new System.Diagnostics.TagList
+        {
+            { "schema", schema },
+            { "table", table }
+        });
+    }
+
+    public static void RecordPartitionsDropped(string schema, string table, int count)
+    {
+        PartitionsDropped.Add(count, new System.Diagnostics.TagList
+        {
+            { "schema", schema },
+            { "table", table }
+        });
+    }
+
+    public static void RecordError(string context)
+    {
+        Errors.Add(1, new System.Diagnostics.TagList { { "context", context } });
+    }
+
+    public static void RecordCycle(double durationMs, int created, int dropped)
+    {
+        CycleDuration.Record(durationMs);
+    }
+}
--- a/src/Scheduler/__Libraries/StellaOps.Scheduler.Worker/Options/PartitionMaintenanceOptions.cs
+++ b/src/Scheduler/__Libraries/StellaOps.Scheduler.Worker/Options/PartitionMaintenanceOptions.cs
@@ -0,0 +1,78 @@
+// -----------------------------------------------------------------------------
+// PartitionMaintenanceOptions.cs
+// Sprint: SPRINT_3422_0001_0001_time_based_partitioning
+// Task: 6.1 - Create partition maintenance job
+// Description: Configuration options for partition maintenance worker
+// -----------------------------------------------------------------------------
+
+namespace StellaOps.Scheduler.Worker.Options;
+
+/// <summary>
+/// Configuration options for partition maintenance.
+/// </summary>
+public sealed class PartitionMaintenanceOptions
+{
+    /// <summary>
+    /// Whether partition maintenance is enabled. Default: true.
+    /// </summary>
+    public bool Enabled { get; set; } = true;
+
+    /// <summary>
+    /// Interval between maintenance runs. Default: 1 hour.
+    /// </summary>
+    public TimeSpan Interval { get; set; } = TimeSpan.FromHours(1);
+
+    /// <summary>
+    /// Number of months ahead to create partitions. Default: 3.
+    /// </summary>
+    public int MonthsAhead { get; set; } = 3;
+
+    /// <summary>
+    /// Retention period in months for scheduler tables. Default: 24 months.
+    /// </summary>
+    public int SchedulerRetentionMonths { get; set; } = 24;
+
+    /// <summary>
+    /// Retention period in months for vuln tables. Default: 36 months.
+    /// </summary>
+    public int VulnRetentionMonths { get; set; } = 36;
+
+    /// <summary>
+    /// Retention period in months for vex tables. Default: 36 months.
+    /// </summary>
+    public int VexRetentionMonths { get; set; } = 36;
+
+    /// <summary>
+    /// Retention period in months for notify tables. Default: 12 months.
+    /// </summary>
+    public int NotifyRetentionMonths { get; set; } = 12;
+
+    /// <summary>
+    /// Tables to manage with their schema. Key = schema.table, Value = retention months (0 = use default).
+    /// </summary>
+    public Dictionary<string, int> ManagedTables { get; set; } = new()
+    {
+        ["scheduler.audit"] = 0,  // Uses SchedulerRetentionMonths
+        ["scheduler.runs"] = 0,
+        ["scheduler.execution_logs"] = 0,
+        ["vuln.merge_events"] = 0,  // Uses VulnRetentionMonths
+        ["vex.timeline_events"] = 0,  // Uses VexRetentionMonths
+        ["notify.deliveries"] = 0  // Uses NotifyRetentionMonths
+    };
+
+    /// <summary>
+    /// Get retention months for a specific table.
+    /// </summary>
+    public int GetRetentionMonths(string schemaTable)
+    {
+        if (ManagedTables.TryGetValue(schemaTable, out var months) && months > 0)
+            return months;
+
+        // Use schema-based defaults
+        return schemaTable.StartsWith("scheduler.") ? SchedulerRetentionMonths :
+               schemaTable.StartsWith("vuln.") ? VulnRetentionMonths :
+               schemaTable.StartsWith("vex.") ? VexRetentionMonths :
+               schemaTable.StartsWith("notify.") ? NotifyRetentionMonths :
+               24; // Default fallback
+    }
+}
--- a/src/Scheduler/__Libraries/StellaOps.Scheduler.Worker/Planning/ScoreReplaySchedulerJob.cs
+++ b/src/Scheduler/__Libraries/StellaOps.Scheduler.Worker/Planning/ScoreReplaySchedulerJob.cs
@@ -0,0 +1,317 @@
+// =============================================================================
+// ScoreReplaySchedulerJob.cs
+// Sprint: SPRINT_3401_0002_0001
+// Task: SCORE-REPLAY-011 - Add scheduled job to rescore when feed snapshots change
+// =============================================================================
+
+using System.Diagnostics;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using StellaOps.Scheduler.Worker.Options;
+
+namespace StellaOps.Scheduler.Worker.Planning;
+
+/// <summary>
+/// Configuration options for score replay scheduling.
+/// </summary>
+public sealed class ScoreReplaySchedulerOptions
+{
+    /// <summary>
+    /// Whether automatic score replay is enabled.
+    /// </summary>
+    public bool Enabled { get; set; } = true;
+
+    /// <summary>
+    /// Maximum age in days for scans to be considered for replay.
+    /// </summary>
+    public int MaxAgeDays { get; set; } = 30;
+
+    /// <summary>
+    /// Whether to send notifications when scores change significantly.
+    /// </summary>
+    public bool NotifyOnDelta { get; set; } = true;
+
+    /// <summary>
+    /// Minimum score delta to trigger notification.
+    /// </summary>
+    public double DeltaThreshold { get; set; } = 0.5;
+
+    /// <summary>
+    /// Maximum number of scans to replay per run.
+    /// </summary>
+    public int MaxScansPerRun { get; set; } = 100;
+
+    /// <summary>
+    /// Parallelism for replay operations.
+    /// </summary>
+    public int Parallelism { get; set; } = 4;
+}
+
+/// <summary>
+/// Result of a score replay operation.
+/// </summary>
+public sealed record ScoreReplayResult(
+    string ScanId,
+    string ReplayId,
+    bool Success,
+    double OriginalScore,
+    double ReplayedScore,
+    int FindingsAdded,
+    int FindingsRemoved,
+    int FindingsRescored,
+    TimeSpan Duration,
+    string? ErrorMessage = null);
+
+/// <summary>
+/// Summary of a score replay batch run.
+/// </summary>
+public sealed record ScoreReplayBatchSummary(
+    DateTimeOffset StartedAt,
+    DateTimeOffset CompletedAt,
+    string TriggerType,
+    string? FeedSnapshotHash,
+    int TotalScans,
+    int SuccessCount,
+    int FailureCount,
+    int SignificantDeltas,
+    IReadOnlyList<ScoreReplayResult> Results);
+
+/// <summary>
+/// Interface for the score replay scheduler.
+/// </summary>
+public interface IScoreReplayScheduler
+{
+    /// <summary>
+    /// Triggers a score replay for all eligible scans.
+    /// </summary>
+    Task<ScoreReplayBatchSummary> ReplayAllAsync(
+        string triggerType,
+        string? feedSnapshotHash = null,
+        CancellationToken ct = default);
+
+    /// <summary>
+    /// Triggers a score replay for a specific scan.
+    /// </summary>
+    Task<ScoreReplayResult> ReplayScanAsync(
+        string scanId,
+        string triggerType,
+        string? feedSnapshotHash = null,
+        CancellationToken ct = default);
+}
+
+/// <summary>
+/// Interface for the scanner replay client.
+/// </summary>
+public interface IScannerReplayClient
+{
+    /// <summary>
+    /// Gets scans eligible for replay (within max age, has manifest).
+    /// </summary>
+    Task<IReadOnlyList<string>> GetEligibleScansAsync(
+        int maxAgeDays,
+        int limit,
+        CancellationToken ct = default);
+
+    /// <summary>
+    /// Triggers a score replay for a scan.
+    /// </summary>
+    Task<ScoreReplayResult> ReplayAsync(
+        string scanId,
+        string? feedSnapshotHash,
+        CancellationToken ct = default);
+
+    /// <summary>
+    /// Gets the current feed snapshot hash.
+    /// </summary>
+    Task<string> GetCurrentFeedSnapshotHashAsync(CancellationToken ct = default);
+}
+
+/// <summary>
+/// Scheduled job that triggers score replays when feed snapshots change.
+/// Per Sprint 3401.0002.0001 - Score Replay & Proof Bundle.
+/// </summary>
+public sealed class ScoreReplaySchedulerJob : IScoreReplayScheduler
+{
+    private readonly IScannerReplayClient _scannerClient;
+    private readonly ScoreReplaySchedulerOptions _options;
+    private readonly ILogger<ScoreReplaySchedulerJob> _logger;
+    private string? _lastFeedSnapshotHash;
+
+    public ScoreReplaySchedulerJob(
+        IScannerReplayClient scannerClient,
+        IOptions<ScoreReplaySchedulerOptions> options,
+        ILogger<ScoreReplaySchedulerJob> logger)
+    {
+        _scannerClient = scannerClient ?? throw new ArgumentNullException(nameof(scannerClient));
+        _options = options?.Value ?? throw new ArgumentNullException(nameof(options));
+        _logger = logger ?? throw new ArgumentNullException(nameof(logger));
+    }
+
+    /// <summary>
+    /// Checks if a new feed snapshot is available and triggers replay if needed.
+    /// Called periodically by the scheduler.
+    /// </summary>
+    public async Task<bool> CheckAndReplayAsync(CancellationToken ct = default)
+    {
+        if (!_options.Enabled)
+        {
+            _logger.LogDebug("Score replay scheduler is disabled");
+            return false;
+        }
+
+        try
+        {
+            var currentHash = await _scannerClient.GetCurrentFeedSnapshotHashAsync(ct);
+
+            if (_lastFeedSnapshotHash is not null && _lastFeedSnapshotHash != currentHash)
+            {
+                _logger.LogInformation(
+                    "Feed snapshot changed from {Old} to {New}, triggering replay",
+                    _lastFeedSnapshotHash[..16],
+                    currentHash[..16]);
+
+                await ReplayAllAsync("feed_update", currentHash, ct);
+                _lastFeedSnapshotHash = currentHash;
+                return true;
+            }
+
+            _lastFeedSnapshotHash = currentHash;
+            return false;
+        }
+        catch (Exception ex)
+        {
+            _logger.LogError(ex, "Error checking for feed snapshot changes");
+            return false;
+        }
+    }
+
+    /// <inheritdoc/>
+    public async Task<ScoreReplayBatchSummary> ReplayAllAsync(
+        string triggerType,
+        string? feedSnapshotHash = null,
+        CancellationToken ct = default)
+    {
+        var startedAt = DateTimeOffset.UtcNow;
+        var results = new List<ScoreReplayResult>();
+        var successCount = 0;
+        var failureCount = 0;
+        var significantDeltas = 0;
+
+        _logger.LogInformation(
+            "Starting score replay batch. Trigger={Trigger}, MaxAge={Days}d, MaxScans={Max}",
+            triggerType,
+            _options.MaxAgeDays,
+            _options.MaxScansPerRun);
+
+        try
+        {
+            var eligibleScans = await _scannerClient.GetEligibleScansAsync(
+                _options.MaxAgeDays,
+                _options.MaxScansPerRun,
+                ct);
+
+            _logger.LogInformation("Found {Count} eligible scans for replay", eligibleScans.Count);
+
+            // Process in parallel batches
+            var semaphore = new SemaphoreSlim(_options.Parallelism);
+            var tasks = eligibleScans.Select(async scanId =>
+            {
+                await semaphore.WaitAsync(ct);
+                try
+                {
+                    return await ReplayScanAsync(scanId, triggerType, feedSnapshotHash, ct);
+                }
+                finally
+                {
+                    semaphore.Release();
+                }
+            });
+
+            var batchResults = await Task.WhenAll(tasks);
+            results.AddRange(batchResults);
+
+            foreach (var result in batchResults)
+            {
+                if (result.Success)
+                {
+                    successCount++;
+                    var delta = Math.Abs(result.ReplayedScore - result.OriginalScore);
+                    if (delta >= _options.DeltaThreshold)
+                    {
+                        significantDeltas++;
+                    }
+                }
+                else
+                {
+                    failureCount++;
+                }
+            }
+        }
+        catch (Exception ex)
+        {
+            _logger.LogError(ex, "Error during batch score replay");
+        }
+
+        var completedAt = DateTimeOffset.UtcNow;
+
+        _logger.LogInformation(
+            "Score replay batch completed. Success={Success}, Failed={Failed}, SignificantDeltas={Deltas}, Duration={Duration}ms",
+            successCount,
+            failureCount,
+            significantDeltas,
+            (completedAt - startedAt).TotalMilliseconds);
+
+        return new ScoreReplayBatchSummary(
+            StartedAt: startedAt,
+            CompletedAt: completedAt,
+            TriggerType: triggerType,
+            FeedSnapshotHash: feedSnapshotHash,
+            TotalScans: results.Count,
+            SuccessCount: successCount,
+            FailureCount: failureCount,
+            SignificantDeltas: significantDeltas,
+            Results: results);
+    }
+
+    /// <inheritdoc/>
+    public async Task<ScoreReplayResult> ReplayScanAsync(
+        string scanId,
+        string triggerType,
+        string? feedSnapshotHash = null,
+        CancellationToken ct = default)
+    {
+        var sw = Stopwatch.StartNew();
+
+        try
+        {
+            _logger.LogDebug("Replaying scan {ScanId}", scanId);
+            var result = await _scannerClient.ReplayAsync(scanId, feedSnapshotHash, ct);
+            sw.Stop();
+
+            _logger.LogDebug(
+                "Scan {ScanId} replayed. Delta={Delta:F2}, Duration={Duration}ms",
+                scanId,
+                result.ReplayedScore - result.OriginalScore,
+                sw.ElapsedMilliseconds);
+
+            return result;
+        }
+        catch (Exception ex)
+        {
+            sw.Stop();
+            _logger.LogWarning(ex, "Failed to replay scan {ScanId}", scanId);
+
+            return new ScoreReplayResult(
+                ScanId: scanId,
+                ReplayId: string.Empty,
+                Success: false,
+                OriginalScore: 0,
+                ReplayedScore: 0,
+                FindingsAdded: 0,
+                FindingsRemoved: 0,
+                FindingsRescored: 0,
+                Duration: sw.Elapsed,
+                ErrorMessage: ex.Message);
+        }
+    }
+}