feat(rate-limiting): Implement core rate limiting functionality with configuration, decision-making, metrics, middleware, and service registration
- Add RateLimitConfig for configuration management with YAML binding support. - Introduce RateLimitDecision to encapsulate the result of rate limit checks. - Implement RateLimitMetrics for OpenTelemetry metrics tracking. - Create RateLimitMiddleware for enforcing rate limits on incoming requests. - Develop RateLimitService to orchestrate instance and environment rate limit checks. - Add RateLimitServiceCollectionExtensions for dependency injection registration.
This commit is contained in:
@@ -0,0 +1,203 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// PartitionExhaustionAlert.cs
|
||||
// Sprint: SPRINT_3422_0001_0001_time_based_partitioning
|
||||
// Task: 6.4 - Add alerting for partition exhaustion
|
||||
// Description: Prometheus/OpenTelemetry metrics and alerts for partition health
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
using Npgsql;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Execution;
|
||||
|
||||
/// <summary>
|
||||
/// Monitors partition health and emits alerts when partitions are running low.
|
||||
/// Per Sprint 3422 - Time-Based Partitioning.
|
||||
/// </summary>
|
||||
public sealed class PartitionHealthMonitor
|
||||
{
|
||||
private static readonly Meter Meter = new("StellaOps.Partitions", "1.0.0");
|
||||
private static readonly ActivitySource ActivitySource = new("StellaOps.Partitions");
|
||||
|
||||
// Gauges for partition metrics
|
||||
private static readonly ObservableGauge<int> FuturePartitions = Meter.CreateObservableGauge<int>(
|
||||
"stellaops.partitions.future_count",
|
||||
() => _lastFuturePartitionCounts.Select(kv =>
|
||||
new Measurement<int>(kv.Value, new KeyValuePair<string, object?>("table", kv.Key))),
|
||||
description: "Number of future partitions available per table");
|
||||
|
||||
private static readonly ObservableGauge<int> DaysUntilExhaustion = Meter.CreateObservableGauge<int>(
|
||||
"stellaops.partitions.days_until_exhaustion",
|
||||
() => _lastDaysUntilExhaustion.Select(kv =>
|
||||
new Measurement<int>(kv.Value, new KeyValuePair<string, object?>("table", kv.Key))),
|
||||
description: "Days until partition exhaustion per table");
|
||||
|
||||
// Counters for alerts
|
||||
private static readonly Counter<int> AlertsFired = Meter.CreateCounter<int>(
|
||||
"stellaops.partitions.alerts_fired",
|
||||
description: "Number of partition exhaustion alerts fired");
|
||||
|
||||
// State for observable gauges
|
||||
private static Dictionary<string, int> _lastFuturePartitionCounts = new();
|
||||
private static Dictionary<string, int> _lastDaysUntilExhaustion = new();
|
||||
|
||||
/// <summary>
|
||||
/// Check partition health and fire alerts if needed.
|
||||
/// </summary>
|
||||
/// <param name="connection">PostgreSQL connection.</param>
|
||||
/// <param name="alertThreshold">Days threshold for warning alert.</param>
|
||||
/// <param name="criticalThreshold">Days threshold for critical alert.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>List of partition health status for each table.</returns>
|
||||
public async Task<List<PartitionHealthStatus>> CheckHealthAsync(
|
||||
NpgsqlConnection connection,
|
||||
int alertThreshold = 30,
|
||||
int criticalThreshold = 7,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
using var activity = ActivitySource.StartActivity("partitions.health_check", ActivityKind.Internal);
|
||||
|
||||
var results = new List<PartitionHealthStatus>();
|
||||
var futureCounts = new Dictionary<string, int>();
|
||||
var daysUntil = new Dictionary<string, int>();
|
||||
|
||||
// Query partition health from partition_mgmt schema
|
||||
await using var cmd = connection.CreateCommand();
|
||||
cmd.CommandText = """
|
||||
SELECT
|
||||
mt.schema_name,
|
||||
mt.table_name,
|
||||
COUNT(*) FILTER (WHERE p.partition_start > NOW()) as future_partitions,
|
||||
MAX(p.partition_start) as last_partition_start
|
||||
FROM partition_mgmt.managed_tables mt
|
||||
LEFT JOIN partition_mgmt.partition_stats p
|
||||
ON mt.schema_name = p.schema_name
|
||||
AND mt.table_name = p.table_name
|
||||
GROUP BY mt.schema_name, mt.table_name, mt.months_ahead
|
||||
ORDER BY mt.schema_name, mt.table_name
|
||||
""";
|
||||
|
||||
try
|
||||
{
|
||||
await using var reader = await cmd.ExecuteReaderAsync(cancellationToken);
|
||||
|
||||
while (await reader.ReadAsync(cancellationToken))
|
||||
{
|
||||
var schema = reader.GetString(0);
|
||||
var table = reader.GetString(1);
|
||||
var futureCount = reader.IsDBNull(2) ? 0 : reader.GetInt32(2);
|
||||
var lastPartitionStart = reader.IsDBNull(3) ? (DateTimeOffset?)null : reader.GetDateTime(3);
|
||||
|
||||
var tableKey = $"{schema}.{table}";
|
||||
var daysUntilExhaustion = lastPartitionStart.HasValue
|
||||
? Math.Max(0, (int)(lastPartitionStart.Value - DateTimeOffset.UtcNow).TotalDays)
|
||||
: 0;
|
||||
|
||||
futureCounts[tableKey] = futureCount;
|
||||
daysUntil[tableKey] = daysUntilExhaustion;
|
||||
|
||||
var severity = daysUntilExhaustion <= criticalThreshold ? AlertSeverity.Critical
|
||||
: daysUntilExhaustion <= alertThreshold ? AlertSeverity.Warning
|
||||
: AlertSeverity.None;
|
||||
|
||||
var status = new PartitionHealthStatus(
|
||||
SchemaName: schema,
|
||||
TableName: table,
|
||||
FuturePartitions: futureCount,
|
||||
DaysUntilExhaustion: daysUntilExhaustion,
|
||||
LastPartitionStart: lastPartitionStart,
|
||||
Severity: severity,
|
||||
AlertMessage: severity != AlertSeverity.None
|
||||
? $"Partition exhaustion {severity.ToString().ToLowerInvariant()}: {tableKey} has {daysUntilExhaustion} days until exhaustion"
|
||||
: null);
|
||||
|
||||
results.Add(status);
|
||||
|
||||
if (severity != AlertSeverity.None)
|
||||
{
|
||||
AlertsFired.Add(1, new TagList
|
||||
{
|
||||
{ "table", tableKey },
|
||||
{ "severity", severity.ToString().ToLowerInvariant() }
|
||||
});
|
||||
|
||||
activity?.AddEvent(new ActivityEvent(
|
||||
"partition.exhaustion.alert",
|
||||
tags: new ActivityTagsCollection
|
||||
{
|
||||
{ "table", tableKey },
|
||||
{ "severity", severity.ToString() },
|
||||
{ "days_until_exhaustion", daysUntilExhaustion }
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (PostgresException ex) when (ex.SqlState == "42P01") // undefined_table
|
||||
{
|
||||
// partition_mgmt schema doesn't exist yet
|
||||
activity?.SetStatus(ActivityStatusCode.Error, "partition_mgmt schema not found");
|
||||
}
|
||||
|
||||
// Update observable gauge state
|
||||
_lastFuturePartitionCounts = futureCounts;
|
||||
_lastDaysUntilExhaustion = daysUntil;
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get alert summary for integration with notification systems.
|
||||
/// </summary>
|
||||
public static PartitionAlertSummary GetAlertSummary(IEnumerable<PartitionHealthStatus> statuses)
|
||||
{
|
||||
var criticalTables = statuses.Where(s => s.Severity == AlertSeverity.Critical).ToList();
|
||||
var warningTables = statuses.Where(s => s.Severity == AlertSeverity.Warning).ToList();
|
||||
|
||||
return new PartitionAlertSummary(
|
||||
CriticalCount: criticalTables.Count,
|
||||
WarningCount: warningTables.Count,
|
||||
CriticalTables: criticalTables.Select(s => $"{s.SchemaName}.{s.TableName}").ToList(),
|
||||
WarningTables: warningTables.Select(s => $"{s.SchemaName}.{s.TableName}").ToList(),
|
||||
OverallSeverity: criticalTables.Count > 0 ? AlertSeverity.Critical
|
||||
: warningTables.Count > 0 ? AlertSeverity.Warning
|
||||
: AlertSeverity.None);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Health status for a single partitioned table.
|
||||
/// </summary>
|
||||
public sealed record PartitionHealthStatus(
|
||||
string SchemaName,
|
||||
string TableName,
|
||||
int FuturePartitions,
|
||||
int DaysUntilExhaustion,
|
||||
DateTimeOffset? LastPartitionStart,
|
||||
AlertSeverity Severity,
|
||||
string? AlertMessage);
|
||||
|
||||
/// <summary>
|
||||
/// Summary of partition alerts.
|
||||
/// </summary>
|
||||
public sealed record PartitionAlertSummary(
|
||||
int CriticalCount,
|
||||
int WarningCount,
|
||||
IReadOnlyList<string> CriticalTables,
|
||||
IReadOnlyList<string> WarningTables,
|
||||
AlertSeverity OverallSeverity);
|
||||
|
||||
/// <summary>
|
||||
/// Alert severity levels.
|
||||
/// </summary>
|
||||
public enum AlertSeverity
|
||||
{
|
||||
/// <summary>No alert needed.</summary>
|
||||
None,
|
||||
|
||||
/// <summary>Warning: action needed soon.</summary>
|
||||
Warning,
|
||||
|
||||
/// <summary>Critical: immediate action required.</summary>
|
||||
Critical
|
||||
}
|
||||
@@ -0,0 +1,250 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// PartitionMaintenanceWorker.cs
|
||||
// Sprint: SPRINT_3422_0001_0001_time_based_partitioning
|
||||
// Task: 6.1 - Create partition maintenance job
|
||||
// Task: 6.2 - Create retention enforcement job
|
||||
// Description: Background worker for partition creation and retention enforcement
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Data;
|
||||
using System.Diagnostics;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using Npgsql;
|
||||
using StellaOps.Scheduler.Storage.Postgres;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Execution;
|
||||
|
||||
/// <summary>
|
||||
/// Background worker that manages partition lifecycle:
|
||||
/// - Creates future partitions to avoid insert failures
|
||||
/// - Drops old partitions to enforce retention policy
|
||||
/// Per advisory guidelines, runs hourly by default.
|
||||
/// </summary>
|
||||
public sealed class PartitionMaintenanceWorker : BackgroundService
|
||||
{
|
||||
private readonly SchedulerDataSource _dataSource;
|
||||
private readonly IOptions<PartitionMaintenanceOptions> _options;
|
||||
private readonly ILogger<PartitionMaintenanceWorker> _logger;
|
||||
private readonly ActivitySource _activitySource = new("StellaOps.Scheduler.PartitionMaintenance");
|
||||
|
||||
public PartitionMaintenanceWorker(
|
||||
SchedulerDataSource dataSource,
|
||||
IOptions<PartitionMaintenanceOptions> options,
|
||||
ILogger<PartitionMaintenanceWorker> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Partition maintenance worker started");
|
||||
|
||||
// Initial delay to let the system stabilize
|
||||
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
var opts = _options.Value;
|
||||
|
||||
if (!opts.Enabled)
|
||||
{
|
||||
_logger.LogDebug("Partition maintenance is disabled");
|
||||
await Task.Delay(opts.Interval, stoppingToken);
|
||||
continue;
|
||||
}
|
||||
|
||||
using var activity = _activitySource.StartActivity("partition.maintenance", ActivityKind.Internal);
|
||||
|
||||
try
|
||||
{
|
||||
await RunMaintenanceCycleAsync(opts, stoppingToken);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Partition maintenance cycle failed");
|
||||
activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
|
||||
PartitionMaintenanceMetrics.RecordError("cycle_failed");
|
||||
}
|
||||
|
||||
await Task.Delay(opts.Interval, stoppingToken);
|
||||
}
|
||||
|
||||
_logger.LogInformation("Partition maintenance worker stopped");
|
||||
}
|
||||
|
||||
private async Task RunMaintenanceCycleAsync(PartitionMaintenanceOptions opts, CancellationToken ct)
|
||||
{
|
||||
var sw = Stopwatch.StartNew();
|
||||
var createdCount = 0;
|
||||
var droppedCount = 0;
|
||||
|
||||
_logger.LogInformation("Starting partition maintenance cycle");
|
||||
|
||||
await using var conn = await _dataSource.GetConnectionAsync(ct);
|
||||
await conn.OpenAsync(ct);
|
||||
|
||||
foreach (var (schemaTable, _) in opts.ManagedTables)
|
||||
{
|
||||
var parts = schemaTable.Split('.', 2);
|
||||
if (parts.Length != 2)
|
||||
{
|
||||
_logger.LogWarning("Invalid managed table format: {Table}", schemaTable);
|
||||
continue;
|
||||
}
|
||||
|
||||
var schema = parts[0];
|
||||
var table = parts[1];
|
||||
|
||||
try
|
||||
{
|
||||
// Step 1: Ensure future partitions exist
|
||||
var created = await EnsureFuturePartitionsAsync(conn, schema, table, opts.MonthsAhead, ct);
|
||||
createdCount += created;
|
||||
|
||||
// Step 2: Enforce retention policy
|
||||
var retentionMonths = opts.GetRetentionMonths(schemaTable);
|
||||
var dropped = await EnforceRetentionAsync(conn, schema, table, retentionMonths, ct);
|
||||
droppedCount += dropped;
|
||||
}
|
||||
catch (PostgresException ex) when (ex.SqlState == "42P01") // undefined_table
|
||||
{
|
||||
_logger.LogDebug("Table {Schema}.{Table} does not exist (not partitioned yet), skipping", schema, table);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to maintain partitions for {Schema}.{Table}", schema, table);
|
||||
PartitionMaintenanceMetrics.RecordError($"{schema}.{table}");
|
||||
}
|
||||
}
|
||||
|
||||
sw.Stop();
|
||||
_logger.LogInformation(
|
||||
"Partition maintenance cycle completed in {ElapsedMs}ms: {Created} partitions created, {Dropped} partitions dropped",
|
||||
sw.ElapsedMilliseconds, createdCount, droppedCount);
|
||||
|
||||
PartitionMaintenanceMetrics.RecordCycle(sw.Elapsed.TotalMilliseconds, createdCount, droppedCount);
|
||||
}
|
||||
|
||||
private async Task<int> EnsureFuturePartitionsAsync(
|
||||
NpgsqlConnection conn,
|
||||
string schema,
|
||||
string table,
|
||||
int monthsAhead,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Use the partition management function if available, otherwise create partitions manually
|
||||
await using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = @"
|
||||
SELECT partition_mgmt.ensure_future_partitions($1, $2, $3)
|
||||
WHERE EXISTS (
|
||||
SELECT 1 FROM pg_proc p
|
||||
JOIN pg_namespace n ON p.pronamespace = n.oid
|
||||
WHERE n.nspname = 'partition_mgmt' AND p.proname = 'ensure_future_partitions'
|
||||
)";
|
||||
cmd.Parameters.AddWithValue(schema);
|
||||
cmd.Parameters.AddWithValue(table);
|
||||
cmd.Parameters.AddWithValue(monthsAhead);
|
||||
|
||||
var result = await cmd.ExecuteScalarAsync(ct);
|
||||
var created = result is int count ? count : 0;
|
||||
|
||||
if (created > 0)
|
||||
{
|
||||
_logger.LogInformation("Created {Count} future partitions for {Schema}.{Table}", created, schema, table);
|
||||
PartitionMaintenanceMetrics.RecordPartitionsCreated(schema, table, created);
|
||||
}
|
||||
|
||||
return created;
|
||||
}
|
||||
|
||||
private async Task<int> EnforceRetentionAsync(
|
||||
NpgsqlConnection conn,
|
||||
string schema,
|
||||
string table,
|
||||
int retentionMonths,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Use the partition management function if available
|
||||
await using var cmd = conn.CreateCommand();
|
||||
cmd.CommandText = @"
|
||||
SELECT partition_mgmt.enforce_retention($1, $2, $3)
|
||||
WHERE EXISTS (
|
||||
SELECT 1 FROM pg_proc p
|
||||
JOIN pg_namespace n ON p.pronamespace = n.oid
|
||||
WHERE n.nspname = 'partition_mgmt' AND p.proname = 'enforce_retention'
|
||||
)";
|
||||
cmd.Parameters.AddWithValue(schema);
|
||||
cmd.Parameters.AddWithValue(table);
|
||||
cmd.Parameters.AddWithValue(retentionMonths);
|
||||
|
||||
var result = await cmd.ExecuteScalarAsync(ct);
|
||||
var dropped = result is int count ? count : 0;
|
||||
|
||||
if (dropped > 0)
|
||||
{
|
||||
_logger.LogInformation("Dropped {Count} old partitions for {Schema}.{Table} (retention: {Months} months)",
|
||||
dropped, schema, table, retentionMonths);
|
||||
PartitionMaintenanceMetrics.RecordPartitionsDropped(schema, table, dropped);
|
||||
}
|
||||
|
||||
return dropped;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metrics for partition maintenance operations.
|
||||
/// </summary>
|
||||
public static class PartitionMaintenanceMetrics
|
||||
{
|
||||
private static readonly System.Diagnostics.Metrics.Meter Meter =
|
||||
new("StellaOps.Scheduler.PartitionMaintenance", "1.0.0");
|
||||
|
||||
private static readonly System.Diagnostics.Metrics.Counter<int> PartitionsCreated =
|
||||
Meter.CreateCounter<int>("stellaops.partitions.created", description: "Number of partitions created");
|
||||
|
||||
private static readonly System.Diagnostics.Metrics.Counter<int> PartitionsDropped =
|
||||
Meter.CreateCounter<int>("stellaops.partitions.dropped", description: "Number of partitions dropped");
|
||||
|
||||
private static readonly System.Diagnostics.Metrics.Counter<int> Errors =
|
||||
Meter.CreateCounter<int>("stellaops.partitions.errors", description: "Number of partition maintenance errors");
|
||||
|
||||
private static readonly System.Diagnostics.Metrics.Histogram<double> CycleDuration =
|
||||
Meter.CreateHistogram<double>("stellaops.partitions.cycle_duration_ms", description: "Duration of maintenance cycle in ms");
|
||||
|
||||
public static void RecordPartitionsCreated(string schema, string table, int count)
|
||||
{
|
||||
PartitionsCreated.Add(count, new System.Diagnostics.TagList
|
||||
{
|
||||
{ "schema", schema },
|
||||
{ "table", table }
|
||||
});
|
||||
}
|
||||
|
||||
public static void RecordPartitionsDropped(string schema, string table, int count)
|
||||
{
|
||||
PartitionsDropped.Add(count, new System.Diagnostics.TagList
|
||||
{
|
||||
{ "schema", schema },
|
||||
{ "table", table }
|
||||
});
|
||||
}
|
||||
|
||||
public static void RecordError(string context)
|
||||
{
|
||||
Errors.Add(1, new System.Diagnostics.TagList { { "context", context } });
|
||||
}
|
||||
|
||||
public static void RecordCycle(double durationMs, int created, int dropped)
|
||||
{
|
||||
CycleDuration.Record(durationMs);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// PartitionMaintenanceOptions.cs
|
||||
// Sprint: SPRINT_3422_0001_0001_time_based_partitioning
|
||||
// Task: 6.1 - Create partition maintenance job
|
||||
// Description: Configuration options for partition maintenance worker
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for partition maintenance.
|
||||
/// </summary>
|
||||
public sealed class PartitionMaintenanceOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Whether partition maintenance is enabled. Default: true.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Interval between maintenance runs. Default: 1 hour.
|
||||
/// </summary>
|
||||
public TimeSpan Interval { get; set; } = TimeSpan.FromHours(1);
|
||||
|
||||
/// <summary>
|
||||
/// Number of months ahead to create partitions. Default: 3.
|
||||
/// </summary>
|
||||
public int MonthsAhead { get; set; } = 3;
|
||||
|
||||
/// <summary>
|
||||
/// Retention period in months for scheduler tables. Default: 24 months.
|
||||
/// </summary>
|
||||
public int SchedulerRetentionMonths { get; set; } = 24;
|
||||
|
||||
/// <summary>
|
||||
/// Retention period in months for vuln tables. Default: 36 months.
|
||||
/// </summary>
|
||||
public int VulnRetentionMonths { get; set; } = 36;
|
||||
|
||||
/// <summary>
|
||||
/// Retention period in months for vex tables. Default: 36 months.
|
||||
/// </summary>
|
||||
public int VexRetentionMonths { get; set; } = 36;
|
||||
|
||||
/// <summary>
|
||||
/// Retention period in months for notify tables. Default: 12 months.
|
||||
/// </summary>
|
||||
public int NotifyRetentionMonths { get; set; } = 12;
|
||||
|
||||
/// <summary>
|
||||
/// Tables to manage with their schema. Key = schema.table, Value = retention months (0 = use default).
|
||||
/// </summary>
|
||||
public Dictionary<string, int> ManagedTables { get; set; } = new()
|
||||
{
|
||||
["scheduler.audit"] = 0, // Uses SchedulerRetentionMonths
|
||||
["scheduler.runs"] = 0,
|
||||
["scheduler.execution_logs"] = 0,
|
||||
["vuln.merge_events"] = 0, // Uses VulnRetentionMonths
|
||||
["vex.timeline_events"] = 0, // Uses VexRetentionMonths
|
||||
["notify.deliveries"] = 0 // Uses NotifyRetentionMonths
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Get retention months for a specific table.
|
||||
/// </summary>
|
||||
public int GetRetentionMonths(string schemaTable)
|
||||
{
|
||||
if (ManagedTables.TryGetValue(schemaTable, out var months) && months > 0)
|
||||
return months;
|
||||
|
||||
// Use schema-based defaults
|
||||
return schemaTable.StartsWith("scheduler.") ? SchedulerRetentionMonths :
|
||||
schemaTable.StartsWith("vuln.") ? VulnRetentionMonths :
|
||||
schemaTable.StartsWith("vex.") ? VexRetentionMonths :
|
||||
schemaTable.StartsWith("notify.") ? NotifyRetentionMonths :
|
||||
24; // Default fallback
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,317 @@
|
||||
// =============================================================================
|
||||
// ScoreReplaySchedulerJob.cs
|
||||
// Sprint: SPRINT_3401_0002_0001
|
||||
// Task: SCORE-REPLAY-011 - Add scheduled job to rescore when feed snapshots change
|
||||
// =============================================================================
|
||||
|
||||
using System.Diagnostics;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Planning;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for score replay scheduling.
|
||||
/// </summary>
|
||||
public sealed class ScoreReplaySchedulerOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Whether automatic score replay is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum age in days for scans to be considered for replay.
|
||||
/// </summary>
|
||||
public int MaxAgeDays { get; set; } = 30;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to send notifications when scores change significantly.
|
||||
/// </summary>
|
||||
public bool NotifyOnDelta { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum score delta to trigger notification.
|
||||
/// </summary>
|
||||
public double DeltaThreshold { get; set; } = 0.5;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of scans to replay per run.
|
||||
/// </summary>
|
||||
public int MaxScansPerRun { get; set; } = 100;
|
||||
|
||||
/// <summary>
|
||||
/// Parallelism for replay operations.
|
||||
/// </summary>
|
||||
public int Parallelism { get; set; } = 4;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a score replay operation.
|
||||
/// </summary>
|
||||
public sealed record ScoreReplayResult(
|
||||
string ScanId,
|
||||
string ReplayId,
|
||||
bool Success,
|
||||
double OriginalScore,
|
||||
double ReplayedScore,
|
||||
int FindingsAdded,
|
||||
int FindingsRemoved,
|
||||
int FindingsRescored,
|
||||
TimeSpan Duration,
|
||||
string? ErrorMessage = null);
|
||||
|
||||
/// <summary>
|
||||
/// Summary of a score replay batch run.
|
||||
/// </summary>
|
||||
public sealed record ScoreReplayBatchSummary(
|
||||
DateTimeOffset StartedAt,
|
||||
DateTimeOffset CompletedAt,
|
||||
string TriggerType,
|
||||
string? FeedSnapshotHash,
|
||||
int TotalScans,
|
||||
int SuccessCount,
|
||||
int FailureCount,
|
||||
int SignificantDeltas,
|
||||
IReadOnlyList<ScoreReplayResult> Results);
|
||||
|
||||
/// <summary>
|
||||
/// Interface for the score replay scheduler.
|
||||
/// </summary>
|
||||
public interface IScoreReplayScheduler
|
||||
{
|
||||
/// <summary>
|
||||
/// Triggers a score replay for all eligible scans.
|
||||
/// </summary>
|
||||
Task<ScoreReplayBatchSummary> ReplayAllAsync(
|
||||
string triggerType,
|
||||
string? feedSnapshotHash = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Triggers a score replay for a specific scan.
|
||||
/// </summary>
|
||||
Task<ScoreReplayResult> ReplayScanAsync(
|
||||
string scanId,
|
||||
string triggerType,
|
||||
string? feedSnapshotHash = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for the scanner replay client.
|
||||
/// </summary>
|
||||
public interface IScannerReplayClient
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets scans eligible for replay (within max age, has manifest).
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<string>> GetEligibleScansAsync(
|
||||
int maxAgeDays,
|
||||
int limit,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Triggers a score replay for a scan.
|
||||
/// </summary>
|
||||
Task<ScoreReplayResult> ReplayAsync(
|
||||
string scanId,
|
||||
string? feedSnapshotHash,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current feed snapshot hash.
|
||||
/// </summary>
|
||||
Task<string> GetCurrentFeedSnapshotHashAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Scheduled job that triggers score replays when feed snapshots change.
|
||||
/// Per Sprint 3401.0002.0001 - Score Replay & Proof Bundle.
|
||||
/// </summary>
|
||||
public sealed class ScoreReplaySchedulerJob : IScoreReplayScheduler
|
||||
{
|
||||
private readonly IScannerReplayClient _scannerClient;
|
||||
private readonly ScoreReplaySchedulerOptions _options;
|
||||
private readonly ILogger<ScoreReplaySchedulerJob> _logger;
|
||||
private string? _lastFeedSnapshotHash;
|
||||
|
||||
public ScoreReplaySchedulerJob(
|
||||
IScannerReplayClient scannerClient,
|
||||
IOptions<ScoreReplaySchedulerOptions> options,
|
||||
ILogger<ScoreReplaySchedulerJob> logger)
|
||||
{
|
||||
_scannerClient = scannerClient ?? throw new ArgumentNullException(nameof(scannerClient));
|
||||
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a new feed snapshot is available and triggers replay if needed.
|
||||
/// Called periodically by the scheduler.
|
||||
/// </summary>
|
||||
public async Task<bool> CheckAndReplayAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
_logger.LogDebug("Score replay scheduler is disabled");
|
||||
return false;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var currentHash = await _scannerClient.GetCurrentFeedSnapshotHashAsync(ct);
|
||||
|
||||
if (_lastFeedSnapshotHash is not null && _lastFeedSnapshotHash != currentHash)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Feed snapshot changed from {Old} to {New}, triggering replay",
|
||||
_lastFeedSnapshotHash[..16],
|
||||
currentHash[..16]);
|
||||
|
||||
await ReplayAllAsync("feed_update", currentHash, ct);
|
||||
_lastFeedSnapshotHash = currentHash;
|
||||
return true;
|
||||
}
|
||||
|
||||
_lastFeedSnapshotHash = currentHash;
|
||||
return false;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error checking for feed snapshot changes");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<ScoreReplayBatchSummary> ReplayAllAsync(
|
||||
string triggerType,
|
||||
string? feedSnapshotHash = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var startedAt = DateTimeOffset.UtcNow;
|
||||
var results = new List<ScoreReplayResult>();
|
||||
var successCount = 0;
|
||||
var failureCount = 0;
|
||||
var significantDeltas = 0;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting score replay batch. Trigger={Trigger}, MaxAge={Days}d, MaxScans={Max}",
|
||||
triggerType,
|
||||
_options.MaxAgeDays,
|
||||
_options.MaxScansPerRun);
|
||||
|
||||
try
|
||||
{
|
||||
var eligibleScans = await _scannerClient.GetEligibleScansAsync(
|
||||
_options.MaxAgeDays,
|
||||
_options.MaxScansPerRun,
|
||||
ct);
|
||||
|
||||
_logger.LogInformation("Found {Count} eligible scans for replay", eligibleScans.Count);
|
||||
|
||||
// Process in parallel batches
|
||||
var semaphore = new SemaphoreSlim(_options.Parallelism);
|
||||
var tasks = eligibleScans.Select(async scanId =>
|
||||
{
|
||||
await semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
return await ReplayScanAsync(scanId, triggerType, feedSnapshotHash, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
semaphore.Release();
|
||||
}
|
||||
});
|
||||
|
||||
var batchResults = await Task.WhenAll(tasks);
|
||||
results.AddRange(batchResults);
|
||||
|
||||
foreach (var result in batchResults)
|
||||
{
|
||||
if (result.Success)
|
||||
{
|
||||
successCount++;
|
||||
var delta = Math.Abs(result.ReplayedScore - result.OriginalScore);
|
||||
if (delta >= _options.DeltaThreshold)
|
||||
{
|
||||
significantDeltas++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
failureCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error during batch score replay");
|
||||
}
|
||||
|
||||
var completedAt = DateTimeOffset.UtcNow;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Score replay batch completed. Success={Success}, Failed={Failed}, SignificantDeltas={Deltas}, Duration={Duration}ms",
|
||||
successCount,
|
||||
failureCount,
|
||||
significantDeltas,
|
||||
(completedAt - startedAt).TotalMilliseconds);
|
||||
|
||||
return new ScoreReplayBatchSummary(
|
||||
StartedAt: startedAt,
|
||||
CompletedAt: completedAt,
|
||||
TriggerType: triggerType,
|
||||
FeedSnapshotHash: feedSnapshotHash,
|
||||
TotalScans: results.Count,
|
||||
SuccessCount: successCount,
|
||||
FailureCount: failureCount,
|
||||
SignificantDeltas: significantDeltas,
|
||||
Results: results);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<ScoreReplayResult> ReplayScanAsync(
|
||||
string scanId,
|
||||
string triggerType,
|
||||
string? feedSnapshotHash = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var sw = Stopwatch.StartNew();
|
||||
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Replaying scan {ScanId}", scanId);
|
||||
var result = await _scannerClient.ReplayAsync(scanId, feedSnapshotHash, ct);
|
||||
sw.Stop();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Scan {ScanId} replayed. Delta={Delta:F2}, Duration={Duration}ms",
|
||||
scanId,
|
||||
result.ReplayedScore - result.OriginalScore,
|
||||
sw.ElapsedMilliseconds);
|
||||
|
||||
return result;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
sw.Stop();
|
||||
_logger.LogWarning(ex, "Failed to replay scan {ScanId}", scanId);
|
||||
|
||||
return new ScoreReplayResult(
|
||||
ScanId: scanId,
|
||||
ReplayId: string.Empty,
|
||||
Success: false,
|
||||
OriginalScore: 0,
|
||||
ReplayedScore: 0,
|
||||
FindingsAdded: 0,
|
||||
FindingsRemoved: 0,
|
||||
FindingsRescored: 0,
|
||||
Duration: sw.Elapsed,
|
||||
ErrorMessage: ex.Message);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user