up
This commit is contained in:
@@ -0,0 +1,583 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Core.Backfill;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for the backfill manager.
|
||||
/// </summary>
|
||||
public sealed record BackfillManagerOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Maximum number of events allowed in a single backfill request.
|
||||
/// </summary>
|
||||
public long MaxEventsPerBackfill { get; init; } = 1_000_000;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum duration allowed for a backfill operation.
|
||||
/// </summary>
|
||||
public TimeSpan MaxBackfillDuration { get; init; } = TimeSpan.FromHours(24);
|
||||
|
||||
/// <summary>
|
||||
/// Data retention period - backfills cannot extend beyond this.
|
||||
/// </summary>
|
||||
public TimeSpan RetentionPeriod { get; init; } = TimeSpan.FromDays(90);
|
||||
|
||||
/// <summary>
|
||||
/// Default TTL for processed event records.
|
||||
/// </summary>
|
||||
public TimeSpan DefaultProcessedEventTtl { get; init; } = TimeSpan.FromDays(30);
|
||||
|
||||
/// <summary>
|
||||
/// Number of sample event keys to include in previews.
|
||||
/// </summary>
|
||||
public int PreviewSampleSize { get; init; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Estimated events per second for duration estimation.
|
||||
/// </summary>
|
||||
public double EstimatedEventsPerSecond { get; init; } = 100;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Coordinates backfill operations with safety validations.
|
||||
/// </summary>
|
||||
public interface IBackfillManager
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new backfill request with validation.
|
||||
/// </summary>
|
||||
Task<BackfillRequest> CreateRequestAsync(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
string reason,
|
||||
string createdBy,
|
||||
int batchSize = 100,
|
||||
bool dryRun = false,
|
||||
bool forceReprocess = false,
|
||||
string? ticket = null,
|
||||
TimeSpan? maxDuration = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Validates a backfill request and runs safety checks.
|
||||
/// </summary>
|
||||
Task<BackfillRequest> ValidateRequestAsync(
|
||||
string tenantId,
|
||||
Guid backfillId,
|
||||
string updatedBy,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Generates a preview of what a backfill would process (dry-run).
|
||||
/// </summary>
|
||||
Task<BackfillPreview> PreviewAsync(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
int batchSize = 100,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Starts execution of a validated backfill request.
|
||||
/// </summary>
|
||||
Task<BackfillRequest> StartAsync(
|
||||
string tenantId,
|
||||
Guid backfillId,
|
||||
string updatedBy,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Pauses a running backfill.
|
||||
/// </summary>
|
||||
Task<BackfillRequest> PauseAsync(
|
||||
string tenantId,
|
||||
Guid backfillId,
|
||||
string updatedBy,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Resumes a paused backfill.
|
||||
/// </summary>
|
||||
Task<BackfillRequest> ResumeAsync(
|
||||
string tenantId,
|
||||
Guid backfillId,
|
||||
string updatedBy,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Cancels a backfill request.
|
||||
/// </summary>
|
||||
Task<BackfillRequest> CancelAsync(
|
||||
string tenantId,
|
||||
Guid backfillId,
|
||||
string updatedBy,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current status of a backfill request.
|
||||
/// </summary>
|
||||
Task<BackfillRequest?> GetStatusAsync(
|
||||
string tenantId,
|
||||
Guid backfillId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Lists backfill requests with filters.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<BackfillRequest>> ListAsync(
|
||||
string tenantId,
|
||||
BackfillStatus? status = null,
|
||||
Guid? sourceId = null,
|
||||
string? jobType = null,
|
||||
int limit = 50,
|
||||
int offset = 0,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Provides event counting for backfill estimation.
|
||||
/// </summary>
|
||||
public interface IBackfillEventCounter
|
||||
{
|
||||
/// <summary>
|
||||
/// Estimates the number of events in a time window.
|
||||
/// </summary>
|
||||
Task<long> EstimateEventCountAsync(
|
||||
string tenantId,
|
||||
string scopeKey,
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets sample event keys from a time window.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<string>> GetSampleEventKeysAsync(
|
||||
string tenantId,
|
||||
string scopeKey,
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
int sampleSize,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates backfill safety conditions.
|
||||
/// </summary>
|
||||
public interface IBackfillSafetyValidator
|
||||
{
|
||||
/// <summary>
|
||||
/// Runs all safety validations for a backfill request.
|
||||
/// </summary>
|
||||
Task<BackfillSafetyChecks> ValidateAsync(
|
||||
BackfillRequest request,
|
||||
long estimatedEvents,
|
||||
TimeSpan estimatedDuration,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of backfill safety validator.
|
||||
/// </summary>
|
||||
public sealed class DefaultBackfillSafetyValidator : IBackfillSafetyValidator
|
||||
{
|
||||
private readonly ISourceValidator _sourceValidator;
|
||||
private readonly IOverlapChecker _overlapChecker;
|
||||
private readonly BackfillManagerOptions _options;
|
||||
|
||||
public DefaultBackfillSafetyValidator(
|
||||
ISourceValidator sourceValidator,
|
||||
IOverlapChecker overlapChecker,
|
||||
BackfillManagerOptions options)
|
||||
{
|
||||
_sourceValidator = sourceValidator;
|
||||
_overlapChecker = overlapChecker;
|
||||
_options = options;
|
||||
}
|
||||
|
||||
public async Task<BackfillSafetyChecks> ValidateAsync(
|
||||
BackfillRequest request,
|
||||
long estimatedEvents,
|
||||
TimeSpan estimatedDuration,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var warnings = new List<string>();
|
||||
var errors = new List<string>();
|
||||
|
||||
// Check source exists
|
||||
var sourceExists = true;
|
||||
if (request.SourceId.HasValue)
|
||||
{
|
||||
sourceExists = await _sourceValidator.ExistsAsync(
|
||||
request.TenantId, request.SourceId.Value, cancellationToken);
|
||||
if (!sourceExists)
|
||||
{
|
||||
errors.Add($"Source {request.SourceId} not found.");
|
||||
}
|
||||
}
|
||||
|
||||
// Check for overlapping backfills
|
||||
var hasOverlap = await _overlapChecker.HasOverlapAsync(
|
||||
request.TenantId,
|
||||
request.ScopeKey,
|
||||
request.WindowStart,
|
||||
request.WindowEnd,
|
||||
request.BackfillId,
|
||||
cancellationToken);
|
||||
if (hasOverlap)
|
||||
{
|
||||
errors.Add("An active backfill already exists for this scope and time window.");
|
||||
}
|
||||
|
||||
// Check retention period
|
||||
var retentionLimit = DateTimeOffset.UtcNow - _options.RetentionPeriod;
|
||||
var withinRetention = request.WindowStart >= retentionLimit;
|
||||
if (!withinRetention)
|
||||
{
|
||||
errors.Add($"Window start {request.WindowStart:O} is beyond the retention period ({_options.RetentionPeriod.TotalDays} days).");
|
||||
}
|
||||
|
||||
// Check event limit
|
||||
var withinEventLimit = estimatedEvents <= _options.MaxEventsPerBackfill;
|
||||
if (!withinEventLimit)
|
||||
{
|
||||
errors.Add($"Estimated {estimatedEvents:N0} events exceeds maximum allowed ({_options.MaxEventsPerBackfill:N0}).");
|
||||
}
|
||||
else if (estimatedEvents > _options.MaxEventsPerBackfill * 0.8)
|
||||
{
|
||||
warnings.Add($"Estimated {estimatedEvents:N0} events is approaching the maximum limit.");
|
||||
}
|
||||
|
||||
// Check duration limit
|
||||
var maxDuration = request.MaxDuration ?? _options.MaxBackfillDuration;
|
||||
var withinDurationLimit = estimatedDuration <= maxDuration;
|
||||
if (!withinDurationLimit)
|
||||
{
|
||||
errors.Add($"Estimated duration {estimatedDuration} exceeds maximum allowed ({maxDuration}).");
|
||||
}
|
||||
|
||||
// Check quota availability (placeholder - always true for now)
|
||||
var quotaAvailable = true;
|
||||
|
||||
// Add warnings for large backfills
|
||||
if (request.WindowDuration > TimeSpan.FromDays(7))
|
||||
{
|
||||
warnings.Add("Large time window may take significant time to process.");
|
||||
}
|
||||
|
||||
if (request.ForceReprocess)
|
||||
{
|
||||
warnings.Add("Force reprocess is enabled - events will be processed even if already seen.");
|
||||
}
|
||||
|
||||
return new BackfillSafetyChecks(
|
||||
SourceExists: sourceExists,
|
||||
HasOverlappingBackfill: hasOverlap,
|
||||
WithinRetention: withinRetention,
|
||||
WithinEventLimit: withinEventLimit,
|
||||
WithinDurationLimit: withinDurationLimit,
|
||||
QuotaAvailable: quotaAvailable,
|
||||
Warnings: warnings,
|
||||
Errors: errors);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates that a source exists.
|
||||
/// </summary>
|
||||
public interface ISourceValidator
|
||||
{
|
||||
/// <summary>
|
||||
/// Checks if a source exists.
|
||||
/// </summary>
|
||||
Task<bool> ExistsAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks for overlapping backfill operations.
|
||||
/// </summary>
|
||||
public interface IOverlapChecker
|
||||
{
|
||||
/// <summary>
|
||||
/// Checks if there's an overlapping active backfill.
|
||||
/// </summary>
|
||||
Task<bool> HasOverlapAsync(
|
||||
string tenantId,
|
||||
string scopeKey,
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
Guid? excludeBackfillId,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of the backfill manager.
|
||||
/// </summary>
|
||||
public sealed class BackfillManager : IBackfillManager
|
||||
{
|
||||
private readonly IBackfillRepository _backfillRepository;
|
||||
private readonly IBackfillSafetyValidator _safetyValidator;
|
||||
private readonly IBackfillEventCounter _eventCounter;
|
||||
private readonly IDuplicateSuppressor _duplicateSuppressor;
|
||||
private readonly BackfillManagerOptions _options;
|
||||
private readonly ILogger<BackfillManager> _logger;
|
||||
|
||||
public BackfillManager(
|
||||
IBackfillRepository backfillRepository,
|
||||
IBackfillSafetyValidator safetyValidator,
|
||||
IBackfillEventCounter eventCounter,
|
||||
IDuplicateSuppressor duplicateSuppressor,
|
||||
BackfillManagerOptions options,
|
||||
ILogger<BackfillManager> logger)
|
||||
{
|
||||
_backfillRepository = backfillRepository;
|
||||
_safetyValidator = safetyValidator;
|
||||
_eventCounter = eventCounter;
|
||||
_duplicateSuppressor = duplicateSuppressor;
|
||||
_options = options;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task<BackfillRequest> CreateRequestAsync(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
string reason,
|
||||
string createdBy,
|
||||
int batchSize = 100,
|
||||
bool dryRun = false,
|
||||
bool forceReprocess = false,
|
||||
string? ticket = null,
|
||||
TimeSpan? maxDuration = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var request = BackfillRequest.Create(
|
||||
tenantId: tenantId,
|
||||
sourceId: sourceId,
|
||||
jobType: jobType,
|
||||
windowStart: windowStart,
|
||||
windowEnd: windowEnd,
|
||||
reason: reason,
|
||||
createdBy: createdBy,
|
||||
batchSize: batchSize,
|
||||
dryRun: dryRun,
|
||||
forceReprocess: forceReprocess,
|
||||
ticket: ticket,
|
||||
maxDuration: maxDuration);
|
||||
|
||||
await _backfillRepository.CreateAsync(request, cancellationToken);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Created backfill request {BackfillId} for scope {ScopeKey} from {WindowStart} to {WindowEnd}",
|
||||
request.BackfillId, request.ScopeKey, windowStart, windowEnd);
|
||||
|
||||
return request;
|
||||
}
|
||||
|
||||
public async Task<BackfillRequest> ValidateRequestAsync(
|
||||
string tenantId,
|
||||
Guid backfillId,
|
||||
string updatedBy,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var request = await _backfillRepository.GetByIdAsync(tenantId, backfillId, cancellationToken)
|
||||
?? throw new InvalidOperationException($"Backfill request {backfillId} not found.");
|
||||
|
||||
request = request.StartValidation(updatedBy);
|
||||
await _backfillRepository.UpdateAsync(request, cancellationToken);
|
||||
|
||||
// Estimate event count
|
||||
var estimatedEvents = await _eventCounter.EstimateEventCountAsync(
|
||||
tenantId, request.ScopeKey, request.WindowStart, request.WindowEnd, cancellationToken);
|
||||
|
||||
// Calculate estimated duration
|
||||
var estimatedDuration = TimeSpan.FromSeconds(estimatedEvents / _options.EstimatedEventsPerSecond);
|
||||
|
||||
// Run safety validations
|
||||
var safetyChecks = await _safetyValidator.ValidateAsync(
|
||||
request, estimatedEvents, estimatedDuration, cancellationToken);
|
||||
|
||||
request = request.WithSafetyChecks(safetyChecks, estimatedEvents, estimatedDuration, updatedBy);
|
||||
await _backfillRepository.UpdateAsync(request, cancellationToken);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Validated backfill request {BackfillId}: {EstimatedEvents} events, safe={IsSafe}",
|
||||
backfillId, estimatedEvents, safetyChecks.IsSafe);
|
||||
|
||||
return request;
|
||||
}
|
||||
|
||||
public async Task<BackfillPreview> PreviewAsync(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
int batchSize = 100,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var scopeKey = GetScopeKey(sourceId, jobType);
|
||||
|
||||
// Estimate total events
|
||||
var estimatedEvents = await _eventCounter.EstimateEventCountAsync(
|
||||
tenantId, scopeKey, windowStart, windowEnd, cancellationToken);
|
||||
|
||||
// Get already processed count
|
||||
var processedCount = await _duplicateSuppressor.CountProcessedAsync(
|
||||
scopeKey, windowStart, windowEnd, cancellationToken);
|
||||
|
||||
// Get sample event keys
|
||||
var sampleKeys = await _eventCounter.GetSampleEventKeysAsync(
|
||||
tenantId, scopeKey, windowStart, windowEnd, _options.PreviewSampleSize, cancellationToken);
|
||||
|
||||
// Calculate estimates
|
||||
var processableEvents = Math.Max(0, estimatedEvents - processedCount);
|
||||
var estimatedDuration = TimeSpan.FromSeconds(processableEvents / _options.EstimatedEventsPerSecond);
|
||||
var estimatedBatches = (int)Math.Ceiling((double)processableEvents / batchSize);
|
||||
|
||||
// Run safety checks
|
||||
var tempRequest = BackfillRequest.Create(
|
||||
tenantId, sourceId, jobType, windowStart, windowEnd,
|
||||
"preview", "system", batchSize);
|
||||
|
||||
var safetyChecks = await _safetyValidator.ValidateAsync(
|
||||
tempRequest, estimatedEvents, estimatedDuration, cancellationToken);
|
||||
|
||||
return new BackfillPreview(
|
||||
ScopeKey: scopeKey,
|
||||
WindowStart: windowStart,
|
||||
WindowEnd: windowEnd,
|
||||
EstimatedEvents: estimatedEvents,
|
||||
SkippedEvents: processedCount,
|
||||
ProcessableEvents: processableEvents,
|
||||
EstimatedDuration: estimatedDuration,
|
||||
EstimatedBatches: estimatedBatches,
|
||||
SafetyChecks: safetyChecks,
|
||||
SampleEventKeys: sampleKeys);
|
||||
}
|
||||
|
||||
public async Task<BackfillRequest> StartAsync(
|
||||
string tenantId,
|
||||
Guid backfillId,
|
||||
string updatedBy,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var request = await _backfillRepository.GetByIdAsync(tenantId, backfillId, cancellationToken)
|
||||
?? throw new InvalidOperationException($"Backfill request {backfillId} not found.");
|
||||
|
||||
request = request.Start(updatedBy);
|
||||
await _backfillRepository.UpdateAsync(request, cancellationToken);
|
||||
|
||||
_logger.LogInformation("Started backfill request {BackfillId}", backfillId);
|
||||
|
||||
return request;
|
||||
}
|
||||
|
||||
public async Task<BackfillRequest> PauseAsync(
|
||||
string tenantId,
|
||||
Guid backfillId,
|
||||
string updatedBy,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var request = await _backfillRepository.GetByIdAsync(tenantId, backfillId, cancellationToken)
|
||||
?? throw new InvalidOperationException($"Backfill request {backfillId} not found.");
|
||||
|
||||
request = request.Pause(updatedBy);
|
||||
await _backfillRepository.UpdateAsync(request, cancellationToken);
|
||||
|
||||
_logger.LogInformation("Paused backfill request {BackfillId}", backfillId);
|
||||
|
||||
return request;
|
||||
}
|
||||
|
||||
public async Task<BackfillRequest> ResumeAsync(
|
||||
string tenantId,
|
||||
Guid backfillId,
|
||||
string updatedBy,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var request = await _backfillRepository.GetByIdAsync(tenantId, backfillId, cancellationToken)
|
||||
?? throw new InvalidOperationException($"Backfill request {backfillId} not found.");
|
||||
|
||||
request = request.Resume(updatedBy);
|
||||
await _backfillRepository.UpdateAsync(request, cancellationToken);
|
||||
|
||||
_logger.LogInformation("Resumed backfill request {BackfillId}", backfillId);
|
||||
|
||||
return request;
|
||||
}
|
||||
|
||||
public async Task<BackfillRequest> CancelAsync(
|
||||
string tenantId,
|
||||
Guid backfillId,
|
||||
string updatedBy,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var request = await _backfillRepository.GetByIdAsync(tenantId, backfillId, cancellationToken)
|
||||
?? throw new InvalidOperationException($"Backfill request {backfillId} not found.");
|
||||
|
||||
request = request.Cancel(updatedBy);
|
||||
await _backfillRepository.UpdateAsync(request, cancellationToken);
|
||||
|
||||
_logger.LogInformation("Canceled backfill request {BackfillId}", backfillId);
|
||||
|
||||
return request;
|
||||
}
|
||||
|
||||
public Task<BackfillRequest?> GetStatusAsync(
|
||||
string tenantId,
|
||||
Guid backfillId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
return _backfillRepository.GetByIdAsync(tenantId, backfillId, cancellationToken);
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<BackfillRequest>> ListAsync(
|
||||
string tenantId,
|
||||
BackfillStatus? status = null,
|
||||
Guid? sourceId = null,
|
||||
string? jobType = null,
|
||||
int limit = 50,
|
||||
int offset = 0,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
return _backfillRepository.ListAsync(tenantId, status, sourceId, jobType, limit, offset, cancellationToken);
|
||||
}
|
||||
|
||||
private static string GetScopeKey(Guid? sourceId, string? jobType)
|
||||
{
|
||||
return (sourceId, jobType) switch
|
||||
{
|
||||
(Guid s, string j) when !string.IsNullOrEmpty(j) => Watermark.CreateScopeKey(s, j),
|
||||
(Guid s, _) => Watermark.CreateScopeKey(s),
|
||||
(_, string j) when !string.IsNullOrEmpty(j) => Watermark.CreateScopeKey(j),
|
||||
_ => throw new ArgumentException("Either sourceId or jobType must be specified.")
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for backfill persistence (imported for convenience).
|
||||
/// </summary>
|
||||
public interface IBackfillRepository
|
||||
{
|
||||
Task<BackfillRequest?> GetByIdAsync(string tenantId, Guid backfillId, CancellationToken cancellationToken);
|
||||
Task CreateAsync(BackfillRequest request, CancellationToken cancellationToken);
|
||||
Task UpdateAsync(BackfillRequest request, CancellationToken cancellationToken);
|
||||
Task<IReadOnlyList<BackfillRequest>> ListAsync(
|
||||
string tenantId,
|
||||
BackfillStatus? status,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -0,0 +1,318 @@
|
||||
namespace StellaOps.Orchestrator.Core.Backfill;
|
||||
|
||||
/// <summary>
|
||||
/// Tracks processed events for duplicate suppression.
|
||||
/// </summary>
|
||||
public interface IDuplicateSuppressor
|
||||
{
|
||||
/// <summary>
|
||||
/// Checks if an event has already been processed.
|
||||
/// </summary>
|
||||
/// <param name="scopeKey">Scope identifier.</param>
|
||||
/// <param name="eventKey">Unique event identifier.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>True if the event was already processed.</returns>
|
||||
Task<bool> HasProcessedAsync(string scopeKey, string eventKey, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Checks multiple events for duplicate status.
|
||||
/// </summary>
|
||||
/// <param name="scopeKey">Scope identifier.</param>
|
||||
/// <param name="eventKeys">Event identifiers to check.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Set of event keys that have already been processed.</returns>
|
||||
Task<IReadOnlySet<string>> GetProcessedAsync(string scopeKey, IEnumerable<string> eventKeys, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Marks an event as processed.
|
||||
/// </summary>
|
||||
/// <param name="scopeKey">Scope identifier.</param>
|
||||
/// <param name="eventKey">Unique event identifier.</param>
|
||||
/// <param name="eventTime">Event timestamp.</param>
|
||||
/// <param name="batchId">Optional batch/backfill identifier.</param>
|
||||
/// <param name="ttl">Time-to-live for the record.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task MarkProcessedAsync(
|
||||
string scopeKey,
|
||||
string eventKey,
|
||||
DateTimeOffset eventTime,
|
||||
Guid? batchId,
|
||||
TimeSpan ttl,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Marks multiple events as processed.
|
||||
/// </summary>
|
||||
/// <param name="scopeKey">Scope identifier.</param>
|
||||
/// <param name="events">Events to mark as processed.</param>
|
||||
/// <param name="batchId">Optional batch/backfill identifier.</param>
|
||||
/// <param name="ttl">Time-to-live for the records.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task MarkProcessedBatchAsync(
|
||||
string scopeKey,
|
||||
IEnumerable<ProcessedEvent> events,
|
||||
Guid? batchId,
|
||||
TimeSpan ttl,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Counts processed events within a time range.
|
||||
/// </summary>
|
||||
/// <param name="scopeKey">Scope identifier.</param>
|
||||
/// <param name="from">Start of time range.</param>
|
||||
/// <param name="to">End of time range.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Count of processed events.</returns>
|
||||
Task<long> CountProcessedAsync(string scopeKey, DateTimeOffset from, DateTimeOffset to, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Removes expired records (cleanup).
|
||||
/// </summary>
|
||||
/// <param name="batchLimit">Maximum records to remove per call.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Number of records removed.</returns>
|
||||
Task<int> CleanupExpiredAsync(int batchLimit, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event data for duplicate tracking.
|
||||
/// </summary>
|
||||
public sealed record ProcessedEvent(
|
||||
/// <summary>Unique event identifier.</summary>
|
||||
string EventKey,
|
||||
|
||||
/// <summary>Event timestamp.</summary>
|
||||
DateTimeOffset EventTime);
|
||||
|
||||
/// <summary>
|
||||
/// In-memory duplicate suppressor for testing.
|
||||
/// </summary>
|
||||
public sealed class InMemoryDuplicateSuppressor : IDuplicateSuppressor
|
||||
{
|
||||
private readonly Dictionary<string, Dictionary<string, ProcessedEventEntry>> _store = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
private sealed record ProcessedEventEntry(
|
||||
DateTimeOffset EventTime,
|
||||
DateTimeOffset ProcessedAt,
|
||||
Guid? BatchId,
|
||||
DateTimeOffset ExpiresAt);
|
||||
|
||||
public Task<bool> HasProcessedAsync(string scopeKey, string eventKey, CancellationToken cancellationToken)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (!_store.TryGetValue(scopeKey, out var scopeStore))
|
||||
return Task.FromResult(false);
|
||||
|
||||
if (!scopeStore.TryGetValue(eventKey, out var entry))
|
||||
return Task.FromResult(false);
|
||||
|
||||
// Check if expired
|
||||
if (entry.ExpiresAt < DateTimeOffset.UtcNow)
|
||||
{
|
||||
scopeStore.Remove(eventKey);
|
||||
return Task.FromResult(false);
|
||||
}
|
||||
|
||||
return Task.FromResult(true);
|
||||
}
|
||||
}
|
||||
|
||||
public Task<IReadOnlySet<string>> GetProcessedAsync(string scopeKey, IEnumerable<string> eventKeys, CancellationToken cancellationToken)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var result = new HashSet<string>();
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
if (!_store.TryGetValue(scopeKey, out var scopeStore))
|
||||
return Task.FromResult<IReadOnlySet<string>>(result);
|
||||
|
||||
foreach (var eventKey in eventKeys)
|
||||
{
|
||||
if (scopeStore.TryGetValue(eventKey, out var entry) && entry.ExpiresAt >= now)
|
||||
{
|
||||
result.Add(eventKey);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Task.FromResult<IReadOnlySet<string>>(result);
|
||||
}
|
||||
|
||||
public Task MarkProcessedAsync(
|
||||
string scopeKey,
|
||||
string eventKey,
|
||||
DateTimeOffset eventTime,
|
||||
Guid? batchId,
|
||||
TimeSpan ttl,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var entry = new ProcessedEventEntry(eventTime, now, batchId, now + ttl);
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
if (!_store.TryGetValue(scopeKey, out var scopeStore))
|
||||
{
|
||||
scopeStore = new Dictionary<string, ProcessedEventEntry>();
|
||||
_store[scopeKey] = scopeStore;
|
||||
}
|
||||
|
||||
scopeStore[eventKey] = entry;
|
||||
}
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public Task MarkProcessedBatchAsync(
|
||||
string scopeKey,
|
||||
IEnumerable<ProcessedEvent> events,
|
||||
Guid? batchId,
|
||||
TimeSpan ttl,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var expiresAt = now + ttl;
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
if (!_store.TryGetValue(scopeKey, out var scopeStore))
|
||||
{
|
||||
scopeStore = new Dictionary<string, ProcessedEventEntry>();
|
||||
_store[scopeKey] = scopeStore;
|
||||
}
|
||||
|
||||
foreach (var evt in events)
|
||||
{
|
||||
scopeStore[evt.EventKey] = new ProcessedEventEntry(evt.EventTime, now, batchId, expiresAt);
|
||||
}
|
||||
}
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public Task<long> CountProcessedAsync(string scopeKey, DateTimeOffset from, DateTimeOffset to, CancellationToken cancellationToken)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
long count = 0;
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
if (_store.TryGetValue(scopeKey, out var scopeStore))
|
||||
{
|
||||
count = scopeStore.Values
|
||||
.Count(e => e.ExpiresAt >= now && e.EventTime >= from && e.EventTime < to);
|
||||
}
|
||||
}
|
||||
|
||||
return Task.FromResult(count);
|
||||
}
|
||||
|
||||
public Task<int> CleanupExpiredAsync(int batchLimit, CancellationToken cancellationToken)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var removed = 0;
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
foreach (var scopeStore in _store.Values)
|
||||
{
|
||||
var expiredKeys = scopeStore
|
||||
.Where(kvp => kvp.Value.ExpiresAt < now)
|
||||
.Take(batchLimit - removed)
|
||||
.Select(kvp => kvp.Key)
|
||||
.ToList();
|
||||
|
||||
foreach (var key in expiredKeys)
|
||||
{
|
||||
scopeStore.Remove(key);
|
||||
removed++;
|
||||
}
|
||||
|
||||
if (removed >= batchLimit)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return Task.FromResult(removed);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of filtering events through duplicate suppression.
|
||||
/// </summary>
|
||||
public sealed record DuplicateFilterResult<T>(
|
||||
/// <summary>Events that should be processed (not duplicates).</summary>
|
||||
IReadOnlyList<T> ToProcess,
|
||||
|
||||
/// <summary>Events that were filtered as duplicates.</summary>
|
||||
IReadOnlyList<T> Duplicates,
|
||||
|
||||
/// <summary>Total events evaluated.</summary>
|
||||
int Total)
|
||||
{
|
||||
/// <summary>
|
||||
/// Number of events that passed filtering.
|
||||
/// </summary>
|
||||
public int ProcessCount => ToProcess.Count;
|
||||
|
||||
/// <summary>
|
||||
/// Number of duplicates filtered.
|
||||
/// </summary>
|
||||
public int DuplicateCount => Duplicates.Count;
|
||||
|
||||
/// <summary>
|
||||
/// Duplicate percentage.
|
||||
/// </summary>
|
||||
public double DuplicatePercent => Total > 0 ? Math.Round((double)DuplicateCount / Total * 100, 2) : 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Helper methods for duplicate suppression.
|
||||
/// </summary>
|
||||
public static class DuplicateSuppressorExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Filters a batch of events, removing duplicates.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">Event type.</typeparam>
|
||||
/// <param name="suppressor">Duplicate suppressor.</param>
|
||||
/// <param name="scopeKey">Scope identifier.</param>
|
||||
/// <param name="events">Events to filter.</param>
|
||||
/// <param name="keySelector">Function to extract event key.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Filter result with events to process and duplicates.</returns>
|
||||
public static async Task<DuplicateFilterResult<T>> FilterAsync<T>(
|
||||
this IDuplicateSuppressor suppressor,
|
||||
string scopeKey,
|
||||
IReadOnlyList<T> events,
|
||||
Func<T, string> keySelector,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (events.Count == 0)
|
||||
return new DuplicateFilterResult<T>([], [], 0);
|
||||
|
||||
var eventKeys = events.Select(keySelector).ToList();
|
||||
var processed = await suppressor.GetProcessedAsync(scopeKey, eventKeys, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var toProcess = new List<T>();
|
||||
var duplicates = new List<T>();
|
||||
|
||||
foreach (var evt in events)
|
||||
{
|
||||
var key = keySelector(evt);
|
||||
if (processed.Contains(key))
|
||||
{
|
||||
duplicates.Add(evt);
|
||||
}
|
||||
else
|
||||
{
|
||||
toProcess.Add(evt);
|
||||
}
|
||||
}
|
||||
|
||||
return new DuplicateFilterResult<T>(toProcess, duplicates, events.Count);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,220 @@
|
||||
namespace StellaOps.Orchestrator.Core.Backfill;
|
||||
|
||||
/// <summary>
|
||||
/// Represents an event-time window for batch processing.
|
||||
/// </summary>
|
||||
public sealed record EventTimeWindow(
|
||||
/// <summary>Start of the window (inclusive).</summary>
|
||||
DateTimeOffset Start,
|
||||
|
||||
/// <summary>End of the window (exclusive).</summary>
|
||||
DateTimeOffset End)
|
||||
{
|
||||
/// <summary>
|
||||
/// Duration of the window.
|
||||
/// </summary>
|
||||
public TimeSpan Duration => End - Start;
|
||||
|
||||
/// <summary>
|
||||
/// Whether the window is empty (zero duration).
|
||||
/// </summary>
|
||||
public bool IsEmpty => End <= Start;
|
||||
|
||||
/// <summary>
|
||||
/// Whether a timestamp falls within this window.
|
||||
/// </summary>
|
||||
public bool Contains(DateTimeOffset timestamp) => timestamp >= Start && timestamp < End;
|
||||
|
||||
/// <summary>
|
||||
/// Whether this window overlaps with another.
|
||||
/// </summary>
|
||||
public bool Overlaps(EventTimeWindow other) =>
|
||||
Start < other.End && End > other.Start;
|
||||
|
||||
/// <summary>
|
||||
/// Creates the intersection of two windows.
|
||||
/// </summary>
|
||||
public EventTimeWindow? Intersect(EventTimeWindow other)
|
||||
{
|
||||
var newStart = Start > other.Start ? Start : other.Start;
|
||||
var newEnd = End < other.End ? End : other.End;
|
||||
|
||||
return newEnd > newStart ? new EventTimeWindow(newStart, newEnd) : null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Splits the window into batches of the specified duration.
|
||||
/// </summary>
|
||||
public IEnumerable<EventTimeWindow> Split(TimeSpan batchDuration)
|
||||
{
|
||||
if (batchDuration <= TimeSpan.Zero)
|
||||
throw new ArgumentOutOfRangeException(nameof(batchDuration), "Batch duration must be positive.");
|
||||
|
||||
var current = Start;
|
||||
while (current < End)
|
||||
{
|
||||
var batchEnd = current + batchDuration;
|
||||
if (batchEnd > End)
|
||||
batchEnd = End;
|
||||
|
||||
yield return new EventTimeWindow(current, batchEnd);
|
||||
current = batchEnd;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a window from a duration ending at the specified time.
|
||||
/// </summary>
|
||||
public static EventTimeWindow FromDuration(DateTimeOffset end, TimeSpan duration) =>
|
||||
new(end - duration, end);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a window covering the last N hours from now.
|
||||
/// </summary>
|
||||
public static EventTimeWindow LastHours(int hours, DateTimeOffset? now = null)
|
||||
{
|
||||
var endTime = now ?? DateTimeOffset.UtcNow;
|
||||
return FromDuration(endTime, TimeSpan.FromHours(hours));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a window covering the last N days from now.
|
||||
/// </summary>
|
||||
public static EventTimeWindow LastDays(int days, DateTimeOffset? now = null)
|
||||
{
|
||||
var endTime = now ?? DateTimeOffset.UtcNow;
|
||||
return FromDuration(endTime, TimeSpan.FromDays(days));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for event-time window computation.
|
||||
/// </summary>
|
||||
public sealed record EventTimeWindowOptions(
|
||||
/// <summary>Minimum window size (prevents too-small batches).</summary>
|
||||
TimeSpan MinWindowSize,
|
||||
|
||||
/// <summary>Maximum window size (prevents too-large batches).</summary>
|
||||
TimeSpan MaxWindowSize,
|
||||
|
||||
/// <summary>Overlap with previous window for late-arriving events.</summary>
|
||||
TimeSpan OverlapDuration,
|
||||
|
||||
/// <summary>Maximum lag allowed before triggering alerts.</summary>
|
||||
TimeSpan MaxLag,
|
||||
|
||||
/// <summary>Default lookback for initial fetch when no watermark exists.</summary>
|
||||
TimeSpan InitialLookback)
|
||||
{
|
||||
/// <summary>
|
||||
/// Default options for hourly batching.
|
||||
/// </summary>
|
||||
public static EventTimeWindowOptions HourlyBatches => new(
|
||||
MinWindowSize: TimeSpan.FromMinutes(5),
|
||||
MaxWindowSize: TimeSpan.FromHours(1),
|
||||
OverlapDuration: TimeSpan.FromMinutes(5),
|
||||
MaxLag: TimeSpan.FromHours(2),
|
||||
InitialLookback: TimeSpan.FromDays(7));
|
||||
|
||||
/// <summary>
|
||||
/// Default options for daily batching.
|
||||
/// </summary>
|
||||
public static EventTimeWindowOptions DailyBatches => new(
|
||||
MinWindowSize: TimeSpan.FromHours(1),
|
||||
MaxWindowSize: TimeSpan.FromDays(1),
|
||||
OverlapDuration: TimeSpan.FromHours(1),
|
||||
MaxLag: TimeSpan.FromDays(1),
|
||||
InitialLookback: TimeSpan.FromDays(30));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes event-time windows for incremental processing.
|
||||
/// </summary>
|
||||
public static class EventTimeWindowPlanner
|
||||
{
|
||||
/// <summary>
|
||||
/// Computes the next window to process based on current watermark.
|
||||
/// </summary>
|
||||
/// <param name="now">Current time.</param>
|
||||
/// <param name="highWatermark">Current high watermark (null for initial fetch).</param>
|
||||
/// <param name="options">Window configuration options.</param>
|
||||
/// <returns>The next window to process, or null if caught up.</returns>
|
||||
public static EventTimeWindow? GetNextWindow(
|
||||
DateTimeOffset now,
|
||||
DateTimeOffset? highWatermark,
|
||||
EventTimeWindowOptions options)
|
||||
{
|
||||
DateTimeOffset windowStart;
|
||||
|
||||
if (highWatermark is null)
|
||||
{
|
||||
// Initial fetch: start from initial lookback
|
||||
windowStart = now - options.InitialLookback;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Incremental fetch: start from watermark minus overlap
|
||||
windowStart = highWatermark.Value - options.OverlapDuration;
|
||||
|
||||
// If we're caught up (watermark + min window > now), no work needed
|
||||
if (highWatermark.Value + options.MinWindowSize > now)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate window end (at most now, at most max window from start)
|
||||
var windowEnd = windowStart + options.MaxWindowSize;
|
||||
if (windowEnd > now)
|
||||
{
|
||||
windowEnd = now;
|
||||
}
|
||||
|
||||
// Ensure minimum window size
|
||||
if (windowEnd - windowStart < options.MinWindowSize)
|
||||
{
|
||||
// If window would be too small, extend end (but not past now)
|
||||
windowEnd = windowStart + options.MinWindowSize;
|
||||
if (windowEnd > now)
|
||||
{
|
||||
return null; // Not enough data accumulated yet
|
||||
}
|
||||
}
|
||||
|
||||
return new EventTimeWindow(windowStart, windowEnd);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Calculates the current lag from the high watermark.
|
||||
/// </summary>
|
||||
public static TimeSpan CalculateLag(DateTimeOffset now, DateTimeOffset highWatermark) =>
|
||||
now - highWatermark;
|
||||
|
||||
/// <summary>
|
||||
/// Determines if the lag exceeds the maximum allowed.
|
||||
/// </summary>
|
||||
public static bool IsLagging(DateTimeOffset now, DateTimeOffset highWatermark, EventTimeWindowOptions options) =>
|
||||
CalculateLag(now, highWatermark) > options.MaxLag;
|
||||
|
||||
/// <summary>
|
||||
/// Estimates the number of windows needed to catch up.
|
||||
/// </summary>
|
||||
public static int EstimateWindowsToProcess(
|
||||
DateTimeOffset now,
|
||||
DateTimeOffset? highWatermark,
|
||||
EventTimeWindowOptions options)
|
||||
{
|
||||
if (highWatermark is null)
|
||||
{
|
||||
// Initial fetch
|
||||
var totalDuration = options.InitialLookback;
|
||||
return (int)Math.Ceiling(totalDuration / options.MaxWindowSize);
|
||||
}
|
||||
|
||||
var lag = CalculateLag(now, highWatermark.Value);
|
||||
if (lag <= options.MinWindowSize)
|
||||
return 0;
|
||||
|
||||
return (int)Math.Ceiling(lag / options.MaxWindowSize);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,502 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Core.DeadLetter;
|
||||
|
||||
/// <summary>
|
||||
/// Notification channel types.
|
||||
/// </summary>
|
||||
public enum NotificationChannel
|
||||
{
|
||||
Email,
|
||||
Slack,
|
||||
Teams,
|
||||
Webhook,
|
||||
PagerDuty
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Notification rule for dead-letter events.
|
||||
/// </summary>
|
||||
public sealed record NotificationRule(
|
||||
Guid RuleId,
|
||||
string TenantId,
|
||||
string? JobTypePattern,
|
||||
string? ErrorCodePattern,
|
||||
ErrorCategory? Category,
|
||||
Guid? SourceId,
|
||||
bool Enabled,
|
||||
NotificationChannel Channel,
|
||||
string Endpoint,
|
||||
int CooldownMinutes,
|
||||
int MaxPerHour,
|
||||
bool Aggregate,
|
||||
DateTimeOffset? LastNotifiedAt,
|
||||
int NotificationsSent,
|
||||
DateTimeOffset CreatedAt,
|
||||
DateTimeOffset UpdatedAt,
|
||||
string CreatedBy,
|
||||
string UpdatedBy)
|
||||
{
|
||||
/// <summary>Creates a new notification rule.</summary>
|
||||
public static NotificationRule Create(
|
||||
string tenantId,
|
||||
NotificationChannel channel,
|
||||
string endpoint,
|
||||
string createdBy,
|
||||
string? jobTypePattern = null,
|
||||
string? errorCodePattern = null,
|
||||
ErrorCategory? category = null,
|
||||
Guid? sourceId = null,
|
||||
int cooldownMinutes = 15,
|
||||
int maxPerHour = 10,
|
||||
bool aggregate = true)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
return new NotificationRule(
|
||||
RuleId: Guid.NewGuid(),
|
||||
TenantId: tenantId,
|
||||
JobTypePattern: jobTypePattern,
|
||||
ErrorCodePattern: errorCodePattern,
|
||||
Category: category,
|
||||
SourceId: sourceId,
|
||||
Enabled: true,
|
||||
Channel: channel,
|
||||
Endpoint: endpoint,
|
||||
CooldownMinutes: cooldownMinutes,
|
||||
MaxPerHour: maxPerHour,
|
||||
Aggregate: aggregate,
|
||||
LastNotifiedAt: null,
|
||||
NotificationsSent: 0,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
CreatedBy: createdBy,
|
||||
UpdatedBy: createdBy);
|
||||
}
|
||||
|
||||
/// <summary>Checks if this rule matches the given entry.</summary>
|
||||
public bool Matches(DeadLetterEntry entry)
|
||||
{
|
||||
if (!Enabled) return false;
|
||||
|
||||
if (SourceId.HasValue && entry.SourceId != SourceId.Value) return false;
|
||||
if (Category.HasValue && entry.Category != Category.Value) return false;
|
||||
|
||||
if (!string.IsNullOrEmpty(JobTypePattern))
|
||||
{
|
||||
if (!System.Text.RegularExpressions.Regex.IsMatch(entry.JobType, JobTypePattern))
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(ErrorCodePattern))
|
||||
{
|
||||
if (!System.Text.RegularExpressions.Regex.IsMatch(entry.ErrorCode, ErrorCodePattern))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>Checks if this rule is within rate limits.</summary>
|
||||
public bool CanNotify(DateTimeOffset now, int notificationsSentThisHour)
|
||||
{
|
||||
if (!Enabled) return false;
|
||||
|
||||
if (notificationsSentThisHour >= MaxPerHour) return false;
|
||||
|
||||
if (LastNotifiedAt.HasValue)
|
||||
{
|
||||
var elapsed = now - LastNotifiedAt.Value;
|
||||
if (elapsed < TimeSpan.FromMinutes(CooldownMinutes))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>Records a notification sent.</summary>
|
||||
public NotificationRule RecordNotification(DateTimeOffset now) =>
|
||||
this with
|
||||
{
|
||||
LastNotifiedAt = now,
|
||||
NotificationsSent = NotificationsSent + 1,
|
||||
UpdatedAt = now
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Notification log entry.
|
||||
/// </summary>
|
||||
public sealed record NotificationLogEntry(
|
||||
Guid LogId,
|
||||
string TenantId,
|
||||
Guid RuleId,
|
||||
IReadOnlyList<Guid> EntryIds,
|
||||
NotificationChannel Channel,
|
||||
string Endpoint,
|
||||
bool Success,
|
||||
string? ErrorMessage,
|
||||
string? Subject,
|
||||
int EntryCount,
|
||||
DateTimeOffset SentAt);
|
||||
|
||||
/// <summary>
|
||||
/// Notification payload for dead-letter events.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterNotificationPayload(
|
||||
string TenantId,
|
||||
string EventType,
|
||||
IReadOnlyList<DeadLetterEntrySummary> Entries,
|
||||
DeadLetterStatsSnapshot? Stats,
|
||||
DateTimeOffset Timestamp,
|
||||
string? ActionUrl);
|
||||
|
||||
/// <summary>
|
||||
/// Summary of a dead-letter entry for notifications.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterEntrySummary(
|
||||
Guid EntryId,
|
||||
Guid OriginalJobId,
|
||||
string JobType,
|
||||
string ErrorCode,
|
||||
ErrorCategory Category,
|
||||
string FailureReason,
|
||||
string? RemediationHint,
|
||||
bool IsRetryable,
|
||||
int ReplayAttempts,
|
||||
DateTimeOffset FailedAt);
|
||||
|
||||
/// <summary>
|
||||
/// Stats snapshot for notifications.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterStatsSnapshot(
|
||||
long PendingCount,
|
||||
long RetryableCount,
|
||||
long ExhaustedCount);
|
||||
|
||||
/// <summary>
|
||||
/// Interface for dead-letter event notifications.
|
||||
/// </summary>
|
||||
public interface IDeadLetterNotifier
|
||||
{
|
||||
/// <summary>Notifies when a new entry is added to dead-letter store.</summary>
|
||||
Task NotifyNewEntryAsync(
|
||||
DeadLetterEntry entry,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Notifies when an entry is successfully replayed.</summary>
|
||||
Task NotifyReplaySuccessAsync(
|
||||
DeadLetterEntry entry,
|
||||
Guid newJobId,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Notifies when an entry exhausts all replay attempts.</summary>
|
||||
Task NotifyExhaustedAsync(
|
||||
DeadLetterEntry entry,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Sends aggregated notifications for pending entries.</summary>
|
||||
Task SendAggregatedNotificationsAsync(
|
||||
string tenantId,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for notification delivery.
|
||||
/// </summary>
|
||||
public interface INotificationDelivery
|
||||
{
|
||||
/// <summary>Sends a notification to the specified endpoint.</summary>
|
||||
Task<bool> SendAsync(
|
||||
NotificationChannel channel,
|
||||
string endpoint,
|
||||
DeadLetterNotificationPayload payload,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Repository for notification rules.
|
||||
/// </summary>
|
||||
public interface INotificationRuleRepository
|
||||
{
|
||||
Task<NotificationRule?> GetByIdAsync(string tenantId, Guid ruleId, CancellationToken cancellationToken);
|
||||
Task<IReadOnlyList<NotificationRule>> ListAsync(string tenantId, bool enabledOnly, CancellationToken cancellationToken);
|
||||
Task<IReadOnlyList<NotificationRule>> GetMatchingRulesAsync(string tenantId, DeadLetterEntry entry, CancellationToken cancellationToken);
|
||||
Task CreateAsync(NotificationRule rule, CancellationToken cancellationToken);
|
||||
Task<bool> UpdateAsync(NotificationRule rule, CancellationToken cancellationToken);
|
||||
Task<bool> DeleteAsync(string tenantId, Guid ruleId, CancellationToken cancellationToken);
|
||||
Task<int> GetNotificationCountThisHourAsync(string tenantId, Guid ruleId, CancellationToken cancellationToken);
|
||||
Task LogNotificationAsync(NotificationLogEntry log, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default dead-letter notifier implementation.
|
||||
/// </summary>
|
||||
public sealed class DeadLetterNotifier : IDeadLetterNotifier
|
||||
{
|
||||
private readonly INotificationRuleRepository _ruleRepository;
|
||||
private readonly IDeadLetterRepository _deadLetterRepository;
|
||||
private readonly INotificationDelivery _delivery;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<DeadLetterNotifier> _logger;
|
||||
|
||||
public DeadLetterNotifier(
|
||||
INotificationRuleRepository ruleRepository,
|
||||
IDeadLetterRepository deadLetterRepository,
|
||||
INotificationDelivery delivery,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<DeadLetterNotifier> logger)
|
||||
{
|
||||
_ruleRepository = ruleRepository ?? throw new ArgumentNullException(nameof(ruleRepository));
|
||||
_deadLetterRepository = deadLetterRepository ?? throw new ArgumentNullException(nameof(deadLetterRepository));
|
||||
_delivery = delivery ?? throw new ArgumentNullException(nameof(delivery));
|
||||
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task NotifyNewEntryAsync(
|
||||
DeadLetterEntry entry,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var rules = await _ruleRepository.GetMatchingRulesAsync(entry.TenantId, entry, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
foreach (var rule in rules)
|
||||
{
|
||||
if (rule.Aggregate)
|
||||
{
|
||||
// Skip immediate notification for aggregated rules
|
||||
continue;
|
||||
}
|
||||
|
||||
var notificationsThisHour = await _ruleRepository.GetNotificationCountThisHourAsync(
|
||||
entry.TenantId, rule.RuleId, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!rule.CanNotify(now, notificationsThisHour))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
await SendNotificationAsync(rule, "new_entry", [entry], null, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task NotifyReplaySuccessAsync(
|
||||
DeadLetterEntry entry,
|
||||
Guid newJobId,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var rules = await _ruleRepository.GetMatchingRulesAsync(entry.TenantId, entry, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
foreach (var rule in rules)
|
||||
{
|
||||
var notificationsThisHour = await _ruleRepository.GetNotificationCountThisHourAsync(
|
||||
entry.TenantId, rule.RuleId, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!rule.CanNotify(now, notificationsThisHour))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var payload = new DeadLetterNotificationPayload(
|
||||
TenantId: entry.TenantId,
|
||||
EventType: "replay_success",
|
||||
Entries: [ToSummary(entry)],
|
||||
Stats: null,
|
||||
Timestamp: now,
|
||||
ActionUrl: null);
|
||||
|
||||
var success = await _delivery.SendAsync(rule.Channel, rule.Endpoint, payload, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
await LogNotificationAsync(rule, [entry.EntryId], success, null, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task NotifyExhaustedAsync(
|
||||
DeadLetterEntry entry,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var rules = await _ruleRepository.GetMatchingRulesAsync(entry.TenantId, entry, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
foreach (var rule in rules)
|
||||
{
|
||||
var notificationsThisHour = await _ruleRepository.GetNotificationCountThisHourAsync(
|
||||
entry.TenantId, rule.RuleId, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!rule.CanNotify(now, notificationsThisHour))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
await SendNotificationAsync(rule, "exhausted", [entry], null, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task SendAggregatedNotificationsAsync(
|
||||
string tenantId,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var rules = await _ruleRepository.ListAsync(tenantId, enabledOnly: true, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var stats = await _deadLetterRepository.GetStatsAsync(tenantId, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
foreach (var rule in rules.Where(r => r.Aggregate))
|
||||
{
|
||||
var notificationsThisHour = await _ruleRepository.GetNotificationCountThisHourAsync(
|
||||
tenantId, rule.RuleId, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!rule.CanNotify(now, notificationsThisHour))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get pending entries matching this rule
|
||||
var options = new DeadLetterListOptions(
|
||||
Status: DeadLetterStatus.Pending,
|
||||
Category: rule.Category,
|
||||
Limit: 10);
|
||||
|
||||
var entries = await _deadLetterRepository.ListAsync(tenantId, options, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
// Filter to only matching entries
|
||||
var matchingEntries = entries.Where(e => rule.Matches(e)).ToList();
|
||||
|
||||
if (matchingEntries.Count == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var statsSnapshot = new DeadLetterStatsSnapshot(
|
||||
PendingCount: stats.PendingEntries,
|
||||
RetryableCount: stats.RetryableEntries,
|
||||
ExhaustedCount: stats.ExhaustedEntries);
|
||||
|
||||
await SendNotificationAsync(rule, "aggregated", matchingEntries, statsSnapshot, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task SendNotificationAsync(
|
||||
NotificationRule rule,
|
||||
string eventType,
|
||||
IReadOnlyList<DeadLetterEntry> entries,
|
||||
DeadLetterStatsSnapshot? stats,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
var payload = new DeadLetterNotificationPayload(
|
||||
TenantId: rule.TenantId,
|
||||
EventType: eventType,
|
||||
Entries: entries.Select(ToSummary).ToList(),
|
||||
Stats: stats,
|
||||
Timestamp: now,
|
||||
ActionUrl: null);
|
||||
|
||||
string? errorMessage = null;
|
||||
bool success;
|
||||
|
||||
try
|
||||
{
|
||||
success = await _delivery.SendAsync(rule.Channel, rule.Endpoint, payload, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
success = false;
|
||||
errorMessage = ex.Message;
|
||||
_logger.LogError(ex, "Failed to send {EventType} notification for rule {RuleId}", eventType, rule.RuleId);
|
||||
}
|
||||
|
||||
await LogNotificationAsync(rule, entries.Select(e => e.EntryId).ToList(), success, errorMessage, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (success)
|
||||
{
|
||||
var updatedRule = rule.RecordNotification(now);
|
||||
await _ruleRepository.UpdateAsync(updatedRule, cancellationToken).ConfigureAwait(false);
|
||||
_logger.LogInformation(
|
||||
"Dead-letter notification sent: tenant={TenantId}, channel={Channel}, eventType={EventType}",
|
||||
rule.TenantId, rule.Channel, eventType);
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Dead-letter notification failed: tenant={TenantId}, channel={Channel}, eventType={EventType}",
|
||||
rule.TenantId, rule.Channel, eventType);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task LogNotificationAsync(
|
||||
NotificationRule rule,
|
||||
IReadOnlyList<Guid> entryIds,
|
||||
bool success,
|
||||
string? errorMessage,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var log = new NotificationLogEntry(
|
||||
LogId: Guid.NewGuid(),
|
||||
TenantId: rule.TenantId,
|
||||
RuleId: rule.RuleId,
|
||||
EntryIds: entryIds,
|
||||
Channel: rule.Channel,
|
||||
Endpoint: rule.Endpoint,
|
||||
Success: success,
|
||||
ErrorMessage: errorMessage,
|
||||
Subject: null,
|
||||
EntryCount: entryIds.Count,
|
||||
SentAt: _timeProvider.GetUtcNow());
|
||||
|
||||
await _ruleRepository.LogNotificationAsync(log, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private static DeadLetterEntrySummary ToSummary(DeadLetterEntry entry) =>
|
||||
new(
|
||||
EntryId: entry.EntryId,
|
||||
OriginalJobId: entry.OriginalJobId,
|
||||
JobType: entry.JobType,
|
||||
ErrorCode: entry.ErrorCode,
|
||||
Category: entry.Category,
|
||||
FailureReason: entry.FailureReason,
|
||||
RemediationHint: entry.RemediationHint,
|
||||
IsRetryable: entry.IsRetryable,
|
||||
ReplayAttempts: entry.ReplayAttempts,
|
||||
FailedAt: entry.FailedAt);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// No-op notifier for when notifications are disabled.
|
||||
/// </summary>
|
||||
public sealed class NullDeadLetterNotifier : IDeadLetterNotifier
|
||||
{
|
||||
public static readonly NullDeadLetterNotifier Instance = new();
|
||||
|
||||
private NullDeadLetterNotifier() { }
|
||||
|
||||
public Task NotifyNewEntryAsync(DeadLetterEntry entry, CancellationToken cancellationToken) =>
|
||||
Task.CompletedTask;
|
||||
|
||||
public Task NotifyReplaySuccessAsync(DeadLetterEntry entry, Guid newJobId, CancellationToken cancellationToken) =>
|
||||
Task.CompletedTask;
|
||||
|
||||
public Task NotifyExhaustedAsync(DeadLetterEntry entry, CancellationToken cancellationToken) =>
|
||||
Task.CompletedTask;
|
||||
|
||||
public Task SendAggregatedNotificationsAsync(string tenantId, CancellationToken cancellationToken) =>
|
||||
Task.CompletedTask;
|
||||
}
|
||||
@@ -0,0 +1,578 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Core.DeadLetter;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a classified error with remediation guidance.
|
||||
/// </summary>
|
||||
public sealed record ClassifiedError(
|
||||
/// <summary>Error code (e.g., "ORCH-ERR-001").</summary>
|
||||
string ErrorCode,
|
||||
|
||||
/// <summary>Error category.</summary>
|
||||
ErrorCategory Category,
|
||||
|
||||
/// <summary>Human-readable description.</summary>
|
||||
string Description,
|
||||
|
||||
/// <summary>Remediation hint for operators.</summary>
|
||||
string RemediationHint,
|
||||
|
||||
/// <summary>Whether this error is potentially retryable.</summary>
|
||||
bool IsRetryable,
|
||||
|
||||
/// <summary>Suggested retry delay if retryable.</summary>
|
||||
TimeSpan? SuggestedRetryDelay);
|
||||
|
||||
/// <summary>
|
||||
/// Classifies errors and provides remediation hints.
|
||||
/// </summary>
|
||||
public interface IErrorClassifier
|
||||
{
|
||||
/// <summary>Classifies an exception into a categorized error.</summary>
|
||||
ClassifiedError Classify(Exception exception);
|
||||
|
||||
/// <summary>Classifies an error code and message.</summary>
|
||||
ClassifiedError Classify(string errorCode, string message);
|
||||
|
||||
/// <summary>Classifies based on HTTP status code and message.</summary>
|
||||
ClassifiedError ClassifyHttpError(int statusCode, string? message);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default error classifier with standard error codes and remediation hints.
|
||||
/// </summary>
|
||||
public sealed class DefaultErrorClassifier : IErrorClassifier
|
||||
{
|
||||
/// <summary>Known error codes with classifications.</summary>
|
||||
public static class ErrorCodes
|
||||
{
|
||||
// Transient errors (ORCH-TRN-xxx)
|
||||
public const string NetworkTimeout = "ORCH-TRN-001";
|
||||
public const string ConnectionRefused = "ORCH-TRN-002";
|
||||
public const string DnsResolutionFailed = "ORCH-TRN-003";
|
||||
public const string ServiceUnavailable = "ORCH-TRN-004";
|
||||
public const string GatewayTimeout = "ORCH-TRN-005";
|
||||
public const string TemporaryFailure = "ORCH-TRN-099";
|
||||
|
||||
// Not found errors (ORCH-NF-xxx)
|
||||
public const string ImageNotFound = "ORCH-NF-001";
|
||||
public const string SourceNotFound = "ORCH-NF-002";
|
||||
public const string RegistryNotFound = "ORCH-NF-003";
|
||||
public const string ManifestNotFound = "ORCH-NF-004";
|
||||
public const string ResourceNotFound = "ORCH-NF-099";
|
||||
|
||||
// Auth errors (ORCH-AUTH-xxx)
|
||||
public const string InvalidCredentials = "ORCH-AUTH-001";
|
||||
public const string TokenExpired = "ORCH-AUTH-002";
|
||||
public const string InsufficientPermissions = "ORCH-AUTH-003";
|
||||
public const string CertificateError = "ORCH-AUTH-004";
|
||||
public const string AuthenticationFailed = "ORCH-AUTH-099";
|
||||
|
||||
// Rate limit errors (ORCH-RL-xxx)
|
||||
public const string RateLimited = "ORCH-RL-001";
|
||||
public const string QuotaExceeded = "ORCH-RL-002";
|
||||
public const string ConcurrencyLimitReached = "ORCH-RL-003";
|
||||
public const string ThrottlingError = "ORCH-RL-099";
|
||||
|
||||
// Validation errors (ORCH-VAL-xxx)
|
||||
public const string InvalidPayload = "ORCH-VAL-001";
|
||||
public const string InvalidConfiguration = "ORCH-VAL-002";
|
||||
public const string SchemaValidationFailed = "ORCH-VAL-003";
|
||||
public const string MissingRequiredField = "ORCH-VAL-004";
|
||||
public const string ValidationFailed = "ORCH-VAL-099";
|
||||
|
||||
// Upstream errors (ORCH-UP-xxx)
|
||||
public const string RegistryError = "ORCH-UP-001";
|
||||
public const string AdvisoryFeedError = "ORCH-UP-002";
|
||||
public const string DatabaseError = "ORCH-UP-003";
|
||||
public const string ExternalServiceError = "ORCH-UP-099";
|
||||
|
||||
// Internal errors (ORCH-INT-xxx)
|
||||
public const string InternalError = "ORCH-INT-001";
|
||||
public const string StateCorruption = "ORCH-INT-002";
|
||||
public const string ProcessingError = "ORCH-INT-003";
|
||||
public const string UnexpectedError = "ORCH-INT-099";
|
||||
|
||||
// Conflict errors (ORCH-CON-xxx)
|
||||
public const string DuplicateJob = "ORCH-CON-001";
|
||||
public const string VersionMismatch = "ORCH-CON-002";
|
||||
public const string ConcurrentModification = "ORCH-CON-003";
|
||||
public const string ConflictError = "ORCH-CON-099";
|
||||
|
||||
// Canceled errors (ORCH-CAN-xxx)
|
||||
public const string UserCanceled = "ORCH-CAN-001";
|
||||
public const string SystemCanceled = "ORCH-CAN-002";
|
||||
public const string TimeoutCanceled = "ORCH-CAN-003";
|
||||
public const string OperationCanceled = "ORCH-CAN-099";
|
||||
}
|
||||
|
||||
private static readonly Dictionary<string, ClassifiedError> KnownErrors = new()
|
||||
{
|
||||
// Transient errors
|
||||
[ErrorCodes.NetworkTimeout] = new(
|
||||
ErrorCodes.NetworkTimeout,
|
||||
ErrorCategory.Transient,
|
||||
"Network operation timed out",
|
||||
"Check network connectivity and firewall rules. If the target service is healthy, increase timeout settings.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(1)),
|
||||
|
||||
[ErrorCodes.ConnectionRefused] = new(
|
||||
ErrorCodes.ConnectionRefused,
|
||||
ErrorCategory.Transient,
|
||||
"Connection refused by target host",
|
||||
"Verify the target service is running and accessible. Check firewall rules and network policies.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(2)),
|
||||
|
||||
[ErrorCodes.DnsResolutionFailed] = new(
|
||||
ErrorCodes.DnsResolutionFailed,
|
||||
ErrorCategory.Transient,
|
||||
"DNS resolution failed",
|
||||
"Verify the hostname is correct. Check DNS server configuration and network connectivity.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(1)),
|
||||
|
||||
[ErrorCodes.ServiceUnavailable] = new(
|
||||
ErrorCodes.ServiceUnavailable,
|
||||
ErrorCategory.Transient,
|
||||
"Service temporarily unavailable (503)",
|
||||
"The target service is temporarily overloaded or under maintenance. Retry with exponential backoff.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(5)),
|
||||
|
||||
[ErrorCodes.GatewayTimeout] = new(
|
||||
ErrorCodes.GatewayTimeout,
|
||||
ErrorCategory.Transient,
|
||||
"Gateway timeout (504)",
|
||||
"An upstream service took too long to respond. This is typically transient; retry with backoff.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(2)),
|
||||
|
||||
[ErrorCodes.TemporaryFailure] = new(
|
||||
ErrorCodes.TemporaryFailure,
|
||||
ErrorCategory.Transient,
|
||||
"Temporary failure",
|
||||
"A transient error occurred. Retry the operation after a brief delay.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(1)),
|
||||
|
||||
// Not found errors
|
||||
[ErrorCodes.ImageNotFound] = new(
|
||||
ErrorCodes.ImageNotFound,
|
||||
ErrorCategory.NotFound,
|
||||
"Container image not found",
|
||||
"Verify the image reference is correct (repository, tag, digest). Check registry access and that the image exists.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.SourceNotFound] = new(
|
||||
ErrorCodes.SourceNotFound,
|
||||
ErrorCategory.NotFound,
|
||||
"Source configuration not found",
|
||||
"The referenced source may have been deleted. Verify the source ID and recreate if necessary.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.RegistryNotFound] = new(
|
||||
ErrorCodes.RegistryNotFound,
|
||||
ErrorCategory.NotFound,
|
||||
"Container registry not found",
|
||||
"Verify the registry URL is correct. Check DNS resolution and that the registry is operational.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.ManifestNotFound] = new(
|
||||
ErrorCodes.ManifestNotFound,
|
||||
ErrorCategory.NotFound,
|
||||
"Image manifest not found",
|
||||
"The image exists but the manifest is missing. The image may have been deleted or the tag moved.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.ResourceNotFound] = new(
|
||||
ErrorCodes.ResourceNotFound,
|
||||
ErrorCategory.NotFound,
|
||||
"Resource not found",
|
||||
"The requested resource does not exist. Verify the resource identifier is correct.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
// Auth errors
|
||||
[ErrorCodes.InvalidCredentials] = new(
|
||||
ErrorCodes.InvalidCredentials,
|
||||
ErrorCategory.AuthFailure,
|
||||
"Invalid credentials",
|
||||
"The provided credentials are invalid. Update the registry credentials in the source configuration.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.TokenExpired] = new(
|
||||
ErrorCodes.TokenExpired,
|
||||
ErrorCategory.AuthFailure,
|
||||
"Authentication token expired",
|
||||
"The authentication token has expired. Refresh credentials or re-authenticate to obtain a new token.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(1)),
|
||||
|
||||
[ErrorCodes.InsufficientPermissions] = new(
|
||||
ErrorCodes.InsufficientPermissions,
|
||||
ErrorCategory.AuthFailure,
|
||||
"Insufficient permissions",
|
||||
"The authenticated user lacks required permissions. Request access from the registry administrator.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.CertificateError] = new(
|
||||
ErrorCodes.CertificateError,
|
||||
ErrorCategory.AuthFailure,
|
||||
"TLS certificate error",
|
||||
"Certificate validation failed. Verify the CA bundle or add the registry's certificate to trusted roots.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.AuthenticationFailed] = new(
|
||||
ErrorCodes.AuthenticationFailed,
|
||||
ErrorCategory.AuthFailure,
|
||||
"Authentication failed",
|
||||
"Unable to authenticate with the target service. Verify credentials and authentication configuration.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
// Rate limit errors
|
||||
[ErrorCodes.RateLimited] = new(
|
||||
ErrorCodes.RateLimited,
|
||||
ErrorCategory.RateLimited,
|
||||
"Rate limit exceeded (429)",
|
||||
"Request rate limit exceeded. Reduce request frequency or upgrade service tier. Will auto-retry with backoff.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(5)),
|
||||
|
||||
[ErrorCodes.QuotaExceeded] = new(
|
||||
ErrorCodes.QuotaExceeded,
|
||||
ErrorCategory.RateLimited,
|
||||
"Quota exceeded",
|
||||
"Usage quota has been exceeded. Wait for quota reset or request quota increase.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromHours(1)),
|
||||
|
||||
[ErrorCodes.ConcurrencyLimitReached] = new(
|
||||
ErrorCodes.ConcurrencyLimitReached,
|
||||
ErrorCategory.RateLimited,
|
||||
"Concurrency limit reached",
|
||||
"Maximum concurrent operations limit reached. Reduce parallel operations or increase limit.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(1)),
|
||||
|
||||
[ErrorCodes.ThrottlingError] = new(
|
||||
ErrorCodes.ThrottlingError,
|
||||
ErrorCategory.RateLimited,
|
||||
"Request throttled",
|
||||
"Request was throttled due to rate limits. Retry with exponential backoff.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(2)),
|
||||
|
||||
// Validation errors
|
||||
[ErrorCodes.InvalidPayload] = new(
|
||||
ErrorCodes.InvalidPayload,
|
||||
ErrorCategory.ValidationError,
|
||||
"Invalid job payload",
|
||||
"The job payload is malformed or invalid. Review the payload structure and fix validation errors.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.InvalidConfiguration] = new(
|
||||
ErrorCodes.InvalidConfiguration,
|
||||
ErrorCategory.ValidationError,
|
||||
"Invalid configuration",
|
||||
"Source or job configuration is invalid. Review and correct the configuration settings.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.SchemaValidationFailed] = new(
|
||||
ErrorCodes.SchemaValidationFailed,
|
||||
ErrorCategory.ValidationError,
|
||||
"Schema validation failed",
|
||||
"Input data failed schema validation. Ensure data conforms to the expected schema.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.MissingRequiredField] = new(
|
||||
ErrorCodes.MissingRequiredField,
|
||||
ErrorCategory.ValidationError,
|
||||
"Missing required field",
|
||||
"A required field is missing from the input. Provide all required fields.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.ValidationFailed] = new(
|
||||
ErrorCodes.ValidationFailed,
|
||||
ErrorCategory.ValidationError,
|
||||
"Validation failed",
|
||||
"Input validation failed. Review the error details and correct the input.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
// Upstream errors
|
||||
[ErrorCodes.RegistryError] = new(
|
||||
ErrorCodes.RegistryError,
|
||||
ErrorCategory.UpstreamError,
|
||||
"Container registry error",
|
||||
"The container registry returned an error. Check registry status and logs for details.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(5)),
|
||||
|
||||
[ErrorCodes.AdvisoryFeedError] = new(
|
||||
ErrorCodes.AdvisoryFeedError,
|
||||
ErrorCategory.UpstreamError,
|
||||
"Advisory feed error",
|
||||
"Error fetching from advisory feed. Check feed URL and authentication. May be temporary.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(15)),
|
||||
|
||||
[ErrorCodes.DatabaseError] = new(
|
||||
ErrorCodes.DatabaseError,
|
||||
ErrorCategory.UpstreamError,
|
||||
"Database error",
|
||||
"Database operation failed. Check database connectivity and status.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(1)),
|
||||
|
||||
[ErrorCodes.ExternalServiceError] = new(
|
||||
ErrorCodes.ExternalServiceError,
|
||||
ErrorCategory.UpstreamError,
|
||||
"External service error",
|
||||
"An external service dependency failed. Check service status and connectivity.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(5)),
|
||||
|
||||
// Internal errors
|
||||
[ErrorCodes.InternalError] = new(
|
||||
ErrorCodes.InternalError,
|
||||
ErrorCategory.InternalError,
|
||||
"Internal processing error",
|
||||
"An internal error occurred. This may indicate a bug. Please report if persistent.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.StateCorruption] = new(
|
||||
ErrorCodes.StateCorruption,
|
||||
ErrorCategory.InternalError,
|
||||
"State corruption detected",
|
||||
"Internal state corruption detected. Manual intervention may be required.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.ProcessingError] = new(
|
||||
ErrorCodes.ProcessingError,
|
||||
ErrorCategory.InternalError,
|
||||
"Processing error",
|
||||
"Error during job processing. Review job payload and configuration.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.UnexpectedError] = new(
|
||||
ErrorCodes.UnexpectedError,
|
||||
ErrorCategory.InternalError,
|
||||
"Unexpected error",
|
||||
"An unexpected error occurred. This may indicate a bug. Please report with error details.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
// Conflict errors
|
||||
[ErrorCodes.DuplicateJob] = new(
|
||||
ErrorCodes.DuplicateJob,
|
||||
ErrorCategory.Conflict,
|
||||
"Duplicate job detected",
|
||||
"A job with the same idempotency key already exists. This is expected for retry scenarios.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.VersionMismatch] = new(
|
||||
ErrorCodes.VersionMismatch,
|
||||
ErrorCategory.Conflict,
|
||||
"Version mismatch",
|
||||
"Resource version conflict detected. Refresh and retry the operation.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromSeconds(5)),
|
||||
|
||||
[ErrorCodes.ConcurrentModification] = new(
|
||||
ErrorCodes.ConcurrentModification,
|
||||
ErrorCategory.Conflict,
|
||||
"Concurrent modification",
|
||||
"Resource was modified concurrently. Refresh state and retry.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromSeconds(5)),
|
||||
|
||||
[ErrorCodes.ConflictError] = new(
|
||||
ErrorCodes.ConflictError,
|
||||
ErrorCategory.Conflict,
|
||||
"Resource conflict",
|
||||
"A resource conflict occurred. Check for concurrent operations.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromSeconds(10)),
|
||||
|
||||
// Canceled errors
|
||||
[ErrorCodes.UserCanceled] = new(
|
||||
ErrorCodes.UserCanceled,
|
||||
ErrorCategory.Canceled,
|
||||
"Canceled by user",
|
||||
"Operation was canceled by user request. No action required unless retry is desired.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
|
||||
[ErrorCodes.SystemCanceled] = new(
|
||||
ErrorCodes.SystemCanceled,
|
||||
ErrorCategory.Canceled,
|
||||
"Canceled by system",
|
||||
"Operation was canceled by the system (e.g., shutdown, quota). May be automatically rescheduled.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(5)),
|
||||
|
||||
[ErrorCodes.TimeoutCanceled] = new(
|
||||
ErrorCodes.TimeoutCanceled,
|
||||
ErrorCategory.Canceled,
|
||||
"Canceled due to timeout",
|
||||
"Operation exceeded its time limit. Consider increasing timeout or optimizing the operation.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(2)),
|
||||
|
||||
[ErrorCodes.OperationCanceled] = new(
|
||||
ErrorCodes.OperationCanceled,
|
||||
ErrorCategory.Canceled,
|
||||
"Operation canceled",
|
||||
"The operation was canceled. Check cancellation source for details.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null)
|
||||
};
|
||||
|
||||
/// <inheritdoc />
|
||||
public ClassifiedError Classify(Exception exception)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(exception);
|
||||
|
||||
return exception switch
|
||||
{
|
||||
OperationCanceledException => KnownErrors[ErrorCodes.OperationCanceled],
|
||||
TimeoutException => KnownErrors[ErrorCodes.NetworkTimeout],
|
||||
HttpRequestException httpEx => ClassifyHttpException(httpEx),
|
||||
_ when exception.Message.Contains("connection refused", StringComparison.OrdinalIgnoreCase)
|
||||
=> KnownErrors[ErrorCodes.ConnectionRefused],
|
||||
_ when exception.Message.Contains("DNS", StringComparison.OrdinalIgnoreCase)
|
||||
=> KnownErrors[ErrorCodes.DnsResolutionFailed],
|
||||
_ when exception.Message.Contains("timeout", StringComparison.OrdinalIgnoreCase)
|
||||
=> KnownErrors[ErrorCodes.NetworkTimeout],
|
||||
_ when exception.Message.Contains("certificate", StringComparison.OrdinalIgnoreCase)
|
||||
=> KnownErrors[ErrorCodes.CertificateError],
|
||||
_ when exception.Message.Contains("unauthorized", StringComparison.OrdinalIgnoreCase)
|
||||
=> KnownErrors[ErrorCodes.AuthenticationFailed],
|
||||
_ when exception.Message.Contains("forbidden", StringComparison.OrdinalIgnoreCase)
|
||||
=> KnownErrors[ErrorCodes.InsufficientPermissions],
|
||||
_ => new ClassifiedError(
|
||||
ErrorCodes.UnexpectedError,
|
||||
ErrorCategory.InternalError,
|
||||
exception.GetType().Name,
|
||||
$"Unexpected error: {exception.Message}. Review stack trace for details.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null)
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public ClassifiedError Classify(string errorCode, string message)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(errorCode);
|
||||
|
||||
if (KnownErrors.TryGetValue(errorCode, out var known))
|
||||
{
|
||||
return known;
|
||||
}
|
||||
|
||||
// Try to infer from error code prefix
|
||||
var category = errorCode switch
|
||||
{
|
||||
_ when errorCode.StartsWith("ORCH-TRN-", StringComparison.Ordinal) => ErrorCategory.Transient,
|
||||
_ when errorCode.StartsWith("ORCH-NF-", StringComparison.Ordinal) => ErrorCategory.NotFound,
|
||||
_ when errorCode.StartsWith("ORCH-AUTH-", StringComparison.Ordinal) => ErrorCategory.AuthFailure,
|
||||
_ when errorCode.StartsWith("ORCH-RL-", StringComparison.Ordinal) => ErrorCategory.RateLimited,
|
||||
_ when errorCode.StartsWith("ORCH-VAL-", StringComparison.Ordinal) => ErrorCategory.ValidationError,
|
||||
_ when errorCode.StartsWith("ORCH-UP-", StringComparison.Ordinal) => ErrorCategory.UpstreamError,
|
||||
_ when errorCode.StartsWith("ORCH-INT-", StringComparison.Ordinal) => ErrorCategory.InternalError,
|
||||
_ when errorCode.StartsWith("ORCH-CON-", StringComparison.Ordinal) => ErrorCategory.Conflict,
|
||||
_ when errorCode.StartsWith("ORCH-CAN-", StringComparison.Ordinal) => ErrorCategory.Canceled,
|
||||
_ => ErrorCategory.Unknown
|
||||
};
|
||||
|
||||
var isRetryable = category is ErrorCategory.Transient or ErrorCategory.RateLimited or ErrorCategory.UpstreamError;
|
||||
|
||||
return new ClassifiedError(
|
||||
errorCode,
|
||||
category,
|
||||
message,
|
||||
"Unknown error code. Review the error message for details.",
|
||||
isRetryable,
|
||||
isRetryable ? TimeSpan.FromMinutes(5) : null);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public ClassifiedError ClassifyHttpError(int statusCode, string? message)
|
||||
{
|
||||
return statusCode switch
|
||||
{
|
||||
400 => KnownErrors[ErrorCodes.ValidationFailed],
|
||||
401 => KnownErrors[ErrorCodes.AuthenticationFailed],
|
||||
403 => KnownErrors[ErrorCodes.InsufficientPermissions],
|
||||
404 => KnownErrors[ErrorCodes.ResourceNotFound],
|
||||
408 => KnownErrors[ErrorCodes.NetworkTimeout],
|
||||
409 => KnownErrors[ErrorCodes.ConflictError],
|
||||
429 => KnownErrors[ErrorCodes.RateLimited],
|
||||
500 => KnownErrors[ErrorCodes.InternalError],
|
||||
502 => KnownErrors[ErrorCodes.ExternalServiceError],
|
||||
503 => KnownErrors[ErrorCodes.ServiceUnavailable],
|
||||
504 => KnownErrors[ErrorCodes.GatewayTimeout],
|
||||
_ when statusCode >= 400 && statusCode < 500 => new ClassifiedError(
|
||||
$"HTTP-{statusCode}",
|
||||
ErrorCategory.ValidationError,
|
||||
message ?? $"HTTP {statusCode} error",
|
||||
"Client error. Review request parameters.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null),
|
||||
_ when statusCode >= 500 => new ClassifiedError(
|
||||
$"HTTP-{statusCode}",
|
||||
ErrorCategory.UpstreamError,
|
||||
message ?? $"HTTP {statusCode} error",
|
||||
"Server error. May be transient; retry with backoff.",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(2)),
|
||||
_ => new ClassifiedError(
|
||||
$"HTTP-{statusCode}",
|
||||
ErrorCategory.Unknown,
|
||||
message ?? $"HTTP {statusCode}",
|
||||
"Unexpected HTTP status. Review response for details.",
|
||||
IsRetryable: false,
|
||||
SuggestedRetryDelay: null)
|
||||
};
|
||||
}
|
||||
|
||||
private ClassifiedError ClassifyHttpException(HttpRequestException ex)
|
||||
{
|
||||
if (ex.StatusCode.HasValue)
|
||||
{
|
||||
return ClassifyHttpError((int)ex.StatusCode.Value, ex.Message);
|
||||
}
|
||||
|
||||
// No status code - likely a connection error
|
||||
return ex.Message switch
|
||||
{
|
||||
_ when ex.Message.Contains("connection refused", StringComparison.OrdinalIgnoreCase)
|
||||
=> KnownErrors[ErrorCodes.ConnectionRefused],
|
||||
_ when ex.Message.Contains("name resolution", StringComparison.OrdinalIgnoreCase)
|
||||
=> KnownErrors[ErrorCodes.DnsResolutionFailed],
|
||||
_ when ex.Message.Contains("SSL", StringComparison.OrdinalIgnoreCase) ||
|
||||
ex.Message.Contains("TLS", StringComparison.OrdinalIgnoreCase)
|
||||
=> KnownErrors[ErrorCodes.CertificateError],
|
||||
_ => KnownErrors[ErrorCodes.ExternalServiceError]
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,221 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Core.DeadLetter;
|
||||
|
||||
/// <summary>
|
||||
/// Repository for dead-letter entry persistence.
|
||||
/// </summary>
|
||||
public interface IDeadLetterRepository
|
||||
{
|
||||
/// <summary>Gets a dead-letter entry by ID.</summary>
|
||||
Task<DeadLetterEntry?> GetByIdAsync(
|
||||
string tenantId,
|
||||
Guid entryId,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Gets a dead-letter entry by original job ID.</summary>
|
||||
Task<DeadLetterEntry?> GetByOriginalJobIdAsync(
|
||||
string tenantId,
|
||||
Guid originalJobId,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Lists dead-letter entries with filtering and pagination.</summary>
|
||||
Task<IReadOnlyList<DeadLetterEntry>> ListAsync(
|
||||
string tenantId,
|
||||
DeadLetterListOptions options,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Counts dead-letter entries with filtering.</summary>
|
||||
Task<long> CountAsync(
|
||||
string tenantId,
|
||||
DeadLetterListOptions options,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Creates a new dead-letter entry.</summary>
|
||||
Task CreateAsync(
|
||||
DeadLetterEntry entry,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Updates an existing dead-letter entry.</summary>
|
||||
Task<bool> UpdateAsync(
|
||||
DeadLetterEntry entry,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Gets entries pending replay that are retryable.</summary>
|
||||
Task<IReadOnlyList<DeadLetterEntry>> GetPendingRetryableAsync(
|
||||
string tenantId,
|
||||
int limit,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Gets entries by error code.</summary>
|
||||
Task<IReadOnlyList<DeadLetterEntry>> GetByErrorCodeAsync(
|
||||
string tenantId,
|
||||
string errorCode,
|
||||
DeadLetterStatus? status,
|
||||
int limit,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Gets entries by category.</summary>
|
||||
Task<IReadOnlyList<DeadLetterEntry>> GetByCategoryAsync(
|
||||
string tenantId,
|
||||
ErrorCategory category,
|
||||
DeadLetterStatus? status,
|
||||
int limit,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Gets aggregated statistics.</summary>
|
||||
Task<DeadLetterStats> GetStatsAsync(
|
||||
string tenantId,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Gets a summary of actionable entries grouped by error code.</summary>
|
||||
Task<IReadOnlyList<DeadLetterSummary>> GetActionableSummaryAsync(
|
||||
string tenantId,
|
||||
int limit,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Marks expired entries.</summary>
|
||||
Task<int> MarkExpiredAsync(
|
||||
int batchLimit,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Purges old resolved/expired entries.</summary>
|
||||
Task<int> PurgeOldEntriesAsync(
|
||||
int retentionDays,
|
||||
int batchLimit,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for listing dead-letter entries.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterListOptions(
|
||||
DeadLetterStatus? Status = null,
|
||||
ErrorCategory? Category = null,
|
||||
string? JobType = null,
|
||||
string? ErrorCode = null,
|
||||
Guid? SourceId = null,
|
||||
Guid? RunId = null,
|
||||
bool? IsRetryable = null,
|
||||
DateTimeOffset? CreatedAfter = null,
|
||||
DateTimeOffset? CreatedBefore = null,
|
||||
string? Cursor = null,
|
||||
int Limit = 50,
|
||||
bool Ascending = false);
|
||||
|
||||
/// <summary>
|
||||
/// Aggregated dead-letter statistics.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterStats(
|
||||
long TotalEntries,
|
||||
long PendingEntries,
|
||||
long ReplayingEntries,
|
||||
long ReplayedEntries,
|
||||
long ResolvedEntries,
|
||||
long ExhaustedEntries,
|
||||
long ExpiredEntries,
|
||||
long RetryableEntries,
|
||||
IReadOnlyDictionary<ErrorCategory, long> ByCategory,
|
||||
IReadOnlyDictionary<string, long> TopErrorCodes,
|
||||
IReadOnlyDictionary<string, long> TopJobTypes);
|
||||
|
||||
/// <summary>
|
||||
/// Summary of dead-letter entries grouped by error code.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterSummary(
|
||||
string ErrorCode,
|
||||
ErrorCategory Category,
|
||||
long EntryCount,
|
||||
long RetryableCount,
|
||||
DateTimeOffset OldestEntry,
|
||||
string? SampleReason);
|
||||
|
||||
/// <summary>
|
||||
/// Repository for replay audit records.
|
||||
/// </summary>
|
||||
public interface IReplayAuditRepository
|
||||
{
|
||||
/// <summary>Gets audit records for an entry.</summary>
|
||||
Task<IReadOnlyList<ReplayAuditRecord>> GetByEntryAsync(
|
||||
string tenantId,
|
||||
Guid entryId,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Gets a specific audit record.</summary>
|
||||
Task<ReplayAuditRecord?> GetByIdAsync(
|
||||
string tenantId,
|
||||
Guid auditId,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Creates a new audit record.</summary>
|
||||
Task CreateAsync(
|
||||
ReplayAuditRecord record,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Updates an audit record (completion).</summary>
|
||||
Task<bool> UpdateAsync(
|
||||
ReplayAuditRecord record,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Gets audit records for a new job ID (to find replay source).</summary>
|
||||
Task<ReplayAuditRecord?> GetByNewJobIdAsync(
|
||||
string tenantId,
|
||||
Guid newJobId,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Replay attempt audit record.
|
||||
/// </summary>
|
||||
public sealed record ReplayAuditRecord(
|
||||
Guid AuditId,
|
||||
string TenantId,
|
||||
Guid EntryId,
|
||||
int AttemptNumber,
|
||||
bool Success,
|
||||
Guid? NewJobId,
|
||||
string? ErrorMessage,
|
||||
string TriggeredBy,
|
||||
DateTimeOffset TriggeredAt,
|
||||
DateTimeOffset? CompletedAt,
|
||||
string InitiatedBy)
|
||||
{
|
||||
/// <summary>Creates a new audit record for a replay attempt.</summary>
|
||||
public static ReplayAuditRecord Create(
|
||||
string tenantId,
|
||||
Guid entryId,
|
||||
int attemptNumber,
|
||||
string triggeredBy,
|
||||
string initiatedBy,
|
||||
DateTimeOffset now) =>
|
||||
new(
|
||||
AuditId: Guid.NewGuid(),
|
||||
TenantId: tenantId,
|
||||
EntryId: entryId,
|
||||
AttemptNumber: attemptNumber,
|
||||
Success: false,
|
||||
NewJobId: null,
|
||||
ErrorMessage: null,
|
||||
TriggeredBy: triggeredBy,
|
||||
TriggeredAt: now,
|
||||
CompletedAt: null,
|
||||
InitiatedBy: initiatedBy);
|
||||
|
||||
/// <summary>Marks the replay as successful.</summary>
|
||||
public ReplayAuditRecord Complete(Guid newJobId, DateTimeOffset now) =>
|
||||
this with
|
||||
{
|
||||
Success = true,
|
||||
NewJobId = newJobId,
|
||||
CompletedAt = now
|
||||
};
|
||||
|
||||
/// <summary>Marks the replay as failed.</summary>
|
||||
public ReplayAuditRecord Fail(string errorMessage, DateTimeOffset now) =>
|
||||
this with
|
||||
{
|
||||
Success = false,
|
||||
ErrorMessage = errorMessage,
|
||||
CompletedAt = now
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,472 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Core.DeadLetter;
|
||||
|
||||
/// <summary>
|
||||
/// Options for replay manager configuration.
|
||||
/// </summary>
|
||||
public sealed record ReplayManagerOptions(
|
||||
/// <summary>Default maximum replay attempts.</summary>
|
||||
int DefaultMaxReplayAttempts = 3,
|
||||
|
||||
/// <summary>Default retention period for dead-letter entries.</summary>
|
||||
TimeSpan DefaultRetention = default,
|
||||
|
||||
/// <summary>Minimum delay between replay attempts.</summary>
|
||||
TimeSpan MinReplayDelay = default,
|
||||
|
||||
/// <summary>Maximum batch size for bulk operations.</summary>
|
||||
int MaxBatchSize = 100,
|
||||
|
||||
/// <summary>Enable automatic replay of retryable entries.</summary>
|
||||
bool AutoReplayEnabled = false,
|
||||
|
||||
/// <summary>Delay before automatic replay.</summary>
|
||||
TimeSpan AutoReplayDelay = default)
|
||||
{
|
||||
/// <summary>Default options.</summary>
|
||||
public static ReplayManagerOptions Default => new(
|
||||
DefaultMaxReplayAttempts: 3,
|
||||
DefaultRetention: TimeSpan.FromDays(30),
|
||||
MinReplayDelay: TimeSpan.FromMinutes(5),
|
||||
MaxBatchSize: 100,
|
||||
AutoReplayEnabled: false,
|
||||
AutoReplayDelay: TimeSpan.FromMinutes(15));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a replay operation.
|
||||
/// </summary>
|
||||
public sealed record ReplayResult(
|
||||
bool Success,
|
||||
Guid? NewJobId,
|
||||
string? ErrorMessage,
|
||||
DeadLetterEntry UpdatedEntry);
|
||||
|
||||
/// <summary>
|
||||
/// Result of a batch replay operation.
|
||||
/// </summary>
|
||||
public sealed record BatchReplayResult(
|
||||
int Attempted,
|
||||
int Succeeded,
|
||||
int Failed,
|
||||
IReadOnlyList<ReplayResult> Results);
|
||||
|
||||
/// <summary>
|
||||
/// Manages dead-letter entry replay operations.
|
||||
/// </summary>
|
||||
public interface IReplayManager
|
||||
{
|
||||
/// <summary>Replays a single dead-letter entry.</summary>
|
||||
Task<ReplayResult> ReplayAsync(
|
||||
string tenantId,
|
||||
Guid entryId,
|
||||
string initiatedBy,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Replays multiple entries by ID.</summary>
|
||||
Task<BatchReplayResult> ReplayBatchAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<Guid> entryIds,
|
||||
string initiatedBy,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Replays all pending retryable entries matching criteria.</summary>
|
||||
Task<BatchReplayResult> ReplayPendingAsync(
|
||||
string tenantId,
|
||||
string? errorCode,
|
||||
ErrorCategory? category,
|
||||
int maxCount,
|
||||
string initiatedBy,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Resolves an entry without replay.</summary>
|
||||
Task<DeadLetterEntry> ResolveAsync(
|
||||
string tenantId,
|
||||
Guid entryId,
|
||||
string notes,
|
||||
string resolvedBy,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Resolves multiple entries without replay.</summary>
|
||||
Task<int> ResolveBatchAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<Guid> entryIds,
|
||||
string notes,
|
||||
string resolvedBy,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Job creator interface for replay operations.
|
||||
/// </summary>
|
||||
public interface IJobCreator
|
||||
{
|
||||
/// <summary>Creates a new job from a dead-letter entry payload.</summary>
|
||||
Task<Job> CreateFromReplayAsync(
|
||||
string tenantId,
|
||||
string jobType,
|
||||
string payload,
|
||||
string payloadDigest,
|
||||
string idempotencyKey,
|
||||
string? correlationId,
|
||||
Guid replayOf,
|
||||
string createdBy,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default replay manager implementation.
|
||||
/// </summary>
|
||||
public sealed class ReplayManager : IReplayManager
|
||||
{
|
||||
private readonly IDeadLetterRepository _deadLetterRepository;
|
||||
private readonly IReplayAuditRepository _auditRepository;
|
||||
private readonly IJobCreator _jobCreator;
|
||||
private readonly IDeadLetterNotifier _notifier;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ReplayManagerOptions _options;
|
||||
private readonly ILogger<ReplayManager> _logger;
|
||||
|
||||
public ReplayManager(
|
||||
IDeadLetterRepository deadLetterRepository,
|
||||
IReplayAuditRepository auditRepository,
|
||||
IJobCreator jobCreator,
|
||||
IDeadLetterNotifier notifier,
|
||||
TimeProvider timeProvider,
|
||||
ReplayManagerOptions options,
|
||||
ILogger<ReplayManager> logger)
|
||||
{
|
||||
_deadLetterRepository = deadLetterRepository ?? throw new ArgumentNullException(nameof(deadLetterRepository));
|
||||
_auditRepository = auditRepository ?? throw new ArgumentNullException(nameof(auditRepository));
|
||||
_jobCreator = jobCreator ?? throw new ArgumentNullException(nameof(jobCreator));
|
||||
_notifier = notifier ?? throw new ArgumentNullException(nameof(notifier));
|
||||
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
|
||||
_options = options ?? ReplayManagerOptions.Default;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<ReplayResult> ReplayAsync(
|
||||
string tenantId,
|
||||
Guid entryId,
|
||||
string initiatedBy,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(initiatedBy);
|
||||
|
||||
var entry = await _deadLetterRepository.GetByIdAsync(tenantId, entryId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (entry is null)
|
||||
{
|
||||
throw new InvalidOperationException($"Dead-letter entry {entryId} not found.");
|
||||
}
|
||||
|
||||
return await ReplayEntryAsync(entry, "manual", initiatedBy, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public async Task<BatchReplayResult> ReplayBatchAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<Guid> entryIds,
|
||||
string initiatedBy,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentNullException.ThrowIfNull(entryIds);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(initiatedBy);
|
||||
|
||||
if (entryIds.Count > _options.MaxBatchSize)
|
||||
{
|
||||
throw new ArgumentException($"Batch size {entryIds.Count} exceeds maximum {_options.MaxBatchSize}.");
|
||||
}
|
||||
|
||||
var results = new List<ReplayResult>();
|
||||
var succeeded = 0;
|
||||
var failed = 0;
|
||||
|
||||
foreach (var entryId in entryIds)
|
||||
{
|
||||
try
|
||||
{
|
||||
var entry = await _deadLetterRepository.GetByIdAsync(tenantId, entryId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (entry is null)
|
||||
{
|
||||
results.Add(new ReplayResult(
|
||||
Success: false,
|
||||
NewJobId: null,
|
||||
ErrorMessage: $"Entry {entryId} not found.",
|
||||
UpdatedEntry: null!));
|
||||
failed++;
|
||||
continue;
|
||||
}
|
||||
|
||||
var result = await ReplayEntryAsync(entry, "batch", initiatedBy, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
results.Add(result);
|
||||
|
||||
if (result.Success)
|
||||
succeeded++;
|
||||
else
|
||||
failed++;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to replay entry {EntryId}", entryId);
|
||||
results.Add(new ReplayResult(
|
||||
Success: false,
|
||||
NewJobId: null,
|
||||
ErrorMessage: ex.Message,
|
||||
UpdatedEntry: null!));
|
||||
failed++;
|
||||
}
|
||||
}
|
||||
|
||||
return new BatchReplayResult(
|
||||
Attempted: entryIds.Count,
|
||||
Succeeded: succeeded,
|
||||
Failed: failed,
|
||||
Results: results);
|
||||
}
|
||||
|
||||
public async Task<BatchReplayResult> ReplayPendingAsync(
|
||||
string tenantId,
|
||||
string? errorCode,
|
||||
ErrorCategory? category,
|
||||
int maxCount,
|
||||
string initiatedBy,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(initiatedBy);
|
||||
|
||||
var effectiveLimit = Math.Min(maxCount, _options.MaxBatchSize);
|
||||
|
||||
IReadOnlyList<DeadLetterEntry> entries;
|
||||
if (!string.IsNullOrEmpty(errorCode))
|
||||
{
|
||||
entries = await _deadLetterRepository.GetByErrorCodeAsync(
|
||||
tenantId, errorCode, DeadLetterStatus.Pending, effectiveLimit, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
else if (category.HasValue)
|
||||
{
|
||||
entries = await _deadLetterRepository.GetByCategoryAsync(
|
||||
tenantId, category.Value, DeadLetterStatus.Pending, effectiveLimit, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
else
|
||||
{
|
||||
entries = await _deadLetterRepository.GetPendingRetryableAsync(tenantId, effectiveLimit, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
|
||||
var results = new List<ReplayResult>();
|
||||
var succeeded = 0;
|
||||
var failed = 0;
|
||||
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
if (!entry.CanReplay)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var result = await ReplayEntryAsync(entry, "auto", initiatedBy, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
results.Add(result);
|
||||
|
||||
if (result.Success)
|
||||
succeeded++;
|
||||
else
|
||||
failed++;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to replay entry {EntryId}", entry.EntryId);
|
||||
results.Add(new ReplayResult(
|
||||
Success: false,
|
||||
NewJobId: null,
|
||||
ErrorMessage: ex.Message,
|
||||
UpdatedEntry: entry));
|
||||
failed++;
|
||||
}
|
||||
}
|
||||
|
||||
return new BatchReplayResult(
|
||||
Attempted: results.Count,
|
||||
Succeeded: succeeded,
|
||||
Failed: failed,
|
||||
Results: results);
|
||||
}
|
||||
|
||||
public async Task<DeadLetterEntry> ResolveAsync(
|
||||
string tenantId,
|
||||
Guid entryId,
|
||||
string notes,
|
||||
string resolvedBy,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(resolvedBy);
|
||||
|
||||
var entry = await _deadLetterRepository.GetByIdAsync(tenantId, entryId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (entry is null)
|
||||
{
|
||||
throw new InvalidOperationException($"Dead-letter entry {entryId} not found.");
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var resolved = entry.Resolve(notes, resolvedBy, now);
|
||||
|
||||
await _deadLetterRepository.UpdateAsync(resolved, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Resolved dead-letter entry {EntryId} for job {JobId}. Notes: {Notes}",
|
||||
entryId, entry.OriginalJobId, notes);
|
||||
|
||||
return resolved;
|
||||
}
|
||||
|
||||
public async Task<int> ResolveBatchAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<Guid> entryIds,
|
||||
string notes,
|
||||
string resolvedBy,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentNullException.ThrowIfNull(entryIds);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(resolvedBy);
|
||||
|
||||
var resolved = 0;
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
foreach (var entryId in entryIds)
|
||||
{
|
||||
try
|
||||
{
|
||||
var entry = await _deadLetterRepository.GetByIdAsync(tenantId, entryId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (entry is null || entry.IsTerminal)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var resolvedEntry = entry.Resolve(notes, resolvedBy, now);
|
||||
await _deadLetterRepository.UpdateAsync(resolvedEntry, cancellationToken).ConfigureAwait(false);
|
||||
resolved++;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to resolve entry {EntryId}", entryId);
|
||||
}
|
||||
}
|
||||
|
||||
return resolved;
|
||||
}
|
||||
|
||||
private async Task<ReplayResult> ReplayEntryAsync(
|
||||
DeadLetterEntry entry,
|
||||
string triggeredBy,
|
||||
string initiatedBy,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (!entry.CanReplay)
|
||||
{
|
||||
return new ReplayResult(
|
||||
Success: false,
|
||||
NewJobId: null,
|
||||
ErrorMessage: $"Entry cannot be replayed: status={entry.Status}, attempts={entry.ReplayAttempts}/{entry.MaxReplayAttempts}, retryable={entry.IsRetryable}",
|
||||
UpdatedEntry: entry);
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
// Mark entry as replaying
|
||||
var replaying = entry.StartReplay(initiatedBy, now);
|
||||
await _deadLetterRepository.UpdateAsync(replaying, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Create audit record
|
||||
var auditRecord = ReplayAuditRecord.Create(
|
||||
entry.TenantId,
|
||||
entry.EntryId,
|
||||
replaying.ReplayAttempts,
|
||||
triggeredBy,
|
||||
initiatedBy,
|
||||
now);
|
||||
await _auditRepository.CreateAsync(auditRecord, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
try
|
||||
{
|
||||
// Create new job with updated idempotency key
|
||||
var newIdempotencyKey = $"{entry.IdempotencyKey}:replay:{replaying.ReplayAttempts}";
|
||||
var newJob = await _jobCreator.CreateFromReplayAsync(
|
||||
entry.TenantId,
|
||||
entry.JobType,
|
||||
entry.Payload,
|
||||
entry.PayloadDigest,
|
||||
newIdempotencyKey,
|
||||
entry.CorrelationId,
|
||||
entry.OriginalJobId,
|
||||
initiatedBy,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Mark replay successful
|
||||
now = _timeProvider.GetUtcNow();
|
||||
var completed = replaying.CompleteReplay(newJob.JobId, initiatedBy, now);
|
||||
await _deadLetterRepository.UpdateAsync(completed, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Update audit record
|
||||
var completedAudit = auditRecord.Complete(newJob.JobId, now);
|
||||
await _auditRepository.UpdateAsync(completedAudit, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Replayed dead-letter entry {EntryId} as new job {NewJobId}",
|
||||
entry.EntryId, newJob.JobId);
|
||||
|
||||
// Notify on success
|
||||
await _notifier.NotifyReplaySuccessAsync(completed, newJob.JobId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return new ReplayResult(
|
||||
Success: true,
|
||||
NewJobId: newJob.JobId,
|
||||
ErrorMessage: null,
|
||||
UpdatedEntry: completed);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to replay entry {EntryId}", entry.EntryId);
|
||||
|
||||
// Mark replay failed
|
||||
now = _timeProvider.GetUtcNow();
|
||||
var failed = replaying.FailReplay(ex.Message, initiatedBy, now);
|
||||
await _deadLetterRepository.UpdateAsync(failed, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Update audit record
|
||||
var failedAudit = auditRecord.Fail(ex.Message, now);
|
||||
await _auditRepository.UpdateAsync(failedAudit, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Notify on exhausted
|
||||
if (failed.Status == DeadLetterStatus.Exhausted)
|
||||
{
|
||||
await _notifier.NotifyExhaustedAsync(failed, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
return new ReplayResult(
|
||||
Success: false,
|
||||
NewJobId: null,
|
||||
ErrorMessage: ex.Message,
|
||||
UpdatedEntry: failed);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents an artifact produced by a job execution.
|
||||
/// Artifacts are immutable outputs with content digests for provenance.
|
||||
/// </summary>
|
||||
public sealed record Artifact(
|
||||
/// <summary>Unique artifact identifier.</summary>
|
||||
Guid ArtifactId,
|
||||
|
||||
/// <summary>Tenant owning this artifact.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Job that produced this artifact.</summary>
|
||||
Guid JobId,
|
||||
|
||||
/// <summary>Run containing the producing job (if any).</summary>
|
||||
Guid? RunId,
|
||||
|
||||
/// <summary>Artifact type (e.g., "sbom", "scan-result", "attestation", "log").</summary>
|
||||
string ArtifactType,
|
||||
|
||||
/// <summary>Storage URI (e.g., "s3://bucket/path", "file:///local/path").</summary>
|
||||
string Uri,
|
||||
|
||||
/// <summary>Content digest (SHA-256) for integrity verification.</summary>
|
||||
string Digest,
|
||||
|
||||
/// <summary>MIME type (e.g., "application/json", "application/vnd.cyclonedx+json").</summary>
|
||||
string? MimeType,
|
||||
|
||||
/// <summary>Artifact size in bytes.</summary>
|
||||
long? SizeBytes,
|
||||
|
||||
/// <summary>When the artifact was created.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>Optional metadata JSON blob.</summary>
|
||||
string? Metadata);
|
||||
@@ -0,0 +1,250 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents an immutable audit log entry for orchestrator operations.
|
||||
/// Captures who did what, when, and with what effect.
|
||||
/// </summary>
|
||||
public sealed record AuditEntry(
|
||||
/// <summary>Unique audit entry identifier.</summary>
|
||||
Guid EntryId,
|
||||
|
||||
/// <summary>Tenant owning this entry.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Type of audited event.</summary>
|
||||
AuditEventType EventType,
|
||||
|
||||
/// <summary>Resource type being audited (job, run, source, quota, etc.).</summary>
|
||||
string ResourceType,
|
||||
|
||||
/// <summary>Resource identifier being audited.</summary>
|
||||
Guid ResourceId,
|
||||
|
||||
/// <summary>Actor who performed the action.</summary>
|
||||
string ActorId,
|
||||
|
||||
/// <summary>Actor type (user, system, worker, api-key).</summary>
|
||||
ActorType ActorType,
|
||||
|
||||
/// <summary>IP address of the actor (if applicable).</summary>
|
||||
string? ActorIp,
|
||||
|
||||
/// <summary>User agent string (if applicable).</summary>
|
||||
string? UserAgent,
|
||||
|
||||
/// <summary>HTTP method used (if applicable).</summary>
|
||||
string? HttpMethod,
|
||||
|
||||
/// <summary>Request path (if applicable).</summary>
|
||||
string? RequestPath,
|
||||
|
||||
/// <summary>State before the change (JSON).</summary>
|
||||
string? OldState,
|
||||
|
||||
/// <summary>State after the change (JSON).</summary>
|
||||
string? NewState,
|
||||
|
||||
/// <summary>Human-readable description of the change.</summary>
|
||||
string Description,
|
||||
|
||||
/// <summary>Correlation ID for distributed tracing.</summary>
|
||||
string? CorrelationId,
|
||||
|
||||
/// <summary>SHA-256 hash of the previous entry for chain integrity.</summary>
|
||||
string? PreviousEntryHash,
|
||||
|
||||
/// <summary>SHA-256 hash of this entry's content for integrity.</summary>
|
||||
string ContentHash,
|
||||
|
||||
/// <summary>Sequence number within the tenant's audit stream.</summary>
|
||||
long SequenceNumber,
|
||||
|
||||
/// <summary>When the event occurred.</summary>
|
||||
DateTimeOffset OccurredAt,
|
||||
|
||||
/// <summary>Optional metadata JSON blob.</summary>
|
||||
string? Metadata)
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new audit entry with computed hash.
|
||||
/// </summary>
|
||||
public static AuditEntry Create(
|
||||
string tenantId,
|
||||
AuditEventType eventType,
|
||||
string resourceType,
|
||||
Guid resourceId,
|
||||
string actorId,
|
||||
ActorType actorType,
|
||||
string description,
|
||||
string? oldState = null,
|
||||
string? newState = null,
|
||||
string? actorIp = null,
|
||||
string? userAgent = null,
|
||||
string? httpMethod = null,
|
||||
string? requestPath = null,
|
||||
string? correlationId = null,
|
||||
string? previousEntryHash = null,
|
||||
long sequenceNumber = 0,
|
||||
string? metadata = null)
|
||||
{
|
||||
var entryId = Guid.NewGuid();
|
||||
var occurredAt = DateTimeOffset.UtcNow;
|
||||
|
||||
// Compute content hash from entry data
|
||||
var contentToHash = $"{entryId}|{tenantId}|{eventType}|{resourceType}|{resourceId}|{actorId}|{actorType}|{description}|{oldState}|{newState}|{occurredAt:O}|{sequenceNumber}";
|
||||
var contentHash = ComputeSha256(contentToHash);
|
||||
|
||||
return new AuditEntry(
|
||||
EntryId: entryId,
|
||||
TenantId: tenantId,
|
||||
EventType: eventType,
|
||||
ResourceType: resourceType,
|
||||
ResourceId: resourceId,
|
||||
ActorId: actorId,
|
||||
ActorType: actorType,
|
||||
ActorIp: actorIp,
|
||||
UserAgent: userAgent,
|
||||
HttpMethod: httpMethod,
|
||||
RequestPath: requestPath,
|
||||
OldState: oldState,
|
||||
NewState: newState,
|
||||
Description: description,
|
||||
CorrelationId: correlationId,
|
||||
PreviousEntryHash: previousEntryHash,
|
||||
ContentHash: contentHash,
|
||||
SequenceNumber: sequenceNumber,
|
||||
OccurredAt: occurredAt,
|
||||
Metadata: metadata);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the integrity of this entry's content hash.
|
||||
/// </summary>
|
||||
public bool VerifyIntegrity()
|
||||
{
|
||||
var contentToHash = $"{EntryId}|{TenantId}|{EventType}|{ResourceType}|{ResourceId}|{ActorId}|{ActorType}|{Description}|{OldState}|{NewState}|{OccurredAt:O}|{SequenceNumber}";
|
||||
var computed = ComputeSha256(contentToHash);
|
||||
return string.Equals(ContentHash, computed, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the chain link to the previous entry.
|
||||
/// </summary>
|
||||
public bool VerifyChainLink(AuditEntry? previousEntry)
|
||||
{
|
||||
if (previousEntry is null)
|
||||
{
|
||||
return PreviousEntryHash is null || SequenceNumber == 1;
|
||||
}
|
||||
|
||||
return string.Equals(PreviousEntryHash, previousEntry.ContentHash, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
private static string ComputeSha256(string content)
|
||||
{
|
||||
var bytes = System.Text.Encoding.UTF8.GetBytes(content);
|
||||
var hash = System.Security.Cryptography.SHA256.HashData(bytes);
|
||||
return Convert.ToHexString(hash).ToLowerInvariant();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Types of auditable events in the orchestrator.
|
||||
/// </summary>
|
||||
public enum AuditEventType
|
||||
{
|
||||
// Job lifecycle events
|
||||
JobCreated = 100,
|
||||
JobScheduled = 101,
|
||||
JobLeased = 102,
|
||||
JobCompleted = 103,
|
||||
JobFailed = 104,
|
||||
JobCanceled = 105,
|
||||
JobRetried = 106,
|
||||
|
||||
// Run lifecycle events
|
||||
RunCreated = 200,
|
||||
RunStarted = 201,
|
||||
RunCompleted = 202,
|
||||
RunFailed = 203,
|
||||
RunCanceled = 204,
|
||||
|
||||
// Source management events
|
||||
SourceCreated = 300,
|
||||
SourceUpdated = 301,
|
||||
SourcePaused = 302,
|
||||
SourceResumed = 303,
|
||||
SourceDeleted = 304,
|
||||
|
||||
// Quota management events
|
||||
QuotaCreated = 400,
|
||||
QuotaUpdated = 401,
|
||||
QuotaPaused = 402,
|
||||
QuotaResumed = 403,
|
||||
QuotaDeleted = 404,
|
||||
|
||||
// SLO management events
|
||||
SloCreated = 500,
|
||||
SloUpdated = 501,
|
||||
SloEnabled = 502,
|
||||
SloDisabled = 503,
|
||||
SloDeleted = 504,
|
||||
SloAlertTriggered = 505,
|
||||
SloAlertAcknowledged = 506,
|
||||
SloAlertResolved = 507,
|
||||
|
||||
// Dead-letter events
|
||||
DeadLetterCreated = 600,
|
||||
DeadLetterReplayed = 601,
|
||||
DeadLetterResolved = 602,
|
||||
DeadLetterExpired = 603,
|
||||
|
||||
// Backfill events
|
||||
BackfillCreated = 700,
|
||||
BackfillStarted = 701,
|
||||
BackfillCompleted = 702,
|
||||
BackfillFailed = 703,
|
||||
BackfillCanceled = 704,
|
||||
|
||||
// Ledger events
|
||||
LedgerExportRequested = 800,
|
||||
LedgerExportCompleted = 801,
|
||||
LedgerExportFailed = 802,
|
||||
|
||||
// Worker events
|
||||
WorkerClaimed = 900,
|
||||
WorkerHeartbeat = 901,
|
||||
WorkerProgressReported = 902,
|
||||
WorkerCompleted = 903,
|
||||
|
||||
// Security events
|
||||
AuthenticationSuccess = 1000,
|
||||
AuthenticationFailure = 1001,
|
||||
AuthorizationDenied = 1002,
|
||||
ApiKeyCreated = 1003,
|
||||
ApiKeyRevoked = 1004
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Types of actors that can perform auditable actions.
|
||||
/// </summary>
|
||||
public enum ActorType
|
||||
{
|
||||
/// <summary>Human user via UI or API.</summary>
|
||||
User = 0,
|
||||
|
||||
/// <summary>System-initiated action (scheduler, background job).</summary>
|
||||
System = 1,
|
||||
|
||||
/// <summary>Worker process.</summary>
|
||||
Worker = 2,
|
||||
|
||||
/// <summary>API key authentication.</summary>
|
||||
ApiKey = 3,
|
||||
|
||||
/// <summary>Service-to-service call.</summary>
|
||||
Service = 4,
|
||||
|
||||
/// <summary>Unknown or unidentified actor.</summary>
|
||||
Unknown = 99
|
||||
}
|
||||
@@ -0,0 +1,429 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a request to backfill/reprocess events within a time window.
|
||||
/// </summary>
|
||||
public sealed record BackfillRequest(
|
||||
/// <summary>Unique backfill request identifier.</summary>
|
||||
Guid BackfillId,
|
||||
|
||||
/// <summary>Tenant this backfill applies to.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Source to backfill (null if job-type scoped).</summary>
|
||||
Guid? SourceId,
|
||||
|
||||
/// <summary>Job type to backfill (null if source-scoped).</summary>
|
||||
string? JobType,
|
||||
|
||||
/// <summary>Normalized scope key.</summary>
|
||||
string ScopeKey,
|
||||
|
||||
/// <summary>Current status of the backfill.</summary>
|
||||
BackfillStatus Status,
|
||||
|
||||
/// <summary>Start of the time window to backfill (inclusive).</summary>
|
||||
DateTimeOffset WindowStart,
|
||||
|
||||
/// <summary>End of the time window to backfill (exclusive).</summary>
|
||||
DateTimeOffset WindowEnd,
|
||||
|
||||
/// <summary>Current processing position within the window.</summary>
|
||||
DateTimeOffset? CurrentPosition,
|
||||
|
||||
/// <summary>Total events estimated in the window.</summary>
|
||||
long? TotalEvents,
|
||||
|
||||
/// <summary>Events successfully processed.</summary>
|
||||
long ProcessedEvents,
|
||||
|
||||
/// <summary>Events skipped due to duplicate suppression.</summary>
|
||||
long SkippedEvents,
|
||||
|
||||
/// <summary>Events that failed processing.</summary>
|
||||
long FailedEvents,
|
||||
|
||||
/// <summary>Number of events to process per batch.</summary>
|
||||
int BatchSize,
|
||||
|
||||
/// <summary>Whether this is a dry-run (preview only, no changes).</summary>
|
||||
bool DryRun,
|
||||
|
||||
/// <summary>Whether to force reprocessing (ignore duplicate suppression).</summary>
|
||||
bool ForceReprocess,
|
||||
|
||||
/// <summary>Estimated duration for the backfill.</summary>
|
||||
TimeSpan? EstimatedDuration,
|
||||
|
||||
/// <summary>Maximum allowed duration (safety limit).</summary>
|
||||
TimeSpan? MaxDuration,
|
||||
|
||||
/// <summary>Results of safety validation checks.</summary>
|
||||
BackfillSafetyChecks? SafetyChecks,
|
||||
|
||||
/// <summary>Reason for the backfill request.</summary>
|
||||
string Reason,
|
||||
|
||||
/// <summary>Optional ticket reference for audit.</summary>
|
||||
string? Ticket,
|
||||
|
||||
/// <summary>When the request was created.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>When processing started.</summary>
|
||||
DateTimeOffset? StartedAt,
|
||||
|
||||
/// <summary>When processing completed.</summary>
|
||||
DateTimeOffset? CompletedAt,
|
||||
|
||||
/// <summary>Actor who created the request.</summary>
|
||||
string CreatedBy,
|
||||
|
||||
/// <summary>Actor who last modified the request.</summary>
|
||||
string UpdatedBy,
|
||||
|
||||
/// <summary>Error message if failed.</summary>
|
||||
string? ErrorMessage)
|
||||
{
|
||||
/// <summary>
|
||||
/// Window duration.
|
||||
/// </summary>
|
||||
public TimeSpan WindowDuration => WindowEnd - WindowStart;
|
||||
|
||||
/// <summary>
|
||||
/// Progress percentage (0-100).
|
||||
/// </summary>
|
||||
public double ProgressPercent => TotalEvents > 0
|
||||
? Math.Round((double)(ProcessedEvents + SkippedEvents + FailedEvents) / TotalEvents.Value * 100, 2)
|
||||
: 0;
|
||||
|
||||
/// <summary>
|
||||
/// Whether the backfill is in a terminal state.
|
||||
/// </summary>
|
||||
public bool IsTerminal => Status is BackfillStatus.Completed or BackfillStatus.Failed or BackfillStatus.Canceled;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new backfill request.
|
||||
/// </summary>
|
||||
public static BackfillRequest Create(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
string reason,
|
||||
string createdBy,
|
||||
int batchSize = 100,
|
||||
bool dryRun = false,
|
||||
bool forceReprocess = false,
|
||||
string? ticket = null,
|
||||
TimeSpan? maxDuration = null)
|
||||
{
|
||||
if (windowEnd <= windowStart)
|
||||
throw new ArgumentException("Window end must be after window start.", nameof(windowEnd));
|
||||
|
||||
if (batchSize <= 0 || batchSize > 10000)
|
||||
throw new ArgumentOutOfRangeException(nameof(batchSize), "Batch size must be between 1 and 10000.");
|
||||
|
||||
var scopeKey = (sourceId, jobType) switch
|
||||
{
|
||||
(Guid s, string j) when !string.IsNullOrEmpty(j) => Watermark.CreateScopeKey(s, j),
|
||||
(Guid s, _) => Watermark.CreateScopeKey(s),
|
||||
(_, string j) when !string.IsNullOrEmpty(j) => Watermark.CreateScopeKey(j),
|
||||
_ => throw new ArgumentException("Either sourceId or jobType must be specified.")
|
||||
};
|
||||
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
return new BackfillRequest(
|
||||
BackfillId: Guid.NewGuid(),
|
||||
TenantId: tenantId,
|
||||
SourceId: sourceId,
|
||||
JobType: jobType,
|
||||
ScopeKey: scopeKey,
|
||||
Status: BackfillStatus.Pending,
|
||||
WindowStart: windowStart,
|
||||
WindowEnd: windowEnd,
|
||||
CurrentPosition: null,
|
||||
TotalEvents: null,
|
||||
ProcessedEvents: 0,
|
||||
SkippedEvents: 0,
|
||||
FailedEvents: 0,
|
||||
BatchSize: batchSize,
|
||||
DryRun: dryRun,
|
||||
ForceReprocess: forceReprocess,
|
||||
EstimatedDuration: null,
|
||||
MaxDuration: maxDuration,
|
||||
SafetyChecks: null,
|
||||
Reason: reason,
|
||||
Ticket: ticket,
|
||||
CreatedAt: now,
|
||||
StartedAt: null,
|
||||
CompletedAt: null,
|
||||
CreatedBy: createdBy,
|
||||
UpdatedBy: createdBy,
|
||||
ErrorMessage: null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Transitions to validating status.
|
||||
/// </summary>
|
||||
public BackfillRequest StartValidation(string updatedBy)
|
||||
{
|
||||
if (Status != BackfillStatus.Pending)
|
||||
throw new InvalidOperationException($"Cannot start validation from status {Status}.");
|
||||
|
||||
return this with
|
||||
{
|
||||
Status = BackfillStatus.Validating,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records safety check results.
|
||||
/// </summary>
|
||||
public BackfillRequest WithSafetyChecks(BackfillSafetyChecks checks, long? totalEvents, TimeSpan? estimatedDuration, string updatedBy)
|
||||
{
|
||||
return this with
|
||||
{
|
||||
SafetyChecks = checks,
|
||||
TotalEvents = totalEvents,
|
||||
EstimatedDuration = estimatedDuration,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Transitions to running status.
|
||||
/// </summary>
|
||||
public BackfillRequest Start(string updatedBy)
|
||||
{
|
||||
if (Status != BackfillStatus.Validating)
|
||||
throw new InvalidOperationException($"Cannot start from status {Status}.");
|
||||
|
||||
if (SafetyChecks?.HasBlockingIssues == true)
|
||||
throw new InvalidOperationException("Cannot start backfill with blocking safety issues.");
|
||||
|
||||
return this with
|
||||
{
|
||||
Status = BackfillStatus.Running,
|
||||
StartedAt = DateTimeOffset.UtcNow,
|
||||
CurrentPosition = WindowStart,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates progress after processing a batch.
|
||||
/// </summary>
|
||||
public BackfillRequest UpdateProgress(
|
||||
DateTimeOffset newPosition,
|
||||
long processed,
|
||||
long skipped,
|
||||
long failed,
|
||||
string updatedBy)
|
||||
{
|
||||
if (Status != BackfillStatus.Running)
|
||||
throw new InvalidOperationException($"Cannot update progress in status {Status}.");
|
||||
|
||||
return this with
|
||||
{
|
||||
CurrentPosition = newPosition,
|
||||
ProcessedEvents = ProcessedEvents + processed,
|
||||
SkippedEvents = SkippedEvents + skipped,
|
||||
FailedEvents = FailedEvents + failed,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Pauses the backfill.
|
||||
/// </summary>
|
||||
public BackfillRequest Pause(string updatedBy)
|
||||
{
|
||||
if (Status != BackfillStatus.Running)
|
||||
throw new InvalidOperationException($"Cannot pause from status {Status}.");
|
||||
|
||||
return this with
|
||||
{
|
||||
Status = BackfillStatus.Paused,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resumes a paused backfill.
|
||||
/// </summary>
|
||||
public BackfillRequest Resume(string updatedBy)
|
||||
{
|
||||
if (Status != BackfillStatus.Paused)
|
||||
throw new InvalidOperationException($"Cannot resume from status {Status}.");
|
||||
|
||||
return this with
|
||||
{
|
||||
Status = BackfillStatus.Running,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Completes the backfill successfully.
|
||||
/// </summary>
|
||||
public BackfillRequest Complete(string updatedBy)
|
||||
{
|
||||
if (Status != BackfillStatus.Running)
|
||||
throw new InvalidOperationException($"Cannot complete from status {Status}.");
|
||||
|
||||
return this with
|
||||
{
|
||||
Status = BackfillStatus.Completed,
|
||||
CompletedAt = DateTimeOffset.UtcNow,
|
||||
CurrentPosition = WindowEnd,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fails the backfill with an error.
|
||||
/// </summary>
|
||||
public BackfillRequest Fail(string error, string updatedBy)
|
||||
{
|
||||
return this with
|
||||
{
|
||||
Status = BackfillStatus.Failed,
|
||||
CompletedAt = DateTimeOffset.UtcNow,
|
||||
ErrorMessage = error,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Cancels the backfill.
|
||||
/// </summary>
|
||||
public BackfillRequest Cancel(string updatedBy)
|
||||
{
|
||||
if (IsTerminal)
|
||||
throw new InvalidOperationException($"Cannot cancel from terminal status {Status}.");
|
||||
|
||||
return this with
|
||||
{
|
||||
Status = BackfillStatus.Canceled,
|
||||
CompletedAt = DateTimeOffset.UtcNow,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of a backfill request.
|
||||
/// </summary>
|
||||
public enum BackfillStatus
|
||||
{
|
||||
/// <summary>Request created, awaiting validation.</summary>
|
||||
Pending,
|
||||
|
||||
/// <summary>Running safety validations.</summary>
|
||||
Validating,
|
||||
|
||||
/// <summary>Actively processing events.</summary>
|
||||
Running,
|
||||
|
||||
/// <summary>Temporarily paused.</summary>
|
||||
Paused,
|
||||
|
||||
/// <summary>Successfully completed.</summary>
|
||||
Completed,
|
||||
|
||||
/// <summary>Failed with error.</summary>
|
||||
Failed,
|
||||
|
||||
/// <summary>Canceled by operator.</summary>
|
||||
Canceled
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Results of backfill safety validation checks.
|
||||
/// </summary>
|
||||
public sealed record BackfillSafetyChecks(
|
||||
/// <summary>Whether the source exists and is accessible.</summary>
|
||||
bool SourceExists,
|
||||
|
||||
/// <summary>Whether there are overlapping active backfills.</summary>
|
||||
bool HasOverlappingBackfill,
|
||||
|
||||
/// <summary>Whether the window is within retention period.</summary>
|
||||
bool WithinRetention,
|
||||
|
||||
/// <summary>Whether the estimated event count is within limits.</summary>
|
||||
bool WithinEventLimit,
|
||||
|
||||
/// <summary>Whether estimated duration is within max duration.</summary>
|
||||
bool WithinDurationLimit,
|
||||
|
||||
/// <summary>Whether required quotas are available.</summary>
|
||||
bool QuotaAvailable,
|
||||
|
||||
/// <summary>Warning messages (non-blocking).</summary>
|
||||
IReadOnlyList<string> Warnings,
|
||||
|
||||
/// <summary>Error messages (blocking).</summary>
|
||||
IReadOnlyList<string> Errors)
|
||||
{
|
||||
/// <summary>
|
||||
/// Whether there are any blocking issues.
|
||||
/// </summary>
|
||||
public bool HasBlockingIssues => !SourceExists || HasOverlappingBackfill || !WithinRetention
|
||||
|| !WithinEventLimit || !WithinDurationLimit || Errors.Count > 0;
|
||||
|
||||
/// <summary>
|
||||
/// Whether the backfill is safe to proceed.
|
||||
/// </summary>
|
||||
public bool IsSafe => !HasBlockingIssues;
|
||||
|
||||
/// <summary>
|
||||
/// Creates successful safety checks with no issues.
|
||||
/// </summary>
|
||||
public static BackfillSafetyChecks AllPassed() => new(
|
||||
SourceExists: true,
|
||||
HasOverlappingBackfill: false,
|
||||
WithinRetention: true,
|
||||
WithinEventLimit: true,
|
||||
WithinDurationLimit: true,
|
||||
QuotaAvailable: true,
|
||||
Warnings: [],
|
||||
Errors: []);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Preview result for dry-run backfill.
|
||||
/// </summary>
|
||||
public sealed record BackfillPreview(
|
||||
/// <summary>Scope being backfilled.</summary>
|
||||
string ScopeKey,
|
||||
|
||||
/// <summary>Time window for backfill.</summary>
|
||||
DateTimeOffset WindowStart,
|
||||
|
||||
/// <summary>Time window for backfill.</summary>
|
||||
DateTimeOffset WindowEnd,
|
||||
|
||||
/// <summary>Estimated total events in window.</summary>
|
||||
long EstimatedEvents,
|
||||
|
||||
/// <summary>Events that would be skipped (already processed).</summary>
|
||||
long SkippedEvents,
|
||||
|
||||
/// <summary>Events that would be processed.</summary>
|
||||
long ProcessableEvents,
|
||||
|
||||
/// <summary>Estimated duration.</summary>
|
||||
TimeSpan EstimatedDuration,
|
||||
|
||||
/// <summary>Number of batches required.</summary>
|
||||
int EstimatedBatches,
|
||||
|
||||
/// <summary>Safety validation results.</summary>
|
||||
BackfillSafetyChecks SafetyChecks,
|
||||
|
||||
/// <summary>Sample of event keys that would be processed.</summary>
|
||||
IReadOnlyList<string> SampleEventKeys);
|
||||
@@ -0,0 +1,42 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a dependency edge in a job DAG (Directed Acyclic Graph).
|
||||
/// The child job cannot start until the parent job succeeds.
|
||||
/// </summary>
|
||||
public sealed record DagEdge(
|
||||
/// <summary>Unique edge identifier.</summary>
|
||||
Guid EdgeId,
|
||||
|
||||
/// <summary>Tenant owning this edge.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Run containing these jobs.</summary>
|
||||
Guid RunId,
|
||||
|
||||
/// <summary>Parent job ID (must complete first).</summary>
|
||||
Guid ParentJobId,
|
||||
|
||||
/// <summary>Child job ID (depends on parent).</summary>
|
||||
Guid ChildJobId,
|
||||
|
||||
/// <summary>Edge type (e.g., "success", "always", "failure").</summary>
|
||||
string EdgeType,
|
||||
|
||||
/// <summary>When this edge was created.</summary>
|
||||
DateTimeOffset CreatedAt);
|
||||
|
||||
/// <summary>
|
||||
/// Edge types defining dependency semantics.
|
||||
/// </summary>
|
||||
public static class DagEdgeTypes
|
||||
{
|
||||
/// <summary>Child runs only if parent succeeds.</summary>
|
||||
public const string Success = "success";
|
||||
|
||||
/// <summary>Child runs regardless of parent outcome.</summary>
|
||||
public const string Always = "always";
|
||||
|
||||
/// <summary>Child runs only if parent fails.</summary>
|
||||
public const string Failure = "failure";
|
||||
}
|
||||
@@ -0,0 +1,292 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a job that has been moved to the dead-letter store after exhausting retries
|
||||
/// or encountering a non-retryable error.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterEntry(
|
||||
/// <summary>Unique dead-letter entry identifier.</summary>
|
||||
Guid EntryId,
|
||||
|
||||
/// <summary>Tenant owning this entry.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Original job that failed.</summary>
|
||||
Guid OriginalJobId,
|
||||
|
||||
/// <summary>Run the job belonged to (if any).</summary>
|
||||
Guid? RunId,
|
||||
|
||||
/// <summary>Source the job was processing (if any).</summary>
|
||||
Guid? SourceId,
|
||||
|
||||
/// <summary>Job type (e.g., "scan.image", "advisory.nvd").</summary>
|
||||
string JobType,
|
||||
|
||||
/// <summary>Job payload JSON (inputs, parameters).</summary>
|
||||
string Payload,
|
||||
|
||||
/// <summary>SHA-256 digest of the payload.</summary>
|
||||
string PayloadDigest,
|
||||
|
||||
/// <summary>Idempotency key from original job.</summary>
|
||||
string IdempotencyKey,
|
||||
|
||||
/// <summary>Correlation ID for distributed tracing.</summary>
|
||||
string? CorrelationId,
|
||||
|
||||
/// <summary>Current entry status.</summary>
|
||||
DeadLetterStatus Status,
|
||||
|
||||
/// <summary>Classified error code.</summary>
|
||||
string ErrorCode,
|
||||
|
||||
/// <summary>Human-readable failure reason.</summary>
|
||||
string FailureReason,
|
||||
|
||||
/// <summary>Suggested remediation hint for operators.</summary>
|
||||
string? RemediationHint,
|
||||
|
||||
/// <summary>Error classification category.</summary>
|
||||
ErrorCategory Category,
|
||||
|
||||
/// <summary>Whether this error is potentially retryable.</summary>
|
||||
bool IsRetryable,
|
||||
|
||||
/// <summary>Number of attempts made by original job.</summary>
|
||||
int OriginalAttempts,
|
||||
|
||||
/// <summary>Number of replay attempts from dead-letter.</summary>
|
||||
int ReplayAttempts,
|
||||
|
||||
/// <summary>Maximum replay attempts allowed.</summary>
|
||||
int MaxReplayAttempts,
|
||||
|
||||
/// <summary>When the job originally failed.</summary>
|
||||
DateTimeOffset FailedAt,
|
||||
|
||||
/// <summary>When the entry was created in dead-letter store.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>When the entry was last updated.</summary>
|
||||
DateTimeOffset UpdatedAt,
|
||||
|
||||
/// <summary>When the entry expires and can be purged.</summary>
|
||||
DateTimeOffset ExpiresAt,
|
||||
|
||||
/// <summary>When the entry was resolved (if applicable).</summary>
|
||||
DateTimeOffset? ResolvedAt,
|
||||
|
||||
/// <summary>Resolution notes (if resolved).</summary>
|
||||
string? ResolutionNotes,
|
||||
|
||||
/// <summary>Actor who created/submitted the original job.</summary>
|
||||
string CreatedBy,
|
||||
|
||||
/// <summary>Actor who last updated the entry.</summary>
|
||||
string UpdatedBy)
|
||||
{
|
||||
/// <summary>Default retention period for dead-letter entries.</summary>
|
||||
public static readonly TimeSpan DefaultRetention = TimeSpan.FromDays(30);
|
||||
|
||||
/// <summary>Default maximum replay attempts.</summary>
|
||||
public const int DefaultMaxReplayAttempts = 3;
|
||||
|
||||
/// <summary>Whether this entry is in a terminal state.</summary>
|
||||
public bool IsTerminal => Status is DeadLetterStatus.Replayed
|
||||
or DeadLetterStatus.Resolved
|
||||
or DeadLetterStatus.Exhausted
|
||||
or DeadLetterStatus.Expired;
|
||||
|
||||
/// <summary>Whether more replay attempts are allowed.</summary>
|
||||
public bool CanReplay => !IsTerminal && IsRetryable && ReplayAttempts < MaxReplayAttempts;
|
||||
|
||||
/// <summary>Creates a new dead-letter entry from a failed job.</summary>
|
||||
public static DeadLetterEntry FromFailedJob(
|
||||
Job job,
|
||||
string errorCode,
|
||||
string failureReason,
|
||||
string? remediationHint,
|
||||
ErrorCategory category,
|
||||
bool isRetryable,
|
||||
DateTimeOffset now,
|
||||
TimeSpan? retention = null,
|
||||
int? maxReplayAttempts = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(job);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(errorCode);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(failureReason);
|
||||
|
||||
var effectiveRetention = retention ?? DefaultRetention;
|
||||
var effectiveMaxReplays = maxReplayAttempts ?? DefaultMaxReplayAttempts;
|
||||
|
||||
return new DeadLetterEntry(
|
||||
EntryId: Guid.NewGuid(),
|
||||
TenantId: job.TenantId,
|
||||
OriginalJobId: job.JobId,
|
||||
RunId: job.RunId,
|
||||
SourceId: null, // Would be extracted from payload if available
|
||||
JobType: job.JobType,
|
||||
Payload: job.Payload,
|
||||
PayloadDigest: job.PayloadDigest,
|
||||
IdempotencyKey: job.IdempotencyKey,
|
||||
CorrelationId: job.CorrelationId,
|
||||
Status: DeadLetterStatus.Pending,
|
||||
ErrorCode: errorCode,
|
||||
FailureReason: failureReason,
|
||||
RemediationHint: remediationHint,
|
||||
Category: category,
|
||||
IsRetryable: isRetryable,
|
||||
OriginalAttempts: job.Attempt,
|
||||
ReplayAttempts: 0,
|
||||
MaxReplayAttempts: effectiveMaxReplays,
|
||||
FailedAt: job.CompletedAt ?? now,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
ExpiresAt: now.Add(effectiveRetention),
|
||||
ResolvedAt: null,
|
||||
ResolutionNotes: null,
|
||||
CreatedBy: job.CreatedBy,
|
||||
UpdatedBy: "system");
|
||||
}
|
||||
|
||||
/// <summary>Marks entry as being replayed.</summary>
|
||||
public DeadLetterEntry StartReplay(string updatedBy, DateTimeOffset now)
|
||||
{
|
||||
if (!CanReplay)
|
||||
throw new InvalidOperationException($"Cannot replay entry in status {Status} with {ReplayAttempts}/{MaxReplayAttempts} attempts.");
|
||||
|
||||
return this with
|
||||
{
|
||||
Status = DeadLetterStatus.Replaying,
|
||||
ReplayAttempts = ReplayAttempts + 1,
|
||||
UpdatedAt = now,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>Marks entry as successfully replayed.</summary>
|
||||
public DeadLetterEntry CompleteReplay(Guid newJobId, string updatedBy, DateTimeOffset now)
|
||||
{
|
||||
if (Status != DeadLetterStatus.Replaying)
|
||||
throw new InvalidOperationException($"Cannot complete replay from status {Status}.");
|
||||
|
||||
return this with
|
||||
{
|
||||
Status = DeadLetterStatus.Replayed,
|
||||
ResolvedAt = now,
|
||||
ResolutionNotes = $"Replayed as job {newJobId}",
|
||||
UpdatedAt = now,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>Marks replay as failed.</summary>
|
||||
public DeadLetterEntry FailReplay(string reason, string updatedBy, DateTimeOffset now)
|
||||
{
|
||||
if (Status != DeadLetterStatus.Replaying)
|
||||
throw new InvalidOperationException($"Cannot fail replay from status {Status}.");
|
||||
|
||||
var newStatus = ReplayAttempts >= MaxReplayAttempts
|
||||
? DeadLetterStatus.Exhausted
|
||||
: DeadLetterStatus.Pending;
|
||||
|
||||
return this with
|
||||
{
|
||||
Status = newStatus,
|
||||
FailureReason = reason,
|
||||
UpdatedAt = now,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>Manually resolves the entry without replay.</summary>
|
||||
public DeadLetterEntry Resolve(string notes, string updatedBy, DateTimeOffset now)
|
||||
{
|
||||
if (IsTerminal)
|
||||
throw new InvalidOperationException($"Cannot resolve entry in terminal status {Status}.");
|
||||
|
||||
return this with
|
||||
{
|
||||
Status = DeadLetterStatus.Resolved,
|
||||
ResolvedAt = now,
|
||||
ResolutionNotes = notes,
|
||||
UpdatedAt = now,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>Marks entry as expired for cleanup.</summary>
|
||||
public DeadLetterEntry MarkExpired(DateTimeOffset now)
|
||||
{
|
||||
if (IsTerminal)
|
||||
throw new InvalidOperationException($"Cannot expire entry in terminal status {Status}.");
|
||||
|
||||
return this with
|
||||
{
|
||||
Status = DeadLetterStatus.Expired,
|
||||
UpdatedAt = now,
|
||||
UpdatedBy = "system"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Dead-letter entry lifecycle states.
|
||||
/// </summary>
|
||||
public enum DeadLetterStatus
|
||||
{
|
||||
/// <summary>Entry awaiting operator action or replay.</summary>
|
||||
Pending = 0,
|
||||
|
||||
/// <summary>Entry currently being replayed.</summary>
|
||||
Replaying = 1,
|
||||
|
||||
/// <summary>Entry successfully replayed as a new job.</summary>
|
||||
Replayed = 2,
|
||||
|
||||
/// <summary>Entry manually resolved without replay.</summary>
|
||||
Resolved = 3,
|
||||
|
||||
/// <summary>Entry exhausted all replay attempts.</summary>
|
||||
Exhausted = 4,
|
||||
|
||||
/// <summary>Entry expired and eligible for purge.</summary>
|
||||
Expired = 5
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Error classification categories for dead-letter entries.
|
||||
/// </summary>
|
||||
public enum ErrorCategory
|
||||
{
|
||||
/// <summary>Unknown or unclassified error.</summary>
|
||||
Unknown = 0,
|
||||
|
||||
/// <summary>Transient infrastructure error (network, timeout).</summary>
|
||||
Transient = 1,
|
||||
|
||||
/// <summary>Resource not found (image, source, etc.).</summary>
|
||||
NotFound = 2,
|
||||
|
||||
/// <summary>Authentication or authorization failure.</summary>
|
||||
AuthFailure = 3,
|
||||
|
||||
/// <summary>Rate limiting or quota exceeded.</summary>
|
||||
RateLimited = 4,
|
||||
|
||||
/// <summary>Invalid input or configuration.</summary>
|
||||
ValidationError = 5,
|
||||
|
||||
/// <summary>Upstream service error (registry, advisory feed).</summary>
|
||||
UpstreamError = 6,
|
||||
|
||||
/// <summary>Internal processing error (bug, corruption).</summary>
|
||||
InternalError = 7,
|
||||
|
||||
/// <summary>Resource conflict (duplicate, version mismatch).</summary>
|
||||
Conflict = 8,
|
||||
|
||||
/// <summary>Operation canceled by user or system.</summary>
|
||||
Canceled = 9
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents an operational incident triggered by threshold breaches.
|
||||
/// Incidents are generated when failure rates exceed configured limits.
|
||||
/// </summary>
|
||||
public sealed record Incident(
|
||||
/// <summary>Unique incident identifier.</summary>
|
||||
Guid IncidentId,
|
||||
|
||||
/// <summary>Tenant affected by this incident.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Incident type (e.g., "failure_rate", "quota_exhausted", "circuit_open").</summary>
|
||||
string IncidentType,
|
||||
|
||||
/// <summary>Incident severity (e.g., "warning", "critical").</summary>
|
||||
string Severity,
|
||||
|
||||
/// <summary>Affected job type (if applicable).</summary>
|
||||
string? JobType,
|
||||
|
||||
/// <summary>Affected source (if applicable).</summary>
|
||||
Guid? SourceId,
|
||||
|
||||
/// <summary>Human-readable incident title.</summary>
|
||||
string Title,
|
||||
|
||||
/// <summary>Detailed incident description.</summary>
|
||||
string Description,
|
||||
|
||||
/// <summary>Current incident status.</summary>
|
||||
IncidentStatus Status,
|
||||
|
||||
/// <summary>When the incident was created.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>When the incident was acknowledged.</summary>
|
||||
DateTimeOffset? AcknowledgedAt,
|
||||
|
||||
/// <summary>Actor who acknowledged the incident.</summary>
|
||||
string? AcknowledgedBy,
|
||||
|
||||
/// <summary>When the incident was resolved.</summary>
|
||||
DateTimeOffset? ResolvedAt,
|
||||
|
||||
/// <summary>Actor who resolved the incident.</summary>
|
||||
string? ResolvedBy,
|
||||
|
||||
/// <summary>Resolution notes.</summary>
|
||||
string? ResolutionNotes,
|
||||
|
||||
/// <summary>Optional metadata JSON blob.</summary>
|
||||
string? Metadata);
|
||||
|
||||
/// <summary>
|
||||
/// Incident lifecycle states.
|
||||
/// </summary>
|
||||
public enum IncidentStatus
|
||||
{
|
||||
/// <summary>Incident is open and unacknowledged.</summary>
|
||||
Open = 0,
|
||||
|
||||
/// <summary>Incident acknowledged by operator.</summary>
|
||||
Acknowledged = 1,
|
||||
|
||||
/// <summary>Incident resolved.</summary>
|
||||
Resolved = 2
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a unit of work to be executed by a worker.
|
||||
/// Jobs are scheduled, leased to workers, and tracked through completion.
|
||||
/// </summary>
|
||||
public sealed record Job(
|
||||
/// <summary>Unique job identifier.</summary>
|
||||
Guid JobId,
|
||||
|
||||
/// <summary>Tenant owning this job.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Optional project scope within tenant.</summary>
|
||||
string? ProjectId,
|
||||
|
||||
/// <summary>Run this job belongs to (if any).</summary>
|
||||
Guid? RunId,
|
||||
|
||||
/// <summary>Job type (e.g., "scan.image", "advisory.nvd", "export.sbom").</summary>
|
||||
string JobType,
|
||||
|
||||
/// <summary>Current job status.</summary>
|
||||
JobStatus Status,
|
||||
|
||||
/// <summary>Priority (higher = more urgent). Default 0.</summary>
|
||||
int Priority,
|
||||
|
||||
/// <summary>Current attempt number (1-based).</summary>
|
||||
int Attempt,
|
||||
|
||||
/// <summary>Maximum retry attempts.</summary>
|
||||
int MaxAttempts,
|
||||
|
||||
/// <summary>SHA-256 digest of the payload for determinism verification.</summary>
|
||||
string PayloadDigest,
|
||||
|
||||
/// <summary>Job payload JSON (inputs, parameters).</summary>
|
||||
string Payload,
|
||||
|
||||
/// <summary>Idempotency key for deduplication.</summary>
|
||||
string IdempotencyKey,
|
||||
|
||||
/// <summary>Correlation ID for distributed tracing.</summary>
|
||||
string? CorrelationId,
|
||||
|
||||
/// <summary>Current lease ID (if leased).</summary>
|
||||
Guid? LeaseId,
|
||||
|
||||
/// <summary>Worker holding the lease (if leased).</summary>
|
||||
string? WorkerId,
|
||||
|
||||
/// <summary>Task runner ID executing the job (if applicable).</summary>
|
||||
string? TaskRunnerId,
|
||||
|
||||
/// <summary>Lease expiration time.</summary>
|
||||
DateTimeOffset? LeaseUntil,
|
||||
|
||||
/// <summary>When the job was created.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>When the job was scheduled (quota cleared).</summary>
|
||||
DateTimeOffset? ScheduledAt,
|
||||
|
||||
/// <summary>When the job was leased to a worker.</summary>
|
||||
DateTimeOffset? LeasedAt,
|
||||
|
||||
/// <summary>When the job completed (terminal state).</summary>
|
||||
DateTimeOffset? CompletedAt,
|
||||
|
||||
/// <summary>Earliest time the job can be scheduled (for backoff).</summary>
|
||||
DateTimeOffset? NotBefore,
|
||||
|
||||
/// <summary>Terminal status reason (failure message, cancel reason, etc.).</summary>
|
||||
string? Reason,
|
||||
|
||||
/// <summary>ID of the original job if this is a replay.</summary>
|
||||
Guid? ReplayOf,
|
||||
|
||||
/// <summary>Actor who created/submitted the job.</summary>
|
||||
string CreatedBy);
|
||||
@@ -0,0 +1,48 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents an immutable history entry for job state changes.
|
||||
/// Provides audit trail for all job lifecycle transitions.
|
||||
/// </summary>
|
||||
public sealed record JobHistory(
|
||||
/// <summary>Unique history entry identifier.</summary>
|
||||
Guid HistoryId,
|
||||
|
||||
/// <summary>Tenant owning this entry.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Job this history entry belongs to.</summary>
|
||||
Guid JobId,
|
||||
|
||||
/// <summary>Sequence number within the job's history (1-based).</summary>
|
||||
int SequenceNo,
|
||||
|
||||
/// <summary>Previous job status.</summary>
|
||||
JobStatus? FromStatus,
|
||||
|
||||
/// <summary>New job status.</summary>
|
||||
JobStatus ToStatus,
|
||||
|
||||
/// <summary>Attempt number at time of transition.</summary>
|
||||
int Attempt,
|
||||
|
||||
/// <summary>Lease ID (if applicable).</summary>
|
||||
Guid? LeaseId,
|
||||
|
||||
/// <summary>Worker ID (if applicable).</summary>
|
||||
string? WorkerId,
|
||||
|
||||
/// <summary>Reason for the transition.</summary>
|
||||
string? Reason,
|
||||
|
||||
/// <summary>When this transition occurred.</summary>
|
||||
DateTimeOffset OccurredAt,
|
||||
|
||||
/// <summary>When this entry was recorded.</summary>
|
||||
DateTimeOffset RecordedAt,
|
||||
|
||||
/// <summary>Actor who caused this transition.</summary>
|
||||
string ActorId,
|
||||
|
||||
/// <summary>Actor type (system, operator, worker).</summary>
|
||||
string ActorType);
|
||||
@@ -0,0 +1,30 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Job lifecycle states. Transitions follow the state machine:
|
||||
/// Pending → Scheduled → Leased → (Succeeded | Failed | Canceled | TimedOut)
|
||||
/// Failed jobs may transition to Pending via replay.
|
||||
/// </summary>
|
||||
public enum JobStatus
|
||||
{
|
||||
/// <summary>Job enqueued but not yet scheduled (e.g., quota exceeded).</summary>
|
||||
Pending = 0,
|
||||
|
||||
/// <summary>Job scheduled and awaiting worker lease.</summary>
|
||||
Scheduled = 1,
|
||||
|
||||
/// <summary>Job leased to a worker for execution.</summary>
|
||||
Leased = 2,
|
||||
|
||||
/// <summary>Job completed successfully.</summary>
|
||||
Succeeded = 3,
|
||||
|
||||
/// <summary>Job failed after exhausting retries.</summary>
|
||||
Failed = 4,
|
||||
|
||||
/// <summary>Job canceled by operator or system.</summary>
|
||||
Canceled = 5,
|
||||
|
||||
/// <summary>Job timed out (lease expired without completion).</summary>
|
||||
TimedOut = 6
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents rate-limit and concurrency quotas for job scheduling.
|
||||
/// Quotas are scoped to tenant and optionally job type.
|
||||
/// </summary>
|
||||
public sealed record Quota(
|
||||
/// <summary>Unique quota identifier.</summary>
|
||||
Guid QuotaId,
|
||||
|
||||
/// <summary>Tenant this quota applies to.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Job type this quota applies to (null = all job types).</summary>
|
||||
string? JobType,
|
||||
|
||||
/// <summary>Maximum concurrent active (leased) jobs.</summary>
|
||||
int MaxActive,
|
||||
|
||||
/// <summary>Maximum jobs per hour (sliding window).</summary>
|
||||
int MaxPerHour,
|
||||
|
||||
/// <summary>Burst capacity for token bucket.</summary>
|
||||
int BurstCapacity,
|
||||
|
||||
/// <summary>Token refill rate (tokens per second).</summary>
|
||||
double RefillRate,
|
||||
|
||||
/// <summary>Current available tokens.</summary>
|
||||
double CurrentTokens,
|
||||
|
||||
/// <summary>Last time tokens were refilled.</summary>
|
||||
DateTimeOffset LastRefillAt,
|
||||
|
||||
/// <summary>Current count of active (leased) jobs.</summary>
|
||||
int CurrentActive,
|
||||
|
||||
/// <summary>Jobs scheduled in current hour window.</summary>
|
||||
int CurrentHourCount,
|
||||
|
||||
/// <summary>Start of current hour window.</summary>
|
||||
DateTimeOffset CurrentHourStart,
|
||||
|
||||
/// <summary>Whether this quota is currently paused (operator override).</summary>
|
||||
bool Paused,
|
||||
|
||||
/// <summary>Operator-provided reason for pause.</summary>
|
||||
string? PauseReason,
|
||||
|
||||
/// <summary>Ticket reference for quota change audit.</summary>
|
||||
string? QuotaTicket,
|
||||
|
||||
/// <summary>When the quota was created.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>When the quota was last updated.</summary>
|
||||
DateTimeOffset UpdatedAt,
|
||||
|
||||
/// <summary>Actor who last modified the quota.</summary>
|
||||
string UpdatedBy);
|
||||
@@ -0,0 +1,78 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a run (batch/workflow execution) containing multiple jobs.
|
||||
/// Runs group related jobs (e.g., scanning an image produces multiple analyzer jobs).
|
||||
/// </summary>
|
||||
public sealed record Run(
|
||||
/// <summary>Unique run identifier.</summary>
|
||||
Guid RunId,
|
||||
|
||||
/// <summary>Tenant owning this run.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Optional project scope within tenant.</summary>
|
||||
string? ProjectId,
|
||||
|
||||
/// <summary>Source that initiated this run.</summary>
|
||||
Guid SourceId,
|
||||
|
||||
/// <summary>Run type (e.g., "scan", "advisory-sync", "export").</summary>
|
||||
string RunType,
|
||||
|
||||
/// <summary>Current aggregate status of the run.</summary>
|
||||
RunStatus Status,
|
||||
|
||||
/// <summary>Correlation ID for distributed tracing.</summary>
|
||||
string? CorrelationId,
|
||||
|
||||
/// <summary>Total number of jobs in this run.</summary>
|
||||
int TotalJobs,
|
||||
|
||||
/// <summary>Number of completed jobs (succeeded + failed + canceled).</summary>
|
||||
int CompletedJobs,
|
||||
|
||||
/// <summary>Number of succeeded jobs.</summary>
|
||||
int SucceededJobs,
|
||||
|
||||
/// <summary>Number of failed jobs.</summary>
|
||||
int FailedJobs,
|
||||
|
||||
/// <summary>When the run was created.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>When the run started executing (first job leased).</summary>
|
||||
DateTimeOffset? StartedAt,
|
||||
|
||||
/// <summary>When the run completed (all jobs terminal).</summary>
|
||||
DateTimeOffset? CompletedAt,
|
||||
|
||||
/// <summary>Actor who initiated the run.</summary>
|
||||
string CreatedBy,
|
||||
|
||||
/// <summary>Optional metadata JSON blob.</summary>
|
||||
string? Metadata);
|
||||
|
||||
/// <summary>
|
||||
/// Run lifecycle states.
|
||||
/// </summary>
|
||||
public enum RunStatus
|
||||
{
|
||||
/// <summary>Run created, jobs being enqueued.</summary>
|
||||
Pending = 0,
|
||||
|
||||
/// <summary>Run is executing (at least one job leased).</summary>
|
||||
Running = 1,
|
||||
|
||||
/// <summary>All jobs completed successfully.</summary>
|
||||
Succeeded = 2,
|
||||
|
||||
/// <summary>Run completed with some failures.</summary>
|
||||
PartiallySucceeded = 3,
|
||||
|
||||
/// <summary>All jobs failed.</summary>
|
||||
Failed = 4,
|
||||
|
||||
/// <summary>Run canceled by operator.</summary>
|
||||
Canceled = 5
|
||||
}
|
||||
@@ -0,0 +1,341 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Immutable ledger entry for run execution records.
|
||||
/// Provides a tamper-evident history of run outcomes with provenance to artifacts.
|
||||
/// </summary>
|
||||
public sealed record RunLedgerEntry(
|
||||
/// <summary>Unique ledger entry identifier.</summary>
|
||||
Guid LedgerId,
|
||||
|
||||
/// <summary>Tenant owning this entry.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Run this entry records.</summary>
|
||||
Guid RunId,
|
||||
|
||||
/// <summary>Source that initiated the run.</summary>
|
||||
Guid SourceId,
|
||||
|
||||
/// <summary>Run type (scan, advisory-sync, export).</summary>
|
||||
string RunType,
|
||||
|
||||
/// <summary>Final run status.</summary>
|
||||
RunStatus FinalStatus,
|
||||
|
||||
/// <summary>Total jobs in the run.</summary>
|
||||
int TotalJobs,
|
||||
|
||||
/// <summary>Successfully completed jobs.</summary>
|
||||
int SucceededJobs,
|
||||
|
||||
/// <summary>Failed jobs.</summary>
|
||||
int FailedJobs,
|
||||
|
||||
/// <summary>When the run was created.</summary>
|
||||
DateTimeOffset RunCreatedAt,
|
||||
|
||||
/// <summary>When the run started executing.</summary>
|
||||
DateTimeOffset? RunStartedAt,
|
||||
|
||||
/// <summary>When the run completed.</summary>
|
||||
DateTimeOffset RunCompletedAt,
|
||||
|
||||
/// <summary>Total execution duration.</summary>
|
||||
TimeSpan ExecutionDuration,
|
||||
|
||||
/// <summary>Actor who initiated the run.</summary>
|
||||
string InitiatedBy,
|
||||
|
||||
/// <summary>SHA-256 digest of the run's input payload.</summary>
|
||||
string InputDigest,
|
||||
|
||||
/// <summary>Aggregated SHA-256 digest of all outputs.</summary>
|
||||
string OutputDigest,
|
||||
|
||||
/// <summary>JSON array of artifact references with their digests.</summary>
|
||||
string ArtifactManifest,
|
||||
|
||||
/// <summary>Sequence number in the tenant's ledger.</summary>
|
||||
long SequenceNumber,
|
||||
|
||||
/// <summary>SHA-256 hash of the previous ledger entry.</summary>
|
||||
string? PreviousEntryHash,
|
||||
|
||||
/// <summary>SHA-256 hash of this entry's content.</summary>
|
||||
string ContentHash,
|
||||
|
||||
/// <summary>When this ledger entry was created.</summary>
|
||||
DateTimeOffset LedgerCreatedAt,
|
||||
|
||||
/// <summary>Correlation ID for tracing.</summary>
|
||||
string? CorrelationId,
|
||||
|
||||
/// <summary>Optional metadata JSON.</summary>
|
||||
string? Metadata)
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a ledger entry from a completed run.
|
||||
/// </summary>
|
||||
public static RunLedgerEntry FromCompletedRun(
|
||||
Run run,
|
||||
IReadOnlyList<Artifact> artifacts,
|
||||
string inputDigest,
|
||||
long sequenceNumber,
|
||||
string? previousEntryHash,
|
||||
string? metadata = null)
|
||||
{
|
||||
if (run.CompletedAt is null)
|
||||
{
|
||||
throw new InvalidOperationException("Cannot create ledger entry from an incomplete run.");
|
||||
}
|
||||
|
||||
var ledgerId = Guid.NewGuid();
|
||||
var ledgerCreatedAt = DateTimeOffset.UtcNow;
|
||||
|
||||
// Build artifact manifest
|
||||
var artifactManifest = BuildArtifactManifest(artifacts);
|
||||
|
||||
// Compute output digest from all artifact digests
|
||||
var outputDigest = ComputeOutputDigest(artifacts);
|
||||
|
||||
// Compute execution duration
|
||||
var startTime = run.StartedAt ?? run.CreatedAt;
|
||||
var executionDuration = run.CompletedAt.Value - startTime;
|
||||
|
||||
// Compute content hash for tamper evidence
|
||||
var contentToHash = $"{ledgerId}|{run.TenantId}|{run.RunId}|{run.SourceId}|{run.RunType}|{run.Status}|{run.TotalJobs}|{run.SucceededJobs}|{run.FailedJobs}|{run.CreatedAt:O}|{run.StartedAt:O}|{run.CompletedAt:O}|{inputDigest}|{outputDigest}|{sequenceNumber}|{previousEntryHash}|{ledgerCreatedAt:O}";
|
||||
var contentHash = ComputeSha256(contentToHash);
|
||||
|
||||
return new RunLedgerEntry(
|
||||
LedgerId: ledgerId,
|
||||
TenantId: run.TenantId,
|
||||
RunId: run.RunId,
|
||||
SourceId: run.SourceId,
|
||||
RunType: run.RunType,
|
||||
FinalStatus: run.Status,
|
||||
TotalJobs: run.TotalJobs,
|
||||
SucceededJobs: run.SucceededJobs,
|
||||
FailedJobs: run.FailedJobs,
|
||||
RunCreatedAt: run.CreatedAt,
|
||||
RunStartedAt: run.StartedAt,
|
||||
RunCompletedAt: run.CompletedAt.Value,
|
||||
ExecutionDuration: executionDuration,
|
||||
InitiatedBy: run.CreatedBy,
|
||||
InputDigest: inputDigest,
|
||||
OutputDigest: outputDigest,
|
||||
ArtifactManifest: artifactManifest,
|
||||
SequenceNumber: sequenceNumber,
|
||||
PreviousEntryHash: previousEntryHash,
|
||||
ContentHash: contentHash,
|
||||
LedgerCreatedAt: ledgerCreatedAt,
|
||||
CorrelationId: run.CorrelationId,
|
||||
Metadata: metadata);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the integrity of this ledger entry.
|
||||
/// </summary>
|
||||
public bool VerifyIntegrity()
|
||||
{
|
||||
var contentToHash = $"{LedgerId}|{TenantId}|{RunId}|{SourceId}|{RunType}|{FinalStatus}|{TotalJobs}|{SucceededJobs}|{FailedJobs}|{RunCreatedAt:O}|{RunStartedAt:O}|{RunCompletedAt:O}|{InputDigest}|{OutputDigest}|{SequenceNumber}|{PreviousEntryHash}|{LedgerCreatedAt:O}";
|
||||
var computed = ComputeSha256(contentToHash);
|
||||
return string.Equals(ContentHash, computed, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the chain link to the previous entry.
|
||||
/// </summary>
|
||||
public bool VerifyChainLink(RunLedgerEntry? previousEntry)
|
||||
{
|
||||
if (previousEntry is null)
|
||||
{
|
||||
return PreviousEntryHash is null || SequenceNumber == 1;
|
||||
}
|
||||
|
||||
return string.Equals(PreviousEntryHash, previousEntry.ContentHash, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
private static string BuildArtifactManifest(IReadOnlyList<Artifact> artifacts)
|
||||
{
|
||||
var entries = artifacts.Select(a => new
|
||||
{
|
||||
a.ArtifactId,
|
||||
a.ArtifactType,
|
||||
a.Uri,
|
||||
a.Digest,
|
||||
a.MimeType,
|
||||
a.SizeBytes,
|
||||
a.CreatedAt
|
||||
});
|
||||
|
||||
return System.Text.Json.JsonSerializer.Serialize(entries);
|
||||
}
|
||||
|
||||
private static string ComputeOutputDigest(IReadOnlyList<Artifact> artifacts)
|
||||
{
|
||||
if (artifacts.Count == 0)
|
||||
{
|
||||
return ComputeSha256("(no artifacts)");
|
||||
}
|
||||
|
||||
// Sort by artifact ID for deterministic ordering
|
||||
var sortedDigests = artifacts
|
||||
.OrderBy(a => a.ArtifactId)
|
||||
.Select(a => a.Digest)
|
||||
.ToList();
|
||||
|
||||
var combined = string.Join("|", sortedDigests);
|
||||
return ComputeSha256(combined);
|
||||
}
|
||||
|
||||
private static string ComputeSha256(string content)
|
||||
{
|
||||
var bytes = System.Text.Encoding.UTF8.GetBytes(content);
|
||||
var hash = System.Security.Cryptography.SHA256.HashData(bytes);
|
||||
return Convert.ToHexString(hash).ToLowerInvariant();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents a ledger export operation.
|
||||
/// </summary>
|
||||
public sealed record LedgerExport(
|
||||
/// <summary>Unique export identifier.</summary>
|
||||
Guid ExportId,
|
||||
|
||||
/// <summary>Tenant requesting the export.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Export status.</summary>
|
||||
LedgerExportStatus Status,
|
||||
|
||||
/// <summary>Export format (json, ndjson, csv).</summary>
|
||||
string Format,
|
||||
|
||||
/// <summary>Start of the time range to export.</summary>
|
||||
DateTimeOffset? StartTime,
|
||||
|
||||
/// <summary>End of the time range to export.</summary>
|
||||
DateTimeOffset? EndTime,
|
||||
|
||||
/// <summary>Run types to include (null = all).</summary>
|
||||
string? RunTypeFilter,
|
||||
|
||||
/// <summary>Source ID filter (null = all).</summary>
|
||||
Guid? SourceIdFilter,
|
||||
|
||||
/// <summary>Number of entries exported.</summary>
|
||||
int EntryCount,
|
||||
|
||||
/// <summary>URI where the export is stored.</summary>
|
||||
string? OutputUri,
|
||||
|
||||
/// <summary>SHA-256 digest of the export file.</summary>
|
||||
string? OutputDigest,
|
||||
|
||||
/// <summary>Size of the export in bytes.</summary>
|
||||
long? OutputSizeBytes,
|
||||
|
||||
/// <summary>Actor who requested the export.</summary>
|
||||
string RequestedBy,
|
||||
|
||||
/// <summary>When the export was requested.</summary>
|
||||
DateTimeOffset RequestedAt,
|
||||
|
||||
/// <summary>When the export started processing.</summary>
|
||||
DateTimeOffset? StartedAt,
|
||||
|
||||
/// <summary>When the export completed.</summary>
|
||||
DateTimeOffset? CompletedAt,
|
||||
|
||||
/// <summary>Error message if export failed.</summary>
|
||||
string? ErrorMessage)
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new pending export request.
|
||||
/// </summary>
|
||||
public static LedgerExport CreateRequest(
|
||||
string tenantId,
|
||||
string format,
|
||||
string requestedBy,
|
||||
DateTimeOffset? startTime = null,
|
||||
DateTimeOffset? endTime = null,
|
||||
string? runTypeFilter = null,
|
||||
Guid? sourceIdFilter = null)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(format))
|
||||
{
|
||||
throw new ArgumentException("Format is required.", nameof(format));
|
||||
}
|
||||
|
||||
var validFormats = new[] { "json", "ndjson", "csv" };
|
||||
if (!validFormats.Contains(format.ToLowerInvariant()))
|
||||
{
|
||||
throw new ArgumentException($"Invalid format. Must be one of: {string.Join(", ", validFormats)}", nameof(format));
|
||||
}
|
||||
|
||||
return new LedgerExport(
|
||||
ExportId: Guid.NewGuid(),
|
||||
TenantId: tenantId,
|
||||
Status: LedgerExportStatus.Pending,
|
||||
Format: format.ToLowerInvariant(),
|
||||
StartTime: startTime,
|
||||
EndTime: endTime,
|
||||
RunTypeFilter: runTypeFilter,
|
||||
SourceIdFilter: sourceIdFilter,
|
||||
EntryCount: 0,
|
||||
OutputUri: null,
|
||||
OutputDigest: null,
|
||||
OutputSizeBytes: null,
|
||||
RequestedBy: requestedBy,
|
||||
RequestedAt: DateTimeOffset.UtcNow,
|
||||
StartedAt: null,
|
||||
CompletedAt: null,
|
||||
ErrorMessage: null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Marks the export as started.
|
||||
/// </summary>
|
||||
public LedgerExport Start() => this with
|
||||
{
|
||||
Status = LedgerExportStatus.Processing,
|
||||
StartedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Marks the export as completed.
|
||||
/// </summary>
|
||||
public LedgerExport Complete(string outputUri, string outputDigest, long outputSizeBytes, int entryCount) => this with
|
||||
{
|
||||
Status = LedgerExportStatus.Completed,
|
||||
OutputUri = outputUri,
|
||||
OutputDigest = outputDigest,
|
||||
OutputSizeBytes = outputSizeBytes,
|
||||
EntryCount = entryCount,
|
||||
CompletedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Marks the export as failed.
|
||||
/// </summary>
|
||||
public LedgerExport Fail(string errorMessage) => this with
|
||||
{
|
||||
Status = LedgerExportStatus.Failed,
|
||||
ErrorMessage = errorMessage,
|
||||
CompletedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of a ledger export operation.
|
||||
/// </summary>
|
||||
public enum LedgerExportStatus
|
||||
{
|
||||
Pending = 0,
|
||||
Processing = 1,
|
||||
Completed = 2,
|
||||
Failed = 3,
|
||||
Canceled = 4
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a scheduled job trigger (cron-based or interval-based).
|
||||
/// Schedules automatically create jobs at specified times.
|
||||
/// </summary>
|
||||
public sealed record Schedule(
|
||||
/// <summary>Unique schedule identifier.</summary>
|
||||
Guid ScheduleId,
|
||||
|
||||
/// <summary>Tenant owning this schedule.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Optional project scope within tenant.</summary>
|
||||
string? ProjectId,
|
||||
|
||||
/// <summary>Source that will be used for jobs.</summary>
|
||||
Guid SourceId,
|
||||
|
||||
/// <summary>Human-readable schedule name.</summary>
|
||||
string Name,
|
||||
|
||||
/// <summary>Job type to create.</summary>
|
||||
string JobType,
|
||||
|
||||
/// <summary>Cron expression (6-field with seconds, UTC).</summary>
|
||||
string CronExpression,
|
||||
|
||||
/// <summary>Timezone for cron evaluation (IANA, e.g., "UTC", "America/New_York").</summary>
|
||||
string Timezone,
|
||||
|
||||
/// <summary>Whether the schedule is enabled.</summary>
|
||||
bool Enabled,
|
||||
|
||||
/// <summary>Job payload template JSON.</summary>
|
||||
string PayloadTemplate,
|
||||
|
||||
/// <summary>Job priority for scheduled jobs.</summary>
|
||||
int Priority,
|
||||
|
||||
/// <summary>Maximum retry attempts for scheduled jobs.</summary>
|
||||
int MaxAttempts,
|
||||
|
||||
/// <summary>Last time a job was triggered from this schedule.</summary>
|
||||
DateTimeOffset? LastTriggeredAt,
|
||||
|
||||
/// <summary>Next scheduled trigger time.</summary>
|
||||
DateTimeOffset? NextTriggerAt,
|
||||
|
||||
/// <summary>When the schedule was created.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>When the schedule was last updated.</summary>
|
||||
DateTimeOffset UpdatedAt,
|
||||
|
||||
/// <summary>Actor who created the schedule.</summary>
|
||||
string CreatedBy,
|
||||
|
||||
/// <summary>Actor who last modified the schedule.</summary>
|
||||
string UpdatedBy);
|
||||
@@ -0,0 +1,423 @@
|
||||
using System.Text.Json;
|
||||
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Signed manifest providing provenance chain from ledger entries to artifacts.
|
||||
/// Enables verification of artifact authenticity and integrity.
|
||||
/// </summary>
|
||||
public sealed record SignedManifest(
|
||||
/// <summary>Unique manifest identifier.</summary>
|
||||
Guid ManifestId,
|
||||
|
||||
/// <summary>Manifest schema version.</summary>
|
||||
string SchemaVersion,
|
||||
|
||||
/// <summary>Tenant owning this manifest.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Type of provenance (run, export, attestation).</summary>
|
||||
ProvenanceType ProvenanceType,
|
||||
|
||||
/// <summary>Subject of the provenance (run ID, export ID, etc.).</summary>
|
||||
Guid SubjectId,
|
||||
|
||||
/// <summary>Provenance statements (JSON array).</summary>
|
||||
string Statements,
|
||||
|
||||
/// <summary>Artifact references with digests (JSON array).</summary>
|
||||
string Artifacts,
|
||||
|
||||
/// <summary>Materials (inputs) used to produce the artifacts (JSON array).</summary>
|
||||
string Materials,
|
||||
|
||||
/// <summary>Build environment information (JSON object).</summary>
|
||||
string? BuildInfo,
|
||||
|
||||
/// <summary>SHA-256 digest of the manifest payload (excluding signature).</summary>
|
||||
string PayloadDigest,
|
||||
|
||||
/// <summary>Signature algorithm used.</summary>
|
||||
string SignatureAlgorithm,
|
||||
|
||||
/// <summary>Base64-encoded signature.</summary>
|
||||
string Signature,
|
||||
|
||||
/// <summary>Key ID used for signing.</summary>
|
||||
string KeyId,
|
||||
|
||||
/// <summary>When the manifest was created.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>Expiration time of the manifest (if any).</summary>
|
||||
DateTimeOffset? ExpiresAt,
|
||||
|
||||
/// <summary>Additional metadata (JSON object).</summary>
|
||||
string? Metadata)
|
||||
{
|
||||
/// <summary>
|
||||
/// Current schema version for manifests.
|
||||
/// </summary>
|
||||
public const string CurrentSchemaVersion = "1.0.0";
|
||||
|
||||
/// <summary>
|
||||
/// Creates an unsigned manifest from a ledger entry.
|
||||
/// The manifest must be signed separately using SigningService.
|
||||
/// </summary>
|
||||
public static SignedManifest CreateFromLedgerEntry(
|
||||
RunLedgerEntry ledger,
|
||||
string? buildInfo = null,
|
||||
string? metadata = null)
|
||||
{
|
||||
var statements = CreateStatementsFromLedger(ledger);
|
||||
var artifacts = ledger.ArtifactManifest;
|
||||
var materials = CreateMaterialsFromLedger(ledger);
|
||||
|
||||
var payloadDigest = ComputePayloadDigest(
|
||||
ledger.TenantId,
|
||||
ProvenanceType.Run,
|
||||
ledger.RunId,
|
||||
statements,
|
||||
artifacts,
|
||||
materials);
|
||||
|
||||
return new SignedManifest(
|
||||
ManifestId: Guid.NewGuid(),
|
||||
SchemaVersion: CurrentSchemaVersion,
|
||||
TenantId: ledger.TenantId,
|
||||
ProvenanceType: ProvenanceType.Run,
|
||||
SubjectId: ledger.RunId,
|
||||
Statements: statements,
|
||||
Artifacts: artifacts,
|
||||
Materials: materials,
|
||||
BuildInfo: buildInfo,
|
||||
PayloadDigest: payloadDigest,
|
||||
SignatureAlgorithm: "none",
|
||||
Signature: string.Empty,
|
||||
KeyId: string.Empty,
|
||||
CreatedAt: DateTimeOffset.UtcNow,
|
||||
ExpiresAt: null,
|
||||
Metadata: metadata);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates an unsigned manifest from a ledger export.
|
||||
/// </summary>
|
||||
public static SignedManifest CreateFromExport(
|
||||
LedgerExport export,
|
||||
IReadOnlyList<RunLedgerEntry> entries,
|
||||
string? buildInfo = null,
|
||||
string? metadata = null)
|
||||
{
|
||||
if (export.Status != LedgerExportStatus.Completed)
|
||||
{
|
||||
throw new InvalidOperationException("Cannot create manifest from incomplete export.");
|
||||
}
|
||||
|
||||
var statements = CreateStatementsFromExport(export, entries);
|
||||
var artifacts = CreateExportArtifacts(export);
|
||||
var materials = CreateExportMaterials(entries);
|
||||
|
||||
var payloadDigest = ComputePayloadDigest(
|
||||
export.TenantId,
|
||||
ProvenanceType.Export,
|
||||
export.ExportId,
|
||||
statements,
|
||||
artifacts,
|
||||
materials);
|
||||
|
||||
return new SignedManifest(
|
||||
ManifestId: Guid.NewGuid(),
|
||||
SchemaVersion: CurrentSchemaVersion,
|
||||
TenantId: export.TenantId,
|
||||
ProvenanceType: ProvenanceType.Export,
|
||||
SubjectId: export.ExportId,
|
||||
Statements: statements,
|
||||
Artifacts: artifacts,
|
||||
Materials: materials,
|
||||
BuildInfo: buildInfo,
|
||||
PayloadDigest: payloadDigest,
|
||||
SignatureAlgorithm: "none",
|
||||
Signature: string.Empty,
|
||||
KeyId: string.Empty,
|
||||
CreatedAt: DateTimeOffset.UtcNow,
|
||||
ExpiresAt: null,
|
||||
Metadata: metadata);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Signs the manifest with the provided signature.
|
||||
/// </summary>
|
||||
public SignedManifest Sign(string signatureAlgorithm, string signature, string keyId, DateTimeOffset? expiresAt = null)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(signatureAlgorithm))
|
||||
{
|
||||
throw new ArgumentException("Signature algorithm is required.", nameof(signatureAlgorithm));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(signature))
|
||||
{
|
||||
throw new ArgumentException("Signature is required.", nameof(signature));
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(keyId))
|
||||
{
|
||||
throw new ArgumentException("Key ID is required.", nameof(keyId));
|
||||
}
|
||||
|
||||
return this with
|
||||
{
|
||||
SignatureAlgorithm = signatureAlgorithm,
|
||||
Signature = signature,
|
||||
KeyId = keyId,
|
||||
ExpiresAt = expiresAt
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if the manifest is signed.
|
||||
/// </summary>
|
||||
public bool IsSigned => !string.IsNullOrEmpty(Signature) && SignatureAlgorithm != "none";
|
||||
|
||||
/// <summary>
|
||||
/// Checks if the manifest has expired.
|
||||
/// </summary>
|
||||
public bool IsExpired => ExpiresAt.HasValue && ExpiresAt.Value < DateTimeOffset.UtcNow;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the payload digest integrity.
|
||||
/// </summary>
|
||||
public bool VerifyPayloadIntegrity()
|
||||
{
|
||||
var computed = ComputePayloadDigest(TenantId, ProvenanceType, SubjectId, Statements, Artifacts, Materials);
|
||||
return string.Equals(PayloadDigest, computed, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parses the artifact manifest into typed objects.
|
||||
/// </summary>
|
||||
public IReadOnlyList<ArtifactReference> GetArtifactReferences()
|
||||
{
|
||||
if (string.IsNullOrEmpty(Artifacts) || Artifacts == "[]")
|
||||
{
|
||||
return Array.Empty<ArtifactReference>();
|
||||
}
|
||||
|
||||
return JsonSerializer.Deserialize<List<ArtifactReference>>(Artifacts) ?? [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parses the material manifest into typed objects.
|
||||
/// </summary>
|
||||
public IReadOnlyList<MaterialReference> GetMaterialReferences()
|
||||
{
|
||||
if (string.IsNullOrEmpty(Materials) || Materials == "[]")
|
||||
{
|
||||
return Array.Empty<MaterialReference>();
|
||||
}
|
||||
|
||||
return JsonSerializer.Deserialize<List<MaterialReference>>(Materials) ?? [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parses the statements into typed objects.
|
||||
/// </summary>
|
||||
public IReadOnlyList<ProvenanceStatement> GetStatements()
|
||||
{
|
||||
if (string.IsNullOrEmpty(Statements) || Statements == "[]")
|
||||
{
|
||||
return Array.Empty<ProvenanceStatement>();
|
||||
}
|
||||
|
||||
return JsonSerializer.Deserialize<List<ProvenanceStatement>>(Statements) ?? [];
|
||||
}
|
||||
|
||||
private static string CreateStatementsFromLedger(RunLedgerEntry ledger)
|
||||
{
|
||||
var statements = new List<ProvenanceStatement>
|
||||
{
|
||||
new(
|
||||
StatementType: "run_completed",
|
||||
Subject: $"run:{ledger.RunId}",
|
||||
Predicate: "produced",
|
||||
Object: $"outputs:{ledger.OutputDigest}",
|
||||
Timestamp: ledger.RunCompletedAt,
|
||||
Metadata: JsonSerializer.Serialize(new
|
||||
{
|
||||
ledger.RunType,
|
||||
ledger.FinalStatus,
|
||||
ledger.TotalJobs,
|
||||
ledger.SucceededJobs,
|
||||
ledger.FailedJobs,
|
||||
ledger.ExecutionDuration
|
||||
})),
|
||||
new(
|
||||
StatementType: "chain_link",
|
||||
Subject: $"ledger:{ledger.LedgerId}",
|
||||
Predicate: "follows",
|
||||
Object: ledger.PreviousEntryHash ?? "(genesis)",
|
||||
Timestamp: ledger.LedgerCreatedAt,
|
||||
Metadata: JsonSerializer.Serialize(new
|
||||
{
|
||||
ledger.SequenceNumber,
|
||||
ledger.ContentHash
|
||||
}))
|
||||
};
|
||||
|
||||
return JsonSerializer.Serialize(statements);
|
||||
}
|
||||
|
||||
private static string CreateMaterialsFromLedger(RunLedgerEntry ledger)
|
||||
{
|
||||
var materials = new List<MaterialReference>
|
||||
{
|
||||
new(
|
||||
Uri: $"input:{ledger.RunId}",
|
||||
Digest: ledger.InputDigest,
|
||||
MediaType: "application/json",
|
||||
Name: "run_input")
|
||||
};
|
||||
|
||||
return JsonSerializer.Serialize(materials);
|
||||
}
|
||||
|
||||
private static string CreateStatementsFromExport(LedgerExport export, IReadOnlyList<RunLedgerEntry> entries)
|
||||
{
|
||||
var statements = new List<ProvenanceStatement>
|
||||
{
|
||||
new(
|
||||
StatementType: "export_completed",
|
||||
Subject: $"export:{export.ExportId}",
|
||||
Predicate: "contains",
|
||||
Object: $"entries:{entries.Count}",
|
||||
Timestamp: export.CompletedAt ?? DateTimeOffset.UtcNow,
|
||||
Metadata: JsonSerializer.Serialize(new
|
||||
{
|
||||
export.Format,
|
||||
export.EntryCount,
|
||||
export.StartTime,
|
||||
export.EndTime,
|
||||
export.RunTypeFilter,
|
||||
export.SourceIdFilter
|
||||
}))
|
||||
};
|
||||
|
||||
// Add chain integrity statement
|
||||
if (entries.Count > 0)
|
||||
{
|
||||
var first = entries.MinBy(e => e.SequenceNumber);
|
||||
var last = entries.MaxBy(e => e.SequenceNumber);
|
||||
if (first is not null && last is not null)
|
||||
{
|
||||
statements.Add(new ProvenanceStatement(
|
||||
StatementType: "chain_range",
|
||||
Subject: $"export:{export.ExportId}",
|
||||
Predicate: "covers",
|
||||
Object: $"sequence:{first.SequenceNumber}-{last.SequenceNumber}",
|
||||
Timestamp: export.CompletedAt ?? DateTimeOffset.UtcNow,
|
||||
Metadata: JsonSerializer.Serialize(new
|
||||
{
|
||||
FirstEntryHash = first.ContentHash,
|
||||
LastEntryHash = last.ContentHash
|
||||
})));
|
||||
}
|
||||
}
|
||||
|
||||
return JsonSerializer.Serialize(statements);
|
||||
}
|
||||
|
||||
private static string CreateExportArtifacts(LedgerExport export)
|
||||
{
|
||||
var artifacts = new List<ArtifactReference>
|
||||
{
|
||||
new(
|
||||
ArtifactId: export.ExportId,
|
||||
ArtifactType: "ledger_export",
|
||||
Uri: export.OutputUri ?? string.Empty,
|
||||
Digest: export.OutputDigest ?? string.Empty,
|
||||
MediaType: GetMediaType(export.Format),
|
||||
SizeBytes: export.OutputSizeBytes ?? 0)
|
||||
};
|
||||
|
||||
return JsonSerializer.Serialize(artifacts);
|
||||
}
|
||||
|
||||
private static string CreateExportMaterials(IReadOnlyList<RunLedgerEntry> entries)
|
||||
{
|
||||
var materials = entries.Select(e => new MaterialReference(
|
||||
Uri: $"ledger:{e.LedgerId}",
|
||||
Digest: e.ContentHash,
|
||||
MediaType: "application/json",
|
||||
Name: $"run_{e.RunId}")).ToList();
|
||||
|
||||
return JsonSerializer.Serialize(materials);
|
||||
}
|
||||
|
||||
private static string GetMediaType(string format) => format.ToLowerInvariant() switch
|
||||
{
|
||||
"json" => "application/json",
|
||||
"ndjson" => "application/x-ndjson",
|
||||
"csv" => "text/csv",
|
||||
_ => "application/octet-stream"
|
||||
};
|
||||
|
||||
private static string ComputePayloadDigest(
|
||||
string tenantId,
|
||||
ProvenanceType provenanceType,
|
||||
Guid subjectId,
|
||||
string statements,
|
||||
string artifacts,
|
||||
string materials)
|
||||
{
|
||||
var payload = $"{tenantId}|{provenanceType}|{subjectId}|{statements}|{artifacts}|{materials}";
|
||||
var bytes = System.Text.Encoding.UTF8.GetBytes(payload);
|
||||
var hash = System.Security.Cryptography.SHA256.HashData(bytes);
|
||||
return Convert.ToHexString(hash).ToLowerInvariant();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Types of provenance tracked by manifests.
|
||||
/// </summary>
|
||||
public enum ProvenanceType
|
||||
{
|
||||
/// <summary>Provenance for a completed run.</summary>
|
||||
Run = 0,
|
||||
|
||||
/// <summary>Provenance for a ledger export.</summary>
|
||||
Export = 1,
|
||||
|
||||
/// <summary>Provenance for an attestation.</summary>
|
||||
Attestation = 2
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reference to an artifact in a manifest.
|
||||
/// </summary>
|
||||
public sealed record ArtifactReference(
|
||||
Guid ArtifactId,
|
||||
string ArtifactType,
|
||||
string Uri,
|
||||
string Digest,
|
||||
string MediaType,
|
||||
long SizeBytes);
|
||||
|
||||
/// <summary>
|
||||
/// Reference to a material (input) in a manifest.
|
||||
/// </summary>
|
||||
public sealed record MaterialReference(
|
||||
string Uri,
|
||||
string Digest,
|
||||
string MediaType,
|
||||
string Name);
|
||||
|
||||
/// <summary>
|
||||
/// A provenance statement in a manifest.
|
||||
/// </summary>
|
||||
public sealed record ProvenanceStatement(
|
||||
string StatementType,
|
||||
string Subject,
|
||||
string Predicate,
|
||||
string Object,
|
||||
DateTimeOffset Timestamp,
|
||||
string? Metadata);
|
||||
@@ -0,0 +1,567 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Service Level Objective type.
|
||||
/// </summary>
|
||||
public enum SloType
|
||||
{
|
||||
/// <summary>Availability SLO (percentage of successful requests).</summary>
|
||||
Availability,
|
||||
|
||||
/// <summary>Latency SLO (percentile-based response time).</summary>
|
||||
Latency,
|
||||
|
||||
/// <summary>Throughput SLO (minimum jobs processed per period).</summary>
|
||||
Throughput
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Time window for SLO computation.
|
||||
/// </summary>
|
||||
public enum SloWindow
|
||||
{
|
||||
/// <summary>Rolling 1 hour window.</summary>
|
||||
OneHour,
|
||||
|
||||
/// <summary>Rolling 1 day window.</summary>
|
||||
OneDay,
|
||||
|
||||
/// <summary>Rolling 7 day window.</summary>
|
||||
SevenDays,
|
||||
|
||||
/// <summary>Rolling 30 day window.</summary>
|
||||
ThirtyDays
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Alert severity for SLO violations.
|
||||
/// </summary>
|
||||
public enum AlertSeverity
|
||||
{
|
||||
/// <summary>Informational - SLO approaching threshold.</summary>
|
||||
Info,
|
||||
|
||||
/// <summary>Warning - SLO at risk.</summary>
|
||||
Warning,
|
||||
|
||||
/// <summary>Critical - SLO likely to be breached.</summary>
|
||||
Critical,
|
||||
|
||||
/// <summary>Emergency - SLO breached.</summary>
|
||||
Emergency
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service Level Objective definition.
|
||||
/// </summary>
|
||||
public sealed record Slo(
|
||||
/// <summary>Unique SLO identifier.</summary>
|
||||
Guid SloId,
|
||||
|
||||
/// <summary>Tenant this SLO belongs to.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Human-readable name.</summary>
|
||||
string Name,
|
||||
|
||||
/// <summary>Optional description.</summary>
|
||||
string? Description,
|
||||
|
||||
/// <summary>Type of SLO.</summary>
|
||||
SloType Type,
|
||||
|
||||
/// <summary>Job type this SLO applies to (null = all job types).</summary>
|
||||
string? JobType,
|
||||
|
||||
/// <summary>Source ID this SLO applies to (null = all sources).</summary>
|
||||
Guid? SourceId,
|
||||
|
||||
/// <summary>Target objective (e.g., 0.999 for 99.9% availability).</summary>
|
||||
double Target,
|
||||
|
||||
/// <summary>Time window for SLO evaluation.</summary>
|
||||
SloWindow Window,
|
||||
|
||||
/// <summary>For latency SLOs: the percentile (e.g., 0.95 for P95).</summary>
|
||||
double? LatencyPercentile,
|
||||
|
||||
/// <summary>For latency SLOs: the target latency in seconds.</summary>
|
||||
double? LatencyTargetSeconds,
|
||||
|
||||
/// <summary>For throughput SLOs: minimum jobs per period.</summary>
|
||||
int? ThroughputMinimum,
|
||||
|
||||
/// <summary>Whether this SLO is actively monitored.</summary>
|
||||
bool Enabled,
|
||||
|
||||
/// <summary>When the SLO was created.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>When the SLO was last updated.</summary>
|
||||
DateTimeOffset UpdatedAt,
|
||||
|
||||
/// <summary>Actor who created the SLO.</summary>
|
||||
string CreatedBy,
|
||||
|
||||
/// <summary>Actor who last modified the SLO.</summary>
|
||||
string UpdatedBy)
|
||||
{
|
||||
/// <summary>Calculates the error budget as a decimal (1 - target).</summary>
|
||||
public double ErrorBudget => 1.0 - Target;
|
||||
|
||||
/// <summary>Creates a new availability SLO.</summary>
|
||||
public static Slo CreateAvailability(
|
||||
string tenantId,
|
||||
string name,
|
||||
double target,
|
||||
SloWindow window,
|
||||
string createdBy,
|
||||
string? description = null,
|
||||
string? jobType = null,
|
||||
Guid? sourceId = null)
|
||||
{
|
||||
ValidateTarget(target);
|
||||
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
return new Slo(
|
||||
SloId: Guid.NewGuid(),
|
||||
TenantId: tenantId,
|
||||
Name: name,
|
||||
Description: description,
|
||||
Type: SloType.Availability,
|
||||
JobType: jobType,
|
||||
SourceId: sourceId,
|
||||
Target: target,
|
||||
Window: window,
|
||||
LatencyPercentile: null,
|
||||
LatencyTargetSeconds: null,
|
||||
ThroughputMinimum: null,
|
||||
Enabled: true,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
CreatedBy: createdBy,
|
||||
UpdatedBy: createdBy);
|
||||
}
|
||||
|
||||
/// <summary>Creates a new latency SLO.</summary>
|
||||
public static Slo CreateLatency(
|
||||
string tenantId,
|
||||
string name,
|
||||
double percentile,
|
||||
double targetSeconds,
|
||||
double target,
|
||||
SloWindow window,
|
||||
string createdBy,
|
||||
string? description = null,
|
||||
string? jobType = null,
|
||||
Guid? sourceId = null)
|
||||
{
|
||||
ValidateTarget(target);
|
||||
if (percentile < 0 || percentile > 1)
|
||||
throw new ArgumentOutOfRangeException(nameof(percentile), "Percentile must be between 0 and 1");
|
||||
if (targetSeconds <= 0)
|
||||
throw new ArgumentOutOfRangeException(nameof(targetSeconds), "Target latency must be positive");
|
||||
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
return new Slo(
|
||||
SloId: Guid.NewGuid(),
|
||||
TenantId: tenantId,
|
||||
Name: name,
|
||||
Description: description,
|
||||
Type: SloType.Latency,
|
||||
JobType: jobType,
|
||||
SourceId: sourceId,
|
||||
Target: target,
|
||||
Window: window,
|
||||
LatencyPercentile: percentile,
|
||||
LatencyTargetSeconds: targetSeconds,
|
||||
ThroughputMinimum: null,
|
||||
Enabled: true,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
CreatedBy: createdBy,
|
||||
UpdatedBy: createdBy);
|
||||
}
|
||||
|
||||
/// <summary>Creates a new throughput SLO.</summary>
|
||||
public static Slo CreateThroughput(
|
||||
string tenantId,
|
||||
string name,
|
||||
int minimum,
|
||||
double target,
|
||||
SloWindow window,
|
||||
string createdBy,
|
||||
string? description = null,
|
||||
string? jobType = null,
|
||||
Guid? sourceId = null)
|
||||
{
|
||||
ValidateTarget(target);
|
||||
if (minimum <= 0)
|
||||
throw new ArgumentOutOfRangeException(nameof(minimum), "Throughput minimum must be positive");
|
||||
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
return new Slo(
|
||||
SloId: Guid.NewGuid(),
|
||||
TenantId: tenantId,
|
||||
Name: name,
|
||||
Description: description,
|
||||
Type: SloType.Throughput,
|
||||
JobType: jobType,
|
||||
SourceId: sourceId,
|
||||
Target: target,
|
||||
Window: window,
|
||||
LatencyPercentile: null,
|
||||
LatencyTargetSeconds: null,
|
||||
ThroughputMinimum: minimum,
|
||||
Enabled: true,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
CreatedBy: createdBy,
|
||||
UpdatedBy: createdBy);
|
||||
}
|
||||
|
||||
/// <summary>Updates the SLO with new values.</summary>
|
||||
public Slo Update(
|
||||
string? name = null,
|
||||
string? description = null,
|
||||
double? target = null,
|
||||
bool? enabled = null,
|
||||
string? updatedBy = null)
|
||||
{
|
||||
if (target.HasValue)
|
||||
ValidateTarget(target.Value);
|
||||
|
||||
return this with
|
||||
{
|
||||
Name = name ?? Name,
|
||||
Description = description ?? Description,
|
||||
Target = target ?? Target,
|
||||
Enabled = enabled ?? Enabled,
|
||||
UpdatedAt = DateTimeOffset.UtcNow,
|
||||
UpdatedBy = updatedBy ?? UpdatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>Disables the SLO.</summary>
|
||||
public Slo Disable(string updatedBy) =>
|
||||
this with
|
||||
{
|
||||
Enabled = false,
|
||||
UpdatedAt = DateTimeOffset.UtcNow,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
|
||||
/// <summary>Enables the SLO.</summary>
|
||||
public Slo Enable(string updatedBy) =>
|
||||
this with
|
||||
{
|
||||
Enabled = true,
|
||||
UpdatedAt = DateTimeOffset.UtcNow,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
|
||||
/// <summary>Gets the window duration as a TimeSpan.</summary>
|
||||
public TimeSpan GetWindowDuration() => Window switch
|
||||
{
|
||||
SloWindow.OneHour => TimeSpan.FromHours(1),
|
||||
SloWindow.OneDay => TimeSpan.FromDays(1),
|
||||
SloWindow.SevenDays => TimeSpan.FromDays(7),
|
||||
SloWindow.ThirtyDays => TimeSpan.FromDays(30),
|
||||
_ => throw new InvalidOperationException($"Unknown window: {Window}")
|
||||
};
|
||||
|
||||
private static void ValidateTarget(double target)
|
||||
{
|
||||
if (target <= 0 || target > 1)
|
||||
throw new ArgumentOutOfRangeException(nameof(target), "Target must be between 0 (exclusive) and 1 (inclusive)");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Current state of an SLO including burn rate and budget consumption.
|
||||
/// </summary>
|
||||
public sealed record SloState(
|
||||
/// <summary>The SLO this state belongs to.</summary>
|
||||
Guid SloId,
|
||||
|
||||
/// <summary>Tenant this state belongs to.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Current SLI value (actual performance).</summary>
|
||||
double CurrentSli,
|
||||
|
||||
/// <summary>Total events/requests in the window.</summary>
|
||||
long TotalEvents,
|
||||
|
||||
/// <summary>Good events (successful) in the window.</summary>
|
||||
long GoodEvents,
|
||||
|
||||
/// <summary>Bad events (failed) in the window.</summary>
|
||||
long BadEvents,
|
||||
|
||||
/// <summary>Error budget consumed (0-1 where 1 = fully consumed).</summary>
|
||||
double BudgetConsumed,
|
||||
|
||||
/// <summary>Error budget remaining (0-1 where 1 = fully available).</summary>
|
||||
double BudgetRemaining,
|
||||
|
||||
/// <summary>Current burn rate (1.0 = consuming budget at sustainable rate).</summary>
|
||||
double BurnRate,
|
||||
|
||||
/// <summary>Projected time until budget exhaustion (null if not burning).</summary>
|
||||
TimeSpan? TimeToExhaustion,
|
||||
|
||||
/// <summary>Whether the SLO is currently met.</summary>
|
||||
bool IsMet,
|
||||
|
||||
/// <summary>Current alert severity based on budget consumption.</summary>
|
||||
AlertSeverity AlertSeverity,
|
||||
|
||||
/// <summary>When this state was computed.</summary>
|
||||
DateTimeOffset ComputedAt,
|
||||
|
||||
/// <summary>Start of the evaluation window.</summary>
|
||||
DateTimeOffset WindowStart,
|
||||
|
||||
/// <summary>End of the evaluation window.</summary>
|
||||
DateTimeOffset WindowEnd)
|
||||
{
|
||||
/// <summary>Creates a state indicating no data is available.</summary>
|
||||
public static SloState NoData(Guid sloId, string tenantId, DateTimeOffset now, SloWindow window)
|
||||
{
|
||||
var windowDuration = GetWindowDuration(window);
|
||||
return new SloState(
|
||||
SloId: sloId,
|
||||
TenantId: tenantId,
|
||||
CurrentSli: 1.0, // Assume good when no data
|
||||
TotalEvents: 0,
|
||||
GoodEvents: 0,
|
||||
BadEvents: 0,
|
||||
BudgetConsumed: 0,
|
||||
BudgetRemaining: 1.0,
|
||||
BurnRate: 0,
|
||||
TimeToExhaustion: null,
|
||||
IsMet: true,
|
||||
AlertSeverity: AlertSeverity.Info,
|
||||
ComputedAt: now,
|
||||
WindowStart: now - windowDuration,
|
||||
WindowEnd: now);
|
||||
}
|
||||
|
||||
private static TimeSpan GetWindowDuration(SloWindow window) => window switch
|
||||
{
|
||||
SloWindow.OneHour => TimeSpan.FromHours(1),
|
||||
SloWindow.OneDay => TimeSpan.FromDays(1),
|
||||
SloWindow.SevenDays => TimeSpan.FromDays(7),
|
||||
SloWindow.ThirtyDays => TimeSpan.FromDays(30),
|
||||
_ => TimeSpan.FromDays(1)
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Alert budget threshold configuration.
|
||||
/// </summary>
|
||||
public sealed record AlertBudgetThreshold(
|
||||
/// <summary>Unique threshold identifier.</summary>
|
||||
Guid ThresholdId,
|
||||
|
||||
/// <summary>SLO this threshold applies to.</summary>
|
||||
Guid SloId,
|
||||
|
||||
/// <summary>Tenant this threshold belongs to.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Budget consumed percentage that triggers this alert (0-1).</summary>
|
||||
double BudgetConsumedThreshold,
|
||||
|
||||
/// <summary>Burn rate threshold that triggers this alert.</summary>
|
||||
double? BurnRateThreshold,
|
||||
|
||||
/// <summary>Severity of the alert.</summary>
|
||||
AlertSeverity Severity,
|
||||
|
||||
/// <summary>Whether this threshold is enabled.</summary>
|
||||
bool Enabled,
|
||||
|
||||
/// <summary>Notification channel for this alert.</summary>
|
||||
string? NotificationChannel,
|
||||
|
||||
/// <summary>Notification endpoint for this alert.</summary>
|
||||
string? NotificationEndpoint,
|
||||
|
||||
/// <summary>Cooldown period between alerts.</summary>
|
||||
TimeSpan Cooldown,
|
||||
|
||||
/// <summary>When an alert was last triggered.</summary>
|
||||
DateTimeOffset? LastTriggeredAt,
|
||||
|
||||
/// <summary>When the threshold was created.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>When the threshold was last updated.</summary>
|
||||
DateTimeOffset UpdatedAt,
|
||||
|
||||
/// <summary>Actor who created the threshold.</summary>
|
||||
string CreatedBy,
|
||||
|
||||
/// <summary>Actor who last modified the threshold.</summary>
|
||||
string UpdatedBy)
|
||||
{
|
||||
/// <summary>Creates a new alert threshold.</summary>
|
||||
public static AlertBudgetThreshold Create(
|
||||
Guid sloId,
|
||||
string tenantId,
|
||||
double budgetConsumedThreshold,
|
||||
AlertSeverity severity,
|
||||
string createdBy,
|
||||
double? burnRateThreshold = null,
|
||||
string? notificationChannel = null,
|
||||
string? notificationEndpoint = null,
|
||||
TimeSpan? cooldown = null)
|
||||
{
|
||||
if (budgetConsumedThreshold < 0 || budgetConsumedThreshold > 1)
|
||||
throw new ArgumentOutOfRangeException(nameof(budgetConsumedThreshold), "Threshold must be between 0 and 1");
|
||||
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
return new AlertBudgetThreshold(
|
||||
ThresholdId: Guid.NewGuid(),
|
||||
SloId: sloId,
|
||||
TenantId: tenantId,
|
||||
BudgetConsumedThreshold: budgetConsumedThreshold,
|
||||
BurnRateThreshold: burnRateThreshold,
|
||||
Severity: severity,
|
||||
Enabled: true,
|
||||
NotificationChannel: notificationChannel,
|
||||
NotificationEndpoint: notificationEndpoint,
|
||||
Cooldown: cooldown ?? TimeSpan.FromHours(1),
|
||||
LastTriggeredAt: null,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
CreatedBy: createdBy,
|
||||
UpdatedBy: createdBy);
|
||||
}
|
||||
|
||||
/// <summary>Checks if this threshold should trigger based on current state.</summary>
|
||||
public bool ShouldTrigger(SloState state, DateTimeOffset now)
|
||||
{
|
||||
if (!Enabled) return false;
|
||||
|
||||
// Check cooldown
|
||||
if (LastTriggeredAt.HasValue && (now - LastTriggeredAt.Value) < Cooldown)
|
||||
return false;
|
||||
|
||||
// Check budget consumed threshold
|
||||
if (state.BudgetConsumed >= BudgetConsumedThreshold)
|
||||
return true;
|
||||
|
||||
// Check burn rate threshold if set
|
||||
if (BurnRateThreshold.HasValue && state.BurnRate >= BurnRateThreshold.Value)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>Records that this threshold was triggered.</summary>
|
||||
public AlertBudgetThreshold RecordTrigger(DateTimeOffset now) =>
|
||||
this with
|
||||
{
|
||||
LastTriggeredAt = now,
|
||||
UpdatedAt = now
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SLO alert event.
|
||||
/// </summary>
|
||||
public sealed record SloAlert(
|
||||
/// <summary>Unique alert identifier.</summary>
|
||||
Guid AlertId,
|
||||
|
||||
/// <summary>SLO this alert relates to.</summary>
|
||||
Guid SloId,
|
||||
|
||||
/// <summary>Threshold that triggered this alert.</summary>
|
||||
Guid ThresholdId,
|
||||
|
||||
/// <summary>Tenant this alert belongs to.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Severity of the alert.</summary>
|
||||
AlertSeverity Severity,
|
||||
|
||||
/// <summary>Alert message.</summary>
|
||||
string Message,
|
||||
|
||||
/// <summary>Budget consumed at time of alert.</summary>
|
||||
double BudgetConsumed,
|
||||
|
||||
/// <summary>Burn rate at time of alert.</summary>
|
||||
double BurnRate,
|
||||
|
||||
/// <summary>Current SLI value at time of alert.</summary>
|
||||
double CurrentSli,
|
||||
|
||||
/// <summary>When the alert was triggered.</summary>
|
||||
DateTimeOffset TriggeredAt,
|
||||
|
||||
/// <summary>When the alert was acknowledged (null if not acknowledged).</summary>
|
||||
DateTimeOffset? AcknowledgedAt,
|
||||
|
||||
/// <summary>Who acknowledged the alert.</summary>
|
||||
string? AcknowledgedBy,
|
||||
|
||||
/// <summary>When the alert was resolved (null if not resolved).</summary>
|
||||
DateTimeOffset? ResolvedAt,
|
||||
|
||||
/// <summary>How the alert was resolved.</summary>
|
||||
string? ResolutionNotes)
|
||||
{
|
||||
/// <summary>Creates a new alert from an SLO state and threshold.</summary>
|
||||
public static SloAlert Create(
|
||||
Slo slo,
|
||||
SloState state,
|
||||
AlertBudgetThreshold threshold)
|
||||
{
|
||||
var message = threshold.BurnRateThreshold.HasValue && state.BurnRate >= threshold.BurnRateThreshold.Value
|
||||
? $"SLO '{slo.Name}' burn rate {state.BurnRate:F2}x exceeds threshold {threshold.BurnRateThreshold.Value:F2}x"
|
||||
: $"SLO '{slo.Name}' error budget {state.BudgetConsumed:P1} consumed exceeds threshold {threshold.BudgetConsumedThreshold:P1}";
|
||||
|
||||
return new SloAlert(
|
||||
AlertId: Guid.NewGuid(),
|
||||
SloId: slo.SloId,
|
||||
ThresholdId: threshold.ThresholdId,
|
||||
TenantId: slo.TenantId,
|
||||
Severity: threshold.Severity,
|
||||
Message: message,
|
||||
BudgetConsumed: state.BudgetConsumed,
|
||||
BurnRate: state.BurnRate,
|
||||
CurrentSli: state.CurrentSli,
|
||||
TriggeredAt: state.ComputedAt,
|
||||
AcknowledgedAt: null,
|
||||
AcknowledgedBy: null,
|
||||
ResolvedAt: null,
|
||||
ResolutionNotes: null);
|
||||
}
|
||||
|
||||
/// <summary>Acknowledges the alert.</summary>
|
||||
public SloAlert Acknowledge(string acknowledgedBy, DateTimeOffset now) =>
|
||||
this with
|
||||
{
|
||||
AcknowledgedAt = now,
|
||||
AcknowledgedBy = acknowledgedBy
|
||||
};
|
||||
|
||||
/// <summary>Resolves the alert.</summary>
|
||||
public SloAlert Resolve(string notes, DateTimeOffset now) =>
|
||||
this with
|
||||
{
|
||||
ResolvedAt = now,
|
||||
ResolutionNotes = notes
|
||||
};
|
||||
|
||||
/// <summary>Whether this alert has been acknowledged.</summary>
|
||||
public bool IsAcknowledged => AcknowledgedAt.HasValue;
|
||||
|
||||
/// <summary>Whether this alert has been resolved.</summary>
|
||||
public bool IsResolved => ResolvedAt.HasValue;
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a job source (producer) that submits jobs to the orchestrator.
|
||||
/// Examples: Concelier, Excititor, Scheduler, Export Center, Policy Engine.
|
||||
/// </summary>
|
||||
public sealed record Source(
|
||||
/// <summary>Unique source identifier.</summary>
|
||||
Guid SourceId,
|
||||
|
||||
/// <summary>Tenant owning this source.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Human-readable source name (e.g., "concelier-nvd").</summary>
|
||||
string Name,
|
||||
|
||||
/// <summary>Source type/category (e.g., "advisory-ingest", "scanner", "export").</summary>
|
||||
string SourceType,
|
||||
|
||||
/// <summary>Whether the source is currently enabled.</summary>
|
||||
bool Enabled,
|
||||
|
||||
/// <summary>Whether the source is paused (throttled by operator).</summary>
|
||||
bool Paused,
|
||||
|
||||
/// <summary>Operator-provided reason for pause (if paused).</summary>
|
||||
string? PauseReason,
|
||||
|
||||
/// <summary>Ticket reference for pause audit trail.</summary>
|
||||
string? PauseTicket,
|
||||
|
||||
/// <summary>Optional configuration JSON blob.</summary>
|
||||
string? Configuration,
|
||||
|
||||
/// <summary>When the source was created.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>When the source was last updated.</summary>
|
||||
DateTimeOffset UpdatedAt,
|
||||
|
||||
/// <summary>Actor who last modified the source.</summary>
|
||||
string UpdatedBy);
|
||||
@@ -0,0 +1,60 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a dynamic rate-limit override (throttle) for a source or job type.
|
||||
/// Throttles are temporary pause/slow-down mechanisms, often in response to upstream pressure.
|
||||
/// </summary>
|
||||
public sealed record Throttle(
|
||||
/// <summary>Unique throttle identifier.</summary>
|
||||
Guid ThrottleId,
|
||||
|
||||
/// <summary>Tenant this throttle applies to.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Source to throttle (null if job-type scoped).</summary>
|
||||
Guid? SourceId,
|
||||
|
||||
/// <summary>Job type to throttle (null if source-scoped).</summary>
|
||||
string? JobType,
|
||||
|
||||
/// <summary>Whether this throttle is currently active.</summary>
|
||||
bool Active,
|
||||
|
||||
/// <summary>Reason for the throttle (e.g., "429 from upstream", "Manual pause").</summary>
|
||||
string Reason,
|
||||
|
||||
/// <summary>Optional ticket reference for audit.</summary>
|
||||
string? Ticket,
|
||||
|
||||
/// <summary>When the throttle was created.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>When the throttle expires (null = indefinite).</summary>
|
||||
DateTimeOffset? ExpiresAt,
|
||||
|
||||
/// <summary>Actor who created the throttle.</summary>
|
||||
string CreatedBy);
|
||||
|
||||
/// <summary>
|
||||
/// Reason categories for throttle creation.
|
||||
/// </summary>
|
||||
public static class ThrottleReasons
|
||||
{
|
||||
/// <summary>Upstream returned 429 Too Many Requests.</summary>
|
||||
public const string UpstreamRateLimited = "upstream_429";
|
||||
|
||||
/// <summary>Upstream returned 503 Service Unavailable.</summary>
|
||||
public const string UpstreamUnavailable = "upstream_503";
|
||||
|
||||
/// <summary>Upstream returned 5xx error repeatedly.</summary>
|
||||
public const string UpstreamErrors = "upstream_5xx";
|
||||
|
||||
/// <summary>Manual operator intervention.</summary>
|
||||
public const string ManualPause = "manual_pause";
|
||||
|
||||
/// <summary>Circuit breaker triggered.</summary>
|
||||
public const string CircuitBreaker = "circuit_breaker";
|
||||
|
||||
/// <summary>Quota exhausted.</summary>
|
||||
public const string QuotaExhausted = "quota_exhausted";
|
||||
}
|
||||
@@ -0,0 +1,162 @@
|
||||
namespace StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
/// <summary>
|
||||
/// Represents an event-time watermark for tracking processing progress.
|
||||
/// Watermarks are scoped by source, job type, or custom key.
|
||||
/// </summary>
|
||||
public sealed record Watermark(
|
||||
/// <summary>Unique watermark identifier.</summary>
|
||||
Guid WatermarkId,
|
||||
|
||||
/// <summary>Tenant this watermark belongs to.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Source this watermark tracks (null if job-type scoped).</summary>
|
||||
Guid? SourceId,
|
||||
|
||||
/// <summary>Job type this watermark tracks (null if source-scoped).</summary>
|
||||
string? JobType,
|
||||
|
||||
/// <summary>Normalized scope key for uniqueness.</summary>
|
||||
string ScopeKey,
|
||||
|
||||
/// <summary>Latest processed event time (high watermark).</summary>
|
||||
DateTimeOffset HighWatermark,
|
||||
|
||||
/// <summary>Earliest event time in current window (low watermark for windowing).</summary>
|
||||
DateTimeOffset? LowWatermark,
|
||||
|
||||
/// <summary>Monotonic sequence number for ordering.</summary>
|
||||
long SequenceNumber,
|
||||
|
||||
/// <summary>Total events processed through this watermark.</summary>
|
||||
long ProcessedCount,
|
||||
|
||||
/// <summary>SHA-256 hash of last processed batch for integrity verification.</summary>
|
||||
string? LastBatchHash,
|
||||
|
||||
/// <summary>When the watermark was created.</summary>
|
||||
DateTimeOffset CreatedAt,
|
||||
|
||||
/// <summary>When the watermark was last updated.</summary>
|
||||
DateTimeOffset UpdatedAt,
|
||||
|
||||
/// <summary>Actor who last modified the watermark.</summary>
|
||||
string UpdatedBy)
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a scope key for source-scoped watermarks.
|
||||
/// </summary>
|
||||
public static string CreateScopeKey(Guid sourceId) =>
|
||||
$"source:{sourceId:N}";
|
||||
|
||||
/// <summary>
|
||||
/// Creates a scope key for job-type-scoped watermarks.
|
||||
/// </summary>
|
||||
public static string CreateScopeKey(string jobType) =>
|
||||
$"job_type:{jobType.ToLowerInvariant()}";
|
||||
|
||||
/// <summary>
|
||||
/// Creates a scope key for source+job-type scoped watermarks.
|
||||
/// </summary>
|
||||
public static string CreateScopeKey(Guid sourceId, string jobType) =>
|
||||
$"source:{sourceId:N}:job_type:{jobType.ToLowerInvariant()}";
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new watermark with initial values.
|
||||
/// </summary>
|
||||
public static Watermark Create(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
DateTimeOffset highWatermark,
|
||||
string createdBy)
|
||||
{
|
||||
var scopeKey = (sourceId, jobType) switch
|
||||
{
|
||||
(Guid s, string j) when !string.IsNullOrEmpty(j) => CreateScopeKey(s, j),
|
||||
(Guid s, _) => CreateScopeKey(s),
|
||||
(_, string j) when !string.IsNullOrEmpty(j) => CreateScopeKey(j),
|
||||
_ => throw new ArgumentException("Either sourceId or jobType must be specified.")
|
||||
};
|
||||
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
return new Watermark(
|
||||
WatermarkId: Guid.NewGuid(),
|
||||
TenantId: tenantId,
|
||||
SourceId: sourceId,
|
||||
JobType: jobType,
|
||||
ScopeKey: scopeKey,
|
||||
HighWatermark: highWatermark,
|
||||
LowWatermark: null,
|
||||
SequenceNumber: 0,
|
||||
ProcessedCount: 0,
|
||||
LastBatchHash: null,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
UpdatedBy: createdBy);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Advances the watermark after successful batch processing.
|
||||
/// </summary>
|
||||
public Watermark Advance(
|
||||
DateTimeOffset newHighWatermark,
|
||||
long eventsProcessed,
|
||||
string? batchHash,
|
||||
string updatedBy)
|
||||
{
|
||||
if (newHighWatermark < HighWatermark)
|
||||
throw new ArgumentException("New high watermark cannot be before current high watermark.", nameof(newHighWatermark));
|
||||
|
||||
return this with
|
||||
{
|
||||
HighWatermark = newHighWatermark,
|
||||
SequenceNumber = SequenceNumber + 1,
|
||||
ProcessedCount = ProcessedCount + eventsProcessed,
|
||||
LastBatchHash = batchHash,
|
||||
UpdatedAt = DateTimeOffset.UtcNow,
|
||||
UpdatedBy = updatedBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets the event-time window bounds.
|
||||
/// </summary>
|
||||
public Watermark WithWindow(DateTimeOffset lowWatermark, DateTimeOffset highWatermark)
|
||||
{
|
||||
if (highWatermark < lowWatermark)
|
||||
throw new ArgumentException("High watermark cannot be before low watermark.");
|
||||
|
||||
return this with
|
||||
{
|
||||
LowWatermark = lowWatermark,
|
||||
HighWatermark = highWatermark,
|
||||
UpdatedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Snapshot of watermark state for observability.
|
||||
/// </summary>
|
||||
public sealed record WatermarkSnapshot(
|
||||
string ScopeKey,
|
||||
DateTimeOffset HighWatermark,
|
||||
DateTimeOffset? LowWatermark,
|
||||
long SequenceNumber,
|
||||
long ProcessedCount,
|
||||
TimeSpan? Lag)
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a snapshot from a watermark with calculated lag.
|
||||
/// </summary>
|
||||
public static WatermarkSnapshot FromWatermark(Watermark watermark, DateTimeOffset now) =>
|
||||
new(
|
||||
ScopeKey: watermark.ScopeKey,
|
||||
HighWatermark: watermark.HighWatermark,
|
||||
LowWatermark: watermark.LowWatermark,
|
||||
SequenceNumber: watermark.SequenceNumber,
|
||||
ProcessedCount: watermark.ProcessedCount,
|
||||
Lag: now - watermark.HighWatermark);
|
||||
}
|
||||
@@ -0,0 +1,450 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Core.RateLimiting;
|
||||
|
||||
/// <summary>
|
||||
/// Adaptive rate limiter that combines token bucket, concurrency limiting, and backpressure handling.
|
||||
/// Provides per-tenant/job-type rate limiting with automatic adaptation to upstream pressure.
|
||||
/// </summary>
|
||||
public sealed class AdaptiveRateLimiter
|
||||
{
|
||||
private readonly TokenBucket _tokenBucket;
|
||||
private readonly ConcurrencyLimiter _concurrencyLimiter;
|
||||
private readonly BackpressureHandler _backpressureHandler;
|
||||
private readonly HourlyCounter _hourlyCounter;
|
||||
private readonly object _lock = new();
|
||||
|
||||
/// <summary>
|
||||
/// Tenant ID this limiter applies to.
|
||||
/// </summary>
|
||||
public string TenantId { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Job type this limiter applies to (null = all types).
|
||||
/// </summary>
|
||||
public string? JobType { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum jobs per hour.
|
||||
/// </summary>
|
||||
public int MaxPerHour { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether the limiter is paused by operator.
|
||||
/// </summary>
|
||||
public bool IsPaused { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Reason for pause (if paused).
|
||||
/// </summary>
|
||||
public string? PauseReason { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new adaptive rate limiter from quota configuration.
|
||||
/// </summary>
|
||||
public AdaptiveRateLimiter(Quota quota, TimeProvider? timeProvider = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(quota);
|
||||
|
||||
TenantId = quota.TenantId;
|
||||
JobType = quota.JobType;
|
||||
MaxPerHour = quota.MaxPerHour;
|
||||
IsPaused = quota.Paused;
|
||||
PauseReason = quota.PauseReason;
|
||||
|
||||
_tokenBucket = new TokenBucket(
|
||||
quota.BurstCapacity,
|
||||
quota.RefillRate,
|
||||
quota.CurrentTokens,
|
||||
quota.LastRefillAt);
|
||||
|
||||
_concurrencyLimiter = new ConcurrencyLimiter(
|
||||
quota.MaxActive,
|
||||
quota.CurrentActive);
|
||||
|
||||
_backpressureHandler = new BackpressureHandler(
|
||||
baseDelay: TimeSpan.FromSeconds(1),
|
||||
maxDelay: TimeSpan.FromMinutes(5),
|
||||
failureThreshold: 3,
|
||||
jitterFactor: 0.2);
|
||||
|
||||
_hourlyCounter = new HourlyCounter(
|
||||
quota.MaxPerHour,
|
||||
quota.CurrentHourCount,
|
||||
quota.CurrentHourStart);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new adaptive rate limiter with explicit configuration.
|
||||
/// </summary>
|
||||
public AdaptiveRateLimiter(
|
||||
string tenantId,
|
||||
string? jobType,
|
||||
int maxActive,
|
||||
int maxPerHour,
|
||||
int burstCapacity,
|
||||
double refillRate)
|
||||
{
|
||||
TenantId = tenantId ?? throw new ArgumentNullException(nameof(tenantId));
|
||||
JobType = jobType;
|
||||
MaxPerHour = maxPerHour;
|
||||
|
||||
_tokenBucket = new TokenBucket(burstCapacity, refillRate);
|
||||
_concurrencyLimiter = new ConcurrencyLimiter(maxActive);
|
||||
_backpressureHandler = new BackpressureHandler();
|
||||
_hourlyCounter = new HourlyCounter(maxPerHour);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Attempts to acquire permission to execute a job.
|
||||
/// </summary>
|
||||
/// <param name="now">Current time.</param>
|
||||
/// <returns>Result indicating whether acquisition was successful and why.</returns>
|
||||
public RateLimitResult TryAcquire(DateTimeOffset now)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
// Check if paused
|
||||
if (IsPaused)
|
||||
{
|
||||
return RateLimitResult.Denied(RateLimitDenialReason.Paused, PauseReason);
|
||||
}
|
||||
|
||||
// Check backpressure
|
||||
if (!_backpressureHandler.ShouldAllow(now))
|
||||
{
|
||||
var snapshot = _backpressureHandler.GetSnapshot(now);
|
||||
return RateLimitResult.Denied(
|
||||
RateLimitDenialReason.Backpressure,
|
||||
snapshot.LastFailureReason,
|
||||
retryAfter: snapshot.TimeRemaining);
|
||||
}
|
||||
|
||||
// Check hourly limit
|
||||
if (!_hourlyCounter.TryIncrement(now))
|
||||
{
|
||||
var hourlySnapshot = _hourlyCounter.GetSnapshot(now);
|
||||
return RateLimitResult.Denied(
|
||||
RateLimitDenialReason.HourlyLimitExceeded,
|
||||
$"Hourly limit of {MaxPerHour} exceeded",
|
||||
retryAfter: hourlySnapshot.TimeUntilReset);
|
||||
}
|
||||
|
||||
// Check concurrency
|
||||
if (!_concurrencyLimiter.TryAcquire())
|
||||
{
|
||||
// Rollback hourly counter
|
||||
_hourlyCounter.Decrement();
|
||||
var concurrencySnapshot = _concurrencyLimiter.GetSnapshot();
|
||||
return RateLimitResult.Denied(
|
||||
RateLimitDenialReason.ConcurrencyLimitExceeded,
|
||||
$"Concurrency limit of {concurrencySnapshot.MaxActive} exceeded");
|
||||
}
|
||||
|
||||
// Check token bucket
|
||||
if (!_tokenBucket.TryConsume(now))
|
||||
{
|
||||
// Rollback concurrency and hourly counter
|
||||
_concurrencyLimiter.Release();
|
||||
_hourlyCounter.Decrement();
|
||||
var waitTime = _tokenBucket.EstimatedWaitTime(now);
|
||||
return RateLimitResult.Denied(
|
||||
RateLimitDenialReason.TokensExhausted,
|
||||
"Token bucket exhausted",
|
||||
retryAfter: waitTime);
|
||||
}
|
||||
|
||||
return RateLimitResult.Allowed();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Releases a concurrency slot when a job completes.
|
||||
/// </summary>
|
||||
public void Release()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_concurrencyLimiter.Release();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records an upstream failure for backpressure calculation.
|
||||
/// </summary>
|
||||
/// <param name="statusCode">HTTP status code from upstream.</param>
|
||||
/// <param name="retryAfter">Optional Retry-After header value.</param>
|
||||
/// <param name="now">Current time.</param>
|
||||
/// <returns>Backpressure result.</returns>
|
||||
public BackpressureResult RecordUpstreamFailure(int statusCode, TimeSpan? retryAfter = null, DateTimeOffset? now = null)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return _backpressureHandler.RecordFailure(statusCode, retryAfter, now);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a successful upstream request.
|
||||
/// </summary>
|
||||
public void RecordUpstreamSuccess()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_backpressureHandler.RecordSuccess();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Pauses the limiter.
|
||||
/// </summary>
|
||||
/// <param name="reason">Reason for pause.</param>
|
||||
public void Pause(string reason)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
IsPaused = true;
|
||||
PauseReason = reason;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resumes the limiter.
|
||||
/// </summary>
|
||||
public void Resume()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
IsPaused = false;
|
||||
PauseReason = null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a snapshot of the current limiter state.
|
||||
/// </summary>
|
||||
/// <param name="now">Current time.</param>
|
||||
/// <returns>Snapshot of limiter state.</returns>
|
||||
public AdaptiveRateLimiterSnapshot GetSnapshot(DateTimeOffset now)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return new AdaptiveRateLimiterSnapshot(
|
||||
TenantId: TenantId,
|
||||
JobType: JobType,
|
||||
IsPaused: IsPaused,
|
||||
PauseReason: PauseReason,
|
||||
TokenBucket: _tokenBucket.GetSnapshot(now),
|
||||
Concurrency: _concurrencyLimiter.GetSnapshot(),
|
||||
Backpressure: _backpressureHandler.GetSnapshot(now),
|
||||
HourlyCounter: _hourlyCounter.GetSnapshot(now));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exports the current state to a quota record for persistence.
|
||||
/// </summary>
|
||||
/// <param name="quotaId">Original quota ID.</param>
|
||||
/// <param name="now">Current time.</param>
|
||||
/// <param name="updatedBy">Actor performing the update.</param>
|
||||
/// <returns>Quota record with current state.</returns>
|
||||
public Quota ExportToQuota(Guid quotaId, DateTimeOffset now, string updatedBy)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
var tokenSnapshot = _tokenBucket.GetSnapshot(now);
|
||||
var concurrencySnapshot = _concurrencyLimiter.GetSnapshot();
|
||||
var hourlySnapshot = _hourlyCounter.GetSnapshot(now);
|
||||
|
||||
return new Quota(
|
||||
QuotaId: quotaId,
|
||||
TenantId: TenantId,
|
||||
JobType: JobType,
|
||||
MaxActive: concurrencySnapshot.MaxActive,
|
||||
MaxPerHour: MaxPerHour,
|
||||
BurstCapacity: tokenSnapshot.BurstCapacity,
|
||||
RefillRate: tokenSnapshot.RefillRate,
|
||||
CurrentTokens: tokenSnapshot.CurrentTokens,
|
||||
LastRefillAt: tokenSnapshot.LastRefillAt,
|
||||
CurrentActive: concurrencySnapshot.CurrentActive,
|
||||
CurrentHourCount: hourlySnapshot.CurrentCount,
|
||||
CurrentHourStart: hourlySnapshot.HourStart,
|
||||
Paused: IsPaused,
|
||||
PauseReason: PauseReason,
|
||||
QuotaTicket: null,
|
||||
CreatedAt: now, // This should be preserved from original
|
||||
UpdatedAt: now,
|
||||
UpdatedBy: updatedBy);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a rate limit acquisition attempt.
|
||||
/// </summary>
|
||||
public sealed record RateLimitResult(
|
||||
bool IsAllowed,
|
||||
RateLimitDenialReason? DenialReason,
|
||||
string? DenialMessage,
|
||||
TimeSpan? RetryAfter)
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates an allowed result.
|
||||
/// </summary>
|
||||
public static RateLimitResult Allowed() => new(true, null, null, null);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a denied result.
|
||||
/// </summary>
|
||||
public static RateLimitResult Denied(
|
||||
RateLimitDenialReason reason,
|
||||
string? message = null,
|
||||
TimeSpan? retryAfter = null) =>
|
||||
new(false, reason, message, retryAfter);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reasons for rate limit denial.
|
||||
/// </summary>
|
||||
public enum RateLimitDenialReason
|
||||
{
|
||||
/// <summary>Limiter is paused by operator.</summary>
|
||||
Paused,
|
||||
|
||||
/// <summary>In backpressure backoff period.</summary>
|
||||
Backpressure,
|
||||
|
||||
/// <summary>Hourly request limit exceeded.</summary>
|
||||
HourlyLimitExceeded,
|
||||
|
||||
/// <summary>Concurrency limit exceeded.</summary>
|
||||
ConcurrencyLimitExceeded,
|
||||
|
||||
/// <summary>Token bucket exhausted.</summary>
|
||||
TokensExhausted
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Snapshot of adaptive rate limiter state.
|
||||
/// </summary>
|
||||
public sealed record AdaptiveRateLimiterSnapshot(
|
||||
string TenantId,
|
||||
string? JobType,
|
||||
bool IsPaused,
|
||||
string? PauseReason,
|
||||
TokenBucketSnapshot TokenBucket,
|
||||
ConcurrencySnapshot Concurrency,
|
||||
BackpressureSnapshot Backpressure,
|
||||
HourlyCounterSnapshot HourlyCounter);
|
||||
|
||||
/// <summary>
|
||||
/// Tracks requests per hour with automatic reset.
|
||||
/// </summary>
|
||||
public sealed class HourlyCounter
|
||||
{
|
||||
private readonly object _lock = new();
|
||||
private int _currentCount;
|
||||
private DateTimeOffset _hourStart;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum allowed requests per hour.
|
||||
/// </summary>
|
||||
public int MaxPerHour { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new hourly counter.
|
||||
/// </summary>
|
||||
public HourlyCounter(int maxPerHour, int currentCount = 0, DateTimeOffset? hourStart = null)
|
||||
{
|
||||
if (maxPerHour <= 0)
|
||||
throw new ArgumentOutOfRangeException(nameof(maxPerHour), "Max per hour must be positive.");
|
||||
|
||||
MaxPerHour = maxPerHour;
|
||||
_currentCount = currentCount;
|
||||
_hourStart = hourStart ?? TruncateToHour(DateTimeOffset.UtcNow);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Attempts to increment the counter.
|
||||
/// </summary>
|
||||
/// <param name="now">Current time.</param>
|
||||
/// <returns>True if increment was allowed, false if limit reached.</returns>
|
||||
public bool TryIncrement(DateTimeOffset now)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
MaybeResetHour(now);
|
||||
|
||||
if (_currentCount < MaxPerHour)
|
||||
{
|
||||
_currentCount++;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Decrements the counter (for rollback).
|
||||
/// </summary>
|
||||
public void Decrement()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (_currentCount > 0)
|
||||
_currentCount--;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a snapshot of the counter state.
|
||||
/// </summary>
|
||||
public HourlyCounterSnapshot GetSnapshot(DateTimeOffset now)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
MaybeResetHour(now);
|
||||
var nextHour = _hourStart.AddHours(1);
|
||||
var timeUntilReset = nextHour - now;
|
||||
|
||||
return new HourlyCounterSnapshot(
|
||||
MaxPerHour: MaxPerHour,
|
||||
CurrentCount: _currentCount,
|
||||
HourStart: _hourStart,
|
||||
TimeUntilReset: timeUntilReset > TimeSpan.Zero ? timeUntilReset : TimeSpan.Zero);
|
||||
}
|
||||
}
|
||||
|
||||
private void MaybeResetHour(DateTimeOffset now)
|
||||
{
|
||||
var currentHour = TruncateToHour(now);
|
||||
if (currentHour > _hourStart)
|
||||
{
|
||||
_hourStart = currentHour;
|
||||
_currentCount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
private static DateTimeOffset TruncateToHour(DateTimeOffset dt) =>
|
||||
new(dt.Year, dt.Month, dt.Day, dt.Hour, 0, 0, dt.Offset);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Snapshot of hourly counter state.
|
||||
/// </summary>
|
||||
public sealed record HourlyCounterSnapshot(
|
||||
int MaxPerHour,
|
||||
int CurrentCount,
|
||||
DateTimeOffset HourStart,
|
||||
TimeSpan TimeUntilReset)
|
||||
{
|
||||
/// <summary>
|
||||
/// Remaining requests in current hour.
|
||||
/// </summary>
|
||||
public int Remaining => Math.Max(0, MaxPerHour - CurrentCount);
|
||||
|
||||
/// <summary>
|
||||
/// Whether the hourly limit has been reached.
|
||||
/// </summary>
|
||||
public bool IsExhausted => CurrentCount >= MaxPerHour;
|
||||
}
|
||||
@@ -0,0 +1,273 @@
|
||||
namespace StellaOps.Orchestrator.Core.RateLimiting;
|
||||
|
||||
/// <summary>
|
||||
/// Handles backpressure from upstream services (429, 503, etc.).
|
||||
/// Implements exponential backoff with jitter for retry timing.
|
||||
/// </summary>
|
||||
public sealed class BackpressureHandler
|
||||
{
|
||||
private readonly object _lock = new();
|
||||
private int _consecutiveFailures;
|
||||
private DateTimeOffset? _backoffUntil;
|
||||
private DateTimeOffset _lastFailureAt;
|
||||
private string? _lastFailureReason;
|
||||
|
||||
/// <summary>
|
||||
/// Base delay for backoff calculation.
|
||||
/// </summary>
|
||||
public TimeSpan BaseDelay { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum delay cap.
|
||||
/// </summary>
|
||||
public TimeSpan MaxDelay { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of failures before triggering full backoff.
|
||||
/// </summary>
|
||||
public int FailureThreshold { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum random jitter to add (0.0 to 1.0 fraction of delay).
|
||||
/// </summary>
|
||||
public double JitterFactor { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether currently in backoff state.
|
||||
/// </summary>
|
||||
public bool IsInBackoff
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return _backoffUntil.HasValue && DateTimeOffset.UtcNow < _backoffUntil.Value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Number of consecutive failures.
|
||||
/// </summary>
|
||||
public int ConsecutiveFailures
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return _consecutiveFailures;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Time until backoff expires (or TimeSpan.Zero if not in backoff).
|
||||
/// </summary>
|
||||
public TimeSpan TimeUntilReady
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (!_backoffUntil.HasValue)
|
||||
return TimeSpan.Zero;
|
||||
|
||||
var remaining = _backoffUntil.Value - DateTimeOffset.UtcNow;
|
||||
return remaining > TimeSpan.Zero ? remaining : TimeSpan.Zero;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new backpressure handler.
|
||||
/// </summary>
|
||||
/// <param name="baseDelay">Base delay for exponential backoff.</param>
|
||||
/// <param name="maxDelay">Maximum delay cap.</param>
|
||||
/// <param name="failureThreshold">Failures before entering backoff.</param>
|
||||
/// <param name="jitterFactor">Random jitter factor (0.0 to 1.0).</param>
|
||||
public BackpressureHandler(
|
||||
TimeSpan? baseDelay = null,
|
||||
TimeSpan? maxDelay = null,
|
||||
int failureThreshold = 1,
|
||||
double jitterFactor = 0.2)
|
||||
{
|
||||
BaseDelay = baseDelay ?? TimeSpan.FromSeconds(1);
|
||||
MaxDelay = maxDelay ?? TimeSpan.FromMinutes(5);
|
||||
FailureThreshold = failureThreshold > 0 ? failureThreshold : 1;
|
||||
JitterFactor = Math.Clamp(jitterFactor, 0.0, 1.0);
|
||||
|
||||
if (BaseDelay <= TimeSpan.Zero)
|
||||
throw new ArgumentOutOfRangeException(nameof(baseDelay), "Base delay must be positive.");
|
||||
if (MaxDelay < BaseDelay)
|
||||
throw new ArgumentOutOfRangeException(nameof(maxDelay), "Max delay must be >= base delay.");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records an upstream failure and potentially triggers backoff.
|
||||
/// </summary>
|
||||
/// <param name="statusCode">HTTP status code from upstream.</param>
|
||||
/// <param name="retryAfter">Optional Retry-After header value.</param>
|
||||
/// <param name="now">Current time.</param>
|
||||
/// <returns>Backoff result with recommended delay.</returns>
|
||||
public BackpressureResult RecordFailure(int statusCode, TimeSpan? retryAfter = null, DateTimeOffset? now = null)
|
||||
{
|
||||
var timestamp = now ?? DateTimeOffset.UtcNow;
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
_consecutiveFailures++;
|
||||
_lastFailureAt = timestamp;
|
||||
_lastFailureReason = GetFailureReason(statusCode);
|
||||
|
||||
// Use Retry-After if provided and reasonable
|
||||
if (retryAfter.HasValue && retryAfter.Value > TimeSpan.Zero && retryAfter.Value <= MaxDelay)
|
||||
{
|
||||
_backoffUntil = timestamp + retryAfter.Value;
|
||||
return new BackpressureResult(
|
||||
ShouldBackoff: true,
|
||||
BackoffDuration: retryAfter.Value,
|
||||
BackoffUntil: _backoffUntil.Value,
|
||||
ConsecutiveFailures: _consecutiveFailures,
|
||||
Reason: _lastFailureReason,
|
||||
StatusCode: statusCode);
|
||||
}
|
||||
|
||||
// Calculate exponential backoff with jitter
|
||||
var delay = CalculateBackoffDelay(_consecutiveFailures, timestamp);
|
||||
_backoffUntil = timestamp + delay;
|
||||
|
||||
return new BackpressureResult(
|
||||
ShouldBackoff: _consecutiveFailures >= FailureThreshold,
|
||||
BackoffDuration: delay,
|
||||
BackoffUntil: _backoffUntil.Value,
|
||||
ConsecutiveFailures: _consecutiveFailures,
|
||||
Reason: _lastFailureReason,
|
||||
StatusCode: statusCode);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a successful request, resetting failure count.
|
||||
/// </summary>
|
||||
public void RecordSuccess()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_consecutiveFailures = 0;
|
||||
_backoffUntil = null;
|
||||
_lastFailureReason = null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a request should be allowed based on backoff state.
|
||||
/// </summary>
|
||||
/// <param name="now">Current time.</param>
|
||||
/// <returns>True if request should proceed, false if in backoff.</returns>
|
||||
public bool ShouldAllow(DateTimeOffset? now = null)
|
||||
{
|
||||
var timestamp = now ?? DateTimeOffset.UtcNow;
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
if (!_backoffUntil.HasValue)
|
||||
return true;
|
||||
|
||||
if (timestamp >= _backoffUntil.Value)
|
||||
{
|
||||
// Backoff expired
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resets the handler to initial state.
|
||||
/// </summary>
|
||||
public void Reset()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_consecutiveFailures = 0;
|
||||
_backoffUntil = null;
|
||||
_lastFailureReason = null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a snapshot of the current backpressure state.
|
||||
/// </summary>
|
||||
/// <param name="now">Current time.</param>
|
||||
/// <returns>Snapshot of backpressure state.</returns>
|
||||
public BackpressureSnapshot GetSnapshot(DateTimeOffset? now = null)
|
||||
{
|
||||
var timestamp = now ?? DateTimeOffset.UtcNow;
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
var isInBackoff = _backoffUntil.HasValue && timestamp < _backoffUntil.Value;
|
||||
var timeRemaining = isInBackoff ? _backoffUntil!.Value - timestamp : TimeSpan.Zero;
|
||||
|
||||
return new BackpressureSnapshot(
|
||||
IsInBackoff: isInBackoff,
|
||||
ConsecutiveFailures: _consecutiveFailures,
|
||||
BackoffUntil: _backoffUntil,
|
||||
TimeRemaining: timeRemaining > TimeSpan.Zero ? timeRemaining : TimeSpan.Zero,
|
||||
LastFailureAt: _lastFailureAt,
|
||||
LastFailureReason: _lastFailureReason);
|
||||
}
|
||||
}
|
||||
|
||||
private TimeSpan CalculateBackoffDelay(int failures, DateTimeOffset now)
|
||||
{
|
||||
// Exponential backoff: baseDelay * 2^(failures-1)
|
||||
var exponent = Math.Min(failures - 1, 10); // Cap exponent to prevent overflow
|
||||
var delayMs = BaseDelay.TotalMilliseconds * Math.Pow(2, exponent);
|
||||
|
||||
// Add jitter
|
||||
if (JitterFactor > 0)
|
||||
{
|
||||
var jitter = delayMs * JitterFactor * Random.Shared.NextDouble();
|
||||
delayMs += jitter;
|
||||
}
|
||||
|
||||
// Cap at max delay
|
||||
var delay = TimeSpan.FromMilliseconds(Math.Min(delayMs, MaxDelay.TotalMilliseconds));
|
||||
return delay;
|
||||
}
|
||||
|
||||
private static string GetFailureReason(int statusCode) => statusCode switch
|
||||
{
|
||||
429 => "upstream_rate_limited",
|
||||
503 => "upstream_unavailable",
|
||||
502 => "upstream_bad_gateway",
|
||||
504 => "upstream_timeout",
|
||||
>= 500 and < 600 => "upstream_server_error",
|
||||
>= 400 and < 500 => "upstream_client_error",
|
||||
_ => "upstream_error"
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of recording a failure.
|
||||
/// </summary>
|
||||
public sealed record BackpressureResult(
|
||||
bool ShouldBackoff,
|
||||
TimeSpan BackoffDuration,
|
||||
DateTimeOffset BackoffUntil,
|
||||
int ConsecutiveFailures,
|
||||
string Reason,
|
||||
int StatusCode);
|
||||
|
||||
/// <summary>
|
||||
/// Snapshot of backpressure handler state.
|
||||
/// </summary>
|
||||
public sealed record BackpressureSnapshot(
|
||||
bool IsInBackoff,
|
||||
int ConsecutiveFailures,
|
||||
DateTimeOffset? BackoffUntil,
|
||||
TimeSpan TimeRemaining,
|
||||
DateTimeOffset LastFailureAt,
|
||||
string? LastFailureReason);
|
||||
@@ -0,0 +1,226 @@
|
||||
namespace StellaOps.Orchestrator.Core.RateLimiting;
|
||||
|
||||
/// <summary>
|
||||
/// Concurrency limiter that tracks active jobs and enforces maximum concurrent execution.
|
||||
/// </summary>
|
||||
public sealed class ConcurrencyLimiter
|
||||
{
|
||||
private readonly object _lock = new();
|
||||
private int _currentActive;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum allowed concurrent active jobs.
|
||||
/// </summary>
|
||||
public int MaxActive { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Current count of active jobs.
|
||||
/// </summary>
|
||||
public int CurrentActive
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return _currentActive;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Number of available slots.
|
||||
/// </summary>
|
||||
public int AvailableSlots
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return Math.Max(0, MaxActive - _currentActive);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new concurrency limiter.
|
||||
/// </summary>
|
||||
/// <param name="maxActive">Maximum concurrent jobs allowed.</param>
|
||||
/// <param name="currentActive">Starting count of active jobs.</param>
|
||||
public ConcurrencyLimiter(int maxActive, int currentActive = 0)
|
||||
{
|
||||
if (maxActive <= 0)
|
||||
throw new ArgumentOutOfRangeException(nameof(maxActive), "Max active must be positive.");
|
||||
if (currentActive < 0)
|
||||
throw new ArgumentOutOfRangeException(nameof(currentActive), "Current active cannot be negative.");
|
||||
|
||||
MaxActive = maxActive;
|
||||
_currentActive = currentActive;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Attempts to acquire a slot for a new active job.
|
||||
/// </summary>
|
||||
/// <returns>True if slot was acquired, false if at capacity.</returns>
|
||||
public bool TryAcquire()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (_currentActive < MaxActive)
|
||||
{
|
||||
_currentActive++;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Attempts to acquire multiple slots.
|
||||
/// </summary>
|
||||
/// <param name="count">Number of slots to acquire.</param>
|
||||
/// <returns>True if all slots were acquired, false otherwise (no partial acquisition).</returns>
|
||||
public bool TryAcquire(int count)
|
||||
{
|
||||
if (count <= 0)
|
||||
throw new ArgumentOutOfRangeException(nameof(count), "Count must be positive.");
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
if (_currentActive + count <= MaxActive)
|
||||
{
|
||||
_currentActive += count;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Releases a slot when a job completes.
|
||||
/// </summary>
|
||||
/// <returns>True if slot was released, false if already at zero.</returns>
|
||||
public bool Release()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (_currentActive > 0)
|
||||
{
|
||||
_currentActive--;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Releases multiple slots.
|
||||
/// </summary>
|
||||
/// <param name="count">Number of slots to release.</param>
|
||||
/// <returns>Number of slots actually released.</returns>
|
||||
public int Release(int count)
|
||||
{
|
||||
if (count <= 0)
|
||||
throw new ArgumentOutOfRangeException(nameof(count), "Count must be positive.");
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
var released = Math.Min(count, _currentActive);
|
||||
_currentActive -= released;
|
||||
return released;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a slot is available without acquiring it.
|
||||
/// </summary>
|
||||
/// <returns>True if at least one slot is available.</returns>
|
||||
public bool HasCapacity()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return _currentActive < MaxActive;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if multiple slots are available without acquiring them.
|
||||
/// </summary>
|
||||
/// <param name="count">Number of slots to check for.</param>
|
||||
/// <returns>True if requested slots are available.</returns>
|
||||
public bool HasCapacity(int count)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return _currentActive + count <= MaxActive;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resets the limiter to zero active jobs.
|
||||
/// </summary>
|
||||
/// <returns>Number of slots that were released.</returns>
|
||||
public int Reset()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
var released = _currentActive;
|
||||
_currentActive = 0;
|
||||
return released;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets the current active count directly (for recovery/sync scenarios).
|
||||
/// </summary>
|
||||
/// <param name="count">New active count.</param>
|
||||
public void SetActive(int count)
|
||||
{
|
||||
if (count < 0)
|
||||
throw new ArgumentOutOfRangeException(nameof(count), "Count cannot be negative.");
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
_currentActive = count;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a snapshot of the current limiter state.
|
||||
/// </summary>
|
||||
/// <returns>Snapshot of limiter state.</returns>
|
||||
public ConcurrencySnapshot GetSnapshot()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return new ConcurrencySnapshot(MaxActive, _currentActive);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Immutable snapshot of concurrency limiter state.
|
||||
/// </summary>
|
||||
public sealed record ConcurrencySnapshot(
|
||||
int MaxActive,
|
||||
int CurrentActive)
|
||||
{
|
||||
/// <summary>
|
||||
/// Number of available slots.
|
||||
/// </summary>
|
||||
public int AvailableSlots => Math.Max(0, MaxActive - CurrentActive);
|
||||
|
||||
/// <summary>
|
||||
/// Utilization percentage (0.0 to 1.0).
|
||||
/// </summary>
|
||||
public double Utilization => (double)CurrentActive / MaxActive;
|
||||
|
||||
/// <summary>
|
||||
/// Whether the limiter is at capacity.
|
||||
/// </summary>
|
||||
public bool IsAtCapacity => CurrentActive >= MaxActive;
|
||||
|
||||
/// <summary>
|
||||
/// Whether there are no active jobs.
|
||||
/// </summary>
|
||||
public bool IsIdle => CurrentActive == 0;
|
||||
}
|
||||
@@ -0,0 +1,210 @@
|
||||
namespace StellaOps.Orchestrator.Core.RateLimiting;
|
||||
|
||||
/// <summary>
|
||||
/// Token bucket rate limiter implementation.
|
||||
/// Tokens refill at a constant rate up to a burst capacity.
|
||||
/// </summary>
|
||||
public sealed class TokenBucket
|
||||
{
|
||||
private readonly object _lock = new();
|
||||
private double _currentTokens;
|
||||
private DateTimeOffset _lastRefillAt;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum tokens the bucket can hold (burst capacity).
|
||||
/// </summary>
|
||||
public int BurstCapacity { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Rate at which tokens are added (tokens per second).
|
||||
/// </summary>
|
||||
public double RefillRate { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Current number of available tokens.
|
||||
/// </summary>
|
||||
public double CurrentTokens
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return _currentTokens;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Last time the bucket was refilled.
|
||||
/// </summary>
|
||||
public DateTimeOffset LastRefillAt
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return _lastRefillAt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new token bucket.
|
||||
/// </summary>
|
||||
/// <param name="burstCapacity">Maximum tokens the bucket can hold.</param>
|
||||
/// <param name="refillRate">Tokens per second to add.</param>
|
||||
/// <param name="initialTokens">Starting number of tokens (defaults to burst capacity).</param>
|
||||
/// <param name="lastRefillAt">Starting time for refill calculation.</param>
|
||||
public TokenBucket(
|
||||
int burstCapacity,
|
||||
double refillRate,
|
||||
double? initialTokens = null,
|
||||
DateTimeOffset? lastRefillAt = null)
|
||||
{
|
||||
if (burstCapacity <= 0)
|
||||
throw new ArgumentOutOfRangeException(nameof(burstCapacity), "Burst capacity must be positive.");
|
||||
if (refillRate <= 0)
|
||||
throw new ArgumentOutOfRangeException(nameof(refillRate), "Refill rate must be positive.");
|
||||
|
||||
BurstCapacity = burstCapacity;
|
||||
RefillRate = refillRate;
|
||||
_currentTokens = Math.Min(initialTokens ?? burstCapacity, burstCapacity);
|
||||
_lastRefillAt = lastRefillAt ?? DateTimeOffset.UtcNow;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Attempts to consume a token from the bucket.
|
||||
/// </summary>
|
||||
/// <param name="now">Current time for refill calculation.</param>
|
||||
/// <param name="tokensRequired">Number of tokens to consume (default 1).</param>
|
||||
/// <returns>True if tokens were consumed, false if insufficient tokens.</returns>
|
||||
public bool TryConsume(DateTimeOffset now, int tokensRequired = 1)
|
||||
{
|
||||
if (tokensRequired <= 0)
|
||||
throw new ArgumentOutOfRangeException(nameof(tokensRequired), "Tokens required must be positive.");
|
||||
|
||||
lock (_lock)
|
||||
{
|
||||
Refill(now);
|
||||
|
||||
if (_currentTokens >= tokensRequired)
|
||||
{
|
||||
_currentTokens -= tokensRequired;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if the bucket has enough tokens without consuming them.
|
||||
/// </summary>
|
||||
/// <param name="now">Current time for refill calculation.</param>
|
||||
/// <param name="tokensRequired">Number of tokens to check for.</param>
|
||||
/// <returns>True if sufficient tokens are available.</returns>
|
||||
public bool HasTokens(DateTimeOffset now, int tokensRequired = 1)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
Refill(now);
|
||||
return _currentTokens >= tokensRequired;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets estimated time until the specified number of tokens will be available.
|
||||
/// </summary>
|
||||
/// <param name="now">Current time for calculation.</param>
|
||||
/// <param name="tokensRequired">Number of tokens needed.</param>
|
||||
/// <returns>Time until tokens available, or TimeSpan.Zero if already available.</returns>
|
||||
public TimeSpan EstimatedWaitTime(DateTimeOffset now, int tokensRequired = 1)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
Refill(now);
|
||||
|
||||
if (_currentTokens >= tokensRequired)
|
||||
return TimeSpan.Zero;
|
||||
|
||||
var tokensNeeded = tokensRequired - _currentTokens;
|
||||
var secondsToWait = tokensNeeded / RefillRate;
|
||||
return TimeSpan.FromSeconds(secondsToWait);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Refills tokens based on elapsed time.
|
||||
/// </summary>
|
||||
/// <param name="now">Current time.</param>
|
||||
public void Refill(DateTimeOffset now)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (now <= _lastRefillAt)
|
||||
return;
|
||||
|
||||
var elapsed = (now - _lastRefillAt).TotalSeconds;
|
||||
var tokensToAdd = elapsed * RefillRate;
|
||||
|
||||
_currentTokens = Math.Min(_currentTokens + tokensToAdd, BurstCapacity);
|
||||
_lastRefillAt = now;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resets the bucket to full capacity.
|
||||
/// </summary>
|
||||
/// <param name="now">Current time.</param>
|
||||
public void Reset(DateTimeOffset now)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_currentTokens = BurstCapacity;
|
||||
_lastRefillAt = now;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a snapshot of the current bucket state.
|
||||
/// </summary>
|
||||
/// <param name="now">Current time for refill calculation.</param>
|
||||
/// <returns>Snapshot of bucket state.</returns>
|
||||
public TokenBucketSnapshot GetSnapshot(DateTimeOffset now)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
Refill(now);
|
||||
return new TokenBucketSnapshot(
|
||||
BurstCapacity,
|
||||
RefillRate,
|
||||
_currentTokens,
|
||||
_lastRefillAt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Immutable snapshot of token bucket state.
|
||||
/// </summary>
|
||||
public sealed record TokenBucketSnapshot(
|
||||
int BurstCapacity,
|
||||
double RefillRate,
|
||||
double CurrentTokens,
|
||||
DateTimeOffset LastRefillAt)
|
||||
{
|
||||
/// <summary>
|
||||
/// Percentage of bucket that is full (0.0 to 1.0).
|
||||
/// </summary>
|
||||
public double FillPercent => CurrentTokens / BurstCapacity;
|
||||
|
||||
/// <summary>
|
||||
/// Whether the bucket is empty.
|
||||
/// </summary>
|
||||
public bool IsEmpty => CurrentTokens < 1;
|
||||
|
||||
/// <summary>
|
||||
/// Whether the bucket is full.
|
||||
/// </summary>
|
||||
public bool IsFull => CurrentTokens >= BurstCapacity;
|
||||
}
|
||||
@@ -0,0 +1,399 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Core.Scheduling;
|
||||
|
||||
/// <summary>
|
||||
/// Plans and manages job DAG (Directed Acyclic Graph) execution.
|
||||
/// Handles dependency resolution, topological sorting, and critical path analysis.
|
||||
/// </summary>
|
||||
public sealed class DagPlanner
|
||||
{
|
||||
/// <summary>
|
||||
/// Validates that the given edges form a valid DAG (no cycles).
|
||||
/// </summary>
|
||||
/// <param name="edges">DAG edges to validate.</param>
|
||||
/// <returns>Validation result with any detected cycles.</returns>
|
||||
public static DagValidationResult ValidateDag(IEnumerable<DagEdge> edges)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(edges);
|
||||
|
||||
var edgeList = edges.ToList();
|
||||
if (edgeList.Count == 0)
|
||||
{
|
||||
return DagValidationResult.Valid();
|
||||
}
|
||||
|
||||
// Build adjacency list
|
||||
var adjacency = new Dictionary<Guid, List<Guid>>();
|
||||
var allNodes = new HashSet<Guid>();
|
||||
|
||||
foreach (var edge in edgeList)
|
||||
{
|
||||
allNodes.Add(edge.ParentJobId);
|
||||
allNodes.Add(edge.ChildJobId);
|
||||
|
||||
if (!adjacency.TryGetValue(edge.ParentJobId, out var children))
|
||||
{
|
||||
children = [];
|
||||
adjacency[edge.ParentJobId] = children;
|
||||
}
|
||||
children.Add(edge.ChildJobId);
|
||||
}
|
||||
|
||||
// Detect cycles using DFS with coloring
|
||||
var white = new HashSet<Guid>(allNodes); // Unvisited
|
||||
var gray = new HashSet<Guid>(); // In progress
|
||||
var cycleNodes = new List<Guid>();
|
||||
|
||||
foreach (var node in allNodes)
|
||||
{
|
||||
if (white.Contains(node))
|
||||
{
|
||||
if (HasCycleDfs(node, adjacency, white, gray, cycleNodes))
|
||||
{
|
||||
return DagValidationResult.CycleDetected(cycleNodes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return DagValidationResult.Valid();
|
||||
}
|
||||
|
||||
private static bool HasCycleDfs(
|
||||
Guid node,
|
||||
Dictionary<Guid, List<Guid>> adjacency,
|
||||
HashSet<Guid> white,
|
||||
HashSet<Guid> gray,
|
||||
List<Guid> cycleNodes)
|
||||
{
|
||||
white.Remove(node);
|
||||
gray.Add(node);
|
||||
|
||||
if (adjacency.TryGetValue(node, out var children))
|
||||
{
|
||||
foreach (var child in children)
|
||||
{
|
||||
if (gray.Contains(child))
|
||||
{
|
||||
// Back edge found - cycle detected
|
||||
cycleNodes.Add(child);
|
||||
cycleNodes.Add(node);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (white.Contains(child) && HasCycleDfs(child, adjacency, white, gray, cycleNodes))
|
||||
{
|
||||
cycleNodes.Add(node);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
gray.Remove(node);
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Performs topological sort on jobs based on their dependencies.
|
||||
/// </summary>
|
||||
/// <param name="jobIds">Job IDs to sort.</param>
|
||||
/// <param name="edges">Dependency edges.</param>
|
||||
/// <returns>Jobs in topologically sorted order (parents before children).</returns>
|
||||
public static IReadOnlyList<Guid> TopologicalSort(IEnumerable<Guid> jobIds, IEnumerable<DagEdge> edges)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(jobIds);
|
||||
ArgumentNullException.ThrowIfNull(edges);
|
||||
|
||||
var jobs = jobIds.ToHashSet();
|
||||
var edgeList = edges.ToList();
|
||||
|
||||
// Build in-degree map and adjacency list
|
||||
var inDegree = jobs.ToDictionary(j => j, _ => 0);
|
||||
var adjacency = new Dictionary<Guid, List<Guid>>();
|
||||
|
||||
foreach (var edge in edgeList)
|
||||
{
|
||||
if (!jobs.Contains(edge.ParentJobId) || !jobs.Contains(edge.ChildJobId))
|
||||
{
|
||||
continue; // Skip edges for jobs not in our set
|
||||
}
|
||||
|
||||
inDegree[edge.ChildJobId]++;
|
||||
|
||||
if (!adjacency.TryGetValue(edge.ParentJobId, out var children))
|
||||
{
|
||||
children = [];
|
||||
adjacency[edge.ParentJobId] = children;
|
||||
}
|
||||
children.Add(edge.ChildJobId);
|
||||
}
|
||||
|
||||
// Kahn's algorithm
|
||||
var queue = new Queue<Guid>(inDegree.Where(kv => kv.Value == 0).Select(kv => kv.Key));
|
||||
var result = new List<Guid>(jobs.Count);
|
||||
|
||||
while (queue.Count > 0)
|
||||
{
|
||||
var current = queue.Dequeue();
|
||||
result.Add(current);
|
||||
|
||||
if (adjacency.TryGetValue(current, out var children))
|
||||
{
|
||||
foreach (var child in children)
|
||||
{
|
||||
inDegree[child]--;
|
||||
if (inDegree[child] == 0)
|
||||
{
|
||||
queue.Enqueue(child);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (result.Count != jobs.Count)
|
||||
{
|
||||
throw new InvalidOperationException("Cycle detected in job DAG - topological sort failed.");
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all jobs that have no unmet dependencies (ready to schedule).
|
||||
/// </summary>
|
||||
/// <param name="jobs">All jobs in the DAG.</param>
|
||||
/// <param name="edges">Dependency edges.</param>
|
||||
/// <returns>Jobs with all dependencies satisfied or no dependencies.</returns>
|
||||
public static IReadOnlyList<Job> GetReadyJobs(IEnumerable<Job> jobs, IEnumerable<DagEdge> edges)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(jobs);
|
||||
ArgumentNullException.ThrowIfNull(edges);
|
||||
|
||||
var jobList = jobs.ToList();
|
||||
var edgeList = edges.ToList();
|
||||
|
||||
// Build map of job ID to job and set of succeeded job IDs
|
||||
var jobMap = jobList.ToDictionary(j => j.JobId);
|
||||
var succeededJobs = jobList
|
||||
.Where(j => JobStateMachine.IsSuccess(j.Status))
|
||||
.Select(j => j.JobId)
|
||||
.ToHashSet();
|
||||
|
||||
// Build map of job ID to parent dependencies
|
||||
var dependencies = new Dictionary<Guid, List<DagEdge>>();
|
||||
foreach (var edge in edgeList)
|
||||
{
|
||||
if (!dependencies.TryGetValue(edge.ChildJobId, out var deps))
|
||||
{
|
||||
deps = [];
|
||||
dependencies[edge.ChildJobId] = deps;
|
||||
}
|
||||
deps.Add(edge);
|
||||
}
|
||||
|
||||
var ready = new List<Job>();
|
||||
|
||||
foreach (var job in jobList)
|
||||
{
|
||||
// Skip jobs that aren't pending
|
||||
if (!JobStateMachine.IsPending(job.Status))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if all dependencies are satisfied
|
||||
if (!dependencies.TryGetValue(job.JobId, out var deps))
|
||||
{
|
||||
// No dependencies - ready to go
|
||||
ready.Add(job);
|
||||
continue;
|
||||
}
|
||||
|
||||
var allSatisfied = deps.All(edge => IsDependencySatisfied(edge, jobMap, succeededJobs));
|
||||
if (allSatisfied)
|
||||
{
|
||||
ready.Add(job);
|
||||
}
|
||||
}
|
||||
|
||||
return ready;
|
||||
}
|
||||
|
||||
private static bool IsDependencySatisfied(DagEdge edge, Dictionary<Guid, Job> jobMap, HashSet<Guid> succeededJobs)
|
||||
{
|
||||
if (!jobMap.TryGetValue(edge.ParentJobId, out var parentJob))
|
||||
{
|
||||
// Parent job doesn't exist - treat as satisfied (orphan edge)
|
||||
return true;
|
||||
}
|
||||
|
||||
return edge.EdgeType switch
|
||||
{
|
||||
DagEdgeTypes.Success => succeededJobs.Contains(edge.ParentJobId),
|
||||
DagEdgeTypes.Always => JobStateMachine.IsTerminal(parentJob.Status),
|
||||
DagEdgeTypes.Failure => parentJob.Status == JobStatus.Failed,
|
||||
_ => false
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Calculates the critical path through the DAG based on estimated durations.
|
||||
/// </summary>
|
||||
/// <param name="jobs">Jobs with estimated durations.</param>
|
||||
/// <param name="edges">Dependency edges.</param>
|
||||
/// <param name="getDuration">Function to get estimated duration for a job.</param>
|
||||
/// <returns>Critical path information.</returns>
|
||||
public static CriticalPathResult CalculateCriticalPath(
|
||||
IEnumerable<Job> jobs,
|
||||
IEnumerable<DagEdge> edges,
|
||||
Func<Job, TimeSpan> getDuration)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(jobs);
|
||||
ArgumentNullException.ThrowIfNull(edges);
|
||||
ArgumentNullException.ThrowIfNull(getDuration);
|
||||
|
||||
var jobList = jobs.ToList();
|
||||
var edgeList = edges.ToList();
|
||||
|
||||
if (jobList.Count == 0)
|
||||
{
|
||||
return new CriticalPathResult([], TimeSpan.Zero);
|
||||
}
|
||||
|
||||
var jobMap = jobList.ToDictionary(j => j.JobId);
|
||||
var sortedIds = TopologicalSort(jobList.Select(j => j.JobId), edgeList);
|
||||
|
||||
// Build reverse adjacency (child -> parents)
|
||||
var parents = new Dictionary<Guid, List<Guid>>();
|
||||
foreach (var edge in edgeList)
|
||||
{
|
||||
if (!parents.TryGetValue(edge.ChildJobId, out var parentList))
|
||||
{
|
||||
parentList = [];
|
||||
parents[edge.ChildJobId] = parentList;
|
||||
}
|
||||
parentList.Add(edge.ParentJobId);
|
||||
}
|
||||
|
||||
// Forward pass: calculate earliest start times
|
||||
var earliestStart = new Dictionary<Guid, TimeSpan>();
|
||||
var earliestFinish = new Dictionary<Guid, TimeSpan>();
|
||||
|
||||
foreach (var jobId in sortedIds)
|
||||
{
|
||||
var job = jobMap[jobId];
|
||||
var duration = getDuration(job);
|
||||
|
||||
var maxParentFinish = TimeSpan.Zero;
|
||||
if (parents.TryGetValue(jobId, out var parentIds))
|
||||
{
|
||||
foreach (var parentId in parentIds)
|
||||
{
|
||||
if (earliestFinish.TryGetValue(parentId, out var pf) && pf > maxParentFinish)
|
||||
{
|
||||
maxParentFinish = pf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
earliestStart[jobId] = maxParentFinish;
|
||||
earliestFinish[jobId] = maxParentFinish + duration;
|
||||
}
|
||||
|
||||
// Find total duration and identify critical path
|
||||
var totalDuration = earliestFinish.Values.DefaultIfEmpty(TimeSpan.Zero).Max();
|
||||
|
||||
// Backward pass: identify critical path (jobs where slack = 0)
|
||||
var criticalPath = new List<Guid>();
|
||||
var latestFinish = new Dictionary<Guid, TimeSpan>();
|
||||
|
||||
foreach (var jobId in sortedIds.Reverse())
|
||||
{
|
||||
var job = jobMap[jobId];
|
||||
var duration = getDuration(job);
|
||||
|
||||
// Find minimum latest start of children
|
||||
var minChildStart = totalDuration;
|
||||
var adjacency = edgeList.Where(e => e.ParentJobId == jobId).Select(e => e.ChildJobId);
|
||||
foreach (var childId in adjacency)
|
||||
{
|
||||
if (latestFinish.TryGetValue(childId, out var lf))
|
||||
{
|
||||
var childLatestStart = lf - getDuration(jobMap[childId]);
|
||||
if (childLatestStart < minChildStart)
|
||||
{
|
||||
minChildStart = childLatestStart;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
latestFinish[jobId] = minChildStart;
|
||||
|
||||
// Check if on critical path (slack = 0)
|
||||
var slack = minChildStart - earliestFinish[jobId];
|
||||
if (slack <= TimeSpan.Zero)
|
||||
{
|
||||
criticalPath.Add(jobId);
|
||||
}
|
||||
}
|
||||
|
||||
criticalPath.Reverse();
|
||||
return new CriticalPathResult(criticalPath, totalDuration);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets jobs that are blocked by a specific failed job.
|
||||
/// </summary>
|
||||
/// <param name="failedJobId">The failed job ID.</param>
|
||||
/// <param name="edges">Dependency edges.</param>
|
||||
/// <returns>All job IDs that are transitively blocked.</returns>
|
||||
public static IReadOnlySet<Guid> GetBlockedJobs(Guid failedJobId, IEnumerable<DagEdge> edges)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(edges);
|
||||
|
||||
var edgeList = edges.ToList();
|
||||
var blocked = new HashSet<Guid>();
|
||||
var queue = new Queue<Guid>();
|
||||
|
||||
// Find direct children with "success" dependency
|
||||
foreach (var edge in edgeList.Where(e => e.ParentJobId == failedJobId && e.EdgeType == DagEdgeTypes.Success))
|
||||
{
|
||||
queue.Enqueue(edge.ChildJobId);
|
||||
}
|
||||
|
||||
// BFS to find all transitively blocked jobs
|
||||
while (queue.Count > 0)
|
||||
{
|
||||
var current = queue.Dequeue();
|
||||
if (!blocked.Add(current))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var edge in edgeList.Where(e => e.ParentJobId == current))
|
||||
{
|
||||
queue.Enqueue(edge.ChildJobId);
|
||||
}
|
||||
}
|
||||
|
||||
return blocked;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of DAG validation.
|
||||
/// </summary>
|
||||
public sealed record DagValidationResult(
|
||||
bool IsValid,
|
||||
IReadOnlyList<Guid> CycleNodes)
|
||||
{
|
||||
public static DagValidationResult Valid() => new(true, []);
|
||||
public static DagValidationResult CycleDetected(IReadOnlyList<Guid> cycleNodes) => new(false, cycleNodes);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of critical path calculation.
|
||||
/// </summary>
|
||||
public sealed record CriticalPathResult(
|
||||
IReadOnlyList<Guid> CriticalPathJobIds,
|
||||
TimeSpan TotalDuration);
|
||||
@@ -0,0 +1,223 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Core.Scheduling;
|
||||
|
||||
/// <summary>
|
||||
/// Coordinates job scheduling decisions including quota checks,
|
||||
/// dependency resolution, and status transitions.
|
||||
/// </summary>
|
||||
public interface IJobScheduler
|
||||
{
|
||||
/// <summary>
|
||||
/// Evaluates whether a job can be scheduled.
|
||||
/// </summary>
|
||||
ScheduleDecision EvaluateScheduling(Job job, SchedulingContext context);
|
||||
|
||||
/// <summary>
|
||||
/// Evaluates the outcome of a job completion and determines next steps.
|
||||
/// </summary>
|
||||
CompletionDecision EvaluateCompletion(Job job, JobStatus outcome, string? reason, CompletionContext context);
|
||||
|
||||
/// <summary>
|
||||
/// Evaluates which pending jobs are ready to be scheduled.
|
||||
/// </summary>
|
||||
IReadOnlyList<Job> GetSchedulableJobs(IEnumerable<Job> pendingJobs, SchedulingContext context);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of job scheduler.
|
||||
/// </summary>
|
||||
public sealed class JobScheduler : IJobScheduler
|
||||
{
|
||||
/// <summary>
|
||||
/// Evaluates whether a job can transition from Pending to Scheduled.
|
||||
/// </summary>
|
||||
public ScheduleDecision EvaluateScheduling(Job job, SchedulingContext context)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(job);
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
|
||||
// Check current status
|
||||
if (job.Status != JobStatus.Pending)
|
||||
{
|
||||
return ScheduleDecision.Reject($"Job is not pending (current: {job.Status})");
|
||||
}
|
||||
|
||||
// Check if job has a not-before time that hasn't passed
|
||||
if (job.NotBefore.HasValue && job.NotBefore.Value > context.Now)
|
||||
{
|
||||
return ScheduleDecision.Defer(job.NotBefore.Value, "Backoff period not elapsed");
|
||||
}
|
||||
|
||||
// Check dependencies
|
||||
if (!context.AreDependenciesSatisfied)
|
||||
{
|
||||
return ScheduleDecision.Defer(null, "Dependencies not satisfied");
|
||||
}
|
||||
|
||||
// Check quota
|
||||
if (!context.HasQuotaAvailable)
|
||||
{
|
||||
return ScheduleDecision.Defer(context.QuotaAvailableAt, "Quota exhausted");
|
||||
}
|
||||
|
||||
// Check if source/job type is throttled
|
||||
if (context.IsThrottled)
|
||||
{
|
||||
return ScheduleDecision.Defer(context.ThrottleExpiresAt, context.ThrottleReason ?? "Throttled");
|
||||
}
|
||||
|
||||
return ScheduleDecision.Schedule();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evaluates the outcome of a job completion.
|
||||
/// </summary>
|
||||
public CompletionDecision EvaluateCompletion(Job job, JobStatus outcome, string? reason, CompletionContext context)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(job);
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
|
||||
// Validate transition
|
||||
if (!JobStateMachine.IsValidTransition(job.Status, outcome))
|
||||
{
|
||||
throw new InvalidJobTransitionException(job.Status, outcome);
|
||||
}
|
||||
|
||||
// Success - job is done
|
||||
if (outcome == JobStatus.Succeeded)
|
||||
{
|
||||
return CompletionDecision.Complete(outcome, reason);
|
||||
}
|
||||
|
||||
// Canceled - no retry
|
||||
if (outcome == JobStatus.Canceled)
|
||||
{
|
||||
return CompletionDecision.Complete(outcome, reason ?? "Canceled");
|
||||
}
|
||||
|
||||
// Failed or TimedOut - check retry policy
|
||||
if (outcome == JobStatus.Failed || outcome == JobStatus.TimedOut)
|
||||
{
|
||||
var retryDecision = RetryEvaluator.Evaluate(job.Attempt, context.RetryPolicy, context.Now);
|
||||
|
||||
if (retryDecision.ShouldRetry)
|
||||
{
|
||||
return CompletionDecision.Retry(
|
||||
retryDecision.NextAttempt,
|
||||
retryDecision.NotBefore!.Value,
|
||||
$"{outcome}: {reason ?? "Unknown error"}. Retry scheduled.");
|
||||
}
|
||||
|
||||
return CompletionDecision.Complete(
|
||||
JobStatus.Failed,
|
||||
$"{outcome}: {reason ?? "Unknown error"}. {retryDecision.Reason}");
|
||||
}
|
||||
|
||||
return CompletionDecision.Complete(outcome, reason);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all pending jobs that are ready to be scheduled.
|
||||
/// </summary>
|
||||
public IReadOnlyList<Job> GetSchedulableJobs(IEnumerable<Job> pendingJobs, SchedulingContext context)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(pendingJobs);
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
|
||||
var schedulable = new List<Job>();
|
||||
|
||||
foreach (var job in pendingJobs)
|
||||
{
|
||||
if (job.Status != JobStatus.Pending)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip if in backoff period
|
||||
if (job.NotBefore.HasValue && job.NotBefore.Value > context.Now)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Dependencies are checked via context.ReadyJobIds
|
||||
if (context.ReadyJobIds != null && !context.ReadyJobIds.Contains(job.JobId))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
schedulable.Add(job);
|
||||
}
|
||||
|
||||
// Sort by priority (descending) then created time (ascending)
|
||||
return schedulable
|
||||
.OrderByDescending(j => j.Priority)
|
||||
.ThenBy(j => j.CreatedAt)
|
||||
.ToList();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Context for scheduling decisions.
|
||||
/// </summary>
|
||||
public sealed record SchedulingContext(
|
||||
DateTimeOffset Now,
|
||||
bool AreDependenciesSatisfied,
|
||||
bool HasQuotaAvailable,
|
||||
DateTimeOffset? QuotaAvailableAt,
|
||||
bool IsThrottled,
|
||||
string? ThrottleReason,
|
||||
DateTimeOffset? ThrottleExpiresAt,
|
||||
IReadOnlySet<Guid>? ReadyJobIds = null)
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a context where scheduling is allowed.
|
||||
/// </summary>
|
||||
public static SchedulingContext AllowScheduling(DateTimeOffset now) => new(
|
||||
now,
|
||||
AreDependenciesSatisfied: true,
|
||||
HasQuotaAvailable: true,
|
||||
QuotaAvailableAt: null,
|
||||
IsThrottled: false,
|
||||
ThrottleReason: null,
|
||||
ThrottleExpiresAt: null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Context for completion decisions.
|
||||
/// </summary>
|
||||
public sealed record CompletionContext(
|
||||
DateTimeOffset Now,
|
||||
RetryPolicy RetryPolicy);
|
||||
|
||||
/// <summary>
|
||||
/// Decision about whether to schedule a job.
|
||||
/// </summary>
|
||||
public sealed record ScheduleDecision(
|
||||
bool CanSchedule,
|
||||
bool ShouldDefer,
|
||||
DateTimeOffset? DeferUntil,
|
||||
string? Reason)
|
||||
{
|
||||
public static ScheduleDecision Schedule() => new(true, false, null, null);
|
||||
public static ScheduleDecision Defer(DateTimeOffset? until, string reason) => new(false, true, until, reason);
|
||||
public static ScheduleDecision Reject(string reason) => new(false, false, null, reason);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Decision about job completion outcome.
|
||||
/// </summary>
|
||||
public sealed record CompletionDecision(
|
||||
bool IsComplete,
|
||||
bool ShouldRetry,
|
||||
JobStatus FinalStatus,
|
||||
int? NextAttempt,
|
||||
DateTimeOffset? RetryNotBefore,
|
||||
string? Reason)
|
||||
{
|
||||
public static CompletionDecision Complete(JobStatus status, string? reason)
|
||||
=> new(true, false, status, null, null, reason);
|
||||
|
||||
public static CompletionDecision Retry(int nextAttempt, DateTimeOffset notBefore, string reason)
|
||||
=> new(false, true, JobStatus.Pending, nextAttempt, notBefore, reason);
|
||||
}
|
||||
@@ -0,0 +1,141 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Core.Scheduling;
|
||||
|
||||
/// <summary>
|
||||
/// Manages job status transitions and validates state machine rules.
|
||||
///
|
||||
/// State machine:
|
||||
/// Pending → Scheduled (quota cleared, dependencies satisfied)
|
||||
/// Scheduled → Leased (worker acquired lease)
|
||||
/// Leased → Succeeded | Failed | Canceled | TimedOut
|
||||
/// Failed → Pending (retry) | Failed (exhausted)
|
||||
/// TimedOut → Pending (retry) | Failed (exhausted)
|
||||
/// </summary>
|
||||
public static class JobStateMachine
|
||||
{
|
||||
/// <summary>
|
||||
/// Validates whether a status transition is allowed.
|
||||
/// </summary>
|
||||
/// <param name="from">Current status.</param>
|
||||
/// <param name="to">Target status.</param>
|
||||
/// <returns>True if transition is valid.</returns>
|
||||
public static bool IsValidTransition(JobStatus from, JobStatus to)
|
||||
{
|
||||
return (from, to) switch
|
||||
{
|
||||
// From Pending
|
||||
(JobStatus.Pending, JobStatus.Scheduled) => true,
|
||||
(JobStatus.Pending, JobStatus.Canceled) => true,
|
||||
|
||||
// From Scheduled
|
||||
(JobStatus.Scheduled, JobStatus.Leased) => true,
|
||||
(JobStatus.Scheduled, JobStatus.Canceled) => true,
|
||||
(JobStatus.Scheduled, JobStatus.Pending) => true, // Back to pending (quota exceeded, dependency failed)
|
||||
|
||||
// From Leased
|
||||
(JobStatus.Leased, JobStatus.Succeeded) => true,
|
||||
(JobStatus.Leased, JobStatus.Failed) => true,
|
||||
(JobStatus.Leased, JobStatus.Canceled) => true,
|
||||
(JobStatus.Leased, JobStatus.TimedOut) => true,
|
||||
|
||||
// Retry transitions (Failed/TimedOut back to Pending)
|
||||
(JobStatus.Failed, JobStatus.Pending) => true,
|
||||
(JobStatus.TimedOut, JobStatus.Pending) => true,
|
||||
|
||||
// Same status (idempotent)
|
||||
_ when from == to => true,
|
||||
|
||||
// All other transitions are invalid
|
||||
_ => false
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Determines if a job status is terminal (no further transitions except replay).
|
||||
/// </summary>
|
||||
public static bool IsTerminal(JobStatus status) => status switch
|
||||
{
|
||||
JobStatus.Succeeded => true,
|
||||
JobStatus.Failed => true,
|
||||
JobStatus.Canceled => true,
|
||||
JobStatus.TimedOut => true,
|
||||
_ => false
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Determines if a job status represents a successful completion.
|
||||
/// </summary>
|
||||
public static bool IsSuccess(JobStatus status) => status == JobStatus.Succeeded;
|
||||
|
||||
/// <summary>
|
||||
/// Determines if a job status represents a failure that may be retried.
|
||||
/// </summary>
|
||||
public static bool IsRetryable(JobStatus status) => status switch
|
||||
{
|
||||
JobStatus.Failed => true,
|
||||
JobStatus.TimedOut => true,
|
||||
_ => false
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Determines if a job is in a state where it can be leased by a worker.
|
||||
/// </summary>
|
||||
public static bool IsLeasable(JobStatus status) => status == JobStatus.Scheduled;
|
||||
|
||||
/// <summary>
|
||||
/// Determines if a job is waiting to be scheduled.
|
||||
/// </summary>
|
||||
public static bool IsPending(JobStatus status) => status == JobStatus.Pending;
|
||||
|
||||
/// <summary>
|
||||
/// Determines if a job is currently being executed.
|
||||
/// </summary>
|
||||
public static bool IsActive(JobStatus status) => status == JobStatus.Leased;
|
||||
|
||||
/// <summary>
|
||||
/// Gets all valid transitions from a given status.
|
||||
/// </summary>
|
||||
public static IReadOnlyList<JobStatus> GetValidTransitions(JobStatus from)
|
||||
{
|
||||
return from switch
|
||||
{
|
||||
JobStatus.Pending => [JobStatus.Scheduled, JobStatus.Canceled],
|
||||
JobStatus.Scheduled => [JobStatus.Leased, JobStatus.Canceled, JobStatus.Pending],
|
||||
JobStatus.Leased => [JobStatus.Succeeded, JobStatus.Failed, JobStatus.Canceled, JobStatus.TimedOut],
|
||||
JobStatus.Failed => [JobStatus.Pending], // Retry only
|
||||
JobStatus.TimedOut => [JobStatus.Pending], // Retry only
|
||||
JobStatus.Succeeded => [],
|
||||
JobStatus.Canceled => [],
|
||||
_ => []
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates a transition and throws if invalid.
|
||||
/// </summary>
|
||||
/// <exception cref="InvalidJobTransitionException">Thrown when transition is not allowed.</exception>
|
||||
public static void ValidateTransition(JobStatus from, JobStatus to)
|
||||
{
|
||||
if (!IsValidTransition(from, to))
|
||||
{
|
||||
throw new InvalidJobTransitionException(from, to);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when an invalid job status transition is attempted.
|
||||
/// </summary>
|
||||
public sealed class InvalidJobTransitionException : Exception
|
||||
{
|
||||
public JobStatus FromStatus { get; }
|
||||
public JobStatus ToStatus { get; }
|
||||
|
||||
public InvalidJobTransitionException(JobStatus from, JobStatus to)
|
||||
: base($"Invalid job status transition from '{from}' to '{to}'.")
|
||||
{
|
||||
FromStatus = from;
|
||||
ToStatus = to;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,173 @@
|
||||
namespace StellaOps.Orchestrator.Core.Scheduling;
|
||||
|
||||
/// <summary>
|
||||
/// Defines retry behavior for failed jobs.
|
||||
/// </summary>
|
||||
public sealed record RetryPolicy(
|
||||
/// <summary>Maximum number of retry attempts (including initial attempt).</summary>
|
||||
int MaxAttempts,
|
||||
|
||||
/// <summary>Initial backoff delay in seconds.</summary>
|
||||
double InitialBackoffSeconds,
|
||||
|
||||
/// <summary>Maximum backoff delay in seconds.</summary>
|
||||
double MaxBackoffSeconds,
|
||||
|
||||
/// <summary>Backoff multiplier for exponential growth.</summary>
|
||||
double BackoffMultiplier,
|
||||
|
||||
/// <summary>Jitter factor (0.0-1.0) to add randomness to backoff.</summary>
|
||||
double JitterFactor)
|
||||
{
|
||||
/// <summary>
|
||||
/// Default retry policy: 3 attempts, exponential backoff from 5s to 300s.
|
||||
/// </summary>
|
||||
public static RetryPolicy Default { get; } = new(
|
||||
MaxAttempts: 3,
|
||||
InitialBackoffSeconds: 5.0,
|
||||
MaxBackoffSeconds: 300.0,
|
||||
BackoffMultiplier: 2.0,
|
||||
JitterFactor: 0.1);
|
||||
|
||||
/// <summary>
|
||||
/// Aggressive retry policy for critical jobs: 5 attempts, quick retries.
|
||||
/// </summary>
|
||||
public static RetryPolicy Aggressive { get; } = new(
|
||||
MaxAttempts: 5,
|
||||
InitialBackoffSeconds: 1.0,
|
||||
MaxBackoffSeconds: 60.0,
|
||||
BackoffMultiplier: 1.5,
|
||||
JitterFactor: 0.2);
|
||||
|
||||
/// <summary>
|
||||
/// Conservative retry policy: 2 attempts, longer delays.
|
||||
/// </summary>
|
||||
public static RetryPolicy Conservative { get; } = new(
|
||||
MaxAttempts: 2,
|
||||
InitialBackoffSeconds: 30.0,
|
||||
MaxBackoffSeconds: 600.0,
|
||||
BackoffMultiplier: 3.0,
|
||||
JitterFactor: 0.1);
|
||||
|
||||
/// <summary>
|
||||
/// No retry policy: single attempt only.
|
||||
/// </summary>
|
||||
public static RetryPolicy NoRetry { get; } = new(
|
||||
MaxAttempts: 1,
|
||||
InitialBackoffSeconds: 0,
|
||||
MaxBackoffSeconds: 0,
|
||||
BackoffMultiplier: 1.0,
|
||||
JitterFactor: 0);
|
||||
|
||||
/// <summary>
|
||||
/// Determines if a job should be retried based on current attempt.
|
||||
/// </summary>
|
||||
/// <param name="currentAttempt">Current attempt number (1-based).</param>
|
||||
/// <returns>True if retry is allowed.</returns>
|
||||
public bool ShouldRetry(int currentAttempt) => currentAttempt < MaxAttempts;
|
||||
|
||||
/// <summary>
|
||||
/// Calculates the next retry time based on current attempt.
|
||||
/// </summary>
|
||||
/// <param name="currentAttempt">Current attempt number (1-based).</param>
|
||||
/// <param name="now">Current time.</param>
|
||||
/// <returns>Earliest time for next retry attempt.</returns>
|
||||
public DateTimeOffset CalculateNextRetryTime(int currentAttempt, DateTimeOffset now)
|
||||
{
|
||||
if (!ShouldRetry(currentAttempt))
|
||||
{
|
||||
throw new InvalidOperationException($"No retry allowed after attempt {currentAttempt} (max: {MaxAttempts}).");
|
||||
}
|
||||
|
||||
var backoffSeconds = CalculateBackoffSeconds(currentAttempt);
|
||||
return now.AddSeconds(backoffSeconds);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Calculates backoff duration in seconds for a given attempt.
|
||||
/// </summary>
|
||||
/// <param name="attempt">Attempt number (1-based).</param>
|
||||
/// <returns>Backoff duration in seconds.</returns>
|
||||
public double CalculateBackoffSeconds(int attempt)
|
||||
{
|
||||
if (attempt < 1)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(attempt), "Attempt must be >= 1.");
|
||||
}
|
||||
|
||||
// Exponential backoff: initial * multiplier^(attempt-1)
|
||||
var exponentialBackoff = InitialBackoffSeconds * Math.Pow(BackoffMultiplier, attempt - 1);
|
||||
|
||||
// Cap at maximum
|
||||
var cappedBackoff = Math.Min(exponentialBackoff, MaxBackoffSeconds);
|
||||
|
||||
// Add jitter to prevent thundering herd
|
||||
var jitter = cappedBackoff * JitterFactor * (Random.Shared.NextDouble() * 2 - 1);
|
||||
var finalBackoff = Math.Max(0, cappedBackoff + jitter);
|
||||
|
||||
return finalBackoff;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of evaluating retry policy for a failed job.
|
||||
/// </summary>
|
||||
public sealed record RetryDecision(
|
||||
/// <summary>Whether the job should be retried.</summary>
|
||||
bool ShouldRetry,
|
||||
|
||||
/// <summary>Next attempt number (if retrying).</summary>
|
||||
int NextAttempt,
|
||||
|
||||
/// <summary>Earliest time for next attempt (if retrying).</summary>
|
||||
DateTimeOffset? NotBefore,
|
||||
|
||||
/// <summary>Reason for the decision.</summary>
|
||||
string Reason)
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a retry decision.
|
||||
/// </summary>
|
||||
public static RetryDecision Retry(int nextAttempt, DateTimeOffset notBefore)
|
||||
=> new(true, nextAttempt, notBefore, $"Scheduling retry attempt {nextAttempt}");
|
||||
|
||||
/// <summary>
|
||||
/// Creates a no-retry decision (exhausted).
|
||||
/// </summary>
|
||||
public static RetryDecision Exhausted(int maxAttempts)
|
||||
=> new(false, 0, null, $"Max attempts ({maxAttempts}) exhausted");
|
||||
|
||||
/// <summary>
|
||||
/// Creates a no-retry decision (not retryable status).
|
||||
/// </summary>
|
||||
public static RetryDecision NotRetryable(string reason)
|
||||
=> new(false, 0, null, reason);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service for evaluating retry decisions.
|
||||
/// </summary>
|
||||
public static class RetryEvaluator
|
||||
{
|
||||
/// <summary>
|
||||
/// Evaluates whether a job should be retried and calculates timing.
|
||||
/// </summary>
|
||||
/// <param name="currentAttempt">Current attempt number.</param>
|
||||
/// <param name="policy">Retry policy to apply.</param>
|
||||
/// <param name="now">Current time.</param>
|
||||
/// <returns>Retry decision.</returns>
|
||||
public static RetryDecision Evaluate(int currentAttempt, RetryPolicy policy, DateTimeOffset now)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(policy);
|
||||
|
||||
if (!policy.ShouldRetry(currentAttempt))
|
||||
{
|
||||
return RetryDecision.Exhausted(policy.MaxAttempts);
|
||||
}
|
||||
|
||||
var nextAttempt = currentAttempt + 1;
|
||||
var notBefore = policy.CalculateNextRetryTime(currentAttempt, now);
|
||||
|
||||
return RetryDecision.Retry(nextAttempt, notBefore);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,341 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Core.SloManagement;
|
||||
|
||||
/// <summary>
|
||||
/// Options for burn rate computation.
|
||||
/// </summary>
|
||||
public sealed record BurnRateOptions
|
||||
{
|
||||
/// <summary>Short window multiplier for multi-window burn rate.</summary>
|
||||
public double ShortWindowMultiplier { get; init; } = 14.4; // 5% budget in 1 hour
|
||||
|
||||
/// <summary>Long window multiplier for multi-window burn rate.</summary>
|
||||
public double LongWindowMultiplier { get; init; } = 6.0; // 20% budget in 6 hours
|
||||
|
||||
/// <summary>Minimum events required for meaningful computation.</summary>
|
||||
public int MinimumEvents { get; init; } = 10;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event counts for SLO computation.
|
||||
/// </summary>
|
||||
public sealed record SloEventCounts(
|
||||
/// <summary>Total events in the window.</summary>
|
||||
long TotalEvents,
|
||||
|
||||
/// <summary>Good events (successful) in the window.</summary>
|
||||
long GoodEvents,
|
||||
|
||||
/// <summary>Bad events (failed) in the window.</summary>
|
||||
long BadEvents,
|
||||
|
||||
/// <summary>Start of the evaluation window.</summary>
|
||||
DateTimeOffset WindowStart,
|
||||
|
||||
/// <summary>End of the evaluation window.</summary>
|
||||
DateTimeOffset WindowEnd);
|
||||
|
||||
/// <summary>
|
||||
/// Interface for retrieving SLO event counts.
|
||||
/// </summary>
|
||||
public interface ISloEventSource
|
||||
{
|
||||
/// <summary>Gets event counts for an availability SLO.</summary>
|
||||
Task<SloEventCounts> GetAvailabilityCountsAsync(
|
||||
string tenantId,
|
||||
string? jobType,
|
||||
Guid? sourceId,
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Gets event counts for a latency SLO.</summary>
|
||||
Task<SloEventCounts> GetLatencyCountsAsync(
|
||||
string tenantId,
|
||||
string? jobType,
|
||||
Guid? sourceId,
|
||||
double percentile,
|
||||
double targetSeconds,
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Gets event counts for a throughput SLO.</summary>
|
||||
Task<SloEventCounts> GetThroughputCountsAsync(
|
||||
string tenantId,
|
||||
string? jobType,
|
||||
Guid? sourceId,
|
||||
int minimumRequired,
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Engine for computing SLO burn rates and error budget consumption.
|
||||
/// </summary>
|
||||
public interface IBurnRateEngine
|
||||
{
|
||||
/// <summary>Computes the current state of an SLO.</summary>
|
||||
Task<SloState> ComputeStateAsync(
|
||||
Slo slo,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Computes states for all enabled SLOs for a tenant.</summary>
|
||||
Task<IReadOnlyList<SloState>> ComputeAllStatesAsync(
|
||||
string tenantId,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>Evaluates alert thresholds and creates alerts if needed.</summary>
|
||||
Task<IReadOnlyList<SloAlert>> EvaluateAlertsAsync(
|
||||
Slo slo,
|
||||
SloState state,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of burn rate computation engine.
|
||||
/// </summary>
|
||||
public sealed class BurnRateEngine : IBurnRateEngine
|
||||
{
|
||||
private readonly ISloRepository _sloRepository;
|
||||
private readonly ISloEventSource _eventSource;
|
||||
private readonly IAlertThresholdRepository _thresholdRepository;
|
||||
private readonly ISloAlertRepository _alertRepository;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly BurnRateOptions _options;
|
||||
private readonly ILogger<BurnRateEngine> _logger;
|
||||
|
||||
public BurnRateEngine(
|
||||
ISloRepository sloRepository,
|
||||
ISloEventSource eventSource,
|
||||
IAlertThresholdRepository thresholdRepository,
|
||||
ISloAlertRepository alertRepository,
|
||||
TimeProvider timeProvider,
|
||||
BurnRateOptions options,
|
||||
ILogger<BurnRateEngine> logger)
|
||||
{
|
||||
_sloRepository = sloRepository ?? throw new ArgumentNullException(nameof(sloRepository));
|
||||
_eventSource = eventSource ?? throw new ArgumentNullException(nameof(eventSource));
|
||||
_thresholdRepository = thresholdRepository ?? throw new ArgumentNullException(nameof(thresholdRepository));
|
||||
_alertRepository = alertRepository ?? throw new ArgumentNullException(nameof(alertRepository));
|
||||
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<SloState> ComputeStateAsync(
|
||||
Slo slo,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var windowDuration = slo.GetWindowDuration();
|
||||
var windowStart = now - windowDuration;
|
||||
|
||||
// Get event counts based on SLO type
|
||||
var counts = slo.Type switch
|
||||
{
|
||||
SloType.Availability => await _eventSource.GetAvailabilityCountsAsync(
|
||||
slo.TenantId, slo.JobType, slo.SourceId, windowStart, now, cancellationToken).ConfigureAwait(false),
|
||||
|
||||
SloType.Latency => await _eventSource.GetLatencyCountsAsync(
|
||||
slo.TenantId, slo.JobType, slo.SourceId,
|
||||
slo.LatencyPercentile ?? 0.95,
|
||||
slo.LatencyTargetSeconds ?? 1.0,
|
||||
windowStart, now, cancellationToken).ConfigureAwait(false),
|
||||
|
||||
SloType.Throughput => await _eventSource.GetThroughputCountsAsync(
|
||||
slo.TenantId, slo.JobType, slo.SourceId,
|
||||
slo.ThroughputMinimum ?? 1,
|
||||
windowStart, now, cancellationToken).ConfigureAwait(false),
|
||||
|
||||
_ => throw new InvalidOperationException($"Unknown SLO type: {slo.Type}")
|
||||
};
|
||||
|
||||
// Handle no data case
|
||||
if (counts.TotalEvents < _options.MinimumEvents)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"SLO {SloId} has insufficient data ({Events} events, minimum {Min})",
|
||||
slo.SloId, counts.TotalEvents, _options.MinimumEvents);
|
||||
return SloState.NoData(slo.SloId, slo.TenantId, now, slo.Window);
|
||||
}
|
||||
|
||||
// Compute SLI (Service Level Indicator)
|
||||
var sli = (double)counts.GoodEvents / counts.TotalEvents;
|
||||
|
||||
// Compute error budget consumption
|
||||
var errorBudget = slo.ErrorBudget;
|
||||
var errorRate = 1.0 - sli;
|
||||
var budgetConsumed = errorBudget > 0 ? errorRate / errorBudget : (errorRate > 0 ? 1.0 : 0.0);
|
||||
budgetConsumed = Math.Clamp(budgetConsumed, 0, 2.0); // Allow showing overconsumption up to 200%
|
||||
|
||||
var budgetRemaining = Math.Max(0, 1.0 - budgetConsumed);
|
||||
|
||||
// Compute burn rate
|
||||
// Burn rate = (actual error rate) / (allowed error rate for sustainable consumption)
|
||||
// Sustainable consumption = error budget / window duration * elapsed time
|
||||
var elapsedRatio = (now - counts.WindowStart).TotalSeconds / windowDuration.TotalSeconds;
|
||||
var sustainableErrorRate = errorBudget * elapsedRatio;
|
||||
var burnRate = sustainableErrorRate > 0 ? errorRate / sustainableErrorRate : 0;
|
||||
|
||||
// Compute time to exhaustion
|
||||
TimeSpan? timeToExhaustion = null;
|
||||
if (burnRate > 0 && budgetRemaining > 0)
|
||||
{
|
||||
var remainingBudget = errorBudget * budgetRemaining;
|
||||
var currentErrorRatePerSecond = errorRate / (now - counts.WindowStart).TotalSeconds;
|
||||
if (currentErrorRatePerSecond > 0)
|
||||
{
|
||||
var secondsToExhaustion = remainingBudget / currentErrorRatePerSecond;
|
||||
timeToExhaustion = TimeSpan.FromSeconds(Math.Min(secondsToExhaustion, windowDuration.TotalSeconds));
|
||||
}
|
||||
}
|
||||
|
||||
// Determine if SLO is met
|
||||
var isMet = sli >= slo.Target;
|
||||
|
||||
// Determine alert severity
|
||||
var alertSeverity = DetermineAlertSeverity(budgetConsumed, burnRate);
|
||||
|
||||
var state = new SloState(
|
||||
SloId: slo.SloId,
|
||||
TenantId: slo.TenantId,
|
||||
CurrentSli: sli,
|
||||
TotalEvents: counts.TotalEvents,
|
||||
GoodEvents: counts.GoodEvents,
|
||||
BadEvents: counts.BadEvents,
|
||||
BudgetConsumed: budgetConsumed,
|
||||
BudgetRemaining: budgetRemaining,
|
||||
BurnRate: burnRate,
|
||||
TimeToExhaustion: timeToExhaustion,
|
||||
IsMet: isMet,
|
||||
AlertSeverity: alertSeverity,
|
||||
ComputedAt: now,
|
||||
WindowStart: counts.WindowStart,
|
||||
WindowEnd: counts.WindowEnd);
|
||||
|
||||
_logger.LogDebug(
|
||||
"SLO {SloId} state computed: SLI={Sli:P2}, BudgetConsumed={BudgetConsumed:P1}, BurnRate={BurnRate:F2}x",
|
||||
slo.SloId, state.CurrentSli, state.BudgetConsumed, state.BurnRate);
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<SloState>> ComputeAllStatesAsync(
|
||||
string tenantId,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var slos = await _sloRepository.ListAsync(tenantId, enabledOnly: true, cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var states = new List<SloState>(slos.Count);
|
||||
|
||||
foreach (var slo in slos)
|
||||
{
|
||||
try
|
||||
{
|
||||
var state = await ComputeStateAsync(slo, cancellationToken).ConfigureAwait(false);
|
||||
states.Add(state);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to compute state for SLO {SloId}", slo.SloId);
|
||||
// Add no-data state for failed computation
|
||||
states.Add(SloState.NoData(slo.SloId, slo.TenantId, _timeProvider.GetUtcNow(), slo.Window));
|
||||
}
|
||||
}
|
||||
|
||||
return states;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<SloAlert>> EvaluateAlertsAsync(
|
||||
Slo slo,
|
||||
SloState state,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var thresholds = await _thresholdRepository.ListBySloAsync(slo.SloId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var alerts = new List<SloAlert>();
|
||||
|
||||
foreach (var threshold in thresholds)
|
||||
{
|
||||
if (!threshold.ShouldTrigger(state, now))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var alert = SloAlert.Create(slo, state, threshold);
|
||||
await _alertRepository.CreateAsync(alert, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var updatedThreshold = threshold.RecordTrigger(now);
|
||||
await _thresholdRepository.UpdateAsync(updatedThreshold, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
alerts.Add(alert);
|
||||
|
||||
_logger.LogWarning(
|
||||
"SLO alert triggered: SloId={SloId}, Severity={Severity}, Message={Message}",
|
||||
slo.SloId, alert.Severity, alert.Message);
|
||||
}
|
||||
|
||||
return alerts;
|
||||
}
|
||||
|
||||
private static AlertSeverity DetermineAlertSeverity(double budgetConsumed, double burnRate)
|
||||
{
|
||||
// Emergency: Budget exhausted or burn rate extremely high
|
||||
if (budgetConsumed >= 1.0 || burnRate >= 10.0)
|
||||
return AlertSeverity.Emergency;
|
||||
|
||||
// Critical: Budget nearly exhausted or burn rate very high
|
||||
if (budgetConsumed >= 0.8 || burnRate >= 5.0)
|
||||
return AlertSeverity.Critical;
|
||||
|
||||
// Warning: Budget significantly consumed or elevated burn rate
|
||||
if (budgetConsumed >= 0.5 || burnRate >= 2.0)
|
||||
return AlertSeverity.Warning;
|
||||
|
||||
// Info: Everything is normal
|
||||
return AlertSeverity.Info;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for SLO persistence.
|
||||
/// </summary>
|
||||
public interface ISloRepository
|
||||
{
|
||||
Task<Slo?> GetByIdAsync(string tenantId, Guid sloId, CancellationToken cancellationToken);
|
||||
Task<IReadOnlyList<Slo>> ListAsync(string tenantId, bool enabledOnly, string? jobType = null, CancellationToken cancellationToken = default);
|
||||
Task CreateAsync(Slo slo, CancellationToken cancellationToken);
|
||||
Task UpdateAsync(Slo slo, CancellationToken cancellationToken);
|
||||
Task<bool> DeleteAsync(string tenantId, Guid sloId, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for alert threshold persistence.
|
||||
/// </summary>
|
||||
public interface IAlertThresholdRepository
|
||||
{
|
||||
Task<AlertBudgetThreshold?> GetByIdAsync(string tenantId, Guid thresholdId, CancellationToken cancellationToken);
|
||||
Task<IReadOnlyList<AlertBudgetThreshold>> ListBySloAsync(Guid sloId, CancellationToken cancellationToken);
|
||||
Task CreateAsync(AlertBudgetThreshold threshold, CancellationToken cancellationToken);
|
||||
Task UpdateAsync(AlertBudgetThreshold threshold, CancellationToken cancellationToken);
|
||||
Task<bool> DeleteAsync(string tenantId, Guid thresholdId, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for SLO alert persistence.
|
||||
/// </summary>
|
||||
public interface ISloAlertRepository
|
||||
{
|
||||
Task<SloAlert?> GetByIdAsync(string tenantId, Guid alertId, CancellationToken cancellationToken);
|
||||
Task<IReadOnlyList<SloAlert>> ListAsync(string tenantId, Guid? sloId, bool? acknowledged, bool? resolved, int limit, int offset, CancellationToken cancellationToken);
|
||||
Task CreateAsync(SloAlert alert, CancellationToken cancellationToken);
|
||||
Task UpdateAsync(SloAlert alert, CancellationToken cancellationToken);
|
||||
Task<int> GetActiveAlertCountAsync(string tenantId, CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -1,18 +1,20 @@
|
||||
<?xml version="1.0" ?>
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<PropertyGroup>
|
||||
|
||||
|
||||
|
||||
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.0-rc.2.25502.107" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
namespace StellaOps.Orchestrator.Infrastructure;
|
||||
|
||||
public class Class1
|
||||
{
|
||||
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Ledger;
|
||||
|
||||
/// <summary>
|
||||
/// Service for exporting ledger data in various formats.
|
||||
/// </summary>
|
||||
public interface ILedgerExporter
|
||||
{
|
||||
/// <summary>
|
||||
/// Exports ledger entries to a file.
|
||||
/// </summary>
|
||||
/// <param name="export">The export request.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The completed export with output details.</returns>
|
||||
Task<LedgerExport> ExportAsync(
|
||||
LedgerExport export,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Generates a signed manifest for a ledger entry.
|
||||
/// </summary>
|
||||
/// <param name="entry">The ledger entry.</param>
|
||||
/// <param name="artifacts">The artifacts from the run.</param>
|
||||
/// <param name="buildInfo">Optional build information.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The generated manifest.</returns>
|
||||
Task<SignedManifest> GenerateManifestAsync(
|
||||
RunLedgerEntry entry,
|
||||
IReadOnlyList<Artifact> artifacts,
|
||||
string? buildInfo = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Generates a signed manifest for an export.
|
||||
/// </summary>
|
||||
/// <param name="export">The completed export.</param>
|
||||
/// <param name="entries">The entries included in the export.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The generated manifest.</returns>
|
||||
Task<SignedManifest> GenerateExportManifestAsync(
|
||||
LedgerExport export,
|
||||
IReadOnlyList<RunLedgerEntry> entries,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
@@ -0,0 +1,309 @@
|
||||
using System.Globalization;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Ledger;
|
||||
|
||||
/// <summary>
|
||||
/// Service for exporting ledger data in various formats.
|
||||
/// </summary>
|
||||
public sealed class LedgerExporter : ILedgerExporter
|
||||
{
|
||||
private readonly ILedgerRepository _ledgerRepository;
|
||||
private readonly ILedgerExportRepository _exportRepository;
|
||||
private readonly ILogger<LedgerExporter> _logger;
|
||||
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
WriteIndented = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
};
|
||||
|
||||
private static readonly JsonSerializerOptions NdjsonOptions = new()
|
||||
{
|
||||
WriteIndented = false,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
};
|
||||
|
||||
public LedgerExporter(
|
||||
ILedgerRepository ledgerRepository,
|
||||
ILedgerExportRepository exportRepository,
|
||||
ILogger<LedgerExporter> logger)
|
||||
{
|
||||
_ledgerRepository = ledgerRepository;
|
||||
_exportRepository = exportRepository;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<LedgerExport> ExportAsync(
|
||||
LedgerExport export,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var startTime = DateTimeOffset.UtcNow;
|
||||
|
||||
try
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Starting ledger export {ExportId} for tenant {TenantId} in format {Format}",
|
||||
export.ExportId, export.TenantId, export.Format);
|
||||
|
||||
// Mark export as started
|
||||
export = export.Start();
|
||||
export = await _exportRepository.UpdateAsync(export, cancellationToken);
|
||||
|
||||
// Fetch entries based on filters
|
||||
var entries = await _ledgerRepository.ListAsync(
|
||||
export.TenantId,
|
||||
export.RunTypeFilter,
|
||||
export.SourceIdFilter,
|
||||
finalStatus: null,
|
||||
export.StartTime,
|
||||
export.EndTime,
|
||||
limit: int.MaxValue,
|
||||
offset: 0,
|
||||
cancellationToken);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Found {EntryCount} ledger entries for export {ExportId}",
|
||||
entries.Count, export.ExportId);
|
||||
|
||||
// Generate output based on format
|
||||
var (content, digest) = await GenerateOutputAsync(entries, export.Format, cancellationToken);
|
||||
|
||||
// Generate output path (in production, this would write to storage)
|
||||
var outputUri = GenerateOutputUri(export);
|
||||
var sizeBytes = Encoding.UTF8.GetByteCount(content);
|
||||
|
||||
// Complete the export
|
||||
export = export.Complete(outputUri, digest, sizeBytes, entries.Count);
|
||||
export = await _exportRepository.UpdateAsync(export, cancellationToken);
|
||||
|
||||
var duration = DateTimeOffset.UtcNow - startTime;
|
||||
OrchestratorMetrics.LedgerExportCompleted(export.TenantId, export.Format);
|
||||
OrchestratorMetrics.RecordLedgerExportDuration(export.TenantId, export.Format, duration.TotalSeconds);
|
||||
OrchestratorMetrics.RecordLedgerExportSize(export.TenantId, export.Format, sizeBytes);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Completed ledger export {ExportId} with {EntryCount} entries, {SizeBytes} bytes",
|
||||
export.ExportId, entries.Count, sizeBytes);
|
||||
|
||||
return export;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"Failed to export ledger {ExportId} for tenant {TenantId}",
|
||||
export.ExportId, export.TenantId);
|
||||
|
||||
OrchestratorMetrics.LedgerExportFailed(export.TenantId, export.Format);
|
||||
|
||||
export = export.Fail(ex.Message);
|
||||
export = await _exportRepository.UpdateAsync(export, cancellationToken);
|
||||
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<SignedManifest> GenerateManifestAsync(
|
||||
RunLedgerEntry entry,
|
||||
IReadOnlyList<Artifact> artifacts,
|
||||
string? buildInfo = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Generating manifest for ledger entry {LedgerId}, run {RunId}",
|
||||
entry.LedgerId, entry.RunId);
|
||||
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(entry, buildInfo);
|
||||
|
||||
OrchestratorMetrics.ManifestCreated(entry.TenantId, "run");
|
||||
|
||||
return Task.FromResult(manifest);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<SignedManifest> GenerateExportManifestAsync(
|
||||
LedgerExport export,
|
||||
IReadOnlyList<RunLedgerEntry> entries,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Generating manifest for export {ExportId} with {EntryCount} entries",
|
||||
export.ExportId, entries.Count);
|
||||
|
||||
var manifest = SignedManifest.CreateFromExport(export, entries);
|
||||
|
||||
OrchestratorMetrics.ManifestCreated(export.TenantId, "export");
|
||||
|
||||
return Task.FromResult(manifest);
|
||||
}
|
||||
|
||||
private async Task<(string Content, string Digest)> GenerateOutputAsync(
|
||||
IReadOnlyList<RunLedgerEntry> entries,
|
||||
string format,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var content = format.ToLowerInvariant() switch
|
||||
{
|
||||
"json" => GenerateJson(entries),
|
||||
"ndjson" => GenerateNdjson(entries),
|
||||
"csv" => GenerateCsv(entries),
|
||||
_ => throw new ArgumentException($"Unsupported export format: {format}", nameof(format))
|
||||
};
|
||||
|
||||
// Compute digest
|
||||
var bytes = Encoding.UTF8.GetBytes(content);
|
||||
var hash = await Task.Run(() => SHA256.HashData(bytes), cancellationToken);
|
||||
var digest = $"sha256:{Convert.ToHexStringLower(hash)}";
|
||||
|
||||
return (content, digest);
|
||||
}
|
||||
|
||||
private static string GenerateJson(IReadOnlyList<RunLedgerEntry> entries)
|
||||
{
|
||||
var exportData = new LedgerExportData
|
||||
{
|
||||
SchemaVersion = "1.0.0",
|
||||
ExportedAt = DateTimeOffset.UtcNow,
|
||||
EntryCount = entries.Count,
|
||||
Entries = entries.Select(MapEntry).ToList()
|
||||
};
|
||||
|
||||
return JsonSerializer.Serialize(exportData, JsonOptions);
|
||||
}
|
||||
|
||||
private static string GenerateNdjson(IReadOnlyList<RunLedgerEntry> entries)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
var mapped = MapEntry(entry);
|
||||
sb.AppendLine(JsonSerializer.Serialize(mapped, NdjsonOptions));
|
||||
}
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
private static string GenerateCsv(IReadOnlyList<RunLedgerEntry> entries)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
|
||||
// Header
|
||||
sb.AppendLine("LedgerId,TenantId,RunId,SourceId,RunType,FinalStatus,TotalJobs,SucceededJobs,FailedJobs,ExecutionDurationMs,InputDigest,OutputDigest,SequenceNumber,ContentHash,PreviousEntryHash,RunCreatedAt,RunCompletedAt,LedgerCreatedAt");
|
||||
|
||||
// Data rows
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
sb.AppendLine(string.Join(",",
|
||||
EscapeCsv(entry.LedgerId.ToString()),
|
||||
EscapeCsv(entry.TenantId),
|
||||
EscapeCsv(entry.RunId.ToString()),
|
||||
EscapeCsv(entry.SourceId.ToString()),
|
||||
EscapeCsv(entry.RunType),
|
||||
EscapeCsv(entry.FinalStatus.ToString()),
|
||||
entry.TotalJobs,
|
||||
entry.SucceededJobs,
|
||||
entry.FailedJobs,
|
||||
entry.ExecutionDuration.TotalMilliseconds.ToString(CultureInfo.InvariantCulture),
|
||||
EscapeCsv(entry.InputDigest),
|
||||
EscapeCsv(entry.OutputDigest),
|
||||
entry.SequenceNumber,
|
||||
EscapeCsv(entry.ContentHash),
|
||||
EscapeCsv(entry.PreviousEntryHash ?? ""),
|
||||
EscapeCsv(entry.RunCreatedAt.ToString("O")),
|
||||
EscapeCsv(entry.RunCompletedAt.ToString("O")),
|
||||
EscapeCsv(entry.LedgerCreatedAt.ToString("O"))));
|
||||
}
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
private static string EscapeCsv(string value)
|
||||
{
|
||||
if (string.IsNullOrEmpty(value))
|
||||
return "";
|
||||
|
||||
if (value.Contains(',') || value.Contains('"') || value.Contains('\n'))
|
||||
{
|
||||
return $"\"{value.Replace("\"", "\"\"")}\"";
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
private static LedgerEntryDto MapEntry(RunLedgerEntry entry) => new()
|
||||
{
|
||||
LedgerId = entry.LedgerId,
|
||||
TenantId = entry.TenantId,
|
||||
RunId = entry.RunId,
|
||||
SourceId = entry.SourceId,
|
||||
RunType = entry.RunType,
|
||||
FinalStatus = entry.FinalStatus.ToString(),
|
||||
TotalJobs = entry.TotalJobs,
|
||||
SucceededJobs = entry.SucceededJobs,
|
||||
FailedJobs = entry.FailedJobs,
|
||||
ExecutionDurationMs = entry.ExecutionDuration.TotalMilliseconds,
|
||||
InputDigest = entry.InputDigest,
|
||||
OutputDigest = entry.OutputDigest,
|
||||
ArtifactManifest = entry.ArtifactManifest,
|
||||
SequenceNumber = entry.SequenceNumber,
|
||||
ContentHash = entry.ContentHash,
|
||||
PreviousEntryHash = entry.PreviousEntryHash,
|
||||
RunCreatedAt = entry.RunCreatedAt,
|
||||
RunCompletedAt = entry.RunCompletedAt,
|
||||
LedgerCreatedAt = entry.LedgerCreatedAt,
|
||||
Metadata = entry.Metadata
|
||||
};
|
||||
|
||||
private static string GenerateOutputUri(LedgerExport export)
|
||||
{
|
||||
var extension = export.Format.ToLowerInvariant() switch
|
||||
{
|
||||
"json" => "json",
|
||||
"ndjson" => "ndjson",
|
||||
"csv" => "csv",
|
||||
_ => "dat"
|
||||
};
|
||||
|
||||
return $"ledger://exports/{export.TenantId}/{export.ExportId}.{extension}";
|
||||
}
|
||||
|
||||
private sealed class LedgerExportData
|
||||
{
|
||||
public required string SchemaVersion { get; init; }
|
||||
public required DateTimeOffset ExportedAt { get; init; }
|
||||
public required int EntryCount { get; init; }
|
||||
public required List<LedgerEntryDto> Entries { get; init; }
|
||||
}
|
||||
|
||||
private sealed class LedgerEntryDto
|
||||
{
|
||||
public required Guid LedgerId { get; init; }
|
||||
public required string TenantId { get; init; }
|
||||
public required Guid RunId { get; init; }
|
||||
public required Guid SourceId { get; init; }
|
||||
public required string RunType { get; init; }
|
||||
public required string FinalStatus { get; init; }
|
||||
public required int TotalJobs { get; init; }
|
||||
public required int SucceededJobs { get; init; }
|
||||
public required int FailedJobs { get; init; }
|
||||
public required double ExecutionDurationMs { get; init; }
|
||||
public required string InputDigest { get; init; }
|
||||
public required string OutputDigest { get; init; }
|
||||
public required string ArtifactManifest { get; init; }
|
||||
public required long SequenceNumber { get; init; }
|
||||
public required string ContentHash { get; init; }
|
||||
public string? PreviousEntryHash { get; init; }
|
||||
public required DateTimeOffset RunCreatedAt { get; init; }
|
||||
public required DateTimeOffset RunCompletedAt { get; init; }
|
||||
public required DateTimeOffset LedgerCreatedAt { get; init; }
|
||||
public string? Metadata { get; init; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,660 @@
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure;
|
||||
|
||||
/// <summary>
|
||||
/// Metrics instrumentation for the Orchestrator service.
|
||||
/// </summary>
|
||||
public static class OrchestratorMetrics
|
||||
{
|
||||
private static readonly Meter Meter = new("StellaOps.Orchestrator", "1.0.0");
|
||||
|
||||
private static readonly Counter<long> JobsEnqueued = Meter.CreateCounter<long>(
|
||||
"orchestrator.jobs.enqueued",
|
||||
description: "Total jobs enqueued");
|
||||
|
||||
private static readonly Counter<long> JobsScheduled = Meter.CreateCounter<long>(
|
||||
"orchestrator.jobs.scheduled",
|
||||
description: "Total jobs scheduled");
|
||||
|
||||
private static readonly Counter<long> JobsLeased = Meter.CreateCounter<long>(
|
||||
"orchestrator.jobs.leased",
|
||||
description: "Total jobs leased to workers");
|
||||
|
||||
private static readonly Counter<long> JobsCompleted = Meter.CreateCounter<long>(
|
||||
"orchestrator.jobs.completed",
|
||||
description: "Total jobs completed");
|
||||
|
||||
private static readonly Counter<long> JobsFailed = Meter.CreateCounter<long>(
|
||||
"orchestrator.jobs.failed",
|
||||
description: "Total jobs failed");
|
||||
|
||||
private static readonly Counter<long> JobsRetried = Meter.CreateCounter<long>(
|
||||
"orchestrator.jobs.retried",
|
||||
description: "Total job retry attempts");
|
||||
|
||||
private static readonly Counter<long> LeaseExtensions = Meter.CreateCounter<long>(
|
||||
"orchestrator.lease.extensions",
|
||||
description: "Total lease extensions");
|
||||
|
||||
private static readonly Counter<long> LeaseExpirations = Meter.CreateCounter<long>(
|
||||
"orchestrator.lease.expirations",
|
||||
description: "Total lease expirations");
|
||||
|
||||
private static readonly Histogram<double> JobDuration = Meter.CreateHistogram<double>(
|
||||
"orchestrator.job.duration.seconds",
|
||||
unit: "s",
|
||||
description: "Job execution duration");
|
||||
|
||||
private static readonly Histogram<double> SchedulingLatency = Meter.CreateHistogram<double>(
|
||||
"orchestrator.scheduling.latency.seconds",
|
||||
unit: "s",
|
||||
description: "Time from job creation to scheduling");
|
||||
|
||||
private static readonly UpDownCounter<long> ActiveConnections = Meter.CreateUpDownCounter<long>(
|
||||
"orchestrator.db.connections.active",
|
||||
description: "Active database connections");
|
||||
|
||||
private static readonly UpDownCounter<long> QueueDepth = Meter.CreateUpDownCounter<long>(
|
||||
"orchestrator.queue.depth",
|
||||
description: "Number of pending jobs in queue");
|
||||
|
||||
private static readonly Counter<long> ArtifactsCreated = Meter.CreateCounter<long>(
|
||||
"orchestrator.artifacts.created",
|
||||
description: "Total artifacts created");
|
||||
|
||||
private static readonly Counter<long> HeartbeatsReceived = Meter.CreateCounter<long>(
|
||||
"orchestrator.heartbeats.received",
|
||||
description: "Total worker heartbeats received");
|
||||
|
||||
private static readonly Counter<long> ProgressReports = Meter.CreateCounter<long>(
|
||||
"orchestrator.progress.reports",
|
||||
description: "Total job progress reports");
|
||||
|
||||
private static readonly Counter<long> SourcesCreated = Meter.CreateCounter<long>(
|
||||
"orchestrator.sources.created",
|
||||
description: "Total sources created");
|
||||
|
||||
private static readonly Counter<long> SourcesPaused = Meter.CreateCounter<long>(
|
||||
"orchestrator.sources.paused",
|
||||
description: "Total source pause operations");
|
||||
|
||||
private static readonly Counter<long> SourcesResumed = Meter.CreateCounter<long>(
|
||||
"orchestrator.sources.resumed",
|
||||
description: "Total source resume operations");
|
||||
|
||||
private static readonly Counter<long> RunsCreated = Meter.CreateCounter<long>(
|
||||
"orchestrator.runs.created",
|
||||
description: "Total runs created");
|
||||
|
||||
private static readonly Counter<long> RunsCompleted = Meter.CreateCounter<long>(
|
||||
"orchestrator.runs.completed",
|
||||
description: "Total runs completed");
|
||||
|
||||
private static readonly Counter<long> QuotasCreated = Meter.CreateCounter<long>(
|
||||
"orchestrator.quotas.created",
|
||||
description: "Total quotas created");
|
||||
|
||||
private static readonly Counter<long> QuotasPaused = Meter.CreateCounter<long>(
|
||||
"orchestrator.quotas.paused",
|
||||
description: "Total quota pause operations");
|
||||
|
||||
private static readonly Counter<long> QuotasResumed = Meter.CreateCounter<long>(
|
||||
"orchestrator.quotas.resumed",
|
||||
description: "Total quota resume operations");
|
||||
|
||||
private static readonly Counter<long> ThrottlesCreated = Meter.CreateCounter<long>(
|
||||
"orchestrator.throttles.created",
|
||||
description: "Total throttles created");
|
||||
|
||||
private static readonly Counter<long> ThrottlesDeactivated = Meter.CreateCounter<long>(
|
||||
"orchestrator.throttles.deactivated",
|
||||
description: "Total throttles deactivated");
|
||||
|
||||
private static readonly Counter<long> RateLimitDenials = Meter.CreateCounter<long>(
|
||||
"orchestrator.ratelimit.denials",
|
||||
description: "Total rate limit denials");
|
||||
|
||||
private static readonly Counter<long> BackpressureEvents = Meter.CreateCounter<long>(
|
||||
"orchestrator.backpressure.events",
|
||||
description: "Total backpressure events from upstream");
|
||||
|
||||
private static readonly Histogram<double> TokenBucketUtilization = Meter.CreateHistogram<double>(
|
||||
"orchestrator.ratelimit.token_utilization",
|
||||
unit: "ratio",
|
||||
description: "Token bucket utilization ratio (0-1)");
|
||||
|
||||
private static readonly Histogram<double> ConcurrencyUtilization = Meter.CreateHistogram<double>(
|
||||
"orchestrator.ratelimit.concurrency_utilization",
|
||||
unit: "ratio",
|
||||
description: "Concurrency limiter utilization ratio (0-1)");
|
||||
|
||||
public static void JobEnqueued(string tenantId, string jobType)
|
||||
=> JobsEnqueued.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType));
|
||||
|
||||
public static void JobScheduled(string tenantId, string jobType)
|
||||
=> JobsScheduled.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType));
|
||||
|
||||
public static void JobLeased(string tenantId, string jobType)
|
||||
=> JobsLeased.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType));
|
||||
|
||||
public static void JobCompleted(string tenantId, string jobType, string status)
|
||||
=> JobsCompleted.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType),
|
||||
new KeyValuePair<string, object?>("status", status));
|
||||
|
||||
public static void JobFailed(string tenantId, string jobType)
|
||||
=> JobsFailed.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType));
|
||||
|
||||
public static void JobRetried(string tenantId, string jobType, int attempt)
|
||||
=> JobsRetried.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType),
|
||||
new KeyValuePair<string, object?>("attempt", attempt));
|
||||
|
||||
public static void LeaseExtended(string tenantId, string jobType)
|
||||
=> LeaseExtensions.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType));
|
||||
|
||||
public static void LeaseExpired(string tenantId, string jobType)
|
||||
=> LeaseExpirations.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType));
|
||||
|
||||
public static void RecordJobDuration(string tenantId, string jobType, double durationSeconds)
|
||||
=> JobDuration.Record(durationSeconds, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType));
|
||||
|
||||
public static void RecordSchedulingLatency(string tenantId, string jobType, double latencySeconds)
|
||||
=> SchedulingLatency.Record(latencySeconds, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType));
|
||||
|
||||
public static void ConnectionOpened(string role)
|
||||
=> ActiveConnections.Add(1, new KeyValuePair<string, object?>("role", role));
|
||||
|
||||
public static void ConnectionClosed(string role)
|
||||
=> ActiveConnections.Add(-1, new KeyValuePair<string, object?>("role", role));
|
||||
|
||||
public static void QueueDepthChanged(string tenantId, string jobType, long delta)
|
||||
=> QueueDepth.Add(delta, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType));
|
||||
|
||||
public static void ArtifactCreated(string tenantId, string artifactType)
|
||||
=> ArtifactsCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("artifact_type", artifactType));
|
||||
|
||||
public static void HeartbeatReceived(string tenantId, string jobType)
|
||||
=> HeartbeatsReceived.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType));
|
||||
|
||||
public static void ProgressReported(string tenantId, string jobType)
|
||||
=> ProgressReports.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType));
|
||||
|
||||
public static void SourceCreated(string tenantId, string sourceType)
|
||||
=> SourcesCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("source_type", sourceType));
|
||||
|
||||
public static void SourcePaused(string tenantId)
|
||||
=> SourcesPaused.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
|
||||
public static void SourceResumed(string tenantId)
|
||||
=> SourcesResumed.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
|
||||
public static void RunCreated(string tenantId, string runType)
|
||||
=> RunsCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("run_type", runType));
|
||||
|
||||
public static void RunCompleted(string tenantId, string runType, string status)
|
||||
=> RunsCompleted.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("run_type", runType),
|
||||
new KeyValuePair<string, object?>("status", status));
|
||||
|
||||
public static void QuotaCreated(string tenantId, string? jobType)
|
||||
=> QuotasCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType ?? "(all)"));
|
||||
|
||||
public static void QuotaPaused(string tenantId)
|
||||
=> QuotasPaused.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
|
||||
public static void QuotaResumed(string tenantId)
|
||||
=> QuotasResumed.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
|
||||
public static void ThrottleCreated(string tenantId, string reason)
|
||||
=> ThrottlesCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("reason", reason));
|
||||
|
||||
public static void ThrottleDeactivated(string tenantId)
|
||||
=> ThrottlesDeactivated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
|
||||
public static void RateLimitDenied(string tenantId, string? jobType, string reason)
|
||||
=> RateLimitDenials.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType ?? "(all)"),
|
||||
new KeyValuePair<string, object?>("reason", reason));
|
||||
|
||||
public static void BackpressureEvent(string tenantId, int statusCode, string reason)
|
||||
=> BackpressureEvents.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("status_code", statusCode),
|
||||
new KeyValuePair<string, object?>("reason", reason));
|
||||
|
||||
public static void RecordTokenBucketUtilization(string tenantId, string? jobType, double utilization)
|
||||
=> TokenBucketUtilization.Record(utilization, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType ?? "(all)"));
|
||||
|
||||
public static void RecordConcurrencyUtilization(string tenantId, string? jobType, double utilization)
|
||||
=> ConcurrencyUtilization.Record(utilization, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType ?? "(all)"));
|
||||
|
||||
// Watermark metrics
|
||||
private static readonly Counter<long> WatermarksCreatedCounter = Meter.CreateCounter<long>(
|
||||
"orchestrator.watermarks.created",
|
||||
description: "Total watermarks created");
|
||||
|
||||
private static readonly Counter<long> WatermarksAdvanced = Meter.CreateCounter<long>(
|
||||
"orchestrator.watermarks.advanced",
|
||||
description: "Total watermark advancement operations");
|
||||
|
||||
private static readonly Histogram<double> WatermarkLag = Meter.CreateHistogram<double>(
|
||||
"orchestrator.watermark.lag.seconds",
|
||||
unit: "s",
|
||||
description: "Watermark lag from current time");
|
||||
|
||||
// Backfill metrics
|
||||
private static readonly Counter<long> BackfillsCreated = Meter.CreateCounter<long>(
|
||||
"orchestrator.backfills.created",
|
||||
description: "Total backfill requests created");
|
||||
|
||||
private static readonly Counter<long> BackfillStatusChanges = Meter.CreateCounter<long>(
|
||||
"orchestrator.backfills.status_changes",
|
||||
description: "Total backfill status changes");
|
||||
|
||||
private static readonly Counter<long> BackfillEventsProcessed = Meter.CreateCounter<long>(
|
||||
"orchestrator.backfills.events_processed",
|
||||
description: "Total events processed by backfills");
|
||||
|
||||
private static readonly Counter<long> BackfillEventsSkipped = Meter.CreateCounter<long>(
|
||||
"orchestrator.backfills.events_skipped",
|
||||
description: "Total events skipped by backfills (duplicates)");
|
||||
|
||||
private static readonly Histogram<double> BackfillDuration = Meter.CreateHistogram<double>(
|
||||
"orchestrator.backfill.duration.seconds",
|
||||
unit: "s",
|
||||
description: "Backfill execution duration");
|
||||
|
||||
private static readonly Histogram<double> BackfillProgress = Meter.CreateHistogram<double>(
|
||||
"orchestrator.backfill.progress",
|
||||
unit: "percent",
|
||||
description: "Backfill progress percentage");
|
||||
|
||||
// Duplicate suppression metrics
|
||||
private static readonly Counter<long> ProcessedEventsMarkedCounter = Meter.CreateCounter<long>(
|
||||
"orchestrator.processed_events.marked",
|
||||
description: "Total processed events marked for duplicate suppression");
|
||||
|
||||
private static readonly Counter<long> ProcessedEventsCleanedUpCounter = Meter.CreateCounter<long>(
|
||||
"orchestrator.processed_events.cleaned_up",
|
||||
description: "Total expired processed events cleaned up");
|
||||
|
||||
private static readonly Counter<long> DuplicatesDetected = Meter.CreateCounter<long>(
|
||||
"orchestrator.duplicates.detected",
|
||||
description: "Total duplicate events detected");
|
||||
|
||||
public static void WatermarkCreated(string tenantId, string scopeKey)
|
||||
=> WatermarksCreatedCounter.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("scope_key", scopeKey));
|
||||
|
||||
public static void WatermarkAdvanced(string tenantId, string scopeKey)
|
||||
=> WatermarksAdvanced.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("scope_key", scopeKey));
|
||||
|
||||
public static void RecordWatermarkLag(string tenantId, string scopeKey, double lagSeconds)
|
||||
=> WatermarkLag.Record(lagSeconds, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("scope_key", scopeKey));
|
||||
|
||||
public static void BackfillCreated(string tenantId, string scopeKey)
|
||||
=> BackfillsCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("scope_key", scopeKey));
|
||||
|
||||
public static void BackfillStatusChanged(string tenantId, string scopeKey, string status)
|
||||
=> BackfillStatusChanges.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("scope_key", scopeKey),
|
||||
new KeyValuePair<string, object?>("status", status));
|
||||
|
||||
public static void BackfillEventProcessed(string tenantId, string scopeKey, long count)
|
||||
=> BackfillEventsProcessed.Add(count, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("scope_key", scopeKey));
|
||||
|
||||
public static void BackfillEventSkipped(string tenantId, string scopeKey, long count)
|
||||
=> BackfillEventsSkipped.Add(count, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("scope_key", scopeKey));
|
||||
|
||||
public static void RecordBackfillDuration(string tenantId, string scopeKey, double durationSeconds)
|
||||
=> BackfillDuration.Record(durationSeconds, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("scope_key", scopeKey));
|
||||
|
||||
public static void RecordBackfillProgress(string tenantId, string scopeKey, double progressPercent)
|
||||
=> BackfillProgress.Record(progressPercent, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("scope_key", scopeKey));
|
||||
|
||||
public static void ProcessedEventsMarked(string tenantId, string scopeKey, long count)
|
||||
=> ProcessedEventsMarkedCounter.Add(count, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("scope_key", scopeKey));
|
||||
|
||||
public static void ProcessedEventsCleanedUp(string tenantId, long count)
|
||||
=> ProcessedEventsCleanedUpCounter.Add(count, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
|
||||
public static void DuplicateDetected(string tenantId, string scopeKey)
|
||||
=> DuplicatesDetected.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("scope_key", scopeKey));
|
||||
|
||||
// Dead-letter metrics
|
||||
private static readonly Counter<long> DeadLetterEntriesCreated = Meter.CreateCounter<long>(
|
||||
"orchestrator.deadletter.created",
|
||||
description: "Total dead-letter entries created");
|
||||
|
||||
private static readonly Counter<long> DeadLetterStatusChanges = Meter.CreateCounter<long>(
|
||||
"orchestrator.deadletter.status_changes",
|
||||
description: "Total dead-letter status changes");
|
||||
|
||||
private static readonly Counter<long> DeadLetterReplayAttempts = Meter.CreateCounter<long>(
|
||||
"orchestrator.deadletter.replay_attempts",
|
||||
description: "Total dead-letter replay attempts");
|
||||
|
||||
private static readonly Counter<long> DeadLetterReplaySuccesses = Meter.CreateCounter<long>(
|
||||
"orchestrator.deadletter.replay_successes",
|
||||
description: "Total successful dead-letter replays");
|
||||
|
||||
private static readonly Counter<long> DeadLetterReplayFailures = Meter.CreateCounter<long>(
|
||||
"orchestrator.deadletter.replay_failures",
|
||||
description: "Total failed dead-letter replays");
|
||||
|
||||
private static readonly Counter<long> DeadLetterEntriesExpired = Meter.CreateCounter<long>(
|
||||
"orchestrator.deadletter.expired",
|
||||
description: "Total dead-letter entries marked as expired");
|
||||
|
||||
private static readonly Counter<long> DeadLetterEntriesPurged = Meter.CreateCounter<long>(
|
||||
"orchestrator.deadletter.purged",
|
||||
description: "Total dead-letter entries purged");
|
||||
|
||||
private static readonly Counter<long> DeadLetterNotificationsSent = Meter.CreateCounter<long>(
|
||||
"orchestrator.deadletter.notifications_sent",
|
||||
description: "Total dead-letter notifications sent");
|
||||
|
||||
private static readonly Counter<long> DeadLetterNotificationsFailed = Meter.CreateCounter<long>(
|
||||
"orchestrator.deadletter.notifications_failed",
|
||||
description: "Total failed dead-letter notifications");
|
||||
|
||||
private static readonly UpDownCounter<long> DeadLetterPendingCount = Meter.CreateUpDownCounter<long>(
|
||||
"orchestrator.deadletter.pending",
|
||||
description: "Current number of pending dead-letter entries");
|
||||
|
||||
public static void DeadLetterCreated(string tenantId, string jobType, string errorCode, string category)
|
||||
=> DeadLetterEntriesCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType),
|
||||
new KeyValuePair<string, object?>("error_code", errorCode),
|
||||
new KeyValuePair<string, object?>("category", category));
|
||||
|
||||
public static void DeadLetterStatusChanged(string tenantId, string jobType, string status)
|
||||
=> DeadLetterStatusChanges.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("job_type", jobType),
|
||||
new KeyValuePair<string, object?>("status", status));
|
||||
|
||||
public static void DeadLetterReplayAttempted(string tenantId, string triggeredBy)
|
||||
=> DeadLetterReplayAttempts.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("triggered_by", triggeredBy));
|
||||
|
||||
public static void DeadLetterReplaySucceeded(string tenantId)
|
||||
=> DeadLetterReplaySuccesses.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
|
||||
public static void DeadLetterReplayFailed(string tenantId)
|
||||
=> DeadLetterReplayFailures.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
|
||||
public static void DeadLetterExpired(int count)
|
||||
=> DeadLetterEntriesExpired.Add(count);
|
||||
|
||||
public static void DeadLetterPurged(int count)
|
||||
=> DeadLetterEntriesPurged.Add(count);
|
||||
|
||||
public static void DeadLetterNotificationSent(string tenantId, string channel, string eventType)
|
||||
=> DeadLetterNotificationsSent.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("channel", channel),
|
||||
new KeyValuePair<string, object?>("event_type", eventType));
|
||||
|
||||
public static void DeadLetterNotificationFailed(string tenantId, string channel, string eventType)
|
||||
=> DeadLetterNotificationsFailed.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("channel", channel),
|
||||
new KeyValuePair<string, object?>("event_type", eventType));
|
||||
|
||||
public static void DeadLetterPendingChanged(string tenantId, long delta)
|
||||
=> DeadLetterPendingCount.Add(delta, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
|
||||
// SLO metrics
|
||||
private static readonly Counter<long> SlosCreated = Meter.CreateCounter<long>(
|
||||
"orchestrator.slos.created",
|
||||
description: "Total SLOs created");
|
||||
|
||||
private static readonly Counter<long> SlosUpdated = Meter.CreateCounter<long>(
|
||||
"orchestrator.slos.updated",
|
||||
description: "Total SLO updates");
|
||||
|
||||
private static readonly Counter<long> SloAlertsTriggered = Meter.CreateCounter<long>(
|
||||
"orchestrator.slo.alerts_triggered",
|
||||
description: "Total SLO alerts triggered");
|
||||
|
||||
private static readonly Counter<long> SloAlertsAcknowledged = Meter.CreateCounter<long>(
|
||||
"orchestrator.slo.alerts_acknowledged",
|
||||
description: "Total SLO alerts acknowledged");
|
||||
|
||||
private static readonly Counter<long> SloAlertsResolved = Meter.CreateCounter<long>(
|
||||
"orchestrator.slo.alerts_resolved",
|
||||
description: "Total SLO alerts resolved");
|
||||
|
||||
private static readonly Histogram<double> SloBudgetConsumed = Meter.CreateHistogram<double>(
|
||||
"orchestrator.slo.budget_consumed",
|
||||
unit: "ratio",
|
||||
description: "SLO error budget consumed (0-1)");
|
||||
|
||||
private static readonly Histogram<double> SloBurnRate = Meter.CreateHistogram<double>(
|
||||
"orchestrator.slo.burn_rate",
|
||||
unit: "ratio",
|
||||
description: "SLO burn rate (1.0 = sustainable)");
|
||||
|
||||
private static readonly Histogram<double> SloCurrentSli = Meter.CreateHistogram<double>(
|
||||
"orchestrator.slo.current_sli",
|
||||
unit: "ratio",
|
||||
description: "Current SLI value (0-1)");
|
||||
|
||||
private static readonly UpDownCounter<long> SloActiveAlerts = Meter.CreateUpDownCounter<long>(
|
||||
"orchestrator.slo.active_alerts",
|
||||
description: "Current number of active SLO alerts");
|
||||
|
||||
private static readonly Histogram<double> SloBudgetRemaining = Meter.CreateHistogram<double>(
|
||||
"orchestrator.slo.budget_remaining",
|
||||
unit: "ratio",
|
||||
description: "SLO error budget remaining (0-1)");
|
||||
|
||||
private static readonly Histogram<double> SloTimeToExhaustion = Meter.CreateHistogram<double>(
|
||||
"orchestrator.slo.time_to_exhaustion.seconds",
|
||||
unit: "s",
|
||||
description: "Estimated time until error budget exhaustion");
|
||||
|
||||
public static void SloCreated(string tenantId, string sloType, string? jobType)
|
||||
=> SlosCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("slo_type", sloType),
|
||||
new KeyValuePair<string, object?>("job_type", jobType ?? "(all)"));
|
||||
|
||||
public static void SloUpdated(string tenantId, string sloName)
|
||||
=> SlosUpdated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("slo_name", sloName));
|
||||
|
||||
public static void SloAlertTriggered(string tenantId, string sloName, string severity)
|
||||
=> SloAlertsTriggered.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("slo_name", sloName),
|
||||
new KeyValuePair<string, object?>("severity", severity));
|
||||
|
||||
public static void SloAlertAcknowledged(string tenantId, string sloName)
|
||||
=> SloAlertsAcknowledged.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("slo_name", sloName));
|
||||
|
||||
public static void SloAlertResolved(string tenantId, string sloName)
|
||||
=> SloAlertsResolved.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("slo_name", sloName));
|
||||
|
||||
public static void RecordSloBudgetConsumed(string tenantId, string sloName, string sloType, double consumed)
|
||||
=> SloBudgetConsumed.Record(consumed, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("slo_name", sloName),
|
||||
new KeyValuePair<string, object?>("slo_type", sloType));
|
||||
|
||||
public static void RecordSloBurnRate(string tenantId, string sloName, string sloType, double burnRate)
|
||||
=> SloBurnRate.Record(burnRate, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("slo_name", sloName),
|
||||
new KeyValuePair<string, object?>("slo_type", sloType));
|
||||
|
||||
public static void RecordSloCurrentSli(string tenantId, string sloName, string sloType, double sli)
|
||||
=> SloCurrentSli.Record(sli, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("slo_name", sloName),
|
||||
new KeyValuePair<string, object?>("slo_type", sloType));
|
||||
|
||||
public static void SloActiveAlertsChanged(string tenantId, long delta)
|
||||
=> SloActiveAlerts.Add(delta, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
|
||||
public static void RecordSloBudgetRemaining(string tenantId, string sloName, string sloType, double remaining)
|
||||
=> SloBudgetRemaining.Record(remaining, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("slo_name", sloName),
|
||||
new KeyValuePair<string, object?>("slo_type", sloType));
|
||||
|
||||
public static void RecordSloTimeToExhaustion(string tenantId, string sloName, double seconds)
|
||||
=> SloTimeToExhaustion.Record(seconds, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("slo_name", sloName));
|
||||
|
||||
// Audit log metrics
|
||||
private static readonly Counter<long> AuditEntriesCreated = Meter.CreateCounter<long>(
|
||||
"orchestrator.audit.entries_created",
|
||||
description: "Total audit log entries created");
|
||||
|
||||
private static readonly Counter<long> AuditChainVerifications = Meter.CreateCounter<long>(
|
||||
"orchestrator.audit.chain_verifications",
|
||||
description: "Total audit chain verification operations");
|
||||
|
||||
private static readonly Counter<long> AuditChainFailures = Meter.CreateCounter<long>(
|
||||
"orchestrator.audit.chain_failures",
|
||||
description: "Total audit chain verification failures");
|
||||
|
||||
private static readonly UpDownCounter<long> AuditEntryCount = Meter.CreateUpDownCounter<long>(
|
||||
"orchestrator.audit.entry_count",
|
||||
description: "Current number of audit entries");
|
||||
|
||||
// Ledger metrics
|
||||
private static readonly Counter<long> LedgerEntriesCreated = Meter.CreateCounter<long>(
|
||||
"orchestrator.ledger.entries_created",
|
||||
description: "Total ledger entries created");
|
||||
|
||||
private static readonly Counter<long> LedgerChainVerifications = Meter.CreateCounter<long>(
|
||||
"orchestrator.ledger.chain_verifications",
|
||||
description: "Total ledger chain verification operations");
|
||||
|
||||
private static readonly Counter<long> LedgerChainFailures = Meter.CreateCounter<long>(
|
||||
"orchestrator.ledger.chain_failures",
|
||||
description: "Total ledger chain verification failures");
|
||||
|
||||
private static readonly Counter<long> LedgerExportsRequested = Meter.CreateCounter<long>(
|
||||
"orchestrator.ledger.exports_requested",
|
||||
description: "Total ledger export requests");
|
||||
|
||||
private static readonly Counter<long> LedgerExportsCompleted = Meter.CreateCounter<long>(
|
||||
"orchestrator.ledger.exports_completed",
|
||||
description: "Total ledger exports completed successfully");
|
||||
|
||||
private static readonly Counter<long> LedgerExportsFailed = Meter.CreateCounter<long>(
|
||||
"orchestrator.ledger.exports_failed",
|
||||
description: "Total ledger exports that failed");
|
||||
|
||||
private static readonly Histogram<double> LedgerExportDuration = Meter.CreateHistogram<double>(
|
||||
"orchestrator.ledger.export_duration.seconds",
|
||||
unit: "s",
|
||||
description: "Ledger export duration");
|
||||
|
||||
private static readonly Histogram<long> LedgerExportSize = Meter.CreateHistogram<long>(
|
||||
"orchestrator.ledger.export_size.bytes",
|
||||
unit: "bytes",
|
||||
description: "Ledger export file size");
|
||||
|
||||
// Manifest metrics
|
||||
private static readonly Counter<long> ManifestsCreated = Meter.CreateCounter<long>(
|
||||
"orchestrator.manifests.created",
|
||||
description: "Total signed manifests created");
|
||||
|
||||
private static readonly Counter<long> ManifestVerifications = Meter.CreateCounter<long>(
|
||||
"orchestrator.manifests.verifications",
|
||||
description: "Total manifest verification operations");
|
||||
|
||||
private static readonly Counter<long> ManifestVerificationFailures = Meter.CreateCounter<long>(
|
||||
"orchestrator.manifests.verification_failures",
|
||||
description: "Total manifest verification failures");
|
||||
|
||||
public static void AuditEntryCreated(string tenantId, string eventType, string resourceType)
|
||||
=> AuditEntriesCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("event_type", eventType),
|
||||
new KeyValuePair<string, object?>("resource_type", resourceType));
|
||||
|
||||
public static void AuditChainVerified(string tenantId, bool success)
|
||||
{
|
||||
AuditChainVerifications.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
if (!success)
|
||||
{
|
||||
AuditChainFailures.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
}
|
||||
}
|
||||
|
||||
public static void AuditEntryCountChanged(string tenantId, long delta)
|
||||
=> AuditEntryCount.Add(delta, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
|
||||
public static void LedgerEntryCreated(string tenantId, string runType, string finalStatus)
|
||||
=> LedgerEntriesCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("run_type", runType),
|
||||
new KeyValuePair<string, object?>("final_status", finalStatus));
|
||||
|
||||
public static void LedgerChainVerified(string tenantId, bool success)
|
||||
{
|
||||
LedgerChainVerifications.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
if (!success)
|
||||
{
|
||||
LedgerChainFailures.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
}
|
||||
}
|
||||
|
||||
public static void LedgerExportRequested(string tenantId, string format)
|
||||
=> LedgerExportsRequested.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("format", format));
|
||||
|
||||
public static void LedgerExportCompleted(string tenantId, string format)
|
||||
=> LedgerExportsCompleted.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("format", format));
|
||||
|
||||
public static void LedgerExportFailed(string tenantId, string format)
|
||||
=> LedgerExportsFailed.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("format", format));
|
||||
|
||||
public static void RecordLedgerExportDuration(string tenantId, string format, double durationSeconds)
|
||||
=> LedgerExportDuration.Record(durationSeconds, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("format", format));
|
||||
|
||||
public static void RecordLedgerExportSize(string tenantId, string format, long sizeBytes)
|
||||
=> LedgerExportSize.Record(sizeBytes, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("format", format));
|
||||
|
||||
public static void ManifestCreated(string tenantId, string provenanceType)
|
||||
=> ManifestsCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
|
||||
new KeyValuePair<string, object?>("provenance_type", provenanceType));
|
||||
|
||||
public static void ManifestVerified(string tenantId, bool success)
|
||||
{
|
||||
ManifestVerifications.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
if (!success)
|
||||
{
|
||||
ManifestVerificationFailures.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,130 @@
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Options;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for the Orchestrator service.
|
||||
/// </summary>
|
||||
public sealed class OrchestratorServiceOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section name.
|
||||
/// </summary>
|
||||
public const string SectionName = "Orchestrator";
|
||||
|
||||
/// <summary>
|
||||
/// HTTP header name for tenant identification.
|
||||
/// </summary>
|
||||
public string TenantHeader { get; set; } = "X-Tenant-Id";
|
||||
|
||||
/// <summary>
|
||||
/// Database connection options.
|
||||
/// </summary>
|
||||
public DatabaseOptions Database { get; set; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Lease management options.
|
||||
/// </summary>
|
||||
public LeaseOptions Lease { get; set; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Rate-limiting options.
|
||||
/// </summary>
|
||||
public RateLimitOptions RateLimit { get; set; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Database connection options.
|
||||
/// </summary>
|
||||
public sealed class DatabaseOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// PostgreSQL connection string.
|
||||
/// </summary>
|
||||
public string ConnectionString { get; set; } = string.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Command timeout in seconds.
|
||||
/// </summary>
|
||||
public int CommandTimeoutSeconds { get; set; } = 30;
|
||||
|
||||
/// <summary>
|
||||
/// Enable connection pooling.
|
||||
/// </summary>
|
||||
public bool EnablePooling { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum pool size.
|
||||
/// </summary>
|
||||
public int MinPoolSize { get; set; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum pool size.
|
||||
/// </summary>
|
||||
public int MaxPoolSize { get; set; } = 100;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Lease management options.
|
||||
/// </summary>
|
||||
public sealed class LeaseOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Default lease duration in seconds.
|
||||
/// </summary>
|
||||
public int DefaultLeaseDurationSeconds { get; set; } = 300;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum lease duration in seconds.
|
||||
/// </summary>
|
||||
public int MaxLeaseDurationSeconds { get; set; } = 3600;
|
||||
|
||||
/// <summary>
|
||||
/// Lease renewal threshold (renew when this fraction of lease remains).
|
||||
/// </summary>
|
||||
public double RenewalThreshold { get; set; } = 0.5;
|
||||
|
||||
/// <summary>
|
||||
/// Interval for checking expired leases in seconds.
|
||||
/// </summary>
|
||||
public int ExpiryCheckIntervalSeconds { get; set; } = 30;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Rate-limiting options.
|
||||
/// </summary>
|
||||
public sealed class RateLimitOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Default maximum concurrent active jobs per tenant.
|
||||
/// </summary>
|
||||
public int DefaultMaxActive { get; set; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Default maximum jobs per hour per tenant.
|
||||
/// </summary>
|
||||
public int DefaultMaxPerHour { get; set; } = 1000;
|
||||
|
||||
/// <summary>
|
||||
/// Default burst capacity for token bucket.
|
||||
/// </summary>
|
||||
public int DefaultBurstCapacity { get; set; } = 50;
|
||||
|
||||
/// <summary>
|
||||
/// Default token refill rate (tokens per second).
|
||||
/// </summary>
|
||||
public double DefaultRefillRate { get; set; } = 1.0;
|
||||
|
||||
/// <summary>
|
||||
/// Failure rate threshold for circuit breaker (0.0-1.0).
|
||||
/// </summary>
|
||||
public double CircuitBreakerThreshold { get; set; } = 0.5;
|
||||
|
||||
/// <summary>
|
||||
/// Window size in minutes for failure rate calculation.
|
||||
/// </summary>
|
||||
public int CircuitBreakerWindowMinutes { get; set; } = 5;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum sample size before circuit breaker can trip.
|
||||
/// </summary>
|
||||
public int CircuitBreakerMinSamples { get; set; } = 10;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,118 @@
|
||||
using System.Data;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using Npgsql;
|
||||
using StellaOps.Orchestrator.Infrastructure.Options;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// Manages PostgreSQL connections for the Orchestrator service.
|
||||
/// Configures session-level tenant context for row-level security.
|
||||
/// </summary>
|
||||
public sealed class OrchestratorDataSource : IAsyncDisposable
|
||||
{
|
||||
private readonly NpgsqlDataSource _dataSource;
|
||||
private readonly OrchestratorServiceOptions.DatabaseOptions _options;
|
||||
private readonly ILogger<OrchestratorDataSource> _logger;
|
||||
|
||||
public OrchestratorDataSource(
|
||||
IOptions<OrchestratorServiceOptions> options,
|
||||
ILogger<OrchestratorDataSource> logger)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(options);
|
||||
_options = options.Value.Database;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
|
||||
var builder = new NpgsqlDataSourceBuilder(_options.ConnectionString);
|
||||
_dataSource = builder.Build();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Command timeout in seconds.
|
||||
/// </summary>
|
||||
public int CommandTimeoutSeconds => _options.CommandTimeoutSeconds;
|
||||
|
||||
/// <summary>
|
||||
/// Disposes the data source and releases all connections.
|
||||
/// </summary>
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
await _dataSource.DisposeAsync().ConfigureAwait(false);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Opens a connection with tenant context configured.
|
||||
/// </summary>
|
||||
/// <param name="tenantId">Tenant identifier for session configuration.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Open PostgreSQL connection.</returns>
|
||||
public Task<NpgsqlConnection> OpenConnectionAsync(string tenantId, CancellationToken cancellationToken)
|
||||
=> OpenConnectionInternalAsync(tenantId, "unspecified", cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Opens a connection with tenant context and role label configured.
|
||||
/// </summary>
|
||||
/// <param name="tenantId">Tenant identifier for session configuration.</param>
|
||||
/// <param name="role">Role label for metrics/logging (e.g., "reader", "writer").</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Open PostgreSQL connection.</returns>
|
||||
public Task<NpgsqlConnection> OpenConnectionAsync(string tenantId, string role, CancellationToken cancellationToken)
|
||||
=> OpenConnectionInternalAsync(tenantId, role, cancellationToken);
|
||||
|
||||
private async Task<NpgsqlConnection> OpenConnectionInternalAsync(string tenantId, string role, CancellationToken cancellationToken)
|
||||
{
|
||||
var connection = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
try
|
||||
{
|
||||
await ConfigureSessionAsync(connection, tenantId, cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.ConnectionOpened(role);
|
||||
connection.StateChange += (_, args) =>
|
||||
{
|
||||
if (args.CurrentState == ConnectionState.Closed)
|
||||
{
|
||||
OrchestratorMetrics.ConnectionClosed(role);
|
||||
}
|
||||
};
|
||||
}
|
||||
catch
|
||||
{
|
||||
await connection.DisposeAsync().ConfigureAwait(false);
|
||||
throw;
|
||||
}
|
||||
|
||||
return connection;
|
||||
}
|
||||
|
||||
private async Task ConfigureSessionAsync(NpgsqlConnection connection, string tenantId, CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Set UTC timezone for deterministic timestamps
|
||||
await using (var command = new NpgsqlCommand("SET TIME ZONE 'UTC';", connection))
|
||||
{
|
||||
command.CommandTimeout = _options.CommandTimeoutSeconds;
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
// Set tenant context for row-level security
|
||||
if (!string.IsNullOrWhiteSpace(tenantId))
|
||||
{
|
||||
await using var tenantCommand = new NpgsqlCommand("SELECT set_config('app.current_tenant', @tenant, false);", connection);
|
||||
tenantCommand.CommandTimeout = _options.CommandTimeoutSeconds;
|
||||
tenantCommand.Parameters.AddWithValue("tenant", tenantId);
|
||||
await tenantCommand.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
if (_logger.IsEnabled(LogLevel.Error))
|
||||
{
|
||||
_logger.LogError(ex, "Failed to configure PostgreSQL session for tenant {TenantId}.", tenantId);
|
||||
}
|
||||
|
||||
throw;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,362 @@
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Npgsql;
|
||||
using NpgsqlTypes;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of artifact repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresArtifactRepository : IArtifactRepository
|
||||
{
|
||||
private const string SelectArtifactColumns = """
|
||||
artifact_id, tenant_id, job_id, run_id, artifact_type, uri, digest,
|
||||
mime_type, size_bytes, created_at, metadata
|
||||
""";
|
||||
|
||||
private const string SelectByIdSql = $"""
|
||||
SELECT {SelectArtifactColumns}
|
||||
FROM artifacts
|
||||
WHERE tenant_id = @tenant_id AND artifact_id = @artifact_id
|
||||
""";
|
||||
|
||||
private const string SelectByJobIdSql = $"""
|
||||
SELECT {SelectArtifactColumns}
|
||||
FROM artifacts
|
||||
WHERE tenant_id = @tenant_id AND job_id = @job_id
|
||||
ORDER BY created_at
|
||||
""";
|
||||
|
||||
private const string SelectByRunIdSql = $"""
|
||||
SELECT {SelectArtifactColumns}
|
||||
FROM artifacts
|
||||
WHERE tenant_id = @tenant_id AND run_id = @run_id
|
||||
ORDER BY created_at
|
||||
""";
|
||||
|
||||
private const string SelectByDigestSql = $"""
|
||||
SELECT {SelectArtifactColumns}
|
||||
FROM artifacts
|
||||
WHERE tenant_id = @tenant_id AND digest = @digest
|
||||
""";
|
||||
|
||||
private const string InsertArtifactSql = """
|
||||
INSERT INTO artifacts (
|
||||
artifact_id, tenant_id, job_id, run_id, artifact_type, uri, digest,
|
||||
mime_type, size_bytes, created_at, metadata)
|
||||
VALUES (
|
||||
@artifact_id, @tenant_id, @job_id, @run_id, @artifact_type, @uri, @digest,
|
||||
@mime_type, @size_bytes, @created_at, @metadata)
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresArtifactRepository> _logger;
|
||||
|
||||
public PostgresArtifactRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresArtifactRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<Artifact?> GetByIdAsync(string tenantId, Guid artifactId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("artifact_id", artifactId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapArtifact(reader);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Artifact>> GetByJobIdAsync(string tenantId, Guid jobId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByJobIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("job_id", jobId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var artifacts = new List<Artifact>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
artifacts.Add(MapArtifact(reader));
|
||||
}
|
||||
return artifacts;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Artifact>> GetByRunIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByRunIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("run_id", runId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var artifacts = new List<Artifact>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
artifacts.Add(MapArtifact(reader));
|
||||
}
|
||||
return artifacts;
|
||||
}
|
||||
|
||||
public async Task<Artifact?> GetByDigestAsync(string tenantId, string digest, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByDigestSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("digest", digest);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapArtifact(reader);
|
||||
}
|
||||
|
||||
public async Task CreateAsync(Artifact artifact, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(artifact.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(InsertArtifactSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
AddArtifactParameters(command, artifact);
|
||||
|
||||
try
|
||||
{
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.ArtifactCreated(artifact.TenantId, artifact.ArtifactType);
|
||||
}
|
||||
catch (PostgresException ex) when (string.Equals(ex.SqlState, PostgresErrorCodes.UniqueViolation, StringComparison.Ordinal))
|
||||
{
|
||||
_logger.LogWarning("Duplicate artifact ID or digest: {ArtifactId}, {Digest}", artifact.ArtifactId, artifact.Digest);
|
||||
throw new DuplicateArtifactException(artifact.ArtifactId, artifact.Digest, ex);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task CreateBatchAsync(IEnumerable<Artifact> artifacts, CancellationToken cancellationToken)
|
||||
{
|
||||
var artifactList = artifacts.ToList();
|
||||
if (artifactList.Count == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var tenantId = artifactList[0].TenantId;
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var transaction = await connection.BeginTransactionAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
try
|
||||
{
|
||||
foreach (var artifact in artifactList)
|
||||
{
|
||||
await using var command = new NpgsqlCommand(InsertArtifactSql, connection, transaction);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
AddArtifactParameters(command, artifact);
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.ArtifactCreated(artifact.TenantId, artifact.ArtifactType);
|
||||
}
|
||||
|
||||
await transaction.CommitAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (PostgresException ex) when (string.Equals(ex.SqlState, PostgresErrorCodes.UniqueViolation, StringComparison.Ordinal))
|
||||
{
|
||||
await transaction.RollbackAsync(cancellationToken).ConfigureAwait(false);
|
||||
_logger.LogWarning(ex, "Duplicate artifact in batch insert");
|
||||
throw;
|
||||
}
|
||||
catch
|
||||
{
|
||||
await transaction.RollbackAsync(cancellationToken).ConfigureAwait(false);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Artifact>> ListAsync(
|
||||
string tenantId,
|
||||
string? artifactType,
|
||||
string? jobType,
|
||||
DateTimeOffset? createdAfter,
|
||||
DateTimeOffset? createdBefore,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var (sql, parameters) = BuildListQuery(tenantId, artifactType, jobType, createdAfter, createdBefore, limit, offset);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var artifacts = new List<Artifact>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
artifacts.Add(MapArtifact(reader));
|
||||
}
|
||||
return artifacts;
|
||||
}
|
||||
|
||||
public async Task<int> CountAsync(
|
||||
string tenantId,
|
||||
string? artifactType,
|
||||
string? jobType,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var (sql, parameters) = BuildCountQuery(tenantId, artifactType, jobType);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
||||
return Convert.ToInt32(result);
|
||||
}
|
||||
|
||||
private static void AddArtifactParameters(NpgsqlCommand command, Artifact artifact)
|
||||
{
|
||||
command.Parameters.AddWithValue("artifact_id", artifact.ArtifactId);
|
||||
command.Parameters.AddWithValue("tenant_id", artifact.TenantId);
|
||||
command.Parameters.AddWithValue("job_id", artifact.JobId);
|
||||
command.Parameters.AddWithValue("run_id", (object?)artifact.RunId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("artifact_type", artifact.ArtifactType);
|
||||
command.Parameters.AddWithValue("uri", artifact.Uri);
|
||||
command.Parameters.AddWithValue("digest", artifact.Digest);
|
||||
command.Parameters.AddWithValue("mime_type", (object?)artifact.MimeType ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("size_bytes", (object?)artifact.SizeBytes ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("created_at", artifact.CreatedAt);
|
||||
command.Parameters.Add(new NpgsqlParameter("metadata", NpgsqlDbType.Jsonb)
|
||||
{
|
||||
Value = (object?)artifact.Metadata ?? DBNull.Value
|
||||
});
|
||||
}
|
||||
|
||||
private static Artifact MapArtifact(NpgsqlDataReader reader)
|
||||
{
|
||||
return new Artifact(
|
||||
ArtifactId: reader.GetGuid(0),
|
||||
TenantId: reader.GetString(1),
|
||||
JobId: reader.GetGuid(2),
|
||||
RunId: reader.IsDBNull(3) ? null : reader.GetGuid(3),
|
||||
ArtifactType: reader.GetString(4),
|
||||
Uri: reader.GetString(5),
|
||||
Digest: reader.GetString(6),
|
||||
MimeType: reader.IsDBNull(7) ? null : reader.GetString(7),
|
||||
SizeBytes: reader.IsDBNull(8) ? null : reader.GetInt64(8),
|
||||
CreatedAt: reader.GetFieldValue<DateTimeOffset>(9),
|
||||
Metadata: reader.IsDBNull(10) ? null : reader.GetString(10));
|
||||
}
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
|
||||
string tenantId,
|
||||
string? artifactType,
|
||||
string? jobType,
|
||||
DateTimeOffset? createdAfter,
|
||||
DateTimeOffset? createdBefore,
|
||||
int limit,
|
||||
int offset)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append($"SELECT {SelectArtifactColumns} FROM artifacts a WHERE a.tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (!string.IsNullOrEmpty(artifactType))
|
||||
{
|
||||
sb.Append(" AND a.artifact_type = @artifact_type");
|
||||
parameters.Add(("artifact_type", artifactType));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(jobType))
|
||||
{
|
||||
sb.Append(" AND EXISTS (SELECT 1 FROM jobs j WHERE j.job_id = a.job_id AND j.tenant_id = a.tenant_id AND j.job_type = @job_type)");
|
||||
parameters.Add(("job_type", jobType));
|
||||
}
|
||||
|
||||
if (createdAfter.HasValue)
|
||||
{
|
||||
sb.Append(" AND a.created_at >= @created_after");
|
||||
parameters.Add(("created_after", createdAfter.Value));
|
||||
}
|
||||
|
||||
if (createdBefore.HasValue)
|
||||
{
|
||||
sb.Append(" AND a.created_at < @created_before");
|
||||
parameters.Add(("created_before", createdBefore.Value));
|
||||
}
|
||||
|
||||
sb.Append(" ORDER BY a.created_at DESC LIMIT @limit OFFSET @offset");
|
||||
parameters.Add(("limit", limit));
|
||||
parameters.Add(("offset", offset));
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildCountQuery(
|
||||
string tenantId,
|
||||
string? artifactType,
|
||||
string? jobType)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append("SELECT COUNT(*) FROM artifacts a WHERE a.tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (!string.IsNullOrEmpty(artifactType))
|
||||
{
|
||||
sb.Append(" AND a.artifact_type = @artifact_type");
|
||||
parameters.Add(("artifact_type", artifactType));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(jobType))
|
||||
{
|
||||
sb.Append(" AND EXISTS (SELECT 1 FROM jobs j WHERE j.job_id = a.job_id AND j.tenant_id = a.tenant_id AND j.job_type = @job_type)");
|
||||
parameters.Add(("job_type", jobType));
|
||||
}
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when attempting to create a duplicate artifact.
|
||||
/// </summary>
|
||||
public sealed class DuplicateArtifactException : Exception
|
||||
{
|
||||
public Guid ArtifactId { get; }
|
||||
public string Digest { get; }
|
||||
|
||||
public DuplicateArtifactException(Guid artifactId, string digest, Exception innerException)
|
||||
: base($"Artifact with ID '{artifactId}' or digest '{digest}' already exists.", innerException)
|
||||
{
|
||||
ArtifactId = artifactId;
|
||||
Digest = digest;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,504 @@
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Npgsql;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of the audit repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresAuditRepository : IAuditRepository
|
||||
{
|
||||
private const string SelectAuditColumns = """
|
||||
entry_id, tenant_id, event_type, resource_type, resource_id, actor_id, actor_type,
|
||||
actor_ip, user_agent, http_method, request_path, old_state, new_state, description,
|
||||
correlation_id, previous_entry_hash, content_hash, sequence_number, occurred_at, metadata
|
||||
""";
|
||||
|
||||
private const string SelectByIdSql = $"""
|
||||
SELECT {SelectAuditColumns}
|
||||
FROM audit_entries
|
||||
WHERE tenant_id = @tenant_id AND entry_id = @entry_id
|
||||
""";
|
||||
|
||||
private const string InsertEntrySql = """
|
||||
INSERT INTO audit_entries (
|
||||
entry_id, tenant_id, event_type, resource_type, resource_id, actor_id, actor_type,
|
||||
actor_ip, user_agent, http_method, request_path, old_state, new_state, description,
|
||||
correlation_id, previous_entry_hash, content_hash, sequence_number, occurred_at, metadata)
|
||||
VALUES (
|
||||
@entry_id, @tenant_id, @event_type, @resource_type, @resource_id, @actor_id, @actor_type,
|
||||
@actor_ip, @user_agent, @http_method, @request_path, @old_state::jsonb, @new_state::jsonb, @description,
|
||||
@correlation_id, @previous_entry_hash, @content_hash, @sequence_number, @occurred_at, @metadata::jsonb)
|
||||
""";
|
||||
|
||||
private const string SelectLatestSql = $"""
|
||||
SELECT {SelectAuditColumns}
|
||||
FROM audit_entries
|
||||
WHERE tenant_id = @tenant_id
|
||||
ORDER BY sequence_number DESC
|
||||
LIMIT 1
|
||||
""";
|
||||
|
||||
private const string GetSequenceSql = """
|
||||
SELECT next_seq, prev_hash FROM next_audit_sequence(@tenant_id)
|
||||
""";
|
||||
|
||||
private const string UpdateSequenceHashSql = """
|
||||
SELECT update_audit_sequence_hash(@tenant_id, @content_hash)
|
||||
""";
|
||||
|
||||
private const string VerifyChainSql = """
|
||||
SELECT is_valid, invalid_entry_id, invalid_sequence, error_message
|
||||
FROM verify_audit_chain(@tenant_id, @start_seq, @end_seq)
|
||||
""";
|
||||
|
||||
private const string GetSummarySql = """
|
||||
SELECT total_entries, entries_since, event_types, unique_actors, unique_resources, earliest_entry, latest_entry
|
||||
FROM get_audit_summary(@tenant_id, @since)
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresAuditRepository> _logger;
|
||||
|
||||
public PostgresAuditRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresAuditRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<AuditEntry> AppendAsync(
|
||||
string tenantId,
|
||||
AuditEventType eventType,
|
||||
string resourceType,
|
||||
Guid resourceId,
|
||||
string actorId,
|
||||
ActorType actorType,
|
||||
string description,
|
||||
string? oldState = null,
|
||||
string? newState = null,
|
||||
string? actorIp = null,
|
||||
string? userAgent = null,
|
||||
string? httpMethod = null,
|
||||
string? requestPath = null,
|
||||
string? correlationId = null,
|
||||
string? metadata = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var transaction = await connection.BeginTransactionAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
try
|
||||
{
|
||||
// Get next sequence number and previous hash
|
||||
long sequenceNumber;
|
||||
string? previousEntryHash;
|
||||
|
||||
await using (var seqCommand = new NpgsqlCommand(GetSequenceSql, connection, transaction))
|
||||
{
|
||||
seqCommand.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
seqCommand.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
|
||||
await using var reader = await seqCommand.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
throw new InvalidOperationException("Failed to get next audit sequence.");
|
||||
}
|
||||
|
||||
sequenceNumber = reader.GetInt64(0);
|
||||
previousEntryHash = reader.IsDBNull(1) ? null : reader.GetString(1);
|
||||
}
|
||||
|
||||
// Create the entry
|
||||
var entry = AuditEntry.Create(
|
||||
tenantId: tenantId,
|
||||
eventType: eventType,
|
||||
resourceType: resourceType,
|
||||
resourceId: resourceId,
|
||||
actorId: actorId,
|
||||
actorType: actorType,
|
||||
description: description,
|
||||
oldState: oldState,
|
||||
newState: newState,
|
||||
actorIp: actorIp,
|
||||
userAgent: userAgent,
|
||||
httpMethod: httpMethod,
|
||||
requestPath: requestPath,
|
||||
correlationId: correlationId,
|
||||
previousEntryHash: previousEntryHash,
|
||||
sequenceNumber: sequenceNumber,
|
||||
metadata: metadata);
|
||||
|
||||
// Insert the entry
|
||||
await using (var insertCommand = new NpgsqlCommand(InsertEntrySql, connection, transaction))
|
||||
{
|
||||
insertCommand.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
AddEntryParameters(insertCommand, entry);
|
||||
await insertCommand.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
// Update sequence hash
|
||||
await using (var updateCommand = new NpgsqlCommand(UpdateSequenceHashSql, connection, transaction))
|
||||
{
|
||||
updateCommand.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
updateCommand.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
updateCommand.Parameters.AddWithValue("content_hash", entry.ContentHash);
|
||||
await updateCommand.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
await transaction.CommitAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
OrchestratorMetrics.AuditEntryCreated(tenantId, eventType.ToString(), resourceType);
|
||||
_logger.LogDebug("Audit entry {EntryId} appended for tenant {TenantId}, sequence {Sequence}",
|
||||
entry.EntryId, tenantId, sequenceNumber);
|
||||
|
||||
return entry;
|
||||
}
|
||||
catch
|
||||
{
|
||||
await transaction.RollbackAsync(cancellationToken).ConfigureAwait(false);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<AuditEntry?> GetByIdAsync(
|
||||
string tenantId,
|
||||
Guid entryId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("entry_id", entryId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapEntry(reader);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<AuditEntry>> ListAsync(
|
||||
string tenantId,
|
||||
AuditEventType? eventType = null,
|
||||
string? resourceType = null,
|
||||
Guid? resourceId = null,
|
||||
string? actorId = null,
|
||||
DateTimeOffset? startTime = null,
|
||||
DateTimeOffset? endTime = null,
|
||||
int limit = 100,
|
||||
int offset = 0,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var (sql, parameters) = BuildListQuery(tenantId, eventType, resourceType, resourceId, actorId, startTime, endTime, limit, offset);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var entries = new List<AuditEntry>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
entries.Add(MapEntry(reader));
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<AuditEntry>> GetBySequenceRangeAsync(
|
||||
string tenantId,
|
||||
long startSequence,
|
||||
long endSequence,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sql = $"""
|
||||
SELECT {SelectAuditColumns}
|
||||
FROM audit_entries
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND sequence_number >= @start_seq
|
||||
AND sequence_number <= @end_seq
|
||||
ORDER BY sequence_number ASC
|
||||
""";
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("start_seq", startSequence);
|
||||
command.Parameters.AddWithValue("end_seq", endSequence);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var entries = new List<AuditEntry>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
entries.Add(MapEntry(reader));
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
public async Task<AuditEntry?> GetLatestAsync(
|
||||
string tenantId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectLatestSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapEntry(reader);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<AuditEntry>> GetByResourceAsync(
|
||||
string tenantId,
|
||||
string resourceType,
|
||||
Guid resourceId,
|
||||
int limit = 100,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sql = $"""
|
||||
SELECT {SelectAuditColumns}
|
||||
FROM audit_entries
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND resource_type = @resource_type
|
||||
AND resource_id = @resource_id
|
||||
ORDER BY occurred_at DESC
|
||||
LIMIT @limit
|
||||
""";
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("resource_type", resourceType);
|
||||
command.Parameters.AddWithValue("resource_id", resourceId);
|
||||
command.Parameters.AddWithValue("limit", limit);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var entries = new List<AuditEntry>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
entries.Add(MapEntry(reader));
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
public async Task<long> GetCountAsync(
|
||||
string tenantId,
|
||||
AuditEventType? eventType = null,
|
||||
DateTimeOffset? startTime = null,
|
||||
DateTimeOffset? endTime = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sb = new StringBuilder("SELECT COUNT(*) FROM audit_entries WHERE tenant_id = @tenant_id");
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (eventType.HasValue)
|
||||
{
|
||||
sb.Append(" AND event_type = @event_type");
|
||||
parameters.Add(("event_type", (int)eventType.Value));
|
||||
}
|
||||
|
||||
if (startTime.HasValue)
|
||||
{
|
||||
sb.Append(" AND occurred_at >= @start_time");
|
||||
parameters.Add(("start_time", startTime.Value));
|
||||
}
|
||||
|
||||
if (endTime.HasValue)
|
||||
{
|
||||
sb.Append(" AND occurred_at <= @end_time");
|
||||
parameters.Add(("end_time", endTime.Value));
|
||||
}
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sb.ToString(), connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
||||
return Convert.ToInt64(result);
|
||||
}
|
||||
|
||||
public async Task<ChainVerificationResult> VerifyChainAsync(
|
||||
string tenantId,
|
||||
long? startSequence = null,
|
||||
long? endSequence = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(VerifyChainSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("start_seq", (object?)startSequence ?? 1L);
|
||||
command.Parameters.AddWithValue("end_seq", (object?)endSequence ?? DBNull.Value);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return new ChainVerificationResult(true, null, null, null);
|
||||
}
|
||||
|
||||
return new ChainVerificationResult(
|
||||
IsValid: reader.GetBoolean(0),
|
||||
InvalidEntryId: reader.IsDBNull(1) ? null : reader.GetGuid(1),
|
||||
InvalidSequence: reader.IsDBNull(2) ? null : reader.GetInt64(2),
|
||||
ErrorMessage: reader.IsDBNull(3) ? null : reader.GetString(3));
|
||||
}
|
||||
|
||||
public async Task<AuditSummary> GetSummaryAsync(
|
||||
string tenantId,
|
||||
DateTimeOffset? since = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(GetSummarySql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("since", (object?)since ?? DBNull.Value);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return new AuditSummary(0, 0, 0, 0, 0, null, null);
|
||||
}
|
||||
|
||||
return new AuditSummary(
|
||||
TotalEntries: reader.GetInt64(0),
|
||||
EntriesSince: reader.GetInt64(1),
|
||||
EventTypes: reader.GetInt64(2),
|
||||
UniqueActors: reader.GetInt64(3),
|
||||
UniqueResources: reader.GetInt64(4),
|
||||
EarliestEntry: reader.IsDBNull(5) ? null : reader.GetFieldValue<DateTimeOffset>(5),
|
||||
LatestEntry: reader.IsDBNull(6) ? null : reader.GetFieldValue<DateTimeOffset>(6));
|
||||
}
|
||||
|
||||
private static void AddEntryParameters(NpgsqlCommand command, AuditEntry entry)
|
||||
{
|
||||
command.Parameters.AddWithValue("entry_id", entry.EntryId);
|
||||
command.Parameters.AddWithValue("tenant_id", entry.TenantId);
|
||||
command.Parameters.AddWithValue("event_type", (int)entry.EventType);
|
||||
command.Parameters.AddWithValue("resource_type", entry.ResourceType);
|
||||
command.Parameters.AddWithValue("resource_id", entry.ResourceId);
|
||||
command.Parameters.AddWithValue("actor_id", entry.ActorId);
|
||||
command.Parameters.AddWithValue("actor_type", (int)entry.ActorType);
|
||||
command.Parameters.AddWithValue("actor_ip", (object?)entry.ActorIp ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("user_agent", (object?)entry.UserAgent ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("http_method", (object?)entry.HttpMethod ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("request_path", (object?)entry.RequestPath ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("old_state", (object?)entry.OldState ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("new_state", (object?)entry.NewState ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("description", entry.Description);
|
||||
command.Parameters.AddWithValue("correlation_id", (object?)entry.CorrelationId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("previous_entry_hash", (object?)entry.PreviousEntryHash ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("content_hash", entry.ContentHash);
|
||||
command.Parameters.AddWithValue("sequence_number", entry.SequenceNumber);
|
||||
command.Parameters.AddWithValue("occurred_at", entry.OccurredAt);
|
||||
command.Parameters.AddWithValue("metadata", (object?)entry.Metadata ?? DBNull.Value);
|
||||
}
|
||||
|
||||
private static AuditEntry MapEntry(NpgsqlDataReader reader)
|
||||
{
|
||||
return new AuditEntry(
|
||||
EntryId: reader.GetGuid(0),
|
||||
TenantId: reader.GetString(1),
|
||||
EventType: (AuditEventType)reader.GetInt32(2),
|
||||
ResourceType: reader.GetString(3),
|
||||
ResourceId: reader.GetGuid(4),
|
||||
ActorId: reader.GetString(5),
|
||||
ActorType: (ActorType)reader.GetInt32(6),
|
||||
ActorIp: reader.IsDBNull(7) ? null : reader.GetString(7),
|
||||
UserAgent: reader.IsDBNull(8) ? null : reader.GetString(8),
|
||||
HttpMethod: reader.IsDBNull(9) ? null : reader.GetString(9),
|
||||
RequestPath: reader.IsDBNull(10) ? null : reader.GetString(10),
|
||||
OldState: reader.IsDBNull(11) ? null : reader.GetString(11),
|
||||
NewState: reader.IsDBNull(12) ? null : reader.GetString(12),
|
||||
Description: reader.GetString(13),
|
||||
CorrelationId: reader.IsDBNull(14) ? null : reader.GetString(14),
|
||||
PreviousEntryHash: reader.IsDBNull(15) ? null : reader.GetString(15),
|
||||
ContentHash: reader.GetString(16),
|
||||
SequenceNumber: reader.GetInt64(17),
|
||||
OccurredAt: reader.GetFieldValue<DateTimeOffset>(18),
|
||||
Metadata: reader.IsDBNull(19) ? null : reader.GetString(19));
|
||||
}
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
|
||||
string tenantId,
|
||||
AuditEventType? eventType,
|
||||
string? resourceType,
|
||||
Guid? resourceId,
|
||||
string? actorId,
|
||||
DateTimeOffset? startTime,
|
||||
DateTimeOffset? endTime,
|
||||
int limit,
|
||||
int offset)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append($"SELECT {SelectAuditColumns} FROM audit_entries WHERE tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (eventType.HasValue)
|
||||
{
|
||||
sb.Append(" AND event_type = @event_type");
|
||||
parameters.Add(("event_type", (int)eventType.Value));
|
||||
}
|
||||
|
||||
if (resourceType is not null)
|
||||
{
|
||||
sb.Append(" AND resource_type = @resource_type");
|
||||
parameters.Add(("resource_type", resourceType));
|
||||
}
|
||||
|
||||
if (resourceId.HasValue)
|
||||
{
|
||||
sb.Append(" AND resource_id = @resource_id");
|
||||
parameters.Add(("resource_id", resourceId.Value));
|
||||
}
|
||||
|
||||
if (actorId is not null)
|
||||
{
|
||||
sb.Append(" AND actor_id = @actor_id");
|
||||
parameters.Add(("actor_id", actorId));
|
||||
}
|
||||
|
||||
if (startTime.HasValue)
|
||||
{
|
||||
sb.Append(" AND occurred_at >= @start_time");
|
||||
parameters.Add(("start_time", startTime.Value));
|
||||
}
|
||||
|
||||
if (endTime.HasValue)
|
||||
{
|
||||
sb.Append(" AND occurred_at <= @end_time");
|
||||
parameters.Add(("end_time", endTime.Value));
|
||||
}
|
||||
|
||||
sb.Append(" ORDER BY occurred_at DESC LIMIT @limit OFFSET @offset");
|
||||
parameters.Add(("limit", limit));
|
||||
parameters.Add(("offset", offset));
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,395 @@
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Npgsql;
|
||||
using NpgsqlTypes;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of backfill request repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresBackfillRepository : IBackfillRepository
|
||||
{
|
||||
private const string SelectBackfillColumns = """
|
||||
backfill_id, tenant_id, source_id, job_type, scope_key, status,
|
||||
window_start, window_end, current_position, total_events,
|
||||
processed_events, skipped_events, failed_events, batch_size,
|
||||
dry_run, force_reprocess, estimated_duration, max_duration,
|
||||
safety_checks, reason, ticket, created_at, started_at, completed_at,
|
||||
created_by, updated_by, error_message
|
||||
""";
|
||||
|
||||
private const string SelectByIdSql = $"""
|
||||
SELECT {SelectBackfillColumns}
|
||||
FROM backfill_requests
|
||||
WHERE tenant_id = @tenant_id AND backfill_id = @backfill_id
|
||||
""";
|
||||
|
||||
private const string InsertBackfillSql = """
|
||||
INSERT INTO backfill_requests (
|
||||
backfill_id, tenant_id, source_id, job_type, scope_key, status,
|
||||
window_start, window_end, current_position, total_events,
|
||||
processed_events, skipped_events, failed_events, batch_size,
|
||||
dry_run, force_reprocess, estimated_duration, max_duration,
|
||||
safety_checks, reason, ticket, created_at, started_at, completed_at,
|
||||
created_by, updated_by, error_message)
|
||||
VALUES (
|
||||
@backfill_id, @tenant_id, @source_id, @job_type, @scope_key, @status,
|
||||
@window_start, @window_end, @current_position, @total_events,
|
||||
@processed_events, @skipped_events, @failed_events, @batch_size,
|
||||
@dry_run, @force_reprocess, @estimated_duration, @max_duration,
|
||||
@safety_checks, @reason, @ticket, @created_at, @started_at, @completed_at,
|
||||
@created_by, @updated_by, @error_message)
|
||||
""";
|
||||
|
||||
private const string UpdateBackfillSql = """
|
||||
UPDATE backfill_requests
|
||||
SET status = @status,
|
||||
current_position = @current_position,
|
||||
total_events = @total_events,
|
||||
processed_events = @processed_events,
|
||||
skipped_events = @skipped_events,
|
||||
failed_events = @failed_events,
|
||||
estimated_duration = @estimated_duration,
|
||||
safety_checks = @safety_checks,
|
||||
started_at = @started_at,
|
||||
completed_at = @completed_at,
|
||||
updated_by = @updated_by,
|
||||
error_message = @error_message
|
||||
WHERE tenant_id = @tenant_id AND backfill_id = @backfill_id
|
||||
""";
|
||||
|
||||
private const string SelectOverlappingSql = """
|
||||
SELECT COUNT(*) FROM backfill_requests
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND scope_key = @scope_key
|
||||
AND status IN ('pending', 'validating', 'running', 'paused')
|
||||
AND window_start < @window_end
|
||||
AND window_end > @window_start
|
||||
AND (@exclude_backfill_id IS NULL OR backfill_id != @exclude_backfill_id)
|
||||
""";
|
||||
|
||||
private const string SelectActiveByScopeSql = $"""
|
||||
SELECT {SelectBackfillColumns}
|
||||
FROM backfill_requests
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND scope_key = @scope_key
|
||||
AND status IN ('pending', 'validating', 'running', 'paused')
|
||||
ORDER BY created_at DESC
|
||||
""";
|
||||
|
||||
private const string CountByStatusSql = """
|
||||
SELECT status, COUNT(*) as count
|
||||
FROM backfill_requests
|
||||
WHERE tenant_id = @tenant_id
|
||||
GROUP BY status
|
||||
""";
|
||||
|
||||
private const string SelectNextPendingSql = $"""
|
||||
SELECT {SelectBackfillColumns}
|
||||
FROM backfill_requests
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND status = 'pending'
|
||||
ORDER BY created_at ASC
|
||||
LIMIT 1
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresBackfillRepository> _logger;
|
||||
private static readonly JsonSerializerOptions JsonOptions = new() { PropertyNamingPolicy = JsonNamingPolicy.CamelCase };
|
||||
|
||||
public PostgresBackfillRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresBackfillRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<BackfillRequest?> GetByIdAsync(string tenantId, Guid backfillId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("backfill_id", backfillId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapBackfillRequest(reader);
|
||||
}
|
||||
|
||||
public async Task CreateAsync(BackfillRequest request, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(request.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(InsertBackfillSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
AddBackfillParameters(command, request);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.BackfillCreated(request.TenantId, request.ScopeKey);
|
||||
}
|
||||
|
||||
public async Task UpdateAsync(BackfillRequest request, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(request.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(UpdateBackfillSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", request.TenantId);
|
||||
command.Parameters.AddWithValue("backfill_id", request.BackfillId);
|
||||
command.Parameters.AddWithValue("status", request.Status.ToString().ToLowerInvariant());
|
||||
command.Parameters.AddWithValue("current_position", (object?)request.CurrentPosition ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("total_events", (object?)request.TotalEvents ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("processed_events", request.ProcessedEvents);
|
||||
command.Parameters.AddWithValue("skipped_events", request.SkippedEvents);
|
||||
command.Parameters.AddWithValue("failed_events", request.FailedEvents);
|
||||
command.Parameters.AddWithValue("estimated_duration", (object?)request.EstimatedDuration ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("safety_checks", request.SafetyChecks is not null
|
||||
? JsonSerializer.Serialize(request.SafetyChecks, JsonOptions)
|
||||
: DBNull.Value);
|
||||
command.Parameters.AddWithValue("started_at", (object?)request.StartedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("completed_at", (object?)request.CompletedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("updated_by", request.UpdatedBy);
|
||||
command.Parameters.AddWithValue("error_message", (object?)request.ErrorMessage ?? DBNull.Value);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (rows == 0)
|
||||
{
|
||||
_logger.LogWarning("Backfill request not found for update: {BackfillId}", request.BackfillId);
|
||||
}
|
||||
else
|
||||
{
|
||||
OrchestratorMetrics.BackfillStatusChanged(request.TenantId, request.ScopeKey, request.Status.ToString());
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<BackfillRequest>> ListAsync(
|
||||
string tenantId,
|
||||
BackfillStatus? status,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var (sql, parameters) = BuildListQuery(tenantId, status, sourceId, jobType, limit, offset);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var requests = new List<BackfillRequest>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
requests.Add(MapBackfillRequest(reader));
|
||||
}
|
||||
return requests;
|
||||
}
|
||||
|
||||
public async Task<bool> HasOverlappingActiveAsync(
|
||||
string tenantId,
|
||||
string scopeKey,
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
Guid? excludeBackfillId,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectOverlappingSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("scope_key", scopeKey);
|
||||
command.Parameters.AddWithValue("window_start", windowStart);
|
||||
command.Parameters.AddWithValue("window_end", windowEnd);
|
||||
command.Parameters.AddWithValue("exclude_backfill_id", (object?)excludeBackfillId ?? DBNull.Value);
|
||||
|
||||
var count = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
||||
return Convert.ToInt64(count) > 0;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<BackfillRequest>> GetActiveByScope(
|
||||
string tenantId,
|
||||
string scopeKey,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectActiveByScopeSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("scope_key", scopeKey);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var requests = new List<BackfillRequest>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
requests.Add(MapBackfillRequest(reader));
|
||||
}
|
||||
return requests;
|
||||
}
|
||||
|
||||
public async Task<IDictionary<BackfillStatus, int>> CountByStatusAsync(
|
||||
string tenantId,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(CountByStatusSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var counts = new Dictionary<BackfillStatus, int>();
|
||||
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
var statusStr = reader.GetString(0);
|
||||
var count = reader.GetInt32(1);
|
||||
if (Enum.TryParse<BackfillStatus>(statusStr, true, out var status))
|
||||
{
|
||||
counts[status] = count;
|
||||
}
|
||||
}
|
||||
|
||||
return counts;
|
||||
}
|
||||
|
||||
public async Task<BackfillRequest?> GetNextPendingAsync(string tenantId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectNextPendingSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapBackfillRequest(reader);
|
||||
}
|
||||
|
||||
private static void AddBackfillParameters(NpgsqlCommand command, BackfillRequest request)
|
||||
{
|
||||
command.Parameters.AddWithValue("backfill_id", request.BackfillId);
|
||||
command.Parameters.AddWithValue("tenant_id", request.TenantId);
|
||||
command.Parameters.AddWithValue("source_id", (object?)request.SourceId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("job_type", (object?)request.JobType ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("scope_key", request.ScopeKey);
|
||||
command.Parameters.AddWithValue("status", request.Status.ToString().ToLowerInvariant());
|
||||
command.Parameters.AddWithValue("window_start", request.WindowStart);
|
||||
command.Parameters.AddWithValue("window_end", request.WindowEnd);
|
||||
command.Parameters.AddWithValue("current_position", (object?)request.CurrentPosition ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("total_events", (object?)request.TotalEvents ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("processed_events", request.ProcessedEvents);
|
||||
command.Parameters.AddWithValue("skipped_events", request.SkippedEvents);
|
||||
command.Parameters.AddWithValue("failed_events", request.FailedEvents);
|
||||
command.Parameters.AddWithValue("batch_size", request.BatchSize);
|
||||
command.Parameters.AddWithValue("dry_run", request.DryRun);
|
||||
command.Parameters.AddWithValue("force_reprocess", request.ForceReprocess);
|
||||
command.Parameters.AddWithValue("estimated_duration", (object?)request.EstimatedDuration ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("max_duration", (object?)request.MaxDuration ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("safety_checks", request.SafetyChecks is not null
|
||||
? JsonSerializer.Serialize(request.SafetyChecks, JsonOptions)
|
||||
: DBNull.Value);
|
||||
command.Parameters.AddWithValue("reason", request.Reason);
|
||||
command.Parameters.AddWithValue("ticket", (object?)request.Ticket ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("created_at", request.CreatedAt);
|
||||
command.Parameters.AddWithValue("started_at", (object?)request.StartedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("completed_at", (object?)request.CompletedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("created_by", request.CreatedBy);
|
||||
command.Parameters.AddWithValue("updated_by", request.UpdatedBy);
|
||||
command.Parameters.AddWithValue("error_message", (object?)request.ErrorMessage ?? DBNull.Value);
|
||||
}
|
||||
|
||||
private static BackfillRequest MapBackfillRequest(NpgsqlDataReader reader)
|
||||
{
|
||||
var safetyChecksJson = reader.IsDBNull(18) ? null : reader.GetString(18);
|
||||
var safetyChecks = safetyChecksJson is not null
|
||||
? JsonSerializer.Deserialize<BackfillSafetyChecks>(safetyChecksJson, JsonOptions)
|
||||
: null;
|
||||
|
||||
return new BackfillRequest(
|
||||
BackfillId: reader.GetGuid(0),
|
||||
TenantId: reader.GetString(1),
|
||||
SourceId: reader.IsDBNull(2) ? null : reader.GetGuid(2),
|
||||
JobType: reader.IsDBNull(3) ? null : reader.GetString(3),
|
||||
ScopeKey: reader.GetString(4),
|
||||
Status: Enum.Parse<BackfillStatus>(reader.GetString(5), ignoreCase: true),
|
||||
WindowStart: reader.GetFieldValue<DateTimeOffset>(6),
|
||||
WindowEnd: reader.GetFieldValue<DateTimeOffset>(7),
|
||||
CurrentPosition: reader.IsDBNull(8) ? null : reader.GetFieldValue<DateTimeOffset>(8),
|
||||
TotalEvents: reader.IsDBNull(9) ? null : reader.GetInt64(9),
|
||||
ProcessedEvents: reader.GetInt64(10),
|
||||
SkippedEvents: reader.GetInt64(11),
|
||||
FailedEvents: reader.GetInt64(12),
|
||||
BatchSize: reader.GetInt32(13),
|
||||
DryRun: reader.GetBoolean(14),
|
||||
ForceReprocess: reader.GetBoolean(15),
|
||||
EstimatedDuration: reader.IsDBNull(16) ? null : reader.GetFieldValue<TimeSpan>(16),
|
||||
MaxDuration: reader.IsDBNull(17) ? null : reader.GetFieldValue<TimeSpan>(17),
|
||||
SafetyChecks: safetyChecks,
|
||||
Reason: reader.GetString(19),
|
||||
Ticket: reader.IsDBNull(20) ? null : reader.GetString(20),
|
||||
CreatedAt: reader.GetFieldValue<DateTimeOffset>(21),
|
||||
StartedAt: reader.IsDBNull(22) ? null : reader.GetFieldValue<DateTimeOffset>(22),
|
||||
CompletedAt: reader.IsDBNull(23) ? null : reader.GetFieldValue<DateTimeOffset>(23),
|
||||
CreatedBy: reader.GetString(24),
|
||||
UpdatedBy: reader.GetString(25),
|
||||
ErrorMessage: reader.IsDBNull(26) ? null : reader.GetString(26));
|
||||
}
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
|
||||
string tenantId,
|
||||
BackfillStatus? status,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
int limit,
|
||||
int offset)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append($"SELECT {SelectBackfillColumns} FROM backfill_requests WHERE tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (status.HasValue)
|
||||
{
|
||||
sb.Append(" AND status = @status");
|
||||
parameters.Add(("status", status.Value.ToString().ToLowerInvariant()));
|
||||
}
|
||||
|
||||
if (sourceId.HasValue)
|
||||
{
|
||||
sb.Append(" AND source_id = @source_id");
|
||||
parameters.Add(("source_id", sourceId.Value));
|
||||
}
|
||||
|
||||
if (jobType is not null)
|
||||
{
|
||||
sb.Append(" AND job_type = @job_type");
|
||||
parameters.Add(("job_type", jobType));
|
||||
}
|
||||
|
||||
sb.Append(" ORDER BY created_at DESC LIMIT @limit OFFSET @offset");
|
||||
parameters.Add(("limit", limit));
|
||||
parameters.Add(("offset", offset));
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,678 @@
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Npgsql;
|
||||
using StellaOps.Orchestrator.Core.DeadLetter;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of dead-letter entry repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresDeadLetterRepository : IDeadLetterRepository
|
||||
{
|
||||
private const string SelectEntryColumns = """
|
||||
entry_id, tenant_id, original_job_id, run_id, source_id, job_type,
|
||||
payload, payload_digest, idempotency_key, correlation_id,
|
||||
status, error_code, failure_reason, remediation_hint, category, is_retryable,
|
||||
original_attempts, replay_attempts, max_replay_attempts,
|
||||
failed_at, created_at, updated_at, expires_at, resolved_at,
|
||||
resolution_notes, created_by, updated_by
|
||||
""";
|
||||
|
||||
private const string SelectByIdSql = $"""
|
||||
SELECT {SelectEntryColumns}
|
||||
FROM dead_letter_entries
|
||||
WHERE tenant_id = @tenant_id AND entry_id = @entry_id
|
||||
""";
|
||||
|
||||
private const string SelectByJobIdSql = $"""
|
||||
SELECT {SelectEntryColumns}
|
||||
FROM dead_letter_entries
|
||||
WHERE tenant_id = @tenant_id AND original_job_id = @original_job_id
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1
|
||||
""";
|
||||
|
||||
private const string InsertEntrySql = """
|
||||
INSERT INTO dead_letter_entries (
|
||||
entry_id, tenant_id, original_job_id, run_id, source_id, job_type,
|
||||
payload, payload_digest, idempotency_key, correlation_id,
|
||||
status, error_code, failure_reason, remediation_hint, category, is_retryable,
|
||||
original_attempts, replay_attempts, max_replay_attempts,
|
||||
failed_at, created_at, updated_at, expires_at, resolved_at,
|
||||
resolution_notes, created_by, updated_by)
|
||||
VALUES (
|
||||
@entry_id, @tenant_id, @original_job_id, @run_id, @source_id, @job_type,
|
||||
@payload::jsonb, @payload_digest, @idempotency_key, @correlation_id,
|
||||
@status, @error_code, @failure_reason, @remediation_hint, @category, @is_retryable,
|
||||
@original_attempts, @replay_attempts, @max_replay_attempts,
|
||||
@failed_at, @created_at, @updated_at, @expires_at, @resolved_at,
|
||||
@resolution_notes, @created_by, @updated_by)
|
||||
""";
|
||||
|
||||
private const string UpdateEntrySql = """
|
||||
UPDATE dead_letter_entries
|
||||
SET status = @status,
|
||||
replay_attempts = @replay_attempts,
|
||||
failure_reason = @failure_reason,
|
||||
updated_at = @updated_at,
|
||||
resolved_at = @resolved_at,
|
||||
resolution_notes = @resolution_notes,
|
||||
updated_by = @updated_by
|
||||
WHERE tenant_id = @tenant_id AND entry_id = @entry_id
|
||||
""";
|
||||
|
||||
private const string SelectPendingRetryableSql = $"""
|
||||
SELECT {SelectEntryColumns}
|
||||
FROM dead_letter_entries
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND status = 'pending'
|
||||
AND is_retryable = TRUE
|
||||
AND replay_attempts < max_replay_attempts
|
||||
ORDER BY created_at ASC
|
||||
LIMIT @limit
|
||||
""";
|
||||
|
||||
private const string SelectByErrorCodeSql = $"""
|
||||
SELECT {SelectEntryColumns}
|
||||
FROM dead_letter_entries
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND error_code = @error_code
|
||||
AND (@status IS NULL OR status = @status)
|
||||
ORDER BY created_at DESC
|
||||
LIMIT @limit
|
||||
""";
|
||||
|
||||
private const string SelectByCategorySql = $"""
|
||||
SELECT {SelectEntryColumns}
|
||||
FROM dead_letter_entries
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND category = @category
|
||||
AND (@status IS NULL OR status = @status)
|
||||
ORDER BY created_at DESC
|
||||
LIMIT @limit
|
||||
""";
|
||||
|
||||
private const string MarkExpiredSql = """
|
||||
SELECT mark_expired_dead_letter_entries(@batch_limit)
|
||||
""";
|
||||
|
||||
private const string PurgeSql = """
|
||||
SELECT purge_dead_letter_entries(@retention_days, @batch_limit)
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresDeadLetterRepository> _logger;
|
||||
private static readonly JsonSerializerOptions JsonOptions = new() { PropertyNamingPolicy = JsonNamingPolicy.CamelCase };
|
||||
|
||||
public PostgresDeadLetterRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresDeadLetterRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<DeadLetterEntry?> GetByIdAsync(
|
||||
string tenantId,
|
||||
Guid entryId,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("entry_id", entryId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapEntry(reader);
|
||||
}
|
||||
|
||||
public async Task<DeadLetterEntry?> GetByOriginalJobIdAsync(
|
||||
string tenantId,
|
||||
Guid originalJobId,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByJobIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("original_job_id", originalJobId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapEntry(reader);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<DeadLetterEntry>> ListAsync(
|
||||
string tenantId,
|
||||
DeadLetterListOptions options,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var (sql, parameters) = BuildListQuery(tenantId, options);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var entries = new List<DeadLetterEntry>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
entries.Add(MapEntry(reader));
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
public async Task<long> CountAsync(
|
||||
string tenantId,
|
||||
DeadLetterListOptions options,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var (sql, parameters) = BuildCountQuery(tenantId, options);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
||||
return Convert.ToInt64(result);
|
||||
}
|
||||
|
||||
public async Task CreateAsync(
|
||||
DeadLetterEntry entry,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(entry.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(InsertEntrySql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
AddEntryParameters(command, entry);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.DeadLetterCreated(entry.TenantId, entry.JobType, entry.ErrorCode, entry.Category.ToString());
|
||||
}
|
||||
|
||||
public async Task<bool> UpdateAsync(
|
||||
DeadLetterEntry entry,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(entry.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(UpdateEntrySql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", entry.TenantId);
|
||||
command.Parameters.AddWithValue("entry_id", entry.EntryId);
|
||||
command.Parameters.AddWithValue("status", entry.Status.ToString().ToLowerInvariant());
|
||||
command.Parameters.AddWithValue("replay_attempts", entry.ReplayAttempts);
|
||||
command.Parameters.AddWithValue("failure_reason", entry.FailureReason);
|
||||
command.Parameters.AddWithValue("updated_at", entry.UpdatedAt);
|
||||
command.Parameters.AddWithValue("resolved_at", (object?)entry.ResolvedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("resolution_notes", (object?)entry.ResolutionNotes ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("updated_by", entry.UpdatedBy);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (rows > 0)
|
||||
{
|
||||
OrchestratorMetrics.DeadLetterStatusChanged(entry.TenantId, entry.JobType, entry.Status.ToString());
|
||||
}
|
||||
return rows > 0;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<DeadLetterEntry>> GetPendingRetryableAsync(
|
||||
string tenantId,
|
||||
int limit,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectPendingRetryableSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("limit", limit);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var entries = new List<DeadLetterEntry>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
entries.Add(MapEntry(reader));
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<DeadLetterEntry>> GetByErrorCodeAsync(
|
||||
string tenantId,
|
||||
string errorCode,
|
||||
DeadLetterStatus? status,
|
||||
int limit,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByErrorCodeSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("error_code", errorCode);
|
||||
command.Parameters.AddWithValue("status", status.HasValue ? status.Value.ToString().ToLowerInvariant() : DBNull.Value);
|
||||
command.Parameters.AddWithValue("limit", limit);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var entries = new List<DeadLetterEntry>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
entries.Add(MapEntry(reader));
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<DeadLetterEntry>> GetByCategoryAsync(
|
||||
string tenantId,
|
||||
ErrorCategory category,
|
||||
DeadLetterStatus? status,
|
||||
int limit,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByCategorySql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("category", category.ToString().ToLowerInvariant());
|
||||
command.Parameters.AddWithValue("status", status.HasValue ? status.Value.ToString().ToLowerInvariant() : DBNull.Value);
|
||||
command.Parameters.AddWithValue("limit", limit);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var entries = new List<DeadLetterEntry>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
entries.Add(MapEntry(reader));
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
public async Task<DeadLetterStats> GetStatsAsync(
|
||||
string tenantId,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
const string statsSql = """
|
||||
SELECT
|
||||
COUNT(*) AS total,
|
||||
COUNT(*) FILTER (WHERE status = 'pending') AS pending,
|
||||
COUNT(*) FILTER (WHERE status = 'replaying') AS replaying,
|
||||
COUNT(*) FILTER (WHERE status = 'replayed') AS replayed,
|
||||
COUNT(*) FILTER (WHERE status = 'resolved') AS resolved,
|
||||
COUNT(*) FILTER (WHERE status = 'exhausted') AS exhausted,
|
||||
COUNT(*) FILTER (WHERE status = 'expired') AS expired,
|
||||
COUNT(*) FILTER (WHERE is_retryable = TRUE AND status = 'pending') AS retryable
|
||||
FROM dead_letter_entries
|
||||
WHERE tenant_id = @tenant_id
|
||||
""";
|
||||
|
||||
const string byCategorySql = """
|
||||
SELECT category, COUNT(*) as cnt
|
||||
FROM dead_letter_entries
|
||||
WHERE tenant_id = @tenant_id
|
||||
GROUP BY category
|
||||
""";
|
||||
|
||||
const string topErrorCodesSql = """
|
||||
SELECT error_code, COUNT(*) as cnt
|
||||
FROM dead_letter_entries
|
||||
WHERE tenant_id = @tenant_id AND status = 'pending'
|
||||
GROUP BY error_code
|
||||
ORDER BY cnt DESC
|
||||
LIMIT 10
|
||||
""";
|
||||
|
||||
const string topJobTypesSql = """
|
||||
SELECT job_type, COUNT(*) as cnt
|
||||
FROM dead_letter_entries
|
||||
WHERE tenant_id = @tenant_id AND status = 'pending'
|
||||
GROUP BY job_type
|
||||
ORDER BY cnt DESC
|
||||
LIMIT 10
|
||||
""";
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Get counts
|
||||
long total = 0, pending = 0, replaying = 0, replayed = 0, resolved = 0, exhausted = 0, expired = 0, retryable = 0;
|
||||
await using (var command = new NpgsqlCommand(statsSql, connection))
|
||||
{
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
total = reader.GetInt64(0);
|
||||
pending = reader.GetInt64(1);
|
||||
replaying = reader.GetInt64(2);
|
||||
replayed = reader.GetInt64(3);
|
||||
resolved = reader.GetInt64(4);
|
||||
exhausted = reader.GetInt64(5);
|
||||
expired = reader.GetInt64(6);
|
||||
retryable = reader.GetInt64(7);
|
||||
}
|
||||
}
|
||||
|
||||
// Get by category
|
||||
var byCategory = new Dictionary<ErrorCategory, long>();
|
||||
await using (var command = new NpgsqlCommand(byCategorySql, connection))
|
||||
{
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
if (Enum.TryParse<ErrorCategory>(reader.GetString(0), true, out var cat))
|
||||
{
|
||||
byCategory[cat] = reader.GetInt64(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get top error codes
|
||||
var topErrorCodes = new Dictionary<string, long>();
|
||||
await using (var command = new NpgsqlCommand(topErrorCodesSql, connection))
|
||||
{
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
topErrorCodes[reader.GetString(0)] = reader.GetInt64(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Get top job types
|
||||
var topJobTypes = new Dictionary<string, long>();
|
||||
await using (var command = new NpgsqlCommand(topJobTypesSql, connection))
|
||||
{
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
topJobTypes[reader.GetString(0)] = reader.GetInt64(1);
|
||||
}
|
||||
}
|
||||
|
||||
return new DeadLetterStats(
|
||||
TotalEntries: total,
|
||||
PendingEntries: pending,
|
||||
ReplayingEntries: replaying,
|
||||
ReplayedEntries: replayed,
|
||||
ResolvedEntries: resolved,
|
||||
ExhaustedEntries: exhausted,
|
||||
ExpiredEntries: expired,
|
||||
RetryableEntries: retryable,
|
||||
ByCategory: byCategory,
|
||||
TopErrorCodes: topErrorCodes,
|
||||
TopJobTypes: topJobTypes);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<DeadLetterSummary>> GetActionableSummaryAsync(
|
||||
string tenantId,
|
||||
int limit,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
const string sql = """
|
||||
SELECT error_code, category, entry_count, retryable_count, oldest_entry, sample_reason
|
||||
FROM get_actionable_dead_letter_summary(@tenant_id, @limit)
|
||||
""";
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("limit", limit);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var summaries = new List<DeadLetterSummary>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
var categoryStr = reader.GetString(1);
|
||||
var category = Enum.TryParse<ErrorCategory>(categoryStr, true, out var cat) ? cat : ErrorCategory.Unknown;
|
||||
|
||||
summaries.Add(new DeadLetterSummary(
|
||||
ErrorCode: reader.GetString(0),
|
||||
Category: category,
|
||||
EntryCount: reader.GetInt64(2),
|
||||
RetryableCount: reader.GetInt64(3),
|
||||
OldestEntry: reader.GetFieldValue<DateTimeOffset>(4),
|
||||
SampleReason: reader.IsDBNull(5) ? null : reader.GetString(5)));
|
||||
}
|
||||
return summaries;
|
||||
}
|
||||
|
||||
public async Task<int> MarkExpiredAsync(
|
||||
int batchLimit,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
// Use a system-level connection (no tenant context needed for maintenance)
|
||||
await using var connection = await _dataSource.OpenConnectionAsync("system", "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(MarkExpiredSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("batch_limit", batchLimit);
|
||||
|
||||
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
||||
var marked = Convert.ToInt32(result);
|
||||
|
||||
if (marked > 0)
|
||||
{
|
||||
OrchestratorMetrics.DeadLetterExpired(marked);
|
||||
_logger.LogInformation("Marked {Count} dead-letter entries as expired", marked);
|
||||
}
|
||||
|
||||
return marked;
|
||||
}
|
||||
|
||||
public async Task<int> PurgeOldEntriesAsync(
|
||||
int retentionDays,
|
||||
int batchLimit,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync("system", "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(PurgeSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("retention_days", retentionDays);
|
||||
command.Parameters.AddWithValue("batch_limit", batchLimit);
|
||||
|
||||
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
||||
var purged = Convert.ToInt32(result);
|
||||
|
||||
if (purged > 0)
|
||||
{
|
||||
OrchestratorMetrics.DeadLetterPurged(purged);
|
||||
_logger.LogInformation("Purged {Count} old dead-letter entries (retention: {RetentionDays} days)", purged, retentionDays);
|
||||
}
|
||||
|
||||
return purged;
|
||||
}
|
||||
|
||||
private static void AddEntryParameters(NpgsqlCommand command, DeadLetterEntry entry)
|
||||
{
|
||||
command.Parameters.AddWithValue("entry_id", entry.EntryId);
|
||||
command.Parameters.AddWithValue("tenant_id", entry.TenantId);
|
||||
command.Parameters.AddWithValue("original_job_id", entry.OriginalJobId);
|
||||
command.Parameters.AddWithValue("run_id", (object?)entry.RunId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("source_id", (object?)entry.SourceId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("job_type", entry.JobType);
|
||||
command.Parameters.AddWithValue("payload", entry.Payload);
|
||||
command.Parameters.AddWithValue("payload_digest", entry.PayloadDigest);
|
||||
command.Parameters.AddWithValue("idempotency_key", entry.IdempotencyKey);
|
||||
command.Parameters.AddWithValue("correlation_id", (object?)entry.CorrelationId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("status", entry.Status.ToString().ToLowerInvariant());
|
||||
command.Parameters.AddWithValue("error_code", entry.ErrorCode);
|
||||
command.Parameters.AddWithValue("failure_reason", entry.FailureReason);
|
||||
command.Parameters.AddWithValue("remediation_hint", (object?)entry.RemediationHint ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("category", entry.Category.ToString().ToLowerInvariant());
|
||||
command.Parameters.AddWithValue("is_retryable", entry.IsRetryable);
|
||||
command.Parameters.AddWithValue("original_attempts", entry.OriginalAttempts);
|
||||
command.Parameters.AddWithValue("replay_attempts", entry.ReplayAttempts);
|
||||
command.Parameters.AddWithValue("max_replay_attempts", entry.MaxReplayAttempts);
|
||||
command.Parameters.AddWithValue("failed_at", entry.FailedAt);
|
||||
command.Parameters.AddWithValue("created_at", entry.CreatedAt);
|
||||
command.Parameters.AddWithValue("updated_at", entry.UpdatedAt);
|
||||
command.Parameters.AddWithValue("expires_at", entry.ExpiresAt);
|
||||
command.Parameters.AddWithValue("resolved_at", (object?)entry.ResolvedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("resolution_notes", (object?)entry.ResolutionNotes ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("created_by", entry.CreatedBy);
|
||||
command.Parameters.AddWithValue("updated_by", entry.UpdatedBy);
|
||||
}
|
||||
|
||||
private static DeadLetterEntry MapEntry(NpgsqlDataReader reader)
|
||||
{
|
||||
var statusStr = reader.GetString(10);
|
||||
var categoryStr = reader.GetString(14);
|
||||
|
||||
return new DeadLetterEntry(
|
||||
EntryId: reader.GetGuid(0),
|
||||
TenantId: reader.GetString(1),
|
||||
OriginalJobId: reader.GetGuid(2),
|
||||
RunId: reader.IsDBNull(3) ? null : reader.GetGuid(3),
|
||||
SourceId: reader.IsDBNull(4) ? null : reader.GetGuid(4),
|
||||
JobType: reader.GetString(5),
|
||||
Payload: reader.GetString(6),
|
||||
PayloadDigest: reader.GetString(7),
|
||||
IdempotencyKey: reader.GetString(8),
|
||||
CorrelationId: reader.IsDBNull(9) ? null : reader.GetString(9),
|
||||
Status: Enum.TryParse<DeadLetterStatus>(statusStr, true, out var status) ? status : DeadLetterStatus.Pending,
|
||||
ErrorCode: reader.GetString(11),
|
||||
FailureReason: reader.GetString(12),
|
||||
RemediationHint: reader.IsDBNull(13) ? null : reader.GetString(13),
|
||||
Category: Enum.TryParse<ErrorCategory>(categoryStr, true, out var cat) ? cat : ErrorCategory.Unknown,
|
||||
IsRetryable: reader.GetBoolean(15),
|
||||
OriginalAttempts: reader.GetInt32(16),
|
||||
ReplayAttempts: reader.GetInt32(17),
|
||||
MaxReplayAttempts: reader.GetInt32(18),
|
||||
FailedAt: reader.GetFieldValue<DateTimeOffset>(19),
|
||||
CreatedAt: reader.GetFieldValue<DateTimeOffset>(20),
|
||||
UpdatedAt: reader.GetFieldValue<DateTimeOffset>(21),
|
||||
ExpiresAt: reader.GetFieldValue<DateTimeOffset>(22),
|
||||
ResolvedAt: reader.IsDBNull(23) ? null : reader.GetFieldValue<DateTimeOffset>(23),
|
||||
ResolutionNotes: reader.IsDBNull(24) ? null : reader.GetString(24),
|
||||
CreatedBy: reader.GetString(25),
|
||||
UpdatedBy: reader.GetString(26));
|
||||
}
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
|
||||
string tenantId,
|
||||
DeadLetterListOptions options)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append($"SELECT {SelectEntryColumns} FROM dead_letter_entries WHERE tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
AppendFilters(sb, parameters, options);
|
||||
|
||||
var order = options.Ascending ? "ASC" : "DESC";
|
||||
sb.Append($" ORDER BY created_at {order}");
|
||||
|
||||
if (!string.IsNullOrEmpty(options.Cursor))
|
||||
{
|
||||
// Cursor is the created_at timestamp
|
||||
var op = options.Ascending ? ">" : "<";
|
||||
sb.Append($" AND created_at {op} @cursor");
|
||||
if (DateTimeOffset.TryParse(options.Cursor, out var cursor))
|
||||
{
|
||||
parameters.Add(("cursor", cursor));
|
||||
}
|
||||
}
|
||||
|
||||
sb.Append(" LIMIT @limit");
|
||||
parameters.Add(("limit", options.Limit));
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildCountQuery(
|
||||
string tenantId,
|
||||
DeadLetterListOptions options)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append("SELECT COUNT(*) FROM dead_letter_entries WHERE tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
AppendFilters(sb, parameters, options);
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
|
||||
private static void AppendFilters(StringBuilder sb, List<(string, object)> parameters, DeadLetterListOptions options)
|
||||
{
|
||||
if (options.Status.HasValue)
|
||||
{
|
||||
sb.Append(" AND status = @status");
|
||||
parameters.Add(("status", options.Status.Value.ToString().ToLowerInvariant()));
|
||||
}
|
||||
|
||||
if (options.Category.HasValue)
|
||||
{
|
||||
sb.Append(" AND category = @category");
|
||||
parameters.Add(("category", options.Category.Value.ToString().ToLowerInvariant()));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(options.JobType))
|
||||
{
|
||||
sb.Append(" AND job_type = @job_type");
|
||||
parameters.Add(("job_type", options.JobType));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(options.ErrorCode))
|
||||
{
|
||||
sb.Append(" AND error_code = @error_code");
|
||||
parameters.Add(("error_code", options.ErrorCode));
|
||||
}
|
||||
|
||||
if (options.SourceId.HasValue)
|
||||
{
|
||||
sb.Append(" AND source_id = @source_id");
|
||||
parameters.Add(("source_id", options.SourceId.Value));
|
||||
}
|
||||
|
||||
if (options.RunId.HasValue)
|
||||
{
|
||||
sb.Append(" AND run_id = @run_id");
|
||||
parameters.Add(("run_id", options.RunId.Value));
|
||||
}
|
||||
|
||||
if (options.IsRetryable.HasValue)
|
||||
{
|
||||
sb.Append(" AND is_retryable = @is_retryable");
|
||||
parameters.Add(("is_retryable", options.IsRetryable.Value));
|
||||
}
|
||||
|
||||
if (options.CreatedAfter.HasValue)
|
||||
{
|
||||
sb.Append(" AND created_at >= @created_after");
|
||||
parameters.Add(("created_after", options.CreatedAfter.Value));
|
||||
}
|
||||
|
||||
if (options.CreatedBefore.HasValue)
|
||||
{
|
||||
sb.Append(" AND created_at <= @created_before");
|
||||
parameters.Add(("created_before", options.CreatedBefore.Value));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,247 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Npgsql;
|
||||
using StellaOps.Orchestrator.Core.Backfill;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of duplicate suppressor.
|
||||
/// </summary>
|
||||
public sealed class PostgresDuplicateSuppressor : IDuplicateSuppressor
|
||||
{
|
||||
private const string SelectProcessedSql = """
|
||||
SELECT 1 FROM processed_events
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND scope_key = @scope_key
|
||||
AND event_key = @event_key
|
||||
AND expires_at > NOW()
|
||||
""";
|
||||
|
||||
private const string SelectMultipleProcessedSql = """
|
||||
SELECT event_key FROM processed_events
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND scope_key = @scope_key
|
||||
AND event_key = ANY(@event_keys)
|
||||
AND expires_at > NOW()
|
||||
""";
|
||||
|
||||
private const string UpsertProcessedSql = """
|
||||
INSERT INTO processed_events (tenant_id, scope_key, event_key, event_time, processed_at, batch_id, expires_at)
|
||||
VALUES (@tenant_id, @scope_key, @event_key, @event_time, NOW(), @batch_id, @expires_at)
|
||||
ON CONFLICT (tenant_id, scope_key, event_key) DO UPDATE
|
||||
SET event_time = EXCLUDED.event_time,
|
||||
processed_at = NOW(),
|
||||
batch_id = EXCLUDED.batch_id,
|
||||
expires_at = EXCLUDED.expires_at
|
||||
""";
|
||||
|
||||
private const string CountProcessedSql = """
|
||||
SELECT COUNT(*) FROM processed_events
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND scope_key = @scope_key
|
||||
AND event_time >= @from
|
||||
AND event_time < @to
|
||||
AND expires_at > NOW()
|
||||
""";
|
||||
|
||||
private const string CleanupExpiredSql = """
|
||||
DELETE FROM processed_events
|
||||
WHERE ctid IN (
|
||||
SELECT ctid FROM processed_events
|
||||
WHERE expires_at < NOW()
|
||||
LIMIT @batch_limit
|
||||
)
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly string _tenantId;
|
||||
private readonly ILogger<PostgresDuplicateSuppressor> _logger;
|
||||
|
||||
public PostgresDuplicateSuppressor(
|
||||
OrchestratorDataSource dataSource,
|
||||
string tenantId,
|
||||
ILogger<PostgresDuplicateSuppressor> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_tenantId = tenantId ?? throw new ArgumentNullException(nameof(tenantId));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<bool> HasProcessedAsync(string scopeKey, string eventKey, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(_tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectProcessedSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", _tenantId);
|
||||
command.Parameters.AddWithValue("scope_key", scopeKey);
|
||||
command.Parameters.AddWithValue("event_key", eventKey);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
return await reader.ReadAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlySet<string>> GetProcessedAsync(string scopeKey, IEnumerable<string> eventKeys, CancellationToken cancellationToken)
|
||||
{
|
||||
var keyList = eventKeys.ToArray();
|
||||
if (keyList.Length == 0)
|
||||
{
|
||||
return new HashSet<string>();
|
||||
}
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(_tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectMultipleProcessedSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", _tenantId);
|
||||
command.Parameters.AddWithValue("scope_key", scopeKey);
|
||||
command.Parameters.AddWithValue("event_keys", keyList);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var result = new HashSet<string>();
|
||||
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
result.Add(reader.GetString(0));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public async Task MarkProcessedAsync(
|
||||
string scopeKey,
|
||||
string eventKey,
|
||||
DateTimeOffset eventTime,
|
||||
Guid? batchId,
|
||||
TimeSpan ttl,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(_tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(UpsertProcessedSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", _tenantId);
|
||||
command.Parameters.AddWithValue("scope_key", scopeKey);
|
||||
command.Parameters.AddWithValue("event_key", eventKey);
|
||||
command.Parameters.AddWithValue("event_time", eventTime);
|
||||
command.Parameters.AddWithValue("batch_id", (object?)batchId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("expires_at", DateTimeOffset.UtcNow + ttl);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public async Task MarkProcessedBatchAsync(
|
||||
string scopeKey,
|
||||
IEnumerable<ProcessedEvent> events,
|
||||
Guid? batchId,
|
||||
TimeSpan ttl,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var eventList = events.ToList();
|
||||
if (eventList.Count == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var expiresAt = DateTimeOffset.UtcNow + ttl;
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(_tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var transaction = await connection.BeginTransactionAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
try
|
||||
{
|
||||
foreach (var evt in eventList)
|
||||
{
|
||||
await using var command = new NpgsqlCommand(UpsertProcessedSql, connection, transaction);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", _tenantId);
|
||||
command.Parameters.AddWithValue("scope_key", scopeKey);
|
||||
command.Parameters.AddWithValue("event_key", evt.EventKey);
|
||||
command.Parameters.AddWithValue("event_time", evt.EventTime);
|
||||
command.Parameters.AddWithValue("batch_id", (object?)batchId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("expires_at", expiresAt);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
await transaction.CommitAsync(cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.ProcessedEventsMarked(_tenantId, scopeKey, eventList.Count);
|
||||
}
|
||||
catch
|
||||
{
|
||||
await transaction.RollbackAsync(cancellationToken).ConfigureAwait(false);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<long> CountProcessedAsync(string scopeKey, DateTimeOffset from, DateTimeOffset to, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(_tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(CountProcessedSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", _tenantId);
|
||||
command.Parameters.AddWithValue("scope_key", scopeKey);
|
||||
command.Parameters.AddWithValue("from", from);
|
||||
command.Parameters.AddWithValue("to", to);
|
||||
|
||||
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
||||
return Convert.ToInt64(result);
|
||||
}
|
||||
|
||||
public async Task<int> CleanupExpiredAsync(int batchLimit, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(_tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(CleanupExpiredSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("batch_limit", batchLimit);
|
||||
|
||||
var deleted = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (deleted > 0)
|
||||
{
|
||||
_logger.LogInformation("Cleaned up {DeletedCount} expired processed events", deleted);
|
||||
OrchestratorMetrics.ProcessedEventsCleanedUp(_tenantId, deleted);
|
||||
}
|
||||
|
||||
return deleted;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Factory for creating tenant-scoped duplicate suppressors.
|
||||
/// </summary>
|
||||
public interface IDuplicateSuppressorFactory
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a duplicate suppressor for the specified tenant.
|
||||
/// </summary>
|
||||
IDuplicateSuppressor Create(string tenantId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Factory implementation for PostgreSQL duplicate suppressors.
|
||||
/// </summary>
|
||||
public sealed class PostgresDuplicateSuppressorFactory : IDuplicateSuppressorFactory
|
||||
{
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILoggerFactory _loggerFactory;
|
||||
|
||||
public PostgresDuplicateSuppressorFactory(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILoggerFactory loggerFactory)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_loggerFactory = loggerFactory ?? throw new ArgumentNullException(nameof(loggerFactory));
|
||||
}
|
||||
|
||||
public IDuplicateSuppressor Create(string tenantId)
|
||||
{
|
||||
return new PostgresDuplicateSuppressor(
|
||||
_dataSource,
|
||||
tenantId,
|
||||
_loggerFactory.CreateLogger<PostgresDuplicateSuppressor>());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,540 @@
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Npgsql;
|
||||
using NpgsqlTypes;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of job repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresJobRepository : IJobRepository
|
||||
{
|
||||
private const string SelectJobColumns = """
|
||||
job_id, tenant_id, project_id, run_id, job_type, status, priority, attempt, max_attempts,
|
||||
payload_digest, payload, idempotency_key, correlation_id, lease_id, worker_id, task_runner_id,
|
||||
lease_until, created_at, scheduled_at, leased_at, completed_at, not_before, reason, replay_of, created_by
|
||||
""";
|
||||
|
||||
private const string SelectByIdSql = $"""
|
||||
SELECT {SelectJobColumns}
|
||||
FROM jobs
|
||||
WHERE tenant_id = @tenant_id AND job_id = @job_id
|
||||
""";
|
||||
|
||||
private const string SelectByIdempotencyKeySql = $"""
|
||||
SELECT {SelectJobColumns}
|
||||
FROM jobs
|
||||
WHERE tenant_id = @tenant_id AND idempotency_key = @idempotency_key
|
||||
""";
|
||||
|
||||
private const string InsertJobSql = """
|
||||
INSERT INTO jobs (
|
||||
job_id, tenant_id, project_id, run_id, job_type, status, priority, attempt, max_attempts,
|
||||
payload_digest, payload, idempotency_key, correlation_id, lease_id, worker_id, task_runner_id,
|
||||
lease_until, created_at, scheduled_at, leased_at, completed_at, not_before, reason, replay_of, created_by)
|
||||
VALUES (
|
||||
@job_id, @tenant_id, @project_id, @run_id, @job_type, @status::job_status, @priority, @attempt, @max_attempts,
|
||||
@payload_digest, @payload, @idempotency_key, @correlation_id, @lease_id, @worker_id, @task_runner_id,
|
||||
@lease_until, @created_at, @scheduled_at, @leased_at, @completed_at, @not_before, @reason, @replay_of, @created_by)
|
||||
""";
|
||||
|
||||
private const string UpdateStatusSql = """
|
||||
UPDATE jobs
|
||||
SET status = @status::job_status,
|
||||
attempt = @attempt,
|
||||
lease_id = @lease_id,
|
||||
worker_id = @worker_id,
|
||||
task_runner_id = @task_runner_id,
|
||||
lease_until = @lease_until,
|
||||
scheduled_at = @scheduled_at,
|
||||
leased_at = @leased_at,
|
||||
completed_at = @completed_at,
|
||||
not_before = @not_before,
|
||||
reason = @reason
|
||||
WHERE tenant_id = @tenant_id AND job_id = @job_id
|
||||
""";
|
||||
|
||||
private const string LeaseNextSqlTemplate = """
|
||||
UPDATE jobs
|
||||
SET status = 'leased'::job_status,
|
||||
lease_id = @lease_id,
|
||||
worker_id = @worker_id,
|
||||
lease_until = @lease_until,
|
||||
leased_at = @leased_at
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND job_id = (
|
||||
SELECT job_id
|
||||
FROM jobs
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND status = 'scheduled'::job_status
|
||||
AND (not_before IS NULL OR not_before <= @now)
|
||||
{0}
|
||||
ORDER BY priority DESC, created_at
|
||||
LIMIT 1
|
||||
FOR UPDATE SKIP LOCKED
|
||||
)
|
||||
RETURNING
|
||||
""";
|
||||
|
||||
private const string ExtendLeaseSql = """
|
||||
UPDATE jobs
|
||||
SET lease_until = @new_lease_until
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND job_id = @job_id
|
||||
AND lease_id = @lease_id
|
||||
AND status = 'leased'::job_status
|
||||
AND lease_until > @now
|
||||
""";
|
||||
|
||||
private const string SelectByRunIdSql = $"""
|
||||
SELECT {SelectJobColumns}
|
||||
FROM jobs
|
||||
WHERE tenant_id = @tenant_id AND run_id = @run_id
|
||||
ORDER BY created_at
|
||||
""";
|
||||
|
||||
private const string SelectExpiredLeasesSql = $"""
|
||||
SELECT {SelectJobColumns}
|
||||
FROM jobs
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND status = 'leased'::job_status
|
||||
AND lease_until < @cutoff
|
||||
ORDER BY lease_until
|
||||
LIMIT @limit
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresJobRepository> _logger;
|
||||
|
||||
public PostgresJobRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresJobRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<Job?> GetByIdAsync(string tenantId, Guid jobId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("job_id", jobId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapJob(reader);
|
||||
}
|
||||
|
||||
public async Task<Job?> GetByIdempotencyKeyAsync(string tenantId, string idempotencyKey, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByIdempotencyKeySql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("idempotency_key", idempotencyKey);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapJob(reader);
|
||||
}
|
||||
|
||||
public async Task CreateAsync(Job job, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(job.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(InsertJobSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
AddJobParameters(command, job);
|
||||
|
||||
try
|
||||
{
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.JobEnqueued(job.TenantId, job.JobType);
|
||||
OrchestratorMetrics.QueueDepthChanged(job.TenantId, job.JobType, 1);
|
||||
}
|
||||
catch (PostgresException ex) when (string.Equals(ex.SqlState, PostgresErrorCodes.UniqueViolation, StringComparison.Ordinal))
|
||||
{
|
||||
_logger.LogWarning("Duplicate job idempotency key: {IdempotencyKey}", job.IdempotencyKey);
|
||||
throw new DuplicateJobException(job.IdempotencyKey, ex);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task UpdateStatusAsync(
|
||||
string tenantId,
|
||||
Guid jobId,
|
||||
JobStatus status,
|
||||
int attempt,
|
||||
Guid? leaseId,
|
||||
string? workerId,
|
||||
string? taskRunnerId,
|
||||
DateTimeOffset? leaseUntil,
|
||||
DateTimeOffset? scheduledAt,
|
||||
DateTimeOffset? leasedAt,
|
||||
DateTimeOffset? completedAt,
|
||||
DateTimeOffset? notBefore,
|
||||
string? reason,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(UpdateStatusSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("job_id", jobId);
|
||||
command.Parameters.AddWithValue("status", StatusToString(status));
|
||||
command.Parameters.AddWithValue("attempt", attempt);
|
||||
command.Parameters.AddWithValue("lease_id", (object?)leaseId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("worker_id", (object?)workerId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("task_runner_id", (object?)taskRunnerId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("lease_until", (object?)leaseUntil ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("scheduled_at", (object?)scheduledAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("leased_at", (object?)leasedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("completed_at", (object?)completedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("not_before", (object?)notBefore ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("reason", (object?)reason ?? DBNull.Value);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public async Task<Job?> LeaseNextAsync(
|
||||
string tenantId,
|
||||
string? jobType,
|
||||
Guid leaseId,
|
||||
string workerId,
|
||||
DateTimeOffset leaseUntil,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var jobTypeFilter = jobType != null ? "AND job_type = @job_type" : "";
|
||||
var sql = string.Format(LeaseNextSqlTemplate, jobTypeFilter) + " " + SelectJobColumns;
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("lease_id", leaseId);
|
||||
command.Parameters.AddWithValue("worker_id", workerId);
|
||||
command.Parameters.AddWithValue("lease_until", leaseUntil);
|
||||
command.Parameters.AddWithValue("leased_at", DateTimeOffset.UtcNow);
|
||||
command.Parameters.AddWithValue("now", DateTimeOffset.UtcNow);
|
||||
|
||||
if (jobType != null)
|
||||
{
|
||||
command.Parameters.AddWithValue("job_type", jobType);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var job = MapJob(reader);
|
||||
OrchestratorMetrics.JobLeased(job.TenantId, job.JobType);
|
||||
OrchestratorMetrics.QueueDepthChanged(job.TenantId, job.JobType, -1);
|
||||
return job;
|
||||
}
|
||||
|
||||
public async Task<bool> ExtendLeaseAsync(
|
||||
string tenantId,
|
||||
Guid jobId,
|
||||
Guid leaseId,
|
||||
DateTimeOffset newLeaseUntil,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(ExtendLeaseSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("job_id", jobId);
|
||||
command.Parameters.AddWithValue("lease_id", leaseId);
|
||||
command.Parameters.AddWithValue("new_lease_until", newLeaseUntil);
|
||||
command.Parameters.AddWithValue("now", DateTimeOffset.UtcNow);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
return rows > 0;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Job>> GetByRunIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByRunIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("run_id", runId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var jobs = new List<Job>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
jobs.Add(MapJob(reader));
|
||||
}
|
||||
return jobs;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Job>> GetExpiredLeasesAsync(string tenantId, DateTimeOffset cutoff, int limit, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectExpiredLeasesSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("cutoff", cutoff);
|
||||
command.Parameters.AddWithValue("limit", limit);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var jobs = new List<Job>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
jobs.Add(MapJob(reader));
|
||||
}
|
||||
return jobs;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Job>> ListAsync(
|
||||
string tenantId,
|
||||
JobStatus? status,
|
||||
string? jobType,
|
||||
string? projectId,
|
||||
DateTimeOffset? createdAfter,
|
||||
DateTimeOffset? createdBefore,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var (sql, parameters) = BuildListQuery(tenantId, status, jobType, projectId, createdAfter, createdBefore, limit, offset);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var jobs = new List<Job>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
jobs.Add(MapJob(reader));
|
||||
}
|
||||
return jobs;
|
||||
}
|
||||
|
||||
public async Task<int> CountAsync(
|
||||
string tenantId,
|
||||
JobStatus? status,
|
||||
string? jobType,
|
||||
string? projectId,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var (sql, parameters) = BuildCountQuery(tenantId, status, jobType, projectId);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
||||
return Convert.ToInt32(result);
|
||||
}
|
||||
|
||||
private static void AddJobParameters(NpgsqlCommand command, Job job)
|
||||
{
|
||||
command.Parameters.AddWithValue("job_id", job.JobId);
|
||||
command.Parameters.AddWithValue("tenant_id", job.TenantId);
|
||||
command.Parameters.AddWithValue("project_id", (object?)job.ProjectId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("run_id", (object?)job.RunId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("job_type", job.JobType);
|
||||
command.Parameters.AddWithValue("status", StatusToString(job.Status));
|
||||
command.Parameters.AddWithValue("priority", job.Priority);
|
||||
command.Parameters.AddWithValue("attempt", job.Attempt);
|
||||
command.Parameters.AddWithValue("max_attempts", job.MaxAttempts);
|
||||
command.Parameters.AddWithValue("payload_digest", job.PayloadDigest);
|
||||
command.Parameters.Add(new NpgsqlParameter<string>("payload", NpgsqlDbType.Jsonb) { TypedValue = job.Payload });
|
||||
command.Parameters.AddWithValue("idempotency_key", job.IdempotencyKey);
|
||||
command.Parameters.AddWithValue("correlation_id", (object?)job.CorrelationId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("lease_id", (object?)job.LeaseId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("worker_id", (object?)job.WorkerId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("task_runner_id", (object?)job.TaskRunnerId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("lease_until", (object?)job.LeaseUntil ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("created_at", job.CreatedAt);
|
||||
command.Parameters.AddWithValue("scheduled_at", (object?)job.ScheduledAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("leased_at", (object?)job.LeasedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("completed_at", (object?)job.CompletedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("not_before", (object?)job.NotBefore ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("reason", (object?)job.Reason ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("replay_of", (object?)job.ReplayOf ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("created_by", job.CreatedBy);
|
||||
}
|
||||
|
||||
private static Job MapJob(NpgsqlDataReader reader)
|
||||
{
|
||||
return new Job(
|
||||
JobId: reader.GetGuid(0),
|
||||
TenantId: reader.GetString(1),
|
||||
ProjectId: reader.IsDBNull(2) ? null : reader.GetString(2),
|
||||
RunId: reader.IsDBNull(3) ? null : reader.GetGuid(3),
|
||||
JobType: reader.GetString(4),
|
||||
Status: ParseStatus(reader.GetString(5)),
|
||||
Priority: reader.GetInt32(6),
|
||||
Attempt: reader.GetInt32(7),
|
||||
MaxAttempts: reader.GetInt32(8),
|
||||
PayloadDigest: reader.GetString(9),
|
||||
Payload: reader.GetString(10),
|
||||
IdempotencyKey: reader.GetString(11),
|
||||
CorrelationId: reader.IsDBNull(12) ? null : reader.GetString(12),
|
||||
LeaseId: reader.IsDBNull(13) ? null : reader.GetGuid(13),
|
||||
WorkerId: reader.IsDBNull(14) ? null : reader.GetString(14),
|
||||
TaskRunnerId: reader.IsDBNull(15) ? null : reader.GetString(15),
|
||||
LeaseUntil: reader.IsDBNull(16) ? null : reader.GetFieldValue<DateTimeOffset>(16),
|
||||
CreatedAt: reader.GetFieldValue<DateTimeOffset>(17),
|
||||
ScheduledAt: reader.IsDBNull(18) ? null : reader.GetFieldValue<DateTimeOffset>(18),
|
||||
LeasedAt: reader.IsDBNull(19) ? null : reader.GetFieldValue<DateTimeOffset>(19),
|
||||
CompletedAt: reader.IsDBNull(20) ? null : reader.GetFieldValue<DateTimeOffset>(20),
|
||||
NotBefore: reader.IsDBNull(21) ? null : reader.GetFieldValue<DateTimeOffset>(21),
|
||||
Reason: reader.IsDBNull(22) ? null : reader.GetString(22),
|
||||
ReplayOf: reader.IsDBNull(23) ? null : reader.GetGuid(23),
|
||||
CreatedBy: reader.GetString(24));
|
||||
}
|
||||
|
||||
private static string StatusToString(JobStatus status) => status switch
|
||||
{
|
||||
JobStatus.Pending => "pending",
|
||||
JobStatus.Scheduled => "scheduled",
|
||||
JobStatus.Leased => "leased",
|
||||
JobStatus.Succeeded => "succeeded",
|
||||
JobStatus.Failed => "failed",
|
||||
JobStatus.Canceled => "canceled",
|
||||
JobStatus.TimedOut => "timed_out",
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(status))
|
||||
};
|
||||
|
||||
private static JobStatus ParseStatus(string status) => status switch
|
||||
{
|
||||
"pending" => JobStatus.Pending,
|
||||
"scheduled" => JobStatus.Scheduled,
|
||||
"leased" => JobStatus.Leased,
|
||||
"succeeded" => JobStatus.Succeeded,
|
||||
"failed" => JobStatus.Failed,
|
||||
"canceled" => JobStatus.Canceled,
|
||||
"timed_out" => JobStatus.TimedOut,
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(status))
|
||||
};
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
|
||||
string tenantId,
|
||||
JobStatus? status,
|
||||
string? jobType,
|
||||
string? projectId,
|
||||
DateTimeOffset? createdAfter,
|
||||
DateTimeOffset? createdBefore,
|
||||
int limit,
|
||||
int offset)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append($"SELECT {SelectJobColumns} FROM jobs WHERE tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (status.HasValue)
|
||||
{
|
||||
sb.Append(" AND status = @status::job_status");
|
||||
parameters.Add(("status", StatusToString(status.Value)));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(jobType))
|
||||
{
|
||||
sb.Append(" AND job_type = @job_type");
|
||||
parameters.Add(("job_type", jobType));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(projectId))
|
||||
{
|
||||
sb.Append(" AND project_id = @project_id");
|
||||
parameters.Add(("project_id", projectId));
|
||||
}
|
||||
|
||||
if (createdAfter.HasValue)
|
||||
{
|
||||
sb.Append(" AND created_at >= @created_after");
|
||||
parameters.Add(("created_after", createdAfter.Value));
|
||||
}
|
||||
|
||||
if (createdBefore.HasValue)
|
||||
{
|
||||
sb.Append(" AND created_at < @created_before");
|
||||
parameters.Add(("created_before", createdBefore.Value));
|
||||
}
|
||||
|
||||
sb.Append(" ORDER BY created_at DESC LIMIT @limit OFFSET @offset");
|
||||
parameters.Add(("limit", limit));
|
||||
parameters.Add(("offset", offset));
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildCountQuery(
|
||||
string tenantId,
|
||||
JobStatus? status,
|
||||
string? jobType,
|
||||
string? projectId)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append("SELECT COUNT(*) FROM jobs WHERE tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (status.HasValue)
|
||||
{
|
||||
sb.Append(" AND status = @status::job_status");
|
||||
parameters.Add(("status", StatusToString(status.Value)));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(jobType))
|
||||
{
|
||||
sb.Append(" AND job_type = @job_type");
|
||||
parameters.Add(("job_type", jobType));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(projectId))
|
||||
{
|
||||
sb.Append(" AND project_id = @project_id");
|
||||
parameters.Add(("project_id", projectId));
|
||||
}
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when attempting to create a job with a duplicate idempotency key.
|
||||
/// </summary>
|
||||
public sealed class DuplicateJobException : Exception
|
||||
{
|
||||
public string IdempotencyKey { get; }
|
||||
|
||||
public DuplicateJobException(string idempotencyKey, Exception innerException)
|
||||
: base($"Job with idempotency key '{idempotencyKey}' already exists.", innerException)
|
||||
{
|
||||
IdempotencyKey = idempotencyKey;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,949 @@
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Npgsql;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of the ledger repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresLedgerRepository : ILedgerRepository
|
||||
{
|
||||
private const string SelectLedgerColumns = """
|
||||
ledger_id, tenant_id, run_id, source_id, run_type, final_status, total_jobs,
|
||||
succeeded_jobs, failed_jobs, run_created_at, run_started_at, run_completed_at,
|
||||
execution_duration_ms, initiated_by, input_digest, output_digest, artifact_manifest,
|
||||
sequence_number, previous_entry_hash, content_hash, ledger_created_at, correlation_id, metadata
|
||||
""";
|
||||
|
||||
private const string SelectByIdSql = $"""
|
||||
SELECT {SelectLedgerColumns}
|
||||
FROM run_ledger_entries
|
||||
WHERE tenant_id = @tenant_id AND ledger_id = @ledger_id
|
||||
""";
|
||||
|
||||
private const string SelectByRunIdSql = $"""
|
||||
SELECT {SelectLedgerColumns}
|
||||
FROM run_ledger_entries
|
||||
WHERE tenant_id = @tenant_id AND run_id = @run_id
|
||||
""";
|
||||
|
||||
private const string InsertEntrySql = """
|
||||
INSERT INTO run_ledger_entries (
|
||||
ledger_id, tenant_id, run_id, source_id, run_type, final_status, total_jobs,
|
||||
succeeded_jobs, failed_jobs, run_created_at, run_started_at, run_completed_at,
|
||||
execution_duration_ms, initiated_by, input_digest, output_digest, artifact_manifest,
|
||||
sequence_number, previous_entry_hash, content_hash, ledger_created_at, correlation_id, metadata)
|
||||
VALUES (
|
||||
@ledger_id, @tenant_id, @run_id, @source_id, @run_type, @final_status, @total_jobs,
|
||||
@succeeded_jobs, @failed_jobs, @run_created_at, @run_started_at, @run_completed_at,
|
||||
@execution_duration_ms, @initiated_by, @input_digest, @output_digest, @artifact_manifest::jsonb,
|
||||
@sequence_number, @previous_entry_hash, @content_hash, @ledger_created_at, @correlation_id, @metadata::jsonb)
|
||||
""";
|
||||
|
||||
private const string SelectLatestSql = $"""
|
||||
SELECT {SelectLedgerColumns}
|
||||
FROM run_ledger_entries
|
||||
WHERE tenant_id = @tenant_id
|
||||
ORDER BY sequence_number DESC
|
||||
LIMIT 1
|
||||
""";
|
||||
|
||||
private const string GetSequenceSql = """
|
||||
SELECT next_seq, prev_hash FROM next_ledger_sequence(@tenant_id)
|
||||
""";
|
||||
|
||||
private const string UpdateSequenceHashSql = """
|
||||
SELECT update_ledger_sequence_hash(@tenant_id, @content_hash)
|
||||
""";
|
||||
|
||||
private const string VerifyChainSql = """
|
||||
SELECT is_valid, invalid_ledger_id, invalid_sequence, error_message
|
||||
FROM verify_ledger_chain(@tenant_id, @start_seq, @end_seq)
|
||||
""";
|
||||
|
||||
private const string GetSummarySql = """
|
||||
SELECT total_entries, entries_since, total_runs, successful_runs, failed_runs,
|
||||
total_jobs, unique_sources, unique_run_types, earliest_entry, latest_entry
|
||||
FROM get_ledger_summary(@tenant_id, @since)
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresLedgerRepository> _logger;
|
||||
|
||||
public PostgresLedgerRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresLedgerRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<RunLedgerEntry> AppendAsync(
|
||||
Run run,
|
||||
IReadOnlyList<Artifact> artifacts,
|
||||
string inputDigest,
|
||||
string? metadata = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (run.CompletedAt is null)
|
||||
{
|
||||
throw new InvalidOperationException("Cannot create ledger entry from an incomplete run.");
|
||||
}
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(run.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var transaction = await connection.BeginTransactionAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
try
|
||||
{
|
||||
// Get next sequence number and previous hash
|
||||
long sequenceNumber;
|
||||
string? previousEntryHash;
|
||||
|
||||
await using (var seqCommand = new NpgsqlCommand(GetSequenceSql, connection, transaction))
|
||||
{
|
||||
seqCommand.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
seqCommand.Parameters.AddWithValue("tenant_id", run.TenantId);
|
||||
|
||||
await using var reader = await seqCommand.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
throw new InvalidOperationException("Failed to get next ledger sequence.");
|
||||
}
|
||||
|
||||
sequenceNumber = reader.GetInt64(0);
|
||||
previousEntryHash = reader.IsDBNull(1) ? null : reader.GetString(1);
|
||||
}
|
||||
|
||||
// Create the ledger entry
|
||||
var entry = RunLedgerEntry.FromCompletedRun(
|
||||
run: run,
|
||||
artifacts: artifacts,
|
||||
inputDigest: inputDigest,
|
||||
sequenceNumber: sequenceNumber,
|
||||
previousEntryHash: previousEntryHash,
|
||||
metadata: metadata);
|
||||
|
||||
// Insert the entry
|
||||
await using (var insertCommand = new NpgsqlCommand(InsertEntrySql, connection, transaction))
|
||||
{
|
||||
insertCommand.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
AddEntryParameters(insertCommand, entry);
|
||||
await insertCommand.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
// Update sequence hash
|
||||
await using (var updateCommand = new NpgsqlCommand(UpdateSequenceHashSql, connection, transaction))
|
||||
{
|
||||
updateCommand.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
updateCommand.Parameters.AddWithValue("tenant_id", run.TenantId);
|
||||
updateCommand.Parameters.AddWithValue("content_hash", entry.ContentHash);
|
||||
await updateCommand.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
await transaction.CommitAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
OrchestratorMetrics.LedgerEntryCreated(run.TenantId, run.RunType, entry.FinalStatus.ToString());
|
||||
_logger.LogDebug("Ledger entry {LedgerId} appended for run {RunId}, sequence {Sequence}",
|
||||
entry.LedgerId, run.RunId, sequenceNumber);
|
||||
|
||||
return entry;
|
||||
}
|
||||
catch
|
||||
{
|
||||
await transaction.RollbackAsync(cancellationToken).ConfigureAwait(false);
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<RunLedgerEntry?> GetByIdAsync(
|
||||
string tenantId,
|
||||
Guid ledgerId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("ledger_id", ledgerId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapEntry(reader);
|
||||
}
|
||||
|
||||
public async Task<RunLedgerEntry?> GetByRunIdAsync(
|
||||
string tenantId,
|
||||
Guid runId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByRunIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("run_id", runId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapEntry(reader);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<RunLedgerEntry>> ListAsync(
|
||||
string tenantId,
|
||||
string? runType = null,
|
||||
Guid? sourceId = null,
|
||||
RunStatus? finalStatus = null,
|
||||
DateTimeOffset? startTime = null,
|
||||
DateTimeOffset? endTime = null,
|
||||
int limit = 100,
|
||||
int offset = 0,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var (sql, parameters) = BuildListQuery(tenantId, runType, sourceId, finalStatus, startTime, endTime, limit, offset);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var entries = new List<RunLedgerEntry>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
entries.Add(MapEntry(reader));
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<RunLedgerEntry>> GetBySequenceRangeAsync(
|
||||
string tenantId,
|
||||
long startSequence,
|
||||
long endSequence,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sql = $"""
|
||||
SELECT {SelectLedgerColumns}
|
||||
FROM run_ledger_entries
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND sequence_number >= @start_seq
|
||||
AND sequence_number <= @end_seq
|
||||
ORDER BY sequence_number ASC
|
||||
""";
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("start_seq", startSequence);
|
||||
command.Parameters.AddWithValue("end_seq", endSequence);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var entries = new List<RunLedgerEntry>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
entries.Add(MapEntry(reader));
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
public async Task<RunLedgerEntry?> GetLatestAsync(
|
||||
string tenantId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectLatestSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapEntry(reader);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<RunLedgerEntry>> GetBySourceAsync(
|
||||
string tenantId,
|
||||
Guid sourceId,
|
||||
int limit = 100,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sql = $"""
|
||||
SELECT {SelectLedgerColumns}
|
||||
FROM run_ledger_entries
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND source_id = @source_id
|
||||
ORDER BY ledger_created_at DESC
|
||||
LIMIT @limit
|
||||
""";
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("source_id", sourceId);
|
||||
command.Parameters.AddWithValue("limit", limit);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var entries = new List<RunLedgerEntry>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
entries.Add(MapEntry(reader));
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
public async Task<long> GetCountAsync(
|
||||
string tenantId,
|
||||
string? runType = null,
|
||||
Guid? sourceId = null,
|
||||
DateTimeOffset? startTime = null,
|
||||
DateTimeOffset? endTime = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sb = new StringBuilder("SELECT COUNT(*) FROM run_ledger_entries WHERE tenant_id = @tenant_id");
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (runType is not null)
|
||||
{
|
||||
sb.Append(" AND run_type = @run_type");
|
||||
parameters.Add(("run_type", runType));
|
||||
}
|
||||
|
||||
if (sourceId.HasValue)
|
||||
{
|
||||
sb.Append(" AND source_id = @source_id");
|
||||
parameters.Add(("source_id", sourceId.Value));
|
||||
}
|
||||
|
||||
if (startTime.HasValue)
|
||||
{
|
||||
sb.Append(" AND ledger_created_at >= @start_time");
|
||||
parameters.Add(("start_time", startTime.Value));
|
||||
}
|
||||
|
||||
if (endTime.HasValue)
|
||||
{
|
||||
sb.Append(" AND ledger_created_at <= @end_time");
|
||||
parameters.Add(("end_time", endTime.Value));
|
||||
}
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sb.ToString(), connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
||||
return Convert.ToInt64(result);
|
||||
}
|
||||
|
||||
public async Task<ChainVerificationResult> VerifyChainAsync(
|
||||
string tenantId,
|
||||
long? startSequence = null,
|
||||
long? endSequence = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(VerifyChainSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("start_seq", (object?)startSequence ?? 1L);
|
||||
command.Parameters.AddWithValue("end_seq", (object?)endSequence ?? DBNull.Value);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return new ChainVerificationResult(true, null, null, null);
|
||||
}
|
||||
|
||||
return new ChainVerificationResult(
|
||||
IsValid: reader.GetBoolean(0),
|
||||
InvalidEntryId: reader.IsDBNull(1) ? null : reader.GetGuid(1),
|
||||
InvalidSequence: reader.IsDBNull(2) ? null : reader.GetInt64(2),
|
||||
ErrorMessage: reader.IsDBNull(3) ? null : reader.GetString(3));
|
||||
}
|
||||
|
||||
public async Task<LedgerSummary> GetSummaryAsync(
|
||||
string tenantId,
|
||||
DateTimeOffset? since = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(GetSummarySql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("since", (object?)since ?? DBNull.Value);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return new LedgerSummary(0, 0, 0, 0, 0, 0, 0, 0, null, null);
|
||||
}
|
||||
|
||||
return new LedgerSummary(
|
||||
TotalEntries: reader.GetInt64(0),
|
||||
EntriesSince: reader.GetInt64(1),
|
||||
TotalRuns: reader.GetInt64(2),
|
||||
SuccessfulRuns: reader.GetInt64(3),
|
||||
FailedRuns: reader.GetInt64(4),
|
||||
TotalJobs: reader.GetInt64(5),
|
||||
UniqueSources: reader.GetInt64(6),
|
||||
UniqueRunTypes: reader.GetInt64(7),
|
||||
EarliestEntry: reader.IsDBNull(8) ? null : reader.GetFieldValue<DateTimeOffset>(8),
|
||||
LatestEntry: reader.IsDBNull(9) ? null : reader.GetFieldValue<DateTimeOffset>(9));
|
||||
}
|
||||
|
||||
private static void AddEntryParameters(NpgsqlCommand command, RunLedgerEntry entry)
|
||||
{
|
||||
command.Parameters.AddWithValue("ledger_id", entry.LedgerId);
|
||||
command.Parameters.AddWithValue("tenant_id", entry.TenantId);
|
||||
command.Parameters.AddWithValue("run_id", entry.RunId);
|
||||
command.Parameters.AddWithValue("source_id", entry.SourceId);
|
||||
command.Parameters.AddWithValue("run_type", entry.RunType);
|
||||
command.Parameters.AddWithValue("final_status", (int)entry.FinalStatus);
|
||||
command.Parameters.AddWithValue("total_jobs", entry.TotalJobs);
|
||||
command.Parameters.AddWithValue("succeeded_jobs", entry.SucceededJobs);
|
||||
command.Parameters.AddWithValue("failed_jobs", entry.FailedJobs);
|
||||
command.Parameters.AddWithValue("run_created_at", entry.RunCreatedAt);
|
||||
command.Parameters.AddWithValue("run_started_at", (object?)entry.RunStartedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("run_completed_at", entry.RunCompletedAt);
|
||||
command.Parameters.AddWithValue("execution_duration_ms", (long)entry.ExecutionDuration.TotalMilliseconds);
|
||||
command.Parameters.AddWithValue("initiated_by", entry.InitiatedBy);
|
||||
command.Parameters.AddWithValue("input_digest", entry.InputDigest);
|
||||
command.Parameters.AddWithValue("output_digest", entry.OutputDigest);
|
||||
command.Parameters.AddWithValue("artifact_manifest", entry.ArtifactManifest);
|
||||
command.Parameters.AddWithValue("sequence_number", entry.SequenceNumber);
|
||||
command.Parameters.AddWithValue("previous_entry_hash", (object?)entry.PreviousEntryHash ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("content_hash", entry.ContentHash);
|
||||
command.Parameters.AddWithValue("ledger_created_at", entry.LedgerCreatedAt);
|
||||
command.Parameters.AddWithValue("correlation_id", (object?)entry.CorrelationId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("metadata", (object?)entry.Metadata ?? DBNull.Value);
|
||||
}
|
||||
|
||||
private static RunLedgerEntry MapEntry(NpgsqlDataReader reader)
|
||||
{
|
||||
return new RunLedgerEntry(
|
||||
LedgerId: reader.GetGuid(0),
|
||||
TenantId: reader.GetString(1),
|
||||
RunId: reader.GetGuid(2),
|
||||
SourceId: reader.GetGuid(3),
|
||||
RunType: reader.GetString(4),
|
||||
FinalStatus: (RunStatus)reader.GetInt32(5),
|
||||
TotalJobs: reader.GetInt32(6),
|
||||
SucceededJobs: reader.GetInt32(7),
|
||||
FailedJobs: reader.GetInt32(8),
|
||||
RunCreatedAt: reader.GetFieldValue<DateTimeOffset>(9),
|
||||
RunStartedAt: reader.IsDBNull(10) ? null : reader.GetFieldValue<DateTimeOffset>(10),
|
||||
RunCompletedAt: reader.GetFieldValue<DateTimeOffset>(11),
|
||||
ExecutionDuration: TimeSpan.FromMilliseconds(reader.GetInt64(12)),
|
||||
InitiatedBy: reader.GetString(13),
|
||||
InputDigest: reader.GetString(14),
|
||||
OutputDigest: reader.GetString(15),
|
||||
ArtifactManifest: reader.GetString(16),
|
||||
SequenceNumber: reader.GetInt64(17),
|
||||
PreviousEntryHash: reader.IsDBNull(18) ? null : reader.GetString(18),
|
||||
ContentHash: reader.GetString(19),
|
||||
LedgerCreatedAt: reader.GetFieldValue<DateTimeOffset>(20),
|
||||
CorrelationId: reader.IsDBNull(21) ? null : reader.GetString(21),
|
||||
Metadata: reader.IsDBNull(22) ? null : reader.GetString(22));
|
||||
}
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
|
||||
string tenantId,
|
||||
string? runType,
|
||||
Guid? sourceId,
|
||||
RunStatus? finalStatus,
|
||||
DateTimeOffset? startTime,
|
||||
DateTimeOffset? endTime,
|
||||
int limit,
|
||||
int offset)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append($"SELECT {SelectLedgerColumns} FROM run_ledger_entries WHERE tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (runType is not null)
|
||||
{
|
||||
sb.Append(" AND run_type = @run_type");
|
||||
parameters.Add(("run_type", runType));
|
||||
}
|
||||
|
||||
if (sourceId.HasValue)
|
||||
{
|
||||
sb.Append(" AND source_id = @source_id");
|
||||
parameters.Add(("source_id", sourceId.Value));
|
||||
}
|
||||
|
||||
if (finalStatus.HasValue)
|
||||
{
|
||||
sb.Append(" AND final_status = @final_status");
|
||||
parameters.Add(("final_status", (int)finalStatus.Value));
|
||||
}
|
||||
|
||||
if (startTime.HasValue)
|
||||
{
|
||||
sb.Append(" AND ledger_created_at >= @start_time");
|
||||
parameters.Add(("start_time", startTime.Value));
|
||||
}
|
||||
|
||||
if (endTime.HasValue)
|
||||
{
|
||||
sb.Append(" AND ledger_created_at <= @end_time");
|
||||
parameters.Add(("end_time", endTime.Value));
|
||||
}
|
||||
|
||||
sb.Append(" ORDER BY ledger_created_at DESC LIMIT @limit OFFSET @offset");
|
||||
parameters.Add(("limit", limit));
|
||||
parameters.Add(("offset", offset));
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of the ledger export repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresLedgerExportRepository : ILedgerExportRepository
|
||||
{
|
||||
private const string SelectExportColumns = """
|
||||
export_id, tenant_id, status, format, start_time, end_time, run_type_filter,
|
||||
source_id_filter, entry_count, output_uri, output_digest, output_size_bytes,
|
||||
requested_by, requested_at, started_at, completed_at, error_message
|
||||
""";
|
||||
|
||||
private const string InsertExportSql = """
|
||||
INSERT INTO ledger_exports (
|
||||
export_id, tenant_id, status, format, start_time, end_time, run_type_filter,
|
||||
source_id_filter, entry_count, output_uri, output_digest, output_size_bytes,
|
||||
requested_by, requested_at, started_at, completed_at, error_message)
|
||||
VALUES (
|
||||
@export_id, @tenant_id, @status, @format, @start_time, @end_time, @run_type_filter,
|
||||
@source_id_filter, @entry_count, @output_uri, @output_digest, @output_size_bytes,
|
||||
@requested_by, @requested_at, @started_at, @completed_at, @error_message)
|
||||
""";
|
||||
|
||||
private const string UpdateExportSql = """
|
||||
UPDATE ledger_exports
|
||||
SET status = @status,
|
||||
entry_count = @entry_count,
|
||||
output_uri = @output_uri,
|
||||
output_digest = @output_digest,
|
||||
output_size_bytes = @output_size_bytes,
|
||||
started_at = @started_at,
|
||||
completed_at = @completed_at,
|
||||
error_message = @error_message
|
||||
WHERE tenant_id = @tenant_id AND export_id = @export_id
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresLedgerExportRepository> _logger;
|
||||
|
||||
public PostgresLedgerExportRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresLedgerExportRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<LedgerExport> CreateAsync(LedgerExport export, CancellationToken cancellationToken = default)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(export.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(InsertExportSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
AddExportParameters(command, export);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
OrchestratorMetrics.LedgerExportRequested(export.TenantId, export.Format);
|
||||
_logger.LogDebug("Ledger export {ExportId} created for tenant {TenantId}", export.ExportId, export.TenantId);
|
||||
|
||||
return export;
|
||||
}
|
||||
|
||||
public async Task<LedgerExport?> GetByIdAsync(string tenantId, Guid exportId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sql = $"""
|
||||
SELECT {SelectExportColumns}
|
||||
FROM ledger_exports
|
||||
WHERE tenant_id = @tenant_id AND export_id = @export_id
|
||||
""";
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("export_id", exportId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapExport(reader);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<LedgerExport>> ListAsync(
|
||||
string tenantId,
|
||||
LedgerExportStatus? status = null,
|
||||
int limit = 100,
|
||||
int offset = 0,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sb = new StringBuilder($"SELECT {SelectExportColumns} FROM ledger_exports WHERE tenant_id = @tenant_id");
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (status.HasValue)
|
||||
{
|
||||
sb.Append(" AND status = @status");
|
||||
parameters.Add(("status", (int)status.Value));
|
||||
}
|
||||
|
||||
sb.Append(" ORDER BY requested_at DESC LIMIT @limit OFFSET @offset");
|
||||
parameters.Add(("limit", limit));
|
||||
parameters.Add(("offset", offset));
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sb.ToString(), connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var exports = new List<LedgerExport>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
exports.Add(MapExport(reader));
|
||||
}
|
||||
return exports;
|
||||
}
|
||||
|
||||
public async Task<LedgerExport> UpdateAsync(LedgerExport export, CancellationToken cancellationToken = default)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(export.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(UpdateExportSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("export_id", export.ExportId);
|
||||
command.Parameters.AddWithValue("tenant_id", export.TenantId);
|
||||
command.Parameters.AddWithValue("status", (int)export.Status);
|
||||
command.Parameters.AddWithValue("entry_count", export.EntryCount);
|
||||
command.Parameters.AddWithValue("output_uri", (object?)export.OutputUri ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("output_digest", (object?)export.OutputDigest ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("output_size_bytes", (object?)export.OutputSizeBytes ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("started_at", (object?)export.StartedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("completed_at", (object?)export.CompletedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("error_message", (object?)export.ErrorMessage ?? DBNull.Value);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (export.Status == LedgerExportStatus.Completed)
|
||||
{
|
||||
OrchestratorMetrics.LedgerExportCompleted(export.TenantId, export.Format);
|
||||
}
|
||||
else if (export.Status == LedgerExportStatus.Failed)
|
||||
{
|
||||
OrchestratorMetrics.LedgerExportFailed(export.TenantId, export.Format);
|
||||
}
|
||||
|
||||
return export;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<LedgerExport>> GetPendingAsync(int limit = 10, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sql = $"""
|
||||
SELECT {SelectExportColumns}
|
||||
FROM ledger_exports
|
||||
WHERE status = @status
|
||||
ORDER BY requested_at ASC
|
||||
LIMIT @limit
|
||||
""";
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync("_system", "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("status", (int)LedgerExportStatus.Pending);
|
||||
command.Parameters.AddWithValue("limit", limit);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var exports = new List<LedgerExport>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
exports.Add(MapExport(reader));
|
||||
}
|
||||
return exports;
|
||||
}
|
||||
|
||||
private static void AddExportParameters(NpgsqlCommand command, LedgerExport export)
|
||||
{
|
||||
command.Parameters.AddWithValue("export_id", export.ExportId);
|
||||
command.Parameters.AddWithValue("tenant_id", export.TenantId);
|
||||
command.Parameters.AddWithValue("status", (int)export.Status);
|
||||
command.Parameters.AddWithValue("format", export.Format);
|
||||
command.Parameters.AddWithValue("start_time", (object?)export.StartTime ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("end_time", (object?)export.EndTime ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("run_type_filter", (object?)export.RunTypeFilter ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("source_id_filter", (object?)export.SourceIdFilter ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("entry_count", export.EntryCount);
|
||||
command.Parameters.AddWithValue("output_uri", (object?)export.OutputUri ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("output_digest", (object?)export.OutputDigest ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("output_size_bytes", (object?)export.OutputSizeBytes ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("requested_by", export.RequestedBy);
|
||||
command.Parameters.AddWithValue("requested_at", export.RequestedAt);
|
||||
command.Parameters.AddWithValue("started_at", (object?)export.StartedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("completed_at", (object?)export.CompletedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("error_message", (object?)export.ErrorMessage ?? DBNull.Value);
|
||||
}
|
||||
|
||||
private static LedgerExport MapExport(NpgsqlDataReader reader)
|
||||
{
|
||||
return new LedgerExport(
|
||||
ExportId: reader.GetGuid(0),
|
||||
TenantId: reader.GetString(1),
|
||||
Status: (LedgerExportStatus)reader.GetInt32(2),
|
||||
Format: reader.GetString(3),
|
||||
StartTime: reader.IsDBNull(4) ? null : reader.GetFieldValue<DateTimeOffset>(4),
|
||||
EndTime: reader.IsDBNull(5) ? null : reader.GetFieldValue<DateTimeOffset>(5),
|
||||
RunTypeFilter: reader.IsDBNull(6) ? null : reader.GetString(6),
|
||||
SourceIdFilter: reader.IsDBNull(7) ? null : reader.GetGuid(7),
|
||||
EntryCount: reader.GetInt32(8),
|
||||
OutputUri: reader.IsDBNull(9) ? null : reader.GetString(9),
|
||||
OutputDigest: reader.IsDBNull(10) ? null : reader.GetString(10),
|
||||
OutputSizeBytes: reader.IsDBNull(11) ? null : reader.GetInt64(11),
|
||||
RequestedBy: reader.GetString(12),
|
||||
RequestedAt: reader.GetFieldValue<DateTimeOffset>(13),
|
||||
StartedAt: reader.IsDBNull(14) ? null : reader.GetFieldValue<DateTimeOffset>(14),
|
||||
CompletedAt: reader.IsDBNull(15) ? null : reader.GetFieldValue<DateTimeOffset>(15),
|
||||
ErrorMessage: reader.IsDBNull(16) ? null : reader.GetString(16));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of the manifest repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresManifestRepository : IManifestRepository
|
||||
{
|
||||
private const string SelectManifestColumns = """
|
||||
manifest_id, schema_version, tenant_id, provenance_type, subject_id, statements,
|
||||
artifacts, materials, build_info, payload_digest, signature_algorithm, signature,
|
||||
key_id, created_at, expires_at, metadata
|
||||
""";
|
||||
|
||||
private const string InsertManifestSql = """
|
||||
INSERT INTO signed_manifests (
|
||||
manifest_id, schema_version, tenant_id, provenance_type, subject_id, statements,
|
||||
artifacts, materials, build_info, payload_digest, signature_algorithm, signature,
|
||||
key_id, created_at, expires_at, metadata)
|
||||
VALUES (
|
||||
@manifest_id, @schema_version, @tenant_id, @provenance_type, @subject_id, @statements::jsonb,
|
||||
@artifacts::jsonb, @materials::jsonb, @build_info::jsonb, @payload_digest, @signature_algorithm, @signature,
|
||||
@key_id, @created_at, @expires_at, @metadata::jsonb)
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresManifestRepository> _logger;
|
||||
|
||||
public PostgresManifestRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresManifestRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<SignedManifest> CreateAsync(SignedManifest manifest, CancellationToken cancellationToken = default)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(manifest.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(InsertManifestSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("manifest_id", manifest.ManifestId);
|
||||
command.Parameters.AddWithValue("schema_version", manifest.SchemaVersion);
|
||||
command.Parameters.AddWithValue("tenant_id", manifest.TenantId);
|
||||
command.Parameters.AddWithValue("provenance_type", (int)manifest.ProvenanceType);
|
||||
command.Parameters.AddWithValue("subject_id", manifest.SubjectId);
|
||||
command.Parameters.AddWithValue("statements", manifest.Statements);
|
||||
command.Parameters.AddWithValue("artifacts", manifest.Artifacts);
|
||||
command.Parameters.AddWithValue("materials", manifest.Materials);
|
||||
command.Parameters.AddWithValue("build_info", (object?)manifest.BuildInfo ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("payload_digest", manifest.PayloadDigest);
|
||||
command.Parameters.AddWithValue("signature_algorithm", manifest.SignatureAlgorithm);
|
||||
command.Parameters.AddWithValue("signature", manifest.Signature);
|
||||
command.Parameters.AddWithValue("key_id", manifest.KeyId);
|
||||
command.Parameters.AddWithValue("created_at", manifest.CreatedAt);
|
||||
command.Parameters.AddWithValue("expires_at", (object?)manifest.ExpiresAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("metadata", (object?)manifest.Metadata ?? DBNull.Value);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
OrchestratorMetrics.ManifestCreated(manifest.TenantId, manifest.ProvenanceType.ToString());
|
||||
_logger.LogDebug("Manifest {ManifestId} created for subject {SubjectId}", manifest.ManifestId, manifest.SubjectId);
|
||||
|
||||
return manifest;
|
||||
}
|
||||
|
||||
public async Task<SignedManifest?> GetByIdAsync(string tenantId, Guid manifestId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sql = $"""
|
||||
SELECT {SelectManifestColumns}
|
||||
FROM signed_manifests
|
||||
WHERE tenant_id = @tenant_id AND manifest_id = @manifest_id
|
||||
""";
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("manifest_id", manifestId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapManifest(reader);
|
||||
}
|
||||
|
||||
public async Task<SignedManifest?> GetBySubjectAsync(
|
||||
string tenantId,
|
||||
ProvenanceType provenanceType,
|
||||
Guid subjectId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sql = $"""
|
||||
SELECT {SelectManifestColumns}
|
||||
FROM signed_manifests
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND provenance_type = @provenance_type
|
||||
AND subject_id = @subject_id
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1
|
||||
""";
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("provenance_type", (int)provenanceType);
|
||||
command.Parameters.AddWithValue("subject_id", subjectId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapManifest(reader);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<SignedManifest>> ListAsync(
|
||||
string tenantId,
|
||||
ProvenanceType? provenanceType = null,
|
||||
int limit = 100,
|
||||
int offset = 0,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sb = new StringBuilder($"SELECT {SelectManifestColumns} FROM signed_manifests WHERE tenant_id = @tenant_id");
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (provenanceType.HasValue)
|
||||
{
|
||||
sb.Append(" AND provenance_type = @provenance_type");
|
||||
parameters.Add(("provenance_type", (int)provenanceType.Value));
|
||||
}
|
||||
|
||||
sb.Append(" ORDER BY created_at DESC LIMIT @limit OFFSET @offset");
|
||||
parameters.Add(("limit", limit));
|
||||
parameters.Add(("offset", offset));
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sb.ToString(), connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var manifests = new List<SignedManifest>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
manifests.Add(MapManifest(reader));
|
||||
}
|
||||
return manifests;
|
||||
}
|
||||
|
||||
public async Task<SignedManifest?> GetByPayloadDigestAsync(
|
||||
string tenantId,
|
||||
string payloadDigest,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sql = $"""
|
||||
SELECT {SelectManifestColumns}
|
||||
FROM signed_manifests
|
||||
WHERE tenant_id = @tenant_id AND payload_digest = @payload_digest
|
||||
""";
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("payload_digest", payloadDigest);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapManifest(reader);
|
||||
}
|
||||
|
||||
private static SignedManifest MapManifest(NpgsqlDataReader reader)
|
||||
{
|
||||
return new SignedManifest(
|
||||
ManifestId: reader.GetGuid(0),
|
||||
SchemaVersion: reader.GetString(1),
|
||||
TenantId: reader.GetString(2),
|
||||
ProvenanceType: (ProvenanceType)reader.GetInt32(3),
|
||||
SubjectId: reader.GetGuid(4),
|
||||
Statements: reader.GetString(5),
|
||||
Artifacts: reader.GetString(6),
|
||||
Materials: reader.GetString(7),
|
||||
BuildInfo: reader.IsDBNull(8) ? null : reader.GetString(8),
|
||||
PayloadDigest: reader.GetString(9),
|
||||
SignatureAlgorithm: reader.GetString(10),
|
||||
Signature: reader.GetString(11),
|
||||
KeyId: reader.GetString(12),
|
||||
CreatedAt: reader.GetFieldValue<DateTimeOffset>(13),
|
||||
ExpiresAt: reader.IsDBNull(14) ? null : reader.GetFieldValue<DateTimeOffset>(14),
|
||||
Metadata: reader.IsDBNull(15) ? null : reader.GetString(15));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,434 @@
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Npgsql;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of quota repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresQuotaRepository : IQuotaRepository
|
||||
{
|
||||
private const string SelectQuotaColumns = """
|
||||
quota_id, tenant_id, job_type, max_active, max_per_hour, burst_capacity,
|
||||
refill_rate, current_tokens, last_refill_at, current_active, current_hour_count,
|
||||
current_hour_start, paused, pause_reason, quota_ticket, created_at, updated_at, updated_by
|
||||
""";
|
||||
|
||||
private const string SelectByIdSql = $"""
|
||||
SELECT {SelectQuotaColumns}
|
||||
FROM quotas
|
||||
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
|
||||
""";
|
||||
|
||||
private const string SelectByTenantAndJobTypeSql = $"""
|
||||
SELECT {SelectQuotaColumns}
|
||||
FROM quotas
|
||||
WHERE tenant_id = @tenant_id AND (job_type = @job_type OR (job_type IS NULL AND @job_type IS NULL))
|
||||
""";
|
||||
|
||||
private const string InsertQuotaSql = """
|
||||
INSERT INTO quotas (
|
||||
quota_id, tenant_id, job_type, max_active, max_per_hour, burst_capacity,
|
||||
refill_rate, current_tokens, last_refill_at, current_active, current_hour_count,
|
||||
current_hour_start, paused, pause_reason, quota_ticket, created_at, updated_at, updated_by)
|
||||
VALUES (
|
||||
@quota_id, @tenant_id, @job_type, @max_active, @max_per_hour, @burst_capacity,
|
||||
@refill_rate, @current_tokens, @last_refill_at, @current_active, @current_hour_count,
|
||||
@current_hour_start, @paused, @pause_reason, @quota_ticket, @created_at, @updated_at, @updated_by)
|
||||
""";
|
||||
|
||||
private const string UpdateQuotaSql = """
|
||||
UPDATE quotas
|
||||
SET job_type = @job_type,
|
||||
max_active = @max_active,
|
||||
max_per_hour = @max_per_hour,
|
||||
burst_capacity = @burst_capacity,
|
||||
refill_rate = @refill_rate,
|
||||
current_tokens = @current_tokens,
|
||||
last_refill_at = @last_refill_at,
|
||||
current_active = @current_active,
|
||||
current_hour_count = @current_hour_count,
|
||||
current_hour_start = @current_hour_start,
|
||||
paused = @paused,
|
||||
pause_reason = @pause_reason,
|
||||
quota_ticket = @quota_ticket,
|
||||
updated_at = @updated_at,
|
||||
updated_by = @updated_by
|
||||
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
|
||||
""";
|
||||
|
||||
private const string UpdateStateSql = """
|
||||
UPDATE quotas
|
||||
SET current_tokens = @current_tokens,
|
||||
last_refill_at = @last_refill_at,
|
||||
current_active = @current_active,
|
||||
current_hour_count = @current_hour_count,
|
||||
current_hour_start = @current_hour_start,
|
||||
updated_at = @updated_at,
|
||||
updated_by = @updated_by
|
||||
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
|
||||
""";
|
||||
|
||||
private const string PauseQuotaSql = """
|
||||
UPDATE quotas
|
||||
SET paused = TRUE,
|
||||
pause_reason = @pause_reason,
|
||||
quota_ticket = @quota_ticket,
|
||||
updated_at = @updated_at,
|
||||
updated_by = @updated_by
|
||||
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
|
||||
""";
|
||||
|
||||
private const string ResumeQuotaSql = """
|
||||
UPDATE quotas
|
||||
SET paused = FALSE,
|
||||
pause_reason = NULL,
|
||||
quota_ticket = NULL,
|
||||
updated_at = @updated_at,
|
||||
updated_by = @updated_by
|
||||
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
|
||||
""";
|
||||
|
||||
private const string IncrementActiveSql = """
|
||||
UPDATE quotas
|
||||
SET current_active = current_active + 1,
|
||||
updated_at = @updated_at
|
||||
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
|
||||
""";
|
||||
|
||||
private const string DecrementActiveSql = """
|
||||
UPDATE quotas
|
||||
SET current_active = GREATEST(current_active - 1, 0),
|
||||
updated_at = @updated_at
|
||||
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
|
||||
""";
|
||||
|
||||
private const string DeleteQuotaSql = """
|
||||
DELETE FROM quotas
|
||||
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresQuotaRepository> _logger;
|
||||
|
||||
public PostgresQuotaRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresQuotaRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<Quota?> GetByIdAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("quota_id", quotaId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapQuota(reader);
|
||||
}
|
||||
|
||||
public async Task<Quota?> GetByTenantAndJobTypeAsync(string tenantId, string? jobType, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByTenantAndJobTypeSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("job_type", (object?)jobType ?? DBNull.Value);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapQuota(reader);
|
||||
}
|
||||
|
||||
public async Task CreateAsync(Quota quota, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(quota.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(InsertQuotaSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
AddQuotaParameters(command, quota);
|
||||
|
||||
try
|
||||
{
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.QuotaCreated(quota.TenantId, quota.JobType);
|
||||
}
|
||||
catch (PostgresException ex) when (string.Equals(ex.SqlState, PostgresErrorCodes.UniqueViolation, StringComparison.Ordinal))
|
||||
{
|
||||
_logger.LogWarning("Duplicate quota for tenant {TenantId} job type {JobType}", quota.TenantId, quota.JobType);
|
||||
throw new DuplicateQuotaException(quota.TenantId, quota.JobType, ex);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task UpdateAsync(Quota quota, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(quota.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(UpdateQuotaSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", quota.TenantId);
|
||||
command.Parameters.AddWithValue("quota_id", quota.QuotaId);
|
||||
command.Parameters.AddWithValue("job_type", (object?)quota.JobType ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("max_active", quota.MaxActive);
|
||||
command.Parameters.AddWithValue("max_per_hour", quota.MaxPerHour);
|
||||
command.Parameters.AddWithValue("burst_capacity", quota.BurstCapacity);
|
||||
command.Parameters.AddWithValue("refill_rate", quota.RefillRate);
|
||||
command.Parameters.AddWithValue("current_tokens", quota.CurrentTokens);
|
||||
command.Parameters.AddWithValue("last_refill_at", quota.LastRefillAt);
|
||||
command.Parameters.AddWithValue("current_active", quota.CurrentActive);
|
||||
command.Parameters.AddWithValue("current_hour_count", quota.CurrentHourCount);
|
||||
command.Parameters.AddWithValue("current_hour_start", quota.CurrentHourStart);
|
||||
command.Parameters.AddWithValue("paused", quota.Paused);
|
||||
command.Parameters.AddWithValue("pause_reason", (object?)quota.PauseReason ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("quota_ticket", (object?)quota.QuotaTicket ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("updated_at", quota.UpdatedAt);
|
||||
command.Parameters.AddWithValue("updated_by", quota.UpdatedBy);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (rows == 0)
|
||||
{
|
||||
_logger.LogWarning("Quota not found for update: {QuotaId}", quota.QuotaId);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task UpdateStateAsync(
|
||||
string tenantId,
|
||||
Guid quotaId,
|
||||
double currentTokens,
|
||||
DateTimeOffset lastRefillAt,
|
||||
int currentActive,
|
||||
int currentHourCount,
|
||||
DateTimeOffset currentHourStart,
|
||||
string updatedBy,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(UpdateStateSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("quota_id", quotaId);
|
||||
command.Parameters.AddWithValue("current_tokens", currentTokens);
|
||||
command.Parameters.AddWithValue("last_refill_at", lastRefillAt);
|
||||
command.Parameters.AddWithValue("current_active", currentActive);
|
||||
command.Parameters.AddWithValue("current_hour_count", currentHourCount);
|
||||
command.Parameters.AddWithValue("current_hour_start", currentHourStart);
|
||||
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
|
||||
command.Parameters.AddWithValue("updated_by", updatedBy);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public async Task PauseAsync(string tenantId, Guid quotaId, string reason, string? ticket, string updatedBy, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(PauseQuotaSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("quota_id", quotaId);
|
||||
command.Parameters.AddWithValue("pause_reason", reason);
|
||||
command.Parameters.AddWithValue("quota_ticket", (object?)ticket ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
|
||||
command.Parameters.AddWithValue("updated_by", updatedBy);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (rows > 0)
|
||||
{
|
||||
OrchestratorMetrics.QuotaPaused(tenantId);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task ResumeAsync(string tenantId, Guid quotaId, string updatedBy, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(ResumeQuotaSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("quota_id", quotaId);
|
||||
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
|
||||
command.Parameters.AddWithValue("updated_by", updatedBy);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (rows > 0)
|
||||
{
|
||||
OrchestratorMetrics.QuotaResumed(tenantId);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task IncrementActiveAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(IncrementActiveSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("quota_id", quotaId);
|
||||
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public async Task DecrementActiveAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(DecrementActiveSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("quota_id", quotaId);
|
||||
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Quota>> ListAsync(
|
||||
string tenantId,
|
||||
string? jobType,
|
||||
bool? paused,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var (sql, parameters) = BuildListQuery(tenantId, jobType, paused, limit, offset);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var quotas = new List<Quota>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
quotas.Add(MapQuota(reader));
|
||||
}
|
||||
return quotas;
|
||||
}
|
||||
|
||||
public async Task<bool> DeleteAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(DeleteQuotaSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("quota_id", quotaId);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
return rows > 0;
|
||||
}
|
||||
|
||||
private static void AddQuotaParameters(NpgsqlCommand command, Quota quota)
|
||||
{
|
||||
command.Parameters.AddWithValue("quota_id", quota.QuotaId);
|
||||
command.Parameters.AddWithValue("tenant_id", quota.TenantId);
|
||||
command.Parameters.AddWithValue("job_type", (object?)quota.JobType ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("max_active", quota.MaxActive);
|
||||
command.Parameters.AddWithValue("max_per_hour", quota.MaxPerHour);
|
||||
command.Parameters.AddWithValue("burst_capacity", quota.BurstCapacity);
|
||||
command.Parameters.AddWithValue("refill_rate", quota.RefillRate);
|
||||
command.Parameters.AddWithValue("current_tokens", quota.CurrentTokens);
|
||||
command.Parameters.AddWithValue("last_refill_at", quota.LastRefillAt);
|
||||
command.Parameters.AddWithValue("current_active", quota.CurrentActive);
|
||||
command.Parameters.AddWithValue("current_hour_count", quota.CurrentHourCount);
|
||||
command.Parameters.AddWithValue("current_hour_start", quota.CurrentHourStart);
|
||||
command.Parameters.AddWithValue("paused", quota.Paused);
|
||||
command.Parameters.AddWithValue("pause_reason", (object?)quota.PauseReason ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("quota_ticket", (object?)quota.QuotaTicket ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("created_at", quota.CreatedAt);
|
||||
command.Parameters.AddWithValue("updated_at", quota.UpdatedAt);
|
||||
command.Parameters.AddWithValue("updated_by", quota.UpdatedBy);
|
||||
}
|
||||
|
||||
private static Quota MapQuota(NpgsqlDataReader reader)
|
||||
{
|
||||
return new Quota(
|
||||
QuotaId: reader.GetGuid(0),
|
||||
TenantId: reader.GetString(1),
|
||||
JobType: reader.IsDBNull(2) ? null : reader.GetString(2),
|
||||
MaxActive: reader.GetInt32(3),
|
||||
MaxPerHour: reader.GetInt32(4),
|
||||
BurstCapacity: reader.GetInt32(5),
|
||||
RefillRate: reader.GetDouble(6),
|
||||
CurrentTokens: reader.GetDouble(7),
|
||||
LastRefillAt: reader.GetFieldValue<DateTimeOffset>(8),
|
||||
CurrentActive: reader.GetInt32(9),
|
||||
CurrentHourCount: reader.GetInt32(10),
|
||||
CurrentHourStart: reader.GetFieldValue<DateTimeOffset>(11),
|
||||
Paused: reader.GetBoolean(12),
|
||||
PauseReason: reader.IsDBNull(13) ? null : reader.GetString(13),
|
||||
QuotaTicket: reader.IsDBNull(14) ? null : reader.GetString(14),
|
||||
CreatedAt: reader.GetFieldValue<DateTimeOffset>(15),
|
||||
UpdatedAt: reader.GetFieldValue<DateTimeOffset>(16),
|
||||
UpdatedBy: reader.GetString(17));
|
||||
}
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
|
||||
string tenantId,
|
||||
string? jobType,
|
||||
bool? paused,
|
||||
int limit,
|
||||
int offset)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append($"SELECT {SelectQuotaColumns} FROM quotas WHERE tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (jobType is not null)
|
||||
{
|
||||
sb.Append(" AND job_type = @job_type");
|
||||
parameters.Add(("job_type", jobType));
|
||||
}
|
||||
|
||||
if (paused.HasValue)
|
||||
{
|
||||
sb.Append(" AND paused = @paused");
|
||||
parameters.Add(("paused", paused.Value));
|
||||
}
|
||||
|
||||
sb.Append(" ORDER BY job_type NULLS FIRST LIMIT @limit OFFSET @offset");
|
||||
parameters.Add(("limit", limit));
|
||||
parameters.Add(("offset", offset));
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when attempting to create a duplicate quota.
|
||||
/// </summary>
|
||||
public sealed class DuplicateQuotaException : Exception
|
||||
{
|
||||
public string TenantId { get; }
|
||||
public string? JobType { get; }
|
||||
|
||||
public DuplicateQuotaException(string tenantId, string? jobType, Exception innerException)
|
||||
: base($"Quota for tenant '{tenantId}' and job type '{jobType ?? "(all)"}' already exists.", innerException)
|
||||
{
|
||||
TenantId = tenantId;
|
||||
JobType = jobType;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,199 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Npgsql;
|
||||
using StellaOps.Orchestrator.Core.DeadLetter;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of replay audit repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresReplayAuditRepository : IReplayAuditRepository
|
||||
{
|
||||
private const string SelectAuditColumns = """
|
||||
audit_id, tenant_id, entry_id, attempt_number,
|
||||
success, new_job_id, error_message,
|
||||
triggered_by, triggered_at, completed_at, initiated_by
|
||||
""";
|
||||
|
||||
private const string SelectByEntrySql = $"""
|
||||
SELECT {SelectAuditColumns}
|
||||
FROM dead_letter_replay_audit
|
||||
WHERE tenant_id = @tenant_id AND entry_id = @entry_id
|
||||
ORDER BY attempt_number ASC
|
||||
""";
|
||||
|
||||
private const string SelectByIdSql = $"""
|
||||
SELECT {SelectAuditColumns}
|
||||
FROM dead_letter_replay_audit
|
||||
WHERE tenant_id = @tenant_id AND audit_id = @audit_id
|
||||
""";
|
||||
|
||||
private const string SelectByNewJobIdSql = $"""
|
||||
SELECT {SelectAuditColumns}
|
||||
FROM dead_letter_replay_audit
|
||||
WHERE tenant_id = @tenant_id AND new_job_id = @new_job_id
|
||||
""";
|
||||
|
||||
private const string InsertAuditSql = """
|
||||
INSERT INTO dead_letter_replay_audit (
|
||||
audit_id, tenant_id, entry_id, attempt_number,
|
||||
success, new_job_id, error_message,
|
||||
triggered_by, triggered_at, completed_at, initiated_by)
|
||||
VALUES (
|
||||
@audit_id, @tenant_id, @entry_id, @attempt_number,
|
||||
@success, @new_job_id, @error_message,
|
||||
@triggered_by, @triggered_at, @completed_at, @initiated_by)
|
||||
""";
|
||||
|
||||
private const string UpdateAuditSql = """
|
||||
UPDATE dead_letter_replay_audit
|
||||
SET success = @success,
|
||||
new_job_id = @new_job_id,
|
||||
error_message = @error_message,
|
||||
completed_at = @completed_at
|
||||
WHERE tenant_id = @tenant_id AND audit_id = @audit_id
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresReplayAuditRepository> _logger;
|
||||
|
||||
public PostgresReplayAuditRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresReplayAuditRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<ReplayAuditRecord>> GetByEntryAsync(
|
||||
string tenantId,
|
||||
Guid entryId,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByEntrySql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("entry_id", entryId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var records = new List<ReplayAuditRecord>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
records.Add(MapRecord(reader));
|
||||
}
|
||||
return records;
|
||||
}
|
||||
|
||||
public async Task<ReplayAuditRecord?> GetByIdAsync(
|
||||
string tenantId,
|
||||
Guid auditId,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("audit_id", auditId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapRecord(reader);
|
||||
}
|
||||
|
||||
public async Task<ReplayAuditRecord?> GetByNewJobIdAsync(
|
||||
string tenantId,
|
||||
Guid newJobId,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByNewJobIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("new_job_id", newJobId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapRecord(reader);
|
||||
}
|
||||
|
||||
public async Task CreateAsync(
|
||||
ReplayAuditRecord record,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(record.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(InsertAuditSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
AddParameters(command, record);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.DeadLetterReplayAttempted(record.TenantId, record.TriggeredBy);
|
||||
}
|
||||
|
||||
public async Task<bool> UpdateAsync(
|
||||
ReplayAuditRecord record,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(record.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(UpdateAuditSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", record.TenantId);
|
||||
command.Parameters.AddWithValue("audit_id", record.AuditId);
|
||||
command.Parameters.AddWithValue("success", record.Success);
|
||||
command.Parameters.AddWithValue("new_job_id", (object?)record.NewJobId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("error_message", (object?)record.ErrorMessage ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("completed_at", (object?)record.CompletedAt ?? DBNull.Value);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (rows > 0 && record.Success)
|
||||
{
|
||||
OrchestratorMetrics.DeadLetterReplaySucceeded(record.TenantId);
|
||||
}
|
||||
else if (rows > 0 && !record.Success)
|
||||
{
|
||||
OrchestratorMetrics.DeadLetterReplayFailed(record.TenantId);
|
||||
}
|
||||
|
||||
return rows > 0;
|
||||
}
|
||||
|
||||
private static void AddParameters(NpgsqlCommand command, ReplayAuditRecord record)
|
||||
{
|
||||
command.Parameters.AddWithValue("audit_id", record.AuditId);
|
||||
command.Parameters.AddWithValue("tenant_id", record.TenantId);
|
||||
command.Parameters.AddWithValue("entry_id", record.EntryId);
|
||||
command.Parameters.AddWithValue("attempt_number", record.AttemptNumber);
|
||||
command.Parameters.AddWithValue("success", record.Success);
|
||||
command.Parameters.AddWithValue("new_job_id", (object?)record.NewJobId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("error_message", (object?)record.ErrorMessage ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("triggered_by", record.TriggeredBy);
|
||||
command.Parameters.AddWithValue("triggered_at", record.TriggeredAt);
|
||||
command.Parameters.AddWithValue("completed_at", (object?)record.CompletedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("initiated_by", record.InitiatedBy);
|
||||
}
|
||||
|
||||
private static ReplayAuditRecord MapRecord(NpgsqlDataReader reader) =>
|
||||
new(
|
||||
AuditId: reader.GetGuid(0),
|
||||
TenantId: reader.GetString(1),
|
||||
EntryId: reader.GetGuid(2),
|
||||
AttemptNumber: reader.GetInt32(3),
|
||||
Success: reader.GetBoolean(4),
|
||||
NewJobId: reader.IsDBNull(5) ? null : reader.GetGuid(5),
|
||||
ErrorMessage: reader.IsDBNull(6) ? null : reader.GetString(6),
|
||||
TriggeredBy: reader.GetString(7),
|
||||
TriggeredAt: reader.GetFieldValue<DateTimeOffset>(8),
|
||||
CompletedAt: reader.IsDBNull(9) ? null : reader.GetFieldValue<DateTimeOffset>(9),
|
||||
InitiatedBy: reader.GetString(10));
|
||||
}
|
||||
@@ -0,0 +1,388 @@
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Npgsql;
|
||||
using NpgsqlTypes;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of run repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresRunRepository : IRunRepository
|
||||
{
|
||||
private const string SelectRunColumns = """
|
||||
run_id, tenant_id, project_id, source_id, run_type, status, correlation_id,
|
||||
total_jobs, completed_jobs, succeeded_jobs, failed_jobs, created_at,
|
||||
started_at, completed_at, created_by, metadata
|
||||
""";
|
||||
|
||||
private const string SelectByIdSql = $"""
|
||||
SELECT {SelectRunColumns}
|
||||
FROM runs
|
||||
WHERE tenant_id = @tenant_id AND run_id = @run_id
|
||||
""";
|
||||
|
||||
private const string InsertRunSql = """
|
||||
INSERT INTO runs (
|
||||
run_id, tenant_id, project_id, source_id, run_type, status, correlation_id,
|
||||
total_jobs, completed_jobs, succeeded_jobs, failed_jobs, created_at,
|
||||
started_at, completed_at, created_by, metadata)
|
||||
VALUES (
|
||||
@run_id, @tenant_id, @project_id, @source_id, @run_type, @status::run_status, @correlation_id,
|
||||
@total_jobs, @completed_jobs, @succeeded_jobs, @failed_jobs, @created_at,
|
||||
@started_at, @completed_at, @created_by, @metadata)
|
||||
""";
|
||||
|
||||
private const string UpdateStatusSql = """
|
||||
UPDATE runs
|
||||
SET status = @status::run_status,
|
||||
total_jobs = @total_jobs,
|
||||
completed_jobs = @completed_jobs,
|
||||
succeeded_jobs = @succeeded_jobs,
|
||||
failed_jobs = @failed_jobs,
|
||||
started_at = @started_at,
|
||||
completed_at = @completed_at
|
||||
WHERE tenant_id = @tenant_id AND run_id = @run_id
|
||||
""";
|
||||
|
||||
private const string IncrementJobCountsSql = """
|
||||
UPDATE runs
|
||||
SET completed_jobs = completed_jobs + 1,
|
||||
succeeded_jobs = CASE WHEN @succeeded THEN succeeded_jobs + 1 ELSE succeeded_jobs END,
|
||||
failed_jobs = CASE WHEN NOT @succeeded THEN failed_jobs + 1 ELSE failed_jobs END,
|
||||
started_at = COALESCE(started_at, @now),
|
||||
status = CASE
|
||||
WHEN completed_jobs + 1 >= total_jobs THEN
|
||||
CASE
|
||||
WHEN @succeeded AND (failed_jobs = 0 OR (NOT @succeeded AND failed_jobs + 1 = total_jobs)) THEN 'succeeded'::run_status
|
||||
WHEN NOT @succeeded AND succeeded_jobs = 0 THEN 'failed'::run_status
|
||||
ELSE 'partially_succeeded'::run_status
|
||||
END
|
||||
ELSE 'running'::run_status
|
||||
END,
|
||||
completed_at = CASE WHEN completed_jobs + 1 >= total_jobs THEN @now ELSE completed_at END
|
||||
WHERE tenant_id = @tenant_id AND run_id = @run_id
|
||||
RETURNING status
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresRunRepository> _logger;
|
||||
|
||||
public PostgresRunRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresRunRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<Run?> GetByIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("run_id", runId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapRun(reader);
|
||||
}
|
||||
|
||||
public async Task CreateAsync(Run run, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(run.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(InsertRunSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
AddRunParameters(command, run);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.RunCreated(run.TenantId, run.RunType);
|
||||
}
|
||||
|
||||
public async Task UpdateStatusAsync(
|
||||
string tenantId,
|
||||
Guid runId,
|
||||
RunStatus status,
|
||||
int totalJobs,
|
||||
int completedJobs,
|
||||
int succeededJobs,
|
||||
int failedJobs,
|
||||
DateTimeOffset? startedAt,
|
||||
DateTimeOffset? completedAt,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(UpdateStatusSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("run_id", runId);
|
||||
command.Parameters.AddWithValue("status", StatusToString(status));
|
||||
command.Parameters.AddWithValue("total_jobs", totalJobs);
|
||||
command.Parameters.AddWithValue("completed_jobs", completedJobs);
|
||||
command.Parameters.AddWithValue("succeeded_jobs", succeededJobs);
|
||||
command.Parameters.AddWithValue("failed_jobs", failedJobs);
|
||||
command.Parameters.AddWithValue("started_at", (object?)startedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("completed_at", (object?)completedAt ?? DBNull.Value);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public async Task IncrementJobCountsAsync(
|
||||
string tenantId,
|
||||
Guid runId,
|
||||
bool succeeded,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(IncrementJobCountsSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("run_id", runId);
|
||||
command.Parameters.AddWithValue("succeeded", succeeded);
|
||||
command.Parameters.AddWithValue("now", DateTimeOffset.UtcNow);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
var newStatus = reader.GetString(0);
|
||||
if (newStatus is "succeeded" or "failed" or "partially_succeeded")
|
||||
{
|
||||
// Run completed - get the full run for metrics
|
||||
var run = await GetByIdAsync(tenantId, runId, cancellationToken).ConfigureAwait(false);
|
||||
if (run is not null)
|
||||
{
|
||||
OrchestratorMetrics.RunCompleted(tenantId, run.RunType, newStatus);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Run>> ListAsync(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? runType,
|
||||
RunStatus? status,
|
||||
string? projectId,
|
||||
DateTimeOffset? createdAfter,
|
||||
DateTimeOffset? createdBefore,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var (sql, parameters) = BuildListQuery(tenantId, sourceId, runType, status, projectId, createdAfter, createdBefore, limit, offset);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var runs = new List<Run>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
runs.Add(MapRun(reader));
|
||||
}
|
||||
return runs;
|
||||
}
|
||||
|
||||
public async Task<int> CountAsync(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? runType,
|
||||
RunStatus? status,
|
||||
string? projectId,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var (sql, parameters) = BuildCountQuery(tenantId, sourceId, runType, status, projectId);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
||||
return Convert.ToInt32(result);
|
||||
}
|
||||
|
||||
private static void AddRunParameters(NpgsqlCommand command, Run run)
|
||||
{
|
||||
command.Parameters.AddWithValue("run_id", run.RunId);
|
||||
command.Parameters.AddWithValue("tenant_id", run.TenantId);
|
||||
command.Parameters.AddWithValue("project_id", (object?)run.ProjectId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("source_id", run.SourceId);
|
||||
command.Parameters.AddWithValue("run_type", run.RunType);
|
||||
command.Parameters.AddWithValue("status", StatusToString(run.Status));
|
||||
command.Parameters.AddWithValue("correlation_id", (object?)run.CorrelationId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("total_jobs", run.TotalJobs);
|
||||
command.Parameters.AddWithValue("completed_jobs", run.CompletedJobs);
|
||||
command.Parameters.AddWithValue("succeeded_jobs", run.SucceededJobs);
|
||||
command.Parameters.AddWithValue("failed_jobs", run.FailedJobs);
|
||||
command.Parameters.AddWithValue("created_at", run.CreatedAt);
|
||||
command.Parameters.AddWithValue("started_at", (object?)run.StartedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("completed_at", (object?)run.CompletedAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("created_by", run.CreatedBy);
|
||||
command.Parameters.Add(new NpgsqlParameter("metadata", NpgsqlDbType.Jsonb)
|
||||
{
|
||||
Value = (object?)run.Metadata ?? DBNull.Value
|
||||
});
|
||||
}
|
||||
|
||||
private static Run MapRun(NpgsqlDataReader reader)
|
||||
{
|
||||
return new Run(
|
||||
RunId: reader.GetGuid(0),
|
||||
TenantId: reader.GetString(1),
|
||||
ProjectId: reader.IsDBNull(2) ? null : reader.GetString(2),
|
||||
SourceId: reader.GetGuid(3),
|
||||
RunType: reader.GetString(4),
|
||||
Status: ParseStatus(reader.GetString(5)),
|
||||
CorrelationId: reader.IsDBNull(6) ? null : reader.GetString(6),
|
||||
TotalJobs: reader.GetInt32(7),
|
||||
CompletedJobs: reader.GetInt32(8),
|
||||
SucceededJobs: reader.GetInt32(9),
|
||||
FailedJobs: reader.GetInt32(10),
|
||||
CreatedAt: reader.GetFieldValue<DateTimeOffset>(11),
|
||||
StartedAt: reader.IsDBNull(12) ? null : reader.GetFieldValue<DateTimeOffset>(12),
|
||||
CompletedAt: reader.IsDBNull(13) ? null : reader.GetFieldValue<DateTimeOffset>(13),
|
||||
CreatedBy: reader.GetString(14),
|
||||
Metadata: reader.IsDBNull(15) ? null : reader.GetString(15));
|
||||
}
|
||||
|
||||
private static string StatusToString(RunStatus status) => status switch
|
||||
{
|
||||
RunStatus.Pending => "pending",
|
||||
RunStatus.Running => "running",
|
||||
RunStatus.Succeeded => "succeeded",
|
||||
RunStatus.PartiallySucceeded => "partially_succeeded",
|
||||
RunStatus.Failed => "failed",
|
||||
RunStatus.Canceled => "canceled",
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(status))
|
||||
};
|
||||
|
||||
private static RunStatus ParseStatus(string status) => status switch
|
||||
{
|
||||
"pending" => RunStatus.Pending,
|
||||
"running" => RunStatus.Running,
|
||||
"succeeded" => RunStatus.Succeeded,
|
||||
"partially_succeeded" => RunStatus.PartiallySucceeded,
|
||||
"failed" => RunStatus.Failed,
|
||||
"canceled" => RunStatus.Canceled,
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(status))
|
||||
};
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? runType,
|
||||
RunStatus? status,
|
||||
string? projectId,
|
||||
DateTimeOffset? createdAfter,
|
||||
DateTimeOffset? createdBefore,
|
||||
int limit,
|
||||
int offset)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append($"SELECT {SelectRunColumns} FROM runs WHERE tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (sourceId.HasValue)
|
||||
{
|
||||
sb.Append(" AND source_id = @source_id");
|
||||
parameters.Add(("source_id", sourceId.Value));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(runType))
|
||||
{
|
||||
sb.Append(" AND run_type = @run_type");
|
||||
parameters.Add(("run_type", runType));
|
||||
}
|
||||
|
||||
if (status.HasValue)
|
||||
{
|
||||
sb.Append(" AND status = @status::run_status");
|
||||
parameters.Add(("status", StatusToString(status.Value)));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(projectId))
|
||||
{
|
||||
sb.Append(" AND project_id = @project_id");
|
||||
parameters.Add(("project_id", projectId));
|
||||
}
|
||||
|
||||
if (createdAfter.HasValue)
|
||||
{
|
||||
sb.Append(" AND created_at >= @created_after");
|
||||
parameters.Add(("created_after", createdAfter.Value));
|
||||
}
|
||||
|
||||
if (createdBefore.HasValue)
|
||||
{
|
||||
sb.Append(" AND created_at < @created_before");
|
||||
parameters.Add(("created_before", createdBefore.Value));
|
||||
}
|
||||
|
||||
sb.Append(" ORDER BY created_at DESC LIMIT @limit OFFSET @offset");
|
||||
parameters.Add(("limit", limit));
|
||||
parameters.Add(("offset", offset));
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildCountQuery(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? runType,
|
||||
RunStatus? status,
|
||||
string? projectId)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append("SELECT COUNT(*) FROM runs WHERE tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (sourceId.HasValue)
|
||||
{
|
||||
sb.Append(" AND source_id = @source_id");
|
||||
parameters.Add(("source_id", sourceId.Value));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(runType))
|
||||
{
|
||||
sb.Append(" AND run_type = @run_type");
|
||||
parameters.Add(("run_type", runType));
|
||||
}
|
||||
|
||||
if (status.HasValue)
|
||||
{
|
||||
sb.Append(" AND status = @status::run_status");
|
||||
parameters.Add(("status", StatusToString(status.Value)));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(projectId))
|
||||
{
|
||||
sb.Append(" AND project_id = @project_id");
|
||||
parameters.Add(("project_id", projectId));
|
||||
}
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,314 @@
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Npgsql;
|
||||
using NpgsqlTypes;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of source repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresSourceRepository : ISourceRepository
|
||||
{
|
||||
private const string SelectSourceColumns = """
|
||||
source_id, tenant_id, name, source_type, enabled, paused, pause_reason,
|
||||
pause_ticket, configuration, created_at, updated_at, updated_by
|
||||
""";
|
||||
|
||||
private const string SelectByIdSql = $"""
|
||||
SELECT {SelectSourceColumns}
|
||||
FROM sources
|
||||
WHERE tenant_id = @tenant_id AND source_id = @source_id
|
||||
""";
|
||||
|
||||
private const string SelectByNameSql = $"""
|
||||
SELECT {SelectSourceColumns}
|
||||
FROM sources
|
||||
WHERE tenant_id = @tenant_id AND name = @name
|
||||
""";
|
||||
|
||||
private const string InsertSourceSql = """
|
||||
INSERT INTO sources (
|
||||
source_id, tenant_id, name, source_type, enabled, paused, pause_reason,
|
||||
pause_ticket, configuration, created_at, updated_at, updated_by)
|
||||
VALUES (
|
||||
@source_id, @tenant_id, @name, @source_type, @enabled, @paused, @pause_reason,
|
||||
@pause_ticket, @configuration, @created_at, @updated_at, @updated_by)
|
||||
""";
|
||||
|
||||
private const string UpdateSourceSql = """
|
||||
UPDATE sources
|
||||
SET name = @name,
|
||||
source_type = @source_type,
|
||||
enabled = @enabled,
|
||||
paused = @paused,
|
||||
pause_reason = @pause_reason,
|
||||
pause_ticket = @pause_ticket,
|
||||
configuration = @configuration,
|
||||
updated_at = @updated_at,
|
||||
updated_by = @updated_by
|
||||
WHERE tenant_id = @tenant_id AND source_id = @source_id
|
||||
""";
|
||||
|
||||
private const string PauseSourceSql = """
|
||||
UPDATE sources
|
||||
SET paused = TRUE,
|
||||
pause_reason = @pause_reason,
|
||||
pause_ticket = @pause_ticket,
|
||||
updated_at = @updated_at,
|
||||
updated_by = @updated_by
|
||||
WHERE tenant_id = @tenant_id AND source_id = @source_id
|
||||
""";
|
||||
|
||||
private const string ResumeSourceSql = """
|
||||
UPDATE sources
|
||||
SET paused = FALSE,
|
||||
pause_reason = NULL,
|
||||
pause_ticket = NULL,
|
||||
updated_at = @updated_at,
|
||||
updated_by = @updated_by
|
||||
WHERE tenant_id = @tenant_id AND source_id = @source_id
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresSourceRepository> _logger;
|
||||
|
||||
public PostgresSourceRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresSourceRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<Source?> GetByIdAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("source_id", sourceId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapSource(reader);
|
||||
}
|
||||
|
||||
public async Task<Source?> GetByNameAsync(string tenantId, string name, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByNameSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("name", name);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapSource(reader);
|
||||
}
|
||||
|
||||
public async Task CreateAsync(Source source, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(source.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(InsertSourceSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
AddSourceParameters(command, source);
|
||||
|
||||
try
|
||||
{
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.SourceCreated(source.TenantId, source.SourceType);
|
||||
}
|
||||
catch (PostgresException ex) when (string.Equals(ex.SqlState, PostgresErrorCodes.UniqueViolation, StringComparison.Ordinal))
|
||||
{
|
||||
_logger.LogWarning("Duplicate source name: {Name}", source.Name);
|
||||
throw new DuplicateSourceException(source.Name, ex);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task UpdateAsync(Source source, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(source.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(UpdateSourceSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", source.TenantId);
|
||||
command.Parameters.AddWithValue("source_id", source.SourceId);
|
||||
command.Parameters.AddWithValue("name", source.Name);
|
||||
command.Parameters.AddWithValue("source_type", source.SourceType);
|
||||
command.Parameters.AddWithValue("enabled", source.Enabled);
|
||||
command.Parameters.AddWithValue("paused", source.Paused);
|
||||
command.Parameters.AddWithValue("pause_reason", (object?)source.PauseReason ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("pause_ticket", (object?)source.PauseTicket ?? DBNull.Value);
|
||||
command.Parameters.Add(new NpgsqlParameter("configuration", NpgsqlDbType.Jsonb)
|
||||
{
|
||||
Value = (object?)source.Configuration ?? DBNull.Value
|
||||
});
|
||||
command.Parameters.AddWithValue("updated_at", source.UpdatedAt);
|
||||
command.Parameters.AddWithValue("updated_by", source.UpdatedBy);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (rows == 0)
|
||||
{
|
||||
_logger.LogWarning("Source not found for update: {SourceId}", source.SourceId);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task PauseAsync(string tenantId, Guid sourceId, string reason, string? ticket, string updatedBy, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(PauseSourceSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("source_id", sourceId);
|
||||
command.Parameters.AddWithValue("pause_reason", reason);
|
||||
command.Parameters.AddWithValue("pause_ticket", (object?)ticket ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
|
||||
command.Parameters.AddWithValue("updated_by", updatedBy);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (rows > 0)
|
||||
{
|
||||
OrchestratorMetrics.SourcePaused(tenantId);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task ResumeAsync(string tenantId, Guid sourceId, string updatedBy, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(ResumeSourceSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("source_id", sourceId);
|
||||
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
|
||||
command.Parameters.AddWithValue("updated_by", updatedBy);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (rows > 0)
|
||||
{
|
||||
OrchestratorMetrics.SourceResumed(tenantId);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Source>> ListAsync(
|
||||
string tenantId,
|
||||
string? sourceType,
|
||||
bool? enabled,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var (sql, parameters) = BuildListQuery(tenantId, sourceType, enabled, limit, offset);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var sources = new List<Source>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
sources.Add(MapSource(reader));
|
||||
}
|
||||
return sources;
|
||||
}
|
||||
|
||||
private static void AddSourceParameters(NpgsqlCommand command, Source source)
|
||||
{
|
||||
command.Parameters.AddWithValue("source_id", source.SourceId);
|
||||
command.Parameters.AddWithValue("tenant_id", source.TenantId);
|
||||
command.Parameters.AddWithValue("name", source.Name);
|
||||
command.Parameters.AddWithValue("source_type", source.SourceType);
|
||||
command.Parameters.AddWithValue("enabled", source.Enabled);
|
||||
command.Parameters.AddWithValue("paused", source.Paused);
|
||||
command.Parameters.AddWithValue("pause_reason", (object?)source.PauseReason ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("pause_ticket", (object?)source.PauseTicket ?? DBNull.Value);
|
||||
command.Parameters.Add(new NpgsqlParameter("configuration", NpgsqlDbType.Jsonb)
|
||||
{
|
||||
Value = (object?)source.Configuration ?? DBNull.Value
|
||||
});
|
||||
command.Parameters.AddWithValue("created_at", source.CreatedAt);
|
||||
command.Parameters.AddWithValue("updated_at", source.UpdatedAt);
|
||||
command.Parameters.AddWithValue("updated_by", source.UpdatedBy);
|
||||
}
|
||||
|
||||
private static Source MapSource(NpgsqlDataReader reader)
|
||||
{
|
||||
return new Source(
|
||||
SourceId: reader.GetGuid(0),
|
||||
TenantId: reader.GetString(1),
|
||||
Name: reader.GetString(2),
|
||||
SourceType: reader.GetString(3),
|
||||
Enabled: reader.GetBoolean(4),
|
||||
Paused: reader.GetBoolean(5),
|
||||
PauseReason: reader.IsDBNull(6) ? null : reader.GetString(6),
|
||||
PauseTicket: reader.IsDBNull(7) ? null : reader.GetString(7),
|
||||
Configuration: reader.IsDBNull(8) ? null : reader.GetString(8),
|
||||
CreatedAt: reader.GetFieldValue<DateTimeOffset>(9),
|
||||
UpdatedAt: reader.GetFieldValue<DateTimeOffset>(10),
|
||||
UpdatedBy: reader.GetString(11));
|
||||
}
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
|
||||
string tenantId,
|
||||
string? sourceType,
|
||||
bool? enabled,
|
||||
int limit,
|
||||
int offset)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append($"SELECT {SelectSourceColumns} FROM sources WHERE tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (!string.IsNullOrEmpty(sourceType))
|
||||
{
|
||||
sb.Append(" AND source_type = @source_type");
|
||||
parameters.Add(("source_type", sourceType));
|
||||
}
|
||||
|
||||
if (enabled.HasValue)
|
||||
{
|
||||
sb.Append(" AND enabled = @enabled");
|
||||
parameters.Add(("enabled", enabled.Value));
|
||||
}
|
||||
|
||||
sb.Append(" ORDER BY name LIMIT @limit OFFSET @offset");
|
||||
parameters.Add(("limit", limit));
|
||||
parameters.Add(("offset", offset));
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when attempting to create a source with a duplicate name.
|
||||
/// </summary>
|
||||
public sealed class DuplicateSourceException : Exception
|
||||
{
|
||||
public string Name { get; }
|
||||
|
||||
public DuplicateSourceException(string name, Exception innerException)
|
||||
: base($"Source with name '{name}' already exists.", innerException)
|
||||
{
|
||||
Name = name;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,310 @@
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Npgsql;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of throttle repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresThrottleRepository : IThrottleRepository
|
||||
{
|
||||
private const string SelectThrottleColumns = """
|
||||
throttle_id, tenant_id, source_id, job_type, active, reason, ticket,
|
||||
created_at, expires_at, created_by
|
||||
""";
|
||||
|
||||
private const string SelectByIdSql = $"""
|
||||
SELECT {SelectThrottleColumns}
|
||||
FROM throttles
|
||||
WHERE tenant_id = @tenant_id AND throttle_id = @throttle_id
|
||||
""";
|
||||
|
||||
private const string SelectActiveBySourceSql = $"""
|
||||
SELECT {SelectThrottleColumns}
|
||||
FROM throttles
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND source_id = @source_id
|
||||
AND active = TRUE
|
||||
AND (expires_at IS NULL OR expires_at > @now)
|
||||
ORDER BY created_at DESC
|
||||
""";
|
||||
|
||||
private const string SelectActiveByJobTypeSql = $"""
|
||||
SELECT {SelectThrottleColumns}
|
||||
FROM throttles
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND job_type = @job_type
|
||||
AND active = TRUE
|
||||
AND (expires_at IS NULL OR expires_at > @now)
|
||||
ORDER BY created_at DESC
|
||||
""";
|
||||
|
||||
private const string InsertThrottleSql = """
|
||||
INSERT INTO throttles (
|
||||
throttle_id, tenant_id, source_id, job_type, active, reason, ticket,
|
||||
created_at, expires_at, created_by)
|
||||
VALUES (
|
||||
@throttle_id, @tenant_id, @source_id, @job_type, @active, @reason, @ticket,
|
||||
@created_at, @expires_at, @created_by)
|
||||
""";
|
||||
|
||||
private const string DeactivateSql = """
|
||||
UPDATE throttles
|
||||
SET active = FALSE
|
||||
WHERE tenant_id = @tenant_id AND throttle_id = @throttle_id
|
||||
""";
|
||||
|
||||
private const string DeactivateBySourceSql = """
|
||||
UPDATE throttles
|
||||
SET active = FALSE
|
||||
WHERE tenant_id = @tenant_id AND source_id = @source_id AND active = TRUE
|
||||
""";
|
||||
|
||||
private const string DeactivateByJobTypeSql = """
|
||||
UPDATE throttles
|
||||
SET active = FALSE
|
||||
WHERE tenant_id = @tenant_id AND job_type = @job_type AND active = TRUE
|
||||
""";
|
||||
|
||||
private const string CleanupExpiredSql = """
|
||||
UPDATE throttles
|
||||
SET active = FALSE
|
||||
WHERE active = TRUE AND expires_at IS NOT NULL AND expires_at <= @now
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresThrottleRepository> _logger;
|
||||
|
||||
public PostgresThrottleRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresThrottleRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<Throttle?> GetByIdAsync(string tenantId, Guid throttleId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("throttle_id", throttleId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapThrottle(reader);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Throttle>> GetActiveBySourceAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectActiveBySourceSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("source_id", sourceId);
|
||||
command.Parameters.AddWithValue("now", DateTimeOffset.UtcNow);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var throttles = new List<Throttle>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
throttles.Add(MapThrottle(reader));
|
||||
}
|
||||
return throttles;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Throttle>> GetActiveByJobTypeAsync(string tenantId, string jobType, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectActiveByJobTypeSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("job_type", jobType);
|
||||
command.Parameters.AddWithValue("now", DateTimeOffset.UtcNow);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var throttles = new List<Throttle>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
throttles.Add(MapThrottle(reader));
|
||||
}
|
||||
return throttles;
|
||||
}
|
||||
|
||||
public async Task CreateAsync(Throttle throttle, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(throttle.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(InsertThrottleSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("throttle_id", throttle.ThrottleId);
|
||||
command.Parameters.AddWithValue("tenant_id", throttle.TenantId);
|
||||
command.Parameters.AddWithValue("source_id", (object?)throttle.SourceId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("job_type", (object?)throttle.JobType ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("active", throttle.Active);
|
||||
command.Parameters.AddWithValue("reason", throttle.Reason);
|
||||
command.Parameters.AddWithValue("ticket", (object?)throttle.Ticket ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("created_at", throttle.CreatedAt);
|
||||
command.Parameters.AddWithValue("expires_at", (object?)throttle.ExpiresAt ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("created_by", throttle.CreatedBy);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.ThrottleCreated(throttle.TenantId, throttle.Reason);
|
||||
}
|
||||
|
||||
public async Task DeactivateAsync(string tenantId, Guid throttleId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(DeactivateSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("throttle_id", throttleId);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (rows > 0)
|
||||
{
|
||||
OrchestratorMetrics.ThrottleDeactivated(tenantId);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task DeactivateBySourceAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(DeactivateBySourceSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("source_id", sourceId);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (rows > 0)
|
||||
{
|
||||
_logger.LogInformation("Deactivated {Count} throttles for source {SourceId}", rows, sourceId);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task DeactivateByJobTypeAsync(string tenantId, string jobType, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(DeactivateByJobTypeSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("job_type", jobType);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (rows > 0)
|
||||
{
|
||||
_logger.LogInformation("Deactivated {Count} throttles for job type {JobType}", rows, jobType);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<int> CleanupExpiredAsync(DateTimeOffset now, CancellationToken cancellationToken)
|
||||
{
|
||||
// Use system tenant for cross-tenant cleanup operations
|
||||
// In production, this should use a dedicated admin connection or be run by a background service
|
||||
await using var connection = await _dataSource.OpenConnectionAsync("system", "admin", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(CleanupExpiredSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("now", now);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (rows > 0)
|
||||
{
|
||||
_logger.LogInformation("Cleaned up {Count} expired throttles", rows);
|
||||
}
|
||||
return rows;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Throttle>> ListAsync(
|
||||
string tenantId,
|
||||
bool? active,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var (sql, parameters) = BuildListQuery(tenantId, active, sourceId, jobType, limit, offset);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var throttles = new List<Throttle>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
throttles.Add(MapThrottle(reader));
|
||||
}
|
||||
return throttles;
|
||||
}
|
||||
|
||||
private static Throttle MapThrottle(NpgsqlDataReader reader)
|
||||
{
|
||||
return new Throttle(
|
||||
ThrottleId: reader.GetGuid(0),
|
||||
TenantId: reader.GetString(1),
|
||||
SourceId: reader.IsDBNull(2) ? null : reader.GetGuid(2),
|
||||
JobType: reader.IsDBNull(3) ? null : reader.GetString(3),
|
||||
Active: reader.GetBoolean(4),
|
||||
Reason: reader.GetString(5),
|
||||
Ticket: reader.IsDBNull(6) ? null : reader.GetString(6),
|
||||
CreatedAt: reader.GetFieldValue<DateTimeOffset>(7),
|
||||
ExpiresAt: reader.IsDBNull(8) ? null : reader.GetFieldValue<DateTimeOffset>(8),
|
||||
CreatedBy: reader.GetString(9));
|
||||
}
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
|
||||
string tenantId,
|
||||
bool? active,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
int limit,
|
||||
int offset)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append($"SELECT {SelectThrottleColumns} FROM throttles WHERE tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (active.HasValue)
|
||||
{
|
||||
sb.Append(" AND active = @active");
|
||||
parameters.Add(("active", active.Value));
|
||||
}
|
||||
|
||||
if (sourceId.HasValue)
|
||||
{
|
||||
sb.Append(" AND source_id = @source_id");
|
||||
parameters.Add(("source_id", sourceId.Value));
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(jobType))
|
||||
{
|
||||
sb.Append(" AND job_type = @job_type");
|
||||
parameters.Add(("job_type", jobType));
|
||||
}
|
||||
|
||||
sb.Append(" ORDER BY created_at DESC LIMIT @limit OFFSET @offset");
|
||||
parameters.Add(("limit", limit));
|
||||
parameters.Add(("offset", offset));
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,386 @@
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Npgsql;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
|
||||
/// <summary>
|
||||
/// PostgreSQL implementation of watermark repository.
|
||||
/// </summary>
|
||||
public sealed class PostgresWatermarkRepository : IWatermarkRepository
|
||||
{
|
||||
private const string SelectWatermarkColumns = """
|
||||
watermark_id, tenant_id, source_id, job_type, scope_key,
|
||||
high_watermark, low_watermark, sequence_number, processed_count,
|
||||
last_batch_hash, created_at, updated_at, updated_by
|
||||
""";
|
||||
|
||||
private const string SelectByScopeKeySql = $"""
|
||||
SELECT {SelectWatermarkColumns}
|
||||
FROM watermarks
|
||||
WHERE tenant_id = @tenant_id AND scope_key = @scope_key
|
||||
""";
|
||||
|
||||
private const string SelectBySourceIdSql = $"""
|
||||
SELECT {SelectWatermarkColumns}
|
||||
FROM watermarks
|
||||
WHERE tenant_id = @tenant_id AND source_id = @source_id AND job_type IS NULL
|
||||
""";
|
||||
|
||||
private const string SelectByJobTypeSql = $"""
|
||||
SELECT {SelectWatermarkColumns}
|
||||
FROM watermarks
|
||||
WHERE tenant_id = @tenant_id AND job_type = @job_type AND source_id IS NULL
|
||||
""";
|
||||
|
||||
private const string SelectBySourceAndJobTypeSql = $"""
|
||||
SELECT {SelectWatermarkColumns}
|
||||
FROM watermarks
|
||||
WHERE tenant_id = @tenant_id AND source_id = @source_id AND job_type = @job_type
|
||||
""";
|
||||
|
||||
private const string InsertWatermarkSql = """
|
||||
INSERT INTO watermarks (
|
||||
watermark_id, tenant_id, source_id, job_type, scope_key,
|
||||
high_watermark, low_watermark, sequence_number, processed_count,
|
||||
last_batch_hash, created_at, updated_at, updated_by)
|
||||
VALUES (
|
||||
@watermark_id, @tenant_id, @source_id, @job_type, @scope_key,
|
||||
@high_watermark, @low_watermark, @sequence_number, @processed_count,
|
||||
@last_batch_hash, @created_at, @updated_at, @updated_by)
|
||||
""";
|
||||
|
||||
private const string UpdateWatermarkSql = """
|
||||
UPDATE watermarks
|
||||
SET high_watermark = @high_watermark,
|
||||
low_watermark = @low_watermark,
|
||||
sequence_number = @sequence_number,
|
||||
processed_count = @processed_count,
|
||||
last_batch_hash = @last_batch_hash,
|
||||
updated_at = @updated_at,
|
||||
updated_by = @updated_by
|
||||
WHERE tenant_id = @tenant_id AND watermark_id = @watermark_id
|
||||
AND sequence_number = @expected_sequence_number
|
||||
""";
|
||||
|
||||
private const string UpsertWatermarkSql = """
|
||||
INSERT INTO watermarks (
|
||||
watermark_id, tenant_id, source_id, job_type, scope_key,
|
||||
high_watermark, low_watermark, sequence_number, processed_count,
|
||||
last_batch_hash, created_at, updated_at, updated_by)
|
||||
VALUES (
|
||||
@watermark_id, @tenant_id, @source_id, @job_type, @scope_key,
|
||||
@high_watermark, @low_watermark, @sequence_number, @processed_count,
|
||||
@last_batch_hash, @created_at, @updated_at, @updated_by)
|
||||
ON CONFLICT (tenant_id, scope_key) DO UPDATE
|
||||
SET high_watermark = EXCLUDED.high_watermark,
|
||||
low_watermark = EXCLUDED.low_watermark,
|
||||
sequence_number = EXCLUDED.sequence_number,
|
||||
processed_count = EXCLUDED.processed_count,
|
||||
last_batch_hash = EXCLUDED.last_batch_hash,
|
||||
updated_at = EXCLUDED.updated_at,
|
||||
updated_by = EXCLUDED.updated_by
|
||||
""";
|
||||
|
||||
private const string DeleteWatermarkSql = """
|
||||
DELETE FROM watermarks
|
||||
WHERE tenant_id = @tenant_id AND scope_key = @scope_key
|
||||
""";
|
||||
|
||||
private const string SelectLaggingSql = $"""
|
||||
SELECT {SelectWatermarkColumns}
|
||||
FROM watermarks
|
||||
WHERE tenant_id = @tenant_id
|
||||
AND high_watermark < @lag_threshold
|
||||
ORDER BY high_watermark ASC
|
||||
LIMIT @limit
|
||||
""";
|
||||
|
||||
private readonly OrchestratorDataSource _dataSource;
|
||||
private readonly ILogger<PostgresWatermarkRepository> _logger;
|
||||
|
||||
public PostgresWatermarkRepository(
|
||||
OrchestratorDataSource dataSource,
|
||||
ILogger<PostgresWatermarkRepository> logger)
|
||||
{
|
||||
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<Watermark?> GetByScopeKeyAsync(string tenantId, string scopeKey, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByScopeKeySql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("scope_key", scopeKey);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapWatermark(reader);
|
||||
}
|
||||
|
||||
public async Task<Watermark?> GetBySourceIdAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectBySourceIdSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("source_id", sourceId);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapWatermark(reader);
|
||||
}
|
||||
|
||||
public async Task<Watermark?> GetByJobTypeAsync(string tenantId, string jobType, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectByJobTypeSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("job_type", jobType);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapWatermark(reader);
|
||||
}
|
||||
|
||||
public async Task<Watermark?> GetBySourceAndJobTypeAsync(string tenantId, Guid sourceId, string jobType, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectBySourceAndJobTypeSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("source_id", sourceId);
|
||||
command.Parameters.AddWithValue("job_type", jobType);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return MapWatermark(reader);
|
||||
}
|
||||
|
||||
public async Task CreateAsync(Watermark watermark, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(watermark.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(InsertWatermarkSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
AddWatermarkParameters(command, watermark);
|
||||
|
||||
try
|
||||
{
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.WatermarkCreated(watermark.TenantId, watermark.ScopeKey);
|
||||
}
|
||||
catch (PostgresException ex) when (string.Equals(ex.SqlState, PostgresErrorCodes.UniqueViolation, StringComparison.Ordinal))
|
||||
{
|
||||
_logger.LogWarning("Duplicate watermark for tenant {TenantId} scope {ScopeKey}", watermark.TenantId, watermark.ScopeKey);
|
||||
throw new DuplicateWatermarkException(watermark.TenantId, watermark.ScopeKey, ex);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<bool> UpdateAsync(Watermark watermark, long expectedSequenceNumber, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(watermark.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(UpdateWatermarkSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", watermark.TenantId);
|
||||
command.Parameters.AddWithValue("watermark_id", watermark.WatermarkId);
|
||||
command.Parameters.AddWithValue("high_watermark", watermark.HighWatermark);
|
||||
command.Parameters.AddWithValue("low_watermark", (object?)watermark.LowWatermark ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("sequence_number", watermark.SequenceNumber);
|
||||
command.Parameters.AddWithValue("processed_count", watermark.ProcessedCount);
|
||||
command.Parameters.AddWithValue("last_batch_hash", (object?)watermark.LastBatchHash ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("updated_at", watermark.UpdatedAt);
|
||||
command.Parameters.AddWithValue("updated_by", watermark.UpdatedBy);
|
||||
command.Parameters.AddWithValue("expected_sequence_number", expectedSequenceNumber);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (rows > 0)
|
||||
{
|
||||
OrchestratorMetrics.WatermarkAdvanced(watermark.TenantId, watermark.ScopeKey);
|
||||
}
|
||||
|
||||
return rows > 0;
|
||||
}
|
||||
|
||||
public async Task UpsertAsync(Watermark watermark, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(watermark.TenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(UpsertWatermarkSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
AddWatermarkParameters(command, watermark);
|
||||
|
||||
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
OrchestratorMetrics.WatermarkAdvanced(watermark.TenantId, watermark.ScopeKey);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Watermark>> ListAsync(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var (sql, parameters) = BuildListQuery(tenantId, sourceId, jobType, limit, offset);
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(sql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
foreach (var (name, value) in parameters)
|
||||
{
|
||||
command.Parameters.AddWithValue(name, value);
|
||||
}
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var watermarks = new List<Watermark>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
watermarks.Add(MapWatermark(reader));
|
||||
}
|
||||
return watermarks;
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Watermark>> GetLaggingAsync(
|
||||
string tenantId,
|
||||
TimeSpan lagThreshold,
|
||||
int limit,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var thresholdTime = DateTimeOffset.UtcNow - lagThreshold;
|
||||
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(SelectLaggingSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("lag_threshold", thresholdTime);
|
||||
command.Parameters.AddWithValue("limit", limit);
|
||||
|
||||
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
|
||||
var watermarks = new List<Watermark>();
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
watermarks.Add(MapWatermark(reader));
|
||||
}
|
||||
return watermarks;
|
||||
}
|
||||
|
||||
public async Task<bool> DeleteAsync(string tenantId, string scopeKey, CancellationToken cancellationToken)
|
||||
{
|
||||
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = new NpgsqlCommand(DeleteWatermarkSql, connection);
|
||||
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
|
||||
|
||||
command.Parameters.AddWithValue("tenant_id", tenantId);
|
||||
command.Parameters.AddWithValue("scope_key", scopeKey);
|
||||
|
||||
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
||||
return rows > 0;
|
||||
}
|
||||
|
||||
private static void AddWatermarkParameters(NpgsqlCommand command, Watermark watermark)
|
||||
{
|
||||
command.Parameters.AddWithValue("watermark_id", watermark.WatermarkId);
|
||||
command.Parameters.AddWithValue("tenant_id", watermark.TenantId);
|
||||
command.Parameters.AddWithValue("source_id", (object?)watermark.SourceId ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("job_type", (object?)watermark.JobType ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("scope_key", watermark.ScopeKey);
|
||||
command.Parameters.AddWithValue("high_watermark", watermark.HighWatermark);
|
||||
command.Parameters.AddWithValue("low_watermark", (object?)watermark.LowWatermark ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("sequence_number", watermark.SequenceNumber);
|
||||
command.Parameters.AddWithValue("processed_count", watermark.ProcessedCount);
|
||||
command.Parameters.AddWithValue("last_batch_hash", (object?)watermark.LastBatchHash ?? DBNull.Value);
|
||||
command.Parameters.AddWithValue("created_at", watermark.CreatedAt);
|
||||
command.Parameters.AddWithValue("updated_at", watermark.UpdatedAt);
|
||||
command.Parameters.AddWithValue("updated_by", watermark.UpdatedBy);
|
||||
}
|
||||
|
||||
private static Watermark MapWatermark(NpgsqlDataReader reader)
|
||||
{
|
||||
return new Watermark(
|
||||
WatermarkId: reader.GetGuid(0),
|
||||
TenantId: reader.GetString(1),
|
||||
SourceId: reader.IsDBNull(2) ? null : reader.GetGuid(2),
|
||||
JobType: reader.IsDBNull(3) ? null : reader.GetString(3),
|
||||
ScopeKey: reader.GetString(4),
|
||||
HighWatermark: reader.GetFieldValue<DateTimeOffset>(5),
|
||||
LowWatermark: reader.IsDBNull(6) ? null : reader.GetFieldValue<DateTimeOffset>(6),
|
||||
SequenceNumber: reader.GetInt64(7),
|
||||
ProcessedCount: reader.GetInt64(8),
|
||||
LastBatchHash: reader.IsDBNull(9) ? null : reader.GetString(9),
|
||||
CreatedAt: reader.GetFieldValue<DateTimeOffset>(10),
|
||||
UpdatedAt: reader.GetFieldValue<DateTimeOffset>(11),
|
||||
UpdatedBy: reader.GetString(12));
|
||||
}
|
||||
|
||||
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
int limit,
|
||||
int offset)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append($"SELECT {SelectWatermarkColumns} FROM watermarks WHERE tenant_id = @tenant_id");
|
||||
|
||||
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
|
||||
|
||||
if (sourceId.HasValue)
|
||||
{
|
||||
sb.Append(" AND source_id = @source_id");
|
||||
parameters.Add(("source_id", sourceId.Value));
|
||||
}
|
||||
|
||||
if (jobType is not null)
|
||||
{
|
||||
sb.Append(" AND job_type = @job_type");
|
||||
parameters.Add(("job_type", jobType));
|
||||
}
|
||||
|
||||
sb.Append(" ORDER BY updated_at DESC LIMIT @limit OFFSET @offset");
|
||||
parameters.Add(("limit", limit));
|
||||
parameters.Add(("offset", offset));
|
||||
|
||||
return (sb.ToString(), parameters);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when attempting to create a duplicate watermark.
|
||||
/// </summary>
|
||||
public sealed class DuplicateWatermarkException : Exception
|
||||
{
|
||||
public string TenantId { get; }
|
||||
public string ScopeKey { get; }
|
||||
|
||||
public DuplicateWatermarkException(string tenantId, string scopeKey, Exception innerException)
|
||||
: base($"Watermark for tenant '{tenantId}' and scope '{scopeKey}' already exists.", innerException)
|
||||
{
|
||||
TenantId = tenantId;
|
||||
ScopeKey = scopeKey;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for artifact persistence operations.
|
||||
/// </summary>
|
||||
public interface IArtifactRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets an artifact by ID.
|
||||
/// </summary>
|
||||
Task<Artifact?> GetByIdAsync(string tenantId, Guid artifactId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets artifacts by job ID.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Artifact>> GetByJobIdAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets artifacts by run ID.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Artifact>> GetByRunIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets an artifact by its content digest.
|
||||
/// </summary>
|
||||
Task<Artifact?> GetByDigestAsync(string tenantId, string digest, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new artifact.
|
||||
/// </summary>
|
||||
Task CreateAsync(Artifact artifact, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Creates multiple artifacts in a batch.
|
||||
/// </summary>
|
||||
Task CreateBatchAsync(IEnumerable<Artifact> artifacts, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Lists artifacts with pagination and filters.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Artifact>> ListAsync(
|
||||
string tenantId,
|
||||
string? artifactType,
|
||||
string? jobType,
|
||||
DateTimeOffset? createdAfter,
|
||||
DateTimeOffset? createdBefore,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Counts artifacts matching the filters.
|
||||
/// </summary>
|
||||
Task<int> CountAsync(
|
||||
string tenantId,
|
||||
string? artifactType,
|
||||
string? jobType,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -0,0 +1,127 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// Repository for audit log entries.
|
||||
/// </summary>
|
||||
public interface IAuditRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Appends a new audit entry to the log.
|
||||
/// </summary>
|
||||
Task<AuditEntry> AppendAsync(
|
||||
string tenantId,
|
||||
AuditEventType eventType,
|
||||
string resourceType,
|
||||
Guid resourceId,
|
||||
string actorId,
|
||||
ActorType actorType,
|
||||
string description,
|
||||
string? oldState = null,
|
||||
string? newState = null,
|
||||
string? actorIp = null,
|
||||
string? userAgent = null,
|
||||
string? httpMethod = null,
|
||||
string? requestPath = null,
|
||||
string? correlationId = null,
|
||||
string? metadata = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets an audit entry by ID.
|
||||
/// </summary>
|
||||
Task<AuditEntry?> GetByIdAsync(
|
||||
string tenantId,
|
||||
Guid entryId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Lists audit entries with optional filters.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<AuditEntry>> ListAsync(
|
||||
string tenantId,
|
||||
AuditEventType? eventType = null,
|
||||
string? resourceType = null,
|
||||
Guid? resourceId = null,
|
||||
string? actorId = null,
|
||||
DateTimeOffset? startTime = null,
|
||||
DateTimeOffset? endTime = null,
|
||||
int limit = 100,
|
||||
int offset = 0,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets audit entries by sequence range.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<AuditEntry>> GetBySequenceRangeAsync(
|
||||
string tenantId,
|
||||
long startSequence,
|
||||
long endSequence,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the latest audit entry for a tenant.
|
||||
/// </summary>
|
||||
Task<AuditEntry?> GetLatestAsync(
|
||||
string tenantId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets audit entries for a specific resource.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<AuditEntry>> GetByResourceAsync(
|
||||
string tenantId,
|
||||
string resourceType,
|
||||
Guid resourceId,
|
||||
int limit = 100,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the count of audit entries.
|
||||
/// </summary>
|
||||
Task<long> GetCountAsync(
|
||||
string tenantId,
|
||||
AuditEventType? eventType = null,
|
||||
DateTimeOffset? startTime = null,
|
||||
DateTimeOffset? endTime = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the chain integrity for a range of entries.
|
||||
/// </summary>
|
||||
Task<ChainVerificationResult> VerifyChainAsync(
|
||||
string tenantId,
|
||||
long? startSequence = null,
|
||||
long? endSequence = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets audit summary statistics.
|
||||
/// </summary>
|
||||
Task<AuditSummary> GetSummaryAsync(
|
||||
string tenantId,
|
||||
DateTimeOffset? since = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of chain verification.
|
||||
/// </summary>
|
||||
public sealed record ChainVerificationResult(
|
||||
bool IsValid,
|
||||
Guid? InvalidEntryId,
|
||||
long? InvalidSequence,
|
||||
string? ErrorMessage);
|
||||
|
||||
/// <summary>
|
||||
/// Audit summary statistics.
|
||||
/// </summary>
|
||||
public sealed record AuditSummary(
|
||||
long TotalEntries,
|
||||
long EntriesSince,
|
||||
long EventTypes,
|
||||
long UniqueActors,
|
||||
long UniqueResources,
|
||||
DateTimeOffset? EarliestEntry,
|
||||
DateTimeOffset? LatestEntry);
|
||||
@@ -0,0 +1,200 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for backfill request persistence operations.
|
||||
/// </summary>
|
||||
public interface IBackfillRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets a backfill request by ID.
|
||||
/// </summary>
|
||||
Task<BackfillRequest?> GetByIdAsync(string tenantId, Guid backfillId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new backfill request.
|
||||
/// </summary>
|
||||
Task CreateAsync(BackfillRequest request, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Updates a backfill request.
|
||||
/// </summary>
|
||||
Task UpdateAsync(BackfillRequest request, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Lists backfill requests with filters.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<BackfillRequest>> ListAsync(
|
||||
string tenantId,
|
||||
BackfillStatus? status,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Checks for overlapping active backfills.
|
||||
/// </summary>
|
||||
Task<bool> HasOverlappingActiveAsync(
|
||||
string tenantId,
|
||||
string scopeKey,
|
||||
DateTimeOffset windowStart,
|
||||
DateTimeOffset windowEnd,
|
||||
Guid? excludeBackfillId,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets running backfills for a scope.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<BackfillRequest>> GetActiveByScope(
|
||||
string tenantId,
|
||||
string scopeKey,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Counts backfill requests by status.
|
||||
/// </summary>
|
||||
Task<IDictionary<BackfillStatus, int>> CountByStatusAsync(
|
||||
string tenantId,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the next backfill ready for processing.
|
||||
/// </summary>
|
||||
Task<BackfillRequest?> GetNextPendingAsync(string tenantId, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for backfill checkpoint persistence.
|
||||
/// </summary>
|
||||
public interface IBackfillCheckpointRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the latest checkpoint for a backfill.
|
||||
/// </summary>
|
||||
Task<BackfillCheckpoint?> GetLatestAsync(string tenantId, Guid backfillId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets all checkpoints for a backfill.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<BackfillCheckpoint>> GetAllAsync(string tenantId, Guid backfillId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new checkpoint.
|
||||
/// </summary>
|
||||
Task CreateAsync(BackfillCheckpoint checkpoint, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Updates a checkpoint (e.g., mark complete).
|
||||
/// </summary>
|
||||
Task UpdateAsync(BackfillCheckpoint checkpoint, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents a backfill processing checkpoint.
|
||||
/// </summary>
|
||||
public sealed record BackfillCheckpoint(
|
||||
/// <summary>Unique checkpoint identifier.</summary>
|
||||
Guid CheckpointId,
|
||||
|
||||
/// <summary>Tenant this checkpoint belongs to.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Parent backfill request ID.</summary>
|
||||
Guid BackfillId,
|
||||
|
||||
/// <summary>Batch sequence number.</summary>
|
||||
int BatchNumber,
|
||||
|
||||
/// <summary>Start of batch time window.</summary>
|
||||
DateTimeOffset BatchStart,
|
||||
|
||||
/// <summary>End of batch time window.</summary>
|
||||
DateTimeOffset BatchEnd,
|
||||
|
||||
/// <summary>Total events in batch.</summary>
|
||||
int EventsInBatch,
|
||||
|
||||
/// <summary>Events processed in batch.</summary>
|
||||
int EventsProcessed,
|
||||
|
||||
/// <summary>Events skipped as duplicates.</summary>
|
||||
int EventsSkipped,
|
||||
|
||||
/// <summary>Events that failed processing.</summary>
|
||||
int EventsFailed,
|
||||
|
||||
/// <summary>Hash of the batch for integrity verification.</summary>
|
||||
string? BatchHash,
|
||||
|
||||
/// <summary>When batch processing started.</summary>
|
||||
DateTimeOffset StartedAt,
|
||||
|
||||
/// <summary>When batch processing completed.</summary>
|
||||
DateTimeOffset? CompletedAt,
|
||||
|
||||
/// <summary>Error message if batch failed.</summary>
|
||||
string? ErrorMessage)
|
||||
{
|
||||
/// <summary>
|
||||
/// Whether this checkpoint is complete.
|
||||
/// </summary>
|
||||
public bool IsComplete => CompletedAt.HasValue;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new checkpoint for a batch.
|
||||
/// </summary>
|
||||
public static BackfillCheckpoint Create(
|
||||
string tenantId,
|
||||
Guid backfillId,
|
||||
int batchNumber,
|
||||
DateTimeOffset batchStart,
|
||||
DateTimeOffset batchEnd,
|
||||
int eventsInBatch)
|
||||
{
|
||||
return new BackfillCheckpoint(
|
||||
CheckpointId: Guid.NewGuid(),
|
||||
TenantId: tenantId,
|
||||
BackfillId: backfillId,
|
||||
BatchNumber: batchNumber,
|
||||
BatchStart: batchStart,
|
||||
BatchEnd: batchEnd,
|
||||
EventsInBatch: eventsInBatch,
|
||||
EventsProcessed: 0,
|
||||
EventsSkipped: 0,
|
||||
EventsFailed: 0,
|
||||
BatchHash: null,
|
||||
StartedAt: DateTimeOffset.UtcNow,
|
||||
CompletedAt: null,
|
||||
ErrorMessage: null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Marks the checkpoint as complete.
|
||||
/// </summary>
|
||||
public BackfillCheckpoint Complete(int processed, int skipped, int failed, string? batchHash)
|
||||
{
|
||||
return this with
|
||||
{
|
||||
EventsProcessed = processed,
|
||||
EventsSkipped = skipped,
|
||||
EventsFailed = failed,
|
||||
BatchHash = batchHash,
|
||||
CompletedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Marks the checkpoint as failed.
|
||||
/// </summary>
|
||||
public BackfillCheckpoint Fail(string error)
|
||||
{
|
||||
return this with
|
||||
{
|
||||
CompletedAt = DateTimeOffset.UtcNow,
|
||||
ErrorMessage = error
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for DAG edge persistence operations.
|
||||
/// </summary>
|
||||
public interface IDagEdgeRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new DAG edge.
|
||||
/// </summary>
|
||||
Task CreateAsync(DagEdge edge, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Creates multiple DAG edges in a batch.
|
||||
/// </summary>
|
||||
Task CreateBatchAsync(IEnumerable<DagEdge> edges, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets all edges for a run.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<DagEdge>> GetByRunIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets parent edges (incoming) for a job.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<DagEdge>> GetParentEdgesAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets child edges (outgoing) for a job.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<DagEdge>> GetChildEdgesAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Checks if all parent dependencies are satisfied for a job.
|
||||
/// </summary>
|
||||
/// <param name="tenantId">Tenant ID.</param>
|
||||
/// <param name="jobId">Job to check dependencies for.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>True if all dependencies are satisfied.</returns>
|
||||
Task<bool> AreDependenciesSatisfiedAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for job history persistence operations.
|
||||
/// </summary>
|
||||
public interface IJobHistoryRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Appends a history entry for a job state change.
|
||||
/// </summary>
|
||||
Task AppendAsync(JobHistory history, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the history for a job.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<JobHistory>> GetByJobIdAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the latest history entry for a job.
|
||||
/// </summary>
|
||||
Task<JobHistory?> GetLatestByJobIdAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the next sequence number for a job's history.
|
||||
/// </summary>
|
||||
Task<int> GetNextSequenceNoAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -0,0 +1,100 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for job persistence operations.
|
||||
/// </summary>
|
||||
public interface IJobRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets a job by ID.
|
||||
/// </summary>
|
||||
Task<Job?> GetByIdAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a job by idempotency key.
|
||||
/// </summary>
|
||||
Task<Job?> GetByIdempotencyKeyAsync(string tenantId, string idempotencyKey, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new job.
|
||||
/// </summary>
|
||||
Task CreateAsync(Job job, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Updates a job's status and related fields.
|
||||
/// </summary>
|
||||
Task UpdateStatusAsync(
|
||||
string tenantId,
|
||||
Guid jobId,
|
||||
JobStatus status,
|
||||
int attempt,
|
||||
Guid? leaseId,
|
||||
string? workerId,
|
||||
string? taskRunnerId,
|
||||
DateTimeOffset? leaseUntil,
|
||||
DateTimeOffset? scheduledAt,
|
||||
DateTimeOffset? leasedAt,
|
||||
DateTimeOffset? completedAt,
|
||||
DateTimeOffset? notBefore,
|
||||
string? reason,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Acquires a lease on a pending/scheduled job for worker execution.
|
||||
/// </summary>
|
||||
/// <returns>The leased job, or null if no jobs available.</returns>
|
||||
Task<Job?> LeaseNextAsync(
|
||||
string tenantId,
|
||||
string? jobType,
|
||||
Guid leaseId,
|
||||
string workerId,
|
||||
DateTimeOffset leaseUntil,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Extends an existing lease.
|
||||
/// </summary>
|
||||
/// <returns>True if lease was extended, false if lease not found or expired.</returns>
|
||||
Task<bool> ExtendLeaseAsync(
|
||||
string tenantId,
|
||||
Guid jobId,
|
||||
Guid leaseId,
|
||||
DateTimeOffset newLeaseUntil,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets jobs by run ID.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Job>> GetByRunIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets jobs with expired leases.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Job>> GetExpiredLeasesAsync(string tenantId, DateTimeOffset cutoff, int limit, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Lists jobs with pagination and filters.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Job>> ListAsync(
|
||||
string tenantId,
|
||||
JobStatus? status,
|
||||
string? jobType,
|
||||
string? projectId,
|
||||
DateTimeOffset? createdAfter,
|
||||
DateTimeOffset? createdBefore,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Counts jobs matching the filters.
|
||||
/// </summary>
|
||||
Task<int> CountAsync(
|
||||
string tenantId,
|
||||
JobStatus? status,
|
||||
string? jobType,
|
||||
string? projectId,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -0,0 +1,210 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// Repository for run ledger entries.
|
||||
/// </summary>
|
||||
public interface ILedgerRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Appends a new ledger entry from a completed run.
|
||||
/// </summary>
|
||||
Task<RunLedgerEntry> AppendAsync(
|
||||
Run run,
|
||||
IReadOnlyList<Artifact> artifacts,
|
||||
string inputDigest,
|
||||
string? metadata = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a ledger entry by ID.
|
||||
/// </summary>
|
||||
Task<RunLedgerEntry?> GetByIdAsync(
|
||||
string tenantId,
|
||||
Guid ledgerId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a ledger entry by run ID.
|
||||
/// </summary>
|
||||
Task<RunLedgerEntry?> GetByRunIdAsync(
|
||||
string tenantId,
|
||||
Guid runId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Lists ledger entries with optional filters.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<RunLedgerEntry>> ListAsync(
|
||||
string tenantId,
|
||||
string? runType = null,
|
||||
Guid? sourceId = null,
|
||||
RunStatus? finalStatus = null,
|
||||
DateTimeOffset? startTime = null,
|
||||
DateTimeOffset? endTime = null,
|
||||
int limit = 100,
|
||||
int offset = 0,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets ledger entries by sequence range.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<RunLedgerEntry>> GetBySequenceRangeAsync(
|
||||
string tenantId,
|
||||
long startSequence,
|
||||
long endSequence,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the latest ledger entry for a tenant.
|
||||
/// </summary>
|
||||
Task<RunLedgerEntry?> GetLatestAsync(
|
||||
string tenantId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets ledger entries for a specific source.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<RunLedgerEntry>> GetBySourceAsync(
|
||||
string tenantId,
|
||||
Guid sourceId,
|
||||
int limit = 100,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the count of ledger entries.
|
||||
/// </summary>
|
||||
Task<long> GetCountAsync(
|
||||
string tenantId,
|
||||
string? runType = null,
|
||||
Guid? sourceId = null,
|
||||
DateTimeOffset? startTime = null,
|
||||
DateTimeOffset? endTime = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the chain integrity for a range of entries.
|
||||
/// </summary>
|
||||
Task<ChainVerificationResult> VerifyChainAsync(
|
||||
string tenantId,
|
||||
long? startSequence = null,
|
||||
long? endSequence = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets ledger summary statistics.
|
||||
/// </summary>
|
||||
Task<LedgerSummary> GetSummaryAsync(
|
||||
string tenantId,
|
||||
DateTimeOffset? since = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Ledger summary statistics.
|
||||
/// </summary>
|
||||
public sealed record LedgerSummary(
|
||||
long TotalEntries,
|
||||
long EntriesSince,
|
||||
long TotalRuns,
|
||||
long SuccessfulRuns,
|
||||
long FailedRuns,
|
||||
long TotalJobs,
|
||||
long UniqueSources,
|
||||
long UniqueRunTypes,
|
||||
DateTimeOffset? EarliestEntry,
|
||||
DateTimeOffset? LatestEntry);
|
||||
|
||||
/// <summary>
|
||||
/// Repository for ledger exports.
|
||||
/// </summary>
|
||||
public interface ILedgerExportRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new export request.
|
||||
/// </summary>
|
||||
Task<LedgerExport> CreateAsync(
|
||||
LedgerExport export,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets an export by ID.
|
||||
/// </summary>
|
||||
Task<LedgerExport?> GetByIdAsync(
|
||||
string tenantId,
|
||||
Guid exportId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Lists exports for a tenant.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<LedgerExport>> ListAsync(
|
||||
string tenantId,
|
||||
LedgerExportStatus? status = null,
|
||||
int limit = 100,
|
||||
int offset = 0,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Updates an export.
|
||||
/// </summary>
|
||||
Task<LedgerExport> UpdateAsync(
|
||||
LedgerExport export,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets pending exports.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<LedgerExport>> GetPendingAsync(
|
||||
int limit = 10,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Repository for signed manifests.
|
||||
/// </summary>
|
||||
public interface IManifestRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new manifest.
|
||||
/// </summary>
|
||||
Task<SignedManifest> CreateAsync(
|
||||
SignedManifest manifest,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a manifest by ID.
|
||||
/// </summary>
|
||||
Task<SignedManifest?> GetByIdAsync(
|
||||
string tenantId,
|
||||
Guid manifestId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a manifest by subject.
|
||||
/// </summary>
|
||||
Task<SignedManifest?> GetBySubjectAsync(
|
||||
string tenantId,
|
||||
ProvenanceType provenanceType,
|
||||
Guid subjectId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Lists manifests for a tenant.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<SignedManifest>> ListAsync(
|
||||
string tenantId,
|
||||
ProvenanceType? provenanceType = null,
|
||||
int limit = 100,
|
||||
int offset = 0,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a manifest by payload digest.
|
||||
/// </summary>
|
||||
Task<SignedManifest?> GetByPayloadDigestAsync(
|
||||
string tenantId,
|
||||
string payloadDigest,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for quota persistence operations.
|
||||
/// </summary>
|
||||
public interface IQuotaRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets a quota by ID.
|
||||
/// </summary>
|
||||
Task<Quota?> GetByIdAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the quota for a tenant and optional job type.
|
||||
/// </summary>
|
||||
Task<Quota?> GetByTenantAndJobTypeAsync(string tenantId, string? jobType, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new quota.
|
||||
/// </summary>
|
||||
Task CreateAsync(Quota quota, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Updates a quota (including token/counter state).
|
||||
/// </summary>
|
||||
Task UpdateAsync(Quota quota, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Pauses a quota with reason.
|
||||
/// </summary>
|
||||
Task PauseAsync(string tenantId, Guid quotaId, string reason, string? ticket, string updatedBy, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Resumes a paused quota.
|
||||
/// </summary>
|
||||
Task ResumeAsync(string tenantId, Guid quotaId, string updatedBy, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Updates the rate limiter state (tokens, counters) without changing configuration.
|
||||
/// </summary>
|
||||
Task UpdateStateAsync(
|
||||
string tenantId,
|
||||
Guid quotaId,
|
||||
double currentTokens,
|
||||
DateTimeOffset lastRefillAt,
|
||||
int currentActive,
|
||||
int currentHourCount,
|
||||
DateTimeOffset currentHourStart,
|
||||
string updatedBy,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Increments the current active count.
|
||||
/// </summary>
|
||||
Task IncrementActiveAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Decrements the current active count.
|
||||
/// </summary>
|
||||
Task DecrementActiveAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Lists quotas for a tenant with pagination.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Quota>> ListAsync(
|
||||
string tenantId,
|
||||
string? jobType,
|
||||
bool? paused,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Deletes a quota.
|
||||
/// </summary>
|
||||
Task<bool> DeleteAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for run persistence operations.
|
||||
/// </summary>
|
||||
public interface IRunRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets a run by ID.
|
||||
/// </summary>
|
||||
Task<Run?> GetByIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new run.
|
||||
/// </summary>
|
||||
Task CreateAsync(Run run, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Updates run status and job counts.
|
||||
/// </summary>
|
||||
Task UpdateStatusAsync(
|
||||
string tenantId,
|
||||
Guid runId,
|
||||
RunStatus status,
|
||||
int totalJobs,
|
||||
int completedJobs,
|
||||
int succeededJobs,
|
||||
int failedJobs,
|
||||
DateTimeOffset? startedAt,
|
||||
DateTimeOffset? completedAt,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Increments job counters when a job completes.
|
||||
/// </summary>
|
||||
Task IncrementJobCountsAsync(
|
||||
string tenantId,
|
||||
Guid runId,
|
||||
bool succeeded,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Lists runs with pagination and filters.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Run>> ListAsync(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? runType,
|
||||
RunStatus? status,
|
||||
string? projectId,
|
||||
DateTimeOffset? createdAfter,
|
||||
DateTimeOffset? createdBefore,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Counts runs matching the filters.
|
||||
/// </summary>
|
||||
Task<int> CountAsync(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? runType,
|
||||
RunStatus? status,
|
||||
string? projectId,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for source persistence operations.
|
||||
/// </summary>
|
||||
public interface ISourceRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets a source by ID.
|
||||
/// </summary>
|
||||
Task<Source?> GetByIdAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a source by name.
|
||||
/// </summary>
|
||||
Task<Source?> GetByNameAsync(string tenantId, string name, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new source.
|
||||
/// </summary>
|
||||
Task CreateAsync(Source source, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Updates a source.
|
||||
/// </summary>
|
||||
Task UpdateAsync(Source source, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Pauses a source with reason.
|
||||
/// </summary>
|
||||
Task PauseAsync(string tenantId, Guid sourceId, string reason, string? ticket, string updatedBy, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Resumes a paused source.
|
||||
/// </summary>
|
||||
Task ResumeAsync(string tenantId, Guid sourceId, string updatedBy, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Lists sources with pagination.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Source>> ListAsync(
|
||||
string tenantId,
|
||||
string? sourceType,
|
||||
bool? enabled,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for throttle persistence operations.
|
||||
/// </summary>
|
||||
public interface IThrottleRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets a throttle by ID.
|
||||
/// </summary>
|
||||
Task<Throttle?> GetByIdAsync(string tenantId, Guid throttleId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets active throttles for a source.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Throttle>> GetActiveBySourceAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets active throttles for a job type.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Throttle>> GetActiveByJobTypeAsync(string tenantId, string jobType, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new throttle.
|
||||
/// </summary>
|
||||
Task CreateAsync(Throttle throttle, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Deactivates a throttle.
|
||||
/// </summary>
|
||||
Task DeactivateAsync(string tenantId, Guid throttleId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Deactivates all throttles for a source.
|
||||
/// </summary>
|
||||
Task DeactivateBySourceAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Deactivates all throttles for a job type.
|
||||
/// </summary>
|
||||
Task DeactivateByJobTypeAsync(string tenantId, string jobType, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Cleans up expired throttles.
|
||||
/// </summary>
|
||||
/// <returns>Number of throttles deactivated.</returns>
|
||||
Task<int> CleanupExpiredAsync(DateTimeOffset now, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Lists throttles for a tenant with pagination.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Throttle>> ListAsync(
|
||||
string tenantId,
|
||||
bool? active,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// Repository interface for watermark persistence operations.
|
||||
/// </summary>
|
||||
public interface IWatermarkRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets a watermark by scope key.
|
||||
/// </summary>
|
||||
Task<Watermark?> GetByScopeKeyAsync(string tenantId, string scopeKey, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a watermark by source ID.
|
||||
/// </summary>
|
||||
Task<Watermark?> GetBySourceIdAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a watermark by job type.
|
||||
/// </summary>
|
||||
Task<Watermark?> GetByJobTypeAsync(string tenantId, string jobType, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a watermark by source ID and job type.
|
||||
/// </summary>
|
||||
Task<Watermark?> GetBySourceAndJobTypeAsync(string tenantId, Guid sourceId, string jobType, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new watermark.
|
||||
/// </summary>
|
||||
Task CreateAsync(Watermark watermark, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Updates a watermark using optimistic concurrency.
|
||||
/// </summary>
|
||||
/// <returns>True if update succeeded, false if concurrent modification detected.</returns>
|
||||
Task<bool> UpdateAsync(Watermark watermark, long expectedSequenceNumber, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Creates or updates a watermark (upsert).
|
||||
/// </summary>
|
||||
Task UpsertAsync(Watermark watermark, CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Lists watermarks for a tenant.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Watermark>> ListAsync(
|
||||
string tenantId,
|
||||
Guid? sourceId,
|
||||
string? jobType,
|
||||
int limit,
|
||||
int offset,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Gets watermarks with lag exceeding the threshold.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<Watermark>> GetLaggingAsync(
|
||||
string tenantId,
|
||||
TimeSpan lagThreshold,
|
||||
int limit,
|
||||
CancellationToken cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Deletes a watermark by scope key.
|
||||
/// </summary>
|
||||
Task<bool> DeleteAsync(string tenantId, string scopeKey, CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Orchestrator.Core.Backfill;
|
||||
using StellaOps.Orchestrator.Infrastructure.Ledger;
|
||||
using StellaOps.Orchestrator.Infrastructure.Options;
|
||||
using StellaOps.Orchestrator.Infrastructure.Postgres;
|
||||
using StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
namespace StellaOps.Orchestrator.Infrastructure;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering Orchestrator infrastructure services.
|
||||
/// </summary>
|
||||
public static class ServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds Orchestrator infrastructure services to the service collection.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configuration">The configuration.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddOrchestratorInfrastructure(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
// Register configuration options
|
||||
services.Configure<OrchestratorServiceOptions>(
|
||||
configuration.GetSection(OrchestratorServiceOptions.SectionName));
|
||||
|
||||
// Register data source
|
||||
services.AddSingleton<OrchestratorDataSource>();
|
||||
|
||||
// Register repositories
|
||||
services.AddScoped<IJobRepository, PostgresJobRepository>();
|
||||
services.AddScoped<IArtifactRepository, PostgresArtifactRepository>();
|
||||
services.AddScoped<ISourceRepository, PostgresSourceRepository>();
|
||||
services.AddScoped<IRunRepository, PostgresRunRepository>();
|
||||
services.AddScoped<IQuotaRepository, PostgresQuotaRepository>();
|
||||
services.AddScoped<IThrottleRepository, PostgresThrottleRepository>();
|
||||
services.AddScoped<IWatermarkRepository, PostgresWatermarkRepository>();
|
||||
services.AddScoped<Infrastructure.Repositories.IBackfillRepository, PostgresBackfillRepository>();
|
||||
|
||||
// Register audit and ledger repositories
|
||||
services.AddScoped<IAuditRepository, PostgresAuditRepository>();
|
||||
services.AddScoped<ILedgerRepository, PostgresLedgerRepository>();
|
||||
services.AddScoped<ILedgerExportRepository, PostgresLedgerExportRepository>();
|
||||
services.AddScoped<IManifestRepository, PostgresManifestRepository>();
|
||||
|
||||
// Register ledger exporter service
|
||||
services.AddScoped<ILedgerExporter, LedgerExporter>();
|
||||
|
||||
// Register duplicate suppression factory
|
||||
services.AddSingleton<IDuplicateSuppressorFactory, PostgresDuplicateSuppressorFactory>();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
@@ -1,28 +1,30 @@
|
||||
<?xml version="1.0" ?>
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
|
||||
<ProjectReference Include="..\StellaOps.Orchestrator.Core\StellaOps.Orchestrator.Core.csproj"/>
|
||||
|
||||
|
||||
</ItemGroup>
|
||||
|
||||
|
||||
|
||||
<PropertyGroup>
|
||||
|
||||
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
|
||||
<ItemGroup>
|
||||
<None Include="migrations\**\*" Pack="false" CopyToOutputDirectory="Never" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.Orchestrator.Core\StellaOps.Orchestrator.Core.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" Version="10.0.0-rc.2.25502.107" />
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="10.0.0-rc.2.25502.107" />
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.0-rc.2.25502.107" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.0-rc.2.25502.107" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.0-rc.2.25502.107" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" Version="10.0.0-rc.2.25502.107" />
|
||||
<PackageReference Include="Npgsql" Version="7.0.7" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
||||
@@ -0,0 +1,323 @@
|
||||
-- 001_initial.sql
|
||||
-- Orchestrator bootstrap schema (ORCH-SVC-32-001)
|
||||
-- Creates core tables for sources, runs, jobs, DAG edges, artifacts, quotas, schedules, and incidents.
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- Enum types for job and run statuses
|
||||
CREATE TYPE job_status AS ENUM (
|
||||
'pending',
|
||||
'scheduled',
|
||||
'leased',
|
||||
'succeeded',
|
||||
'failed',
|
||||
'canceled',
|
||||
'timed_out'
|
||||
);
|
||||
|
||||
CREATE TYPE run_status AS ENUM (
|
||||
'pending',
|
||||
'running',
|
||||
'succeeded',
|
||||
'partially_succeeded',
|
||||
'failed',
|
||||
'canceled'
|
||||
);
|
||||
|
||||
CREATE TYPE incident_status AS ENUM (
|
||||
'open',
|
||||
'acknowledged',
|
||||
'resolved'
|
||||
);
|
||||
|
||||
CREATE TYPE dag_edge_type AS ENUM (
|
||||
'success',
|
||||
'always',
|
||||
'failure'
|
||||
);
|
||||
|
||||
-- Sources: Job producers (Concelier, Scanner, Export, etc.)
|
||||
CREATE TABLE sources (
|
||||
source_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
source_type TEXT NOT NULL,
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
paused BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
pause_reason TEXT,
|
||||
pause_ticket TEXT,
|
||||
configuration JSONB,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_by TEXT NOT NULL,
|
||||
CONSTRAINT pk_sources PRIMARY KEY (tenant_id, source_id),
|
||||
CONSTRAINT uq_sources_name UNIQUE (tenant_id, name)
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE sources_default PARTITION OF sources DEFAULT;
|
||||
|
||||
CREATE INDEX ix_sources_type ON sources (tenant_id, source_type);
|
||||
CREATE INDEX ix_sources_enabled ON sources (tenant_id, enabled) WHERE enabled = TRUE;
|
||||
|
||||
-- Runs: Batch/workflow executions containing jobs
|
||||
CREATE TABLE runs (
|
||||
run_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
project_id TEXT,
|
||||
source_id UUID NOT NULL,
|
||||
run_type TEXT NOT NULL,
|
||||
status run_status NOT NULL DEFAULT 'pending',
|
||||
correlation_id TEXT,
|
||||
total_jobs INTEGER NOT NULL DEFAULT 0,
|
||||
completed_jobs INTEGER NOT NULL DEFAULT 0,
|
||||
succeeded_jobs INTEGER NOT NULL DEFAULT 0,
|
||||
failed_jobs INTEGER NOT NULL DEFAULT 0,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
created_by TEXT NOT NULL,
|
||||
metadata JSONB,
|
||||
CONSTRAINT pk_runs PRIMARY KEY (tenant_id, run_id),
|
||||
CONSTRAINT fk_runs_source FOREIGN KEY (tenant_id, source_id) REFERENCES sources (tenant_id, source_id)
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE runs_default PARTITION OF runs DEFAULT;
|
||||
|
||||
CREATE INDEX ix_runs_status ON runs (tenant_id, status, created_at DESC);
|
||||
CREATE INDEX ix_runs_source ON runs (tenant_id, source_id, created_at DESC);
|
||||
CREATE INDEX ix_runs_project ON runs (tenant_id, project_id, created_at DESC) WHERE project_id IS NOT NULL;
|
||||
CREATE INDEX ix_runs_correlation ON runs (tenant_id, correlation_id) WHERE correlation_id IS NOT NULL;
|
||||
|
||||
-- Jobs: Individual units of work
|
||||
CREATE TABLE jobs (
|
||||
job_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
project_id TEXT,
|
||||
run_id UUID,
|
||||
job_type TEXT NOT NULL,
|
||||
status job_status NOT NULL DEFAULT 'pending',
|
||||
priority INTEGER NOT NULL DEFAULT 0,
|
||||
attempt INTEGER NOT NULL DEFAULT 1,
|
||||
max_attempts INTEGER NOT NULL DEFAULT 3,
|
||||
payload_digest CHAR(64) NOT NULL,
|
||||
payload JSONB NOT NULL,
|
||||
idempotency_key TEXT NOT NULL,
|
||||
correlation_id TEXT,
|
||||
lease_id UUID,
|
||||
worker_id TEXT,
|
||||
task_runner_id TEXT,
|
||||
lease_until TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
scheduled_at TIMESTAMPTZ,
|
||||
leased_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
not_before TIMESTAMPTZ,
|
||||
reason TEXT,
|
||||
replay_of UUID,
|
||||
created_by TEXT NOT NULL,
|
||||
CONSTRAINT pk_jobs PRIMARY KEY (tenant_id, job_id),
|
||||
CONSTRAINT uq_jobs_idempotency UNIQUE (tenant_id, idempotency_key),
|
||||
CONSTRAINT ck_jobs_payload_digest_hex CHECK (payload_digest ~ '^[0-9a-f]{64}$'),
|
||||
CONSTRAINT ck_jobs_attempt_positive CHECK (attempt >= 1),
|
||||
CONSTRAINT ck_jobs_max_attempts_positive CHECK (max_attempts >= 1)
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE jobs_default PARTITION OF jobs DEFAULT;
|
||||
|
||||
CREATE INDEX ix_jobs_status ON jobs (tenant_id, status, priority DESC, created_at);
|
||||
CREATE INDEX ix_jobs_type_status ON jobs (tenant_id, job_type, status, created_at);
|
||||
CREATE INDEX ix_jobs_run ON jobs (tenant_id, run_id) WHERE run_id IS NOT NULL;
|
||||
CREATE INDEX ix_jobs_lease ON jobs (tenant_id, lease_id) WHERE lease_id IS NOT NULL;
|
||||
CREATE INDEX ix_jobs_lease_expiry ON jobs (tenant_id, lease_until) WHERE status = 'leased' AND lease_until IS NOT NULL;
|
||||
CREATE INDEX ix_jobs_not_before ON jobs (tenant_id, not_before) WHERE status = 'pending' AND not_before IS NOT NULL;
|
||||
CREATE INDEX ix_jobs_scheduled ON jobs (tenant_id, job_type, status, scheduled_at) WHERE status = 'scheduled';
|
||||
CREATE INDEX ix_jobs_replay ON jobs (tenant_id, replay_of) WHERE replay_of IS NOT NULL;
|
||||
|
||||
-- Job History: Immutable audit trail for job state changes
|
||||
CREATE TABLE job_history (
|
||||
history_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
job_id UUID NOT NULL,
|
||||
sequence_no INTEGER NOT NULL,
|
||||
from_status job_status,
|
||||
to_status job_status NOT NULL,
|
||||
attempt INTEGER NOT NULL,
|
||||
lease_id UUID,
|
||||
worker_id TEXT,
|
||||
reason TEXT,
|
||||
occurred_at TIMESTAMPTZ NOT NULL,
|
||||
recorded_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
actor_id TEXT NOT NULL,
|
||||
actor_type TEXT NOT NULL,
|
||||
CONSTRAINT pk_job_history PRIMARY KEY (tenant_id, job_id, sequence_no),
|
||||
CONSTRAINT ck_job_history_actor_type CHECK (actor_type IN ('system', 'operator', 'worker'))
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE job_history_default PARTITION OF job_history DEFAULT;
|
||||
|
||||
CREATE INDEX ix_job_history_occurred ON job_history (tenant_id, job_id, occurred_at DESC);
|
||||
|
||||
-- DAG Edges: Job dependencies within a run
|
||||
CREATE TABLE dag_edges (
|
||||
edge_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
run_id UUID NOT NULL,
|
||||
parent_job_id UUID NOT NULL,
|
||||
child_job_id UUID NOT NULL,
|
||||
edge_type dag_edge_type NOT NULL DEFAULT 'success',
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
CONSTRAINT pk_dag_edges PRIMARY KEY (tenant_id, edge_id),
|
||||
CONSTRAINT uq_dag_edges_parent_child UNIQUE (tenant_id, run_id, parent_job_id, child_job_id),
|
||||
CONSTRAINT fk_dag_edges_run FOREIGN KEY (tenant_id, run_id) REFERENCES runs (tenant_id, run_id),
|
||||
CONSTRAINT fk_dag_edges_parent FOREIGN KEY (tenant_id, parent_job_id) REFERENCES jobs (tenant_id, job_id),
|
||||
CONSTRAINT fk_dag_edges_child FOREIGN KEY (tenant_id, child_job_id) REFERENCES jobs (tenant_id, job_id),
|
||||
CONSTRAINT ck_dag_edges_no_self_loop CHECK (parent_job_id <> child_job_id)
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE dag_edges_default PARTITION OF dag_edges DEFAULT;
|
||||
|
||||
CREATE INDEX ix_dag_edges_run ON dag_edges (tenant_id, run_id);
|
||||
CREATE INDEX ix_dag_edges_parent ON dag_edges (tenant_id, parent_job_id);
|
||||
CREATE INDEX ix_dag_edges_child ON dag_edges (tenant_id, child_job_id);
|
||||
|
||||
-- Artifacts: Job outputs with provenance
|
||||
CREATE TABLE artifacts (
|
||||
artifact_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
job_id UUID NOT NULL,
|
||||
run_id UUID,
|
||||
artifact_type TEXT NOT NULL,
|
||||
uri TEXT NOT NULL,
|
||||
digest CHAR(64) NOT NULL,
|
||||
mime_type TEXT,
|
||||
size_bytes BIGINT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
metadata JSONB,
|
||||
CONSTRAINT pk_artifacts PRIMARY KEY (tenant_id, artifact_id),
|
||||
CONSTRAINT fk_artifacts_job FOREIGN KEY (tenant_id, job_id) REFERENCES jobs (tenant_id, job_id),
|
||||
CONSTRAINT ck_artifacts_digest_hex CHECK (digest ~ '^[0-9a-f]{64}$')
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE artifacts_default PARTITION OF artifacts DEFAULT;
|
||||
|
||||
CREATE INDEX ix_artifacts_job ON artifacts (tenant_id, job_id);
|
||||
CREATE INDEX ix_artifacts_run ON artifacts (tenant_id, run_id) WHERE run_id IS NOT NULL;
|
||||
CREATE INDEX ix_artifacts_type ON artifacts (tenant_id, artifact_type, created_at DESC);
|
||||
CREATE INDEX ix_artifacts_digest ON artifacts (tenant_id, digest);
|
||||
|
||||
-- Quotas: Rate-limit and concurrency controls
|
||||
CREATE TABLE quotas (
|
||||
quota_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
job_type TEXT,
|
||||
max_active INTEGER NOT NULL DEFAULT 10,
|
||||
max_per_hour INTEGER NOT NULL DEFAULT 1000,
|
||||
burst_capacity INTEGER NOT NULL DEFAULT 50,
|
||||
refill_rate DOUBLE PRECISION NOT NULL DEFAULT 1.0,
|
||||
current_tokens DOUBLE PRECISION NOT NULL DEFAULT 50.0,
|
||||
last_refill_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
current_active INTEGER NOT NULL DEFAULT 0,
|
||||
current_hour_count INTEGER NOT NULL DEFAULT 0,
|
||||
current_hour_start TIMESTAMPTZ NOT NULL DEFAULT DATE_TRUNC('hour', NOW()),
|
||||
paused BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
pause_reason TEXT,
|
||||
quota_ticket TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_by TEXT NOT NULL,
|
||||
CONSTRAINT pk_quotas PRIMARY KEY (tenant_id, quota_id),
|
||||
CONSTRAINT uq_quotas_tenant_type UNIQUE (tenant_id, job_type),
|
||||
CONSTRAINT ck_quotas_max_active_positive CHECK (max_active > 0),
|
||||
CONSTRAINT ck_quotas_max_per_hour_positive CHECK (max_per_hour > 0),
|
||||
CONSTRAINT ck_quotas_burst_positive CHECK (burst_capacity > 0),
|
||||
CONSTRAINT ck_quotas_refill_positive CHECK (refill_rate > 0)
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE quotas_default PARTITION OF quotas DEFAULT;
|
||||
|
||||
CREATE INDEX ix_quotas_type ON quotas (tenant_id, job_type);
|
||||
CREATE INDEX ix_quotas_paused ON quotas (tenant_id, paused) WHERE paused = TRUE;
|
||||
|
||||
-- Schedules: Cron-based job triggers
|
||||
CREATE TABLE schedules (
|
||||
schedule_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
project_id TEXT,
|
||||
source_id UUID NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
job_type TEXT NOT NULL,
|
||||
cron_expression TEXT NOT NULL,
|
||||
timezone TEXT NOT NULL DEFAULT 'UTC',
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
payload_template JSONB NOT NULL,
|
||||
priority INTEGER NOT NULL DEFAULT 0,
|
||||
max_attempts INTEGER NOT NULL DEFAULT 3,
|
||||
last_triggered_at TIMESTAMPTZ,
|
||||
next_trigger_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
created_by TEXT NOT NULL,
|
||||
updated_by TEXT NOT NULL,
|
||||
CONSTRAINT pk_schedules PRIMARY KEY (tenant_id, schedule_id),
|
||||
CONSTRAINT uq_schedules_name UNIQUE (tenant_id, name),
|
||||
CONSTRAINT fk_schedules_source FOREIGN KEY (tenant_id, source_id) REFERENCES sources (tenant_id, source_id),
|
||||
CONSTRAINT ck_schedules_max_attempts_positive CHECK (max_attempts >= 1)
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE schedules_default PARTITION OF schedules DEFAULT;
|
||||
|
||||
CREATE INDEX ix_schedules_enabled ON schedules (tenant_id, enabled, next_trigger_at) WHERE enabled = TRUE;
|
||||
CREATE INDEX ix_schedules_next_trigger ON schedules (tenant_id, next_trigger_at) WHERE enabled = TRUE AND next_trigger_at IS NOT NULL;
|
||||
CREATE INDEX ix_schedules_source ON schedules (tenant_id, source_id);
|
||||
|
||||
-- Incidents: Operational alerts and escalations
|
||||
CREATE TABLE incidents (
|
||||
incident_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
incident_type TEXT NOT NULL,
|
||||
severity TEXT NOT NULL,
|
||||
job_type TEXT,
|
||||
source_id UUID,
|
||||
title TEXT NOT NULL,
|
||||
description TEXT NOT NULL,
|
||||
status incident_status NOT NULL DEFAULT 'open',
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
acknowledged_at TIMESTAMPTZ,
|
||||
acknowledged_by TEXT,
|
||||
resolved_at TIMESTAMPTZ,
|
||||
resolved_by TEXT,
|
||||
resolution_notes TEXT,
|
||||
metadata JSONB,
|
||||
CONSTRAINT pk_incidents PRIMARY KEY (tenant_id, incident_id),
|
||||
CONSTRAINT ck_incidents_severity CHECK (severity IN ('warning', 'critical'))
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE incidents_default PARTITION OF incidents DEFAULT;
|
||||
|
||||
CREATE INDEX ix_incidents_status ON incidents (tenant_id, status, created_at DESC);
|
||||
CREATE INDEX ix_incidents_type ON incidents (tenant_id, incident_type, status);
|
||||
CREATE INDEX ix_incidents_open ON incidents (tenant_id, severity, created_at DESC) WHERE status = 'open';
|
||||
|
||||
-- Throttles: Dynamic rate-limit overrides (pause/resume per source or job type)
|
||||
CREATE TABLE throttles (
|
||||
throttle_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
source_id UUID,
|
||||
job_type TEXT,
|
||||
active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
reason TEXT NOT NULL,
|
||||
ticket TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
expires_at TIMESTAMPTZ,
|
||||
created_by TEXT NOT NULL,
|
||||
CONSTRAINT pk_throttles PRIMARY KEY (tenant_id, throttle_id),
|
||||
CONSTRAINT ck_throttles_scope CHECK (source_id IS NOT NULL OR job_type IS NOT NULL)
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE throttles_default PARTITION OF throttles DEFAULT;
|
||||
|
||||
CREATE INDEX ix_throttles_active ON throttles (tenant_id, active, expires_at) WHERE active = TRUE;
|
||||
CREATE INDEX ix_throttles_source ON throttles (tenant_id, source_id) WHERE source_id IS NOT NULL;
|
||||
CREATE INDEX ix_throttles_type ON throttles (tenant_id, job_type) WHERE job_type IS NOT NULL;
|
||||
|
||||
COMMIT;
|
||||
@@ -0,0 +1,154 @@
|
||||
-- 002_backfill.sql
|
||||
-- Backfill and watermark tables for event-time window tracking (ORCH-SVC-33-003)
|
||||
-- Adds watermarks, backfill_requests, and processed_events for duplicate suppression.
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- Backfill request status
|
||||
CREATE TYPE backfill_status AS ENUM (
|
||||
'pending',
|
||||
'validating',
|
||||
'running',
|
||||
'paused',
|
||||
'completed',
|
||||
'failed',
|
||||
'canceled'
|
||||
);
|
||||
|
||||
-- Watermarks: Per-source/job-type event-time cursors
|
||||
CREATE TABLE watermarks (
|
||||
watermark_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
source_id UUID,
|
||||
job_type TEXT,
|
||||
scope_key TEXT NOT NULL, -- Normalized scope identifier
|
||||
high_watermark TIMESTAMPTZ NOT NULL, -- Latest processed event time
|
||||
low_watermark TIMESTAMPTZ, -- Earliest event time in current window
|
||||
sequence_number BIGINT NOT NULL DEFAULT 0,
|
||||
processed_count BIGINT NOT NULL DEFAULT 0,
|
||||
last_batch_hash CHAR(64), -- SHA-256 of last processed batch for integrity
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_by TEXT NOT NULL,
|
||||
CONSTRAINT pk_watermarks PRIMARY KEY (tenant_id, watermark_id),
|
||||
CONSTRAINT uq_watermarks_scope UNIQUE (tenant_id, scope_key),
|
||||
CONSTRAINT ck_watermarks_hash_hex CHECK (last_batch_hash IS NULL OR last_batch_hash ~ '^[0-9a-f]{64}$')
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE watermarks_default PARTITION OF watermarks DEFAULT;
|
||||
|
||||
CREATE INDEX ix_watermarks_source ON watermarks (tenant_id, source_id) WHERE source_id IS NOT NULL;
|
||||
CREATE INDEX ix_watermarks_job_type ON watermarks (tenant_id, job_type) WHERE job_type IS NOT NULL;
|
||||
|
||||
-- Backfill Requests: Batch reprocessing operations
|
||||
CREATE TABLE backfill_requests (
|
||||
backfill_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
source_id UUID,
|
||||
job_type TEXT,
|
||||
scope_key TEXT NOT NULL,
|
||||
status backfill_status NOT NULL DEFAULT 'pending',
|
||||
-- Time window for backfill
|
||||
window_start TIMESTAMPTZ NOT NULL,
|
||||
window_end TIMESTAMPTZ NOT NULL,
|
||||
-- Progress tracking
|
||||
current_position TIMESTAMPTZ,
|
||||
total_events BIGINT,
|
||||
processed_events BIGINT NOT NULL DEFAULT 0,
|
||||
skipped_events BIGINT NOT NULL DEFAULT 0, -- Duplicates skipped
|
||||
failed_events BIGINT NOT NULL DEFAULT 0,
|
||||
-- Configuration
|
||||
batch_size INTEGER NOT NULL DEFAULT 100,
|
||||
dry_run BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
force_reprocess BOOLEAN NOT NULL DEFAULT FALSE, -- Ignore duplicate suppression
|
||||
-- Safety validations
|
||||
estimated_duration INTERVAL,
|
||||
max_duration INTERVAL,
|
||||
safety_checks JSONB, -- Validation results
|
||||
-- Audit
|
||||
reason TEXT NOT NULL,
|
||||
ticket TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
created_by TEXT NOT NULL,
|
||||
updated_by TEXT NOT NULL,
|
||||
error_message TEXT,
|
||||
CONSTRAINT pk_backfill_requests PRIMARY KEY (tenant_id, backfill_id),
|
||||
CONSTRAINT ck_backfill_window_order CHECK (window_end > window_start),
|
||||
CONSTRAINT ck_backfill_batch_size CHECK (batch_size > 0 AND batch_size <= 10000)
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE backfill_requests_default PARTITION OF backfill_requests DEFAULT;
|
||||
|
||||
CREATE INDEX ix_backfill_status ON backfill_requests (tenant_id, status, created_at DESC);
|
||||
CREATE INDEX ix_backfill_scope ON backfill_requests (tenant_id, scope_key, created_at DESC);
|
||||
CREATE INDEX ix_backfill_running ON backfill_requests (tenant_id, source_id, job_type) WHERE status IN ('running', 'validating');
|
||||
|
||||
-- Processed Events: Duplicate suppression tracking (TTL-managed)
|
||||
CREATE TABLE processed_events (
|
||||
tenant_id TEXT NOT NULL,
|
||||
scope_key TEXT NOT NULL,
|
||||
event_key TEXT NOT NULL, -- Unique identifier for deduplication
|
||||
event_time TIMESTAMPTZ NOT NULL,
|
||||
processed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
batch_id UUID, -- Backfill batch or run ID
|
||||
expires_at TIMESTAMPTZ NOT NULL, -- TTL for automatic cleanup
|
||||
CONSTRAINT pk_processed_events PRIMARY KEY (tenant_id, scope_key, event_key)
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE processed_events_default PARTITION OF processed_events DEFAULT;
|
||||
|
||||
CREATE INDEX ix_processed_events_expires ON processed_events (expires_at) WHERE expires_at < NOW() + INTERVAL '1 day';
|
||||
CREATE INDEX ix_processed_events_time ON processed_events (tenant_id, scope_key, event_time DESC);
|
||||
CREATE INDEX ix_processed_events_batch ON processed_events (tenant_id, batch_id) WHERE batch_id IS NOT NULL;
|
||||
|
||||
-- Backfill Checkpoints: Resumable batch processing state
|
||||
CREATE TABLE backfill_checkpoints (
|
||||
checkpoint_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
backfill_id UUID NOT NULL,
|
||||
batch_number INTEGER NOT NULL,
|
||||
batch_start TIMESTAMPTZ NOT NULL,
|
||||
batch_end TIMESTAMPTZ NOT NULL,
|
||||
events_in_batch INTEGER NOT NULL,
|
||||
events_processed INTEGER NOT NULL DEFAULT 0,
|
||||
events_skipped INTEGER NOT NULL DEFAULT 0,
|
||||
events_failed INTEGER NOT NULL DEFAULT 0,
|
||||
batch_hash CHAR(64),
|
||||
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
completed_at TIMESTAMPTZ,
|
||||
error_message TEXT,
|
||||
CONSTRAINT pk_backfill_checkpoints PRIMARY KEY (tenant_id, checkpoint_id),
|
||||
CONSTRAINT fk_backfill_checkpoints_request FOREIGN KEY (tenant_id, backfill_id)
|
||||
REFERENCES backfill_requests (tenant_id, backfill_id) ON DELETE CASCADE,
|
||||
CONSTRAINT uq_backfill_checkpoints_batch UNIQUE (tenant_id, backfill_id, batch_number),
|
||||
CONSTRAINT ck_backfill_checkpoints_hash_hex CHECK (batch_hash IS NULL OR batch_hash ~ '^[0-9a-f]{64}$')
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE backfill_checkpoints_default PARTITION OF backfill_checkpoints DEFAULT;
|
||||
|
||||
CREATE INDEX ix_backfill_checkpoints_request ON backfill_checkpoints (tenant_id, backfill_id, batch_number);
|
||||
|
||||
-- Function to clean up expired processed events (called by background job)
|
||||
CREATE OR REPLACE FUNCTION cleanup_expired_processed_events(batch_limit INTEGER DEFAULT 10000)
|
||||
RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
deleted_count INTEGER;
|
||||
BEGIN
|
||||
WITH deleted AS (
|
||||
DELETE FROM processed_events
|
||||
WHERE ctid IN (
|
||||
SELECT ctid FROM processed_events
|
||||
WHERE expires_at < NOW()
|
||||
LIMIT batch_limit
|
||||
)
|
||||
RETURNING 1
|
||||
)
|
||||
SELECT COUNT(*) INTO deleted_count FROM deleted;
|
||||
|
||||
RETURN deleted_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
COMMIT;
|
||||
@@ -0,0 +1,278 @@
|
||||
-- 003_dead_letter.sql
|
||||
-- Dead-letter store for failed jobs with error classification and replay (ORCH-SVC-33-004)
|
||||
-- Adds dead_letter_entries, replay_audit, and notification_rules tables.
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- Dead-letter entry status
|
||||
CREATE TYPE dead_letter_status AS ENUM (
|
||||
'pending', -- Awaiting operator action or auto-replay
|
||||
'replaying', -- Currently being replayed
|
||||
'replayed', -- Successfully replayed as new job
|
||||
'resolved', -- Manually resolved without replay
|
||||
'exhausted', -- All replay attempts exhausted
|
||||
'expired' -- Expired and eligible for purge
|
||||
);
|
||||
|
||||
-- Error classification category
|
||||
CREATE TYPE error_category AS ENUM (
|
||||
'unknown', -- Unclassified error
|
||||
'transient', -- Transient infrastructure error
|
||||
'not_found', -- Resource not found
|
||||
'auth_failure', -- Authentication/authorization failure
|
||||
'rate_limited', -- Rate limiting or quota exceeded
|
||||
'validation_error', -- Invalid input or configuration
|
||||
'upstream_error', -- External service error
|
||||
'internal_error', -- Internal processing error
|
||||
'conflict', -- Resource conflict
|
||||
'canceled' -- Operation canceled
|
||||
);
|
||||
|
||||
-- Dead-letter Entries: Failed jobs awaiting remediation
|
||||
CREATE TABLE dead_letter_entries (
|
||||
entry_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
-- Original job reference
|
||||
original_job_id UUID NOT NULL,
|
||||
run_id UUID,
|
||||
source_id UUID,
|
||||
job_type TEXT NOT NULL,
|
||||
-- Payload preservation
|
||||
payload JSONB NOT NULL,
|
||||
payload_digest CHAR(64) NOT NULL, -- SHA-256 of payload
|
||||
idempotency_key TEXT NOT NULL,
|
||||
correlation_id TEXT,
|
||||
-- Status and classification
|
||||
status dead_letter_status NOT NULL DEFAULT 'pending',
|
||||
error_code TEXT NOT NULL,
|
||||
failure_reason TEXT NOT NULL,
|
||||
remediation_hint TEXT,
|
||||
category error_category NOT NULL DEFAULT 'unknown',
|
||||
is_retryable BOOLEAN NOT NULL DEFAULT FALSE,
|
||||
-- Attempt tracking
|
||||
original_attempts INTEGER NOT NULL,
|
||||
replay_attempts INTEGER NOT NULL DEFAULT 0,
|
||||
max_replay_attempts INTEGER NOT NULL DEFAULT 3,
|
||||
-- Timestamps
|
||||
failed_at TIMESTAMPTZ NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
expires_at TIMESTAMPTZ NOT NULL,
|
||||
resolved_at TIMESTAMPTZ,
|
||||
-- Resolution
|
||||
resolution_notes TEXT,
|
||||
-- Audit
|
||||
created_by TEXT NOT NULL,
|
||||
updated_by TEXT NOT NULL,
|
||||
CONSTRAINT pk_dead_letter_entries PRIMARY KEY (tenant_id, entry_id),
|
||||
CONSTRAINT ck_dead_letter_payload_digest CHECK (payload_digest ~ '^[0-9a-f]{64}$'),
|
||||
CONSTRAINT ck_dead_letter_attempts CHECK (replay_attempts >= 0 AND replay_attempts <= max_replay_attempts + 1)
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE dead_letter_entries_default PARTITION OF dead_letter_entries DEFAULT;
|
||||
|
||||
-- Indexes for common query patterns
|
||||
CREATE INDEX ix_dead_letter_status ON dead_letter_entries (tenant_id, status, created_at DESC);
|
||||
CREATE INDEX ix_dead_letter_job ON dead_letter_entries (tenant_id, original_job_id);
|
||||
CREATE INDEX ix_dead_letter_job_type ON dead_letter_entries (tenant_id, job_type, status, created_at DESC);
|
||||
CREATE INDEX ix_dead_letter_category ON dead_letter_entries (tenant_id, category, status);
|
||||
CREATE INDEX ix_dead_letter_error_code ON dead_letter_entries (tenant_id, error_code, status);
|
||||
CREATE INDEX ix_dead_letter_expires ON dead_letter_entries (expires_at) WHERE status NOT IN ('replayed', 'resolved', 'exhausted');
|
||||
CREATE INDEX ix_dead_letter_source ON dead_letter_entries (tenant_id, source_id, status) WHERE source_id IS NOT NULL;
|
||||
CREATE INDEX ix_dead_letter_run ON dead_letter_entries (tenant_id, run_id, status) WHERE run_id IS NOT NULL;
|
||||
CREATE INDEX ix_dead_letter_retryable ON dead_letter_entries (tenant_id, is_retryable, status) WHERE is_retryable = TRUE AND status = 'pending';
|
||||
|
||||
-- Replay Audit: Track replay attempts for auditing and debugging
|
||||
CREATE TABLE dead_letter_replay_audit (
|
||||
audit_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
entry_id UUID NOT NULL,
|
||||
attempt_number INTEGER NOT NULL,
|
||||
-- Outcome
|
||||
success BOOLEAN NOT NULL,
|
||||
new_job_id UUID, -- If successful, the new job ID
|
||||
error_message TEXT, -- If failed, the reason
|
||||
-- Context
|
||||
triggered_by TEXT NOT NULL, -- 'auto', 'manual', 'batch'
|
||||
triggered_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
completed_at TIMESTAMPTZ,
|
||||
-- Audit
|
||||
initiated_by TEXT NOT NULL,
|
||||
CONSTRAINT pk_dead_letter_replay_audit PRIMARY KEY (tenant_id, audit_id),
|
||||
CONSTRAINT fk_dead_letter_replay_audit_entry FOREIGN KEY (tenant_id, entry_id)
|
||||
REFERENCES dead_letter_entries (tenant_id, entry_id) ON DELETE CASCADE,
|
||||
CONSTRAINT uq_dead_letter_replay_audit_attempt UNIQUE (tenant_id, entry_id, attempt_number)
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE dead_letter_replay_audit_default PARTITION OF dead_letter_replay_audit DEFAULT;
|
||||
|
||||
CREATE INDEX ix_dead_letter_replay_audit_entry ON dead_letter_replay_audit (tenant_id, entry_id, attempt_number);
|
||||
CREATE INDEX ix_dead_letter_replay_audit_job ON dead_letter_replay_audit (tenant_id, new_job_id) WHERE new_job_id IS NOT NULL;
|
||||
|
||||
-- Notification Rules: Configure alerting for dead-letter events
|
||||
CREATE TABLE dead_letter_notification_rules (
|
||||
rule_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
-- Filter criteria (all optional - match any if not specified)
|
||||
job_type_pattern TEXT, -- Regex pattern for job types
|
||||
error_code_pattern TEXT, -- Regex pattern for error codes
|
||||
category error_category,
|
||||
source_id UUID,
|
||||
-- Notification settings
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
channel TEXT NOT NULL, -- 'email', 'slack', 'teams', 'webhook'
|
||||
endpoint TEXT NOT NULL, -- Email address, webhook URL, etc.
|
||||
-- Throttling
|
||||
cooldown_minutes INTEGER NOT NULL DEFAULT 15,
|
||||
max_per_hour INTEGER NOT NULL DEFAULT 10,
|
||||
aggregate BOOLEAN NOT NULL DEFAULT TRUE, -- Aggregate notifications
|
||||
-- State
|
||||
last_notified_at TIMESTAMPTZ,
|
||||
notifications_sent INTEGER NOT NULL DEFAULT 0,
|
||||
-- Audit
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
created_by TEXT NOT NULL,
|
||||
updated_by TEXT NOT NULL,
|
||||
CONSTRAINT pk_dead_letter_notification_rules PRIMARY KEY (tenant_id, rule_id),
|
||||
CONSTRAINT ck_dead_letter_notification_channel CHECK (channel IN ('email', 'slack', 'teams', 'webhook', 'pagerduty')),
|
||||
CONSTRAINT ck_dead_letter_notification_cooldown CHECK (cooldown_minutes >= 0),
|
||||
CONSTRAINT ck_dead_letter_notification_max_per_hour CHECK (max_per_hour > 0)
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE dead_letter_notification_rules_default PARTITION OF dead_letter_notification_rules DEFAULT;
|
||||
|
||||
CREATE INDEX ix_dead_letter_notification_rules_enabled ON dead_letter_notification_rules (tenant_id, enabled) WHERE enabled = TRUE;
|
||||
CREATE INDEX ix_dead_letter_notification_rules_source ON dead_letter_notification_rules (tenant_id, source_id) WHERE source_id IS NOT NULL;
|
||||
CREATE INDEX ix_dead_letter_notification_rules_category ON dead_letter_notification_rules (tenant_id, category) WHERE category IS NOT NULL;
|
||||
|
||||
-- Notification Log: Track sent notifications for throttling and auditing
|
||||
CREATE TABLE dead_letter_notification_log (
|
||||
log_id UUID NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
rule_id UUID NOT NULL,
|
||||
entry_ids UUID[] NOT NULL, -- Entries included in this notification
|
||||
channel TEXT NOT NULL,
|
||||
endpoint TEXT NOT NULL,
|
||||
-- Outcome
|
||||
success BOOLEAN NOT NULL,
|
||||
error_message TEXT,
|
||||
-- Context
|
||||
subject TEXT,
|
||||
entry_count INTEGER NOT NULL,
|
||||
sent_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
CONSTRAINT pk_dead_letter_notification_log PRIMARY KEY (tenant_id, log_id),
|
||||
CONSTRAINT fk_dead_letter_notification_log_rule FOREIGN KEY (tenant_id, rule_id)
|
||||
REFERENCES dead_letter_notification_rules (tenant_id, rule_id) ON DELETE CASCADE
|
||||
) PARTITION BY LIST (tenant_id);
|
||||
|
||||
CREATE TABLE dead_letter_notification_log_default PARTITION OF dead_letter_notification_log DEFAULT;
|
||||
|
||||
CREATE INDEX ix_dead_letter_notification_log_rule ON dead_letter_notification_log (tenant_id, rule_id, sent_at DESC);
|
||||
CREATE INDEX ix_dead_letter_notification_log_sent ON dead_letter_notification_log (tenant_id, sent_at DESC);
|
||||
|
||||
-- Dead-letter statistics view
|
||||
CREATE OR REPLACE VIEW dead_letter_stats AS
|
||||
SELECT
|
||||
tenant_id,
|
||||
status,
|
||||
category,
|
||||
error_code,
|
||||
job_type,
|
||||
is_retryable,
|
||||
COUNT(*) AS entry_count,
|
||||
COUNT(*) FILTER (WHERE replay_attempts = 0) AS never_replayed,
|
||||
AVG(replay_attempts)::NUMERIC(5,2) AS avg_replay_attempts,
|
||||
MIN(created_at) AS oldest_entry,
|
||||
MAX(created_at) AS newest_entry,
|
||||
COUNT(*) FILTER (WHERE expires_at < NOW()) AS expired_count
|
||||
FROM dead_letter_entries
|
||||
GROUP BY tenant_id, status, category, error_code, job_type, is_retryable;
|
||||
|
||||
-- Function to mark expired entries
|
||||
CREATE OR REPLACE FUNCTION mark_expired_dead_letter_entries(batch_limit INTEGER DEFAULT 1000)
|
||||
RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
updated_count INTEGER;
|
||||
BEGIN
|
||||
WITH expired AS (
|
||||
UPDATE dead_letter_entries
|
||||
SET status = 'expired',
|
||||
updated_at = NOW(),
|
||||
updated_by = 'system'
|
||||
WHERE ctid IN (
|
||||
SELECT ctid FROM dead_letter_entries
|
||||
WHERE status NOT IN ('replayed', 'resolved', 'exhausted', 'expired')
|
||||
AND expires_at < NOW()
|
||||
LIMIT batch_limit
|
||||
)
|
||||
RETURNING 1
|
||||
)
|
||||
SELECT COUNT(*) INTO updated_count FROM expired;
|
||||
|
||||
RETURN updated_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to purge old resolved/expired entries (retention cleanup)
|
||||
CREATE OR REPLACE FUNCTION purge_dead_letter_entries(
|
||||
retention_days INTEGER DEFAULT 90,
|
||||
batch_limit INTEGER DEFAULT 1000
|
||||
)
|
||||
RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
deleted_count INTEGER;
|
||||
cutoff_date TIMESTAMPTZ;
|
||||
BEGIN
|
||||
cutoff_date := NOW() - (retention_days || ' days')::INTERVAL;
|
||||
|
||||
WITH deleted AS (
|
||||
DELETE FROM dead_letter_entries
|
||||
WHERE ctid IN (
|
||||
SELECT ctid FROM dead_letter_entries
|
||||
WHERE status IN ('replayed', 'resolved', 'exhausted', 'expired')
|
||||
AND updated_at < cutoff_date
|
||||
LIMIT batch_limit
|
||||
)
|
||||
RETURNING 1
|
||||
)
|
||||
SELECT COUNT(*) INTO deleted_count FROM deleted;
|
||||
|
||||
RETURN deleted_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to get actionable dead-letter entries (for dashboard)
|
||||
CREATE OR REPLACE FUNCTION get_actionable_dead_letter_summary(
|
||||
p_tenant_id TEXT,
|
||||
p_limit INTEGER DEFAULT 10
|
||||
)
|
||||
RETURNS TABLE (
|
||||
error_code TEXT,
|
||||
category error_category,
|
||||
entry_count BIGINT,
|
||||
retryable_count BIGINT,
|
||||
oldest_entry TIMESTAMPTZ,
|
||||
sample_reason TEXT
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
dle.error_code,
|
||||
dle.category,
|
||||
COUNT(*)::BIGINT AS entry_count,
|
||||
COUNT(*) FILTER (WHERE dle.is_retryable)::BIGINT AS retryable_count,
|
||||
MIN(dle.created_at) AS oldest_entry,
|
||||
(SELECT failure_reason FROM dead_letter_entries
|
||||
WHERE tenant_id = p_tenant_id AND error_code = dle.error_code AND status = 'pending'
|
||||
ORDER BY created_at DESC LIMIT 1) AS sample_reason
|
||||
FROM dead_letter_entries dle
|
||||
WHERE dle.tenant_id = p_tenant_id
|
||||
AND dle.status = 'pending'
|
||||
GROUP BY dle.error_code, dle.category
|
||||
ORDER BY COUNT(*) DESC
|
||||
LIMIT p_limit;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql STABLE;
|
||||
|
||||
COMMIT;
|
||||
@@ -0,0 +1,243 @@
|
||||
-- Migration: 004_slo_quotas
|
||||
-- Creates tables for SLO management and quota APIs
|
||||
|
||||
-- SLO definitions table
|
||||
CREATE TABLE IF NOT EXISTS slos (
|
||||
slo_id UUID PRIMARY KEY,
|
||||
tenant_id TEXT NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
description TEXT,
|
||||
slo_type TEXT NOT NULL CHECK (slo_type IN ('availability', 'latency', 'throughput')),
|
||||
job_type TEXT,
|
||||
source_id UUID,
|
||||
target DOUBLE PRECISION NOT NULL CHECK (target > 0 AND target <= 1),
|
||||
window TEXT NOT NULL CHECK (window IN ('one_hour', 'one_day', 'seven_days', 'thirty_days')),
|
||||
latency_percentile DOUBLE PRECISION CHECK (latency_percentile IS NULL OR (latency_percentile >= 0 AND latency_percentile <= 1)),
|
||||
latency_target_seconds DOUBLE PRECISION CHECK (latency_target_seconds IS NULL OR latency_target_seconds > 0),
|
||||
throughput_minimum INTEGER CHECK (throughput_minimum IS NULL OR throughput_minimum > 0),
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
created_by TEXT NOT NULL,
|
||||
updated_by TEXT NOT NULL,
|
||||
UNIQUE (tenant_id, name)
|
||||
);
|
||||
|
||||
-- Indexes for SLOs
|
||||
CREATE INDEX IF NOT EXISTS idx_slos_tenant ON slos(tenant_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_slos_tenant_enabled ON slos(tenant_id, enabled) WHERE enabled = TRUE;
|
||||
CREATE INDEX IF NOT EXISTS idx_slos_tenant_job_type ON slos(tenant_id, job_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_slos_tenant_source ON slos(tenant_id, source_id);
|
||||
|
||||
-- Alert budget thresholds table
|
||||
CREATE TABLE IF NOT EXISTS alert_budget_thresholds (
|
||||
threshold_id UUID PRIMARY KEY,
|
||||
slo_id UUID NOT NULL REFERENCES slos(slo_id) ON DELETE CASCADE,
|
||||
tenant_id TEXT NOT NULL,
|
||||
budget_consumed_threshold DOUBLE PRECISION NOT NULL CHECK (budget_consumed_threshold >= 0 AND budget_consumed_threshold <= 1),
|
||||
burn_rate_threshold DOUBLE PRECISION CHECK (burn_rate_threshold IS NULL OR burn_rate_threshold > 0),
|
||||
severity TEXT NOT NULL CHECK (severity IN ('info', 'warning', 'critical', 'emergency')),
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
notification_channel TEXT,
|
||||
notification_endpoint TEXT,
|
||||
cooldown_seconds INTEGER NOT NULL DEFAULT 3600,
|
||||
last_triggered_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
created_by TEXT NOT NULL,
|
||||
updated_by TEXT NOT NULL
|
||||
);
|
||||
|
||||
-- Indexes for alert thresholds
|
||||
CREATE INDEX IF NOT EXISTS idx_alert_thresholds_slo ON alert_budget_thresholds(slo_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_alert_thresholds_tenant ON alert_budget_thresholds(tenant_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_alert_thresholds_enabled ON alert_budget_thresholds(slo_id, enabled) WHERE enabled = TRUE;
|
||||
|
||||
-- SLO alerts table
|
||||
CREATE TABLE IF NOT EXISTS slo_alerts (
|
||||
alert_id UUID PRIMARY KEY,
|
||||
slo_id UUID NOT NULL REFERENCES slos(slo_id) ON DELETE CASCADE,
|
||||
threshold_id UUID NOT NULL REFERENCES alert_budget_thresholds(threshold_id) ON DELETE CASCADE,
|
||||
tenant_id TEXT NOT NULL,
|
||||
severity TEXT NOT NULL CHECK (severity IN ('info', 'warning', 'critical', 'emergency')),
|
||||
message TEXT NOT NULL,
|
||||
budget_consumed DOUBLE PRECISION NOT NULL,
|
||||
burn_rate DOUBLE PRECISION NOT NULL,
|
||||
current_sli DOUBLE PRECISION NOT NULL,
|
||||
triggered_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
acknowledged_at TIMESTAMPTZ,
|
||||
acknowledged_by TEXT,
|
||||
resolved_at TIMESTAMPTZ,
|
||||
resolution_notes TEXT
|
||||
);
|
||||
|
||||
-- Indexes for SLO alerts
|
||||
CREATE INDEX IF NOT EXISTS idx_slo_alerts_tenant ON slo_alerts(tenant_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_slo_alerts_slo ON slo_alerts(slo_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_slo_alerts_tenant_triggered ON slo_alerts(tenant_id, triggered_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_slo_alerts_active ON slo_alerts(tenant_id, resolved_at) WHERE resolved_at IS NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_slo_alerts_unacknowledged ON slo_alerts(tenant_id, acknowledged_at) WHERE acknowledged_at IS NULL;
|
||||
|
||||
-- SLO state snapshots for historical tracking
|
||||
CREATE TABLE IF NOT EXISTS slo_state_snapshots (
|
||||
snapshot_id UUID PRIMARY KEY,
|
||||
slo_id UUID NOT NULL REFERENCES slos(slo_id) ON DELETE CASCADE,
|
||||
tenant_id TEXT NOT NULL,
|
||||
current_sli DOUBLE PRECISION NOT NULL,
|
||||
total_events BIGINT NOT NULL,
|
||||
good_events BIGINT NOT NULL,
|
||||
bad_events BIGINT NOT NULL,
|
||||
budget_consumed DOUBLE PRECISION NOT NULL,
|
||||
budget_remaining DOUBLE PRECISION NOT NULL,
|
||||
burn_rate DOUBLE PRECISION NOT NULL,
|
||||
is_met BOOLEAN NOT NULL,
|
||||
alert_severity TEXT NOT NULL,
|
||||
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
window_start TIMESTAMPTZ NOT NULL,
|
||||
window_end TIMESTAMPTZ NOT NULL
|
||||
);
|
||||
|
||||
-- Indexes for state snapshots
|
||||
CREATE INDEX IF NOT EXISTS idx_slo_snapshots_slo ON slo_state_snapshots(slo_id, computed_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_slo_snapshots_tenant ON slo_state_snapshots(tenant_id, computed_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_slo_snapshots_cleanup ON slo_state_snapshots(computed_at);
|
||||
|
||||
-- Quota audit log for tracking changes
|
||||
CREATE TABLE IF NOT EXISTS quota_audit_log (
|
||||
audit_id UUID PRIMARY KEY,
|
||||
tenant_id TEXT NOT NULL,
|
||||
quota_id UUID NOT NULL,
|
||||
action TEXT NOT NULL CHECK (action IN ('created', 'updated', 'paused', 'resumed', 'deleted')),
|
||||
old_values JSONB,
|
||||
new_values JSONB,
|
||||
reason TEXT,
|
||||
ticket TEXT,
|
||||
performed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
performed_by TEXT NOT NULL
|
||||
);
|
||||
|
||||
-- Indexes for quota audit log
|
||||
CREATE INDEX IF NOT EXISTS idx_quota_audit_tenant ON quota_audit_log(tenant_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_quota_audit_quota ON quota_audit_log(quota_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_quota_audit_time ON quota_audit_log(performed_at DESC);
|
||||
|
||||
-- Job metrics aggregation table for SLO computation
|
||||
-- Stores pre-aggregated metrics per hour for efficient SLO queries
|
||||
CREATE TABLE IF NOT EXISTS job_metrics_hourly (
|
||||
metric_id UUID PRIMARY KEY,
|
||||
tenant_id TEXT NOT NULL,
|
||||
job_type TEXT,
|
||||
source_id UUID,
|
||||
hour_start TIMESTAMPTZ NOT NULL,
|
||||
total_jobs BIGINT NOT NULL DEFAULT 0,
|
||||
successful_jobs BIGINT NOT NULL DEFAULT 0,
|
||||
failed_jobs BIGINT NOT NULL DEFAULT 0,
|
||||
latency_p50_seconds DOUBLE PRECISION,
|
||||
latency_p95_seconds DOUBLE PRECISION,
|
||||
latency_p99_seconds DOUBLE PRECISION,
|
||||
avg_latency_seconds DOUBLE PRECISION,
|
||||
min_latency_seconds DOUBLE PRECISION,
|
||||
max_latency_seconds DOUBLE PRECISION,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
UNIQUE (tenant_id, job_type, source_id, hour_start)
|
||||
);
|
||||
|
||||
-- Indexes for job metrics
|
||||
CREATE INDEX IF NOT EXISTS idx_job_metrics_tenant ON job_metrics_hourly(tenant_id, hour_start DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_metrics_tenant_type ON job_metrics_hourly(tenant_id, job_type, hour_start DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_job_metrics_cleanup ON job_metrics_hourly(hour_start);
|
||||
|
||||
-- Function to aggregate job metrics for SLO computation
|
||||
CREATE OR REPLACE FUNCTION get_slo_availability_counts(
|
||||
p_tenant_id TEXT,
|
||||
p_job_type TEXT,
|
||||
p_source_id UUID,
|
||||
p_window_start TIMESTAMPTZ,
|
||||
p_window_end TIMESTAMPTZ
|
||||
) RETURNS TABLE (
|
||||
total_events BIGINT,
|
||||
good_events BIGINT,
|
||||
bad_events BIGINT
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
COALESCE(SUM(total_jobs), 0)::BIGINT AS total_events,
|
||||
COALESCE(SUM(successful_jobs), 0)::BIGINT AS good_events,
|
||||
COALESCE(SUM(failed_jobs), 0)::BIGINT AS bad_events
|
||||
FROM job_metrics_hourly
|
||||
WHERE tenant_id = p_tenant_id
|
||||
AND hour_start >= p_window_start
|
||||
AND hour_start < p_window_end
|
||||
AND (p_job_type IS NULL OR job_type = p_job_type)
|
||||
AND (p_source_id IS NULL OR source_id = p_source_id);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to clean up old SLO state snapshots
|
||||
CREATE OR REPLACE FUNCTION cleanup_slo_snapshots(
|
||||
p_retention_days INTEGER DEFAULT 90,
|
||||
p_batch_limit INTEGER DEFAULT 10000
|
||||
) RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
deleted_count INTEGER;
|
||||
BEGIN
|
||||
WITH deleted AS (
|
||||
DELETE FROM slo_state_snapshots
|
||||
WHERE computed_at < NOW() - (p_retention_days || ' days')::INTERVAL
|
||||
LIMIT p_batch_limit
|
||||
RETURNING 1
|
||||
)
|
||||
SELECT COUNT(*) INTO deleted_count FROM deleted;
|
||||
|
||||
RETURN deleted_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to clean up old quota audit logs
|
||||
CREATE OR REPLACE FUNCTION cleanup_quota_audit_log(
|
||||
p_retention_days INTEGER DEFAULT 365,
|
||||
p_batch_limit INTEGER DEFAULT 10000
|
||||
) RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
deleted_count INTEGER;
|
||||
BEGIN
|
||||
WITH deleted AS (
|
||||
DELETE FROM quota_audit_log
|
||||
WHERE performed_at < NOW() - (p_retention_days || ' days')::INTERVAL
|
||||
LIMIT p_batch_limit
|
||||
RETURNING 1
|
||||
)
|
||||
SELECT COUNT(*) INTO deleted_count FROM deleted;
|
||||
|
||||
RETURN deleted_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to get SLO summary for a tenant
|
||||
CREATE OR REPLACE FUNCTION get_slo_summary(
|
||||
p_tenant_id TEXT
|
||||
) RETURNS TABLE (
|
||||
total_slos BIGINT,
|
||||
enabled_slos BIGINT,
|
||||
active_alerts BIGINT,
|
||||
unacknowledged_alerts BIGINT,
|
||||
critical_alerts BIGINT
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
(SELECT COUNT(*) FROM slos WHERE tenant_id = p_tenant_id)::BIGINT AS total_slos,
|
||||
(SELECT COUNT(*) FROM slos WHERE tenant_id = p_tenant_id AND enabled = TRUE)::BIGINT AS enabled_slos,
|
||||
(SELECT COUNT(*) FROM slo_alerts WHERE tenant_id = p_tenant_id AND resolved_at IS NULL)::BIGINT AS active_alerts,
|
||||
(SELECT COUNT(*) FROM slo_alerts WHERE tenant_id = p_tenant_id AND acknowledged_at IS NULL AND resolved_at IS NULL)::BIGINT AS unacknowledged_alerts,
|
||||
(SELECT COUNT(*) FROM slo_alerts WHERE tenant_id = p_tenant_id AND severity IN ('critical', 'emergency') AND resolved_at IS NULL)::BIGINT AS critical_alerts;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
COMMENT ON TABLE slos IS 'Service Level Objective definitions for tenants';
|
||||
COMMENT ON TABLE alert_budget_thresholds IS 'Alert thresholds for SLO error budget consumption';
|
||||
COMMENT ON TABLE slo_alerts IS 'SLO alert events triggered by threshold violations';
|
||||
COMMENT ON TABLE slo_state_snapshots IS 'Historical snapshots of SLO state for trend analysis';
|
||||
COMMENT ON TABLE quota_audit_log IS 'Audit trail for quota configuration changes';
|
||||
COMMENT ON TABLE job_metrics_hourly IS 'Pre-aggregated hourly job metrics for efficient SLO computation';
|
||||
@@ -0,0 +1,417 @@
|
||||
-- Migration: 005_audit_ledger
|
||||
-- Creates tables for audit logging and immutable run ledger
|
||||
|
||||
-- Audit log entries table (immutable append-only log)
|
||||
CREATE TABLE IF NOT EXISTS audit_entries (
|
||||
entry_id UUID PRIMARY KEY,
|
||||
tenant_id TEXT NOT NULL,
|
||||
event_type INTEGER NOT NULL,
|
||||
resource_type TEXT NOT NULL,
|
||||
resource_id UUID NOT NULL,
|
||||
actor_id TEXT NOT NULL,
|
||||
actor_type INTEGER NOT NULL,
|
||||
actor_ip TEXT,
|
||||
user_agent TEXT,
|
||||
http_method TEXT,
|
||||
request_path TEXT,
|
||||
old_state JSONB,
|
||||
new_state JSONB,
|
||||
description TEXT NOT NULL,
|
||||
correlation_id TEXT,
|
||||
previous_entry_hash TEXT,
|
||||
content_hash TEXT NOT NULL,
|
||||
sequence_number BIGINT NOT NULL,
|
||||
occurred_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
metadata JSONB
|
||||
);
|
||||
|
||||
-- Indexes for audit log
|
||||
CREATE INDEX IF NOT EXISTS idx_audit_tenant ON audit_entries(tenant_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_audit_tenant_time ON audit_entries(tenant_id, occurred_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_audit_tenant_seq ON audit_entries(tenant_id, sequence_number DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_audit_resource ON audit_entries(tenant_id, resource_type, resource_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_audit_actor ON audit_entries(tenant_id, actor_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_audit_event_type ON audit_entries(tenant_id, event_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_audit_correlation ON audit_entries(correlation_id) WHERE correlation_id IS NOT NULL;
|
||||
|
||||
-- Run ledger entries table (immutable run execution records)
|
||||
CREATE TABLE IF NOT EXISTS run_ledger_entries (
|
||||
ledger_id UUID PRIMARY KEY,
|
||||
tenant_id TEXT NOT NULL,
|
||||
run_id UUID NOT NULL,
|
||||
source_id UUID NOT NULL,
|
||||
run_type TEXT NOT NULL,
|
||||
final_status INTEGER NOT NULL,
|
||||
total_jobs INTEGER NOT NULL,
|
||||
succeeded_jobs INTEGER NOT NULL,
|
||||
failed_jobs INTEGER NOT NULL,
|
||||
run_created_at TIMESTAMPTZ NOT NULL,
|
||||
run_started_at TIMESTAMPTZ,
|
||||
run_completed_at TIMESTAMPTZ NOT NULL,
|
||||
execution_duration_ms BIGINT NOT NULL,
|
||||
initiated_by TEXT NOT NULL,
|
||||
input_digest TEXT NOT NULL,
|
||||
output_digest TEXT NOT NULL,
|
||||
artifact_manifest JSONB NOT NULL,
|
||||
sequence_number BIGINT NOT NULL,
|
||||
previous_entry_hash TEXT,
|
||||
content_hash TEXT NOT NULL,
|
||||
ledger_created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
correlation_id TEXT,
|
||||
metadata JSONB
|
||||
);
|
||||
|
||||
-- Indexes for run ledger
|
||||
CREATE INDEX IF NOT EXISTS idx_ledger_tenant ON run_ledger_entries(tenant_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_ledger_tenant_time ON run_ledger_entries(tenant_id, ledger_created_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_ledger_tenant_seq ON run_ledger_entries(tenant_id, sequence_number DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_ledger_run ON run_ledger_entries(run_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_ledger_source ON run_ledger_entries(tenant_id, source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_ledger_run_type ON run_ledger_entries(tenant_id, run_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_ledger_content_hash ON run_ledger_entries(content_hash);
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_ledger_tenant_run ON run_ledger_entries(tenant_id, run_id);
|
||||
|
||||
-- Ledger exports table
|
||||
CREATE TABLE IF NOT EXISTS ledger_exports (
|
||||
export_id UUID PRIMARY KEY,
|
||||
tenant_id TEXT NOT NULL,
|
||||
status INTEGER NOT NULL DEFAULT 0,
|
||||
format TEXT NOT NULL CHECK (format IN ('json', 'ndjson', 'csv')),
|
||||
start_time TIMESTAMPTZ,
|
||||
end_time TIMESTAMPTZ,
|
||||
run_type_filter TEXT,
|
||||
source_id_filter UUID,
|
||||
entry_count INTEGER NOT NULL DEFAULT 0,
|
||||
output_uri TEXT,
|
||||
output_digest TEXT,
|
||||
output_size_bytes BIGINT,
|
||||
requested_by TEXT NOT NULL,
|
||||
requested_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
started_at TIMESTAMPTZ,
|
||||
completed_at TIMESTAMPTZ,
|
||||
error_message TEXT
|
||||
);
|
||||
|
||||
-- Indexes for ledger exports
|
||||
CREATE INDEX IF NOT EXISTS idx_exports_tenant ON ledger_exports(tenant_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_exports_tenant_time ON ledger_exports(tenant_id, requested_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_exports_status ON ledger_exports(tenant_id, status);
|
||||
|
||||
-- Signed manifests table
|
||||
CREATE TABLE IF NOT EXISTS signed_manifests (
|
||||
manifest_id UUID PRIMARY KEY,
|
||||
schema_version TEXT NOT NULL,
|
||||
tenant_id TEXT NOT NULL,
|
||||
provenance_type INTEGER NOT NULL,
|
||||
subject_id UUID NOT NULL,
|
||||
statements JSONB NOT NULL,
|
||||
artifacts JSONB NOT NULL,
|
||||
materials JSONB NOT NULL,
|
||||
build_info JSONB,
|
||||
payload_digest TEXT NOT NULL,
|
||||
signature_algorithm TEXT NOT NULL,
|
||||
signature TEXT NOT NULL,
|
||||
key_id TEXT NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
expires_at TIMESTAMPTZ,
|
||||
metadata JSONB
|
||||
);
|
||||
|
||||
-- Indexes for signed manifests
|
||||
CREATE INDEX IF NOT EXISTS idx_manifests_tenant ON signed_manifests(tenant_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_manifests_subject ON signed_manifests(tenant_id, provenance_type, subject_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_manifests_payload ON signed_manifests(payload_digest);
|
||||
CREATE INDEX IF NOT EXISTS idx_manifests_key ON signed_manifests(key_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_manifests_expiry ON signed_manifests(expires_at) WHERE expires_at IS NOT NULL;
|
||||
|
||||
-- Sequence tracking for audit entries per tenant
|
||||
CREATE TABLE IF NOT EXISTS audit_sequences (
|
||||
tenant_id TEXT PRIMARY KEY,
|
||||
last_sequence_number BIGINT NOT NULL DEFAULT 0,
|
||||
last_entry_hash TEXT,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Sequence tracking for ledger entries per tenant
|
||||
CREATE TABLE IF NOT EXISTS ledger_sequences (
|
||||
tenant_id TEXT PRIMARY KEY,
|
||||
last_sequence_number BIGINT NOT NULL DEFAULT 0,
|
||||
last_entry_hash TEXT,
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
-- Function to get the next audit sequence number for a tenant
|
||||
CREATE OR REPLACE FUNCTION next_audit_sequence(
|
||||
p_tenant_id TEXT
|
||||
) RETURNS TABLE (
|
||||
next_seq BIGINT,
|
||||
prev_hash TEXT
|
||||
) AS $$
|
||||
DECLARE
|
||||
v_next_seq BIGINT;
|
||||
v_prev_hash TEXT;
|
||||
BEGIN
|
||||
-- Lock and update the sequence
|
||||
INSERT INTO audit_sequences (tenant_id, last_sequence_number, last_entry_hash, updated_at)
|
||||
VALUES (p_tenant_id, 1, NULL, NOW())
|
||||
ON CONFLICT (tenant_id)
|
||||
DO UPDATE SET
|
||||
last_sequence_number = audit_sequences.last_sequence_number + 1,
|
||||
updated_at = NOW()
|
||||
RETURNING audit_sequences.last_sequence_number, audit_sequences.last_entry_hash
|
||||
INTO v_next_seq, v_prev_hash;
|
||||
|
||||
RETURN QUERY SELECT v_next_seq, v_prev_hash;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to update audit sequence with new hash after insertion
|
||||
CREATE OR REPLACE FUNCTION update_audit_sequence_hash(
|
||||
p_tenant_id TEXT,
|
||||
p_content_hash TEXT
|
||||
) RETURNS VOID AS $$
|
||||
BEGIN
|
||||
UPDATE audit_sequences
|
||||
SET last_entry_hash = p_content_hash,
|
||||
updated_at = NOW()
|
||||
WHERE tenant_id = p_tenant_id;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to get the next ledger sequence number for a tenant
|
||||
CREATE OR REPLACE FUNCTION next_ledger_sequence(
|
||||
p_tenant_id TEXT
|
||||
) RETURNS TABLE (
|
||||
next_seq BIGINT,
|
||||
prev_hash TEXT
|
||||
) AS $$
|
||||
DECLARE
|
||||
v_next_seq BIGINT;
|
||||
v_prev_hash TEXT;
|
||||
BEGIN
|
||||
-- Lock and update the sequence
|
||||
INSERT INTO ledger_sequences (tenant_id, last_sequence_number, last_entry_hash, updated_at)
|
||||
VALUES (p_tenant_id, 1, NULL, NOW())
|
||||
ON CONFLICT (tenant_id)
|
||||
DO UPDATE SET
|
||||
last_sequence_number = ledger_sequences.last_sequence_number + 1,
|
||||
updated_at = NOW()
|
||||
RETURNING ledger_sequences.last_sequence_number, ledger_sequences.last_entry_hash
|
||||
INTO v_next_seq, v_prev_hash;
|
||||
|
||||
RETURN QUERY SELECT v_next_seq, v_prev_hash;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to update ledger sequence with new hash after insertion
|
||||
CREATE OR REPLACE FUNCTION update_ledger_sequence_hash(
|
||||
p_tenant_id TEXT,
|
||||
p_content_hash TEXT
|
||||
) RETURNS VOID AS $$
|
||||
BEGIN
|
||||
UPDATE ledger_sequences
|
||||
SET last_entry_hash = p_content_hash,
|
||||
updated_at = NOW()
|
||||
WHERE tenant_id = p_tenant_id;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to verify audit chain integrity
|
||||
CREATE OR REPLACE FUNCTION verify_audit_chain(
|
||||
p_tenant_id TEXT,
|
||||
p_start_seq BIGINT DEFAULT 1,
|
||||
p_end_seq BIGINT DEFAULT NULL
|
||||
) RETURNS TABLE (
|
||||
is_valid BOOLEAN,
|
||||
invalid_entry_id UUID,
|
||||
invalid_sequence BIGINT,
|
||||
error_message TEXT
|
||||
) AS $$
|
||||
DECLARE
|
||||
v_prev_hash TEXT;
|
||||
v_entry RECORD;
|
||||
BEGIN
|
||||
FOR v_entry IN
|
||||
SELECT entry_id, sequence_number, previous_entry_hash, content_hash
|
||||
FROM audit_entries
|
||||
WHERE tenant_id = p_tenant_id
|
||||
AND sequence_number >= p_start_seq
|
||||
AND (p_end_seq IS NULL OR sequence_number <= p_end_seq)
|
||||
ORDER BY sequence_number ASC
|
||||
LOOP
|
||||
-- First entry should have null previous hash or be sequence 1
|
||||
IF v_entry.sequence_number = 1 AND v_entry.previous_entry_hash IS NOT NULL THEN
|
||||
RETURN QUERY SELECT FALSE, v_entry.entry_id, v_entry.sequence_number,
|
||||
'First entry should have null previous_entry_hash'::TEXT;
|
||||
RETURN;
|
||||
END IF;
|
||||
|
||||
-- Check chain link
|
||||
IF v_prev_hash IS NOT NULL AND v_entry.previous_entry_hash != v_prev_hash THEN
|
||||
RETURN QUERY SELECT FALSE, v_entry.entry_id, v_entry.sequence_number,
|
||||
format('Chain break: expected %s, got %s', v_prev_hash, v_entry.previous_entry_hash);
|
||||
RETURN;
|
||||
END IF;
|
||||
|
||||
v_prev_hash := v_entry.content_hash;
|
||||
END LOOP;
|
||||
|
||||
RETURN QUERY SELECT TRUE, NULL::UUID, NULL::BIGINT, NULL::TEXT;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to verify ledger chain integrity
|
||||
CREATE OR REPLACE FUNCTION verify_ledger_chain(
|
||||
p_tenant_id TEXT,
|
||||
p_start_seq BIGINT DEFAULT 1,
|
||||
p_end_seq BIGINT DEFAULT NULL
|
||||
) RETURNS TABLE (
|
||||
is_valid BOOLEAN,
|
||||
invalid_ledger_id UUID,
|
||||
invalid_sequence BIGINT,
|
||||
error_message TEXT
|
||||
) AS $$
|
||||
DECLARE
|
||||
v_prev_hash TEXT;
|
||||
v_entry RECORD;
|
||||
BEGIN
|
||||
FOR v_entry IN
|
||||
SELECT ledger_id, sequence_number, previous_entry_hash, content_hash
|
||||
FROM run_ledger_entries
|
||||
WHERE tenant_id = p_tenant_id
|
||||
AND sequence_number >= p_start_seq
|
||||
AND (p_end_seq IS NULL OR sequence_number <= p_end_seq)
|
||||
ORDER BY sequence_number ASC
|
||||
LOOP
|
||||
-- First entry should have null previous hash or be sequence 1
|
||||
IF v_entry.sequence_number = 1 AND v_entry.previous_entry_hash IS NOT NULL THEN
|
||||
RETURN QUERY SELECT FALSE, v_entry.ledger_id, v_entry.sequence_number,
|
||||
'First entry should have null previous_entry_hash'::TEXT;
|
||||
RETURN;
|
||||
END IF;
|
||||
|
||||
-- Check chain link
|
||||
IF v_prev_hash IS NOT NULL AND v_entry.previous_entry_hash != v_prev_hash THEN
|
||||
RETURN QUERY SELECT FALSE, v_entry.ledger_id, v_entry.sequence_number,
|
||||
format('Chain break: expected %s, got %s', v_prev_hash, v_entry.previous_entry_hash);
|
||||
RETURN;
|
||||
END IF;
|
||||
|
||||
v_prev_hash := v_entry.content_hash;
|
||||
END LOOP;
|
||||
|
||||
RETURN QUERY SELECT TRUE, NULL::UUID, NULL::BIGINT, NULL::TEXT;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to get audit summary statistics
|
||||
CREATE OR REPLACE FUNCTION get_audit_summary(
|
||||
p_tenant_id TEXT,
|
||||
p_since TIMESTAMPTZ DEFAULT NULL
|
||||
) RETURNS TABLE (
|
||||
total_entries BIGINT,
|
||||
entries_since BIGINT,
|
||||
event_types BIGINT,
|
||||
unique_actors BIGINT,
|
||||
unique_resources BIGINT,
|
||||
earliest_entry TIMESTAMPTZ,
|
||||
latest_entry TIMESTAMPTZ
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
COUNT(*)::BIGINT AS total_entries,
|
||||
COUNT(*) FILTER (WHERE p_since IS NULL OR occurred_at >= p_since)::BIGINT AS entries_since,
|
||||
COUNT(DISTINCT event_type)::BIGINT AS event_types,
|
||||
COUNT(DISTINCT actor_id)::BIGINT AS unique_actors,
|
||||
COUNT(DISTINCT (resource_type, resource_id))::BIGINT AS unique_resources,
|
||||
MIN(occurred_at) AS earliest_entry,
|
||||
MAX(occurred_at) AS latest_entry
|
||||
FROM audit_entries
|
||||
WHERE tenant_id = p_tenant_id;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to get ledger summary statistics
|
||||
CREATE OR REPLACE FUNCTION get_ledger_summary(
|
||||
p_tenant_id TEXT,
|
||||
p_since TIMESTAMPTZ DEFAULT NULL
|
||||
) RETURNS TABLE (
|
||||
total_entries BIGINT,
|
||||
entries_since BIGINT,
|
||||
total_runs BIGINT,
|
||||
successful_runs BIGINT,
|
||||
failed_runs BIGINT,
|
||||
total_jobs BIGINT,
|
||||
unique_sources BIGINT,
|
||||
unique_run_types BIGINT,
|
||||
earliest_entry TIMESTAMPTZ,
|
||||
latest_entry TIMESTAMPTZ
|
||||
) AS $$
|
||||
BEGIN
|
||||
RETURN QUERY
|
||||
SELECT
|
||||
COUNT(*)::BIGINT AS total_entries,
|
||||
COUNT(*) FILTER (WHERE p_since IS NULL OR ledger_created_at >= p_since)::BIGINT AS entries_since,
|
||||
COUNT(*)::BIGINT AS total_runs,
|
||||
COUNT(*) FILTER (WHERE final_status = 2)::BIGINT AS successful_runs, -- RunStatus.Succeeded = 2
|
||||
COUNT(*) FILTER (WHERE final_status IN (3, 4))::BIGINT AS failed_runs, -- PartiallySucceeded = 3, Failed = 4
|
||||
COALESCE(SUM(total_jobs), 0)::BIGINT AS total_jobs,
|
||||
COUNT(DISTINCT source_id)::BIGINT AS unique_sources,
|
||||
COUNT(DISTINCT run_type)::BIGINT AS unique_run_types,
|
||||
MIN(ledger_created_at) AS earliest_entry,
|
||||
MAX(ledger_created_at) AS latest_entry
|
||||
FROM run_ledger_entries
|
||||
WHERE tenant_id = p_tenant_id;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to cleanup old audit entries (respecting retention)
|
||||
CREATE OR REPLACE FUNCTION cleanup_audit_entries(
|
||||
p_retention_days INTEGER DEFAULT 365,
|
||||
p_batch_limit INTEGER DEFAULT 10000
|
||||
) RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
deleted_count INTEGER;
|
||||
BEGIN
|
||||
WITH deleted AS (
|
||||
DELETE FROM audit_entries
|
||||
WHERE occurred_at < NOW() - (p_retention_days || ' days')::INTERVAL
|
||||
LIMIT p_batch_limit
|
||||
RETURNING 1
|
||||
)
|
||||
SELECT COUNT(*) INTO deleted_count FROM deleted;
|
||||
|
||||
RETURN deleted_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to cleanup old ledger entries (respecting retention)
|
||||
CREATE OR REPLACE FUNCTION cleanup_ledger_entries(
|
||||
p_retention_days INTEGER DEFAULT 2555, -- ~7 years for compliance
|
||||
p_batch_limit INTEGER DEFAULT 10000
|
||||
) RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
deleted_count INTEGER;
|
||||
BEGIN
|
||||
WITH deleted AS (
|
||||
DELETE FROM run_ledger_entries
|
||||
WHERE ledger_created_at < NOW() - (p_retention_days || ' days')::INTERVAL
|
||||
LIMIT p_batch_limit
|
||||
RETURNING 1
|
||||
)
|
||||
SELECT COUNT(*) INTO deleted_count FROM deleted;
|
||||
|
||||
RETURN deleted_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Comments
|
||||
COMMENT ON TABLE audit_entries IS 'Immutable audit log with hash chain for tamper evidence';
|
||||
COMMENT ON TABLE run_ledger_entries IS 'Immutable run execution ledger with provenance tracking';
|
||||
COMMENT ON TABLE ledger_exports IS 'Ledger export operations tracking';
|
||||
COMMENT ON TABLE signed_manifests IS 'Signed provenance manifests for artifacts and exports';
|
||||
COMMENT ON TABLE audit_sequences IS 'Sequence tracking for audit entry chain integrity';
|
||||
COMMENT ON TABLE ledger_sequences IS 'Sequence tracking for ledger entry chain integrity';
|
||||
COMMENT ON FUNCTION verify_audit_chain IS 'Verifies the hash chain integrity of audit entries';
|
||||
COMMENT ON FUNCTION verify_ledger_chain IS 'Verifies the hash chain integrity of ledger entries';
|
||||
@@ -0,0 +1,321 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.AuditLedger;
|
||||
|
||||
/// <summary>
|
||||
/// Tests for AuditEntry domain model.
|
||||
/// </summary>
|
||||
public sealed class AuditEntryTests
|
||||
{
|
||||
[Fact]
|
||||
public void Create_WithValidParameters_SetsAllProperties()
|
||||
{
|
||||
// Arrange
|
||||
var tenantId = "test-tenant";
|
||||
var resourceId = Guid.NewGuid();
|
||||
|
||||
// Act
|
||||
var entry = AuditEntry.Create(
|
||||
tenantId: tenantId,
|
||||
eventType: AuditEventType.JobCreated,
|
||||
resourceType: "job",
|
||||
resourceId: resourceId,
|
||||
actorId: "user@example.com",
|
||||
actorType: ActorType.User,
|
||||
description: "Job created",
|
||||
oldState: null,
|
||||
newState: """{"status":"pending"}""",
|
||||
actorIp: "192.168.1.1",
|
||||
userAgent: "TestClient/1.0",
|
||||
httpMethod: "POST",
|
||||
requestPath: "/api/v1/jobs",
|
||||
correlationId: "corr-123",
|
||||
previousEntryHash: null,
|
||||
sequenceNumber: 1,
|
||||
metadata: """{"extra":"data"}""");
|
||||
|
||||
// Assert
|
||||
Assert.NotEqual(Guid.Empty, entry.EntryId);
|
||||
Assert.Equal(tenantId, entry.TenantId);
|
||||
Assert.Equal(AuditEventType.JobCreated, entry.EventType);
|
||||
Assert.Equal("job", entry.ResourceType);
|
||||
Assert.Equal(resourceId, entry.ResourceId);
|
||||
Assert.Equal("user@example.com", entry.ActorId);
|
||||
Assert.Equal(ActorType.User, entry.ActorType);
|
||||
Assert.Equal("192.168.1.1", entry.ActorIp);
|
||||
Assert.Equal("TestClient/1.0", entry.UserAgent);
|
||||
Assert.Equal("POST", entry.HttpMethod);
|
||||
Assert.Equal("/api/v1/jobs", entry.RequestPath);
|
||||
Assert.Null(entry.OldState);
|
||||
Assert.Equal("""{"status":"pending"}""", entry.NewState);
|
||||
Assert.Equal("Job created", entry.Description);
|
||||
Assert.Equal("corr-123", entry.CorrelationId);
|
||||
Assert.Null(entry.PreviousEntryHash);
|
||||
Assert.NotEmpty(entry.ContentHash);
|
||||
Assert.Equal(1, entry.SequenceNumber);
|
||||
Assert.Equal("""{"extra":"data"}""", entry.Metadata);
|
||||
Assert.True(entry.OccurredAt > DateTimeOffset.MinValue);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_GeneratesValidContentHash()
|
||||
{
|
||||
// Arrange & Act
|
||||
var entry = AuditEntry.Create(
|
||||
tenantId: "test-tenant",
|
||||
eventType: AuditEventType.RunCreated,
|
||||
resourceType: "run",
|
||||
resourceId: Guid.NewGuid(),
|
||||
actorId: "system",
|
||||
actorType: ActorType.System,
|
||||
description: "Run created",
|
||||
sequenceNumber: 1);
|
||||
|
||||
// Assert
|
||||
Assert.NotEmpty(entry.ContentHash);
|
||||
Assert.Equal(64, entry.ContentHash.Length); // SHA-256 produces 64 hex chars
|
||||
Assert.True(entry.ContentHash.All(c => char.IsAsciiHexDigit(c)));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerifyIntegrity_WithValidEntry_ReturnsTrue()
|
||||
{
|
||||
// Arrange
|
||||
var entry = AuditEntry.Create(
|
||||
tenantId: "test-tenant",
|
||||
eventType: AuditEventType.SourceCreated,
|
||||
resourceType: "source",
|
||||
resourceId: Guid.NewGuid(),
|
||||
actorId: "admin",
|
||||
actorType: ActorType.User,
|
||||
description: "Source created",
|
||||
sequenceNumber: 5);
|
||||
|
||||
// Act
|
||||
var isValid = entry.VerifyIntegrity();
|
||||
|
||||
// Assert
|
||||
Assert.True(isValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerifyIntegrity_WithTamperedEntry_ReturnsFalse()
|
||||
{
|
||||
// Arrange
|
||||
var entry = AuditEntry.Create(
|
||||
tenantId: "test-tenant",
|
||||
eventType: AuditEventType.QuotaCreated,
|
||||
resourceType: "quota",
|
||||
resourceId: Guid.NewGuid(),
|
||||
actorId: "admin",
|
||||
actorType: ActorType.User,
|
||||
description: "Original description",
|
||||
sequenceNumber: 1);
|
||||
|
||||
// Tamper with the entry by changing description but keeping original hash
|
||||
var tamperedEntry = entry with { Description = "Tampered description" };
|
||||
|
||||
// Act
|
||||
var isValid = tamperedEntry.VerifyIntegrity();
|
||||
|
||||
// Assert
|
||||
Assert.False(isValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerifyChainLink_WithNullPrevious_AndFirstEntry_ReturnsTrue()
|
||||
{
|
||||
// Arrange
|
||||
var entry = AuditEntry.Create(
|
||||
tenantId: "test-tenant",
|
||||
eventType: AuditEventType.JobScheduled,
|
||||
resourceType: "job",
|
||||
resourceId: Guid.NewGuid(),
|
||||
actorId: "scheduler",
|
||||
actorType: ActorType.System,
|
||||
description: "Job scheduled",
|
||||
previousEntryHash: null,
|
||||
sequenceNumber: 1);
|
||||
|
||||
// Act
|
||||
var isValid = entry.VerifyChainLink(null);
|
||||
|
||||
// Assert
|
||||
Assert.True(isValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerifyChainLink_WithValidPreviousEntry_ReturnsTrue()
|
||||
{
|
||||
// Arrange
|
||||
var first = AuditEntry.Create(
|
||||
tenantId: "test-tenant",
|
||||
eventType: AuditEventType.JobCreated,
|
||||
resourceType: "job",
|
||||
resourceId: Guid.NewGuid(),
|
||||
actorId: "user",
|
||||
actorType: ActorType.User,
|
||||
description: "First entry",
|
||||
previousEntryHash: null,
|
||||
sequenceNumber: 1);
|
||||
|
||||
var second = AuditEntry.Create(
|
||||
tenantId: "test-tenant",
|
||||
eventType: AuditEventType.JobLeased,
|
||||
resourceType: "job",
|
||||
resourceId: Guid.NewGuid(),
|
||||
actorId: "worker",
|
||||
actorType: ActorType.Worker,
|
||||
description: "Second entry",
|
||||
previousEntryHash: first.ContentHash,
|
||||
sequenceNumber: 2);
|
||||
|
||||
// Act
|
||||
var isValid = second.VerifyChainLink(first);
|
||||
|
||||
// Assert
|
||||
Assert.True(isValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerifyChainLink_WithInvalidPreviousHash_ReturnsFalse()
|
||||
{
|
||||
// Arrange
|
||||
var first = AuditEntry.Create(
|
||||
tenantId: "test-tenant",
|
||||
eventType: AuditEventType.JobCreated,
|
||||
resourceType: "job",
|
||||
resourceId: Guid.NewGuid(),
|
||||
actorId: "user",
|
||||
actorType: ActorType.User,
|
||||
description: "First entry",
|
||||
previousEntryHash: null,
|
||||
sequenceNumber: 1);
|
||||
|
||||
var second = AuditEntry.Create(
|
||||
tenantId: "test-tenant",
|
||||
eventType: AuditEventType.JobCompleted,
|
||||
resourceType: "job",
|
||||
resourceId: Guid.NewGuid(),
|
||||
actorId: "worker",
|
||||
actorType: ActorType.Worker,
|
||||
description: "Second entry with wrong hash",
|
||||
previousEntryHash: "wrong_hash_value",
|
||||
sequenceNumber: 2);
|
||||
|
||||
// Act
|
||||
var isValid = second.VerifyChainLink(first);
|
||||
|
||||
// Assert
|
||||
Assert.False(isValid);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(AuditEventType.JobCreated, "job")]
|
||||
[InlineData(AuditEventType.RunStarted, "run")]
|
||||
[InlineData(AuditEventType.SourcePaused, "source")]
|
||||
[InlineData(AuditEventType.QuotaUpdated, "quota")]
|
||||
[InlineData(AuditEventType.SloAlertTriggered, "slo")]
|
||||
[InlineData(AuditEventType.DeadLetterReplayed, "deadletter")]
|
||||
[InlineData(AuditEventType.BackfillStarted, "backfill")]
|
||||
[InlineData(AuditEventType.LedgerExportRequested, "export")]
|
||||
[InlineData(AuditEventType.WorkerHeartbeat, "worker")]
|
||||
[InlineData(AuditEventType.AuthorizationDenied, "security")]
|
||||
public void Create_WithDifferentEventTypes_CreatesValidEntries(AuditEventType eventType, string resourceType)
|
||||
{
|
||||
// Act
|
||||
var entry = AuditEntry.Create(
|
||||
tenantId: "test-tenant",
|
||||
eventType: eventType,
|
||||
resourceType: resourceType,
|
||||
resourceId: Guid.NewGuid(),
|
||||
actorId: "test-actor",
|
||||
actorType: ActorType.System,
|
||||
description: $"Testing {eventType}",
|
||||
sequenceNumber: 1);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(eventType, entry.EventType);
|
||||
Assert.Equal(resourceType, entry.ResourceType);
|
||||
Assert.True(entry.VerifyIntegrity());
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(ActorType.User)]
|
||||
[InlineData(ActorType.System)]
|
||||
[InlineData(ActorType.Worker)]
|
||||
[InlineData(ActorType.ApiKey)]
|
||||
[InlineData(ActorType.Service)]
|
||||
[InlineData(ActorType.Unknown)]
|
||||
public void Create_WithDifferentActorTypes_CreatesValidEntries(ActorType actorType)
|
||||
{
|
||||
// Act
|
||||
var entry = AuditEntry.Create(
|
||||
tenantId: "test-tenant",
|
||||
eventType: AuditEventType.JobCreated,
|
||||
resourceType: "job",
|
||||
resourceId: Guid.NewGuid(),
|
||||
actorId: "test-actor",
|
||||
actorType: actorType,
|
||||
description: $"Testing actor type {actorType}",
|
||||
sequenceNumber: 1);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(actorType, entry.ActorType);
|
||||
Assert.True(entry.VerifyIntegrity());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithOldAndNewState_TracksChanges()
|
||||
{
|
||||
// Arrange
|
||||
var oldState = """{"status":"pending","priority":0}""";
|
||||
var newState = """{"status":"running","priority":1}""";
|
||||
|
||||
// Act
|
||||
var entry = AuditEntry.Create(
|
||||
tenantId: "test-tenant",
|
||||
eventType: AuditEventType.JobLeased,
|
||||
resourceType: "job",
|
||||
resourceId: Guid.NewGuid(),
|
||||
actorId: "worker-1",
|
||||
actorType: ActorType.Worker,
|
||||
description: "Job leased",
|
||||
oldState: oldState,
|
||||
newState: newState,
|
||||
sequenceNumber: 1);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(oldState, entry.OldState);
|
||||
Assert.Equal(newState, entry.NewState);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_MultipleEntries_GeneratesDifferentHashes()
|
||||
{
|
||||
// Act
|
||||
var entry1 = AuditEntry.Create(
|
||||
tenantId: "test-tenant",
|
||||
eventType: AuditEventType.JobCreated,
|
||||
resourceType: "job",
|
||||
resourceId: Guid.NewGuid(),
|
||||
actorId: "user1",
|
||||
actorType: ActorType.User,
|
||||
description: "First job",
|
||||
sequenceNumber: 1);
|
||||
|
||||
var entry2 = AuditEntry.Create(
|
||||
tenantId: "test-tenant",
|
||||
eventType: AuditEventType.JobCreated,
|
||||
resourceType: "job",
|
||||
resourceId: Guid.NewGuid(),
|
||||
actorId: "user2",
|
||||
actorType: ActorType.User,
|
||||
description: "Second job",
|
||||
sequenceNumber: 2);
|
||||
|
||||
// Assert
|
||||
Assert.NotEqual(entry1.ContentHash, entry2.ContentHash);
|
||||
Assert.NotEqual(entry1.EntryId, entry2.EntryId);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,238 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.AuditLedger;
|
||||
|
||||
/// <summary>
|
||||
/// Tests for LedgerExport domain model.
|
||||
/// </summary>
|
||||
public sealed class LedgerExportTests
|
||||
{
|
||||
[Fact]
|
||||
public void CreateRequest_WithValidParameters_CreatesExport()
|
||||
{
|
||||
// Act
|
||||
var export = LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: "json",
|
||||
requestedBy: "user@example.com",
|
||||
startTime: DateTimeOffset.UtcNow.AddDays(-7),
|
||||
endTime: DateTimeOffset.UtcNow,
|
||||
runTypeFilter: "scan",
|
||||
sourceIdFilter: Guid.NewGuid());
|
||||
|
||||
// Assert
|
||||
Assert.NotEqual(Guid.Empty, export.ExportId);
|
||||
Assert.Equal("test-tenant", export.TenantId);
|
||||
Assert.Equal(LedgerExportStatus.Pending, export.Status);
|
||||
Assert.Equal("json", export.Format);
|
||||
Assert.NotNull(export.StartTime);
|
||||
Assert.NotNull(export.EndTime);
|
||||
Assert.Equal("scan", export.RunTypeFilter);
|
||||
Assert.NotNull(export.SourceIdFilter);
|
||||
Assert.Equal("user@example.com", export.RequestedBy);
|
||||
Assert.True(export.RequestedAt > DateTimeOffset.MinValue);
|
||||
Assert.Null(export.StartedAt);
|
||||
Assert.Null(export.CompletedAt);
|
||||
Assert.Equal(0, export.EntryCount);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("json")]
|
||||
[InlineData("ndjson")]
|
||||
[InlineData("csv")]
|
||||
[InlineData("JSON")]
|
||||
[InlineData("NDJSON")]
|
||||
[InlineData("CSV")]
|
||||
public void CreateRequest_WithValidFormats_NormalizesToLowerCase(string format)
|
||||
{
|
||||
// Act
|
||||
var export = LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: format,
|
||||
requestedBy: "user");
|
||||
|
||||
// Assert
|
||||
Assert.Equal(format.ToLowerInvariant(), export.Format);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("xml")]
|
||||
[InlineData("yaml")]
|
||||
[InlineData("parquet")]
|
||||
[InlineData("invalid")]
|
||||
public void CreateRequest_WithInvalidFormat_ThrowsException(string format)
|
||||
{
|
||||
// Act & Assert
|
||||
Assert.Throws<ArgumentException>(() =>
|
||||
LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: format,
|
||||
requestedBy: "user"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateRequest_WithNullFormat_ThrowsException()
|
||||
{
|
||||
// Act & Assert
|
||||
Assert.Throws<ArgumentException>(() =>
|
||||
LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: null!,
|
||||
requestedBy: "user"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateRequest_WithEmptyFormat_ThrowsException()
|
||||
{
|
||||
// Act & Assert
|
||||
Assert.Throws<ArgumentException>(() =>
|
||||
LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: "",
|
||||
requestedBy: "user"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Start_SetsStatusAndStartedAt()
|
||||
{
|
||||
// Arrange
|
||||
var export = LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: "json",
|
||||
requestedBy: "user");
|
||||
|
||||
// Act
|
||||
var started = export.Start();
|
||||
|
||||
// Assert
|
||||
Assert.Equal(LedgerExportStatus.Processing, started.Status);
|
||||
Assert.NotNull(started.StartedAt);
|
||||
Assert.True(started.StartedAt >= export.RequestedAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Complete_SetsAllProperties()
|
||||
{
|
||||
// Arrange
|
||||
var export = LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: "json",
|
||||
requestedBy: "user").Start();
|
||||
|
||||
// Act
|
||||
var completed = export.Complete(
|
||||
outputUri: "file:///exports/test.json",
|
||||
outputDigest: "sha256:abc123",
|
||||
outputSizeBytes: 1024,
|
||||
entryCount: 100);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(LedgerExportStatus.Completed, completed.Status);
|
||||
Assert.Equal("file:///exports/test.json", completed.OutputUri);
|
||||
Assert.Equal("sha256:abc123", completed.OutputDigest);
|
||||
Assert.Equal(1024, completed.OutputSizeBytes);
|
||||
Assert.Equal(100, completed.EntryCount);
|
||||
Assert.NotNull(completed.CompletedAt);
|
||||
Assert.Null(completed.ErrorMessage);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Fail_SetsStatusAndErrorMessage()
|
||||
{
|
||||
// Arrange
|
||||
var export = LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: "json",
|
||||
requestedBy: "user").Start();
|
||||
|
||||
// Act
|
||||
var failed = export.Fail("Database connection failed");
|
||||
|
||||
// Assert
|
||||
Assert.Equal(LedgerExportStatus.Failed, failed.Status);
|
||||
Assert.Equal("Database connection failed", failed.ErrorMessage);
|
||||
Assert.NotNull(failed.CompletedAt);
|
||||
Assert.Null(failed.OutputUri);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateRequest_WithMinimalParameters_CreatesExport()
|
||||
{
|
||||
// Act
|
||||
var export = LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: "ndjson",
|
||||
requestedBy: "system");
|
||||
|
||||
// Assert
|
||||
Assert.NotEqual(Guid.Empty, export.ExportId);
|
||||
Assert.Null(export.StartTime);
|
||||
Assert.Null(export.EndTime);
|
||||
Assert.Null(export.RunTypeFilter);
|
||||
Assert.Null(export.SourceIdFilter);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ExportLifecycle_FullFlow_TracksAllStates()
|
||||
{
|
||||
// Create
|
||||
var export = LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: "csv",
|
||||
requestedBy: "user");
|
||||
Assert.Equal(LedgerExportStatus.Pending, export.Status);
|
||||
|
||||
// Start
|
||||
export = export.Start();
|
||||
Assert.Equal(LedgerExportStatus.Processing, export.Status);
|
||||
Assert.NotNull(export.StartedAt);
|
||||
|
||||
// Complete
|
||||
export = export.Complete("file:///out.csv", "sha256:xyz", 2048, 50);
|
||||
Assert.Equal(LedgerExportStatus.Completed, export.Status);
|
||||
Assert.NotNull(export.CompletedAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ExportLifecycle_FailedFlow_TracksStates()
|
||||
{
|
||||
// Create
|
||||
var export = LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: "json",
|
||||
requestedBy: "user");
|
||||
|
||||
// Start
|
||||
export = export.Start();
|
||||
|
||||
// Fail
|
||||
export = export.Fail("Out of disk space");
|
||||
Assert.Equal(LedgerExportStatus.Failed, export.Status);
|
||||
Assert.Equal("Out of disk space", export.ErrorMessage);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Complete_PreservesOriginalProperties()
|
||||
{
|
||||
// Arrange
|
||||
var sourceId = Guid.NewGuid();
|
||||
var export = LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: "json",
|
||||
requestedBy: "user",
|
||||
startTime: DateTimeOffset.UtcNow.AddDays(-1),
|
||||
endTime: DateTimeOffset.UtcNow,
|
||||
runTypeFilter: "scan",
|
||||
sourceIdFilter: sourceId).Start();
|
||||
|
||||
// Act
|
||||
var completed = export.Complete("uri", "digest", 100, 10);
|
||||
|
||||
// Assert
|
||||
Assert.Equal("test-tenant", completed.TenantId);
|
||||
Assert.Equal("json", completed.Format);
|
||||
Assert.Equal("scan", completed.RunTypeFilter);
|
||||
Assert.Equal(sourceId, completed.SourceIdFilter);
|
||||
Assert.Equal("user", completed.RequestedBy);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,318 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.AuditLedger;
|
||||
|
||||
/// <summary>
|
||||
/// Tests for RunLedgerEntry domain model.
|
||||
/// </summary>
|
||||
public sealed class RunLedgerTests
|
||||
{
|
||||
[Fact]
|
||||
public void FromCompletedRun_WithValidRun_CreatesLedgerEntry()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var artifacts = CreateArtifacts(run.RunId, 2);
|
||||
|
||||
// Act
|
||||
var entry = RunLedgerEntry.FromCompletedRun(
|
||||
run: run,
|
||||
artifacts: artifacts,
|
||||
inputDigest: "abc123",
|
||||
sequenceNumber: 1,
|
||||
previousEntryHash: null);
|
||||
|
||||
// Assert
|
||||
Assert.NotEqual(Guid.Empty, entry.LedgerId);
|
||||
Assert.Equal(run.TenantId, entry.TenantId);
|
||||
Assert.Equal(run.RunId, entry.RunId);
|
||||
Assert.Equal(run.SourceId, entry.SourceId);
|
||||
Assert.Equal(run.RunType, entry.RunType);
|
||||
Assert.Equal(run.Status, entry.FinalStatus);
|
||||
Assert.Equal(run.TotalJobs, entry.TotalJobs);
|
||||
Assert.Equal(run.SucceededJobs, entry.SucceededJobs);
|
||||
Assert.Equal(run.FailedJobs, entry.FailedJobs);
|
||||
Assert.Equal(run.CreatedAt, entry.RunCreatedAt);
|
||||
Assert.Equal(run.CompletedAt, entry.RunCompletedAt);
|
||||
Assert.Equal("abc123", entry.InputDigest);
|
||||
Assert.NotEmpty(entry.OutputDigest);
|
||||
Assert.NotEmpty(entry.ArtifactManifest);
|
||||
Assert.Equal(1, entry.SequenceNumber);
|
||||
Assert.Null(entry.PreviousEntryHash);
|
||||
Assert.NotEmpty(entry.ContentHash);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FromCompletedRun_WithIncompleteRun_ThrowsException()
|
||||
{
|
||||
// Arrange
|
||||
var run = new Run(
|
||||
RunId: Guid.NewGuid(),
|
||||
TenantId: "test-tenant",
|
||||
ProjectId: null,
|
||||
SourceId: Guid.NewGuid(),
|
||||
RunType: "scan",
|
||||
Status: RunStatus.Running,
|
||||
CorrelationId: null,
|
||||
TotalJobs: 5,
|
||||
CompletedJobs: 2,
|
||||
SucceededJobs: 2,
|
||||
FailedJobs: 0,
|
||||
CreatedAt: DateTimeOffset.UtcNow.AddMinutes(-10),
|
||||
StartedAt: DateTimeOffset.UtcNow.AddMinutes(-9),
|
||||
CompletedAt: null, // Not completed
|
||||
CreatedBy: "user",
|
||||
Metadata: null);
|
||||
|
||||
// Act & Assert
|
||||
Assert.Throws<InvalidOperationException>(() =>
|
||||
RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerifyIntegrity_WithValidEntry_ReturnsTrue()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
|
||||
// Act
|
||||
var isValid = entry.VerifyIntegrity();
|
||||
|
||||
// Assert
|
||||
Assert.True(isValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerifyIntegrity_WithTamperedEntry_ReturnsFalse()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
|
||||
// Tamper with the entry
|
||||
var tamperedEntry = entry with { TotalJobs = 999 };
|
||||
|
||||
// Act
|
||||
var isValid = tamperedEntry.VerifyIntegrity();
|
||||
|
||||
// Assert
|
||||
Assert.False(isValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerifyChainLink_WithNullPrevious_AndFirstEntry_ReturnsTrue()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
|
||||
// Act
|
||||
var isValid = entry.VerifyChainLink(null);
|
||||
|
||||
// Assert
|
||||
Assert.True(isValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerifyChainLink_WithValidPreviousEntry_ReturnsTrue()
|
||||
{
|
||||
// Arrange
|
||||
var run1 = CreateCompletedRun();
|
||||
var first = RunLedgerEntry.FromCompletedRun(run1, [], "input1", 1, null);
|
||||
|
||||
var run2 = CreateCompletedRun();
|
||||
var second = RunLedgerEntry.FromCompletedRun(run2, [], "input2", 2, first.ContentHash);
|
||||
|
||||
// Act
|
||||
var isValid = second.VerifyChainLink(first);
|
||||
|
||||
// Assert
|
||||
Assert.True(isValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerifyChainLink_WithInvalidPreviousHash_ReturnsFalse()
|
||||
{
|
||||
// Arrange
|
||||
var run1 = CreateCompletedRun();
|
||||
var first = RunLedgerEntry.FromCompletedRun(run1, [], "input1", 1, null);
|
||||
|
||||
var run2 = CreateCompletedRun();
|
||||
var second = RunLedgerEntry.FromCompletedRun(run2, [], "input2", 2, "invalid_hash");
|
||||
|
||||
// Act
|
||||
var isValid = second.VerifyChainLink(first);
|
||||
|
||||
// Assert
|
||||
Assert.False(isValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FromCompletedRun_CalculatesExecutionDuration()
|
||||
{
|
||||
// Arrange
|
||||
var startedAt = DateTimeOffset.UtcNow.AddMinutes(-5);
|
||||
var completedAt = DateTimeOffset.UtcNow;
|
||||
var run = new Run(
|
||||
RunId: Guid.NewGuid(),
|
||||
TenantId: "test-tenant",
|
||||
ProjectId: null,
|
||||
SourceId: Guid.NewGuid(),
|
||||
RunType: "scan",
|
||||
Status: RunStatus.Succeeded,
|
||||
CorrelationId: null,
|
||||
TotalJobs: 10,
|
||||
CompletedJobs: 10,
|
||||
SucceededJobs: 10,
|
||||
FailedJobs: 0,
|
||||
CreatedAt: startedAt.AddMinutes(-1),
|
||||
StartedAt: startedAt,
|
||||
CompletedAt: completedAt,
|
||||
CreatedBy: "user",
|
||||
Metadata: null);
|
||||
|
||||
// Act
|
||||
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(completedAt - startedAt, entry.ExecutionDuration);
|
||||
Assert.True(entry.ExecutionDuration.TotalMinutes >= 4.9);
|
||||
Assert.True(entry.ExecutionDuration.TotalMinutes <= 5.1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FromCompletedRun_WithArtifacts_GeneratesManifestAndDigest()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var artifacts = CreateArtifacts(run.RunId, 3);
|
||||
|
||||
// Act
|
||||
var entry = RunLedgerEntry.FromCompletedRun(run, artifacts, "input", 1, null);
|
||||
|
||||
// Assert
|
||||
Assert.NotEmpty(entry.ArtifactManifest);
|
||||
Assert.Contains("ArtifactId", entry.ArtifactManifest);
|
||||
Assert.NotEmpty(entry.OutputDigest);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FromCompletedRun_WithNoArtifacts_GeneratesEmptyManifest()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
|
||||
// Act
|
||||
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
|
||||
// Assert
|
||||
Assert.Equal("[]", entry.ArtifactManifest);
|
||||
Assert.NotEmpty(entry.OutputDigest);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(RunStatus.Succeeded)]
|
||||
[InlineData(RunStatus.PartiallySucceeded)]
|
||||
[InlineData(RunStatus.Failed)]
|
||||
[InlineData(RunStatus.Canceled)]
|
||||
public void FromCompletedRun_WithDifferentStatuses_CreatesValidEntries(RunStatus status)
|
||||
{
|
||||
// Arrange
|
||||
var run = new Run(
|
||||
RunId: Guid.NewGuid(),
|
||||
TenantId: "test-tenant",
|
||||
ProjectId: null,
|
||||
SourceId: Guid.NewGuid(),
|
||||
RunType: "scan",
|
||||
Status: status,
|
||||
CorrelationId: null,
|
||||
TotalJobs: 10,
|
||||
CompletedJobs: 10,
|
||||
SucceededJobs: status == RunStatus.Succeeded ? 10 : 5,
|
||||
FailedJobs: status == RunStatus.Failed ? 10 : (status == RunStatus.PartiallySucceeded ? 5 : 0),
|
||||
CreatedAt: DateTimeOffset.UtcNow.AddMinutes(-10),
|
||||
StartedAt: DateTimeOffset.UtcNow.AddMinutes(-9),
|
||||
CompletedAt: DateTimeOffset.UtcNow,
|
||||
CreatedBy: "user",
|
||||
Metadata: null);
|
||||
|
||||
// Act
|
||||
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(status, entry.FinalStatus);
|
||||
Assert.True(entry.VerifyIntegrity());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FromCompletedRun_WithMetadata_IncludesMetadata()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var metadata = """{"custom":"metadata","count":42}""";
|
||||
|
||||
// Act
|
||||
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null, metadata);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(metadata, entry.Metadata);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ContentHash_IsDeterministic()
|
||||
{
|
||||
// Arrange - create two entries with same data but different times
|
||||
// The hash should be different because OccurredAt is included
|
||||
|
||||
var run1 = CreateCompletedRun();
|
||||
var entry1 = RunLedgerEntry.FromCompletedRun(run1, [], "same-input", 1, null);
|
||||
|
||||
// Use the exact same run to ensure determinism
|
||||
var run2 = run1;
|
||||
|
||||
// Act - note: can't test exact determinism because LedgerId and LedgerCreatedAt differ
|
||||
// Instead, verify the hash format
|
||||
Assert.Equal(64, entry1.ContentHash.Length);
|
||||
Assert.True(entry1.ContentHash.All(c => char.IsAsciiHexDigit(c)));
|
||||
}
|
||||
|
||||
private static Run CreateCompletedRun(string runType = "scan") => new(
|
||||
RunId: Guid.NewGuid(),
|
||||
TenantId: "test-tenant",
|
||||
ProjectId: null,
|
||||
SourceId: Guid.NewGuid(),
|
||||
RunType: runType,
|
||||
Status: RunStatus.Succeeded,
|
||||
CorrelationId: "corr-123",
|
||||
TotalJobs: 10,
|
||||
CompletedJobs: 10,
|
||||
SucceededJobs: 8,
|
||||
FailedJobs: 2,
|
||||
CreatedAt: DateTimeOffset.UtcNow.AddMinutes(-10),
|
||||
StartedAt: DateTimeOffset.UtcNow.AddMinutes(-9),
|
||||
CompletedAt: DateTimeOffset.UtcNow,
|
||||
CreatedBy: "test-user",
|
||||
Metadata: null);
|
||||
|
||||
private static List<Artifact> CreateArtifacts(Guid runId, int count)
|
||||
{
|
||||
var artifacts = new List<Artifact>();
|
||||
for (var i = 0; i < count; i++)
|
||||
{
|
||||
artifacts.Add(new Artifact(
|
||||
ArtifactId: Guid.NewGuid(),
|
||||
TenantId: "test-tenant",
|
||||
JobId: Guid.NewGuid(),
|
||||
RunId: runId,
|
||||
ArtifactType: "sbom",
|
||||
Uri: $"file:///artifacts/{Guid.NewGuid()}.json",
|
||||
Digest: $"sha256:{Guid.NewGuid():N}",
|
||||
MimeType: "application/json",
|
||||
SizeBytes: 1024 * (i + 1),
|
||||
CreatedAt: DateTimeOffset.UtcNow,
|
||||
Metadata: null));
|
||||
}
|
||||
return artifacts;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,398 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.AuditLedger;
|
||||
|
||||
/// <summary>
|
||||
/// Tests for SignedManifest domain model.
|
||||
/// </summary>
|
||||
public sealed class SignedManifestTests
|
||||
{
|
||||
[Fact]
|
||||
public void CreateFromLedgerEntry_WithValidEntry_CreatesManifest()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var artifacts = CreateArtifacts(run.RunId, 2);
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, artifacts, "input-digest", 1, null);
|
||||
|
||||
// Act
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
|
||||
// Assert
|
||||
Assert.NotEqual(Guid.Empty, manifest.ManifestId);
|
||||
Assert.Equal(SignedManifest.CurrentSchemaVersion, manifest.SchemaVersion);
|
||||
Assert.Equal(ledgerEntry.TenantId, manifest.TenantId);
|
||||
Assert.Equal(ProvenanceType.Run, manifest.ProvenanceType);
|
||||
Assert.Equal(ledgerEntry.RunId, manifest.SubjectId);
|
||||
Assert.NotEmpty(manifest.Statements);
|
||||
Assert.NotEmpty(manifest.Artifacts);
|
||||
Assert.NotEmpty(manifest.Materials);
|
||||
Assert.NotEmpty(manifest.PayloadDigest);
|
||||
Assert.Equal("none", manifest.SignatureAlgorithm);
|
||||
Assert.Empty(manifest.Signature);
|
||||
Assert.Empty(manifest.KeyId);
|
||||
Assert.False(manifest.IsSigned);
|
||||
Assert.False(manifest.IsExpired);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateFromExport_WithValidExport_CreatesManifest()
|
||||
{
|
||||
// Arrange
|
||||
var export = CreateCompletedExport();
|
||||
var entries = CreateLedgerEntries(3);
|
||||
|
||||
// Act
|
||||
var manifest = SignedManifest.CreateFromExport(export, entries);
|
||||
|
||||
// Assert
|
||||
Assert.NotEqual(Guid.Empty, manifest.ManifestId);
|
||||
Assert.Equal(ProvenanceType.Export, manifest.ProvenanceType);
|
||||
Assert.Equal(export.ExportId, manifest.SubjectId);
|
||||
Assert.NotEmpty(manifest.Statements);
|
||||
Assert.NotEmpty(manifest.Materials);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateFromExport_WithIncompleteExport_ThrowsException()
|
||||
{
|
||||
// Arrange
|
||||
var export = LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: "json",
|
||||
requestedBy: "user");
|
||||
|
||||
// Act & Assert
|
||||
Assert.Throws<InvalidOperationException>(() =>
|
||||
SignedManifest.CreateFromExport(export, []));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Sign_WithValidSignature_SetsSignatureProperties()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
|
||||
// Act
|
||||
var signed = manifest.Sign(
|
||||
signatureAlgorithm: "ES256",
|
||||
signature: "base64-encoded-signature",
|
||||
keyId: "key-001",
|
||||
expiresAt: DateTimeOffset.UtcNow.AddDays(30));
|
||||
|
||||
// Assert
|
||||
Assert.Equal("ES256", signed.SignatureAlgorithm);
|
||||
Assert.Equal("base64-encoded-signature", signed.Signature);
|
||||
Assert.Equal("key-001", signed.KeyId);
|
||||
Assert.True(signed.IsSigned);
|
||||
Assert.False(signed.IsExpired);
|
||||
Assert.NotNull(signed.ExpiresAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Sign_WithEmptyAlgorithm_ThrowsException()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
|
||||
// Act & Assert
|
||||
Assert.Throws<ArgumentException>(() =>
|
||||
manifest.Sign("", "signature", "key-001"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Sign_WithEmptySignature_ThrowsException()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
|
||||
// Act & Assert
|
||||
Assert.Throws<ArgumentException>(() =>
|
||||
manifest.Sign("ES256", "", "key-001"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Sign_WithEmptyKeyId_ThrowsException()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
|
||||
// Act & Assert
|
||||
Assert.Throws<ArgumentException>(() =>
|
||||
manifest.Sign("ES256", "signature", ""));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsSigned_WithUnsignedManifest_ReturnsFalse()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
|
||||
// Assert
|
||||
Assert.False(manifest.IsSigned);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsExpired_WithNoExpiration_ReturnsFalse()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
|
||||
// Assert
|
||||
Assert.False(manifest.IsExpired);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsExpired_WithFutureExpiration_ReturnsFalse()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry)
|
||||
.Sign("ES256", "sig", "key", DateTimeOffset.UtcNow.AddDays(30));
|
||||
|
||||
// Assert
|
||||
Assert.False(manifest.IsExpired);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsExpired_WithPastExpiration_ReturnsTrue()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry)
|
||||
.Sign("ES256", "sig", "key", DateTimeOffset.UtcNow.AddDays(-1));
|
||||
|
||||
// Assert
|
||||
Assert.True(manifest.IsExpired);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerifyPayloadIntegrity_WithValidManifest_ReturnsTrue()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
|
||||
// Act
|
||||
var isValid = manifest.VerifyPayloadIntegrity();
|
||||
|
||||
// Assert
|
||||
Assert.True(isValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerifyPayloadIntegrity_WithTamperedManifest_ReturnsFalse()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
|
||||
// Tamper with the manifest
|
||||
var tampered = manifest with { Statements = "[]" };
|
||||
|
||||
// Act
|
||||
var isValid = tampered.VerifyPayloadIntegrity();
|
||||
|
||||
// Assert
|
||||
Assert.False(isValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetArtifactReferences_ReturnsTypedObjects()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var artifacts = CreateArtifacts(run.RunId, 2);
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, artifacts, "input", 1, null);
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
|
||||
// Act
|
||||
var references = manifest.GetArtifactReferences();
|
||||
|
||||
// Assert
|
||||
Assert.Equal(2, references.Count);
|
||||
Assert.All(references, r =>
|
||||
{
|
||||
Assert.NotEqual(Guid.Empty, r.ArtifactId);
|
||||
Assert.NotEmpty(r.ArtifactType);
|
||||
Assert.NotEmpty(r.Uri);
|
||||
Assert.NotEmpty(r.Digest);
|
||||
});
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetMaterialReferences_ReturnsTypedObjects()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input-digest", 1, null);
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
|
||||
// Act
|
||||
var materials = manifest.GetMaterialReferences();
|
||||
|
||||
// Assert
|
||||
Assert.Single(materials);
|
||||
Assert.Contains("input:", materials[0].Uri);
|
||||
Assert.Equal("input-digest", materials[0].Digest);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetStatements_ReturnsTypedObjects()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
|
||||
// Act
|
||||
var statements = manifest.GetStatements();
|
||||
|
||||
// Assert
|
||||
Assert.Equal(2, statements.Count);
|
||||
Assert.Contains(statements, s => s.StatementType == "run_completed");
|
||||
Assert.Contains(statements, s => s.StatementType == "chain_link");
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(ProvenanceType.Run)]
|
||||
[InlineData(ProvenanceType.Export)]
|
||||
public void CreateManifest_WithDifferentProvenanceTypes_CreatesValidManifests(ProvenanceType expectedType)
|
||||
{
|
||||
// Arrange & Act
|
||||
SignedManifest manifest;
|
||||
|
||||
if (expectedType == ProvenanceType.Run)
|
||||
{
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
}
|
||||
else
|
||||
{
|
||||
var export = CreateCompletedExport();
|
||||
manifest = SignedManifest.CreateFromExport(export, []);
|
||||
}
|
||||
|
||||
// Assert
|
||||
Assert.Equal(expectedType, manifest.ProvenanceType);
|
||||
Assert.True(manifest.VerifyPayloadIntegrity());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateFromLedgerEntry_WithBuildInfo_IncludesBuildInfo()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
var buildInfo = """{"version":"1.0.0","builder":"test"}""";
|
||||
|
||||
// Act
|
||||
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry, buildInfo);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(buildInfo, manifest.BuildInfo);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PayloadDigest_IsDeterministic()
|
||||
{
|
||||
// Arrange
|
||||
var run = CreateCompletedRun();
|
||||
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
|
||||
|
||||
// Act
|
||||
var manifest1 = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
var manifest2 = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
|
||||
|
||||
// Note: ManifestId will differ, but the payload digest should be the same
|
||||
// if the content (statements, artifacts, materials) is identical
|
||||
// In this case, they won't be identical because timestamps in statements differ
|
||||
Assert.NotEmpty(manifest1.PayloadDigest);
|
||||
Assert.NotEmpty(manifest2.PayloadDigest);
|
||||
Assert.Equal(64, manifest1.PayloadDigest.Length);
|
||||
}
|
||||
|
||||
private static Run CreateCompletedRun(string runType = "scan") => new(
|
||||
RunId: Guid.NewGuid(),
|
||||
TenantId: "test-tenant",
|
||||
ProjectId: null,
|
||||
SourceId: Guid.NewGuid(),
|
||||
RunType: runType,
|
||||
Status: RunStatus.Succeeded,
|
||||
CorrelationId: "corr-123",
|
||||
TotalJobs: 10,
|
||||
CompletedJobs: 10,
|
||||
SucceededJobs: 8,
|
||||
FailedJobs: 2,
|
||||
CreatedAt: DateTimeOffset.UtcNow.AddMinutes(-10),
|
||||
StartedAt: DateTimeOffset.UtcNow.AddMinutes(-9),
|
||||
CompletedAt: DateTimeOffset.UtcNow,
|
||||
CreatedBy: "test-user",
|
||||
Metadata: null);
|
||||
|
||||
private static LedgerExport CreateCompletedExport()
|
||||
{
|
||||
var export = LedgerExport.CreateRequest(
|
||||
tenantId: "test-tenant",
|
||||
format: "json",
|
||||
requestedBy: "user");
|
||||
|
||||
return export
|
||||
.Start()
|
||||
.Complete("file:///exports/test.json", "sha256:abc123", 1024, 10);
|
||||
}
|
||||
|
||||
private static List<Artifact> CreateArtifacts(Guid runId, int count)
|
||||
{
|
||||
var artifacts = new List<Artifact>();
|
||||
for (var i = 0; i < count; i++)
|
||||
{
|
||||
artifacts.Add(new Artifact(
|
||||
ArtifactId: Guid.NewGuid(),
|
||||
TenantId: "test-tenant",
|
||||
JobId: Guid.NewGuid(),
|
||||
RunId: runId,
|
||||
ArtifactType: "sbom",
|
||||
Uri: $"file:///artifacts/{Guid.NewGuid()}.json",
|
||||
Digest: $"sha256:{Guid.NewGuid():N}",
|
||||
MimeType: "application/json",
|
||||
SizeBytes: 1024 * (i + 1),
|
||||
CreatedAt: DateTimeOffset.UtcNow,
|
||||
Metadata: null));
|
||||
}
|
||||
return artifacts;
|
||||
}
|
||||
|
||||
private static List<RunLedgerEntry> CreateLedgerEntries(int count)
|
||||
{
|
||||
var entries = new List<RunLedgerEntry>();
|
||||
string? previousHash = null;
|
||||
|
||||
for (var i = 0; i < count; i++)
|
||||
{
|
||||
var run = CreateCompletedRun();
|
||||
var entry = RunLedgerEntry.FromCompletedRun(run, [], $"input-{i}", i + 1, previousHash);
|
||||
entries.Add(entry);
|
||||
previousHash = entry.ContentHash;
|
||||
}
|
||||
|
||||
return entries;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,407 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.Backfill;
|
||||
|
||||
public class BackfillRequestTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
private const string TenantId = "test-tenant";
|
||||
private static readonly Guid SourceId = Guid.NewGuid();
|
||||
private const string JobType = "scan";
|
||||
|
||||
[Fact]
|
||||
public void Create_WithValidParameters_CreatesRequest()
|
||||
{
|
||||
var windowStart = BaseTime;
|
||||
var windowEnd = BaseTime.AddDays(7);
|
||||
|
||||
var request = BackfillRequest.Create(
|
||||
tenantId: TenantId,
|
||||
sourceId: SourceId,
|
||||
jobType: null,
|
||||
windowStart: windowStart,
|
||||
windowEnd: windowEnd,
|
||||
reason: "Reprocess after bug fix",
|
||||
createdBy: "admin");
|
||||
|
||||
Assert.NotEqual(Guid.Empty, request.BackfillId);
|
||||
Assert.Equal(TenantId, request.TenantId);
|
||||
Assert.Equal(SourceId, request.SourceId);
|
||||
Assert.Null(request.JobType);
|
||||
Assert.Equal(BackfillStatus.Pending, request.Status);
|
||||
Assert.Equal(windowStart, request.WindowStart);
|
||||
Assert.Equal(windowEnd, request.WindowEnd);
|
||||
Assert.Null(request.CurrentPosition);
|
||||
Assert.Null(request.TotalEvents);
|
||||
Assert.Equal(0, request.ProcessedEvents);
|
||||
Assert.Equal(0, request.SkippedEvents);
|
||||
Assert.Equal(0, request.FailedEvents);
|
||||
Assert.Equal(100, request.BatchSize);
|
||||
Assert.False(request.DryRun);
|
||||
Assert.False(request.ForceReprocess);
|
||||
Assert.Equal("admin", request.CreatedBy);
|
||||
Assert.Equal("admin", request.UpdatedBy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithDryRunAndForceReprocess_SetsFlags()
|
||||
{
|
||||
var request = BackfillRequest.Create(
|
||||
TenantId, SourceId, null, BaseTime, BaseTime.AddDays(1),
|
||||
"Test", "admin", dryRun: true, forceReprocess: true);
|
||||
|
||||
Assert.True(request.DryRun);
|
||||
Assert.True(request.ForceReprocess);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithCustomBatchSize_SetsBatchSize()
|
||||
{
|
||||
var request = BackfillRequest.Create(
|
||||
TenantId, SourceId, null, BaseTime, BaseTime.AddDays(1),
|
||||
"Test", "admin", batchSize: 500);
|
||||
|
||||
Assert.Equal(500, request.BatchSize);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithInvalidBatchSize_Throws()
|
||||
{
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
BackfillRequest.Create(TenantId, SourceId, null, BaseTime, BaseTime.AddDays(1),
|
||||
"Test", "admin", batchSize: 0));
|
||||
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
BackfillRequest.Create(TenantId, SourceId, null, BaseTime, BaseTime.AddDays(1),
|
||||
"Test", "admin", batchSize: 10001));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithInvalidWindow_Throws()
|
||||
{
|
||||
Assert.Throws<ArgumentException>(() =>
|
||||
BackfillRequest.Create(TenantId, SourceId, null,
|
||||
windowStart: BaseTime.AddDays(1),
|
||||
windowEnd: BaseTime,
|
||||
reason: "Test",
|
||||
createdBy: "admin"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithoutSourceOrJobType_Throws()
|
||||
{
|
||||
Assert.Throws<ArgumentException>(() =>
|
||||
BackfillRequest.Create(TenantId, null, null, BaseTime, BaseTime.AddDays(1),
|
||||
"Test", "admin"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void WindowDuration_ReturnsCorrectDuration()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(7), "Test", "admin");
|
||||
|
||||
Assert.Equal(TimeSpan.FromDays(7), request.WindowDuration);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void StartValidation_TransitionsToPending()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin");
|
||||
|
||||
var validating = request.StartValidation("validator");
|
||||
|
||||
Assert.Equal(BackfillStatus.Validating, validating.Status);
|
||||
Assert.Equal("validator", validating.UpdatedBy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void StartValidation_FromNonPending_Throws()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin");
|
||||
var validating = request.StartValidation("v");
|
||||
|
||||
Assert.Throws<InvalidOperationException>(() =>
|
||||
validating.StartValidation("v"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void WithSafetyChecks_RecordsSafetyResults()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin")
|
||||
.StartValidation("v");
|
||||
|
||||
var checks = BackfillSafetyChecks.AllPassed();
|
||||
var result = request.WithSafetyChecks(checks, 1000, TimeSpan.FromMinutes(10), "v");
|
||||
|
||||
Assert.Equal(checks, result.SafetyChecks);
|
||||
Assert.Equal(1000, result.TotalEvents);
|
||||
Assert.Equal(TimeSpan.FromMinutes(10), result.EstimatedDuration);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Start_TransitionsToRunning()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin")
|
||||
.StartValidation("v")
|
||||
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v");
|
||||
|
||||
var running = request.Start("worker");
|
||||
|
||||
Assert.Equal(BackfillStatus.Running, running.Status);
|
||||
Assert.NotNull(running.StartedAt);
|
||||
Assert.Equal(request.WindowStart, running.CurrentPosition);
|
||||
Assert.Equal("worker", running.UpdatedBy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Start_WithBlockingIssues_Throws()
|
||||
{
|
||||
var checks = new BackfillSafetyChecks(
|
||||
SourceExists: false,
|
||||
HasOverlappingBackfill: false,
|
||||
WithinRetention: true,
|
||||
WithinEventLimit: true,
|
||||
WithinDurationLimit: true,
|
||||
QuotaAvailable: true,
|
||||
Warnings: [],
|
||||
Errors: ["Source not found"]);
|
||||
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin")
|
||||
.StartValidation("v")
|
||||
.WithSafetyChecks(checks, 1000, TimeSpan.FromMinutes(10), "v");
|
||||
|
||||
Assert.Throws<InvalidOperationException>(() => request.Start("worker"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void UpdateProgress_UpdatesCounters()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin")
|
||||
.StartValidation("v")
|
||||
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
|
||||
.Start("worker");
|
||||
|
||||
var newPosition = BaseTime.AddHours(6);
|
||||
var updated = request.UpdateProgress(newPosition, processed: 500, skipped: 50, failed: 5, "worker");
|
||||
|
||||
Assert.Equal(newPosition, updated.CurrentPosition);
|
||||
Assert.Equal(500, updated.ProcessedEvents);
|
||||
Assert.Equal(50, updated.SkippedEvents);
|
||||
Assert.Equal(5, updated.FailedEvents);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void UpdateProgress_AccumulatesCounts()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin")
|
||||
.StartValidation("v")
|
||||
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
|
||||
.Start("worker");
|
||||
|
||||
var after1 = request.UpdateProgress(BaseTime.AddHours(1), 100, 10, 1, "w");
|
||||
var after2 = after1.UpdateProgress(BaseTime.AddHours(2), 200, 20, 2, "w");
|
||||
|
||||
Assert.Equal(300, after2.ProcessedEvents);
|
||||
Assert.Equal(30, after2.SkippedEvents);
|
||||
Assert.Equal(3, after2.FailedEvents);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ProgressPercent_CalculatesCorrectly()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin")
|
||||
.StartValidation("v")
|
||||
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
|
||||
.Start("worker")
|
||||
.UpdateProgress(BaseTime.AddHours(12), 400, 50, 50, "w");
|
||||
|
||||
Assert.Equal(50.0, request.ProgressPercent);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Pause_TransitionsToPaused()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin")
|
||||
.StartValidation("v")
|
||||
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
|
||||
.Start("worker");
|
||||
|
||||
var paused = request.Pause("admin");
|
||||
|
||||
Assert.Equal(BackfillStatus.Paused, paused.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Resume_TransitionsToRunning()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin")
|
||||
.StartValidation("v")
|
||||
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
|
||||
.Start("worker")
|
||||
.Pause("admin");
|
||||
|
||||
var resumed = request.Resume("admin");
|
||||
|
||||
Assert.Equal(BackfillStatus.Running, resumed.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Complete_TransitionsToCompleted()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin")
|
||||
.StartValidation("v")
|
||||
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
|
||||
.Start("worker");
|
||||
|
||||
var completed = request.Complete("worker");
|
||||
|
||||
Assert.Equal(BackfillStatus.Completed, completed.Status);
|
||||
Assert.NotNull(completed.CompletedAt);
|
||||
Assert.Equal(request.WindowEnd, completed.CurrentPosition);
|
||||
Assert.True(completed.IsTerminal);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Fail_TransitionsToFailed()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin")
|
||||
.StartValidation("v")
|
||||
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
|
||||
.Start("worker");
|
||||
|
||||
var failed = request.Fail("Connection timeout", "worker");
|
||||
|
||||
Assert.Equal(BackfillStatus.Failed, failed.Status);
|
||||
Assert.Equal("Connection timeout", failed.ErrorMessage);
|
||||
Assert.NotNull(failed.CompletedAt);
|
||||
Assert.True(failed.IsTerminal);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Cancel_TransitionsToCanceled()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin")
|
||||
.StartValidation("v")
|
||||
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
|
||||
.Start("worker");
|
||||
|
||||
var canceled = request.Cancel("admin");
|
||||
|
||||
Assert.Equal(BackfillStatus.Canceled, canceled.Status);
|
||||
Assert.NotNull(canceled.CompletedAt);
|
||||
Assert.True(canceled.IsTerminal);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Cancel_FromTerminalState_Throws()
|
||||
{
|
||||
var request = BackfillRequest.Create(TenantId, SourceId, null,
|
||||
BaseTime, BaseTime.AddDays(1), "Test", "admin")
|
||||
.StartValidation("v")
|
||||
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
|
||||
.Start("worker")
|
||||
.Complete("worker");
|
||||
|
||||
Assert.Throws<InvalidOperationException>(() => request.Cancel("admin"));
|
||||
}
|
||||
}
|
||||
|
||||
public class BackfillSafetyChecksTests
|
||||
{
|
||||
[Fact]
|
||||
public void AllPassed_ReturnsValidChecks()
|
||||
{
|
||||
var checks = BackfillSafetyChecks.AllPassed();
|
||||
|
||||
Assert.True(checks.SourceExists);
|
||||
Assert.False(checks.HasOverlappingBackfill);
|
||||
Assert.True(checks.WithinRetention);
|
||||
Assert.True(checks.WithinEventLimit);
|
||||
Assert.True(checks.WithinDurationLimit);
|
||||
Assert.True(checks.QuotaAvailable);
|
||||
Assert.Empty(checks.Warnings);
|
||||
Assert.Empty(checks.Errors);
|
||||
Assert.True(checks.IsSafe);
|
||||
Assert.False(checks.HasBlockingIssues);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HasBlockingIssues_WithMissingSource_ReturnsTrue()
|
||||
{
|
||||
var checks = new BackfillSafetyChecks(
|
||||
SourceExists: false,
|
||||
HasOverlappingBackfill: false,
|
||||
WithinRetention: true,
|
||||
WithinEventLimit: true,
|
||||
WithinDurationLimit: true,
|
||||
QuotaAvailable: true,
|
||||
Warnings: [],
|
||||
Errors: []);
|
||||
|
||||
Assert.True(checks.HasBlockingIssues);
|
||||
Assert.False(checks.IsSafe);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HasBlockingIssues_WithOverlap_ReturnsTrue()
|
||||
{
|
||||
var checks = new BackfillSafetyChecks(
|
||||
SourceExists: true,
|
||||
HasOverlappingBackfill: true,
|
||||
WithinRetention: true,
|
||||
WithinEventLimit: true,
|
||||
WithinDurationLimit: true,
|
||||
QuotaAvailable: true,
|
||||
Warnings: [],
|
||||
Errors: []);
|
||||
|
||||
Assert.True(checks.HasBlockingIssues);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HasBlockingIssues_WithErrors_ReturnsTrue()
|
||||
{
|
||||
var checks = new BackfillSafetyChecks(
|
||||
SourceExists: true,
|
||||
HasOverlappingBackfill: false,
|
||||
WithinRetention: true,
|
||||
WithinEventLimit: true,
|
||||
WithinDurationLimit: true,
|
||||
QuotaAvailable: true,
|
||||
Warnings: [],
|
||||
Errors: ["Custom error"]);
|
||||
|
||||
Assert.True(checks.HasBlockingIssues);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsSafe_WithOnlyWarnings_ReturnsTrue()
|
||||
{
|
||||
var checks = new BackfillSafetyChecks(
|
||||
SourceExists: true,
|
||||
HasOverlappingBackfill: false,
|
||||
WithinRetention: true,
|
||||
WithinEventLimit: true,
|
||||
WithinDurationLimit: true,
|
||||
QuotaAvailable: true,
|
||||
Warnings: ["Large window may take time"],
|
||||
Errors: []);
|
||||
|
||||
Assert.True(checks.IsSafe);
|
||||
Assert.False(checks.HasBlockingIssues);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,210 @@
|
||||
using StellaOps.Orchestrator.Core.Backfill;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.Backfill;
|
||||
|
||||
public class DuplicateSuppressorTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
private const string ScopeKey = "source:test123";
|
||||
private static readonly TimeSpan DefaultTtl = TimeSpan.FromDays(30);
|
||||
|
||||
[Fact]
|
||||
public async Task HasProcessedAsync_NewEvent_ReturnsFalse()
|
||||
{
|
||||
var suppressor = new InMemoryDuplicateSuppressor();
|
||||
|
||||
var result = await suppressor.HasProcessedAsync(ScopeKey, "event-1", CancellationToken.None);
|
||||
|
||||
Assert.False(result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task HasProcessedAsync_MarkedEvent_ReturnsTrue()
|
||||
{
|
||||
var suppressor = new InMemoryDuplicateSuppressor();
|
||||
await suppressor.MarkProcessedAsync(ScopeKey, "event-1", BaseTime, null, DefaultTtl, CancellationToken.None);
|
||||
|
||||
var result = await suppressor.HasProcessedAsync(ScopeKey, "event-1", CancellationToken.None);
|
||||
|
||||
Assert.True(result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task HasProcessedAsync_DifferentScope_ReturnsFalse()
|
||||
{
|
||||
var suppressor = new InMemoryDuplicateSuppressor();
|
||||
await suppressor.MarkProcessedAsync(ScopeKey, "event-1", BaseTime, null, DefaultTtl, CancellationToken.None);
|
||||
|
||||
var result = await suppressor.HasProcessedAsync("other-scope", "event-1", CancellationToken.None);
|
||||
|
||||
Assert.False(result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GetProcessedAsync_ReturnsOnlyProcessedKeys()
|
||||
{
|
||||
var suppressor = new InMemoryDuplicateSuppressor();
|
||||
await suppressor.MarkProcessedAsync(ScopeKey, "event-1", BaseTime, null, DefaultTtl, CancellationToken.None);
|
||||
await suppressor.MarkProcessedAsync(ScopeKey, "event-3", BaseTime, null, DefaultTtl, CancellationToken.None);
|
||||
|
||||
var eventKeys = new[] { "event-1", "event-2", "event-3", "event-4" };
|
||||
var result = await suppressor.GetProcessedAsync(ScopeKey, eventKeys, CancellationToken.None);
|
||||
|
||||
Assert.Equal(2, result.Count);
|
||||
Assert.Contains("event-1", result);
|
||||
Assert.Contains("event-3", result);
|
||||
Assert.DoesNotContain("event-2", result);
|
||||
Assert.DoesNotContain("event-4", result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GetProcessedAsync_EmptyInput_ReturnsEmptySet()
|
||||
{
|
||||
var suppressor = new InMemoryDuplicateSuppressor();
|
||||
|
||||
var result = await suppressor.GetProcessedAsync(ScopeKey, [], CancellationToken.None);
|
||||
|
||||
Assert.Empty(result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MarkProcessedBatchAsync_MarksAllEvents()
|
||||
{
|
||||
var suppressor = new InMemoryDuplicateSuppressor();
|
||||
var events = new[]
|
||||
{
|
||||
new ProcessedEvent("event-1", BaseTime),
|
||||
new ProcessedEvent("event-2", BaseTime.AddMinutes(1)),
|
||||
new ProcessedEvent("event-3", BaseTime.AddMinutes(2))
|
||||
};
|
||||
|
||||
await suppressor.MarkProcessedBatchAsync(ScopeKey, events, Guid.NewGuid(), DefaultTtl, CancellationToken.None);
|
||||
|
||||
Assert.True(await suppressor.HasProcessedAsync(ScopeKey, "event-1", CancellationToken.None));
|
||||
Assert.True(await suppressor.HasProcessedAsync(ScopeKey, "event-2", CancellationToken.None));
|
||||
Assert.True(await suppressor.HasProcessedAsync(ScopeKey, "event-3", CancellationToken.None));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CountProcessedAsync_ReturnsCorrectCount()
|
||||
{
|
||||
var suppressor = new InMemoryDuplicateSuppressor();
|
||||
var events = new[]
|
||||
{
|
||||
new ProcessedEvent("event-1", BaseTime.AddHours(1)),
|
||||
new ProcessedEvent("event-2", BaseTime.AddHours(2)),
|
||||
new ProcessedEvent("event-3", BaseTime.AddHours(3)),
|
||||
new ProcessedEvent("event-4", BaseTime.AddHours(5)) // Outside range
|
||||
};
|
||||
await suppressor.MarkProcessedBatchAsync(ScopeKey, events, null, DefaultTtl, CancellationToken.None);
|
||||
|
||||
var count = await suppressor.CountProcessedAsync(
|
||||
ScopeKey,
|
||||
BaseTime,
|
||||
BaseTime.AddHours(4),
|
||||
CancellationToken.None);
|
||||
|
||||
Assert.Equal(3, count);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CountProcessedAsync_DifferentScope_ReturnsZero()
|
||||
{
|
||||
var suppressor = new InMemoryDuplicateSuppressor();
|
||||
await suppressor.MarkProcessedAsync(ScopeKey, "event-1", BaseTime, null, DefaultTtl, CancellationToken.None);
|
||||
|
||||
var count = await suppressor.CountProcessedAsync(
|
||||
"other-scope",
|
||||
BaseTime.AddHours(-1),
|
||||
BaseTime.AddHours(1),
|
||||
CancellationToken.None);
|
||||
|
||||
Assert.Equal(0, count);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task FilterAsync_SeparatesDuplicatesFromNew()
|
||||
{
|
||||
var suppressor = new InMemoryDuplicateSuppressor();
|
||||
await suppressor.MarkProcessedAsync(ScopeKey, "event-1", BaseTime, null, DefaultTtl, CancellationToken.None);
|
||||
await suppressor.MarkProcessedAsync(ScopeKey, "event-3", BaseTime, null, DefaultTtl, CancellationToken.None);
|
||||
|
||||
var events = new[] { "event-1", "event-2", "event-3", "event-4" };
|
||||
var result = await suppressor.FilterAsync(
|
||||
ScopeKey,
|
||||
events,
|
||||
e => e,
|
||||
CancellationToken.None);
|
||||
|
||||
Assert.Equal(4, result.Total);
|
||||
Assert.Equal(2, result.ProcessCount);
|
||||
Assert.Equal(2, result.DuplicateCount);
|
||||
Assert.Contains("event-2", result.ToProcess);
|
||||
Assert.Contains("event-4", result.ToProcess);
|
||||
Assert.Contains("event-1", result.Duplicates);
|
||||
Assert.Contains("event-3", result.Duplicates);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task FilterAsync_WithEmptyList_ReturnsEmptyResult()
|
||||
{
|
||||
var suppressor = new InMemoryDuplicateSuppressor();
|
||||
|
||||
var result = await suppressor.FilterAsync<string>(
|
||||
ScopeKey,
|
||||
[],
|
||||
e => e,
|
||||
CancellationToken.None);
|
||||
|
||||
Assert.Equal(0, result.Total);
|
||||
Assert.Empty(result.ToProcess);
|
||||
Assert.Empty(result.Duplicates);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void DuplicateFilterResult_CalculatesDuplicatePercent()
|
||||
{
|
||||
var result = new DuplicateFilterResult<string>(
|
||||
ToProcess: ["a", "b"],
|
||||
Duplicates: ["c", "d", "e"],
|
||||
Total: 5);
|
||||
|
||||
Assert.Equal(60.0, result.DuplicatePercent);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void DuplicateFilterResult_WithZeroTotal_ReturnsZeroPercent()
|
||||
{
|
||||
var result = new DuplicateFilterResult<string>(
|
||||
ToProcess: [],
|
||||
Duplicates: [],
|
||||
Total: 0);
|
||||
|
||||
Assert.Equal(0.0, result.DuplicatePercent);
|
||||
}
|
||||
}
|
||||
|
||||
public class ProcessedEventTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
|
||||
[Fact]
|
||||
public void ProcessedEvent_StoresProperties()
|
||||
{
|
||||
var evt = new ProcessedEvent("event-123", BaseTime);
|
||||
|
||||
Assert.Equal("event-123", evt.EventKey);
|
||||
Assert.Equal(BaseTime, evt.EventTime);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ProcessedEvent_EqualsComparison()
|
||||
{
|
||||
var evt1 = new ProcessedEvent("event-123", BaseTime);
|
||||
var evt2 = new ProcessedEvent("event-123", BaseTime);
|
||||
var evt3 = new ProcessedEvent("event-456", BaseTime);
|
||||
|
||||
Assert.Equal(evt1, evt2);
|
||||
Assert.NotEqual(evt1, evt3);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,355 @@
|
||||
using StellaOps.Orchestrator.Core.Backfill;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.Backfill;
|
||||
|
||||
public class EventTimeWindowTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
|
||||
[Fact]
|
||||
public void Duration_ReturnsCorrectValue()
|
||||
{
|
||||
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
|
||||
|
||||
Assert.Equal(TimeSpan.FromHours(2), window.Duration);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsEmpty_WithEqualStartEnd_ReturnsTrue()
|
||||
{
|
||||
var window = new EventTimeWindow(BaseTime, BaseTime);
|
||||
|
||||
Assert.True(window.IsEmpty);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsEmpty_WithEndBeforeStart_ReturnsTrue()
|
||||
{
|
||||
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(-1));
|
||||
|
||||
Assert.True(window.IsEmpty);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsEmpty_WithValidWindow_ReturnsFalse()
|
||||
{
|
||||
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(1));
|
||||
|
||||
Assert.False(window.IsEmpty);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Contains_TimestampInWindow_ReturnsTrue()
|
||||
{
|
||||
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
|
||||
|
||||
Assert.True(window.Contains(BaseTime));
|
||||
Assert.True(window.Contains(BaseTime.AddHours(1)));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Contains_TimestampAtEnd_ReturnsFalse()
|
||||
{
|
||||
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
|
||||
|
||||
Assert.False(window.Contains(BaseTime.AddHours(2)));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Contains_TimestampOutsideWindow_ReturnsFalse()
|
||||
{
|
||||
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
|
||||
|
||||
Assert.False(window.Contains(BaseTime.AddHours(-1)));
|
||||
Assert.False(window.Contains(BaseTime.AddHours(3)));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Overlaps_WithOverlappingWindow_ReturnsTrue()
|
||||
{
|
||||
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
|
||||
var window2 = new EventTimeWindow(BaseTime.AddHours(1), BaseTime.AddHours(3));
|
||||
|
||||
Assert.True(window1.Overlaps(window2));
|
||||
Assert.True(window2.Overlaps(window1));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Overlaps_WithContainedWindow_ReturnsTrue()
|
||||
{
|
||||
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(4));
|
||||
var window2 = new EventTimeWindow(BaseTime.AddHours(1), BaseTime.AddHours(2));
|
||||
|
||||
Assert.True(window1.Overlaps(window2));
|
||||
Assert.True(window2.Overlaps(window1));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Overlaps_WithAdjacentWindow_ReturnsFalse()
|
||||
{
|
||||
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
|
||||
var window2 = new EventTimeWindow(BaseTime.AddHours(2), BaseTime.AddHours(4));
|
||||
|
||||
Assert.False(window1.Overlaps(window2));
|
||||
Assert.False(window2.Overlaps(window1));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Overlaps_WithDisjointWindow_ReturnsFalse()
|
||||
{
|
||||
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(1));
|
||||
var window2 = new EventTimeWindow(BaseTime.AddHours(3), BaseTime.AddHours(4));
|
||||
|
||||
Assert.False(window1.Overlaps(window2));
|
||||
Assert.False(window2.Overlaps(window1));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Intersect_WithOverlappingWindow_ReturnsIntersection()
|
||||
{
|
||||
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(3));
|
||||
var window2 = new EventTimeWindow(BaseTime.AddHours(1), BaseTime.AddHours(4));
|
||||
|
||||
var intersection = window1.Intersect(window2);
|
||||
|
||||
Assert.NotNull(intersection);
|
||||
Assert.Equal(BaseTime.AddHours(1), intersection.Start);
|
||||
Assert.Equal(BaseTime.AddHours(3), intersection.End);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Intersect_WithContainedWindow_ReturnsContained()
|
||||
{
|
||||
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(4));
|
||||
var window2 = new EventTimeWindow(BaseTime.AddHours(1), BaseTime.AddHours(2));
|
||||
|
||||
var intersection = window1.Intersect(window2);
|
||||
|
||||
Assert.NotNull(intersection);
|
||||
Assert.Equal(window2, intersection);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Intersect_WithDisjointWindow_ReturnsNull()
|
||||
{
|
||||
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(1));
|
||||
var window2 = new EventTimeWindow(BaseTime.AddHours(2), BaseTime.AddHours(3));
|
||||
|
||||
var intersection = window1.Intersect(window2);
|
||||
|
||||
Assert.Null(intersection);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Split_DividesIntoEqualBatches()
|
||||
{
|
||||
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(6));
|
||||
|
||||
var batches = window.Split(TimeSpan.FromHours(2)).ToList();
|
||||
|
||||
Assert.Equal(3, batches.Count);
|
||||
Assert.Equal(BaseTime, batches[0].Start);
|
||||
Assert.Equal(BaseTime.AddHours(2), batches[0].End);
|
||||
Assert.Equal(BaseTime.AddHours(2), batches[1].Start);
|
||||
Assert.Equal(BaseTime.AddHours(4), batches[1].End);
|
||||
Assert.Equal(BaseTime.AddHours(4), batches[2].Start);
|
||||
Assert.Equal(BaseTime.AddHours(6), batches[2].End);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Split_WithRemainder_CreatesPartialFinalBatch()
|
||||
{
|
||||
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(5));
|
||||
|
||||
var batches = window.Split(TimeSpan.FromHours(2)).ToList();
|
||||
|
||||
Assert.Equal(3, batches.Count);
|
||||
Assert.Equal(BaseTime.AddHours(4), batches[2].Start);
|
||||
Assert.Equal(BaseTime.AddHours(5), batches[2].End);
|
||||
Assert.Equal(TimeSpan.FromHours(1), batches[2].Duration);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Split_WithZeroDuration_Throws()
|
||||
{
|
||||
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
|
||||
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
window.Split(TimeSpan.Zero).ToList());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Split_WithNegativeDuration_Throws()
|
||||
{
|
||||
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
|
||||
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
window.Split(TimeSpan.FromHours(-1)).ToList());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FromDuration_CreatesCorrectWindow()
|
||||
{
|
||||
var window = EventTimeWindow.FromDuration(BaseTime, TimeSpan.FromHours(3));
|
||||
|
||||
Assert.Equal(BaseTime.AddHours(-3), window.Start);
|
||||
Assert.Equal(BaseTime, window.End);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void LastHours_CreatesCorrectWindow()
|
||||
{
|
||||
var window = EventTimeWindow.LastHours(6, BaseTime);
|
||||
|
||||
Assert.Equal(BaseTime.AddHours(-6), window.Start);
|
||||
Assert.Equal(BaseTime, window.End);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void LastDays_CreatesCorrectWindow()
|
||||
{
|
||||
var window = EventTimeWindow.LastDays(7, BaseTime);
|
||||
|
||||
Assert.Equal(BaseTime.AddDays(-7), window.Start);
|
||||
Assert.Equal(BaseTime, window.End);
|
||||
}
|
||||
}
|
||||
|
||||
public class EventTimeWindowPlannerTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
private static readonly EventTimeWindowOptions TestOptions = new(
|
||||
MinWindowSize: TimeSpan.FromMinutes(5),
|
||||
MaxWindowSize: TimeSpan.FromHours(1),
|
||||
OverlapDuration: TimeSpan.FromMinutes(5),
|
||||
MaxLag: TimeSpan.FromHours(2),
|
||||
InitialLookback: TimeSpan.FromDays(7));
|
||||
|
||||
[Fact]
|
||||
public void GetNextWindow_WithNoWatermark_ReturnsInitialWindow()
|
||||
{
|
||||
var window = EventTimeWindowPlanner.GetNextWindow(BaseTime, null, TestOptions);
|
||||
|
||||
Assert.NotNull(window);
|
||||
Assert.Equal(BaseTime - TestOptions.InitialLookback, window.Start);
|
||||
Assert.Equal(window.Start + TestOptions.MaxWindowSize, window.End);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetNextWindow_WithWatermark_ReturnsIncrementalWindow()
|
||||
{
|
||||
var watermark = BaseTime.AddHours(-2);
|
||||
|
||||
var window = EventTimeWindowPlanner.GetNextWindow(BaseTime, watermark, TestOptions);
|
||||
|
||||
Assert.NotNull(window);
|
||||
Assert.Equal(watermark - TestOptions.OverlapDuration, window.Start);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetNextWindow_WhenCaughtUp_ReturnsNull()
|
||||
{
|
||||
var watermark = BaseTime.AddMinutes(-3); // Less than MinWindowSize from now
|
||||
|
||||
var window = EventTimeWindowPlanner.GetNextWindow(BaseTime, watermark, TestOptions);
|
||||
|
||||
Assert.Null(window);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetNextWindow_CapsAtNow()
|
||||
{
|
||||
var watermark = BaseTime.AddMinutes(-30); // 30 minutes ago
|
||||
|
||||
var window = EventTimeWindowPlanner.GetNextWindow(BaseTime, watermark, TestOptions);
|
||||
|
||||
Assert.NotNull(window);
|
||||
Assert.True(window.End <= BaseTime);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CalculateLag_ReturnsCorrectValue()
|
||||
{
|
||||
var watermark = BaseTime.AddHours(-2);
|
||||
|
||||
var lag = EventTimeWindowPlanner.CalculateLag(BaseTime, watermark);
|
||||
|
||||
Assert.Equal(TimeSpan.FromHours(2), lag);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsLagging_WithinThreshold_ReturnsFalse()
|
||||
{
|
||||
var watermark = BaseTime.AddHours(-1);
|
||||
|
||||
var isLagging = EventTimeWindowPlanner.IsLagging(BaseTime, watermark, TestOptions);
|
||||
|
||||
Assert.False(isLagging);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IsLagging_ExceedsThreshold_ReturnsTrue()
|
||||
{
|
||||
var watermark = BaseTime.AddHours(-3);
|
||||
|
||||
var isLagging = EventTimeWindowPlanner.IsLagging(BaseTime, watermark, TestOptions);
|
||||
|
||||
Assert.True(isLagging);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void EstimateWindowsToProcess_WithNoWatermark_ReturnsInitialCount()
|
||||
{
|
||||
var count = EventTimeWindowPlanner.EstimateWindowsToProcess(BaseTime, null, TestOptions);
|
||||
|
||||
// 7 days / 1 hour = 168 windows
|
||||
Assert.Equal(168, count);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void EstimateWindowsToProcess_WithWatermark_ReturnsLagCount()
|
||||
{
|
||||
var watermark = BaseTime.AddHours(-3);
|
||||
|
||||
var count = EventTimeWindowPlanner.EstimateWindowsToProcess(BaseTime, watermark, TestOptions);
|
||||
|
||||
Assert.Equal(3, count);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void EstimateWindowsToProcess_WhenCaughtUp_ReturnsZero()
|
||||
{
|
||||
var watermark = BaseTime.AddMinutes(-3);
|
||||
|
||||
var count = EventTimeWindowPlanner.EstimateWindowsToProcess(BaseTime, watermark, TestOptions);
|
||||
|
||||
Assert.Equal(0, count);
|
||||
}
|
||||
}
|
||||
|
||||
public class EventTimeWindowOptionsTests
|
||||
{
|
||||
[Fact]
|
||||
public void HourlyBatches_HasCorrectDefaults()
|
||||
{
|
||||
var options = EventTimeWindowOptions.HourlyBatches;
|
||||
|
||||
Assert.Equal(TimeSpan.FromMinutes(5), options.MinWindowSize);
|
||||
Assert.Equal(TimeSpan.FromHours(1), options.MaxWindowSize);
|
||||
Assert.Equal(TimeSpan.FromMinutes(5), options.OverlapDuration);
|
||||
Assert.Equal(TimeSpan.FromHours(2), options.MaxLag);
|
||||
Assert.Equal(TimeSpan.FromDays(7), options.InitialLookback);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void DailyBatches_HasCorrectDefaults()
|
||||
{
|
||||
var options = EventTimeWindowOptions.DailyBatches;
|
||||
|
||||
Assert.Equal(TimeSpan.FromHours(1), options.MinWindowSize);
|
||||
Assert.Equal(TimeSpan.FromDays(1), options.MaxWindowSize);
|
||||
Assert.Equal(TimeSpan.FromHours(1), options.OverlapDuration);
|
||||
Assert.Equal(TimeSpan.FromDays(1), options.MaxLag);
|
||||
Assert.Equal(TimeSpan.FromDays(30), options.InitialLookback);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,157 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.Backfill;
|
||||
|
||||
public class WatermarkTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
private const string TenantId = "test-tenant";
|
||||
private static readonly Guid SourceId = Guid.NewGuid();
|
||||
private const string JobType = "scan";
|
||||
|
||||
[Fact]
|
||||
public void CreateScopeKey_WithSourceId_ReturnsCorrectFormat()
|
||||
{
|
||||
var sourceId = Guid.Parse("12345678-1234-1234-1234-123456789abc");
|
||||
var scopeKey = Watermark.CreateScopeKey(sourceId);
|
||||
|
||||
Assert.Equal("source:12345678123412341234123456789abc", scopeKey);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateScopeKey_WithJobType_ReturnsCorrectFormat()
|
||||
{
|
||||
var scopeKey = Watermark.CreateScopeKey("Scan");
|
||||
|
||||
Assert.Equal("job_type:scan", scopeKey);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateScopeKey_WithSourceIdAndJobType_ReturnsCorrectFormat()
|
||||
{
|
||||
var sourceId = Guid.Parse("12345678-1234-1234-1234-123456789abc");
|
||||
var scopeKey = Watermark.CreateScopeKey(sourceId, "Scan");
|
||||
|
||||
Assert.Equal("source:12345678123412341234123456789abc:job_type:scan", scopeKey);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithSourceId_CreatesValidWatermark()
|
||||
{
|
||||
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
|
||||
|
||||
Assert.NotEqual(Guid.Empty, watermark.WatermarkId);
|
||||
Assert.Equal(TenantId, watermark.TenantId);
|
||||
Assert.Equal(SourceId, watermark.SourceId);
|
||||
Assert.Null(watermark.JobType);
|
||||
Assert.Equal(BaseTime, watermark.HighWatermark);
|
||||
Assert.Null(watermark.LowWatermark);
|
||||
Assert.Equal(0, watermark.SequenceNumber);
|
||||
Assert.Equal(0, watermark.ProcessedCount);
|
||||
Assert.Null(watermark.LastBatchHash);
|
||||
Assert.Equal("system", watermark.UpdatedBy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithJobType_CreatesValidWatermark()
|
||||
{
|
||||
var watermark = Watermark.Create(TenantId, null, JobType, BaseTime, "system");
|
||||
|
||||
Assert.NotEqual(Guid.Empty, watermark.WatermarkId);
|
||||
Assert.Equal(TenantId, watermark.TenantId);
|
||||
Assert.Null(watermark.SourceId);
|
||||
Assert.Equal(JobType, watermark.JobType);
|
||||
Assert.Equal($"job_type:{JobType}", watermark.ScopeKey);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithBothSourceIdAndJobType_CreatesCombinedScopeKey()
|
||||
{
|
||||
var watermark = Watermark.Create(TenantId, SourceId, JobType, BaseTime, "system");
|
||||
|
||||
Assert.Equal(SourceId, watermark.SourceId);
|
||||
Assert.Equal(JobType, watermark.JobType);
|
||||
Assert.Contains("source:", watermark.ScopeKey);
|
||||
Assert.Contains("job_type:", watermark.ScopeKey);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithoutSourceIdOrJobType_Throws()
|
||||
{
|
||||
Assert.Throws<ArgumentException>(() =>
|
||||
Watermark.Create(TenantId, null, null, BaseTime, "system"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Advance_IncreasesHighWatermarkAndSequence()
|
||||
{
|
||||
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
|
||||
var newTime = BaseTime.AddHours(1);
|
||||
var batchHash = "abc123def456";
|
||||
|
||||
var advanced = watermark.Advance(newTime, 100, batchHash, "worker-1");
|
||||
|
||||
Assert.Equal(newTime, advanced.HighWatermark);
|
||||
Assert.Equal(1, advanced.SequenceNumber);
|
||||
Assert.Equal(100, advanced.ProcessedCount);
|
||||
Assert.Equal(batchHash, advanced.LastBatchHash);
|
||||
Assert.Equal("worker-1", advanced.UpdatedBy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Advance_AccumulatesProcessedCount()
|
||||
{
|
||||
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
|
||||
|
||||
var after1 = watermark.Advance(BaseTime.AddHours(1), 100, null, "worker");
|
||||
var after2 = after1.Advance(BaseTime.AddHours(2), 150, null, "worker");
|
||||
|
||||
Assert.Equal(250, after2.ProcessedCount);
|
||||
Assert.Equal(2, after2.SequenceNumber);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Advance_WithEarlierTime_Throws()
|
||||
{
|
||||
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
|
||||
var earlierTime = BaseTime.AddHours(-1);
|
||||
|
||||
Assert.Throws<ArgumentException>(() =>
|
||||
watermark.Advance(earlierTime, 100, null, "worker"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void WithWindow_SetsWindowBounds()
|
||||
{
|
||||
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
|
||||
var lowWm = BaseTime.AddHours(-1);
|
||||
var highWm = BaseTime.AddHours(1);
|
||||
|
||||
var windowed = watermark.WithWindow(lowWm, highWm);
|
||||
|
||||
Assert.Equal(lowWm, windowed.LowWatermark);
|
||||
Assert.Equal(highWm, windowed.HighWatermark);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void WithWindow_HighBeforeLow_Throws()
|
||||
{
|
||||
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
|
||||
|
||||
Assert.Throws<ArgumentException>(() =>
|
||||
watermark.WithWindow(BaseTime.AddHours(1), BaseTime.AddHours(-1)));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void WatermarkSnapshot_CalculatesLag()
|
||||
{
|
||||
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
|
||||
var now = BaseTime.AddHours(2);
|
||||
|
||||
var snapshot = WatermarkSnapshot.FromWatermark(watermark, now);
|
||||
|
||||
Assert.Equal(watermark.ScopeKey, snapshot.ScopeKey);
|
||||
Assert.Equal(watermark.HighWatermark, snapshot.HighWatermark);
|
||||
Assert.Equal(TimeSpan.FromHours(2), snapshot.Lag);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,355 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.ControlPlane;
|
||||
|
||||
/// <summary>
|
||||
/// Control-plane validation tests for Run domain and lifecycle operations.
|
||||
/// These tests validate the Run record, status transitions, and job counting.
|
||||
/// </summary>
|
||||
public sealed class RunTests
|
||||
{
|
||||
private const string TestTenantId = "test-tenant";
|
||||
|
||||
[Fact]
|
||||
public void Run_Creation_WithValidData_Succeeds()
|
||||
{
|
||||
var runId = Guid.NewGuid();
|
||||
var sourceId = Guid.NewGuid();
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
var run = new Run(
|
||||
RunId: runId,
|
||||
TenantId: TestTenantId,
|
||||
ProjectId: "project-1",
|
||||
SourceId: sourceId,
|
||||
RunType: "scan",
|
||||
Status: RunStatus.Pending,
|
||||
CorrelationId: "corr-123",
|
||||
TotalJobs: 5,
|
||||
CompletedJobs: 0,
|
||||
SucceededJobs: 0,
|
||||
FailedJobs: 0,
|
||||
CreatedAt: now,
|
||||
StartedAt: null,
|
||||
CompletedAt: null,
|
||||
CreatedBy: "system",
|
||||
Metadata: """{"image":"alpine:3.18"}""");
|
||||
|
||||
Assert.Equal(runId, run.RunId);
|
||||
Assert.Equal(TestTenantId, run.TenantId);
|
||||
Assert.Equal("project-1", run.ProjectId);
|
||||
Assert.Equal(sourceId, run.SourceId);
|
||||
Assert.Equal("scan", run.RunType);
|
||||
Assert.Equal(RunStatus.Pending, run.Status);
|
||||
Assert.Equal(5, run.TotalJobs);
|
||||
Assert.Equal(0, run.CompletedJobs);
|
||||
Assert.Null(run.StartedAt);
|
||||
Assert.Null(run.CompletedAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_StatusTransition_PendingToRunning()
|
||||
{
|
||||
var run = CreateRun(RunStatus.Pending);
|
||||
var started = run with
|
||||
{
|
||||
Status = RunStatus.Running,
|
||||
StartedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
|
||||
Assert.Equal(RunStatus.Running, started.Status);
|
||||
Assert.NotNull(started.StartedAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_StatusTransition_RunningToSucceeded()
|
||||
{
|
||||
var run = CreateRun(RunStatus.Running, totalJobs: 3, completedJobs: 2, succeededJobs: 2);
|
||||
var completed = run with
|
||||
{
|
||||
Status = RunStatus.Succeeded,
|
||||
CompletedJobs = 3,
|
||||
SucceededJobs = 3,
|
||||
CompletedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
|
||||
Assert.Equal(RunStatus.Succeeded, completed.Status);
|
||||
Assert.Equal(3, completed.CompletedJobs);
|
||||
Assert.Equal(3, completed.SucceededJobs);
|
||||
Assert.Equal(0, completed.FailedJobs);
|
||||
Assert.NotNull(completed.CompletedAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_StatusTransition_RunningToPartiallySucceeded()
|
||||
{
|
||||
var run = CreateRun(RunStatus.Running, totalJobs: 5, completedJobs: 4, succeededJobs: 3, failedJobs: 1);
|
||||
var completed = run with
|
||||
{
|
||||
Status = RunStatus.PartiallySucceeded,
|
||||
CompletedJobs = 5,
|
||||
SucceededJobs = 4,
|
||||
FailedJobs = 1,
|
||||
CompletedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
|
||||
Assert.Equal(RunStatus.PartiallySucceeded, completed.Status);
|
||||
Assert.Equal(5, completed.CompletedJobs);
|
||||
Assert.Equal(4, completed.SucceededJobs);
|
||||
Assert.Equal(1, completed.FailedJobs);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_StatusTransition_RunningToFailed()
|
||||
{
|
||||
var run = CreateRun(RunStatus.Running, totalJobs: 3, completedJobs: 2, failedJobs: 2);
|
||||
var failed = run with
|
||||
{
|
||||
Status = RunStatus.Failed,
|
||||
CompletedJobs = 3,
|
||||
FailedJobs = 3,
|
||||
SucceededJobs = 0,
|
||||
CompletedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
|
||||
Assert.Equal(RunStatus.Failed, failed.Status);
|
||||
Assert.Equal(0, failed.SucceededJobs);
|
||||
Assert.Equal(3, failed.FailedJobs);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_StatusTransition_ToCanceled()
|
||||
{
|
||||
var run = CreateRun(RunStatus.Running, totalJobs: 5, completedJobs: 2);
|
||||
var canceled = run with
|
||||
{
|
||||
Status = RunStatus.Canceled,
|
||||
CompletedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
|
||||
Assert.Equal(RunStatus.Canceled, canceled.Status);
|
||||
Assert.Equal(2, canceled.CompletedJobs); // Preserves completed count
|
||||
Assert.NotNull(canceled.CompletedAt);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(RunStatus.Pending)]
|
||||
[InlineData(RunStatus.Running)]
|
||||
[InlineData(RunStatus.Succeeded)]
|
||||
[InlineData(RunStatus.PartiallySucceeded)]
|
||||
[InlineData(RunStatus.Failed)]
|
||||
[InlineData(RunStatus.Canceled)]
|
||||
public void RunStatus_AllValues_AreValid(RunStatus status)
|
||||
{
|
||||
var run = CreateRun(status);
|
||||
Assert.Equal(status, run.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_JobCounting_IncrementSucceeded()
|
||||
{
|
||||
var run = CreateRun(RunStatus.Running, totalJobs: 3, completedJobs: 0);
|
||||
|
||||
var afterOne = run with
|
||||
{
|
||||
CompletedJobs = 1,
|
||||
SucceededJobs = 1
|
||||
};
|
||||
|
||||
var afterTwo = afterOne with
|
||||
{
|
||||
CompletedJobs = 2,
|
||||
SucceededJobs = 2
|
||||
};
|
||||
|
||||
var afterThree = afterTwo with
|
||||
{
|
||||
CompletedJobs = 3,
|
||||
SucceededJobs = 3,
|
||||
Status = RunStatus.Succeeded,
|
||||
CompletedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
|
||||
Assert.Equal(1, afterOne.CompletedJobs);
|
||||
Assert.Equal(2, afterTwo.CompletedJobs);
|
||||
Assert.Equal(3, afterThree.CompletedJobs);
|
||||
Assert.Equal(RunStatus.Succeeded, afterThree.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_JobCounting_IncrementFailed()
|
||||
{
|
||||
var run = CreateRun(RunStatus.Running, totalJobs: 2, completedJobs: 0);
|
||||
|
||||
var afterOne = run with
|
||||
{
|
||||
CompletedJobs = 1,
|
||||
FailedJobs = 1
|
||||
};
|
||||
|
||||
var afterTwo = afterOne with
|
||||
{
|
||||
CompletedJobs = 2,
|
||||
FailedJobs = 2,
|
||||
Status = RunStatus.Failed,
|
||||
CompletedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
|
||||
Assert.Equal(2, afterTwo.FailedJobs);
|
||||
Assert.Equal(0, afterTwo.SucceededJobs);
|
||||
Assert.Equal(RunStatus.Failed, afterTwo.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_JobCounting_MixedResults_PartialSuccess()
|
||||
{
|
||||
var run = CreateRun(RunStatus.Running, totalJobs: 4);
|
||||
|
||||
var final = run with
|
||||
{
|
||||
CompletedJobs = 4,
|
||||
SucceededJobs = 3,
|
||||
FailedJobs = 1,
|
||||
Status = RunStatus.PartiallySucceeded,
|
||||
CompletedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
|
||||
Assert.Equal(4, final.CompletedJobs);
|
||||
Assert.Equal(3, final.SucceededJobs);
|
||||
Assert.Equal(1, final.FailedJobs);
|
||||
Assert.Equal(RunStatus.PartiallySucceeded, final.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_JobCounting_Invariant_CompletedEqualsSucceededPlusFailed()
|
||||
{
|
||||
var run = CreateRun(
|
||||
RunStatus.Running,
|
||||
totalJobs: 10,
|
||||
completedJobs: 7,
|
||||
succeededJobs: 5,
|
||||
failedJobs: 2);
|
||||
|
||||
Assert.Equal(run.SucceededJobs + run.FailedJobs, run.CompletedJobs);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_Duration_CanBeCalculated()
|
||||
{
|
||||
var startedAt = new DateTimeOffset(2025, 1, 1, 10, 0, 0, TimeSpan.Zero);
|
||||
var completedAt = new DateTimeOffset(2025, 1, 1, 10, 5, 30, TimeSpan.Zero);
|
||||
|
||||
var run = new Run(
|
||||
Guid.NewGuid(), TestTenantId, null, Guid.NewGuid(), "scan",
|
||||
RunStatus.Succeeded, null, 5, 5, 5, 0,
|
||||
startedAt.AddMinutes(-1), startedAt, completedAt, "system", null);
|
||||
|
||||
var duration = run.CompletedAt!.Value - run.StartedAt!.Value;
|
||||
|
||||
Assert.Equal(TimeSpan.FromMinutes(5.5), duration);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("scan")]
|
||||
[InlineData("advisory-sync")]
|
||||
[InlineData("export")]
|
||||
[InlineData("policy-evaluation")]
|
||||
public void Run_RunType_AcceptsValidTypes(string runType)
|
||||
{
|
||||
var run = CreateRun(runType: runType);
|
||||
Assert.Equal(runType, run.RunType);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_ProjectId_CanBeNull()
|
||||
{
|
||||
var run = CreateRun(projectId: null);
|
||||
Assert.Null(run.ProjectId);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_CorrelationId_ForDistributedTracing()
|
||||
{
|
||||
var correlationId = "trace-" + Guid.NewGuid().ToString("N")[..8];
|
||||
var run = CreateRun(correlationId: correlationId);
|
||||
|
||||
Assert.Equal(correlationId, run.CorrelationId);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_Metadata_CanContainJsonBlob()
|
||||
{
|
||||
var metadata = """
|
||||
{
|
||||
"image": "alpine:3.18",
|
||||
"analyzers": ["syft", "grype", "trivy"],
|
||||
"priority": "high"
|
||||
}
|
||||
""";
|
||||
|
||||
var run = CreateRun(metadata: metadata);
|
||||
Assert.Contains("alpine:3.18", run.Metadata);
|
||||
Assert.Contains("analyzers", run.Metadata);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_Equality_BasedOnRecordSemantics()
|
||||
{
|
||||
var runId = Guid.NewGuid();
|
||||
var sourceId = Guid.NewGuid();
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
var run1 = new Run(
|
||||
runId, TestTenantId, null, sourceId, "scan",
|
||||
RunStatus.Pending, null, 5, 0, 0, 0,
|
||||
now, null, null, "system", null);
|
||||
|
||||
var run2 = new Run(
|
||||
runId, TestTenantId, null, sourceId, "scan",
|
||||
RunStatus.Pending, null, 5, 0, 0, 0,
|
||||
now, null, null, "system", null);
|
||||
|
||||
Assert.Equal(run1, run2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Run_ZeroTotalJobs_IsValid()
|
||||
{
|
||||
// Edge case: run with no jobs (perhaps all filtered out)
|
||||
var run = CreateRun(totalJobs: 0);
|
||||
|
||||
Assert.Equal(0, run.TotalJobs);
|
||||
Assert.Equal(0, run.CompletedJobs);
|
||||
}
|
||||
|
||||
private static Run CreateRun(
|
||||
RunStatus status = RunStatus.Pending,
|
||||
int totalJobs = 5,
|
||||
int completedJobs = 0,
|
||||
int succeededJobs = 0,
|
||||
int failedJobs = 0,
|
||||
string runType = "test-run",
|
||||
string? projectId = "test-project",
|
||||
string? correlationId = null,
|
||||
string? metadata = null)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
return new Run(
|
||||
RunId: Guid.NewGuid(),
|
||||
TenantId: TestTenantId,
|
||||
ProjectId: projectId,
|
||||
SourceId: Guid.NewGuid(),
|
||||
RunType: runType,
|
||||
Status: status,
|
||||
CorrelationId: correlationId,
|
||||
TotalJobs: totalJobs,
|
||||
CompletedJobs: completedJobs,
|
||||
SucceededJobs: succeededJobs,
|
||||
FailedJobs: failedJobs,
|
||||
CreatedAt: now,
|
||||
StartedAt: status == RunStatus.Running ? now : null,
|
||||
CompletedAt: null,
|
||||
CreatedBy: "system",
|
||||
Metadata: metadata);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,260 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.ControlPlane;
|
||||
|
||||
/// <summary>
|
||||
/// Control-plane validation tests for Source domain and operations.
|
||||
/// These tests validate the Source record, its invariants, and business rules.
|
||||
/// </summary>
|
||||
public sealed class SourceTests
|
||||
{
|
||||
private const string TestTenantId = "test-tenant";
|
||||
|
||||
[Fact]
|
||||
public void Source_Creation_WithValidData_Succeeds()
|
||||
{
|
||||
var sourceId = Guid.NewGuid();
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
var source = new Source(
|
||||
SourceId: sourceId,
|
||||
TenantId: TestTenantId,
|
||||
Name: "concelier-nvd",
|
||||
SourceType: "advisory-ingest",
|
||||
Enabled: true,
|
||||
Paused: false,
|
||||
PauseReason: null,
|
||||
PauseTicket: null,
|
||||
Configuration: """{"feed_url":"https://nvd.nist.gov"}""",
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
UpdatedBy: "system");
|
||||
|
||||
Assert.Equal(sourceId, source.SourceId);
|
||||
Assert.Equal(TestTenantId, source.TenantId);
|
||||
Assert.Equal("concelier-nvd", source.Name);
|
||||
Assert.Equal("advisory-ingest", source.SourceType);
|
||||
Assert.True(source.Enabled);
|
||||
Assert.False(source.Paused);
|
||||
Assert.Null(source.PauseReason);
|
||||
Assert.NotNull(source.Configuration);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Source_Creation_WithPausedState_HasReasonAndTicket()
|
||||
{
|
||||
var source = CreatePausedSource(
|
||||
"Maintenance window",
|
||||
"OPS-1234");
|
||||
|
||||
Assert.True(source.Paused);
|
||||
Assert.Equal("Maintenance window", source.PauseReason);
|
||||
Assert.Equal("OPS-1234", source.PauseTicket);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Source_Creation_DisabledSource_IsNotPaused()
|
||||
{
|
||||
var source = CreateSource(enabled: false, paused: false);
|
||||
|
||||
Assert.False(source.Enabled);
|
||||
Assert.False(source.Paused);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Source_WithRecord_AllowsImmutableUpdates()
|
||||
{
|
||||
var original = CreateSource();
|
||||
var updated = original with { Enabled = false, UpdatedAt = DateTimeOffset.UtcNow };
|
||||
|
||||
Assert.True(original.Enabled);
|
||||
Assert.False(updated.Enabled);
|
||||
Assert.Equal(original.SourceId, updated.SourceId);
|
||||
Assert.Equal(original.Name, updated.Name);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Source_Pause_UpdatesStateCorrectly()
|
||||
{
|
||||
var original = CreateSource();
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
var paused = original with
|
||||
{
|
||||
Paused = true,
|
||||
PauseReason = "Rate limit exceeded",
|
||||
PauseTicket = "INC-5678",
|
||||
UpdatedAt = now,
|
||||
UpdatedBy = "operator"
|
||||
};
|
||||
|
||||
Assert.False(original.Paused);
|
||||
Assert.True(paused.Paused);
|
||||
Assert.Equal("Rate limit exceeded", paused.PauseReason);
|
||||
Assert.Equal("INC-5678", paused.PauseTicket);
|
||||
Assert.Equal("operator", paused.UpdatedBy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Source_Resume_ClearsReasonAndTicket()
|
||||
{
|
||||
var paused = CreatePausedSource("Test reason", "TICKET-123");
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
var resumed = paused with
|
||||
{
|
||||
Paused = false,
|
||||
PauseReason = null,
|
||||
PauseTicket = null,
|
||||
UpdatedAt = now,
|
||||
UpdatedBy = "operator"
|
||||
};
|
||||
|
||||
Assert.False(resumed.Paused);
|
||||
Assert.Null(resumed.PauseReason);
|
||||
Assert.Null(resumed.PauseTicket);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("advisory-ingest")]
|
||||
[InlineData("scanner")]
|
||||
[InlineData("export")]
|
||||
[InlineData("scheduler")]
|
||||
[InlineData("policy")]
|
||||
public void Source_SourceType_AcceptsValidTypes(string sourceType)
|
||||
{
|
||||
var source = CreateSource(sourceType: sourceType);
|
||||
Assert.Equal(sourceType, source.SourceType);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Source_Configuration_CanBeNull()
|
||||
{
|
||||
var source = CreateSource(configuration: null);
|
||||
Assert.Null(source.Configuration);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Source_Configuration_CanContainJsonBlob()
|
||||
{
|
||||
var config = """
|
||||
{
|
||||
"feed_url": "https://nvd.nist.gov",
|
||||
"poll_interval_seconds": 3600,
|
||||
"retry_policy": {
|
||||
"max_attempts": 3,
|
||||
"backoff_multiplier": 2.0
|
||||
}
|
||||
}
|
||||
""";
|
||||
|
||||
var source = CreateSource(configuration: config);
|
||||
Assert.Contains("feed_url", source.Configuration);
|
||||
Assert.Contains("retry_policy", source.Configuration);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Source_Equality_BasedOnRecordSemantics()
|
||||
{
|
||||
var sourceId = Guid.NewGuid();
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
var source1 = new Source(
|
||||
sourceId, TestTenantId, "test", "type", true, false,
|
||||
null, null, null, now, now, "user");
|
||||
|
||||
var source2 = new Source(
|
||||
sourceId, TestTenantId, "test", "type", true, false,
|
||||
null, null, null, now, now, "user");
|
||||
|
||||
Assert.Equal(source1, source2);
|
||||
Assert.Equal(source1.GetHashCode(), source2.GetHashCode());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Source_Inequality_WhenDifferentFields()
|
||||
{
|
||||
var source1 = CreateSource(name: "source-a");
|
||||
var source2 = CreateSource(name: "source-b");
|
||||
|
||||
Assert.NotEqual(source1, source2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Source_CanBeDisabledWhilePaused()
|
||||
{
|
||||
var source = CreateSource(enabled: false, paused: true)
|
||||
with { PauseReason = "Permanently retired" };
|
||||
|
||||
Assert.False(source.Enabled);
|
||||
Assert.True(source.Paused);
|
||||
Assert.Equal("Permanently retired", source.PauseReason);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Source_UpdatedBy_TracksLastModifier()
|
||||
{
|
||||
var source = CreateSource(updatedBy: "system");
|
||||
var modified = source with { UpdatedBy = "admin@example.com" };
|
||||
|
||||
Assert.Equal("system", source.UpdatedBy);
|
||||
Assert.Equal("admin@example.com", modified.UpdatedBy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Source_Timestamps_ArePreserved()
|
||||
{
|
||||
var createdAt = new DateTimeOffset(2025, 1, 1, 0, 0, 0, TimeSpan.Zero);
|
||||
var updatedAt = new DateTimeOffset(2025, 6, 15, 12, 30, 0, TimeSpan.Zero);
|
||||
|
||||
var source = new Source(
|
||||
Guid.NewGuid(), TestTenantId, "test", "type", true, false,
|
||||
null, null, null, createdAt, updatedAt, "user");
|
||||
|
||||
Assert.Equal(createdAt, source.CreatedAt);
|
||||
Assert.Equal(updatedAt, source.UpdatedAt);
|
||||
Assert.True(source.UpdatedAt > source.CreatedAt);
|
||||
}
|
||||
|
||||
private static Source CreateSource(
|
||||
string name = "test-source",
|
||||
string sourceType = "test-type",
|
||||
bool enabled = true,
|
||||
bool paused = false,
|
||||
string? configuration = null,
|
||||
string updatedBy = "system")
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
return new Source(
|
||||
SourceId: Guid.NewGuid(),
|
||||
TenantId: TestTenantId,
|
||||
Name: name,
|
||||
SourceType: sourceType,
|
||||
Enabled: enabled,
|
||||
Paused: paused,
|
||||
PauseReason: null,
|
||||
PauseTicket: null,
|
||||
Configuration: configuration,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
UpdatedBy: updatedBy);
|
||||
}
|
||||
|
||||
private static Source CreatePausedSource(string reason, string? ticket = null)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
return new Source(
|
||||
SourceId: Guid.NewGuid(),
|
||||
TenantId: TestTenantId,
|
||||
Name: "paused-source",
|
||||
SourceType: "test-type",
|
||||
Enabled: true,
|
||||
Paused: true,
|
||||
PauseReason: reason,
|
||||
PauseTicket: ticket,
|
||||
Configuration: null,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
UpdatedBy: "operator");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,320 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.DeadLetter;
|
||||
|
||||
public class DeadLetterEntryTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
private const string TenantId = "test-tenant";
|
||||
|
||||
private static Job CreateTestJob() =>
|
||||
new(
|
||||
JobId: Guid.NewGuid(),
|
||||
TenantId: TenantId,
|
||||
ProjectId: null,
|
||||
RunId: Guid.NewGuid(),
|
||||
JobType: "scan.image",
|
||||
Status: JobStatus.Failed,
|
||||
Priority: 0,
|
||||
Attempt: 3,
|
||||
MaxAttempts: 3,
|
||||
PayloadDigest: "abcd1234" + new string('0', 56),
|
||||
Payload: """{"image":"test:latest"}""",
|
||||
IdempotencyKey: "test-key-123",
|
||||
CorrelationId: "trace-456",
|
||||
LeaseId: null,
|
||||
WorkerId: null,
|
||||
TaskRunnerId: null,
|
||||
LeaseUntil: null,
|
||||
CreatedAt: BaseTime.AddHours(-1),
|
||||
ScheduledAt: BaseTime.AddMinutes(-50),
|
||||
LeasedAt: BaseTime.AddMinutes(-45),
|
||||
CompletedAt: BaseTime,
|
||||
NotBefore: null,
|
||||
Reason: "Connection timeout",
|
||||
ReplayOf: null,
|
||||
CreatedBy: "test-user");
|
||||
|
||||
[Fact]
|
||||
public void FromFailedJob_CreatesValidEntry()
|
||||
{
|
||||
var job = CreateTestJob();
|
||||
|
||||
var entry = DeadLetterEntry.FromFailedJob(
|
||||
job,
|
||||
errorCode: "ORCH-TRN-001",
|
||||
failureReason: "Network timeout",
|
||||
remediationHint: "Check connectivity",
|
||||
category: ErrorCategory.Transient,
|
||||
isRetryable: true,
|
||||
now: BaseTime);
|
||||
|
||||
Assert.NotEqual(Guid.Empty, entry.EntryId);
|
||||
Assert.Equal(TenantId, entry.TenantId);
|
||||
Assert.Equal(job.JobId, entry.OriginalJobId);
|
||||
Assert.Equal(job.RunId, entry.RunId);
|
||||
Assert.Equal(job.JobType, entry.JobType);
|
||||
Assert.Equal(job.Payload, entry.Payload);
|
||||
Assert.Equal(job.PayloadDigest, entry.PayloadDigest);
|
||||
Assert.Equal(job.IdempotencyKey, entry.IdempotencyKey);
|
||||
Assert.Equal(job.CorrelationId, entry.CorrelationId);
|
||||
Assert.Equal(DeadLetterStatus.Pending, entry.Status);
|
||||
Assert.Equal("ORCH-TRN-001", entry.ErrorCode);
|
||||
Assert.Equal("Network timeout", entry.FailureReason);
|
||||
Assert.Equal("Check connectivity", entry.RemediationHint);
|
||||
Assert.Equal(ErrorCategory.Transient, entry.Category);
|
||||
Assert.True(entry.IsRetryable);
|
||||
Assert.Equal(3, entry.OriginalAttempts);
|
||||
Assert.Equal(0, entry.ReplayAttempts);
|
||||
Assert.Equal(3, entry.MaxReplayAttempts);
|
||||
Assert.Equal(BaseTime, entry.FailedAt);
|
||||
Assert.Equal(BaseTime, entry.CreatedAt);
|
||||
Assert.False(entry.IsTerminal);
|
||||
Assert.True(entry.CanReplay);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FromFailedJob_WithCustomRetention_SetsExpiresAt()
|
||||
{
|
||||
var job = CreateTestJob();
|
||||
var retention = TimeSpan.FromDays(60);
|
||||
|
||||
var entry = DeadLetterEntry.FromFailedJob(
|
||||
job, "ERR", "Failed", null, ErrorCategory.Unknown, false, BaseTime,
|
||||
retention: retention);
|
||||
|
||||
Assert.Equal(BaseTime.AddDays(60), entry.ExpiresAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FromFailedJob_WithCustomMaxReplays_SetsMaxReplayAttempts()
|
||||
{
|
||||
var job = CreateTestJob();
|
||||
|
||||
var entry = DeadLetterEntry.FromFailedJob(
|
||||
job, "ERR", "Failed", null, ErrorCategory.Unknown, true, BaseTime,
|
||||
maxReplayAttempts: 5);
|
||||
|
||||
Assert.Equal(5, entry.MaxReplayAttempts);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void StartReplay_TransitionsToReplaying()
|
||||
{
|
||||
var entry = CreatePendingEntry();
|
||||
|
||||
var replaying = entry.StartReplay("operator", BaseTime.AddMinutes(5));
|
||||
|
||||
Assert.Equal(DeadLetterStatus.Replaying, replaying.Status);
|
||||
Assert.Equal(1, replaying.ReplayAttempts);
|
||||
Assert.Equal("operator", replaying.UpdatedBy);
|
||||
Assert.False(replaying.IsTerminal);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void StartReplay_IncreasesAttemptCount()
|
||||
{
|
||||
var entry = CreatePendingEntry() with { ReplayAttempts = 1 };
|
||||
|
||||
var replaying = entry.StartReplay("operator", BaseTime);
|
||||
|
||||
Assert.Equal(2, replaying.ReplayAttempts);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void StartReplay_WhenNotRetryable_Throws()
|
||||
{
|
||||
var entry = CreatePendingEntry() with { IsRetryable = false };
|
||||
|
||||
Assert.Throws<InvalidOperationException>(() =>
|
||||
entry.StartReplay("operator", BaseTime));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void StartReplay_WhenExhausted_Throws()
|
||||
{
|
||||
var entry = CreatePendingEntry() with { ReplayAttempts = 3 };
|
||||
|
||||
Assert.Throws<InvalidOperationException>(() =>
|
||||
entry.StartReplay("operator", BaseTime));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void StartReplay_WhenTerminal_Throws()
|
||||
{
|
||||
var entry = CreatePendingEntry() with { Status = DeadLetterStatus.Resolved };
|
||||
|
||||
Assert.Throws<InvalidOperationException>(() =>
|
||||
entry.StartReplay("operator", BaseTime));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CompleteReplay_TransitionsToReplayed()
|
||||
{
|
||||
var entry = CreatePendingEntry().StartReplay("op", BaseTime);
|
||||
var newJobId = Guid.NewGuid();
|
||||
|
||||
var completed = entry.CompleteReplay(newJobId, "op", BaseTime.AddMinutes(1));
|
||||
|
||||
Assert.Equal(DeadLetterStatus.Replayed, completed.Status);
|
||||
Assert.Equal(BaseTime.AddMinutes(1), completed.ResolvedAt);
|
||||
Assert.Contains(newJobId.ToString(), completed.ResolutionNotes);
|
||||
Assert.True(completed.IsTerminal);
|
||||
Assert.False(completed.CanReplay);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CompleteReplay_WhenNotReplaying_Throws()
|
||||
{
|
||||
var entry = CreatePendingEntry();
|
||||
|
||||
Assert.Throws<InvalidOperationException>(() =>
|
||||
entry.CompleteReplay(Guid.NewGuid(), "op", BaseTime));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FailReplay_WithAttemptsRemaining_ReturnsToPending()
|
||||
{
|
||||
var entry = CreatePendingEntry().StartReplay("op", BaseTime);
|
||||
|
||||
var failed = entry.FailReplay("Timeout", "op", BaseTime.AddMinutes(1));
|
||||
|
||||
Assert.Equal(DeadLetterStatus.Pending, failed.Status);
|
||||
Assert.Equal("Timeout", failed.FailureReason);
|
||||
Assert.False(failed.IsTerminal);
|
||||
Assert.True(failed.CanReplay); // Still has 2 more attempts
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FailReplay_WithNoAttemptsRemaining_TransitionsToExhausted()
|
||||
{
|
||||
var entry = CreatePendingEntry() with { ReplayAttempts = 2 };
|
||||
var replaying = entry.StartReplay("op", BaseTime); // Now at 3 attempts
|
||||
|
||||
var failed = replaying.FailReplay("Final failure", "op", BaseTime);
|
||||
|
||||
Assert.Equal(DeadLetterStatus.Exhausted, failed.Status);
|
||||
Assert.True(failed.IsTerminal);
|
||||
Assert.False(failed.CanReplay);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Resolve_TransitionsToResolved()
|
||||
{
|
||||
var entry = CreatePendingEntry();
|
||||
|
||||
var resolved = entry.Resolve("Manually verified as expected", "admin", BaseTime);
|
||||
|
||||
Assert.Equal(DeadLetterStatus.Resolved, resolved.Status);
|
||||
Assert.Equal(BaseTime, resolved.ResolvedAt);
|
||||
Assert.Equal("Manually verified as expected", resolved.ResolutionNotes);
|
||||
Assert.Equal("admin", resolved.UpdatedBy);
|
||||
Assert.True(resolved.IsTerminal);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Resolve_WhenTerminal_Throws()
|
||||
{
|
||||
var entry = CreatePendingEntry() with { Status = DeadLetterStatus.Replayed };
|
||||
|
||||
Assert.Throws<InvalidOperationException>(() =>
|
||||
entry.Resolve("Notes", "admin", BaseTime));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MarkExpired_TransitionsToExpired()
|
||||
{
|
||||
var entry = CreatePendingEntry();
|
||||
|
||||
var expired = entry.MarkExpired(BaseTime.AddDays(31));
|
||||
|
||||
Assert.Equal(DeadLetterStatus.Expired, expired.Status);
|
||||
Assert.Equal("system", expired.UpdatedBy);
|
||||
Assert.True(expired.IsTerminal);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MarkExpired_WhenTerminal_Throws()
|
||||
{
|
||||
var entry = CreatePendingEntry() with { Status = DeadLetterStatus.Resolved };
|
||||
|
||||
Assert.Throws<InvalidOperationException>(() =>
|
||||
entry.MarkExpired(BaseTime));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanReplay_WhenRetryableAndNotTerminalAndAttemptsAvailable_ReturnsTrue()
|
||||
{
|
||||
var entry = CreatePendingEntry();
|
||||
|
||||
Assert.True(entry.CanReplay);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanReplay_WhenNotRetryable_ReturnsFalse()
|
||||
{
|
||||
var entry = CreatePendingEntry() with { IsRetryable = false };
|
||||
|
||||
Assert.False(entry.CanReplay);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanReplay_WhenTerminal_ReturnsFalse()
|
||||
{
|
||||
var entry = CreatePendingEntry() with { Status = DeadLetterStatus.Replayed };
|
||||
|
||||
Assert.False(entry.CanReplay);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanReplay_WhenMaxAttemptsReached_ReturnsFalse()
|
||||
{
|
||||
var entry = CreatePendingEntry() with { ReplayAttempts = 3 };
|
||||
|
||||
Assert.False(entry.CanReplay);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DeadLetterStatus.Pending, false)]
|
||||
[InlineData(DeadLetterStatus.Replaying, false)]
|
||||
[InlineData(DeadLetterStatus.Replayed, true)]
|
||||
[InlineData(DeadLetterStatus.Resolved, true)]
|
||||
[InlineData(DeadLetterStatus.Exhausted, true)]
|
||||
[InlineData(DeadLetterStatus.Expired, true)]
|
||||
public void IsTerminal_ReturnsCorrectValue(DeadLetterStatus status, bool expectedTerminal)
|
||||
{
|
||||
var entry = CreatePendingEntry() with { Status = status };
|
||||
|
||||
Assert.Equal(expectedTerminal, entry.IsTerminal);
|
||||
}
|
||||
|
||||
private static DeadLetterEntry CreatePendingEntry() =>
|
||||
new(
|
||||
EntryId: Guid.NewGuid(),
|
||||
TenantId: TenantId,
|
||||
OriginalJobId: Guid.NewGuid(),
|
||||
RunId: Guid.NewGuid(),
|
||||
SourceId: null,
|
||||
JobType: "scan.image",
|
||||
Payload: "{}",
|
||||
PayloadDigest: new string('a', 64),
|
||||
IdempotencyKey: "key-123",
|
||||
CorrelationId: "trace-456",
|
||||
Status: DeadLetterStatus.Pending,
|
||||
ErrorCode: "ORCH-TRN-001",
|
||||
FailureReason: "Network timeout",
|
||||
RemediationHint: "Check connectivity",
|
||||
Category: ErrorCategory.Transient,
|
||||
IsRetryable: true,
|
||||
OriginalAttempts: 3,
|
||||
ReplayAttempts: 0,
|
||||
MaxReplayAttempts: 3,
|
||||
FailedAt: BaseTime,
|
||||
CreatedAt: BaseTime,
|
||||
UpdatedAt: BaseTime,
|
||||
ExpiresAt: BaseTime.AddDays(30),
|
||||
ResolvedAt: null,
|
||||
ResolutionNotes: null,
|
||||
CreatedBy: "test-user",
|
||||
UpdatedBy: "system");
|
||||
}
|
||||
@@ -0,0 +1,265 @@
|
||||
using StellaOps.Orchestrator.Core.DeadLetter;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.DeadLetter;
|
||||
|
||||
public class ErrorClassificationTests
|
||||
{
|
||||
private readonly DefaultErrorClassifier _classifier = new();
|
||||
|
||||
[Fact]
|
||||
public void Classify_KnownErrorCode_ReturnsCorrectClassification()
|
||||
{
|
||||
var result = _classifier.Classify(DefaultErrorClassifier.ErrorCodes.NetworkTimeout, "test");
|
||||
|
||||
Assert.Equal(DefaultErrorClassifier.ErrorCodes.NetworkTimeout, result.ErrorCode);
|
||||
Assert.Equal(ErrorCategory.Transient, result.Category);
|
||||
Assert.True(result.IsRetryable);
|
||||
Assert.NotNull(result.SuggestedRetryDelay);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Classify_UnknownErrorCode_InfersFromPrefix()
|
||||
{
|
||||
var result = _classifier.Classify("ORCH-TRN-999", "Custom transient error");
|
||||
|
||||
Assert.Equal("ORCH-TRN-999", result.ErrorCode);
|
||||
Assert.Equal(ErrorCategory.Transient, result.Category);
|
||||
Assert.True(result.IsRetryable);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Classify_UnknownPrefix_ReturnsUnknownCategory()
|
||||
{
|
||||
var result = _classifier.Classify("CUSTOM-ERR-001", "Unknown error");
|
||||
|
||||
Assert.Equal("CUSTOM-ERR-001", result.ErrorCode);
|
||||
Assert.Equal(ErrorCategory.Unknown, result.Category);
|
||||
Assert.False(result.IsRetryable);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(DefaultErrorClassifier.ErrorCodes.NetworkTimeout, ErrorCategory.Transient, true)]
|
||||
[InlineData(DefaultErrorClassifier.ErrorCodes.ImageNotFound, ErrorCategory.NotFound, false)]
|
||||
[InlineData(DefaultErrorClassifier.ErrorCodes.InvalidCredentials, ErrorCategory.AuthFailure, false)]
|
||||
[InlineData(DefaultErrorClassifier.ErrorCodes.RateLimited, ErrorCategory.RateLimited, true)]
|
||||
[InlineData(DefaultErrorClassifier.ErrorCodes.InvalidPayload, ErrorCategory.ValidationError, false)]
|
||||
[InlineData(DefaultErrorClassifier.ErrorCodes.RegistryError, ErrorCategory.UpstreamError, true)]
|
||||
[InlineData(DefaultErrorClassifier.ErrorCodes.InternalError, ErrorCategory.InternalError, false)]
|
||||
[InlineData(DefaultErrorClassifier.ErrorCodes.DuplicateJob, ErrorCategory.Conflict, false)]
|
||||
[InlineData(DefaultErrorClassifier.ErrorCodes.UserCanceled, ErrorCategory.Canceled, false)]
|
||||
public void Classify_ErrorCode_ReturnsExpectedCategory(string errorCode, ErrorCategory expectedCategory, bool expectedRetryable)
|
||||
{
|
||||
var result = _classifier.Classify(errorCode, "test");
|
||||
|
||||
Assert.Equal(expectedCategory, result.Category);
|
||||
Assert.Equal(expectedRetryable, result.IsRetryable);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Classify_TimeoutException_ReturnsTransient()
|
||||
{
|
||||
var exception = new TimeoutException("Operation timed out");
|
||||
|
||||
var result = _classifier.Classify(exception);
|
||||
|
||||
Assert.Equal(ErrorCategory.Transient, result.Category);
|
||||
Assert.True(result.IsRetryable);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Classify_OperationCanceledException_ReturnsCanceled()
|
||||
{
|
||||
var exception = new OperationCanceledException();
|
||||
|
||||
var result = _classifier.Classify(exception);
|
||||
|
||||
Assert.Equal(ErrorCategory.Canceled, result.Category);
|
||||
Assert.False(result.IsRetryable);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Classify_ExceptionWithConnectionRefused_ReturnsTransient()
|
||||
{
|
||||
var exception = new Exception("connection refused by remote host");
|
||||
|
||||
var result = _classifier.Classify(exception);
|
||||
|
||||
Assert.Equal(DefaultErrorClassifier.ErrorCodes.ConnectionRefused, result.ErrorCode);
|
||||
Assert.Equal(ErrorCategory.Transient, result.Category);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Classify_ExceptionWithDns_ReturnsTransient()
|
||||
{
|
||||
var exception = new Exception("DNS resolution failed");
|
||||
|
||||
var result = _classifier.Classify(exception);
|
||||
|
||||
Assert.Equal(DefaultErrorClassifier.ErrorCodes.DnsResolutionFailed, result.ErrorCode);
|
||||
Assert.Equal(ErrorCategory.Transient, result.Category);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Classify_ExceptionWithCertificate_ReturnsAuthFailure()
|
||||
{
|
||||
var exception = new Exception("SSL certificate validation failed");
|
||||
|
||||
var result = _classifier.Classify(exception);
|
||||
|
||||
Assert.Equal(DefaultErrorClassifier.ErrorCodes.CertificateError, result.ErrorCode);
|
||||
Assert.Equal(ErrorCategory.AuthFailure, result.Category);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Classify_GenericException_ReturnsUnexpectedError()
|
||||
{
|
||||
var exception = new Exception("Something unexpected happened");
|
||||
|
||||
var result = _classifier.Classify(exception);
|
||||
|
||||
Assert.Equal(DefaultErrorClassifier.ErrorCodes.UnexpectedError, result.ErrorCode);
|
||||
Assert.Equal(ErrorCategory.InternalError, result.Category);
|
||||
Assert.False(result.IsRetryable);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(400, ErrorCategory.ValidationError)]
|
||||
[InlineData(401, ErrorCategory.AuthFailure)]
|
||||
[InlineData(403, ErrorCategory.AuthFailure)]
|
||||
[InlineData(404, ErrorCategory.NotFound)]
|
||||
[InlineData(408, ErrorCategory.Transient)]
|
||||
[InlineData(409, ErrorCategory.Conflict)]
|
||||
[InlineData(429, ErrorCategory.RateLimited)]
|
||||
[InlineData(500, ErrorCategory.InternalError)]
|
||||
[InlineData(502, ErrorCategory.UpstreamError)]
|
||||
[InlineData(503, ErrorCategory.Transient)]
|
||||
[InlineData(504, ErrorCategory.Transient)]
|
||||
public void ClassifyHttpError_ReturnsExpectedCategory(int statusCode, ErrorCategory expectedCategory)
|
||||
{
|
||||
var result = _classifier.ClassifyHttpError(statusCode, "HTTP error");
|
||||
|
||||
Assert.Equal(expectedCategory, result.Category);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ClassifyHttpError_429_IsRetryable()
|
||||
{
|
||||
var result = _classifier.ClassifyHttpError(429, "Too many requests");
|
||||
|
||||
Assert.True(result.IsRetryable);
|
||||
Assert.NotNull(result.SuggestedRetryDelay);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ClassifyHttpError_503_IsRetryable()
|
||||
{
|
||||
var result = _classifier.ClassifyHttpError(503, "Service unavailable");
|
||||
|
||||
Assert.True(result.IsRetryable);
|
||||
Assert.NotNull(result.SuggestedRetryDelay);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ClassifyHttpError_400_IsNotRetryable()
|
||||
{
|
||||
var result = _classifier.ClassifyHttpError(400, "Bad request");
|
||||
|
||||
Assert.False(result.IsRetryable);
|
||||
Assert.Null(result.SuggestedRetryDelay);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ClassifyHttpError_Unknown4xx_ReturnsValidationError()
|
||||
{
|
||||
var result = _classifier.ClassifyHttpError(418, "I'm a teapot");
|
||||
|
||||
Assert.Equal(ErrorCategory.ValidationError, result.Category);
|
||||
Assert.Equal("HTTP-418", result.ErrorCode);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ClassifyHttpError_Unknown5xx_ReturnsUpstreamError()
|
||||
{
|
||||
var result = _classifier.ClassifyHttpError(599, "Custom server error");
|
||||
|
||||
Assert.Equal(ErrorCategory.UpstreamError, result.Category);
|
||||
Assert.Equal("HTTP-599", result.ErrorCode);
|
||||
Assert.True(result.IsRetryable);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AllKnownErrorCodes_HaveRemediationHints()
|
||||
{
|
||||
var errorCodes = new[]
|
||||
{
|
||||
DefaultErrorClassifier.ErrorCodes.NetworkTimeout,
|
||||
DefaultErrorClassifier.ErrorCodes.ConnectionRefused,
|
||||
DefaultErrorClassifier.ErrorCodes.ServiceUnavailable,
|
||||
DefaultErrorClassifier.ErrorCodes.ImageNotFound,
|
||||
DefaultErrorClassifier.ErrorCodes.InvalidCredentials,
|
||||
DefaultErrorClassifier.ErrorCodes.RateLimited,
|
||||
DefaultErrorClassifier.ErrorCodes.InvalidPayload,
|
||||
DefaultErrorClassifier.ErrorCodes.InternalError
|
||||
};
|
||||
|
||||
foreach (var code in errorCodes)
|
||||
{
|
||||
var result = _classifier.Classify(code, "test");
|
||||
Assert.NotNull(result.RemediationHint);
|
||||
Assert.NotEmpty(result.RemediationHint);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TransientErrors_HaveSuggestedRetryDelay()
|
||||
{
|
||||
var transientCodes = new[]
|
||||
{
|
||||
DefaultErrorClassifier.ErrorCodes.NetworkTimeout,
|
||||
DefaultErrorClassifier.ErrorCodes.ConnectionRefused,
|
||||
DefaultErrorClassifier.ErrorCodes.ServiceUnavailable,
|
||||
DefaultErrorClassifier.ErrorCodes.GatewayTimeout
|
||||
};
|
||||
|
||||
foreach (var code in transientCodes)
|
||||
{
|
||||
var result = _classifier.Classify(code, "test");
|
||||
Assert.NotNull(result.SuggestedRetryDelay);
|
||||
Assert.True(result.SuggestedRetryDelay.Value > TimeSpan.Zero);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public class ClassifiedErrorTests
|
||||
{
|
||||
[Fact]
|
||||
public void ClassifiedError_StoresAllProperties()
|
||||
{
|
||||
var error = new ClassifiedError(
|
||||
ErrorCode: "TEST-001",
|
||||
Category: ErrorCategory.Transient,
|
||||
Description: "Test error",
|
||||
RemediationHint: "Try again",
|
||||
IsRetryable: true,
|
||||
SuggestedRetryDelay: TimeSpan.FromMinutes(5));
|
||||
|
||||
Assert.Equal("TEST-001", error.ErrorCode);
|
||||
Assert.Equal(ErrorCategory.Transient, error.Category);
|
||||
Assert.Equal("Test error", error.Description);
|
||||
Assert.Equal("Try again", error.RemediationHint);
|
||||
Assert.True(error.IsRetryable);
|
||||
Assert.Equal(TimeSpan.FromMinutes(5), error.SuggestedRetryDelay);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ClassifiedError_EqualsComparison()
|
||||
{
|
||||
var error1 = new ClassifiedError("TEST", ErrorCategory.Unknown, "Desc", "Hint", false, null);
|
||||
var error2 = new ClassifiedError("TEST", ErrorCategory.Unknown, "Desc", "Hint", false, null);
|
||||
var error3 = new ClassifiedError("OTHER", ErrorCategory.Unknown, "Desc", "Hint", false, null);
|
||||
|
||||
Assert.Equal(error1, error2);
|
||||
Assert.NotEqual(error1, error3);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,309 @@
|
||||
using StellaOps.Orchestrator.Core.DeadLetter;
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.DeadLetter;
|
||||
|
||||
public class NotificationRuleTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
private const string TenantId = "test-tenant";
|
||||
|
||||
[Fact]
|
||||
public void Create_SetsDefaultValues()
|
||||
{
|
||||
var rule = NotificationRule.Create(
|
||||
TenantId,
|
||||
NotificationChannel.Slack,
|
||||
"https://hooks.slack.com/test",
|
||||
"admin");
|
||||
|
||||
Assert.NotEqual(Guid.Empty, rule.RuleId);
|
||||
Assert.Equal(TenantId, rule.TenantId);
|
||||
Assert.Equal(NotificationChannel.Slack, rule.Channel);
|
||||
Assert.Equal("https://hooks.slack.com/test", rule.Endpoint);
|
||||
Assert.True(rule.Enabled);
|
||||
Assert.Equal(15, rule.CooldownMinutes);
|
||||
Assert.Equal(10, rule.MaxPerHour);
|
||||
Assert.True(rule.Aggregate);
|
||||
Assert.Null(rule.LastNotifiedAt);
|
||||
Assert.Equal(0, rule.NotificationsSent);
|
||||
Assert.Equal("admin", rule.CreatedBy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithFilters_SetsFilters()
|
||||
{
|
||||
var sourceId = Guid.NewGuid();
|
||||
|
||||
var rule = NotificationRule.Create(
|
||||
TenantId,
|
||||
NotificationChannel.Email,
|
||||
"alerts@example.com",
|
||||
"admin",
|
||||
jobTypePattern: "scan\\.*",
|
||||
errorCodePattern: "ORCH-TRN-.*",
|
||||
category: ErrorCategory.Transient,
|
||||
sourceId: sourceId);
|
||||
|
||||
Assert.Equal("scan\\.*", rule.JobTypePattern);
|
||||
Assert.Equal("ORCH-TRN-.*", rule.ErrorCodePattern);
|
||||
Assert.Equal(ErrorCategory.Transient, rule.Category);
|
||||
Assert.Equal(sourceId, rule.SourceId);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithCustomRateLimits_SetsLimits()
|
||||
{
|
||||
var rule = NotificationRule.Create(
|
||||
TenantId,
|
||||
NotificationChannel.Webhook,
|
||||
"https://webhook.example.com",
|
||||
"admin",
|
||||
cooldownMinutes: 30,
|
||||
maxPerHour: 5,
|
||||
aggregate: false);
|
||||
|
||||
Assert.Equal(30, rule.CooldownMinutes);
|
||||
Assert.Equal(5, rule.MaxPerHour);
|
||||
Assert.False(rule.Aggregate);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Matches_WithNoFilters_MatchesAll()
|
||||
{
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin");
|
||||
var entry = CreateTestEntry();
|
||||
|
||||
Assert.True(rule.Matches(entry));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Matches_WhenDisabled_ReturnsFalse()
|
||||
{
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin")
|
||||
with { Enabled = false };
|
||||
var entry = CreateTestEntry();
|
||||
|
||||
Assert.False(rule.Matches(entry));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Matches_WithSourceIdFilter_MatchesOnlyMatchingSource()
|
||||
{
|
||||
var sourceId = Guid.NewGuid();
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
|
||||
sourceId: sourceId);
|
||||
|
||||
var matchingEntry = CreateTestEntry() with { SourceId = sourceId };
|
||||
var nonMatchingEntry = CreateTestEntry() with { SourceId = Guid.NewGuid() };
|
||||
|
||||
Assert.True(rule.Matches(matchingEntry));
|
||||
Assert.False(rule.Matches(nonMatchingEntry));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Matches_WithCategoryFilter_MatchesOnlyMatchingCategory()
|
||||
{
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
|
||||
category: ErrorCategory.Transient);
|
||||
|
||||
var matchingEntry = CreateTestEntry() with { Category = ErrorCategory.Transient };
|
||||
var nonMatchingEntry = CreateTestEntry() with { Category = ErrorCategory.NotFound };
|
||||
|
||||
Assert.True(rule.Matches(matchingEntry));
|
||||
Assert.False(rule.Matches(nonMatchingEntry));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Matches_WithJobTypePattern_MatchesRegex()
|
||||
{
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
|
||||
jobTypePattern: @"scan\..*");
|
||||
|
||||
var matchingEntry1 = CreateTestEntry() with { JobType = "scan.image" };
|
||||
var matchingEntry2 = CreateTestEntry() with { JobType = "scan.sbom" };
|
||||
var nonMatchingEntry = CreateTestEntry() with { JobType = "export.report" };
|
||||
|
||||
Assert.True(rule.Matches(matchingEntry1));
|
||||
Assert.True(rule.Matches(matchingEntry2));
|
||||
Assert.False(rule.Matches(nonMatchingEntry));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Matches_WithErrorCodePattern_MatchesRegex()
|
||||
{
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
|
||||
errorCodePattern: @"ORCH-TRN-\d+");
|
||||
|
||||
var matchingEntry = CreateTestEntry() with { ErrorCode = "ORCH-TRN-001" };
|
||||
var nonMatchingEntry = CreateTestEntry() with { ErrorCode = "ORCH-NF-001" };
|
||||
|
||||
Assert.True(rule.Matches(matchingEntry));
|
||||
Assert.False(rule.Matches(nonMatchingEntry));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanNotify_WhenDisabled_ReturnsFalse()
|
||||
{
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin")
|
||||
with { Enabled = false };
|
||||
|
||||
Assert.False(rule.CanNotify(BaseTime, 0));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanNotify_WithinCooldown_ReturnsFalse()
|
||||
{
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
|
||||
cooldownMinutes: 15) with { LastNotifiedAt = BaseTime };
|
||||
|
||||
Assert.False(rule.CanNotify(BaseTime.AddMinutes(10), 0));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanNotify_AfterCooldown_ReturnsTrue()
|
||||
{
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
|
||||
cooldownMinutes: 15) with { LastNotifiedAt = BaseTime };
|
||||
|
||||
Assert.True(rule.CanNotify(BaseTime.AddMinutes(20), 0));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanNotify_AtMaxPerHour_ReturnsFalse()
|
||||
{
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
|
||||
maxPerHour: 5);
|
||||
|
||||
Assert.False(rule.CanNotify(BaseTime, 5));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanNotify_BelowMaxPerHour_ReturnsTrue()
|
||||
{
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
|
||||
maxPerHour: 5);
|
||||
|
||||
Assert.True(rule.CanNotify(BaseTime, 4));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanNotify_WithNoLastNotification_ReturnsTrue()
|
||||
{
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin");
|
||||
|
||||
Assert.True(rule.CanNotify(BaseTime, 0));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordNotification_UpdatesFields()
|
||||
{
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin");
|
||||
|
||||
var updated = rule.RecordNotification(BaseTime);
|
||||
|
||||
Assert.Equal(BaseTime, updated.LastNotifiedAt);
|
||||
Assert.Equal(1, updated.NotificationsSent);
|
||||
Assert.Equal(BaseTime, updated.UpdatedAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordNotification_IncrementsCount()
|
||||
{
|
||||
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin")
|
||||
with { NotificationsSent = 5 };
|
||||
|
||||
var updated = rule.RecordNotification(BaseTime);
|
||||
|
||||
Assert.Equal(6, updated.NotificationsSent);
|
||||
}
|
||||
|
||||
private static DeadLetterEntry CreateTestEntry() =>
|
||||
new(
|
||||
EntryId: Guid.NewGuid(),
|
||||
TenantId: TenantId,
|
||||
OriginalJobId: Guid.NewGuid(),
|
||||
RunId: null,
|
||||
SourceId: null,
|
||||
JobType: "scan.image",
|
||||
Payload: "{}",
|
||||
PayloadDigest: new string('a', 64),
|
||||
IdempotencyKey: "key",
|
||||
CorrelationId: null,
|
||||
Status: DeadLetterStatus.Pending,
|
||||
ErrorCode: "ORCH-TRN-001",
|
||||
FailureReason: "Timeout",
|
||||
RemediationHint: null,
|
||||
Category: ErrorCategory.Transient,
|
||||
IsRetryable: true,
|
||||
OriginalAttempts: 3,
|
||||
ReplayAttempts: 0,
|
||||
MaxReplayAttempts: 3,
|
||||
FailedAt: BaseTime,
|
||||
CreatedAt: BaseTime,
|
||||
UpdatedAt: BaseTime,
|
||||
ExpiresAt: BaseTime.AddDays(30),
|
||||
ResolvedAt: null,
|
||||
ResolutionNotes: null,
|
||||
CreatedBy: "test",
|
||||
UpdatedBy: "system");
|
||||
}
|
||||
|
||||
public class ReplayAuditRecordTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
private const string TenantId = "test-tenant";
|
||||
|
||||
[Fact]
|
||||
public void Create_SetsInitialValues()
|
||||
{
|
||||
var entryId = Guid.NewGuid();
|
||||
|
||||
var record = ReplayAuditRecord.Create(
|
||||
TenantId,
|
||||
entryId,
|
||||
attemptNumber: 1,
|
||||
triggeredBy: "manual",
|
||||
initiatedBy: "operator",
|
||||
now: BaseTime);
|
||||
|
||||
Assert.NotEqual(Guid.Empty, record.AuditId);
|
||||
Assert.Equal(TenantId, record.TenantId);
|
||||
Assert.Equal(entryId, record.EntryId);
|
||||
Assert.Equal(1, record.AttemptNumber);
|
||||
Assert.False(record.Success);
|
||||
Assert.Null(record.NewJobId);
|
||||
Assert.Null(record.ErrorMessage);
|
||||
Assert.Equal("manual", record.TriggeredBy);
|
||||
Assert.Equal(BaseTime, record.TriggeredAt);
|
||||
Assert.Null(record.CompletedAt);
|
||||
Assert.Equal("operator", record.InitiatedBy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Complete_SetsSuccessAndJobId()
|
||||
{
|
||||
var record = ReplayAuditRecord.Create(TenantId, Guid.NewGuid(), 1, "auto", "system", BaseTime);
|
||||
var newJobId = Guid.NewGuid();
|
||||
|
||||
var completed = record.Complete(newJobId, BaseTime.AddMinutes(1));
|
||||
|
||||
Assert.True(completed.Success);
|
||||
Assert.Equal(newJobId, completed.NewJobId);
|
||||
Assert.Equal(BaseTime.AddMinutes(1), completed.CompletedAt);
|
||||
Assert.Null(completed.ErrorMessage);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Fail_SetsErrorMessage()
|
||||
{
|
||||
var record = ReplayAuditRecord.Create(TenantId, Guid.NewGuid(), 1, "auto", "system", BaseTime);
|
||||
|
||||
var failed = record.Fail("Connection timeout", BaseTime.AddMinutes(1));
|
||||
|
||||
Assert.False(failed.Success);
|
||||
Assert.Null(failed.NewJobId);
|
||||
Assert.Equal("Connection timeout", failed.ErrorMessage);
|
||||
Assert.Equal(BaseTime.AddMinutes(1), failed.CompletedAt);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,391 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Core.RateLimiting;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.RateLimiting;
|
||||
|
||||
public class AdaptiveRateLimiterTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
|
||||
private static Quota CreateDefaultQuota() => new(
|
||||
QuotaId: Guid.NewGuid(),
|
||||
TenantId: "tenant-1",
|
||||
JobType: "scan",
|
||||
MaxActive: 5,
|
||||
MaxPerHour: 100,
|
||||
BurstCapacity: 10,
|
||||
RefillRate: 2.0,
|
||||
CurrentTokens: 10,
|
||||
LastRefillAt: BaseTime,
|
||||
CurrentActive: 0,
|
||||
CurrentHourCount: 0,
|
||||
CurrentHourStart: BaseTime,
|
||||
Paused: false,
|
||||
PauseReason: null,
|
||||
QuotaTicket: null,
|
||||
CreatedAt: BaseTime,
|
||||
UpdatedAt: BaseTime,
|
||||
UpdatedBy: "system");
|
||||
|
||||
[Fact]
|
||||
public void Constructor_FromQuota_InitializesCorrectly()
|
||||
{
|
||||
var quota = CreateDefaultQuota();
|
||||
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
Assert.Equal("tenant-1", limiter.TenantId);
|
||||
Assert.Equal("scan", limiter.JobType);
|
||||
Assert.Equal(100, limiter.MaxPerHour);
|
||||
Assert.False(limiter.IsPaused);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithExplicitParameters_InitializesCorrectly()
|
||||
{
|
||||
var limiter = new AdaptiveRateLimiter(
|
||||
tenantId: "tenant-2",
|
||||
jobType: "analyze",
|
||||
maxActive: 3,
|
||||
maxPerHour: 50,
|
||||
burstCapacity: 5,
|
||||
refillRate: 1.0);
|
||||
|
||||
Assert.Equal("tenant-2", limiter.TenantId);
|
||||
Assert.Equal("analyze", limiter.JobType);
|
||||
Assert.Equal(50, limiter.MaxPerHour);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithNullQuota_Throws()
|
||||
{
|
||||
Assert.Throws<ArgumentNullException>(() =>
|
||||
new AdaptiveRateLimiter(null!));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithNullTenantId_Throws()
|
||||
{
|
||||
Assert.Throws<ArgumentNullException>(() =>
|
||||
new AdaptiveRateLimiter(
|
||||
tenantId: null!,
|
||||
jobType: "scan",
|
||||
maxActive: 5,
|
||||
maxPerHour: 100,
|
||||
burstCapacity: 10,
|
||||
refillRate: 2.0));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryAcquire_WithCapacity_ReturnsAllowed()
|
||||
{
|
||||
var quota = CreateDefaultQuota();
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
var result = limiter.TryAcquire(BaseTime);
|
||||
|
||||
Assert.True(result.IsAllowed);
|
||||
Assert.Null(result.DenialReason);
|
||||
Assert.Null(result.DenialMessage);
|
||||
Assert.Null(result.RetryAfter);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryAcquire_WhenPaused_ReturnsDenied()
|
||||
{
|
||||
var quota = CreateDefaultQuota() with { Paused = true, PauseReason = "Manual pause" };
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
var result = limiter.TryAcquire(BaseTime);
|
||||
|
||||
Assert.False(result.IsAllowed);
|
||||
Assert.Equal(RateLimitDenialReason.Paused, result.DenialReason);
|
||||
Assert.Equal("Manual pause", result.DenialMessage);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryAcquire_WhenConcurrencyExceeded_ReturnsDenied()
|
||||
{
|
||||
var quota = CreateDefaultQuota() with { MaxActive = 2, CurrentActive = 2 };
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
var result = limiter.TryAcquire(BaseTime);
|
||||
|
||||
Assert.False(result.IsAllowed);
|
||||
Assert.Equal(RateLimitDenialReason.ConcurrencyLimitExceeded, result.DenialReason);
|
||||
Assert.Contains("Concurrency limit of 2", result.DenialMessage);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryAcquire_WhenTokensExhausted_ReturnsDenied()
|
||||
{
|
||||
var quota = CreateDefaultQuota() with { CurrentTokens = 0 };
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
var result = limiter.TryAcquire(BaseTime);
|
||||
|
||||
Assert.False(result.IsAllowed);
|
||||
Assert.Equal(RateLimitDenialReason.TokensExhausted, result.DenialReason);
|
||||
Assert.NotNull(result.RetryAfter);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryAcquire_WhenHourlyLimitExceeded_ReturnsDenied()
|
||||
{
|
||||
var quota = CreateDefaultQuota() with { CurrentHourCount = 100 }; // MaxPerHour = 100
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
var result = limiter.TryAcquire(BaseTime);
|
||||
|
||||
Assert.False(result.IsAllowed);
|
||||
Assert.Equal(RateLimitDenialReason.HourlyLimitExceeded, result.DenialReason);
|
||||
Assert.Contains("Hourly limit of 100", result.DenialMessage);
|
||||
Assert.NotNull(result.RetryAfter);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryAcquire_InBackpressure_ReturnsDenied()
|
||||
{
|
||||
var quota = CreateDefaultQuota();
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
// Record failure to trigger backpressure
|
||||
limiter.RecordUpstreamFailure(429, TimeSpan.FromMinutes(1), BaseTime);
|
||||
|
||||
var result = limiter.TryAcquire(BaseTime.AddSeconds(10));
|
||||
|
||||
Assert.False(result.IsAllowed);
|
||||
Assert.Equal(RateLimitDenialReason.Backpressure, result.DenialReason);
|
||||
Assert.NotNull(result.RetryAfter);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryAcquire_ConsumesTokenAndConcurrency()
|
||||
{
|
||||
var quota = CreateDefaultQuota();
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
limiter.TryAcquire(BaseTime);
|
||||
|
||||
var snapshot = limiter.GetSnapshot(BaseTime);
|
||||
Assert.Equal(9, snapshot.TokenBucket.CurrentTokens);
|
||||
Assert.Equal(1, snapshot.Concurrency.CurrentActive);
|
||||
Assert.Equal(1, snapshot.HourlyCounter.CurrentCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Release_DecrementsConcurrency()
|
||||
{
|
||||
var quota = CreateDefaultQuota();
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
limiter.TryAcquire(BaseTime);
|
||||
limiter.Release();
|
||||
|
||||
var snapshot = limiter.GetSnapshot(BaseTime);
|
||||
Assert.Equal(0, snapshot.Concurrency.CurrentActive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordUpstreamFailure_TriggersBackpressure()
|
||||
{
|
||||
var quota = CreateDefaultQuota();
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
var result = limiter.RecordUpstreamFailure(429, TimeSpan.FromSeconds(30), BaseTime);
|
||||
|
||||
Assert.True(result.ShouldBackoff);
|
||||
Assert.Equal(TimeSpan.FromSeconds(30), result.BackoffDuration);
|
||||
Assert.Equal(429, result.StatusCode);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordUpstreamSuccess_ClearsBackpressure()
|
||||
{
|
||||
var quota = CreateDefaultQuota();
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
limiter.RecordUpstreamFailure(429, TimeSpan.FromMinutes(1), BaseTime);
|
||||
limiter.RecordUpstreamSuccess();
|
||||
|
||||
var snapshot = limiter.GetSnapshot(BaseTime.AddSeconds(10));
|
||||
Assert.False(snapshot.Backpressure.IsInBackoff);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Pause_PausesLimiter()
|
||||
{
|
||||
var quota = CreateDefaultQuota();
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
limiter.Pause("Maintenance");
|
||||
|
||||
Assert.True(limiter.IsPaused);
|
||||
Assert.Equal("Maintenance", limiter.PauseReason);
|
||||
|
||||
var result = limiter.TryAcquire(BaseTime);
|
||||
Assert.False(result.IsAllowed);
|
||||
Assert.Equal(RateLimitDenialReason.Paused, result.DenialReason);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Resume_ResumesLimiter()
|
||||
{
|
||||
var quota = CreateDefaultQuota() with { Paused = true, PauseReason = "Maintenance" };
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
limiter.Resume();
|
||||
|
||||
Assert.False(limiter.IsPaused);
|
||||
Assert.Null(limiter.PauseReason);
|
||||
|
||||
var result = limiter.TryAcquire(BaseTime);
|
||||
Assert.True(result.IsAllowed);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_ReturnsCompleteState()
|
||||
{
|
||||
var quota = CreateDefaultQuota();
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
limiter.TryAcquire(BaseTime);
|
||||
limiter.RecordUpstreamFailure(503, now: BaseTime);
|
||||
|
||||
var snapshot = limiter.GetSnapshot(BaseTime);
|
||||
|
||||
Assert.Equal("tenant-1", snapshot.TenantId);
|
||||
Assert.Equal("scan", snapshot.JobType);
|
||||
Assert.False(snapshot.IsPaused);
|
||||
Assert.Equal(9, snapshot.TokenBucket.CurrentTokens);
|
||||
Assert.Equal(1, snapshot.Concurrency.CurrentActive);
|
||||
Assert.True(snapshot.Backpressure.IsInBackoff);
|
||||
Assert.Equal(1, snapshot.HourlyCounter.CurrentCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ExportToQuota_PreservesState()
|
||||
{
|
||||
var originalQuota = CreateDefaultQuota();
|
||||
var limiter = new AdaptiveRateLimiter(originalQuota);
|
||||
|
||||
limiter.TryAcquire(BaseTime);
|
||||
limiter.TryAcquire(BaseTime);
|
||||
limiter.Release();
|
||||
limiter.Pause("Testing");
|
||||
|
||||
var exportedQuota = limiter.ExportToQuota(originalQuota.QuotaId, BaseTime.AddSeconds(10), "test-user");
|
||||
|
||||
Assert.Equal(originalQuota.QuotaId, exportedQuota.QuotaId);
|
||||
Assert.Equal("tenant-1", exportedQuota.TenantId);
|
||||
Assert.Equal("scan", exportedQuota.JobType);
|
||||
Assert.Equal(1, exportedQuota.CurrentActive); // 2 acquired, 1 released
|
||||
Assert.Equal(2, exportedQuota.CurrentHourCount);
|
||||
Assert.True(exportedQuota.Paused);
|
||||
Assert.Equal("Testing", exportedQuota.PauseReason);
|
||||
Assert.Equal("test-user", exportedQuota.UpdatedBy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void MultipleAcquires_TrackCorrectly()
|
||||
{
|
||||
var quota = CreateDefaultQuota() with { MaxActive = 3, BurstCapacity = 5 };
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
var result1 = limiter.TryAcquire(BaseTime);
|
||||
var result2 = limiter.TryAcquire(BaseTime);
|
||||
var result3 = limiter.TryAcquire(BaseTime);
|
||||
var result4 = limiter.TryAcquire(BaseTime);
|
||||
|
||||
Assert.True(result1.IsAllowed);
|
||||
Assert.True(result2.IsAllowed);
|
||||
Assert.True(result3.IsAllowed);
|
||||
Assert.False(result4.IsAllowed);
|
||||
Assert.Equal(RateLimitDenialReason.ConcurrencyLimitExceeded, result4.DenialReason);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RollbackOnConcurrencyFailure_DoesNotAffectHourlyCounter()
|
||||
{
|
||||
var quota = CreateDefaultQuota() with { MaxActive = 1, CurrentActive = 1 };
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
var result = limiter.TryAcquire(BaseTime);
|
||||
|
||||
Assert.False(result.IsAllowed);
|
||||
var snapshot = limiter.GetSnapshot(BaseTime);
|
||||
Assert.Equal(0, snapshot.HourlyCounter.CurrentCount); // Should be rolled back
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RollbackOnTokenBucketFailure_DoesNotAffectOtherCounters()
|
||||
{
|
||||
var quota = CreateDefaultQuota() with { CurrentTokens = 0 };
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
var result = limiter.TryAcquire(BaseTime);
|
||||
|
||||
Assert.False(result.IsAllowed);
|
||||
var snapshot = limiter.GetSnapshot(BaseTime);
|
||||
Assert.Equal(0, snapshot.Concurrency.CurrentActive); // Should be rolled back
|
||||
Assert.Equal(0, snapshot.HourlyCounter.CurrentCount); // Should be rolled back
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HourlyCounter_ResetsAfterHour()
|
||||
{
|
||||
var quota = CreateDefaultQuota() with { CurrentHourCount = 50 };
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
|
||||
// Try acquire after an hour has passed
|
||||
var result = limiter.TryAcquire(BaseTime.AddHours(1).AddMinutes(1));
|
||||
|
||||
Assert.True(result.IsAllowed);
|
||||
var snapshot = limiter.GetSnapshot(BaseTime.AddHours(1).AddMinutes(1));
|
||||
Assert.Equal(1, snapshot.HourlyCounter.CurrentCount); // Reset and then 1 new
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ConcurrentAccess_IsThreadSafe()
|
||||
{
|
||||
var quota = CreateDefaultQuota() with { MaxActive = 50, MaxPerHour = 1000, BurstCapacity = 100 };
|
||||
var limiter = new AdaptiveRateLimiter(quota);
|
||||
var successes = 0;
|
||||
|
||||
Parallel.For(0, 100, _ =>
|
||||
{
|
||||
var result = limiter.TryAcquire(DateTimeOffset.UtcNow);
|
||||
if (result.IsAllowed)
|
||||
{
|
||||
Interlocked.Increment(ref successes);
|
||||
}
|
||||
});
|
||||
|
||||
Assert.Equal(50, successes); // Limited by MaxActive
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RateLimitResult_AllowedFactory_CreatesCorrectResult()
|
||||
{
|
||||
var result = RateLimitResult.Allowed();
|
||||
|
||||
Assert.True(result.IsAllowed);
|
||||
Assert.Null(result.DenialReason);
|
||||
Assert.Null(result.DenialMessage);
|
||||
Assert.Null(result.RetryAfter);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RateLimitResult_DeniedFactory_CreatesCorrectResult()
|
||||
{
|
||||
var result = RateLimitResult.Denied(
|
||||
RateLimitDenialReason.TokensExhausted,
|
||||
"No tokens available",
|
||||
TimeSpan.FromSeconds(5));
|
||||
|
||||
Assert.False(result.IsAllowed);
|
||||
Assert.Equal(RateLimitDenialReason.TokensExhausted, result.DenialReason);
|
||||
Assert.Equal("No tokens available", result.DenialMessage);
|
||||
Assert.Equal(TimeSpan.FromSeconds(5), result.RetryAfter);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,313 @@
|
||||
using StellaOps.Orchestrator.Core.RateLimiting;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.RateLimiting;
|
||||
|
||||
public class BackpressureHandlerTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithDefaults_SetsCorrectValues()
|
||||
{
|
||||
var handler = new BackpressureHandler();
|
||||
|
||||
Assert.Equal(TimeSpan.FromSeconds(1), handler.BaseDelay);
|
||||
Assert.Equal(TimeSpan.FromMinutes(5), handler.MaxDelay);
|
||||
Assert.Equal(1, handler.FailureThreshold);
|
||||
Assert.Equal(0.2, handler.JitterFactor);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithCustomValues_SetsCorrectly()
|
||||
{
|
||||
var handler = new BackpressureHandler(
|
||||
baseDelay: TimeSpan.FromSeconds(2),
|
||||
maxDelay: TimeSpan.FromMinutes(10),
|
||||
failureThreshold: 3,
|
||||
jitterFactor: 0.5);
|
||||
|
||||
Assert.Equal(TimeSpan.FromSeconds(2), handler.BaseDelay);
|
||||
Assert.Equal(TimeSpan.FromMinutes(10), handler.MaxDelay);
|
||||
Assert.Equal(3, handler.FailureThreshold);
|
||||
Assert.Equal(0.5, handler.JitterFactor);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithInvalidBaseDelay_Throws()
|
||||
{
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
new BackpressureHandler(baseDelay: TimeSpan.Zero));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithMaxDelayLessThanBase_Throws()
|
||||
{
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
new BackpressureHandler(
|
||||
baseDelay: TimeSpan.FromSeconds(10),
|
||||
maxDelay: TimeSpan.FromSeconds(5)));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithJitterOutOfRange_Clamps()
|
||||
{
|
||||
var handler1 = new BackpressureHandler(jitterFactor: -0.5);
|
||||
var handler2 = new BackpressureHandler(jitterFactor: 1.5);
|
||||
|
||||
Assert.Equal(0.0, handler1.JitterFactor);
|
||||
Assert.Equal(1.0, handler2.JitterFactor);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ShouldAllow_Initially_ReturnsTrue()
|
||||
{
|
||||
var handler = new BackpressureHandler();
|
||||
|
||||
Assert.True(handler.ShouldAllow(BaseTime));
|
||||
Assert.False(handler.IsInBackoff);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordFailure_Returns429Reason()
|
||||
{
|
||||
var handler = new BackpressureHandler(jitterFactor: 0);
|
||||
|
||||
var result = handler.RecordFailure(429, now: BaseTime);
|
||||
|
||||
Assert.True(result.ShouldBackoff);
|
||||
Assert.Equal("upstream_rate_limited", result.Reason);
|
||||
Assert.Equal(429, result.StatusCode);
|
||||
Assert.Equal(1, result.ConsecutiveFailures);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordFailure_Returns503Reason()
|
||||
{
|
||||
var handler = new BackpressureHandler(jitterFactor: 0);
|
||||
|
||||
var result = handler.RecordFailure(503, now: BaseTime);
|
||||
|
||||
Assert.Equal("upstream_unavailable", result.Reason);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(502, "upstream_bad_gateway")]
|
||||
[InlineData(504, "upstream_timeout")]
|
||||
[InlineData(500, "upstream_server_error")]
|
||||
[InlineData(501, "upstream_server_error")]
|
||||
[InlineData(400, "upstream_client_error")]
|
||||
[InlineData(404, "upstream_client_error")]
|
||||
[InlineData(200, "upstream_error")]
|
||||
public void RecordFailure_MapsStatusCodeToReason(int statusCode, string expectedReason)
|
||||
{
|
||||
var handler = new BackpressureHandler();
|
||||
|
||||
var result = handler.RecordFailure(statusCode, now: BaseTime);
|
||||
|
||||
Assert.Equal(expectedReason, result.Reason);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordFailure_WithRetryAfter_UsesProvidedDelay()
|
||||
{
|
||||
var handler = new BackpressureHandler(jitterFactor: 0);
|
||||
var retryAfter = TimeSpan.FromSeconds(30);
|
||||
|
||||
var result = handler.RecordFailure(429, retryAfter: retryAfter, now: BaseTime);
|
||||
|
||||
Assert.Equal(retryAfter, result.BackoffDuration);
|
||||
Assert.Equal(BaseTime.AddSeconds(30), result.BackoffUntil);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordFailure_WithRetryAfterExceedingMax_UsesCalculatedDelay()
|
||||
{
|
||||
var handler = new BackpressureHandler(
|
||||
maxDelay: TimeSpan.FromMinutes(5),
|
||||
jitterFactor: 0);
|
||||
var retryAfter = TimeSpan.FromMinutes(10); // Exceeds max
|
||||
|
||||
var result = handler.RecordFailure(429, retryAfter: retryAfter, now: BaseTime);
|
||||
|
||||
Assert.True(result.BackoffDuration <= TimeSpan.FromMinutes(5));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordFailure_ExponentialBackoff_IncreasesDelay()
|
||||
{
|
||||
var handler = new BackpressureHandler(
|
||||
baseDelay: TimeSpan.FromSeconds(1),
|
||||
maxDelay: TimeSpan.FromMinutes(5),
|
||||
jitterFactor: 0);
|
||||
|
||||
var result1 = handler.RecordFailure(429, now: BaseTime);
|
||||
var result2 = handler.RecordFailure(429, now: BaseTime.AddSeconds(10));
|
||||
var result3 = handler.RecordFailure(429, now: BaseTime.AddSeconds(20));
|
||||
|
||||
// base * 2^0 = 1s, base * 2^1 = 2s, base * 2^2 = 4s
|
||||
Assert.Equal(TimeSpan.FromSeconds(1), result1.BackoffDuration);
|
||||
Assert.Equal(TimeSpan.FromSeconds(2), result2.BackoffDuration);
|
||||
Assert.Equal(TimeSpan.FromSeconds(4), result3.BackoffDuration);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordFailure_CapsAtMaxDelay()
|
||||
{
|
||||
var handler = new BackpressureHandler(
|
||||
baseDelay: TimeSpan.FromSeconds(1),
|
||||
maxDelay: TimeSpan.FromSeconds(10),
|
||||
jitterFactor: 0);
|
||||
|
||||
// Record many failures to exceed max
|
||||
for (var i = 0; i < 10; i++)
|
||||
{
|
||||
handler.RecordFailure(429, now: BaseTime.AddSeconds(i * 20));
|
||||
}
|
||||
|
||||
var result = handler.RecordFailure(429, now: BaseTime.AddSeconds(200));
|
||||
|
||||
Assert.Equal(TimeSpan.FromSeconds(10), result.BackoffDuration);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ShouldAllow_InBackoff_ReturnsFalse()
|
||||
{
|
||||
var handler = new BackpressureHandler(jitterFactor: 0);
|
||||
|
||||
handler.RecordFailure(429, now: BaseTime);
|
||||
|
||||
Assert.False(handler.ShouldAllow(BaseTime.AddMilliseconds(500)));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ShouldAllow_AfterBackoffExpires_ReturnsTrue()
|
||||
{
|
||||
var handler = new BackpressureHandler(
|
||||
baseDelay: TimeSpan.FromSeconds(1),
|
||||
jitterFactor: 0);
|
||||
|
||||
handler.RecordFailure(429, now: BaseTime);
|
||||
|
||||
Assert.True(handler.ShouldAllow(BaseTime.AddSeconds(2)));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordSuccess_ResetsFailureCount()
|
||||
{
|
||||
var handler = new BackpressureHandler();
|
||||
|
||||
handler.RecordFailure(429, now: BaseTime);
|
||||
handler.RecordFailure(429, now: BaseTime.AddSeconds(5));
|
||||
Assert.Equal(2, handler.ConsecutiveFailures);
|
||||
|
||||
handler.RecordSuccess();
|
||||
|
||||
Assert.Equal(0, handler.ConsecutiveFailures);
|
||||
Assert.True(handler.ShouldAllow(BaseTime.AddSeconds(10)));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Reset_ClearsAllState()
|
||||
{
|
||||
var handler = new BackpressureHandler();
|
||||
|
||||
handler.RecordFailure(429, now: BaseTime);
|
||||
handler.RecordFailure(429, now: BaseTime.AddSeconds(5));
|
||||
|
||||
handler.Reset();
|
||||
|
||||
Assert.Equal(0, handler.ConsecutiveFailures);
|
||||
Assert.False(handler.IsInBackoff);
|
||||
Assert.Equal(TimeSpan.Zero, handler.TimeUntilReady);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TimeUntilReady_ReturnsCorrectValue()
|
||||
{
|
||||
var handler = new BackpressureHandler(
|
||||
baseDelay: TimeSpan.FromSeconds(10),
|
||||
jitterFactor: 0);
|
||||
|
||||
// Use current time so TimeUntilReady (which uses UtcNow internally) works correctly
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
handler.RecordFailure(429, now: now);
|
||||
|
||||
var remaining = handler.TimeUntilReady;
|
||||
|
||||
// Should be positive and up to 10 seconds
|
||||
Assert.True(remaining > TimeSpan.Zero, $"Expected > 0, got {remaining}");
|
||||
Assert.True(remaining <= TimeSpan.FromSeconds(10), $"Expected <= 10s, got {remaining}");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_ReturnsCorrectState()
|
||||
{
|
||||
var handler = new BackpressureHandler(jitterFactor: 0);
|
||||
|
||||
handler.RecordFailure(429, now: BaseTime);
|
||||
handler.RecordFailure(503, now: BaseTime.AddSeconds(5));
|
||||
|
||||
var snapshot = handler.GetSnapshot(BaseTime.AddSeconds(5));
|
||||
|
||||
Assert.True(snapshot.IsInBackoff);
|
||||
Assert.Equal(2, snapshot.ConsecutiveFailures);
|
||||
Assert.NotNull(snapshot.BackoffUntil);
|
||||
Assert.Equal("upstream_unavailable", snapshot.LastFailureReason);
|
||||
Assert.True(snapshot.TimeRemaining > TimeSpan.Zero);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_WhenNotInBackoff_ShowsNotInBackoff()
|
||||
{
|
||||
var handler = new BackpressureHandler();
|
||||
|
||||
var snapshot = handler.GetSnapshot(BaseTime);
|
||||
|
||||
Assert.False(snapshot.IsInBackoff);
|
||||
Assert.Null(snapshot.BackoffUntil);
|
||||
Assert.Equal(TimeSpan.Zero, snapshot.TimeRemaining);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void FailureThreshold_DelaysBackoffUntilThreshold()
|
||||
{
|
||||
var handler = new BackpressureHandler(
|
||||
failureThreshold: 3,
|
||||
jitterFactor: 0);
|
||||
|
||||
var result1 = handler.RecordFailure(429, now: BaseTime);
|
||||
var result2 = handler.RecordFailure(429, now: BaseTime.AddSeconds(1));
|
||||
var result3 = handler.RecordFailure(429, now: BaseTime.AddSeconds(2));
|
||||
|
||||
Assert.False(result1.ShouldBackoff);
|
||||
Assert.False(result2.ShouldBackoff);
|
||||
Assert.True(result3.ShouldBackoff);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ConcurrentAccess_IsThreadSafe()
|
||||
{
|
||||
var handler = new BackpressureHandler(failureThreshold: 5);
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
Parallel.For(0, 100, i =>
|
||||
{
|
||||
if (i % 3 == 0)
|
||||
{
|
||||
handler.RecordFailure(429, now: now.AddMilliseconds(i));
|
||||
}
|
||||
else if (i % 3 == 1)
|
||||
{
|
||||
handler.RecordSuccess();
|
||||
}
|
||||
else
|
||||
{
|
||||
handler.ShouldAllow(now.AddMilliseconds(i));
|
||||
}
|
||||
});
|
||||
|
||||
// Should complete without exceptions
|
||||
var snapshot = handler.GetSnapshot(now.AddSeconds(100));
|
||||
Assert.True(snapshot.ConsecutiveFailures >= 0);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,279 @@
|
||||
using StellaOps.Orchestrator.Core.RateLimiting;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.RateLimiting;
|
||||
|
||||
public class ConcurrencyLimiterTests
|
||||
{
|
||||
[Fact]
|
||||
public void Constructor_WithValidMaxActive_CreatesLimiter()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10);
|
||||
|
||||
Assert.Equal(10, limiter.MaxActive);
|
||||
Assert.Equal(0, limiter.CurrentActive);
|
||||
Assert.Equal(10, limiter.AvailableSlots);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithInitialActive_SetsCorrectly()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 3);
|
||||
|
||||
Assert.Equal(3, limiter.CurrentActive);
|
||||
Assert.Equal(7, limiter.AvailableSlots);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(0)]
|
||||
[InlineData(-1)]
|
||||
public void Constructor_WithInvalidMaxActive_Throws(int maxActive)
|
||||
{
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
new ConcurrencyLimiter(maxActive: maxActive));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithNegativeCurrentActive_Throws()
|
||||
{
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
new ConcurrencyLimiter(maxActive: 10, currentActive: -1));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryAcquire_WithCapacity_ReturnsTrue()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10);
|
||||
|
||||
var result = limiter.TryAcquire();
|
||||
|
||||
Assert.True(result);
|
||||
Assert.Equal(1, limiter.CurrentActive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryAcquire_AtCapacity_ReturnsFalse()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 2, currentActive: 2);
|
||||
|
||||
var result = limiter.TryAcquire();
|
||||
|
||||
Assert.False(result);
|
||||
Assert.Equal(2, limiter.CurrentActive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryAcquire_MultipleSlots_WithCapacity_ReturnsTrue()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10);
|
||||
|
||||
var result = limiter.TryAcquire(count: 5);
|
||||
|
||||
Assert.True(result);
|
||||
Assert.Equal(5, limiter.CurrentActive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryAcquire_MultipleSlots_WithoutCapacity_ReturnsFalse()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 8);
|
||||
|
||||
var result = limiter.TryAcquire(count: 5);
|
||||
|
||||
Assert.False(result);
|
||||
Assert.Equal(8, limiter.CurrentActive); // Unchanged (no partial acquisition)
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryAcquire_ZeroSlots_Throws()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10);
|
||||
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
limiter.TryAcquire(count: 0));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Release_WithActiveSlots_ReturnsTrue()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 5);
|
||||
|
||||
var result = limiter.Release();
|
||||
|
||||
Assert.True(result);
|
||||
Assert.Equal(4, limiter.CurrentActive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Release_WithNoActiveSlots_ReturnsFalse()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 0);
|
||||
|
||||
var result = limiter.Release();
|
||||
|
||||
Assert.False(result);
|
||||
Assert.Equal(0, limiter.CurrentActive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Release_MultipleSlots_ReleasesCorrectAmount()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 5);
|
||||
|
||||
var released = limiter.Release(count: 3);
|
||||
|
||||
Assert.Equal(3, released);
|
||||
Assert.Equal(2, limiter.CurrentActive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Release_MultipleSlots_CapsAtCurrentActive()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 2);
|
||||
|
||||
var released = limiter.Release(count: 5);
|
||||
|
||||
Assert.Equal(2, released); // Only 2 were available to release
|
||||
Assert.Equal(0, limiter.CurrentActive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Release_ZeroSlots_Throws()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 5);
|
||||
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
limiter.Release(count: 0));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HasCapacity_WithAvailableSlots_ReturnsTrue()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 5);
|
||||
|
||||
Assert.True(limiter.HasCapacity());
|
||||
Assert.True(limiter.HasCapacity(count: 5));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HasCapacity_WithoutAvailableSlots_ReturnsFalse()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 10);
|
||||
|
||||
Assert.False(limiter.HasCapacity());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HasCapacity_ForMultipleSlots_ChecksCorrectly()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 8);
|
||||
|
||||
Assert.True(limiter.HasCapacity(count: 2));
|
||||
Assert.False(limiter.HasCapacity(count: 3));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Reset_SetsToZero()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 5);
|
||||
|
||||
var released = limiter.Reset();
|
||||
|
||||
Assert.Equal(5, released);
|
||||
Assert.Equal(0, limiter.CurrentActive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SetActive_SetsCorrectCount()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10);
|
||||
|
||||
limiter.SetActive(7);
|
||||
|
||||
Assert.Equal(7, limiter.CurrentActive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SetActive_NegativeCount_Throws()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10);
|
||||
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
limiter.SetActive(-1));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_ReturnsCorrectState()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 4);
|
||||
|
||||
var snapshot = limiter.GetSnapshot();
|
||||
|
||||
Assert.Equal(10, snapshot.MaxActive);
|
||||
Assert.Equal(4, snapshot.CurrentActive);
|
||||
Assert.Equal(6, snapshot.AvailableSlots);
|
||||
Assert.Equal(0.4, snapshot.Utilization);
|
||||
Assert.False(snapshot.IsAtCapacity);
|
||||
Assert.False(snapshot.IsIdle);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_AtCapacity_ShowsAtCapacity()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 10);
|
||||
|
||||
var snapshot = limiter.GetSnapshot();
|
||||
|
||||
Assert.True(snapshot.IsAtCapacity);
|
||||
Assert.Equal(1.0, snapshot.Utilization);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_WhenIdle_ShowsIdle()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 0);
|
||||
|
||||
var snapshot = limiter.GetSnapshot();
|
||||
|
||||
Assert.True(snapshot.IsIdle);
|
||||
Assert.Equal(0.0, snapshot.Utilization);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ConcurrentAccess_IsThreadSafe()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 50);
|
||||
var acquired = 0;
|
||||
|
||||
Parallel.For(0, 100, _ =>
|
||||
{
|
||||
if (limiter.TryAcquire())
|
||||
{
|
||||
Interlocked.Increment(ref acquired);
|
||||
}
|
||||
});
|
||||
|
||||
Assert.Equal(50, acquired);
|
||||
Assert.Equal(50, limiter.CurrentActive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ConcurrentAcquireAndRelease_MaintainsInvariants()
|
||||
{
|
||||
var limiter = new ConcurrencyLimiter(maxActive: 10);
|
||||
var completed = 0;
|
||||
|
||||
Parallel.For(0, 100, _ =>
|
||||
{
|
||||
if (limiter.TryAcquire())
|
||||
{
|
||||
Interlocked.Increment(ref completed);
|
||||
limiter.Release();
|
||||
}
|
||||
});
|
||||
|
||||
// All operations should complete without deadlock
|
||||
Assert.True(completed > 0);
|
||||
// After all parallel operations complete, should be back to 0
|
||||
Assert.Equal(0, limiter.CurrentActive);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,196 @@
|
||||
using StellaOps.Orchestrator.Core.RateLimiting;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.RateLimiting;
|
||||
|
||||
public class HourlyCounterTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithValidMaxPerHour_CreatesCounter()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 100);
|
||||
|
||||
Assert.Equal(100, counter.MaxPerHour);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithInitialCount_SetsCorrectly()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 50, hourStart: BaseTime);
|
||||
|
||||
var snapshot = counter.GetSnapshot(BaseTime);
|
||||
Assert.Equal(50, snapshot.CurrentCount);
|
||||
Assert.Equal(50, snapshot.Remaining);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(0)]
|
||||
[InlineData(-1)]
|
||||
public void Constructor_WithInvalidMaxPerHour_Throws(int maxPerHour)
|
||||
{
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
new HourlyCounter(maxPerHour: maxPerHour));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryIncrement_WithinLimit_ReturnsTrue()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 100);
|
||||
|
||||
var result = counter.TryIncrement(BaseTime);
|
||||
|
||||
Assert.True(result);
|
||||
var snapshot = counter.GetSnapshot(BaseTime);
|
||||
Assert.Equal(1, snapshot.CurrentCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryIncrement_AtLimit_ReturnsFalse()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 2, currentCount: 2, hourStart: BaseTime);
|
||||
|
||||
var result = counter.TryIncrement(BaseTime);
|
||||
|
||||
Assert.False(result);
|
||||
var snapshot = counter.GetSnapshot(BaseTime);
|
||||
Assert.Equal(2, snapshot.CurrentCount); // Unchanged
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryIncrement_AfterHourReset_IncrementsFromZero()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 50, hourStart: BaseTime);
|
||||
|
||||
var result = counter.TryIncrement(BaseTime.AddHours(1).AddMinutes(1));
|
||||
|
||||
Assert.True(result);
|
||||
var snapshot = counter.GetSnapshot(BaseTime.AddHours(1).AddMinutes(1));
|
||||
Assert.Equal(1, snapshot.CurrentCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryIncrement_AtLimitAfterHourReset_Succeeds()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 100, hourStart: BaseTime);
|
||||
|
||||
var result = counter.TryIncrement(BaseTime.AddHours(1).AddMinutes(1));
|
||||
|
||||
Assert.True(result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Decrement_DecreasesCount()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 5, hourStart: BaseTime);
|
||||
|
||||
counter.Decrement();
|
||||
|
||||
var snapshot = counter.GetSnapshot(BaseTime);
|
||||
Assert.Equal(4, snapshot.CurrentCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Decrement_AtZero_StaysAtZero()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 0, hourStart: BaseTime);
|
||||
|
||||
counter.Decrement();
|
||||
|
||||
var snapshot = counter.GetSnapshot(BaseTime);
|
||||
Assert.Equal(0, snapshot.CurrentCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_CalculatesRemainingCorrectly()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 30, hourStart: BaseTime);
|
||||
|
||||
var snapshot = counter.GetSnapshot(BaseTime);
|
||||
|
||||
Assert.Equal(70, snapshot.Remaining);
|
||||
Assert.False(snapshot.IsExhausted);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_AtLimit_ShowsExhausted()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 100, hourStart: BaseTime);
|
||||
|
||||
var snapshot = counter.GetSnapshot(BaseTime);
|
||||
|
||||
Assert.Equal(0, snapshot.Remaining);
|
||||
Assert.True(snapshot.IsExhausted);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_CalculatesTimeUntilReset()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 0, hourStart: BaseTime);
|
||||
|
||||
var snapshot = counter.GetSnapshot(BaseTime.AddMinutes(15));
|
||||
|
||||
Assert.Equal(TimeSpan.FromMinutes(45), snapshot.TimeUntilReset);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_AfterHourBoundary_ResetsAndReturnsNewHour()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 50, hourStart: BaseTime);
|
||||
|
||||
var snapshot = counter.GetSnapshot(BaseTime.AddHours(1).AddMinutes(30));
|
||||
|
||||
Assert.Equal(0, snapshot.CurrentCount);
|
||||
Assert.Equal(BaseTime.AddHours(1), snapshot.HourStart);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_ResetsHourCorrectly()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 50, hourStart: BaseTime);
|
||||
|
||||
// Check at 12:30 - same hour
|
||||
var snapshot1 = counter.GetSnapshot(BaseTime.AddMinutes(30));
|
||||
Assert.Equal(50, snapshot1.CurrentCount);
|
||||
Assert.Equal(BaseTime, snapshot1.HourStart);
|
||||
|
||||
// Check at 13:15 - new hour
|
||||
var snapshot2 = counter.GetSnapshot(BaseTime.AddHours(1).AddMinutes(15));
|
||||
Assert.Equal(0, snapshot2.CurrentCount);
|
||||
Assert.Equal(BaseTime.AddHours(1), snapshot2.HourStart);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ConcurrentAccess_IsThreadSafe()
|
||||
{
|
||||
var counter = new HourlyCounter(maxPerHour: 50);
|
||||
var successes = 0;
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
Parallel.For(0, 100, _ =>
|
||||
{
|
||||
if (counter.TryIncrement(now))
|
||||
{
|
||||
Interlocked.Increment(ref successes);
|
||||
}
|
||||
});
|
||||
|
||||
Assert.Equal(50, successes);
|
||||
var snapshot = counter.GetSnapshot(now);
|
||||
Assert.Equal(50, snapshot.CurrentCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HourlyCounterSnapshot_Remaining_NeverNegative()
|
||||
{
|
||||
// Edge case: if CurrentCount somehow exceeds MaxPerHour
|
||||
var snapshot = new HourlyCounterSnapshot(
|
||||
MaxPerHour: 100,
|
||||
CurrentCount: 150,
|
||||
HourStart: BaseTime,
|
||||
TimeUntilReset: TimeSpan.FromMinutes(30));
|
||||
|
||||
Assert.Equal(0, snapshot.Remaining);
|
||||
Assert.True(snapshot.IsExhausted);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,258 @@
|
||||
using StellaOps.Orchestrator.Core.RateLimiting;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.RateLimiting;
|
||||
|
||||
public class TokenBucketTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithValidParameters_CreatesBucket()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0);
|
||||
|
||||
Assert.Equal(10, bucket.BurstCapacity);
|
||||
Assert.Equal(2.0, bucket.RefillRate);
|
||||
Assert.Equal(10, bucket.CurrentTokens);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithInitialTokens_SetsCorrectly()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5);
|
||||
|
||||
Assert.Equal(5, bucket.CurrentTokens);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_WithInitialTokensExceedingCapacity_CapsAtCapacity()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 15);
|
||||
|
||||
Assert.Equal(10, bucket.CurrentTokens);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(0)]
|
||||
[InlineData(-1)]
|
||||
public void Constructor_WithInvalidBurstCapacity_Throws(int burstCapacity)
|
||||
{
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
new TokenBucket(burstCapacity: burstCapacity, refillRate: 2.0));
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(0)]
|
||||
[InlineData(-1)]
|
||||
public void Constructor_WithInvalidRefillRate_Throws(double refillRate)
|
||||
{
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
new TokenBucket(burstCapacity: 10, refillRate: refillRate));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryConsume_WithAvailableTokens_ReturnsTrue()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0);
|
||||
|
||||
var result = bucket.TryConsume(BaseTime);
|
||||
|
||||
Assert.True(result);
|
||||
Assert.Equal(9, bucket.CurrentTokens);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryConsume_WithMultipleTokens_ConsumesCorrectAmount()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0);
|
||||
|
||||
var result = bucket.TryConsume(BaseTime, tokensRequired: 5);
|
||||
|
||||
Assert.True(result);
|
||||
Assert.Equal(5, bucket.CurrentTokens);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryConsume_WithInsufficientTokens_ReturnsFalse()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 2);
|
||||
|
||||
var result = bucket.TryConsume(BaseTime, tokensRequired: 5);
|
||||
|
||||
Assert.False(result);
|
||||
Assert.Equal(2, bucket.CurrentTokens); // Unchanged
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryConsume_WithExactTokens_ConsumesAll()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5);
|
||||
|
||||
var result = bucket.TryConsume(BaseTime, tokensRequired: 5);
|
||||
|
||||
Assert.True(result);
|
||||
Assert.Equal(0, bucket.CurrentTokens);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryConsume_WithZeroTokensRequired_Throws()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0);
|
||||
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
bucket.TryConsume(BaseTime, tokensRequired: 0));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Refill_AfterTimeElapsed_AddsTokens()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5, lastRefillAt: BaseTime);
|
||||
|
||||
bucket.Refill(BaseTime.AddSeconds(2));
|
||||
|
||||
Assert.Equal(9, bucket.CurrentTokens); // 5 + (2 * 2.0)
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Refill_CapsAtBurstCapacity()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 8, lastRefillAt: BaseTime);
|
||||
|
||||
bucket.Refill(BaseTime.AddSeconds(10));
|
||||
|
||||
Assert.Equal(10, bucket.CurrentTokens); // Capped at burst capacity
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Refill_WithPastTime_DoesNothing()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5, lastRefillAt: BaseTime);
|
||||
|
||||
bucket.Refill(BaseTime.AddSeconds(-1));
|
||||
|
||||
Assert.Equal(5, bucket.CurrentTokens);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TryConsume_RefillsBeforeConsuming()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 0, lastRefillAt: BaseTime);
|
||||
|
||||
// After 3 seconds, should have 6 tokens (3 * 2.0)
|
||||
var result = bucket.TryConsume(BaseTime.AddSeconds(3), tokensRequired: 5);
|
||||
|
||||
Assert.True(result);
|
||||
Assert.Equal(1, bucket.CurrentTokens); // 6 - 5
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HasTokens_WithSufficientTokens_ReturnsTrue()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5);
|
||||
|
||||
var result = bucket.HasTokens(BaseTime, tokensRequired: 3);
|
||||
|
||||
Assert.True(result);
|
||||
Assert.Equal(5, bucket.CurrentTokens); // Unchanged
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void HasTokens_WithInsufficientTokens_ReturnsFalse()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 2);
|
||||
|
||||
var result = bucket.HasTokens(BaseTime, tokensRequired: 5);
|
||||
|
||||
Assert.False(result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void EstimatedWaitTime_WithAvailableTokens_ReturnsZero()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5);
|
||||
|
||||
var wait = bucket.EstimatedWaitTime(BaseTime, tokensRequired: 3);
|
||||
|
||||
Assert.Equal(TimeSpan.Zero, wait);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void EstimatedWaitTime_WithInsufficientTokens_ReturnsCorrectTime()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 2, lastRefillAt: BaseTime);
|
||||
|
||||
// Need 5 tokens, have 2, need 3 more at rate 2.0 = 1.5 seconds
|
||||
var wait = bucket.EstimatedWaitTime(BaseTime, tokensRequired: 5);
|
||||
|
||||
Assert.Equal(TimeSpan.FromSeconds(1.5), wait);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Reset_SetsToFullCapacity()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 3);
|
||||
|
||||
bucket.Reset(BaseTime);
|
||||
|
||||
Assert.Equal(10, bucket.CurrentTokens);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_ReturnsCorrectState()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5, lastRefillAt: BaseTime);
|
||||
|
||||
var snapshot = bucket.GetSnapshot(BaseTime);
|
||||
|
||||
Assert.Equal(10, snapshot.BurstCapacity);
|
||||
Assert.Equal(2.0, snapshot.RefillRate);
|
||||
Assert.Equal(5, snapshot.CurrentTokens);
|
||||
Assert.Equal(BaseTime, snapshot.LastRefillAt);
|
||||
Assert.Equal(0.5, snapshot.FillPercent);
|
||||
Assert.False(snapshot.IsEmpty);
|
||||
Assert.False(snapshot.IsFull);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_WithEmptyBucket_ShowsEmpty()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 0, lastRefillAt: BaseTime);
|
||||
|
||||
var snapshot = bucket.GetSnapshot(BaseTime);
|
||||
|
||||
Assert.True(snapshot.IsEmpty);
|
||||
Assert.False(snapshot.IsFull);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSnapshot_WithFullBucket_ShowsFull()
|
||||
{
|
||||
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 10);
|
||||
|
||||
var snapshot = bucket.GetSnapshot(BaseTime);
|
||||
|
||||
Assert.False(snapshot.IsEmpty);
|
||||
Assert.True(snapshot.IsFull);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ConcurrentAccess_IsThreadSafe()
|
||||
{
|
||||
// Use fixed time to avoid refills during test (set refillRate to 0 effect)
|
||||
var fixedTime = DateTimeOffset.UtcNow;
|
||||
var bucket = new TokenBucket(burstCapacity: 100, refillRate: 0.001, initialTokens: 100, lastRefillAt: fixedTime);
|
||||
var successes = 0;
|
||||
|
||||
Parallel.For(0, 100, _ =>
|
||||
{
|
||||
if (bucket.TryConsume(fixedTime))
|
||||
{
|
||||
Interlocked.Increment(ref successes);
|
||||
}
|
||||
});
|
||||
|
||||
Assert.Equal(100, successes);
|
||||
// Due to thread timing, tokens might be slightly different, just check it's close to 0
|
||||
Assert.True(bucket.CurrentTokens < 1, $"Expected < 1 tokens remaining, got {bucket.CurrentTokens}");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,284 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Core.Scheduling;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.Scheduling;
|
||||
|
||||
public sealed class DagPlannerTests
|
||||
{
|
||||
private static readonly string TenantId = "test-tenant";
|
||||
private static readonly Guid RunId = Guid.NewGuid();
|
||||
|
||||
[Fact]
|
||||
public void ValidateDag_EmptyEdges_ReturnsValid()
|
||||
{
|
||||
var result = DagPlanner.ValidateDag([]);
|
||||
Assert.True(result.IsValid);
|
||||
Assert.Empty(result.CycleNodes);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ValidateDag_LinearChain_ReturnsValid()
|
||||
{
|
||||
var jobA = Guid.NewGuid();
|
||||
var jobB = Guid.NewGuid();
|
||||
var jobC = Guid.NewGuid();
|
||||
|
||||
var edges = new[]
|
||||
{
|
||||
CreateEdge(jobA, jobB),
|
||||
CreateEdge(jobB, jobC)
|
||||
};
|
||||
|
||||
var result = DagPlanner.ValidateDag(edges);
|
||||
Assert.True(result.IsValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ValidateDag_DiamondShape_ReturnsValid()
|
||||
{
|
||||
// A -> B -> D
|
||||
// A -> C -> D
|
||||
var jobA = Guid.NewGuid();
|
||||
var jobB = Guid.NewGuid();
|
||||
var jobC = Guid.NewGuid();
|
||||
var jobD = Guid.NewGuid();
|
||||
|
||||
var edges = new[]
|
||||
{
|
||||
CreateEdge(jobA, jobB),
|
||||
CreateEdge(jobA, jobC),
|
||||
CreateEdge(jobB, jobD),
|
||||
CreateEdge(jobC, jobD)
|
||||
};
|
||||
|
||||
var result = DagPlanner.ValidateDag(edges);
|
||||
Assert.True(result.IsValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ValidateDag_SimpleCycle_ReturnsCycleDetected()
|
||||
{
|
||||
var jobA = Guid.NewGuid();
|
||||
var jobB = Guid.NewGuid();
|
||||
|
||||
var edges = new[]
|
||||
{
|
||||
CreateEdge(jobA, jobB),
|
||||
CreateEdge(jobB, jobA) // Cycle!
|
||||
};
|
||||
|
||||
var result = DagPlanner.ValidateDag(edges);
|
||||
Assert.False(result.IsValid);
|
||||
Assert.NotEmpty(result.CycleNodes);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ValidateDag_SelfLoop_ReturnsCycleDetected()
|
||||
{
|
||||
var jobA = Guid.NewGuid();
|
||||
|
||||
var edges = new[]
|
||||
{
|
||||
CreateEdge(jobA, jobA) // Self-loop!
|
||||
};
|
||||
|
||||
var result = DagPlanner.ValidateDag(edges);
|
||||
Assert.False(result.IsValid);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TopologicalSort_LinearChain_ReturnsCorrectOrder()
|
||||
{
|
||||
var jobA = Guid.NewGuid();
|
||||
var jobB = Guid.NewGuid();
|
||||
var jobC = Guid.NewGuid();
|
||||
|
||||
var jobs = new[] { jobC, jobA, jobB }; // Unordered
|
||||
var edges = new[]
|
||||
{
|
||||
CreateEdge(jobA, jobB),
|
||||
CreateEdge(jobB, jobC)
|
||||
};
|
||||
|
||||
var sorted = DagPlanner.TopologicalSort(jobs, edges).ToList();
|
||||
|
||||
Assert.Equal(3, sorted.Count);
|
||||
Assert.True(sorted.IndexOf(jobA) < sorted.IndexOf(jobB));
|
||||
Assert.True(sorted.IndexOf(jobB) < sorted.IndexOf(jobC));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TopologicalSort_DiamondShape_ReturnsValidOrder()
|
||||
{
|
||||
var jobA = Guid.NewGuid();
|
||||
var jobB = Guid.NewGuid();
|
||||
var jobC = Guid.NewGuid();
|
||||
var jobD = Guid.NewGuid();
|
||||
|
||||
var jobs = new[] { jobD, jobC, jobB, jobA }; // Reverse order
|
||||
var edges = new[]
|
||||
{
|
||||
CreateEdge(jobA, jobB),
|
||||
CreateEdge(jobA, jobC),
|
||||
CreateEdge(jobB, jobD),
|
||||
CreateEdge(jobC, jobD)
|
||||
};
|
||||
|
||||
var sorted = DagPlanner.TopologicalSort(jobs, edges).ToList();
|
||||
|
||||
Assert.Equal(4, sorted.Count);
|
||||
Assert.True(sorted.IndexOf(jobA) < sorted.IndexOf(jobB));
|
||||
Assert.True(sorted.IndexOf(jobA) < sorted.IndexOf(jobC));
|
||||
Assert.True(sorted.IndexOf(jobB) < sorted.IndexOf(jobD));
|
||||
Assert.True(sorted.IndexOf(jobC) < sorted.IndexOf(jobD));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TopologicalSort_NoEdges_ReturnsAllJobs()
|
||||
{
|
||||
var jobA = Guid.NewGuid();
|
||||
var jobB = Guid.NewGuid();
|
||||
|
||||
var jobs = new[] { jobA, jobB };
|
||||
var sorted = DagPlanner.TopologicalSort(jobs, []);
|
||||
|
||||
Assert.Equal(2, sorted.Count);
|
||||
Assert.Contains(jobA, sorted);
|
||||
Assert.Contains(jobB, sorted);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetReadyJobs_NoDependencies_ReturnsAllPendingJobs()
|
||||
{
|
||||
var job1 = CreateJob(JobStatus.Pending);
|
||||
var job2 = CreateJob(JobStatus.Pending);
|
||||
var job3 = CreateJob(JobStatus.Scheduled); // Not pending
|
||||
|
||||
var ready = DagPlanner.GetReadyJobs([job1, job2, job3], []);
|
||||
|
||||
Assert.Equal(2, ready.Count);
|
||||
Assert.Contains(job1, ready);
|
||||
Assert.Contains(job2, ready);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetReadyJobs_WithUnsatisfiedDependency_FiltersBlockedJobs()
|
||||
{
|
||||
var job1 = CreateJob(JobStatus.Pending);
|
||||
var job2 = CreateJob(JobStatus.Pending);
|
||||
|
||||
var edges = new[]
|
||||
{
|
||||
CreateEdge(job1.JobId, job2.JobId) // job2 depends on job1
|
||||
};
|
||||
|
||||
var ready = DagPlanner.GetReadyJobs([job1, job2], edges);
|
||||
|
||||
Assert.Single(ready);
|
||||
Assert.Contains(job1, ready);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetReadyJobs_WithSatisfiedDependency_IncludesDependentJob()
|
||||
{
|
||||
var job1 = CreateJob(JobStatus.Succeeded); // Parent completed
|
||||
var job2 = CreateJob(JobStatus.Pending); // Can now run
|
||||
|
||||
var edges = new[]
|
||||
{
|
||||
CreateEdge(job1.JobId, job2.JobId)
|
||||
};
|
||||
|
||||
var ready = DagPlanner.GetReadyJobs([job1, job2], edges);
|
||||
|
||||
Assert.Single(ready);
|
||||
Assert.Contains(job2, ready);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetBlockedJobs_SingleFailure_ReturnsDirectAndTransitiveChildren()
|
||||
{
|
||||
var failed = Guid.NewGuid();
|
||||
var child1 = Guid.NewGuid();
|
||||
var child2 = Guid.NewGuid();
|
||||
var grandchild = Guid.NewGuid();
|
||||
|
||||
var edges = new[]
|
||||
{
|
||||
CreateEdge(failed, child1),
|
||||
CreateEdge(failed, child2),
|
||||
CreateEdge(child1, grandchild)
|
||||
};
|
||||
|
||||
var blocked = DagPlanner.GetBlockedJobs(failed, edges);
|
||||
|
||||
Assert.Equal(3, blocked.Count);
|
||||
Assert.Contains(child1, blocked);
|
||||
Assert.Contains(child2, blocked);
|
||||
Assert.Contains(grandchild, blocked);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CalculateCriticalPath_LinearChain_ReturnsEntireChain()
|
||||
{
|
||||
var job1 = CreateJob(JobStatus.Pending);
|
||||
var job2 = CreateJob(JobStatus.Pending);
|
||||
var job3 = CreateJob(JobStatus.Pending);
|
||||
|
||||
var edges = new[]
|
||||
{
|
||||
CreateEdge(job1.JobId, job2.JobId),
|
||||
CreateEdge(job2.JobId, job3.JobId)
|
||||
};
|
||||
|
||||
var result = DagPlanner.CalculateCriticalPath(
|
||||
[job1, job2, job3],
|
||||
edges,
|
||||
_ => TimeSpan.FromMinutes(10));
|
||||
|
||||
Assert.Equal(TimeSpan.FromMinutes(30), result.TotalDuration);
|
||||
Assert.Equal(3, result.CriticalPathJobIds.Count);
|
||||
}
|
||||
|
||||
private static DagEdge CreateEdge(Guid parent, Guid child, string edgeType = DagEdgeTypes.Success)
|
||||
{
|
||||
return new DagEdge(
|
||||
EdgeId: Guid.NewGuid(),
|
||||
TenantId: TenantId,
|
||||
RunId: RunId,
|
||||
ParentJobId: parent,
|
||||
ChildJobId: child,
|
||||
EdgeType: edgeType,
|
||||
CreatedAt: DateTimeOffset.UtcNow);
|
||||
}
|
||||
|
||||
private static Job CreateJob(JobStatus status, int priority = 0)
|
||||
{
|
||||
return new Job(
|
||||
JobId: Guid.NewGuid(),
|
||||
TenantId: TenantId,
|
||||
ProjectId: null,
|
||||
RunId: RunId,
|
||||
JobType: "test.job",
|
||||
Status: status,
|
||||
Priority: priority,
|
||||
Attempt: 1,
|
||||
MaxAttempts: 3,
|
||||
PayloadDigest: "0".PadLeft(64, '0'),
|
||||
Payload: "{}",
|
||||
IdempotencyKey: Guid.NewGuid().ToString(),
|
||||
CorrelationId: null,
|
||||
LeaseId: null,
|
||||
WorkerId: null,
|
||||
TaskRunnerId: null,
|
||||
LeaseUntil: null,
|
||||
CreatedAt: DateTimeOffset.UtcNow,
|
||||
ScheduledAt: null,
|
||||
LeasedAt: null,
|
||||
CompletedAt: null,
|
||||
NotBefore: null,
|
||||
Reason: null,
|
||||
ReplayOf: null,
|
||||
CreatedBy: "test");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,109 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Core.Scheduling;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.Scheduling;
|
||||
|
||||
public sealed class JobStateMachineTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData(JobStatus.Pending, JobStatus.Scheduled, true)]
|
||||
[InlineData(JobStatus.Pending, JobStatus.Canceled, true)]
|
||||
[InlineData(JobStatus.Pending, JobStatus.Leased, false)]
|
||||
[InlineData(JobStatus.Scheduled, JobStatus.Leased, true)]
|
||||
[InlineData(JobStatus.Scheduled, JobStatus.Canceled, true)]
|
||||
[InlineData(JobStatus.Scheduled, JobStatus.Pending, true)]
|
||||
[InlineData(JobStatus.Leased, JobStatus.Succeeded, true)]
|
||||
[InlineData(JobStatus.Leased, JobStatus.Failed, true)]
|
||||
[InlineData(JobStatus.Leased, JobStatus.Canceled, true)]
|
||||
[InlineData(JobStatus.Leased, JobStatus.TimedOut, true)]
|
||||
[InlineData(JobStatus.Leased, JobStatus.Pending, false)]
|
||||
[InlineData(JobStatus.Failed, JobStatus.Pending, true)]
|
||||
[InlineData(JobStatus.Failed, JobStatus.Scheduled, false)]
|
||||
[InlineData(JobStatus.TimedOut, JobStatus.Pending, true)]
|
||||
[InlineData(JobStatus.Succeeded, JobStatus.Pending, false)]
|
||||
[InlineData(JobStatus.Canceled, JobStatus.Pending, false)]
|
||||
public void IsValidTransition_ReturnsExpectedResult(JobStatus from, JobStatus to, bool expected)
|
||||
{
|
||||
var result = JobStateMachine.IsValidTransition(from, to);
|
||||
Assert.Equal(expected, result);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(JobStatus.Pending, JobStatus.Pending)]
|
||||
[InlineData(JobStatus.Scheduled, JobStatus.Scheduled)]
|
||||
[InlineData(JobStatus.Leased, JobStatus.Leased)]
|
||||
[InlineData(JobStatus.Succeeded, JobStatus.Succeeded)]
|
||||
public void IsValidTransition_SameStatus_ReturnsTrue(JobStatus status, JobStatus same)
|
||||
{
|
||||
Assert.True(JobStateMachine.IsValidTransition(status, same));
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(JobStatus.Succeeded, true)]
|
||||
[InlineData(JobStatus.Failed, true)]
|
||||
[InlineData(JobStatus.Canceled, true)]
|
||||
[InlineData(JobStatus.TimedOut, true)]
|
||||
[InlineData(JobStatus.Pending, false)]
|
||||
[InlineData(JobStatus.Scheduled, false)]
|
||||
[InlineData(JobStatus.Leased, false)]
|
||||
public void IsTerminal_ReturnsExpectedResult(JobStatus status, bool expected)
|
||||
{
|
||||
Assert.Equal(expected, JobStateMachine.IsTerminal(status));
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(JobStatus.Failed, true)]
|
||||
[InlineData(JobStatus.TimedOut, true)]
|
||||
[InlineData(JobStatus.Succeeded, false)]
|
||||
[InlineData(JobStatus.Canceled, false)]
|
||||
[InlineData(JobStatus.Pending, false)]
|
||||
public void IsRetryable_ReturnsExpectedResult(JobStatus status, bool expected)
|
||||
{
|
||||
Assert.Equal(expected, JobStateMachine.IsRetryable(status));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ValidateTransition_InvalidTransition_ThrowsException()
|
||||
{
|
||||
var ex = Assert.Throws<InvalidJobTransitionException>(
|
||||
() => JobStateMachine.ValidateTransition(JobStatus.Pending, JobStatus.Succeeded));
|
||||
|
||||
Assert.Equal(JobStatus.Pending, ex.FromStatus);
|
||||
Assert.Equal(JobStatus.Succeeded, ex.ToStatus);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ValidateTransition_ValidTransition_DoesNotThrow()
|
||||
{
|
||||
JobStateMachine.ValidateTransition(JobStatus.Pending, JobStatus.Scheduled);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetValidTransitions_Pending_ReturnsScheduledAndCanceled()
|
||||
{
|
||||
var transitions = JobStateMachine.GetValidTransitions(JobStatus.Pending);
|
||||
|
||||
Assert.Contains(JobStatus.Scheduled, transitions);
|
||||
Assert.Contains(JobStatus.Canceled, transitions);
|
||||
Assert.Equal(2, transitions.Count);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetValidTransitions_Leased_ReturnsFourOptions()
|
||||
{
|
||||
var transitions = JobStateMachine.GetValidTransitions(JobStatus.Leased);
|
||||
|
||||
Assert.Contains(JobStatus.Succeeded, transitions);
|
||||
Assert.Contains(JobStatus.Failed, transitions);
|
||||
Assert.Contains(JobStatus.Canceled, transitions);
|
||||
Assert.Contains(JobStatus.TimedOut, transitions);
|
||||
Assert.Equal(4, transitions.Count);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetValidTransitions_Terminal_ReturnsEmpty()
|
||||
{
|
||||
Assert.Empty(JobStateMachine.GetValidTransitions(JobStatus.Succeeded));
|
||||
Assert.Empty(JobStateMachine.GetValidTransitions(JobStatus.Canceled));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,143 @@
|
||||
using StellaOps.Orchestrator.Core.Scheduling;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.Scheduling;
|
||||
|
||||
public sealed class RetryPolicyTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData(1, true)] // First attempt, can retry
|
||||
[InlineData(2, true)] // Second attempt, can retry (3 max)
|
||||
[InlineData(3, false)] // Third attempt, cannot retry (3 max)
|
||||
[InlineData(4, false)] // Beyond max
|
||||
public void ShouldRetry_DefaultPolicy_ReturnsExpected(int attempt, bool expected)
|
||||
{
|
||||
var policy = RetryPolicy.Default; // 3 max attempts
|
||||
Assert.Equal(expected, policy.ShouldRetry(attempt));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ShouldRetry_NoRetryPolicy_NeverRetries()
|
||||
{
|
||||
var policy = RetryPolicy.NoRetry;
|
||||
Assert.False(policy.ShouldRetry(1));
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(1, 5.0)] // First attempt: 5 * 2^0 = 5
|
||||
[InlineData(2, 10.0)] // Second attempt: 5 * 2^1 = 10
|
||||
[InlineData(3, 20.0)] // Third attempt: 5 * 2^2 = 20
|
||||
public void CalculateBackoffSeconds_ExponentialGrowth_ReturnsExpected(int attempt, double expectedBase)
|
||||
{
|
||||
// Use a policy with no jitter for deterministic testing
|
||||
var policy = new RetryPolicy(
|
||||
MaxAttempts: 5,
|
||||
InitialBackoffSeconds: 5.0,
|
||||
MaxBackoffSeconds: 300.0,
|
||||
BackoffMultiplier: 2.0,
|
||||
JitterFactor: 0.0); // No jitter
|
||||
|
||||
var backoff = policy.CalculateBackoffSeconds(attempt);
|
||||
Assert.Equal(expectedBase, backoff, precision: 1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CalculateBackoffSeconds_CapsAtMaximum()
|
||||
{
|
||||
var policy = new RetryPolicy(
|
||||
MaxAttempts: 10,
|
||||
InitialBackoffSeconds: 100.0,
|
||||
MaxBackoffSeconds: 200.0,
|
||||
BackoffMultiplier: 2.0,
|
||||
JitterFactor: 0.0);
|
||||
|
||||
// 100 * 2^5 = 3200, but capped at 200
|
||||
var backoff = policy.CalculateBackoffSeconds(6);
|
||||
Assert.Equal(200.0, backoff);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CalculateBackoffSeconds_WithJitter_VariesWithinRange()
|
||||
{
|
||||
var policy = new RetryPolicy(
|
||||
MaxAttempts: 5,
|
||||
InitialBackoffSeconds: 10.0,
|
||||
MaxBackoffSeconds: 300.0,
|
||||
BackoffMultiplier: 2.0,
|
||||
JitterFactor: 0.2); // 20% jitter
|
||||
|
||||
// Run multiple times to verify jitter adds variance
|
||||
var backoffs = Enumerable.Range(0, 100)
|
||||
.Select(_ => policy.CalculateBackoffSeconds(1))
|
||||
.ToList();
|
||||
|
||||
var minExpected = 10.0 * 0.8; // 10 - 20%
|
||||
var maxExpected = 10.0 * 1.2; // 10 + 20%
|
||||
|
||||
Assert.True(backoffs.All(b => b >= minExpected && b <= maxExpected));
|
||||
// Should have some variance (not all equal)
|
||||
Assert.True(backoffs.Distinct().Count() > 1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CalculateNextRetryTime_ReturnsCorrectTime()
|
||||
{
|
||||
var policy = new RetryPolicy(
|
||||
MaxAttempts: 3,
|
||||
InitialBackoffSeconds: 30.0,
|
||||
MaxBackoffSeconds: 300.0,
|
||||
BackoffMultiplier: 2.0,
|
||||
JitterFactor: 0.0);
|
||||
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var nextRetry = policy.CalculateNextRetryTime(1, now);
|
||||
|
||||
Assert.Equal(now.AddSeconds(30), nextRetry);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CalculateNextRetryTime_WhenExhausted_ThrowsException()
|
||||
{
|
||||
var policy = RetryPolicy.Default; // 3 max
|
||||
|
||||
Assert.Throws<InvalidOperationException>(
|
||||
() => policy.CalculateNextRetryTime(3, DateTimeOffset.UtcNow));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RetryEvaluator_WhenShouldRetry_ReturnsRetryDecision()
|
||||
{
|
||||
var policy = RetryPolicy.Default;
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
var decision = RetryEvaluator.Evaluate(1, policy, now);
|
||||
|
||||
Assert.True(decision.ShouldRetry);
|
||||
Assert.Equal(2, decision.NextAttempt);
|
||||
Assert.NotNull(decision.NotBefore);
|
||||
Assert.True(decision.NotBefore > now);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RetryEvaluator_WhenExhausted_ReturnsExhaustedDecision()
|
||||
{
|
||||
var policy = RetryPolicy.Default; // 3 max
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
var decision = RetryEvaluator.Evaluate(3, policy, now);
|
||||
|
||||
Assert.False(decision.ShouldRetry);
|
||||
Assert.Null(decision.NotBefore);
|
||||
Assert.Contains("exhausted", decision.Reason, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void DefaultPolicy_HasReasonableValues()
|
||||
{
|
||||
var policy = RetryPolicy.Default;
|
||||
|
||||
Assert.Equal(3, policy.MaxAttempts);
|
||||
Assert.Equal(5.0, policy.InitialBackoffSeconds);
|
||||
Assert.Equal(300.0, policy.MaxBackoffSeconds);
|
||||
Assert.Equal(2.0, policy.BackoffMultiplier);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,531 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.Tests.SloManagement;
|
||||
|
||||
public class SloTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
private const string TenantId = "test-tenant";
|
||||
|
||||
// =========================================================================
|
||||
// Slo Creation Tests
|
||||
// =========================================================================
|
||||
|
||||
[Fact]
|
||||
public void CreateAvailability_SetsCorrectProperties()
|
||||
{
|
||||
var slo = Slo.CreateAvailability(
|
||||
TenantId,
|
||||
"API Availability",
|
||||
target: 0.999,
|
||||
window: SloWindow.ThirtyDays,
|
||||
createdBy: "admin",
|
||||
description: "99.9% uptime target");
|
||||
|
||||
Assert.NotEqual(Guid.Empty, slo.SloId);
|
||||
Assert.Equal(TenantId, slo.TenantId);
|
||||
Assert.Equal("API Availability", slo.Name);
|
||||
Assert.Equal("99.9% uptime target", slo.Description);
|
||||
Assert.Equal(SloType.Availability, slo.Type);
|
||||
Assert.Equal(0.999, slo.Target);
|
||||
Assert.Equal(SloWindow.ThirtyDays, slo.Window);
|
||||
Assert.True(slo.Enabled);
|
||||
Assert.Null(slo.JobType);
|
||||
Assert.Null(slo.SourceId);
|
||||
Assert.Equal("admin", slo.CreatedBy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateAvailability_WithJobType_SetsJobType()
|
||||
{
|
||||
var slo = Slo.CreateAvailability(
|
||||
TenantId,
|
||||
"Scan Availability",
|
||||
0.99,
|
||||
SloWindow.SevenDays,
|
||||
"admin",
|
||||
jobType: "scan.image");
|
||||
|
||||
Assert.Equal("scan.image", slo.JobType);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateAvailability_WithSourceId_SetsSourceId()
|
||||
{
|
||||
var sourceId = Guid.NewGuid();
|
||||
var slo = Slo.CreateAvailability(
|
||||
TenantId,
|
||||
"Source Availability",
|
||||
0.995,
|
||||
SloWindow.OneDay,
|
||||
"admin",
|
||||
sourceId: sourceId);
|
||||
|
||||
Assert.Equal(sourceId, slo.SourceId);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateLatency_SetsCorrectProperties()
|
||||
{
|
||||
var slo = Slo.CreateLatency(
|
||||
TenantId,
|
||||
"API Latency P95",
|
||||
percentile: 0.95,
|
||||
targetSeconds: 0.5,
|
||||
target: 0.99,
|
||||
window: SloWindow.OneDay,
|
||||
createdBy: "admin");
|
||||
|
||||
Assert.Equal(SloType.Latency, slo.Type);
|
||||
Assert.Equal(0.95, slo.LatencyPercentile);
|
||||
Assert.Equal(0.5, slo.LatencyTargetSeconds);
|
||||
Assert.Equal(0.99, slo.Target);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CreateThroughput_SetsCorrectProperties()
|
||||
{
|
||||
var slo = Slo.CreateThroughput(
|
||||
TenantId,
|
||||
"Scan Throughput",
|
||||
minimum: 1000,
|
||||
target: 0.95,
|
||||
window: SloWindow.OneHour,
|
||||
createdBy: "admin");
|
||||
|
||||
Assert.Equal(SloType.Throughput, slo.Type);
|
||||
Assert.Equal(1000, slo.ThroughputMinimum);
|
||||
Assert.Equal(0.95, slo.Target);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Slo Validation Tests
|
||||
// =========================================================================
|
||||
|
||||
[Theory]
|
||||
[InlineData(0)]
|
||||
[InlineData(-0.1)]
|
||||
[InlineData(1.1)]
|
||||
public void CreateAvailability_WithInvalidTarget_Throws(double target)
|
||||
{
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
Slo.CreateAvailability(TenantId, "Test", target, SloWindow.OneDay, "admin"));
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(-0.1)]
|
||||
[InlineData(1.1)]
|
||||
public void CreateLatency_WithInvalidPercentile_Throws(double percentile)
|
||||
{
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
Slo.CreateLatency(TenantId, "Test", percentile, 1.0, 0.99, SloWindow.OneDay, "admin"));
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(0)]
|
||||
[InlineData(-1.0)]
|
||||
public void CreateLatency_WithInvalidTargetSeconds_Throws(double targetSeconds)
|
||||
{
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
Slo.CreateLatency(TenantId, "Test", 0.95, targetSeconds, 0.99, SloWindow.OneDay, "admin"));
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(0)]
|
||||
[InlineData(-1)]
|
||||
public void CreateThroughput_WithInvalidMinimum_Throws(int minimum)
|
||||
{
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
Slo.CreateThroughput(TenantId, "Test", minimum, 0.99, SloWindow.OneDay, "admin"));
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Error Budget Tests
|
||||
// =========================================================================
|
||||
|
||||
[Theory]
|
||||
[InlineData(0.999, 0.001)]
|
||||
[InlineData(0.99, 0.01)]
|
||||
[InlineData(0.95, 0.05)]
|
||||
[InlineData(0.9, 0.1)]
|
||||
public void ErrorBudget_CalculatesCorrectly(double target, double expectedBudget)
|
||||
{
|
||||
var slo = Slo.CreateAvailability(TenantId, "Test", target, SloWindow.OneDay, "admin");
|
||||
|
||||
Assert.Equal(expectedBudget, slo.ErrorBudget, precision: 10);
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Window Duration Tests
|
||||
// =========================================================================
|
||||
|
||||
[Theory]
|
||||
[InlineData(SloWindow.OneHour, 1)]
|
||||
[InlineData(SloWindow.OneDay, 24)]
|
||||
[InlineData(SloWindow.SevenDays, 168)]
|
||||
[InlineData(SloWindow.ThirtyDays, 720)]
|
||||
public void GetWindowDuration_ReturnsCorrectHours(SloWindow window, int expectedHours)
|
||||
{
|
||||
var slo = Slo.CreateAvailability(TenantId, "Test", 0.99, window, "admin");
|
||||
|
||||
Assert.Equal(TimeSpan.FromHours(expectedHours), slo.GetWindowDuration());
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Update Tests
|
||||
// =========================================================================
|
||||
|
||||
[Fact]
|
||||
public void Update_UpdatesOnlySpecifiedFields()
|
||||
{
|
||||
var slo = Slo.CreateAvailability(TenantId, "Original", 0.99, SloWindow.OneDay, "admin");
|
||||
|
||||
var updated = slo.Update(name: "Updated", updatedBy: "operator");
|
||||
|
||||
Assert.Equal("Updated", updated.Name);
|
||||
Assert.Equal(0.99, updated.Target); // Unchanged
|
||||
Assert.True(updated.Enabled); // Unchanged
|
||||
Assert.Equal("operator", updated.UpdatedBy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Update_WithNewTarget_UpdatesTarget()
|
||||
{
|
||||
var slo = Slo.CreateAvailability(TenantId, "Test", 0.99, SloWindow.OneDay, "admin");
|
||||
|
||||
var updated = slo.Update(target: 0.999, updatedBy: "operator");
|
||||
|
||||
Assert.Equal(0.999, updated.Target);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Update_WithInvalidTarget_Throws()
|
||||
{
|
||||
var slo = Slo.CreateAvailability(TenantId, "Test", 0.99, SloWindow.OneDay, "admin");
|
||||
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
slo.Update(target: 1.5, updatedBy: "operator"));
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// Enable/Disable Tests
|
||||
// =========================================================================
|
||||
|
||||
[Fact]
|
||||
public void Disable_SetsEnabledToFalse()
|
||||
{
|
||||
var slo = Slo.CreateAvailability(TenantId, "Test", 0.99, SloWindow.OneDay, "admin");
|
||||
|
||||
var disabled = slo.Disable("operator");
|
||||
|
||||
Assert.False(disabled.Enabled);
|
||||
Assert.Equal("operator", disabled.UpdatedBy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Enable_SetsEnabledToTrue()
|
||||
{
|
||||
var slo = Slo.CreateAvailability(TenantId, "Test", 0.99, SloWindow.OneDay, "admin")
|
||||
.Disable("operator");
|
||||
|
||||
var enabled = slo.Enable("operator");
|
||||
|
||||
Assert.True(enabled.Enabled);
|
||||
}
|
||||
}
|
||||
|
||||
public class SloStateTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
private const string TenantId = "test-tenant";
|
||||
|
||||
[Fact]
|
||||
public void NoData_CreatesCorrectState()
|
||||
{
|
||||
var sloId = Guid.NewGuid();
|
||||
|
||||
var state = SloState.NoData(sloId, TenantId, BaseTime, SloWindow.OneDay);
|
||||
|
||||
Assert.Equal(sloId, state.SloId);
|
||||
Assert.Equal(TenantId, state.TenantId);
|
||||
Assert.Equal(1.0, state.CurrentSli);
|
||||
Assert.Equal(0, state.TotalEvents);
|
||||
Assert.Equal(0, state.GoodEvents);
|
||||
Assert.Equal(0, state.BadEvents);
|
||||
Assert.Equal(0, state.BudgetConsumed);
|
||||
Assert.Equal(1.0, state.BudgetRemaining);
|
||||
Assert.Equal(0, state.BurnRate);
|
||||
Assert.Null(state.TimeToExhaustion);
|
||||
Assert.True(state.IsMet);
|
||||
Assert.Equal(AlertSeverity.Info, state.AlertSeverity);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(SloWindow.OneHour)]
|
||||
[InlineData(SloWindow.OneDay)]
|
||||
[InlineData(SloWindow.SevenDays)]
|
||||
[InlineData(SloWindow.ThirtyDays)]
|
||||
public void NoData_SetsCorrectWindowBounds(SloWindow window)
|
||||
{
|
||||
var state = SloState.NoData(Guid.NewGuid(), TenantId, BaseTime, window);
|
||||
|
||||
Assert.Equal(BaseTime, state.WindowEnd);
|
||||
Assert.True(state.WindowStart < state.WindowEnd);
|
||||
}
|
||||
}
|
||||
|
||||
public class AlertBudgetThresholdTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
private const string TenantId = "test-tenant";
|
||||
|
||||
[Fact]
|
||||
public void Create_SetsCorrectProperties()
|
||||
{
|
||||
var sloId = Guid.NewGuid();
|
||||
|
||||
var threshold = AlertBudgetThreshold.Create(
|
||||
sloId,
|
||||
TenantId,
|
||||
budgetConsumedThreshold: 0.5,
|
||||
severity: AlertSeverity.Warning,
|
||||
createdBy: "admin");
|
||||
|
||||
Assert.NotEqual(Guid.Empty, threshold.ThresholdId);
|
||||
Assert.Equal(sloId, threshold.SloId);
|
||||
Assert.Equal(TenantId, threshold.TenantId);
|
||||
Assert.Equal(0.5, threshold.BudgetConsumedThreshold);
|
||||
Assert.Equal(AlertSeverity.Warning, threshold.Severity);
|
||||
Assert.True(threshold.Enabled);
|
||||
Assert.Null(threshold.BurnRateThreshold);
|
||||
Assert.Equal(TimeSpan.FromHours(1), threshold.Cooldown);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithBurnRateThreshold_SetsBurnRate()
|
||||
{
|
||||
var threshold = AlertBudgetThreshold.Create(
|
||||
Guid.NewGuid(),
|
||||
TenantId,
|
||||
0.8,
|
||||
AlertSeverity.Critical,
|
||||
"admin",
|
||||
burnRateThreshold: 5.0);
|
||||
|
||||
Assert.Equal(5.0, threshold.BurnRateThreshold);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithCustomCooldown_SetsCooldown()
|
||||
{
|
||||
var threshold = AlertBudgetThreshold.Create(
|
||||
Guid.NewGuid(),
|
||||
TenantId,
|
||||
0.5,
|
||||
AlertSeverity.Warning,
|
||||
"admin",
|
||||
cooldown: TimeSpan.FromMinutes(30));
|
||||
|
||||
Assert.Equal(TimeSpan.FromMinutes(30), threshold.Cooldown);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(-0.1)]
|
||||
[InlineData(1.1)]
|
||||
public void Create_WithInvalidThreshold_Throws(double threshold)
|
||||
{
|
||||
Assert.Throws<ArgumentOutOfRangeException>(() =>
|
||||
AlertBudgetThreshold.Create(Guid.NewGuid(), TenantId, threshold, AlertSeverity.Warning, "admin"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ShouldTrigger_WhenDisabled_ReturnsFalse()
|
||||
{
|
||||
var threshold = AlertBudgetThreshold.Create(
|
||||
Guid.NewGuid(), TenantId, 0.5, AlertSeverity.Warning, "admin")
|
||||
with { Enabled = false };
|
||||
|
||||
var state = CreateTestState(budgetConsumed: 0.6);
|
||||
|
||||
Assert.False(threshold.ShouldTrigger(state, BaseTime));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ShouldTrigger_WhenBudgetExceedsThreshold_ReturnsTrue()
|
||||
{
|
||||
var threshold = AlertBudgetThreshold.Create(
|
||||
Guid.NewGuid(), TenantId, 0.5, AlertSeverity.Warning, "admin");
|
||||
|
||||
var state = CreateTestState(budgetConsumed: 0.6);
|
||||
|
||||
Assert.True(threshold.ShouldTrigger(state, BaseTime));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ShouldTrigger_WhenBudgetBelowThreshold_ReturnsFalse()
|
||||
{
|
||||
var threshold = AlertBudgetThreshold.Create(
|
||||
Guid.NewGuid(), TenantId, 0.5, AlertSeverity.Warning, "admin");
|
||||
|
||||
var state = CreateTestState(budgetConsumed: 0.3);
|
||||
|
||||
Assert.False(threshold.ShouldTrigger(state, BaseTime));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ShouldTrigger_WhenBurnRateExceedsThreshold_ReturnsTrue()
|
||||
{
|
||||
var threshold = AlertBudgetThreshold.Create(
|
||||
Guid.NewGuid(), TenantId, 0.9, AlertSeverity.Critical, "admin",
|
||||
burnRateThreshold: 3.0);
|
||||
|
||||
var state = CreateTestState(budgetConsumed: 0.3, burnRate: 4.0);
|
||||
|
||||
Assert.True(threshold.ShouldTrigger(state, BaseTime));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ShouldTrigger_WhenWithinCooldown_ReturnsFalse()
|
||||
{
|
||||
var threshold = AlertBudgetThreshold.Create(
|
||||
Guid.NewGuid(), TenantId, 0.5, AlertSeverity.Warning, "admin")
|
||||
with { LastTriggeredAt = BaseTime, Cooldown = TimeSpan.FromHours(1) };
|
||||
|
||||
var state = CreateTestState(budgetConsumed: 0.6);
|
||||
|
||||
Assert.False(threshold.ShouldTrigger(state, BaseTime.AddMinutes(30)));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ShouldTrigger_WhenCooldownExpired_ReturnsTrue()
|
||||
{
|
||||
var threshold = AlertBudgetThreshold.Create(
|
||||
Guid.NewGuid(), TenantId, 0.5, AlertSeverity.Warning, "admin")
|
||||
with { LastTriggeredAt = BaseTime, Cooldown = TimeSpan.FromHours(1) };
|
||||
|
||||
var state = CreateTestState(budgetConsumed: 0.6);
|
||||
|
||||
Assert.True(threshold.ShouldTrigger(state, BaseTime.AddMinutes(90)));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RecordTrigger_UpdatesLastTriggeredAt()
|
||||
{
|
||||
var threshold = AlertBudgetThreshold.Create(
|
||||
Guid.NewGuid(), TenantId, 0.5, AlertSeverity.Warning, "admin");
|
||||
|
||||
var updated = threshold.RecordTrigger(BaseTime);
|
||||
|
||||
Assert.Equal(BaseTime, updated.LastTriggeredAt);
|
||||
Assert.Equal(BaseTime, updated.UpdatedAt);
|
||||
}
|
||||
|
||||
private static SloState CreateTestState(double budgetConsumed = 0.5, double burnRate = 1.0) =>
|
||||
new(
|
||||
SloId: Guid.NewGuid(),
|
||||
TenantId: TenantId,
|
||||
CurrentSli: 0.99,
|
||||
TotalEvents: 1000,
|
||||
GoodEvents: 990,
|
||||
BadEvents: 10,
|
||||
BudgetConsumed: budgetConsumed,
|
||||
BudgetRemaining: 1 - budgetConsumed,
|
||||
BurnRate: burnRate,
|
||||
TimeToExhaustion: TimeSpan.FromHours(10),
|
||||
IsMet: true,
|
||||
AlertSeverity: AlertSeverity.Info,
|
||||
ComputedAt: BaseTime,
|
||||
WindowStart: BaseTime.AddDays(-1),
|
||||
WindowEnd: BaseTime);
|
||||
}
|
||||
|
||||
public class SloAlertTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
|
||||
private const string TenantId = "test-tenant";
|
||||
|
||||
[Fact]
|
||||
public void Create_FromSloAndState_CreatesAlert()
|
||||
{
|
||||
var slo = Slo.CreateAvailability(TenantId, "API Availability", 0.999, SloWindow.ThirtyDays, "admin");
|
||||
var state = CreateTestState(slo.SloId, budgetConsumed: 0.8);
|
||||
var threshold = AlertBudgetThreshold.Create(slo.SloId, TenantId, 0.5, AlertSeverity.Warning, "admin");
|
||||
|
||||
var alert = SloAlert.Create(slo, state, threshold);
|
||||
|
||||
Assert.NotEqual(Guid.Empty, alert.AlertId);
|
||||
Assert.Equal(slo.SloId, alert.SloId);
|
||||
Assert.Equal(threshold.ThresholdId, alert.ThresholdId);
|
||||
Assert.Equal(TenantId, alert.TenantId);
|
||||
Assert.Equal(AlertSeverity.Warning, alert.Severity);
|
||||
Assert.Contains("API Availability", alert.Message);
|
||||
Assert.Equal(0.8, alert.BudgetConsumed);
|
||||
Assert.False(alert.IsAcknowledged);
|
||||
Assert.False(alert.IsResolved);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Create_WithBurnRateTrigger_IncludesBurnRateInMessage()
|
||||
{
|
||||
var slo = Slo.CreateAvailability(TenantId, "Test SLO", 0.99, SloWindow.OneDay, "admin");
|
||||
var state = CreateTestState(slo.SloId, budgetConsumed: 0.3, burnRate: 6.0);
|
||||
var threshold = AlertBudgetThreshold.Create(slo.SloId, TenantId, 0.9, AlertSeverity.Critical, "admin",
|
||||
burnRateThreshold: 5.0);
|
||||
|
||||
var alert = SloAlert.Create(slo, state, threshold);
|
||||
|
||||
Assert.Contains("burn rate", alert.Message);
|
||||
Assert.Contains("6.00", alert.Message);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Acknowledge_SetsAcknowledgedFields()
|
||||
{
|
||||
var alert = CreateTestAlert();
|
||||
|
||||
var acknowledged = alert.Acknowledge("operator", BaseTime.AddHours(1));
|
||||
|
||||
Assert.True(acknowledged.IsAcknowledged);
|
||||
Assert.Equal(BaseTime.AddHours(1), acknowledged.AcknowledgedAt);
|
||||
Assert.Equal("operator", acknowledged.AcknowledgedBy);
|
||||
Assert.False(acknowledged.IsResolved);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Resolve_SetsResolvedFields()
|
||||
{
|
||||
var alert = CreateTestAlert();
|
||||
|
||||
var resolved = alert.Resolve("Fixed by scaling up", BaseTime.AddHours(2));
|
||||
|
||||
Assert.True(resolved.IsResolved);
|
||||
Assert.Equal(BaseTime.AddHours(2), resolved.ResolvedAt);
|
||||
Assert.Equal("Fixed by scaling up", resolved.ResolutionNotes);
|
||||
}
|
||||
|
||||
private static SloAlert CreateTestAlert()
|
||||
{
|
||||
var slo = Slo.CreateAvailability(TenantId, "Test SLO", 0.99, SloWindow.OneDay, "admin");
|
||||
var state = CreateTestState(slo.SloId, budgetConsumed: 0.6);
|
||||
var threshold = AlertBudgetThreshold.Create(slo.SloId, TenantId, 0.5, AlertSeverity.Warning, "admin");
|
||||
return SloAlert.Create(slo, state, threshold);
|
||||
}
|
||||
|
||||
private static SloState CreateTestState(Guid sloId, double budgetConsumed = 0.5, double burnRate = 1.0) =>
|
||||
new(
|
||||
SloId: sloId,
|
||||
TenantId: TenantId,
|
||||
CurrentSli: 0.99,
|
||||
TotalEvents: 1000,
|
||||
GoodEvents: 990,
|
||||
BadEvents: 10,
|
||||
BudgetConsumed: budgetConsumed,
|
||||
BudgetRemaining: 1 - budgetConsumed,
|
||||
BurnRate: burnRate,
|
||||
TimeToExhaustion: TimeSpan.FromHours(10),
|
||||
IsMet: true,
|
||||
AlertSeverity: AlertSeverity.Info,
|
||||
ComputedAt: BaseTime,
|
||||
WindowStart: BaseTime.AddDays(-1),
|
||||
WindowEnd: BaseTime);
|
||||
}
|
||||
@@ -0,0 +1,338 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
using StellaOps.Orchestrator.Infrastructure.Repositories;
|
||||
|
||||
namespace StellaOps.Orchestrator.WebService.Contracts;
|
||||
|
||||
// ===== Audit Contracts =====
|
||||
|
||||
/// <summary>
|
||||
/// Response for an audit entry.
|
||||
/// </summary>
|
||||
public sealed record AuditEntryResponse(
|
||||
Guid EntryId,
|
||||
string TenantId,
|
||||
string EventType,
|
||||
string ResourceType,
|
||||
Guid ResourceId,
|
||||
string ActorId,
|
||||
string ActorType,
|
||||
string? ActorIp,
|
||||
string? UserAgent,
|
||||
string? HttpMethod,
|
||||
string? RequestPath,
|
||||
string? OldState,
|
||||
string? NewState,
|
||||
string Description,
|
||||
string? CorrelationId,
|
||||
string? PreviousEntryHash,
|
||||
string ContentHash,
|
||||
long SequenceNumber,
|
||||
DateTimeOffset OccurredAt,
|
||||
string? Metadata)
|
||||
{
|
||||
public static AuditEntryResponse FromDomain(AuditEntry entry) => new(
|
||||
EntryId: entry.EntryId,
|
||||
TenantId: entry.TenantId,
|
||||
EventType: entry.EventType.ToString(),
|
||||
ResourceType: entry.ResourceType,
|
||||
ResourceId: entry.ResourceId,
|
||||
ActorId: entry.ActorId,
|
||||
ActorType: entry.ActorType.ToString(),
|
||||
ActorIp: entry.ActorIp,
|
||||
UserAgent: entry.UserAgent,
|
||||
HttpMethod: entry.HttpMethod,
|
||||
RequestPath: entry.RequestPath,
|
||||
OldState: entry.OldState,
|
||||
NewState: entry.NewState,
|
||||
Description: entry.Description,
|
||||
CorrelationId: entry.CorrelationId,
|
||||
PreviousEntryHash: entry.PreviousEntryHash,
|
||||
ContentHash: entry.ContentHash,
|
||||
SequenceNumber: entry.SequenceNumber,
|
||||
OccurredAt: entry.OccurredAt,
|
||||
Metadata: entry.Metadata);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// List response for audit entries.
|
||||
/// </summary>
|
||||
public sealed record AuditEntryListResponse(
|
||||
IReadOnlyList<AuditEntryResponse> Entries,
|
||||
string? NextCursor);
|
||||
|
||||
/// <summary>
|
||||
/// Response for audit summary.
|
||||
/// </summary>
|
||||
public sealed record AuditSummaryResponse(
|
||||
long TotalEntries,
|
||||
long EntriesSince,
|
||||
long EventTypes,
|
||||
long UniqueActors,
|
||||
long UniqueResources,
|
||||
DateTimeOffset? EarliestEntry,
|
||||
DateTimeOffset? LatestEntry)
|
||||
{
|
||||
public static AuditSummaryResponse FromDomain(AuditSummary summary) => new(
|
||||
TotalEntries: summary.TotalEntries,
|
||||
EntriesSince: summary.EntriesSince,
|
||||
EventTypes: summary.EventTypes,
|
||||
UniqueActors: summary.UniqueActors,
|
||||
UniqueResources: summary.UniqueResources,
|
||||
EarliestEntry: summary.EarliestEntry,
|
||||
LatestEntry: summary.LatestEntry);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response for chain verification.
|
||||
/// </summary>
|
||||
public sealed record ChainVerificationResponse(
|
||||
bool IsValid,
|
||||
Guid? InvalidEntryId,
|
||||
long? InvalidSequence,
|
||||
string? ErrorMessage)
|
||||
{
|
||||
public static ChainVerificationResponse FromDomain(ChainVerificationResult result) => new(
|
||||
IsValid: result.IsValid,
|
||||
InvalidEntryId: result.InvalidEntryId,
|
||||
InvalidSequence: result.InvalidSequence,
|
||||
ErrorMessage: result.ErrorMessage);
|
||||
}
|
||||
|
||||
// ===== Ledger Contracts =====
|
||||
|
||||
/// <summary>
|
||||
/// Response for a ledger entry.
|
||||
/// </summary>
|
||||
public sealed record LedgerEntryResponse(
|
||||
Guid LedgerId,
|
||||
string TenantId,
|
||||
Guid RunId,
|
||||
Guid SourceId,
|
||||
string RunType,
|
||||
string FinalStatus,
|
||||
int TotalJobs,
|
||||
int SucceededJobs,
|
||||
int FailedJobs,
|
||||
DateTimeOffset RunCreatedAt,
|
||||
DateTimeOffset? RunStartedAt,
|
||||
DateTimeOffset RunCompletedAt,
|
||||
long ExecutionDurationMs,
|
||||
string InitiatedBy,
|
||||
string InputDigest,
|
||||
string OutputDigest,
|
||||
long SequenceNumber,
|
||||
string? PreviousEntryHash,
|
||||
string ContentHash,
|
||||
DateTimeOffset LedgerCreatedAt,
|
||||
string? CorrelationId)
|
||||
{
|
||||
public static LedgerEntryResponse FromDomain(RunLedgerEntry entry) => new(
|
||||
LedgerId: entry.LedgerId,
|
||||
TenantId: entry.TenantId,
|
||||
RunId: entry.RunId,
|
||||
SourceId: entry.SourceId,
|
||||
RunType: entry.RunType,
|
||||
FinalStatus: entry.FinalStatus.ToString(),
|
||||
TotalJobs: entry.TotalJobs,
|
||||
SucceededJobs: entry.SucceededJobs,
|
||||
FailedJobs: entry.FailedJobs,
|
||||
RunCreatedAt: entry.RunCreatedAt,
|
||||
RunStartedAt: entry.RunStartedAt,
|
||||
RunCompletedAt: entry.RunCompletedAt,
|
||||
ExecutionDurationMs: (long)entry.ExecutionDuration.TotalMilliseconds,
|
||||
InitiatedBy: entry.InitiatedBy,
|
||||
InputDigest: entry.InputDigest,
|
||||
OutputDigest: entry.OutputDigest,
|
||||
SequenceNumber: entry.SequenceNumber,
|
||||
PreviousEntryHash: entry.PreviousEntryHash,
|
||||
ContentHash: entry.ContentHash,
|
||||
LedgerCreatedAt: entry.LedgerCreatedAt,
|
||||
CorrelationId: entry.CorrelationId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// List response for ledger entries.
|
||||
/// </summary>
|
||||
public sealed record LedgerEntryListResponse(
|
||||
IReadOnlyList<LedgerEntryResponse> Entries,
|
||||
string? NextCursor);
|
||||
|
||||
/// <summary>
|
||||
/// Response for ledger summary.
|
||||
/// </summary>
|
||||
public sealed record LedgerSummaryResponse(
|
||||
long TotalEntries,
|
||||
long EntriesSince,
|
||||
long TotalRuns,
|
||||
long SuccessfulRuns,
|
||||
long FailedRuns,
|
||||
long TotalJobs,
|
||||
long UniqueSources,
|
||||
long UniqueRunTypes,
|
||||
DateTimeOffset? EarliestEntry,
|
||||
DateTimeOffset? LatestEntry)
|
||||
{
|
||||
public static LedgerSummaryResponse FromDomain(LedgerSummary summary) => new(
|
||||
TotalEntries: summary.TotalEntries,
|
||||
EntriesSince: summary.EntriesSince,
|
||||
TotalRuns: summary.TotalRuns,
|
||||
SuccessfulRuns: summary.SuccessfulRuns,
|
||||
FailedRuns: summary.FailedRuns,
|
||||
TotalJobs: summary.TotalJobs,
|
||||
UniqueSources: summary.UniqueSources,
|
||||
UniqueRunTypes: summary.UniqueRunTypes,
|
||||
EarliestEntry: summary.EarliestEntry,
|
||||
LatestEntry: summary.LatestEntry);
|
||||
}
|
||||
|
||||
// ===== Export Contracts =====
|
||||
|
||||
/// <summary>
|
||||
/// Request to create a ledger export.
|
||||
/// </summary>
|
||||
public sealed record CreateLedgerExportRequest(
|
||||
string Format,
|
||||
DateTimeOffset? StartTime,
|
||||
DateTimeOffset? EndTime,
|
||||
string? RunTypeFilter,
|
||||
Guid? SourceIdFilter);
|
||||
|
||||
/// <summary>
|
||||
/// Response for a ledger export.
|
||||
/// </summary>
|
||||
public sealed record LedgerExportResponse(
|
||||
Guid ExportId,
|
||||
string TenantId,
|
||||
string Status,
|
||||
string Format,
|
||||
DateTimeOffset? StartTime,
|
||||
DateTimeOffset? EndTime,
|
||||
string? RunTypeFilter,
|
||||
Guid? SourceIdFilter,
|
||||
int EntryCount,
|
||||
string? OutputUri,
|
||||
string? OutputDigest,
|
||||
long? OutputSizeBytes,
|
||||
string RequestedBy,
|
||||
DateTimeOffset RequestedAt,
|
||||
DateTimeOffset? StartedAt,
|
||||
DateTimeOffset? CompletedAt,
|
||||
string? ErrorMessage)
|
||||
{
|
||||
public static LedgerExportResponse FromDomain(LedgerExport export) => new(
|
||||
ExportId: export.ExportId,
|
||||
TenantId: export.TenantId,
|
||||
Status: export.Status.ToString(),
|
||||
Format: export.Format,
|
||||
StartTime: export.StartTime,
|
||||
EndTime: export.EndTime,
|
||||
RunTypeFilter: export.RunTypeFilter,
|
||||
SourceIdFilter: export.SourceIdFilter,
|
||||
EntryCount: export.EntryCount,
|
||||
OutputUri: export.OutputUri,
|
||||
OutputDigest: export.OutputDigest,
|
||||
OutputSizeBytes: export.OutputSizeBytes,
|
||||
RequestedBy: export.RequestedBy,
|
||||
RequestedAt: export.RequestedAt,
|
||||
StartedAt: export.StartedAt,
|
||||
CompletedAt: export.CompletedAt,
|
||||
ErrorMessage: export.ErrorMessage);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// List response for ledger exports.
|
||||
/// </summary>
|
||||
public sealed record LedgerExportListResponse(
|
||||
IReadOnlyList<LedgerExportResponse> Exports,
|
||||
string? NextCursor);
|
||||
|
||||
// ===== Manifest Contracts =====
|
||||
|
||||
/// <summary>
|
||||
/// Response for a signed manifest.
|
||||
/// </summary>
|
||||
public sealed record ManifestResponse(
|
||||
Guid ManifestId,
|
||||
string SchemaVersion,
|
||||
string TenantId,
|
||||
string ProvenanceType,
|
||||
Guid SubjectId,
|
||||
string PayloadDigest,
|
||||
string SignatureAlgorithm,
|
||||
bool IsSigned,
|
||||
bool IsExpired,
|
||||
string KeyId,
|
||||
DateTimeOffset CreatedAt,
|
||||
DateTimeOffset? ExpiresAt)
|
||||
{
|
||||
public static ManifestResponse FromDomain(SignedManifest manifest) => new(
|
||||
ManifestId: manifest.ManifestId,
|
||||
SchemaVersion: manifest.SchemaVersion,
|
||||
TenantId: manifest.TenantId,
|
||||
ProvenanceType: manifest.ProvenanceType.ToString(),
|
||||
SubjectId: manifest.SubjectId,
|
||||
PayloadDigest: manifest.PayloadDigest,
|
||||
SignatureAlgorithm: manifest.SignatureAlgorithm,
|
||||
IsSigned: manifest.IsSigned,
|
||||
IsExpired: manifest.IsExpired,
|
||||
KeyId: manifest.KeyId,
|
||||
CreatedAt: manifest.CreatedAt,
|
||||
ExpiresAt: manifest.ExpiresAt);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response with full manifest details including statements and artifacts.
|
||||
/// </summary>
|
||||
public sealed record ManifestDetailResponse(
|
||||
Guid ManifestId,
|
||||
string SchemaVersion,
|
||||
string TenantId,
|
||||
string ProvenanceType,
|
||||
Guid SubjectId,
|
||||
string Statements,
|
||||
string Artifacts,
|
||||
string Materials,
|
||||
string? BuildInfo,
|
||||
string PayloadDigest,
|
||||
string SignatureAlgorithm,
|
||||
string Signature,
|
||||
string KeyId,
|
||||
DateTimeOffset CreatedAt,
|
||||
DateTimeOffset? ExpiresAt,
|
||||
string? Metadata)
|
||||
{
|
||||
public static ManifestDetailResponse FromDomain(SignedManifest manifest) => new(
|
||||
ManifestId: manifest.ManifestId,
|
||||
SchemaVersion: manifest.SchemaVersion,
|
||||
TenantId: manifest.TenantId,
|
||||
ProvenanceType: manifest.ProvenanceType.ToString(),
|
||||
SubjectId: manifest.SubjectId,
|
||||
Statements: manifest.Statements,
|
||||
Artifacts: manifest.Artifacts,
|
||||
Materials: manifest.Materials,
|
||||
BuildInfo: manifest.BuildInfo,
|
||||
PayloadDigest: manifest.PayloadDigest,
|
||||
SignatureAlgorithm: manifest.SignatureAlgorithm,
|
||||
Signature: manifest.Signature,
|
||||
KeyId: manifest.KeyId,
|
||||
CreatedAt: manifest.CreatedAt,
|
||||
ExpiresAt: manifest.ExpiresAt,
|
||||
Metadata: manifest.Metadata);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// List response for manifests.
|
||||
/// </summary>
|
||||
public sealed record ManifestListResponse(
|
||||
IReadOnlyList<ManifestResponse> Manifests,
|
||||
string? NextCursor);
|
||||
|
||||
/// <summary>
|
||||
/// Response for manifest verification.
|
||||
/// </summary>
|
||||
public sealed record ManifestVerificationResponse(
|
||||
Guid ManifestId,
|
||||
bool PayloadIntegrityValid,
|
||||
bool IsExpired,
|
||||
bool IsSigned,
|
||||
string? ValidationError);
|
||||
@@ -0,0 +1,46 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.WebService.Contracts;
|
||||
|
||||
/// <summary>
|
||||
/// Response representing a DAG edge (job dependency).
|
||||
/// </summary>
|
||||
public sealed record DagEdgeResponse(
|
||||
Guid EdgeId,
|
||||
Guid RunId,
|
||||
Guid ParentJobId,
|
||||
Guid ChildJobId,
|
||||
string EdgeType,
|
||||
DateTimeOffset CreatedAt)
|
||||
{
|
||||
public static DagEdgeResponse FromDomain(DagEdge edge) => new(
|
||||
edge.EdgeId,
|
||||
edge.RunId,
|
||||
edge.ParentJobId,
|
||||
edge.ChildJobId,
|
||||
edge.EdgeType,
|
||||
edge.CreatedAt);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response containing the DAG structure for a run.
|
||||
/// </summary>
|
||||
public sealed record DagResponse(
|
||||
Guid RunId,
|
||||
IReadOnlyList<DagEdgeResponse> Edges,
|
||||
IReadOnlyList<Guid> TopologicalOrder,
|
||||
IReadOnlyList<Guid> CriticalPath,
|
||||
TimeSpan? EstimatedDuration);
|
||||
|
||||
/// <summary>
|
||||
/// Response containing a list of edges.
|
||||
/// </summary>
|
||||
public sealed record DagEdgeListResponse(
|
||||
IReadOnlyList<DagEdgeResponse> Edges);
|
||||
|
||||
/// <summary>
|
||||
/// Response for blocked jobs (transitively affected by a failure).
|
||||
/// </summary>
|
||||
public sealed record BlockedJobsResponse(
|
||||
Guid FailedJobId,
|
||||
IReadOnlyList<Guid> BlockedJobIds);
|
||||
@@ -0,0 +1,121 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.WebService.Contracts;
|
||||
|
||||
/// <summary>
|
||||
/// Response representing a job.
|
||||
/// </summary>
|
||||
public sealed record JobResponse(
|
||||
Guid JobId,
|
||||
Guid? RunId,
|
||||
string JobType,
|
||||
string Status,
|
||||
int Priority,
|
||||
int Attempt,
|
||||
int MaxAttempts,
|
||||
string? CorrelationId,
|
||||
string? WorkerId,
|
||||
string? TaskRunnerId,
|
||||
DateTimeOffset CreatedAt,
|
||||
DateTimeOffset? ScheduledAt,
|
||||
DateTimeOffset? LeasedAt,
|
||||
DateTimeOffset? CompletedAt,
|
||||
DateTimeOffset? NotBefore,
|
||||
string? Reason,
|
||||
Guid? ReplayOf,
|
||||
string CreatedBy)
|
||||
{
|
||||
public static JobResponse FromDomain(Job job) => new(
|
||||
job.JobId,
|
||||
job.RunId,
|
||||
job.JobType,
|
||||
job.Status.ToString().ToLowerInvariant(),
|
||||
job.Priority,
|
||||
job.Attempt,
|
||||
job.MaxAttempts,
|
||||
job.CorrelationId,
|
||||
job.WorkerId,
|
||||
job.TaskRunnerId,
|
||||
job.CreatedAt,
|
||||
job.ScheduledAt,
|
||||
job.LeasedAt,
|
||||
job.CompletedAt,
|
||||
job.NotBefore,
|
||||
job.Reason,
|
||||
job.ReplayOf,
|
||||
job.CreatedBy);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response representing a job with its full payload.
|
||||
/// </summary>
|
||||
public sealed record JobDetailResponse(
|
||||
Guid JobId,
|
||||
Guid? RunId,
|
||||
string JobType,
|
||||
string Status,
|
||||
int Priority,
|
||||
int Attempt,
|
||||
int MaxAttempts,
|
||||
string PayloadDigest,
|
||||
string Payload,
|
||||
string IdempotencyKey,
|
||||
string? CorrelationId,
|
||||
Guid? LeaseId,
|
||||
string? WorkerId,
|
||||
string? TaskRunnerId,
|
||||
DateTimeOffset? LeaseUntil,
|
||||
DateTimeOffset CreatedAt,
|
||||
DateTimeOffset? ScheduledAt,
|
||||
DateTimeOffset? LeasedAt,
|
||||
DateTimeOffset? CompletedAt,
|
||||
DateTimeOffset? NotBefore,
|
||||
string? Reason,
|
||||
Guid? ReplayOf,
|
||||
string CreatedBy)
|
||||
{
|
||||
public static JobDetailResponse FromDomain(Job job) => new(
|
||||
job.JobId,
|
||||
job.RunId,
|
||||
job.JobType,
|
||||
job.Status.ToString().ToLowerInvariant(),
|
||||
job.Priority,
|
||||
job.Attempt,
|
||||
job.MaxAttempts,
|
||||
job.PayloadDigest,
|
||||
job.Payload,
|
||||
job.IdempotencyKey,
|
||||
job.CorrelationId,
|
||||
job.LeaseId,
|
||||
job.WorkerId,
|
||||
job.TaskRunnerId,
|
||||
job.LeaseUntil,
|
||||
job.CreatedAt,
|
||||
job.ScheduledAt,
|
||||
job.LeasedAt,
|
||||
job.CompletedAt,
|
||||
job.NotBefore,
|
||||
job.Reason,
|
||||
job.ReplayOf,
|
||||
job.CreatedBy);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response containing a list of jobs.
|
||||
/// </summary>
|
||||
public sealed record JobListResponse(
|
||||
IReadOnlyList<JobResponse> Jobs,
|
||||
string? NextCursor);
|
||||
|
||||
/// <summary>
|
||||
/// Summary statistics for jobs.
|
||||
/// </summary>
|
||||
public sealed record JobSummary(
|
||||
int TotalJobs,
|
||||
int PendingJobs,
|
||||
int ScheduledJobs,
|
||||
int LeasedJobs,
|
||||
int SucceededJobs,
|
||||
int FailedJobs,
|
||||
int CanceledJobs,
|
||||
int TimedOutJobs);
|
||||
@@ -0,0 +1,22 @@
|
||||
namespace StellaOps.Orchestrator.WebService.Contracts;
|
||||
|
||||
/// <summary>
|
||||
/// Common query options for pagination.
|
||||
/// </summary>
|
||||
public sealed record QueryOptions
|
||||
{
|
||||
/// <summary>Maximum number of results to return. Default 50.</summary>
|
||||
public int Limit { get; init; } = 50;
|
||||
|
||||
/// <summary>Cursor for pagination (opaque token).</summary>
|
||||
public string? Cursor { get; init; }
|
||||
|
||||
/// <summary>Sort order: "asc" or "desc". Default "desc".</summary>
|
||||
public string? Sort { get; init; }
|
||||
|
||||
/// <summary>Filter by created after date.</summary>
|
||||
public DateTimeOffset? CreatedAfter { get; init; }
|
||||
|
||||
/// <summary>Filter by created before date.</summary>
|
||||
public DateTimeOffset? CreatedBefore { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,352 @@
|
||||
using StellaOps.Orchestrator.Core.Domain;
|
||||
|
||||
namespace StellaOps.Orchestrator.WebService.Contracts;
|
||||
|
||||
// ============================================================================
|
||||
// Quota Contracts
|
||||
// ============================================================================
|
||||
|
||||
/// <summary>
|
||||
/// Request to create a quota.
|
||||
/// </summary>
|
||||
public sealed record CreateQuotaRequest(
|
||||
string? JobType,
|
||||
int MaxActive,
|
||||
int MaxPerHour,
|
||||
int BurstCapacity,
|
||||
double RefillRate);
|
||||
|
||||
/// <summary>
|
||||
/// Request to update a quota.
|
||||
/// </summary>
|
||||
public sealed record UpdateQuotaRequest(
|
||||
int? MaxActive,
|
||||
int? MaxPerHour,
|
||||
int? BurstCapacity,
|
||||
double? RefillRate);
|
||||
|
||||
/// <summary>
|
||||
/// Request to pause a quota.
|
||||
/// </summary>
|
||||
public sealed record PauseQuotaRequest(
|
||||
string Reason,
|
||||
string? Ticket);
|
||||
|
||||
/// <summary>
|
||||
/// Response for a quota.
|
||||
/// </summary>
|
||||
public sealed record QuotaResponse(
|
||||
Guid QuotaId,
|
||||
string TenantId,
|
||||
string? JobType,
|
||||
int MaxActive,
|
||||
int MaxPerHour,
|
||||
int BurstCapacity,
|
||||
double RefillRate,
|
||||
double CurrentTokens,
|
||||
int CurrentActive,
|
||||
int CurrentHourCount,
|
||||
bool Paused,
|
||||
string? PauseReason,
|
||||
string? QuotaTicket,
|
||||
DateTimeOffset CreatedAt,
|
||||
DateTimeOffset UpdatedAt,
|
||||
string UpdatedBy)
|
||||
{
|
||||
public static QuotaResponse FromDomain(Quota quota) =>
|
||||
new(
|
||||
QuotaId: quota.QuotaId,
|
||||
TenantId: quota.TenantId,
|
||||
JobType: quota.JobType,
|
||||
MaxActive: quota.MaxActive,
|
||||
MaxPerHour: quota.MaxPerHour,
|
||||
BurstCapacity: quota.BurstCapacity,
|
||||
RefillRate: quota.RefillRate,
|
||||
CurrentTokens: quota.CurrentTokens,
|
||||
CurrentActive: quota.CurrentActive,
|
||||
CurrentHourCount: quota.CurrentHourCount,
|
||||
Paused: quota.Paused,
|
||||
PauseReason: quota.PauseReason,
|
||||
QuotaTicket: quota.QuotaTicket,
|
||||
CreatedAt: quota.CreatedAt,
|
||||
UpdatedAt: quota.UpdatedAt,
|
||||
UpdatedBy: quota.UpdatedBy);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response for quota list.
|
||||
/// </summary>
|
||||
public sealed record QuotaListResponse(
|
||||
IReadOnlyList<QuotaResponse> Items,
|
||||
string? NextCursor);
|
||||
|
||||
// ============================================================================
|
||||
// SLO Contracts
|
||||
// ============================================================================
|
||||
|
||||
/// <summary>
|
||||
/// Request to create an SLO.
|
||||
/// </summary>
|
||||
public sealed record CreateSloRequest(
|
||||
string Name,
|
||||
string? Description,
|
||||
string Type,
|
||||
string? JobType,
|
||||
Guid? SourceId,
|
||||
double Target,
|
||||
string Window,
|
||||
double? LatencyPercentile,
|
||||
double? LatencyTargetSeconds,
|
||||
int? ThroughputMinimum);
|
||||
|
||||
/// <summary>
|
||||
/// Request to update an SLO.
|
||||
/// </summary>
|
||||
public sealed record UpdateSloRequest(
|
||||
string? Name,
|
||||
string? Description,
|
||||
double? Target,
|
||||
bool? Enabled);
|
||||
|
||||
/// <summary>
|
||||
/// Response for an SLO.
|
||||
/// </summary>
|
||||
public sealed record SloResponse(
|
||||
Guid SloId,
|
||||
string TenantId,
|
||||
string Name,
|
||||
string? Description,
|
||||
string Type,
|
||||
string? JobType,
|
||||
Guid? SourceId,
|
||||
double Target,
|
||||
string Window,
|
||||
double ErrorBudget,
|
||||
double? LatencyPercentile,
|
||||
double? LatencyTargetSeconds,
|
||||
int? ThroughputMinimum,
|
||||
bool Enabled,
|
||||
DateTimeOffset CreatedAt,
|
||||
DateTimeOffset UpdatedAt)
|
||||
{
|
||||
public static SloResponse FromDomain(Slo slo) =>
|
||||
new(
|
||||
SloId: slo.SloId,
|
||||
TenantId: slo.TenantId,
|
||||
Name: slo.Name,
|
||||
Description: slo.Description,
|
||||
Type: slo.Type.ToString().ToLowerInvariant(),
|
||||
JobType: slo.JobType,
|
||||
SourceId: slo.SourceId,
|
||||
Target: slo.Target,
|
||||
Window: FormatWindow(slo.Window),
|
||||
ErrorBudget: slo.ErrorBudget,
|
||||
LatencyPercentile: slo.LatencyPercentile,
|
||||
LatencyTargetSeconds: slo.LatencyTargetSeconds,
|
||||
ThroughputMinimum: slo.ThroughputMinimum,
|
||||
Enabled: slo.Enabled,
|
||||
CreatedAt: slo.CreatedAt,
|
||||
UpdatedAt: slo.UpdatedAt);
|
||||
|
||||
private static string FormatWindow(SloWindow window) => window switch
|
||||
{
|
||||
SloWindow.OneHour => "1h",
|
||||
SloWindow.OneDay => "1d",
|
||||
SloWindow.SevenDays => "7d",
|
||||
SloWindow.ThirtyDays => "30d",
|
||||
_ => window.ToString()
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response for SLO list.
|
||||
/// </summary>
|
||||
public sealed record SloListResponse(
|
||||
IReadOnlyList<SloResponse> Items,
|
||||
string? NextCursor);
|
||||
|
||||
/// <summary>
|
||||
/// Response for SLO state (current metrics).
|
||||
/// </summary>
|
||||
public sealed record SloStateResponse(
|
||||
Guid SloId,
|
||||
double CurrentSli,
|
||||
long TotalEvents,
|
||||
long GoodEvents,
|
||||
long BadEvents,
|
||||
double BudgetConsumed,
|
||||
double BudgetRemaining,
|
||||
double BurnRate,
|
||||
double? TimeToExhaustionSeconds,
|
||||
bool IsMet,
|
||||
string AlertSeverity,
|
||||
DateTimeOffset ComputedAt,
|
||||
DateTimeOffset WindowStart,
|
||||
DateTimeOffset WindowEnd)
|
||||
{
|
||||
public static SloStateResponse FromDomain(SloState state) =>
|
||||
new(
|
||||
SloId: state.SloId,
|
||||
CurrentSli: state.CurrentSli,
|
||||
TotalEvents: state.TotalEvents,
|
||||
GoodEvents: state.GoodEvents,
|
||||
BadEvents: state.BadEvents,
|
||||
BudgetConsumed: state.BudgetConsumed,
|
||||
BudgetRemaining: state.BudgetRemaining,
|
||||
BurnRate: state.BurnRate,
|
||||
TimeToExhaustionSeconds: state.TimeToExhaustion?.TotalSeconds,
|
||||
IsMet: state.IsMet,
|
||||
AlertSeverity: state.AlertSeverity.ToString().ToLowerInvariant(),
|
||||
ComputedAt: state.ComputedAt,
|
||||
WindowStart: state.WindowStart,
|
||||
WindowEnd: state.WindowEnd);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response with SLO and its current state.
|
||||
/// </summary>
|
||||
public sealed record SloWithStateResponse(
|
||||
SloResponse Slo,
|
||||
SloStateResponse State);
|
||||
|
||||
// ============================================================================
|
||||
// Alert Threshold Contracts
|
||||
// ============================================================================
|
||||
|
||||
/// <summary>
|
||||
/// Request to create an alert threshold.
|
||||
/// </summary>
|
||||
public sealed record CreateAlertThresholdRequest(
|
||||
double BudgetConsumedThreshold,
|
||||
double? BurnRateThreshold,
|
||||
string Severity,
|
||||
string? NotificationChannel,
|
||||
string? NotificationEndpoint,
|
||||
int? CooldownMinutes);
|
||||
|
||||
/// <summary>
|
||||
/// Response for an alert threshold.
|
||||
/// </summary>
|
||||
public sealed record AlertThresholdResponse(
|
||||
Guid ThresholdId,
|
||||
Guid SloId,
|
||||
double BudgetConsumedThreshold,
|
||||
double? BurnRateThreshold,
|
||||
string Severity,
|
||||
bool Enabled,
|
||||
string? NotificationChannel,
|
||||
string? NotificationEndpoint,
|
||||
int CooldownMinutes,
|
||||
DateTimeOffset? LastTriggeredAt,
|
||||
DateTimeOffset CreatedAt,
|
||||
DateTimeOffset UpdatedAt)
|
||||
{
|
||||
public static AlertThresholdResponse FromDomain(AlertBudgetThreshold threshold) =>
|
||||
new(
|
||||
ThresholdId: threshold.ThresholdId,
|
||||
SloId: threshold.SloId,
|
||||
BudgetConsumedThreshold: threshold.BudgetConsumedThreshold,
|
||||
BurnRateThreshold: threshold.BurnRateThreshold,
|
||||
Severity: threshold.Severity.ToString().ToLowerInvariant(),
|
||||
Enabled: threshold.Enabled,
|
||||
NotificationChannel: threshold.NotificationChannel,
|
||||
NotificationEndpoint: threshold.NotificationEndpoint,
|
||||
CooldownMinutes: (int)threshold.Cooldown.TotalMinutes,
|
||||
LastTriggeredAt: threshold.LastTriggeredAt,
|
||||
CreatedAt: threshold.CreatedAt,
|
||||
UpdatedAt: threshold.UpdatedAt);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Alert Contracts
|
||||
// ============================================================================
|
||||
|
||||
/// <summary>
|
||||
/// Response for an SLO alert.
|
||||
/// </summary>
|
||||
public sealed record SloAlertResponse(
|
||||
Guid AlertId,
|
||||
Guid SloId,
|
||||
Guid ThresholdId,
|
||||
string Severity,
|
||||
string Message,
|
||||
double BudgetConsumed,
|
||||
double BurnRate,
|
||||
double CurrentSli,
|
||||
DateTimeOffset TriggeredAt,
|
||||
DateTimeOffset? AcknowledgedAt,
|
||||
string? AcknowledgedBy,
|
||||
DateTimeOffset? ResolvedAt,
|
||||
string? ResolutionNotes)
|
||||
{
|
||||
public static SloAlertResponse FromDomain(SloAlert alert) =>
|
||||
new(
|
||||
AlertId: alert.AlertId,
|
||||
SloId: alert.SloId,
|
||||
ThresholdId: alert.ThresholdId,
|
||||
Severity: alert.Severity.ToString().ToLowerInvariant(),
|
||||
Message: alert.Message,
|
||||
BudgetConsumed: alert.BudgetConsumed,
|
||||
BurnRate: alert.BurnRate,
|
||||
CurrentSli: alert.CurrentSli,
|
||||
TriggeredAt: alert.TriggeredAt,
|
||||
AcknowledgedAt: alert.AcknowledgedAt,
|
||||
AcknowledgedBy: alert.AcknowledgedBy,
|
||||
ResolvedAt: alert.ResolvedAt,
|
||||
ResolutionNotes: alert.ResolutionNotes);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response for alert list.
|
||||
/// </summary>
|
||||
public sealed record SloAlertListResponse(
|
||||
IReadOnlyList<SloAlertResponse> Items,
|
||||
string? NextCursor);
|
||||
|
||||
/// <summary>
|
||||
/// Request to acknowledge an alert.
|
||||
/// </summary>
|
||||
public sealed record AcknowledgeAlertRequest(
|
||||
string AcknowledgedBy);
|
||||
|
||||
/// <summary>
|
||||
/// Request to resolve an alert.
|
||||
/// </summary>
|
||||
public sealed record ResolveAlertRequest(
|
||||
string ResolutionNotes);
|
||||
|
||||
// ============================================================================
|
||||
// Summary Contracts
|
||||
// ============================================================================
|
||||
|
||||
/// <summary>
|
||||
/// Summary response for SLO health.
|
||||
/// </summary>
|
||||
public sealed record SloSummaryResponse(
|
||||
long TotalSlos,
|
||||
long EnabledSlos,
|
||||
long ActiveAlerts,
|
||||
long UnacknowledgedAlerts,
|
||||
long CriticalAlerts,
|
||||
IReadOnlyList<SloWithStateResponse> SlosAtRisk);
|
||||
|
||||
/// <summary>
|
||||
/// Summary response for quota usage.
|
||||
/// </summary>
|
||||
public sealed record QuotaSummaryResponse(
|
||||
long TotalQuotas,
|
||||
long PausedQuotas,
|
||||
double AverageTokenUtilization,
|
||||
double AverageConcurrencyUtilization,
|
||||
IReadOnlyList<QuotaUtilizationResponse> Quotas);
|
||||
|
||||
/// <summary>
|
||||
/// Quota utilization response.
|
||||
/// </summary>
|
||||
public sealed record QuotaUtilizationResponse(
|
||||
Guid QuotaId,
|
||||
string? JobType,
|
||||
double TokenUtilization,
|
||||
double ConcurrencyUtilization,
|
||||
double HourlyUtilization,
|
||||
bool Paused);
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user