up
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled

This commit is contained in:
StellaOps Bot
2025-11-28 20:55:22 +02:00
parent d040c001ac
commit 2548abc56f
231 changed files with 47468 additions and 68 deletions

View File

@@ -0,0 +1,583 @@
using Microsoft.Extensions.Logging;
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Core.Backfill;
/// <summary>
/// Configuration options for the backfill manager.
/// </summary>
public sealed record BackfillManagerOptions
{
/// <summary>
/// Maximum number of events allowed in a single backfill request.
/// </summary>
public long MaxEventsPerBackfill { get; init; } = 1_000_000;
/// <summary>
/// Maximum duration allowed for a backfill operation.
/// </summary>
public TimeSpan MaxBackfillDuration { get; init; } = TimeSpan.FromHours(24);
/// <summary>
/// Data retention period - backfills cannot extend beyond this.
/// </summary>
public TimeSpan RetentionPeriod { get; init; } = TimeSpan.FromDays(90);
/// <summary>
/// Default TTL for processed event records.
/// </summary>
public TimeSpan DefaultProcessedEventTtl { get; init; } = TimeSpan.FromDays(30);
/// <summary>
/// Number of sample event keys to include in previews.
/// </summary>
public int PreviewSampleSize { get; init; } = 10;
/// <summary>
/// Estimated events per second for duration estimation.
/// </summary>
public double EstimatedEventsPerSecond { get; init; } = 100;
}
/// <summary>
/// Coordinates backfill operations with safety validations.
/// </summary>
public interface IBackfillManager
{
/// <summary>
/// Creates a new backfill request with validation.
/// </summary>
Task<BackfillRequest> CreateRequestAsync(
string tenantId,
Guid? sourceId,
string? jobType,
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
string reason,
string createdBy,
int batchSize = 100,
bool dryRun = false,
bool forceReprocess = false,
string? ticket = null,
TimeSpan? maxDuration = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Validates a backfill request and runs safety checks.
/// </summary>
Task<BackfillRequest> ValidateRequestAsync(
string tenantId,
Guid backfillId,
string updatedBy,
CancellationToken cancellationToken = default);
/// <summary>
/// Generates a preview of what a backfill would process (dry-run).
/// </summary>
Task<BackfillPreview> PreviewAsync(
string tenantId,
Guid? sourceId,
string? jobType,
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
int batchSize = 100,
CancellationToken cancellationToken = default);
/// <summary>
/// Starts execution of a validated backfill request.
/// </summary>
Task<BackfillRequest> StartAsync(
string tenantId,
Guid backfillId,
string updatedBy,
CancellationToken cancellationToken = default);
/// <summary>
/// Pauses a running backfill.
/// </summary>
Task<BackfillRequest> PauseAsync(
string tenantId,
Guid backfillId,
string updatedBy,
CancellationToken cancellationToken = default);
/// <summary>
/// Resumes a paused backfill.
/// </summary>
Task<BackfillRequest> ResumeAsync(
string tenantId,
Guid backfillId,
string updatedBy,
CancellationToken cancellationToken = default);
/// <summary>
/// Cancels a backfill request.
/// </summary>
Task<BackfillRequest> CancelAsync(
string tenantId,
Guid backfillId,
string updatedBy,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets the current status of a backfill request.
/// </summary>
Task<BackfillRequest?> GetStatusAsync(
string tenantId,
Guid backfillId,
CancellationToken cancellationToken = default);
/// <summary>
/// Lists backfill requests with filters.
/// </summary>
Task<IReadOnlyList<BackfillRequest>> ListAsync(
string tenantId,
BackfillStatus? status = null,
Guid? sourceId = null,
string? jobType = null,
int limit = 50,
int offset = 0,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Provides event counting for backfill estimation.
/// </summary>
public interface IBackfillEventCounter
{
/// <summary>
/// Estimates the number of events in a time window.
/// </summary>
Task<long> EstimateEventCountAsync(
string tenantId,
string scopeKey,
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
CancellationToken cancellationToken);
/// <summary>
/// Gets sample event keys from a time window.
/// </summary>
Task<IReadOnlyList<string>> GetSampleEventKeysAsync(
string tenantId,
string scopeKey,
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
int sampleSize,
CancellationToken cancellationToken);
}
/// <summary>
/// Validates backfill safety conditions.
/// </summary>
public interface IBackfillSafetyValidator
{
/// <summary>
/// Runs all safety validations for a backfill request.
/// </summary>
Task<BackfillSafetyChecks> ValidateAsync(
BackfillRequest request,
long estimatedEvents,
TimeSpan estimatedDuration,
CancellationToken cancellationToken);
}
/// <summary>
/// Default implementation of backfill safety validator.
/// </summary>
public sealed class DefaultBackfillSafetyValidator : IBackfillSafetyValidator
{
private readonly ISourceValidator _sourceValidator;
private readonly IOverlapChecker _overlapChecker;
private readonly BackfillManagerOptions _options;
public DefaultBackfillSafetyValidator(
ISourceValidator sourceValidator,
IOverlapChecker overlapChecker,
BackfillManagerOptions options)
{
_sourceValidator = sourceValidator;
_overlapChecker = overlapChecker;
_options = options;
}
public async Task<BackfillSafetyChecks> ValidateAsync(
BackfillRequest request,
long estimatedEvents,
TimeSpan estimatedDuration,
CancellationToken cancellationToken)
{
var warnings = new List<string>();
var errors = new List<string>();
// Check source exists
var sourceExists = true;
if (request.SourceId.HasValue)
{
sourceExists = await _sourceValidator.ExistsAsync(
request.TenantId, request.SourceId.Value, cancellationToken);
if (!sourceExists)
{
errors.Add($"Source {request.SourceId} not found.");
}
}
// Check for overlapping backfills
var hasOverlap = await _overlapChecker.HasOverlapAsync(
request.TenantId,
request.ScopeKey,
request.WindowStart,
request.WindowEnd,
request.BackfillId,
cancellationToken);
if (hasOverlap)
{
errors.Add("An active backfill already exists for this scope and time window.");
}
// Check retention period
var retentionLimit = DateTimeOffset.UtcNow - _options.RetentionPeriod;
var withinRetention = request.WindowStart >= retentionLimit;
if (!withinRetention)
{
errors.Add($"Window start {request.WindowStart:O} is beyond the retention period ({_options.RetentionPeriod.TotalDays} days).");
}
// Check event limit
var withinEventLimit = estimatedEvents <= _options.MaxEventsPerBackfill;
if (!withinEventLimit)
{
errors.Add($"Estimated {estimatedEvents:N0} events exceeds maximum allowed ({_options.MaxEventsPerBackfill:N0}).");
}
else if (estimatedEvents > _options.MaxEventsPerBackfill * 0.8)
{
warnings.Add($"Estimated {estimatedEvents:N0} events is approaching the maximum limit.");
}
// Check duration limit
var maxDuration = request.MaxDuration ?? _options.MaxBackfillDuration;
var withinDurationLimit = estimatedDuration <= maxDuration;
if (!withinDurationLimit)
{
errors.Add($"Estimated duration {estimatedDuration} exceeds maximum allowed ({maxDuration}).");
}
// Check quota availability (placeholder - always true for now)
var quotaAvailable = true;
// Add warnings for large backfills
if (request.WindowDuration > TimeSpan.FromDays(7))
{
warnings.Add("Large time window may take significant time to process.");
}
if (request.ForceReprocess)
{
warnings.Add("Force reprocess is enabled - events will be processed even if already seen.");
}
return new BackfillSafetyChecks(
SourceExists: sourceExists,
HasOverlappingBackfill: hasOverlap,
WithinRetention: withinRetention,
WithinEventLimit: withinEventLimit,
WithinDurationLimit: withinDurationLimit,
QuotaAvailable: quotaAvailable,
Warnings: warnings,
Errors: errors);
}
}
/// <summary>
/// Validates that a source exists.
/// </summary>
public interface ISourceValidator
{
/// <summary>
/// Checks if a source exists.
/// </summary>
Task<bool> ExistsAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken);
}
/// <summary>
/// Checks for overlapping backfill operations.
/// </summary>
public interface IOverlapChecker
{
/// <summary>
/// Checks if there's an overlapping active backfill.
/// </summary>
Task<bool> HasOverlapAsync(
string tenantId,
string scopeKey,
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
Guid? excludeBackfillId,
CancellationToken cancellationToken);
}
/// <summary>
/// Default implementation of the backfill manager.
/// </summary>
public sealed class BackfillManager : IBackfillManager
{
private readonly IBackfillRepository _backfillRepository;
private readonly IBackfillSafetyValidator _safetyValidator;
private readonly IBackfillEventCounter _eventCounter;
private readonly IDuplicateSuppressor _duplicateSuppressor;
private readonly BackfillManagerOptions _options;
private readonly ILogger<BackfillManager> _logger;
public BackfillManager(
IBackfillRepository backfillRepository,
IBackfillSafetyValidator safetyValidator,
IBackfillEventCounter eventCounter,
IDuplicateSuppressor duplicateSuppressor,
BackfillManagerOptions options,
ILogger<BackfillManager> logger)
{
_backfillRepository = backfillRepository;
_safetyValidator = safetyValidator;
_eventCounter = eventCounter;
_duplicateSuppressor = duplicateSuppressor;
_options = options;
_logger = logger;
}
public async Task<BackfillRequest> CreateRequestAsync(
string tenantId,
Guid? sourceId,
string? jobType,
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
string reason,
string createdBy,
int batchSize = 100,
bool dryRun = false,
bool forceReprocess = false,
string? ticket = null,
TimeSpan? maxDuration = null,
CancellationToken cancellationToken = default)
{
var request = BackfillRequest.Create(
tenantId: tenantId,
sourceId: sourceId,
jobType: jobType,
windowStart: windowStart,
windowEnd: windowEnd,
reason: reason,
createdBy: createdBy,
batchSize: batchSize,
dryRun: dryRun,
forceReprocess: forceReprocess,
ticket: ticket,
maxDuration: maxDuration);
await _backfillRepository.CreateAsync(request, cancellationToken);
_logger.LogInformation(
"Created backfill request {BackfillId} for scope {ScopeKey} from {WindowStart} to {WindowEnd}",
request.BackfillId, request.ScopeKey, windowStart, windowEnd);
return request;
}
public async Task<BackfillRequest> ValidateRequestAsync(
string tenantId,
Guid backfillId,
string updatedBy,
CancellationToken cancellationToken = default)
{
var request = await _backfillRepository.GetByIdAsync(tenantId, backfillId, cancellationToken)
?? throw new InvalidOperationException($"Backfill request {backfillId} not found.");
request = request.StartValidation(updatedBy);
await _backfillRepository.UpdateAsync(request, cancellationToken);
// Estimate event count
var estimatedEvents = await _eventCounter.EstimateEventCountAsync(
tenantId, request.ScopeKey, request.WindowStart, request.WindowEnd, cancellationToken);
// Calculate estimated duration
var estimatedDuration = TimeSpan.FromSeconds(estimatedEvents / _options.EstimatedEventsPerSecond);
// Run safety validations
var safetyChecks = await _safetyValidator.ValidateAsync(
request, estimatedEvents, estimatedDuration, cancellationToken);
request = request.WithSafetyChecks(safetyChecks, estimatedEvents, estimatedDuration, updatedBy);
await _backfillRepository.UpdateAsync(request, cancellationToken);
_logger.LogInformation(
"Validated backfill request {BackfillId}: {EstimatedEvents} events, safe={IsSafe}",
backfillId, estimatedEvents, safetyChecks.IsSafe);
return request;
}
public async Task<BackfillPreview> PreviewAsync(
string tenantId,
Guid? sourceId,
string? jobType,
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
int batchSize = 100,
CancellationToken cancellationToken = default)
{
var scopeKey = GetScopeKey(sourceId, jobType);
// Estimate total events
var estimatedEvents = await _eventCounter.EstimateEventCountAsync(
tenantId, scopeKey, windowStart, windowEnd, cancellationToken);
// Get already processed count
var processedCount = await _duplicateSuppressor.CountProcessedAsync(
scopeKey, windowStart, windowEnd, cancellationToken);
// Get sample event keys
var sampleKeys = await _eventCounter.GetSampleEventKeysAsync(
tenantId, scopeKey, windowStart, windowEnd, _options.PreviewSampleSize, cancellationToken);
// Calculate estimates
var processableEvents = Math.Max(0, estimatedEvents - processedCount);
var estimatedDuration = TimeSpan.FromSeconds(processableEvents / _options.EstimatedEventsPerSecond);
var estimatedBatches = (int)Math.Ceiling((double)processableEvents / batchSize);
// Run safety checks
var tempRequest = BackfillRequest.Create(
tenantId, sourceId, jobType, windowStart, windowEnd,
"preview", "system", batchSize);
var safetyChecks = await _safetyValidator.ValidateAsync(
tempRequest, estimatedEvents, estimatedDuration, cancellationToken);
return new BackfillPreview(
ScopeKey: scopeKey,
WindowStart: windowStart,
WindowEnd: windowEnd,
EstimatedEvents: estimatedEvents,
SkippedEvents: processedCount,
ProcessableEvents: processableEvents,
EstimatedDuration: estimatedDuration,
EstimatedBatches: estimatedBatches,
SafetyChecks: safetyChecks,
SampleEventKeys: sampleKeys);
}
public async Task<BackfillRequest> StartAsync(
string tenantId,
Guid backfillId,
string updatedBy,
CancellationToken cancellationToken = default)
{
var request = await _backfillRepository.GetByIdAsync(tenantId, backfillId, cancellationToken)
?? throw new InvalidOperationException($"Backfill request {backfillId} not found.");
request = request.Start(updatedBy);
await _backfillRepository.UpdateAsync(request, cancellationToken);
_logger.LogInformation("Started backfill request {BackfillId}", backfillId);
return request;
}
public async Task<BackfillRequest> PauseAsync(
string tenantId,
Guid backfillId,
string updatedBy,
CancellationToken cancellationToken = default)
{
var request = await _backfillRepository.GetByIdAsync(tenantId, backfillId, cancellationToken)
?? throw new InvalidOperationException($"Backfill request {backfillId} not found.");
request = request.Pause(updatedBy);
await _backfillRepository.UpdateAsync(request, cancellationToken);
_logger.LogInformation("Paused backfill request {BackfillId}", backfillId);
return request;
}
public async Task<BackfillRequest> ResumeAsync(
string tenantId,
Guid backfillId,
string updatedBy,
CancellationToken cancellationToken = default)
{
var request = await _backfillRepository.GetByIdAsync(tenantId, backfillId, cancellationToken)
?? throw new InvalidOperationException($"Backfill request {backfillId} not found.");
request = request.Resume(updatedBy);
await _backfillRepository.UpdateAsync(request, cancellationToken);
_logger.LogInformation("Resumed backfill request {BackfillId}", backfillId);
return request;
}
public async Task<BackfillRequest> CancelAsync(
string tenantId,
Guid backfillId,
string updatedBy,
CancellationToken cancellationToken = default)
{
var request = await _backfillRepository.GetByIdAsync(tenantId, backfillId, cancellationToken)
?? throw new InvalidOperationException($"Backfill request {backfillId} not found.");
request = request.Cancel(updatedBy);
await _backfillRepository.UpdateAsync(request, cancellationToken);
_logger.LogInformation("Canceled backfill request {BackfillId}", backfillId);
return request;
}
public Task<BackfillRequest?> GetStatusAsync(
string tenantId,
Guid backfillId,
CancellationToken cancellationToken = default)
{
return _backfillRepository.GetByIdAsync(tenantId, backfillId, cancellationToken);
}
public Task<IReadOnlyList<BackfillRequest>> ListAsync(
string tenantId,
BackfillStatus? status = null,
Guid? sourceId = null,
string? jobType = null,
int limit = 50,
int offset = 0,
CancellationToken cancellationToken = default)
{
return _backfillRepository.ListAsync(tenantId, status, sourceId, jobType, limit, offset, cancellationToken);
}
private static string GetScopeKey(Guid? sourceId, string? jobType)
{
return (sourceId, jobType) switch
{
(Guid s, string j) when !string.IsNullOrEmpty(j) => Watermark.CreateScopeKey(s, j),
(Guid s, _) => Watermark.CreateScopeKey(s),
(_, string j) when !string.IsNullOrEmpty(j) => Watermark.CreateScopeKey(j),
_ => throw new ArgumentException("Either sourceId or jobType must be specified.")
};
}
}
/// <summary>
/// Repository interface for backfill persistence (imported for convenience).
/// </summary>
public interface IBackfillRepository
{
Task<BackfillRequest?> GetByIdAsync(string tenantId, Guid backfillId, CancellationToken cancellationToken);
Task CreateAsync(BackfillRequest request, CancellationToken cancellationToken);
Task UpdateAsync(BackfillRequest request, CancellationToken cancellationToken);
Task<IReadOnlyList<BackfillRequest>> ListAsync(
string tenantId,
BackfillStatus? status,
Guid? sourceId,
string? jobType,
int limit,
int offset,
CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,318 @@
namespace StellaOps.Orchestrator.Core.Backfill;
/// <summary>
/// Tracks processed events for duplicate suppression.
/// </summary>
public interface IDuplicateSuppressor
{
/// <summary>
/// Checks if an event has already been processed.
/// </summary>
/// <param name="scopeKey">Scope identifier.</param>
/// <param name="eventKey">Unique event identifier.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>True if the event was already processed.</returns>
Task<bool> HasProcessedAsync(string scopeKey, string eventKey, CancellationToken cancellationToken);
/// <summary>
/// Checks multiple events for duplicate status.
/// </summary>
/// <param name="scopeKey">Scope identifier.</param>
/// <param name="eventKeys">Event identifiers to check.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Set of event keys that have already been processed.</returns>
Task<IReadOnlySet<string>> GetProcessedAsync(string scopeKey, IEnumerable<string> eventKeys, CancellationToken cancellationToken);
/// <summary>
/// Marks an event as processed.
/// </summary>
/// <param name="scopeKey">Scope identifier.</param>
/// <param name="eventKey">Unique event identifier.</param>
/// <param name="eventTime">Event timestamp.</param>
/// <param name="batchId">Optional batch/backfill identifier.</param>
/// <param name="ttl">Time-to-live for the record.</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task MarkProcessedAsync(
string scopeKey,
string eventKey,
DateTimeOffset eventTime,
Guid? batchId,
TimeSpan ttl,
CancellationToken cancellationToken);
/// <summary>
/// Marks multiple events as processed.
/// </summary>
/// <param name="scopeKey">Scope identifier.</param>
/// <param name="events">Events to mark as processed.</param>
/// <param name="batchId">Optional batch/backfill identifier.</param>
/// <param name="ttl">Time-to-live for the records.</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task MarkProcessedBatchAsync(
string scopeKey,
IEnumerable<ProcessedEvent> events,
Guid? batchId,
TimeSpan ttl,
CancellationToken cancellationToken);
/// <summary>
/// Counts processed events within a time range.
/// </summary>
/// <param name="scopeKey">Scope identifier.</param>
/// <param name="from">Start of time range.</param>
/// <param name="to">End of time range.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Count of processed events.</returns>
Task<long> CountProcessedAsync(string scopeKey, DateTimeOffset from, DateTimeOffset to, CancellationToken cancellationToken);
/// <summary>
/// Removes expired records (cleanup).
/// </summary>
/// <param name="batchLimit">Maximum records to remove per call.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Number of records removed.</returns>
Task<int> CleanupExpiredAsync(int batchLimit, CancellationToken cancellationToken);
}
/// <summary>
/// Event data for duplicate tracking.
/// </summary>
public sealed record ProcessedEvent(
/// <summary>Unique event identifier.</summary>
string EventKey,
/// <summary>Event timestamp.</summary>
DateTimeOffset EventTime);
/// <summary>
/// In-memory duplicate suppressor for testing.
/// </summary>
public sealed class InMemoryDuplicateSuppressor : IDuplicateSuppressor
{
private readonly Dictionary<string, Dictionary<string, ProcessedEventEntry>> _store = new();
private readonly object _lock = new();
private sealed record ProcessedEventEntry(
DateTimeOffset EventTime,
DateTimeOffset ProcessedAt,
Guid? BatchId,
DateTimeOffset ExpiresAt);
public Task<bool> HasProcessedAsync(string scopeKey, string eventKey, CancellationToken cancellationToken)
{
lock (_lock)
{
if (!_store.TryGetValue(scopeKey, out var scopeStore))
return Task.FromResult(false);
if (!scopeStore.TryGetValue(eventKey, out var entry))
return Task.FromResult(false);
// Check if expired
if (entry.ExpiresAt < DateTimeOffset.UtcNow)
{
scopeStore.Remove(eventKey);
return Task.FromResult(false);
}
return Task.FromResult(true);
}
}
public Task<IReadOnlySet<string>> GetProcessedAsync(string scopeKey, IEnumerable<string> eventKeys, CancellationToken cancellationToken)
{
var now = DateTimeOffset.UtcNow;
var result = new HashSet<string>();
lock (_lock)
{
if (!_store.TryGetValue(scopeKey, out var scopeStore))
return Task.FromResult<IReadOnlySet<string>>(result);
foreach (var eventKey in eventKeys)
{
if (scopeStore.TryGetValue(eventKey, out var entry) && entry.ExpiresAt >= now)
{
result.Add(eventKey);
}
}
}
return Task.FromResult<IReadOnlySet<string>>(result);
}
public Task MarkProcessedAsync(
string scopeKey,
string eventKey,
DateTimeOffset eventTime,
Guid? batchId,
TimeSpan ttl,
CancellationToken cancellationToken)
{
var now = DateTimeOffset.UtcNow;
var entry = new ProcessedEventEntry(eventTime, now, batchId, now + ttl);
lock (_lock)
{
if (!_store.TryGetValue(scopeKey, out var scopeStore))
{
scopeStore = new Dictionary<string, ProcessedEventEntry>();
_store[scopeKey] = scopeStore;
}
scopeStore[eventKey] = entry;
}
return Task.CompletedTask;
}
public Task MarkProcessedBatchAsync(
string scopeKey,
IEnumerable<ProcessedEvent> events,
Guid? batchId,
TimeSpan ttl,
CancellationToken cancellationToken)
{
var now = DateTimeOffset.UtcNow;
var expiresAt = now + ttl;
lock (_lock)
{
if (!_store.TryGetValue(scopeKey, out var scopeStore))
{
scopeStore = new Dictionary<string, ProcessedEventEntry>();
_store[scopeKey] = scopeStore;
}
foreach (var evt in events)
{
scopeStore[evt.EventKey] = new ProcessedEventEntry(evt.EventTime, now, batchId, expiresAt);
}
}
return Task.CompletedTask;
}
public Task<long> CountProcessedAsync(string scopeKey, DateTimeOffset from, DateTimeOffset to, CancellationToken cancellationToken)
{
var now = DateTimeOffset.UtcNow;
long count = 0;
lock (_lock)
{
if (_store.TryGetValue(scopeKey, out var scopeStore))
{
count = scopeStore.Values
.Count(e => e.ExpiresAt >= now && e.EventTime >= from && e.EventTime < to);
}
}
return Task.FromResult(count);
}
public Task<int> CleanupExpiredAsync(int batchLimit, CancellationToken cancellationToken)
{
var now = DateTimeOffset.UtcNow;
var removed = 0;
lock (_lock)
{
foreach (var scopeStore in _store.Values)
{
var expiredKeys = scopeStore
.Where(kvp => kvp.Value.ExpiresAt < now)
.Take(batchLimit - removed)
.Select(kvp => kvp.Key)
.ToList();
foreach (var key in expiredKeys)
{
scopeStore.Remove(key);
removed++;
}
if (removed >= batchLimit)
break;
}
}
return Task.FromResult(removed);
}
}
/// <summary>
/// Result of filtering events through duplicate suppression.
/// </summary>
public sealed record DuplicateFilterResult<T>(
/// <summary>Events that should be processed (not duplicates).</summary>
IReadOnlyList<T> ToProcess,
/// <summary>Events that were filtered as duplicates.</summary>
IReadOnlyList<T> Duplicates,
/// <summary>Total events evaluated.</summary>
int Total)
{
/// <summary>
/// Number of events that passed filtering.
/// </summary>
public int ProcessCount => ToProcess.Count;
/// <summary>
/// Number of duplicates filtered.
/// </summary>
public int DuplicateCount => Duplicates.Count;
/// <summary>
/// Duplicate percentage.
/// </summary>
public double DuplicatePercent => Total > 0 ? Math.Round((double)DuplicateCount / Total * 100, 2) : 0;
}
/// <summary>
/// Helper methods for duplicate suppression.
/// </summary>
public static class DuplicateSuppressorExtensions
{
/// <summary>
/// Filters a batch of events, removing duplicates.
/// </summary>
/// <typeparam name="T">Event type.</typeparam>
/// <param name="suppressor">Duplicate suppressor.</param>
/// <param name="scopeKey">Scope identifier.</param>
/// <param name="events">Events to filter.</param>
/// <param name="keySelector">Function to extract event key.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Filter result with events to process and duplicates.</returns>
public static async Task<DuplicateFilterResult<T>> FilterAsync<T>(
this IDuplicateSuppressor suppressor,
string scopeKey,
IReadOnlyList<T> events,
Func<T, string> keySelector,
CancellationToken cancellationToken)
{
if (events.Count == 0)
return new DuplicateFilterResult<T>([], [], 0);
var eventKeys = events.Select(keySelector).ToList();
var processed = await suppressor.GetProcessedAsync(scopeKey, eventKeys, cancellationToken).ConfigureAwait(false);
var toProcess = new List<T>();
var duplicates = new List<T>();
foreach (var evt in events)
{
var key = keySelector(evt);
if (processed.Contains(key))
{
duplicates.Add(evt);
}
else
{
toProcess.Add(evt);
}
}
return new DuplicateFilterResult<T>(toProcess, duplicates, events.Count);
}
}

View File

@@ -0,0 +1,220 @@
namespace StellaOps.Orchestrator.Core.Backfill;
/// <summary>
/// Represents an event-time window for batch processing.
/// </summary>
public sealed record EventTimeWindow(
/// <summary>Start of the window (inclusive).</summary>
DateTimeOffset Start,
/// <summary>End of the window (exclusive).</summary>
DateTimeOffset End)
{
/// <summary>
/// Duration of the window.
/// </summary>
public TimeSpan Duration => End - Start;
/// <summary>
/// Whether the window is empty (zero duration).
/// </summary>
public bool IsEmpty => End <= Start;
/// <summary>
/// Whether a timestamp falls within this window.
/// </summary>
public bool Contains(DateTimeOffset timestamp) => timestamp >= Start && timestamp < End;
/// <summary>
/// Whether this window overlaps with another.
/// </summary>
public bool Overlaps(EventTimeWindow other) =>
Start < other.End && End > other.Start;
/// <summary>
/// Creates the intersection of two windows.
/// </summary>
public EventTimeWindow? Intersect(EventTimeWindow other)
{
var newStart = Start > other.Start ? Start : other.Start;
var newEnd = End < other.End ? End : other.End;
return newEnd > newStart ? new EventTimeWindow(newStart, newEnd) : null;
}
/// <summary>
/// Splits the window into batches of the specified duration.
/// </summary>
public IEnumerable<EventTimeWindow> Split(TimeSpan batchDuration)
{
if (batchDuration <= TimeSpan.Zero)
throw new ArgumentOutOfRangeException(nameof(batchDuration), "Batch duration must be positive.");
var current = Start;
while (current < End)
{
var batchEnd = current + batchDuration;
if (batchEnd > End)
batchEnd = End;
yield return new EventTimeWindow(current, batchEnd);
current = batchEnd;
}
}
/// <summary>
/// Creates a window from a duration ending at the specified time.
/// </summary>
public static EventTimeWindow FromDuration(DateTimeOffset end, TimeSpan duration) =>
new(end - duration, end);
/// <summary>
/// Creates a window covering the last N hours from now.
/// </summary>
public static EventTimeWindow LastHours(int hours, DateTimeOffset? now = null)
{
var endTime = now ?? DateTimeOffset.UtcNow;
return FromDuration(endTime, TimeSpan.FromHours(hours));
}
/// <summary>
/// Creates a window covering the last N days from now.
/// </summary>
public static EventTimeWindow LastDays(int days, DateTimeOffset? now = null)
{
var endTime = now ?? DateTimeOffset.UtcNow;
return FromDuration(endTime, TimeSpan.FromDays(days));
}
}
/// <summary>
/// Configuration for event-time window computation.
/// </summary>
public sealed record EventTimeWindowOptions(
/// <summary>Minimum window size (prevents too-small batches).</summary>
TimeSpan MinWindowSize,
/// <summary>Maximum window size (prevents too-large batches).</summary>
TimeSpan MaxWindowSize,
/// <summary>Overlap with previous window for late-arriving events.</summary>
TimeSpan OverlapDuration,
/// <summary>Maximum lag allowed before triggering alerts.</summary>
TimeSpan MaxLag,
/// <summary>Default lookback for initial fetch when no watermark exists.</summary>
TimeSpan InitialLookback)
{
/// <summary>
/// Default options for hourly batching.
/// </summary>
public static EventTimeWindowOptions HourlyBatches => new(
MinWindowSize: TimeSpan.FromMinutes(5),
MaxWindowSize: TimeSpan.FromHours(1),
OverlapDuration: TimeSpan.FromMinutes(5),
MaxLag: TimeSpan.FromHours(2),
InitialLookback: TimeSpan.FromDays(7));
/// <summary>
/// Default options for daily batching.
/// </summary>
public static EventTimeWindowOptions DailyBatches => new(
MinWindowSize: TimeSpan.FromHours(1),
MaxWindowSize: TimeSpan.FromDays(1),
OverlapDuration: TimeSpan.FromHours(1),
MaxLag: TimeSpan.FromDays(1),
InitialLookback: TimeSpan.FromDays(30));
}
/// <summary>
/// Computes event-time windows for incremental processing.
/// </summary>
public static class EventTimeWindowPlanner
{
/// <summary>
/// Computes the next window to process based on current watermark.
/// </summary>
/// <param name="now">Current time.</param>
/// <param name="highWatermark">Current high watermark (null for initial fetch).</param>
/// <param name="options">Window configuration options.</param>
/// <returns>The next window to process, or null if caught up.</returns>
public static EventTimeWindow? GetNextWindow(
DateTimeOffset now,
DateTimeOffset? highWatermark,
EventTimeWindowOptions options)
{
DateTimeOffset windowStart;
if (highWatermark is null)
{
// Initial fetch: start from initial lookback
windowStart = now - options.InitialLookback;
}
else
{
// Incremental fetch: start from watermark minus overlap
windowStart = highWatermark.Value - options.OverlapDuration;
// If we're caught up (watermark + min window > now), no work needed
if (highWatermark.Value + options.MinWindowSize > now)
{
return null;
}
}
// Calculate window end (at most now, at most max window from start)
var windowEnd = windowStart + options.MaxWindowSize;
if (windowEnd > now)
{
windowEnd = now;
}
// Ensure minimum window size
if (windowEnd - windowStart < options.MinWindowSize)
{
// If window would be too small, extend end (but not past now)
windowEnd = windowStart + options.MinWindowSize;
if (windowEnd > now)
{
return null; // Not enough data accumulated yet
}
}
return new EventTimeWindow(windowStart, windowEnd);
}
/// <summary>
/// Calculates the current lag from the high watermark.
/// </summary>
public static TimeSpan CalculateLag(DateTimeOffset now, DateTimeOffset highWatermark) =>
now - highWatermark;
/// <summary>
/// Determines if the lag exceeds the maximum allowed.
/// </summary>
public static bool IsLagging(DateTimeOffset now, DateTimeOffset highWatermark, EventTimeWindowOptions options) =>
CalculateLag(now, highWatermark) > options.MaxLag;
/// <summary>
/// Estimates the number of windows needed to catch up.
/// </summary>
public static int EstimateWindowsToProcess(
DateTimeOffset now,
DateTimeOffset? highWatermark,
EventTimeWindowOptions options)
{
if (highWatermark is null)
{
// Initial fetch
var totalDuration = options.InitialLookback;
return (int)Math.Ceiling(totalDuration / options.MaxWindowSize);
}
var lag = CalculateLag(now, highWatermark.Value);
if (lag <= options.MinWindowSize)
return 0;
return (int)Math.Ceiling(lag / options.MaxWindowSize);
}
}

View File

@@ -0,0 +1,502 @@
using Microsoft.Extensions.Logging;
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Core.DeadLetter;
/// <summary>
/// Notification channel types.
/// </summary>
public enum NotificationChannel
{
Email,
Slack,
Teams,
Webhook,
PagerDuty
}
/// <summary>
/// Notification rule for dead-letter events.
/// </summary>
public sealed record NotificationRule(
Guid RuleId,
string TenantId,
string? JobTypePattern,
string? ErrorCodePattern,
ErrorCategory? Category,
Guid? SourceId,
bool Enabled,
NotificationChannel Channel,
string Endpoint,
int CooldownMinutes,
int MaxPerHour,
bool Aggregate,
DateTimeOffset? LastNotifiedAt,
int NotificationsSent,
DateTimeOffset CreatedAt,
DateTimeOffset UpdatedAt,
string CreatedBy,
string UpdatedBy)
{
/// <summary>Creates a new notification rule.</summary>
public static NotificationRule Create(
string tenantId,
NotificationChannel channel,
string endpoint,
string createdBy,
string? jobTypePattern = null,
string? errorCodePattern = null,
ErrorCategory? category = null,
Guid? sourceId = null,
int cooldownMinutes = 15,
int maxPerHour = 10,
bool aggregate = true)
{
var now = DateTimeOffset.UtcNow;
return new NotificationRule(
RuleId: Guid.NewGuid(),
TenantId: tenantId,
JobTypePattern: jobTypePattern,
ErrorCodePattern: errorCodePattern,
Category: category,
SourceId: sourceId,
Enabled: true,
Channel: channel,
Endpoint: endpoint,
CooldownMinutes: cooldownMinutes,
MaxPerHour: maxPerHour,
Aggregate: aggregate,
LastNotifiedAt: null,
NotificationsSent: 0,
CreatedAt: now,
UpdatedAt: now,
CreatedBy: createdBy,
UpdatedBy: createdBy);
}
/// <summary>Checks if this rule matches the given entry.</summary>
public bool Matches(DeadLetterEntry entry)
{
if (!Enabled) return false;
if (SourceId.HasValue && entry.SourceId != SourceId.Value) return false;
if (Category.HasValue && entry.Category != Category.Value) return false;
if (!string.IsNullOrEmpty(JobTypePattern))
{
if (!System.Text.RegularExpressions.Regex.IsMatch(entry.JobType, JobTypePattern))
return false;
}
if (!string.IsNullOrEmpty(ErrorCodePattern))
{
if (!System.Text.RegularExpressions.Regex.IsMatch(entry.ErrorCode, ErrorCodePattern))
return false;
}
return true;
}
/// <summary>Checks if this rule is within rate limits.</summary>
public bool CanNotify(DateTimeOffset now, int notificationsSentThisHour)
{
if (!Enabled) return false;
if (notificationsSentThisHour >= MaxPerHour) return false;
if (LastNotifiedAt.HasValue)
{
var elapsed = now - LastNotifiedAt.Value;
if (elapsed < TimeSpan.FromMinutes(CooldownMinutes))
return false;
}
return true;
}
/// <summary>Records a notification sent.</summary>
public NotificationRule RecordNotification(DateTimeOffset now) =>
this with
{
LastNotifiedAt = now,
NotificationsSent = NotificationsSent + 1,
UpdatedAt = now
};
}
/// <summary>
/// Notification log entry.
/// </summary>
public sealed record NotificationLogEntry(
Guid LogId,
string TenantId,
Guid RuleId,
IReadOnlyList<Guid> EntryIds,
NotificationChannel Channel,
string Endpoint,
bool Success,
string? ErrorMessage,
string? Subject,
int EntryCount,
DateTimeOffset SentAt);
/// <summary>
/// Notification payload for dead-letter events.
/// </summary>
public sealed record DeadLetterNotificationPayload(
string TenantId,
string EventType,
IReadOnlyList<DeadLetterEntrySummary> Entries,
DeadLetterStatsSnapshot? Stats,
DateTimeOffset Timestamp,
string? ActionUrl);
/// <summary>
/// Summary of a dead-letter entry for notifications.
/// </summary>
public sealed record DeadLetterEntrySummary(
Guid EntryId,
Guid OriginalJobId,
string JobType,
string ErrorCode,
ErrorCategory Category,
string FailureReason,
string? RemediationHint,
bool IsRetryable,
int ReplayAttempts,
DateTimeOffset FailedAt);
/// <summary>
/// Stats snapshot for notifications.
/// </summary>
public sealed record DeadLetterStatsSnapshot(
long PendingCount,
long RetryableCount,
long ExhaustedCount);
/// <summary>
/// Interface for dead-letter event notifications.
/// </summary>
public interface IDeadLetterNotifier
{
/// <summary>Notifies when a new entry is added to dead-letter store.</summary>
Task NotifyNewEntryAsync(
DeadLetterEntry entry,
CancellationToken cancellationToken);
/// <summary>Notifies when an entry is successfully replayed.</summary>
Task NotifyReplaySuccessAsync(
DeadLetterEntry entry,
Guid newJobId,
CancellationToken cancellationToken);
/// <summary>Notifies when an entry exhausts all replay attempts.</summary>
Task NotifyExhaustedAsync(
DeadLetterEntry entry,
CancellationToken cancellationToken);
/// <summary>Sends aggregated notifications for pending entries.</summary>
Task SendAggregatedNotificationsAsync(
string tenantId,
CancellationToken cancellationToken);
}
/// <summary>
/// Interface for notification delivery.
/// </summary>
public interface INotificationDelivery
{
/// <summary>Sends a notification to the specified endpoint.</summary>
Task<bool> SendAsync(
NotificationChannel channel,
string endpoint,
DeadLetterNotificationPayload payload,
CancellationToken cancellationToken);
}
/// <summary>
/// Repository for notification rules.
/// </summary>
public interface INotificationRuleRepository
{
Task<NotificationRule?> GetByIdAsync(string tenantId, Guid ruleId, CancellationToken cancellationToken);
Task<IReadOnlyList<NotificationRule>> ListAsync(string tenantId, bool enabledOnly, CancellationToken cancellationToken);
Task<IReadOnlyList<NotificationRule>> GetMatchingRulesAsync(string tenantId, DeadLetterEntry entry, CancellationToken cancellationToken);
Task CreateAsync(NotificationRule rule, CancellationToken cancellationToken);
Task<bool> UpdateAsync(NotificationRule rule, CancellationToken cancellationToken);
Task<bool> DeleteAsync(string tenantId, Guid ruleId, CancellationToken cancellationToken);
Task<int> GetNotificationCountThisHourAsync(string tenantId, Guid ruleId, CancellationToken cancellationToken);
Task LogNotificationAsync(NotificationLogEntry log, CancellationToken cancellationToken);
}
/// <summary>
/// Default dead-letter notifier implementation.
/// </summary>
public sealed class DeadLetterNotifier : IDeadLetterNotifier
{
private readonly INotificationRuleRepository _ruleRepository;
private readonly IDeadLetterRepository _deadLetterRepository;
private readonly INotificationDelivery _delivery;
private readonly TimeProvider _timeProvider;
private readonly ILogger<DeadLetterNotifier> _logger;
public DeadLetterNotifier(
INotificationRuleRepository ruleRepository,
IDeadLetterRepository deadLetterRepository,
INotificationDelivery delivery,
TimeProvider timeProvider,
ILogger<DeadLetterNotifier> logger)
{
_ruleRepository = ruleRepository ?? throw new ArgumentNullException(nameof(ruleRepository));
_deadLetterRepository = deadLetterRepository ?? throw new ArgumentNullException(nameof(deadLetterRepository));
_delivery = delivery ?? throw new ArgumentNullException(nameof(delivery));
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task NotifyNewEntryAsync(
DeadLetterEntry entry,
CancellationToken cancellationToken)
{
var rules = await _ruleRepository.GetMatchingRulesAsync(entry.TenantId, entry, cancellationToken)
.ConfigureAwait(false);
var now = _timeProvider.GetUtcNow();
foreach (var rule in rules)
{
if (rule.Aggregate)
{
// Skip immediate notification for aggregated rules
continue;
}
var notificationsThisHour = await _ruleRepository.GetNotificationCountThisHourAsync(
entry.TenantId, rule.RuleId, cancellationToken).ConfigureAwait(false);
if (!rule.CanNotify(now, notificationsThisHour))
{
continue;
}
await SendNotificationAsync(rule, "new_entry", [entry], null, cancellationToken)
.ConfigureAwait(false);
}
}
public async Task NotifyReplaySuccessAsync(
DeadLetterEntry entry,
Guid newJobId,
CancellationToken cancellationToken)
{
var rules = await _ruleRepository.GetMatchingRulesAsync(entry.TenantId, entry, cancellationToken)
.ConfigureAwait(false);
var now = _timeProvider.GetUtcNow();
foreach (var rule in rules)
{
var notificationsThisHour = await _ruleRepository.GetNotificationCountThisHourAsync(
entry.TenantId, rule.RuleId, cancellationToken).ConfigureAwait(false);
if (!rule.CanNotify(now, notificationsThisHour))
{
continue;
}
var payload = new DeadLetterNotificationPayload(
TenantId: entry.TenantId,
EventType: "replay_success",
Entries: [ToSummary(entry)],
Stats: null,
Timestamp: now,
ActionUrl: null);
var success = await _delivery.SendAsync(rule.Channel, rule.Endpoint, payload, cancellationToken)
.ConfigureAwait(false);
await LogNotificationAsync(rule, [entry.EntryId], success, null, cancellationToken)
.ConfigureAwait(false);
}
}
public async Task NotifyExhaustedAsync(
DeadLetterEntry entry,
CancellationToken cancellationToken)
{
var rules = await _ruleRepository.GetMatchingRulesAsync(entry.TenantId, entry, cancellationToken)
.ConfigureAwait(false);
var now = _timeProvider.GetUtcNow();
foreach (var rule in rules)
{
var notificationsThisHour = await _ruleRepository.GetNotificationCountThisHourAsync(
entry.TenantId, rule.RuleId, cancellationToken).ConfigureAwait(false);
if (!rule.CanNotify(now, notificationsThisHour))
{
continue;
}
await SendNotificationAsync(rule, "exhausted", [entry], null, cancellationToken)
.ConfigureAwait(false);
}
}
public async Task SendAggregatedNotificationsAsync(
string tenantId,
CancellationToken cancellationToken)
{
var rules = await _ruleRepository.ListAsync(tenantId, enabledOnly: true, cancellationToken)
.ConfigureAwait(false);
var now = _timeProvider.GetUtcNow();
var stats = await _deadLetterRepository.GetStatsAsync(tenantId, cancellationToken).ConfigureAwait(false);
foreach (var rule in rules.Where(r => r.Aggregate))
{
var notificationsThisHour = await _ruleRepository.GetNotificationCountThisHourAsync(
tenantId, rule.RuleId, cancellationToken).ConfigureAwait(false);
if (!rule.CanNotify(now, notificationsThisHour))
{
continue;
}
// Get pending entries matching this rule
var options = new DeadLetterListOptions(
Status: DeadLetterStatus.Pending,
Category: rule.Category,
Limit: 10);
var entries = await _deadLetterRepository.ListAsync(tenantId, options, cancellationToken)
.ConfigureAwait(false);
// Filter to only matching entries
var matchingEntries = entries.Where(e => rule.Matches(e)).ToList();
if (matchingEntries.Count == 0)
{
continue;
}
var statsSnapshot = new DeadLetterStatsSnapshot(
PendingCount: stats.PendingEntries,
RetryableCount: stats.RetryableEntries,
ExhaustedCount: stats.ExhaustedEntries);
await SendNotificationAsync(rule, "aggregated", matchingEntries, statsSnapshot, cancellationToken)
.ConfigureAwait(false);
}
}
private async Task SendNotificationAsync(
NotificationRule rule,
string eventType,
IReadOnlyList<DeadLetterEntry> entries,
DeadLetterStatsSnapshot? stats,
CancellationToken cancellationToken)
{
var now = _timeProvider.GetUtcNow();
var payload = new DeadLetterNotificationPayload(
TenantId: rule.TenantId,
EventType: eventType,
Entries: entries.Select(ToSummary).ToList(),
Stats: stats,
Timestamp: now,
ActionUrl: null);
string? errorMessage = null;
bool success;
try
{
success = await _delivery.SendAsync(rule.Channel, rule.Endpoint, payload, cancellationToken)
.ConfigureAwait(false);
}
catch (Exception ex)
{
success = false;
errorMessage = ex.Message;
_logger.LogError(ex, "Failed to send {EventType} notification for rule {RuleId}", eventType, rule.RuleId);
}
await LogNotificationAsync(rule, entries.Select(e => e.EntryId).ToList(), success, errorMessage, cancellationToken)
.ConfigureAwait(false);
if (success)
{
var updatedRule = rule.RecordNotification(now);
await _ruleRepository.UpdateAsync(updatedRule, cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"Dead-letter notification sent: tenant={TenantId}, channel={Channel}, eventType={EventType}",
rule.TenantId, rule.Channel, eventType);
}
else
{
_logger.LogWarning(
"Dead-letter notification failed: tenant={TenantId}, channel={Channel}, eventType={EventType}",
rule.TenantId, rule.Channel, eventType);
}
}
private async Task LogNotificationAsync(
NotificationRule rule,
IReadOnlyList<Guid> entryIds,
bool success,
string? errorMessage,
CancellationToken cancellationToken)
{
var log = new NotificationLogEntry(
LogId: Guid.NewGuid(),
TenantId: rule.TenantId,
RuleId: rule.RuleId,
EntryIds: entryIds,
Channel: rule.Channel,
Endpoint: rule.Endpoint,
Success: success,
ErrorMessage: errorMessage,
Subject: null,
EntryCount: entryIds.Count,
SentAt: _timeProvider.GetUtcNow());
await _ruleRepository.LogNotificationAsync(log, cancellationToken).ConfigureAwait(false);
}
private static DeadLetterEntrySummary ToSummary(DeadLetterEntry entry) =>
new(
EntryId: entry.EntryId,
OriginalJobId: entry.OriginalJobId,
JobType: entry.JobType,
ErrorCode: entry.ErrorCode,
Category: entry.Category,
FailureReason: entry.FailureReason,
RemediationHint: entry.RemediationHint,
IsRetryable: entry.IsRetryable,
ReplayAttempts: entry.ReplayAttempts,
FailedAt: entry.FailedAt);
}
/// <summary>
/// No-op notifier for when notifications are disabled.
/// </summary>
public sealed class NullDeadLetterNotifier : IDeadLetterNotifier
{
public static readonly NullDeadLetterNotifier Instance = new();
private NullDeadLetterNotifier() { }
public Task NotifyNewEntryAsync(DeadLetterEntry entry, CancellationToken cancellationToken) =>
Task.CompletedTask;
public Task NotifyReplaySuccessAsync(DeadLetterEntry entry, Guid newJobId, CancellationToken cancellationToken) =>
Task.CompletedTask;
public Task NotifyExhaustedAsync(DeadLetterEntry entry, CancellationToken cancellationToken) =>
Task.CompletedTask;
public Task SendAggregatedNotificationsAsync(string tenantId, CancellationToken cancellationToken) =>
Task.CompletedTask;
}

View File

@@ -0,0 +1,578 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Core.DeadLetter;
/// <summary>
/// Represents a classified error with remediation guidance.
/// </summary>
public sealed record ClassifiedError(
/// <summary>Error code (e.g., "ORCH-ERR-001").</summary>
string ErrorCode,
/// <summary>Error category.</summary>
ErrorCategory Category,
/// <summary>Human-readable description.</summary>
string Description,
/// <summary>Remediation hint for operators.</summary>
string RemediationHint,
/// <summary>Whether this error is potentially retryable.</summary>
bool IsRetryable,
/// <summary>Suggested retry delay if retryable.</summary>
TimeSpan? SuggestedRetryDelay);
/// <summary>
/// Classifies errors and provides remediation hints.
/// </summary>
public interface IErrorClassifier
{
/// <summary>Classifies an exception into a categorized error.</summary>
ClassifiedError Classify(Exception exception);
/// <summary>Classifies an error code and message.</summary>
ClassifiedError Classify(string errorCode, string message);
/// <summary>Classifies based on HTTP status code and message.</summary>
ClassifiedError ClassifyHttpError(int statusCode, string? message);
}
/// <summary>
/// Default error classifier with standard error codes and remediation hints.
/// </summary>
public sealed class DefaultErrorClassifier : IErrorClassifier
{
/// <summary>Known error codes with classifications.</summary>
public static class ErrorCodes
{
// Transient errors (ORCH-TRN-xxx)
public const string NetworkTimeout = "ORCH-TRN-001";
public const string ConnectionRefused = "ORCH-TRN-002";
public const string DnsResolutionFailed = "ORCH-TRN-003";
public const string ServiceUnavailable = "ORCH-TRN-004";
public const string GatewayTimeout = "ORCH-TRN-005";
public const string TemporaryFailure = "ORCH-TRN-099";
// Not found errors (ORCH-NF-xxx)
public const string ImageNotFound = "ORCH-NF-001";
public const string SourceNotFound = "ORCH-NF-002";
public const string RegistryNotFound = "ORCH-NF-003";
public const string ManifestNotFound = "ORCH-NF-004";
public const string ResourceNotFound = "ORCH-NF-099";
// Auth errors (ORCH-AUTH-xxx)
public const string InvalidCredentials = "ORCH-AUTH-001";
public const string TokenExpired = "ORCH-AUTH-002";
public const string InsufficientPermissions = "ORCH-AUTH-003";
public const string CertificateError = "ORCH-AUTH-004";
public const string AuthenticationFailed = "ORCH-AUTH-099";
// Rate limit errors (ORCH-RL-xxx)
public const string RateLimited = "ORCH-RL-001";
public const string QuotaExceeded = "ORCH-RL-002";
public const string ConcurrencyLimitReached = "ORCH-RL-003";
public const string ThrottlingError = "ORCH-RL-099";
// Validation errors (ORCH-VAL-xxx)
public const string InvalidPayload = "ORCH-VAL-001";
public const string InvalidConfiguration = "ORCH-VAL-002";
public const string SchemaValidationFailed = "ORCH-VAL-003";
public const string MissingRequiredField = "ORCH-VAL-004";
public const string ValidationFailed = "ORCH-VAL-099";
// Upstream errors (ORCH-UP-xxx)
public const string RegistryError = "ORCH-UP-001";
public const string AdvisoryFeedError = "ORCH-UP-002";
public const string DatabaseError = "ORCH-UP-003";
public const string ExternalServiceError = "ORCH-UP-099";
// Internal errors (ORCH-INT-xxx)
public const string InternalError = "ORCH-INT-001";
public const string StateCorruption = "ORCH-INT-002";
public const string ProcessingError = "ORCH-INT-003";
public const string UnexpectedError = "ORCH-INT-099";
// Conflict errors (ORCH-CON-xxx)
public const string DuplicateJob = "ORCH-CON-001";
public const string VersionMismatch = "ORCH-CON-002";
public const string ConcurrentModification = "ORCH-CON-003";
public const string ConflictError = "ORCH-CON-099";
// Canceled errors (ORCH-CAN-xxx)
public const string UserCanceled = "ORCH-CAN-001";
public const string SystemCanceled = "ORCH-CAN-002";
public const string TimeoutCanceled = "ORCH-CAN-003";
public const string OperationCanceled = "ORCH-CAN-099";
}
private static readonly Dictionary<string, ClassifiedError> KnownErrors = new()
{
// Transient errors
[ErrorCodes.NetworkTimeout] = new(
ErrorCodes.NetworkTimeout,
ErrorCategory.Transient,
"Network operation timed out",
"Check network connectivity and firewall rules. If the target service is healthy, increase timeout settings.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(1)),
[ErrorCodes.ConnectionRefused] = new(
ErrorCodes.ConnectionRefused,
ErrorCategory.Transient,
"Connection refused by target host",
"Verify the target service is running and accessible. Check firewall rules and network policies.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(2)),
[ErrorCodes.DnsResolutionFailed] = new(
ErrorCodes.DnsResolutionFailed,
ErrorCategory.Transient,
"DNS resolution failed",
"Verify the hostname is correct. Check DNS server configuration and network connectivity.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(1)),
[ErrorCodes.ServiceUnavailable] = new(
ErrorCodes.ServiceUnavailable,
ErrorCategory.Transient,
"Service temporarily unavailable (503)",
"The target service is temporarily overloaded or under maintenance. Retry with exponential backoff.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(5)),
[ErrorCodes.GatewayTimeout] = new(
ErrorCodes.GatewayTimeout,
ErrorCategory.Transient,
"Gateway timeout (504)",
"An upstream service took too long to respond. This is typically transient; retry with backoff.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(2)),
[ErrorCodes.TemporaryFailure] = new(
ErrorCodes.TemporaryFailure,
ErrorCategory.Transient,
"Temporary failure",
"A transient error occurred. Retry the operation after a brief delay.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(1)),
// Not found errors
[ErrorCodes.ImageNotFound] = new(
ErrorCodes.ImageNotFound,
ErrorCategory.NotFound,
"Container image not found",
"Verify the image reference is correct (repository, tag, digest). Check registry access and that the image exists.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.SourceNotFound] = new(
ErrorCodes.SourceNotFound,
ErrorCategory.NotFound,
"Source configuration not found",
"The referenced source may have been deleted. Verify the source ID and recreate if necessary.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.RegistryNotFound] = new(
ErrorCodes.RegistryNotFound,
ErrorCategory.NotFound,
"Container registry not found",
"Verify the registry URL is correct. Check DNS resolution and that the registry is operational.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.ManifestNotFound] = new(
ErrorCodes.ManifestNotFound,
ErrorCategory.NotFound,
"Image manifest not found",
"The image exists but the manifest is missing. The image may have been deleted or the tag moved.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.ResourceNotFound] = new(
ErrorCodes.ResourceNotFound,
ErrorCategory.NotFound,
"Resource not found",
"The requested resource does not exist. Verify the resource identifier is correct.",
IsRetryable: false,
SuggestedRetryDelay: null),
// Auth errors
[ErrorCodes.InvalidCredentials] = new(
ErrorCodes.InvalidCredentials,
ErrorCategory.AuthFailure,
"Invalid credentials",
"The provided credentials are invalid. Update the registry credentials in the source configuration.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.TokenExpired] = new(
ErrorCodes.TokenExpired,
ErrorCategory.AuthFailure,
"Authentication token expired",
"The authentication token has expired. Refresh credentials or re-authenticate to obtain a new token.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(1)),
[ErrorCodes.InsufficientPermissions] = new(
ErrorCodes.InsufficientPermissions,
ErrorCategory.AuthFailure,
"Insufficient permissions",
"The authenticated user lacks required permissions. Request access from the registry administrator.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.CertificateError] = new(
ErrorCodes.CertificateError,
ErrorCategory.AuthFailure,
"TLS certificate error",
"Certificate validation failed. Verify the CA bundle or add the registry's certificate to trusted roots.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.AuthenticationFailed] = new(
ErrorCodes.AuthenticationFailed,
ErrorCategory.AuthFailure,
"Authentication failed",
"Unable to authenticate with the target service. Verify credentials and authentication configuration.",
IsRetryable: false,
SuggestedRetryDelay: null),
// Rate limit errors
[ErrorCodes.RateLimited] = new(
ErrorCodes.RateLimited,
ErrorCategory.RateLimited,
"Rate limit exceeded (429)",
"Request rate limit exceeded. Reduce request frequency or upgrade service tier. Will auto-retry with backoff.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(5)),
[ErrorCodes.QuotaExceeded] = new(
ErrorCodes.QuotaExceeded,
ErrorCategory.RateLimited,
"Quota exceeded",
"Usage quota has been exceeded. Wait for quota reset or request quota increase.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromHours(1)),
[ErrorCodes.ConcurrencyLimitReached] = new(
ErrorCodes.ConcurrencyLimitReached,
ErrorCategory.RateLimited,
"Concurrency limit reached",
"Maximum concurrent operations limit reached. Reduce parallel operations or increase limit.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(1)),
[ErrorCodes.ThrottlingError] = new(
ErrorCodes.ThrottlingError,
ErrorCategory.RateLimited,
"Request throttled",
"Request was throttled due to rate limits. Retry with exponential backoff.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(2)),
// Validation errors
[ErrorCodes.InvalidPayload] = new(
ErrorCodes.InvalidPayload,
ErrorCategory.ValidationError,
"Invalid job payload",
"The job payload is malformed or invalid. Review the payload structure and fix validation errors.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.InvalidConfiguration] = new(
ErrorCodes.InvalidConfiguration,
ErrorCategory.ValidationError,
"Invalid configuration",
"Source or job configuration is invalid. Review and correct the configuration settings.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.SchemaValidationFailed] = new(
ErrorCodes.SchemaValidationFailed,
ErrorCategory.ValidationError,
"Schema validation failed",
"Input data failed schema validation. Ensure data conforms to the expected schema.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.MissingRequiredField] = new(
ErrorCodes.MissingRequiredField,
ErrorCategory.ValidationError,
"Missing required field",
"A required field is missing from the input. Provide all required fields.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.ValidationFailed] = new(
ErrorCodes.ValidationFailed,
ErrorCategory.ValidationError,
"Validation failed",
"Input validation failed. Review the error details and correct the input.",
IsRetryable: false,
SuggestedRetryDelay: null),
// Upstream errors
[ErrorCodes.RegistryError] = new(
ErrorCodes.RegistryError,
ErrorCategory.UpstreamError,
"Container registry error",
"The container registry returned an error. Check registry status and logs for details.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(5)),
[ErrorCodes.AdvisoryFeedError] = new(
ErrorCodes.AdvisoryFeedError,
ErrorCategory.UpstreamError,
"Advisory feed error",
"Error fetching from advisory feed. Check feed URL and authentication. May be temporary.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(15)),
[ErrorCodes.DatabaseError] = new(
ErrorCodes.DatabaseError,
ErrorCategory.UpstreamError,
"Database error",
"Database operation failed. Check database connectivity and status.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(1)),
[ErrorCodes.ExternalServiceError] = new(
ErrorCodes.ExternalServiceError,
ErrorCategory.UpstreamError,
"External service error",
"An external service dependency failed. Check service status and connectivity.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(5)),
// Internal errors
[ErrorCodes.InternalError] = new(
ErrorCodes.InternalError,
ErrorCategory.InternalError,
"Internal processing error",
"An internal error occurred. This may indicate a bug. Please report if persistent.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.StateCorruption] = new(
ErrorCodes.StateCorruption,
ErrorCategory.InternalError,
"State corruption detected",
"Internal state corruption detected. Manual intervention may be required.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.ProcessingError] = new(
ErrorCodes.ProcessingError,
ErrorCategory.InternalError,
"Processing error",
"Error during job processing. Review job payload and configuration.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.UnexpectedError] = new(
ErrorCodes.UnexpectedError,
ErrorCategory.InternalError,
"Unexpected error",
"An unexpected error occurred. This may indicate a bug. Please report with error details.",
IsRetryable: false,
SuggestedRetryDelay: null),
// Conflict errors
[ErrorCodes.DuplicateJob] = new(
ErrorCodes.DuplicateJob,
ErrorCategory.Conflict,
"Duplicate job detected",
"A job with the same idempotency key already exists. This is expected for retry scenarios.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.VersionMismatch] = new(
ErrorCodes.VersionMismatch,
ErrorCategory.Conflict,
"Version mismatch",
"Resource version conflict detected. Refresh and retry the operation.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromSeconds(5)),
[ErrorCodes.ConcurrentModification] = new(
ErrorCodes.ConcurrentModification,
ErrorCategory.Conflict,
"Concurrent modification",
"Resource was modified concurrently. Refresh state and retry.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromSeconds(5)),
[ErrorCodes.ConflictError] = new(
ErrorCodes.ConflictError,
ErrorCategory.Conflict,
"Resource conflict",
"A resource conflict occurred. Check for concurrent operations.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromSeconds(10)),
// Canceled errors
[ErrorCodes.UserCanceled] = new(
ErrorCodes.UserCanceled,
ErrorCategory.Canceled,
"Canceled by user",
"Operation was canceled by user request. No action required unless retry is desired.",
IsRetryable: false,
SuggestedRetryDelay: null),
[ErrorCodes.SystemCanceled] = new(
ErrorCodes.SystemCanceled,
ErrorCategory.Canceled,
"Canceled by system",
"Operation was canceled by the system (e.g., shutdown, quota). May be automatically rescheduled.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(5)),
[ErrorCodes.TimeoutCanceled] = new(
ErrorCodes.TimeoutCanceled,
ErrorCategory.Canceled,
"Canceled due to timeout",
"Operation exceeded its time limit. Consider increasing timeout or optimizing the operation.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(2)),
[ErrorCodes.OperationCanceled] = new(
ErrorCodes.OperationCanceled,
ErrorCategory.Canceled,
"Operation canceled",
"The operation was canceled. Check cancellation source for details.",
IsRetryable: false,
SuggestedRetryDelay: null)
};
/// <inheritdoc />
public ClassifiedError Classify(Exception exception)
{
ArgumentNullException.ThrowIfNull(exception);
return exception switch
{
OperationCanceledException => KnownErrors[ErrorCodes.OperationCanceled],
TimeoutException => KnownErrors[ErrorCodes.NetworkTimeout],
HttpRequestException httpEx => ClassifyHttpException(httpEx),
_ when exception.Message.Contains("connection refused", StringComparison.OrdinalIgnoreCase)
=> KnownErrors[ErrorCodes.ConnectionRefused],
_ when exception.Message.Contains("DNS", StringComparison.OrdinalIgnoreCase)
=> KnownErrors[ErrorCodes.DnsResolutionFailed],
_ when exception.Message.Contains("timeout", StringComparison.OrdinalIgnoreCase)
=> KnownErrors[ErrorCodes.NetworkTimeout],
_ when exception.Message.Contains("certificate", StringComparison.OrdinalIgnoreCase)
=> KnownErrors[ErrorCodes.CertificateError],
_ when exception.Message.Contains("unauthorized", StringComparison.OrdinalIgnoreCase)
=> KnownErrors[ErrorCodes.AuthenticationFailed],
_ when exception.Message.Contains("forbidden", StringComparison.OrdinalIgnoreCase)
=> KnownErrors[ErrorCodes.InsufficientPermissions],
_ => new ClassifiedError(
ErrorCodes.UnexpectedError,
ErrorCategory.InternalError,
exception.GetType().Name,
$"Unexpected error: {exception.Message}. Review stack trace for details.",
IsRetryable: false,
SuggestedRetryDelay: null)
};
}
/// <inheritdoc />
public ClassifiedError Classify(string errorCode, string message)
{
ArgumentException.ThrowIfNullOrWhiteSpace(errorCode);
if (KnownErrors.TryGetValue(errorCode, out var known))
{
return known;
}
// Try to infer from error code prefix
var category = errorCode switch
{
_ when errorCode.StartsWith("ORCH-TRN-", StringComparison.Ordinal) => ErrorCategory.Transient,
_ when errorCode.StartsWith("ORCH-NF-", StringComparison.Ordinal) => ErrorCategory.NotFound,
_ when errorCode.StartsWith("ORCH-AUTH-", StringComparison.Ordinal) => ErrorCategory.AuthFailure,
_ when errorCode.StartsWith("ORCH-RL-", StringComparison.Ordinal) => ErrorCategory.RateLimited,
_ when errorCode.StartsWith("ORCH-VAL-", StringComparison.Ordinal) => ErrorCategory.ValidationError,
_ when errorCode.StartsWith("ORCH-UP-", StringComparison.Ordinal) => ErrorCategory.UpstreamError,
_ when errorCode.StartsWith("ORCH-INT-", StringComparison.Ordinal) => ErrorCategory.InternalError,
_ when errorCode.StartsWith("ORCH-CON-", StringComparison.Ordinal) => ErrorCategory.Conflict,
_ when errorCode.StartsWith("ORCH-CAN-", StringComparison.Ordinal) => ErrorCategory.Canceled,
_ => ErrorCategory.Unknown
};
var isRetryable = category is ErrorCategory.Transient or ErrorCategory.RateLimited or ErrorCategory.UpstreamError;
return new ClassifiedError(
errorCode,
category,
message,
"Unknown error code. Review the error message for details.",
isRetryable,
isRetryable ? TimeSpan.FromMinutes(5) : null);
}
/// <inheritdoc />
public ClassifiedError ClassifyHttpError(int statusCode, string? message)
{
return statusCode switch
{
400 => KnownErrors[ErrorCodes.ValidationFailed],
401 => KnownErrors[ErrorCodes.AuthenticationFailed],
403 => KnownErrors[ErrorCodes.InsufficientPermissions],
404 => KnownErrors[ErrorCodes.ResourceNotFound],
408 => KnownErrors[ErrorCodes.NetworkTimeout],
409 => KnownErrors[ErrorCodes.ConflictError],
429 => KnownErrors[ErrorCodes.RateLimited],
500 => KnownErrors[ErrorCodes.InternalError],
502 => KnownErrors[ErrorCodes.ExternalServiceError],
503 => KnownErrors[ErrorCodes.ServiceUnavailable],
504 => KnownErrors[ErrorCodes.GatewayTimeout],
_ when statusCode >= 400 && statusCode < 500 => new ClassifiedError(
$"HTTP-{statusCode}",
ErrorCategory.ValidationError,
message ?? $"HTTP {statusCode} error",
"Client error. Review request parameters.",
IsRetryable: false,
SuggestedRetryDelay: null),
_ when statusCode >= 500 => new ClassifiedError(
$"HTTP-{statusCode}",
ErrorCategory.UpstreamError,
message ?? $"HTTP {statusCode} error",
"Server error. May be transient; retry with backoff.",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(2)),
_ => new ClassifiedError(
$"HTTP-{statusCode}",
ErrorCategory.Unknown,
message ?? $"HTTP {statusCode}",
"Unexpected HTTP status. Review response for details.",
IsRetryable: false,
SuggestedRetryDelay: null)
};
}
private ClassifiedError ClassifyHttpException(HttpRequestException ex)
{
if (ex.StatusCode.HasValue)
{
return ClassifyHttpError((int)ex.StatusCode.Value, ex.Message);
}
// No status code - likely a connection error
return ex.Message switch
{
_ when ex.Message.Contains("connection refused", StringComparison.OrdinalIgnoreCase)
=> KnownErrors[ErrorCodes.ConnectionRefused],
_ when ex.Message.Contains("name resolution", StringComparison.OrdinalIgnoreCase)
=> KnownErrors[ErrorCodes.DnsResolutionFailed],
_ when ex.Message.Contains("SSL", StringComparison.OrdinalIgnoreCase) ||
ex.Message.Contains("TLS", StringComparison.OrdinalIgnoreCase)
=> KnownErrors[ErrorCodes.CertificateError],
_ => KnownErrors[ErrorCodes.ExternalServiceError]
};
}
}

View File

@@ -0,0 +1,221 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Core.DeadLetter;
/// <summary>
/// Repository for dead-letter entry persistence.
/// </summary>
public interface IDeadLetterRepository
{
/// <summary>Gets a dead-letter entry by ID.</summary>
Task<DeadLetterEntry?> GetByIdAsync(
string tenantId,
Guid entryId,
CancellationToken cancellationToken);
/// <summary>Gets a dead-letter entry by original job ID.</summary>
Task<DeadLetterEntry?> GetByOriginalJobIdAsync(
string tenantId,
Guid originalJobId,
CancellationToken cancellationToken);
/// <summary>Lists dead-letter entries with filtering and pagination.</summary>
Task<IReadOnlyList<DeadLetterEntry>> ListAsync(
string tenantId,
DeadLetterListOptions options,
CancellationToken cancellationToken);
/// <summary>Counts dead-letter entries with filtering.</summary>
Task<long> CountAsync(
string tenantId,
DeadLetterListOptions options,
CancellationToken cancellationToken);
/// <summary>Creates a new dead-letter entry.</summary>
Task CreateAsync(
DeadLetterEntry entry,
CancellationToken cancellationToken);
/// <summary>Updates an existing dead-letter entry.</summary>
Task<bool> UpdateAsync(
DeadLetterEntry entry,
CancellationToken cancellationToken);
/// <summary>Gets entries pending replay that are retryable.</summary>
Task<IReadOnlyList<DeadLetterEntry>> GetPendingRetryableAsync(
string tenantId,
int limit,
CancellationToken cancellationToken);
/// <summary>Gets entries by error code.</summary>
Task<IReadOnlyList<DeadLetterEntry>> GetByErrorCodeAsync(
string tenantId,
string errorCode,
DeadLetterStatus? status,
int limit,
CancellationToken cancellationToken);
/// <summary>Gets entries by category.</summary>
Task<IReadOnlyList<DeadLetterEntry>> GetByCategoryAsync(
string tenantId,
ErrorCategory category,
DeadLetterStatus? status,
int limit,
CancellationToken cancellationToken);
/// <summary>Gets aggregated statistics.</summary>
Task<DeadLetterStats> GetStatsAsync(
string tenantId,
CancellationToken cancellationToken);
/// <summary>Gets a summary of actionable entries grouped by error code.</summary>
Task<IReadOnlyList<DeadLetterSummary>> GetActionableSummaryAsync(
string tenantId,
int limit,
CancellationToken cancellationToken);
/// <summary>Marks expired entries.</summary>
Task<int> MarkExpiredAsync(
int batchLimit,
CancellationToken cancellationToken);
/// <summary>Purges old resolved/expired entries.</summary>
Task<int> PurgeOldEntriesAsync(
int retentionDays,
int batchLimit,
CancellationToken cancellationToken);
}
/// <summary>
/// Options for listing dead-letter entries.
/// </summary>
public sealed record DeadLetterListOptions(
DeadLetterStatus? Status = null,
ErrorCategory? Category = null,
string? JobType = null,
string? ErrorCode = null,
Guid? SourceId = null,
Guid? RunId = null,
bool? IsRetryable = null,
DateTimeOffset? CreatedAfter = null,
DateTimeOffset? CreatedBefore = null,
string? Cursor = null,
int Limit = 50,
bool Ascending = false);
/// <summary>
/// Aggregated dead-letter statistics.
/// </summary>
public sealed record DeadLetterStats(
long TotalEntries,
long PendingEntries,
long ReplayingEntries,
long ReplayedEntries,
long ResolvedEntries,
long ExhaustedEntries,
long ExpiredEntries,
long RetryableEntries,
IReadOnlyDictionary<ErrorCategory, long> ByCategory,
IReadOnlyDictionary<string, long> TopErrorCodes,
IReadOnlyDictionary<string, long> TopJobTypes);
/// <summary>
/// Summary of dead-letter entries grouped by error code.
/// </summary>
public sealed record DeadLetterSummary(
string ErrorCode,
ErrorCategory Category,
long EntryCount,
long RetryableCount,
DateTimeOffset OldestEntry,
string? SampleReason);
/// <summary>
/// Repository for replay audit records.
/// </summary>
public interface IReplayAuditRepository
{
/// <summary>Gets audit records for an entry.</summary>
Task<IReadOnlyList<ReplayAuditRecord>> GetByEntryAsync(
string tenantId,
Guid entryId,
CancellationToken cancellationToken);
/// <summary>Gets a specific audit record.</summary>
Task<ReplayAuditRecord?> GetByIdAsync(
string tenantId,
Guid auditId,
CancellationToken cancellationToken);
/// <summary>Creates a new audit record.</summary>
Task CreateAsync(
ReplayAuditRecord record,
CancellationToken cancellationToken);
/// <summary>Updates an audit record (completion).</summary>
Task<bool> UpdateAsync(
ReplayAuditRecord record,
CancellationToken cancellationToken);
/// <summary>Gets audit records for a new job ID (to find replay source).</summary>
Task<ReplayAuditRecord?> GetByNewJobIdAsync(
string tenantId,
Guid newJobId,
CancellationToken cancellationToken);
}
/// <summary>
/// Replay attempt audit record.
/// </summary>
public sealed record ReplayAuditRecord(
Guid AuditId,
string TenantId,
Guid EntryId,
int AttemptNumber,
bool Success,
Guid? NewJobId,
string? ErrorMessage,
string TriggeredBy,
DateTimeOffset TriggeredAt,
DateTimeOffset? CompletedAt,
string InitiatedBy)
{
/// <summary>Creates a new audit record for a replay attempt.</summary>
public static ReplayAuditRecord Create(
string tenantId,
Guid entryId,
int attemptNumber,
string triggeredBy,
string initiatedBy,
DateTimeOffset now) =>
new(
AuditId: Guid.NewGuid(),
TenantId: tenantId,
EntryId: entryId,
AttemptNumber: attemptNumber,
Success: false,
NewJobId: null,
ErrorMessage: null,
TriggeredBy: triggeredBy,
TriggeredAt: now,
CompletedAt: null,
InitiatedBy: initiatedBy);
/// <summary>Marks the replay as successful.</summary>
public ReplayAuditRecord Complete(Guid newJobId, DateTimeOffset now) =>
this with
{
Success = true,
NewJobId = newJobId,
CompletedAt = now
};
/// <summary>Marks the replay as failed.</summary>
public ReplayAuditRecord Fail(string errorMessage, DateTimeOffset now) =>
this with
{
Success = false,
ErrorMessage = errorMessage,
CompletedAt = now
};
}

View File

@@ -0,0 +1,472 @@
using Microsoft.Extensions.Logging;
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Core.DeadLetter;
/// <summary>
/// Options for replay manager configuration.
/// </summary>
public sealed record ReplayManagerOptions(
/// <summary>Default maximum replay attempts.</summary>
int DefaultMaxReplayAttempts = 3,
/// <summary>Default retention period for dead-letter entries.</summary>
TimeSpan DefaultRetention = default,
/// <summary>Minimum delay between replay attempts.</summary>
TimeSpan MinReplayDelay = default,
/// <summary>Maximum batch size for bulk operations.</summary>
int MaxBatchSize = 100,
/// <summary>Enable automatic replay of retryable entries.</summary>
bool AutoReplayEnabled = false,
/// <summary>Delay before automatic replay.</summary>
TimeSpan AutoReplayDelay = default)
{
/// <summary>Default options.</summary>
public static ReplayManagerOptions Default => new(
DefaultMaxReplayAttempts: 3,
DefaultRetention: TimeSpan.FromDays(30),
MinReplayDelay: TimeSpan.FromMinutes(5),
MaxBatchSize: 100,
AutoReplayEnabled: false,
AutoReplayDelay: TimeSpan.FromMinutes(15));
}
/// <summary>
/// Result of a replay operation.
/// </summary>
public sealed record ReplayResult(
bool Success,
Guid? NewJobId,
string? ErrorMessage,
DeadLetterEntry UpdatedEntry);
/// <summary>
/// Result of a batch replay operation.
/// </summary>
public sealed record BatchReplayResult(
int Attempted,
int Succeeded,
int Failed,
IReadOnlyList<ReplayResult> Results);
/// <summary>
/// Manages dead-letter entry replay operations.
/// </summary>
public interface IReplayManager
{
/// <summary>Replays a single dead-letter entry.</summary>
Task<ReplayResult> ReplayAsync(
string tenantId,
Guid entryId,
string initiatedBy,
CancellationToken cancellationToken);
/// <summary>Replays multiple entries by ID.</summary>
Task<BatchReplayResult> ReplayBatchAsync(
string tenantId,
IReadOnlyList<Guid> entryIds,
string initiatedBy,
CancellationToken cancellationToken);
/// <summary>Replays all pending retryable entries matching criteria.</summary>
Task<BatchReplayResult> ReplayPendingAsync(
string tenantId,
string? errorCode,
ErrorCategory? category,
int maxCount,
string initiatedBy,
CancellationToken cancellationToken);
/// <summary>Resolves an entry without replay.</summary>
Task<DeadLetterEntry> ResolveAsync(
string tenantId,
Guid entryId,
string notes,
string resolvedBy,
CancellationToken cancellationToken);
/// <summary>Resolves multiple entries without replay.</summary>
Task<int> ResolveBatchAsync(
string tenantId,
IReadOnlyList<Guid> entryIds,
string notes,
string resolvedBy,
CancellationToken cancellationToken);
}
/// <summary>
/// Job creator interface for replay operations.
/// </summary>
public interface IJobCreator
{
/// <summary>Creates a new job from a dead-letter entry payload.</summary>
Task<Job> CreateFromReplayAsync(
string tenantId,
string jobType,
string payload,
string payloadDigest,
string idempotencyKey,
string? correlationId,
Guid replayOf,
string createdBy,
CancellationToken cancellationToken);
}
/// <summary>
/// Default replay manager implementation.
/// </summary>
public sealed class ReplayManager : IReplayManager
{
private readonly IDeadLetterRepository _deadLetterRepository;
private readonly IReplayAuditRepository _auditRepository;
private readonly IJobCreator _jobCreator;
private readonly IDeadLetterNotifier _notifier;
private readonly TimeProvider _timeProvider;
private readonly ReplayManagerOptions _options;
private readonly ILogger<ReplayManager> _logger;
public ReplayManager(
IDeadLetterRepository deadLetterRepository,
IReplayAuditRepository auditRepository,
IJobCreator jobCreator,
IDeadLetterNotifier notifier,
TimeProvider timeProvider,
ReplayManagerOptions options,
ILogger<ReplayManager> logger)
{
_deadLetterRepository = deadLetterRepository ?? throw new ArgumentNullException(nameof(deadLetterRepository));
_auditRepository = auditRepository ?? throw new ArgumentNullException(nameof(auditRepository));
_jobCreator = jobCreator ?? throw new ArgumentNullException(nameof(jobCreator));
_notifier = notifier ?? throw new ArgumentNullException(nameof(notifier));
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
_options = options ?? ReplayManagerOptions.Default;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<ReplayResult> ReplayAsync(
string tenantId,
Guid entryId,
string initiatedBy,
CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentException.ThrowIfNullOrWhiteSpace(initiatedBy);
var entry = await _deadLetterRepository.GetByIdAsync(tenantId, entryId, cancellationToken)
.ConfigureAwait(false);
if (entry is null)
{
throw new InvalidOperationException($"Dead-letter entry {entryId} not found.");
}
return await ReplayEntryAsync(entry, "manual", initiatedBy, cancellationToken).ConfigureAwait(false);
}
public async Task<BatchReplayResult> ReplayBatchAsync(
string tenantId,
IReadOnlyList<Guid> entryIds,
string initiatedBy,
CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentNullException.ThrowIfNull(entryIds);
ArgumentException.ThrowIfNullOrWhiteSpace(initiatedBy);
if (entryIds.Count > _options.MaxBatchSize)
{
throw new ArgumentException($"Batch size {entryIds.Count} exceeds maximum {_options.MaxBatchSize}.");
}
var results = new List<ReplayResult>();
var succeeded = 0;
var failed = 0;
foreach (var entryId in entryIds)
{
try
{
var entry = await _deadLetterRepository.GetByIdAsync(tenantId, entryId, cancellationToken)
.ConfigureAwait(false);
if (entry is null)
{
results.Add(new ReplayResult(
Success: false,
NewJobId: null,
ErrorMessage: $"Entry {entryId} not found.",
UpdatedEntry: null!));
failed++;
continue;
}
var result = await ReplayEntryAsync(entry, "batch", initiatedBy, cancellationToken)
.ConfigureAwait(false);
results.Add(result);
if (result.Success)
succeeded++;
else
failed++;
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to replay entry {EntryId}", entryId);
results.Add(new ReplayResult(
Success: false,
NewJobId: null,
ErrorMessage: ex.Message,
UpdatedEntry: null!));
failed++;
}
}
return new BatchReplayResult(
Attempted: entryIds.Count,
Succeeded: succeeded,
Failed: failed,
Results: results);
}
public async Task<BatchReplayResult> ReplayPendingAsync(
string tenantId,
string? errorCode,
ErrorCategory? category,
int maxCount,
string initiatedBy,
CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentException.ThrowIfNullOrWhiteSpace(initiatedBy);
var effectiveLimit = Math.Min(maxCount, _options.MaxBatchSize);
IReadOnlyList<DeadLetterEntry> entries;
if (!string.IsNullOrEmpty(errorCode))
{
entries = await _deadLetterRepository.GetByErrorCodeAsync(
tenantId, errorCode, DeadLetterStatus.Pending, effectiveLimit, cancellationToken)
.ConfigureAwait(false);
}
else if (category.HasValue)
{
entries = await _deadLetterRepository.GetByCategoryAsync(
tenantId, category.Value, DeadLetterStatus.Pending, effectiveLimit, cancellationToken)
.ConfigureAwait(false);
}
else
{
entries = await _deadLetterRepository.GetPendingRetryableAsync(tenantId, effectiveLimit, cancellationToken)
.ConfigureAwait(false);
}
var results = new List<ReplayResult>();
var succeeded = 0;
var failed = 0;
foreach (var entry in entries)
{
if (!entry.CanReplay)
{
continue;
}
try
{
var result = await ReplayEntryAsync(entry, "auto", initiatedBy, cancellationToken)
.ConfigureAwait(false);
results.Add(result);
if (result.Success)
succeeded++;
else
failed++;
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to replay entry {EntryId}", entry.EntryId);
results.Add(new ReplayResult(
Success: false,
NewJobId: null,
ErrorMessage: ex.Message,
UpdatedEntry: entry));
failed++;
}
}
return new BatchReplayResult(
Attempted: results.Count,
Succeeded: succeeded,
Failed: failed,
Results: results);
}
public async Task<DeadLetterEntry> ResolveAsync(
string tenantId,
Guid entryId,
string notes,
string resolvedBy,
CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentException.ThrowIfNullOrWhiteSpace(resolvedBy);
var entry = await _deadLetterRepository.GetByIdAsync(tenantId, entryId, cancellationToken)
.ConfigureAwait(false);
if (entry is null)
{
throw new InvalidOperationException($"Dead-letter entry {entryId} not found.");
}
var now = _timeProvider.GetUtcNow();
var resolved = entry.Resolve(notes, resolvedBy, now);
await _deadLetterRepository.UpdateAsync(resolved, cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"Resolved dead-letter entry {EntryId} for job {JobId}. Notes: {Notes}",
entryId, entry.OriginalJobId, notes);
return resolved;
}
public async Task<int> ResolveBatchAsync(
string tenantId,
IReadOnlyList<Guid> entryIds,
string notes,
string resolvedBy,
CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentNullException.ThrowIfNull(entryIds);
ArgumentException.ThrowIfNullOrWhiteSpace(resolvedBy);
var resolved = 0;
var now = _timeProvider.GetUtcNow();
foreach (var entryId in entryIds)
{
try
{
var entry = await _deadLetterRepository.GetByIdAsync(tenantId, entryId, cancellationToken)
.ConfigureAwait(false);
if (entry is null || entry.IsTerminal)
{
continue;
}
var resolvedEntry = entry.Resolve(notes, resolvedBy, now);
await _deadLetterRepository.UpdateAsync(resolvedEntry, cancellationToken).ConfigureAwait(false);
resolved++;
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to resolve entry {EntryId}", entryId);
}
}
return resolved;
}
private async Task<ReplayResult> ReplayEntryAsync(
DeadLetterEntry entry,
string triggeredBy,
string initiatedBy,
CancellationToken cancellationToken)
{
if (!entry.CanReplay)
{
return new ReplayResult(
Success: false,
NewJobId: null,
ErrorMessage: $"Entry cannot be replayed: status={entry.Status}, attempts={entry.ReplayAttempts}/{entry.MaxReplayAttempts}, retryable={entry.IsRetryable}",
UpdatedEntry: entry);
}
var now = _timeProvider.GetUtcNow();
// Mark entry as replaying
var replaying = entry.StartReplay(initiatedBy, now);
await _deadLetterRepository.UpdateAsync(replaying, cancellationToken).ConfigureAwait(false);
// Create audit record
var auditRecord = ReplayAuditRecord.Create(
entry.TenantId,
entry.EntryId,
replaying.ReplayAttempts,
triggeredBy,
initiatedBy,
now);
await _auditRepository.CreateAsync(auditRecord, cancellationToken).ConfigureAwait(false);
try
{
// Create new job with updated idempotency key
var newIdempotencyKey = $"{entry.IdempotencyKey}:replay:{replaying.ReplayAttempts}";
var newJob = await _jobCreator.CreateFromReplayAsync(
entry.TenantId,
entry.JobType,
entry.Payload,
entry.PayloadDigest,
newIdempotencyKey,
entry.CorrelationId,
entry.OriginalJobId,
initiatedBy,
cancellationToken).ConfigureAwait(false);
// Mark replay successful
now = _timeProvider.GetUtcNow();
var completed = replaying.CompleteReplay(newJob.JobId, initiatedBy, now);
await _deadLetterRepository.UpdateAsync(completed, cancellationToken).ConfigureAwait(false);
// Update audit record
var completedAudit = auditRecord.Complete(newJob.JobId, now);
await _auditRepository.UpdateAsync(completedAudit, cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"Replayed dead-letter entry {EntryId} as new job {NewJobId}",
entry.EntryId, newJob.JobId);
// Notify on success
await _notifier.NotifyReplaySuccessAsync(completed, newJob.JobId, cancellationToken)
.ConfigureAwait(false);
return new ReplayResult(
Success: true,
NewJobId: newJob.JobId,
ErrorMessage: null,
UpdatedEntry: completed);
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to replay entry {EntryId}", entry.EntryId);
// Mark replay failed
now = _timeProvider.GetUtcNow();
var failed = replaying.FailReplay(ex.Message, initiatedBy, now);
await _deadLetterRepository.UpdateAsync(failed, cancellationToken).ConfigureAwait(false);
// Update audit record
var failedAudit = auditRecord.Fail(ex.Message, now);
await _auditRepository.UpdateAsync(failedAudit, cancellationToken).ConfigureAwait(false);
// Notify on exhausted
if (failed.Status == DeadLetterStatus.Exhausted)
{
await _notifier.NotifyExhaustedAsync(failed, cancellationToken).ConfigureAwait(false);
}
return new ReplayResult(
Success: false,
NewJobId: null,
ErrorMessage: ex.Message,
UpdatedEntry: failed);
}
}
}

View File

@@ -0,0 +1,39 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents an artifact produced by a job execution.
/// Artifacts are immutable outputs with content digests for provenance.
/// </summary>
public sealed record Artifact(
/// <summary>Unique artifact identifier.</summary>
Guid ArtifactId,
/// <summary>Tenant owning this artifact.</summary>
string TenantId,
/// <summary>Job that produced this artifact.</summary>
Guid JobId,
/// <summary>Run containing the producing job (if any).</summary>
Guid? RunId,
/// <summary>Artifact type (e.g., "sbom", "scan-result", "attestation", "log").</summary>
string ArtifactType,
/// <summary>Storage URI (e.g., "s3://bucket/path", "file:///local/path").</summary>
string Uri,
/// <summary>Content digest (SHA-256) for integrity verification.</summary>
string Digest,
/// <summary>MIME type (e.g., "application/json", "application/vnd.cyclonedx+json").</summary>
string? MimeType,
/// <summary>Artifact size in bytes.</summary>
long? SizeBytes,
/// <summary>When the artifact was created.</summary>
DateTimeOffset CreatedAt,
/// <summary>Optional metadata JSON blob.</summary>
string? Metadata);

View File

@@ -0,0 +1,250 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents an immutable audit log entry for orchestrator operations.
/// Captures who did what, when, and with what effect.
/// </summary>
public sealed record AuditEntry(
/// <summary>Unique audit entry identifier.</summary>
Guid EntryId,
/// <summary>Tenant owning this entry.</summary>
string TenantId,
/// <summary>Type of audited event.</summary>
AuditEventType EventType,
/// <summary>Resource type being audited (job, run, source, quota, etc.).</summary>
string ResourceType,
/// <summary>Resource identifier being audited.</summary>
Guid ResourceId,
/// <summary>Actor who performed the action.</summary>
string ActorId,
/// <summary>Actor type (user, system, worker, api-key).</summary>
ActorType ActorType,
/// <summary>IP address of the actor (if applicable).</summary>
string? ActorIp,
/// <summary>User agent string (if applicable).</summary>
string? UserAgent,
/// <summary>HTTP method used (if applicable).</summary>
string? HttpMethod,
/// <summary>Request path (if applicable).</summary>
string? RequestPath,
/// <summary>State before the change (JSON).</summary>
string? OldState,
/// <summary>State after the change (JSON).</summary>
string? NewState,
/// <summary>Human-readable description of the change.</summary>
string Description,
/// <summary>Correlation ID for distributed tracing.</summary>
string? CorrelationId,
/// <summary>SHA-256 hash of the previous entry for chain integrity.</summary>
string? PreviousEntryHash,
/// <summary>SHA-256 hash of this entry's content for integrity.</summary>
string ContentHash,
/// <summary>Sequence number within the tenant's audit stream.</summary>
long SequenceNumber,
/// <summary>When the event occurred.</summary>
DateTimeOffset OccurredAt,
/// <summary>Optional metadata JSON blob.</summary>
string? Metadata)
{
/// <summary>
/// Creates a new audit entry with computed hash.
/// </summary>
public static AuditEntry Create(
string tenantId,
AuditEventType eventType,
string resourceType,
Guid resourceId,
string actorId,
ActorType actorType,
string description,
string? oldState = null,
string? newState = null,
string? actorIp = null,
string? userAgent = null,
string? httpMethod = null,
string? requestPath = null,
string? correlationId = null,
string? previousEntryHash = null,
long sequenceNumber = 0,
string? metadata = null)
{
var entryId = Guid.NewGuid();
var occurredAt = DateTimeOffset.UtcNow;
// Compute content hash from entry data
var contentToHash = $"{entryId}|{tenantId}|{eventType}|{resourceType}|{resourceId}|{actorId}|{actorType}|{description}|{oldState}|{newState}|{occurredAt:O}|{sequenceNumber}";
var contentHash = ComputeSha256(contentToHash);
return new AuditEntry(
EntryId: entryId,
TenantId: tenantId,
EventType: eventType,
ResourceType: resourceType,
ResourceId: resourceId,
ActorId: actorId,
ActorType: actorType,
ActorIp: actorIp,
UserAgent: userAgent,
HttpMethod: httpMethod,
RequestPath: requestPath,
OldState: oldState,
NewState: newState,
Description: description,
CorrelationId: correlationId,
PreviousEntryHash: previousEntryHash,
ContentHash: contentHash,
SequenceNumber: sequenceNumber,
OccurredAt: occurredAt,
Metadata: metadata);
}
/// <summary>
/// Verifies the integrity of this entry's content hash.
/// </summary>
public bool VerifyIntegrity()
{
var contentToHash = $"{EntryId}|{TenantId}|{EventType}|{ResourceType}|{ResourceId}|{ActorId}|{ActorType}|{Description}|{OldState}|{NewState}|{OccurredAt:O}|{SequenceNumber}";
var computed = ComputeSha256(contentToHash);
return string.Equals(ContentHash, computed, StringComparison.OrdinalIgnoreCase);
}
/// <summary>
/// Verifies the chain link to the previous entry.
/// </summary>
public bool VerifyChainLink(AuditEntry? previousEntry)
{
if (previousEntry is null)
{
return PreviousEntryHash is null || SequenceNumber == 1;
}
return string.Equals(PreviousEntryHash, previousEntry.ContentHash, StringComparison.OrdinalIgnoreCase);
}
private static string ComputeSha256(string content)
{
var bytes = System.Text.Encoding.UTF8.GetBytes(content);
var hash = System.Security.Cryptography.SHA256.HashData(bytes);
return Convert.ToHexString(hash).ToLowerInvariant();
}
}
/// <summary>
/// Types of auditable events in the orchestrator.
/// </summary>
public enum AuditEventType
{
// Job lifecycle events
JobCreated = 100,
JobScheduled = 101,
JobLeased = 102,
JobCompleted = 103,
JobFailed = 104,
JobCanceled = 105,
JobRetried = 106,
// Run lifecycle events
RunCreated = 200,
RunStarted = 201,
RunCompleted = 202,
RunFailed = 203,
RunCanceled = 204,
// Source management events
SourceCreated = 300,
SourceUpdated = 301,
SourcePaused = 302,
SourceResumed = 303,
SourceDeleted = 304,
// Quota management events
QuotaCreated = 400,
QuotaUpdated = 401,
QuotaPaused = 402,
QuotaResumed = 403,
QuotaDeleted = 404,
// SLO management events
SloCreated = 500,
SloUpdated = 501,
SloEnabled = 502,
SloDisabled = 503,
SloDeleted = 504,
SloAlertTriggered = 505,
SloAlertAcknowledged = 506,
SloAlertResolved = 507,
// Dead-letter events
DeadLetterCreated = 600,
DeadLetterReplayed = 601,
DeadLetterResolved = 602,
DeadLetterExpired = 603,
// Backfill events
BackfillCreated = 700,
BackfillStarted = 701,
BackfillCompleted = 702,
BackfillFailed = 703,
BackfillCanceled = 704,
// Ledger events
LedgerExportRequested = 800,
LedgerExportCompleted = 801,
LedgerExportFailed = 802,
// Worker events
WorkerClaimed = 900,
WorkerHeartbeat = 901,
WorkerProgressReported = 902,
WorkerCompleted = 903,
// Security events
AuthenticationSuccess = 1000,
AuthenticationFailure = 1001,
AuthorizationDenied = 1002,
ApiKeyCreated = 1003,
ApiKeyRevoked = 1004
}
/// <summary>
/// Types of actors that can perform auditable actions.
/// </summary>
public enum ActorType
{
/// <summary>Human user via UI or API.</summary>
User = 0,
/// <summary>System-initiated action (scheduler, background job).</summary>
System = 1,
/// <summary>Worker process.</summary>
Worker = 2,
/// <summary>API key authentication.</summary>
ApiKey = 3,
/// <summary>Service-to-service call.</summary>
Service = 4,
/// <summary>Unknown or unidentified actor.</summary>
Unknown = 99
}

View File

@@ -0,0 +1,429 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents a request to backfill/reprocess events within a time window.
/// </summary>
public sealed record BackfillRequest(
/// <summary>Unique backfill request identifier.</summary>
Guid BackfillId,
/// <summary>Tenant this backfill applies to.</summary>
string TenantId,
/// <summary>Source to backfill (null if job-type scoped).</summary>
Guid? SourceId,
/// <summary>Job type to backfill (null if source-scoped).</summary>
string? JobType,
/// <summary>Normalized scope key.</summary>
string ScopeKey,
/// <summary>Current status of the backfill.</summary>
BackfillStatus Status,
/// <summary>Start of the time window to backfill (inclusive).</summary>
DateTimeOffset WindowStart,
/// <summary>End of the time window to backfill (exclusive).</summary>
DateTimeOffset WindowEnd,
/// <summary>Current processing position within the window.</summary>
DateTimeOffset? CurrentPosition,
/// <summary>Total events estimated in the window.</summary>
long? TotalEvents,
/// <summary>Events successfully processed.</summary>
long ProcessedEvents,
/// <summary>Events skipped due to duplicate suppression.</summary>
long SkippedEvents,
/// <summary>Events that failed processing.</summary>
long FailedEvents,
/// <summary>Number of events to process per batch.</summary>
int BatchSize,
/// <summary>Whether this is a dry-run (preview only, no changes).</summary>
bool DryRun,
/// <summary>Whether to force reprocessing (ignore duplicate suppression).</summary>
bool ForceReprocess,
/// <summary>Estimated duration for the backfill.</summary>
TimeSpan? EstimatedDuration,
/// <summary>Maximum allowed duration (safety limit).</summary>
TimeSpan? MaxDuration,
/// <summary>Results of safety validation checks.</summary>
BackfillSafetyChecks? SafetyChecks,
/// <summary>Reason for the backfill request.</summary>
string Reason,
/// <summary>Optional ticket reference for audit.</summary>
string? Ticket,
/// <summary>When the request was created.</summary>
DateTimeOffset CreatedAt,
/// <summary>When processing started.</summary>
DateTimeOffset? StartedAt,
/// <summary>When processing completed.</summary>
DateTimeOffset? CompletedAt,
/// <summary>Actor who created the request.</summary>
string CreatedBy,
/// <summary>Actor who last modified the request.</summary>
string UpdatedBy,
/// <summary>Error message if failed.</summary>
string? ErrorMessage)
{
/// <summary>
/// Window duration.
/// </summary>
public TimeSpan WindowDuration => WindowEnd - WindowStart;
/// <summary>
/// Progress percentage (0-100).
/// </summary>
public double ProgressPercent => TotalEvents > 0
? Math.Round((double)(ProcessedEvents + SkippedEvents + FailedEvents) / TotalEvents.Value * 100, 2)
: 0;
/// <summary>
/// Whether the backfill is in a terminal state.
/// </summary>
public bool IsTerminal => Status is BackfillStatus.Completed or BackfillStatus.Failed or BackfillStatus.Canceled;
/// <summary>
/// Creates a new backfill request.
/// </summary>
public static BackfillRequest Create(
string tenantId,
Guid? sourceId,
string? jobType,
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
string reason,
string createdBy,
int batchSize = 100,
bool dryRun = false,
bool forceReprocess = false,
string? ticket = null,
TimeSpan? maxDuration = null)
{
if (windowEnd <= windowStart)
throw new ArgumentException("Window end must be after window start.", nameof(windowEnd));
if (batchSize <= 0 || batchSize > 10000)
throw new ArgumentOutOfRangeException(nameof(batchSize), "Batch size must be between 1 and 10000.");
var scopeKey = (sourceId, jobType) switch
{
(Guid s, string j) when !string.IsNullOrEmpty(j) => Watermark.CreateScopeKey(s, j),
(Guid s, _) => Watermark.CreateScopeKey(s),
(_, string j) when !string.IsNullOrEmpty(j) => Watermark.CreateScopeKey(j),
_ => throw new ArgumentException("Either sourceId or jobType must be specified.")
};
var now = DateTimeOffset.UtcNow;
return new BackfillRequest(
BackfillId: Guid.NewGuid(),
TenantId: tenantId,
SourceId: sourceId,
JobType: jobType,
ScopeKey: scopeKey,
Status: BackfillStatus.Pending,
WindowStart: windowStart,
WindowEnd: windowEnd,
CurrentPosition: null,
TotalEvents: null,
ProcessedEvents: 0,
SkippedEvents: 0,
FailedEvents: 0,
BatchSize: batchSize,
DryRun: dryRun,
ForceReprocess: forceReprocess,
EstimatedDuration: null,
MaxDuration: maxDuration,
SafetyChecks: null,
Reason: reason,
Ticket: ticket,
CreatedAt: now,
StartedAt: null,
CompletedAt: null,
CreatedBy: createdBy,
UpdatedBy: createdBy,
ErrorMessage: null);
}
/// <summary>
/// Transitions to validating status.
/// </summary>
public BackfillRequest StartValidation(string updatedBy)
{
if (Status != BackfillStatus.Pending)
throw new InvalidOperationException($"Cannot start validation from status {Status}.");
return this with
{
Status = BackfillStatus.Validating,
UpdatedBy = updatedBy
};
}
/// <summary>
/// Records safety check results.
/// </summary>
public BackfillRequest WithSafetyChecks(BackfillSafetyChecks checks, long? totalEvents, TimeSpan? estimatedDuration, string updatedBy)
{
return this with
{
SafetyChecks = checks,
TotalEvents = totalEvents,
EstimatedDuration = estimatedDuration,
UpdatedBy = updatedBy
};
}
/// <summary>
/// Transitions to running status.
/// </summary>
public BackfillRequest Start(string updatedBy)
{
if (Status != BackfillStatus.Validating)
throw new InvalidOperationException($"Cannot start from status {Status}.");
if (SafetyChecks?.HasBlockingIssues == true)
throw new InvalidOperationException("Cannot start backfill with blocking safety issues.");
return this with
{
Status = BackfillStatus.Running,
StartedAt = DateTimeOffset.UtcNow,
CurrentPosition = WindowStart,
UpdatedBy = updatedBy
};
}
/// <summary>
/// Updates progress after processing a batch.
/// </summary>
public BackfillRequest UpdateProgress(
DateTimeOffset newPosition,
long processed,
long skipped,
long failed,
string updatedBy)
{
if (Status != BackfillStatus.Running)
throw new InvalidOperationException($"Cannot update progress in status {Status}.");
return this with
{
CurrentPosition = newPosition,
ProcessedEvents = ProcessedEvents + processed,
SkippedEvents = SkippedEvents + skipped,
FailedEvents = FailedEvents + failed,
UpdatedBy = updatedBy
};
}
/// <summary>
/// Pauses the backfill.
/// </summary>
public BackfillRequest Pause(string updatedBy)
{
if (Status != BackfillStatus.Running)
throw new InvalidOperationException($"Cannot pause from status {Status}.");
return this with
{
Status = BackfillStatus.Paused,
UpdatedBy = updatedBy
};
}
/// <summary>
/// Resumes a paused backfill.
/// </summary>
public BackfillRequest Resume(string updatedBy)
{
if (Status != BackfillStatus.Paused)
throw new InvalidOperationException($"Cannot resume from status {Status}.");
return this with
{
Status = BackfillStatus.Running,
UpdatedBy = updatedBy
};
}
/// <summary>
/// Completes the backfill successfully.
/// </summary>
public BackfillRequest Complete(string updatedBy)
{
if (Status != BackfillStatus.Running)
throw new InvalidOperationException($"Cannot complete from status {Status}.");
return this with
{
Status = BackfillStatus.Completed,
CompletedAt = DateTimeOffset.UtcNow,
CurrentPosition = WindowEnd,
UpdatedBy = updatedBy
};
}
/// <summary>
/// Fails the backfill with an error.
/// </summary>
public BackfillRequest Fail(string error, string updatedBy)
{
return this with
{
Status = BackfillStatus.Failed,
CompletedAt = DateTimeOffset.UtcNow,
ErrorMessage = error,
UpdatedBy = updatedBy
};
}
/// <summary>
/// Cancels the backfill.
/// </summary>
public BackfillRequest Cancel(string updatedBy)
{
if (IsTerminal)
throw new InvalidOperationException($"Cannot cancel from terminal status {Status}.");
return this with
{
Status = BackfillStatus.Canceled,
CompletedAt = DateTimeOffset.UtcNow,
UpdatedBy = updatedBy
};
}
}
/// <summary>
/// Status of a backfill request.
/// </summary>
public enum BackfillStatus
{
/// <summary>Request created, awaiting validation.</summary>
Pending,
/// <summary>Running safety validations.</summary>
Validating,
/// <summary>Actively processing events.</summary>
Running,
/// <summary>Temporarily paused.</summary>
Paused,
/// <summary>Successfully completed.</summary>
Completed,
/// <summary>Failed with error.</summary>
Failed,
/// <summary>Canceled by operator.</summary>
Canceled
}
/// <summary>
/// Results of backfill safety validation checks.
/// </summary>
public sealed record BackfillSafetyChecks(
/// <summary>Whether the source exists and is accessible.</summary>
bool SourceExists,
/// <summary>Whether there are overlapping active backfills.</summary>
bool HasOverlappingBackfill,
/// <summary>Whether the window is within retention period.</summary>
bool WithinRetention,
/// <summary>Whether the estimated event count is within limits.</summary>
bool WithinEventLimit,
/// <summary>Whether estimated duration is within max duration.</summary>
bool WithinDurationLimit,
/// <summary>Whether required quotas are available.</summary>
bool QuotaAvailable,
/// <summary>Warning messages (non-blocking).</summary>
IReadOnlyList<string> Warnings,
/// <summary>Error messages (blocking).</summary>
IReadOnlyList<string> Errors)
{
/// <summary>
/// Whether there are any blocking issues.
/// </summary>
public bool HasBlockingIssues => !SourceExists || HasOverlappingBackfill || !WithinRetention
|| !WithinEventLimit || !WithinDurationLimit || Errors.Count > 0;
/// <summary>
/// Whether the backfill is safe to proceed.
/// </summary>
public bool IsSafe => !HasBlockingIssues;
/// <summary>
/// Creates successful safety checks with no issues.
/// </summary>
public static BackfillSafetyChecks AllPassed() => new(
SourceExists: true,
HasOverlappingBackfill: false,
WithinRetention: true,
WithinEventLimit: true,
WithinDurationLimit: true,
QuotaAvailable: true,
Warnings: [],
Errors: []);
}
/// <summary>
/// Preview result for dry-run backfill.
/// </summary>
public sealed record BackfillPreview(
/// <summary>Scope being backfilled.</summary>
string ScopeKey,
/// <summary>Time window for backfill.</summary>
DateTimeOffset WindowStart,
/// <summary>Time window for backfill.</summary>
DateTimeOffset WindowEnd,
/// <summary>Estimated total events in window.</summary>
long EstimatedEvents,
/// <summary>Events that would be skipped (already processed).</summary>
long SkippedEvents,
/// <summary>Events that would be processed.</summary>
long ProcessableEvents,
/// <summary>Estimated duration.</summary>
TimeSpan EstimatedDuration,
/// <summary>Number of batches required.</summary>
int EstimatedBatches,
/// <summary>Safety validation results.</summary>
BackfillSafetyChecks SafetyChecks,
/// <summary>Sample of event keys that would be processed.</summary>
IReadOnlyList<string> SampleEventKeys);

View File

@@ -0,0 +1,42 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents a dependency edge in a job DAG (Directed Acyclic Graph).
/// The child job cannot start until the parent job succeeds.
/// </summary>
public sealed record DagEdge(
/// <summary>Unique edge identifier.</summary>
Guid EdgeId,
/// <summary>Tenant owning this edge.</summary>
string TenantId,
/// <summary>Run containing these jobs.</summary>
Guid RunId,
/// <summary>Parent job ID (must complete first).</summary>
Guid ParentJobId,
/// <summary>Child job ID (depends on parent).</summary>
Guid ChildJobId,
/// <summary>Edge type (e.g., "success", "always", "failure").</summary>
string EdgeType,
/// <summary>When this edge was created.</summary>
DateTimeOffset CreatedAt);
/// <summary>
/// Edge types defining dependency semantics.
/// </summary>
public static class DagEdgeTypes
{
/// <summary>Child runs only if parent succeeds.</summary>
public const string Success = "success";
/// <summary>Child runs regardless of parent outcome.</summary>
public const string Always = "always";
/// <summary>Child runs only if parent fails.</summary>
public const string Failure = "failure";
}

View File

@@ -0,0 +1,292 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents a job that has been moved to the dead-letter store after exhausting retries
/// or encountering a non-retryable error.
/// </summary>
public sealed record DeadLetterEntry(
/// <summary>Unique dead-letter entry identifier.</summary>
Guid EntryId,
/// <summary>Tenant owning this entry.</summary>
string TenantId,
/// <summary>Original job that failed.</summary>
Guid OriginalJobId,
/// <summary>Run the job belonged to (if any).</summary>
Guid? RunId,
/// <summary>Source the job was processing (if any).</summary>
Guid? SourceId,
/// <summary>Job type (e.g., "scan.image", "advisory.nvd").</summary>
string JobType,
/// <summary>Job payload JSON (inputs, parameters).</summary>
string Payload,
/// <summary>SHA-256 digest of the payload.</summary>
string PayloadDigest,
/// <summary>Idempotency key from original job.</summary>
string IdempotencyKey,
/// <summary>Correlation ID for distributed tracing.</summary>
string? CorrelationId,
/// <summary>Current entry status.</summary>
DeadLetterStatus Status,
/// <summary>Classified error code.</summary>
string ErrorCode,
/// <summary>Human-readable failure reason.</summary>
string FailureReason,
/// <summary>Suggested remediation hint for operators.</summary>
string? RemediationHint,
/// <summary>Error classification category.</summary>
ErrorCategory Category,
/// <summary>Whether this error is potentially retryable.</summary>
bool IsRetryable,
/// <summary>Number of attempts made by original job.</summary>
int OriginalAttempts,
/// <summary>Number of replay attempts from dead-letter.</summary>
int ReplayAttempts,
/// <summary>Maximum replay attempts allowed.</summary>
int MaxReplayAttempts,
/// <summary>When the job originally failed.</summary>
DateTimeOffset FailedAt,
/// <summary>When the entry was created in dead-letter store.</summary>
DateTimeOffset CreatedAt,
/// <summary>When the entry was last updated.</summary>
DateTimeOffset UpdatedAt,
/// <summary>When the entry expires and can be purged.</summary>
DateTimeOffset ExpiresAt,
/// <summary>When the entry was resolved (if applicable).</summary>
DateTimeOffset? ResolvedAt,
/// <summary>Resolution notes (if resolved).</summary>
string? ResolutionNotes,
/// <summary>Actor who created/submitted the original job.</summary>
string CreatedBy,
/// <summary>Actor who last updated the entry.</summary>
string UpdatedBy)
{
/// <summary>Default retention period for dead-letter entries.</summary>
public static readonly TimeSpan DefaultRetention = TimeSpan.FromDays(30);
/// <summary>Default maximum replay attempts.</summary>
public const int DefaultMaxReplayAttempts = 3;
/// <summary>Whether this entry is in a terminal state.</summary>
public bool IsTerminal => Status is DeadLetterStatus.Replayed
or DeadLetterStatus.Resolved
or DeadLetterStatus.Exhausted
or DeadLetterStatus.Expired;
/// <summary>Whether more replay attempts are allowed.</summary>
public bool CanReplay => !IsTerminal && IsRetryable && ReplayAttempts < MaxReplayAttempts;
/// <summary>Creates a new dead-letter entry from a failed job.</summary>
public static DeadLetterEntry FromFailedJob(
Job job,
string errorCode,
string failureReason,
string? remediationHint,
ErrorCategory category,
bool isRetryable,
DateTimeOffset now,
TimeSpan? retention = null,
int? maxReplayAttempts = null)
{
ArgumentNullException.ThrowIfNull(job);
ArgumentException.ThrowIfNullOrWhiteSpace(errorCode);
ArgumentException.ThrowIfNullOrWhiteSpace(failureReason);
var effectiveRetention = retention ?? DefaultRetention;
var effectiveMaxReplays = maxReplayAttempts ?? DefaultMaxReplayAttempts;
return new DeadLetterEntry(
EntryId: Guid.NewGuid(),
TenantId: job.TenantId,
OriginalJobId: job.JobId,
RunId: job.RunId,
SourceId: null, // Would be extracted from payload if available
JobType: job.JobType,
Payload: job.Payload,
PayloadDigest: job.PayloadDigest,
IdempotencyKey: job.IdempotencyKey,
CorrelationId: job.CorrelationId,
Status: DeadLetterStatus.Pending,
ErrorCode: errorCode,
FailureReason: failureReason,
RemediationHint: remediationHint,
Category: category,
IsRetryable: isRetryable,
OriginalAttempts: job.Attempt,
ReplayAttempts: 0,
MaxReplayAttempts: effectiveMaxReplays,
FailedAt: job.CompletedAt ?? now,
CreatedAt: now,
UpdatedAt: now,
ExpiresAt: now.Add(effectiveRetention),
ResolvedAt: null,
ResolutionNotes: null,
CreatedBy: job.CreatedBy,
UpdatedBy: "system");
}
/// <summary>Marks entry as being replayed.</summary>
public DeadLetterEntry StartReplay(string updatedBy, DateTimeOffset now)
{
if (!CanReplay)
throw new InvalidOperationException($"Cannot replay entry in status {Status} with {ReplayAttempts}/{MaxReplayAttempts} attempts.");
return this with
{
Status = DeadLetterStatus.Replaying,
ReplayAttempts = ReplayAttempts + 1,
UpdatedAt = now,
UpdatedBy = updatedBy
};
}
/// <summary>Marks entry as successfully replayed.</summary>
public DeadLetterEntry CompleteReplay(Guid newJobId, string updatedBy, DateTimeOffset now)
{
if (Status != DeadLetterStatus.Replaying)
throw new InvalidOperationException($"Cannot complete replay from status {Status}.");
return this with
{
Status = DeadLetterStatus.Replayed,
ResolvedAt = now,
ResolutionNotes = $"Replayed as job {newJobId}",
UpdatedAt = now,
UpdatedBy = updatedBy
};
}
/// <summary>Marks replay as failed.</summary>
public DeadLetterEntry FailReplay(string reason, string updatedBy, DateTimeOffset now)
{
if (Status != DeadLetterStatus.Replaying)
throw new InvalidOperationException($"Cannot fail replay from status {Status}.");
var newStatus = ReplayAttempts >= MaxReplayAttempts
? DeadLetterStatus.Exhausted
: DeadLetterStatus.Pending;
return this with
{
Status = newStatus,
FailureReason = reason,
UpdatedAt = now,
UpdatedBy = updatedBy
};
}
/// <summary>Manually resolves the entry without replay.</summary>
public DeadLetterEntry Resolve(string notes, string updatedBy, DateTimeOffset now)
{
if (IsTerminal)
throw new InvalidOperationException($"Cannot resolve entry in terminal status {Status}.");
return this with
{
Status = DeadLetterStatus.Resolved,
ResolvedAt = now,
ResolutionNotes = notes,
UpdatedAt = now,
UpdatedBy = updatedBy
};
}
/// <summary>Marks entry as expired for cleanup.</summary>
public DeadLetterEntry MarkExpired(DateTimeOffset now)
{
if (IsTerminal)
throw new InvalidOperationException($"Cannot expire entry in terminal status {Status}.");
return this with
{
Status = DeadLetterStatus.Expired,
UpdatedAt = now,
UpdatedBy = "system"
};
}
}
/// <summary>
/// Dead-letter entry lifecycle states.
/// </summary>
public enum DeadLetterStatus
{
/// <summary>Entry awaiting operator action or replay.</summary>
Pending = 0,
/// <summary>Entry currently being replayed.</summary>
Replaying = 1,
/// <summary>Entry successfully replayed as a new job.</summary>
Replayed = 2,
/// <summary>Entry manually resolved without replay.</summary>
Resolved = 3,
/// <summary>Entry exhausted all replay attempts.</summary>
Exhausted = 4,
/// <summary>Entry expired and eligible for purge.</summary>
Expired = 5
}
/// <summary>
/// Error classification categories for dead-letter entries.
/// </summary>
public enum ErrorCategory
{
/// <summary>Unknown or unclassified error.</summary>
Unknown = 0,
/// <summary>Transient infrastructure error (network, timeout).</summary>
Transient = 1,
/// <summary>Resource not found (image, source, etc.).</summary>
NotFound = 2,
/// <summary>Authentication or authorization failure.</summary>
AuthFailure = 3,
/// <summary>Rate limiting or quota exceeded.</summary>
RateLimited = 4,
/// <summary>Invalid input or configuration.</summary>
ValidationError = 5,
/// <summary>Upstream service error (registry, advisory feed).</summary>
UpstreamError = 6,
/// <summary>Internal processing error (bug, corruption).</summary>
InternalError = 7,
/// <summary>Resource conflict (duplicate, version mismatch).</summary>
Conflict = 8,
/// <summary>Operation canceled by user or system.</summary>
Canceled = 9
}

View File

@@ -0,0 +1,69 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents an operational incident triggered by threshold breaches.
/// Incidents are generated when failure rates exceed configured limits.
/// </summary>
public sealed record Incident(
/// <summary>Unique incident identifier.</summary>
Guid IncidentId,
/// <summary>Tenant affected by this incident.</summary>
string TenantId,
/// <summary>Incident type (e.g., "failure_rate", "quota_exhausted", "circuit_open").</summary>
string IncidentType,
/// <summary>Incident severity (e.g., "warning", "critical").</summary>
string Severity,
/// <summary>Affected job type (if applicable).</summary>
string? JobType,
/// <summary>Affected source (if applicable).</summary>
Guid? SourceId,
/// <summary>Human-readable incident title.</summary>
string Title,
/// <summary>Detailed incident description.</summary>
string Description,
/// <summary>Current incident status.</summary>
IncidentStatus Status,
/// <summary>When the incident was created.</summary>
DateTimeOffset CreatedAt,
/// <summary>When the incident was acknowledged.</summary>
DateTimeOffset? AcknowledgedAt,
/// <summary>Actor who acknowledged the incident.</summary>
string? AcknowledgedBy,
/// <summary>When the incident was resolved.</summary>
DateTimeOffset? ResolvedAt,
/// <summary>Actor who resolved the incident.</summary>
string? ResolvedBy,
/// <summary>Resolution notes.</summary>
string? ResolutionNotes,
/// <summary>Optional metadata JSON blob.</summary>
string? Metadata);
/// <summary>
/// Incident lifecycle states.
/// </summary>
public enum IncidentStatus
{
/// <summary>Incident is open and unacknowledged.</summary>
Open = 0,
/// <summary>Incident acknowledged by operator.</summary>
Acknowledged = 1,
/// <summary>Incident resolved.</summary>
Resolved = 2
}

View File

@@ -0,0 +1,81 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents a unit of work to be executed by a worker.
/// Jobs are scheduled, leased to workers, and tracked through completion.
/// </summary>
public sealed record Job(
/// <summary>Unique job identifier.</summary>
Guid JobId,
/// <summary>Tenant owning this job.</summary>
string TenantId,
/// <summary>Optional project scope within tenant.</summary>
string? ProjectId,
/// <summary>Run this job belongs to (if any).</summary>
Guid? RunId,
/// <summary>Job type (e.g., "scan.image", "advisory.nvd", "export.sbom").</summary>
string JobType,
/// <summary>Current job status.</summary>
JobStatus Status,
/// <summary>Priority (higher = more urgent). Default 0.</summary>
int Priority,
/// <summary>Current attempt number (1-based).</summary>
int Attempt,
/// <summary>Maximum retry attempts.</summary>
int MaxAttempts,
/// <summary>SHA-256 digest of the payload for determinism verification.</summary>
string PayloadDigest,
/// <summary>Job payload JSON (inputs, parameters).</summary>
string Payload,
/// <summary>Idempotency key for deduplication.</summary>
string IdempotencyKey,
/// <summary>Correlation ID for distributed tracing.</summary>
string? CorrelationId,
/// <summary>Current lease ID (if leased).</summary>
Guid? LeaseId,
/// <summary>Worker holding the lease (if leased).</summary>
string? WorkerId,
/// <summary>Task runner ID executing the job (if applicable).</summary>
string? TaskRunnerId,
/// <summary>Lease expiration time.</summary>
DateTimeOffset? LeaseUntil,
/// <summary>When the job was created.</summary>
DateTimeOffset CreatedAt,
/// <summary>When the job was scheduled (quota cleared).</summary>
DateTimeOffset? ScheduledAt,
/// <summary>When the job was leased to a worker.</summary>
DateTimeOffset? LeasedAt,
/// <summary>When the job completed (terminal state).</summary>
DateTimeOffset? CompletedAt,
/// <summary>Earliest time the job can be scheduled (for backoff).</summary>
DateTimeOffset? NotBefore,
/// <summary>Terminal status reason (failure message, cancel reason, etc.).</summary>
string? Reason,
/// <summary>ID of the original job if this is a replay.</summary>
Guid? ReplayOf,
/// <summary>Actor who created/submitted the job.</summary>
string CreatedBy);

View File

@@ -0,0 +1,48 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents an immutable history entry for job state changes.
/// Provides audit trail for all job lifecycle transitions.
/// </summary>
public sealed record JobHistory(
/// <summary>Unique history entry identifier.</summary>
Guid HistoryId,
/// <summary>Tenant owning this entry.</summary>
string TenantId,
/// <summary>Job this history entry belongs to.</summary>
Guid JobId,
/// <summary>Sequence number within the job's history (1-based).</summary>
int SequenceNo,
/// <summary>Previous job status.</summary>
JobStatus? FromStatus,
/// <summary>New job status.</summary>
JobStatus ToStatus,
/// <summary>Attempt number at time of transition.</summary>
int Attempt,
/// <summary>Lease ID (if applicable).</summary>
Guid? LeaseId,
/// <summary>Worker ID (if applicable).</summary>
string? WorkerId,
/// <summary>Reason for the transition.</summary>
string? Reason,
/// <summary>When this transition occurred.</summary>
DateTimeOffset OccurredAt,
/// <summary>When this entry was recorded.</summary>
DateTimeOffset RecordedAt,
/// <summary>Actor who caused this transition.</summary>
string ActorId,
/// <summary>Actor type (system, operator, worker).</summary>
string ActorType);

View File

@@ -0,0 +1,30 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Job lifecycle states. Transitions follow the state machine:
/// Pending → Scheduled → Leased → (Succeeded | Failed | Canceled | TimedOut)
/// Failed jobs may transition to Pending via replay.
/// </summary>
public enum JobStatus
{
/// <summary>Job enqueued but not yet scheduled (e.g., quota exceeded).</summary>
Pending = 0,
/// <summary>Job scheduled and awaiting worker lease.</summary>
Scheduled = 1,
/// <summary>Job leased to a worker for execution.</summary>
Leased = 2,
/// <summary>Job completed successfully.</summary>
Succeeded = 3,
/// <summary>Job failed after exhausting retries.</summary>
Failed = 4,
/// <summary>Job canceled by operator or system.</summary>
Canceled = 5,
/// <summary>Job timed out (lease expired without completion).</summary>
TimedOut = 6
}

View File

@@ -0,0 +1,60 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents rate-limit and concurrency quotas for job scheduling.
/// Quotas are scoped to tenant and optionally job type.
/// </summary>
public sealed record Quota(
/// <summary>Unique quota identifier.</summary>
Guid QuotaId,
/// <summary>Tenant this quota applies to.</summary>
string TenantId,
/// <summary>Job type this quota applies to (null = all job types).</summary>
string? JobType,
/// <summary>Maximum concurrent active (leased) jobs.</summary>
int MaxActive,
/// <summary>Maximum jobs per hour (sliding window).</summary>
int MaxPerHour,
/// <summary>Burst capacity for token bucket.</summary>
int BurstCapacity,
/// <summary>Token refill rate (tokens per second).</summary>
double RefillRate,
/// <summary>Current available tokens.</summary>
double CurrentTokens,
/// <summary>Last time tokens were refilled.</summary>
DateTimeOffset LastRefillAt,
/// <summary>Current count of active (leased) jobs.</summary>
int CurrentActive,
/// <summary>Jobs scheduled in current hour window.</summary>
int CurrentHourCount,
/// <summary>Start of current hour window.</summary>
DateTimeOffset CurrentHourStart,
/// <summary>Whether this quota is currently paused (operator override).</summary>
bool Paused,
/// <summary>Operator-provided reason for pause.</summary>
string? PauseReason,
/// <summary>Ticket reference for quota change audit.</summary>
string? QuotaTicket,
/// <summary>When the quota was created.</summary>
DateTimeOffset CreatedAt,
/// <summary>When the quota was last updated.</summary>
DateTimeOffset UpdatedAt,
/// <summary>Actor who last modified the quota.</summary>
string UpdatedBy);

View File

@@ -0,0 +1,78 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents a run (batch/workflow execution) containing multiple jobs.
/// Runs group related jobs (e.g., scanning an image produces multiple analyzer jobs).
/// </summary>
public sealed record Run(
/// <summary>Unique run identifier.</summary>
Guid RunId,
/// <summary>Tenant owning this run.</summary>
string TenantId,
/// <summary>Optional project scope within tenant.</summary>
string? ProjectId,
/// <summary>Source that initiated this run.</summary>
Guid SourceId,
/// <summary>Run type (e.g., "scan", "advisory-sync", "export").</summary>
string RunType,
/// <summary>Current aggregate status of the run.</summary>
RunStatus Status,
/// <summary>Correlation ID for distributed tracing.</summary>
string? CorrelationId,
/// <summary>Total number of jobs in this run.</summary>
int TotalJobs,
/// <summary>Number of completed jobs (succeeded + failed + canceled).</summary>
int CompletedJobs,
/// <summary>Number of succeeded jobs.</summary>
int SucceededJobs,
/// <summary>Number of failed jobs.</summary>
int FailedJobs,
/// <summary>When the run was created.</summary>
DateTimeOffset CreatedAt,
/// <summary>When the run started executing (first job leased).</summary>
DateTimeOffset? StartedAt,
/// <summary>When the run completed (all jobs terminal).</summary>
DateTimeOffset? CompletedAt,
/// <summary>Actor who initiated the run.</summary>
string CreatedBy,
/// <summary>Optional metadata JSON blob.</summary>
string? Metadata);
/// <summary>
/// Run lifecycle states.
/// </summary>
public enum RunStatus
{
/// <summary>Run created, jobs being enqueued.</summary>
Pending = 0,
/// <summary>Run is executing (at least one job leased).</summary>
Running = 1,
/// <summary>All jobs completed successfully.</summary>
Succeeded = 2,
/// <summary>Run completed with some failures.</summary>
PartiallySucceeded = 3,
/// <summary>All jobs failed.</summary>
Failed = 4,
/// <summary>Run canceled by operator.</summary>
Canceled = 5
}

View File

@@ -0,0 +1,341 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Immutable ledger entry for run execution records.
/// Provides a tamper-evident history of run outcomes with provenance to artifacts.
/// </summary>
public sealed record RunLedgerEntry(
/// <summary>Unique ledger entry identifier.</summary>
Guid LedgerId,
/// <summary>Tenant owning this entry.</summary>
string TenantId,
/// <summary>Run this entry records.</summary>
Guid RunId,
/// <summary>Source that initiated the run.</summary>
Guid SourceId,
/// <summary>Run type (scan, advisory-sync, export).</summary>
string RunType,
/// <summary>Final run status.</summary>
RunStatus FinalStatus,
/// <summary>Total jobs in the run.</summary>
int TotalJobs,
/// <summary>Successfully completed jobs.</summary>
int SucceededJobs,
/// <summary>Failed jobs.</summary>
int FailedJobs,
/// <summary>When the run was created.</summary>
DateTimeOffset RunCreatedAt,
/// <summary>When the run started executing.</summary>
DateTimeOffset? RunStartedAt,
/// <summary>When the run completed.</summary>
DateTimeOffset RunCompletedAt,
/// <summary>Total execution duration.</summary>
TimeSpan ExecutionDuration,
/// <summary>Actor who initiated the run.</summary>
string InitiatedBy,
/// <summary>SHA-256 digest of the run's input payload.</summary>
string InputDigest,
/// <summary>Aggregated SHA-256 digest of all outputs.</summary>
string OutputDigest,
/// <summary>JSON array of artifact references with their digests.</summary>
string ArtifactManifest,
/// <summary>Sequence number in the tenant's ledger.</summary>
long SequenceNumber,
/// <summary>SHA-256 hash of the previous ledger entry.</summary>
string? PreviousEntryHash,
/// <summary>SHA-256 hash of this entry's content.</summary>
string ContentHash,
/// <summary>When this ledger entry was created.</summary>
DateTimeOffset LedgerCreatedAt,
/// <summary>Correlation ID for tracing.</summary>
string? CorrelationId,
/// <summary>Optional metadata JSON.</summary>
string? Metadata)
{
/// <summary>
/// Creates a ledger entry from a completed run.
/// </summary>
public static RunLedgerEntry FromCompletedRun(
Run run,
IReadOnlyList<Artifact> artifacts,
string inputDigest,
long sequenceNumber,
string? previousEntryHash,
string? metadata = null)
{
if (run.CompletedAt is null)
{
throw new InvalidOperationException("Cannot create ledger entry from an incomplete run.");
}
var ledgerId = Guid.NewGuid();
var ledgerCreatedAt = DateTimeOffset.UtcNow;
// Build artifact manifest
var artifactManifest = BuildArtifactManifest(artifacts);
// Compute output digest from all artifact digests
var outputDigest = ComputeOutputDigest(artifacts);
// Compute execution duration
var startTime = run.StartedAt ?? run.CreatedAt;
var executionDuration = run.CompletedAt.Value - startTime;
// Compute content hash for tamper evidence
var contentToHash = $"{ledgerId}|{run.TenantId}|{run.RunId}|{run.SourceId}|{run.RunType}|{run.Status}|{run.TotalJobs}|{run.SucceededJobs}|{run.FailedJobs}|{run.CreatedAt:O}|{run.StartedAt:O}|{run.CompletedAt:O}|{inputDigest}|{outputDigest}|{sequenceNumber}|{previousEntryHash}|{ledgerCreatedAt:O}";
var contentHash = ComputeSha256(contentToHash);
return new RunLedgerEntry(
LedgerId: ledgerId,
TenantId: run.TenantId,
RunId: run.RunId,
SourceId: run.SourceId,
RunType: run.RunType,
FinalStatus: run.Status,
TotalJobs: run.TotalJobs,
SucceededJobs: run.SucceededJobs,
FailedJobs: run.FailedJobs,
RunCreatedAt: run.CreatedAt,
RunStartedAt: run.StartedAt,
RunCompletedAt: run.CompletedAt.Value,
ExecutionDuration: executionDuration,
InitiatedBy: run.CreatedBy,
InputDigest: inputDigest,
OutputDigest: outputDigest,
ArtifactManifest: artifactManifest,
SequenceNumber: sequenceNumber,
PreviousEntryHash: previousEntryHash,
ContentHash: contentHash,
LedgerCreatedAt: ledgerCreatedAt,
CorrelationId: run.CorrelationId,
Metadata: metadata);
}
/// <summary>
/// Verifies the integrity of this ledger entry.
/// </summary>
public bool VerifyIntegrity()
{
var contentToHash = $"{LedgerId}|{TenantId}|{RunId}|{SourceId}|{RunType}|{FinalStatus}|{TotalJobs}|{SucceededJobs}|{FailedJobs}|{RunCreatedAt:O}|{RunStartedAt:O}|{RunCompletedAt:O}|{InputDigest}|{OutputDigest}|{SequenceNumber}|{PreviousEntryHash}|{LedgerCreatedAt:O}";
var computed = ComputeSha256(contentToHash);
return string.Equals(ContentHash, computed, StringComparison.OrdinalIgnoreCase);
}
/// <summary>
/// Verifies the chain link to the previous entry.
/// </summary>
public bool VerifyChainLink(RunLedgerEntry? previousEntry)
{
if (previousEntry is null)
{
return PreviousEntryHash is null || SequenceNumber == 1;
}
return string.Equals(PreviousEntryHash, previousEntry.ContentHash, StringComparison.OrdinalIgnoreCase);
}
private static string BuildArtifactManifest(IReadOnlyList<Artifact> artifacts)
{
var entries = artifacts.Select(a => new
{
a.ArtifactId,
a.ArtifactType,
a.Uri,
a.Digest,
a.MimeType,
a.SizeBytes,
a.CreatedAt
});
return System.Text.Json.JsonSerializer.Serialize(entries);
}
private static string ComputeOutputDigest(IReadOnlyList<Artifact> artifacts)
{
if (artifacts.Count == 0)
{
return ComputeSha256("(no artifacts)");
}
// Sort by artifact ID for deterministic ordering
var sortedDigests = artifacts
.OrderBy(a => a.ArtifactId)
.Select(a => a.Digest)
.ToList();
var combined = string.Join("|", sortedDigests);
return ComputeSha256(combined);
}
private static string ComputeSha256(string content)
{
var bytes = System.Text.Encoding.UTF8.GetBytes(content);
var hash = System.Security.Cryptography.SHA256.HashData(bytes);
return Convert.ToHexString(hash).ToLowerInvariant();
}
}
/// <summary>
/// Represents a ledger export operation.
/// </summary>
public sealed record LedgerExport(
/// <summary>Unique export identifier.</summary>
Guid ExportId,
/// <summary>Tenant requesting the export.</summary>
string TenantId,
/// <summary>Export status.</summary>
LedgerExportStatus Status,
/// <summary>Export format (json, ndjson, csv).</summary>
string Format,
/// <summary>Start of the time range to export.</summary>
DateTimeOffset? StartTime,
/// <summary>End of the time range to export.</summary>
DateTimeOffset? EndTime,
/// <summary>Run types to include (null = all).</summary>
string? RunTypeFilter,
/// <summary>Source ID filter (null = all).</summary>
Guid? SourceIdFilter,
/// <summary>Number of entries exported.</summary>
int EntryCount,
/// <summary>URI where the export is stored.</summary>
string? OutputUri,
/// <summary>SHA-256 digest of the export file.</summary>
string? OutputDigest,
/// <summary>Size of the export in bytes.</summary>
long? OutputSizeBytes,
/// <summary>Actor who requested the export.</summary>
string RequestedBy,
/// <summary>When the export was requested.</summary>
DateTimeOffset RequestedAt,
/// <summary>When the export started processing.</summary>
DateTimeOffset? StartedAt,
/// <summary>When the export completed.</summary>
DateTimeOffset? CompletedAt,
/// <summary>Error message if export failed.</summary>
string? ErrorMessage)
{
/// <summary>
/// Creates a new pending export request.
/// </summary>
public static LedgerExport CreateRequest(
string tenantId,
string format,
string requestedBy,
DateTimeOffset? startTime = null,
DateTimeOffset? endTime = null,
string? runTypeFilter = null,
Guid? sourceIdFilter = null)
{
if (string.IsNullOrWhiteSpace(format))
{
throw new ArgumentException("Format is required.", nameof(format));
}
var validFormats = new[] { "json", "ndjson", "csv" };
if (!validFormats.Contains(format.ToLowerInvariant()))
{
throw new ArgumentException($"Invalid format. Must be one of: {string.Join(", ", validFormats)}", nameof(format));
}
return new LedgerExport(
ExportId: Guid.NewGuid(),
TenantId: tenantId,
Status: LedgerExportStatus.Pending,
Format: format.ToLowerInvariant(),
StartTime: startTime,
EndTime: endTime,
RunTypeFilter: runTypeFilter,
SourceIdFilter: sourceIdFilter,
EntryCount: 0,
OutputUri: null,
OutputDigest: null,
OutputSizeBytes: null,
RequestedBy: requestedBy,
RequestedAt: DateTimeOffset.UtcNow,
StartedAt: null,
CompletedAt: null,
ErrorMessage: null);
}
/// <summary>
/// Marks the export as started.
/// </summary>
public LedgerExport Start() => this with
{
Status = LedgerExportStatus.Processing,
StartedAt = DateTimeOffset.UtcNow
};
/// <summary>
/// Marks the export as completed.
/// </summary>
public LedgerExport Complete(string outputUri, string outputDigest, long outputSizeBytes, int entryCount) => this with
{
Status = LedgerExportStatus.Completed,
OutputUri = outputUri,
OutputDigest = outputDigest,
OutputSizeBytes = outputSizeBytes,
EntryCount = entryCount,
CompletedAt = DateTimeOffset.UtcNow
};
/// <summary>
/// Marks the export as failed.
/// </summary>
public LedgerExport Fail(string errorMessage) => this with
{
Status = LedgerExportStatus.Failed,
ErrorMessage = errorMessage,
CompletedAt = DateTimeOffset.UtcNow
};
}
/// <summary>
/// Status of a ledger export operation.
/// </summary>
public enum LedgerExportStatus
{
Pending = 0,
Processing = 1,
Completed = 2,
Failed = 3,
Canceled = 4
}

View File

@@ -0,0 +1,60 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents a scheduled job trigger (cron-based or interval-based).
/// Schedules automatically create jobs at specified times.
/// </summary>
public sealed record Schedule(
/// <summary>Unique schedule identifier.</summary>
Guid ScheduleId,
/// <summary>Tenant owning this schedule.</summary>
string TenantId,
/// <summary>Optional project scope within tenant.</summary>
string? ProjectId,
/// <summary>Source that will be used for jobs.</summary>
Guid SourceId,
/// <summary>Human-readable schedule name.</summary>
string Name,
/// <summary>Job type to create.</summary>
string JobType,
/// <summary>Cron expression (6-field with seconds, UTC).</summary>
string CronExpression,
/// <summary>Timezone for cron evaluation (IANA, e.g., "UTC", "America/New_York").</summary>
string Timezone,
/// <summary>Whether the schedule is enabled.</summary>
bool Enabled,
/// <summary>Job payload template JSON.</summary>
string PayloadTemplate,
/// <summary>Job priority for scheduled jobs.</summary>
int Priority,
/// <summary>Maximum retry attempts for scheduled jobs.</summary>
int MaxAttempts,
/// <summary>Last time a job was triggered from this schedule.</summary>
DateTimeOffset? LastTriggeredAt,
/// <summary>Next scheduled trigger time.</summary>
DateTimeOffset? NextTriggerAt,
/// <summary>When the schedule was created.</summary>
DateTimeOffset CreatedAt,
/// <summary>When the schedule was last updated.</summary>
DateTimeOffset UpdatedAt,
/// <summary>Actor who created the schedule.</summary>
string CreatedBy,
/// <summary>Actor who last modified the schedule.</summary>
string UpdatedBy);

View File

@@ -0,0 +1,423 @@
using System.Text.Json;
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Signed manifest providing provenance chain from ledger entries to artifacts.
/// Enables verification of artifact authenticity and integrity.
/// </summary>
public sealed record SignedManifest(
/// <summary>Unique manifest identifier.</summary>
Guid ManifestId,
/// <summary>Manifest schema version.</summary>
string SchemaVersion,
/// <summary>Tenant owning this manifest.</summary>
string TenantId,
/// <summary>Type of provenance (run, export, attestation).</summary>
ProvenanceType ProvenanceType,
/// <summary>Subject of the provenance (run ID, export ID, etc.).</summary>
Guid SubjectId,
/// <summary>Provenance statements (JSON array).</summary>
string Statements,
/// <summary>Artifact references with digests (JSON array).</summary>
string Artifacts,
/// <summary>Materials (inputs) used to produce the artifacts (JSON array).</summary>
string Materials,
/// <summary>Build environment information (JSON object).</summary>
string? BuildInfo,
/// <summary>SHA-256 digest of the manifest payload (excluding signature).</summary>
string PayloadDigest,
/// <summary>Signature algorithm used.</summary>
string SignatureAlgorithm,
/// <summary>Base64-encoded signature.</summary>
string Signature,
/// <summary>Key ID used for signing.</summary>
string KeyId,
/// <summary>When the manifest was created.</summary>
DateTimeOffset CreatedAt,
/// <summary>Expiration time of the manifest (if any).</summary>
DateTimeOffset? ExpiresAt,
/// <summary>Additional metadata (JSON object).</summary>
string? Metadata)
{
/// <summary>
/// Current schema version for manifests.
/// </summary>
public const string CurrentSchemaVersion = "1.0.0";
/// <summary>
/// Creates an unsigned manifest from a ledger entry.
/// The manifest must be signed separately using SigningService.
/// </summary>
public static SignedManifest CreateFromLedgerEntry(
RunLedgerEntry ledger,
string? buildInfo = null,
string? metadata = null)
{
var statements = CreateStatementsFromLedger(ledger);
var artifacts = ledger.ArtifactManifest;
var materials = CreateMaterialsFromLedger(ledger);
var payloadDigest = ComputePayloadDigest(
ledger.TenantId,
ProvenanceType.Run,
ledger.RunId,
statements,
artifacts,
materials);
return new SignedManifest(
ManifestId: Guid.NewGuid(),
SchemaVersion: CurrentSchemaVersion,
TenantId: ledger.TenantId,
ProvenanceType: ProvenanceType.Run,
SubjectId: ledger.RunId,
Statements: statements,
Artifacts: artifacts,
Materials: materials,
BuildInfo: buildInfo,
PayloadDigest: payloadDigest,
SignatureAlgorithm: "none",
Signature: string.Empty,
KeyId: string.Empty,
CreatedAt: DateTimeOffset.UtcNow,
ExpiresAt: null,
Metadata: metadata);
}
/// <summary>
/// Creates an unsigned manifest from a ledger export.
/// </summary>
public static SignedManifest CreateFromExport(
LedgerExport export,
IReadOnlyList<RunLedgerEntry> entries,
string? buildInfo = null,
string? metadata = null)
{
if (export.Status != LedgerExportStatus.Completed)
{
throw new InvalidOperationException("Cannot create manifest from incomplete export.");
}
var statements = CreateStatementsFromExport(export, entries);
var artifacts = CreateExportArtifacts(export);
var materials = CreateExportMaterials(entries);
var payloadDigest = ComputePayloadDigest(
export.TenantId,
ProvenanceType.Export,
export.ExportId,
statements,
artifacts,
materials);
return new SignedManifest(
ManifestId: Guid.NewGuid(),
SchemaVersion: CurrentSchemaVersion,
TenantId: export.TenantId,
ProvenanceType: ProvenanceType.Export,
SubjectId: export.ExportId,
Statements: statements,
Artifacts: artifacts,
Materials: materials,
BuildInfo: buildInfo,
PayloadDigest: payloadDigest,
SignatureAlgorithm: "none",
Signature: string.Empty,
KeyId: string.Empty,
CreatedAt: DateTimeOffset.UtcNow,
ExpiresAt: null,
Metadata: metadata);
}
/// <summary>
/// Signs the manifest with the provided signature.
/// </summary>
public SignedManifest Sign(string signatureAlgorithm, string signature, string keyId, DateTimeOffset? expiresAt = null)
{
if (string.IsNullOrWhiteSpace(signatureAlgorithm))
{
throw new ArgumentException("Signature algorithm is required.", nameof(signatureAlgorithm));
}
if (string.IsNullOrWhiteSpace(signature))
{
throw new ArgumentException("Signature is required.", nameof(signature));
}
if (string.IsNullOrWhiteSpace(keyId))
{
throw new ArgumentException("Key ID is required.", nameof(keyId));
}
return this with
{
SignatureAlgorithm = signatureAlgorithm,
Signature = signature,
KeyId = keyId,
ExpiresAt = expiresAt
};
}
/// <summary>
/// Checks if the manifest is signed.
/// </summary>
public bool IsSigned => !string.IsNullOrEmpty(Signature) && SignatureAlgorithm != "none";
/// <summary>
/// Checks if the manifest has expired.
/// </summary>
public bool IsExpired => ExpiresAt.HasValue && ExpiresAt.Value < DateTimeOffset.UtcNow;
/// <summary>
/// Verifies the payload digest integrity.
/// </summary>
public bool VerifyPayloadIntegrity()
{
var computed = ComputePayloadDigest(TenantId, ProvenanceType, SubjectId, Statements, Artifacts, Materials);
return string.Equals(PayloadDigest, computed, StringComparison.OrdinalIgnoreCase);
}
/// <summary>
/// Parses the artifact manifest into typed objects.
/// </summary>
public IReadOnlyList<ArtifactReference> GetArtifactReferences()
{
if (string.IsNullOrEmpty(Artifacts) || Artifacts == "[]")
{
return Array.Empty<ArtifactReference>();
}
return JsonSerializer.Deserialize<List<ArtifactReference>>(Artifacts) ?? [];
}
/// <summary>
/// Parses the material manifest into typed objects.
/// </summary>
public IReadOnlyList<MaterialReference> GetMaterialReferences()
{
if (string.IsNullOrEmpty(Materials) || Materials == "[]")
{
return Array.Empty<MaterialReference>();
}
return JsonSerializer.Deserialize<List<MaterialReference>>(Materials) ?? [];
}
/// <summary>
/// Parses the statements into typed objects.
/// </summary>
public IReadOnlyList<ProvenanceStatement> GetStatements()
{
if (string.IsNullOrEmpty(Statements) || Statements == "[]")
{
return Array.Empty<ProvenanceStatement>();
}
return JsonSerializer.Deserialize<List<ProvenanceStatement>>(Statements) ?? [];
}
private static string CreateStatementsFromLedger(RunLedgerEntry ledger)
{
var statements = new List<ProvenanceStatement>
{
new(
StatementType: "run_completed",
Subject: $"run:{ledger.RunId}",
Predicate: "produced",
Object: $"outputs:{ledger.OutputDigest}",
Timestamp: ledger.RunCompletedAt,
Metadata: JsonSerializer.Serialize(new
{
ledger.RunType,
ledger.FinalStatus,
ledger.TotalJobs,
ledger.SucceededJobs,
ledger.FailedJobs,
ledger.ExecutionDuration
})),
new(
StatementType: "chain_link",
Subject: $"ledger:{ledger.LedgerId}",
Predicate: "follows",
Object: ledger.PreviousEntryHash ?? "(genesis)",
Timestamp: ledger.LedgerCreatedAt,
Metadata: JsonSerializer.Serialize(new
{
ledger.SequenceNumber,
ledger.ContentHash
}))
};
return JsonSerializer.Serialize(statements);
}
private static string CreateMaterialsFromLedger(RunLedgerEntry ledger)
{
var materials = new List<MaterialReference>
{
new(
Uri: $"input:{ledger.RunId}",
Digest: ledger.InputDigest,
MediaType: "application/json",
Name: "run_input")
};
return JsonSerializer.Serialize(materials);
}
private static string CreateStatementsFromExport(LedgerExport export, IReadOnlyList<RunLedgerEntry> entries)
{
var statements = new List<ProvenanceStatement>
{
new(
StatementType: "export_completed",
Subject: $"export:{export.ExportId}",
Predicate: "contains",
Object: $"entries:{entries.Count}",
Timestamp: export.CompletedAt ?? DateTimeOffset.UtcNow,
Metadata: JsonSerializer.Serialize(new
{
export.Format,
export.EntryCount,
export.StartTime,
export.EndTime,
export.RunTypeFilter,
export.SourceIdFilter
}))
};
// Add chain integrity statement
if (entries.Count > 0)
{
var first = entries.MinBy(e => e.SequenceNumber);
var last = entries.MaxBy(e => e.SequenceNumber);
if (first is not null && last is not null)
{
statements.Add(new ProvenanceStatement(
StatementType: "chain_range",
Subject: $"export:{export.ExportId}",
Predicate: "covers",
Object: $"sequence:{first.SequenceNumber}-{last.SequenceNumber}",
Timestamp: export.CompletedAt ?? DateTimeOffset.UtcNow,
Metadata: JsonSerializer.Serialize(new
{
FirstEntryHash = first.ContentHash,
LastEntryHash = last.ContentHash
})));
}
}
return JsonSerializer.Serialize(statements);
}
private static string CreateExportArtifacts(LedgerExport export)
{
var artifacts = new List<ArtifactReference>
{
new(
ArtifactId: export.ExportId,
ArtifactType: "ledger_export",
Uri: export.OutputUri ?? string.Empty,
Digest: export.OutputDigest ?? string.Empty,
MediaType: GetMediaType(export.Format),
SizeBytes: export.OutputSizeBytes ?? 0)
};
return JsonSerializer.Serialize(artifacts);
}
private static string CreateExportMaterials(IReadOnlyList<RunLedgerEntry> entries)
{
var materials = entries.Select(e => new MaterialReference(
Uri: $"ledger:{e.LedgerId}",
Digest: e.ContentHash,
MediaType: "application/json",
Name: $"run_{e.RunId}")).ToList();
return JsonSerializer.Serialize(materials);
}
private static string GetMediaType(string format) => format.ToLowerInvariant() switch
{
"json" => "application/json",
"ndjson" => "application/x-ndjson",
"csv" => "text/csv",
_ => "application/octet-stream"
};
private static string ComputePayloadDigest(
string tenantId,
ProvenanceType provenanceType,
Guid subjectId,
string statements,
string artifacts,
string materials)
{
var payload = $"{tenantId}|{provenanceType}|{subjectId}|{statements}|{artifacts}|{materials}";
var bytes = System.Text.Encoding.UTF8.GetBytes(payload);
var hash = System.Security.Cryptography.SHA256.HashData(bytes);
return Convert.ToHexString(hash).ToLowerInvariant();
}
}
/// <summary>
/// Types of provenance tracked by manifests.
/// </summary>
public enum ProvenanceType
{
/// <summary>Provenance for a completed run.</summary>
Run = 0,
/// <summary>Provenance for a ledger export.</summary>
Export = 1,
/// <summary>Provenance for an attestation.</summary>
Attestation = 2
}
/// <summary>
/// Reference to an artifact in a manifest.
/// </summary>
public sealed record ArtifactReference(
Guid ArtifactId,
string ArtifactType,
string Uri,
string Digest,
string MediaType,
long SizeBytes);
/// <summary>
/// Reference to a material (input) in a manifest.
/// </summary>
public sealed record MaterialReference(
string Uri,
string Digest,
string MediaType,
string Name);
/// <summary>
/// A provenance statement in a manifest.
/// </summary>
public sealed record ProvenanceStatement(
string StatementType,
string Subject,
string Predicate,
string Object,
DateTimeOffset Timestamp,
string? Metadata);

View File

@@ -0,0 +1,567 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Service Level Objective type.
/// </summary>
public enum SloType
{
/// <summary>Availability SLO (percentage of successful requests).</summary>
Availability,
/// <summary>Latency SLO (percentile-based response time).</summary>
Latency,
/// <summary>Throughput SLO (minimum jobs processed per period).</summary>
Throughput
}
/// <summary>
/// Time window for SLO computation.
/// </summary>
public enum SloWindow
{
/// <summary>Rolling 1 hour window.</summary>
OneHour,
/// <summary>Rolling 1 day window.</summary>
OneDay,
/// <summary>Rolling 7 day window.</summary>
SevenDays,
/// <summary>Rolling 30 day window.</summary>
ThirtyDays
}
/// <summary>
/// Alert severity for SLO violations.
/// </summary>
public enum AlertSeverity
{
/// <summary>Informational - SLO approaching threshold.</summary>
Info,
/// <summary>Warning - SLO at risk.</summary>
Warning,
/// <summary>Critical - SLO likely to be breached.</summary>
Critical,
/// <summary>Emergency - SLO breached.</summary>
Emergency
}
/// <summary>
/// Service Level Objective definition.
/// </summary>
public sealed record Slo(
/// <summary>Unique SLO identifier.</summary>
Guid SloId,
/// <summary>Tenant this SLO belongs to.</summary>
string TenantId,
/// <summary>Human-readable name.</summary>
string Name,
/// <summary>Optional description.</summary>
string? Description,
/// <summary>Type of SLO.</summary>
SloType Type,
/// <summary>Job type this SLO applies to (null = all job types).</summary>
string? JobType,
/// <summary>Source ID this SLO applies to (null = all sources).</summary>
Guid? SourceId,
/// <summary>Target objective (e.g., 0.999 for 99.9% availability).</summary>
double Target,
/// <summary>Time window for SLO evaluation.</summary>
SloWindow Window,
/// <summary>For latency SLOs: the percentile (e.g., 0.95 for P95).</summary>
double? LatencyPercentile,
/// <summary>For latency SLOs: the target latency in seconds.</summary>
double? LatencyTargetSeconds,
/// <summary>For throughput SLOs: minimum jobs per period.</summary>
int? ThroughputMinimum,
/// <summary>Whether this SLO is actively monitored.</summary>
bool Enabled,
/// <summary>When the SLO was created.</summary>
DateTimeOffset CreatedAt,
/// <summary>When the SLO was last updated.</summary>
DateTimeOffset UpdatedAt,
/// <summary>Actor who created the SLO.</summary>
string CreatedBy,
/// <summary>Actor who last modified the SLO.</summary>
string UpdatedBy)
{
/// <summary>Calculates the error budget as a decimal (1 - target).</summary>
public double ErrorBudget => 1.0 - Target;
/// <summary>Creates a new availability SLO.</summary>
public static Slo CreateAvailability(
string tenantId,
string name,
double target,
SloWindow window,
string createdBy,
string? description = null,
string? jobType = null,
Guid? sourceId = null)
{
ValidateTarget(target);
var now = DateTimeOffset.UtcNow;
return new Slo(
SloId: Guid.NewGuid(),
TenantId: tenantId,
Name: name,
Description: description,
Type: SloType.Availability,
JobType: jobType,
SourceId: sourceId,
Target: target,
Window: window,
LatencyPercentile: null,
LatencyTargetSeconds: null,
ThroughputMinimum: null,
Enabled: true,
CreatedAt: now,
UpdatedAt: now,
CreatedBy: createdBy,
UpdatedBy: createdBy);
}
/// <summary>Creates a new latency SLO.</summary>
public static Slo CreateLatency(
string tenantId,
string name,
double percentile,
double targetSeconds,
double target,
SloWindow window,
string createdBy,
string? description = null,
string? jobType = null,
Guid? sourceId = null)
{
ValidateTarget(target);
if (percentile < 0 || percentile > 1)
throw new ArgumentOutOfRangeException(nameof(percentile), "Percentile must be between 0 and 1");
if (targetSeconds <= 0)
throw new ArgumentOutOfRangeException(nameof(targetSeconds), "Target latency must be positive");
var now = DateTimeOffset.UtcNow;
return new Slo(
SloId: Guid.NewGuid(),
TenantId: tenantId,
Name: name,
Description: description,
Type: SloType.Latency,
JobType: jobType,
SourceId: sourceId,
Target: target,
Window: window,
LatencyPercentile: percentile,
LatencyTargetSeconds: targetSeconds,
ThroughputMinimum: null,
Enabled: true,
CreatedAt: now,
UpdatedAt: now,
CreatedBy: createdBy,
UpdatedBy: createdBy);
}
/// <summary>Creates a new throughput SLO.</summary>
public static Slo CreateThroughput(
string tenantId,
string name,
int minimum,
double target,
SloWindow window,
string createdBy,
string? description = null,
string? jobType = null,
Guid? sourceId = null)
{
ValidateTarget(target);
if (minimum <= 0)
throw new ArgumentOutOfRangeException(nameof(minimum), "Throughput minimum must be positive");
var now = DateTimeOffset.UtcNow;
return new Slo(
SloId: Guid.NewGuid(),
TenantId: tenantId,
Name: name,
Description: description,
Type: SloType.Throughput,
JobType: jobType,
SourceId: sourceId,
Target: target,
Window: window,
LatencyPercentile: null,
LatencyTargetSeconds: null,
ThroughputMinimum: minimum,
Enabled: true,
CreatedAt: now,
UpdatedAt: now,
CreatedBy: createdBy,
UpdatedBy: createdBy);
}
/// <summary>Updates the SLO with new values.</summary>
public Slo Update(
string? name = null,
string? description = null,
double? target = null,
bool? enabled = null,
string? updatedBy = null)
{
if (target.HasValue)
ValidateTarget(target.Value);
return this with
{
Name = name ?? Name,
Description = description ?? Description,
Target = target ?? Target,
Enabled = enabled ?? Enabled,
UpdatedAt = DateTimeOffset.UtcNow,
UpdatedBy = updatedBy ?? UpdatedBy
};
}
/// <summary>Disables the SLO.</summary>
public Slo Disable(string updatedBy) =>
this with
{
Enabled = false,
UpdatedAt = DateTimeOffset.UtcNow,
UpdatedBy = updatedBy
};
/// <summary>Enables the SLO.</summary>
public Slo Enable(string updatedBy) =>
this with
{
Enabled = true,
UpdatedAt = DateTimeOffset.UtcNow,
UpdatedBy = updatedBy
};
/// <summary>Gets the window duration as a TimeSpan.</summary>
public TimeSpan GetWindowDuration() => Window switch
{
SloWindow.OneHour => TimeSpan.FromHours(1),
SloWindow.OneDay => TimeSpan.FromDays(1),
SloWindow.SevenDays => TimeSpan.FromDays(7),
SloWindow.ThirtyDays => TimeSpan.FromDays(30),
_ => throw new InvalidOperationException($"Unknown window: {Window}")
};
private static void ValidateTarget(double target)
{
if (target <= 0 || target > 1)
throw new ArgumentOutOfRangeException(nameof(target), "Target must be between 0 (exclusive) and 1 (inclusive)");
}
}
/// <summary>
/// Current state of an SLO including burn rate and budget consumption.
/// </summary>
public sealed record SloState(
/// <summary>The SLO this state belongs to.</summary>
Guid SloId,
/// <summary>Tenant this state belongs to.</summary>
string TenantId,
/// <summary>Current SLI value (actual performance).</summary>
double CurrentSli,
/// <summary>Total events/requests in the window.</summary>
long TotalEvents,
/// <summary>Good events (successful) in the window.</summary>
long GoodEvents,
/// <summary>Bad events (failed) in the window.</summary>
long BadEvents,
/// <summary>Error budget consumed (0-1 where 1 = fully consumed).</summary>
double BudgetConsumed,
/// <summary>Error budget remaining (0-1 where 1 = fully available).</summary>
double BudgetRemaining,
/// <summary>Current burn rate (1.0 = consuming budget at sustainable rate).</summary>
double BurnRate,
/// <summary>Projected time until budget exhaustion (null if not burning).</summary>
TimeSpan? TimeToExhaustion,
/// <summary>Whether the SLO is currently met.</summary>
bool IsMet,
/// <summary>Current alert severity based on budget consumption.</summary>
AlertSeverity AlertSeverity,
/// <summary>When this state was computed.</summary>
DateTimeOffset ComputedAt,
/// <summary>Start of the evaluation window.</summary>
DateTimeOffset WindowStart,
/// <summary>End of the evaluation window.</summary>
DateTimeOffset WindowEnd)
{
/// <summary>Creates a state indicating no data is available.</summary>
public static SloState NoData(Guid sloId, string tenantId, DateTimeOffset now, SloWindow window)
{
var windowDuration = GetWindowDuration(window);
return new SloState(
SloId: sloId,
TenantId: tenantId,
CurrentSli: 1.0, // Assume good when no data
TotalEvents: 0,
GoodEvents: 0,
BadEvents: 0,
BudgetConsumed: 0,
BudgetRemaining: 1.0,
BurnRate: 0,
TimeToExhaustion: null,
IsMet: true,
AlertSeverity: AlertSeverity.Info,
ComputedAt: now,
WindowStart: now - windowDuration,
WindowEnd: now);
}
private static TimeSpan GetWindowDuration(SloWindow window) => window switch
{
SloWindow.OneHour => TimeSpan.FromHours(1),
SloWindow.OneDay => TimeSpan.FromDays(1),
SloWindow.SevenDays => TimeSpan.FromDays(7),
SloWindow.ThirtyDays => TimeSpan.FromDays(30),
_ => TimeSpan.FromDays(1)
};
}
/// <summary>
/// Alert budget threshold configuration.
/// </summary>
public sealed record AlertBudgetThreshold(
/// <summary>Unique threshold identifier.</summary>
Guid ThresholdId,
/// <summary>SLO this threshold applies to.</summary>
Guid SloId,
/// <summary>Tenant this threshold belongs to.</summary>
string TenantId,
/// <summary>Budget consumed percentage that triggers this alert (0-1).</summary>
double BudgetConsumedThreshold,
/// <summary>Burn rate threshold that triggers this alert.</summary>
double? BurnRateThreshold,
/// <summary>Severity of the alert.</summary>
AlertSeverity Severity,
/// <summary>Whether this threshold is enabled.</summary>
bool Enabled,
/// <summary>Notification channel for this alert.</summary>
string? NotificationChannel,
/// <summary>Notification endpoint for this alert.</summary>
string? NotificationEndpoint,
/// <summary>Cooldown period between alerts.</summary>
TimeSpan Cooldown,
/// <summary>When an alert was last triggered.</summary>
DateTimeOffset? LastTriggeredAt,
/// <summary>When the threshold was created.</summary>
DateTimeOffset CreatedAt,
/// <summary>When the threshold was last updated.</summary>
DateTimeOffset UpdatedAt,
/// <summary>Actor who created the threshold.</summary>
string CreatedBy,
/// <summary>Actor who last modified the threshold.</summary>
string UpdatedBy)
{
/// <summary>Creates a new alert threshold.</summary>
public static AlertBudgetThreshold Create(
Guid sloId,
string tenantId,
double budgetConsumedThreshold,
AlertSeverity severity,
string createdBy,
double? burnRateThreshold = null,
string? notificationChannel = null,
string? notificationEndpoint = null,
TimeSpan? cooldown = null)
{
if (budgetConsumedThreshold < 0 || budgetConsumedThreshold > 1)
throw new ArgumentOutOfRangeException(nameof(budgetConsumedThreshold), "Threshold must be between 0 and 1");
var now = DateTimeOffset.UtcNow;
return new AlertBudgetThreshold(
ThresholdId: Guid.NewGuid(),
SloId: sloId,
TenantId: tenantId,
BudgetConsumedThreshold: budgetConsumedThreshold,
BurnRateThreshold: burnRateThreshold,
Severity: severity,
Enabled: true,
NotificationChannel: notificationChannel,
NotificationEndpoint: notificationEndpoint,
Cooldown: cooldown ?? TimeSpan.FromHours(1),
LastTriggeredAt: null,
CreatedAt: now,
UpdatedAt: now,
CreatedBy: createdBy,
UpdatedBy: createdBy);
}
/// <summary>Checks if this threshold should trigger based on current state.</summary>
public bool ShouldTrigger(SloState state, DateTimeOffset now)
{
if (!Enabled) return false;
// Check cooldown
if (LastTriggeredAt.HasValue && (now - LastTriggeredAt.Value) < Cooldown)
return false;
// Check budget consumed threshold
if (state.BudgetConsumed >= BudgetConsumedThreshold)
return true;
// Check burn rate threshold if set
if (BurnRateThreshold.HasValue && state.BurnRate >= BurnRateThreshold.Value)
return true;
return false;
}
/// <summary>Records that this threshold was triggered.</summary>
public AlertBudgetThreshold RecordTrigger(DateTimeOffset now) =>
this with
{
LastTriggeredAt = now,
UpdatedAt = now
};
}
/// <summary>
/// SLO alert event.
/// </summary>
public sealed record SloAlert(
/// <summary>Unique alert identifier.</summary>
Guid AlertId,
/// <summary>SLO this alert relates to.</summary>
Guid SloId,
/// <summary>Threshold that triggered this alert.</summary>
Guid ThresholdId,
/// <summary>Tenant this alert belongs to.</summary>
string TenantId,
/// <summary>Severity of the alert.</summary>
AlertSeverity Severity,
/// <summary>Alert message.</summary>
string Message,
/// <summary>Budget consumed at time of alert.</summary>
double BudgetConsumed,
/// <summary>Burn rate at time of alert.</summary>
double BurnRate,
/// <summary>Current SLI value at time of alert.</summary>
double CurrentSli,
/// <summary>When the alert was triggered.</summary>
DateTimeOffset TriggeredAt,
/// <summary>When the alert was acknowledged (null if not acknowledged).</summary>
DateTimeOffset? AcknowledgedAt,
/// <summary>Who acknowledged the alert.</summary>
string? AcknowledgedBy,
/// <summary>When the alert was resolved (null if not resolved).</summary>
DateTimeOffset? ResolvedAt,
/// <summary>How the alert was resolved.</summary>
string? ResolutionNotes)
{
/// <summary>Creates a new alert from an SLO state and threshold.</summary>
public static SloAlert Create(
Slo slo,
SloState state,
AlertBudgetThreshold threshold)
{
var message = threshold.BurnRateThreshold.HasValue && state.BurnRate >= threshold.BurnRateThreshold.Value
? $"SLO '{slo.Name}' burn rate {state.BurnRate:F2}x exceeds threshold {threshold.BurnRateThreshold.Value:F2}x"
: $"SLO '{slo.Name}' error budget {state.BudgetConsumed:P1} consumed exceeds threshold {threshold.BudgetConsumedThreshold:P1}";
return new SloAlert(
AlertId: Guid.NewGuid(),
SloId: slo.SloId,
ThresholdId: threshold.ThresholdId,
TenantId: slo.TenantId,
Severity: threshold.Severity,
Message: message,
BudgetConsumed: state.BudgetConsumed,
BurnRate: state.BurnRate,
CurrentSli: state.CurrentSli,
TriggeredAt: state.ComputedAt,
AcknowledgedAt: null,
AcknowledgedBy: null,
ResolvedAt: null,
ResolutionNotes: null);
}
/// <summary>Acknowledges the alert.</summary>
public SloAlert Acknowledge(string acknowledgedBy, DateTimeOffset now) =>
this with
{
AcknowledgedAt = now,
AcknowledgedBy = acknowledgedBy
};
/// <summary>Resolves the alert.</summary>
public SloAlert Resolve(string notes, DateTimeOffset now) =>
this with
{
ResolvedAt = now,
ResolutionNotes = notes
};
/// <summary>Whether this alert has been acknowledged.</summary>
public bool IsAcknowledged => AcknowledgedAt.HasValue;
/// <summary>Whether this alert has been resolved.</summary>
public bool IsResolved => ResolvedAt.HasValue;
}

View File

@@ -0,0 +1,42 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents a job source (producer) that submits jobs to the orchestrator.
/// Examples: Concelier, Excititor, Scheduler, Export Center, Policy Engine.
/// </summary>
public sealed record Source(
/// <summary>Unique source identifier.</summary>
Guid SourceId,
/// <summary>Tenant owning this source.</summary>
string TenantId,
/// <summary>Human-readable source name (e.g., "concelier-nvd").</summary>
string Name,
/// <summary>Source type/category (e.g., "advisory-ingest", "scanner", "export").</summary>
string SourceType,
/// <summary>Whether the source is currently enabled.</summary>
bool Enabled,
/// <summary>Whether the source is paused (throttled by operator).</summary>
bool Paused,
/// <summary>Operator-provided reason for pause (if paused).</summary>
string? PauseReason,
/// <summary>Ticket reference for pause audit trail.</summary>
string? PauseTicket,
/// <summary>Optional configuration JSON blob.</summary>
string? Configuration,
/// <summary>When the source was created.</summary>
DateTimeOffset CreatedAt,
/// <summary>When the source was last updated.</summary>
DateTimeOffset UpdatedAt,
/// <summary>Actor who last modified the source.</summary>
string UpdatedBy);

View File

@@ -0,0 +1,60 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents a dynamic rate-limit override (throttle) for a source or job type.
/// Throttles are temporary pause/slow-down mechanisms, often in response to upstream pressure.
/// </summary>
public sealed record Throttle(
/// <summary>Unique throttle identifier.</summary>
Guid ThrottleId,
/// <summary>Tenant this throttle applies to.</summary>
string TenantId,
/// <summary>Source to throttle (null if job-type scoped).</summary>
Guid? SourceId,
/// <summary>Job type to throttle (null if source-scoped).</summary>
string? JobType,
/// <summary>Whether this throttle is currently active.</summary>
bool Active,
/// <summary>Reason for the throttle (e.g., "429 from upstream", "Manual pause").</summary>
string Reason,
/// <summary>Optional ticket reference for audit.</summary>
string? Ticket,
/// <summary>When the throttle was created.</summary>
DateTimeOffset CreatedAt,
/// <summary>When the throttle expires (null = indefinite).</summary>
DateTimeOffset? ExpiresAt,
/// <summary>Actor who created the throttle.</summary>
string CreatedBy);
/// <summary>
/// Reason categories for throttle creation.
/// </summary>
public static class ThrottleReasons
{
/// <summary>Upstream returned 429 Too Many Requests.</summary>
public const string UpstreamRateLimited = "upstream_429";
/// <summary>Upstream returned 503 Service Unavailable.</summary>
public const string UpstreamUnavailable = "upstream_503";
/// <summary>Upstream returned 5xx error repeatedly.</summary>
public const string UpstreamErrors = "upstream_5xx";
/// <summary>Manual operator intervention.</summary>
public const string ManualPause = "manual_pause";
/// <summary>Circuit breaker triggered.</summary>
public const string CircuitBreaker = "circuit_breaker";
/// <summary>Quota exhausted.</summary>
public const string QuotaExhausted = "quota_exhausted";
}

View File

@@ -0,0 +1,162 @@
namespace StellaOps.Orchestrator.Core.Domain;
/// <summary>
/// Represents an event-time watermark for tracking processing progress.
/// Watermarks are scoped by source, job type, or custom key.
/// </summary>
public sealed record Watermark(
/// <summary>Unique watermark identifier.</summary>
Guid WatermarkId,
/// <summary>Tenant this watermark belongs to.</summary>
string TenantId,
/// <summary>Source this watermark tracks (null if job-type scoped).</summary>
Guid? SourceId,
/// <summary>Job type this watermark tracks (null if source-scoped).</summary>
string? JobType,
/// <summary>Normalized scope key for uniqueness.</summary>
string ScopeKey,
/// <summary>Latest processed event time (high watermark).</summary>
DateTimeOffset HighWatermark,
/// <summary>Earliest event time in current window (low watermark for windowing).</summary>
DateTimeOffset? LowWatermark,
/// <summary>Monotonic sequence number for ordering.</summary>
long SequenceNumber,
/// <summary>Total events processed through this watermark.</summary>
long ProcessedCount,
/// <summary>SHA-256 hash of last processed batch for integrity verification.</summary>
string? LastBatchHash,
/// <summary>When the watermark was created.</summary>
DateTimeOffset CreatedAt,
/// <summary>When the watermark was last updated.</summary>
DateTimeOffset UpdatedAt,
/// <summary>Actor who last modified the watermark.</summary>
string UpdatedBy)
{
/// <summary>
/// Creates a scope key for source-scoped watermarks.
/// </summary>
public static string CreateScopeKey(Guid sourceId) =>
$"source:{sourceId:N}";
/// <summary>
/// Creates a scope key for job-type-scoped watermarks.
/// </summary>
public static string CreateScopeKey(string jobType) =>
$"job_type:{jobType.ToLowerInvariant()}";
/// <summary>
/// Creates a scope key for source+job-type scoped watermarks.
/// </summary>
public static string CreateScopeKey(Guid sourceId, string jobType) =>
$"source:{sourceId:N}:job_type:{jobType.ToLowerInvariant()}";
/// <summary>
/// Creates a new watermark with initial values.
/// </summary>
public static Watermark Create(
string tenantId,
Guid? sourceId,
string? jobType,
DateTimeOffset highWatermark,
string createdBy)
{
var scopeKey = (sourceId, jobType) switch
{
(Guid s, string j) when !string.IsNullOrEmpty(j) => CreateScopeKey(s, j),
(Guid s, _) => CreateScopeKey(s),
(_, string j) when !string.IsNullOrEmpty(j) => CreateScopeKey(j),
_ => throw new ArgumentException("Either sourceId or jobType must be specified.")
};
var now = DateTimeOffset.UtcNow;
return new Watermark(
WatermarkId: Guid.NewGuid(),
TenantId: tenantId,
SourceId: sourceId,
JobType: jobType,
ScopeKey: scopeKey,
HighWatermark: highWatermark,
LowWatermark: null,
SequenceNumber: 0,
ProcessedCount: 0,
LastBatchHash: null,
CreatedAt: now,
UpdatedAt: now,
UpdatedBy: createdBy);
}
/// <summary>
/// Advances the watermark after successful batch processing.
/// </summary>
public Watermark Advance(
DateTimeOffset newHighWatermark,
long eventsProcessed,
string? batchHash,
string updatedBy)
{
if (newHighWatermark < HighWatermark)
throw new ArgumentException("New high watermark cannot be before current high watermark.", nameof(newHighWatermark));
return this with
{
HighWatermark = newHighWatermark,
SequenceNumber = SequenceNumber + 1,
ProcessedCount = ProcessedCount + eventsProcessed,
LastBatchHash = batchHash,
UpdatedAt = DateTimeOffset.UtcNow,
UpdatedBy = updatedBy
};
}
/// <summary>
/// Sets the event-time window bounds.
/// </summary>
public Watermark WithWindow(DateTimeOffset lowWatermark, DateTimeOffset highWatermark)
{
if (highWatermark < lowWatermark)
throw new ArgumentException("High watermark cannot be before low watermark.");
return this with
{
LowWatermark = lowWatermark,
HighWatermark = highWatermark,
UpdatedAt = DateTimeOffset.UtcNow
};
}
}
/// <summary>
/// Snapshot of watermark state for observability.
/// </summary>
public sealed record WatermarkSnapshot(
string ScopeKey,
DateTimeOffset HighWatermark,
DateTimeOffset? LowWatermark,
long SequenceNumber,
long ProcessedCount,
TimeSpan? Lag)
{
/// <summary>
/// Creates a snapshot from a watermark with calculated lag.
/// </summary>
public static WatermarkSnapshot FromWatermark(Watermark watermark, DateTimeOffset now) =>
new(
ScopeKey: watermark.ScopeKey,
HighWatermark: watermark.HighWatermark,
LowWatermark: watermark.LowWatermark,
SequenceNumber: watermark.SequenceNumber,
ProcessedCount: watermark.ProcessedCount,
Lag: now - watermark.HighWatermark);
}

View File

@@ -0,0 +1,450 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Core.RateLimiting;
/// <summary>
/// Adaptive rate limiter that combines token bucket, concurrency limiting, and backpressure handling.
/// Provides per-tenant/job-type rate limiting with automatic adaptation to upstream pressure.
/// </summary>
public sealed class AdaptiveRateLimiter
{
private readonly TokenBucket _tokenBucket;
private readonly ConcurrencyLimiter _concurrencyLimiter;
private readonly BackpressureHandler _backpressureHandler;
private readonly HourlyCounter _hourlyCounter;
private readonly object _lock = new();
/// <summary>
/// Tenant ID this limiter applies to.
/// </summary>
public string TenantId { get; }
/// <summary>
/// Job type this limiter applies to (null = all types).
/// </summary>
public string? JobType { get; }
/// <summary>
/// Maximum jobs per hour.
/// </summary>
public int MaxPerHour { get; }
/// <summary>
/// Whether the limiter is paused by operator.
/// </summary>
public bool IsPaused { get; private set; }
/// <summary>
/// Reason for pause (if paused).
/// </summary>
public string? PauseReason { get; private set; }
/// <summary>
/// Creates a new adaptive rate limiter from quota configuration.
/// </summary>
public AdaptiveRateLimiter(Quota quota, TimeProvider? timeProvider = null)
{
ArgumentNullException.ThrowIfNull(quota);
TenantId = quota.TenantId;
JobType = quota.JobType;
MaxPerHour = quota.MaxPerHour;
IsPaused = quota.Paused;
PauseReason = quota.PauseReason;
_tokenBucket = new TokenBucket(
quota.BurstCapacity,
quota.RefillRate,
quota.CurrentTokens,
quota.LastRefillAt);
_concurrencyLimiter = new ConcurrencyLimiter(
quota.MaxActive,
quota.CurrentActive);
_backpressureHandler = new BackpressureHandler(
baseDelay: TimeSpan.FromSeconds(1),
maxDelay: TimeSpan.FromMinutes(5),
failureThreshold: 3,
jitterFactor: 0.2);
_hourlyCounter = new HourlyCounter(
quota.MaxPerHour,
quota.CurrentHourCount,
quota.CurrentHourStart);
}
/// <summary>
/// Creates a new adaptive rate limiter with explicit configuration.
/// </summary>
public AdaptiveRateLimiter(
string tenantId,
string? jobType,
int maxActive,
int maxPerHour,
int burstCapacity,
double refillRate)
{
TenantId = tenantId ?? throw new ArgumentNullException(nameof(tenantId));
JobType = jobType;
MaxPerHour = maxPerHour;
_tokenBucket = new TokenBucket(burstCapacity, refillRate);
_concurrencyLimiter = new ConcurrencyLimiter(maxActive);
_backpressureHandler = new BackpressureHandler();
_hourlyCounter = new HourlyCounter(maxPerHour);
}
/// <summary>
/// Attempts to acquire permission to execute a job.
/// </summary>
/// <param name="now">Current time.</param>
/// <returns>Result indicating whether acquisition was successful and why.</returns>
public RateLimitResult TryAcquire(DateTimeOffset now)
{
lock (_lock)
{
// Check if paused
if (IsPaused)
{
return RateLimitResult.Denied(RateLimitDenialReason.Paused, PauseReason);
}
// Check backpressure
if (!_backpressureHandler.ShouldAllow(now))
{
var snapshot = _backpressureHandler.GetSnapshot(now);
return RateLimitResult.Denied(
RateLimitDenialReason.Backpressure,
snapshot.LastFailureReason,
retryAfter: snapshot.TimeRemaining);
}
// Check hourly limit
if (!_hourlyCounter.TryIncrement(now))
{
var hourlySnapshot = _hourlyCounter.GetSnapshot(now);
return RateLimitResult.Denied(
RateLimitDenialReason.HourlyLimitExceeded,
$"Hourly limit of {MaxPerHour} exceeded",
retryAfter: hourlySnapshot.TimeUntilReset);
}
// Check concurrency
if (!_concurrencyLimiter.TryAcquire())
{
// Rollback hourly counter
_hourlyCounter.Decrement();
var concurrencySnapshot = _concurrencyLimiter.GetSnapshot();
return RateLimitResult.Denied(
RateLimitDenialReason.ConcurrencyLimitExceeded,
$"Concurrency limit of {concurrencySnapshot.MaxActive} exceeded");
}
// Check token bucket
if (!_tokenBucket.TryConsume(now))
{
// Rollback concurrency and hourly counter
_concurrencyLimiter.Release();
_hourlyCounter.Decrement();
var waitTime = _tokenBucket.EstimatedWaitTime(now);
return RateLimitResult.Denied(
RateLimitDenialReason.TokensExhausted,
"Token bucket exhausted",
retryAfter: waitTime);
}
return RateLimitResult.Allowed();
}
}
/// <summary>
/// Releases a concurrency slot when a job completes.
/// </summary>
public void Release()
{
lock (_lock)
{
_concurrencyLimiter.Release();
}
}
/// <summary>
/// Records an upstream failure for backpressure calculation.
/// </summary>
/// <param name="statusCode">HTTP status code from upstream.</param>
/// <param name="retryAfter">Optional Retry-After header value.</param>
/// <param name="now">Current time.</param>
/// <returns>Backpressure result.</returns>
public BackpressureResult RecordUpstreamFailure(int statusCode, TimeSpan? retryAfter = null, DateTimeOffset? now = null)
{
lock (_lock)
{
return _backpressureHandler.RecordFailure(statusCode, retryAfter, now);
}
}
/// <summary>
/// Records a successful upstream request.
/// </summary>
public void RecordUpstreamSuccess()
{
lock (_lock)
{
_backpressureHandler.RecordSuccess();
}
}
/// <summary>
/// Pauses the limiter.
/// </summary>
/// <param name="reason">Reason for pause.</param>
public void Pause(string reason)
{
lock (_lock)
{
IsPaused = true;
PauseReason = reason;
}
}
/// <summary>
/// Resumes the limiter.
/// </summary>
public void Resume()
{
lock (_lock)
{
IsPaused = false;
PauseReason = null;
}
}
/// <summary>
/// Gets a snapshot of the current limiter state.
/// </summary>
/// <param name="now">Current time.</param>
/// <returns>Snapshot of limiter state.</returns>
public AdaptiveRateLimiterSnapshot GetSnapshot(DateTimeOffset now)
{
lock (_lock)
{
return new AdaptiveRateLimiterSnapshot(
TenantId: TenantId,
JobType: JobType,
IsPaused: IsPaused,
PauseReason: PauseReason,
TokenBucket: _tokenBucket.GetSnapshot(now),
Concurrency: _concurrencyLimiter.GetSnapshot(),
Backpressure: _backpressureHandler.GetSnapshot(now),
HourlyCounter: _hourlyCounter.GetSnapshot(now));
}
}
/// <summary>
/// Exports the current state to a quota record for persistence.
/// </summary>
/// <param name="quotaId">Original quota ID.</param>
/// <param name="now">Current time.</param>
/// <param name="updatedBy">Actor performing the update.</param>
/// <returns>Quota record with current state.</returns>
public Quota ExportToQuota(Guid quotaId, DateTimeOffset now, string updatedBy)
{
lock (_lock)
{
var tokenSnapshot = _tokenBucket.GetSnapshot(now);
var concurrencySnapshot = _concurrencyLimiter.GetSnapshot();
var hourlySnapshot = _hourlyCounter.GetSnapshot(now);
return new Quota(
QuotaId: quotaId,
TenantId: TenantId,
JobType: JobType,
MaxActive: concurrencySnapshot.MaxActive,
MaxPerHour: MaxPerHour,
BurstCapacity: tokenSnapshot.BurstCapacity,
RefillRate: tokenSnapshot.RefillRate,
CurrentTokens: tokenSnapshot.CurrentTokens,
LastRefillAt: tokenSnapshot.LastRefillAt,
CurrentActive: concurrencySnapshot.CurrentActive,
CurrentHourCount: hourlySnapshot.CurrentCount,
CurrentHourStart: hourlySnapshot.HourStart,
Paused: IsPaused,
PauseReason: PauseReason,
QuotaTicket: null,
CreatedAt: now, // This should be preserved from original
UpdatedAt: now,
UpdatedBy: updatedBy);
}
}
}
/// <summary>
/// Result of a rate limit acquisition attempt.
/// </summary>
public sealed record RateLimitResult(
bool IsAllowed,
RateLimitDenialReason? DenialReason,
string? DenialMessage,
TimeSpan? RetryAfter)
{
/// <summary>
/// Creates an allowed result.
/// </summary>
public static RateLimitResult Allowed() => new(true, null, null, null);
/// <summary>
/// Creates a denied result.
/// </summary>
public static RateLimitResult Denied(
RateLimitDenialReason reason,
string? message = null,
TimeSpan? retryAfter = null) =>
new(false, reason, message, retryAfter);
}
/// <summary>
/// Reasons for rate limit denial.
/// </summary>
public enum RateLimitDenialReason
{
/// <summary>Limiter is paused by operator.</summary>
Paused,
/// <summary>In backpressure backoff period.</summary>
Backpressure,
/// <summary>Hourly request limit exceeded.</summary>
HourlyLimitExceeded,
/// <summary>Concurrency limit exceeded.</summary>
ConcurrencyLimitExceeded,
/// <summary>Token bucket exhausted.</summary>
TokensExhausted
}
/// <summary>
/// Snapshot of adaptive rate limiter state.
/// </summary>
public sealed record AdaptiveRateLimiterSnapshot(
string TenantId,
string? JobType,
bool IsPaused,
string? PauseReason,
TokenBucketSnapshot TokenBucket,
ConcurrencySnapshot Concurrency,
BackpressureSnapshot Backpressure,
HourlyCounterSnapshot HourlyCounter);
/// <summary>
/// Tracks requests per hour with automatic reset.
/// </summary>
public sealed class HourlyCounter
{
private readonly object _lock = new();
private int _currentCount;
private DateTimeOffset _hourStart;
/// <summary>
/// Maximum allowed requests per hour.
/// </summary>
public int MaxPerHour { get; }
/// <summary>
/// Creates a new hourly counter.
/// </summary>
public HourlyCounter(int maxPerHour, int currentCount = 0, DateTimeOffset? hourStart = null)
{
if (maxPerHour <= 0)
throw new ArgumentOutOfRangeException(nameof(maxPerHour), "Max per hour must be positive.");
MaxPerHour = maxPerHour;
_currentCount = currentCount;
_hourStart = hourStart ?? TruncateToHour(DateTimeOffset.UtcNow);
}
/// <summary>
/// Attempts to increment the counter.
/// </summary>
/// <param name="now">Current time.</param>
/// <returns>True if increment was allowed, false if limit reached.</returns>
public bool TryIncrement(DateTimeOffset now)
{
lock (_lock)
{
MaybeResetHour(now);
if (_currentCount < MaxPerHour)
{
_currentCount++;
return true;
}
return false;
}
}
/// <summary>
/// Decrements the counter (for rollback).
/// </summary>
public void Decrement()
{
lock (_lock)
{
if (_currentCount > 0)
_currentCount--;
}
}
/// <summary>
/// Gets a snapshot of the counter state.
/// </summary>
public HourlyCounterSnapshot GetSnapshot(DateTimeOffset now)
{
lock (_lock)
{
MaybeResetHour(now);
var nextHour = _hourStart.AddHours(1);
var timeUntilReset = nextHour - now;
return new HourlyCounterSnapshot(
MaxPerHour: MaxPerHour,
CurrentCount: _currentCount,
HourStart: _hourStart,
TimeUntilReset: timeUntilReset > TimeSpan.Zero ? timeUntilReset : TimeSpan.Zero);
}
}
private void MaybeResetHour(DateTimeOffset now)
{
var currentHour = TruncateToHour(now);
if (currentHour > _hourStart)
{
_hourStart = currentHour;
_currentCount = 0;
}
}
private static DateTimeOffset TruncateToHour(DateTimeOffset dt) =>
new(dt.Year, dt.Month, dt.Day, dt.Hour, 0, 0, dt.Offset);
}
/// <summary>
/// Snapshot of hourly counter state.
/// </summary>
public sealed record HourlyCounterSnapshot(
int MaxPerHour,
int CurrentCount,
DateTimeOffset HourStart,
TimeSpan TimeUntilReset)
{
/// <summary>
/// Remaining requests in current hour.
/// </summary>
public int Remaining => Math.Max(0, MaxPerHour - CurrentCount);
/// <summary>
/// Whether the hourly limit has been reached.
/// </summary>
public bool IsExhausted => CurrentCount >= MaxPerHour;
}

View File

@@ -0,0 +1,273 @@
namespace StellaOps.Orchestrator.Core.RateLimiting;
/// <summary>
/// Handles backpressure from upstream services (429, 503, etc.).
/// Implements exponential backoff with jitter for retry timing.
/// </summary>
public sealed class BackpressureHandler
{
private readonly object _lock = new();
private int _consecutiveFailures;
private DateTimeOffset? _backoffUntil;
private DateTimeOffset _lastFailureAt;
private string? _lastFailureReason;
/// <summary>
/// Base delay for backoff calculation.
/// </summary>
public TimeSpan BaseDelay { get; }
/// <summary>
/// Maximum delay cap.
/// </summary>
public TimeSpan MaxDelay { get; }
/// <summary>
/// Number of failures before triggering full backoff.
/// </summary>
public int FailureThreshold { get; }
/// <summary>
/// Maximum random jitter to add (0.0 to 1.0 fraction of delay).
/// </summary>
public double JitterFactor { get; }
/// <summary>
/// Whether currently in backoff state.
/// </summary>
public bool IsInBackoff
{
get
{
lock (_lock)
{
return _backoffUntil.HasValue && DateTimeOffset.UtcNow < _backoffUntil.Value;
}
}
}
/// <summary>
/// Number of consecutive failures.
/// </summary>
public int ConsecutiveFailures
{
get
{
lock (_lock)
{
return _consecutiveFailures;
}
}
}
/// <summary>
/// Time until backoff expires (or TimeSpan.Zero if not in backoff).
/// </summary>
public TimeSpan TimeUntilReady
{
get
{
lock (_lock)
{
if (!_backoffUntil.HasValue)
return TimeSpan.Zero;
var remaining = _backoffUntil.Value - DateTimeOffset.UtcNow;
return remaining > TimeSpan.Zero ? remaining : TimeSpan.Zero;
}
}
}
/// <summary>
/// Creates a new backpressure handler.
/// </summary>
/// <param name="baseDelay">Base delay for exponential backoff.</param>
/// <param name="maxDelay">Maximum delay cap.</param>
/// <param name="failureThreshold">Failures before entering backoff.</param>
/// <param name="jitterFactor">Random jitter factor (0.0 to 1.0).</param>
public BackpressureHandler(
TimeSpan? baseDelay = null,
TimeSpan? maxDelay = null,
int failureThreshold = 1,
double jitterFactor = 0.2)
{
BaseDelay = baseDelay ?? TimeSpan.FromSeconds(1);
MaxDelay = maxDelay ?? TimeSpan.FromMinutes(5);
FailureThreshold = failureThreshold > 0 ? failureThreshold : 1;
JitterFactor = Math.Clamp(jitterFactor, 0.0, 1.0);
if (BaseDelay <= TimeSpan.Zero)
throw new ArgumentOutOfRangeException(nameof(baseDelay), "Base delay must be positive.");
if (MaxDelay < BaseDelay)
throw new ArgumentOutOfRangeException(nameof(maxDelay), "Max delay must be >= base delay.");
}
/// <summary>
/// Records an upstream failure and potentially triggers backoff.
/// </summary>
/// <param name="statusCode">HTTP status code from upstream.</param>
/// <param name="retryAfter">Optional Retry-After header value.</param>
/// <param name="now">Current time.</param>
/// <returns>Backoff result with recommended delay.</returns>
public BackpressureResult RecordFailure(int statusCode, TimeSpan? retryAfter = null, DateTimeOffset? now = null)
{
var timestamp = now ?? DateTimeOffset.UtcNow;
lock (_lock)
{
_consecutiveFailures++;
_lastFailureAt = timestamp;
_lastFailureReason = GetFailureReason(statusCode);
// Use Retry-After if provided and reasonable
if (retryAfter.HasValue && retryAfter.Value > TimeSpan.Zero && retryAfter.Value <= MaxDelay)
{
_backoffUntil = timestamp + retryAfter.Value;
return new BackpressureResult(
ShouldBackoff: true,
BackoffDuration: retryAfter.Value,
BackoffUntil: _backoffUntil.Value,
ConsecutiveFailures: _consecutiveFailures,
Reason: _lastFailureReason,
StatusCode: statusCode);
}
// Calculate exponential backoff with jitter
var delay = CalculateBackoffDelay(_consecutiveFailures, timestamp);
_backoffUntil = timestamp + delay;
return new BackpressureResult(
ShouldBackoff: _consecutiveFailures >= FailureThreshold,
BackoffDuration: delay,
BackoffUntil: _backoffUntil.Value,
ConsecutiveFailures: _consecutiveFailures,
Reason: _lastFailureReason,
StatusCode: statusCode);
}
}
/// <summary>
/// Records a successful request, resetting failure count.
/// </summary>
public void RecordSuccess()
{
lock (_lock)
{
_consecutiveFailures = 0;
_backoffUntil = null;
_lastFailureReason = null;
}
}
/// <summary>
/// Checks if a request should be allowed based on backoff state.
/// </summary>
/// <param name="now">Current time.</param>
/// <returns>True if request should proceed, false if in backoff.</returns>
public bool ShouldAllow(DateTimeOffset? now = null)
{
var timestamp = now ?? DateTimeOffset.UtcNow;
lock (_lock)
{
if (!_backoffUntil.HasValue)
return true;
if (timestamp >= _backoffUntil.Value)
{
// Backoff expired
return true;
}
return false;
}
}
/// <summary>
/// Resets the handler to initial state.
/// </summary>
public void Reset()
{
lock (_lock)
{
_consecutiveFailures = 0;
_backoffUntil = null;
_lastFailureReason = null;
}
}
/// <summary>
/// Gets a snapshot of the current backpressure state.
/// </summary>
/// <param name="now">Current time.</param>
/// <returns>Snapshot of backpressure state.</returns>
public BackpressureSnapshot GetSnapshot(DateTimeOffset? now = null)
{
var timestamp = now ?? DateTimeOffset.UtcNow;
lock (_lock)
{
var isInBackoff = _backoffUntil.HasValue && timestamp < _backoffUntil.Value;
var timeRemaining = isInBackoff ? _backoffUntil!.Value - timestamp : TimeSpan.Zero;
return new BackpressureSnapshot(
IsInBackoff: isInBackoff,
ConsecutiveFailures: _consecutiveFailures,
BackoffUntil: _backoffUntil,
TimeRemaining: timeRemaining > TimeSpan.Zero ? timeRemaining : TimeSpan.Zero,
LastFailureAt: _lastFailureAt,
LastFailureReason: _lastFailureReason);
}
}
private TimeSpan CalculateBackoffDelay(int failures, DateTimeOffset now)
{
// Exponential backoff: baseDelay * 2^(failures-1)
var exponent = Math.Min(failures - 1, 10); // Cap exponent to prevent overflow
var delayMs = BaseDelay.TotalMilliseconds * Math.Pow(2, exponent);
// Add jitter
if (JitterFactor > 0)
{
var jitter = delayMs * JitterFactor * Random.Shared.NextDouble();
delayMs += jitter;
}
// Cap at max delay
var delay = TimeSpan.FromMilliseconds(Math.Min(delayMs, MaxDelay.TotalMilliseconds));
return delay;
}
private static string GetFailureReason(int statusCode) => statusCode switch
{
429 => "upstream_rate_limited",
503 => "upstream_unavailable",
502 => "upstream_bad_gateway",
504 => "upstream_timeout",
>= 500 and < 600 => "upstream_server_error",
>= 400 and < 500 => "upstream_client_error",
_ => "upstream_error"
};
}
/// <summary>
/// Result of recording a failure.
/// </summary>
public sealed record BackpressureResult(
bool ShouldBackoff,
TimeSpan BackoffDuration,
DateTimeOffset BackoffUntil,
int ConsecutiveFailures,
string Reason,
int StatusCode);
/// <summary>
/// Snapshot of backpressure handler state.
/// </summary>
public sealed record BackpressureSnapshot(
bool IsInBackoff,
int ConsecutiveFailures,
DateTimeOffset? BackoffUntil,
TimeSpan TimeRemaining,
DateTimeOffset LastFailureAt,
string? LastFailureReason);

View File

@@ -0,0 +1,226 @@
namespace StellaOps.Orchestrator.Core.RateLimiting;
/// <summary>
/// Concurrency limiter that tracks active jobs and enforces maximum concurrent execution.
/// </summary>
public sealed class ConcurrencyLimiter
{
private readonly object _lock = new();
private int _currentActive;
/// <summary>
/// Maximum allowed concurrent active jobs.
/// </summary>
public int MaxActive { get; }
/// <summary>
/// Current count of active jobs.
/// </summary>
public int CurrentActive
{
get
{
lock (_lock)
{
return _currentActive;
}
}
}
/// <summary>
/// Number of available slots.
/// </summary>
public int AvailableSlots
{
get
{
lock (_lock)
{
return Math.Max(0, MaxActive - _currentActive);
}
}
}
/// <summary>
/// Creates a new concurrency limiter.
/// </summary>
/// <param name="maxActive">Maximum concurrent jobs allowed.</param>
/// <param name="currentActive">Starting count of active jobs.</param>
public ConcurrencyLimiter(int maxActive, int currentActive = 0)
{
if (maxActive <= 0)
throw new ArgumentOutOfRangeException(nameof(maxActive), "Max active must be positive.");
if (currentActive < 0)
throw new ArgumentOutOfRangeException(nameof(currentActive), "Current active cannot be negative.");
MaxActive = maxActive;
_currentActive = currentActive;
}
/// <summary>
/// Attempts to acquire a slot for a new active job.
/// </summary>
/// <returns>True if slot was acquired, false if at capacity.</returns>
public bool TryAcquire()
{
lock (_lock)
{
if (_currentActive < MaxActive)
{
_currentActive++;
return true;
}
return false;
}
}
/// <summary>
/// Attempts to acquire multiple slots.
/// </summary>
/// <param name="count">Number of slots to acquire.</param>
/// <returns>True if all slots were acquired, false otherwise (no partial acquisition).</returns>
public bool TryAcquire(int count)
{
if (count <= 0)
throw new ArgumentOutOfRangeException(nameof(count), "Count must be positive.");
lock (_lock)
{
if (_currentActive + count <= MaxActive)
{
_currentActive += count;
return true;
}
return false;
}
}
/// <summary>
/// Releases a slot when a job completes.
/// </summary>
/// <returns>True if slot was released, false if already at zero.</returns>
public bool Release()
{
lock (_lock)
{
if (_currentActive > 0)
{
_currentActive--;
return true;
}
return false;
}
}
/// <summary>
/// Releases multiple slots.
/// </summary>
/// <param name="count">Number of slots to release.</param>
/// <returns>Number of slots actually released.</returns>
public int Release(int count)
{
if (count <= 0)
throw new ArgumentOutOfRangeException(nameof(count), "Count must be positive.");
lock (_lock)
{
var released = Math.Min(count, _currentActive);
_currentActive -= released;
return released;
}
}
/// <summary>
/// Checks if a slot is available without acquiring it.
/// </summary>
/// <returns>True if at least one slot is available.</returns>
public bool HasCapacity()
{
lock (_lock)
{
return _currentActive < MaxActive;
}
}
/// <summary>
/// Checks if multiple slots are available without acquiring them.
/// </summary>
/// <param name="count">Number of slots to check for.</param>
/// <returns>True if requested slots are available.</returns>
public bool HasCapacity(int count)
{
lock (_lock)
{
return _currentActive + count <= MaxActive;
}
}
/// <summary>
/// Resets the limiter to zero active jobs.
/// </summary>
/// <returns>Number of slots that were released.</returns>
public int Reset()
{
lock (_lock)
{
var released = _currentActive;
_currentActive = 0;
return released;
}
}
/// <summary>
/// Sets the current active count directly (for recovery/sync scenarios).
/// </summary>
/// <param name="count">New active count.</param>
public void SetActive(int count)
{
if (count < 0)
throw new ArgumentOutOfRangeException(nameof(count), "Count cannot be negative.");
lock (_lock)
{
_currentActive = count;
}
}
/// <summary>
/// Gets a snapshot of the current limiter state.
/// </summary>
/// <returns>Snapshot of limiter state.</returns>
public ConcurrencySnapshot GetSnapshot()
{
lock (_lock)
{
return new ConcurrencySnapshot(MaxActive, _currentActive);
}
}
}
/// <summary>
/// Immutable snapshot of concurrency limiter state.
/// </summary>
public sealed record ConcurrencySnapshot(
int MaxActive,
int CurrentActive)
{
/// <summary>
/// Number of available slots.
/// </summary>
public int AvailableSlots => Math.Max(0, MaxActive - CurrentActive);
/// <summary>
/// Utilization percentage (0.0 to 1.0).
/// </summary>
public double Utilization => (double)CurrentActive / MaxActive;
/// <summary>
/// Whether the limiter is at capacity.
/// </summary>
public bool IsAtCapacity => CurrentActive >= MaxActive;
/// <summary>
/// Whether there are no active jobs.
/// </summary>
public bool IsIdle => CurrentActive == 0;
}

View File

@@ -0,0 +1,210 @@
namespace StellaOps.Orchestrator.Core.RateLimiting;
/// <summary>
/// Token bucket rate limiter implementation.
/// Tokens refill at a constant rate up to a burst capacity.
/// </summary>
public sealed class TokenBucket
{
private readonly object _lock = new();
private double _currentTokens;
private DateTimeOffset _lastRefillAt;
/// <summary>
/// Maximum tokens the bucket can hold (burst capacity).
/// </summary>
public int BurstCapacity { get; }
/// <summary>
/// Rate at which tokens are added (tokens per second).
/// </summary>
public double RefillRate { get; }
/// <summary>
/// Current number of available tokens.
/// </summary>
public double CurrentTokens
{
get
{
lock (_lock)
{
return _currentTokens;
}
}
}
/// <summary>
/// Last time the bucket was refilled.
/// </summary>
public DateTimeOffset LastRefillAt
{
get
{
lock (_lock)
{
return _lastRefillAt;
}
}
}
/// <summary>
/// Creates a new token bucket.
/// </summary>
/// <param name="burstCapacity">Maximum tokens the bucket can hold.</param>
/// <param name="refillRate">Tokens per second to add.</param>
/// <param name="initialTokens">Starting number of tokens (defaults to burst capacity).</param>
/// <param name="lastRefillAt">Starting time for refill calculation.</param>
public TokenBucket(
int burstCapacity,
double refillRate,
double? initialTokens = null,
DateTimeOffset? lastRefillAt = null)
{
if (burstCapacity <= 0)
throw new ArgumentOutOfRangeException(nameof(burstCapacity), "Burst capacity must be positive.");
if (refillRate <= 0)
throw new ArgumentOutOfRangeException(nameof(refillRate), "Refill rate must be positive.");
BurstCapacity = burstCapacity;
RefillRate = refillRate;
_currentTokens = Math.Min(initialTokens ?? burstCapacity, burstCapacity);
_lastRefillAt = lastRefillAt ?? DateTimeOffset.UtcNow;
}
/// <summary>
/// Attempts to consume a token from the bucket.
/// </summary>
/// <param name="now">Current time for refill calculation.</param>
/// <param name="tokensRequired">Number of tokens to consume (default 1).</param>
/// <returns>True if tokens were consumed, false if insufficient tokens.</returns>
public bool TryConsume(DateTimeOffset now, int tokensRequired = 1)
{
if (tokensRequired <= 0)
throw new ArgumentOutOfRangeException(nameof(tokensRequired), "Tokens required must be positive.");
lock (_lock)
{
Refill(now);
if (_currentTokens >= tokensRequired)
{
_currentTokens -= tokensRequired;
return true;
}
return false;
}
}
/// <summary>
/// Checks if the bucket has enough tokens without consuming them.
/// </summary>
/// <param name="now">Current time for refill calculation.</param>
/// <param name="tokensRequired">Number of tokens to check for.</param>
/// <returns>True if sufficient tokens are available.</returns>
public bool HasTokens(DateTimeOffset now, int tokensRequired = 1)
{
lock (_lock)
{
Refill(now);
return _currentTokens >= tokensRequired;
}
}
/// <summary>
/// Gets estimated time until the specified number of tokens will be available.
/// </summary>
/// <param name="now">Current time for calculation.</param>
/// <param name="tokensRequired">Number of tokens needed.</param>
/// <returns>Time until tokens available, or TimeSpan.Zero if already available.</returns>
public TimeSpan EstimatedWaitTime(DateTimeOffset now, int tokensRequired = 1)
{
lock (_lock)
{
Refill(now);
if (_currentTokens >= tokensRequired)
return TimeSpan.Zero;
var tokensNeeded = tokensRequired - _currentTokens;
var secondsToWait = tokensNeeded / RefillRate;
return TimeSpan.FromSeconds(secondsToWait);
}
}
/// <summary>
/// Refills tokens based on elapsed time.
/// </summary>
/// <param name="now">Current time.</param>
public void Refill(DateTimeOffset now)
{
lock (_lock)
{
if (now <= _lastRefillAt)
return;
var elapsed = (now - _lastRefillAt).TotalSeconds;
var tokensToAdd = elapsed * RefillRate;
_currentTokens = Math.Min(_currentTokens + tokensToAdd, BurstCapacity);
_lastRefillAt = now;
}
}
/// <summary>
/// Resets the bucket to full capacity.
/// </summary>
/// <param name="now">Current time.</param>
public void Reset(DateTimeOffset now)
{
lock (_lock)
{
_currentTokens = BurstCapacity;
_lastRefillAt = now;
}
}
/// <summary>
/// Creates a snapshot of the current bucket state.
/// </summary>
/// <param name="now">Current time for refill calculation.</param>
/// <returns>Snapshot of bucket state.</returns>
public TokenBucketSnapshot GetSnapshot(DateTimeOffset now)
{
lock (_lock)
{
Refill(now);
return new TokenBucketSnapshot(
BurstCapacity,
RefillRate,
_currentTokens,
_lastRefillAt);
}
}
}
/// <summary>
/// Immutable snapshot of token bucket state.
/// </summary>
public sealed record TokenBucketSnapshot(
int BurstCapacity,
double RefillRate,
double CurrentTokens,
DateTimeOffset LastRefillAt)
{
/// <summary>
/// Percentage of bucket that is full (0.0 to 1.0).
/// </summary>
public double FillPercent => CurrentTokens / BurstCapacity;
/// <summary>
/// Whether the bucket is empty.
/// </summary>
public bool IsEmpty => CurrentTokens < 1;
/// <summary>
/// Whether the bucket is full.
/// </summary>
public bool IsFull => CurrentTokens >= BurstCapacity;
}

View File

@@ -0,0 +1,399 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Core.Scheduling;
/// <summary>
/// Plans and manages job DAG (Directed Acyclic Graph) execution.
/// Handles dependency resolution, topological sorting, and critical path analysis.
/// </summary>
public sealed class DagPlanner
{
/// <summary>
/// Validates that the given edges form a valid DAG (no cycles).
/// </summary>
/// <param name="edges">DAG edges to validate.</param>
/// <returns>Validation result with any detected cycles.</returns>
public static DagValidationResult ValidateDag(IEnumerable<DagEdge> edges)
{
ArgumentNullException.ThrowIfNull(edges);
var edgeList = edges.ToList();
if (edgeList.Count == 0)
{
return DagValidationResult.Valid();
}
// Build adjacency list
var adjacency = new Dictionary<Guid, List<Guid>>();
var allNodes = new HashSet<Guid>();
foreach (var edge in edgeList)
{
allNodes.Add(edge.ParentJobId);
allNodes.Add(edge.ChildJobId);
if (!adjacency.TryGetValue(edge.ParentJobId, out var children))
{
children = [];
adjacency[edge.ParentJobId] = children;
}
children.Add(edge.ChildJobId);
}
// Detect cycles using DFS with coloring
var white = new HashSet<Guid>(allNodes); // Unvisited
var gray = new HashSet<Guid>(); // In progress
var cycleNodes = new List<Guid>();
foreach (var node in allNodes)
{
if (white.Contains(node))
{
if (HasCycleDfs(node, adjacency, white, gray, cycleNodes))
{
return DagValidationResult.CycleDetected(cycleNodes);
}
}
}
return DagValidationResult.Valid();
}
private static bool HasCycleDfs(
Guid node,
Dictionary<Guid, List<Guid>> adjacency,
HashSet<Guid> white,
HashSet<Guid> gray,
List<Guid> cycleNodes)
{
white.Remove(node);
gray.Add(node);
if (adjacency.TryGetValue(node, out var children))
{
foreach (var child in children)
{
if (gray.Contains(child))
{
// Back edge found - cycle detected
cycleNodes.Add(child);
cycleNodes.Add(node);
return true;
}
if (white.Contains(child) && HasCycleDfs(child, adjacency, white, gray, cycleNodes))
{
cycleNodes.Add(node);
return true;
}
}
}
gray.Remove(node);
return false;
}
/// <summary>
/// Performs topological sort on jobs based on their dependencies.
/// </summary>
/// <param name="jobIds">Job IDs to sort.</param>
/// <param name="edges">Dependency edges.</param>
/// <returns>Jobs in topologically sorted order (parents before children).</returns>
public static IReadOnlyList<Guid> TopologicalSort(IEnumerable<Guid> jobIds, IEnumerable<DagEdge> edges)
{
ArgumentNullException.ThrowIfNull(jobIds);
ArgumentNullException.ThrowIfNull(edges);
var jobs = jobIds.ToHashSet();
var edgeList = edges.ToList();
// Build in-degree map and adjacency list
var inDegree = jobs.ToDictionary(j => j, _ => 0);
var adjacency = new Dictionary<Guid, List<Guid>>();
foreach (var edge in edgeList)
{
if (!jobs.Contains(edge.ParentJobId) || !jobs.Contains(edge.ChildJobId))
{
continue; // Skip edges for jobs not in our set
}
inDegree[edge.ChildJobId]++;
if (!adjacency.TryGetValue(edge.ParentJobId, out var children))
{
children = [];
adjacency[edge.ParentJobId] = children;
}
children.Add(edge.ChildJobId);
}
// Kahn's algorithm
var queue = new Queue<Guid>(inDegree.Where(kv => kv.Value == 0).Select(kv => kv.Key));
var result = new List<Guid>(jobs.Count);
while (queue.Count > 0)
{
var current = queue.Dequeue();
result.Add(current);
if (adjacency.TryGetValue(current, out var children))
{
foreach (var child in children)
{
inDegree[child]--;
if (inDegree[child] == 0)
{
queue.Enqueue(child);
}
}
}
}
if (result.Count != jobs.Count)
{
throw new InvalidOperationException("Cycle detected in job DAG - topological sort failed.");
}
return result;
}
/// <summary>
/// Gets all jobs that have no unmet dependencies (ready to schedule).
/// </summary>
/// <param name="jobs">All jobs in the DAG.</param>
/// <param name="edges">Dependency edges.</param>
/// <returns>Jobs with all dependencies satisfied or no dependencies.</returns>
public static IReadOnlyList<Job> GetReadyJobs(IEnumerable<Job> jobs, IEnumerable<DagEdge> edges)
{
ArgumentNullException.ThrowIfNull(jobs);
ArgumentNullException.ThrowIfNull(edges);
var jobList = jobs.ToList();
var edgeList = edges.ToList();
// Build map of job ID to job and set of succeeded job IDs
var jobMap = jobList.ToDictionary(j => j.JobId);
var succeededJobs = jobList
.Where(j => JobStateMachine.IsSuccess(j.Status))
.Select(j => j.JobId)
.ToHashSet();
// Build map of job ID to parent dependencies
var dependencies = new Dictionary<Guid, List<DagEdge>>();
foreach (var edge in edgeList)
{
if (!dependencies.TryGetValue(edge.ChildJobId, out var deps))
{
deps = [];
dependencies[edge.ChildJobId] = deps;
}
deps.Add(edge);
}
var ready = new List<Job>();
foreach (var job in jobList)
{
// Skip jobs that aren't pending
if (!JobStateMachine.IsPending(job.Status))
{
continue;
}
// Check if all dependencies are satisfied
if (!dependencies.TryGetValue(job.JobId, out var deps))
{
// No dependencies - ready to go
ready.Add(job);
continue;
}
var allSatisfied = deps.All(edge => IsDependencySatisfied(edge, jobMap, succeededJobs));
if (allSatisfied)
{
ready.Add(job);
}
}
return ready;
}
private static bool IsDependencySatisfied(DagEdge edge, Dictionary<Guid, Job> jobMap, HashSet<Guid> succeededJobs)
{
if (!jobMap.TryGetValue(edge.ParentJobId, out var parentJob))
{
// Parent job doesn't exist - treat as satisfied (orphan edge)
return true;
}
return edge.EdgeType switch
{
DagEdgeTypes.Success => succeededJobs.Contains(edge.ParentJobId),
DagEdgeTypes.Always => JobStateMachine.IsTerminal(parentJob.Status),
DagEdgeTypes.Failure => parentJob.Status == JobStatus.Failed,
_ => false
};
}
/// <summary>
/// Calculates the critical path through the DAG based on estimated durations.
/// </summary>
/// <param name="jobs">Jobs with estimated durations.</param>
/// <param name="edges">Dependency edges.</param>
/// <param name="getDuration">Function to get estimated duration for a job.</param>
/// <returns>Critical path information.</returns>
public static CriticalPathResult CalculateCriticalPath(
IEnumerable<Job> jobs,
IEnumerable<DagEdge> edges,
Func<Job, TimeSpan> getDuration)
{
ArgumentNullException.ThrowIfNull(jobs);
ArgumentNullException.ThrowIfNull(edges);
ArgumentNullException.ThrowIfNull(getDuration);
var jobList = jobs.ToList();
var edgeList = edges.ToList();
if (jobList.Count == 0)
{
return new CriticalPathResult([], TimeSpan.Zero);
}
var jobMap = jobList.ToDictionary(j => j.JobId);
var sortedIds = TopologicalSort(jobList.Select(j => j.JobId), edgeList);
// Build reverse adjacency (child -> parents)
var parents = new Dictionary<Guid, List<Guid>>();
foreach (var edge in edgeList)
{
if (!parents.TryGetValue(edge.ChildJobId, out var parentList))
{
parentList = [];
parents[edge.ChildJobId] = parentList;
}
parentList.Add(edge.ParentJobId);
}
// Forward pass: calculate earliest start times
var earliestStart = new Dictionary<Guid, TimeSpan>();
var earliestFinish = new Dictionary<Guid, TimeSpan>();
foreach (var jobId in sortedIds)
{
var job = jobMap[jobId];
var duration = getDuration(job);
var maxParentFinish = TimeSpan.Zero;
if (parents.TryGetValue(jobId, out var parentIds))
{
foreach (var parentId in parentIds)
{
if (earliestFinish.TryGetValue(parentId, out var pf) && pf > maxParentFinish)
{
maxParentFinish = pf;
}
}
}
earliestStart[jobId] = maxParentFinish;
earliestFinish[jobId] = maxParentFinish + duration;
}
// Find total duration and identify critical path
var totalDuration = earliestFinish.Values.DefaultIfEmpty(TimeSpan.Zero).Max();
// Backward pass: identify critical path (jobs where slack = 0)
var criticalPath = new List<Guid>();
var latestFinish = new Dictionary<Guid, TimeSpan>();
foreach (var jobId in sortedIds.Reverse())
{
var job = jobMap[jobId];
var duration = getDuration(job);
// Find minimum latest start of children
var minChildStart = totalDuration;
var adjacency = edgeList.Where(e => e.ParentJobId == jobId).Select(e => e.ChildJobId);
foreach (var childId in adjacency)
{
if (latestFinish.TryGetValue(childId, out var lf))
{
var childLatestStart = lf - getDuration(jobMap[childId]);
if (childLatestStart < minChildStart)
{
minChildStart = childLatestStart;
}
}
}
latestFinish[jobId] = minChildStart;
// Check if on critical path (slack = 0)
var slack = minChildStart - earliestFinish[jobId];
if (slack <= TimeSpan.Zero)
{
criticalPath.Add(jobId);
}
}
criticalPath.Reverse();
return new CriticalPathResult(criticalPath, totalDuration);
}
/// <summary>
/// Gets jobs that are blocked by a specific failed job.
/// </summary>
/// <param name="failedJobId">The failed job ID.</param>
/// <param name="edges">Dependency edges.</param>
/// <returns>All job IDs that are transitively blocked.</returns>
public static IReadOnlySet<Guid> GetBlockedJobs(Guid failedJobId, IEnumerable<DagEdge> edges)
{
ArgumentNullException.ThrowIfNull(edges);
var edgeList = edges.ToList();
var blocked = new HashSet<Guid>();
var queue = new Queue<Guid>();
// Find direct children with "success" dependency
foreach (var edge in edgeList.Where(e => e.ParentJobId == failedJobId && e.EdgeType == DagEdgeTypes.Success))
{
queue.Enqueue(edge.ChildJobId);
}
// BFS to find all transitively blocked jobs
while (queue.Count > 0)
{
var current = queue.Dequeue();
if (!blocked.Add(current))
{
continue;
}
foreach (var edge in edgeList.Where(e => e.ParentJobId == current))
{
queue.Enqueue(edge.ChildJobId);
}
}
return blocked;
}
}
/// <summary>
/// Result of DAG validation.
/// </summary>
public sealed record DagValidationResult(
bool IsValid,
IReadOnlyList<Guid> CycleNodes)
{
public static DagValidationResult Valid() => new(true, []);
public static DagValidationResult CycleDetected(IReadOnlyList<Guid> cycleNodes) => new(false, cycleNodes);
}
/// <summary>
/// Result of critical path calculation.
/// </summary>
public sealed record CriticalPathResult(
IReadOnlyList<Guid> CriticalPathJobIds,
TimeSpan TotalDuration);

View File

@@ -0,0 +1,223 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Core.Scheduling;
/// <summary>
/// Coordinates job scheduling decisions including quota checks,
/// dependency resolution, and status transitions.
/// </summary>
public interface IJobScheduler
{
/// <summary>
/// Evaluates whether a job can be scheduled.
/// </summary>
ScheduleDecision EvaluateScheduling(Job job, SchedulingContext context);
/// <summary>
/// Evaluates the outcome of a job completion and determines next steps.
/// </summary>
CompletionDecision EvaluateCompletion(Job job, JobStatus outcome, string? reason, CompletionContext context);
/// <summary>
/// Evaluates which pending jobs are ready to be scheduled.
/// </summary>
IReadOnlyList<Job> GetSchedulableJobs(IEnumerable<Job> pendingJobs, SchedulingContext context);
}
/// <summary>
/// Default implementation of job scheduler.
/// </summary>
public sealed class JobScheduler : IJobScheduler
{
/// <summary>
/// Evaluates whether a job can transition from Pending to Scheduled.
/// </summary>
public ScheduleDecision EvaluateScheduling(Job job, SchedulingContext context)
{
ArgumentNullException.ThrowIfNull(job);
ArgumentNullException.ThrowIfNull(context);
// Check current status
if (job.Status != JobStatus.Pending)
{
return ScheduleDecision.Reject($"Job is not pending (current: {job.Status})");
}
// Check if job has a not-before time that hasn't passed
if (job.NotBefore.HasValue && job.NotBefore.Value > context.Now)
{
return ScheduleDecision.Defer(job.NotBefore.Value, "Backoff period not elapsed");
}
// Check dependencies
if (!context.AreDependenciesSatisfied)
{
return ScheduleDecision.Defer(null, "Dependencies not satisfied");
}
// Check quota
if (!context.HasQuotaAvailable)
{
return ScheduleDecision.Defer(context.QuotaAvailableAt, "Quota exhausted");
}
// Check if source/job type is throttled
if (context.IsThrottled)
{
return ScheduleDecision.Defer(context.ThrottleExpiresAt, context.ThrottleReason ?? "Throttled");
}
return ScheduleDecision.Schedule();
}
/// <summary>
/// Evaluates the outcome of a job completion.
/// </summary>
public CompletionDecision EvaluateCompletion(Job job, JobStatus outcome, string? reason, CompletionContext context)
{
ArgumentNullException.ThrowIfNull(job);
ArgumentNullException.ThrowIfNull(context);
// Validate transition
if (!JobStateMachine.IsValidTransition(job.Status, outcome))
{
throw new InvalidJobTransitionException(job.Status, outcome);
}
// Success - job is done
if (outcome == JobStatus.Succeeded)
{
return CompletionDecision.Complete(outcome, reason);
}
// Canceled - no retry
if (outcome == JobStatus.Canceled)
{
return CompletionDecision.Complete(outcome, reason ?? "Canceled");
}
// Failed or TimedOut - check retry policy
if (outcome == JobStatus.Failed || outcome == JobStatus.TimedOut)
{
var retryDecision = RetryEvaluator.Evaluate(job.Attempt, context.RetryPolicy, context.Now);
if (retryDecision.ShouldRetry)
{
return CompletionDecision.Retry(
retryDecision.NextAttempt,
retryDecision.NotBefore!.Value,
$"{outcome}: {reason ?? "Unknown error"}. Retry scheduled.");
}
return CompletionDecision.Complete(
JobStatus.Failed,
$"{outcome}: {reason ?? "Unknown error"}. {retryDecision.Reason}");
}
return CompletionDecision.Complete(outcome, reason);
}
/// <summary>
/// Gets all pending jobs that are ready to be scheduled.
/// </summary>
public IReadOnlyList<Job> GetSchedulableJobs(IEnumerable<Job> pendingJobs, SchedulingContext context)
{
ArgumentNullException.ThrowIfNull(pendingJobs);
ArgumentNullException.ThrowIfNull(context);
var schedulable = new List<Job>();
foreach (var job in pendingJobs)
{
if (job.Status != JobStatus.Pending)
{
continue;
}
// Skip if in backoff period
if (job.NotBefore.HasValue && job.NotBefore.Value > context.Now)
{
continue;
}
// Dependencies are checked via context.ReadyJobIds
if (context.ReadyJobIds != null && !context.ReadyJobIds.Contains(job.JobId))
{
continue;
}
schedulable.Add(job);
}
// Sort by priority (descending) then created time (ascending)
return schedulable
.OrderByDescending(j => j.Priority)
.ThenBy(j => j.CreatedAt)
.ToList();
}
}
/// <summary>
/// Context for scheduling decisions.
/// </summary>
public sealed record SchedulingContext(
DateTimeOffset Now,
bool AreDependenciesSatisfied,
bool HasQuotaAvailable,
DateTimeOffset? QuotaAvailableAt,
bool IsThrottled,
string? ThrottleReason,
DateTimeOffset? ThrottleExpiresAt,
IReadOnlySet<Guid>? ReadyJobIds = null)
{
/// <summary>
/// Creates a context where scheduling is allowed.
/// </summary>
public static SchedulingContext AllowScheduling(DateTimeOffset now) => new(
now,
AreDependenciesSatisfied: true,
HasQuotaAvailable: true,
QuotaAvailableAt: null,
IsThrottled: false,
ThrottleReason: null,
ThrottleExpiresAt: null);
}
/// <summary>
/// Context for completion decisions.
/// </summary>
public sealed record CompletionContext(
DateTimeOffset Now,
RetryPolicy RetryPolicy);
/// <summary>
/// Decision about whether to schedule a job.
/// </summary>
public sealed record ScheduleDecision(
bool CanSchedule,
bool ShouldDefer,
DateTimeOffset? DeferUntil,
string? Reason)
{
public static ScheduleDecision Schedule() => new(true, false, null, null);
public static ScheduleDecision Defer(DateTimeOffset? until, string reason) => new(false, true, until, reason);
public static ScheduleDecision Reject(string reason) => new(false, false, null, reason);
}
/// <summary>
/// Decision about job completion outcome.
/// </summary>
public sealed record CompletionDecision(
bool IsComplete,
bool ShouldRetry,
JobStatus FinalStatus,
int? NextAttempt,
DateTimeOffset? RetryNotBefore,
string? Reason)
{
public static CompletionDecision Complete(JobStatus status, string? reason)
=> new(true, false, status, null, null, reason);
public static CompletionDecision Retry(int nextAttempt, DateTimeOffset notBefore, string reason)
=> new(false, true, JobStatus.Pending, nextAttempt, notBefore, reason);
}

View File

@@ -0,0 +1,141 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Core.Scheduling;
/// <summary>
/// Manages job status transitions and validates state machine rules.
///
/// State machine:
/// Pending → Scheduled (quota cleared, dependencies satisfied)
/// Scheduled → Leased (worker acquired lease)
/// Leased → Succeeded | Failed | Canceled | TimedOut
/// Failed → Pending (retry) | Failed (exhausted)
/// TimedOut → Pending (retry) | Failed (exhausted)
/// </summary>
public static class JobStateMachine
{
/// <summary>
/// Validates whether a status transition is allowed.
/// </summary>
/// <param name="from">Current status.</param>
/// <param name="to">Target status.</param>
/// <returns>True if transition is valid.</returns>
public static bool IsValidTransition(JobStatus from, JobStatus to)
{
return (from, to) switch
{
// From Pending
(JobStatus.Pending, JobStatus.Scheduled) => true,
(JobStatus.Pending, JobStatus.Canceled) => true,
// From Scheduled
(JobStatus.Scheduled, JobStatus.Leased) => true,
(JobStatus.Scheduled, JobStatus.Canceled) => true,
(JobStatus.Scheduled, JobStatus.Pending) => true, // Back to pending (quota exceeded, dependency failed)
// From Leased
(JobStatus.Leased, JobStatus.Succeeded) => true,
(JobStatus.Leased, JobStatus.Failed) => true,
(JobStatus.Leased, JobStatus.Canceled) => true,
(JobStatus.Leased, JobStatus.TimedOut) => true,
// Retry transitions (Failed/TimedOut back to Pending)
(JobStatus.Failed, JobStatus.Pending) => true,
(JobStatus.TimedOut, JobStatus.Pending) => true,
// Same status (idempotent)
_ when from == to => true,
// All other transitions are invalid
_ => false
};
}
/// <summary>
/// Determines if a job status is terminal (no further transitions except replay).
/// </summary>
public static bool IsTerminal(JobStatus status) => status switch
{
JobStatus.Succeeded => true,
JobStatus.Failed => true,
JobStatus.Canceled => true,
JobStatus.TimedOut => true,
_ => false
};
/// <summary>
/// Determines if a job status represents a successful completion.
/// </summary>
public static bool IsSuccess(JobStatus status) => status == JobStatus.Succeeded;
/// <summary>
/// Determines if a job status represents a failure that may be retried.
/// </summary>
public static bool IsRetryable(JobStatus status) => status switch
{
JobStatus.Failed => true,
JobStatus.TimedOut => true,
_ => false
};
/// <summary>
/// Determines if a job is in a state where it can be leased by a worker.
/// </summary>
public static bool IsLeasable(JobStatus status) => status == JobStatus.Scheduled;
/// <summary>
/// Determines if a job is waiting to be scheduled.
/// </summary>
public static bool IsPending(JobStatus status) => status == JobStatus.Pending;
/// <summary>
/// Determines if a job is currently being executed.
/// </summary>
public static bool IsActive(JobStatus status) => status == JobStatus.Leased;
/// <summary>
/// Gets all valid transitions from a given status.
/// </summary>
public static IReadOnlyList<JobStatus> GetValidTransitions(JobStatus from)
{
return from switch
{
JobStatus.Pending => [JobStatus.Scheduled, JobStatus.Canceled],
JobStatus.Scheduled => [JobStatus.Leased, JobStatus.Canceled, JobStatus.Pending],
JobStatus.Leased => [JobStatus.Succeeded, JobStatus.Failed, JobStatus.Canceled, JobStatus.TimedOut],
JobStatus.Failed => [JobStatus.Pending], // Retry only
JobStatus.TimedOut => [JobStatus.Pending], // Retry only
JobStatus.Succeeded => [],
JobStatus.Canceled => [],
_ => []
};
}
/// <summary>
/// Validates a transition and throws if invalid.
/// </summary>
/// <exception cref="InvalidJobTransitionException">Thrown when transition is not allowed.</exception>
public static void ValidateTransition(JobStatus from, JobStatus to)
{
if (!IsValidTransition(from, to))
{
throw new InvalidJobTransitionException(from, to);
}
}
}
/// <summary>
/// Exception thrown when an invalid job status transition is attempted.
/// </summary>
public sealed class InvalidJobTransitionException : Exception
{
public JobStatus FromStatus { get; }
public JobStatus ToStatus { get; }
public InvalidJobTransitionException(JobStatus from, JobStatus to)
: base($"Invalid job status transition from '{from}' to '{to}'.")
{
FromStatus = from;
ToStatus = to;
}
}

View File

@@ -0,0 +1,173 @@
namespace StellaOps.Orchestrator.Core.Scheduling;
/// <summary>
/// Defines retry behavior for failed jobs.
/// </summary>
public sealed record RetryPolicy(
/// <summary>Maximum number of retry attempts (including initial attempt).</summary>
int MaxAttempts,
/// <summary>Initial backoff delay in seconds.</summary>
double InitialBackoffSeconds,
/// <summary>Maximum backoff delay in seconds.</summary>
double MaxBackoffSeconds,
/// <summary>Backoff multiplier for exponential growth.</summary>
double BackoffMultiplier,
/// <summary>Jitter factor (0.0-1.0) to add randomness to backoff.</summary>
double JitterFactor)
{
/// <summary>
/// Default retry policy: 3 attempts, exponential backoff from 5s to 300s.
/// </summary>
public static RetryPolicy Default { get; } = new(
MaxAttempts: 3,
InitialBackoffSeconds: 5.0,
MaxBackoffSeconds: 300.0,
BackoffMultiplier: 2.0,
JitterFactor: 0.1);
/// <summary>
/// Aggressive retry policy for critical jobs: 5 attempts, quick retries.
/// </summary>
public static RetryPolicy Aggressive { get; } = new(
MaxAttempts: 5,
InitialBackoffSeconds: 1.0,
MaxBackoffSeconds: 60.0,
BackoffMultiplier: 1.5,
JitterFactor: 0.2);
/// <summary>
/// Conservative retry policy: 2 attempts, longer delays.
/// </summary>
public static RetryPolicy Conservative { get; } = new(
MaxAttempts: 2,
InitialBackoffSeconds: 30.0,
MaxBackoffSeconds: 600.0,
BackoffMultiplier: 3.0,
JitterFactor: 0.1);
/// <summary>
/// No retry policy: single attempt only.
/// </summary>
public static RetryPolicy NoRetry { get; } = new(
MaxAttempts: 1,
InitialBackoffSeconds: 0,
MaxBackoffSeconds: 0,
BackoffMultiplier: 1.0,
JitterFactor: 0);
/// <summary>
/// Determines if a job should be retried based on current attempt.
/// </summary>
/// <param name="currentAttempt">Current attempt number (1-based).</param>
/// <returns>True if retry is allowed.</returns>
public bool ShouldRetry(int currentAttempt) => currentAttempt < MaxAttempts;
/// <summary>
/// Calculates the next retry time based on current attempt.
/// </summary>
/// <param name="currentAttempt">Current attempt number (1-based).</param>
/// <param name="now">Current time.</param>
/// <returns>Earliest time for next retry attempt.</returns>
public DateTimeOffset CalculateNextRetryTime(int currentAttempt, DateTimeOffset now)
{
if (!ShouldRetry(currentAttempt))
{
throw new InvalidOperationException($"No retry allowed after attempt {currentAttempt} (max: {MaxAttempts}).");
}
var backoffSeconds = CalculateBackoffSeconds(currentAttempt);
return now.AddSeconds(backoffSeconds);
}
/// <summary>
/// Calculates backoff duration in seconds for a given attempt.
/// </summary>
/// <param name="attempt">Attempt number (1-based).</param>
/// <returns>Backoff duration in seconds.</returns>
public double CalculateBackoffSeconds(int attempt)
{
if (attempt < 1)
{
throw new ArgumentOutOfRangeException(nameof(attempt), "Attempt must be >= 1.");
}
// Exponential backoff: initial * multiplier^(attempt-1)
var exponentialBackoff = InitialBackoffSeconds * Math.Pow(BackoffMultiplier, attempt - 1);
// Cap at maximum
var cappedBackoff = Math.Min(exponentialBackoff, MaxBackoffSeconds);
// Add jitter to prevent thundering herd
var jitter = cappedBackoff * JitterFactor * (Random.Shared.NextDouble() * 2 - 1);
var finalBackoff = Math.Max(0, cappedBackoff + jitter);
return finalBackoff;
}
}
/// <summary>
/// Result of evaluating retry policy for a failed job.
/// </summary>
public sealed record RetryDecision(
/// <summary>Whether the job should be retried.</summary>
bool ShouldRetry,
/// <summary>Next attempt number (if retrying).</summary>
int NextAttempt,
/// <summary>Earliest time for next attempt (if retrying).</summary>
DateTimeOffset? NotBefore,
/// <summary>Reason for the decision.</summary>
string Reason)
{
/// <summary>
/// Creates a retry decision.
/// </summary>
public static RetryDecision Retry(int nextAttempt, DateTimeOffset notBefore)
=> new(true, nextAttempt, notBefore, $"Scheduling retry attempt {nextAttempt}");
/// <summary>
/// Creates a no-retry decision (exhausted).
/// </summary>
public static RetryDecision Exhausted(int maxAttempts)
=> new(false, 0, null, $"Max attempts ({maxAttempts}) exhausted");
/// <summary>
/// Creates a no-retry decision (not retryable status).
/// </summary>
public static RetryDecision NotRetryable(string reason)
=> new(false, 0, null, reason);
}
/// <summary>
/// Service for evaluating retry decisions.
/// </summary>
public static class RetryEvaluator
{
/// <summary>
/// Evaluates whether a job should be retried and calculates timing.
/// </summary>
/// <param name="currentAttempt">Current attempt number.</param>
/// <param name="policy">Retry policy to apply.</param>
/// <param name="now">Current time.</param>
/// <returns>Retry decision.</returns>
public static RetryDecision Evaluate(int currentAttempt, RetryPolicy policy, DateTimeOffset now)
{
ArgumentNullException.ThrowIfNull(policy);
if (!policy.ShouldRetry(currentAttempt))
{
return RetryDecision.Exhausted(policy.MaxAttempts);
}
var nextAttempt = currentAttempt + 1;
var notBefore = policy.CalculateNextRetryTime(currentAttempt, now);
return RetryDecision.Retry(nextAttempt, notBefore);
}
}

View File

@@ -0,0 +1,341 @@
using Microsoft.Extensions.Logging;
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Core.SloManagement;
/// <summary>
/// Options for burn rate computation.
/// </summary>
public sealed record BurnRateOptions
{
/// <summary>Short window multiplier for multi-window burn rate.</summary>
public double ShortWindowMultiplier { get; init; } = 14.4; // 5% budget in 1 hour
/// <summary>Long window multiplier for multi-window burn rate.</summary>
public double LongWindowMultiplier { get; init; } = 6.0; // 20% budget in 6 hours
/// <summary>Minimum events required for meaningful computation.</summary>
public int MinimumEvents { get; init; } = 10;
}
/// <summary>
/// Event counts for SLO computation.
/// </summary>
public sealed record SloEventCounts(
/// <summary>Total events in the window.</summary>
long TotalEvents,
/// <summary>Good events (successful) in the window.</summary>
long GoodEvents,
/// <summary>Bad events (failed) in the window.</summary>
long BadEvents,
/// <summary>Start of the evaluation window.</summary>
DateTimeOffset WindowStart,
/// <summary>End of the evaluation window.</summary>
DateTimeOffset WindowEnd);
/// <summary>
/// Interface for retrieving SLO event counts.
/// </summary>
public interface ISloEventSource
{
/// <summary>Gets event counts for an availability SLO.</summary>
Task<SloEventCounts> GetAvailabilityCountsAsync(
string tenantId,
string? jobType,
Guid? sourceId,
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
CancellationToken cancellationToken);
/// <summary>Gets event counts for a latency SLO.</summary>
Task<SloEventCounts> GetLatencyCountsAsync(
string tenantId,
string? jobType,
Guid? sourceId,
double percentile,
double targetSeconds,
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
CancellationToken cancellationToken);
/// <summary>Gets event counts for a throughput SLO.</summary>
Task<SloEventCounts> GetThroughputCountsAsync(
string tenantId,
string? jobType,
Guid? sourceId,
int minimumRequired,
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
CancellationToken cancellationToken);
}
/// <summary>
/// Engine for computing SLO burn rates and error budget consumption.
/// </summary>
public interface IBurnRateEngine
{
/// <summary>Computes the current state of an SLO.</summary>
Task<SloState> ComputeStateAsync(
Slo slo,
CancellationToken cancellationToken);
/// <summary>Computes states for all enabled SLOs for a tenant.</summary>
Task<IReadOnlyList<SloState>> ComputeAllStatesAsync(
string tenantId,
CancellationToken cancellationToken);
/// <summary>Evaluates alert thresholds and creates alerts if needed.</summary>
Task<IReadOnlyList<SloAlert>> EvaluateAlertsAsync(
Slo slo,
SloState state,
CancellationToken cancellationToken);
}
/// <summary>
/// Default implementation of burn rate computation engine.
/// </summary>
public sealed class BurnRateEngine : IBurnRateEngine
{
private readonly ISloRepository _sloRepository;
private readonly ISloEventSource _eventSource;
private readonly IAlertThresholdRepository _thresholdRepository;
private readonly ISloAlertRepository _alertRepository;
private readonly TimeProvider _timeProvider;
private readonly BurnRateOptions _options;
private readonly ILogger<BurnRateEngine> _logger;
public BurnRateEngine(
ISloRepository sloRepository,
ISloEventSource eventSource,
IAlertThresholdRepository thresholdRepository,
ISloAlertRepository alertRepository,
TimeProvider timeProvider,
BurnRateOptions options,
ILogger<BurnRateEngine> logger)
{
_sloRepository = sloRepository ?? throw new ArgumentNullException(nameof(sloRepository));
_eventSource = eventSource ?? throw new ArgumentNullException(nameof(eventSource));
_thresholdRepository = thresholdRepository ?? throw new ArgumentNullException(nameof(thresholdRepository));
_alertRepository = alertRepository ?? throw new ArgumentNullException(nameof(alertRepository));
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<SloState> ComputeStateAsync(
Slo slo,
CancellationToken cancellationToken)
{
var now = _timeProvider.GetUtcNow();
var windowDuration = slo.GetWindowDuration();
var windowStart = now - windowDuration;
// Get event counts based on SLO type
var counts = slo.Type switch
{
SloType.Availability => await _eventSource.GetAvailabilityCountsAsync(
slo.TenantId, slo.JobType, slo.SourceId, windowStart, now, cancellationToken).ConfigureAwait(false),
SloType.Latency => await _eventSource.GetLatencyCountsAsync(
slo.TenantId, slo.JobType, slo.SourceId,
slo.LatencyPercentile ?? 0.95,
slo.LatencyTargetSeconds ?? 1.0,
windowStart, now, cancellationToken).ConfigureAwait(false),
SloType.Throughput => await _eventSource.GetThroughputCountsAsync(
slo.TenantId, slo.JobType, slo.SourceId,
slo.ThroughputMinimum ?? 1,
windowStart, now, cancellationToken).ConfigureAwait(false),
_ => throw new InvalidOperationException($"Unknown SLO type: {slo.Type}")
};
// Handle no data case
if (counts.TotalEvents < _options.MinimumEvents)
{
_logger.LogDebug(
"SLO {SloId} has insufficient data ({Events} events, minimum {Min})",
slo.SloId, counts.TotalEvents, _options.MinimumEvents);
return SloState.NoData(slo.SloId, slo.TenantId, now, slo.Window);
}
// Compute SLI (Service Level Indicator)
var sli = (double)counts.GoodEvents / counts.TotalEvents;
// Compute error budget consumption
var errorBudget = slo.ErrorBudget;
var errorRate = 1.0 - sli;
var budgetConsumed = errorBudget > 0 ? errorRate / errorBudget : (errorRate > 0 ? 1.0 : 0.0);
budgetConsumed = Math.Clamp(budgetConsumed, 0, 2.0); // Allow showing overconsumption up to 200%
var budgetRemaining = Math.Max(0, 1.0 - budgetConsumed);
// Compute burn rate
// Burn rate = (actual error rate) / (allowed error rate for sustainable consumption)
// Sustainable consumption = error budget / window duration * elapsed time
var elapsedRatio = (now - counts.WindowStart).TotalSeconds / windowDuration.TotalSeconds;
var sustainableErrorRate = errorBudget * elapsedRatio;
var burnRate = sustainableErrorRate > 0 ? errorRate / sustainableErrorRate : 0;
// Compute time to exhaustion
TimeSpan? timeToExhaustion = null;
if (burnRate > 0 && budgetRemaining > 0)
{
var remainingBudget = errorBudget * budgetRemaining;
var currentErrorRatePerSecond = errorRate / (now - counts.WindowStart).TotalSeconds;
if (currentErrorRatePerSecond > 0)
{
var secondsToExhaustion = remainingBudget / currentErrorRatePerSecond;
timeToExhaustion = TimeSpan.FromSeconds(Math.Min(secondsToExhaustion, windowDuration.TotalSeconds));
}
}
// Determine if SLO is met
var isMet = sli >= slo.Target;
// Determine alert severity
var alertSeverity = DetermineAlertSeverity(budgetConsumed, burnRate);
var state = new SloState(
SloId: slo.SloId,
TenantId: slo.TenantId,
CurrentSli: sli,
TotalEvents: counts.TotalEvents,
GoodEvents: counts.GoodEvents,
BadEvents: counts.BadEvents,
BudgetConsumed: budgetConsumed,
BudgetRemaining: budgetRemaining,
BurnRate: burnRate,
TimeToExhaustion: timeToExhaustion,
IsMet: isMet,
AlertSeverity: alertSeverity,
ComputedAt: now,
WindowStart: counts.WindowStart,
WindowEnd: counts.WindowEnd);
_logger.LogDebug(
"SLO {SloId} state computed: SLI={Sli:P2}, BudgetConsumed={BudgetConsumed:P1}, BurnRate={BurnRate:F2}x",
slo.SloId, state.CurrentSli, state.BudgetConsumed, state.BurnRate);
return state;
}
public async Task<IReadOnlyList<SloState>> ComputeAllStatesAsync(
string tenantId,
CancellationToken cancellationToken)
{
var slos = await _sloRepository.ListAsync(tenantId, enabledOnly: true, cancellationToken: cancellationToken)
.ConfigureAwait(false);
var states = new List<SloState>(slos.Count);
foreach (var slo in slos)
{
try
{
var state = await ComputeStateAsync(slo, cancellationToken).ConfigureAwait(false);
states.Add(state);
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to compute state for SLO {SloId}", slo.SloId);
// Add no-data state for failed computation
states.Add(SloState.NoData(slo.SloId, slo.TenantId, _timeProvider.GetUtcNow(), slo.Window));
}
}
return states;
}
public async Task<IReadOnlyList<SloAlert>> EvaluateAlertsAsync(
Slo slo,
SloState state,
CancellationToken cancellationToken)
{
var now = _timeProvider.GetUtcNow();
var thresholds = await _thresholdRepository.ListBySloAsync(slo.SloId, cancellationToken)
.ConfigureAwait(false);
var alerts = new List<SloAlert>();
foreach (var threshold in thresholds)
{
if (!threshold.ShouldTrigger(state, now))
{
continue;
}
var alert = SloAlert.Create(slo, state, threshold);
await _alertRepository.CreateAsync(alert, cancellationToken).ConfigureAwait(false);
var updatedThreshold = threshold.RecordTrigger(now);
await _thresholdRepository.UpdateAsync(updatedThreshold, cancellationToken).ConfigureAwait(false);
alerts.Add(alert);
_logger.LogWarning(
"SLO alert triggered: SloId={SloId}, Severity={Severity}, Message={Message}",
slo.SloId, alert.Severity, alert.Message);
}
return alerts;
}
private static AlertSeverity DetermineAlertSeverity(double budgetConsumed, double burnRate)
{
// Emergency: Budget exhausted or burn rate extremely high
if (budgetConsumed >= 1.0 || burnRate >= 10.0)
return AlertSeverity.Emergency;
// Critical: Budget nearly exhausted or burn rate very high
if (budgetConsumed >= 0.8 || burnRate >= 5.0)
return AlertSeverity.Critical;
// Warning: Budget significantly consumed or elevated burn rate
if (budgetConsumed >= 0.5 || burnRate >= 2.0)
return AlertSeverity.Warning;
// Info: Everything is normal
return AlertSeverity.Info;
}
}
/// <summary>
/// Repository interface for SLO persistence.
/// </summary>
public interface ISloRepository
{
Task<Slo?> GetByIdAsync(string tenantId, Guid sloId, CancellationToken cancellationToken);
Task<IReadOnlyList<Slo>> ListAsync(string tenantId, bool enabledOnly, string? jobType = null, CancellationToken cancellationToken = default);
Task CreateAsync(Slo slo, CancellationToken cancellationToken);
Task UpdateAsync(Slo slo, CancellationToken cancellationToken);
Task<bool> DeleteAsync(string tenantId, Guid sloId, CancellationToken cancellationToken);
}
/// <summary>
/// Repository interface for alert threshold persistence.
/// </summary>
public interface IAlertThresholdRepository
{
Task<AlertBudgetThreshold?> GetByIdAsync(string tenantId, Guid thresholdId, CancellationToken cancellationToken);
Task<IReadOnlyList<AlertBudgetThreshold>> ListBySloAsync(Guid sloId, CancellationToken cancellationToken);
Task CreateAsync(AlertBudgetThreshold threshold, CancellationToken cancellationToken);
Task UpdateAsync(AlertBudgetThreshold threshold, CancellationToken cancellationToken);
Task<bool> DeleteAsync(string tenantId, Guid thresholdId, CancellationToken cancellationToken);
}
/// <summary>
/// Repository interface for SLO alert persistence.
/// </summary>
public interface ISloAlertRepository
{
Task<SloAlert?> GetByIdAsync(string tenantId, Guid alertId, CancellationToken cancellationToken);
Task<IReadOnlyList<SloAlert>> ListAsync(string tenantId, Guid? sloId, bool? acknowledged, bool? resolved, int limit, int offset, CancellationToken cancellationToken);
Task CreateAsync(SloAlert alert, CancellationToken cancellationToken);
Task UpdateAsync(SloAlert alert, CancellationToken cancellationToken);
Task<int> GetActiveAlertCountAsync(string tenantId, CancellationToken cancellationToken);
}

View File

@@ -1,18 +1,20 @@
<?xml version="1.0" ?>
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<LangVersion>preview</LangVersion>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.0-rc.2.25502.107" />
</ItemGroup>
</Project>

View File

@@ -1,6 +0,0 @@
namespace StellaOps.Orchestrator.Infrastructure;
public class Class1
{
}

View File

@@ -0,0 +1,45 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Ledger;
/// <summary>
/// Service for exporting ledger data in various formats.
/// </summary>
public interface ILedgerExporter
{
/// <summary>
/// Exports ledger entries to a file.
/// </summary>
/// <param name="export">The export request.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The completed export with output details.</returns>
Task<LedgerExport> ExportAsync(
LedgerExport export,
CancellationToken cancellationToken = default);
/// <summary>
/// Generates a signed manifest for a ledger entry.
/// </summary>
/// <param name="entry">The ledger entry.</param>
/// <param name="artifacts">The artifacts from the run.</param>
/// <param name="buildInfo">Optional build information.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The generated manifest.</returns>
Task<SignedManifest> GenerateManifestAsync(
RunLedgerEntry entry,
IReadOnlyList<Artifact> artifacts,
string? buildInfo = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Generates a signed manifest for an export.
/// </summary>
/// <param name="export">The completed export.</param>
/// <param name="entries">The entries included in the export.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The generated manifest.</returns>
Task<SignedManifest> GenerateExportManifestAsync(
LedgerExport export,
IReadOnlyList<RunLedgerEntry> entries,
CancellationToken cancellationToken = default);
}

View File

@@ -0,0 +1,309 @@
using System.Globalization;
using System.Security.Cryptography;
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Infrastructure.Repositories;
namespace StellaOps.Orchestrator.Infrastructure.Ledger;
/// <summary>
/// Service for exporting ledger data in various formats.
/// </summary>
public sealed class LedgerExporter : ILedgerExporter
{
private readonly ILedgerRepository _ledgerRepository;
private readonly ILedgerExportRepository _exportRepository;
private readonly ILogger<LedgerExporter> _logger;
private static readonly JsonSerializerOptions JsonOptions = new()
{
WriteIndented = true,
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
};
private static readonly JsonSerializerOptions NdjsonOptions = new()
{
WriteIndented = false,
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
};
public LedgerExporter(
ILedgerRepository ledgerRepository,
ILedgerExportRepository exportRepository,
ILogger<LedgerExporter> logger)
{
_ledgerRepository = ledgerRepository;
_exportRepository = exportRepository;
_logger = logger;
}
/// <inheritdoc />
public async Task<LedgerExport> ExportAsync(
LedgerExport export,
CancellationToken cancellationToken = default)
{
var startTime = DateTimeOffset.UtcNow;
try
{
_logger.LogInformation(
"Starting ledger export {ExportId} for tenant {TenantId} in format {Format}",
export.ExportId, export.TenantId, export.Format);
// Mark export as started
export = export.Start();
export = await _exportRepository.UpdateAsync(export, cancellationToken);
// Fetch entries based on filters
var entries = await _ledgerRepository.ListAsync(
export.TenantId,
export.RunTypeFilter,
export.SourceIdFilter,
finalStatus: null,
export.StartTime,
export.EndTime,
limit: int.MaxValue,
offset: 0,
cancellationToken);
_logger.LogInformation(
"Found {EntryCount} ledger entries for export {ExportId}",
entries.Count, export.ExportId);
// Generate output based on format
var (content, digest) = await GenerateOutputAsync(entries, export.Format, cancellationToken);
// Generate output path (in production, this would write to storage)
var outputUri = GenerateOutputUri(export);
var sizeBytes = Encoding.UTF8.GetByteCount(content);
// Complete the export
export = export.Complete(outputUri, digest, sizeBytes, entries.Count);
export = await _exportRepository.UpdateAsync(export, cancellationToken);
var duration = DateTimeOffset.UtcNow - startTime;
OrchestratorMetrics.LedgerExportCompleted(export.TenantId, export.Format);
OrchestratorMetrics.RecordLedgerExportDuration(export.TenantId, export.Format, duration.TotalSeconds);
OrchestratorMetrics.RecordLedgerExportSize(export.TenantId, export.Format, sizeBytes);
_logger.LogInformation(
"Completed ledger export {ExportId} with {EntryCount} entries, {SizeBytes} bytes",
export.ExportId, entries.Count, sizeBytes);
return export;
}
catch (Exception ex)
{
_logger.LogError(ex,
"Failed to export ledger {ExportId} for tenant {TenantId}",
export.ExportId, export.TenantId);
OrchestratorMetrics.LedgerExportFailed(export.TenantId, export.Format);
export = export.Fail(ex.Message);
export = await _exportRepository.UpdateAsync(export, cancellationToken);
throw;
}
}
/// <inheritdoc />
public Task<SignedManifest> GenerateManifestAsync(
RunLedgerEntry entry,
IReadOnlyList<Artifact> artifacts,
string? buildInfo = null,
CancellationToken cancellationToken = default)
{
_logger.LogInformation(
"Generating manifest for ledger entry {LedgerId}, run {RunId}",
entry.LedgerId, entry.RunId);
var manifest = SignedManifest.CreateFromLedgerEntry(entry, buildInfo);
OrchestratorMetrics.ManifestCreated(entry.TenantId, "run");
return Task.FromResult(manifest);
}
/// <inheritdoc />
public Task<SignedManifest> GenerateExportManifestAsync(
LedgerExport export,
IReadOnlyList<RunLedgerEntry> entries,
CancellationToken cancellationToken = default)
{
_logger.LogInformation(
"Generating manifest for export {ExportId} with {EntryCount} entries",
export.ExportId, entries.Count);
var manifest = SignedManifest.CreateFromExport(export, entries);
OrchestratorMetrics.ManifestCreated(export.TenantId, "export");
return Task.FromResult(manifest);
}
private async Task<(string Content, string Digest)> GenerateOutputAsync(
IReadOnlyList<RunLedgerEntry> entries,
string format,
CancellationToken cancellationToken)
{
var content = format.ToLowerInvariant() switch
{
"json" => GenerateJson(entries),
"ndjson" => GenerateNdjson(entries),
"csv" => GenerateCsv(entries),
_ => throw new ArgumentException($"Unsupported export format: {format}", nameof(format))
};
// Compute digest
var bytes = Encoding.UTF8.GetBytes(content);
var hash = await Task.Run(() => SHA256.HashData(bytes), cancellationToken);
var digest = $"sha256:{Convert.ToHexStringLower(hash)}";
return (content, digest);
}
private static string GenerateJson(IReadOnlyList<RunLedgerEntry> entries)
{
var exportData = new LedgerExportData
{
SchemaVersion = "1.0.0",
ExportedAt = DateTimeOffset.UtcNow,
EntryCount = entries.Count,
Entries = entries.Select(MapEntry).ToList()
};
return JsonSerializer.Serialize(exportData, JsonOptions);
}
private static string GenerateNdjson(IReadOnlyList<RunLedgerEntry> entries)
{
var sb = new StringBuilder();
foreach (var entry in entries)
{
var mapped = MapEntry(entry);
sb.AppendLine(JsonSerializer.Serialize(mapped, NdjsonOptions));
}
return sb.ToString();
}
private static string GenerateCsv(IReadOnlyList<RunLedgerEntry> entries)
{
var sb = new StringBuilder();
// Header
sb.AppendLine("LedgerId,TenantId,RunId,SourceId,RunType,FinalStatus,TotalJobs,SucceededJobs,FailedJobs,ExecutionDurationMs,InputDigest,OutputDigest,SequenceNumber,ContentHash,PreviousEntryHash,RunCreatedAt,RunCompletedAt,LedgerCreatedAt");
// Data rows
foreach (var entry in entries)
{
sb.AppendLine(string.Join(",",
EscapeCsv(entry.LedgerId.ToString()),
EscapeCsv(entry.TenantId),
EscapeCsv(entry.RunId.ToString()),
EscapeCsv(entry.SourceId.ToString()),
EscapeCsv(entry.RunType),
EscapeCsv(entry.FinalStatus.ToString()),
entry.TotalJobs,
entry.SucceededJobs,
entry.FailedJobs,
entry.ExecutionDuration.TotalMilliseconds.ToString(CultureInfo.InvariantCulture),
EscapeCsv(entry.InputDigest),
EscapeCsv(entry.OutputDigest),
entry.SequenceNumber,
EscapeCsv(entry.ContentHash),
EscapeCsv(entry.PreviousEntryHash ?? ""),
EscapeCsv(entry.RunCreatedAt.ToString("O")),
EscapeCsv(entry.RunCompletedAt.ToString("O")),
EscapeCsv(entry.LedgerCreatedAt.ToString("O"))));
}
return sb.ToString();
}
private static string EscapeCsv(string value)
{
if (string.IsNullOrEmpty(value))
return "";
if (value.Contains(',') || value.Contains('"') || value.Contains('\n'))
{
return $"\"{value.Replace("\"", "\"\"")}\"";
}
return value;
}
private static LedgerEntryDto MapEntry(RunLedgerEntry entry) => new()
{
LedgerId = entry.LedgerId,
TenantId = entry.TenantId,
RunId = entry.RunId,
SourceId = entry.SourceId,
RunType = entry.RunType,
FinalStatus = entry.FinalStatus.ToString(),
TotalJobs = entry.TotalJobs,
SucceededJobs = entry.SucceededJobs,
FailedJobs = entry.FailedJobs,
ExecutionDurationMs = entry.ExecutionDuration.TotalMilliseconds,
InputDigest = entry.InputDigest,
OutputDigest = entry.OutputDigest,
ArtifactManifest = entry.ArtifactManifest,
SequenceNumber = entry.SequenceNumber,
ContentHash = entry.ContentHash,
PreviousEntryHash = entry.PreviousEntryHash,
RunCreatedAt = entry.RunCreatedAt,
RunCompletedAt = entry.RunCompletedAt,
LedgerCreatedAt = entry.LedgerCreatedAt,
Metadata = entry.Metadata
};
private static string GenerateOutputUri(LedgerExport export)
{
var extension = export.Format.ToLowerInvariant() switch
{
"json" => "json",
"ndjson" => "ndjson",
"csv" => "csv",
_ => "dat"
};
return $"ledger://exports/{export.TenantId}/{export.ExportId}.{extension}";
}
private sealed class LedgerExportData
{
public required string SchemaVersion { get; init; }
public required DateTimeOffset ExportedAt { get; init; }
public required int EntryCount { get; init; }
public required List<LedgerEntryDto> Entries { get; init; }
}
private sealed class LedgerEntryDto
{
public required Guid LedgerId { get; init; }
public required string TenantId { get; init; }
public required Guid RunId { get; init; }
public required Guid SourceId { get; init; }
public required string RunType { get; init; }
public required string FinalStatus { get; init; }
public required int TotalJobs { get; init; }
public required int SucceededJobs { get; init; }
public required int FailedJobs { get; init; }
public required double ExecutionDurationMs { get; init; }
public required string InputDigest { get; init; }
public required string OutputDigest { get; init; }
public required string ArtifactManifest { get; init; }
public required long SequenceNumber { get; init; }
public required string ContentHash { get; init; }
public string? PreviousEntryHash { get; init; }
public required DateTimeOffset RunCreatedAt { get; init; }
public required DateTimeOffset RunCompletedAt { get; init; }
public required DateTimeOffset LedgerCreatedAt { get; init; }
public string? Metadata { get; init; }
}
}

View File

@@ -0,0 +1,660 @@
using System.Diagnostics.Metrics;
namespace StellaOps.Orchestrator.Infrastructure;
/// <summary>
/// Metrics instrumentation for the Orchestrator service.
/// </summary>
public static class OrchestratorMetrics
{
private static readonly Meter Meter = new("StellaOps.Orchestrator", "1.0.0");
private static readonly Counter<long> JobsEnqueued = Meter.CreateCounter<long>(
"orchestrator.jobs.enqueued",
description: "Total jobs enqueued");
private static readonly Counter<long> JobsScheduled = Meter.CreateCounter<long>(
"orchestrator.jobs.scheduled",
description: "Total jobs scheduled");
private static readonly Counter<long> JobsLeased = Meter.CreateCounter<long>(
"orchestrator.jobs.leased",
description: "Total jobs leased to workers");
private static readonly Counter<long> JobsCompleted = Meter.CreateCounter<long>(
"orchestrator.jobs.completed",
description: "Total jobs completed");
private static readonly Counter<long> JobsFailed = Meter.CreateCounter<long>(
"orchestrator.jobs.failed",
description: "Total jobs failed");
private static readonly Counter<long> JobsRetried = Meter.CreateCounter<long>(
"orchestrator.jobs.retried",
description: "Total job retry attempts");
private static readonly Counter<long> LeaseExtensions = Meter.CreateCounter<long>(
"orchestrator.lease.extensions",
description: "Total lease extensions");
private static readonly Counter<long> LeaseExpirations = Meter.CreateCounter<long>(
"orchestrator.lease.expirations",
description: "Total lease expirations");
private static readonly Histogram<double> JobDuration = Meter.CreateHistogram<double>(
"orchestrator.job.duration.seconds",
unit: "s",
description: "Job execution duration");
private static readonly Histogram<double> SchedulingLatency = Meter.CreateHistogram<double>(
"orchestrator.scheduling.latency.seconds",
unit: "s",
description: "Time from job creation to scheduling");
private static readonly UpDownCounter<long> ActiveConnections = Meter.CreateUpDownCounter<long>(
"orchestrator.db.connections.active",
description: "Active database connections");
private static readonly UpDownCounter<long> QueueDepth = Meter.CreateUpDownCounter<long>(
"orchestrator.queue.depth",
description: "Number of pending jobs in queue");
private static readonly Counter<long> ArtifactsCreated = Meter.CreateCounter<long>(
"orchestrator.artifacts.created",
description: "Total artifacts created");
private static readonly Counter<long> HeartbeatsReceived = Meter.CreateCounter<long>(
"orchestrator.heartbeats.received",
description: "Total worker heartbeats received");
private static readonly Counter<long> ProgressReports = Meter.CreateCounter<long>(
"orchestrator.progress.reports",
description: "Total job progress reports");
private static readonly Counter<long> SourcesCreated = Meter.CreateCounter<long>(
"orchestrator.sources.created",
description: "Total sources created");
private static readonly Counter<long> SourcesPaused = Meter.CreateCounter<long>(
"orchestrator.sources.paused",
description: "Total source pause operations");
private static readonly Counter<long> SourcesResumed = Meter.CreateCounter<long>(
"orchestrator.sources.resumed",
description: "Total source resume operations");
private static readonly Counter<long> RunsCreated = Meter.CreateCounter<long>(
"orchestrator.runs.created",
description: "Total runs created");
private static readonly Counter<long> RunsCompleted = Meter.CreateCounter<long>(
"orchestrator.runs.completed",
description: "Total runs completed");
private static readonly Counter<long> QuotasCreated = Meter.CreateCounter<long>(
"orchestrator.quotas.created",
description: "Total quotas created");
private static readonly Counter<long> QuotasPaused = Meter.CreateCounter<long>(
"orchestrator.quotas.paused",
description: "Total quota pause operations");
private static readonly Counter<long> QuotasResumed = Meter.CreateCounter<long>(
"orchestrator.quotas.resumed",
description: "Total quota resume operations");
private static readonly Counter<long> ThrottlesCreated = Meter.CreateCounter<long>(
"orchestrator.throttles.created",
description: "Total throttles created");
private static readonly Counter<long> ThrottlesDeactivated = Meter.CreateCounter<long>(
"orchestrator.throttles.deactivated",
description: "Total throttles deactivated");
private static readonly Counter<long> RateLimitDenials = Meter.CreateCounter<long>(
"orchestrator.ratelimit.denials",
description: "Total rate limit denials");
private static readonly Counter<long> BackpressureEvents = Meter.CreateCounter<long>(
"orchestrator.backpressure.events",
description: "Total backpressure events from upstream");
private static readonly Histogram<double> TokenBucketUtilization = Meter.CreateHistogram<double>(
"orchestrator.ratelimit.token_utilization",
unit: "ratio",
description: "Token bucket utilization ratio (0-1)");
private static readonly Histogram<double> ConcurrencyUtilization = Meter.CreateHistogram<double>(
"orchestrator.ratelimit.concurrency_utilization",
unit: "ratio",
description: "Concurrency limiter utilization ratio (0-1)");
public static void JobEnqueued(string tenantId, string jobType)
=> JobsEnqueued.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType));
public static void JobScheduled(string tenantId, string jobType)
=> JobsScheduled.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType));
public static void JobLeased(string tenantId, string jobType)
=> JobsLeased.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType));
public static void JobCompleted(string tenantId, string jobType, string status)
=> JobsCompleted.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType),
new KeyValuePair<string, object?>("status", status));
public static void JobFailed(string tenantId, string jobType)
=> JobsFailed.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType));
public static void JobRetried(string tenantId, string jobType, int attempt)
=> JobsRetried.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType),
new KeyValuePair<string, object?>("attempt", attempt));
public static void LeaseExtended(string tenantId, string jobType)
=> LeaseExtensions.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType));
public static void LeaseExpired(string tenantId, string jobType)
=> LeaseExpirations.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType));
public static void RecordJobDuration(string tenantId, string jobType, double durationSeconds)
=> JobDuration.Record(durationSeconds, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType));
public static void RecordSchedulingLatency(string tenantId, string jobType, double latencySeconds)
=> SchedulingLatency.Record(latencySeconds, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType));
public static void ConnectionOpened(string role)
=> ActiveConnections.Add(1, new KeyValuePair<string, object?>("role", role));
public static void ConnectionClosed(string role)
=> ActiveConnections.Add(-1, new KeyValuePair<string, object?>("role", role));
public static void QueueDepthChanged(string tenantId, string jobType, long delta)
=> QueueDepth.Add(delta, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType));
public static void ArtifactCreated(string tenantId, string artifactType)
=> ArtifactsCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("artifact_type", artifactType));
public static void HeartbeatReceived(string tenantId, string jobType)
=> HeartbeatsReceived.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType));
public static void ProgressReported(string tenantId, string jobType)
=> ProgressReports.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType));
public static void SourceCreated(string tenantId, string sourceType)
=> SourcesCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("source_type", sourceType));
public static void SourcePaused(string tenantId)
=> SourcesPaused.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
public static void SourceResumed(string tenantId)
=> SourcesResumed.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
public static void RunCreated(string tenantId, string runType)
=> RunsCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("run_type", runType));
public static void RunCompleted(string tenantId, string runType, string status)
=> RunsCompleted.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("run_type", runType),
new KeyValuePair<string, object?>("status", status));
public static void QuotaCreated(string tenantId, string? jobType)
=> QuotasCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType ?? "(all)"));
public static void QuotaPaused(string tenantId)
=> QuotasPaused.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
public static void QuotaResumed(string tenantId)
=> QuotasResumed.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
public static void ThrottleCreated(string tenantId, string reason)
=> ThrottlesCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("reason", reason));
public static void ThrottleDeactivated(string tenantId)
=> ThrottlesDeactivated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
public static void RateLimitDenied(string tenantId, string? jobType, string reason)
=> RateLimitDenials.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType ?? "(all)"),
new KeyValuePair<string, object?>("reason", reason));
public static void BackpressureEvent(string tenantId, int statusCode, string reason)
=> BackpressureEvents.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("status_code", statusCode),
new KeyValuePair<string, object?>("reason", reason));
public static void RecordTokenBucketUtilization(string tenantId, string? jobType, double utilization)
=> TokenBucketUtilization.Record(utilization, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType ?? "(all)"));
public static void RecordConcurrencyUtilization(string tenantId, string? jobType, double utilization)
=> ConcurrencyUtilization.Record(utilization, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType ?? "(all)"));
// Watermark metrics
private static readonly Counter<long> WatermarksCreatedCounter = Meter.CreateCounter<long>(
"orchestrator.watermarks.created",
description: "Total watermarks created");
private static readonly Counter<long> WatermarksAdvanced = Meter.CreateCounter<long>(
"orchestrator.watermarks.advanced",
description: "Total watermark advancement operations");
private static readonly Histogram<double> WatermarkLag = Meter.CreateHistogram<double>(
"orchestrator.watermark.lag.seconds",
unit: "s",
description: "Watermark lag from current time");
// Backfill metrics
private static readonly Counter<long> BackfillsCreated = Meter.CreateCounter<long>(
"orchestrator.backfills.created",
description: "Total backfill requests created");
private static readonly Counter<long> BackfillStatusChanges = Meter.CreateCounter<long>(
"orchestrator.backfills.status_changes",
description: "Total backfill status changes");
private static readonly Counter<long> BackfillEventsProcessed = Meter.CreateCounter<long>(
"orchestrator.backfills.events_processed",
description: "Total events processed by backfills");
private static readonly Counter<long> BackfillEventsSkipped = Meter.CreateCounter<long>(
"orchestrator.backfills.events_skipped",
description: "Total events skipped by backfills (duplicates)");
private static readonly Histogram<double> BackfillDuration = Meter.CreateHistogram<double>(
"orchestrator.backfill.duration.seconds",
unit: "s",
description: "Backfill execution duration");
private static readonly Histogram<double> BackfillProgress = Meter.CreateHistogram<double>(
"orchestrator.backfill.progress",
unit: "percent",
description: "Backfill progress percentage");
// Duplicate suppression metrics
private static readonly Counter<long> ProcessedEventsMarkedCounter = Meter.CreateCounter<long>(
"orchestrator.processed_events.marked",
description: "Total processed events marked for duplicate suppression");
private static readonly Counter<long> ProcessedEventsCleanedUpCounter = Meter.CreateCounter<long>(
"orchestrator.processed_events.cleaned_up",
description: "Total expired processed events cleaned up");
private static readonly Counter<long> DuplicatesDetected = Meter.CreateCounter<long>(
"orchestrator.duplicates.detected",
description: "Total duplicate events detected");
public static void WatermarkCreated(string tenantId, string scopeKey)
=> WatermarksCreatedCounter.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("scope_key", scopeKey));
public static void WatermarkAdvanced(string tenantId, string scopeKey)
=> WatermarksAdvanced.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("scope_key", scopeKey));
public static void RecordWatermarkLag(string tenantId, string scopeKey, double lagSeconds)
=> WatermarkLag.Record(lagSeconds, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("scope_key", scopeKey));
public static void BackfillCreated(string tenantId, string scopeKey)
=> BackfillsCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("scope_key", scopeKey));
public static void BackfillStatusChanged(string tenantId, string scopeKey, string status)
=> BackfillStatusChanges.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("scope_key", scopeKey),
new KeyValuePair<string, object?>("status", status));
public static void BackfillEventProcessed(string tenantId, string scopeKey, long count)
=> BackfillEventsProcessed.Add(count, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("scope_key", scopeKey));
public static void BackfillEventSkipped(string tenantId, string scopeKey, long count)
=> BackfillEventsSkipped.Add(count, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("scope_key", scopeKey));
public static void RecordBackfillDuration(string tenantId, string scopeKey, double durationSeconds)
=> BackfillDuration.Record(durationSeconds, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("scope_key", scopeKey));
public static void RecordBackfillProgress(string tenantId, string scopeKey, double progressPercent)
=> BackfillProgress.Record(progressPercent, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("scope_key", scopeKey));
public static void ProcessedEventsMarked(string tenantId, string scopeKey, long count)
=> ProcessedEventsMarkedCounter.Add(count, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("scope_key", scopeKey));
public static void ProcessedEventsCleanedUp(string tenantId, long count)
=> ProcessedEventsCleanedUpCounter.Add(count, new KeyValuePair<string, object?>("tenant_id", tenantId));
public static void DuplicateDetected(string tenantId, string scopeKey)
=> DuplicatesDetected.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("scope_key", scopeKey));
// Dead-letter metrics
private static readonly Counter<long> DeadLetterEntriesCreated = Meter.CreateCounter<long>(
"orchestrator.deadletter.created",
description: "Total dead-letter entries created");
private static readonly Counter<long> DeadLetterStatusChanges = Meter.CreateCounter<long>(
"orchestrator.deadletter.status_changes",
description: "Total dead-letter status changes");
private static readonly Counter<long> DeadLetterReplayAttempts = Meter.CreateCounter<long>(
"orchestrator.deadletter.replay_attempts",
description: "Total dead-letter replay attempts");
private static readonly Counter<long> DeadLetterReplaySuccesses = Meter.CreateCounter<long>(
"orchestrator.deadletter.replay_successes",
description: "Total successful dead-letter replays");
private static readonly Counter<long> DeadLetterReplayFailures = Meter.CreateCounter<long>(
"orchestrator.deadletter.replay_failures",
description: "Total failed dead-letter replays");
private static readonly Counter<long> DeadLetterEntriesExpired = Meter.CreateCounter<long>(
"orchestrator.deadletter.expired",
description: "Total dead-letter entries marked as expired");
private static readonly Counter<long> DeadLetterEntriesPurged = Meter.CreateCounter<long>(
"orchestrator.deadletter.purged",
description: "Total dead-letter entries purged");
private static readonly Counter<long> DeadLetterNotificationsSent = Meter.CreateCounter<long>(
"orchestrator.deadletter.notifications_sent",
description: "Total dead-letter notifications sent");
private static readonly Counter<long> DeadLetterNotificationsFailed = Meter.CreateCounter<long>(
"orchestrator.deadletter.notifications_failed",
description: "Total failed dead-letter notifications");
private static readonly UpDownCounter<long> DeadLetterPendingCount = Meter.CreateUpDownCounter<long>(
"orchestrator.deadletter.pending",
description: "Current number of pending dead-letter entries");
public static void DeadLetterCreated(string tenantId, string jobType, string errorCode, string category)
=> DeadLetterEntriesCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType),
new KeyValuePair<string, object?>("error_code", errorCode),
new KeyValuePair<string, object?>("category", category));
public static void DeadLetterStatusChanged(string tenantId, string jobType, string status)
=> DeadLetterStatusChanges.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("job_type", jobType),
new KeyValuePair<string, object?>("status", status));
public static void DeadLetterReplayAttempted(string tenantId, string triggeredBy)
=> DeadLetterReplayAttempts.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("triggered_by", triggeredBy));
public static void DeadLetterReplaySucceeded(string tenantId)
=> DeadLetterReplaySuccesses.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
public static void DeadLetterReplayFailed(string tenantId)
=> DeadLetterReplayFailures.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
public static void DeadLetterExpired(int count)
=> DeadLetterEntriesExpired.Add(count);
public static void DeadLetterPurged(int count)
=> DeadLetterEntriesPurged.Add(count);
public static void DeadLetterNotificationSent(string tenantId, string channel, string eventType)
=> DeadLetterNotificationsSent.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("channel", channel),
new KeyValuePair<string, object?>("event_type", eventType));
public static void DeadLetterNotificationFailed(string tenantId, string channel, string eventType)
=> DeadLetterNotificationsFailed.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("channel", channel),
new KeyValuePair<string, object?>("event_type", eventType));
public static void DeadLetterPendingChanged(string tenantId, long delta)
=> DeadLetterPendingCount.Add(delta, new KeyValuePair<string, object?>("tenant_id", tenantId));
// SLO metrics
private static readonly Counter<long> SlosCreated = Meter.CreateCounter<long>(
"orchestrator.slos.created",
description: "Total SLOs created");
private static readonly Counter<long> SlosUpdated = Meter.CreateCounter<long>(
"orchestrator.slos.updated",
description: "Total SLO updates");
private static readonly Counter<long> SloAlertsTriggered = Meter.CreateCounter<long>(
"orchestrator.slo.alerts_triggered",
description: "Total SLO alerts triggered");
private static readonly Counter<long> SloAlertsAcknowledged = Meter.CreateCounter<long>(
"orchestrator.slo.alerts_acknowledged",
description: "Total SLO alerts acknowledged");
private static readonly Counter<long> SloAlertsResolved = Meter.CreateCounter<long>(
"orchestrator.slo.alerts_resolved",
description: "Total SLO alerts resolved");
private static readonly Histogram<double> SloBudgetConsumed = Meter.CreateHistogram<double>(
"orchestrator.slo.budget_consumed",
unit: "ratio",
description: "SLO error budget consumed (0-1)");
private static readonly Histogram<double> SloBurnRate = Meter.CreateHistogram<double>(
"orchestrator.slo.burn_rate",
unit: "ratio",
description: "SLO burn rate (1.0 = sustainable)");
private static readonly Histogram<double> SloCurrentSli = Meter.CreateHistogram<double>(
"orchestrator.slo.current_sli",
unit: "ratio",
description: "Current SLI value (0-1)");
private static readonly UpDownCounter<long> SloActiveAlerts = Meter.CreateUpDownCounter<long>(
"orchestrator.slo.active_alerts",
description: "Current number of active SLO alerts");
private static readonly Histogram<double> SloBudgetRemaining = Meter.CreateHistogram<double>(
"orchestrator.slo.budget_remaining",
unit: "ratio",
description: "SLO error budget remaining (0-1)");
private static readonly Histogram<double> SloTimeToExhaustion = Meter.CreateHistogram<double>(
"orchestrator.slo.time_to_exhaustion.seconds",
unit: "s",
description: "Estimated time until error budget exhaustion");
public static void SloCreated(string tenantId, string sloType, string? jobType)
=> SlosCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("slo_type", sloType),
new KeyValuePair<string, object?>("job_type", jobType ?? "(all)"));
public static void SloUpdated(string tenantId, string sloName)
=> SlosUpdated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("slo_name", sloName));
public static void SloAlertTriggered(string tenantId, string sloName, string severity)
=> SloAlertsTriggered.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("slo_name", sloName),
new KeyValuePair<string, object?>("severity", severity));
public static void SloAlertAcknowledged(string tenantId, string sloName)
=> SloAlertsAcknowledged.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("slo_name", sloName));
public static void SloAlertResolved(string tenantId, string sloName)
=> SloAlertsResolved.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("slo_name", sloName));
public static void RecordSloBudgetConsumed(string tenantId, string sloName, string sloType, double consumed)
=> SloBudgetConsumed.Record(consumed, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("slo_name", sloName),
new KeyValuePair<string, object?>("slo_type", sloType));
public static void RecordSloBurnRate(string tenantId, string sloName, string sloType, double burnRate)
=> SloBurnRate.Record(burnRate, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("slo_name", sloName),
new KeyValuePair<string, object?>("slo_type", sloType));
public static void RecordSloCurrentSli(string tenantId, string sloName, string sloType, double sli)
=> SloCurrentSli.Record(sli, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("slo_name", sloName),
new KeyValuePair<string, object?>("slo_type", sloType));
public static void SloActiveAlertsChanged(string tenantId, long delta)
=> SloActiveAlerts.Add(delta, new KeyValuePair<string, object?>("tenant_id", tenantId));
public static void RecordSloBudgetRemaining(string tenantId, string sloName, string sloType, double remaining)
=> SloBudgetRemaining.Record(remaining, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("slo_name", sloName),
new KeyValuePair<string, object?>("slo_type", sloType));
public static void RecordSloTimeToExhaustion(string tenantId, string sloName, double seconds)
=> SloTimeToExhaustion.Record(seconds, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("slo_name", sloName));
// Audit log metrics
private static readonly Counter<long> AuditEntriesCreated = Meter.CreateCounter<long>(
"orchestrator.audit.entries_created",
description: "Total audit log entries created");
private static readonly Counter<long> AuditChainVerifications = Meter.CreateCounter<long>(
"orchestrator.audit.chain_verifications",
description: "Total audit chain verification operations");
private static readonly Counter<long> AuditChainFailures = Meter.CreateCounter<long>(
"orchestrator.audit.chain_failures",
description: "Total audit chain verification failures");
private static readonly UpDownCounter<long> AuditEntryCount = Meter.CreateUpDownCounter<long>(
"orchestrator.audit.entry_count",
description: "Current number of audit entries");
// Ledger metrics
private static readonly Counter<long> LedgerEntriesCreated = Meter.CreateCounter<long>(
"orchestrator.ledger.entries_created",
description: "Total ledger entries created");
private static readonly Counter<long> LedgerChainVerifications = Meter.CreateCounter<long>(
"orchestrator.ledger.chain_verifications",
description: "Total ledger chain verification operations");
private static readonly Counter<long> LedgerChainFailures = Meter.CreateCounter<long>(
"orchestrator.ledger.chain_failures",
description: "Total ledger chain verification failures");
private static readonly Counter<long> LedgerExportsRequested = Meter.CreateCounter<long>(
"orchestrator.ledger.exports_requested",
description: "Total ledger export requests");
private static readonly Counter<long> LedgerExportsCompleted = Meter.CreateCounter<long>(
"orchestrator.ledger.exports_completed",
description: "Total ledger exports completed successfully");
private static readonly Counter<long> LedgerExportsFailed = Meter.CreateCounter<long>(
"orchestrator.ledger.exports_failed",
description: "Total ledger exports that failed");
private static readonly Histogram<double> LedgerExportDuration = Meter.CreateHistogram<double>(
"orchestrator.ledger.export_duration.seconds",
unit: "s",
description: "Ledger export duration");
private static readonly Histogram<long> LedgerExportSize = Meter.CreateHistogram<long>(
"orchestrator.ledger.export_size.bytes",
unit: "bytes",
description: "Ledger export file size");
// Manifest metrics
private static readonly Counter<long> ManifestsCreated = Meter.CreateCounter<long>(
"orchestrator.manifests.created",
description: "Total signed manifests created");
private static readonly Counter<long> ManifestVerifications = Meter.CreateCounter<long>(
"orchestrator.manifests.verifications",
description: "Total manifest verification operations");
private static readonly Counter<long> ManifestVerificationFailures = Meter.CreateCounter<long>(
"orchestrator.manifests.verification_failures",
description: "Total manifest verification failures");
public static void AuditEntryCreated(string tenantId, string eventType, string resourceType)
=> AuditEntriesCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("event_type", eventType),
new KeyValuePair<string, object?>("resource_type", resourceType));
public static void AuditChainVerified(string tenantId, bool success)
{
AuditChainVerifications.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
if (!success)
{
AuditChainFailures.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
}
}
public static void AuditEntryCountChanged(string tenantId, long delta)
=> AuditEntryCount.Add(delta, new KeyValuePair<string, object?>("tenant_id", tenantId));
public static void LedgerEntryCreated(string tenantId, string runType, string finalStatus)
=> LedgerEntriesCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("run_type", runType),
new KeyValuePair<string, object?>("final_status", finalStatus));
public static void LedgerChainVerified(string tenantId, bool success)
{
LedgerChainVerifications.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
if (!success)
{
LedgerChainFailures.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
}
}
public static void LedgerExportRequested(string tenantId, string format)
=> LedgerExportsRequested.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("format", format));
public static void LedgerExportCompleted(string tenantId, string format)
=> LedgerExportsCompleted.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("format", format));
public static void LedgerExportFailed(string tenantId, string format)
=> LedgerExportsFailed.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("format", format));
public static void RecordLedgerExportDuration(string tenantId, string format, double durationSeconds)
=> LedgerExportDuration.Record(durationSeconds, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("format", format));
public static void RecordLedgerExportSize(string tenantId, string format, long sizeBytes)
=> LedgerExportSize.Record(sizeBytes, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("format", format));
public static void ManifestCreated(string tenantId, string provenanceType)
=> ManifestsCreated.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId),
new KeyValuePair<string, object?>("provenance_type", provenanceType));
public static void ManifestVerified(string tenantId, bool success)
{
ManifestVerifications.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
if (!success)
{
ManifestVerificationFailures.Add(1, new KeyValuePair<string, object?>("tenant_id", tenantId));
}
}
}

View File

@@ -0,0 +1,130 @@
namespace StellaOps.Orchestrator.Infrastructure.Options;
/// <summary>
/// Configuration options for the Orchestrator service.
/// </summary>
public sealed class OrchestratorServiceOptions
{
/// <summary>
/// Configuration section name.
/// </summary>
public const string SectionName = "Orchestrator";
/// <summary>
/// HTTP header name for tenant identification.
/// </summary>
public string TenantHeader { get; set; } = "X-Tenant-Id";
/// <summary>
/// Database connection options.
/// </summary>
public DatabaseOptions Database { get; set; } = new();
/// <summary>
/// Lease management options.
/// </summary>
public LeaseOptions Lease { get; set; } = new();
/// <summary>
/// Rate-limiting options.
/// </summary>
public RateLimitOptions RateLimit { get; set; } = new();
/// <summary>
/// Database connection options.
/// </summary>
public sealed class DatabaseOptions
{
/// <summary>
/// PostgreSQL connection string.
/// </summary>
public string ConnectionString { get; set; } = string.Empty;
/// <summary>
/// Command timeout in seconds.
/// </summary>
public int CommandTimeoutSeconds { get; set; } = 30;
/// <summary>
/// Enable connection pooling.
/// </summary>
public bool EnablePooling { get; set; } = true;
/// <summary>
/// Minimum pool size.
/// </summary>
public int MinPoolSize { get; set; } = 1;
/// <summary>
/// Maximum pool size.
/// </summary>
public int MaxPoolSize { get; set; } = 100;
}
/// <summary>
/// Lease management options.
/// </summary>
public sealed class LeaseOptions
{
/// <summary>
/// Default lease duration in seconds.
/// </summary>
public int DefaultLeaseDurationSeconds { get; set; } = 300;
/// <summary>
/// Maximum lease duration in seconds.
/// </summary>
public int MaxLeaseDurationSeconds { get; set; } = 3600;
/// <summary>
/// Lease renewal threshold (renew when this fraction of lease remains).
/// </summary>
public double RenewalThreshold { get; set; } = 0.5;
/// <summary>
/// Interval for checking expired leases in seconds.
/// </summary>
public int ExpiryCheckIntervalSeconds { get; set; } = 30;
}
/// <summary>
/// Rate-limiting options.
/// </summary>
public sealed class RateLimitOptions
{
/// <summary>
/// Default maximum concurrent active jobs per tenant.
/// </summary>
public int DefaultMaxActive { get; set; } = 10;
/// <summary>
/// Default maximum jobs per hour per tenant.
/// </summary>
public int DefaultMaxPerHour { get; set; } = 1000;
/// <summary>
/// Default burst capacity for token bucket.
/// </summary>
public int DefaultBurstCapacity { get; set; } = 50;
/// <summary>
/// Default token refill rate (tokens per second).
/// </summary>
public double DefaultRefillRate { get; set; } = 1.0;
/// <summary>
/// Failure rate threshold for circuit breaker (0.0-1.0).
/// </summary>
public double CircuitBreakerThreshold { get; set; } = 0.5;
/// <summary>
/// Window size in minutes for failure rate calculation.
/// </summary>
public int CircuitBreakerWindowMinutes { get; set; } = 5;
/// <summary>
/// Minimum sample size before circuit breaker can trip.
/// </summary>
public int CircuitBreakerMinSamples { get; set; } = 10;
}
}

View File

@@ -0,0 +1,118 @@
using System.Data;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using Npgsql;
using StellaOps.Orchestrator.Infrastructure.Options;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// Manages PostgreSQL connections for the Orchestrator service.
/// Configures session-level tenant context for row-level security.
/// </summary>
public sealed class OrchestratorDataSource : IAsyncDisposable
{
private readonly NpgsqlDataSource _dataSource;
private readonly OrchestratorServiceOptions.DatabaseOptions _options;
private readonly ILogger<OrchestratorDataSource> _logger;
public OrchestratorDataSource(
IOptions<OrchestratorServiceOptions> options,
ILogger<OrchestratorDataSource> logger)
{
ArgumentNullException.ThrowIfNull(options);
_options = options.Value.Database;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
var builder = new NpgsqlDataSourceBuilder(_options.ConnectionString);
_dataSource = builder.Build();
}
/// <summary>
/// Command timeout in seconds.
/// </summary>
public int CommandTimeoutSeconds => _options.CommandTimeoutSeconds;
/// <summary>
/// Disposes the data source and releases all connections.
/// </summary>
public async ValueTask DisposeAsync()
{
await _dataSource.DisposeAsync().ConfigureAwait(false);
}
/// <summary>
/// Opens a connection with tenant context configured.
/// </summary>
/// <param name="tenantId">Tenant identifier for session configuration.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Open PostgreSQL connection.</returns>
public Task<NpgsqlConnection> OpenConnectionAsync(string tenantId, CancellationToken cancellationToken)
=> OpenConnectionInternalAsync(tenantId, "unspecified", cancellationToken);
/// <summary>
/// Opens a connection with tenant context and role label configured.
/// </summary>
/// <param name="tenantId">Tenant identifier for session configuration.</param>
/// <param name="role">Role label for metrics/logging (e.g., "reader", "writer").</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Open PostgreSQL connection.</returns>
public Task<NpgsqlConnection> OpenConnectionAsync(string tenantId, string role, CancellationToken cancellationToken)
=> OpenConnectionInternalAsync(tenantId, role, cancellationToken);
private async Task<NpgsqlConnection> OpenConnectionInternalAsync(string tenantId, string role, CancellationToken cancellationToken)
{
var connection = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
try
{
await ConfigureSessionAsync(connection, tenantId, cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.ConnectionOpened(role);
connection.StateChange += (_, args) =>
{
if (args.CurrentState == ConnectionState.Closed)
{
OrchestratorMetrics.ConnectionClosed(role);
}
};
}
catch
{
await connection.DisposeAsync().ConfigureAwait(false);
throw;
}
return connection;
}
private async Task ConfigureSessionAsync(NpgsqlConnection connection, string tenantId, CancellationToken cancellationToken)
{
try
{
// Set UTC timezone for deterministic timestamps
await using (var command = new NpgsqlCommand("SET TIME ZONE 'UTC';", connection))
{
command.CommandTimeout = _options.CommandTimeoutSeconds;
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
// Set tenant context for row-level security
if (!string.IsNullOrWhiteSpace(tenantId))
{
await using var tenantCommand = new NpgsqlCommand("SELECT set_config('app.current_tenant', @tenant, false);", connection);
tenantCommand.CommandTimeout = _options.CommandTimeoutSeconds;
tenantCommand.Parameters.AddWithValue("tenant", tenantId);
await tenantCommand.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
}
catch (Exception ex)
{
if (_logger.IsEnabled(LogLevel.Error))
{
_logger.LogError(ex, "Failed to configure PostgreSQL session for tenant {TenantId}.", tenantId);
}
throw;
}
}
}

View File

@@ -0,0 +1,362 @@
using System.Text;
using Microsoft.Extensions.Logging;
using Npgsql;
using NpgsqlTypes;
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Infrastructure.Repositories;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// PostgreSQL implementation of artifact repository.
/// </summary>
public sealed class PostgresArtifactRepository : IArtifactRepository
{
private const string SelectArtifactColumns = """
artifact_id, tenant_id, job_id, run_id, artifact_type, uri, digest,
mime_type, size_bytes, created_at, metadata
""";
private const string SelectByIdSql = $"""
SELECT {SelectArtifactColumns}
FROM artifacts
WHERE tenant_id = @tenant_id AND artifact_id = @artifact_id
""";
private const string SelectByJobIdSql = $"""
SELECT {SelectArtifactColumns}
FROM artifacts
WHERE tenant_id = @tenant_id AND job_id = @job_id
ORDER BY created_at
""";
private const string SelectByRunIdSql = $"""
SELECT {SelectArtifactColumns}
FROM artifacts
WHERE tenant_id = @tenant_id AND run_id = @run_id
ORDER BY created_at
""";
private const string SelectByDigestSql = $"""
SELECT {SelectArtifactColumns}
FROM artifacts
WHERE tenant_id = @tenant_id AND digest = @digest
""";
private const string InsertArtifactSql = """
INSERT INTO artifacts (
artifact_id, tenant_id, job_id, run_id, artifact_type, uri, digest,
mime_type, size_bytes, created_at, metadata)
VALUES (
@artifact_id, @tenant_id, @job_id, @run_id, @artifact_type, @uri, @digest,
@mime_type, @size_bytes, @created_at, @metadata)
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresArtifactRepository> _logger;
public PostgresArtifactRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresArtifactRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<Artifact?> GetByIdAsync(string tenantId, Guid artifactId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("artifact_id", artifactId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapArtifact(reader);
}
public async Task<IReadOnlyList<Artifact>> GetByJobIdAsync(string tenantId, Guid jobId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByJobIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("job_id", jobId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var artifacts = new List<Artifact>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
artifacts.Add(MapArtifact(reader));
}
return artifacts;
}
public async Task<IReadOnlyList<Artifact>> GetByRunIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByRunIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("run_id", runId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var artifacts = new List<Artifact>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
artifacts.Add(MapArtifact(reader));
}
return artifacts;
}
public async Task<Artifact?> GetByDigestAsync(string tenantId, string digest, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByDigestSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("digest", digest);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapArtifact(reader);
}
public async Task CreateAsync(Artifact artifact, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(artifact.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(InsertArtifactSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddArtifactParameters(command, artifact);
try
{
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.ArtifactCreated(artifact.TenantId, artifact.ArtifactType);
}
catch (PostgresException ex) when (string.Equals(ex.SqlState, PostgresErrorCodes.UniqueViolation, StringComparison.Ordinal))
{
_logger.LogWarning("Duplicate artifact ID or digest: {ArtifactId}, {Digest}", artifact.ArtifactId, artifact.Digest);
throw new DuplicateArtifactException(artifact.ArtifactId, artifact.Digest, ex);
}
}
public async Task CreateBatchAsync(IEnumerable<Artifact> artifacts, CancellationToken cancellationToken)
{
var artifactList = artifacts.ToList();
if (artifactList.Count == 0)
{
return;
}
var tenantId = artifactList[0].TenantId;
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var transaction = await connection.BeginTransactionAsync(cancellationToken).ConfigureAwait(false);
try
{
foreach (var artifact in artifactList)
{
await using var command = new NpgsqlCommand(InsertArtifactSql, connection, transaction);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddArtifactParameters(command, artifact);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.ArtifactCreated(artifact.TenantId, artifact.ArtifactType);
}
await transaction.CommitAsync(cancellationToken).ConfigureAwait(false);
}
catch (PostgresException ex) when (string.Equals(ex.SqlState, PostgresErrorCodes.UniqueViolation, StringComparison.Ordinal))
{
await transaction.RollbackAsync(cancellationToken).ConfigureAwait(false);
_logger.LogWarning(ex, "Duplicate artifact in batch insert");
throw;
}
catch
{
await transaction.RollbackAsync(cancellationToken).ConfigureAwait(false);
throw;
}
}
public async Task<IReadOnlyList<Artifact>> ListAsync(
string tenantId,
string? artifactType,
string? jobType,
DateTimeOffset? createdAfter,
DateTimeOffset? createdBefore,
int limit,
int offset,
CancellationToken cancellationToken)
{
var (sql, parameters) = BuildListQuery(tenantId, artifactType, jobType, createdAfter, createdBefore, limit, offset);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var artifacts = new List<Artifact>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
artifacts.Add(MapArtifact(reader));
}
return artifacts;
}
public async Task<int> CountAsync(
string tenantId,
string? artifactType,
string? jobType,
CancellationToken cancellationToken)
{
var (sql, parameters) = BuildCountQuery(tenantId, artifactType, jobType);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
return Convert.ToInt32(result);
}
private static void AddArtifactParameters(NpgsqlCommand command, Artifact artifact)
{
command.Parameters.AddWithValue("artifact_id", artifact.ArtifactId);
command.Parameters.AddWithValue("tenant_id", artifact.TenantId);
command.Parameters.AddWithValue("job_id", artifact.JobId);
command.Parameters.AddWithValue("run_id", (object?)artifact.RunId ?? DBNull.Value);
command.Parameters.AddWithValue("artifact_type", artifact.ArtifactType);
command.Parameters.AddWithValue("uri", artifact.Uri);
command.Parameters.AddWithValue("digest", artifact.Digest);
command.Parameters.AddWithValue("mime_type", (object?)artifact.MimeType ?? DBNull.Value);
command.Parameters.AddWithValue("size_bytes", (object?)artifact.SizeBytes ?? DBNull.Value);
command.Parameters.AddWithValue("created_at", artifact.CreatedAt);
command.Parameters.Add(new NpgsqlParameter("metadata", NpgsqlDbType.Jsonb)
{
Value = (object?)artifact.Metadata ?? DBNull.Value
});
}
private static Artifact MapArtifact(NpgsqlDataReader reader)
{
return new Artifact(
ArtifactId: reader.GetGuid(0),
TenantId: reader.GetString(1),
JobId: reader.GetGuid(2),
RunId: reader.IsDBNull(3) ? null : reader.GetGuid(3),
ArtifactType: reader.GetString(4),
Uri: reader.GetString(5),
Digest: reader.GetString(6),
MimeType: reader.IsDBNull(7) ? null : reader.GetString(7),
SizeBytes: reader.IsDBNull(8) ? null : reader.GetInt64(8),
CreatedAt: reader.GetFieldValue<DateTimeOffset>(9),
Metadata: reader.IsDBNull(10) ? null : reader.GetString(10));
}
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
string tenantId,
string? artifactType,
string? jobType,
DateTimeOffset? createdAfter,
DateTimeOffset? createdBefore,
int limit,
int offset)
{
var sb = new StringBuilder();
sb.Append($"SELECT {SelectArtifactColumns} FROM artifacts a WHERE a.tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (!string.IsNullOrEmpty(artifactType))
{
sb.Append(" AND a.artifact_type = @artifact_type");
parameters.Add(("artifact_type", artifactType));
}
if (!string.IsNullOrEmpty(jobType))
{
sb.Append(" AND EXISTS (SELECT 1 FROM jobs j WHERE j.job_id = a.job_id AND j.tenant_id = a.tenant_id AND j.job_type = @job_type)");
parameters.Add(("job_type", jobType));
}
if (createdAfter.HasValue)
{
sb.Append(" AND a.created_at >= @created_after");
parameters.Add(("created_after", createdAfter.Value));
}
if (createdBefore.HasValue)
{
sb.Append(" AND a.created_at < @created_before");
parameters.Add(("created_before", createdBefore.Value));
}
sb.Append(" ORDER BY a.created_at DESC LIMIT @limit OFFSET @offset");
parameters.Add(("limit", limit));
parameters.Add(("offset", offset));
return (sb.ToString(), parameters);
}
private static (string sql, List<(string name, object value)> parameters) BuildCountQuery(
string tenantId,
string? artifactType,
string? jobType)
{
var sb = new StringBuilder();
sb.Append("SELECT COUNT(*) FROM artifacts a WHERE a.tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (!string.IsNullOrEmpty(artifactType))
{
sb.Append(" AND a.artifact_type = @artifact_type");
parameters.Add(("artifact_type", artifactType));
}
if (!string.IsNullOrEmpty(jobType))
{
sb.Append(" AND EXISTS (SELECT 1 FROM jobs j WHERE j.job_id = a.job_id AND j.tenant_id = a.tenant_id AND j.job_type = @job_type)");
parameters.Add(("job_type", jobType));
}
return (sb.ToString(), parameters);
}
}
/// <summary>
/// Exception thrown when attempting to create a duplicate artifact.
/// </summary>
public sealed class DuplicateArtifactException : Exception
{
public Guid ArtifactId { get; }
public string Digest { get; }
public DuplicateArtifactException(Guid artifactId, string digest, Exception innerException)
: base($"Artifact with ID '{artifactId}' or digest '{digest}' already exists.", innerException)
{
ArtifactId = artifactId;
Digest = digest;
}
}

View File

@@ -0,0 +1,504 @@
using System.Text;
using Microsoft.Extensions.Logging;
using Npgsql;
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Infrastructure.Repositories;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// PostgreSQL implementation of the audit repository.
/// </summary>
public sealed class PostgresAuditRepository : IAuditRepository
{
private const string SelectAuditColumns = """
entry_id, tenant_id, event_type, resource_type, resource_id, actor_id, actor_type,
actor_ip, user_agent, http_method, request_path, old_state, new_state, description,
correlation_id, previous_entry_hash, content_hash, sequence_number, occurred_at, metadata
""";
private const string SelectByIdSql = $"""
SELECT {SelectAuditColumns}
FROM audit_entries
WHERE tenant_id = @tenant_id AND entry_id = @entry_id
""";
private const string InsertEntrySql = """
INSERT INTO audit_entries (
entry_id, tenant_id, event_type, resource_type, resource_id, actor_id, actor_type,
actor_ip, user_agent, http_method, request_path, old_state, new_state, description,
correlation_id, previous_entry_hash, content_hash, sequence_number, occurred_at, metadata)
VALUES (
@entry_id, @tenant_id, @event_type, @resource_type, @resource_id, @actor_id, @actor_type,
@actor_ip, @user_agent, @http_method, @request_path, @old_state::jsonb, @new_state::jsonb, @description,
@correlation_id, @previous_entry_hash, @content_hash, @sequence_number, @occurred_at, @metadata::jsonb)
""";
private const string SelectLatestSql = $"""
SELECT {SelectAuditColumns}
FROM audit_entries
WHERE tenant_id = @tenant_id
ORDER BY sequence_number DESC
LIMIT 1
""";
private const string GetSequenceSql = """
SELECT next_seq, prev_hash FROM next_audit_sequence(@tenant_id)
""";
private const string UpdateSequenceHashSql = """
SELECT update_audit_sequence_hash(@tenant_id, @content_hash)
""";
private const string VerifyChainSql = """
SELECT is_valid, invalid_entry_id, invalid_sequence, error_message
FROM verify_audit_chain(@tenant_id, @start_seq, @end_seq)
""";
private const string GetSummarySql = """
SELECT total_entries, entries_since, event_types, unique_actors, unique_resources, earliest_entry, latest_entry
FROM get_audit_summary(@tenant_id, @since)
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresAuditRepository> _logger;
public PostgresAuditRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresAuditRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<AuditEntry> AppendAsync(
string tenantId,
AuditEventType eventType,
string resourceType,
Guid resourceId,
string actorId,
ActorType actorType,
string description,
string? oldState = null,
string? newState = null,
string? actorIp = null,
string? userAgent = null,
string? httpMethod = null,
string? requestPath = null,
string? correlationId = null,
string? metadata = null,
CancellationToken cancellationToken = default)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var transaction = await connection.BeginTransactionAsync(cancellationToken).ConfigureAwait(false);
try
{
// Get next sequence number and previous hash
long sequenceNumber;
string? previousEntryHash;
await using (var seqCommand = new NpgsqlCommand(GetSequenceSql, connection, transaction))
{
seqCommand.CommandTimeout = _dataSource.CommandTimeoutSeconds;
seqCommand.Parameters.AddWithValue("tenant_id", tenantId);
await using var reader = await seqCommand.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
throw new InvalidOperationException("Failed to get next audit sequence.");
}
sequenceNumber = reader.GetInt64(0);
previousEntryHash = reader.IsDBNull(1) ? null : reader.GetString(1);
}
// Create the entry
var entry = AuditEntry.Create(
tenantId: tenantId,
eventType: eventType,
resourceType: resourceType,
resourceId: resourceId,
actorId: actorId,
actorType: actorType,
description: description,
oldState: oldState,
newState: newState,
actorIp: actorIp,
userAgent: userAgent,
httpMethod: httpMethod,
requestPath: requestPath,
correlationId: correlationId,
previousEntryHash: previousEntryHash,
sequenceNumber: sequenceNumber,
metadata: metadata);
// Insert the entry
await using (var insertCommand = new NpgsqlCommand(InsertEntrySql, connection, transaction))
{
insertCommand.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddEntryParameters(insertCommand, entry);
await insertCommand.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
// Update sequence hash
await using (var updateCommand = new NpgsqlCommand(UpdateSequenceHashSql, connection, transaction))
{
updateCommand.CommandTimeout = _dataSource.CommandTimeoutSeconds;
updateCommand.Parameters.AddWithValue("tenant_id", tenantId);
updateCommand.Parameters.AddWithValue("content_hash", entry.ContentHash);
await updateCommand.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
await transaction.CommitAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.AuditEntryCreated(tenantId, eventType.ToString(), resourceType);
_logger.LogDebug("Audit entry {EntryId} appended for tenant {TenantId}, sequence {Sequence}",
entry.EntryId, tenantId, sequenceNumber);
return entry;
}
catch
{
await transaction.RollbackAsync(cancellationToken).ConfigureAwait(false);
throw;
}
}
public async Task<AuditEntry?> GetByIdAsync(
string tenantId,
Guid entryId,
CancellationToken cancellationToken = default)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("entry_id", entryId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapEntry(reader);
}
public async Task<IReadOnlyList<AuditEntry>> ListAsync(
string tenantId,
AuditEventType? eventType = null,
string? resourceType = null,
Guid? resourceId = null,
string? actorId = null,
DateTimeOffset? startTime = null,
DateTimeOffset? endTime = null,
int limit = 100,
int offset = 0,
CancellationToken cancellationToken = default)
{
var (sql, parameters) = BuildListQuery(tenantId, eventType, resourceType, resourceId, actorId, startTime, endTime, limit, offset);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var entries = new List<AuditEntry>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
entries.Add(MapEntry(reader));
}
return entries;
}
public async Task<IReadOnlyList<AuditEntry>> GetBySequenceRangeAsync(
string tenantId,
long startSequence,
long endSequence,
CancellationToken cancellationToken = default)
{
var sql = $"""
SELECT {SelectAuditColumns}
FROM audit_entries
WHERE tenant_id = @tenant_id
AND sequence_number >= @start_seq
AND sequence_number <= @end_seq
ORDER BY sequence_number ASC
""";
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("start_seq", startSequence);
command.Parameters.AddWithValue("end_seq", endSequence);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var entries = new List<AuditEntry>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
entries.Add(MapEntry(reader));
}
return entries;
}
public async Task<AuditEntry?> GetLatestAsync(
string tenantId,
CancellationToken cancellationToken = default)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectLatestSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapEntry(reader);
}
public async Task<IReadOnlyList<AuditEntry>> GetByResourceAsync(
string tenantId,
string resourceType,
Guid resourceId,
int limit = 100,
CancellationToken cancellationToken = default)
{
var sql = $"""
SELECT {SelectAuditColumns}
FROM audit_entries
WHERE tenant_id = @tenant_id
AND resource_type = @resource_type
AND resource_id = @resource_id
ORDER BY occurred_at DESC
LIMIT @limit
""";
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("resource_type", resourceType);
command.Parameters.AddWithValue("resource_id", resourceId);
command.Parameters.AddWithValue("limit", limit);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var entries = new List<AuditEntry>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
entries.Add(MapEntry(reader));
}
return entries;
}
public async Task<long> GetCountAsync(
string tenantId,
AuditEventType? eventType = null,
DateTimeOffset? startTime = null,
DateTimeOffset? endTime = null,
CancellationToken cancellationToken = default)
{
var sb = new StringBuilder("SELECT COUNT(*) FROM audit_entries WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (eventType.HasValue)
{
sb.Append(" AND event_type = @event_type");
parameters.Add(("event_type", (int)eventType.Value));
}
if (startTime.HasValue)
{
sb.Append(" AND occurred_at >= @start_time");
parameters.Add(("start_time", startTime.Value));
}
if (endTime.HasValue)
{
sb.Append(" AND occurred_at <= @end_time");
parameters.Add(("end_time", endTime.Value));
}
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sb.ToString(), connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
return Convert.ToInt64(result);
}
public async Task<ChainVerificationResult> VerifyChainAsync(
string tenantId,
long? startSequence = null,
long? endSequence = null,
CancellationToken cancellationToken = default)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(VerifyChainSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("start_seq", (object?)startSequence ?? 1L);
command.Parameters.AddWithValue("end_seq", (object?)endSequence ?? DBNull.Value);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return new ChainVerificationResult(true, null, null, null);
}
return new ChainVerificationResult(
IsValid: reader.GetBoolean(0),
InvalidEntryId: reader.IsDBNull(1) ? null : reader.GetGuid(1),
InvalidSequence: reader.IsDBNull(2) ? null : reader.GetInt64(2),
ErrorMessage: reader.IsDBNull(3) ? null : reader.GetString(3));
}
public async Task<AuditSummary> GetSummaryAsync(
string tenantId,
DateTimeOffset? since = null,
CancellationToken cancellationToken = default)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(GetSummarySql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("since", (object?)since ?? DBNull.Value);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return new AuditSummary(0, 0, 0, 0, 0, null, null);
}
return new AuditSummary(
TotalEntries: reader.GetInt64(0),
EntriesSince: reader.GetInt64(1),
EventTypes: reader.GetInt64(2),
UniqueActors: reader.GetInt64(3),
UniqueResources: reader.GetInt64(4),
EarliestEntry: reader.IsDBNull(5) ? null : reader.GetFieldValue<DateTimeOffset>(5),
LatestEntry: reader.IsDBNull(6) ? null : reader.GetFieldValue<DateTimeOffset>(6));
}
private static void AddEntryParameters(NpgsqlCommand command, AuditEntry entry)
{
command.Parameters.AddWithValue("entry_id", entry.EntryId);
command.Parameters.AddWithValue("tenant_id", entry.TenantId);
command.Parameters.AddWithValue("event_type", (int)entry.EventType);
command.Parameters.AddWithValue("resource_type", entry.ResourceType);
command.Parameters.AddWithValue("resource_id", entry.ResourceId);
command.Parameters.AddWithValue("actor_id", entry.ActorId);
command.Parameters.AddWithValue("actor_type", (int)entry.ActorType);
command.Parameters.AddWithValue("actor_ip", (object?)entry.ActorIp ?? DBNull.Value);
command.Parameters.AddWithValue("user_agent", (object?)entry.UserAgent ?? DBNull.Value);
command.Parameters.AddWithValue("http_method", (object?)entry.HttpMethod ?? DBNull.Value);
command.Parameters.AddWithValue("request_path", (object?)entry.RequestPath ?? DBNull.Value);
command.Parameters.AddWithValue("old_state", (object?)entry.OldState ?? DBNull.Value);
command.Parameters.AddWithValue("new_state", (object?)entry.NewState ?? DBNull.Value);
command.Parameters.AddWithValue("description", entry.Description);
command.Parameters.AddWithValue("correlation_id", (object?)entry.CorrelationId ?? DBNull.Value);
command.Parameters.AddWithValue("previous_entry_hash", (object?)entry.PreviousEntryHash ?? DBNull.Value);
command.Parameters.AddWithValue("content_hash", entry.ContentHash);
command.Parameters.AddWithValue("sequence_number", entry.SequenceNumber);
command.Parameters.AddWithValue("occurred_at", entry.OccurredAt);
command.Parameters.AddWithValue("metadata", (object?)entry.Metadata ?? DBNull.Value);
}
private static AuditEntry MapEntry(NpgsqlDataReader reader)
{
return new AuditEntry(
EntryId: reader.GetGuid(0),
TenantId: reader.GetString(1),
EventType: (AuditEventType)reader.GetInt32(2),
ResourceType: reader.GetString(3),
ResourceId: reader.GetGuid(4),
ActorId: reader.GetString(5),
ActorType: (ActorType)reader.GetInt32(6),
ActorIp: reader.IsDBNull(7) ? null : reader.GetString(7),
UserAgent: reader.IsDBNull(8) ? null : reader.GetString(8),
HttpMethod: reader.IsDBNull(9) ? null : reader.GetString(9),
RequestPath: reader.IsDBNull(10) ? null : reader.GetString(10),
OldState: reader.IsDBNull(11) ? null : reader.GetString(11),
NewState: reader.IsDBNull(12) ? null : reader.GetString(12),
Description: reader.GetString(13),
CorrelationId: reader.IsDBNull(14) ? null : reader.GetString(14),
PreviousEntryHash: reader.IsDBNull(15) ? null : reader.GetString(15),
ContentHash: reader.GetString(16),
SequenceNumber: reader.GetInt64(17),
OccurredAt: reader.GetFieldValue<DateTimeOffset>(18),
Metadata: reader.IsDBNull(19) ? null : reader.GetString(19));
}
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
string tenantId,
AuditEventType? eventType,
string? resourceType,
Guid? resourceId,
string? actorId,
DateTimeOffset? startTime,
DateTimeOffset? endTime,
int limit,
int offset)
{
var sb = new StringBuilder();
sb.Append($"SELECT {SelectAuditColumns} FROM audit_entries WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (eventType.HasValue)
{
sb.Append(" AND event_type = @event_type");
parameters.Add(("event_type", (int)eventType.Value));
}
if (resourceType is not null)
{
sb.Append(" AND resource_type = @resource_type");
parameters.Add(("resource_type", resourceType));
}
if (resourceId.HasValue)
{
sb.Append(" AND resource_id = @resource_id");
parameters.Add(("resource_id", resourceId.Value));
}
if (actorId is not null)
{
sb.Append(" AND actor_id = @actor_id");
parameters.Add(("actor_id", actorId));
}
if (startTime.HasValue)
{
sb.Append(" AND occurred_at >= @start_time");
parameters.Add(("start_time", startTime.Value));
}
if (endTime.HasValue)
{
sb.Append(" AND occurred_at <= @end_time");
parameters.Add(("end_time", endTime.Value));
}
sb.Append(" ORDER BY occurred_at DESC LIMIT @limit OFFSET @offset");
parameters.Add(("limit", limit));
parameters.Add(("offset", offset));
return (sb.ToString(), parameters);
}
}

View File

@@ -0,0 +1,395 @@
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using Npgsql;
using NpgsqlTypes;
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Infrastructure.Repositories;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// PostgreSQL implementation of backfill request repository.
/// </summary>
public sealed class PostgresBackfillRepository : IBackfillRepository
{
private const string SelectBackfillColumns = """
backfill_id, tenant_id, source_id, job_type, scope_key, status,
window_start, window_end, current_position, total_events,
processed_events, skipped_events, failed_events, batch_size,
dry_run, force_reprocess, estimated_duration, max_duration,
safety_checks, reason, ticket, created_at, started_at, completed_at,
created_by, updated_by, error_message
""";
private const string SelectByIdSql = $"""
SELECT {SelectBackfillColumns}
FROM backfill_requests
WHERE tenant_id = @tenant_id AND backfill_id = @backfill_id
""";
private const string InsertBackfillSql = """
INSERT INTO backfill_requests (
backfill_id, tenant_id, source_id, job_type, scope_key, status,
window_start, window_end, current_position, total_events,
processed_events, skipped_events, failed_events, batch_size,
dry_run, force_reprocess, estimated_duration, max_duration,
safety_checks, reason, ticket, created_at, started_at, completed_at,
created_by, updated_by, error_message)
VALUES (
@backfill_id, @tenant_id, @source_id, @job_type, @scope_key, @status,
@window_start, @window_end, @current_position, @total_events,
@processed_events, @skipped_events, @failed_events, @batch_size,
@dry_run, @force_reprocess, @estimated_duration, @max_duration,
@safety_checks, @reason, @ticket, @created_at, @started_at, @completed_at,
@created_by, @updated_by, @error_message)
""";
private const string UpdateBackfillSql = """
UPDATE backfill_requests
SET status = @status,
current_position = @current_position,
total_events = @total_events,
processed_events = @processed_events,
skipped_events = @skipped_events,
failed_events = @failed_events,
estimated_duration = @estimated_duration,
safety_checks = @safety_checks,
started_at = @started_at,
completed_at = @completed_at,
updated_by = @updated_by,
error_message = @error_message
WHERE tenant_id = @tenant_id AND backfill_id = @backfill_id
""";
private const string SelectOverlappingSql = """
SELECT COUNT(*) FROM backfill_requests
WHERE tenant_id = @tenant_id
AND scope_key = @scope_key
AND status IN ('pending', 'validating', 'running', 'paused')
AND window_start < @window_end
AND window_end > @window_start
AND (@exclude_backfill_id IS NULL OR backfill_id != @exclude_backfill_id)
""";
private const string SelectActiveByScopeSql = $"""
SELECT {SelectBackfillColumns}
FROM backfill_requests
WHERE tenant_id = @tenant_id
AND scope_key = @scope_key
AND status IN ('pending', 'validating', 'running', 'paused')
ORDER BY created_at DESC
""";
private const string CountByStatusSql = """
SELECT status, COUNT(*) as count
FROM backfill_requests
WHERE tenant_id = @tenant_id
GROUP BY status
""";
private const string SelectNextPendingSql = $"""
SELECT {SelectBackfillColumns}
FROM backfill_requests
WHERE tenant_id = @tenant_id
AND status = 'pending'
ORDER BY created_at ASC
LIMIT 1
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresBackfillRepository> _logger;
private static readonly JsonSerializerOptions JsonOptions = new() { PropertyNamingPolicy = JsonNamingPolicy.CamelCase };
public PostgresBackfillRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresBackfillRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<BackfillRequest?> GetByIdAsync(string tenantId, Guid backfillId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("backfill_id", backfillId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapBackfillRequest(reader);
}
public async Task CreateAsync(BackfillRequest request, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(request.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(InsertBackfillSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddBackfillParameters(command, request);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.BackfillCreated(request.TenantId, request.ScopeKey);
}
public async Task UpdateAsync(BackfillRequest request, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(request.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(UpdateBackfillSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", request.TenantId);
command.Parameters.AddWithValue("backfill_id", request.BackfillId);
command.Parameters.AddWithValue("status", request.Status.ToString().ToLowerInvariant());
command.Parameters.AddWithValue("current_position", (object?)request.CurrentPosition ?? DBNull.Value);
command.Parameters.AddWithValue("total_events", (object?)request.TotalEvents ?? DBNull.Value);
command.Parameters.AddWithValue("processed_events", request.ProcessedEvents);
command.Parameters.AddWithValue("skipped_events", request.SkippedEvents);
command.Parameters.AddWithValue("failed_events", request.FailedEvents);
command.Parameters.AddWithValue("estimated_duration", (object?)request.EstimatedDuration ?? DBNull.Value);
command.Parameters.AddWithValue("safety_checks", request.SafetyChecks is not null
? JsonSerializer.Serialize(request.SafetyChecks, JsonOptions)
: DBNull.Value);
command.Parameters.AddWithValue("started_at", (object?)request.StartedAt ?? DBNull.Value);
command.Parameters.AddWithValue("completed_at", (object?)request.CompletedAt ?? DBNull.Value);
command.Parameters.AddWithValue("updated_by", request.UpdatedBy);
command.Parameters.AddWithValue("error_message", (object?)request.ErrorMessage ?? DBNull.Value);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows == 0)
{
_logger.LogWarning("Backfill request not found for update: {BackfillId}", request.BackfillId);
}
else
{
OrchestratorMetrics.BackfillStatusChanged(request.TenantId, request.ScopeKey, request.Status.ToString());
}
}
public async Task<IReadOnlyList<BackfillRequest>> ListAsync(
string tenantId,
BackfillStatus? status,
Guid? sourceId,
string? jobType,
int limit,
int offset,
CancellationToken cancellationToken)
{
var (sql, parameters) = BuildListQuery(tenantId, status, sourceId, jobType, limit, offset);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var requests = new List<BackfillRequest>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
requests.Add(MapBackfillRequest(reader));
}
return requests;
}
public async Task<bool> HasOverlappingActiveAsync(
string tenantId,
string scopeKey,
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
Guid? excludeBackfillId,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectOverlappingSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("scope_key", scopeKey);
command.Parameters.AddWithValue("window_start", windowStart);
command.Parameters.AddWithValue("window_end", windowEnd);
command.Parameters.AddWithValue("exclude_backfill_id", (object?)excludeBackfillId ?? DBNull.Value);
var count = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
return Convert.ToInt64(count) > 0;
}
public async Task<IReadOnlyList<BackfillRequest>> GetActiveByScope(
string tenantId,
string scopeKey,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectActiveByScopeSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("scope_key", scopeKey);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var requests = new List<BackfillRequest>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
requests.Add(MapBackfillRequest(reader));
}
return requests;
}
public async Task<IDictionary<BackfillStatus, int>> CountByStatusAsync(
string tenantId,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(CountByStatusSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var counts = new Dictionary<BackfillStatus, int>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
var statusStr = reader.GetString(0);
var count = reader.GetInt32(1);
if (Enum.TryParse<BackfillStatus>(statusStr, true, out var status))
{
counts[status] = count;
}
}
return counts;
}
public async Task<BackfillRequest?> GetNextPendingAsync(string tenantId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectNextPendingSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapBackfillRequest(reader);
}
private static void AddBackfillParameters(NpgsqlCommand command, BackfillRequest request)
{
command.Parameters.AddWithValue("backfill_id", request.BackfillId);
command.Parameters.AddWithValue("tenant_id", request.TenantId);
command.Parameters.AddWithValue("source_id", (object?)request.SourceId ?? DBNull.Value);
command.Parameters.AddWithValue("job_type", (object?)request.JobType ?? DBNull.Value);
command.Parameters.AddWithValue("scope_key", request.ScopeKey);
command.Parameters.AddWithValue("status", request.Status.ToString().ToLowerInvariant());
command.Parameters.AddWithValue("window_start", request.WindowStart);
command.Parameters.AddWithValue("window_end", request.WindowEnd);
command.Parameters.AddWithValue("current_position", (object?)request.CurrentPosition ?? DBNull.Value);
command.Parameters.AddWithValue("total_events", (object?)request.TotalEvents ?? DBNull.Value);
command.Parameters.AddWithValue("processed_events", request.ProcessedEvents);
command.Parameters.AddWithValue("skipped_events", request.SkippedEvents);
command.Parameters.AddWithValue("failed_events", request.FailedEvents);
command.Parameters.AddWithValue("batch_size", request.BatchSize);
command.Parameters.AddWithValue("dry_run", request.DryRun);
command.Parameters.AddWithValue("force_reprocess", request.ForceReprocess);
command.Parameters.AddWithValue("estimated_duration", (object?)request.EstimatedDuration ?? DBNull.Value);
command.Parameters.AddWithValue("max_duration", (object?)request.MaxDuration ?? DBNull.Value);
command.Parameters.AddWithValue("safety_checks", request.SafetyChecks is not null
? JsonSerializer.Serialize(request.SafetyChecks, JsonOptions)
: DBNull.Value);
command.Parameters.AddWithValue("reason", request.Reason);
command.Parameters.AddWithValue("ticket", (object?)request.Ticket ?? DBNull.Value);
command.Parameters.AddWithValue("created_at", request.CreatedAt);
command.Parameters.AddWithValue("started_at", (object?)request.StartedAt ?? DBNull.Value);
command.Parameters.AddWithValue("completed_at", (object?)request.CompletedAt ?? DBNull.Value);
command.Parameters.AddWithValue("created_by", request.CreatedBy);
command.Parameters.AddWithValue("updated_by", request.UpdatedBy);
command.Parameters.AddWithValue("error_message", (object?)request.ErrorMessage ?? DBNull.Value);
}
private static BackfillRequest MapBackfillRequest(NpgsqlDataReader reader)
{
var safetyChecksJson = reader.IsDBNull(18) ? null : reader.GetString(18);
var safetyChecks = safetyChecksJson is not null
? JsonSerializer.Deserialize<BackfillSafetyChecks>(safetyChecksJson, JsonOptions)
: null;
return new BackfillRequest(
BackfillId: reader.GetGuid(0),
TenantId: reader.GetString(1),
SourceId: reader.IsDBNull(2) ? null : reader.GetGuid(2),
JobType: reader.IsDBNull(3) ? null : reader.GetString(3),
ScopeKey: reader.GetString(4),
Status: Enum.Parse<BackfillStatus>(reader.GetString(5), ignoreCase: true),
WindowStart: reader.GetFieldValue<DateTimeOffset>(6),
WindowEnd: reader.GetFieldValue<DateTimeOffset>(7),
CurrentPosition: reader.IsDBNull(8) ? null : reader.GetFieldValue<DateTimeOffset>(8),
TotalEvents: reader.IsDBNull(9) ? null : reader.GetInt64(9),
ProcessedEvents: reader.GetInt64(10),
SkippedEvents: reader.GetInt64(11),
FailedEvents: reader.GetInt64(12),
BatchSize: reader.GetInt32(13),
DryRun: reader.GetBoolean(14),
ForceReprocess: reader.GetBoolean(15),
EstimatedDuration: reader.IsDBNull(16) ? null : reader.GetFieldValue<TimeSpan>(16),
MaxDuration: reader.IsDBNull(17) ? null : reader.GetFieldValue<TimeSpan>(17),
SafetyChecks: safetyChecks,
Reason: reader.GetString(19),
Ticket: reader.IsDBNull(20) ? null : reader.GetString(20),
CreatedAt: reader.GetFieldValue<DateTimeOffset>(21),
StartedAt: reader.IsDBNull(22) ? null : reader.GetFieldValue<DateTimeOffset>(22),
CompletedAt: reader.IsDBNull(23) ? null : reader.GetFieldValue<DateTimeOffset>(23),
CreatedBy: reader.GetString(24),
UpdatedBy: reader.GetString(25),
ErrorMessage: reader.IsDBNull(26) ? null : reader.GetString(26));
}
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
string tenantId,
BackfillStatus? status,
Guid? sourceId,
string? jobType,
int limit,
int offset)
{
var sb = new StringBuilder();
sb.Append($"SELECT {SelectBackfillColumns} FROM backfill_requests WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (status.HasValue)
{
sb.Append(" AND status = @status");
parameters.Add(("status", status.Value.ToString().ToLowerInvariant()));
}
if (sourceId.HasValue)
{
sb.Append(" AND source_id = @source_id");
parameters.Add(("source_id", sourceId.Value));
}
if (jobType is not null)
{
sb.Append(" AND job_type = @job_type");
parameters.Add(("job_type", jobType));
}
sb.Append(" ORDER BY created_at DESC LIMIT @limit OFFSET @offset");
parameters.Add(("limit", limit));
parameters.Add(("offset", offset));
return (sb.ToString(), parameters);
}
}

View File

@@ -0,0 +1,678 @@
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using Npgsql;
using StellaOps.Orchestrator.Core.DeadLetter;
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// PostgreSQL implementation of dead-letter entry repository.
/// </summary>
public sealed class PostgresDeadLetterRepository : IDeadLetterRepository
{
private const string SelectEntryColumns = """
entry_id, tenant_id, original_job_id, run_id, source_id, job_type,
payload, payload_digest, idempotency_key, correlation_id,
status, error_code, failure_reason, remediation_hint, category, is_retryable,
original_attempts, replay_attempts, max_replay_attempts,
failed_at, created_at, updated_at, expires_at, resolved_at,
resolution_notes, created_by, updated_by
""";
private const string SelectByIdSql = $"""
SELECT {SelectEntryColumns}
FROM dead_letter_entries
WHERE tenant_id = @tenant_id AND entry_id = @entry_id
""";
private const string SelectByJobIdSql = $"""
SELECT {SelectEntryColumns}
FROM dead_letter_entries
WHERE tenant_id = @tenant_id AND original_job_id = @original_job_id
ORDER BY created_at DESC
LIMIT 1
""";
private const string InsertEntrySql = """
INSERT INTO dead_letter_entries (
entry_id, tenant_id, original_job_id, run_id, source_id, job_type,
payload, payload_digest, idempotency_key, correlation_id,
status, error_code, failure_reason, remediation_hint, category, is_retryable,
original_attempts, replay_attempts, max_replay_attempts,
failed_at, created_at, updated_at, expires_at, resolved_at,
resolution_notes, created_by, updated_by)
VALUES (
@entry_id, @tenant_id, @original_job_id, @run_id, @source_id, @job_type,
@payload::jsonb, @payload_digest, @idempotency_key, @correlation_id,
@status, @error_code, @failure_reason, @remediation_hint, @category, @is_retryable,
@original_attempts, @replay_attempts, @max_replay_attempts,
@failed_at, @created_at, @updated_at, @expires_at, @resolved_at,
@resolution_notes, @created_by, @updated_by)
""";
private const string UpdateEntrySql = """
UPDATE dead_letter_entries
SET status = @status,
replay_attempts = @replay_attempts,
failure_reason = @failure_reason,
updated_at = @updated_at,
resolved_at = @resolved_at,
resolution_notes = @resolution_notes,
updated_by = @updated_by
WHERE tenant_id = @tenant_id AND entry_id = @entry_id
""";
private const string SelectPendingRetryableSql = $"""
SELECT {SelectEntryColumns}
FROM dead_letter_entries
WHERE tenant_id = @tenant_id
AND status = 'pending'
AND is_retryable = TRUE
AND replay_attempts < max_replay_attempts
ORDER BY created_at ASC
LIMIT @limit
""";
private const string SelectByErrorCodeSql = $"""
SELECT {SelectEntryColumns}
FROM dead_letter_entries
WHERE tenant_id = @tenant_id
AND error_code = @error_code
AND (@status IS NULL OR status = @status)
ORDER BY created_at DESC
LIMIT @limit
""";
private const string SelectByCategorySql = $"""
SELECT {SelectEntryColumns}
FROM dead_letter_entries
WHERE tenant_id = @tenant_id
AND category = @category
AND (@status IS NULL OR status = @status)
ORDER BY created_at DESC
LIMIT @limit
""";
private const string MarkExpiredSql = """
SELECT mark_expired_dead_letter_entries(@batch_limit)
""";
private const string PurgeSql = """
SELECT purge_dead_letter_entries(@retention_days, @batch_limit)
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresDeadLetterRepository> _logger;
private static readonly JsonSerializerOptions JsonOptions = new() { PropertyNamingPolicy = JsonNamingPolicy.CamelCase };
public PostgresDeadLetterRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresDeadLetterRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<DeadLetterEntry?> GetByIdAsync(
string tenantId,
Guid entryId,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("entry_id", entryId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapEntry(reader);
}
public async Task<DeadLetterEntry?> GetByOriginalJobIdAsync(
string tenantId,
Guid originalJobId,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByJobIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("original_job_id", originalJobId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapEntry(reader);
}
public async Task<IReadOnlyList<DeadLetterEntry>> ListAsync(
string tenantId,
DeadLetterListOptions options,
CancellationToken cancellationToken)
{
var (sql, parameters) = BuildListQuery(tenantId, options);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var entries = new List<DeadLetterEntry>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
entries.Add(MapEntry(reader));
}
return entries;
}
public async Task<long> CountAsync(
string tenantId,
DeadLetterListOptions options,
CancellationToken cancellationToken)
{
var (sql, parameters) = BuildCountQuery(tenantId, options);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
return Convert.ToInt64(result);
}
public async Task CreateAsync(
DeadLetterEntry entry,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(entry.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(InsertEntrySql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddEntryParameters(command, entry);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.DeadLetterCreated(entry.TenantId, entry.JobType, entry.ErrorCode, entry.Category.ToString());
}
public async Task<bool> UpdateAsync(
DeadLetterEntry entry,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(entry.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(UpdateEntrySql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", entry.TenantId);
command.Parameters.AddWithValue("entry_id", entry.EntryId);
command.Parameters.AddWithValue("status", entry.Status.ToString().ToLowerInvariant());
command.Parameters.AddWithValue("replay_attempts", entry.ReplayAttempts);
command.Parameters.AddWithValue("failure_reason", entry.FailureReason);
command.Parameters.AddWithValue("updated_at", entry.UpdatedAt);
command.Parameters.AddWithValue("resolved_at", (object?)entry.ResolvedAt ?? DBNull.Value);
command.Parameters.AddWithValue("resolution_notes", (object?)entry.ResolutionNotes ?? DBNull.Value);
command.Parameters.AddWithValue("updated_by", entry.UpdatedBy);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows > 0)
{
OrchestratorMetrics.DeadLetterStatusChanged(entry.TenantId, entry.JobType, entry.Status.ToString());
}
return rows > 0;
}
public async Task<IReadOnlyList<DeadLetterEntry>> GetPendingRetryableAsync(
string tenantId,
int limit,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectPendingRetryableSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("limit", limit);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var entries = new List<DeadLetterEntry>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
entries.Add(MapEntry(reader));
}
return entries;
}
public async Task<IReadOnlyList<DeadLetterEntry>> GetByErrorCodeAsync(
string tenantId,
string errorCode,
DeadLetterStatus? status,
int limit,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByErrorCodeSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("error_code", errorCode);
command.Parameters.AddWithValue("status", status.HasValue ? status.Value.ToString().ToLowerInvariant() : DBNull.Value);
command.Parameters.AddWithValue("limit", limit);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var entries = new List<DeadLetterEntry>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
entries.Add(MapEntry(reader));
}
return entries;
}
public async Task<IReadOnlyList<DeadLetterEntry>> GetByCategoryAsync(
string tenantId,
ErrorCategory category,
DeadLetterStatus? status,
int limit,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByCategorySql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("category", category.ToString().ToLowerInvariant());
command.Parameters.AddWithValue("status", status.HasValue ? status.Value.ToString().ToLowerInvariant() : DBNull.Value);
command.Parameters.AddWithValue("limit", limit);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var entries = new List<DeadLetterEntry>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
entries.Add(MapEntry(reader));
}
return entries;
}
public async Task<DeadLetterStats> GetStatsAsync(
string tenantId,
CancellationToken cancellationToken)
{
const string statsSql = """
SELECT
COUNT(*) AS total,
COUNT(*) FILTER (WHERE status = 'pending') AS pending,
COUNT(*) FILTER (WHERE status = 'replaying') AS replaying,
COUNT(*) FILTER (WHERE status = 'replayed') AS replayed,
COUNT(*) FILTER (WHERE status = 'resolved') AS resolved,
COUNT(*) FILTER (WHERE status = 'exhausted') AS exhausted,
COUNT(*) FILTER (WHERE status = 'expired') AS expired,
COUNT(*) FILTER (WHERE is_retryable = TRUE AND status = 'pending') AS retryable
FROM dead_letter_entries
WHERE tenant_id = @tenant_id
""";
const string byCategorySql = """
SELECT category, COUNT(*) as cnt
FROM dead_letter_entries
WHERE tenant_id = @tenant_id
GROUP BY category
""";
const string topErrorCodesSql = """
SELECT error_code, COUNT(*) as cnt
FROM dead_letter_entries
WHERE tenant_id = @tenant_id AND status = 'pending'
GROUP BY error_code
ORDER BY cnt DESC
LIMIT 10
""";
const string topJobTypesSql = """
SELECT job_type, COUNT(*) as cnt
FROM dead_letter_entries
WHERE tenant_id = @tenant_id AND status = 'pending'
GROUP BY job_type
ORDER BY cnt DESC
LIMIT 10
""";
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
// Get counts
long total = 0, pending = 0, replaying = 0, replayed = 0, resolved = 0, exhausted = 0, expired = 0, retryable = 0;
await using (var command = new NpgsqlCommand(statsSql, connection))
{
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
total = reader.GetInt64(0);
pending = reader.GetInt64(1);
replaying = reader.GetInt64(2);
replayed = reader.GetInt64(3);
resolved = reader.GetInt64(4);
exhausted = reader.GetInt64(5);
expired = reader.GetInt64(6);
retryable = reader.GetInt64(7);
}
}
// Get by category
var byCategory = new Dictionary<ErrorCategory, long>();
await using (var command = new NpgsqlCommand(byCategorySql, connection))
{
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
if (Enum.TryParse<ErrorCategory>(reader.GetString(0), true, out var cat))
{
byCategory[cat] = reader.GetInt64(1);
}
}
}
// Get top error codes
var topErrorCodes = new Dictionary<string, long>();
await using (var command = new NpgsqlCommand(topErrorCodesSql, connection))
{
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
topErrorCodes[reader.GetString(0)] = reader.GetInt64(1);
}
}
// Get top job types
var topJobTypes = new Dictionary<string, long>();
await using (var command = new NpgsqlCommand(topJobTypesSql, connection))
{
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
topJobTypes[reader.GetString(0)] = reader.GetInt64(1);
}
}
return new DeadLetterStats(
TotalEntries: total,
PendingEntries: pending,
ReplayingEntries: replaying,
ReplayedEntries: replayed,
ResolvedEntries: resolved,
ExhaustedEntries: exhausted,
ExpiredEntries: expired,
RetryableEntries: retryable,
ByCategory: byCategory,
TopErrorCodes: topErrorCodes,
TopJobTypes: topJobTypes);
}
public async Task<IReadOnlyList<DeadLetterSummary>> GetActionableSummaryAsync(
string tenantId,
int limit,
CancellationToken cancellationToken)
{
const string sql = """
SELECT error_code, category, entry_count, retryable_count, oldest_entry, sample_reason
FROM get_actionable_dead_letter_summary(@tenant_id, @limit)
""";
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("limit", limit);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var summaries = new List<DeadLetterSummary>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
var categoryStr = reader.GetString(1);
var category = Enum.TryParse<ErrorCategory>(categoryStr, true, out var cat) ? cat : ErrorCategory.Unknown;
summaries.Add(new DeadLetterSummary(
ErrorCode: reader.GetString(0),
Category: category,
EntryCount: reader.GetInt64(2),
RetryableCount: reader.GetInt64(3),
OldestEntry: reader.GetFieldValue<DateTimeOffset>(4),
SampleReason: reader.IsDBNull(5) ? null : reader.GetString(5)));
}
return summaries;
}
public async Task<int> MarkExpiredAsync(
int batchLimit,
CancellationToken cancellationToken)
{
// Use a system-level connection (no tenant context needed for maintenance)
await using var connection = await _dataSource.OpenConnectionAsync("system", "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(MarkExpiredSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("batch_limit", batchLimit);
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
var marked = Convert.ToInt32(result);
if (marked > 0)
{
OrchestratorMetrics.DeadLetterExpired(marked);
_logger.LogInformation("Marked {Count} dead-letter entries as expired", marked);
}
return marked;
}
public async Task<int> PurgeOldEntriesAsync(
int retentionDays,
int batchLimit,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync("system", "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(PurgeSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("retention_days", retentionDays);
command.Parameters.AddWithValue("batch_limit", batchLimit);
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
var purged = Convert.ToInt32(result);
if (purged > 0)
{
OrchestratorMetrics.DeadLetterPurged(purged);
_logger.LogInformation("Purged {Count} old dead-letter entries (retention: {RetentionDays} days)", purged, retentionDays);
}
return purged;
}
private static void AddEntryParameters(NpgsqlCommand command, DeadLetterEntry entry)
{
command.Parameters.AddWithValue("entry_id", entry.EntryId);
command.Parameters.AddWithValue("tenant_id", entry.TenantId);
command.Parameters.AddWithValue("original_job_id", entry.OriginalJobId);
command.Parameters.AddWithValue("run_id", (object?)entry.RunId ?? DBNull.Value);
command.Parameters.AddWithValue("source_id", (object?)entry.SourceId ?? DBNull.Value);
command.Parameters.AddWithValue("job_type", entry.JobType);
command.Parameters.AddWithValue("payload", entry.Payload);
command.Parameters.AddWithValue("payload_digest", entry.PayloadDigest);
command.Parameters.AddWithValue("idempotency_key", entry.IdempotencyKey);
command.Parameters.AddWithValue("correlation_id", (object?)entry.CorrelationId ?? DBNull.Value);
command.Parameters.AddWithValue("status", entry.Status.ToString().ToLowerInvariant());
command.Parameters.AddWithValue("error_code", entry.ErrorCode);
command.Parameters.AddWithValue("failure_reason", entry.FailureReason);
command.Parameters.AddWithValue("remediation_hint", (object?)entry.RemediationHint ?? DBNull.Value);
command.Parameters.AddWithValue("category", entry.Category.ToString().ToLowerInvariant());
command.Parameters.AddWithValue("is_retryable", entry.IsRetryable);
command.Parameters.AddWithValue("original_attempts", entry.OriginalAttempts);
command.Parameters.AddWithValue("replay_attempts", entry.ReplayAttempts);
command.Parameters.AddWithValue("max_replay_attempts", entry.MaxReplayAttempts);
command.Parameters.AddWithValue("failed_at", entry.FailedAt);
command.Parameters.AddWithValue("created_at", entry.CreatedAt);
command.Parameters.AddWithValue("updated_at", entry.UpdatedAt);
command.Parameters.AddWithValue("expires_at", entry.ExpiresAt);
command.Parameters.AddWithValue("resolved_at", (object?)entry.ResolvedAt ?? DBNull.Value);
command.Parameters.AddWithValue("resolution_notes", (object?)entry.ResolutionNotes ?? DBNull.Value);
command.Parameters.AddWithValue("created_by", entry.CreatedBy);
command.Parameters.AddWithValue("updated_by", entry.UpdatedBy);
}
private static DeadLetterEntry MapEntry(NpgsqlDataReader reader)
{
var statusStr = reader.GetString(10);
var categoryStr = reader.GetString(14);
return new DeadLetterEntry(
EntryId: reader.GetGuid(0),
TenantId: reader.GetString(1),
OriginalJobId: reader.GetGuid(2),
RunId: reader.IsDBNull(3) ? null : reader.GetGuid(3),
SourceId: reader.IsDBNull(4) ? null : reader.GetGuid(4),
JobType: reader.GetString(5),
Payload: reader.GetString(6),
PayloadDigest: reader.GetString(7),
IdempotencyKey: reader.GetString(8),
CorrelationId: reader.IsDBNull(9) ? null : reader.GetString(9),
Status: Enum.TryParse<DeadLetterStatus>(statusStr, true, out var status) ? status : DeadLetterStatus.Pending,
ErrorCode: reader.GetString(11),
FailureReason: reader.GetString(12),
RemediationHint: reader.IsDBNull(13) ? null : reader.GetString(13),
Category: Enum.TryParse<ErrorCategory>(categoryStr, true, out var cat) ? cat : ErrorCategory.Unknown,
IsRetryable: reader.GetBoolean(15),
OriginalAttempts: reader.GetInt32(16),
ReplayAttempts: reader.GetInt32(17),
MaxReplayAttempts: reader.GetInt32(18),
FailedAt: reader.GetFieldValue<DateTimeOffset>(19),
CreatedAt: reader.GetFieldValue<DateTimeOffset>(20),
UpdatedAt: reader.GetFieldValue<DateTimeOffset>(21),
ExpiresAt: reader.GetFieldValue<DateTimeOffset>(22),
ResolvedAt: reader.IsDBNull(23) ? null : reader.GetFieldValue<DateTimeOffset>(23),
ResolutionNotes: reader.IsDBNull(24) ? null : reader.GetString(24),
CreatedBy: reader.GetString(25),
UpdatedBy: reader.GetString(26));
}
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
string tenantId,
DeadLetterListOptions options)
{
var sb = new StringBuilder();
sb.Append($"SELECT {SelectEntryColumns} FROM dead_letter_entries WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
AppendFilters(sb, parameters, options);
var order = options.Ascending ? "ASC" : "DESC";
sb.Append($" ORDER BY created_at {order}");
if (!string.IsNullOrEmpty(options.Cursor))
{
// Cursor is the created_at timestamp
var op = options.Ascending ? ">" : "<";
sb.Append($" AND created_at {op} @cursor");
if (DateTimeOffset.TryParse(options.Cursor, out var cursor))
{
parameters.Add(("cursor", cursor));
}
}
sb.Append(" LIMIT @limit");
parameters.Add(("limit", options.Limit));
return (sb.ToString(), parameters);
}
private static (string sql, List<(string name, object value)> parameters) BuildCountQuery(
string tenantId,
DeadLetterListOptions options)
{
var sb = new StringBuilder();
sb.Append("SELECT COUNT(*) FROM dead_letter_entries WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
AppendFilters(sb, parameters, options);
return (sb.ToString(), parameters);
}
private static void AppendFilters(StringBuilder sb, List<(string, object)> parameters, DeadLetterListOptions options)
{
if (options.Status.HasValue)
{
sb.Append(" AND status = @status");
parameters.Add(("status", options.Status.Value.ToString().ToLowerInvariant()));
}
if (options.Category.HasValue)
{
sb.Append(" AND category = @category");
parameters.Add(("category", options.Category.Value.ToString().ToLowerInvariant()));
}
if (!string.IsNullOrEmpty(options.JobType))
{
sb.Append(" AND job_type = @job_type");
parameters.Add(("job_type", options.JobType));
}
if (!string.IsNullOrEmpty(options.ErrorCode))
{
sb.Append(" AND error_code = @error_code");
parameters.Add(("error_code", options.ErrorCode));
}
if (options.SourceId.HasValue)
{
sb.Append(" AND source_id = @source_id");
parameters.Add(("source_id", options.SourceId.Value));
}
if (options.RunId.HasValue)
{
sb.Append(" AND run_id = @run_id");
parameters.Add(("run_id", options.RunId.Value));
}
if (options.IsRetryable.HasValue)
{
sb.Append(" AND is_retryable = @is_retryable");
parameters.Add(("is_retryable", options.IsRetryable.Value));
}
if (options.CreatedAfter.HasValue)
{
sb.Append(" AND created_at >= @created_after");
parameters.Add(("created_after", options.CreatedAfter.Value));
}
if (options.CreatedBefore.HasValue)
{
sb.Append(" AND created_at <= @created_before");
parameters.Add(("created_before", options.CreatedBefore.Value));
}
}
}

View File

@@ -0,0 +1,247 @@
using Microsoft.Extensions.Logging;
using Npgsql;
using StellaOps.Orchestrator.Core.Backfill;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// PostgreSQL implementation of duplicate suppressor.
/// </summary>
public sealed class PostgresDuplicateSuppressor : IDuplicateSuppressor
{
private const string SelectProcessedSql = """
SELECT 1 FROM processed_events
WHERE tenant_id = @tenant_id
AND scope_key = @scope_key
AND event_key = @event_key
AND expires_at > NOW()
""";
private const string SelectMultipleProcessedSql = """
SELECT event_key FROM processed_events
WHERE tenant_id = @tenant_id
AND scope_key = @scope_key
AND event_key = ANY(@event_keys)
AND expires_at > NOW()
""";
private const string UpsertProcessedSql = """
INSERT INTO processed_events (tenant_id, scope_key, event_key, event_time, processed_at, batch_id, expires_at)
VALUES (@tenant_id, @scope_key, @event_key, @event_time, NOW(), @batch_id, @expires_at)
ON CONFLICT (tenant_id, scope_key, event_key) DO UPDATE
SET event_time = EXCLUDED.event_time,
processed_at = NOW(),
batch_id = EXCLUDED.batch_id,
expires_at = EXCLUDED.expires_at
""";
private const string CountProcessedSql = """
SELECT COUNT(*) FROM processed_events
WHERE tenant_id = @tenant_id
AND scope_key = @scope_key
AND event_time >= @from
AND event_time < @to
AND expires_at > NOW()
""";
private const string CleanupExpiredSql = """
DELETE FROM processed_events
WHERE ctid IN (
SELECT ctid FROM processed_events
WHERE expires_at < NOW()
LIMIT @batch_limit
)
""";
private readonly OrchestratorDataSource _dataSource;
private readonly string _tenantId;
private readonly ILogger<PostgresDuplicateSuppressor> _logger;
public PostgresDuplicateSuppressor(
OrchestratorDataSource dataSource,
string tenantId,
ILogger<PostgresDuplicateSuppressor> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_tenantId = tenantId ?? throw new ArgumentNullException(nameof(tenantId));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<bool> HasProcessedAsync(string scopeKey, string eventKey, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(_tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectProcessedSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", _tenantId);
command.Parameters.AddWithValue("scope_key", scopeKey);
command.Parameters.AddWithValue("event_key", eventKey);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
return await reader.ReadAsync(cancellationToken).ConfigureAwait(false);
}
public async Task<IReadOnlySet<string>> GetProcessedAsync(string scopeKey, IEnumerable<string> eventKeys, CancellationToken cancellationToken)
{
var keyList = eventKeys.ToArray();
if (keyList.Length == 0)
{
return new HashSet<string>();
}
await using var connection = await _dataSource.OpenConnectionAsync(_tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectMultipleProcessedSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", _tenantId);
command.Parameters.AddWithValue("scope_key", scopeKey);
command.Parameters.AddWithValue("event_keys", keyList);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var result = new HashSet<string>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
result.Add(reader.GetString(0));
}
return result;
}
public async Task MarkProcessedAsync(
string scopeKey,
string eventKey,
DateTimeOffset eventTime,
Guid? batchId,
TimeSpan ttl,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(_tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(UpsertProcessedSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", _tenantId);
command.Parameters.AddWithValue("scope_key", scopeKey);
command.Parameters.AddWithValue("event_key", eventKey);
command.Parameters.AddWithValue("event_time", eventTime);
command.Parameters.AddWithValue("batch_id", (object?)batchId ?? DBNull.Value);
command.Parameters.AddWithValue("expires_at", DateTimeOffset.UtcNow + ttl);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
public async Task MarkProcessedBatchAsync(
string scopeKey,
IEnumerable<ProcessedEvent> events,
Guid? batchId,
TimeSpan ttl,
CancellationToken cancellationToken)
{
var eventList = events.ToList();
if (eventList.Count == 0)
{
return;
}
var expiresAt = DateTimeOffset.UtcNow + ttl;
await using var connection = await _dataSource.OpenConnectionAsync(_tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var transaction = await connection.BeginTransactionAsync(cancellationToken).ConfigureAwait(false);
try
{
foreach (var evt in eventList)
{
await using var command = new NpgsqlCommand(UpsertProcessedSql, connection, transaction);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", _tenantId);
command.Parameters.AddWithValue("scope_key", scopeKey);
command.Parameters.AddWithValue("event_key", evt.EventKey);
command.Parameters.AddWithValue("event_time", evt.EventTime);
command.Parameters.AddWithValue("batch_id", (object?)batchId ?? DBNull.Value);
command.Parameters.AddWithValue("expires_at", expiresAt);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
await transaction.CommitAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.ProcessedEventsMarked(_tenantId, scopeKey, eventList.Count);
}
catch
{
await transaction.RollbackAsync(cancellationToken).ConfigureAwait(false);
throw;
}
}
public async Task<long> CountProcessedAsync(string scopeKey, DateTimeOffset from, DateTimeOffset to, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(_tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(CountProcessedSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", _tenantId);
command.Parameters.AddWithValue("scope_key", scopeKey);
command.Parameters.AddWithValue("from", from);
command.Parameters.AddWithValue("to", to);
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
return Convert.ToInt64(result);
}
public async Task<int> CleanupExpiredAsync(int batchLimit, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(_tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(CleanupExpiredSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("batch_limit", batchLimit);
var deleted = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (deleted > 0)
{
_logger.LogInformation("Cleaned up {DeletedCount} expired processed events", deleted);
OrchestratorMetrics.ProcessedEventsCleanedUp(_tenantId, deleted);
}
return deleted;
}
}
/// <summary>
/// Factory for creating tenant-scoped duplicate suppressors.
/// </summary>
public interface IDuplicateSuppressorFactory
{
/// <summary>
/// Creates a duplicate suppressor for the specified tenant.
/// </summary>
IDuplicateSuppressor Create(string tenantId);
}
/// <summary>
/// Factory implementation for PostgreSQL duplicate suppressors.
/// </summary>
public sealed class PostgresDuplicateSuppressorFactory : IDuplicateSuppressorFactory
{
private readonly OrchestratorDataSource _dataSource;
private readonly ILoggerFactory _loggerFactory;
public PostgresDuplicateSuppressorFactory(
OrchestratorDataSource dataSource,
ILoggerFactory loggerFactory)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_loggerFactory = loggerFactory ?? throw new ArgumentNullException(nameof(loggerFactory));
}
public IDuplicateSuppressor Create(string tenantId)
{
return new PostgresDuplicateSuppressor(
_dataSource,
tenantId,
_loggerFactory.CreateLogger<PostgresDuplicateSuppressor>());
}
}

View File

@@ -0,0 +1,540 @@
using System.Text;
using Microsoft.Extensions.Logging;
using Npgsql;
using NpgsqlTypes;
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Infrastructure.Repositories;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// PostgreSQL implementation of job repository.
/// </summary>
public sealed class PostgresJobRepository : IJobRepository
{
private const string SelectJobColumns = """
job_id, tenant_id, project_id, run_id, job_type, status, priority, attempt, max_attempts,
payload_digest, payload, idempotency_key, correlation_id, lease_id, worker_id, task_runner_id,
lease_until, created_at, scheduled_at, leased_at, completed_at, not_before, reason, replay_of, created_by
""";
private const string SelectByIdSql = $"""
SELECT {SelectJobColumns}
FROM jobs
WHERE tenant_id = @tenant_id AND job_id = @job_id
""";
private const string SelectByIdempotencyKeySql = $"""
SELECT {SelectJobColumns}
FROM jobs
WHERE tenant_id = @tenant_id AND idempotency_key = @idempotency_key
""";
private const string InsertJobSql = """
INSERT INTO jobs (
job_id, tenant_id, project_id, run_id, job_type, status, priority, attempt, max_attempts,
payload_digest, payload, idempotency_key, correlation_id, lease_id, worker_id, task_runner_id,
lease_until, created_at, scheduled_at, leased_at, completed_at, not_before, reason, replay_of, created_by)
VALUES (
@job_id, @tenant_id, @project_id, @run_id, @job_type, @status::job_status, @priority, @attempt, @max_attempts,
@payload_digest, @payload, @idempotency_key, @correlation_id, @lease_id, @worker_id, @task_runner_id,
@lease_until, @created_at, @scheduled_at, @leased_at, @completed_at, @not_before, @reason, @replay_of, @created_by)
""";
private const string UpdateStatusSql = """
UPDATE jobs
SET status = @status::job_status,
attempt = @attempt,
lease_id = @lease_id,
worker_id = @worker_id,
task_runner_id = @task_runner_id,
lease_until = @lease_until,
scheduled_at = @scheduled_at,
leased_at = @leased_at,
completed_at = @completed_at,
not_before = @not_before,
reason = @reason
WHERE tenant_id = @tenant_id AND job_id = @job_id
""";
private const string LeaseNextSqlTemplate = """
UPDATE jobs
SET status = 'leased'::job_status,
lease_id = @lease_id,
worker_id = @worker_id,
lease_until = @lease_until,
leased_at = @leased_at
WHERE tenant_id = @tenant_id
AND job_id = (
SELECT job_id
FROM jobs
WHERE tenant_id = @tenant_id
AND status = 'scheduled'::job_status
AND (not_before IS NULL OR not_before <= @now)
{0}
ORDER BY priority DESC, created_at
LIMIT 1
FOR UPDATE SKIP LOCKED
)
RETURNING
""";
private const string ExtendLeaseSql = """
UPDATE jobs
SET lease_until = @new_lease_until
WHERE tenant_id = @tenant_id
AND job_id = @job_id
AND lease_id = @lease_id
AND status = 'leased'::job_status
AND lease_until > @now
""";
private const string SelectByRunIdSql = $"""
SELECT {SelectJobColumns}
FROM jobs
WHERE tenant_id = @tenant_id AND run_id = @run_id
ORDER BY created_at
""";
private const string SelectExpiredLeasesSql = $"""
SELECT {SelectJobColumns}
FROM jobs
WHERE tenant_id = @tenant_id
AND status = 'leased'::job_status
AND lease_until < @cutoff
ORDER BY lease_until
LIMIT @limit
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresJobRepository> _logger;
public PostgresJobRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresJobRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<Job?> GetByIdAsync(string tenantId, Guid jobId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("job_id", jobId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapJob(reader);
}
public async Task<Job?> GetByIdempotencyKeyAsync(string tenantId, string idempotencyKey, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByIdempotencyKeySql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("idempotency_key", idempotencyKey);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapJob(reader);
}
public async Task CreateAsync(Job job, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(job.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(InsertJobSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddJobParameters(command, job);
try
{
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.JobEnqueued(job.TenantId, job.JobType);
OrchestratorMetrics.QueueDepthChanged(job.TenantId, job.JobType, 1);
}
catch (PostgresException ex) when (string.Equals(ex.SqlState, PostgresErrorCodes.UniqueViolation, StringComparison.Ordinal))
{
_logger.LogWarning("Duplicate job idempotency key: {IdempotencyKey}", job.IdempotencyKey);
throw new DuplicateJobException(job.IdempotencyKey, ex);
}
}
public async Task UpdateStatusAsync(
string tenantId,
Guid jobId,
JobStatus status,
int attempt,
Guid? leaseId,
string? workerId,
string? taskRunnerId,
DateTimeOffset? leaseUntil,
DateTimeOffset? scheduledAt,
DateTimeOffset? leasedAt,
DateTimeOffset? completedAt,
DateTimeOffset? notBefore,
string? reason,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(UpdateStatusSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("job_id", jobId);
command.Parameters.AddWithValue("status", StatusToString(status));
command.Parameters.AddWithValue("attempt", attempt);
command.Parameters.AddWithValue("lease_id", (object?)leaseId ?? DBNull.Value);
command.Parameters.AddWithValue("worker_id", (object?)workerId ?? DBNull.Value);
command.Parameters.AddWithValue("task_runner_id", (object?)taskRunnerId ?? DBNull.Value);
command.Parameters.AddWithValue("lease_until", (object?)leaseUntil ?? DBNull.Value);
command.Parameters.AddWithValue("scheduled_at", (object?)scheduledAt ?? DBNull.Value);
command.Parameters.AddWithValue("leased_at", (object?)leasedAt ?? DBNull.Value);
command.Parameters.AddWithValue("completed_at", (object?)completedAt ?? DBNull.Value);
command.Parameters.AddWithValue("not_before", (object?)notBefore ?? DBNull.Value);
command.Parameters.AddWithValue("reason", (object?)reason ?? DBNull.Value);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
public async Task<Job?> LeaseNextAsync(
string tenantId,
string? jobType,
Guid leaseId,
string workerId,
DateTimeOffset leaseUntil,
CancellationToken cancellationToken)
{
var jobTypeFilter = jobType != null ? "AND job_type = @job_type" : "";
var sql = string.Format(LeaseNextSqlTemplate, jobTypeFilter) + " " + SelectJobColumns;
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("lease_id", leaseId);
command.Parameters.AddWithValue("worker_id", workerId);
command.Parameters.AddWithValue("lease_until", leaseUntil);
command.Parameters.AddWithValue("leased_at", DateTimeOffset.UtcNow);
command.Parameters.AddWithValue("now", DateTimeOffset.UtcNow);
if (jobType != null)
{
command.Parameters.AddWithValue("job_type", jobType);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
var job = MapJob(reader);
OrchestratorMetrics.JobLeased(job.TenantId, job.JobType);
OrchestratorMetrics.QueueDepthChanged(job.TenantId, job.JobType, -1);
return job;
}
public async Task<bool> ExtendLeaseAsync(
string tenantId,
Guid jobId,
Guid leaseId,
DateTimeOffset newLeaseUntil,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(ExtendLeaseSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("job_id", jobId);
command.Parameters.AddWithValue("lease_id", leaseId);
command.Parameters.AddWithValue("new_lease_until", newLeaseUntil);
command.Parameters.AddWithValue("now", DateTimeOffset.UtcNow);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
return rows > 0;
}
public async Task<IReadOnlyList<Job>> GetByRunIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByRunIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("run_id", runId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var jobs = new List<Job>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
jobs.Add(MapJob(reader));
}
return jobs;
}
public async Task<IReadOnlyList<Job>> GetExpiredLeasesAsync(string tenantId, DateTimeOffset cutoff, int limit, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectExpiredLeasesSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("cutoff", cutoff);
command.Parameters.AddWithValue("limit", limit);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var jobs = new List<Job>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
jobs.Add(MapJob(reader));
}
return jobs;
}
public async Task<IReadOnlyList<Job>> ListAsync(
string tenantId,
JobStatus? status,
string? jobType,
string? projectId,
DateTimeOffset? createdAfter,
DateTimeOffset? createdBefore,
int limit,
int offset,
CancellationToken cancellationToken)
{
var (sql, parameters) = BuildListQuery(tenantId, status, jobType, projectId, createdAfter, createdBefore, limit, offset);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var jobs = new List<Job>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
jobs.Add(MapJob(reader));
}
return jobs;
}
public async Task<int> CountAsync(
string tenantId,
JobStatus? status,
string? jobType,
string? projectId,
CancellationToken cancellationToken)
{
var (sql, parameters) = BuildCountQuery(tenantId, status, jobType, projectId);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
return Convert.ToInt32(result);
}
private static void AddJobParameters(NpgsqlCommand command, Job job)
{
command.Parameters.AddWithValue("job_id", job.JobId);
command.Parameters.AddWithValue("tenant_id", job.TenantId);
command.Parameters.AddWithValue("project_id", (object?)job.ProjectId ?? DBNull.Value);
command.Parameters.AddWithValue("run_id", (object?)job.RunId ?? DBNull.Value);
command.Parameters.AddWithValue("job_type", job.JobType);
command.Parameters.AddWithValue("status", StatusToString(job.Status));
command.Parameters.AddWithValue("priority", job.Priority);
command.Parameters.AddWithValue("attempt", job.Attempt);
command.Parameters.AddWithValue("max_attempts", job.MaxAttempts);
command.Parameters.AddWithValue("payload_digest", job.PayloadDigest);
command.Parameters.Add(new NpgsqlParameter<string>("payload", NpgsqlDbType.Jsonb) { TypedValue = job.Payload });
command.Parameters.AddWithValue("idempotency_key", job.IdempotencyKey);
command.Parameters.AddWithValue("correlation_id", (object?)job.CorrelationId ?? DBNull.Value);
command.Parameters.AddWithValue("lease_id", (object?)job.LeaseId ?? DBNull.Value);
command.Parameters.AddWithValue("worker_id", (object?)job.WorkerId ?? DBNull.Value);
command.Parameters.AddWithValue("task_runner_id", (object?)job.TaskRunnerId ?? DBNull.Value);
command.Parameters.AddWithValue("lease_until", (object?)job.LeaseUntil ?? DBNull.Value);
command.Parameters.AddWithValue("created_at", job.CreatedAt);
command.Parameters.AddWithValue("scheduled_at", (object?)job.ScheduledAt ?? DBNull.Value);
command.Parameters.AddWithValue("leased_at", (object?)job.LeasedAt ?? DBNull.Value);
command.Parameters.AddWithValue("completed_at", (object?)job.CompletedAt ?? DBNull.Value);
command.Parameters.AddWithValue("not_before", (object?)job.NotBefore ?? DBNull.Value);
command.Parameters.AddWithValue("reason", (object?)job.Reason ?? DBNull.Value);
command.Parameters.AddWithValue("replay_of", (object?)job.ReplayOf ?? DBNull.Value);
command.Parameters.AddWithValue("created_by", job.CreatedBy);
}
private static Job MapJob(NpgsqlDataReader reader)
{
return new Job(
JobId: reader.GetGuid(0),
TenantId: reader.GetString(1),
ProjectId: reader.IsDBNull(2) ? null : reader.GetString(2),
RunId: reader.IsDBNull(3) ? null : reader.GetGuid(3),
JobType: reader.GetString(4),
Status: ParseStatus(reader.GetString(5)),
Priority: reader.GetInt32(6),
Attempt: reader.GetInt32(7),
MaxAttempts: reader.GetInt32(8),
PayloadDigest: reader.GetString(9),
Payload: reader.GetString(10),
IdempotencyKey: reader.GetString(11),
CorrelationId: reader.IsDBNull(12) ? null : reader.GetString(12),
LeaseId: reader.IsDBNull(13) ? null : reader.GetGuid(13),
WorkerId: reader.IsDBNull(14) ? null : reader.GetString(14),
TaskRunnerId: reader.IsDBNull(15) ? null : reader.GetString(15),
LeaseUntil: reader.IsDBNull(16) ? null : reader.GetFieldValue<DateTimeOffset>(16),
CreatedAt: reader.GetFieldValue<DateTimeOffset>(17),
ScheduledAt: reader.IsDBNull(18) ? null : reader.GetFieldValue<DateTimeOffset>(18),
LeasedAt: reader.IsDBNull(19) ? null : reader.GetFieldValue<DateTimeOffset>(19),
CompletedAt: reader.IsDBNull(20) ? null : reader.GetFieldValue<DateTimeOffset>(20),
NotBefore: reader.IsDBNull(21) ? null : reader.GetFieldValue<DateTimeOffset>(21),
Reason: reader.IsDBNull(22) ? null : reader.GetString(22),
ReplayOf: reader.IsDBNull(23) ? null : reader.GetGuid(23),
CreatedBy: reader.GetString(24));
}
private static string StatusToString(JobStatus status) => status switch
{
JobStatus.Pending => "pending",
JobStatus.Scheduled => "scheduled",
JobStatus.Leased => "leased",
JobStatus.Succeeded => "succeeded",
JobStatus.Failed => "failed",
JobStatus.Canceled => "canceled",
JobStatus.TimedOut => "timed_out",
_ => throw new ArgumentOutOfRangeException(nameof(status))
};
private static JobStatus ParseStatus(string status) => status switch
{
"pending" => JobStatus.Pending,
"scheduled" => JobStatus.Scheduled,
"leased" => JobStatus.Leased,
"succeeded" => JobStatus.Succeeded,
"failed" => JobStatus.Failed,
"canceled" => JobStatus.Canceled,
"timed_out" => JobStatus.TimedOut,
_ => throw new ArgumentOutOfRangeException(nameof(status))
};
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
string tenantId,
JobStatus? status,
string? jobType,
string? projectId,
DateTimeOffset? createdAfter,
DateTimeOffset? createdBefore,
int limit,
int offset)
{
var sb = new StringBuilder();
sb.Append($"SELECT {SelectJobColumns} FROM jobs WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (status.HasValue)
{
sb.Append(" AND status = @status::job_status");
parameters.Add(("status", StatusToString(status.Value)));
}
if (!string.IsNullOrEmpty(jobType))
{
sb.Append(" AND job_type = @job_type");
parameters.Add(("job_type", jobType));
}
if (!string.IsNullOrEmpty(projectId))
{
sb.Append(" AND project_id = @project_id");
parameters.Add(("project_id", projectId));
}
if (createdAfter.HasValue)
{
sb.Append(" AND created_at >= @created_after");
parameters.Add(("created_after", createdAfter.Value));
}
if (createdBefore.HasValue)
{
sb.Append(" AND created_at < @created_before");
parameters.Add(("created_before", createdBefore.Value));
}
sb.Append(" ORDER BY created_at DESC LIMIT @limit OFFSET @offset");
parameters.Add(("limit", limit));
parameters.Add(("offset", offset));
return (sb.ToString(), parameters);
}
private static (string sql, List<(string name, object value)> parameters) BuildCountQuery(
string tenantId,
JobStatus? status,
string? jobType,
string? projectId)
{
var sb = new StringBuilder();
sb.Append("SELECT COUNT(*) FROM jobs WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (status.HasValue)
{
sb.Append(" AND status = @status::job_status");
parameters.Add(("status", StatusToString(status.Value)));
}
if (!string.IsNullOrEmpty(jobType))
{
sb.Append(" AND job_type = @job_type");
parameters.Add(("job_type", jobType));
}
if (!string.IsNullOrEmpty(projectId))
{
sb.Append(" AND project_id = @project_id");
parameters.Add(("project_id", projectId));
}
return (sb.ToString(), parameters);
}
}
/// <summary>
/// Exception thrown when attempting to create a job with a duplicate idempotency key.
/// </summary>
public sealed class DuplicateJobException : Exception
{
public string IdempotencyKey { get; }
public DuplicateJobException(string idempotencyKey, Exception innerException)
: base($"Job with idempotency key '{idempotencyKey}' already exists.", innerException)
{
IdempotencyKey = idempotencyKey;
}
}

View File

@@ -0,0 +1,949 @@
using System.Text;
using Microsoft.Extensions.Logging;
using Npgsql;
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Infrastructure.Repositories;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// PostgreSQL implementation of the ledger repository.
/// </summary>
public sealed class PostgresLedgerRepository : ILedgerRepository
{
private const string SelectLedgerColumns = """
ledger_id, tenant_id, run_id, source_id, run_type, final_status, total_jobs,
succeeded_jobs, failed_jobs, run_created_at, run_started_at, run_completed_at,
execution_duration_ms, initiated_by, input_digest, output_digest, artifact_manifest,
sequence_number, previous_entry_hash, content_hash, ledger_created_at, correlation_id, metadata
""";
private const string SelectByIdSql = $"""
SELECT {SelectLedgerColumns}
FROM run_ledger_entries
WHERE tenant_id = @tenant_id AND ledger_id = @ledger_id
""";
private const string SelectByRunIdSql = $"""
SELECT {SelectLedgerColumns}
FROM run_ledger_entries
WHERE tenant_id = @tenant_id AND run_id = @run_id
""";
private const string InsertEntrySql = """
INSERT INTO run_ledger_entries (
ledger_id, tenant_id, run_id, source_id, run_type, final_status, total_jobs,
succeeded_jobs, failed_jobs, run_created_at, run_started_at, run_completed_at,
execution_duration_ms, initiated_by, input_digest, output_digest, artifact_manifest,
sequence_number, previous_entry_hash, content_hash, ledger_created_at, correlation_id, metadata)
VALUES (
@ledger_id, @tenant_id, @run_id, @source_id, @run_type, @final_status, @total_jobs,
@succeeded_jobs, @failed_jobs, @run_created_at, @run_started_at, @run_completed_at,
@execution_duration_ms, @initiated_by, @input_digest, @output_digest, @artifact_manifest::jsonb,
@sequence_number, @previous_entry_hash, @content_hash, @ledger_created_at, @correlation_id, @metadata::jsonb)
""";
private const string SelectLatestSql = $"""
SELECT {SelectLedgerColumns}
FROM run_ledger_entries
WHERE tenant_id = @tenant_id
ORDER BY sequence_number DESC
LIMIT 1
""";
private const string GetSequenceSql = """
SELECT next_seq, prev_hash FROM next_ledger_sequence(@tenant_id)
""";
private const string UpdateSequenceHashSql = """
SELECT update_ledger_sequence_hash(@tenant_id, @content_hash)
""";
private const string VerifyChainSql = """
SELECT is_valid, invalid_ledger_id, invalid_sequence, error_message
FROM verify_ledger_chain(@tenant_id, @start_seq, @end_seq)
""";
private const string GetSummarySql = """
SELECT total_entries, entries_since, total_runs, successful_runs, failed_runs,
total_jobs, unique_sources, unique_run_types, earliest_entry, latest_entry
FROM get_ledger_summary(@tenant_id, @since)
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresLedgerRepository> _logger;
public PostgresLedgerRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresLedgerRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<RunLedgerEntry> AppendAsync(
Run run,
IReadOnlyList<Artifact> artifacts,
string inputDigest,
string? metadata = null,
CancellationToken cancellationToken = default)
{
if (run.CompletedAt is null)
{
throw new InvalidOperationException("Cannot create ledger entry from an incomplete run.");
}
await using var connection = await _dataSource.OpenConnectionAsync(run.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var transaction = await connection.BeginTransactionAsync(cancellationToken).ConfigureAwait(false);
try
{
// Get next sequence number and previous hash
long sequenceNumber;
string? previousEntryHash;
await using (var seqCommand = new NpgsqlCommand(GetSequenceSql, connection, transaction))
{
seqCommand.CommandTimeout = _dataSource.CommandTimeoutSeconds;
seqCommand.Parameters.AddWithValue("tenant_id", run.TenantId);
await using var reader = await seqCommand.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
throw new InvalidOperationException("Failed to get next ledger sequence.");
}
sequenceNumber = reader.GetInt64(0);
previousEntryHash = reader.IsDBNull(1) ? null : reader.GetString(1);
}
// Create the ledger entry
var entry = RunLedgerEntry.FromCompletedRun(
run: run,
artifacts: artifacts,
inputDigest: inputDigest,
sequenceNumber: sequenceNumber,
previousEntryHash: previousEntryHash,
metadata: metadata);
// Insert the entry
await using (var insertCommand = new NpgsqlCommand(InsertEntrySql, connection, transaction))
{
insertCommand.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddEntryParameters(insertCommand, entry);
await insertCommand.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
// Update sequence hash
await using (var updateCommand = new NpgsqlCommand(UpdateSequenceHashSql, connection, transaction))
{
updateCommand.CommandTimeout = _dataSource.CommandTimeoutSeconds;
updateCommand.Parameters.AddWithValue("tenant_id", run.TenantId);
updateCommand.Parameters.AddWithValue("content_hash", entry.ContentHash);
await updateCommand.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
await transaction.CommitAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.LedgerEntryCreated(run.TenantId, run.RunType, entry.FinalStatus.ToString());
_logger.LogDebug("Ledger entry {LedgerId} appended for run {RunId}, sequence {Sequence}",
entry.LedgerId, run.RunId, sequenceNumber);
return entry;
}
catch
{
await transaction.RollbackAsync(cancellationToken).ConfigureAwait(false);
throw;
}
}
public async Task<RunLedgerEntry?> GetByIdAsync(
string tenantId,
Guid ledgerId,
CancellationToken cancellationToken = default)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("ledger_id", ledgerId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapEntry(reader);
}
public async Task<RunLedgerEntry?> GetByRunIdAsync(
string tenantId,
Guid runId,
CancellationToken cancellationToken = default)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByRunIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("run_id", runId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapEntry(reader);
}
public async Task<IReadOnlyList<RunLedgerEntry>> ListAsync(
string tenantId,
string? runType = null,
Guid? sourceId = null,
RunStatus? finalStatus = null,
DateTimeOffset? startTime = null,
DateTimeOffset? endTime = null,
int limit = 100,
int offset = 0,
CancellationToken cancellationToken = default)
{
var (sql, parameters) = BuildListQuery(tenantId, runType, sourceId, finalStatus, startTime, endTime, limit, offset);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var entries = new List<RunLedgerEntry>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
entries.Add(MapEntry(reader));
}
return entries;
}
public async Task<IReadOnlyList<RunLedgerEntry>> GetBySequenceRangeAsync(
string tenantId,
long startSequence,
long endSequence,
CancellationToken cancellationToken = default)
{
var sql = $"""
SELECT {SelectLedgerColumns}
FROM run_ledger_entries
WHERE tenant_id = @tenant_id
AND sequence_number >= @start_seq
AND sequence_number <= @end_seq
ORDER BY sequence_number ASC
""";
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("start_seq", startSequence);
command.Parameters.AddWithValue("end_seq", endSequence);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var entries = new List<RunLedgerEntry>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
entries.Add(MapEntry(reader));
}
return entries;
}
public async Task<RunLedgerEntry?> GetLatestAsync(
string tenantId,
CancellationToken cancellationToken = default)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectLatestSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapEntry(reader);
}
public async Task<IReadOnlyList<RunLedgerEntry>> GetBySourceAsync(
string tenantId,
Guid sourceId,
int limit = 100,
CancellationToken cancellationToken = default)
{
var sql = $"""
SELECT {SelectLedgerColumns}
FROM run_ledger_entries
WHERE tenant_id = @tenant_id
AND source_id = @source_id
ORDER BY ledger_created_at DESC
LIMIT @limit
""";
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("source_id", sourceId);
command.Parameters.AddWithValue("limit", limit);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var entries = new List<RunLedgerEntry>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
entries.Add(MapEntry(reader));
}
return entries;
}
public async Task<long> GetCountAsync(
string tenantId,
string? runType = null,
Guid? sourceId = null,
DateTimeOffset? startTime = null,
DateTimeOffset? endTime = null,
CancellationToken cancellationToken = default)
{
var sb = new StringBuilder("SELECT COUNT(*) FROM run_ledger_entries WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (runType is not null)
{
sb.Append(" AND run_type = @run_type");
parameters.Add(("run_type", runType));
}
if (sourceId.HasValue)
{
sb.Append(" AND source_id = @source_id");
parameters.Add(("source_id", sourceId.Value));
}
if (startTime.HasValue)
{
sb.Append(" AND ledger_created_at >= @start_time");
parameters.Add(("start_time", startTime.Value));
}
if (endTime.HasValue)
{
sb.Append(" AND ledger_created_at <= @end_time");
parameters.Add(("end_time", endTime.Value));
}
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sb.ToString(), connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
return Convert.ToInt64(result);
}
public async Task<ChainVerificationResult> VerifyChainAsync(
string tenantId,
long? startSequence = null,
long? endSequence = null,
CancellationToken cancellationToken = default)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(VerifyChainSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("start_seq", (object?)startSequence ?? 1L);
command.Parameters.AddWithValue("end_seq", (object?)endSequence ?? DBNull.Value);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return new ChainVerificationResult(true, null, null, null);
}
return new ChainVerificationResult(
IsValid: reader.GetBoolean(0),
InvalidEntryId: reader.IsDBNull(1) ? null : reader.GetGuid(1),
InvalidSequence: reader.IsDBNull(2) ? null : reader.GetInt64(2),
ErrorMessage: reader.IsDBNull(3) ? null : reader.GetString(3));
}
public async Task<LedgerSummary> GetSummaryAsync(
string tenantId,
DateTimeOffset? since = null,
CancellationToken cancellationToken = default)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(GetSummarySql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("since", (object?)since ?? DBNull.Value);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return new LedgerSummary(0, 0, 0, 0, 0, 0, 0, 0, null, null);
}
return new LedgerSummary(
TotalEntries: reader.GetInt64(0),
EntriesSince: reader.GetInt64(1),
TotalRuns: reader.GetInt64(2),
SuccessfulRuns: reader.GetInt64(3),
FailedRuns: reader.GetInt64(4),
TotalJobs: reader.GetInt64(5),
UniqueSources: reader.GetInt64(6),
UniqueRunTypes: reader.GetInt64(7),
EarliestEntry: reader.IsDBNull(8) ? null : reader.GetFieldValue<DateTimeOffset>(8),
LatestEntry: reader.IsDBNull(9) ? null : reader.GetFieldValue<DateTimeOffset>(9));
}
private static void AddEntryParameters(NpgsqlCommand command, RunLedgerEntry entry)
{
command.Parameters.AddWithValue("ledger_id", entry.LedgerId);
command.Parameters.AddWithValue("tenant_id", entry.TenantId);
command.Parameters.AddWithValue("run_id", entry.RunId);
command.Parameters.AddWithValue("source_id", entry.SourceId);
command.Parameters.AddWithValue("run_type", entry.RunType);
command.Parameters.AddWithValue("final_status", (int)entry.FinalStatus);
command.Parameters.AddWithValue("total_jobs", entry.TotalJobs);
command.Parameters.AddWithValue("succeeded_jobs", entry.SucceededJobs);
command.Parameters.AddWithValue("failed_jobs", entry.FailedJobs);
command.Parameters.AddWithValue("run_created_at", entry.RunCreatedAt);
command.Parameters.AddWithValue("run_started_at", (object?)entry.RunStartedAt ?? DBNull.Value);
command.Parameters.AddWithValue("run_completed_at", entry.RunCompletedAt);
command.Parameters.AddWithValue("execution_duration_ms", (long)entry.ExecutionDuration.TotalMilliseconds);
command.Parameters.AddWithValue("initiated_by", entry.InitiatedBy);
command.Parameters.AddWithValue("input_digest", entry.InputDigest);
command.Parameters.AddWithValue("output_digest", entry.OutputDigest);
command.Parameters.AddWithValue("artifact_manifest", entry.ArtifactManifest);
command.Parameters.AddWithValue("sequence_number", entry.SequenceNumber);
command.Parameters.AddWithValue("previous_entry_hash", (object?)entry.PreviousEntryHash ?? DBNull.Value);
command.Parameters.AddWithValue("content_hash", entry.ContentHash);
command.Parameters.AddWithValue("ledger_created_at", entry.LedgerCreatedAt);
command.Parameters.AddWithValue("correlation_id", (object?)entry.CorrelationId ?? DBNull.Value);
command.Parameters.AddWithValue("metadata", (object?)entry.Metadata ?? DBNull.Value);
}
private static RunLedgerEntry MapEntry(NpgsqlDataReader reader)
{
return new RunLedgerEntry(
LedgerId: reader.GetGuid(0),
TenantId: reader.GetString(1),
RunId: reader.GetGuid(2),
SourceId: reader.GetGuid(3),
RunType: reader.GetString(4),
FinalStatus: (RunStatus)reader.GetInt32(5),
TotalJobs: reader.GetInt32(6),
SucceededJobs: reader.GetInt32(7),
FailedJobs: reader.GetInt32(8),
RunCreatedAt: reader.GetFieldValue<DateTimeOffset>(9),
RunStartedAt: reader.IsDBNull(10) ? null : reader.GetFieldValue<DateTimeOffset>(10),
RunCompletedAt: reader.GetFieldValue<DateTimeOffset>(11),
ExecutionDuration: TimeSpan.FromMilliseconds(reader.GetInt64(12)),
InitiatedBy: reader.GetString(13),
InputDigest: reader.GetString(14),
OutputDigest: reader.GetString(15),
ArtifactManifest: reader.GetString(16),
SequenceNumber: reader.GetInt64(17),
PreviousEntryHash: reader.IsDBNull(18) ? null : reader.GetString(18),
ContentHash: reader.GetString(19),
LedgerCreatedAt: reader.GetFieldValue<DateTimeOffset>(20),
CorrelationId: reader.IsDBNull(21) ? null : reader.GetString(21),
Metadata: reader.IsDBNull(22) ? null : reader.GetString(22));
}
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
string tenantId,
string? runType,
Guid? sourceId,
RunStatus? finalStatus,
DateTimeOffset? startTime,
DateTimeOffset? endTime,
int limit,
int offset)
{
var sb = new StringBuilder();
sb.Append($"SELECT {SelectLedgerColumns} FROM run_ledger_entries WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (runType is not null)
{
sb.Append(" AND run_type = @run_type");
parameters.Add(("run_type", runType));
}
if (sourceId.HasValue)
{
sb.Append(" AND source_id = @source_id");
parameters.Add(("source_id", sourceId.Value));
}
if (finalStatus.HasValue)
{
sb.Append(" AND final_status = @final_status");
parameters.Add(("final_status", (int)finalStatus.Value));
}
if (startTime.HasValue)
{
sb.Append(" AND ledger_created_at >= @start_time");
parameters.Add(("start_time", startTime.Value));
}
if (endTime.HasValue)
{
sb.Append(" AND ledger_created_at <= @end_time");
parameters.Add(("end_time", endTime.Value));
}
sb.Append(" ORDER BY ledger_created_at DESC LIMIT @limit OFFSET @offset");
parameters.Add(("limit", limit));
parameters.Add(("offset", offset));
return (sb.ToString(), parameters);
}
}
/// <summary>
/// PostgreSQL implementation of the ledger export repository.
/// </summary>
public sealed class PostgresLedgerExportRepository : ILedgerExportRepository
{
private const string SelectExportColumns = """
export_id, tenant_id, status, format, start_time, end_time, run_type_filter,
source_id_filter, entry_count, output_uri, output_digest, output_size_bytes,
requested_by, requested_at, started_at, completed_at, error_message
""";
private const string InsertExportSql = """
INSERT INTO ledger_exports (
export_id, tenant_id, status, format, start_time, end_time, run_type_filter,
source_id_filter, entry_count, output_uri, output_digest, output_size_bytes,
requested_by, requested_at, started_at, completed_at, error_message)
VALUES (
@export_id, @tenant_id, @status, @format, @start_time, @end_time, @run_type_filter,
@source_id_filter, @entry_count, @output_uri, @output_digest, @output_size_bytes,
@requested_by, @requested_at, @started_at, @completed_at, @error_message)
""";
private const string UpdateExportSql = """
UPDATE ledger_exports
SET status = @status,
entry_count = @entry_count,
output_uri = @output_uri,
output_digest = @output_digest,
output_size_bytes = @output_size_bytes,
started_at = @started_at,
completed_at = @completed_at,
error_message = @error_message
WHERE tenant_id = @tenant_id AND export_id = @export_id
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresLedgerExportRepository> _logger;
public PostgresLedgerExportRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresLedgerExportRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<LedgerExport> CreateAsync(LedgerExport export, CancellationToken cancellationToken = default)
{
await using var connection = await _dataSource.OpenConnectionAsync(export.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(InsertExportSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddExportParameters(command, export);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.LedgerExportRequested(export.TenantId, export.Format);
_logger.LogDebug("Ledger export {ExportId} created for tenant {TenantId}", export.ExportId, export.TenantId);
return export;
}
public async Task<LedgerExport?> GetByIdAsync(string tenantId, Guid exportId, CancellationToken cancellationToken = default)
{
var sql = $"""
SELECT {SelectExportColumns}
FROM ledger_exports
WHERE tenant_id = @tenant_id AND export_id = @export_id
""";
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("export_id", exportId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapExport(reader);
}
public async Task<IReadOnlyList<LedgerExport>> ListAsync(
string tenantId,
LedgerExportStatus? status = null,
int limit = 100,
int offset = 0,
CancellationToken cancellationToken = default)
{
var sb = new StringBuilder($"SELECT {SelectExportColumns} FROM ledger_exports WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (status.HasValue)
{
sb.Append(" AND status = @status");
parameters.Add(("status", (int)status.Value));
}
sb.Append(" ORDER BY requested_at DESC LIMIT @limit OFFSET @offset");
parameters.Add(("limit", limit));
parameters.Add(("offset", offset));
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sb.ToString(), connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var exports = new List<LedgerExport>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
exports.Add(MapExport(reader));
}
return exports;
}
public async Task<LedgerExport> UpdateAsync(LedgerExport export, CancellationToken cancellationToken = default)
{
await using var connection = await _dataSource.OpenConnectionAsync(export.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(UpdateExportSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("export_id", export.ExportId);
command.Parameters.AddWithValue("tenant_id", export.TenantId);
command.Parameters.AddWithValue("status", (int)export.Status);
command.Parameters.AddWithValue("entry_count", export.EntryCount);
command.Parameters.AddWithValue("output_uri", (object?)export.OutputUri ?? DBNull.Value);
command.Parameters.AddWithValue("output_digest", (object?)export.OutputDigest ?? DBNull.Value);
command.Parameters.AddWithValue("output_size_bytes", (object?)export.OutputSizeBytes ?? DBNull.Value);
command.Parameters.AddWithValue("started_at", (object?)export.StartedAt ?? DBNull.Value);
command.Parameters.AddWithValue("completed_at", (object?)export.CompletedAt ?? DBNull.Value);
command.Parameters.AddWithValue("error_message", (object?)export.ErrorMessage ?? DBNull.Value);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (export.Status == LedgerExportStatus.Completed)
{
OrchestratorMetrics.LedgerExportCompleted(export.TenantId, export.Format);
}
else if (export.Status == LedgerExportStatus.Failed)
{
OrchestratorMetrics.LedgerExportFailed(export.TenantId, export.Format);
}
return export;
}
public async Task<IReadOnlyList<LedgerExport>> GetPendingAsync(int limit = 10, CancellationToken cancellationToken = default)
{
var sql = $"""
SELECT {SelectExportColumns}
FROM ledger_exports
WHERE status = @status
ORDER BY requested_at ASC
LIMIT @limit
""";
await using var connection = await _dataSource.OpenConnectionAsync("_system", "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("status", (int)LedgerExportStatus.Pending);
command.Parameters.AddWithValue("limit", limit);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var exports = new List<LedgerExport>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
exports.Add(MapExport(reader));
}
return exports;
}
private static void AddExportParameters(NpgsqlCommand command, LedgerExport export)
{
command.Parameters.AddWithValue("export_id", export.ExportId);
command.Parameters.AddWithValue("tenant_id", export.TenantId);
command.Parameters.AddWithValue("status", (int)export.Status);
command.Parameters.AddWithValue("format", export.Format);
command.Parameters.AddWithValue("start_time", (object?)export.StartTime ?? DBNull.Value);
command.Parameters.AddWithValue("end_time", (object?)export.EndTime ?? DBNull.Value);
command.Parameters.AddWithValue("run_type_filter", (object?)export.RunTypeFilter ?? DBNull.Value);
command.Parameters.AddWithValue("source_id_filter", (object?)export.SourceIdFilter ?? DBNull.Value);
command.Parameters.AddWithValue("entry_count", export.EntryCount);
command.Parameters.AddWithValue("output_uri", (object?)export.OutputUri ?? DBNull.Value);
command.Parameters.AddWithValue("output_digest", (object?)export.OutputDigest ?? DBNull.Value);
command.Parameters.AddWithValue("output_size_bytes", (object?)export.OutputSizeBytes ?? DBNull.Value);
command.Parameters.AddWithValue("requested_by", export.RequestedBy);
command.Parameters.AddWithValue("requested_at", export.RequestedAt);
command.Parameters.AddWithValue("started_at", (object?)export.StartedAt ?? DBNull.Value);
command.Parameters.AddWithValue("completed_at", (object?)export.CompletedAt ?? DBNull.Value);
command.Parameters.AddWithValue("error_message", (object?)export.ErrorMessage ?? DBNull.Value);
}
private static LedgerExport MapExport(NpgsqlDataReader reader)
{
return new LedgerExport(
ExportId: reader.GetGuid(0),
TenantId: reader.GetString(1),
Status: (LedgerExportStatus)reader.GetInt32(2),
Format: reader.GetString(3),
StartTime: reader.IsDBNull(4) ? null : reader.GetFieldValue<DateTimeOffset>(4),
EndTime: reader.IsDBNull(5) ? null : reader.GetFieldValue<DateTimeOffset>(5),
RunTypeFilter: reader.IsDBNull(6) ? null : reader.GetString(6),
SourceIdFilter: reader.IsDBNull(7) ? null : reader.GetGuid(7),
EntryCount: reader.GetInt32(8),
OutputUri: reader.IsDBNull(9) ? null : reader.GetString(9),
OutputDigest: reader.IsDBNull(10) ? null : reader.GetString(10),
OutputSizeBytes: reader.IsDBNull(11) ? null : reader.GetInt64(11),
RequestedBy: reader.GetString(12),
RequestedAt: reader.GetFieldValue<DateTimeOffset>(13),
StartedAt: reader.IsDBNull(14) ? null : reader.GetFieldValue<DateTimeOffset>(14),
CompletedAt: reader.IsDBNull(15) ? null : reader.GetFieldValue<DateTimeOffset>(15),
ErrorMessage: reader.IsDBNull(16) ? null : reader.GetString(16));
}
}
/// <summary>
/// PostgreSQL implementation of the manifest repository.
/// </summary>
public sealed class PostgresManifestRepository : IManifestRepository
{
private const string SelectManifestColumns = """
manifest_id, schema_version, tenant_id, provenance_type, subject_id, statements,
artifacts, materials, build_info, payload_digest, signature_algorithm, signature,
key_id, created_at, expires_at, metadata
""";
private const string InsertManifestSql = """
INSERT INTO signed_manifests (
manifest_id, schema_version, tenant_id, provenance_type, subject_id, statements,
artifacts, materials, build_info, payload_digest, signature_algorithm, signature,
key_id, created_at, expires_at, metadata)
VALUES (
@manifest_id, @schema_version, @tenant_id, @provenance_type, @subject_id, @statements::jsonb,
@artifacts::jsonb, @materials::jsonb, @build_info::jsonb, @payload_digest, @signature_algorithm, @signature,
@key_id, @created_at, @expires_at, @metadata::jsonb)
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresManifestRepository> _logger;
public PostgresManifestRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresManifestRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<SignedManifest> CreateAsync(SignedManifest manifest, CancellationToken cancellationToken = default)
{
await using var connection = await _dataSource.OpenConnectionAsync(manifest.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(InsertManifestSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("manifest_id", manifest.ManifestId);
command.Parameters.AddWithValue("schema_version", manifest.SchemaVersion);
command.Parameters.AddWithValue("tenant_id", manifest.TenantId);
command.Parameters.AddWithValue("provenance_type", (int)manifest.ProvenanceType);
command.Parameters.AddWithValue("subject_id", manifest.SubjectId);
command.Parameters.AddWithValue("statements", manifest.Statements);
command.Parameters.AddWithValue("artifacts", manifest.Artifacts);
command.Parameters.AddWithValue("materials", manifest.Materials);
command.Parameters.AddWithValue("build_info", (object?)manifest.BuildInfo ?? DBNull.Value);
command.Parameters.AddWithValue("payload_digest", manifest.PayloadDigest);
command.Parameters.AddWithValue("signature_algorithm", manifest.SignatureAlgorithm);
command.Parameters.AddWithValue("signature", manifest.Signature);
command.Parameters.AddWithValue("key_id", manifest.KeyId);
command.Parameters.AddWithValue("created_at", manifest.CreatedAt);
command.Parameters.AddWithValue("expires_at", (object?)manifest.ExpiresAt ?? DBNull.Value);
command.Parameters.AddWithValue("metadata", (object?)manifest.Metadata ?? DBNull.Value);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.ManifestCreated(manifest.TenantId, manifest.ProvenanceType.ToString());
_logger.LogDebug("Manifest {ManifestId} created for subject {SubjectId}", manifest.ManifestId, manifest.SubjectId);
return manifest;
}
public async Task<SignedManifest?> GetByIdAsync(string tenantId, Guid manifestId, CancellationToken cancellationToken = default)
{
var sql = $"""
SELECT {SelectManifestColumns}
FROM signed_manifests
WHERE tenant_id = @tenant_id AND manifest_id = @manifest_id
""";
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("manifest_id", manifestId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapManifest(reader);
}
public async Task<SignedManifest?> GetBySubjectAsync(
string tenantId,
ProvenanceType provenanceType,
Guid subjectId,
CancellationToken cancellationToken = default)
{
var sql = $"""
SELECT {SelectManifestColumns}
FROM signed_manifests
WHERE tenant_id = @tenant_id
AND provenance_type = @provenance_type
AND subject_id = @subject_id
ORDER BY created_at DESC
LIMIT 1
""";
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("provenance_type", (int)provenanceType);
command.Parameters.AddWithValue("subject_id", subjectId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapManifest(reader);
}
public async Task<IReadOnlyList<SignedManifest>> ListAsync(
string tenantId,
ProvenanceType? provenanceType = null,
int limit = 100,
int offset = 0,
CancellationToken cancellationToken = default)
{
var sb = new StringBuilder($"SELECT {SelectManifestColumns} FROM signed_manifests WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (provenanceType.HasValue)
{
sb.Append(" AND provenance_type = @provenance_type");
parameters.Add(("provenance_type", (int)provenanceType.Value));
}
sb.Append(" ORDER BY created_at DESC LIMIT @limit OFFSET @offset");
parameters.Add(("limit", limit));
parameters.Add(("offset", offset));
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sb.ToString(), connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var manifests = new List<SignedManifest>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
manifests.Add(MapManifest(reader));
}
return manifests;
}
public async Task<SignedManifest?> GetByPayloadDigestAsync(
string tenantId,
string payloadDigest,
CancellationToken cancellationToken = default)
{
var sql = $"""
SELECT {SelectManifestColumns}
FROM signed_manifests
WHERE tenant_id = @tenant_id AND payload_digest = @payload_digest
""";
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("payload_digest", payloadDigest);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapManifest(reader);
}
private static SignedManifest MapManifest(NpgsqlDataReader reader)
{
return new SignedManifest(
ManifestId: reader.GetGuid(0),
SchemaVersion: reader.GetString(1),
TenantId: reader.GetString(2),
ProvenanceType: (ProvenanceType)reader.GetInt32(3),
SubjectId: reader.GetGuid(4),
Statements: reader.GetString(5),
Artifacts: reader.GetString(6),
Materials: reader.GetString(7),
BuildInfo: reader.IsDBNull(8) ? null : reader.GetString(8),
PayloadDigest: reader.GetString(9),
SignatureAlgorithm: reader.GetString(10),
Signature: reader.GetString(11),
KeyId: reader.GetString(12),
CreatedAt: reader.GetFieldValue<DateTimeOffset>(13),
ExpiresAt: reader.IsDBNull(14) ? null : reader.GetFieldValue<DateTimeOffset>(14),
Metadata: reader.IsDBNull(15) ? null : reader.GetString(15));
}
}

View File

@@ -0,0 +1,434 @@
using System.Text;
using Microsoft.Extensions.Logging;
using Npgsql;
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Infrastructure.Repositories;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// PostgreSQL implementation of quota repository.
/// </summary>
public sealed class PostgresQuotaRepository : IQuotaRepository
{
private const string SelectQuotaColumns = """
quota_id, tenant_id, job_type, max_active, max_per_hour, burst_capacity,
refill_rate, current_tokens, last_refill_at, current_active, current_hour_count,
current_hour_start, paused, pause_reason, quota_ticket, created_at, updated_at, updated_by
""";
private const string SelectByIdSql = $"""
SELECT {SelectQuotaColumns}
FROM quotas
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
""";
private const string SelectByTenantAndJobTypeSql = $"""
SELECT {SelectQuotaColumns}
FROM quotas
WHERE tenant_id = @tenant_id AND (job_type = @job_type OR (job_type IS NULL AND @job_type IS NULL))
""";
private const string InsertQuotaSql = """
INSERT INTO quotas (
quota_id, tenant_id, job_type, max_active, max_per_hour, burst_capacity,
refill_rate, current_tokens, last_refill_at, current_active, current_hour_count,
current_hour_start, paused, pause_reason, quota_ticket, created_at, updated_at, updated_by)
VALUES (
@quota_id, @tenant_id, @job_type, @max_active, @max_per_hour, @burst_capacity,
@refill_rate, @current_tokens, @last_refill_at, @current_active, @current_hour_count,
@current_hour_start, @paused, @pause_reason, @quota_ticket, @created_at, @updated_at, @updated_by)
""";
private const string UpdateQuotaSql = """
UPDATE quotas
SET job_type = @job_type,
max_active = @max_active,
max_per_hour = @max_per_hour,
burst_capacity = @burst_capacity,
refill_rate = @refill_rate,
current_tokens = @current_tokens,
last_refill_at = @last_refill_at,
current_active = @current_active,
current_hour_count = @current_hour_count,
current_hour_start = @current_hour_start,
paused = @paused,
pause_reason = @pause_reason,
quota_ticket = @quota_ticket,
updated_at = @updated_at,
updated_by = @updated_by
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
""";
private const string UpdateStateSql = """
UPDATE quotas
SET current_tokens = @current_tokens,
last_refill_at = @last_refill_at,
current_active = @current_active,
current_hour_count = @current_hour_count,
current_hour_start = @current_hour_start,
updated_at = @updated_at,
updated_by = @updated_by
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
""";
private const string PauseQuotaSql = """
UPDATE quotas
SET paused = TRUE,
pause_reason = @pause_reason,
quota_ticket = @quota_ticket,
updated_at = @updated_at,
updated_by = @updated_by
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
""";
private const string ResumeQuotaSql = """
UPDATE quotas
SET paused = FALSE,
pause_reason = NULL,
quota_ticket = NULL,
updated_at = @updated_at,
updated_by = @updated_by
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
""";
private const string IncrementActiveSql = """
UPDATE quotas
SET current_active = current_active + 1,
updated_at = @updated_at
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
""";
private const string DecrementActiveSql = """
UPDATE quotas
SET current_active = GREATEST(current_active - 1, 0),
updated_at = @updated_at
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
""";
private const string DeleteQuotaSql = """
DELETE FROM quotas
WHERE tenant_id = @tenant_id AND quota_id = @quota_id
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresQuotaRepository> _logger;
public PostgresQuotaRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresQuotaRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<Quota?> GetByIdAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("quota_id", quotaId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapQuota(reader);
}
public async Task<Quota?> GetByTenantAndJobTypeAsync(string tenantId, string? jobType, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByTenantAndJobTypeSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("job_type", (object?)jobType ?? DBNull.Value);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapQuota(reader);
}
public async Task CreateAsync(Quota quota, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(quota.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(InsertQuotaSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddQuotaParameters(command, quota);
try
{
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.QuotaCreated(quota.TenantId, quota.JobType);
}
catch (PostgresException ex) when (string.Equals(ex.SqlState, PostgresErrorCodes.UniqueViolation, StringComparison.Ordinal))
{
_logger.LogWarning("Duplicate quota for tenant {TenantId} job type {JobType}", quota.TenantId, quota.JobType);
throw new DuplicateQuotaException(quota.TenantId, quota.JobType, ex);
}
}
public async Task UpdateAsync(Quota quota, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(quota.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(UpdateQuotaSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", quota.TenantId);
command.Parameters.AddWithValue("quota_id", quota.QuotaId);
command.Parameters.AddWithValue("job_type", (object?)quota.JobType ?? DBNull.Value);
command.Parameters.AddWithValue("max_active", quota.MaxActive);
command.Parameters.AddWithValue("max_per_hour", quota.MaxPerHour);
command.Parameters.AddWithValue("burst_capacity", quota.BurstCapacity);
command.Parameters.AddWithValue("refill_rate", quota.RefillRate);
command.Parameters.AddWithValue("current_tokens", quota.CurrentTokens);
command.Parameters.AddWithValue("last_refill_at", quota.LastRefillAt);
command.Parameters.AddWithValue("current_active", quota.CurrentActive);
command.Parameters.AddWithValue("current_hour_count", quota.CurrentHourCount);
command.Parameters.AddWithValue("current_hour_start", quota.CurrentHourStart);
command.Parameters.AddWithValue("paused", quota.Paused);
command.Parameters.AddWithValue("pause_reason", (object?)quota.PauseReason ?? DBNull.Value);
command.Parameters.AddWithValue("quota_ticket", (object?)quota.QuotaTicket ?? DBNull.Value);
command.Parameters.AddWithValue("updated_at", quota.UpdatedAt);
command.Parameters.AddWithValue("updated_by", quota.UpdatedBy);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows == 0)
{
_logger.LogWarning("Quota not found for update: {QuotaId}", quota.QuotaId);
}
}
public async Task UpdateStateAsync(
string tenantId,
Guid quotaId,
double currentTokens,
DateTimeOffset lastRefillAt,
int currentActive,
int currentHourCount,
DateTimeOffset currentHourStart,
string updatedBy,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(UpdateStateSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("quota_id", quotaId);
command.Parameters.AddWithValue("current_tokens", currentTokens);
command.Parameters.AddWithValue("last_refill_at", lastRefillAt);
command.Parameters.AddWithValue("current_active", currentActive);
command.Parameters.AddWithValue("current_hour_count", currentHourCount);
command.Parameters.AddWithValue("current_hour_start", currentHourStart);
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
command.Parameters.AddWithValue("updated_by", updatedBy);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
public async Task PauseAsync(string tenantId, Guid quotaId, string reason, string? ticket, string updatedBy, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(PauseQuotaSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("quota_id", quotaId);
command.Parameters.AddWithValue("pause_reason", reason);
command.Parameters.AddWithValue("quota_ticket", (object?)ticket ?? DBNull.Value);
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
command.Parameters.AddWithValue("updated_by", updatedBy);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows > 0)
{
OrchestratorMetrics.QuotaPaused(tenantId);
}
}
public async Task ResumeAsync(string tenantId, Guid quotaId, string updatedBy, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(ResumeQuotaSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("quota_id", quotaId);
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
command.Parameters.AddWithValue("updated_by", updatedBy);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows > 0)
{
OrchestratorMetrics.QuotaResumed(tenantId);
}
}
public async Task IncrementActiveAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(IncrementActiveSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("quota_id", quotaId);
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
public async Task DecrementActiveAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(DecrementActiveSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("quota_id", quotaId);
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
public async Task<IReadOnlyList<Quota>> ListAsync(
string tenantId,
string? jobType,
bool? paused,
int limit,
int offset,
CancellationToken cancellationToken)
{
var (sql, parameters) = BuildListQuery(tenantId, jobType, paused, limit, offset);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var quotas = new List<Quota>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
quotas.Add(MapQuota(reader));
}
return quotas;
}
public async Task<bool> DeleteAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(DeleteQuotaSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("quota_id", quotaId);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
return rows > 0;
}
private static void AddQuotaParameters(NpgsqlCommand command, Quota quota)
{
command.Parameters.AddWithValue("quota_id", quota.QuotaId);
command.Parameters.AddWithValue("tenant_id", quota.TenantId);
command.Parameters.AddWithValue("job_type", (object?)quota.JobType ?? DBNull.Value);
command.Parameters.AddWithValue("max_active", quota.MaxActive);
command.Parameters.AddWithValue("max_per_hour", quota.MaxPerHour);
command.Parameters.AddWithValue("burst_capacity", quota.BurstCapacity);
command.Parameters.AddWithValue("refill_rate", quota.RefillRate);
command.Parameters.AddWithValue("current_tokens", quota.CurrentTokens);
command.Parameters.AddWithValue("last_refill_at", quota.LastRefillAt);
command.Parameters.AddWithValue("current_active", quota.CurrentActive);
command.Parameters.AddWithValue("current_hour_count", quota.CurrentHourCount);
command.Parameters.AddWithValue("current_hour_start", quota.CurrentHourStart);
command.Parameters.AddWithValue("paused", quota.Paused);
command.Parameters.AddWithValue("pause_reason", (object?)quota.PauseReason ?? DBNull.Value);
command.Parameters.AddWithValue("quota_ticket", (object?)quota.QuotaTicket ?? DBNull.Value);
command.Parameters.AddWithValue("created_at", quota.CreatedAt);
command.Parameters.AddWithValue("updated_at", quota.UpdatedAt);
command.Parameters.AddWithValue("updated_by", quota.UpdatedBy);
}
private static Quota MapQuota(NpgsqlDataReader reader)
{
return new Quota(
QuotaId: reader.GetGuid(0),
TenantId: reader.GetString(1),
JobType: reader.IsDBNull(2) ? null : reader.GetString(2),
MaxActive: reader.GetInt32(3),
MaxPerHour: reader.GetInt32(4),
BurstCapacity: reader.GetInt32(5),
RefillRate: reader.GetDouble(6),
CurrentTokens: reader.GetDouble(7),
LastRefillAt: reader.GetFieldValue<DateTimeOffset>(8),
CurrentActive: reader.GetInt32(9),
CurrentHourCount: reader.GetInt32(10),
CurrentHourStart: reader.GetFieldValue<DateTimeOffset>(11),
Paused: reader.GetBoolean(12),
PauseReason: reader.IsDBNull(13) ? null : reader.GetString(13),
QuotaTicket: reader.IsDBNull(14) ? null : reader.GetString(14),
CreatedAt: reader.GetFieldValue<DateTimeOffset>(15),
UpdatedAt: reader.GetFieldValue<DateTimeOffset>(16),
UpdatedBy: reader.GetString(17));
}
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
string tenantId,
string? jobType,
bool? paused,
int limit,
int offset)
{
var sb = new StringBuilder();
sb.Append($"SELECT {SelectQuotaColumns} FROM quotas WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (jobType is not null)
{
sb.Append(" AND job_type = @job_type");
parameters.Add(("job_type", jobType));
}
if (paused.HasValue)
{
sb.Append(" AND paused = @paused");
parameters.Add(("paused", paused.Value));
}
sb.Append(" ORDER BY job_type NULLS FIRST LIMIT @limit OFFSET @offset");
parameters.Add(("limit", limit));
parameters.Add(("offset", offset));
return (sb.ToString(), parameters);
}
}
/// <summary>
/// Exception thrown when attempting to create a duplicate quota.
/// </summary>
public sealed class DuplicateQuotaException : Exception
{
public string TenantId { get; }
public string? JobType { get; }
public DuplicateQuotaException(string tenantId, string? jobType, Exception innerException)
: base($"Quota for tenant '{tenantId}' and job type '{jobType ?? "(all)"}' already exists.", innerException)
{
TenantId = tenantId;
JobType = jobType;
}
}

View File

@@ -0,0 +1,199 @@
using Microsoft.Extensions.Logging;
using Npgsql;
using StellaOps.Orchestrator.Core.DeadLetter;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// PostgreSQL implementation of replay audit repository.
/// </summary>
public sealed class PostgresReplayAuditRepository : IReplayAuditRepository
{
private const string SelectAuditColumns = """
audit_id, tenant_id, entry_id, attempt_number,
success, new_job_id, error_message,
triggered_by, triggered_at, completed_at, initiated_by
""";
private const string SelectByEntrySql = $"""
SELECT {SelectAuditColumns}
FROM dead_letter_replay_audit
WHERE tenant_id = @tenant_id AND entry_id = @entry_id
ORDER BY attempt_number ASC
""";
private const string SelectByIdSql = $"""
SELECT {SelectAuditColumns}
FROM dead_letter_replay_audit
WHERE tenant_id = @tenant_id AND audit_id = @audit_id
""";
private const string SelectByNewJobIdSql = $"""
SELECT {SelectAuditColumns}
FROM dead_letter_replay_audit
WHERE tenant_id = @tenant_id AND new_job_id = @new_job_id
""";
private const string InsertAuditSql = """
INSERT INTO dead_letter_replay_audit (
audit_id, tenant_id, entry_id, attempt_number,
success, new_job_id, error_message,
triggered_by, triggered_at, completed_at, initiated_by)
VALUES (
@audit_id, @tenant_id, @entry_id, @attempt_number,
@success, @new_job_id, @error_message,
@triggered_by, @triggered_at, @completed_at, @initiated_by)
""";
private const string UpdateAuditSql = """
UPDATE dead_letter_replay_audit
SET success = @success,
new_job_id = @new_job_id,
error_message = @error_message,
completed_at = @completed_at
WHERE tenant_id = @tenant_id AND audit_id = @audit_id
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresReplayAuditRepository> _logger;
public PostgresReplayAuditRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresReplayAuditRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<IReadOnlyList<ReplayAuditRecord>> GetByEntryAsync(
string tenantId,
Guid entryId,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByEntrySql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("entry_id", entryId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var records = new List<ReplayAuditRecord>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
records.Add(MapRecord(reader));
}
return records;
}
public async Task<ReplayAuditRecord?> GetByIdAsync(
string tenantId,
Guid auditId,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("audit_id", auditId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapRecord(reader);
}
public async Task<ReplayAuditRecord?> GetByNewJobIdAsync(
string tenantId,
Guid newJobId,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByNewJobIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("new_job_id", newJobId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapRecord(reader);
}
public async Task CreateAsync(
ReplayAuditRecord record,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(record.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(InsertAuditSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddParameters(command, record);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.DeadLetterReplayAttempted(record.TenantId, record.TriggeredBy);
}
public async Task<bool> UpdateAsync(
ReplayAuditRecord record,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(record.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(UpdateAuditSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", record.TenantId);
command.Parameters.AddWithValue("audit_id", record.AuditId);
command.Parameters.AddWithValue("success", record.Success);
command.Parameters.AddWithValue("new_job_id", (object?)record.NewJobId ?? DBNull.Value);
command.Parameters.AddWithValue("error_message", (object?)record.ErrorMessage ?? DBNull.Value);
command.Parameters.AddWithValue("completed_at", (object?)record.CompletedAt ?? DBNull.Value);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows > 0 && record.Success)
{
OrchestratorMetrics.DeadLetterReplaySucceeded(record.TenantId);
}
else if (rows > 0 && !record.Success)
{
OrchestratorMetrics.DeadLetterReplayFailed(record.TenantId);
}
return rows > 0;
}
private static void AddParameters(NpgsqlCommand command, ReplayAuditRecord record)
{
command.Parameters.AddWithValue("audit_id", record.AuditId);
command.Parameters.AddWithValue("tenant_id", record.TenantId);
command.Parameters.AddWithValue("entry_id", record.EntryId);
command.Parameters.AddWithValue("attempt_number", record.AttemptNumber);
command.Parameters.AddWithValue("success", record.Success);
command.Parameters.AddWithValue("new_job_id", (object?)record.NewJobId ?? DBNull.Value);
command.Parameters.AddWithValue("error_message", (object?)record.ErrorMessage ?? DBNull.Value);
command.Parameters.AddWithValue("triggered_by", record.TriggeredBy);
command.Parameters.AddWithValue("triggered_at", record.TriggeredAt);
command.Parameters.AddWithValue("completed_at", (object?)record.CompletedAt ?? DBNull.Value);
command.Parameters.AddWithValue("initiated_by", record.InitiatedBy);
}
private static ReplayAuditRecord MapRecord(NpgsqlDataReader reader) =>
new(
AuditId: reader.GetGuid(0),
TenantId: reader.GetString(1),
EntryId: reader.GetGuid(2),
AttemptNumber: reader.GetInt32(3),
Success: reader.GetBoolean(4),
NewJobId: reader.IsDBNull(5) ? null : reader.GetGuid(5),
ErrorMessage: reader.IsDBNull(6) ? null : reader.GetString(6),
TriggeredBy: reader.GetString(7),
TriggeredAt: reader.GetFieldValue<DateTimeOffset>(8),
CompletedAt: reader.IsDBNull(9) ? null : reader.GetFieldValue<DateTimeOffset>(9),
InitiatedBy: reader.GetString(10));
}

View File

@@ -0,0 +1,388 @@
using System.Text;
using Microsoft.Extensions.Logging;
using Npgsql;
using NpgsqlTypes;
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Infrastructure.Repositories;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// PostgreSQL implementation of run repository.
/// </summary>
public sealed class PostgresRunRepository : IRunRepository
{
private const string SelectRunColumns = """
run_id, tenant_id, project_id, source_id, run_type, status, correlation_id,
total_jobs, completed_jobs, succeeded_jobs, failed_jobs, created_at,
started_at, completed_at, created_by, metadata
""";
private const string SelectByIdSql = $"""
SELECT {SelectRunColumns}
FROM runs
WHERE tenant_id = @tenant_id AND run_id = @run_id
""";
private const string InsertRunSql = """
INSERT INTO runs (
run_id, tenant_id, project_id, source_id, run_type, status, correlation_id,
total_jobs, completed_jobs, succeeded_jobs, failed_jobs, created_at,
started_at, completed_at, created_by, metadata)
VALUES (
@run_id, @tenant_id, @project_id, @source_id, @run_type, @status::run_status, @correlation_id,
@total_jobs, @completed_jobs, @succeeded_jobs, @failed_jobs, @created_at,
@started_at, @completed_at, @created_by, @metadata)
""";
private const string UpdateStatusSql = """
UPDATE runs
SET status = @status::run_status,
total_jobs = @total_jobs,
completed_jobs = @completed_jobs,
succeeded_jobs = @succeeded_jobs,
failed_jobs = @failed_jobs,
started_at = @started_at,
completed_at = @completed_at
WHERE tenant_id = @tenant_id AND run_id = @run_id
""";
private const string IncrementJobCountsSql = """
UPDATE runs
SET completed_jobs = completed_jobs + 1,
succeeded_jobs = CASE WHEN @succeeded THEN succeeded_jobs + 1 ELSE succeeded_jobs END,
failed_jobs = CASE WHEN NOT @succeeded THEN failed_jobs + 1 ELSE failed_jobs END,
started_at = COALESCE(started_at, @now),
status = CASE
WHEN completed_jobs + 1 >= total_jobs THEN
CASE
WHEN @succeeded AND (failed_jobs = 0 OR (NOT @succeeded AND failed_jobs + 1 = total_jobs)) THEN 'succeeded'::run_status
WHEN NOT @succeeded AND succeeded_jobs = 0 THEN 'failed'::run_status
ELSE 'partially_succeeded'::run_status
END
ELSE 'running'::run_status
END,
completed_at = CASE WHEN completed_jobs + 1 >= total_jobs THEN @now ELSE completed_at END
WHERE tenant_id = @tenant_id AND run_id = @run_id
RETURNING status
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresRunRepository> _logger;
public PostgresRunRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresRunRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<Run?> GetByIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("run_id", runId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapRun(reader);
}
public async Task CreateAsync(Run run, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(run.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(InsertRunSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddRunParameters(command, run);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.RunCreated(run.TenantId, run.RunType);
}
public async Task UpdateStatusAsync(
string tenantId,
Guid runId,
RunStatus status,
int totalJobs,
int completedJobs,
int succeededJobs,
int failedJobs,
DateTimeOffset? startedAt,
DateTimeOffset? completedAt,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(UpdateStatusSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("run_id", runId);
command.Parameters.AddWithValue("status", StatusToString(status));
command.Parameters.AddWithValue("total_jobs", totalJobs);
command.Parameters.AddWithValue("completed_jobs", completedJobs);
command.Parameters.AddWithValue("succeeded_jobs", succeededJobs);
command.Parameters.AddWithValue("failed_jobs", failedJobs);
command.Parameters.AddWithValue("started_at", (object?)startedAt ?? DBNull.Value);
command.Parameters.AddWithValue("completed_at", (object?)completedAt ?? DBNull.Value);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
}
public async Task IncrementJobCountsAsync(
string tenantId,
Guid runId,
bool succeeded,
CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(IncrementJobCountsSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("run_id", runId);
command.Parameters.AddWithValue("succeeded", succeeded);
command.Parameters.AddWithValue("now", DateTimeOffset.UtcNow);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
var newStatus = reader.GetString(0);
if (newStatus is "succeeded" or "failed" or "partially_succeeded")
{
// Run completed - get the full run for metrics
var run = await GetByIdAsync(tenantId, runId, cancellationToken).ConfigureAwait(false);
if (run is not null)
{
OrchestratorMetrics.RunCompleted(tenantId, run.RunType, newStatus);
}
}
}
}
public async Task<IReadOnlyList<Run>> ListAsync(
string tenantId,
Guid? sourceId,
string? runType,
RunStatus? status,
string? projectId,
DateTimeOffset? createdAfter,
DateTimeOffset? createdBefore,
int limit,
int offset,
CancellationToken cancellationToken)
{
var (sql, parameters) = BuildListQuery(tenantId, sourceId, runType, status, projectId, createdAfter, createdBefore, limit, offset);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var runs = new List<Run>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
runs.Add(MapRun(reader));
}
return runs;
}
public async Task<int> CountAsync(
string tenantId,
Guid? sourceId,
string? runType,
RunStatus? status,
string? projectId,
CancellationToken cancellationToken)
{
var (sql, parameters) = BuildCountQuery(tenantId, sourceId, runType, status, projectId);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
return Convert.ToInt32(result);
}
private static void AddRunParameters(NpgsqlCommand command, Run run)
{
command.Parameters.AddWithValue("run_id", run.RunId);
command.Parameters.AddWithValue("tenant_id", run.TenantId);
command.Parameters.AddWithValue("project_id", (object?)run.ProjectId ?? DBNull.Value);
command.Parameters.AddWithValue("source_id", run.SourceId);
command.Parameters.AddWithValue("run_type", run.RunType);
command.Parameters.AddWithValue("status", StatusToString(run.Status));
command.Parameters.AddWithValue("correlation_id", (object?)run.CorrelationId ?? DBNull.Value);
command.Parameters.AddWithValue("total_jobs", run.TotalJobs);
command.Parameters.AddWithValue("completed_jobs", run.CompletedJobs);
command.Parameters.AddWithValue("succeeded_jobs", run.SucceededJobs);
command.Parameters.AddWithValue("failed_jobs", run.FailedJobs);
command.Parameters.AddWithValue("created_at", run.CreatedAt);
command.Parameters.AddWithValue("started_at", (object?)run.StartedAt ?? DBNull.Value);
command.Parameters.AddWithValue("completed_at", (object?)run.CompletedAt ?? DBNull.Value);
command.Parameters.AddWithValue("created_by", run.CreatedBy);
command.Parameters.Add(new NpgsqlParameter("metadata", NpgsqlDbType.Jsonb)
{
Value = (object?)run.Metadata ?? DBNull.Value
});
}
private static Run MapRun(NpgsqlDataReader reader)
{
return new Run(
RunId: reader.GetGuid(0),
TenantId: reader.GetString(1),
ProjectId: reader.IsDBNull(2) ? null : reader.GetString(2),
SourceId: reader.GetGuid(3),
RunType: reader.GetString(4),
Status: ParseStatus(reader.GetString(5)),
CorrelationId: reader.IsDBNull(6) ? null : reader.GetString(6),
TotalJobs: reader.GetInt32(7),
CompletedJobs: reader.GetInt32(8),
SucceededJobs: reader.GetInt32(9),
FailedJobs: reader.GetInt32(10),
CreatedAt: reader.GetFieldValue<DateTimeOffset>(11),
StartedAt: reader.IsDBNull(12) ? null : reader.GetFieldValue<DateTimeOffset>(12),
CompletedAt: reader.IsDBNull(13) ? null : reader.GetFieldValue<DateTimeOffset>(13),
CreatedBy: reader.GetString(14),
Metadata: reader.IsDBNull(15) ? null : reader.GetString(15));
}
private static string StatusToString(RunStatus status) => status switch
{
RunStatus.Pending => "pending",
RunStatus.Running => "running",
RunStatus.Succeeded => "succeeded",
RunStatus.PartiallySucceeded => "partially_succeeded",
RunStatus.Failed => "failed",
RunStatus.Canceled => "canceled",
_ => throw new ArgumentOutOfRangeException(nameof(status))
};
private static RunStatus ParseStatus(string status) => status switch
{
"pending" => RunStatus.Pending,
"running" => RunStatus.Running,
"succeeded" => RunStatus.Succeeded,
"partially_succeeded" => RunStatus.PartiallySucceeded,
"failed" => RunStatus.Failed,
"canceled" => RunStatus.Canceled,
_ => throw new ArgumentOutOfRangeException(nameof(status))
};
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
string tenantId,
Guid? sourceId,
string? runType,
RunStatus? status,
string? projectId,
DateTimeOffset? createdAfter,
DateTimeOffset? createdBefore,
int limit,
int offset)
{
var sb = new StringBuilder();
sb.Append($"SELECT {SelectRunColumns} FROM runs WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (sourceId.HasValue)
{
sb.Append(" AND source_id = @source_id");
parameters.Add(("source_id", sourceId.Value));
}
if (!string.IsNullOrEmpty(runType))
{
sb.Append(" AND run_type = @run_type");
parameters.Add(("run_type", runType));
}
if (status.HasValue)
{
sb.Append(" AND status = @status::run_status");
parameters.Add(("status", StatusToString(status.Value)));
}
if (!string.IsNullOrEmpty(projectId))
{
sb.Append(" AND project_id = @project_id");
parameters.Add(("project_id", projectId));
}
if (createdAfter.HasValue)
{
sb.Append(" AND created_at >= @created_after");
parameters.Add(("created_after", createdAfter.Value));
}
if (createdBefore.HasValue)
{
sb.Append(" AND created_at < @created_before");
parameters.Add(("created_before", createdBefore.Value));
}
sb.Append(" ORDER BY created_at DESC LIMIT @limit OFFSET @offset");
parameters.Add(("limit", limit));
parameters.Add(("offset", offset));
return (sb.ToString(), parameters);
}
private static (string sql, List<(string name, object value)> parameters) BuildCountQuery(
string tenantId,
Guid? sourceId,
string? runType,
RunStatus? status,
string? projectId)
{
var sb = new StringBuilder();
sb.Append("SELECT COUNT(*) FROM runs WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (sourceId.HasValue)
{
sb.Append(" AND source_id = @source_id");
parameters.Add(("source_id", sourceId.Value));
}
if (!string.IsNullOrEmpty(runType))
{
sb.Append(" AND run_type = @run_type");
parameters.Add(("run_type", runType));
}
if (status.HasValue)
{
sb.Append(" AND status = @status::run_status");
parameters.Add(("status", StatusToString(status.Value)));
}
if (!string.IsNullOrEmpty(projectId))
{
sb.Append(" AND project_id = @project_id");
parameters.Add(("project_id", projectId));
}
return (sb.ToString(), parameters);
}
}

View File

@@ -0,0 +1,314 @@
using System.Text;
using Microsoft.Extensions.Logging;
using Npgsql;
using NpgsqlTypes;
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Infrastructure.Repositories;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// PostgreSQL implementation of source repository.
/// </summary>
public sealed class PostgresSourceRepository : ISourceRepository
{
private const string SelectSourceColumns = """
source_id, tenant_id, name, source_type, enabled, paused, pause_reason,
pause_ticket, configuration, created_at, updated_at, updated_by
""";
private const string SelectByIdSql = $"""
SELECT {SelectSourceColumns}
FROM sources
WHERE tenant_id = @tenant_id AND source_id = @source_id
""";
private const string SelectByNameSql = $"""
SELECT {SelectSourceColumns}
FROM sources
WHERE tenant_id = @tenant_id AND name = @name
""";
private const string InsertSourceSql = """
INSERT INTO sources (
source_id, tenant_id, name, source_type, enabled, paused, pause_reason,
pause_ticket, configuration, created_at, updated_at, updated_by)
VALUES (
@source_id, @tenant_id, @name, @source_type, @enabled, @paused, @pause_reason,
@pause_ticket, @configuration, @created_at, @updated_at, @updated_by)
""";
private const string UpdateSourceSql = """
UPDATE sources
SET name = @name,
source_type = @source_type,
enabled = @enabled,
paused = @paused,
pause_reason = @pause_reason,
pause_ticket = @pause_ticket,
configuration = @configuration,
updated_at = @updated_at,
updated_by = @updated_by
WHERE tenant_id = @tenant_id AND source_id = @source_id
""";
private const string PauseSourceSql = """
UPDATE sources
SET paused = TRUE,
pause_reason = @pause_reason,
pause_ticket = @pause_ticket,
updated_at = @updated_at,
updated_by = @updated_by
WHERE tenant_id = @tenant_id AND source_id = @source_id
""";
private const string ResumeSourceSql = """
UPDATE sources
SET paused = FALSE,
pause_reason = NULL,
pause_ticket = NULL,
updated_at = @updated_at,
updated_by = @updated_by
WHERE tenant_id = @tenant_id AND source_id = @source_id
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresSourceRepository> _logger;
public PostgresSourceRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresSourceRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<Source?> GetByIdAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("source_id", sourceId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapSource(reader);
}
public async Task<Source?> GetByNameAsync(string tenantId, string name, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByNameSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("name", name);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapSource(reader);
}
public async Task CreateAsync(Source source, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(source.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(InsertSourceSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddSourceParameters(command, source);
try
{
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.SourceCreated(source.TenantId, source.SourceType);
}
catch (PostgresException ex) when (string.Equals(ex.SqlState, PostgresErrorCodes.UniqueViolation, StringComparison.Ordinal))
{
_logger.LogWarning("Duplicate source name: {Name}", source.Name);
throw new DuplicateSourceException(source.Name, ex);
}
}
public async Task UpdateAsync(Source source, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(source.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(UpdateSourceSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", source.TenantId);
command.Parameters.AddWithValue("source_id", source.SourceId);
command.Parameters.AddWithValue("name", source.Name);
command.Parameters.AddWithValue("source_type", source.SourceType);
command.Parameters.AddWithValue("enabled", source.Enabled);
command.Parameters.AddWithValue("paused", source.Paused);
command.Parameters.AddWithValue("pause_reason", (object?)source.PauseReason ?? DBNull.Value);
command.Parameters.AddWithValue("pause_ticket", (object?)source.PauseTicket ?? DBNull.Value);
command.Parameters.Add(new NpgsqlParameter("configuration", NpgsqlDbType.Jsonb)
{
Value = (object?)source.Configuration ?? DBNull.Value
});
command.Parameters.AddWithValue("updated_at", source.UpdatedAt);
command.Parameters.AddWithValue("updated_by", source.UpdatedBy);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows == 0)
{
_logger.LogWarning("Source not found for update: {SourceId}", source.SourceId);
}
}
public async Task PauseAsync(string tenantId, Guid sourceId, string reason, string? ticket, string updatedBy, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(PauseSourceSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("source_id", sourceId);
command.Parameters.AddWithValue("pause_reason", reason);
command.Parameters.AddWithValue("pause_ticket", (object?)ticket ?? DBNull.Value);
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
command.Parameters.AddWithValue("updated_by", updatedBy);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows > 0)
{
OrchestratorMetrics.SourcePaused(tenantId);
}
}
public async Task ResumeAsync(string tenantId, Guid sourceId, string updatedBy, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(ResumeSourceSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("source_id", sourceId);
command.Parameters.AddWithValue("updated_at", DateTimeOffset.UtcNow);
command.Parameters.AddWithValue("updated_by", updatedBy);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows > 0)
{
OrchestratorMetrics.SourceResumed(tenantId);
}
}
public async Task<IReadOnlyList<Source>> ListAsync(
string tenantId,
string? sourceType,
bool? enabled,
int limit,
int offset,
CancellationToken cancellationToken)
{
var (sql, parameters) = BuildListQuery(tenantId, sourceType, enabled, limit, offset);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var sources = new List<Source>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
sources.Add(MapSource(reader));
}
return sources;
}
private static void AddSourceParameters(NpgsqlCommand command, Source source)
{
command.Parameters.AddWithValue("source_id", source.SourceId);
command.Parameters.AddWithValue("tenant_id", source.TenantId);
command.Parameters.AddWithValue("name", source.Name);
command.Parameters.AddWithValue("source_type", source.SourceType);
command.Parameters.AddWithValue("enabled", source.Enabled);
command.Parameters.AddWithValue("paused", source.Paused);
command.Parameters.AddWithValue("pause_reason", (object?)source.PauseReason ?? DBNull.Value);
command.Parameters.AddWithValue("pause_ticket", (object?)source.PauseTicket ?? DBNull.Value);
command.Parameters.Add(new NpgsqlParameter("configuration", NpgsqlDbType.Jsonb)
{
Value = (object?)source.Configuration ?? DBNull.Value
});
command.Parameters.AddWithValue("created_at", source.CreatedAt);
command.Parameters.AddWithValue("updated_at", source.UpdatedAt);
command.Parameters.AddWithValue("updated_by", source.UpdatedBy);
}
private static Source MapSource(NpgsqlDataReader reader)
{
return new Source(
SourceId: reader.GetGuid(0),
TenantId: reader.GetString(1),
Name: reader.GetString(2),
SourceType: reader.GetString(3),
Enabled: reader.GetBoolean(4),
Paused: reader.GetBoolean(5),
PauseReason: reader.IsDBNull(6) ? null : reader.GetString(6),
PauseTicket: reader.IsDBNull(7) ? null : reader.GetString(7),
Configuration: reader.IsDBNull(8) ? null : reader.GetString(8),
CreatedAt: reader.GetFieldValue<DateTimeOffset>(9),
UpdatedAt: reader.GetFieldValue<DateTimeOffset>(10),
UpdatedBy: reader.GetString(11));
}
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
string tenantId,
string? sourceType,
bool? enabled,
int limit,
int offset)
{
var sb = new StringBuilder();
sb.Append($"SELECT {SelectSourceColumns} FROM sources WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (!string.IsNullOrEmpty(sourceType))
{
sb.Append(" AND source_type = @source_type");
parameters.Add(("source_type", sourceType));
}
if (enabled.HasValue)
{
sb.Append(" AND enabled = @enabled");
parameters.Add(("enabled", enabled.Value));
}
sb.Append(" ORDER BY name LIMIT @limit OFFSET @offset");
parameters.Add(("limit", limit));
parameters.Add(("offset", offset));
return (sb.ToString(), parameters);
}
}
/// <summary>
/// Exception thrown when attempting to create a source with a duplicate name.
/// </summary>
public sealed class DuplicateSourceException : Exception
{
public string Name { get; }
public DuplicateSourceException(string name, Exception innerException)
: base($"Source with name '{name}' already exists.", innerException)
{
Name = name;
}
}

View File

@@ -0,0 +1,310 @@
using System.Text;
using Microsoft.Extensions.Logging;
using Npgsql;
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Infrastructure.Repositories;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// PostgreSQL implementation of throttle repository.
/// </summary>
public sealed class PostgresThrottleRepository : IThrottleRepository
{
private const string SelectThrottleColumns = """
throttle_id, tenant_id, source_id, job_type, active, reason, ticket,
created_at, expires_at, created_by
""";
private const string SelectByIdSql = $"""
SELECT {SelectThrottleColumns}
FROM throttles
WHERE tenant_id = @tenant_id AND throttle_id = @throttle_id
""";
private const string SelectActiveBySourceSql = $"""
SELECT {SelectThrottleColumns}
FROM throttles
WHERE tenant_id = @tenant_id
AND source_id = @source_id
AND active = TRUE
AND (expires_at IS NULL OR expires_at > @now)
ORDER BY created_at DESC
""";
private const string SelectActiveByJobTypeSql = $"""
SELECT {SelectThrottleColumns}
FROM throttles
WHERE tenant_id = @tenant_id
AND job_type = @job_type
AND active = TRUE
AND (expires_at IS NULL OR expires_at > @now)
ORDER BY created_at DESC
""";
private const string InsertThrottleSql = """
INSERT INTO throttles (
throttle_id, tenant_id, source_id, job_type, active, reason, ticket,
created_at, expires_at, created_by)
VALUES (
@throttle_id, @tenant_id, @source_id, @job_type, @active, @reason, @ticket,
@created_at, @expires_at, @created_by)
""";
private const string DeactivateSql = """
UPDATE throttles
SET active = FALSE
WHERE tenant_id = @tenant_id AND throttle_id = @throttle_id
""";
private const string DeactivateBySourceSql = """
UPDATE throttles
SET active = FALSE
WHERE tenant_id = @tenant_id AND source_id = @source_id AND active = TRUE
""";
private const string DeactivateByJobTypeSql = """
UPDATE throttles
SET active = FALSE
WHERE tenant_id = @tenant_id AND job_type = @job_type AND active = TRUE
""";
private const string CleanupExpiredSql = """
UPDATE throttles
SET active = FALSE
WHERE active = TRUE AND expires_at IS NOT NULL AND expires_at <= @now
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresThrottleRepository> _logger;
public PostgresThrottleRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresThrottleRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<Throttle?> GetByIdAsync(string tenantId, Guid throttleId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("throttle_id", throttleId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapThrottle(reader);
}
public async Task<IReadOnlyList<Throttle>> GetActiveBySourceAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectActiveBySourceSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("source_id", sourceId);
command.Parameters.AddWithValue("now", DateTimeOffset.UtcNow);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var throttles = new List<Throttle>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
throttles.Add(MapThrottle(reader));
}
return throttles;
}
public async Task<IReadOnlyList<Throttle>> GetActiveByJobTypeAsync(string tenantId, string jobType, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectActiveByJobTypeSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("job_type", jobType);
command.Parameters.AddWithValue("now", DateTimeOffset.UtcNow);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var throttles = new List<Throttle>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
throttles.Add(MapThrottle(reader));
}
return throttles;
}
public async Task CreateAsync(Throttle throttle, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(throttle.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(InsertThrottleSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("throttle_id", throttle.ThrottleId);
command.Parameters.AddWithValue("tenant_id", throttle.TenantId);
command.Parameters.AddWithValue("source_id", (object?)throttle.SourceId ?? DBNull.Value);
command.Parameters.AddWithValue("job_type", (object?)throttle.JobType ?? DBNull.Value);
command.Parameters.AddWithValue("active", throttle.Active);
command.Parameters.AddWithValue("reason", throttle.Reason);
command.Parameters.AddWithValue("ticket", (object?)throttle.Ticket ?? DBNull.Value);
command.Parameters.AddWithValue("created_at", throttle.CreatedAt);
command.Parameters.AddWithValue("expires_at", (object?)throttle.ExpiresAt ?? DBNull.Value);
command.Parameters.AddWithValue("created_by", throttle.CreatedBy);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.ThrottleCreated(throttle.TenantId, throttle.Reason);
}
public async Task DeactivateAsync(string tenantId, Guid throttleId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(DeactivateSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("throttle_id", throttleId);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows > 0)
{
OrchestratorMetrics.ThrottleDeactivated(tenantId);
}
}
public async Task DeactivateBySourceAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(DeactivateBySourceSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("source_id", sourceId);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows > 0)
{
_logger.LogInformation("Deactivated {Count} throttles for source {SourceId}", rows, sourceId);
}
}
public async Task DeactivateByJobTypeAsync(string tenantId, string jobType, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(DeactivateByJobTypeSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("job_type", jobType);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows > 0)
{
_logger.LogInformation("Deactivated {Count} throttles for job type {JobType}", rows, jobType);
}
}
public async Task<int> CleanupExpiredAsync(DateTimeOffset now, CancellationToken cancellationToken)
{
// Use system tenant for cross-tenant cleanup operations
// In production, this should use a dedicated admin connection or be run by a background service
await using var connection = await _dataSource.OpenConnectionAsync("system", "admin", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(CleanupExpiredSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("now", now);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows > 0)
{
_logger.LogInformation("Cleaned up {Count} expired throttles", rows);
}
return rows;
}
public async Task<IReadOnlyList<Throttle>> ListAsync(
string tenantId,
bool? active,
Guid? sourceId,
string? jobType,
int limit,
int offset,
CancellationToken cancellationToken)
{
var (sql, parameters) = BuildListQuery(tenantId, active, sourceId, jobType, limit, offset);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var throttles = new List<Throttle>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
throttles.Add(MapThrottle(reader));
}
return throttles;
}
private static Throttle MapThrottle(NpgsqlDataReader reader)
{
return new Throttle(
ThrottleId: reader.GetGuid(0),
TenantId: reader.GetString(1),
SourceId: reader.IsDBNull(2) ? null : reader.GetGuid(2),
JobType: reader.IsDBNull(3) ? null : reader.GetString(3),
Active: reader.GetBoolean(4),
Reason: reader.GetString(5),
Ticket: reader.IsDBNull(6) ? null : reader.GetString(6),
CreatedAt: reader.GetFieldValue<DateTimeOffset>(7),
ExpiresAt: reader.IsDBNull(8) ? null : reader.GetFieldValue<DateTimeOffset>(8),
CreatedBy: reader.GetString(9));
}
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
string tenantId,
bool? active,
Guid? sourceId,
string? jobType,
int limit,
int offset)
{
var sb = new StringBuilder();
sb.Append($"SELECT {SelectThrottleColumns} FROM throttles WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (active.HasValue)
{
sb.Append(" AND active = @active");
parameters.Add(("active", active.Value));
}
if (sourceId.HasValue)
{
sb.Append(" AND source_id = @source_id");
parameters.Add(("source_id", sourceId.Value));
}
if (!string.IsNullOrEmpty(jobType))
{
sb.Append(" AND job_type = @job_type");
parameters.Add(("job_type", jobType));
}
sb.Append(" ORDER BY created_at DESC LIMIT @limit OFFSET @offset");
parameters.Add(("limit", limit));
parameters.Add(("offset", offset));
return (sb.ToString(), parameters);
}
}

View File

@@ -0,0 +1,386 @@
using System.Text;
using Microsoft.Extensions.Logging;
using Npgsql;
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Infrastructure.Repositories;
namespace StellaOps.Orchestrator.Infrastructure.Postgres;
/// <summary>
/// PostgreSQL implementation of watermark repository.
/// </summary>
public sealed class PostgresWatermarkRepository : IWatermarkRepository
{
private const string SelectWatermarkColumns = """
watermark_id, tenant_id, source_id, job_type, scope_key,
high_watermark, low_watermark, sequence_number, processed_count,
last_batch_hash, created_at, updated_at, updated_by
""";
private const string SelectByScopeKeySql = $"""
SELECT {SelectWatermarkColumns}
FROM watermarks
WHERE tenant_id = @tenant_id AND scope_key = @scope_key
""";
private const string SelectBySourceIdSql = $"""
SELECT {SelectWatermarkColumns}
FROM watermarks
WHERE tenant_id = @tenant_id AND source_id = @source_id AND job_type IS NULL
""";
private const string SelectByJobTypeSql = $"""
SELECT {SelectWatermarkColumns}
FROM watermarks
WHERE tenant_id = @tenant_id AND job_type = @job_type AND source_id IS NULL
""";
private const string SelectBySourceAndJobTypeSql = $"""
SELECT {SelectWatermarkColumns}
FROM watermarks
WHERE tenant_id = @tenant_id AND source_id = @source_id AND job_type = @job_type
""";
private const string InsertWatermarkSql = """
INSERT INTO watermarks (
watermark_id, tenant_id, source_id, job_type, scope_key,
high_watermark, low_watermark, sequence_number, processed_count,
last_batch_hash, created_at, updated_at, updated_by)
VALUES (
@watermark_id, @tenant_id, @source_id, @job_type, @scope_key,
@high_watermark, @low_watermark, @sequence_number, @processed_count,
@last_batch_hash, @created_at, @updated_at, @updated_by)
""";
private const string UpdateWatermarkSql = """
UPDATE watermarks
SET high_watermark = @high_watermark,
low_watermark = @low_watermark,
sequence_number = @sequence_number,
processed_count = @processed_count,
last_batch_hash = @last_batch_hash,
updated_at = @updated_at,
updated_by = @updated_by
WHERE tenant_id = @tenant_id AND watermark_id = @watermark_id
AND sequence_number = @expected_sequence_number
""";
private const string UpsertWatermarkSql = """
INSERT INTO watermarks (
watermark_id, tenant_id, source_id, job_type, scope_key,
high_watermark, low_watermark, sequence_number, processed_count,
last_batch_hash, created_at, updated_at, updated_by)
VALUES (
@watermark_id, @tenant_id, @source_id, @job_type, @scope_key,
@high_watermark, @low_watermark, @sequence_number, @processed_count,
@last_batch_hash, @created_at, @updated_at, @updated_by)
ON CONFLICT (tenant_id, scope_key) DO UPDATE
SET high_watermark = EXCLUDED.high_watermark,
low_watermark = EXCLUDED.low_watermark,
sequence_number = EXCLUDED.sequence_number,
processed_count = EXCLUDED.processed_count,
last_batch_hash = EXCLUDED.last_batch_hash,
updated_at = EXCLUDED.updated_at,
updated_by = EXCLUDED.updated_by
""";
private const string DeleteWatermarkSql = """
DELETE FROM watermarks
WHERE tenant_id = @tenant_id AND scope_key = @scope_key
""";
private const string SelectLaggingSql = $"""
SELECT {SelectWatermarkColumns}
FROM watermarks
WHERE tenant_id = @tenant_id
AND high_watermark < @lag_threshold
ORDER BY high_watermark ASC
LIMIT @limit
""";
private readonly OrchestratorDataSource _dataSource;
private readonly ILogger<PostgresWatermarkRepository> _logger;
public PostgresWatermarkRepository(
OrchestratorDataSource dataSource,
ILogger<PostgresWatermarkRepository> logger)
{
_dataSource = dataSource ?? throw new ArgumentNullException(nameof(dataSource));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<Watermark?> GetByScopeKeyAsync(string tenantId, string scopeKey, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByScopeKeySql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("scope_key", scopeKey);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapWatermark(reader);
}
public async Task<Watermark?> GetBySourceIdAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectBySourceIdSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("source_id", sourceId);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapWatermark(reader);
}
public async Task<Watermark?> GetByJobTypeAsync(string tenantId, string jobType, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectByJobTypeSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("job_type", jobType);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapWatermark(reader);
}
public async Task<Watermark?> GetBySourceAndJobTypeAsync(string tenantId, Guid sourceId, string jobType, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectBySourceAndJobTypeSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("source_id", sourceId);
command.Parameters.AddWithValue("job_type", jobType);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
if (!await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
return null;
}
return MapWatermark(reader);
}
public async Task CreateAsync(Watermark watermark, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(watermark.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(InsertWatermarkSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddWatermarkParameters(command, watermark);
try
{
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.WatermarkCreated(watermark.TenantId, watermark.ScopeKey);
}
catch (PostgresException ex) when (string.Equals(ex.SqlState, PostgresErrorCodes.UniqueViolation, StringComparison.Ordinal))
{
_logger.LogWarning("Duplicate watermark for tenant {TenantId} scope {ScopeKey}", watermark.TenantId, watermark.ScopeKey);
throw new DuplicateWatermarkException(watermark.TenantId, watermark.ScopeKey, ex);
}
}
public async Task<bool> UpdateAsync(Watermark watermark, long expectedSequenceNumber, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(watermark.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(UpdateWatermarkSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", watermark.TenantId);
command.Parameters.AddWithValue("watermark_id", watermark.WatermarkId);
command.Parameters.AddWithValue("high_watermark", watermark.HighWatermark);
command.Parameters.AddWithValue("low_watermark", (object?)watermark.LowWatermark ?? DBNull.Value);
command.Parameters.AddWithValue("sequence_number", watermark.SequenceNumber);
command.Parameters.AddWithValue("processed_count", watermark.ProcessedCount);
command.Parameters.AddWithValue("last_batch_hash", (object?)watermark.LastBatchHash ?? DBNull.Value);
command.Parameters.AddWithValue("updated_at", watermark.UpdatedAt);
command.Parameters.AddWithValue("updated_by", watermark.UpdatedBy);
command.Parameters.AddWithValue("expected_sequence_number", expectedSequenceNumber);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
if (rows > 0)
{
OrchestratorMetrics.WatermarkAdvanced(watermark.TenantId, watermark.ScopeKey);
}
return rows > 0;
}
public async Task UpsertAsync(Watermark watermark, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(watermark.TenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(UpsertWatermarkSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
AddWatermarkParameters(command, watermark);
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
OrchestratorMetrics.WatermarkAdvanced(watermark.TenantId, watermark.ScopeKey);
}
public async Task<IReadOnlyList<Watermark>> ListAsync(
string tenantId,
Guid? sourceId,
string? jobType,
int limit,
int offset,
CancellationToken cancellationToken)
{
var (sql, parameters) = BuildListQuery(tenantId, sourceId, jobType, limit, offset);
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(sql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
foreach (var (name, value) in parameters)
{
command.Parameters.AddWithValue(name, value);
}
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var watermarks = new List<Watermark>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
watermarks.Add(MapWatermark(reader));
}
return watermarks;
}
public async Task<IReadOnlyList<Watermark>> GetLaggingAsync(
string tenantId,
TimeSpan lagThreshold,
int limit,
CancellationToken cancellationToken)
{
var thresholdTime = DateTimeOffset.UtcNow - lagThreshold;
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "reader", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(SelectLaggingSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("lag_threshold", thresholdTime);
command.Parameters.AddWithValue("limit", limit);
await using var reader = await command.ExecuteReaderAsync(cancellationToken).ConfigureAwait(false);
var watermarks = new List<Watermark>();
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
{
watermarks.Add(MapWatermark(reader));
}
return watermarks;
}
public async Task<bool> DeleteAsync(string tenantId, string scopeKey, CancellationToken cancellationToken)
{
await using var connection = await _dataSource.OpenConnectionAsync(tenantId, "writer", cancellationToken).ConfigureAwait(false);
await using var command = new NpgsqlCommand(DeleteWatermarkSql, connection);
command.CommandTimeout = _dataSource.CommandTimeoutSeconds;
command.Parameters.AddWithValue("tenant_id", tenantId);
command.Parameters.AddWithValue("scope_key", scopeKey);
var rows = await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
return rows > 0;
}
private static void AddWatermarkParameters(NpgsqlCommand command, Watermark watermark)
{
command.Parameters.AddWithValue("watermark_id", watermark.WatermarkId);
command.Parameters.AddWithValue("tenant_id", watermark.TenantId);
command.Parameters.AddWithValue("source_id", (object?)watermark.SourceId ?? DBNull.Value);
command.Parameters.AddWithValue("job_type", (object?)watermark.JobType ?? DBNull.Value);
command.Parameters.AddWithValue("scope_key", watermark.ScopeKey);
command.Parameters.AddWithValue("high_watermark", watermark.HighWatermark);
command.Parameters.AddWithValue("low_watermark", (object?)watermark.LowWatermark ?? DBNull.Value);
command.Parameters.AddWithValue("sequence_number", watermark.SequenceNumber);
command.Parameters.AddWithValue("processed_count", watermark.ProcessedCount);
command.Parameters.AddWithValue("last_batch_hash", (object?)watermark.LastBatchHash ?? DBNull.Value);
command.Parameters.AddWithValue("created_at", watermark.CreatedAt);
command.Parameters.AddWithValue("updated_at", watermark.UpdatedAt);
command.Parameters.AddWithValue("updated_by", watermark.UpdatedBy);
}
private static Watermark MapWatermark(NpgsqlDataReader reader)
{
return new Watermark(
WatermarkId: reader.GetGuid(0),
TenantId: reader.GetString(1),
SourceId: reader.IsDBNull(2) ? null : reader.GetGuid(2),
JobType: reader.IsDBNull(3) ? null : reader.GetString(3),
ScopeKey: reader.GetString(4),
HighWatermark: reader.GetFieldValue<DateTimeOffset>(5),
LowWatermark: reader.IsDBNull(6) ? null : reader.GetFieldValue<DateTimeOffset>(6),
SequenceNumber: reader.GetInt64(7),
ProcessedCount: reader.GetInt64(8),
LastBatchHash: reader.IsDBNull(9) ? null : reader.GetString(9),
CreatedAt: reader.GetFieldValue<DateTimeOffset>(10),
UpdatedAt: reader.GetFieldValue<DateTimeOffset>(11),
UpdatedBy: reader.GetString(12));
}
private static (string sql, List<(string name, object value)> parameters) BuildListQuery(
string tenantId,
Guid? sourceId,
string? jobType,
int limit,
int offset)
{
var sb = new StringBuilder();
sb.Append($"SELECT {SelectWatermarkColumns} FROM watermarks WHERE tenant_id = @tenant_id");
var parameters = new List<(string, object)> { ("tenant_id", tenantId) };
if (sourceId.HasValue)
{
sb.Append(" AND source_id = @source_id");
parameters.Add(("source_id", sourceId.Value));
}
if (jobType is not null)
{
sb.Append(" AND job_type = @job_type");
parameters.Add(("job_type", jobType));
}
sb.Append(" ORDER BY updated_at DESC LIMIT @limit OFFSET @offset");
parameters.Add(("limit", limit));
parameters.Add(("offset", offset));
return (sb.ToString(), parameters);
}
}
/// <summary>
/// Exception thrown when attempting to create a duplicate watermark.
/// </summary>
public sealed class DuplicateWatermarkException : Exception
{
public string TenantId { get; }
public string ScopeKey { get; }
public DuplicateWatermarkException(string tenantId, string scopeKey, Exception innerException)
: base($"Watermark for tenant '{tenantId}' and scope '{scopeKey}' already exists.", innerException)
{
TenantId = tenantId;
ScopeKey = scopeKey;
}
}

View File

@@ -0,0 +1,61 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
/// <summary>
/// Repository interface for artifact persistence operations.
/// </summary>
public interface IArtifactRepository
{
/// <summary>
/// Gets an artifact by ID.
/// </summary>
Task<Artifact?> GetByIdAsync(string tenantId, Guid artifactId, CancellationToken cancellationToken);
/// <summary>
/// Gets artifacts by job ID.
/// </summary>
Task<IReadOnlyList<Artifact>> GetByJobIdAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
/// <summary>
/// Gets artifacts by run ID.
/// </summary>
Task<IReadOnlyList<Artifact>> GetByRunIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken);
/// <summary>
/// Gets an artifact by its content digest.
/// </summary>
Task<Artifact?> GetByDigestAsync(string tenantId, string digest, CancellationToken cancellationToken);
/// <summary>
/// Creates a new artifact.
/// </summary>
Task CreateAsync(Artifact artifact, CancellationToken cancellationToken);
/// <summary>
/// Creates multiple artifacts in a batch.
/// </summary>
Task CreateBatchAsync(IEnumerable<Artifact> artifacts, CancellationToken cancellationToken);
/// <summary>
/// Lists artifacts with pagination and filters.
/// </summary>
Task<IReadOnlyList<Artifact>> ListAsync(
string tenantId,
string? artifactType,
string? jobType,
DateTimeOffset? createdAfter,
DateTimeOffset? createdBefore,
int limit,
int offset,
CancellationToken cancellationToken);
/// <summary>
/// Counts artifacts matching the filters.
/// </summary>
Task<int> CountAsync(
string tenantId,
string? artifactType,
string? jobType,
CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,127 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
/// <summary>
/// Repository for audit log entries.
/// </summary>
public interface IAuditRepository
{
/// <summary>
/// Appends a new audit entry to the log.
/// </summary>
Task<AuditEntry> AppendAsync(
string tenantId,
AuditEventType eventType,
string resourceType,
Guid resourceId,
string actorId,
ActorType actorType,
string description,
string? oldState = null,
string? newState = null,
string? actorIp = null,
string? userAgent = null,
string? httpMethod = null,
string? requestPath = null,
string? correlationId = null,
string? metadata = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets an audit entry by ID.
/// </summary>
Task<AuditEntry?> GetByIdAsync(
string tenantId,
Guid entryId,
CancellationToken cancellationToken = default);
/// <summary>
/// Lists audit entries with optional filters.
/// </summary>
Task<IReadOnlyList<AuditEntry>> ListAsync(
string tenantId,
AuditEventType? eventType = null,
string? resourceType = null,
Guid? resourceId = null,
string? actorId = null,
DateTimeOffset? startTime = null,
DateTimeOffset? endTime = null,
int limit = 100,
int offset = 0,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets audit entries by sequence range.
/// </summary>
Task<IReadOnlyList<AuditEntry>> GetBySequenceRangeAsync(
string tenantId,
long startSequence,
long endSequence,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets the latest audit entry for a tenant.
/// </summary>
Task<AuditEntry?> GetLatestAsync(
string tenantId,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets audit entries for a specific resource.
/// </summary>
Task<IReadOnlyList<AuditEntry>> GetByResourceAsync(
string tenantId,
string resourceType,
Guid resourceId,
int limit = 100,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets the count of audit entries.
/// </summary>
Task<long> GetCountAsync(
string tenantId,
AuditEventType? eventType = null,
DateTimeOffset? startTime = null,
DateTimeOffset? endTime = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Verifies the chain integrity for a range of entries.
/// </summary>
Task<ChainVerificationResult> VerifyChainAsync(
string tenantId,
long? startSequence = null,
long? endSequence = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets audit summary statistics.
/// </summary>
Task<AuditSummary> GetSummaryAsync(
string tenantId,
DateTimeOffset? since = null,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Result of chain verification.
/// </summary>
public sealed record ChainVerificationResult(
bool IsValid,
Guid? InvalidEntryId,
long? InvalidSequence,
string? ErrorMessage);
/// <summary>
/// Audit summary statistics.
/// </summary>
public sealed record AuditSummary(
long TotalEntries,
long EntriesSince,
long EventTypes,
long UniqueActors,
long UniqueResources,
DateTimeOffset? EarliestEntry,
DateTimeOffset? LatestEntry);

View File

@@ -0,0 +1,200 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
/// <summary>
/// Repository interface for backfill request persistence operations.
/// </summary>
public interface IBackfillRepository
{
/// <summary>
/// Gets a backfill request by ID.
/// </summary>
Task<BackfillRequest?> GetByIdAsync(string tenantId, Guid backfillId, CancellationToken cancellationToken);
/// <summary>
/// Creates a new backfill request.
/// </summary>
Task CreateAsync(BackfillRequest request, CancellationToken cancellationToken);
/// <summary>
/// Updates a backfill request.
/// </summary>
Task UpdateAsync(BackfillRequest request, CancellationToken cancellationToken);
/// <summary>
/// Lists backfill requests with filters.
/// </summary>
Task<IReadOnlyList<BackfillRequest>> ListAsync(
string tenantId,
BackfillStatus? status,
Guid? sourceId,
string? jobType,
int limit,
int offset,
CancellationToken cancellationToken);
/// <summary>
/// Checks for overlapping active backfills.
/// </summary>
Task<bool> HasOverlappingActiveAsync(
string tenantId,
string scopeKey,
DateTimeOffset windowStart,
DateTimeOffset windowEnd,
Guid? excludeBackfillId,
CancellationToken cancellationToken);
/// <summary>
/// Gets running backfills for a scope.
/// </summary>
Task<IReadOnlyList<BackfillRequest>> GetActiveByScope(
string tenantId,
string scopeKey,
CancellationToken cancellationToken);
/// <summary>
/// Counts backfill requests by status.
/// </summary>
Task<IDictionary<BackfillStatus, int>> CountByStatusAsync(
string tenantId,
CancellationToken cancellationToken);
/// <summary>
/// Gets the next backfill ready for processing.
/// </summary>
Task<BackfillRequest?> GetNextPendingAsync(string tenantId, CancellationToken cancellationToken);
}
/// <summary>
/// Repository interface for backfill checkpoint persistence.
/// </summary>
public interface IBackfillCheckpointRepository
{
/// <summary>
/// Gets the latest checkpoint for a backfill.
/// </summary>
Task<BackfillCheckpoint?> GetLatestAsync(string tenantId, Guid backfillId, CancellationToken cancellationToken);
/// <summary>
/// Gets all checkpoints for a backfill.
/// </summary>
Task<IReadOnlyList<BackfillCheckpoint>> GetAllAsync(string tenantId, Guid backfillId, CancellationToken cancellationToken);
/// <summary>
/// Creates a new checkpoint.
/// </summary>
Task CreateAsync(BackfillCheckpoint checkpoint, CancellationToken cancellationToken);
/// <summary>
/// Updates a checkpoint (e.g., mark complete).
/// </summary>
Task UpdateAsync(BackfillCheckpoint checkpoint, CancellationToken cancellationToken);
}
/// <summary>
/// Represents a backfill processing checkpoint.
/// </summary>
public sealed record BackfillCheckpoint(
/// <summary>Unique checkpoint identifier.</summary>
Guid CheckpointId,
/// <summary>Tenant this checkpoint belongs to.</summary>
string TenantId,
/// <summary>Parent backfill request ID.</summary>
Guid BackfillId,
/// <summary>Batch sequence number.</summary>
int BatchNumber,
/// <summary>Start of batch time window.</summary>
DateTimeOffset BatchStart,
/// <summary>End of batch time window.</summary>
DateTimeOffset BatchEnd,
/// <summary>Total events in batch.</summary>
int EventsInBatch,
/// <summary>Events processed in batch.</summary>
int EventsProcessed,
/// <summary>Events skipped as duplicates.</summary>
int EventsSkipped,
/// <summary>Events that failed processing.</summary>
int EventsFailed,
/// <summary>Hash of the batch for integrity verification.</summary>
string? BatchHash,
/// <summary>When batch processing started.</summary>
DateTimeOffset StartedAt,
/// <summary>When batch processing completed.</summary>
DateTimeOffset? CompletedAt,
/// <summary>Error message if batch failed.</summary>
string? ErrorMessage)
{
/// <summary>
/// Whether this checkpoint is complete.
/// </summary>
public bool IsComplete => CompletedAt.HasValue;
/// <summary>
/// Creates a new checkpoint for a batch.
/// </summary>
public static BackfillCheckpoint Create(
string tenantId,
Guid backfillId,
int batchNumber,
DateTimeOffset batchStart,
DateTimeOffset batchEnd,
int eventsInBatch)
{
return new BackfillCheckpoint(
CheckpointId: Guid.NewGuid(),
TenantId: tenantId,
BackfillId: backfillId,
BatchNumber: batchNumber,
BatchStart: batchStart,
BatchEnd: batchEnd,
EventsInBatch: eventsInBatch,
EventsProcessed: 0,
EventsSkipped: 0,
EventsFailed: 0,
BatchHash: null,
StartedAt: DateTimeOffset.UtcNow,
CompletedAt: null,
ErrorMessage: null);
}
/// <summary>
/// Marks the checkpoint as complete.
/// </summary>
public BackfillCheckpoint Complete(int processed, int skipped, int failed, string? batchHash)
{
return this with
{
EventsProcessed = processed,
EventsSkipped = skipped,
EventsFailed = failed,
BatchHash = batchHash,
CompletedAt = DateTimeOffset.UtcNow
};
}
/// <summary>
/// Marks the checkpoint as failed.
/// </summary>
public BackfillCheckpoint Fail(string error)
{
return this with
{
CompletedAt = DateTimeOffset.UtcNow,
ErrorMessage = error
};
}
}

View File

@@ -0,0 +1,43 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
/// <summary>
/// Repository interface for DAG edge persistence operations.
/// </summary>
public interface IDagEdgeRepository
{
/// <summary>
/// Creates a new DAG edge.
/// </summary>
Task CreateAsync(DagEdge edge, CancellationToken cancellationToken);
/// <summary>
/// Creates multiple DAG edges in a batch.
/// </summary>
Task CreateBatchAsync(IEnumerable<DagEdge> edges, CancellationToken cancellationToken);
/// <summary>
/// Gets all edges for a run.
/// </summary>
Task<IReadOnlyList<DagEdge>> GetByRunIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken);
/// <summary>
/// Gets parent edges (incoming) for a job.
/// </summary>
Task<IReadOnlyList<DagEdge>> GetParentEdgesAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
/// <summary>
/// Gets child edges (outgoing) for a job.
/// </summary>
Task<IReadOnlyList<DagEdge>> GetChildEdgesAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
/// <summary>
/// Checks if all parent dependencies are satisfied for a job.
/// </summary>
/// <param name="tenantId">Tenant ID.</param>
/// <param name="jobId">Job to check dependencies for.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>True if all dependencies are satisfied.</returns>
Task<bool> AreDependenciesSatisfiedAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,29 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
/// <summary>
/// Repository interface for job history persistence operations.
/// </summary>
public interface IJobHistoryRepository
{
/// <summary>
/// Appends a history entry for a job state change.
/// </summary>
Task AppendAsync(JobHistory history, CancellationToken cancellationToken);
/// <summary>
/// Gets the history for a job.
/// </summary>
Task<IReadOnlyList<JobHistory>> GetByJobIdAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
/// <summary>
/// Gets the latest history entry for a job.
/// </summary>
Task<JobHistory?> GetLatestByJobIdAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
/// <summary>
/// Gets the next sequence number for a job's history.
/// </summary>
Task<int> GetNextSequenceNoAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,100 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
/// <summary>
/// Repository interface for job persistence operations.
/// </summary>
public interface IJobRepository
{
/// <summary>
/// Gets a job by ID.
/// </summary>
Task<Job?> GetByIdAsync(string tenantId, Guid jobId, CancellationToken cancellationToken);
/// <summary>
/// Gets a job by idempotency key.
/// </summary>
Task<Job?> GetByIdempotencyKeyAsync(string tenantId, string idempotencyKey, CancellationToken cancellationToken);
/// <summary>
/// Creates a new job.
/// </summary>
Task CreateAsync(Job job, CancellationToken cancellationToken);
/// <summary>
/// Updates a job's status and related fields.
/// </summary>
Task UpdateStatusAsync(
string tenantId,
Guid jobId,
JobStatus status,
int attempt,
Guid? leaseId,
string? workerId,
string? taskRunnerId,
DateTimeOffset? leaseUntil,
DateTimeOffset? scheduledAt,
DateTimeOffset? leasedAt,
DateTimeOffset? completedAt,
DateTimeOffset? notBefore,
string? reason,
CancellationToken cancellationToken);
/// <summary>
/// Acquires a lease on a pending/scheduled job for worker execution.
/// </summary>
/// <returns>The leased job, or null if no jobs available.</returns>
Task<Job?> LeaseNextAsync(
string tenantId,
string? jobType,
Guid leaseId,
string workerId,
DateTimeOffset leaseUntil,
CancellationToken cancellationToken);
/// <summary>
/// Extends an existing lease.
/// </summary>
/// <returns>True if lease was extended, false if lease not found or expired.</returns>
Task<bool> ExtendLeaseAsync(
string tenantId,
Guid jobId,
Guid leaseId,
DateTimeOffset newLeaseUntil,
CancellationToken cancellationToken);
/// <summary>
/// Gets jobs by run ID.
/// </summary>
Task<IReadOnlyList<Job>> GetByRunIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken);
/// <summary>
/// Gets jobs with expired leases.
/// </summary>
Task<IReadOnlyList<Job>> GetExpiredLeasesAsync(string tenantId, DateTimeOffset cutoff, int limit, CancellationToken cancellationToken);
/// <summary>
/// Lists jobs with pagination and filters.
/// </summary>
Task<IReadOnlyList<Job>> ListAsync(
string tenantId,
JobStatus? status,
string? jobType,
string? projectId,
DateTimeOffset? createdAfter,
DateTimeOffset? createdBefore,
int limit,
int offset,
CancellationToken cancellationToken);
/// <summary>
/// Counts jobs matching the filters.
/// </summary>
Task<int> CountAsync(
string tenantId,
JobStatus? status,
string? jobType,
string? projectId,
CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,210 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
/// <summary>
/// Repository for run ledger entries.
/// </summary>
public interface ILedgerRepository
{
/// <summary>
/// Appends a new ledger entry from a completed run.
/// </summary>
Task<RunLedgerEntry> AppendAsync(
Run run,
IReadOnlyList<Artifact> artifacts,
string inputDigest,
string? metadata = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets a ledger entry by ID.
/// </summary>
Task<RunLedgerEntry?> GetByIdAsync(
string tenantId,
Guid ledgerId,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets a ledger entry by run ID.
/// </summary>
Task<RunLedgerEntry?> GetByRunIdAsync(
string tenantId,
Guid runId,
CancellationToken cancellationToken = default);
/// <summary>
/// Lists ledger entries with optional filters.
/// </summary>
Task<IReadOnlyList<RunLedgerEntry>> ListAsync(
string tenantId,
string? runType = null,
Guid? sourceId = null,
RunStatus? finalStatus = null,
DateTimeOffset? startTime = null,
DateTimeOffset? endTime = null,
int limit = 100,
int offset = 0,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets ledger entries by sequence range.
/// </summary>
Task<IReadOnlyList<RunLedgerEntry>> GetBySequenceRangeAsync(
string tenantId,
long startSequence,
long endSequence,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets the latest ledger entry for a tenant.
/// </summary>
Task<RunLedgerEntry?> GetLatestAsync(
string tenantId,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets ledger entries for a specific source.
/// </summary>
Task<IReadOnlyList<RunLedgerEntry>> GetBySourceAsync(
string tenantId,
Guid sourceId,
int limit = 100,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets the count of ledger entries.
/// </summary>
Task<long> GetCountAsync(
string tenantId,
string? runType = null,
Guid? sourceId = null,
DateTimeOffset? startTime = null,
DateTimeOffset? endTime = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Verifies the chain integrity for a range of entries.
/// </summary>
Task<ChainVerificationResult> VerifyChainAsync(
string tenantId,
long? startSequence = null,
long? endSequence = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets ledger summary statistics.
/// </summary>
Task<LedgerSummary> GetSummaryAsync(
string tenantId,
DateTimeOffset? since = null,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Ledger summary statistics.
/// </summary>
public sealed record LedgerSummary(
long TotalEntries,
long EntriesSince,
long TotalRuns,
long SuccessfulRuns,
long FailedRuns,
long TotalJobs,
long UniqueSources,
long UniqueRunTypes,
DateTimeOffset? EarliestEntry,
DateTimeOffset? LatestEntry);
/// <summary>
/// Repository for ledger exports.
/// </summary>
public interface ILedgerExportRepository
{
/// <summary>
/// Creates a new export request.
/// </summary>
Task<LedgerExport> CreateAsync(
LedgerExport export,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets an export by ID.
/// </summary>
Task<LedgerExport?> GetByIdAsync(
string tenantId,
Guid exportId,
CancellationToken cancellationToken = default);
/// <summary>
/// Lists exports for a tenant.
/// </summary>
Task<IReadOnlyList<LedgerExport>> ListAsync(
string tenantId,
LedgerExportStatus? status = null,
int limit = 100,
int offset = 0,
CancellationToken cancellationToken = default);
/// <summary>
/// Updates an export.
/// </summary>
Task<LedgerExport> UpdateAsync(
LedgerExport export,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets pending exports.
/// </summary>
Task<IReadOnlyList<LedgerExport>> GetPendingAsync(
int limit = 10,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Repository for signed manifests.
/// </summary>
public interface IManifestRepository
{
/// <summary>
/// Creates a new manifest.
/// </summary>
Task<SignedManifest> CreateAsync(
SignedManifest manifest,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets a manifest by ID.
/// </summary>
Task<SignedManifest?> GetByIdAsync(
string tenantId,
Guid manifestId,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets a manifest by subject.
/// </summary>
Task<SignedManifest?> GetBySubjectAsync(
string tenantId,
ProvenanceType provenanceType,
Guid subjectId,
CancellationToken cancellationToken = default);
/// <summary>
/// Lists manifests for a tenant.
/// </summary>
Task<IReadOnlyList<SignedManifest>> ListAsync(
string tenantId,
ProvenanceType? provenanceType = null,
int limit = 100,
int offset = 0,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets a manifest by payload digest.
/// </summary>
Task<SignedManifest?> GetByPayloadDigestAsync(
string tenantId,
string payloadDigest,
CancellationToken cancellationToken = default);
}

View File

@@ -0,0 +1,79 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
/// <summary>
/// Repository interface for quota persistence operations.
/// </summary>
public interface IQuotaRepository
{
/// <summary>
/// Gets a quota by ID.
/// </summary>
Task<Quota?> GetByIdAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken);
/// <summary>
/// Gets the quota for a tenant and optional job type.
/// </summary>
Task<Quota?> GetByTenantAndJobTypeAsync(string tenantId, string? jobType, CancellationToken cancellationToken);
/// <summary>
/// Creates a new quota.
/// </summary>
Task CreateAsync(Quota quota, CancellationToken cancellationToken);
/// <summary>
/// Updates a quota (including token/counter state).
/// </summary>
Task UpdateAsync(Quota quota, CancellationToken cancellationToken);
/// <summary>
/// Pauses a quota with reason.
/// </summary>
Task PauseAsync(string tenantId, Guid quotaId, string reason, string? ticket, string updatedBy, CancellationToken cancellationToken);
/// <summary>
/// Resumes a paused quota.
/// </summary>
Task ResumeAsync(string tenantId, Guid quotaId, string updatedBy, CancellationToken cancellationToken);
/// <summary>
/// Updates the rate limiter state (tokens, counters) without changing configuration.
/// </summary>
Task UpdateStateAsync(
string tenantId,
Guid quotaId,
double currentTokens,
DateTimeOffset lastRefillAt,
int currentActive,
int currentHourCount,
DateTimeOffset currentHourStart,
string updatedBy,
CancellationToken cancellationToken);
/// <summary>
/// Increments the current active count.
/// </summary>
Task IncrementActiveAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken);
/// <summary>
/// Decrements the current active count.
/// </summary>
Task DecrementActiveAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken);
/// <summary>
/// Lists quotas for a tenant with pagination.
/// </summary>
Task<IReadOnlyList<Quota>> ListAsync(
string tenantId,
string? jobType,
bool? paused,
int limit,
int offset,
CancellationToken cancellationToken);
/// <summary>
/// Deletes a quota.
/// </summary>
Task<bool> DeleteAsync(string tenantId, Guid quotaId, CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,69 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
/// <summary>
/// Repository interface for run persistence operations.
/// </summary>
public interface IRunRepository
{
/// <summary>
/// Gets a run by ID.
/// </summary>
Task<Run?> GetByIdAsync(string tenantId, Guid runId, CancellationToken cancellationToken);
/// <summary>
/// Creates a new run.
/// </summary>
Task CreateAsync(Run run, CancellationToken cancellationToken);
/// <summary>
/// Updates run status and job counts.
/// </summary>
Task UpdateStatusAsync(
string tenantId,
Guid runId,
RunStatus status,
int totalJobs,
int completedJobs,
int succeededJobs,
int failedJobs,
DateTimeOffset? startedAt,
DateTimeOffset? completedAt,
CancellationToken cancellationToken);
/// <summary>
/// Increments job counters when a job completes.
/// </summary>
Task IncrementJobCountsAsync(
string tenantId,
Guid runId,
bool succeeded,
CancellationToken cancellationToken);
/// <summary>
/// Lists runs with pagination and filters.
/// </summary>
Task<IReadOnlyList<Run>> ListAsync(
string tenantId,
Guid? sourceId,
string? runType,
RunStatus? status,
string? projectId,
DateTimeOffset? createdAfter,
DateTimeOffset? createdBefore,
int limit,
int offset,
CancellationToken cancellationToken);
/// <summary>
/// Counts runs matching the filters.
/// </summary>
Task<int> CountAsync(
string tenantId,
Guid? sourceId,
string? runType,
RunStatus? status,
string? projectId,
CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,50 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
/// <summary>
/// Repository interface for source persistence operations.
/// </summary>
public interface ISourceRepository
{
/// <summary>
/// Gets a source by ID.
/// </summary>
Task<Source?> GetByIdAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken);
/// <summary>
/// Gets a source by name.
/// </summary>
Task<Source?> GetByNameAsync(string tenantId, string name, CancellationToken cancellationToken);
/// <summary>
/// Creates a new source.
/// </summary>
Task CreateAsync(Source source, CancellationToken cancellationToken);
/// <summary>
/// Updates a source.
/// </summary>
Task UpdateAsync(Source source, CancellationToken cancellationToken);
/// <summary>
/// Pauses a source with reason.
/// </summary>
Task PauseAsync(string tenantId, Guid sourceId, string reason, string? ticket, string updatedBy, CancellationToken cancellationToken);
/// <summary>
/// Resumes a paused source.
/// </summary>
Task ResumeAsync(string tenantId, Guid sourceId, string updatedBy, CancellationToken cancellationToken);
/// <summary>
/// Lists sources with pagination.
/// </summary>
Task<IReadOnlyList<Source>> ListAsync(
string tenantId,
string? sourceType,
bool? enabled,
int limit,
int offset,
CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,62 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
/// <summary>
/// Repository interface for throttle persistence operations.
/// </summary>
public interface IThrottleRepository
{
/// <summary>
/// Gets a throttle by ID.
/// </summary>
Task<Throttle?> GetByIdAsync(string tenantId, Guid throttleId, CancellationToken cancellationToken);
/// <summary>
/// Gets active throttles for a source.
/// </summary>
Task<IReadOnlyList<Throttle>> GetActiveBySourceAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken);
/// <summary>
/// Gets active throttles for a job type.
/// </summary>
Task<IReadOnlyList<Throttle>> GetActiveByJobTypeAsync(string tenantId, string jobType, CancellationToken cancellationToken);
/// <summary>
/// Creates a new throttle.
/// </summary>
Task CreateAsync(Throttle throttle, CancellationToken cancellationToken);
/// <summary>
/// Deactivates a throttle.
/// </summary>
Task DeactivateAsync(string tenantId, Guid throttleId, CancellationToken cancellationToken);
/// <summary>
/// Deactivates all throttles for a source.
/// </summary>
Task DeactivateBySourceAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken);
/// <summary>
/// Deactivates all throttles for a job type.
/// </summary>
Task DeactivateByJobTypeAsync(string tenantId, string jobType, CancellationToken cancellationToken);
/// <summary>
/// Cleans up expired throttles.
/// </summary>
/// <returns>Number of throttles deactivated.</returns>
Task<int> CleanupExpiredAsync(DateTimeOffset now, CancellationToken cancellationToken);
/// <summary>
/// Lists throttles for a tenant with pagination.
/// </summary>
Task<IReadOnlyList<Throttle>> ListAsync(
string tenantId,
bool? active,
Guid? sourceId,
string? jobType,
int limit,
int offset,
CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,70 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Infrastructure.Repositories;
/// <summary>
/// Repository interface for watermark persistence operations.
/// </summary>
public interface IWatermarkRepository
{
/// <summary>
/// Gets a watermark by scope key.
/// </summary>
Task<Watermark?> GetByScopeKeyAsync(string tenantId, string scopeKey, CancellationToken cancellationToken);
/// <summary>
/// Gets a watermark by source ID.
/// </summary>
Task<Watermark?> GetBySourceIdAsync(string tenantId, Guid sourceId, CancellationToken cancellationToken);
/// <summary>
/// Gets a watermark by job type.
/// </summary>
Task<Watermark?> GetByJobTypeAsync(string tenantId, string jobType, CancellationToken cancellationToken);
/// <summary>
/// Gets a watermark by source ID and job type.
/// </summary>
Task<Watermark?> GetBySourceAndJobTypeAsync(string tenantId, Guid sourceId, string jobType, CancellationToken cancellationToken);
/// <summary>
/// Creates a new watermark.
/// </summary>
Task CreateAsync(Watermark watermark, CancellationToken cancellationToken);
/// <summary>
/// Updates a watermark using optimistic concurrency.
/// </summary>
/// <returns>True if update succeeded, false if concurrent modification detected.</returns>
Task<bool> UpdateAsync(Watermark watermark, long expectedSequenceNumber, CancellationToken cancellationToken);
/// <summary>
/// Creates or updates a watermark (upsert).
/// </summary>
Task UpsertAsync(Watermark watermark, CancellationToken cancellationToken);
/// <summary>
/// Lists watermarks for a tenant.
/// </summary>
Task<IReadOnlyList<Watermark>> ListAsync(
string tenantId,
Guid? sourceId,
string? jobType,
int limit,
int offset,
CancellationToken cancellationToken);
/// <summary>
/// Gets watermarks with lag exceeding the threshold.
/// </summary>
Task<IReadOnlyList<Watermark>> GetLaggingAsync(
string tenantId,
TimeSpan lagThreshold,
int limit,
CancellationToken cancellationToken);
/// <summary>
/// Deletes a watermark by scope key.
/// </summary>
Task<bool> DeleteAsync(string tenantId, string scopeKey, CancellationToken cancellationToken);
}

View File

@@ -0,0 +1,57 @@
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Orchestrator.Core.Backfill;
using StellaOps.Orchestrator.Infrastructure.Ledger;
using StellaOps.Orchestrator.Infrastructure.Options;
using StellaOps.Orchestrator.Infrastructure.Postgres;
using StellaOps.Orchestrator.Infrastructure.Repositories;
namespace StellaOps.Orchestrator.Infrastructure;
/// <summary>
/// Extension methods for registering Orchestrator infrastructure services.
/// </summary>
public static class ServiceCollectionExtensions
{
/// <summary>
/// Adds Orchestrator infrastructure services to the service collection.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="configuration">The configuration.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddOrchestratorInfrastructure(
this IServiceCollection services,
IConfiguration configuration)
{
// Register configuration options
services.Configure<OrchestratorServiceOptions>(
configuration.GetSection(OrchestratorServiceOptions.SectionName));
// Register data source
services.AddSingleton<OrchestratorDataSource>();
// Register repositories
services.AddScoped<IJobRepository, PostgresJobRepository>();
services.AddScoped<IArtifactRepository, PostgresArtifactRepository>();
services.AddScoped<ISourceRepository, PostgresSourceRepository>();
services.AddScoped<IRunRepository, PostgresRunRepository>();
services.AddScoped<IQuotaRepository, PostgresQuotaRepository>();
services.AddScoped<IThrottleRepository, PostgresThrottleRepository>();
services.AddScoped<IWatermarkRepository, PostgresWatermarkRepository>();
services.AddScoped<Infrastructure.Repositories.IBackfillRepository, PostgresBackfillRepository>();
// Register audit and ledger repositories
services.AddScoped<IAuditRepository, PostgresAuditRepository>();
services.AddScoped<ILedgerRepository, PostgresLedgerRepository>();
services.AddScoped<ILedgerExportRepository, PostgresLedgerExportRepository>();
services.AddScoped<IManifestRepository, PostgresManifestRepository>();
// Register ledger exporter service
services.AddScoped<ILedgerExporter, LedgerExporter>();
// Register duplicate suppression factory
services.AddSingleton<IDuplicateSuppressorFactory, PostgresDuplicateSuppressorFactory>();
return services;
}
}

View File

@@ -1,28 +1,30 @@
<?xml version="1.0" ?>
<Project Sdk="Microsoft.NET.Sdk">
<ItemGroup>
<ProjectReference Include="..\StellaOps.Orchestrator.Core\StellaOps.Orchestrator.Core.csproj"/>
</ItemGroup>
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<LangVersion>preview</LangVersion>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
</PropertyGroup>
<ItemGroup>
<None Include="migrations\**\*" Pack="false" CopyToOutputDirectory="Never" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\StellaOps.Orchestrator.Core\StellaOps.Orchestrator.Core.csproj"/>
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" Version="10.0.0-rc.2.25502.107" />
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="10.0.0-rc.2.25502.107" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="10.0.0-rc.2.25502.107" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="10.0.0-rc.2.25502.107" />
<PackageReference Include="Microsoft.Extensions.Options" Version="10.0.0-rc.2.25502.107" />
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" Version="10.0.0-rc.2.25502.107" />
<PackageReference Include="Npgsql" Version="7.0.7" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,323 @@
-- 001_initial.sql
-- Orchestrator bootstrap schema (ORCH-SVC-32-001)
-- Creates core tables for sources, runs, jobs, DAG edges, artifacts, quotas, schedules, and incidents.
BEGIN;
-- Enum types for job and run statuses
CREATE TYPE job_status AS ENUM (
'pending',
'scheduled',
'leased',
'succeeded',
'failed',
'canceled',
'timed_out'
);
CREATE TYPE run_status AS ENUM (
'pending',
'running',
'succeeded',
'partially_succeeded',
'failed',
'canceled'
);
CREATE TYPE incident_status AS ENUM (
'open',
'acknowledged',
'resolved'
);
CREATE TYPE dag_edge_type AS ENUM (
'success',
'always',
'failure'
);
-- Sources: Job producers (Concelier, Scanner, Export, etc.)
CREATE TABLE sources (
source_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
name TEXT NOT NULL,
source_type TEXT NOT NULL,
enabled BOOLEAN NOT NULL DEFAULT TRUE,
paused BOOLEAN NOT NULL DEFAULT FALSE,
pause_reason TEXT,
pause_ticket TEXT,
configuration JSONB,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_by TEXT NOT NULL,
CONSTRAINT pk_sources PRIMARY KEY (tenant_id, source_id),
CONSTRAINT uq_sources_name UNIQUE (tenant_id, name)
) PARTITION BY LIST (tenant_id);
CREATE TABLE sources_default PARTITION OF sources DEFAULT;
CREATE INDEX ix_sources_type ON sources (tenant_id, source_type);
CREATE INDEX ix_sources_enabled ON sources (tenant_id, enabled) WHERE enabled = TRUE;
-- Runs: Batch/workflow executions containing jobs
CREATE TABLE runs (
run_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
project_id TEXT,
source_id UUID NOT NULL,
run_type TEXT NOT NULL,
status run_status NOT NULL DEFAULT 'pending',
correlation_id TEXT,
total_jobs INTEGER NOT NULL DEFAULT 0,
completed_jobs INTEGER NOT NULL DEFAULT 0,
succeeded_jobs INTEGER NOT NULL DEFAULT 0,
failed_jobs INTEGER NOT NULL DEFAULT 0,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
created_by TEXT NOT NULL,
metadata JSONB,
CONSTRAINT pk_runs PRIMARY KEY (tenant_id, run_id),
CONSTRAINT fk_runs_source FOREIGN KEY (tenant_id, source_id) REFERENCES sources (tenant_id, source_id)
) PARTITION BY LIST (tenant_id);
CREATE TABLE runs_default PARTITION OF runs DEFAULT;
CREATE INDEX ix_runs_status ON runs (tenant_id, status, created_at DESC);
CREATE INDEX ix_runs_source ON runs (tenant_id, source_id, created_at DESC);
CREATE INDEX ix_runs_project ON runs (tenant_id, project_id, created_at DESC) WHERE project_id IS NOT NULL;
CREATE INDEX ix_runs_correlation ON runs (tenant_id, correlation_id) WHERE correlation_id IS NOT NULL;
-- Jobs: Individual units of work
CREATE TABLE jobs (
job_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
project_id TEXT,
run_id UUID,
job_type TEXT NOT NULL,
status job_status NOT NULL DEFAULT 'pending',
priority INTEGER NOT NULL DEFAULT 0,
attempt INTEGER NOT NULL DEFAULT 1,
max_attempts INTEGER NOT NULL DEFAULT 3,
payload_digest CHAR(64) NOT NULL,
payload JSONB NOT NULL,
idempotency_key TEXT NOT NULL,
correlation_id TEXT,
lease_id UUID,
worker_id TEXT,
task_runner_id TEXT,
lease_until TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
scheduled_at TIMESTAMPTZ,
leased_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
not_before TIMESTAMPTZ,
reason TEXT,
replay_of UUID,
created_by TEXT NOT NULL,
CONSTRAINT pk_jobs PRIMARY KEY (tenant_id, job_id),
CONSTRAINT uq_jobs_idempotency UNIQUE (tenant_id, idempotency_key),
CONSTRAINT ck_jobs_payload_digest_hex CHECK (payload_digest ~ '^[0-9a-f]{64}$'),
CONSTRAINT ck_jobs_attempt_positive CHECK (attempt >= 1),
CONSTRAINT ck_jobs_max_attempts_positive CHECK (max_attempts >= 1)
) PARTITION BY LIST (tenant_id);
CREATE TABLE jobs_default PARTITION OF jobs DEFAULT;
CREATE INDEX ix_jobs_status ON jobs (tenant_id, status, priority DESC, created_at);
CREATE INDEX ix_jobs_type_status ON jobs (tenant_id, job_type, status, created_at);
CREATE INDEX ix_jobs_run ON jobs (tenant_id, run_id) WHERE run_id IS NOT NULL;
CREATE INDEX ix_jobs_lease ON jobs (tenant_id, lease_id) WHERE lease_id IS NOT NULL;
CREATE INDEX ix_jobs_lease_expiry ON jobs (tenant_id, lease_until) WHERE status = 'leased' AND lease_until IS NOT NULL;
CREATE INDEX ix_jobs_not_before ON jobs (tenant_id, not_before) WHERE status = 'pending' AND not_before IS NOT NULL;
CREATE INDEX ix_jobs_scheduled ON jobs (tenant_id, job_type, status, scheduled_at) WHERE status = 'scheduled';
CREATE INDEX ix_jobs_replay ON jobs (tenant_id, replay_of) WHERE replay_of IS NOT NULL;
-- Job History: Immutable audit trail for job state changes
CREATE TABLE job_history (
history_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
job_id UUID NOT NULL,
sequence_no INTEGER NOT NULL,
from_status job_status,
to_status job_status NOT NULL,
attempt INTEGER NOT NULL,
lease_id UUID,
worker_id TEXT,
reason TEXT,
occurred_at TIMESTAMPTZ NOT NULL,
recorded_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
actor_id TEXT NOT NULL,
actor_type TEXT NOT NULL,
CONSTRAINT pk_job_history PRIMARY KEY (tenant_id, job_id, sequence_no),
CONSTRAINT ck_job_history_actor_type CHECK (actor_type IN ('system', 'operator', 'worker'))
) PARTITION BY LIST (tenant_id);
CREATE TABLE job_history_default PARTITION OF job_history DEFAULT;
CREATE INDEX ix_job_history_occurred ON job_history (tenant_id, job_id, occurred_at DESC);
-- DAG Edges: Job dependencies within a run
CREATE TABLE dag_edges (
edge_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
run_id UUID NOT NULL,
parent_job_id UUID NOT NULL,
child_job_id UUID NOT NULL,
edge_type dag_edge_type NOT NULL DEFAULT 'success',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT pk_dag_edges PRIMARY KEY (tenant_id, edge_id),
CONSTRAINT uq_dag_edges_parent_child UNIQUE (tenant_id, run_id, parent_job_id, child_job_id),
CONSTRAINT fk_dag_edges_run FOREIGN KEY (tenant_id, run_id) REFERENCES runs (tenant_id, run_id),
CONSTRAINT fk_dag_edges_parent FOREIGN KEY (tenant_id, parent_job_id) REFERENCES jobs (tenant_id, job_id),
CONSTRAINT fk_dag_edges_child FOREIGN KEY (tenant_id, child_job_id) REFERENCES jobs (tenant_id, job_id),
CONSTRAINT ck_dag_edges_no_self_loop CHECK (parent_job_id <> child_job_id)
) PARTITION BY LIST (tenant_id);
CREATE TABLE dag_edges_default PARTITION OF dag_edges DEFAULT;
CREATE INDEX ix_dag_edges_run ON dag_edges (tenant_id, run_id);
CREATE INDEX ix_dag_edges_parent ON dag_edges (tenant_id, parent_job_id);
CREATE INDEX ix_dag_edges_child ON dag_edges (tenant_id, child_job_id);
-- Artifacts: Job outputs with provenance
CREATE TABLE artifacts (
artifact_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
job_id UUID NOT NULL,
run_id UUID,
artifact_type TEXT NOT NULL,
uri TEXT NOT NULL,
digest CHAR(64) NOT NULL,
mime_type TEXT,
size_bytes BIGINT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
metadata JSONB,
CONSTRAINT pk_artifacts PRIMARY KEY (tenant_id, artifact_id),
CONSTRAINT fk_artifacts_job FOREIGN KEY (tenant_id, job_id) REFERENCES jobs (tenant_id, job_id),
CONSTRAINT ck_artifacts_digest_hex CHECK (digest ~ '^[0-9a-f]{64}$')
) PARTITION BY LIST (tenant_id);
CREATE TABLE artifacts_default PARTITION OF artifacts DEFAULT;
CREATE INDEX ix_artifacts_job ON artifacts (tenant_id, job_id);
CREATE INDEX ix_artifacts_run ON artifacts (tenant_id, run_id) WHERE run_id IS NOT NULL;
CREATE INDEX ix_artifacts_type ON artifacts (tenant_id, artifact_type, created_at DESC);
CREATE INDEX ix_artifacts_digest ON artifacts (tenant_id, digest);
-- Quotas: Rate-limit and concurrency controls
CREATE TABLE quotas (
quota_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
job_type TEXT,
max_active INTEGER NOT NULL DEFAULT 10,
max_per_hour INTEGER NOT NULL DEFAULT 1000,
burst_capacity INTEGER NOT NULL DEFAULT 50,
refill_rate DOUBLE PRECISION NOT NULL DEFAULT 1.0,
current_tokens DOUBLE PRECISION NOT NULL DEFAULT 50.0,
last_refill_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
current_active INTEGER NOT NULL DEFAULT 0,
current_hour_count INTEGER NOT NULL DEFAULT 0,
current_hour_start TIMESTAMPTZ NOT NULL DEFAULT DATE_TRUNC('hour', NOW()),
paused BOOLEAN NOT NULL DEFAULT FALSE,
pause_reason TEXT,
quota_ticket TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_by TEXT NOT NULL,
CONSTRAINT pk_quotas PRIMARY KEY (tenant_id, quota_id),
CONSTRAINT uq_quotas_tenant_type UNIQUE (tenant_id, job_type),
CONSTRAINT ck_quotas_max_active_positive CHECK (max_active > 0),
CONSTRAINT ck_quotas_max_per_hour_positive CHECK (max_per_hour > 0),
CONSTRAINT ck_quotas_burst_positive CHECK (burst_capacity > 0),
CONSTRAINT ck_quotas_refill_positive CHECK (refill_rate > 0)
) PARTITION BY LIST (tenant_id);
CREATE TABLE quotas_default PARTITION OF quotas DEFAULT;
CREATE INDEX ix_quotas_type ON quotas (tenant_id, job_type);
CREATE INDEX ix_quotas_paused ON quotas (tenant_id, paused) WHERE paused = TRUE;
-- Schedules: Cron-based job triggers
CREATE TABLE schedules (
schedule_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
project_id TEXT,
source_id UUID NOT NULL,
name TEXT NOT NULL,
job_type TEXT NOT NULL,
cron_expression TEXT NOT NULL,
timezone TEXT NOT NULL DEFAULT 'UTC',
enabled BOOLEAN NOT NULL DEFAULT TRUE,
payload_template JSONB NOT NULL,
priority INTEGER NOT NULL DEFAULT 0,
max_attempts INTEGER NOT NULL DEFAULT 3,
last_triggered_at TIMESTAMPTZ,
next_trigger_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_by TEXT NOT NULL,
updated_by TEXT NOT NULL,
CONSTRAINT pk_schedules PRIMARY KEY (tenant_id, schedule_id),
CONSTRAINT uq_schedules_name UNIQUE (tenant_id, name),
CONSTRAINT fk_schedules_source FOREIGN KEY (tenant_id, source_id) REFERENCES sources (tenant_id, source_id),
CONSTRAINT ck_schedules_max_attempts_positive CHECK (max_attempts >= 1)
) PARTITION BY LIST (tenant_id);
CREATE TABLE schedules_default PARTITION OF schedules DEFAULT;
CREATE INDEX ix_schedules_enabled ON schedules (tenant_id, enabled, next_trigger_at) WHERE enabled = TRUE;
CREATE INDEX ix_schedules_next_trigger ON schedules (tenant_id, next_trigger_at) WHERE enabled = TRUE AND next_trigger_at IS NOT NULL;
CREATE INDEX ix_schedules_source ON schedules (tenant_id, source_id);
-- Incidents: Operational alerts and escalations
CREATE TABLE incidents (
incident_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
incident_type TEXT NOT NULL,
severity TEXT NOT NULL,
job_type TEXT,
source_id UUID,
title TEXT NOT NULL,
description TEXT NOT NULL,
status incident_status NOT NULL DEFAULT 'open',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
acknowledged_at TIMESTAMPTZ,
acknowledged_by TEXT,
resolved_at TIMESTAMPTZ,
resolved_by TEXT,
resolution_notes TEXT,
metadata JSONB,
CONSTRAINT pk_incidents PRIMARY KEY (tenant_id, incident_id),
CONSTRAINT ck_incidents_severity CHECK (severity IN ('warning', 'critical'))
) PARTITION BY LIST (tenant_id);
CREATE TABLE incidents_default PARTITION OF incidents DEFAULT;
CREATE INDEX ix_incidents_status ON incidents (tenant_id, status, created_at DESC);
CREATE INDEX ix_incidents_type ON incidents (tenant_id, incident_type, status);
CREATE INDEX ix_incidents_open ON incidents (tenant_id, severity, created_at DESC) WHERE status = 'open';
-- Throttles: Dynamic rate-limit overrides (pause/resume per source or job type)
CREATE TABLE throttles (
throttle_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
source_id UUID,
job_type TEXT,
active BOOLEAN NOT NULL DEFAULT TRUE,
reason TEXT NOT NULL,
ticket TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
expires_at TIMESTAMPTZ,
created_by TEXT NOT NULL,
CONSTRAINT pk_throttles PRIMARY KEY (tenant_id, throttle_id),
CONSTRAINT ck_throttles_scope CHECK (source_id IS NOT NULL OR job_type IS NOT NULL)
) PARTITION BY LIST (tenant_id);
CREATE TABLE throttles_default PARTITION OF throttles DEFAULT;
CREATE INDEX ix_throttles_active ON throttles (tenant_id, active, expires_at) WHERE active = TRUE;
CREATE INDEX ix_throttles_source ON throttles (tenant_id, source_id) WHERE source_id IS NOT NULL;
CREATE INDEX ix_throttles_type ON throttles (tenant_id, job_type) WHERE job_type IS NOT NULL;
COMMIT;

View File

@@ -0,0 +1,154 @@
-- 002_backfill.sql
-- Backfill and watermark tables for event-time window tracking (ORCH-SVC-33-003)
-- Adds watermarks, backfill_requests, and processed_events for duplicate suppression.
BEGIN;
-- Backfill request status
CREATE TYPE backfill_status AS ENUM (
'pending',
'validating',
'running',
'paused',
'completed',
'failed',
'canceled'
);
-- Watermarks: Per-source/job-type event-time cursors
CREATE TABLE watermarks (
watermark_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
source_id UUID,
job_type TEXT,
scope_key TEXT NOT NULL, -- Normalized scope identifier
high_watermark TIMESTAMPTZ NOT NULL, -- Latest processed event time
low_watermark TIMESTAMPTZ, -- Earliest event time in current window
sequence_number BIGINT NOT NULL DEFAULT 0,
processed_count BIGINT NOT NULL DEFAULT 0,
last_batch_hash CHAR(64), -- SHA-256 of last processed batch for integrity
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_by TEXT NOT NULL,
CONSTRAINT pk_watermarks PRIMARY KEY (tenant_id, watermark_id),
CONSTRAINT uq_watermarks_scope UNIQUE (tenant_id, scope_key),
CONSTRAINT ck_watermarks_hash_hex CHECK (last_batch_hash IS NULL OR last_batch_hash ~ '^[0-9a-f]{64}$')
) PARTITION BY LIST (tenant_id);
CREATE TABLE watermarks_default PARTITION OF watermarks DEFAULT;
CREATE INDEX ix_watermarks_source ON watermarks (tenant_id, source_id) WHERE source_id IS NOT NULL;
CREATE INDEX ix_watermarks_job_type ON watermarks (tenant_id, job_type) WHERE job_type IS NOT NULL;
-- Backfill Requests: Batch reprocessing operations
CREATE TABLE backfill_requests (
backfill_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
source_id UUID,
job_type TEXT,
scope_key TEXT NOT NULL,
status backfill_status NOT NULL DEFAULT 'pending',
-- Time window for backfill
window_start TIMESTAMPTZ NOT NULL,
window_end TIMESTAMPTZ NOT NULL,
-- Progress tracking
current_position TIMESTAMPTZ,
total_events BIGINT,
processed_events BIGINT NOT NULL DEFAULT 0,
skipped_events BIGINT NOT NULL DEFAULT 0, -- Duplicates skipped
failed_events BIGINT NOT NULL DEFAULT 0,
-- Configuration
batch_size INTEGER NOT NULL DEFAULT 100,
dry_run BOOLEAN NOT NULL DEFAULT FALSE,
force_reprocess BOOLEAN NOT NULL DEFAULT FALSE, -- Ignore duplicate suppression
-- Safety validations
estimated_duration INTERVAL,
max_duration INTERVAL,
safety_checks JSONB, -- Validation results
-- Audit
reason TEXT NOT NULL,
ticket TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
created_by TEXT NOT NULL,
updated_by TEXT NOT NULL,
error_message TEXT,
CONSTRAINT pk_backfill_requests PRIMARY KEY (tenant_id, backfill_id),
CONSTRAINT ck_backfill_window_order CHECK (window_end > window_start),
CONSTRAINT ck_backfill_batch_size CHECK (batch_size > 0 AND batch_size <= 10000)
) PARTITION BY LIST (tenant_id);
CREATE TABLE backfill_requests_default PARTITION OF backfill_requests DEFAULT;
CREATE INDEX ix_backfill_status ON backfill_requests (tenant_id, status, created_at DESC);
CREATE INDEX ix_backfill_scope ON backfill_requests (tenant_id, scope_key, created_at DESC);
CREATE INDEX ix_backfill_running ON backfill_requests (tenant_id, source_id, job_type) WHERE status IN ('running', 'validating');
-- Processed Events: Duplicate suppression tracking (TTL-managed)
CREATE TABLE processed_events (
tenant_id TEXT NOT NULL,
scope_key TEXT NOT NULL,
event_key TEXT NOT NULL, -- Unique identifier for deduplication
event_time TIMESTAMPTZ NOT NULL,
processed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
batch_id UUID, -- Backfill batch or run ID
expires_at TIMESTAMPTZ NOT NULL, -- TTL for automatic cleanup
CONSTRAINT pk_processed_events PRIMARY KEY (tenant_id, scope_key, event_key)
) PARTITION BY LIST (tenant_id);
CREATE TABLE processed_events_default PARTITION OF processed_events DEFAULT;
CREATE INDEX ix_processed_events_expires ON processed_events (expires_at) WHERE expires_at < NOW() + INTERVAL '1 day';
CREATE INDEX ix_processed_events_time ON processed_events (tenant_id, scope_key, event_time DESC);
CREATE INDEX ix_processed_events_batch ON processed_events (tenant_id, batch_id) WHERE batch_id IS NOT NULL;
-- Backfill Checkpoints: Resumable batch processing state
CREATE TABLE backfill_checkpoints (
checkpoint_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
backfill_id UUID NOT NULL,
batch_number INTEGER NOT NULL,
batch_start TIMESTAMPTZ NOT NULL,
batch_end TIMESTAMPTZ NOT NULL,
events_in_batch INTEGER NOT NULL,
events_processed INTEGER NOT NULL DEFAULT 0,
events_skipped INTEGER NOT NULL DEFAULT 0,
events_failed INTEGER NOT NULL DEFAULT 0,
batch_hash CHAR(64),
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
completed_at TIMESTAMPTZ,
error_message TEXT,
CONSTRAINT pk_backfill_checkpoints PRIMARY KEY (tenant_id, checkpoint_id),
CONSTRAINT fk_backfill_checkpoints_request FOREIGN KEY (tenant_id, backfill_id)
REFERENCES backfill_requests (tenant_id, backfill_id) ON DELETE CASCADE,
CONSTRAINT uq_backfill_checkpoints_batch UNIQUE (tenant_id, backfill_id, batch_number),
CONSTRAINT ck_backfill_checkpoints_hash_hex CHECK (batch_hash IS NULL OR batch_hash ~ '^[0-9a-f]{64}$')
) PARTITION BY LIST (tenant_id);
CREATE TABLE backfill_checkpoints_default PARTITION OF backfill_checkpoints DEFAULT;
CREATE INDEX ix_backfill_checkpoints_request ON backfill_checkpoints (tenant_id, backfill_id, batch_number);
-- Function to clean up expired processed events (called by background job)
CREATE OR REPLACE FUNCTION cleanup_expired_processed_events(batch_limit INTEGER DEFAULT 10000)
RETURNS INTEGER AS $$
DECLARE
deleted_count INTEGER;
BEGIN
WITH deleted AS (
DELETE FROM processed_events
WHERE ctid IN (
SELECT ctid FROM processed_events
WHERE expires_at < NOW()
LIMIT batch_limit
)
RETURNING 1
)
SELECT COUNT(*) INTO deleted_count FROM deleted;
RETURN deleted_count;
END;
$$ LANGUAGE plpgsql;
COMMIT;

View File

@@ -0,0 +1,278 @@
-- 003_dead_letter.sql
-- Dead-letter store for failed jobs with error classification and replay (ORCH-SVC-33-004)
-- Adds dead_letter_entries, replay_audit, and notification_rules tables.
BEGIN;
-- Dead-letter entry status
CREATE TYPE dead_letter_status AS ENUM (
'pending', -- Awaiting operator action or auto-replay
'replaying', -- Currently being replayed
'replayed', -- Successfully replayed as new job
'resolved', -- Manually resolved without replay
'exhausted', -- All replay attempts exhausted
'expired' -- Expired and eligible for purge
);
-- Error classification category
CREATE TYPE error_category AS ENUM (
'unknown', -- Unclassified error
'transient', -- Transient infrastructure error
'not_found', -- Resource not found
'auth_failure', -- Authentication/authorization failure
'rate_limited', -- Rate limiting or quota exceeded
'validation_error', -- Invalid input or configuration
'upstream_error', -- External service error
'internal_error', -- Internal processing error
'conflict', -- Resource conflict
'canceled' -- Operation canceled
);
-- Dead-letter Entries: Failed jobs awaiting remediation
CREATE TABLE dead_letter_entries (
entry_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
-- Original job reference
original_job_id UUID NOT NULL,
run_id UUID,
source_id UUID,
job_type TEXT NOT NULL,
-- Payload preservation
payload JSONB NOT NULL,
payload_digest CHAR(64) NOT NULL, -- SHA-256 of payload
idempotency_key TEXT NOT NULL,
correlation_id TEXT,
-- Status and classification
status dead_letter_status NOT NULL DEFAULT 'pending',
error_code TEXT NOT NULL,
failure_reason TEXT NOT NULL,
remediation_hint TEXT,
category error_category NOT NULL DEFAULT 'unknown',
is_retryable BOOLEAN NOT NULL DEFAULT FALSE,
-- Attempt tracking
original_attempts INTEGER NOT NULL,
replay_attempts INTEGER NOT NULL DEFAULT 0,
max_replay_attempts INTEGER NOT NULL DEFAULT 3,
-- Timestamps
failed_at TIMESTAMPTZ NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
expires_at TIMESTAMPTZ NOT NULL,
resolved_at TIMESTAMPTZ,
-- Resolution
resolution_notes TEXT,
-- Audit
created_by TEXT NOT NULL,
updated_by TEXT NOT NULL,
CONSTRAINT pk_dead_letter_entries PRIMARY KEY (tenant_id, entry_id),
CONSTRAINT ck_dead_letter_payload_digest CHECK (payload_digest ~ '^[0-9a-f]{64}$'),
CONSTRAINT ck_dead_letter_attempts CHECK (replay_attempts >= 0 AND replay_attempts <= max_replay_attempts + 1)
) PARTITION BY LIST (tenant_id);
CREATE TABLE dead_letter_entries_default PARTITION OF dead_letter_entries DEFAULT;
-- Indexes for common query patterns
CREATE INDEX ix_dead_letter_status ON dead_letter_entries (tenant_id, status, created_at DESC);
CREATE INDEX ix_dead_letter_job ON dead_letter_entries (tenant_id, original_job_id);
CREATE INDEX ix_dead_letter_job_type ON dead_letter_entries (tenant_id, job_type, status, created_at DESC);
CREATE INDEX ix_dead_letter_category ON dead_letter_entries (tenant_id, category, status);
CREATE INDEX ix_dead_letter_error_code ON dead_letter_entries (tenant_id, error_code, status);
CREATE INDEX ix_dead_letter_expires ON dead_letter_entries (expires_at) WHERE status NOT IN ('replayed', 'resolved', 'exhausted');
CREATE INDEX ix_dead_letter_source ON dead_letter_entries (tenant_id, source_id, status) WHERE source_id IS NOT NULL;
CREATE INDEX ix_dead_letter_run ON dead_letter_entries (tenant_id, run_id, status) WHERE run_id IS NOT NULL;
CREATE INDEX ix_dead_letter_retryable ON dead_letter_entries (tenant_id, is_retryable, status) WHERE is_retryable = TRUE AND status = 'pending';
-- Replay Audit: Track replay attempts for auditing and debugging
CREATE TABLE dead_letter_replay_audit (
audit_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
entry_id UUID NOT NULL,
attempt_number INTEGER NOT NULL,
-- Outcome
success BOOLEAN NOT NULL,
new_job_id UUID, -- If successful, the new job ID
error_message TEXT, -- If failed, the reason
-- Context
triggered_by TEXT NOT NULL, -- 'auto', 'manual', 'batch'
triggered_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
completed_at TIMESTAMPTZ,
-- Audit
initiated_by TEXT NOT NULL,
CONSTRAINT pk_dead_letter_replay_audit PRIMARY KEY (tenant_id, audit_id),
CONSTRAINT fk_dead_letter_replay_audit_entry FOREIGN KEY (tenant_id, entry_id)
REFERENCES dead_letter_entries (tenant_id, entry_id) ON DELETE CASCADE,
CONSTRAINT uq_dead_letter_replay_audit_attempt UNIQUE (tenant_id, entry_id, attempt_number)
) PARTITION BY LIST (tenant_id);
CREATE TABLE dead_letter_replay_audit_default PARTITION OF dead_letter_replay_audit DEFAULT;
CREATE INDEX ix_dead_letter_replay_audit_entry ON dead_letter_replay_audit (tenant_id, entry_id, attempt_number);
CREATE INDEX ix_dead_letter_replay_audit_job ON dead_letter_replay_audit (tenant_id, new_job_id) WHERE new_job_id IS NOT NULL;
-- Notification Rules: Configure alerting for dead-letter events
CREATE TABLE dead_letter_notification_rules (
rule_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
-- Filter criteria (all optional - match any if not specified)
job_type_pattern TEXT, -- Regex pattern for job types
error_code_pattern TEXT, -- Regex pattern for error codes
category error_category,
source_id UUID,
-- Notification settings
enabled BOOLEAN NOT NULL DEFAULT TRUE,
channel TEXT NOT NULL, -- 'email', 'slack', 'teams', 'webhook'
endpoint TEXT NOT NULL, -- Email address, webhook URL, etc.
-- Throttling
cooldown_minutes INTEGER NOT NULL DEFAULT 15,
max_per_hour INTEGER NOT NULL DEFAULT 10,
aggregate BOOLEAN NOT NULL DEFAULT TRUE, -- Aggregate notifications
-- State
last_notified_at TIMESTAMPTZ,
notifications_sent INTEGER NOT NULL DEFAULT 0,
-- Audit
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_by TEXT NOT NULL,
updated_by TEXT NOT NULL,
CONSTRAINT pk_dead_letter_notification_rules PRIMARY KEY (tenant_id, rule_id),
CONSTRAINT ck_dead_letter_notification_channel CHECK (channel IN ('email', 'slack', 'teams', 'webhook', 'pagerduty')),
CONSTRAINT ck_dead_letter_notification_cooldown CHECK (cooldown_minutes >= 0),
CONSTRAINT ck_dead_letter_notification_max_per_hour CHECK (max_per_hour > 0)
) PARTITION BY LIST (tenant_id);
CREATE TABLE dead_letter_notification_rules_default PARTITION OF dead_letter_notification_rules DEFAULT;
CREATE INDEX ix_dead_letter_notification_rules_enabled ON dead_letter_notification_rules (tenant_id, enabled) WHERE enabled = TRUE;
CREATE INDEX ix_dead_letter_notification_rules_source ON dead_letter_notification_rules (tenant_id, source_id) WHERE source_id IS NOT NULL;
CREATE INDEX ix_dead_letter_notification_rules_category ON dead_letter_notification_rules (tenant_id, category) WHERE category IS NOT NULL;
-- Notification Log: Track sent notifications for throttling and auditing
CREATE TABLE dead_letter_notification_log (
log_id UUID NOT NULL,
tenant_id TEXT NOT NULL,
rule_id UUID NOT NULL,
entry_ids UUID[] NOT NULL, -- Entries included in this notification
channel TEXT NOT NULL,
endpoint TEXT NOT NULL,
-- Outcome
success BOOLEAN NOT NULL,
error_message TEXT,
-- Context
subject TEXT,
entry_count INTEGER NOT NULL,
sent_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
CONSTRAINT pk_dead_letter_notification_log PRIMARY KEY (tenant_id, log_id),
CONSTRAINT fk_dead_letter_notification_log_rule FOREIGN KEY (tenant_id, rule_id)
REFERENCES dead_letter_notification_rules (tenant_id, rule_id) ON DELETE CASCADE
) PARTITION BY LIST (tenant_id);
CREATE TABLE dead_letter_notification_log_default PARTITION OF dead_letter_notification_log DEFAULT;
CREATE INDEX ix_dead_letter_notification_log_rule ON dead_letter_notification_log (tenant_id, rule_id, sent_at DESC);
CREATE INDEX ix_dead_letter_notification_log_sent ON dead_letter_notification_log (tenant_id, sent_at DESC);
-- Dead-letter statistics view
CREATE OR REPLACE VIEW dead_letter_stats AS
SELECT
tenant_id,
status,
category,
error_code,
job_type,
is_retryable,
COUNT(*) AS entry_count,
COUNT(*) FILTER (WHERE replay_attempts = 0) AS never_replayed,
AVG(replay_attempts)::NUMERIC(5,2) AS avg_replay_attempts,
MIN(created_at) AS oldest_entry,
MAX(created_at) AS newest_entry,
COUNT(*) FILTER (WHERE expires_at < NOW()) AS expired_count
FROM dead_letter_entries
GROUP BY tenant_id, status, category, error_code, job_type, is_retryable;
-- Function to mark expired entries
CREATE OR REPLACE FUNCTION mark_expired_dead_letter_entries(batch_limit INTEGER DEFAULT 1000)
RETURNS INTEGER AS $$
DECLARE
updated_count INTEGER;
BEGIN
WITH expired AS (
UPDATE dead_letter_entries
SET status = 'expired',
updated_at = NOW(),
updated_by = 'system'
WHERE ctid IN (
SELECT ctid FROM dead_letter_entries
WHERE status NOT IN ('replayed', 'resolved', 'exhausted', 'expired')
AND expires_at < NOW()
LIMIT batch_limit
)
RETURNING 1
)
SELECT COUNT(*) INTO updated_count FROM expired;
RETURN updated_count;
END;
$$ LANGUAGE plpgsql;
-- Function to purge old resolved/expired entries (retention cleanup)
CREATE OR REPLACE FUNCTION purge_dead_letter_entries(
retention_days INTEGER DEFAULT 90,
batch_limit INTEGER DEFAULT 1000
)
RETURNS INTEGER AS $$
DECLARE
deleted_count INTEGER;
cutoff_date TIMESTAMPTZ;
BEGIN
cutoff_date := NOW() - (retention_days || ' days')::INTERVAL;
WITH deleted AS (
DELETE FROM dead_letter_entries
WHERE ctid IN (
SELECT ctid FROM dead_letter_entries
WHERE status IN ('replayed', 'resolved', 'exhausted', 'expired')
AND updated_at < cutoff_date
LIMIT batch_limit
)
RETURNING 1
)
SELECT COUNT(*) INTO deleted_count FROM deleted;
RETURN deleted_count;
END;
$$ LANGUAGE plpgsql;
-- Function to get actionable dead-letter entries (for dashboard)
CREATE OR REPLACE FUNCTION get_actionable_dead_letter_summary(
p_tenant_id TEXT,
p_limit INTEGER DEFAULT 10
)
RETURNS TABLE (
error_code TEXT,
category error_category,
entry_count BIGINT,
retryable_count BIGINT,
oldest_entry TIMESTAMPTZ,
sample_reason TEXT
) AS $$
BEGIN
RETURN QUERY
SELECT
dle.error_code,
dle.category,
COUNT(*)::BIGINT AS entry_count,
COUNT(*) FILTER (WHERE dle.is_retryable)::BIGINT AS retryable_count,
MIN(dle.created_at) AS oldest_entry,
(SELECT failure_reason FROM dead_letter_entries
WHERE tenant_id = p_tenant_id AND error_code = dle.error_code AND status = 'pending'
ORDER BY created_at DESC LIMIT 1) AS sample_reason
FROM dead_letter_entries dle
WHERE dle.tenant_id = p_tenant_id
AND dle.status = 'pending'
GROUP BY dle.error_code, dle.category
ORDER BY COUNT(*) DESC
LIMIT p_limit;
END;
$$ LANGUAGE plpgsql STABLE;
COMMIT;

View File

@@ -0,0 +1,243 @@
-- Migration: 004_slo_quotas
-- Creates tables for SLO management and quota APIs
-- SLO definitions table
CREATE TABLE IF NOT EXISTS slos (
slo_id UUID PRIMARY KEY,
tenant_id TEXT NOT NULL,
name TEXT NOT NULL,
description TEXT,
slo_type TEXT NOT NULL CHECK (slo_type IN ('availability', 'latency', 'throughput')),
job_type TEXT,
source_id UUID,
target DOUBLE PRECISION NOT NULL CHECK (target > 0 AND target <= 1),
window TEXT NOT NULL CHECK (window IN ('one_hour', 'one_day', 'seven_days', 'thirty_days')),
latency_percentile DOUBLE PRECISION CHECK (latency_percentile IS NULL OR (latency_percentile >= 0 AND latency_percentile <= 1)),
latency_target_seconds DOUBLE PRECISION CHECK (latency_target_seconds IS NULL OR latency_target_seconds > 0),
throughput_minimum INTEGER CHECK (throughput_minimum IS NULL OR throughput_minimum > 0),
enabled BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_by TEXT NOT NULL,
updated_by TEXT NOT NULL,
UNIQUE (tenant_id, name)
);
-- Indexes for SLOs
CREATE INDEX IF NOT EXISTS idx_slos_tenant ON slos(tenant_id);
CREATE INDEX IF NOT EXISTS idx_slos_tenant_enabled ON slos(tenant_id, enabled) WHERE enabled = TRUE;
CREATE INDEX IF NOT EXISTS idx_slos_tenant_job_type ON slos(tenant_id, job_type);
CREATE INDEX IF NOT EXISTS idx_slos_tenant_source ON slos(tenant_id, source_id);
-- Alert budget thresholds table
CREATE TABLE IF NOT EXISTS alert_budget_thresholds (
threshold_id UUID PRIMARY KEY,
slo_id UUID NOT NULL REFERENCES slos(slo_id) ON DELETE CASCADE,
tenant_id TEXT NOT NULL,
budget_consumed_threshold DOUBLE PRECISION NOT NULL CHECK (budget_consumed_threshold >= 0 AND budget_consumed_threshold <= 1),
burn_rate_threshold DOUBLE PRECISION CHECK (burn_rate_threshold IS NULL OR burn_rate_threshold > 0),
severity TEXT NOT NULL CHECK (severity IN ('info', 'warning', 'critical', 'emergency')),
enabled BOOLEAN NOT NULL DEFAULT TRUE,
notification_channel TEXT,
notification_endpoint TEXT,
cooldown_seconds INTEGER NOT NULL DEFAULT 3600,
last_triggered_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
created_by TEXT NOT NULL,
updated_by TEXT NOT NULL
);
-- Indexes for alert thresholds
CREATE INDEX IF NOT EXISTS idx_alert_thresholds_slo ON alert_budget_thresholds(slo_id);
CREATE INDEX IF NOT EXISTS idx_alert_thresholds_tenant ON alert_budget_thresholds(tenant_id);
CREATE INDEX IF NOT EXISTS idx_alert_thresholds_enabled ON alert_budget_thresholds(slo_id, enabled) WHERE enabled = TRUE;
-- SLO alerts table
CREATE TABLE IF NOT EXISTS slo_alerts (
alert_id UUID PRIMARY KEY,
slo_id UUID NOT NULL REFERENCES slos(slo_id) ON DELETE CASCADE,
threshold_id UUID NOT NULL REFERENCES alert_budget_thresholds(threshold_id) ON DELETE CASCADE,
tenant_id TEXT NOT NULL,
severity TEXT NOT NULL CHECK (severity IN ('info', 'warning', 'critical', 'emergency')),
message TEXT NOT NULL,
budget_consumed DOUBLE PRECISION NOT NULL,
burn_rate DOUBLE PRECISION NOT NULL,
current_sli DOUBLE PRECISION NOT NULL,
triggered_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
acknowledged_at TIMESTAMPTZ,
acknowledged_by TEXT,
resolved_at TIMESTAMPTZ,
resolution_notes TEXT
);
-- Indexes for SLO alerts
CREATE INDEX IF NOT EXISTS idx_slo_alerts_tenant ON slo_alerts(tenant_id);
CREATE INDEX IF NOT EXISTS idx_slo_alerts_slo ON slo_alerts(slo_id);
CREATE INDEX IF NOT EXISTS idx_slo_alerts_tenant_triggered ON slo_alerts(tenant_id, triggered_at DESC);
CREATE INDEX IF NOT EXISTS idx_slo_alerts_active ON slo_alerts(tenant_id, resolved_at) WHERE resolved_at IS NULL;
CREATE INDEX IF NOT EXISTS idx_slo_alerts_unacknowledged ON slo_alerts(tenant_id, acknowledged_at) WHERE acknowledged_at IS NULL;
-- SLO state snapshots for historical tracking
CREATE TABLE IF NOT EXISTS slo_state_snapshots (
snapshot_id UUID PRIMARY KEY,
slo_id UUID NOT NULL REFERENCES slos(slo_id) ON DELETE CASCADE,
tenant_id TEXT NOT NULL,
current_sli DOUBLE PRECISION NOT NULL,
total_events BIGINT NOT NULL,
good_events BIGINT NOT NULL,
bad_events BIGINT NOT NULL,
budget_consumed DOUBLE PRECISION NOT NULL,
budget_remaining DOUBLE PRECISION NOT NULL,
burn_rate DOUBLE PRECISION NOT NULL,
is_met BOOLEAN NOT NULL,
alert_severity TEXT NOT NULL,
computed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
window_start TIMESTAMPTZ NOT NULL,
window_end TIMESTAMPTZ NOT NULL
);
-- Indexes for state snapshots
CREATE INDEX IF NOT EXISTS idx_slo_snapshots_slo ON slo_state_snapshots(slo_id, computed_at DESC);
CREATE INDEX IF NOT EXISTS idx_slo_snapshots_tenant ON slo_state_snapshots(tenant_id, computed_at DESC);
CREATE INDEX IF NOT EXISTS idx_slo_snapshots_cleanup ON slo_state_snapshots(computed_at);
-- Quota audit log for tracking changes
CREATE TABLE IF NOT EXISTS quota_audit_log (
audit_id UUID PRIMARY KEY,
tenant_id TEXT NOT NULL,
quota_id UUID NOT NULL,
action TEXT NOT NULL CHECK (action IN ('created', 'updated', 'paused', 'resumed', 'deleted')),
old_values JSONB,
new_values JSONB,
reason TEXT,
ticket TEXT,
performed_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
performed_by TEXT NOT NULL
);
-- Indexes for quota audit log
CREATE INDEX IF NOT EXISTS idx_quota_audit_tenant ON quota_audit_log(tenant_id);
CREATE INDEX IF NOT EXISTS idx_quota_audit_quota ON quota_audit_log(quota_id);
CREATE INDEX IF NOT EXISTS idx_quota_audit_time ON quota_audit_log(performed_at DESC);
-- Job metrics aggregation table for SLO computation
-- Stores pre-aggregated metrics per hour for efficient SLO queries
CREATE TABLE IF NOT EXISTS job_metrics_hourly (
metric_id UUID PRIMARY KEY,
tenant_id TEXT NOT NULL,
job_type TEXT,
source_id UUID,
hour_start TIMESTAMPTZ NOT NULL,
total_jobs BIGINT NOT NULL DEFAULT 0,
successful_jobs BIGINT NOT NULL DEFAULT 0,
failed_jobs BIGINT NOT NULL DEFAULT 0,
latency_p50_seconds DOUBLE PRECISION,
latency_p95_seconds DOUBLE PRECISION,
latency_p99_seconds DOUBLE PRECISION,
avg_latency_seconds DOUBLE PRECISION,
min_latency_seconds DOUBLE PRECISION,
max_latency_seconds DOUBLE PRECISION,
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE (tenant_id, job_type, source_id, hour_start)
);
-- Indexes for job metrics
CREATE INDEX IF NOT EXISTS idx_job_metrics_tenant ON job_metrics_hourly(tenant_id, hour_start DESC);
CREATE INDEX IF NOT EXISTS idx_job_metrics_tenant_type ON job_metrics_hourly(tenant_id, job_type, hour_start DESC);
CREATE INDEX IF NOT EXISTS idx_job_metrics_cleanup ON job_metrics_hourly(hour_start);
-- Function to aggregate job metrics for SLO computation
CREATE OR REPLACE FUNCTION get_slo_availability_counts(
p_tenant_id TEXT,
p_job_type TEXT,
p_source_id UUID,
p_window_start TIMESTAMPTZ,
p_window_end TIMESTAMPTZ
) RETURNS TABLE (
total_events BIGINT,
good_events BIGINT,
bad_events BIGINT
) AS $$
BEGIN
RETURN QUERY
SELECT
COALESCE(SUM(total_jobs), 0)::BIGINT AS total_events,
COALESCE(SUM(successful_jobs), 0)::BIGINT AS good_events,
COALESCE(SUM(failed_jobs), 0)::BIGINT AS bad_events
FROM job_metrics_hourly
WHERE tenant_id = p_tenant_id
AND hour_start >= p_window_start
AND hour_start < p_window_end
AND (p_job_type IS NULL OR job_type = p_job_type)
AND (p_source_id IS NULL OR source_id = p_source_id);
END;
$$ LANGUAGE plpgsql;
-- Function to clean up old SLO state snapshots
CREATE OR REPLACE FUNCTION cleanup_slo_snapshots(
p_retention_days INTEGER DEFAULT 90,
p_batch_limit INTEGER DEFAULT 10000
) RETURNS INTEGER AS $$
DECLARE
deleted_count INTEGER;
BEGIN
WITH deleted AS (
DELETE FROM slo_state_snapshots
WHERE computed_at < NOW() - (p_retention_days || ' days')::INTERVAL
LIMIT p_batch_limit
RETURNING 1
)
SELECT COUNT(*) INTO deleted_count FROM deleted;
RETURN deleted_count;
END;
$$ LANGUAGE plpgsql;
-- Function to clean up old quota audit logs
CREATE OR REPLACE FUNCTION cleanup_quota_audit_log(
p_retention_days INTEGER DEFAULT 365,
p_batch_limit INTEGER DEFAULT 10000
) RETURNS INTEGER AS $$
DECLARE
deleted_count INTEGER;
BEGIN
WITH deleted AS (
DELETE FROM quota_audit_log
WHERE performed_at < NOW() - (p_retention_days || ' days')::INTERVAL
LIMIT p_batch_limit
RETURNING 1
)
SELECT COUNT(*) INTO deleted_count FROM deleted;
RETURN deleted_count;
END;
$$ LANGUAGE plpgsql;
-- Function to get SLO summary for a tenant
CREATE OR REPLACE FUNCTION get_slo_summary(
p_tenant_id TEXT
) RETURNS TABLE (
total_slos BIGINT,
enabled_slos BIGINT,
active_alerts BIGINT,
unacknowledged_alerts BIGINT,
critical_alerts BIGINT
) AS $$
BEGIN
RETURN QUERY
SELECT
(SELECT COUNT(*) FROM slos WHERE tenant_id = p_tenant_id)::BIGINT AS total_slos,
(SELECT COUNT(*) FROM slos WHERE tenant_id = p_tenant_id AND enabled = TRUE)::BIGINT AS enabled_slos,
(SELECT COUNT(*) FROM slo_alerts WHERE tenant_id = p_tenant_id AND resolved_at IS NULL)::BIGINT AS active_alerts,
(SELECT COUNT(*) FROM slo_alerts WHERE tenant_id = p_tenant_id AND acknowledged_at IS NULL AND resolved_at IS NULL)::BIGINT AS unacknowledged_alerts,
(SELECT COUNT(*) FROM slo_alerts WHERE tenant_id = p_tenant_id AND severity IN ('critical', 'emergency') AND resolved_at IS NULL)::BIGINT AS critical_alerts;
END;
$$ LANGUAGE plpgsql;
COMMENT ON TABLE slos IS 'Service Level Objective definitions for tenants';
COMMENT ON TABLE alert_budget_thresholds IS 'Alert thresholds for SLO error budget consumption';
COMMENT ON TABLE slo_alerts IS 'SLO alert events triggered by threshold violations';
COMMENT ON TABLE slo_state_snapshots IS 'Historical snapshots of SLO state for trend analysis';
COMMENT ON TABLE quota_audit_log IS 'Audit trail for quota configuration changes';
COMMENT ON TABLE job_metrics_hourly IS 'Pre-aggregated hourly job metrics for efficient SLO computation';

View File

@@ -0,0 +1,417 @@
-- Migration: 005_audit_ledger
-- Creates tables for audit logging and immutable run ledger
-- Audit log entries table (immutable append-only log)
CREATE TABLE IF NOT EXISTS audit_entries (
entry_id UUID PRIMARY KEY,
tenant_id TEXT NOT NULL,
event_type INTEGER NOT NULL,
resource_type TEXT NOT NULL,
resource_id UUID NOT NULL,
actor_id TEXT NOT NULL,
actor_type INTEGER NOT NULL,
actor_ip TEXT,
user_agent TEXT,
http_method TEXT,
request_path TEXT,
old_state JSONB,
new_state JSONB,
description TEXT NOT NULL,
correlation_id TEXT,
previous_entry_hash TEXT,
content_hash TEXT NOT NULL,
sequence_number BIGINT NOT NULL,
occurred_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
metadata JSONB
);
-- Indexes for audit log
CREATE INDEX IF NOT EXISTS idx_audit_tenant ON audit_entries(tenant_id);
CREATE INDEX IF NOT EXISTS idx_audit_tenant_time ON audit_entries(tenant_id, occurred_at DESC);
CREATE INDEX IF NOT EXISTS idx_audit_tenant_seq ON audit_entries(tenant_id, sequence_number DESC);
CREATE INDEX IF NOT EXISTS idx_audit_resource ON audit_entries(tenant_id, resource_type, resource_id);
CREATE INDEX IF NOT EXISTS idx_audit_actor ON audit_entries(tenant_id, actor_id);
CREATE INDEX IF NOT EXISTS idx_audit_event_type ON audit_entries(tenant_id, event_type);
CREATE INDEX IF NOT EXISTS idx_audit_correlation ON audit_entries(correlation_id) WHERE correlation_id IS NOT NULL;
-- Run ledger entries table (immutable run execution records)
CREATE TABLE IF NOT EXISTS run_ledger_entries (
ledger_id UUID PRIMARY KEY,
tenant_id TEXT NOT NULL,
run_id UUID NOT NULL,
source_id UUID NOT NULL,
run_type TEXT NOT NULL,
final_status INTEGER NOT NULL,
total_jobs INTEGER NOT NULL,
succeeded_jobs INTEGER NOT NULL,
failed_jobs INTEGER NOT NULL,
run_created_at TIMESTAMPTZ NOT NULL,
run_started_at TIMESTAMPTZ,
run_completed_at TIMESTAMPTZ NOT NULL,
execution_duration_ms BIGINT NOT NULL,
initiated_by TEXT NOT NULL,
input_digest TEXT NOT NULL,
output_digest TEXT NOT NULL,
artifact_manifest JSONB NOT NULL,
sequence_number BIGINT NOT NULL,
previous_entry_hash TEXT,
content_hash TEXT NOT NULL,
ledger_created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
correlation_id TEXT,
metadata JSONB
);
-- Indexes for run ledger
CREATE INDEX IF NOT EXISTS idx_ledger_tenant ON run_ledger_entries(tenant_id);
CREATE INDEX IF NOT EXISTS idx_ledger_tenant_time ON run_ledger_entries(tenant_id, ledger_created_at DESC);
CREATE INDEX IF NOT EXISTS idx_ledger_tenant_seq ON run_ledger_entries(tenant_id, sequence_number DESC);
CREATE INDEX IF NOT EXISTS idx_ledger_run ON run_ledger_entries(run_id);
CREATE INDEX IF NOT EXISTS idx_ledger_source ON run_ledger_entries(tenant_id, source_id);
CREATE INDEX IF NOT EXISTS idx_ledger_run_type ON run_ledger_entries(tenant_id, run_type);
CREATE INDEX IF NOT EXISTS idx_ledger_content_hash ON run_ledger_entries(content_hash);
CREATE UNIQUE INDEX IF NOT EXISTS idx_ledger_tenant_run ON run_ledger_entries(tenant_id, run_id);
-- Ledger exports table
CREATE TABLE IF NOT EXISTS ledger_exports (
export_id UUID PRIMARY KEY,
tenant_id TEXT NOT NULL,
status INTEGER NOT NULL DEFAULT 0,
format TEXT NOT NULL CHECK (format IN ('json', 'ndjson', 'csv')),
start_time TIMESTAMPTZ,
end_time TIMESTAMPTZ,
run_type_filter TEXT,
source_id_filter UUID,
entry_count INTEGER NOT NULL DEFAULT 0,
output_uri TEXT,
output_digest TEXT,
output_size_bytes BIGINT,
requested_by TEXT NOT NULL,
requested_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
error_message TEXT
);
-- Indexes for ledger exports
CREATE INDEX IF NOT EXISTS idx_exports_tenant ON ledger_exports(tenant_id);
CREATE INDEX IF NOT EXISTS idx_exports_tenant_time ON ledger_exports(tenant_id, requested_at DESC);
CREATE INDEX IF NOT EXISTS idx_exports_status ON ledger_exports(tenant_id, status);
-- Signed manifests table
CREATE TABLE IF NOT EXISTS signed_manifests (
manifest_id UUID PRIMARY KEY,
schema_version TEXT NOT NULL,
tenant_id TEXT NOT NULL,
provenance_type INTEGER NOT NULL,
subject_id UUID NOT NULL,
statements JSONB NOT NULL,
artifacts JSONB NOT NULL,
materials JSONB NOT NULL,
build_info JSONB,
payload_digest TEXT NOT NULL,
signature_algorithm TEXT NOT NULL,
signature TEXT NOT NULL,
key_id TEXT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
expires_at TIMESTAMPTZ,
metadata JSONB
);
-- Indexes for signed manifests
CREATE INDEX IF NOT EXISTS idx_manifests_tenant ON signed_manifests(tenant_id);
CREATE INDEX IF NOT EXISTS idx_manifests_subject ON signed_manifests(tenant_id, provenance_type, subject_id);
CREATE INDEX IF NOT EXISTS idx_manifests_payload ON signed_manifests(payload_digest);
CREATE INDEX IF NOT EXISTS idx_manifests_key ON signed_manifests(key_id);
CREATE INDEX IF NOT EXISTS idx_manifests_expiry ON signed_manifests(expires_at) WHERE expires_at IS NOT NULL;
-- Sequence tracking for audit entries per tenant
CREATE TABLE IF NOT EXISTS audit_sequences (
tenant_id TEXT PRIMARY KEY,
last_sequence_number BIGINT NOT NULL DEFAULT 0,
last_entry_hash TEXT,
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Sequence tracking for ledger entries per tenant
CREATE TABLE IF NOT EXISTS ledger_sequences (
tenant_id TEXT PRIMARY KEY,
last_sequence_number BIGINT NOT NULL DEFAULT 0,
last_entry_hash TEXT,
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Function to get the next audit sequence number for a tenant
CREATE OR REPLACE FUNCTION next_audit_sequence(
p_tenant_id TEXT
) RETURNS TABLE (
next_seq BIGINT,
prev_hash TEXT
) AS $$
DECLARE
v_next_seq BIGINT;
v_prev_hash TEXT;
BEGIN
-- Lock and update the sequence
INSERT INTO audit_sequences (tenant_id, last_sequence_number, last_entry_hash, updated_at)
VALUES (p_tenant_id, 1, NULL, NOW())
ON CONFLICT (tenant_id)
DO UPDATE SET
last_sequence_number = audit_sequences.last_sequence_number + 1,
updated_at = NOW()
RETURNING audit_sequences.last_sequence_number, audit_sequences.last_entry_hash
INTO v_next_seq, v_prev_hash;
RETURN QUERY SELECT v_next_seq, v_prev_hash;
END;
$$ LANGUAGE plpgsql;
-- Function to update audit sequence with new hash after insertion
CREATE OR REPLACE FUNCTION update_audit_sequence_hash(
p_tenant_id TEXT,
p_content_hash TEXT
) RETURNS VOID AS $$
BEGIN
UPDATE audit_sequences
SET last_entry_hash = p_content_hash,
updated_at = NOW()
WHERE tenant_id = p_tenant_id;
END;
$$ LANGUAGE plpgsql;
-- Function to get the next ledger sequence number for a tenant
CREATE OR REPLACE FUNCTION next_ledger_sequence(
p_tenant_id TEXT
) RETURNS TABLE (
next_seq BIGINT,
prev_hash TEXT
) AS $$
DECLARE
v_next_seq BIGINT;
v_prev_hash TEXT;
BEGIN
-- Lock and update the sequence
INSERT INTO ledger_sequences (tenant_id, last_sequence_number, last_entry_hash, updated_at)
VALUES (p_tenant_id, 1, NULL, NOW())
ON CONFLICT (tenant_id)
DO UPDATE SET
last_sequence_number = ledger_sequences.last_sequence_number + 1,
updated_at = NOW()
RETURNING ledger_sequences.last_sequence_number, ledger_sequences.last_entry_hash
INTO v_next_seq, v_prev_hash;
RETURN QUERY SELECT v_next_seq, v_prev_hash;
END;
$$ LANGUAGE plpgsql;
-- Function to update ledger sequence with new hash after insertion
CREATE OR REPLACE FUNCTION update_ledger_sequence_hash(
p_tenant_id TEXT,
p_content_hash TEXT
) RETURNS VOID AS $$
BEGIN
UPDATE ledger_sequences
SET last_entry_hash = p_content_hash,
updated_at = NOW()
WHERE tenant_id = p_tenant_id;
END;
$$ LANGUAGE plpgsql;
-- Function to verify audit chain integrity
CREATE OR REPLACE FUNCTION verify_audit_chain(
p_tenant_id TEXT,
p_start_seq BIGINT DEFAULT 1,
p_end_seq BIGINT DEFAULT NULL
) RETURNS TABLE (
is_valid BOOLEAN,
invalid_entry_id UUID,
invalid_sequence BIGINT,
error_message TEXT
) AS $$
DECLARE
v_prev_hash TEXT;
v_entry RECORD;
BEGIN
FOR v_entry IN
SELECT entry_id, sequence_number, previous_entry_hash, content_hash
FROM audit_entries
WHERE tenant_id = p_tenant_id
AND sequence_number >= p_start_seq
AND (p_end_seq IS NULL OR sequence_number <= p_end_seq)
ORDER BY sequence_number ASC
LOOP
-- First entry should have null previous hash or be sequence 1
IF v_entry.sequence_number = 1 AND v_entry.previous_entry_hash IS NOT NULL THEN
RETURN QUERY SELECT FALSE, v_entry.entry_id, v_entry.sequence_number,
'First entry should have null previous_entry_hash'::TEXT;
RETURN;
END IF;
-- Check chain link
IF v_prev_hash IS NOT NULL AND v_entry.previous_entry_hash != v_prev_hash THEN
RETURN QUERY SELECT FALSE, v_entry.entry_id, v_entry.sequence_number,
format('Chain break: expected %s, got %s', v_prev_hash, v_entry.previous_entry_hash);
RETURN;
END IF;
v_prev_hash := v_entry.content_hash;
END LOOP;
RETURN QUERY SELECT TRUE, NULL::UUID, NULL::BIGINT, NULL::TEXT;
END;
$$ LANGUAGE plpgsql;
-- Function to verify ledger chain integrity
CREATE OR REPLACE FUNCTION verify_ledger_chain(
p_tenant_id TEXT,
p_start_seq BIGINT DEFAULT 1,
p_end_seq BIGINT DEFAULT NULL
) RETURNS TABLE (
is_valid BOOLEAN,
invalid_ledger_id UUID,
invalid_sequence BIGINT,
error_message TEXT
) AS $$
DECLARE
v_prev_hash TEXT;
v_entry RECORD;
BEGIN
FOR v_entry IN
SELECT ledger_id, sequence_number, previous_entry_hash, content_hash
FROM run_ledger_entries
WHERE tenant_id = p_tenant_id
AND sequence_number >= p_start_seq
AND (p_end_seq IS NULL OR sequence_number <= p_end_seq)
ORDER BY sequence_number ASC
LOOP
-- First entry should have null previous hash or be sequence 1
IF v_entry.sequence_number = 1 AND v_entry.previous_entry_hash IS NOT NULL THEN
RETURN QUERY SELECT FALSE, v_entry.ledger_id, v_entry.sequence_number,
'First entry should have null previous_entry_hash'::TEXT;
RETURN;
END IF;
-- Check chain link
IF v_prev_hash IS NOT NULL AND v_entry.previous_entry_hash != v_prev_hash THEN
RETURN QUERY SELECT FALSE, v_entry.ledger_id, v_entry.sequence_number,
format('Chain break: expected %s, got %s', v_prev_hash, v_entry.previous_entry_hash);
RETURN;
END IF;
v_prev_hash := v_entry.content_hash;
END LOOP;
RETURN QUERY SELECT TRUE, NULL::UUID, NULL::BIGINT, NULL::TEXT;
END;
$$ LANGUAGE plpgsql;
-- Function to get audit summary statistics
CREATE OR REPLACE FUNCTION get_audit_summary(
p_tenant_id TEXT,
p_since TIMESTAMPTZ DEFAULT NULL
) RETURNS TABLE (
total_entries BIGINT,
entries_since BIGINT,
event_types BIGINT,
unique_actors BIGINT,
unique_resources BIGINT,
earliest_entry TIMESTAMPTZ,
latest_entry TIMESTAMPTZ
) AS $$
BEGIN
RETURN QUERY
SELECT
COUNT(*)::BIGINT AS total_entries,
COUNT(*) FILTER (WHERE p_since IS NULL OR occurred_at >= p_since)::BIGINT AS entries_since,
COUNT(DISTINCT event_type)::BIGINT AS event_types,
COUNT(DISTINCT actor_id)::BIGINT AS unique_actors,
COUNT(DISTINCT (resource_type, resource_id))::BIGINT AS unique_resources,
MIN(occurred_at) AS earliest_entry,
MAX(occurred_at) AS latest_entry
FROM audit_entries
WHERE tenant_id = p_tenant_id;
END;
$$ LANGUAGE plpgsql;
-- Function to get ledger summary statistics
CREATE OR REPLACE FUNCTION get_ledger_summary(
p_tenant_id TEXT,
p_since TIMESTAMPTZ DEFAULT NULL
) RETURNS TABLE (
total_entries BIGINT,
entries_since BIGINT,
total_runs BIGINT,
successful_runs BIGINT,
failed_runs BIGINT,
total_jobs BIGINT,
unique_sources BIGINT,
unique_run_types BIGINT,
earliest_entry TIMESTAMPTZ,
latest_entry TIMESTAMPTZ
) AS $$
BEGIN
RETURN QUERY
SELECT
COUNT(*)::BIGINT AS total_entries,
COUNT(*) FILTER (WHERE p_since IS NULL OR ledger_created_at >= p_since)::BIGINT AS entries_since,
COUNT(*)::BIGINT AS total_runs,
COUNT(*) FILTER (WHERE final_status = 2)::BIGINT AS successful_runs, -- RunStatus.Succeeded = 2
COUNT(*) FILTER (WHERE final_status IN (3, 4))::BIGINT AS failed_runs, -- PartiallySucceeded = 3, Failed = 4
COALESCE(SUM(total_jobs), 0)::BIGINT AS total_jobs,
COUNT(DISTINCT source_id)::BIGINT AS unique_sources,
COUNT(DISTINCT run_type)::BIGINT AS unique_run_types,
MIN(ledger_created_at) AS earliest_entry,
MAX(ledger_created_at) AS latest_entry
FROM run_ledger_entries
WHERE tenant_id = p_tenant_id;
END;
$$ LANGUAGE plpgsql;
-- Function to cleanup old audit entries (respecting retention)
CREATE OR REPLACE FUNCTION cleanup_audit_entries(
p_retention_days INTEGER DEFAULT 365,
p_batch_limit INTEGER DEFAULT 10000
) RETURNS INTEGER AS $$
DECLARE
deleted_count INTEGER;
BEGIN
WITH deleted AS (
DELETE FROM audit_entries
WHERE occurred_at < NOW() - (p_retention_days || ' days')::INTERVAL
LIMIT p_batch_limit
RETURNING 1
)
SELECT COUNT(*) INTO deleted_count FROM deleted;
RETURN deleted_count;
END;
$$ LANGUAGE plpgsql;
-- Function to cleanup old ledger entries (respecting retention)
CREATE OR REPLACE FUNCTION cleanup_ledger_entries(
p_retention_days INTEGER DEFAULT 2555, -- ~7 years for compliance
p_batch_limit INTEGER DEFAULT 10000
) RETURNS INTEGER AS $$
DECLARE
deleted_count INTEGER;
BEGIN
WITH deleted AS (
DELETE FROM run_ledger_entries
WHERE ledger_created_at < NOW() - (p_retention_days || ' days')::INTERVAL
LIMIT p_batch_limit
RETURNING 1
)
SELECT COUNT(*) INTO deleted_count FROM deleted;
RETURN deleted_count;
END;
$$ LANGUAGE plpgsql;
-- Comments
COMMENT ON TABLE audit_entries IS 'Immutable audit log with hash chain for tamper evidence';
COMMENT ON TABLE run_ledger_entries IS 'Immutable run execution ledger with provenance tracking';
COMMENT ON TABLE ledger_exports IS 'Ledger export operations tracking';
COMMENT ON TABLE signed_manifests IS 'Signed provenance manifests for artifacts and exports';
COMMENT ON TABLE audit_sequences IS 'Sequence tracking for audit entry chain integrity';
COMMENT ON TABLE ledger_sequences IS 'Sequence tracking for ledger entry chain integrity';
COMMENT ON FUNCTION verify_audit_chain IS 'Verifies the hash chain integrity of audit entries';
COMMENT ON FUNCTION verify_ledger_chain IS 'Verifies the hash chain integrity of ledger entries';

View File

@@ -0,0 +1,321 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Tests.AuditLedger;
/// <summary>
/// Tests for AuditEntry domain model.
/// </summary>
public sealed class AuditEntryTests
{
[Fact]
public void Create_WithValidParameters_SetsAllProperties()
{
// Arrange
var tenantId = "test-tenant";
var resourceId = Guid.NewGuid();
// Act
var entry = AuditEntry.Create(
tenantId: tenantId,
eventType: AuditEventType.JobCreated,
resourceType: "job",
resourceId: resourceId,
actorId: "user@example.com",
actorType: ActorType.User,
description: "Job created",
oldState: null,
newState: """{"status":"pending"}""",
actorIp: "192.168.1.1",
userAgent: "TestClient/1.0",
httpMethod: "POST",
requestPath: "/api/v1/jobs",
correlationId: "corr-123",
previousEntryHash: null,
sequenceNumber: 1,
metadata: """{"extra":"data"}""");
// Assert
Assert.NotEqual(Guid.Empty, entry.EntryId);
Assert.Equal(tenantId, entry.TenantId);
Assert.Equal(AuditEventType.JobCreated, entry.EventType);
Assert.Equal("job", entry.ResourceType);
Assert.Equal(resourceId, entry.ResourceId);
Assert.Equal("user@example.com", entry.ActorId);
Assert.Equal(ActorType.User, entry.ActorType);
Assert.Equal("192.168.1.1", entry.ActorIp);
Assert.Equal("TestClient/1.0", entry.UserAgent);
Assert.Equal("POST", entry.HttpMethod);
Assert.Equal("/api/v1/jobs", entry.RequestPath);
Assert.Null(entry.OldState);
Assert.Equal("""{"status":"pending"}""", entry.NewState);
Assert.Equal("Job created", entry.Description);
Assert.Equal("corr-123", entry.CorrelationId);
Assert.Null(entry.PreviousEntryHash);
Assert.NotEmpty(entry.ContentHash);
Assert.Equal(1, entry.SequenceNumber);
Assert.Equal("""{"extra":"data"}""", entry.Metadata);
Assert.True(entry.OccurredAt > DateTimeOffset.MinValue);
}
[Fact]
public void Create_GeneratesValidContentHash()
{
// Arrange & Act
var entry = AuditEntry.Create(
tenantId: "test-tenant",
eventType: AuditEventType.RunCreated,
resourceType: "run",
resourceId: Guid.NewGuid(),
actorId: "system",
actorType: ActorType.System,
description: "Run created",
sequenceNumber: 1);
// Assert
Assert.NotEmpty(entry.ContentHash);
Assert.Equal(64, entry.ContentHash.Length); // SHA-256 produces 64 hex chars
Assert.True(entry.ContentHash.All(c => char.IsAsciiHexDigit(c)));
}
[Fact]
public void VerifyIntegrity_WithValidEntry_ReturnsTrue()
{
// Arrange
var entry = AuditEntry.Create(
tenantId: "test-tenant",
eventType: AuditEventType.SourceCreated,
resourceType: "source",
resourceId: Guid.NewGuid(),
actorId: "admin",
actorType: ActorType.User,
description: "Source created",
sequenceNumber: 5);
// Act
var isValid = entry.VerifyIntegrity();
// Assert
Assert.True(isValid);
}
[Fact]
public void VerifyIntegrity_WithTamperedEntry_ReturnsFalse()
{
// Arrange
var entry = AuditEntry.Create(
tenantId: "test-tenant",
eventType: AuditEventType.QuotaCreated,
resourceType: "quota",
resourceId: Guid.NewGuid(),
actorId: "admin",
actorType: ActorType.User,
description: "Original description",
sequenceNumber: 1);
// Tamper with the entry by changing description but keeping original hash
var tamperedEntry = entry with { Description = "Tampered description" };
// Act
var isValid = tamperedEntry.VerifyIntegrity();
// Assert
Assert.False(isValid);
}
[Fact]
public void VerifyChainLink_WithNullPrevious_AndFirstEntry_ReturnsTrue()
{
// Arrange
var entry = AuditEntry.Create(
tenantId: "test-tenant",
eventType: AuditEventType.JobScheduled,
resourceType: "job",
resourceId: Guid.NewGuid(),
actorId: "scheduler",
actorType: ActorType.System,
description: "Job scheduled",
previousEntryHash: null,
sequenceNumber: 1);
// Act
var isValid = entry.VerifyChainLink(null);
// Assert
Assert.True(isValid);
}
[Fact]
public void VerifyChainLink_WithValidPreviousEntry_ReturnsTrue()
{
// Arrange
var first = AuditEntry.Create(
tenantId: "test-tenant",
eventType: AuditEventType.JobCreated,
resourceType: "job",
resourceId: Guid.NewGuid(),
actorId: "user",
actorType: ActorType.User,
description: "First entry",
previousEntryHash: null,
sequenceNumber: 1);
var second = AuditEntry.Create(
tenantId: "test-tenant",
eventType: AuditEventType.JobLeased,
resourceType: "job",
resourceId: Guid.NewGuid(),
actorId: "worker",
actorType: ActorType.Worker,
description: "Second entry",
previousEntryHash: first.ContentHash,
sequenceNumber: 2);
// Act
var isValid = second.VerifyChainLink(first);
// Assert
Assert.True(isValid);
}
[Fact]
public void VerifyChainLink_WithInvalidPreviousHash_ReturnsFalse()
{
// Arrange
var first = AuditEntry.Create(
tenantId: "test-tenant",
eventType: AuditEventType.JobCreated,
resourceType: "job",
resourceId: Guid.NewGuid(),
actorId: "user",
actorType: ActorType.User,
description: "First entry",
previousEntryHash: null,
sequenceNumber: 1);
var second = AuditEntry.Create(
tenantId: "test-tenant",
eventType: AuditEventType.JobCompleted,
resourceType: "job",
resourceId: Guid.NewGuid(),
actorId: "worker",
actorType: ActorType.Worker,
description: "Second entry with wrong hash",
previousEntryHash: "wrong_hash_value",
sequenceNumber: 2);
// Act
var isValid = second.VerifyChainLink(first);
// Assert
Assert.False(isValid);
}
[Theory]
[InlineData(AuditEventType.JobCreated, "job")]
[InlineData(AuditEventType.RunStarted, "run")]
[InlineData(AuditEventType.SourcePaused, "source")]
[InlineData(AuditEventType.QuotaUpdated, "quota")]
[InlineData(AuditEventType.SloAlertTriggered, "slo")]
[InlineData(AuditEventType.DeadLetterReplayed, "deadletter")]
[InlineData(AuditEventType.BackfillStarted, "backfill")]
[InlineData(AuditEventType.LedgerExportRequested, "export")]
[InlineData(AuditEventType.WorkerHeartbeat, "worker")]
[InlineData(AuditEventType.AuthorizationDenied, "security")]
public void Create_WithDifferentEventTypes_CreatesValidEntries(AuditEventType eventType, string resourceType)
{
// Act
var entry = AuditEntry.Create(
tenantId: "test-tenant",
eventType: eventType,
resourceType: resourceType,
resourceId: Guid.NewGuid(),
actorId: "test-actor",
actorType: ActorType.System,
description: $"Testing {eventType}",
sequenceNumber: 1);
// Assert
Assert.Equal(eventType, entry.EventType);
Assert.Equal(resourceType, entry.ResourceType);
Assert.True(entry.VerifyIntegrity());
}
[Theory]
[InlineData(ActorType.User)]
[InlineData(ActorType.System)]
[InlineData(ActorType.Worker)]
[InlineData(ActorType.ApiKey)]
[InlineData(ActorType.Service)]
[InlineData(ActorType.Unknown)]
public void Create_WithDifferentActorTypes_CreatesValidEntries(ActorType actorType)
{
// Act
var entry = AuditEntry.Create(
tenantId: "test-tenant",
eventType: AuditEventType.JobCreated,
resourceType: "job",
resourceId: Guid.NewGuid(),
actorId: "test-actor",
actorType: actorType,
description: $"Testing actor type {actorType}",
sequenceNumber: 1);
// Assert
Assert.Equal(actorType, entry.ActorType);
Assert.True(entry.VerifyIntegrity());
}
[Fact]
public void Create_WithOldAndNewState_TracksChanges()
{
// Arrange
var oldState = """{"status":"pending","priority":0}""";
var newState = """{"status":"running","priority":1}""";
// Act
var entry = AuditEntry.Create(
tenantId: "test-tenant",
eventType: AuditEventType.JobLeased,
resourceType: "job",
resourceId: Guid.NewGuid(),
actorId: "worker-1",
actorType: ActorType.Worker,
description: "Job leased",
oldState: oldState,
newState: newState,
sequenceNumber: 1);
// Assert
Assert.Equal(oldState, entry.OldState);
Assert.Equal(newState, entry.NewState);
}
[Fact]
public void Create_MultipleEntries_GeneratesDifferentHashes()
{
// Act
var entry1 = AuditEntry.Create(
tenantId: "test-tenant",
eventType: AuditEventType.JobCreated,
resourceType: "job",
resourceId: Guid.NewGuid(),
actorId: "user1",
actorType: ActorType.User,
description: "First job",
sequenceNumber: 1);
var entry2 = AuditEntry.Create(
tenantId: "test-tenant",
eventType: AuditEventType.JobCreated,
resourceType: "job",
resourceId: Guid.NewGuid(),
actorId: "user2",
actorType: ActorType.User,
description: "Second job",
sequenceNumber: 2);
// Assert
Assert.NotEqual(entry1.ContentHash, entry2.ContentHash);
Assert.NotEqual(entry1.EntryId, entry2.EntryId);
}
}

View File

@@ -0,0 +1,238 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Tests.AuditLedger;
/// <summary>
/// Tests for LedgerExport domain model.
/// </summary>
public sealed class LedgerExportTests
{
[Fact]
public void CreateRequest_WithValidParameters_CreatesExport()
{
// Act
var export = LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: "json",
requestedBy: "user@example.com",
startTime: DateTimeOffset.UtcNow.AddDays(-7),
endTime: DateTimeOffset.UtcNow,
runTypeFilter: "scan",
sourceIdFilter: Guid.NewGuid());
// Assert
Assert.NotEqual(Guid.Empty, export.ExportId);
Assert.Equal("test-tenant", export.TenantId);
Assert.Equal(LedgerExportStatus.Pending, export.Status);
Assert.Equal("json", export.Format);
Assert.NotNull(export.StartTime);
Assert.NotNull(export.EndTime);
Assert.Equal("scan", export.RunTypeFilter);
Assert.NotNull(export.SourceIdFilter);
Assert.Equal("user@example.com", export.RequestedBy);
Assert.True(export.RequestedAt > DateTimeOffset.MinValue);
Assert.Null(export.StartedAt);
Assert.Null(export.CompletedAt);
Assert.Equal(0, export.EntryCount);
}
[Theory]
[InlineData("json")]
[InlineData("ndjson")]
[InlineData("csv")]
[InlineData("JSON")]
[InlineData("NDJSON")]
[InlineData("CSV")]
public void CreateRequest_WithValidFormats_NormalizesToLowerCase(string format)
{
// Act
var export = LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: format,
requestedBy: "user");
// Assert
Assert.Equal(format.ToLowerInvariant(), export.Format);
}
[Theory]
[InlineData("xml")]
[InlineData("yaml")]
[InlineData("parquet")]
[InlineData("invalid")]
public void CreateRequest_WithInvalidFormat_ThrowsException(string format)
{
// Act & Assert
Assert.Throws<ArgumentException>(() =>
LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: format,
requestedBy: "user"));
}
[Fact]
public void CreateRequest_WithNullFormat_ThrowsException()
{
// Act & Assert
Assert.Throws<ArgumentException>(() =>
LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: null!,
requestedBy: "user"));
}
[Fact]
public void CreateRequest_WithEmptyFormat_ThrowsException()
{
// Act & Assert
Assert.Throws<ArgumentException>(() =>
LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: "",
requestedBy: "user"));
}
[Fact]
public void Start_SetsStatusAndStartedAt()
{
// Arrange
var export = LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: "json",
requestedBy: "user");
// Act
var started = export.Start();
// Assert
Assert.Equal(LedgerExportStatus.Processing, started.Status);
Assert.NotNull(started.StartedAt);
Assert.True(started.StartedAt >= export.RequestedAt);
}
[Fact]
public void Complete_SetsAllProperties()
{
// Arrange
var export = LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: "json",
requestedBy: "user").Start();
// Act
var completed = export.Complete(
outputUri: "file:///exports/test.json",
outputDigest: "sha256:abc123",
outputSizeBytes: 1024,
entryCount: 100);
// Assert
Assert.Equal(LedgerExportStatus.Completed, completed.Status);
Assert.Equal("file:///exports/test.json", completed.OutputUri);
Assert.Equal("sha256:abc123", completed.OutputDigest);
Assert.Equal(1024, completed.OutputSizeBytes);
Assert.Equal(100, completed.EntryCount);
Assert.NotNull(completed.CompletedAt);
Assert.Null(completed.ErrorMessage);
}
[Fact]
public void Fail_SetsStatusAndErrorMessage()
{
// Arrange
var export = LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: "json",
requestedBy: "user").Start();
// Act
var failed = export.Fail("Database connection failed");
// Assert
Assert.Equal(LedgerExportStatus.Failed, failed.Status);
Assert.Equal("Database connection failed", failed.ErrorMessage);
Assert.NotNull(failed.CompletedAt);
Assert.Null(failed.OutputUri);
}
[Fact]
public void CreateRequest_WithMinimalParameters_CreatesExport()
{
// Act
var export = LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: "ndjson",
requestedBy: "system");
// Assert
Assert.NotEqual(Guid.Empty, export.ExportId);
Assert.Null(export.StartTime);
Assert.Null(export.EndTime);
Assert.Null(export.RunTypeFilter);
Assert.Null(export.SourceIdFilter);
}
[Fact]
public void ExportLifecycle_FullFlow_TracksAllStates()
{
// Create
var export = LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: "csv",
requestedBy: "user");
Assert.Equal(LedgerExportStatus.Pending, export.Status);
// Start
export = export.Start();
Assert.Equal(LedgerExportStatus.Processing, export.Status);
Assert.NotNull(export.StartedAt);
// Complete
export = export.Complete("file:///out.csv", "sha256:xyz", 2048, 50);
Assert.Equal(LedgerExportStatus.Completed, export.Status);
Assert.NotNull(export.CompletedAt);
}
[Fact]
public void ExportLifecycle_FailedFlow_TracksStates()
{
// Create
var export = LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: "json",
requestedBy: "user");
// Start
export = export.Start();
// Fail
export = export.Fail("Out of disk space");
Assert.Equal(LedgerExportStatus.Failed, export.Status);
Assert.Equal("Out of disk space", export.ErrorMessage);
}
[Fact]
public void Complete_PreservesOriginalProperties()
{
// Arrange
var sourceId = Guid.NewGuid();
var export = LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: "json",
requestedBy: "user",
startTime: DateTimeOffset.UtcNow.AddDays(-1),
endTime: DateTimeOffset.UtcNow,
runTypeFilter: "scan",
sourceIdFilter: sourceId).Start();
// Act
var completed = export.Complete("uri", "digest", 100, 10);
// Assert
Assert.Equal("test-tenant", completed.TenantId);
Assert.Equal("json", completed.Format);
Assert.Equal("scan", completed.RunTypeFilter);
Assert.Equal(sourceId, completed.SourceIdFilter);
Assert.Equal("user", completed.RequestedBy);
}
}

View File

@@ -0,0 +1,318 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Tests.AuditLedger;
/// <summary>
/// Tests for RunLedgerEntry domain model.
/// </summary>
public sealed class RunLedgerTests
{
[Fact]
public void FromCompletedRun_WithValidRun_CreatesLedgerEntry()
{
// Arrange
var run = CreateCompletedRun();
var artifacts = CreateArtifacts(run.RunId, 2);
// Act
var entry = RunLedgerEntry.FromCompletedRun(
run: run,
artifacts: artifacts,
inputDigest: "abc123",
sequenceNumber: 1,
previousEntryHash: null);
// Assert
Assert.NotEqual(Guid.Empty, entry.LedgerId);
Assert.Equal(run.TenantId, entry.TenantId);
Assert.Equal(run.RunId, entry.RunId);
Assert.Equal(run.SourceId, entry.SourceId);
Assert.Equal(run.RunType, entry.RunType);
Assert.Equal(run.Status, entry.FinalStatus);
Assert.Equal(run.TotalJobs, entry.TotalJobs);
Assert.Equal(run.SucceededJobs, entry.SucceededJobs);
Assert.Equal(run.FailedJobs, entry.FailedJobs);
Assert.Equal(run.CreatedAt, entry.RunCreatedAt);
Assert.Equal(run.CompletedAt, entry.RunCompletedAt);
Assert.Equal("abc123", entry.InputDigest);
Assert.NotEmpty(entry.OutputDigest);
Assert.NotEmpty(entry.ArtifactManifest);
Assert.Equal(1, entry.SequenceNumber);
Assert.Null(entry.PreviousEntryHash);
Assert.NotEmpty(entry.ContentHash);
}
[Fact]
public void FromCompletedRun_WithIncompleteRun_ThrowsException()
{
// Arrange
var run = new Run(
RunId: Guid.NewGuid(),
TenantId: "test-tenant",
ProjectId: null,
SourceId: Guid.NewGuid(),
RunType: "scan",
Status: RunStatus.Running,
CorrelationId: null,
TotalJobs: 5,
CompletedJobs: 2,
SucceededJobs: 2,
FailedJobs: 0,
CreatedAt: DateTimeOffset.UtcNow.AddMinutes(-10),
StartedAt: DateTimeOffset.UtcNow.AddMinutes(-9),
CompletedAt: null, // Not completed
CreatedBy: "user",
Metadata: null);
// Act & Assert
Assert.Throws<InvalidOperationException>(() =>
RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null));
}
[Fact]
public void VerifyIntegrity_WithValidEntry_ReturnsTrue()
{
// Arrange
var run = CreateCompletedRun();
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
// Act
var isValid = entry.VerifyIntegrity();
// Assert
Assert.True(isValid);
}
[Fact]
public void VerifyIntegrity_WithTamperedEntry_ReturnsFalse()
{
// Arrange
var run = CreateCompletedRun();
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
// Tamper with the entry
var tamperedEntry = entry with { TotalJobs = 999 };
// Act
var isValid = tamperedEntry.VerifyIntegrity();
// Assert
Assert.False(isValid);
}
[Fact]
public void VerifyChainLink_WithNullPrevious_AndFirstEntry_ReturnsTrue()
{
// Arrange
var run = CreateCompletedRun();
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
// Act
var isValid = entry.VerifyChainLink(null);
// Assert
Assert.True(isValid);
}
[Fact]
public void VerifyChainLink_WithValidPreviousEntry_ReturnsTrue()
{
// Arrange
var run1 = CreateCompletedRun();
var first = RunLedgerEntry.FromCompletedRun(run1, [], "input1", 1, null);
var run2 = CreateCompletedRun();
var second = RunLedgerEntry.FromCompletedRun(run2, [], "input2", 2, first.ContentHash);
// Act
var isValid = second.VerifyChainLink(first);
// Assert
Assert.True(isValid);
}
[Fact]
public void VerifyChainLink_WithInvalidPreviousHash_ReturnsFalse()
{
// Arrange
var run1 = CreateCompletedRun();
var first = RunLedgerEntry.FromCompletedRun(run1, [], "input1", 1, null);
var run2 = CreateCompletedRun();
var second = RunLedgerEntry.FromCompletedRun(run2, [], "input2", 2, "invalid_hash");
// Act
var isValid = second.VerifyChainLink(first);
// Assert
Assert.False(isValid);
}
[Fact]
public void FromCompletedRun_CalculatesExecutionDuration()
{
// Arrange
var startedAt = DateTimeOffset.UtcNow.AddMinutes(-5);
var completedAt = DateTimeOffset.UtcNow;
var run = new Run(
RunId: Guid.NewGuid(),
TenantId: "test-tenant",
ProjectId: null,
SourceId: Guid.NewGuid(),
RunType: "scan",
Status: RunStatus.Succeeded,
CorrelationId: null,
TotalJobs: 10,
CompletedJobs: 10,
SucceededJobs: 10,
FailedJobs: 0,
CreatedAt: startedAt.AddMinutes(-1),
StartedAt: startedAt,
CompletedAt: completedAt,
CreatedBy: "user",
Metadata: null);
// Act
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
// Assert
Assert.Equal(completedAt - startedAt, entry.ExecutionDuration);
Assert.True(entry.ExecutionDuration.TotalMinutes >= 4.9);
Assert.True(entry.ExecutionDuration.TotalMinutes <= 5.1);
}
[Fact]
public void FromCompletedRun_WithArtifacts_GeneratesManifestAndDigest()
{
// Arrange
var run = CreateCompletedRun();
var artifacts = CreateArtifacts(run.RunId, 3);
// Act
var entry = RunLedgerEntry.FromCompletedRun(run, artifacts, "input", 1, null);
// Assert
Assert.NotEmpty(entry.ArtifactManifest);
Assert.Contains("ArtifactId", entry.ArtifactManifest);
Assert.NotEmpty(entry.OutputDigest);
}
[Fact]
public void FromCompletedRun_WithNoArtifacts_GeneratesEmptyManifest()
{
// Arrange
var run = CreateCompletedRun();
// Act
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
// Assert
Assert.Equal("[]", entry.ArtifactManifest);
Assert.NotEmpty(entry.OutputDigest);
}
[Theory]
[InlineData(RunStatus.Succeeded)]
[InlineData(RunStatus.PartiallySucceeded)]
[InlineData(RunStatus.Failed)]
[InlineData(RunStatus.Canceled)]
public void FromCompletedRun_WithDifferentStatuses_CreatesValidEntries(RunStatus status)
{
// Arrange
var run = new Run(
RunId: Guid.NewGuid(),
TenantId: "test-tenant",
ProjectId: null,
SourceId: Guid.NewGuid(),
RunType: "scan",
Status: status,
CorrelationId: null,
TotalJobs: 10,
CompletedJobs: 10,
SucceededJobs: status == RunStatus.Succeeded ? 10 : 5,
FailedJobs: status == RunStatus.Failed ? 10 : (status == RunStatus.PartiallySucceeded ? 5 : 0),
CreatedAt: DateTimeOffset.UtcNow.AddMinutes(-10),
StartedAt: DateTimeOffset.UtcNow.AddMinutes(-9),
CompletedAt: DateTimeOffset.UtcNow,
CreatedBy: "user",
Metadata: null);
// Act
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
// Assert
Assert.Equal(status, entry.FinalStatus);
Assert.True(entry.VerifyIntegrity());
}
[Fact]
public void FromCompletedRun_WithMetadata_IncludesMetadata()
{
// Arrange
var run = CreateCompletedRun();
var metadata = """{"custom":"metadata","count":42}""";
// Act
var entry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null, metadata);
// Assert
Assert.Equal(metadata, entry.Metadata);
}
[Fact]
public void ContentHash_IsDeterministic()
{
// Arrange - create two entries with same data but different times
// The hash should be different because OccurredAt is included
var run1 = CreateCompletedRun();
var entry1 = RunLedgerEntry.FromCompletedRun(run1, [], "same-input", 1, null);
// Use the exact same run to ensure determinism
var run2 = run1;
// Act - note: can't test exact determinism because LedgerId and LedgerCreatedAt differ
// Instead, verify the hash format
Assert.Equal(64, entry1.ContentHash.Length);
Assert.True(entry1.ContentHash.All(c => char.IsAsciiHexDigit(c)));
}
private static Run CreateCompletedRun(string runType = "scan") => new(
RunId: Guid.NewGuid(),
TenantId: "test-tenant",
ProjectId: null,
SourceId: Guid.NewGuid(),
RunType: runType,
Status: RunStatus.Succeeded,
CorrelationId: "corr-123",
TotalJobs: 10,
CompletedJobs: 10,
SucceededJobs: 8,
FailedJobs: 2,
CreatedAt: DateTimeOffset.UtcNow.AddMinutes(-10),
StartedAt: DateTimeOffset.UtcNow.AddMinutes(-9),
CompletedAt: DateTimeOffset.UtcNow,
CreatedBy: "test-user",
Metadata: null);
private static List<Artifact> CreateArtifacts(Guid runId, int count)
{
var artifacts = new List<Artifact>();
for (var i = 0; i < count; i++)
{
artifacts.Add(new Artifact(
ArtifactId: Guid.NewGuid(),
TenantId: "test-tenant",
JobId: Guid.NewGuid(),
RunId: runId,
ArtifactType: "sbom",
Uri: $"file:///artifacts/{Guid.NewGuid()}.json",
Digest: $"sha256:{Guid.NewGuid():N}",
MimeType: "application/json",
SizeBytes: 1024 * (i + 1),
CreatedAt: DateTimeOffset.UtcNow,
Metadata: null));
}
return artifacts;
}
}

View File

@@ -0,0 +1,398 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Tests.AuditLedger;
/// <summary>
/// Tests for SignedManifest domain model.
/// </summary>
public sealed class SignedManifestTests
{
[Fact]
public void CreateFromLedgerEntry_WithValidEntry_CreatesManifest()
{
// Arrange
var run = CreateCompletedRun();
var artifacts = CreateArtifacts(run.RunId, 2);
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, artifacts, "input-digest", 1, null);
// Act
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
// Assert
Assert.NotEqual(Guid.Empty, manifest.ManifestId);
Assert.Equal(SignedManifest.CurrentSchemaVersion, manifest.SchemaVersion);
Assert.Equal(ledgerEntry.TenantId, manifest.TenantId);
Assert.Equal(ProvenanceType.Run, manifest.ProvenanceType);
Assert.Equal(ledgerEntry.RunId, manifest.SubjectId);
Assert.NotEmpty(manifest.Statements);
Assert.NotEmpty(manifest.Artifacts);
Assert.NotEmpty(manifest.Materials);
Assert.NotEmpty(manifest.PayloadDigest);
Assert.Equal("none", manifest.SignatureAlgorithm);
Assert.Empty(manifest.Signature);
Assert.Empty(manifest.KeyId);
Assert.False(manifest.IsSigned);
Assert.False(manifest.IsExpired);
}
[Fact]
public void CreateFromExport_WithValidExport_CreatesManifest()
{
// Arrange
var export = CreateCompletedExport();
var entries = CreateLedgerEntries(3);
// Act
var manifest = SignedManifest.CreateFromExport(export, entries);
// Assert
Assert.NotEqual(Guid.Empty, manifest.ManifestId);
Assert.Equal(ProvenanceType.Export, manifest.ProvenanceType);
Assert.Equal(export.ExportId, manifest.SubjectId);
Assert.NotEmpty(manifest.Statements);
Assert.NotEmpty(manifest.Materials);
}
[Fact]
public void CreateFromExport_WithIncompleteExport_ThrowsException()
{
// Arrange
var export = LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: "json",
requestedBy: "user");
// Act & Assert
Assert.Throws<InvalidOperationException>(() =>
SignedManifest.CreateFromExport(export, []));
}
[Fact]
public void Sign_WithValidSignature_SetsSignatureProperties()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
// Act
var signed = manifest.Sign(
signatureAlgorithm: "ES256",
signature: "base64-encoded-signature",
keyId: "key-001",
expiresAt: DateTimeOffset.UtcNow.AddDays(30));
// Assert
Assert.Equal("ES256", signed.SignatureAlgorithm);
Assert.Equal("base64-encoded-signature", signed.Signature);
Assert.Equal("key-001", signed.KeyId);
Assert.True(signed.IsSigned);
Assert.False(signed.IsExpired);
Assert.NotNull(signed.ExpiresAt);
}
[Fact]
public void Sign_WithEmptyAlgorithm_ThrowsException()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
// Act & Assert
Assert.Throws<ArgumentException>(() =>
manifest.Sign("", "signature", "key-001"));
}
[Fact]
public void Sign_WithEmptySignature_ThrowsException()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
// Act & Assert
Assert.Throws<ArgumentException>(() =>
manifest.Sign("ES256", "", "key-001"));
}
[Fact]
public void Sign_WithEmptyKeyId_ThrowsException()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
// Act & Assert
Assert.Throws<ArgumentException>(() =>
manifest.Sign("ES256", "signature", ""));
}
[Fact]
public void IsSigned_WithUnsignedManifest_ReturnsFalse()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
// Assert
Assert.False(manifest.IsSigned);
}
[Fact]
public void IsExpired_WithNoExpiration_ReturnsFalse()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
// Assert
Assert.False(manifest.IsExpired);
}
[Fact]
public void IsExpired_WithFutureExpiration_ReturnsFalse()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry)
.Sign("ES256", "sig", "key", DateTimeOffset.UtcNow.AddDays(30));
// Assert
Assert.False(manifest.IsExpired);
}
[Fact]
public void IsExpired_WithPastExpiration_ReturnsTrue()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry)
.Sign("ES256", "sig", "key", DateTimeOffset.UtcNow.AddDays(-1));
// Assert
Assert.True(manifest.IsExpired);
}
[Fact]
public void VerifyPayloadIntegrity_WithValidManifest_ReturnsTrue()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
// Act
var isValid = manifest.VerifyPayloadIntegrity();
// Assert
Assert.True(isValid);
}
[Fact]
public void VerifyPayloadIntegrity_WithTamperedManifest_ReturnsFalse()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
// Tamper with the manifest
var tampered = manifest with { Statements = "[]" };
// Act
var isValid = tampered.VerifyPayloadIntegrity();
// Assert
Assert.False(isValid);
}
[Fact]
public void GetArtifactReferences_ReturnsTypedObjects()
{
// Arrange
var run = CreateCompletedRun();
var artifacts = CreateArtifacts(run.RunId, 2);
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, artifacts, "input", 1, null);
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
// Act
var references = manifest.GetArtifactReferences();
// Assert
Assert.Equal(2, references.Count);
Assert.All(references, r =>
{
Assert.NotEqual(Guid.Empty, r.ArtifactId);
Assert.NotEmpty(r.ArtifactType);
Assert.NotEmpty(r.Uri);
Assert.NotEmpty(r.Digest);
});
}
[Fact]
public void GetMaterialReferences_ReturnsTypedObjects()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input-digest", 1, null);
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
// Act
var materials = manifest.GetMaterialReferences();
// Assert
Assert.Single(materials);
Assert.Contains("input:", materials[0].Uri);
Assert.Equal("input-digest", materials[0].Digest);
}
[Fact]
public void GetStatements_ReturnsTypedObjects()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
// Act
var statements = manifest.GetStatements();
// Assert
Assert.Equal(2, statements.Count);
Assert.Contains(statements, s => s.StatementType == "run_completed");
Assert.Contains(statements, s => s.StatementType == "chain_link");
}
[Theory]
[InlineData(ProvenanceType.Run)]
[InlineData(ProvenanceType.Export)]
public void CreateManifest_WithDifferentProvenanceTypes_CreatesValidManifests(ProvenanceType expectedType)
{
// Arrange & Act
SignedManifest manifest;
if (expectedType == ProvenanceType.Run)
{
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
}
else
{
var export = CreateCompletedExport();
manifest = SignedManifest.CreateFromExport(export, []);
}
// Assert
Assert.Equal(expectedType, manifest.ProvenanceType);
Assert.True(manifest.VerifyPayloadIntegrity());
}
[Fact]
public void CreateFromLedgerEntry_WithBuildInfo_IncludesBuildInfo()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
var buildInfo = """{"version":"1.0.0","builder":"test"}""";
// Act
var manifest = SignedManifest.CreateFromLedgerEntry(ledgerEntry, buildInfo);
// Assert
Assert.Equal(buildInfo, manifest.BuildInfo);
}
[Fact]
public void PayloadDigest_IsDeterministic()
{
// Arrange
var run = CreateCompletedRun();
var ledgerEntry = RunLedgerEntry.FromCompletedRun(run, [], "input", 1, null);
// Act
var manifest1 = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
var manifest2 = SignedManifest.CreateFromLedgerEntry(ledgerEntry);
// Note: ManifestId will differ, but the payload digest should be the same
// if the content (statements, artifacts, materials) is identical
// In this case, they won't be identical because timestamps in statements differ
Assert.NotEmpty(manifest1.PayloadDigest);
Assert.NotEmpty(manifest2.PayloadDigest);
Assert.Equal(64, manifest1.PayloadDigest.Length);
}
private static Run CreateCompletedRun(string runType = "scan") => new(
RunId: Guid.NewGuid(),
TenantId: "test-tenant",
ProjectId: null,
SourceId: Guid.NewGuid(),
RunType: runType,
Status: RunStatus.Succeeded,
CorrelationId: "corr-123",
TotalJobs: 10,
CompletedJobs: 10,
SucceededJobs: 8,
FailedJobs: 2,
CreatedAt: DateTimeOffset.UtcNow.AddMinutes(-10),
StartedAt: DateTimeOffset.UtcNow.AddMinutes(-9),
CompletedAt: DateTimeOffset.UtcNow,
CreatedBy: "test-user",
Metadata: null);
private static LedgerExport CreateCompletedExport()
{
var export = LedgerExport.CreateRequest(
tenantId: "test-tenant",
format: "json",
requestedBy: "user");
return export
.Start()
.Complete("file:///exports/test.json", "sha256:abc123", 1024, 10);
}
private static List<Artifact> CreateArtifacts(Guid runId, int count)
{
var artifacts = new List<Artifact>();
for (var i = 0; i < count; i++)
{
artifacts.Add(new Artifact(
ArtifactId: Guid.NewGuid(),
TenantId: "test-tenant",
JobId: Guid.NewGuid(),
RunId: runId,
ArtifactType: "sbom",
Uri: $"file:///artifacts/{Guid.NewGuid()}.json",
Digest: $"sha256:{Guid.NewGuid():N}",
MimeType: "application/json",
SizeBytes: 1024 * (i + 1),
CreatedAt: DateTimeOffset.UtcNow,
Metadata: null));
}
return artifacts;
}
private static List<RunLedgerEntry> CreateLedgerEntries(int count)
{
var entries = new List<RunLedgerEntry>();
string? previousHash = null;
for (var i = 0; i < count; i++)
{
var run = CreateCompletedRun();
var entry = RunLedgerEntry.FromCompletedRun(run, [], $"input-{i}", i + 1, previousHash);
entries.Add(entry);
previousHash = entry.ContentHash;
}
return entries;
}
}

View File

@@ -0,0 +1,407 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Tests.Backfill;
public class BackfillRequestTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
private const string TenantId = "test-tenant";
private static readonly Guid SourceId = Guid.NewGuid();
private const string JobType = "scan";
[Fact]
public void Create_WithValidParameters_CreatesRequest()
{
var windowStart = BaseTime;
var windowEnd = BaseTime.AddDays(7);
var request = BackfillRequest.Create(
tenantId: TenantId,
sourceId: SourceId,
jobType: null,
windowStart: windowStart,
windowEnd: windowEnd,
reason: "Reprocess after bug fix",
createdBy: "admin");
Assert.NotEqual(Guid.Empty, request.BackfillId);
Assert.Equal(TenantId, request.TenantId);
Assert.Equal(SourceId, request.SourceId);
Assert.Null(request.JobType);
Assert.Equal(BackfillStatus.Pending, request.Status);
Assert.Equal(windowStart, request.WindowStart);
Assert.Equal(windowEnd, request.WindowEnd);
Assert.Null(request.CurrentPosition);
Assert.Null(request.TotalEvents);
Assert.Equal(0, request.ProcessedEvents);
Assert.Equal(0, request.SkippedEvents);
Assert.Equal(0, request.FailedEvents);
Assert.Equal(100, request.BatchSize);
Assert.False(request.DryRun);
Assert.False(request.ForceReprocess);
Assert.Equal("admin", request.CreatedBy);
Assert.Equal("admin", request.UpdatedBy);
}
[Fact]
public void Create_WithDryRunAndForceReprocess_SetsFlags()
{
var request = BackfillRequest.Create(
TenantId, SourceId, null, BaseTime, BaseTime.AddDays(1),
"Test", "admin", dryRun: true, forceReprocess: true);
Assert.True(request.DryRun);
Assert.True(request.ForceReprocess);
}
[Fact]
public void Create_WithCustomBatchSize_SetsBatchSize()
{
var request = BackfillRequest.Create(
TenantId, SourceId, null, BaseTime, BaseTime.AddDays(1),
"Test", "admin", batchSize: 500);
Assert.Equal(500, request.BatchSize);
}
[Fact]
public void Create_WithInvalidBatchSize_Throws()
{
Assert.Throws<ArgumentOutOfRangeException>(() =>
BackfillRequest.Create(TenantId, SourceId, null, BaseTime, BaseTime.AddDays(1),
"Test", "admin", batchSize: 0));
Assert.Throws<ArgumentOutOfRangeException>(() =>
BackfillRequest.Create(TenantId, SourceId, null, BaseTime, BaseTime.AddDays(1),
"Test", "admin", batchSize: 10001));
}
[Fact]
public void Create_WithInvalidWindow_Throws()
{
Assert.Throws<ArgumentException>(() =>
BackfillRequest.Create(TenantId, SourceId, null,
windowStart: BaseTime.AddDays(1),
windowEnd: BaseTime,
reason: "Test",
createdBy: "admin"));
}
[Fact]
public void Create_WithoutSourceOrJobType_Throws()
{
Assert.Throws<ArgumentException>(() =>
BackfillRequest.Create(TenantId, null, null, BaseTime, BaseTime.AddDays(1),
"Test", "admin"));
}
[Fact]
public void WindowDuration_ReturnsCorrectDuration()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(7), "Test", "admin");
Assert.Equal(TimeSpan.FromDays(7), request.WindowDuration);
}
[Fact]
public void StartValidation_TransitionsToPending()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin");
var validating = request.StartValidation("validator");
Assert.Equal(BackfillStatus.Validating, validating.Status);
Assert.Equal("validator", validating.UpdatedBy);
}
[Fact]
public void StartValidation_FromNonPending_Throws()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin");
var validating = request.StartValidation("v");
Assert.Throws<InvalidOperationException>(() =>
validating.StartValidation("v"));
}
[Fact]
public void WithSafetyChecks_RecordsSafetyResults()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin")
.StartValidation("v");
var checks = BackfillSafetyChecks.AllPassed();
var result = request.WithSafetyChecks(checks, 1000, TimeSpan.FromMinutes(10), "v");
Assert.Equal(checks, result.SafetyChecks);
Assert.Equal(1000, result.TotalEvents);
Assert.Equal(TimeSpan.FromMinutes(10), result.EstimatedDuration);
}
[Fact]
public void Start_TransitionsToRunning()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin")
.StartValidation("v")
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v");
var running = request.Start("worker");
Assert.Equal(BackfillStatus.Running, running.Status);
Assert.NotNull(running.StartedAt);
Assert.Equal(request.WindowStart, running.CurrentPosition);
Assert.Equal("worker", running.UpdatedBy);
}
[Fact]
public void Start_WithBlockingIssues_Throws()
{
var checks = new BackfillSafetyChecks(
SourceExists: false,
HasOverlappingBackfill: false,
WithinRetention: true,
WithinEventLimit: true,
WithinDurationLimit: true,
QuotaAvailable: true,
Warnings: [],
Errors: ["Source not found"]);
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin")
.StartValidation("v")
.WithSafetyChecks(checks, 1000, TimeSpan.FromMinutes(10), "v");
Assert.Throws<InvalidOperationException>(() => request.Start("worker"));
}
[Fact]
public void UpdateProgress_UpdatesCounters()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin")
.StartValidation("v")
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
.Start("worker");
var newPosition = BaseTime.AddHours(6);
var updated = request.UpdateProgress(newPosition, processed: 500, skipped: 50, failed: 5, "worker");
Assert.Equal(newPosition, updated.CurrentPosition);
Assert.Equal(500, updated.ProcessedEvents);
Assert.Equal(50, updated.SkippedEvents);
Assert.Equal(5, updated.FailedEvents);
}
[Fact]
public void UpdateProgress_AccumulatesCounts()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin")
.StartValidation("v")
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
.Start("worker");
var after1 = request.UpdateProgress(BaseTime.AddHours(1), 100, 10, 1, "w");
var after2 = after1.UpdateProgress(BaseTime.AddHours(2), 200, 20, 2, "w");
Assert.Equal(300, after2.ProcessedEvents);
Assert.Equal(30, after2.SkippedEvents);
Assert.Equal(3, after2.FailedEvents);
}
[Fact]
public void ProgressPercent_CalculatesCorrectly()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin")
.StartValidation("v")
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
.Start("worker")
.UpdateProgress(BaseTime.AddHours(12), 400, 50, 50, "w");
Assert.Equal(50.0, request.ProgressPercent);
}
[Fact]
public void Pause_TransitionsToPaused()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin")
.StartValidation("v")
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
.Start("worker");
var paused = request.Pause("admin");
Assert.Equal(BackfillStatus.Paused, paused.Status);
}
[Fact]
public void Resume_TransitionsToRunning()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin")
.StartValidation("v")
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
.Start("worker")
.Pause("admin");
var resumed = request.Resume("admin");
Assert.Equal(BackfillStatus.Running, resumed.Status);
}
[Fact]
public void Complete_TransitionsToCompleted()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin")
.StartValidation("v")
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
.Start("worker");
var completed = request.Complete("worker");
Assert.Equal(BackfillStatus.Completed, completed.Status);
Assert.NotNull(completed.CompletedAt);
Assert.Equal(request.WindowEnd, completed.CurrentPosition);
Assert.True(completed.IsTerminal);
}
[Fact]
public void Fail_TransitionsToFailed()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin")
.StartValidation("v")
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
.Start("worker");
var failed = request.Fail("Connection timeout", "worker");
Assert.Equal(BackfillStatus.Failed, failed.Status);
Assert.Equal("Connection timeout", failed.ErrorMessage);
Assert.NotNull(failed.CompletedAt);
Assert.True(failed.IsTerminal);
}
[Fact]
public void Cancel_TransitionsToCanceled()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin")
.StartValidation("v")
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
.Start("worker");
var canceled = request.Cancel("admin");
Assert.Equal(BackfillStatus.Canceled, canceled.Status);
Assert.NotNull(canceled.CompletedAt);
Assert.True(canceled.IsTerminal);
}
[Fact]
public void Cancel_FromTerminalState_Throws()
{
var request = BackfillRequest.Create(TenantId, SourceId, null,
BaseTime, BaseTime.AddDays(1), "Test", "admin")
.StartValidation("v")
.WithSafetyChecks(BackfillSafetyChecks.AllPassed(), 1000, TimeSpan.FromMinutes(10), "v")
.Start("worker")
.Complete("worker");
Assert.Throws<InvalidOperationException>(() => request.Cancel("admin"));
}
}
public class BackfillSafetyChecksTests
{
[Fact]
public void AllPassed_ReturnsValidChecks()
{
var checks = BackfillSafetyChecks.AllPassed();
Assert.True(checks.SourceExists);
Assert.False(checks.HasOverlappingBackfill);
Assert.True(checks.WithinRetention);
Assert.True(checks.WithinEventLimit);
Assert.True(checks.WithinDurationLimit);
Assert.True(checks.QuotaAvailable);
Assert.Empty(checks.Warnings);
Assert.Empty(checks.Errors);
Assert.True(checks.IsSafe);
Assert.False(checks.HasBlockingIssues);
}
[Fact]
public void HasBlockingIssues_WithMissingSource_ReturnsTrue()
{
var checks = new BackfillSafetyChecks(
SourceExists: false,
HasOverlappingBackfill: false,
WithinRetention: true,
WithinEventLimit: true,
WithinDurationLimit: true,
QuotaAvailable: true,
Warnings: [],
Errors: []);
Assert.True(checks.HasBlockingIssues);
Assert.False(checks.IsSafe);
}
[Fact]
public void HasBlockingIssues_WithOverlap_ReturnsTrue()
{
var checks = new BackfillSafetyChecks(
SourceExists: true,
HasOverlappingBackfill: true,
WithinRetention: true,
WithinEventLimit: true,
WithinDurationLimit: true,
QuotaAvailable: true,
Warnings: [],
Errors: []);
Assert.True(checks.HasBlockingIssues);
}
[Fact]
public void HasBlockingIssues_WithErrors_ReturnsTrue()
{
var checks = new BackfillSafetyChecks(
SourceExists: true,
HasOverlappingBackfill: false,
WithinRetention: true,
WithinEventLimit: true,
WithinDurationLimit: true,
QuotaAvailable: true,
Warnings: [],
Errors: ["Custom error"]);
Assert.True(checks.HasBlockingIssues);
}
[Fact]
public void IsSafe_WithOnlyWarnings_ReturnsTrue()
{
var checks = new BackfillSafetyChecks(
SourceExists: true,
HasOverlappingBackfill: false,
WithinRetention: true,
WithinEventLimit: true,
WithinDurationLimit: true,
QuotaAvailable: true,
Warnings: ["Large window may take time"],
Errors: []);
Assert.True(checks.IsSafe);
Assert.False(checks.HasBlockingIssues);
}
}

View File

@@ -0,0 +1,210 @@
using StellaOps.Orchestrator.Core.Backfill;
namespace StellaOps.Orchestrator.Tests.Backfill;
public class DuplicateSuppressorTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
private const string ScopeKey = "source:test123";
private static readonly TimeSpan DefaultTtl = TimeSpan.FromDays(30);
[Fact]
public async Task HasProcessedAsync_NewEvent_ReturnsFalse()
{
var suppressor = new InMemoryDuplicateSuppressor();
var result = await suppressor.HasProcessedAsync(ScopeKey, "event-1", CancellationToken.None);
Assert.False(result);
}
[Fact]
public async Task HasProcessedAsync_MarkedEvent_ReturnsTrue()
{
var suppressor = new InMemoryDuplicateSuppressor();
await suppressor.MarkProcessedAsync(ScopeKey, "event-1", BaseTime, null, DefaultTtl, CancellationToken.None);
var result = await suppressor.HasProcessedAsync(ScopeKey, "event-1", CancellationToken.None);
Assert.True(result);
}
[Fact]
public async Task HasProcessedAsync_DifferentScope_ReturnsFalse()
{
var suppressor = new InMemoryDuplicateSuppressor();
await suppressor.MarkProcessedAsync(ScopeKey, "event-1", BaseTime, null, DefaultTtl, CancellationToken.None);
var result = await suppressor.HasProcessedAsync("other-scope", "event-1", CancellationToken.None);
Assert.False(result);
}
[Fact]
public async Task GetProcessedAsync_ReturnsOnlyProcessedKeys()
{
var suppressor = new InMemoryDuplicateSuppressor();
await suppressor.MarkProcessedAsync(ScopeKey, "event-1", BaseTime, null, DefaultTtl, CancellationToken.None);
await suppressor.MarkProcessedAsync(ScopeKey, "event-3", BaseTime, null, DefaultTtl, CancellationToken.None);
var eventKeys = new[] { "event-1", "event-2", "event-3", "event-4" };
var result = await suppressor.GetProcessedAsync(ScopeKey, eventKeys, CancellationToken.None);
Assert.Equal(2, result.Count);
Assert.Contains("event-1", result);
Assert.Contains("event-3", result);
Assert.DoesNotContain("event-2", result);
Assert.DoesNotContain("event-4", result);
}
[Fact]
public async Task GetProcessedAsync_EmptyInput_ReturnsEmptySet()
{
var suppressor = new InMemoryDuplicateSuppressor();
var result = await suppressor.GetProcessedAsync(ScopeKey, [], CancellationToken.None);
Assert.Empty(result);
}
[Fact]
public async Task MarkProcessedBatchAsync_MarksAllEvents()
{
var suppressor = new InMemoryDuplicateSuppressor();
var events = new[]
{
new ProcessedEvent("event-1", BaseTime),
new ProcessedEvent("event-2", BaseTime.AddMinutes(1)),
new ProcessedEvent("event-3", BaseTime.AddMinutes(2))
};
await suppressor.MarkProcessedBatchAsync(ScopeKey, events, Guid.NewGuid(), DefaultTtl, CancellationToken.None);
Assert.True(await suppressor.HasProcessedAsync(ScopeKey, "event-1", CancellationToken.None));
Assert.True(await suppressor.HasProcessedAsync(ScopeKey, "event-2", CancellationToken.None));
Assert.True(await suppressor.HasProcessedAsync(ScopeKey, "event-3", CancellationToken.None));
}
[Fact]
public async Task CountProcessedAsync_ReturnsCorrectCount()
{
var suppressor = new InMemoryDuplicateSuppressor();
var events = new[]
{
new ProcessedEvent("event-1", BaseTime.AddHours(1)),
new ProcessedEvent("event-2", BaseTime.AddHours(2)),
new ProcessedEvent("event-3", BaseTime.AddHours(3)),
new ProcessedEvent("event-4", BaseTime.AddHours(5)) // Outside range
};
await suppressor.MarkProcessedBatchAsync(ScopeKey, events, null, DefaultTtl, CancellationToken.None);
var count = await suppressor.CountProcessedAsync(
ScopeKey,
BaseTime,
BaseTime.AddHours(4),
CancellationToken.None);
Assert.Equal(3, count);
}
[Fact]
public async Task CountProcessedAsync_DifferentScope_ReturnsZero()
{
var suppressor = new InMemoryDuplicateSuppressor();
await suppressor.MarkProcessedAsync(ScopeKey, "event-1", BaseTime, null, DefaultTtl, CancellationToken.None);
var count = await suppressor.CountProcessedAsync(
"other-scope",
BaseTime.AddHours(-1),
BaseTime.AddHours(1),
CancellationToken.None);
Assert.Equal(0, count);
}
[Fact]
public async Task FilterAsync_SeparatesDuplicatesFromNew()
{
var suppressor = new InMemoryDuplicateSuppressor();
await suppressor.MarkProcessedAsync(ScopeKey, "event-1", BaseTime, null, DefaultTtl, CancellationToken.None);
await suppressor.MarkProcessedAsync(ScopeKey, "event-3", BaseTime, null, DefaultTtl, CancellationToken.None);
var events = new[] { "event-1", "event-2", "event-3", "event-4" };
var result = await suppressor.FilterAsync(
ScopeKey,
events,
e => e,
CancellationToken.None);
Assert.Equal(4, result.Total);
Assert.Equal(2, result.ProcessCount);
Assert.Equal(2, result.DuplicateCount);
Assert.Contains("event-2", result.ToProcess);
Assert.Contains("event-4", result.ToProcess);
Assert.Contains("event-1", result.Duplicates);
Assert.Contains("event-3", result.Duplicates);
}
[Fact]
public async Task FilterAsync_WithEmptyList_ReturnsEmptyResult()
{
var suppressor = new InMemoryDuplicateSuppressor();
var result = await suppressor.FilterAsync<string>(
ScopeKey,
[],
e => e,
CancellationToken.None);
Assert.Equal(0, result.Total);
Assert.Empty(result.ToProcess);
Assert.Empty(result.Duplicates);
}
[Fact]
public void DuplicateFilterResult_CalculatesDuplicatePercent()
{
var result = new DuplicateFilterResult<string>(
ToProcess: ["a", "b"],
Duplicates: ["c", "d", "e"],
Total: 5);
Assert.Equal(60.0, result.DuplicatePercent);
}
[Fact]
public void DuplicateFilterResult_WithZeroTotal_ReturnsZeroPercent()
{
var result = new DuplicateFilterResult<string>(
ToProcess: [],
Duplicates: [],
Total: 0);
Assert.Equal(0.0, result.DuplicatePercent);
}
}
public class ProcessedEventTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
[Fact]
public void ProcessedEvent_StoresProperties()
{
var evt = new ProcessedEvent("event-123", BaseTime);
Assert.Equal("event-123", evt.EventKey);
Assert.Equal(BaseTime, evt.EventTime);
}
[Fact]
public void ProcessedEvent_EqualsComparison()
{
var evt1 = new ProcessedEvent("event-123", BaseTime);
var evt2 = new ProcessedEvent("event-123", BaseTime);
var evt3 = new ProcessedEvent("event-456", BaseTime);
Assert.Equal(evt1, evt2);
Assert.NotEqual(evt1, evt3);
}
}

View File

@@ -0,0 +1,355 @@
using StellaOps.Orchestrator.Core.Backfill;
namespace StellaOps.Orchestrator.Tests.Backfill;
public class EventTimeWindowTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
[Fact]
public void Duration_ReturnsCorrectValue()
{
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
Assert.Equal(TimeSpan.FromHours(2), window.Duration);
}
[Fact]
public void IsEmpty_WithEqualStartEnd_ReturnsTrue()
{
var window = new EventTimeWindow(BaseTime, BaseTime);
Assert.True(window.IsEmpty);
}
[Fact]
public void IsEmpty_WithEndBeforeStart_ReturnsTrue()
{
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(-1));
Assert.True(window.IsEmpty);
}
[Fact]
public void IsEmpty_WithValidWindow_ReturnsFalse()
{
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(1));
Assert.False(window.IsEmpty);
}
[Fact]
public void Contains_TimestampInWindow_ReturnsTrue()
{
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
Assert.True(window.Contains(BaseTime));
Assert.True(window.Contains(BaseTime.AddHours(1)));
}
[Fact]
public void Contains_TimestampAtEnd_ReturnsFalse()
{
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
Assert.False(window.Contains(BaseTime.AddHours(2)));
}
[Fact]
public void Contains_TimestampOutsideWindow_ReturnsFalse()
{
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
Assert.False(window.Contains(BaseTime.AddHours(-1)));
Assert.False(window.Contains(BaseTime.AddHours(3)));
}
[Fact]
public void Overlaps_WithOverlappingWindow_ReturnsTrue()
{
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
var window2 = new EventTimeWindow(BaseTime.AddHours(1), BaseTime.AddHours(3));
Assert.True(window1.Overlaps(window2));
Assert.True(window2.Overlaps(window1));
}
[Fact]
public void Overlaps_WithContainedWindow_ReturnsTrue()
{
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(4));
var window2 = new EventTimeWindow(BaseTime.AddHours(1), BaseTime.AddHours(2));
Assert.True(window1.Overlaps(window2));
Assert.True(window2.Overlaps(window1));
}
[Fact]
public void Overlaps_WithAdjacentWindow_ReturnsFalse()
{
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
var window2 = new EventTimeWindow(BaseTime.AddHours(2), BaseTime.AddHours(4));
Assert.False(window1.Overlaps(window2));
Assert.False(window2.Overlaps(window1));
}
[Fact]
public void Overlaps_WithDisjointWindow_ReturnsFalse()
{
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(1));
var window2 = new EventTimeWindow(BaseTime.AddHours(3), BaseTime.AddHours(4));
Assert.False(window1.Overlaps(window2));
Assert.False(window2.Overlaps(window1));
}
[Fact]
public void Intersect_WithOverlappingWindow_ReturnsIntersection()
{
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(3));
var window2 = new EventTimeWindow(BaseTime.AddHours(1), BaseTime.AddHours(4));
var intersection = window1.Intersect(window2);
Assert.NotNull(intersection);
Assert.Equal(BaseTime.AddHours(1), intersection.Start);
Assert.Equal(BaseTime.AddHours(3), intersection.End);
}
[Fact]
public void Intersect_WithContainedWindow_ReturnsContained()
{
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(4));
var window2 = new EventTimeWindow(BaseTime.AddHours(1), BaseTime.AddHours(2));
var intersection = window1.Intersect(window2);
Assert.NotNull(intersection);
Assert.Equal(window2, intersection);
}
[Fact]
public void Intersect_WithDisjointWindow_ReturnsNull()
{
var window1 = new EventTimeWindow(BaseTime, BaseTime.AddHours(1));
var window2 = new EventTimeWindow(BaseTime.AddHours(2), BaseTime.AddHours(3));
var intersection = window1.Intersect(window2);
Assert.Null(intersection);
}
[Fact]
public void Split_DividesIntoEqualBatches()
{
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(6));
var batches = window.Split(TimeSpan.FromHours(2)).ToList();
Assert.Equal(3, batches.Count);
Assert.Equal(BaseTime, batches[0].Start);
Assert.Equal(BaseTime.AddHours(2), batches[0].End);
Assert.Equal(BaseTime.AddHours(2), batches[1].Start);
Assert.Equal(BaseTime.AddHours(4), batches[1].End);
Assert.Equal(BaseTime.AddHours(4), batches[2].Start);
Assert.Equal(BaseTime.AddHours(6), batches[2].End);
}
[Fact]
public void Split_WithRemainder_CreatesPartialFinalBatch()
{
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(5));
var batches = window.Split(TimeSpan.FromHours(2)).ToList();
Assert.Equal(3, batches.Count);
Assert.Equal(BaseTime.AddHours(4), batches[2].Start);
Assert.Equal(BaseTime.AddHours(5), batches[2].End);
Assert.Equal(TimeSpan.FromHours(1), batches[2].Duration);
}
[Fact]
public void Split_WithZeroDuration_Throws()
{
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
Assert.Throws<ArgumentOutOfRangeException>(() =>
window.Split(TimeSpan.Zero).ToList());
}
[Fact]
public void Split_WithNegativeDuration_Throws()
{
var window = new EventTimeWindow(BaseTime, BaseTime.AddHours(2));
Assert.Throws<ArgumentOutOfRangeException>(() =>
window.Split(TimeSpan.FromHours(-1)).ToList());
}
[Fact]
public void FromDuration_CreatesCorrectWindow()
{
var window = EventTimeWindow.FromDuration(BaseTime, TimeSpan.FromHours(3));
Assert.Equal(BaseTime.AddHours(-3), window.Start);
Assert.Equal(BaseTime, window.End);
}
[Fact]
public void LastHours_CreatesCorrectWindow()
{
var window = EventTimeWindow.LastHours(6, BaseTime);
Assert.Equal(BaseTime.AddHours(-6), window.Start);
Assert.Equal(BaseTime, window.End);
}
[Fact]
public void LastDays_CreatesCorrectWindow()
{
var window = EventTimeWindow.LastDays(7, BaseTime);
Assert.Equal(BaseTime.AddDays(-7), window.Start);
Assert.Equal(BaseTime, window.End);
}
}
public class EventTimeWindowPlannerTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
private static readonly EventTimeWindowOptions TestOptions = new(
MinWindowSize: TimeSpan.FromMinutes(5),
MaxWindowSize: TimeSpan.FromHours(1),
OverlapDuration: TimeSpan.FromMinutes(5),
MaxLag: TimeSpan.FromHours(2),
InitialLookback: TimeSpan.FromDays(7));
[Fact]
public void GetNextWindow_WithNoWatermark_ReturnsInitialWindow()
{
var window = EventTimeWindowPlanner.GetNextWindow(BaseTime, null, TestOptions);
Assert.NotNull(window);
Assert.Equal(BaseTime - TestOptions.InitialLookback, window.Start);
Assert.Equal(window.Start + TestOptions.MaxWindowSize, window.End);
}
[Fact]
public void GetNextWindow_WithWatermark_ReturnsIncrementalWindow()
{
var watermark = BaseTime.AddHours(-2);
var window = EventTimeWindowPlanner.GetNextWindow(BaseTime, watermark, TestOptions);
Assert.NotNull(window);
Assert.Equal(watermark - TestOptions.OverlapDuration, window.Start);
}
[Fact]
public void GetNextWindow_WhenCaughtUp_ReturnsNull()
{
var watermark = BaseTime.AddMinutes(-3); // Less than MinWindowSize from now
var window = EventTimeWindowPlanner.GetNextWindow(BaseTime, watermark, TestOptions);
Assert.Null(window);
}
[Fact]
public void GetNextWindow_CapsAtNow()
{
var watermark = BaseTime.AddMinutes(-30); // 30 minutes ago
var window = EventTimeWindowPlanner.GetNextWindow(BaseTime, watermark, TestOptions);
Assert.NotNull(window);
Assert.True(window.End <= BaseTime);
}
[Fact]
public void CalculateLag_ReturnsCorrectValue()
{
var watermark = BaseTime.AddHours(-2);
var lag = EventTimeWindowPlanner.CalculateLag(BaseTime, watermark);
Assert.Equal(TimeSpan.FromHours(2), lag);
}
[Fact]
public void IsLagging_WithinThreshold_ReturnsFalse()
{
var watermark = BaseTime.AddHours(-1);
var isLagging = EventTimeWindowPlanner.IsLagging(BaseTime, watermark, TestOptions);
Assert.False(isLagging);
}
[Fact]
public void IsLagging_ExceedsThreshold_ReturnsTrue()
{
var watermark = BaseTime.AddHours(-3);
var isLagging = EventTimeWindowPlanner.IsLagging(BaseTime, watermark, TestOptions);
Assert.True(isLagging);
}
[Fact]
public void EstimateWindowsToProcess_WithNoWatermark_ReturnsInitialCount()
{
var count = EventTimeWindowPlanner.EstimateWindowsToProcess(BaseTime, null, TestOptions);
// 7 days / 1 hour = 168 windows
Assert.Equal(168, count);
}
[Fact]
public void EstimateWindowsToProcess_WithWatermark_ReturnsLagCount()
{
var watermark = BaseTime.AddHours(-3);
var count = EventTimeWindowPlanner.EstimateWindowsToProcess(BaseTime, watermark, TestOptions);
Assert.Equal(3, count);
}
[Fact]
public void EstimateWindowsToProcess_WhenCaughtUp_ReturnsZero()
{
var watermark = BaseTime.AddMinutes(-3);
var count = EventTimeWindowPlanner.EstimateWindowsToProcess(BaseTime, watermark, TestOptions);
Assert.Equal(0, count);
}
}
public class EventTimeWindowOptionsTests
{
[Fact]
public void HourlyBatches_HasCorrectDefaults()
{
var options = EventTimeWindowOptions.HourlyBatches;
Assert.Equal(TimeSpan.FromMinutes(5), options.MinWindowSize);
Assert.Equal(TimeSpan.FromHours(1), options.MaxWindowSize);
Assert.Equal(TimeSpan.FromMinutes(5), options.OverlapDuration);
Assert.Equal(TimeSpan.FromHours(2), options.MaxLag);
Assert.Equal(TimeSpan.FromDays(7), options.InitialLookback);
}
[Fact]
public void DailyBatches_HasCorrectDefaults()
{
var options = EventTimeWindowOptions.DailyBatches;
Assert.Equal(TimeSpan.FromHours(1), options.MinWindowSize);
Assert.Equal(TimeSpan.FromDays(1), options.MaxWindowSize);
Assert.Equal(TimeSpan.FromHours(1), options.OverlapDuration);
Assert.Equal(TimeSpan.FromDays(1), options.MaxLag);
Assert.Equal(TimeSpan.FromDays(30), options.InitialLookback);
}
}

View File

@@ -0,0 +1,157 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Tests.Backfill;
public class WatermarkTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
private const string TenantId = "test-tenant";
private static readonly Guid SourceId = Guid.NewGuid();
private const string JobType = "scan";
[Fact]
public void CreateScopeKey_WithSourceId_ReturnsCorrectFormat()
{
var sourceId = Guid.Parse("12345678-1234-1234-1234-123456789abc");
var scopeKey = Watermark.CreateScopeKey(sourceId);
Assert.Equal("source:12345678123412341234123456789abc", scopeKey);
}
[Fact]
public void CreateScopeKey_WithJobType_ReturnsCorrectFormat()
{
var scopeKey = Watermark.CreateScopeKey("Scan");
Assert.Equal("job_type:scan", scopeKey);
}
[Fact]
public void CreateScopeKey_WithSourceIdAndJobType_ReturnsCorrectFormat()
{
var sourceId = Guid.Parse("12345678-1234-1234-1234-123456789abc");
var scopeKey = Watermark.CreateScopeKey(sourceId, "Scan");
Assert.Equal("source:12345678123412341234123456789abc:job_type:scan", scopeKey);
}
[Fact]
public void Create_WithSourceId_CreatesValidWatermark()
{
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
Assert.NotEqual(Guid.Empty, watermark.WatermarkId);
Assert.Equal(TenantId, watermark.TenantId);
Assert.Equal(SourceId, watermark.SourceId);
Assert.Null(watermark.JobType);
Assert.Equal(BaseTime, watermark.HighWatermark);
Assert.Null(watermark.LowWatermark);
Assert.Equal(0, watermark.SequenceNumber);
Assert.Equal(0, watermark.ProcessedCount);
Assert.Null(watermark.LastBatchHash);
Assert.Equal("system", watermark.UpdatedBy);
}
[Fact]
public void Create_WithJobType_CreatesValidWatermark()
{
var watermark = Watermark.Create(TenantId, null, JobType, BaseTime, "system");
Assert.NotEqual(Guid.Empty, watermark.WatermarkId);
Assert.Equal(TenantId, watermark.TenantId);
Assert.Null(watermark.SourceId);
Assert.Equal(JobType, watermark.JobType);
Assert.Equal($"job_type:{JobType}", watermark.ScopeKey);
}
[Fact]
public void Create_WithBothSourceIdAndJobType_CreatesCombinedScopeKey()
{
var watermark = Watermark.Create(TenantId, SourceId, JobType, BaseTime, "system");
Assert.Equal(SourceId, watermark.SourceId);
Assert.Equal(JobType, watermark.JobType);
Assert.Contains("source:", watermark.ScopeKey);
Assert.Contains("job_type:", watermark.ScopeKey);
}
[Fact]
public void Create_WithoutSourceIdOrJobType_Throws()
{
Assert.Throws<ArgumentException>(() =>
Watermark.Create(TenantId, null, null, BaseTime, "system"));
}
[Fact]
public void Advance_IncreasesHighWatermarkAndSequence()
{
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
var newTime = BaseTime.AddHours(1);
var batchHash = "abc123def456";
var advanced = watermark.Advance(newTime, 100, batchHash, "worker-1");
Assert.Equal(newTime, advanced.HighWatermark);
Assert.Equal(1, advanced.SequenceNumber);
Assert.Equal(100, advanced.ProcessedCount);
Assert.Equal(batchHash, advanced.LastBatchHash);
Assert.Equal("worker-1", advanced.UpdatedBy);
}
[Fact]
public void Advance_AccumulatesProcessedCount()
{
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
var after1 = watermark.Advance(BaseTime.AddHours(1), 100, null, "worker");
var after2 = after1.Advance(BaseTime.AddHours(2), 150, null, "worker");
Assert.Equal(250, after2.ProcessedCount);
Assert.Equal(2, after2.SequenceNumber);
}
[Fact]
public void Advance_WithEarlierTime_Throws()
{
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
var earlierTime = BaseTime.AddHours(-1);
Assert.Throws<ArgumentException>(() =>
watermark.Advance(earlierTime, 100, null, "worker"));
}
[Fact]
public void WithWindow_SetsWindowBounds()
{
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
var lowWm = BaseTime.AddHours(-1);
var highWm = BaseTime.AddHours(1);
var windowed = watermark.WithWindow(lowWm, highWm);
Assert.Equal(lowWm, windowed.LowWatermark);
Assert.Equal(highWm, windowed.HighWatermark);
}
[Fact]
public void WithWindow_HighBeforeLow_Throws()
{
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
Assert.Throws<ArgumentException>(() =>
watermark.WithWindow(BaseTime.AddHours(1), BaseTime.AddHours(-1)));
}
[Fact]
public void WatermarkSnapshot_CalculatesLag()
{
var watermark = Watermark.Create(TenantId, SourceId, null, BaseTime, "system");
var now = BaseTime.AddHours(2);
var snapshot = WatermarkSnapshot.FromWatermark(watermark, now);
Assert.Equal(watermark.ScopeKey, snapshot.ScopeKey);
Assert.Equal(watermark.HighWatermark, snapshot.HighWatermark);
Assert.Equal(TimeSpan.FromHours(2), snapshot.Lag);
}
}

View File

@@ -0,0 +1,355 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Tests.ControlPlane;
/// <summary>
/// Control-plane validation tests for Run domain and lifecycle operations.
/// These tests validate the Run record, status transitions, and job counting.
/// </summary>
public sealed class RunTests
{
private const string TestTenantId = "test-tenant";
[Fact]
public void Run_Creation_WithValidData_Succeeds()
{
var runId = Guid.NewGuid();
var sourceId = Guid.NewGuid();
var now = DateTimeOffset.UtcNow;
var run = new Run(
RunId: runId,
TenantId: TestTenantId,
ProjectId: "project-1",
SourceId: sourceId,
RunType: "scan",
Status: RunStatus.Pending,
CorrelationId: "corr-123",
TotalJobs: 5,
CompletedJobs: 0,
SucceededJobs: 0,
FailedJobs: 0,
CreatedAt: now,
StartedAt: null,
CompletedAt: null,
CreatedBy: "system",
Metadata: """{"image":"alpine:3.18"}""");
Assert.Equal(runId, run.RunId);
Assert.Equal(TestTenantId, run.TenantId);
Assert.Equal("project-1", run.ProjectId);
Assert.Equal(sourceId, run.SourceId);
Assert.Equal("scan", run.RunType);
Assert.Equal(RunStatus.Pending, run.Status);
Assert.Equal(5, run.TotalJobs);
Assert.Equal(0, run.CompletedJobs);
Assert.Null(run.StartedAt);
Assert.Null(run.CompletedAt);
}
[Fact]
public void Run_StatusTransition_PendingToRunning()
{
var run = CreateRun(RunStatus.Pending);
var started = run with
{
Status = RunStatus.Running,
StartedAt = DateTimeOffset.UtcNow
};
Assert.Equal(RunStatus.Running, started.Status);
Assert.NotNull(started.StartedAt);
}
[Fact]
public void Run_StatusTransition_RunningToSucceeded()
{
var run = CreateRun(RunStatus.Running, totalJobs: 3, completedJobs: 2, succeededJobs: 2);
var completed = run with
{
Status = RunStatus.Succeeded,
CompletedJobs = 3,
SucceededJobs = 3,
CompletedAt = DateTimeOffset.UtcNow
};
Assert.Equal(RunStatus.Succeeded, completed.Status);
Assert.Equal(3, completed.CompletedJobs);
Assert.Equal(3, completed.SucceededJobs);
Assert.Equal(0, completed.FailedJobs);
Assert.NotNull(completed.CompletedAt);
}
[Fact]
public void Run_StatusTransition_RunningToPartiallySucceeded()
{
var run = CreateRun(RunStatus.Running, totalJobs: 5, completedJobs: 4, succeededJobs: 3, failedJobs: 1);
var completed = run with
{
Status = RunStatus.PartiallySucceeded,
CompletedJobs = 5,
SucceededJobs = 4,
FailedJobs = 1,
CompletedAt = DateTimeOffset.UtcNow
};
Assert.Equal(RunStatus.PartiallySucceeded, completed.Status);
Assert.Equal(5, completed.CompletedJobs);
Assert.Equal(4, completed.SucceededJobs);
Assert.Equal(1, completed.FailedJobs);
}
[Fact]
public void Run_StatusTransition_RunningToFailed()
{
var run = CreateRun(RunStatus.Running, totalJobs: 3, completedJobs: 2, failedJobs: 2);
var failed = run with
{
Status = RunStatus.Failed,
CompletedJobs = 3,
FailedJobs = 3,
SucceededJobs = 0,
CompletedAt = DateTimeOffset.UtcNow
};
Assert.Equal(RunStatus.Failed, failed.Status);
Assert.Equal(0, failed.SucceededJobs);
Assert.Equal(3, failed.FailedJobs);
}
[Fact]
public void Run_StatusTransition_ToCanceled()
{
var run = CreateRun(RunStatus.Running, totalJobs: 5, completedJobs: 2);
var canceled = run with
{
Status = RunStatus.Canceled,
CompletedAt = DateTimeOffset.UtcNow
};
Assert.Equal(RunStatus.Canceled, canceled.Status);
Assert.Equal(2, canceled.CompletedJobs); // Preserves completed count
Assert.NotNull(canceled.CompletedAt);
}
[Theory]
[InlineData(RunStatus.Pending)]
[InlineData(RunStatus.Running)]
[InlineData(RunStatus.Succeeded)]
[InlineData(RunStatus.PartiallySucceeded)]
[InlineData(RunStatus.Failed)]
[InlineData(RunStatus.Canceled)]
public void RunStatus_AllValues_AreValid(RunStatus status)
{
var run = CreateRun(status);
Assert.Equal(status, run.Status);
}
[Fact]
public void Run_JobCounting_IncrementSucceeded()
{
var run = CreateRun(RunStatus.Running, totalJobs: 3, completedJobs: 0);
var afterOne = run with
{
CompletedJobs = 1,
SucceededJobs = 1
};
var afterTwo = afterOne with
{
CompletedJobs = 2,
SucceededJobs = 2
};
var afterThree = afterTwo with
{
CompletedJobs = 3,
SucceededJobs = 3,
Status = RunStatus.Succeeded,
CompletedAt = DateTimeOffset.UtcNow
};
Assert.Equal(1, afterOne.CompletedJobs);
Assert.Equal(2, afterTwo.CompletedJobs);
Assert.Equal(3, afterThree.CompletedJobs);
Assert.Equal(RunStatus.Succeeded, afterThree.Status);
}
[Fact]
public void Run_JobCounting_IncrementFailed()
{
var run = CreateRun(RunStatus.Running, totalJobs: 2, completedJobs: 0);
var afterOne = run with
{
CompletedJobs = 1,
FailedJobs = 1
};
var afterTwo = afterOne with
{
CompletedJobs = 2,
FailedJobs = 2,
Status = RunStatus.Failed,
CompletedAt = DateTimeOffset.UtcNow
};
Assert.Equal(2, afterTwo.FailedJobs);
Assert.Equal(0, afterTwo.SucceededJobs);
Assert.Equal(RunStatus.Failed, afterTwo.Status);
}
[Fact]
public void Run_JobCounting_MixedResults_PartialSuccess()
{
var run = CreateRun(RunStatus.Running, totalJobs: 4);
var final = run with
{
CompletedJobs = 4,
SucceededJobs = 3,
FailedJobs = 1,
Status = RunStatus.PartiallySucceeded,
CompletedAt = DateTimeOffset.UtcNow
};
Assert.Equal(4, final.CompletedJobs);
Assert.Equal(3, final.SucceededJobs);
Assert.Equal(1, final.FailedJobs);
Assert.Equal(RunStatus.PartiallySucceeded, final.Status);
}
[Fact]
public void Run_JobCounting_Invariant_CompletedEqualsSucceededPlusFailed()
{
var run = CreateRun(
RunStatus.Running,
totalJobs: 10,
completedJobs: 7,
succeededJobs: 5,
failedJobs: 2);
Assert.Equal(run.SucceededJobs + run.FailedJobs, run.CompletedJobs);
}
[Fact]
public void Run_Duration_CanBeCalculated()
{
var startedAt = new DateTimeOffset(2025, 1, 1, 10, 0, 0, TimeSpan.Zero);
var completedAt = new DateTimeOffset(2025, 1, 1, 10, 5, 30, TimeSpan.Zero);
var run = new Run(
Guid.NewGuid(), TestTenantId, null, Guid.NewGuid(), "scan",
RunStatus.Succeeded, null, 5, 5, 5, 0,
startedAt.AddMinutes(-1), startedAt, completedAt, "system", null);
var duration = run.CompletedAt!.Value - run.StartedAt!.Value;
Assert.Equal(TimeSpan.FromMinutes(5.5), duration);
}
[Theory]
[InlineData("scan")]
[InlineData("advisory-sync")]
[InlineData("export")]
[InlineData("policy-evaluation")]
public void Run_RunType_AcceptsValidTypes(string runType)
{
var run = CreateRun(runType: runType);
Assert.Equal(runType, run.RunType);
}
[Fact]
public void Run_ProjectId_CanBeNull()
{
var run = CreateRun(projectId: null);
Assert.Null(run.ProjectId);
}
[Fact]
public void Run_CorrelationId_ForDistributedTracing()
{
var correlationId = "trace-" + Guid.NewGuid().ToString("N")[..8];
var run = CreateRun(correlationId: correlationId);
Assert.Equal(correlationId, run.CorrelationId);
}
[Fact]
public void Run_Metadata_CanContainJsonBlob()
{
var metadata = """
{
"image": "alpine:3.18",
"analyzers": ["syft", "grype", "trivy"],
"priority": "high"
}
""";
var run = CreateRun(metadata: metadata);
Assert.Contains("alpine:3.18", run.Metadata);
Assert.Contains("analyzers", run.Metadata);
}
[Fact]
public void Run_Equality_BasedOnRecordSemantics()
{
var runId = Guid.NewGuid();
var sourceId = Guid.NewGuid();
var now = DateTimeOffset.UtcNow;
var run1 = new Run(
runId, TestTenantId, null, sourceId, "scan",
RunStatus.Pending, null, 5, 0, 0, 0,
now, null, null, "system", null);
var run2 = new Run(
runId, TestTenantId, null, sourceId, "scan",
RunStatus.Pending, null, 5, 0, 0, 0,
now, null, null, "system", null);
Assert.Equal(run1, run2);
}
[Fact]
public void Run_ZeroTotalJobs_IsValid()
{
// Edge case: run with no jobs (perhaps all filtered out)
var run = CreateRun(totalJobs: 0);
Assert.Equal(0, run.TotalJobs);
Assert.Equal(0, run.CompletedJobs);
}
private static Run CreateRun(
RunStatus status = RunStatus.Pending,
int totalJobs = 5,
int completedJobs = 0,
int succeededJobs = 0,
int failedJobs = 0,
string runType = "test-run",
string? projectId = "test-project",
string? correlationId = null,
string? metadata = null)
{
var now = DateTimeOffset.UtcNow;
return new Run(
RunId: Guid.NewGuid(),
TenantId: TestTenantId,
ProjectId: projectId,
SourceId: Guid.NewGuid(),
RunType: runType,
Status: status,
CorrelationId: correlationId,
TotalJobs: totalJobs,
CompletedJobs: completedJobs,
SucceededJobs: succeededJobs,
FailedJobs: failedJobs,
CreatedAt: now,
StartedAt: status == RunStatus.Running ? now : null,
CompletedAt: null,
CreatedBy: "system",
Metadata: metadata);
}
}

View File

@@ -0,0 +1,260 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Tests.ControlPlane;
/// <summary>
/// Control-plane validation tests for Source domain and operations.
/// These tests validate the Source record, its invariants, and business rules.
/// </summary>
public sealed class SourceTests
{
private const string TestTenantId = "test-tenant";
[Fact]
public void Source_Creation_WithValidData_Succeeds()
{
var sourceId = Guid.NewGuid();
var now = DateTimeOffset.UtcNow;
var source = new Source(
SourceId: sourceId,
TenantId: TestTenantId,
Name: "concelier-nvd",
SourceType: "advisory-ingest",
Enabled: true,
Paused: false,
PauseReason: null,
PauseTicket: null,
Configuration: """{"feed_url":"https://nvd.nist.gov"}""",
CreatedAt: now,
UpdatedAt: now,
UpdatedBy: "system");
Assert.Equal(sourceId, source.SourceId);
Assert.Equal(TestTenantId, source.TenantId);
Assert.Equal("concelier-nvd", source.Name);
Assert.Equal("advisory-ingest", source.SourceType);
Assert.True(source.Enabled);
Assert.False(source.Paused);
Assert.Null(source.PauseReason);
Assert.NotNull(source.Configuration);
}
[Fact]
public void Source_Creation_WithPausedState_HasReasonAndTicket()
{
var source = CreatePausedSource(
"Maintenance window",
"OPS-1234");
Assert.True(source.Paused);
Assert.Equal("Maintenance window", source.PauseReason);
Assert.Equal("OPS-1234", source.PauseTicket);
}
[Fact]
public void Source_Creation_DisabledSource_IsNotPaused()
{
var source = CreateSource(enabled: false, paused: false);
Assert.False(source.Enabled);
Assert.False(source.Paused);
}
[Fact]
public void Source_WithRecord_AllowsImmutableUpdates()
{
var original = CreateSource();
var updated = original with { Enabled = false, UpdatedAt = DateTimeOffset.UtcNow };
Assert.True(original.Enabled);
Assert.False(updated.Enabled);
Assert.Equal(original.SourceId, updated.SourceId);
Assert.Equal(original.Name, updated.Name);
}
[Fact]
public void Source_Pause_UpdatesStateCorrectly()
{
var original = CreateSource();
var now = DateTimeOffset.UtcNow;
var paused = original with
{
Paused = true,
PauseReason = "Rate limit exceeded",
PauseTicket = "INC-5678",
UpdatedAt = now,
UpdatedBy = "operator"
};
Assert.False(original.Paused);
Assert.True(paused.Paused);
Assert.Equal("Rate limit exceeded", paused.PauseReason);
Assert.Equal("INC-5678", paused.PauseTicket);
Assert.Equal("operator", paused.UpdatedBy);
}
[Fact]
public void Source_Resume_ClearsReasonAndTicket()
{
var paused = CreatePausedSource("Test reason", "TICKET-123");
var now = DateTimeOffset.UtcNow;
var resumed = paused with
{
Paused = false,
PauseReason = null,
PauseTicket = null,
UpdatedAt = now,
UpdatedBy = "operator"
};
Assert.False(resumed.Paused);
Assert.Null(resumed.PauseReason);
Assert.Null(resumed.PauseTicket);
}
[Theory]
[InlineData("advisory-ingest")]
[InlineData("scanner")]
[InlineData("export")]
[InlineData("scheduler")]
[InlineData("policy")]
public void Source_SourceType_AcceptsValidTypes(string sourceType)
{
var source = CreateSource(sourceType: sourceType);
Assert.Equal(sourceType, source.SourceType);
}
[Fact]
public void Source_Configuration_CanBeNull()
{
var source = CreateSource(configuration: null);
Assert.Null(source.Configuration);
}
[Fact]
public void Source_Configuration_CanContainJsonBlob()
{
var config = """
{
"feed_url": "https://nvd.nist.gov",
"poll_interval_seconds": 3600,
"retry_policy": {
"max_attempts": 3,
"backoff_multiplier": 2.0
}
}
""";
var source = CreateSource(configuration: config);
Assert.Contains("feed_url", source.Configuration);
Assert.Contains("retry_policy", source.Configuration);
}
[Fact]
public void Source_Equality_BasedOnRecordSemantics()
{
var sourceId = Guid.NewGuid();
var now = DateTimeOffset.UtcNow;
var source1 = new Source(
sourceId, TestTenantId, "test", "type", true, false,
null, null, null, now, now, "user");
var source2 = new Source(
sourceId, TestTenantId, "test", "type", true, false,
null, null, null, now, now, "user");
Assert.Equal(source1, source2);
Assert.Equal(source1.GetHashCode(), source2.GetHashCode());
}
[Fact]
public void Source_Inequality_WhenDifferentFields()
{
var source1 = CreateSource(name: "source-a");
var source2 = CreateSource(name: "source-b");
Assert.NotEqual(source1, source2);
}
[Fact]
public void Source_CanBeDisabledWhilePaused()
{
var source = CreateSource(enabled: false, paused: true)
with { PauseReason = "Permanently retired" };
Assert.False(source.Enabled);
Assert.True(source.Paused);
Assert.Equal("Permanently retired", source.PauseReason);
}
[Fact]
public void Source_UpdatedBy_TracksLastModifier()
{
var source = CreateSource(updatedBy: "system");
var modified = source with { UpdatedBy = "admin@example.com" };
Assert.Equal("system", source.UpdatedBy);
Assert.Equal("admin@example.com", modified.UpdatedBy);
}
[Fact]
public void Source_Timestamps_ArePreserved()
{
var createdAt = new DateTimeOffset(2025, 1, 1, 0, 0, 0, TimeSpan.Zero);
var updatedAt = new DateTimeOffset(2025, 6, 15, 12, 30, 0, TimeSpan.Zero);
var source = new Source(
Guid.NewGuid(), TestTenantId, "test", "type", true, false,
null, null, null, createdAt, updatedAt, "user");
Assert.Equal(createdAt, source.CreatedAt);
Assert.Equal(updatedAt, source.UpdatedAt);
Assert.True(source.UpdatedAt > source.CreatedAt);
}
private static Source CreateSource(
string name = "test-source",
string sourceType = "test-type",
bool enabled = true,
bool paused = false,
string? configuration = null,
string updatedBy = "system")
{
var now = DateTimeOffset.UtcNow;
return new Source(
SourceId: Guid.NewGuid(),
TenantId: TestTenantId,
Name: name,
SourceType: sourceType,
Enabled: enabled,
Paused: paused,
PauseReason: null,
PauseTicket: null,
Configuration: configuration,
CreatedAt: now,
UpdatedAt: now,
UpdatedBy: updatedBy);
}
private static Source CreatePausedSource(string reason, string? ticket = null)
{
var now = DateTimeOffset.UtcNow;
return new Source(
SourceId: Guid.NewGuid(),
TenantId: TestTenantId,
Name: "paused-source",
SourceType: "test-type",
Enabled: true,
Paused: true,
PauseReason: reason,
PauseTicket: ticket,
Configuration: null,
CreatedAt: now,
UpdatedAt: now,
UpdatedBy: "operator");
}
}

View File

@@ -0,0 +1,320 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Tests.DeadLetter;
public class DeadLetterEntryTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
private const string TenantId = "test-tenant";
private static Job CreateTestJob() =>
new(
JobId: Guid.NewGuid(),
TenantId: TenantId,
ProjectId: null,
RunId: Guid.NewGuid(),
JobType: "scan.image",
Status: JobStatus.Failed,
Priority: 0,
Attempt: 3,
MaxAttempts: 3,
PayloadDigest: "abcd1234" + new string('0', 56),
Payload: """{"image":"test:latest"}""",
IdempotencyKey: "test-key-123",
CorrelationId: "trace-456",
LeaseId: null,
WorkerId: null,
TaskRunnerId: null,
LeaseUntil: null,
CreatedAt: BaseTime.AddHours(-1),
ScheduledAt: BaseTime.AddMinutes(-50),
LeasedAt: BaseTime.AddMinutes(-45),
CompletedAt: BaseTime,
NotBefore: null,
Reason: "Connection timeout",
ReplayOf: null,
CreatedBy: "test-user");
[Fact]
public void FromFailedJob_CreatesValidEntry()
{
var job = CreateTestJob();
var entry = DeadLetterEntry.FromFailedJob(
job,
errorCode: "ORCH-TRN-001",
failureReason: "Network timeout",
remediationHint: "Check connectivity",
category: ErrorCategory.Transient,
isRetryable: true,
now: BaseTime);
Assert.NotEqual(Guid.Empty, entry.EntryId);
Assert.Equal(TenantId, entry.TenantId);
Assert.Equal(job.JobId, entry.OriginalJobId);
Assert.Equal(job.RunId, entry.RunId);
Assert.Equal(job.JobType, entry.JobType);
Assert.Equal(job.Payload, entry.Payload);
Assert.Equal(job.PayloadDigest, entry.PayloadDigest);
Assert.Equal(job.IdempotencyKey, entry.IdempotencyKey);
Assert.Equal(job.CorrelationId, entry.CorrelationId);
Assert.Equal(DeadLetterStatus.Pending, entry.Status);
Assert.Equal("ORCH-TRN-001", entry.ErrorCode);
Assert.Equal("Network timeout", entry.FailureReason);
Assert.Equal("Check connectivity", entry.RemediationHint);
Assert.Equal(ErrorCategory.Transient, entry.Category);
Assert.True(entry.IsRetryable);
Assert.Equal(3, entry.OriginalAttempts);
Assert.Equal(0, entry.ReplayAttempts);
Assert.Equal(3, entry.MaxReplayAttempts);
Assert.Equal(BaseTime, entry.FailedAt);
Assert.Equal(BaseTime, entry.CreatedAt);
Assert.False(entry.IsTerminal);
Assert.True(entry.CanReplay);
}
[Fact]
public void FromFailedJob_WithCustomRetention_SetsExpiresAt()
{
var job = CreateTestJob();
var retention = TimeSpan.FromDays(60);
var entry = DeadLetterEntry.FromFailedJob(
job, "ERR", "Failed", null, ErrorCategory.Unknown, false, BaseTime,
retention: retention);
Assert.Equal(BaseTime.AddDays(60), entry.ExpiresAt);
}
[Fact]
public void FromFailedJob_WithCustomMaxReplays_SetsMaxReplayAttempts()
{
var job = CreateTestJob();
var entry = DeadLetterEntry.FromFailedJob(
job, "ERR", "Failed", null, ErrorCategory.Unknown, true, BaseTime,
maxReplayAttempts: 5);
Assert.Equal(5, entry.MaxReplayAttempts);
}
[Fact]
public void StartReplay_TransitionsToReplaying()
{
var entry = CreatePendingEntry();
var replaying = entry.StartReplay("operator", BaseTime.AddMinutes(5));
Assert.Equal(DeadLetterStatus.Replaying, replaying.Status);
Assert.Equal(1, replaying.ReplayAttempts);
Assert.Equal("operator", replaying.UpdatedBy);
Assert.False(replaying.IsTerminal);
}
[Fact]
public void StartReplay_IncreasesAttemptCount()
{
var entry = CreatePendingEntry() with { ReplayAttempts = 1 };
var replaying = entry.StartReplay("operator", BaseTime);
Assert.Equal(2, replaying.ReplayAttempts);
}
[Fact]
public void StartReplay_WhenNotRetryable_Throws()
{
var entry = CreatePendingEntry() with { IsRetryable = false };
Assert.Throws<InvalidOperationException>(() =>
entry.StartReplay("operator", BaseTime));
}
[Fact]
public void StartReplay_WhenExhausted_Throws()
{
var entry = CreatePendingEntry() with { ReplayAttempts = 3 };
Assert.Throws<InvalidOperationException>(() =>
entry.StartReplay("operator", BaseTime));
}
[Fact]
public void StartReplay_WhenTerminal_Throws()
{
var entry = CreatePendingEntry() with { Status = DeadLetterStatus.Resolved };
Assert.Throws<InvalidOperationException>(() =>
entry.StartReplay("operator", BaseTime));
}
[Fact]
public void CompleteReplay_TransitionsToReplayed()
{
var entry = CreatePendingEntry().StartReplay("op", BaseTime);
var newJobId = Guid.NewGuid();
var completed = entry.CompleteReplay(newJobId, "op", BaseTime.AddMinutes(1));
Assert.Equal(DeadLetterStatus.Replayed, completed.Status);
Assert.Equal(BaseTime.AddMinutes(1), completed.ResolvedAt);
Assert.Contains(newJobId.ToString(), completed.ResolutionNotes);
Assert.True(completed.IsTerminal);
Assert.False(completed.CanReplay);
}
[Fact]
public void CompleteReplay_WhenNotReplaying_Throws()
{
var entry = CreatePendingEntry();
Assert.Throws<InvalidOperationException>(() =>
entry.CompleteReplay(Guid.NewGuid(), "op", BaseTime));
}
[Fact]
public void FailReplay_WithAttemptsRemaining_ReturnsToPending()
{
var entry = CreatePendingEntry().StartReplay("op", BaseTime);
var failed = entry.FailReplay("Timeout", "op", BaseTime.AddMinutes(1));
Assert.Equal(DeadLetterStatus.Pending, failed.Status);
Assert.Equal("Timeout", failed.FailureReason);
Assert.False(failed.IsTerminal);
Assert.True(failed.CanReplay); // Still has 2 more attempts
}
[Fact]
public void FailReplay_WithNoAttemptsRemaining_TransitionsToExhausted()
{
var entry = CreatePendingEntry() with { ReplayAttempts = 2 };
var replaying = entry.StartReplay("op", BaseTime); // Now at 3 attempts
var failed = replaying.FailReplay("Final failure", "op", BaseTime);
Assert.Equal(DeadLetterStatus.Exhausted, failed.Status);
Assert.True(failed.IsTerminal);
Assert.False(failed.CanReplay);
}
[Fact]
public void Resolve_TransitionsToResolved()
{
var entry = CreatePendingEntry();
var resolved = entry.Resolve("Manually verified as expected", "admin", BaseTime);
Assert.Equal(DeadLetterStatus.Resolved, resolved.Status);
Assert.Equal(BaseTime, resolved.ResolvedAt);
Assert.Equal("Manually verified as expected", resolved.ResolutionNotes);
Assert.Equal("admin", resolved.UpdatedBy);
Assert.True(resolved.IsTerminal);
}
[Fact]
public void Resolve_WhenTerminal_Throws()
{
var entry = CreatePendingEntry() with { Status = DeadLetterStatus.Replayed };
Assert.Throws<InvalidOperationException>(() =>
entry.Resolve("Notes", "admin", BaseTime));
}
[Fact]
public void MarkExpired_TransitionsToExpired()
{
var entry = CreatePendingEntry();
var expired = entry.MarkExpired(BaseTime.AddDays(31));
Assert.Equal(DeadLetterStatus.Expired, expired.Status);
Assert.Equal("system", expired.UpdatedBy);
Assert.True(expired.IsTerminal);
}
[Fact]
public void MarkExpired_WhenTerminal_Throws()
{
var entry = CreatePendingEntry() with { Status = DeadLetterStatus.Resolved };
Assert.Throws<InvalidOperationException>(() =>
entry.MarkExpired(BaseTime));
}
[Fact]
public void CanReplay_WhenRetryableAndNotTerminalAndAttemptsAvailable_ReturnsTrue()
{
var entry = CreatePendingEntry();
Assert.True(entry.CanReplay);
}
[Fact]
public void CanReplay_WhenNotRetryable_ReturnsFalse()
{
var entry = CreatePendingEntry() with { IsRetryable = false };
Assert.False(entry.CanReplay);
}
[Fact]
public void CanReplay_WhenTerminal_ReturnsFalse()
{
var entry = CreatePendingEntry() with { Status = DeadLetterStatus.Replayed };
Assert.False(entry.CanReplay);
}
[Fact]
public void CanReplay_WhenMaxAttemptsReached_ReturnsFalse()
{
var entry = CreatePendingEntry() with { ReplayAttempts = 3 };
Assert.False(entry.CanReplay);
}
[Theory]
[InlineData(DeadLetterStatus.Pending, false)]
[InlineData(DeadLetterStatus.Replaying, false)]
[InlineData(DeadLetterStatus.Replayed, true)]
[InlineData(DeadLetterStatus.Resolved, true)]
[InlineData(DeadLetterStatus.Exhausted, true)]
[InlineData(DeadLetterStatus.Expired, true)]
public void IsTerminal_ReturnsCorrectValue(DeadLetterStatus status, bool expectedTerminal)
{
var entry = CreatePendingEntry() with { Status = status };
Assert.Equal(expectedTerminal, entry.IsTerminal);
}
private static DeadLetterEntry CreatePendingEntry() =>
new(
EntryId: Guid.NewGuid(),
TenantId: TenantId,
OriginalJobId: Guid.NewGuid(),
RunId: Guid.NewGuid(),
SourceId: null,
JobType: "scan.image",
Payload: "{}",
PayloadDigest: new string('a', 64),
IdempotencyKey: "key-123",
CorrelationId: "trace-456",
Status: DeadLetterStatus.Pending,
ErrorCode: "ORCH-TRN-001",
FailureReason: "Network timeout",
RemediationHint: "Check connectivity",
Category: ErrorCategory.Transient,
IsRetryable: true,
OriginalAttempts: 3,
ReplayAttempts: 0,
MaxReplayAttempts: 3,
FailedAt: BaseTime,
CreatedAt: BaseTime,
UpdatedAt: BaseTime,
ExpiresAt: BaseTime.AddDays(30),
ResolvedAt: null,
ResolutionNotes: null,
CreatedBy: "test-user",
UpdatedBy: "system");
}

View File

@@ -0,0 +1,265 @@
using StellaOps.Orchestrator.Core.DeadLetter;
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Tests.DeadLetter;
public class ErrorClassificationTests
{
private readonly DefaultErrorClassifier _classifier = new();
[Fact]
public void Classify_KnownErrorCode_ReturnsCorrectClassification()
{
var result = _classifier.Classify(DefaultErrorClassifier.ErrorCodes.NetworkTimeout, "test");
Assert.Equal(DefaultErrorClassifier.ErrorCodes.NetworkTimeout, result.ErrorCode);
Assert.Equal(ErrorCategory.Transient, result.Category);
Assert.True(result.IsRetryable);
Assert.NotNull(result.SuggestedRetryDelay);
}
[Fact]
public void Classify_UnknownErrorCode_InfersFromPrefix()
{
var result = _classifier.Classify("ORCH-TRN-999", "Custom transient error");
Assert.Equal("ORCH-TRN-999", result.ErrorCode);
Assert.Equal(ErrorCategory.Transient, result.Category);
Assert.True(result.IsRetryable);
}
[Fact]
public void Classify_UnknownPrefix_ReturnsUnknownCategory()
{
var result = _classifier.Classify("CUSTOM-ERR-001", "Unknown error");
Assert.Equal("CUSTOM-ERR-001", result.ErrorCode);
Assert.Equal(ErrorCategory.Unknown, result.Category);
Assert.False(result.IsRetryable);
}
[Theory]
[InlineData(DefaultErrorClassifier.ErrorCodes.NetworkTimeout, ErrorCategory.Transient, true)]
[InlineData(DefaultErrorClassifier.ErrorCodes.ImageNotFound, ErrorCategory.NotFound, false)]
[InlineData(DefaultErrorClassifier.ErrorCodes.InvalidCredentials, ErrorCategory.AuthFailure, false)]
[InlineData(DefaultErrorClassifier.ErrorCodes.RateLimited, ErrorCategory.RateLimited, true)]
[InlineData(DefaultErrorClassifier.ErrorCodes.InvalidPayload, ErrorCategory.ValidationError, false)]
[InlineData(DefaultErrorClassifier.ErrorCodes.RegistryError, ErrorCategory.UpstreamError, true)]
[InlineData(DefaultErrorClassifier.ErrorCodes.InternalError, ErrorCategory.InternalError, false)]
[InlineData(DefaultErrorClassifier.ErrorCodes.DuplicateJob, ErrorCategory.Conflict, false)]
[InlineData(DefaultErrorClassifier.ErrorCodes.UserCanceled, ErrorCategory.Canceled, false)]
public void Classify_ErrorCode_ReturnsExpectedCategory(string errorCode, ErrorCategory expectedCategory, bool expectedRetryable)
{
var result = _classifier.Classify(errorCode, "test");
Assert.Equal(expectedCategory, result.Category);
Assert.Equal(expectedRetryable, result.IsRetryable);
}
[Fact]
public void Classify_TimeoutException_ReturnsTransient()
{
var exception = new TimeoutException("Operation timed out");
var result = _classifier.Classify(exception);
Assert.Equal(ErrorCategory.Transient, result.Category);
Assert.True(result.IsRetryable);
}
[Fact]
public void Classify_OperationCanceledException_ReturnsCanceled()
{
var exception = new OperationCanceledException();
var result = _classifier.Classify(exception);
Assert.Equal(ErrorCategory.Canceled, result.Category);
Assert.False(result.IsRetryable);
}
[Fact]
public void Classify_ExceptionWithConnectionRefused_ReturnsTransient()
{
var exception = new Exception("connection refused by remote host");
var result = _classifier.Classify(exception);
Assert.Equal(DefaultErrorClassifier.ErrorCodes.ConnectionRefused, result.ErrorCode);
Assert.Equal(ErrorCategory.Transient, result.Category);
}
[Fact]
public void Classify_ExceptionWithDns_ReturnsTransient()
{
var exception = new Exception("DNS resolution failed");
var result = _classifier.Classify(exception);
Assert.Equal(DefaultErrorClassifier.ErrorCodes.DnsResolutionFailed, result.ErrorCode);
Assert.Equal(ErrorCategory.Transient, result.Category);
}
[Fact]
public void Classify_ExceptionWithCertificate_ReturnsAuthFailure()
{
var exception = new Exception("SSL certificate validation failed");
var result = _classifier.Classify(exception);
Assert.Equal(DefaultErrorClassifier.ErrorCodes.CertificateError, result.ErrorCode);
Assert.Equal(ErrorCategory.AuthFailure, result.Category);
}
[Fact]
public void Classify_GenericException_ReturnsUnexpectedError()
{
var exception = new Exception("Something unexpected happened");
var result = _classifier.Classify(exception);
Assert.Equal(DefaultErrorClassifier.ErrorCodes.UnexpectedError, result.ErrorCode);
Assert.Equal(ErrorCategory.InternalError, result.Category);
Assert.False(result.IsRetryable);
}
[Theory]
[InlineData(400, ErrorCategory.ValidationError)]
[InlineData(401, ErrorCategory.AuthFailure)]
[InlineData(403, ErrorCategory.AuthFailure)]
[InlineData(404, ErrorCategory.NotFound)]
[InlineData(408, ErrorCategory.Transient)]
[InlineData(409, ErrorCategory.Conflict)]
[InlineData(429, ErrorCategory.RateLimited)]
[InlineData(500, ErrorCategory.InternalError)]
[InlineData(502, ErrorCategory.UpstreamError)]
[InlineData(503, ErrorCategory.Transient)]
[InlineData(504, ErrorCategory.Transient)]
public void ClassifyHttpError_ReturnsExpectedCategory(int statusCode, ErrorCategory expectedCategory)
{
var result = _classifier.ClassifyHttpError(statusCode, "HTTP error");
Assert.Equal(expectedCategory, result.Category);
}
[Fact]
public void ClassifyHttpError_429_IsRetryable()
{
var result = _classifier.ClassifyHttpError(429, "Too many requests");
Assert.True(result.IsRetryable);
Assert.NotNull(result.SuggestedRetryDelay);
}
[Fact]
public void ClassifyHttpError_503_IsRetryable()
{
var result = _classifier.ClassifyHttpError(503, "Service unavailable");
Assert.True(result.IsRetryable);
Assert.NotNull(result.SuggestedRetryDelay);
}
[Fact]
public void ClassifyHttpError_400_IsNotRetryable()
{
var result = _classifier.ClassifyHttpError(400, "Bad request");
Assert.False(result.IsRetryable);
Assert.Null(result.SuggestedRetryDelay);
}
[Fact]
public void ClassifyHttpError_Unknown4xx_ReturnsValidationError()
{
var result = _classifier.ClassifyHttpError(418, "I'm a teapot");
Assert.Equal(ErrorCategory.ValidationError, result.Category);
Assert.Equal("HTTP-418", result.ErrorCode);
}
[Fact]
public void ClassifyHttpError_Unknown5xx_ReturnsUpstreamError()
{
var result = _classifier.ClassifyHttpError(599, "Custom server error");
Assert.Equal(ErrorCategory.UpstreamError, result.Category);
Assert.Equal("HTTP-599", result.ErrorCode);
Assert.True(result.IsRetryable);
}
[Fact]
public void AllKnownErrorCodes_HaveRemediationHints()
{
var errorCodes = new[]
{
DefaultErrorClassifier.ErrorCodes.NetworkTimeout,
DefaultErrorClassifier.ErrorCodes.ConnectionRefused,
DefaultErrorClassifier.ErrorCodes.ServiceUnavailable,
DefaultErrorClassifier.ErrorCodes.ImageNotFound,
DefaultErrorClassifier.ErrorCodes.InvalidCredentials,
DefaultErrorClassifier.ErrorCodes.RateLimited,
DefaultErrorClassifier.ErrorCodes.InvalidPayload,
DefaultErrorClassifier.ErrorCodes.InternalError
};
foreach (var code in errorCodes)
{
var result = _classifier.Classify(code, "test");
Assert.NotNull(result.RemediationHint);
Assert.NotEmpty(result.RemediationHint);
}
}
[Fact]
public void TransientErrors_HaveSuggestedRetryDelay()
{
var transientCodes = new[]
{
DefaultErrorClassifier.ErrorCodes.NetworkTimeout,
DefaultErrorClassifier.ErrorCodes.ConnectionRefused,
DefaultErrorClassifier.ErrorCodes.ServiceUnavailable,
DefaultErrorClassifier.ErrorCodes.GatewayTimeout
};
foreach (var code in transientCodes)
{
var result = _classifier.Classify(code, "test");
Assert.NotNull(result.SuggestedRetryDelay);
Assert.True(result.SuggestedRetryDelay.Value > TimeSpan.Zero);
}
}
}
public class ClassifiedErrorTests
{
[Fact]
public void ClassifiedError_StoresAllProperties()
{
var error = new ClassifiedError(
ErrorCode: "TEST-001",
Category: ErrorCategory.Transient,
Description: "Test error",
RemediationHint: "Try again",
IsRetryable: true,
SuggestedRetryDelay: TimeSpan.FromMinutes(5));
Assert.Equal("TEST-001", error.ErrorCode);
Assert.Equal(ErrorCategory.Transient, error.Category);
Assert.Equal("Test error", error.Description);
Assert.Equal("Try again", error.RemediationHint);
Assert.True(error.IsRetryable);
Assert.Equal(TimeSpan.FromMinutes(5), error.SuggestedRetryDelay);
}
[Fact]
public void ClassifiedError_EqualsComparison()
{
var error1 = new ClassifiedError("TEST", ErrorCategory.Unknown, "Desc", "Hint", false, null);
var error2 = new ClassifiedError("TEST", ErrorCategory.Unknown, "Desc", "Hint", false, null);
var error3 = new ClassifiedError("OTHER", ErrorCategory.Unknown, "Desc", "Hint", false, null);
Assert.Equal(error1, error2);
Assert.NotEqual(error1, error3);
}
}

View File

@@ -0,0 +1,309 @@
using StellaOps.Orchestrator.Core.DeadLetter;
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Tests.DeadLetter;
public class NotificationRuleTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
private const string TenantId = "test-tenant";
[Fact]
public void Create_SetsDefaultValues()
{
var rule = NotificationRule.Create(
TenantId,
NotificationChannel.Slack,
"https://hooks.slack.com/test",
"admin");
Assert.NotEqual(Guid.Empty, rule.RuleId);
Assert.Equal(TenantId, rule.TenantId);
Assert.Equal(NotificationChannel.Slack, rule.Channel);
Assert.Equal("https://hooks.slack.com/test", rule.Endpoint);
Assert.True(rule.Enabled);
Assert.Equal(15, rule.CooldownMinutes);
Assert.Equal(10, rule.MaxPerHour);
Assert.True(rule.Aggregate);
Assert.Null(rule.LastNotifiedAt);
Assert.Equal(0, rule.NotificationsSent);
Assert.Equal("admin", rule.CreatedBy);
}
[Fact]
public void Create_WithFilters_SetsFilters()
{
var sourceId = Guid.NewGuid();
var rule = NotificationRule.Create(
TenantId,
NotificationChannel.Email,
"alerts@example.com",
"admin",
jobTypePattern: "scan\\.*",
errorCodePattern: "ORCH-TRN-.*",
category: ErrorCategory.Transient,
sourceId: sourceId);
Assert.Equal("scan\\.*", rule.JobTypePattern);
Assert.Equal("ORCH-TRN-.*", rule.ErrorCodePattern);
Assert.Equal(ErrorCategory.Transient, rule.Category);
Assert.Equal(sourceId, rule.SourceId);
}
[Fact]
public void Create_WithCustomRateLimits_SetsLimits()
{
var rule = NotificationRule.Create(
TenantId,
NotificationChannel.Webhook,
"https://webhook.example.com",
"admin",
cooldownMinutes: 30,
maxPerHour: 5,
aggregate: false);
Assert.Equal(30, rule.CooldownMinutes);
Assert.Equal(5, rule.MaxPerHour);
Assert.False(rule.Aggregate);
}
[Fact]
public void Matches_WithNoFilters_MatchesAll()
{
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin");
var entry = CreateTestEntry();
Assert.True(rule.Matches(entry));
}
[Fact]
public void Matches_WhenDisabled_ReturnsFalse()
{
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin")
with { Enabled = false };
var entry = CreateTestEntry();
Assert.False(rule.Matches(entry));
}
[Fact]
public void Matches_WithSourceIdFilter_MatchesOnlyMatchingSource()
{
var sourceId = Guid.NewGuid();
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
sourceId: sourceId);
var matchingEntry = CreateTestEntry() with { SourceId = sourceId };
var nonMatchingEntry = CreateTestEntry() with { SourceId = Guid.NewGuid() };
Assert.True(rule.Matches(matchingEntry));
Assert.False(rule.Matches(nonMatchingEntry));
}
[Fact]
public void Matches_WithCategoryFilter_MatchesOnlyMatchingCategory()
{
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
category: ErrorCategory.Transient);
var matchingEntry = CreateTestEntry() with { Category = ErrorCategory.Transient };
var nonMatchingEntry = CreateTestEntry() with { Category = ErrorCategory.NotFound };
Assert.True(rule.Matches(matchingEntry));
Assert.False(rule.Matches(nonMatchingEntry));
}
[Fact]
public void Matches_WithJobTypePattern_MatchesRegex()
{
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
jobTypePattern: @"scan\..*");
var matchingEntry1 = CreateTestEntry() with { JobType = "scan.image" };
var matchingEntry2 = CreateTestEntry() with { JobType = "scan.sbom" };
var nonMatchingEntry = CreateTestEntry() with { JobType = "export.report" };
Assert.True(rule.Matches(matchingEntry1));
Assert.True(rule.Matches(matchingEntry2));
Assert.False(rule.Matches(nonMatchingEntry));
}
[Fact]
public void Matches_WithErrorCodePattern_MatchesRegex()
{
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
errorCodePattern: @"ORCH-TRN-\d+");
var matchingEntry = CreateTestEntry() with { ErrorCode = "ORCH-TRN-001" };
var nonMatchingEntry = CreateTestEntry() with { ErrorCode = "ORCH-NF-001" };
Assert.True(rule.Matches(matchingEntry));
Assert.False(rule.Matches(nonMatchingEntry));
}
[Fact]
public void CanNotify_WhenDisabled_ReturnsFalse()
{
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin")
with { Enabled = false };
Assert.False(rule.CanNotify(BaseTime, 0));
}
[Fact]
public void CanNotify_WithinCooldown_ReturnsFalse()
{
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
cooldownMinutes: 15) with { LastNotifiedAt = BaseTime };
Assert.False(rule.CanNotify(BaseTime.AddMinutes(10), 0));
}
[Fact]
public void CanNotify_AfterCooldown_ReturnsTrue()
{
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
cooldownMinutes: 15) with { LastNotifiedAt = BaseTime };
Assert.True(rule.CanNotify(BaseTime.AddMinutes(20), 0));
}
[Fact]
public void CanNotify_AtMaxPerHour_ReturnsFalse()
{
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
maxPerHour: 5);
Assert.False(rule.CanNotify(BaseTime, 5));
}
[Fact]
public void CanNotify_BelowMaxPerHour_ReturnsTrue()
{
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin",
maxPerHour: 5);
Assert.True(rule.CanNotify(BaseTime, 4));
}
[Fact]
public void CanNotify_WithNoLastNotification_ReturnsTrue()
{
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin");
Assert.True(rule.CanNotify(BaseTime, 0));
}
[Fact]
public void RecordNotification_UpdatesFields()
{
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin");
var updated = rule.RecordNotification(BaseTime);
Assert.Equal(BaseTime, updated.LastNotifiedAt);
Assert.Equal(1, updated.NotificationsSent);
Assert.Equal(BaseTime, updated.UpdatedAt);
}
[Fact]
public void RecordNotification_IncrementsCount()
{
var rule = NotificationRule.Create(TenantId, NotificationChannel.Slack, "url", "admin")
with { NotificationsSent = 5 };
var updated = rule.RecordNotification(BaseTime);
Assert.Equal(6, updated.NotificationsSent);
}
private static DeadLetterEntry CreateTestEntry() =>
new(
EntryId: Guid.NewGuid(),
TenantId: TenantId,
OriginalJobId: Guid.NewGuid(),
RunId: null,
SourceId: null,
JobType: "scan.image",
Payload: "{}",
PayloadDigest: new string('a', 64),
IdempotencyKey: "key",
CorrelationId: null,
Status: DeadLetterStatus.Pending,
ErrorCode: "ORCH-TRN-001",
FailureReason: "Timeout",
RemediationHint: null,
Category: ErrorCategory.Transient,
IsRetryable: true,
OriginalAttempts: 3,
ReplayAttempts: 0,
MaxReplayAttempts: 3,
FailedAt: BaseTime,
CreatedAt: BaseTime,
UpdatedAt: BaseTime,
ExpiresAt: BaseTime.AddDays(30),
ResolvedAt: null,
ResolutionNotes: null,
CreatedBy: "test",
UpdatedBy: "system");
}
public class ReplayAuditRecordTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
private const string TenantId = "test-tenant";
[Fact]
public void Create_SetsInitialValues()
{
var entryId = Guid.NewGuid();
var record = ReplayAuditRecord.Create(
TenantId,
entryId,
attemptNumber: 1,
triggeredBy: "manual",
initiatedBy: "operator",
now: BaseTime);
Assert.NotEqual(Guid.Empty, record.AuditId);
Assert.Equal(TenantId, record.TenantId);
Assert.Equal(entryId, record.EntryId);
Assert.Equal(1, record.AttemptNumber);
Assert.False(record.Success);
Assert.Null(record.NewJobId);
Assert.Null(record.ErrorMessage);
Assert.Equal("manual", record.TriggeredBy);
Assert.Equal(BaseTime, record.TriggeredAt);
Assert.Null(record.CompletedAt);
Assert.Equal("operator", record.InitiatedBy);
}
[Fact]
public void Complete_SetsSuccessAndJobId()
{
var record = ReplayAuditRecord.Create(TenantId, Guid.NewGuid(), 1, "auto", "system", BaseTime);
var newJobId = Guid.NewGuid();
var completed = record.Complete(newJobId, BaseTime.AddMinutes(1));
Assert.True(completed.Success);
Assert.Equal(newJobId, completed.NewJobId);
Assert.Equal(BaseTime.AddMinutes(1), completed.CompletedAt);
Assert.Null(completed.ErrorMessage);
}
[Fact]
public void Fail_SetsErrorMessage()
{
var record = ReplayAuditRecord.Create(TenantId, Guid.NewGuid(), 1, "auto", "system", BaseTime);
var failed = record.Fail("Connection timeout", BaseTime.AddMinutes(1));
Assert.False(failed.Success);
Assert.Null(failed.NewJobId);
Assert.Equal("Connection timeout", failed.ErrorMessage);
Assert.Equal(BaseTime.AddMinutes(1), failed.CompletedAt);
}
}

View File

@@ -0,0 +1,391 @@
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Core.RateLimiting;
namespace StellaOps.Orchestrator.Tests.RateLimiting;
public class AdaptiveRateLimiterTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
private static Quota CreateDefaultQuota() => new(
QuotaId: Guid.NewGuid(),
TenantId: "tenant-1",
JobType: "scan",
MaxActive: 5,
MaxPerHour: 100,
BurstCapacity: 10,
RefillRate: 2.0,
CurrentTokens: 10,
LastRefillAt: BaseTime,
CurrentActive: 0,
CurrentHourCount: 0,
CurrentHourStart: BaseTime,
Paused: false,
PauseReason: null,
QuotaTicket: null,
CreatedAt: BaseTime,
UpdatedAt: BaseTime,
UpdatedBy: "system");
[Fact]
public void Constructor_FromQuota_InitializesCorrectly()
{
var quota = CreateDefaultQuota();
var limiter = new AdaptiveRateLimiter(quota);
Assert.Equal("tenant-1", limiter.TenantId);
Assert.Equal("scan", limiter.JobType);
Assert.Equal(100, limiter.MaxPerHour);
Assert.False(limiter.IsPaused);
}
[Fact]
public void Constructor_WithExplicitParameters_InitializesCorrectly()
{
var limiter = new AdaptiveRateLimiter(
tenantId: "tenant-2",
jobType: "analyze",
maxActive: 3,
maxPerHour: 50,
burstCapacity: 5,
refillRate: 1.0);
Assert.Equal("tenant-2", limiter.TenantId);
Assert.Equal("analyze", limiter.JobType);
Assert.Equal(50, limiter.MaxPerHour);
}
[Fact]
public void Constructor_WithNullQuota_Throws()
{
Assert.Throws<ArgumentNullException>(() =>
new AdaptiveRateLimiter(null!));
}
[Fact]
public void Constructor_WithNullTenantId_Throws()
{
Assert.Throws<ArgumentNullException>(() =>
new AdaptiveRateLimiter(
tenantId: null!,
jobType: "scan",
maxActive: 5,
maxPerHour: 100,
burstCapacity: 10,
refillRate: 2.0));
}
[Fact]
public void TryAcquire_WithCapacity_ReturnsAllowed()
{
var quota = CreateDefaultQuota();
var limiter = new AdaptiveRateLimiter(quota);
var result = limiter.TryAcquire(BaseTime);
Assert.True(result.IsAllowed);
Assert.Null(result.DenialReason);
Assert.Null(result.DenialMessage);
Assert.Null(result.RetryAfter);
}
[Fact]
public void TryAcquire_WhenPaused_ReturnsDenied()
{
var quota = CreateDefaultQuota() with { Paused = true, PauseReason = "Manual pause" };
var limiter = new AdaptiveRateLimiter(quota);
var result = limiter.TryAcquire(BaseTime);
Assert.False(result.IsAllowed);
Assert.Equal(RateLimitDenialReason.Paused, result.DenialReason);
Assert.Equal("Manual pause", result.DenialMessage);
}
[Fact]
public void TryAcquire_WhenConcurrencyExceeded_ReturnsDenied()
{
var quota = CreateDefaultQuota() with { MaxActive = 2, CurrentActive = 2 };
var limiter = new AdaptiveRateLimiter(quota);
var result = limiter.TryAcquire(BaseTime);
Assert.False(result.IsAllowed);
Assert.Equal(RateLimitDenialReason.ConcurrencyLimitExceeded, result.DenialReason);
Assert.Contains("Concurrency limit of 2", result.DenialMessage);
}
[Fact]
public void TryAcquire_WhenTokensExhausted_ReturnsDenied()
{
var quota = CreateDefaultQuota() with { CurrentTokens = 0 };
var limiter = new AdaptiveRateLimiter(quota);
var result = limiter.TryAcquire(BaseTime);
Assert.False(result.IsAllowed);
Assert.Equal(RateLimitDenialReason.TokensExhausted, result.DenialReason);
Assert.NotNull(result.RetryAfter);
}
[Fact]
public void TryAcquire_WhenHourlyLimitExceeded_ReturnsDenied()
{
var quota = CreateDefaultQuota() with { CurrentHourCount = 100 }; // MaxPerHour = 100
var limiter = new AdaptiveRateLimiter(quota);
var result = limiter.TryAcquire(BaseTime);
Assert.False(result.IsAllowed);
Assert.Equal(RateLimitDenialReason.HourlyLimitExceeded, result.DenialReason);
Assert.Contains("Hourly limit of 100", result.DenialMessage);
Assert.NotNull(result.RetryAfter);
}
[Fact]
public void TryAcquire_InBackpressure_ReturnsDenied()
{
var quota = CreateDefaultQuota();
var limiter = new AdaptiveRateLimiter(quota);
// Record failure to trigger backpressure
limiter.RecordUpstreamFailure(429, TimeSpan.FromMinutes(1), BaseTime);
var result = limiter.TryAcquire(BaseTime.AddSeconds(10));
Assert.False(result.IsAllowed);
Assert.Equal(RateLimitDenialReason.Backpressure, result.DenialReason);
Assert.NotNull(result.RetryAfter);
}
[Fact]
public void TryAcquire_ConsumesTokenAndConcurrency()
{
var quota = CreateDefaultQuota();
var limiter = new AdaptiveRateLimiter(quota);
limiter.TryAcquire(BaseTime);
var snapshot = limiter.GetSnapshot(BaseTime);
Assert.Equal(9, snapshot.TokenBucket.CurrentTokens);
Assert.Equal(1, snapshot.Concurrency.CurrentActive);
Assert.Equal(1, snapshot.HourlyCounter.CurrentCount);
}
[Fact]
public void Release_DecrementsConcurrency()
{
var quota = CreateDefaultQuota();
var limiter = new AdaptiveRateLimiter(quota);
limiter.TryAcquire(BaseTime);
limiter.Release();
var snapshot = limiter.GetSnapshot(BaseTime);
Assert.Equal(0, snapshot.Concurrency.CurrentActive);
}
[Fact]
public void RecordUpstreamFailure_TriggersBackpressure()
{
var quota = CreateDefaultQuota();
var limiter = new AdaptiveRateLimiter(quota);
var result = limiter.RecordUpstreamFailure(429, TimeSpan.FromSeconds(30), BaseTime);
Assert.True(result.ShouldBackoff);
Assert.Equal(TimeSpan.FromSeconds(30), result.BackoffDuration);
Assert.Equal(429, result.StatusCode);
}
[Fact]
public void RecordUpstreamSuccess_ClearsBackpressure()
{
var quota = CreateDefaultQuota();
var limiter = new AdaptiveRateLimiter(quota);
limiter.RecordUpstreamFailure(429, TimeSpan.FromMinutes(1), BaseTime);
limiter.RecordUpstreamSuccess();
var snapshot = limiter.GetSnapshot(BaseTime.AddSeconds(10));
Assert.False(snapshot.Backpressure.IsInBackoff);
}
[Fact]
public void Pause_PausesLimiter()
{
var quota = CreateDefaultQuota();
var limiter = new AdaptiveRateLimiter(quota);
limiter.Pause("Maintenance");
Assert.True(limiter.IsPaused);
Assert.Equal("Maintenance", limiter.PauseReason);
var result = limiter.TryAcquire(BaseTime);
Assert.False(result.IsAllowed);
Assert.Equal(RateLimitDenialReason.Paused, result.DenialReason);
}
[Fact]
public void Resume_ResumesLimiter()
{
var quota = CreateDefaultQuota() with { Paused = true, PauseReason = "Maintenance" };
var limiter = new AdaptiveRateLimiter(quota);
limiter.Resume();
Assert.False(limiter.IsPaused);
Assert.Null(limiter.PauseReason);
var result = limiter.TryAcquire(BaseTime);
Assert.True(result.IsAllowed);
}
[Fact]
public void GetSnapshot_ReturnsCompleteState()
{
var quota = CreateDefaultQuota();
var limiter = new AdaptiveRateLimiter(quota);
limiter.TryAcquire(BaseTime);
limiter.RecordUpstreamFailure(503, now: BaseTime);
var snapshot = limiter.GetSnapshot(BaseTime);
Assert.Equal("tenant-1", snapshot.TenantId);
Assert.Equal("scan", snapshot.JobType);
Assert.False(snapshot.IsPaused);
Assert.Equal(9, snapshot.TokenBucket.CurrentTokens);
Assert.Equal(1, snapshot.Concurrency.CurrentActive);
Assert.True(snapshot.Backpressure.IsInBackoff);
Assert.Equal(1, snapshot.HourlyCounter.CurrentCount);
}
[Fact]
public void ExportToQuota_PreservesState()
{
var originalQuota = CreateDefaultQuota();
var limiter = new AdaptiveRateLimiter(originalQuota);
limiter.TryAcquire(BaseTime);
limiter.TryAcquire(BaseTime);
limiter.Release();
limiter.Pause("Testing");
var exportedQuota = limiter.ExportToQuota(originalQuota.QuotaId, BaseTime.AddSeconds(10), "test-user");
Assert.Equal(originalQuota.QuotaId, exportedQuota.QuotaId);
Assert.Equal("tenant-1", exportedQuota.TenantId);
Assert.Equal("scan", exportedQuota.JobType);
Assert.Equal(1, exportedQuota.CurrentActive); // 2 acquired, 1 released
Assert.Equal(2, exportedQuota.CurrentHourCount);
Assert.True(exportedQuota.Paused);
Assert.Equal("Testing", exportedQuota.PauseReason);
Assert.Equal("test-user", exportedQuota.UpdatedBy);
}
[Fact]
public void MultipleAcquires_TrackCorrectly()
{
var quota = CreateDefaultQuota() with { MaxActive = 3, BurstCapacity = 5 };
var limiter = new AdaptiveRateLimiter(quota);
var result1 = limiter.TryAcquire(BaseTime);
var result2 = limiter.TryAcquire(BaseTime);
var result3 = limiter.TryAcquire(BaseTime);
var result4 = limiter.TryAcquire(BaseTime);
Assert.True(result1.IsAllowed);
Assert.True(result2.IsAllowed);
Assert.True(result3.IsAllowed);
Assert.False(result4.IsAllowed);
Assert.Equal(RateLimitDenialReason.ConcurrencyLimitExceeded, result4.DenialReason);
}
[Fact]
public void RollbackOnConcurrencyFailure_DoesNotAffectHourlyCounter()
{
var quota = CreateDefaultQuota() with { MaxActive = 1, CurrentActive = 1 };
var limiter = new AdaptiveRateLimiter(quota);
var result = limiter.TryAcquire(BaseTime);
Assert.False(result.IsAllowed);
var snapshot = limiter.GetSnapshot(BaseTime);
Assert.Equal(0, snapshot.HourlyCounter.CurrentCount); // Should be rolled back
}
[Fact]
public void RollbackOnTokenBucketFailure_DoesNotAffectOtherCounters()
{
var quota = CreateDefaultQuota() with { CurrentTokens = 0 };
var limiter = new AdaptiveRateLimiter(quota);
var result = limiter.TryAcquire(BaseTime);
Assert.False(result.IsAllowed);
var snapshot = limiter.GetSnapshot(BaseTime);
Assert.Equal(0, snapshot.Concurrency.CurrentActive); // Should be rolled back
Assert.Equal(0, snapshot.HourlyCounter.CurrentCount); // Should be rolled back
}
[Fact]
public void HourlyCounter_ResetsAfterHour()
{
var quota = CreateDefaultQuota() with { CurrentHourCount = 50 };
var limiter = new AdaptiveRateLimiter(quota);
// Try acquire after an hour has passed
var result = limiter.TryAcquire(BaseTime.AddHours(1).AddMinutes(1));
Assert.True(result.IsAllowed);
var snapshot = limiter.GetSnapshot(BaseTime.AddHours(1).AddMinutes(1));
Assert.Equal(1, snapshot.HourlyCounter.CurrentCount); // Reset and then 1 new
}
[Fact]
public void ConcurrentAccess_IsThreadSafe()
{
var quota = CreateDefaultQuota() with { MaxActive = 50, MaxPerHour = 1000, BurstCapacity = 100 };
var limiter = new AdaptiveRateLimiter(quota);
var successes = 0;
Parallel.For(0, 100, _ =>
{
var result = limiter.TryAcquire(DateTimeOffset.UtcNow);
if (result.IsAllowed)
{
Interlocked.Increment(ref successes);
}
});
Assert.Equal(50, successes); // Limited by MaxActive
}
[Fact]
public void RateLimitResult_AllowedFactory_CreatesCorrectResult()
{
var result = RateLimitResult.Allowed();
Assert.True(result.IsAllowed);
Assert.Null(result.DenialReason);
Assert.Null(result.DenialMessage);
Assert.Null(result.RetryAfter);
}
[Fact]
public void RateLimitResult_DeniedFactory_CreatesCorrectResult()
{
var result = RateLimitResult.Denied(
RateLimitDenialReason.TokensExhausted,
"No tokens available",
TimeSpan.FromSeconds(5));
Assert.False(result.IsAllowed);
Assert.Equal(RateLimitDenialReason.TokensExhausted, result.DenialReason);
Assert.Equal("No tokens available", result.DenialMessage);
Assert.Equal(TimeSpan.FromSeconds(5), result.RetryAfter);
}
}

View File

@@ -0,0 +1,313 @@
using StellaOps.Orchestrator.Core.RateLimiting;
namespace StellaOps.Orchestrator.Tests.RateLimiting;
public class BackpressureHandlerTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
[Fact]
public void Constructor_WithDefaults_SetsCorrectValues()
{
var handler = new BackpressureHandler();
Assert.Equal(TimeSpan.FromSeconds(1), handler.BaseDelay);
Assert.Equal(TimeSpan.FromMinutes(5), handler.MaxDelay);
Assert.Equal(1, handler.FailureThreshold);
Assert.Equal(0.2, handler.JitterFactor);
}
[Fact]
public void Constructor_WithCustomValues_SetsCorrectly()
{
var handler = new BackpressureHandler(
baseDelay: TimeSpan.FromSeconds(2),
maxDelay: TimeSpan.FromMinutes(10),
failureThreshold: 3,
jitterFactor: 0.5);
Assert.Equal(TimeSpan.FromSeconds(2), handler.BaseDelay);
Assert.Equal(TimeSpan.FromMinutes(10), handler.MaxDelay);
Assert.Equal(3, handler.FailureThreshold);
Assert.Equal(0.5, handler.JitterFactor);
}
[Fact]
public void Constructor_WithInvalidBaseDelay_Throws()
{
Assert.Throws<ArgumentOutOfRangeException>(() =>
new BackpressureHandler(baseDelay: TimeSpan.Zero));
}
[Fact]
public void Constructor_WithMaxDelayLessThanBase_Throws()
{
Assert.Throws<ArgumentOutOfRangeException>(() =>
new BackpressureHandler(
baseDelay: TimeSpan.FromSeconds(10),
maxDelay: TimeSpan.FromSeconds(5)));
}
[Fact]
public void Constructor_WithJitterOutOfRange_Clamps()
{
var handler1 = new BackpressureHandler(jitterFactor: -0.5);
var handler2 = new BackpressureHandler(jitterFactor: 1.5);
Assert.Equal(0.0, handler1.JitterFactor);
Assert.Equal(1.0, handler2.JitterFactor);
}
[Fact]
public void ShouldAllow_Initially_ReturnsTrue()
{
var handler = new BackpressureHandler();
Assert.True(handler.ShouldAllow(BaseTime));
Assert.False(handler.IsInBackoff);
}
[Fact]
public void RecordFailure_Returns429Reason()
{
var handler = new BackpressureHandler(jitterFactor: 0);
var result = handler.RecordFailure(429, now: BaseTime);
Assert.True(result.ShouldBackoff);
Assert.Equal("upstream_rate_limited", result.Reason);
Assert.Equal(429, result.StatusCode);
Assert.Equal(1, result.ConsecutiveFailures);
}
[Fact]
public void RecordFailure_Returns503Reason()
{
var handler = new BackpressureHandler(jitterFactor: 0);
var result = handler.RecordFailure(503, now: BaseTime);
Assert.Equal("upstream_unavailable", result.Reason);
}
[Theory]
[InlineData(502, "upstream_bad_gateway")]
[InlineData(504, "upstream_timeout")]
[InlineData(500, "upstream_server_error")]
[InlineData(501, "upstream_server_error")]
[InlineData(400, "upstream_client_error")]
[InlineData(404, "upstream_client_error")]
[InlineData(200, "upstream_error")]
public void RecordFailure_MapsStatusCodeToReason(int statusCode, string expectedReason)
{
var handler = new BackpressureHandler();
var result = handler.RecordFailure(statusCode, now: BaseTime);
Assert.Equal(expectedReason, result.Reason);
}
[Fact]
public void RecordFailure_WithRetryAfter_UsesProvidedDelay()
{
var handler = new BackpressureHandler(jitterFactor: 0);
var retryAfter = TimeSpan.FromSeconds(30);
var result = handler.RecordFailure(429, retryAfter: retryAfter, now: BaseTime);
Assert.Equal(retryAfter, result.BackoffDuration);
Assert.Equal(BaseTime.AddSeconds(30), result.BackoffUntil);
}
[Fact]
public void RecordFailure_WithRetryAfterExceedingMax_UsesCalculatedDelay()
{
var handler = new BackpressureHandler(
maxDelay: TimeSpan.FromMinutes(5),
jitterFactor: 0);
var retryAfter = TimeSpan.FromMinutes(10); // Exceeds max
var result = handler.RecordFailure(429, retryAfter: retryAfter, now: BaseTime);
Assert.True(result.BackoffDuration <= TimeSpan.FromMinutes(5));
}
[Fact]
public void RecordFailure_ExponentialBackoff_IncreasesDelay()
{
var handler = new BackpressureHandler(
baseDelay: TimeSpan.FromSeconds(1),
maxDelay: TimeSpan.FromMinutes(5),
jitterFactor: 0);
var result1 = handler.RecordFailure(429, now: BaseTime);
var result2 = handler.RecordFailure(429, now: BaseTime.AddSeconds(10));
var result3 = handler.RecordFailure(429, now: BaseTime.AddSeconds(20));
// base * 2^0 = 1s, base * 2^1 = 2s, base * 2^2 = 4s
Assert.Equal(TimeSpan.FromSeconds(1), result1.BackoffDuration);
Assert.Equal(TimeSpan.FromSeconds(2), result2.BackoffDuration);
Assert.Equal(TimeSpan.FromSeconds(4), result3.BackoffDuration);
}
[Fact]
public void RecordFailure_CapsAtMaxDelay()
{
var handler = new BackpressureHandler(
baseDelay: TimeSpan.FromSeconds(1),
maxDelay: TimeSpan.FromSeconds(10),
jitterFactor: 0);
// Record many failures to exceed max
for (var i = 0; i < 10; i++)
{
handler.RecordFailure(429, now: BaseTime.AddSeconds(i * 20));
}
var result = handler.RecordFailure(429, now: BaseTime.AddSeconds(200));
Assert.Equal(TimeSpan.FromSeconds(10), result.BackoffDuration);
}
[Fact]
public void ShouldAllow_InBackoff_ReturnsFalse()
{
var handler = new BackpressureHandler(jitterFactor: 0);
handler.RecordFailure(429, now: BaseTime);
Assert.False(handler.ShouldAllow(BaseTime.AddMilliseconds(500)));
}
[Fact]
public void ShouldAllow_AfterBackoffExpires_ReturnsTrue()
{
var handler = new BackpressureHandler(
baseDelay: TimeSpan.FromSeconds(1),
jitterFactor: 0);
handler.RecordFailure(429, now: BaseTime);
Assert.True(handler.ShouldAllow(BaseTime.AddSeconds(2)));
}
[Fact]
public void RecordSuccess_ResetsFailureCount()
{
var handler = new BackpressureHandler();
handler.RecordFailure(429, now: BaseTime);
handler.RecordFailure(429, now: BaseTime.AddSeconds(5));
Assert.Equal(2, handler.ConsecutiveFailures);
handler.RecordSuccess();
Assert.Equal(0, handler.ConsecutiveFailures);
Assert.True(handler.ShouldAllow(BaseTime.AddSeconds(10)));
}
[Fact]
public void Reset_ClearsAllState()
{
var handler = new BackpressureHandler();
handler.RecordFailure(429, now: BaseTime);
handler.RecordFailure(429, now: BaseTime.AddSeconds(5));
handler.Reset();
Assert.Equal(0, handler.ConsecutiveFailures);
Assert.False(handler.IsInBackoff);
Assert.Equal(TimeSpan.Zero, handler.TimeUntilReady);
}
[Fact]
public void TimeUntilReady_ReturnsCorrectValue()
{
var handler = new BackpressureHandler(
baseDelay: TimeSpan.FromSeconds(10),
jitterFactor: 0);
// Use current time so TimeUntilReady (which uses UtcNow internally) works correctly
var now = DateTimeOffset.UtcNow;
handler.RecordFailure(429, now: now);
var remaining = handler.TimeUntilReady;
// Should be positive and up to 10 seconds
Assert.True(remaining > TimeSpan.Zero, $"Expected > 0, got {remaining}");
Assert.True(remaining <= TimeSpan.FromSeconds(10), $"Expected <= 10s, got {remaining}");
}
[Fact]
public void GetSnapshot_ReturnsCorrectState()
{
var handler = new BackpressureHandler(jitterFactor: 0);
handler.RecordFailure(429, now: BaseTime);
handler.RecordFailure(503, now: BaseTime.AddSeconds(5));
var snapshot = handler.GetSnapshot(BaseTime.AddSeconds(5));
Assert.True(snapshot.IsInBackoff);
Assert.Equal(2, snapshot.ConsecutiveFailures);
Assert.NotNull(snapshot.BackoffUntil);
Assert.Equal("upstream_unavailable", snapshot.LastFailureReason);
Assert.True(snapshot.TimeRemaining > TimeSpan.Zero);
}
[Fact]
public void GetSnapshot_WhenNotInBackoff_ShowsNotInBackoff()
{
var handler = new BackpressureHandler();
var snapshot = handler.GetSnapshot(BaseTime);
Assert.False(snapshot.IsInBackoff);
Assert.Null(snapshot.BackoffUntil);
Assert.Equal(TimeSpan.Zero, snapshot.TimeRemaining);
}
[Fact]
public void FailureThreshold_DelaysBackoffUntilThreshold()
{
var handler = new BackpressureHandler(
failureThreshold: 3,
jitterFactor: 0);
var result1 = handler.RecordFailure(429, now: BaseTime);
var result2 = handler.RecordFailure(429, now: BaseTime.AddSeconds(1));
var result3 = handler.RecordFailure(429, now: BaseTime.AddSeconds(2));
Assert.False(result1.ShouldBackoff);
Assert.False(result2.ShouldBackoff);
Assert.True(result3.ShouldBackoff);
}
[Fact]
public void ConcurrentAccess_IsThreadSafe()
{
var handler = new BackpressureHandler(failureThreshold: 5);
var now = DateTimeOffset.UtcNow;
Parallel.For(0, 100, i =>
{
if (i % 3 == 0)
{
handler.RecordFailure(429, now: now.AddMilliseconds(i));
}
else if (i % 3 == 1)
{
handler.RecordSuccess();
}
else
{
handler.ShouldAllow(now.AddMilliseconds(i));
}
});
// Should complete without exceptions
var snapshot = handler.GetSnapshot(now.AddSeconds(100));
Assert.True(snapshot.ConsecutiveFailures >= 0);
}
}

View File

@@ -0,0 +1,279 @@
using StellaOps.Orchestrator.Core.RateLimiting;
namespace StellaOps.Orchestrator.Tests.RateLimiting;
public class ConcurrencyLimiterTests
{
[Fact]
public void Constructor_WithValidMaxActive_CreatesLimiter()
{
var limiter = new ConcurrencyLimiter(maxActive: 10);
Assert.Equal(10, limiter.MaxActive);
Assert.Equal(0, limiter.CurrentActive);
Assert.Equal(10, limiter.AvailableSlots);
}
[Fact]
public void Constructor_WithInitialActive_SetsCorrectly()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 3);
Assert.Equal(3, limiter.CurrentActive);
Assert.Equal(7, limiter.AvailableSlots);
}
[Theory]
[InlineData(0)]
[InlineData(-1)]
public void Constructor_WithInvalidMaxActive_Throws(int maxActive)
{
Assert.Throws<ArgumentOutOfRangeException>(() =>
new ConcurrencyLimiter(maxActive: maxActive));
}
[Fact]
public void Constructor_WithNegativeCurrentActive_Throws()
{
Assert.Throws<ArgumentOutOfRangeException>(() =>
new ConcurrencyLimiter(maxActive: 10, currentActive: -1));
}
[Fact]
public void TryAcquire_WithCapacity_ReturnsTrue()
{
var limiter = new ConcurrencyLimiter(maxActive: 10);
var result = limiter.TryAcquire();
Assert.True(result);
Assert.Equal(1, limiter.CurrentActive);
}
[Fact]
public void TryAcquire_AtCapacity_ReturnsFalse()
{
var limiter = new ConcurrencyLimiter(maxActive: 2, currentActive: 2);
var result = limiter.TryAcquire();
Assert.False(result);
Assert.Equal(2, limiter.CurrentActive);
}
[Fact]
public void TryAcquire_MultipleSlots_WithCapacity_ReturnsTrue()
{
var limiter = new ConcurrencyLimiter(maxActive: 10);
var result = limiter.TryAcquire(count: 5);
Assert.True(result);
Assert.Equal(5, limiter.CurrentActive);
}
[Fact]
public void TryAcquire_MultipleSlots_WithoutCapacity_ReturnsFalse()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 8);
var result = limiter.TryAcquire(count: 5);
Assert.False(result);
Assert.Equal(8, limiter.CurrentActive); // Unchanged (no partial acquisition)
}
[Fact]
public void TryAcquire_ZeroSlots_Throws()
{
var limiter = new ConcurrencyLimiter(maxActive: 10);
Assert.Throws<ArgumentOutOfRangeException>(() =>
limiter.TryAcquire(count: 0));
}
[Fact]
public void Release_WithActiveSlots_ReturnsTrue()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 5);
var result = limiter.Release();
Assert.True(result);
Assert.Equal(4, limiter.CurrentActive);
}
[Fact]
public void Release_WithNoActiveSlots_ReturnsFalse()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 0);
var result = limiter.Release();
Assert.False(result);
Assert.Equal(0, limiter.CurrentActive);
}
[Fact]
public void Release_MultipleSlots_ReleasesCorrectAmount()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 5);
var released = limiter.Release(count: 3);
Assert.Equal(3, released);
Assert.Equal(2, limiter.CurrentActive);
}
[Fact]
public void Release_MultipleSlots_CapsAtCurrentActive()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 2);
var released = limiter.Release(count: 5);
Assert.Equal(2, released); // Only 2 were available to release
Assert.Equal(0, limiter.CurrentActive);
}
[Fact]
public void Release_ZeroSlots_Throws()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 5);
Assert.Throws<ArgumentOutOfRangeException>(() =>
limiter.Release(count: 0));
}
[Fact]
public void HasCapacity_WithAvailableSlots_ReturnsTrue()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 5);
Assert.True(limiter.HasCapacity());
Assert.True(limiter.HasCapacity(count: 5));
}
[Fact]
public void HasCapacity_WithoutAvailableSlots_ReturnsFalse()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 10);
Assert.False(limiter.HasCapacity());
}
[Fact]
public void HasCapacity_ForMultipleSlots_ChecksCorrectly()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 8);
Assert.True(limiter.HasCapacity(count: 2));
Assert.False(limiter.HasCapacity(count: 3));
}
[Fact]
public void Reset_SetsToZero()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 5);
var released = limiter.Reset();
Assert.Equal(5, released);
Assert.Equal(0, limiter.CurrentActive);
}
[Fact]
public void SetActive_SetsCorrectCount()
{
var limiter = new ConcurrencyLimiter(maxActive: 10);
limiter.SetActive(7);
Assert.Equal(7, limiter.CurrentActive);
}
[Fact]
public void SetActive_NegativeCount_Throws()
{
var limiter = new ConcurrencyLimiter(maxActive: 10);
Assert.Throws<ArgumentOutOfRangeException>(() =>
limiter.SetActive(-1));
}
[Fact]
public void GetSnapshot_ReturnsCorrectState()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 4);
var snapshot = limiter.GetSnapshot();
Assert.Equal(10, snapshot.MaxActive);
Assert.Equal(4, snapshot.CurrentActive);
Assert.Equal(6, snapshot.AvailableSlots);
Assert.Equal(0.4, snapshot.Utilization);
Assert.False(snapshot.IsAtCapacity);
Assert.False(snapshot.IsIdle);
}
[Fact]
public void GetSnapshot_AtCapacity_ShowsAtCapacity()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 10);
var snapshot = limiter.GetSnapshot();
Assert.True(snapshot.IsAtCapacity);
Assert.Equal(1.0, snapshot.Utilization);
}
[Fact]
public void GetSnapshot_WhenIdle_ShowsIdle()
{
var limiter = new ConcurrencyLimiter(maxActive: 10, currentActive: 0);
var snapshot = limiter.GetSnapshot();
Assert.True(snapshot.IsIdle);
Assert.Equal(0.0, snapshot.Utilization);
}
[Fact]
public void ConcurrentAccess_IsThreadSafe()
{
var limiter = new ConcurrencyLimiter(maxActive: 50);
var acquired = 0;
Parallel.For(0, 100, _ =>
{
if (limiter.TryAcquire())
{
Interlocked.Increment(ref acquired);
}
});
Assert.Equal(50, acquired);
Assert.Equal(50, limiter.CurrentActive);
}
[Fact]
public void ConcurrentAcquireAndRelease_MaintainsInvariants()
{
var limiter = new ConcurrencyLimiter(maxActive: 10);
var completed = 0;
Parallel.For(0, 100, _ =>
{
if (limiter.TryAcquire())
{
Interlocked.Increment(ref completed);
limiter.Release();
}
});
// All operations should complete without deadlock
Assert.True(completed > 0);
// After all parallel operations complete, should be back to 0
Assert.Equal(0, limiter.CurrentActive);
}
}

View File

@@ -0,0 +1,196 @@
using StellaOps.Orchestrator.Core.RateLimiting;
namespace StellaOps.Orchestrator.Tests.RateLimiting;
public class HourlyCounterTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
[Fact]
public void Constructor_WithValidMaxPerHour_CreatesCounter()
{
var counter = new HourlyCounter(maxPerHour: 100);
Assert.Equal(100, counter.MaxPerHour);
}
[Fact]
public void Constructor_WithInitialCount_SetsCorrectly()
{
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 50, hourStart: BaseTime);
var snapshot = counter.GetSnapshot(BaseTime);
Assert.Equal(50, snapshot.CurrentCount);
Assert.Equal(50, snapshot.Remaining);
}
[Theory]
[InlineData(0)]
[InlineData(-1)]
public void Constructor_WithInvalidMaxPerHour_Throws(int maxPerHour)
{
Assert.Throws<ArgumentOutOfRangeException>(() =>
new HourlyCounter(maxPerHour: maxPerHour));
}
[Fact]
public void TryIncrement_WithinLimit_ReturnsTrue()
{
var counter = new HourlyCounter(maxPerHour: 100);
var result = counter.TryIncrement(BaseTime);
Assert.True(result);
var snapshot = counter.GetSnapshot(BaseTime);
Assert.Equal(1, snapshot.CurrentCount);
}
[Fact]
public void TryIncrement_AtLimit_ReturnsFalse()
{
var counter = new HourlyCounter(maxPerHour: 2, currentCount: 2, hourStart: BaseTime);
var result = counter.TryIncrement(BaseTime);
Assert.False(result);
var snapshot = counter.GetSnapshot(BaseTime);
Assert.Equal(2, snapshot.CurrentCount); // Unchanged
}
[Fact]
public void TryIncrement_AfterHourReset_IncrementsFromZero()
{
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 50, hourStart: BaseTime);
var result = counter.TryIncrement(BaseTime.AddHours(1).AddMinutes(1));
Assert.True(result);
var snapshot = counter.GetSnapshot(BaseTime.AddHours(1).AddMinutes(1));
Assert.Equal(1, snapshot.CurrentCount);
}
[Fact]
public void TryIncrement_AtLimitAfterHourReset_Succeeds()
{
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 100, hourStart: BaseTime);
var result = counter.TryIncrement(BaseTime.AddHours(1).AddMinutes(1));
Assert.True(result);
}
[Fact]
public void Decrement_DecreasesCount()
{
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 5, hourStart: BaseTime);
counter.Decrement();
var snapshot = counter.GetSnapshot(BaseTime);
Assert.Equal(4, snapshot.CurrentCount);
}
[Fact]
public void Decrement_AtZero_StaysAtZero()
{
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 0, hourStart: BaseTime);
counter.Decrement();
var snapshot = counter.GetSnapshot(BaseTime);
Assert.Equal(0, snapshot.CurrentCount);
}
[Fact]
public void GetSnapshot_CalculatesRemainingCorrectly()
{
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 30, hourStart: BaseTime);
var snapshot = counter.GetSnapshot(BaseTime);
Assert.Equal(70, snapshot.Remaining);
Assert.False(snapshot.IsExhausted);
}
[Fact]
public void GetSnapshot_AtLimit_ShowsExhausted()
{
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 100, hourStart: BaseTime);
var snapshot = counter.GetSnapshot(BaseTime);
Assert.Equal(0, snapshot.Remaining);
Assert.True(snapshot.IsExhausted);
}
[Fact]
public void GetSnapshot_CalculatesTimeUntilReset()
{
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 0, hourStart: BaseTime);
var snapshot = counter.GetSnapshot(BaseTime.AddMinutes(15));
Assert.Equal(TimeSpan.FromMinutes(45), snapshot.TimeUntilReset);
}
[Fact]
public void GetSnapshot_AfterHourBoundary_ResetsAndReturnsNewHour()
{
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 50, hourStart: BaseTime);
var snapshot = counter.GetSnapshot(BaseTime.AddHours(1).AddMinutes(30));
Assert.Equal(0, snapshot.CurrentCount);
Assert.Equal(BaseTime.AddHours(1), snapshot.HourStart);
}
[Fact]
public void GetSnapshot_ResetsHourCorrectly()
{
var counter = new HourlyCounter(maxPerHour: 100, currentCount: 50, hourStart: BaseTime);
// Check at 12:30 - same hour
var snapshot1 = counter.GetSnapshot(BaseTime.AddMinutes(30));
Assert.Equal(50, snapshot1.CurrentCount);
Assert.Equal(BaseTime, snapshot1.HourStart);
// Check at 13:15 - new hour
var snapshot2 = counter.GetSnapshot(BaseTime.AddHours(1).AddMinutes(15));
Assert.Equal(0, snapshot2.CurrentCount);
Assert.Equal(BaseTime.AddHours(1), snapshot2.HourStart);
}
[Fact]
public void ConcurrentAccess_IsThreadSafe()
{
var counter = new HourlyCounter(maxPerHour: 50);
var successes = 0;
var now = DateTimeOffset.UtcNow;
Parallel.For(0, 100, _ =>
{
if (counter.TryIncrement(now))
{
Interlocked.Increment(ref successes);
}
});
Assert.Equal(50, successes);
var snapshot = counter.GetSnapshot(now);
Assert.Equal(50, snapshot.CurrentCount);
}
[Fact]
public void HourlyCounterSnapshot_Remaining_NeverNegative()
{
// Edge case: if CurrentCount somehow exceeds MaxPerHour
var snapshot = new HourlyCounterSnapshot(
MaxPerHour: 100,
CurrentCount: 150,
HourStart: BaseTime,
TimeUntilReset: TimeSpan.FromMinutes(30));
Assert.Equal(0, snapshot.Remaining);
Assert.True(snapshot.IsExhausted);
}
}

View File

@@ -0,0 +1,258 @@
using StellaOps.Orchestrator.Core.RateLimiting;
namespace StellaOps.Orchestrator.Tests.RateLimiting;
public class TokenBucketTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
[Fact]
public void Constructor_WithValidParameters_CreatesBucket()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0);
Assert.Equal(10, bucket.BurstCapacity);
Assert.Equal(2.0, bucket.RefillRate);
Assert.Equal(10, bucket.CurrentTokens);
}
[Fact]
public void Constructor_WithInitialTokens_SetsCorrectly()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5);
Assert.Equal(5, bucket.CurrentTokens);
}
[Fact]
public void Constructor_WithInitialTokensExceedingCapacity_CapsAtCapacity()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 15);
Assert.Equal(10, bucket.CurrentTokens);
}
[Theory]
[InlineData(0)]
[InlineData(-1)]
public void Constructor_WithInvalidBurstCapacity_Throws(int burstCapacity)
{
Assert.Throws<ArgumentOutOfRangeException>(() =>
new TokenBucket(burstCapacity: burstCapacity, refillRate: 2.0));
}
[Theory]
[InlineData(0)]
[InlineData(-1)]
public void Constructor_WithInvalidRefillRate_Throws(double refillRate)
{
Assert.Throws<ArgumentOutOfRangeException>(() =>
new TokenBucket(burstCapacity: 10, refillRate: refillRate));
}
[Fact]
public void TryConsume_WithAvailableTokens_ReturnsTrue()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0);
var result = bucket.TryConsume(BaseTime);
Assert.True(result);
Assert.Equal(9, bucket.CurrentTokens);
}
[Fact]
public void TryConsume_WithMultipleTokens_ConsumesCorrectAmount()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0);
var result = bucket.TryConsume(BaseTime, tokensRequired: 5);
Assert.True(result);
Assert.Equal(5, bucket.CurrentTokens);
}
[Fact]
public void TryConsume_WithInsufficientTokens_ReturnsFalse()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 2);
var result = bucket.TryConsume(BaseTime, tokensRequired: 5);
Assert.False(result);
Assert.Equal(2, bucket.CurrentTokens); // Unchanged
}
[Fact]
public void TryConsume_WithExactTokens_ConsumesAll()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5);
var result = bucket.TryConsume(BaseTime, tokensRequired: 5);
Assert.True(result);
Assert.Equal(0, bucket.CurrentTokens);
}
[Fact]
public void TryConsume_WithZeroTokensRequired_Throws()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0);
Assert.Throws<ArgumentOutOfRangeException>(() =>
bucket.TryConsume(BaseTime, tokensRequired: 0));
}
[Fact]
public void Refill_AfterTimeElapsed_AddsTokens()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5, lastRefillAt: BaseTime);
bucket.Refill(BaseTime.AddSeconds(2));
Assert.Equal(9, bucket.CurrentTokens); // 5 + (2 * 2.0)
}
[Fact]
public void Refill_CapsAtBurstCapacity()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 8, lastRefillAt: BaseTime);
bucket.Refill(BaseTime.AddSeconds(10));
Assert.Equal(10, bucket.CurrentTokens); // Capped at burst capacity
}
[Fact]
public void Refill_WithPastTime_DoesNothing()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5, lastRefillAt: BaseTime);
bucket.Refill(BaseTime.AddSeconds(-1));
Assert.Equal(5, bucket.CurrentTokens);
}
[Fact]
public void TryConsume_RefillsBeforeConsuming()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 0, lastRefillAt: BaseTime);
// After 3 seconds, should have 6 tokens (3 * 2.0)
var result = bucket.TryConsume(BaseTime.AddSeconds(3), tokensRequired: 5);
Assert.True(result);
Assert.Equal(1, bucket.CurrentTokens); // 6 - 5
}
[Fact]
public void HasTokens_WithSufficientTokens_ReturnsTrue()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5);
var result = bucket.HasTokens(BaseTime, tokensRequired: 3);
Assert.True(result);
Assert.Equal(5, bucket.CurrentTokens); // Unchanged
}
[Fact]
public void HasTokens_WithInsufficientTokens_ReturnsFalse()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 2);
var result = bucket.HasTokens(BaseTime, tokensRequired: 5);
Assert.False(result);
}
[Fact]
public void EstimatedWaitTime_WithAvailableTokens_ReturnsZero()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5);
var wait = bucket.EstimatedWaitTime(BaseTime, tokensRequired: 3);
Assert.Equal(TimeSpan.Zero, wait);
}
[Fact]
public void EstimatedWaitTime_WithInsufficientTokens_ReturnsCorrectTime()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 2, lastRefillAt: BaseTime);
// Need 5 tokens, have 2, need 3 more at rate 2.0 = 1.5 seconds
var wait = bucket.EstimatedWaitTime(BaseTime, tokensRequired: 5);
Assert.Equal(TimeSpan.FromSeconds(1.5), wait);
}
[Fact]
public void Reset_SetsToFullCapacity()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 3);
bucket.Reset(BaseTime);
Assert.Equal(10, bucket.CurrentTokens);
}
[Fact]
public void GetSnapshot_ReturnsCorrectState()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 5, lastRefillAt: BaseTime);
var snapshot = bucket.GetSnapshot(BaseTime);
Assert.Equal(10, snapshot.BurstCapacity);
Assert.Equal(2.0, snapshot.RefillRate);
Assert.Equal(5, snapshot.CurrentTokens);
Assert.Equal(BaseTime, snapshot.LastRefillAt);
Assert.Equal(0.5, snapshot.FillPercent);
Assert.False(snapshot.IsEmpty);
Assert.False(snapshot.IsFull);
}
[Fact]
public void GetSnapshot_WithEmptyBucket_ShowsEmpty()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 0, lastRefillAt: BaseTime);
var snapshot = bucket.GetSnapshot(BaseTime);
Assert.True(snapshot.IsEmpty);
Assert.False(snapshot.IsFull);
}
[Fact]
public void GetSnapshot_WithFullBucket_ShowsFull()
{
var bucket = new TokenBucket(burstCapacity: 10, refillRate: 2.0, initialTokens: 10);
var snapshot = bucket.GetSnapshot(BaseTime);
Assert.False(snapshot.IsEmpty);
Assert.True(snapshot.IsFull);
}
[Fact]
public void ConcurrentAccess_IsThreadSafe()
{
// Use fixed time to avoid refills during test (set refillRate to 0 effect)
var fixedTime = DateTimeOffset.UtcNow;
var bucket = new TokenBucket(burstCapacity: 100, refillRate: 0.001, initialTokens: 100, lastRefillAt: fixedTime);
var successes = 0;
Parallel.For(0, 100, _ =>
{
if (bucket.TryConsume(fixedTime))
{
Interlocked.Increment(ref successes);
}
});
Assert.Equal(100, successes);
// Due to thread timing, tokens might be slightly different, just check it's close to 0
Assert.True(bucket.CurrentTokens < 1, $"Expected < 1 tokens remaining, got {bucket.CurrentTokens}");
}
}

View File

@@ -0,0 +1,284 @@
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Core.Scheduling;
namespace StellaOps.Orchestrator.Tests.Scheduling;
public sealed class DagPlannerTests
{
private static readonly string TenantId = "test-tenant";
private static readonly Guid RunId = Guid.NewGuid();
[Fact]
public void ValidateDag_EmptyEdges_ReturnsValid()
{
var result = DagPlanner.ValidateDag([]);
Assert.True(result.IsValid);
Assert.Empty(result.CycleNodes);
}
[Fact]
public void ValidateDag_LinearChain_ReturnsValid()
{
var jobA = Guid.NewGuid();
var jobB = Guid.NewGuid();
var jobC = Guid.NewGuid();
var edges = new[]
{
CreateEdge(jobA, jobB),
CreateEdge(jobB, jobC)
};
var result = DagPlanner.ValidateDag(edges);
Assert.True(result.IsValid);
}
[Fact]
public void ValidateDag_DiamondShape_ReturnsValid()
{
// A -> B -> D
// A -> C -> D
var jobA = Guid.NewGuid();
var jobB = Guid.NewGuid();
var jobC = Guid.NewGuid();
var jobD = Guid.NewGuid();
var edges = new[]
{
CreateEdge(jobA, jobB),
CreateEdge(jobA, jobC),
CreateEdge(jobB, jobD),
CreateEdge(jobC, jobD)
};
var result = DagPlanner.ValidateDag(edges);
Assert.True(result.IsValid);
}
[Fact]
public void ValidateDag_SimpleCycle_ReturnsCycleDetected()
{
var jobA = Guid.NewGuid();
var jobB = Guid.NewGuid();
var edges = new[]
{
CreateEdge(jobA, jobB),
CreateEdge(jobB, jobA) // Cycle!
};
var result = DagPlanner.ValidateDag(edges);
Assert.False(result.IsValid);
Assert.NotEmpty(result.CycleNodes);
}
[Fact]
public void ValidateDag_SelfLoop_ReturnsCycleDetected()
{
var jobA = Guid.NewGuid();
var edges = new[]
{
CreateEdge(jobA, jobA) // Self-loop!
};
var result = DagPlanner.ValidateDag(edges);
Assert.False(result.IsValid);
}
[Fact]
public void TopologicalSort_LinearChain_ReturnsCorrectOrder()
{
var jobA = Guid.NewGuid();
var jobB = Guid.NewGuid();
var jobC = Guid.NewGuid();
var jobs = new[] { jobC, jobA, jobB }; // Unordered
var edges = new[]
{
CreateEdge(jobA, jobB),
CreateEdge(jobB, jobC)
};
var sorted = DagPlanner.TopologicalSort(jobs, edges).ToList();
Assert.Equal(3, sorted.Count);
Assert.True(sorted.IndexOf(jobA) < sorted.IndexOf(jobB));
Assert.True(sorted.IndexOf(jobB) < sorted.IndexOf(jobC));
}
[Fact]
public void TopologicalSort_DiamondShape_ReturnsValidOrder()
{
var jobA = Guid.NewGuid();
var jobB = Guid.NewGuid();
var jobC = Guid.NewGuid();
var jobD = Guid.NewGuid();
var jobs = new[] { jobD, jobC, jobB, jobA }; // Reverse order
var edges = new[]
{
CreateEdge(jobA, jobB),
CreateEdge(jobA, jobC),
CreateEdge(jobB, jobD),
CreateEdge(jobC, jobD)
};
var sorted = DagPlanner.TopologicalSort(jobs, edges).ToList();
Assert.Equal(4, sorted.Count);
Assert.True(sorted.IndexOf(jobA) < sorted.IndexOf(jobB));
Assert.True(sorted.IndexOf(jobA) < sorted.IndexOf(jobC));
Assert.True(sorted.IndexOf(jobB) < sorted.IndexOf(jobD));
Assert.True(sorted.IndexOf(jobC) < sorted.IndexOf(jobD));
}
[Fact]
public void TopologicalSort_NoEdges_ReturnsAllJobs()
{
var jobA = Guid.NewGuid();
var jobB = Guid.NewGuid();
var jobs = new[] { jobA, jobB };
var sorted = DagPlanner.TopologicalSort(jobs, []);
Assert.Equal(2, sorted.Count);
Assert.Contains(jobA, sorted);
Assert.Contains(jobB, sorted);
}
[Fact]
public void GetReadyJobs_NoDependencies_ReturnsAllPendingJobs()
{
var job1 = CreateJob(JobStatus.Pending);
var job2 = CreateJob(JobStatus.Pending);
var job3 = CreateJob(JobStatus.Scheduled); // Not pending
var ready = DagPlanner.GetReadyJobs([job1, job2, job3], []);
Assert.Equal(2, ready.Count);
Assert.Contains(job1, ready);
Assert.Contains(job2, ready);
}
[Fact]
public void GetReadyJobs_WithUnsatisfiedDependency_FiltersBlockedJobs()
{
var job1 = CreateJob(JobStatus.Pending);
var job2 = CreateJob(JobStatus.Pending);
var edges = new[]
{
CreateEdge(job1.JobId, job2.JobId) // job2 depends on job1
};
var ready = DagPlanner.GetReadyJobs([job1, job2], edges);
Assert.Single(ready);
Assert.Contains(job1, ready);
}
[Fact]
public void GetReadyJobs_WithSatisfiedDependency_IncludesDependentJob()
{
var job1 = CreateJob(JobStatus.Succeeded); // Parent completed
var job2 = CreateJob(JobStatus.Pending); // Can now run
var edges = new[]
{
CreateEdge(job1.JobId, job2.JobId)
};
var ready = DagPlanner.GetReadyJobs([job1, job2], edges);
Assert.Single(ready);
Assert.Contains(job2, ready);
}
[Fact]
public void GetBlockedJobs_SingleFailure_ReturnsDirectAndTransitiveChildren()
{
var failed = Guid.NewGuid();
var child1 = Guid.NewGuid();
var child2 = Guid.NewGuid();
var grandchild = Guid.NewGuid();
var edges = new[]
{
CreateEdge(failed, child1),
CreateEdge(failed, child2),
CreateEdge(child1, grandchild)
};
var blocked = DagPlanner.GetBlockedJobs(failed, edges);
Assert.Equal(3, blocked.Count);
Assert.Contains(child1, blocked);
Assert.Contains(child2, blocked);
Assert.Contains(grandchild, blocked);
}
[Fact]
public void CalculateCriticalPath_LinearChain_ReturnsEntireChain()
{
var job1 = CreateJob(JobStatus.Pending);
var job2 = CreateJob(JobStatus.Pending);
var job3 = CreateJob(JobStatus.Pending);
var edges = new[]
{
CreateEdge(job1.JobId, job2.JobId),
CreateEdge(job2.JobId, job3.JobId)
};
var result = DagPlanner.CalculateCriticalPath(
[job1, job2, job3],
edges,
_ => TimeSpan.FromMinutes(10));
Assert.Equal(TimeSpan.FromMinutes(30), result.TotalDuration);
Assert.Equal(3, result.CriticalPathJobIds.Count);
}
private static DagEdge CreateEdge(Guid parent, Guid child, string edgeType = DagEdgeTypes.Success)
{
return new DagEdge(
EdgeId: Guid.NewGuid(),
TenantId: TenantId,
RunId: RunId,
ParentJobId: parent,
ChildJobId: child,
EdgeType: edgeType,
CreatedAt: DateTimeOffset.UtcNow);
}
private static Job CreateJob(JobStatus status, int priority = 0)
{
return new Job(
JobId: Guid.NewGuid(),
TenantId: TenantId,
ProjectId: null,
RunId: RunId,
JobType: "test.job",
Status: status,
Priority: priority,
Attempt: 1,
MaxAttempts: 3,
PayloadDigest: "0".PadLeft(64, '0'),
Payload: "{}",
IdempotencyKey: Guid.NewGuid().ToString(),
CorrelationId: null,
LeaseId: null,
WorkerId: null,
TaskRunnerId: null,
LeaseUntil: null,
CreatedAt: DateTimeOffset.UtcNow,
ScheduledAt: null,
LeasedAt: null,
CompletedAt: null,
NotBefore: null,
Reason: null,
ReplayOf: null,
CreatedBy: "test");
}
}

View File

@@ -0,0 +1,109 @@
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Core.Scheduling;
namespace StellaOps.Orchestrator.Tests.Scheduling;
public sealed class JobStateMachineTests
{
[Theory]
[InlineData(JobStatus.Pending, JobStatus.Scheduled, true)]
[InlineData(JobStatus.Pending, JobStatus.Canceled, true)]
[InlineData(JobStatus.Pending, JobStatus.Leased, false)]
[InlineData(JobStatus.Scheduled, JobStatus.Leased, true)]
[InlineData(JobStatus.Scheduled, JobStatus.Canceled, true)]
[InlineData(JobStatus.Scheduled, JobStatus.Pending, true)]
[InlineData(JobStatus.Leased, JobStatus.Succeeded, true)]
[InlineData(JobStatus.Leased, JobStatus.Failed, true)]
[InlineData(JobStatus.Leased, JobStatus.Canceled, true)]
[InlineData(JobStatus.Leased, JobStatus.TimedOut, true)]
[InlineData(JobStatus.Leased, JobStatus.Pending, false)]
[InlineData(JobStatus.Failed, JobStatus.Pending, true)]
[InlineData(JobStatus.Failed, JobStatus.Scheduled, false)]
[InlineData(JobStatus.TimedOut, JobStatus.Pending, true)]
[InlineData(JobStatus.Succeeded, JobStatus.Pending, false)]
[InlineData(JobStatus.Canceled, JobStatus.Pending, false)]
public void IsValidTransition_ReturnsExpectedResult(JobStatus from, JobStatus to, bool expected)
{
var result = JobStateMachine.IsValidTransition(from, to);
Assert.Equal(expected, result);
}
[Theory]
[InlineData(JobStatus.Pending, JobStatus.Pending)]
[InlineData(JobStatus.Scheduled, JobStatus.Scheduled)]
[InlineData(JobStatus.Leased, JobStatus.Leased)]
[InlineData(JobStatus.Succeeded, JobStatus.Succeeded)]
public void IsValidTransition_SameStatus_ReturnsTrue(JobStatus status, JobStatus same)
{
Assert.True(JobStateMachine.IsValidTransition(status, same));
}
[Theory]
[InlineData(JobStatus.Succeeded, true)]
[InlineData(JobStatus.Failed, true)]
[InlineData(JobStatus.Canceled, true)]
[InlineData(JobStatus.TimedOut, true)]
[InlineData(JobStatus.Pending, false)]
[InlineData(JobStatus.Scheduled, false)]
[InlineData(JobStatus.Leased, false)]
public void IsTerminal_ReturnsExpectedResult(JobStatus status, bool expected)
{
Assert.Equal(expected, JobStateMachine.IsTerminal(status));
}
[Theory]
[InlineData(JobStatus.Failed, true)]
[InlineData(JobStatus.TimedOut, true)]
[InlineData(JobStatus.Succeeded, false)]
[InlineData(JobStatus.Canceled, false)]
[InlineData(JobStatus.Pending, false)]
public void IsRetryable_ReturnsExpectedResult(JobStatus status, bool expected)
{
Assert.Equal(expected, JobStateMachine.IsRetryable(status));
}
[Fact]
public void ValidateTransition_InvalidTransition_ThrowsException()
{
var ex = Assert.Throws<InvalidJobTransitionException>(
() => JobStateMachine.ValidateTransition(JobStatus.Pending, JobStatus.Succeeded));
Assert.Equal(JobStatus.Pending, ex.FromStatus);
Assert.Equal(JobStatus.Succeeded, ex.ToStatus);
}
[Fact]
public void ValidateTransition_ValidTransition_DoesNotThrow()
{
JobStateMachine.ValidateTransition(JobStatus.Pending, JobStatus.Scheduled);
}
[Fact]
public void GetValidTransitions_Pending_ReturnsScheduledAndCanceled()
{
var transitions = JobStateMachine.GetValidTransitions(JobStatus.Pending);
Assert.Contains(JobStatus.Scheduled, transitions);
Assert.Contains(JobStatus.Canceled, transitions);
Assert.Equal(2, transitions.Count);
}
[Fact]
public void GetValidTransitions_Leased_ReturnsFourOptions()
{
var transitions = JobStateMachine.GetValidTransitions(JobStatus.Leased);
Assert.Contains(JobStatus.Succeeded, transitions);
Assert.Contains(JobStatus.Failed, transitions);
Assert.Contains(JobStatus.Canceled, transitions);
Assert.Contains(JobStatus.TimedOut, transitions);
Assert.Equal(4, transitions.Count);
}
[Fact]
public void GetValidTransitions_Terminal_ReturnsEmpty()
{
Assert.Empty(JobStateMachine.GetValidTransitions(JobStatus.Succeeded));
Assert.Empty(JobStateMachine.GetValidTransitions(JobStatus.Canceled));
}
}

View File

@@ -0,0 +1,143 @@
using StellaOps.Orchestrator.Core.Scheduling;
namespace StellaOps.Orchestrator.Tests.Scheduling;
public sealed class RetryPolicyTests
{
[Theory]
[InlineData(1, true)] // First attempt, can retry
[InlineData(2, true)] // Second attempt, can retry (3 max)
[InlineData(3, false)] // Third attempt, cannot retry (3 max)
[InlineData(4, false)] // Beyond max
public void ShouldRetry_DefaultPolicy_ReturnsExpected(int attempt, bool expected)
{
var policy = RetryPolicy.Default; // 3 max attempts
Assert.Equal(expected, policy.ShouldRetry(attempt));
}
[Fact]
public void ShouldRetry_NoRetryPolicy_NeverRetries()
{
var policy = RetryPolicy.NoRetry;
Assert.False(policy.ShouldRetry(1));
}
[Theory]
[InlineData(1, 5.0)] // First attempt: 5 * 2^0 = 5
[InlineData(2, 10.0)] // Second attempt: 5 * 2^1 = 10
[InlineData(3, 20.0)] // Third attempt: 5 * 2^2 = 20
public void CalculateBackoffSeconds_ExponentialGrowth_ReturnsExpected(int attempt, double expectedBase)
{
// Use a policy with no jitter for deterministic testing
var policy = new RetryPolicy(
MaxAttempts: 5,
InitialBackoffSeconds: 5.0,
MaxBackoffSeconds: 300.0,
BackoffMultiplier: 2.0,
JitterFactor: 0.0); // No jitter
var backoff = policy.CalculateBackoffSeconds(attempt);
Assert.Equal(expectedBase, backoff, precision: 1);
}
[Fact]
public void CalculateBackoffSeconds_CapsAtMaximum()
{
var policy = new RetryPolicy(
MaxAttempts: 10,
InitialBackoffSeconds: 100.0,
MaxBackoffSeconds: 200.0,
BackoffMultiplier: 2.0,
JitterFactor: 0.0);
// 100 * 2^5 = 3200, but capped at 200
var backoff = policy.CalculateBackoffSeconds(6);
Assert.Equal(200.0, backoff);
}
[Fact]
public void CalculateBackoffSeconds_WithJitter_VariesWithinRange()
{
var policy = new RetryPolicy(
MaxAttempts: 5,
InitialBackoffSeconds: 10.0,
MaxBackoffSeconds: 300.0,
BackoffMultiplier: 2.0,
JitterFactor: 0.2); // 20% jitter
// Run multiple times to verify jitter adds variance
var backoffs = Enumerable.Range(0, 100)
.Select(_ => policy.CalculateBackoffSeconds(1))
.ToList();
var minExpected = 10.0 * 0.8; // 10 - 20%
var maxExpected = 10.0 * 1.2; // 10 + 20%
Assert.True(backoffs.All(b => b >= minExpected && b <= maxExpected));
// Should have some variance (not all equal)
Assert.True(backoffs.Distinct().Count() > 1);
}
[Fact]
public void CalculateNextRetryTime_ReturnsCorrectTime()
{
var policy = new RetryPolicy(
MaxAttempts: 3,
InitialBackoffSeconds: 30.0,
MaxBackoffSeconds: 300.0,
BackoffMultiplier: 2.0,
JitterFactor: 0.0);
var now = DateTimeOffset.UtcNow;
var nextRetry = policy.CalculateNextRetryTime(1, now);
Assert.Equal(now.AddSeconds(30), nextRetry);
}
[Fact]
public void CalculateNextRetryTime_WhenExhausted_ThrowsException()
{
var policy = RetryPolicy.Default; // 3 max
Assert.Throws<InvalidOperationException>(
() => policy.CalculateNextRetryTime(3, DateTimeOffset.UtcNow));
}
[Fact]
public void RetryEvaluator_WhenShouldRetry_ReturnsRetryDecision()
{
var policy = RetryPolicy.Default;
var now = DateTimeOffset.UtcNow;
var decision = RetryEvaluator.Evaluate(1, policy, now);
Assert.True(decision.ShouldRetry);
Assert.Equal(2, decision.NextAttempt);
Assert.NotNull(decision.NotBefore);
Assert.True(decision.NotBefore > now);
}
[Fact]
public void RetryEvaluator_WhenExhausted_ReturnsExhaustedDecision()
{
var policy = RetryPolicy.Default; // 3 max
var now = DateTimeOffset.UtcNow;
var decision = RetryEvaluator.Evaluate(3, policy, now);
Assert.False(decision.ShouldRetry);
Assert.Null(decision.NotBefore);
Assert.Contains("exhausted", decision.Reason, StringComparison.OrdinalIgnoreCase);
}
[Fact]
public void DefaultPolicy_HasReasonableValues()
{
var policy = RetryPolicy.Default;
Assert.Equal(3, policy.MaxAttempts);
Assert.Equal(5.0, policy.InitialBackoffSeconds);
Assert.Equal(300.0, policy.MaxBackoffSeconds);
Assert.Equal(2.0, policy.BackoffMultiplier);
}
}

View File

@@ -0,0 +1,531 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.Tests.SloManagement;
public class SloTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
private const string TenantId = "test-tenant";
// =========================================================================
// Slo Creation Tests
// =========================================================================
[Fact]
public void CreateAvailability_SetsCorrectProperties()
{
var slo = Slo.CreateAvailability(
TenantId,
"API Availability",
target: 0.999,
window: SloWindow.ThirtyDays,
createdBy: "admin",
description: "99.9% uptime target");
Assert.NotEqual(Guid.Empty, slo.SloId);
Assert.Equal(TenantId, slo.TenantId);
Assert.Equal("API Availability", slo.Name);
Assert.Equal("99.9% uptime target", slo.Description);
Assert.Equal(SloType.Availability, slo.Type);
Assert.Equal(0.999, slo.Target);
Assert.Equal(SloWindow.ThirtyDays, slo.Window);
Assert.True(slo.Enabled);
Assert.Null(slo.JobType);
Assert.Null(slo.SourceId);
Assert.Equal("admin", slo.CreatedBy);
}
[Fact]
public void CreateAvailability_WithJobType_SetsJobType()
{
var slo = Slo.CreateAvailability(
TenantId,
"Scan Availability",
0.99,
SloWindow.SevenDays,
"admin",
jobType: "scan.image");
Assert.Equal("scan.image", slo.JobType);
}
[Fact]
public void CreateAvailability_WithSourceId_SetsSourceId()
{
var sourceId = Guid.NewGuid();
var slo = Slo.CreateAvailability(
TenantId,
"Source Availability",
0.995,
SloWindow.OneDay,
"admin",
sourceId: sourceId);
Assert.Equal(sourceId, slo.SourceId);
}
[Fact]
public void CreateLatency_SetsCorrectProperties()
{
var slo = Slo.CreateLatency(
TenantId,
"API Latency P95",
percentile: 0.95,
targetSeconds: 0.5,
target: 0.99,
window: SloWindow.OneDay,
createdBy: "admin");
Assert.Equal(SloType.Latency, slo.Type);
Assert.Equal(0.95, slo.LatencyPercentile);
Assert.Equal(0.5, slo.LatencyTargetSeconds);
Assert.Equal(0.99, slo.Target);
}
[Fact]
public void CreateThroughput_SetsCorrectProperties()
{
var slo = Slo.CreateThroughput(
TenantId,
"Scan Throughput",
minimum: 1000,
target: 0.95,
window: SloWindow.OneHour,
createdBy: "admin");
Assert.Equal(SloType.Throughput, slo.Type);
Assert.Equal(1000, slo.ThroughputMinimum);
Assert.Equal(0.95, slo.Target);
}
// =========================================================================
// Slo Validation Tests
// =========================================================================
[Theory]
[InlineData(0)]
[InlineData(-0.1)]
[InlineData(1.1)]
public void CreateAvailability_WithInvalidTarget_Throws(double target)
{
Assert.Throws<ArgumentOutOfRangeException>(() =>
Slo.CreateAvailability(TenantId, "Test", target, SloWindow.OneDay, "admin"));
}
[Theory]
[InlineData(-0.1)]
[InlineData(1.1)]
public void CreateLatency_WithInvalidPercentile_Throws(double percentile)
{
Assert.Throws<ArgumentOutOfRangeException>(() =>
Slo.CreateLatency(TenantId, "Test", percentile, 1.0, 0.99, SloWindow.OneDay, "admin"));
}
[Theory]
[InlineData(0)]
[InlineData(-1.0)]
public void CreateLatency_WithInvalidTargetSeconds_Throws(double targetSeconds)
{
Assert.Throws<ArgumentOutOfRangeException>(() =>
Slo.CreateLatency(TenantId, "Test", 0.95, targetSeconds, 0.99, SloWindow.OneDay, "admin"));
}
[Theory]
[InlineData(0)]
[InlineData(-1)]
public void CreateThroughput_WithInvalidMinimum_Throws(int minimum)
{
Assert.Throws<ArgumentOutOfRangeException>(() =>
Slo.CreateThroughput(TenantId, "Test", minimum, 0.99, SloWindow.OneDay, "admin"));
}
// =========================================================================
// Error Budget Tests
// =========================================================================
[Theory]
[InlineData(0.999, 0.001)]
[InlineData(0.99, 0.01)]
[InlineData(0.95, 0.05)]
[InlineData(0.9, 0.1)]
public void ErrorBudget_CalculatesCorrectly(double target, double expectedBudget)
{
var slo = Slo.CreateAvailability(TenantId, "Test", target, SloWindow.OneDay, "admin");
Assert.Equal(expectedBudget, slo.ErrorBudget, precision: 10);
}
// =========================================================================
// Window Duration Tests
// =========================================================================
[Theory]
[InlineData(SloWindow.OneHour, 1)]
[InlineData(SloWindow.OneDay, 24)]
[InlineData(SloWindow.SevenDays, 168)]
[InlineData(SloWindow.ThirtyDays, 720)]
public void GetWindowDuration_ReturnsCorrectHours(SloWindow window, int expectedHours)
{
var slo = Slo.CreateAvailability(TenantId, "Test", 0.99, window, "admin");
Assert.Equal(TimeSpan.FromHours(expectedHours), slo.GetWindowDuration());
}
// =========================================================================
// Update Tests
// =========================================================================
[Fact]
public void Update_UpdatesOnlySpecifiedFields()
{
var slo = Slo.CreateAvailability(TenantId, "Original", 0.99, SloWindow.OneDay, "admin");
var updated = slo.Update(name: "Updated", updatedBy: "operator");
Assert.Equal("Updated", updated.Name);
Assert.Equal(0.99, updated.Target); // Unchanged
Assert.True(updated.Enabled); // Unchanged
Assert.Equal("operator", updated.UpdatedBy);
}
[Fact]
public void Update_WithNewTarget_UpdatesTarget()
{
var slo = Slo.CreateAvailability(TenantId, "Test", 0.99, SloWindow.OneDay, "admin");
var updated = slo.Update(target: 0.999, updatedBy: "operator");
Assert.Equal(0.999, updated.Target);
}
[Fact]
public void Update_WithInvalidTarget_Throws()
{
var slo = Slo.CreateAvailability(TenantId, "Test", 0.99, SloWindow.OneDay, "admin");
Assert.Throws<ArgumentOutOfRangeException>(() =>
slo.Update(target: 1.5, updatedBy: "operator"));
}
// =========================================================================
// Enable/Disable Tests
// =========================================================================
[Fact]
public void Disable_SetsEnabledToFalse()
{
var slo = Slo.CreateAvailability(TenantId, "Test", 0.99, SloWindow.OneDay, "admin");
var disabled = slo.Disable("operator");
Assert.False(disabled.Enabled);
Assert.Equal("operator", disabled.UpdatedBy);
}
[Fact]
public void Enable_SetsEnabledToTrue()
{
var slo = Slo.CreateAvailability(TenantId, "Test", 0.99, SloWindow.OneDay, "admin")
.Disable("operator");
var enabled = slo.Enable("operator");
Assert.True(enabled.Enabled);
}
}
public class SloStateTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
private const string TenantId = "test-tenant";
[Fact]
public void NoData_CreatesCorrectState()
{
var sloId = Guid.NewGuid();
var state = SloState.NoData(sloId, TenantId, BaseTime, SloWindow.OneDay);
Assert.Equal(sloId, state.SloId);
Assert.Equal(TenantId, state.TenantId);
Assert.Equal(1.0, state.CurrentSli);
Assert.Equal(0, state.TotalEvents);
Assert.Equal(0, state.GoodEvents);
Assert.Equal(0, state.BadEvents);
Assert.Equal(0, state.BudgetConsumed);
Assert.Equal(1.0, state.BudgetRemaining);
Assert.Equal(0, state.BurnRate);
Assert.Null(state.TimeToExhaustion);
Assert.True(state.IsMet);
Assert.Equal(AlertSeverity.Info, state.AlertSeverity);
}
[Theory]
[InlineData(SloWindow.OneHour)]
[InlineData(SloWindow.OneDay)]
[InlineData(SloWindow.SevenDays)]
[InlineData(SloWindow.ThirtyDays)]
public void NoData_SetsCorrectWindowBounds(SloWindow window)
{
var state = SloState.NoData(Guid.NewGuid(), TenantId, BaseTime, window);
Assert.Equal(BaseTime, state.WindowEnd);
Assert.True(state.WindowStart < state.WindowEnd);
}
}
public class AlertBudgetThresholdTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
private const string TenantId = "test-tenant";
[Fact]
public void Create_SetsCorrectProperties()
{
var sloId = Guid.NewGuid();
var threshold = AlertBudgetThreshold.Create(
sloId,
TenantId,
budgetConsumedThreshold: 0.5,
severity: AlertSeverity.Warning,
createdBy: "admin");
Assert.NotEqual(Guid.Empty, threshold.ThresholdId);
Assert.Equal(sloId, threshold.SloId);
Assert.Equal(TenantId, threshold.TenantId);
Assert.Equal(0.5, threshold.BudgetConsumedThreshold);
Assert.Equal(AlertSeverity.Warning, threshold.Severity);
Assert.True(threshold.Enabled);
Assert.Null(threshold.BurnRateThreshold);
Assert.Equal(TimeSpan.FromHours(1), threshold.Cooldown);
}
[Fact]
public void Create_WithBurnRateThreshold_SetsBurnRate()
{
var threshold = AlertBudgetThreshold.Create(
Guid.NewGuid(),
TenantId,
0.8,
AlertSeverity.Critical,
"admin",
burnRateThreshold: 5.0);
Assert.Equal(5.0, threshold.BurnRateThreshold);
}
[Fact]
public void Create_WithCustomCooldown_SetsCooldown()
{
var threshold = AlertBudgetThreshold.Create(
Guid.NewGuid(),
TenantId,
0.5,
AlertSeverity.Warning,
"admin",
cooldown: TimeSpan.FromMinutes(30));
Assert.Equal(TimeSpan.FromMinutes(30), threshold.Cooldown);
}
[Theory]
[InlineData(-0.1)]
[InlineData(1.1)]
public void Create_WithInvalidThreshold_Throws(double threshold)
{
Assert.Throws<ArgumentOutOfRangeException>(() =>
AlertBudgetThreshold.Create(Guid.NewGuid(), TenantId, threshold, AlertSeverity.Warning, "admin"));
}
[Fact]
public void ShouldTrigger_WhenDisabled_ReturnsFalse()
{
var threshold = AlertBudgetThreshold.Create(
Guid.NewGuid(), TenantId, 0.5, AlertSeverity.Warning, "admin")
with { Enabled = false };
var state = CreateTestState(budgetConsumed: 0.6);
Assert.False(threshold.ShouldTrigger(state, BaseTime));
}
[Fact]
public void ShouldTrigger_WhenBudgetExceedsThreshold_ReturnsTrue()
{
var threshold = AlertBudgetThreshold.Create(
Guid.NewGuid(), TenantId, 0.5, AlertSeverity.Warning, "admin");
var state = CreateTestState(budgetConsumed: 0.6);
Assert.True(threshold.ShouldTrigger(state, BaseTime));
}
[Fact]
public void ShouldTrigger_WhenBudgetBelowThreshold_ReturnsFalse()
{
var threshold = AlertBudgetThreshold.Create(
Guid.NewGuid(), TenantId, 0.5, AlertSeverity.Warning, "admin");
var state = CreateTestState(budgetConsumed: 0.3);
Assert.False(threshold.ShouldTrigger(state, BaseTime));
}
[Fact]
public void ShouldTrigger_WhenBurnRateExceedsThreshold_ReturnsTrue()
{
var threshold = AlertBudgetThreshold.Create(
Guid.NewGuid(), TenantId, 0.9, AlertSeverity.Critical, "admin",
burnRateThreshold: 3.0);
var state = CreateTestState(budgetConsumed: 0.3, burnRate: 4.0);
Assert.True(threshold.ShouldTrigger(state, BaseTime));
}
[Fact]
public void ShouldTrigger_WhenWithinCooldown_ReturnsFalse()
{
var threshold = AlertBudgetThreshold.Create(
Guid.NewGuid(), TenantId, 0.5, AlertSeverity.Warning, "admin")
with { LastTriggeredAt = BaseTime, Cooldown = TimeSpan.FromHours(1) };
var state = CreateTestState(budgetConsumed: 0.6);
Assert.False(threshold.ShouldTrigger(state, BaseTime.AddMinutes(30)));
}
[Fact]
public void ShouldTrigger_WhenCooldownExpired_ReturnsTrue()
{
var threshold = AlertBudgetThreshold.Create(
Guid.NewGuid(), TenantId, 0.5, AlertSeverity.Warning, "admin")
with { LastTriggeredAt = BaseTime, Cooldown = TimeSpan.FromHours(1) };
var state = CreateTestState(budgetConsumed: 0.6);
Assert.True(threshold.ShouldTrigger(state, BaseTime.AddMinutes(90)));
}
[Fact]
public void RecordTrigger_UpdatesLastTriggeredAt()
{
var threshold = AlertBudgetThreshold.Create(
Guid.NewGuid(), TenantId, 0.5, AlertSeverity.Warning, "admin");
var updated = threshold.RecordTrigger(BaseTime);
Assert.Equal(BaseTime, updated.LastTriggeredAt);
Assert.Equal(BaseTime, updated.UpdatedAt);
}
private static SloState CreateTestState(double budgetConsumed = 0.5, double burnRate = 1.0) =>
new(
SloId: Guid.NewGuid(),
TenantId: TenantId,
CurrentSli: 0.99,
TotalEvents: 1000,
GoodEvents: 990,
BadEvents: 10,
BudgetConsumed: budgetConsumed,
BudgetRemaining: 1 - budgetConsumed,
BurnRate: burnRate,
TimeToExhaustion: TimeSpan.FromHours(10),
IsMet: true,
AlertSeverity: AlertSeverity.Info,
ComputedAt: BaseTime,
WindowStart: BaseTime.AddDays(-1),
WindowEnd: BaseTime);
}
public class SloAlertTests
{
private static readonly DateTimeOffset BaseTime = new(2024, 1, 1, 12, 0, 0, TimeSpan.Zero);
private const string TenantId = "test-tenant";
[Fact]
public void Create_FromSloAndState_CreatesAlert()
{
var slo = Slo.CreateAvailability(TenantId, "API Availability", 0.999, SloWindow.ThirtyDays, "admin");
var state = CreateTestState(slo.SloId, budgetConsumed: 0.8);
var threshold = AlertBudgetThreshold.Create(slo.SloId, TenantId, 0.5, AlertSeverity.Warning, "admin");
var alert = SloAlert.Create(slo, state, threshold);
Assert.NotEqual(Guid.Empty, alert.AlertId);
Assert.Equal(slo.SloId, alert.SloId);
Assert.Equal(threshold.ThresholdId, alert.ThresholdId);
Assert.Equal(TenantId, alert.TenantId);
Assert.Equal(AlertSeverity.Warning, alert.Severity);
Assert.Contains("API Availability", alert.Message);
Assert.Equal(0.8, alert.BudgetConsumed);
Assert.False(alert.IsAcknowledged);
Assert.False(alert.IsResolved);
}
[Fact]
public void Create_WithBurnRateTrigger_IncludesBurnRateInMessage()
{
var slo = Slo.CreateAvailability(TenantId, "Test SLO", 0.99, SloWindow.OneDay, "admin");
var state = CreateTestState(slo.SloId, budgetConsumed: 0.3, burnRate: 6.0);
var threshold = AlertBudgetThreshold.Create(slo.SloId, TenantId, 0.9, AlertSeverity.Critical, "admin",
burnRateThreshold: 5.0);
var alert = SloAlert.Create(slo, state, threshold);
Assert.Contains("burn rate", alert.Message);
Assert.Contains("6.00", alert.Message);
}
[Fact]
public void Acknowledge_SetsAcknowledgedFields()
{
var alert = CreateTestAlert();
var acknowledged = alert.Acknowledge("operator", BaseTime.AddHours(1));
Assert.True(acknowledged.IsAcknowledged);
Assert.Equal(BaseTime.AddHours(1), acknowledged.AcknowledgedAt);
Assert.Equal("operator", acknowledged.AcknowledgedBy);
Assert.False(acknowledged.IsResolved);
}
[Fact]
public void Resolve_SetsResolvedFields()
{
var alert = CreateTestAlert();
var resolved = alert.Resolve("Fixed by scaling up", BaseTime.AddHours(2));
Assert.True(resolved.IsResolved);
Assert.Equal(BaseTime.AddHours(2), resolved.ResolvedAt);
Assert.Equal("Fixed by scaling up", resolved.ResolutionNotes);
}
private static SloAlert CreateTestAlert()
{
var slo = Slo.CreateAvailability(TenantId, "Test SLO", 0.99, SloWindow.OneDay, "admin");
var state = CreateTestState(slo.SloId, budgetConsumed: 0.6);
var threshold = AlertBudgetThreshold.Create(slo.SloId, TenantId, 0.5, AlertSeverity.Warning, "admin");
return SloAlert.Create(slo, state, threshold);
}
private static SloState CreateTestState(Guid sloId, double budgetConsumed = 0.5, double burnRate = 1.0) =>
new(
SloId: sloId,
TenantId: TenantId,
CurrentSli: 0.99,
TotalEvents: 1000,
GoodEvents: 990,
BadEvents: 10,
BudgetConsumed: budgetConsumed,
BudgetRemaining: 1 - budgetConsumed,
BurnRate: burnRate,
TimeToExhaustion: TimeSpan.FromHours(10),
IsMet: true,
AlertSeverity: AlertSeverity.Info,
ComputedAt: BaseTime,
WindowStart: BaseTime.AddDays(-1),
WindowEnd: BaseTime);
}

View File

@@ -0,0 +1,338 @@
using StellaOps.Orchestrator.Core.Domain;
using StellaOps.Orchestrator.Infrastructure.Repositories;
namespace StellaOps.Orchestrator.WebService.Contracts;
// ===== Audit Contracts =====
/// <summary>
/// Response for an audit entry.
/// </summary>
public sealed record AuditEntryResponse(
Guid EntryId,
string TenantId,
string EventType,
string ResourceType,
Guid ResourceId,
string ActorId,
string ActorType,
string? ActorIp,
string? UserAgent,
string? HttpMethod,
string? RequestPath,
string? OldState,
string? NewState,
string Description,
string? CorrelationId,
string? PreviousEntryHash,
string ContentHash,
long SequenceNumber,
DateTimeOffset OccurredAt,
string? Metadata)
{
public static AuditEntryResponse FromDomain(AuditEntry entry) => new(
EntryId: entry.EntryId,
TenantId: entry.TenantId,
EventType: entry.EventType.ToString(),
ResourceType: entry.ResourceType,
ResourceId: entry.ResourceId,
ActorId: entry.ActorId,
ActorType: entry.ActorType.ToString(),
ActorIp: entry.ActorIp,
UserAgent: entry.UserAgent,
HttpMethod: entry.HttpMethod,
RequestPath: entry.RequestPath,
OldState: entry.OldState,
NewState: entry.NewState,
Description: entry.Description,
CorrelationId: entry.CorrelationId,
PreviousEntryHash: entry.PreviousEntryHash,
ContentHash: entry.ContentHash,
SequenceNumber: entry.SequenceNumber,
OccurredAt: entry.OccurredAt,
Metadata: entry.Metadata);
}
/// <summary>
/// List response for audit entries.
/// </summary>
public sealed record AuditEntryListResponse(
IReadOnlyList<AuditEntryResponse> Entries,
string? NextCursor);
/// <summary>
/// Response for audit summary.
/// </summary>
public sealed record AuditSummaryResponse(
long TotalEntries,
long EntriesSince,
long EventTypes,
long UniqueActors,
long UniqueResources,
DateTimeOffset? EarliestEntry,
DateTimeOffset? LatestEntry)
{
public static AuditSummaryResponse FromDomain(AuditSummary summary) => new(
TotalEntries: summary.TotalEntries,
EntriesSince: summary.EntriesSince,
EventTypes: summary.EventTypes,
UniqueActors: summary.UniqueActors,
UniqueResources: summary.UniqueResources,
EarliestEntry: summary.EarliestEntry,
LatestEntry: summary.LatestEntry);
}
/// <summary>
/// Response for chain verification.
/// </summary>
public sealed record ChainVerificationResponse(
bool IsValid,
Guid? InvalidEntryId,
long? InvalidSequence,
string? ErrorMessage)
{
public static ChainVerificationResponse FromDomain(ChainVerificationResult result) => new(
IsValid: result.IsValid,
InvalidEntryId: result.InvalidEntryId,
InvalidSequence: result.InvalidSequence,
ErrorMessage: result.ErrorMessage);
}
// ===== Ledger Contracts =====
/// <summary>
/// Response for a ledger entry.
/// </summary>
public sealed record LedgerEntryResponse(
Guid LedgerId,
string TenantId,
Guid RunId,
Guid SourceId,
string RunType,
string FinalStatus,
int TotalJobs,
int SucceededJobs,
int FailedJobs,
DateTimeOffset RunCreatedAt,
DateTimeOffset? RunStartedAt,
DateTimeOffset RunCompletedAt,
long ExecutionDurationMs,
string InitiatedBy,
string InputDigest,
string OutputDigest,
long SequenceNumber,
string? PreviousEntryHash,
string ContentHash,
DateTimeOffset LedgerCreatedAt,
string? CorrelationId)
{
public static LedgerEntryResponse FromDomain(RunLedgerEntry entry) => new(
LedgerId: entry.LedgerId,
TenantId: entry.TenantId,
RunId: entry.RunId,
SourceId: entry.SourceId,
RunType: entry.RunType,
FinalStatus: entry.FinalStatus.ToString(),
TotalJobs: entry.TotalJobs,
SucceededJobs: entry.SucceededJobs,
FailedJobs: entry.FailedJobs,
RunCreatedAt: entry.RunCreatedAt,
RunStartedAt: entry.RunStartedAt,
RunCompletedAt: entry.RunCompletedAt,
ExecutionDurationMs: (long)entry.ExecutionDuration.TotalMilliseconds,
InitiatedBy: entry.InitiatedBy,
InputDigest: entry.InputDigest,
OutputDigest: entry.OutputDigest,
SequenceNumber: entry.SequenceNumber,
PreviousEntryHash: entry.PreviousEntryHash,
ContentHash: entry.ContentHash,
LedgerCreatedAt: entry.LedgerCreatedAt,
CorrelationId: entry.CorrelationId);
}
/// <summary>
/// List response for ledger entries.
/// </summary>
public sealed record LedgerEntryListResponse(
IReadOnlyList<LedgerEntryResponse> Entries,
string? NextCursor);
/// <summary>
/// Response for ledger summary.
/// </summary>
public sealed record LedgerSummaryResponse(
long TotalEntries,
long EntriesSince,
long TotalRuns,
long SuccessfulRuns,
long FailedRuns,
long TotalJobs,
long UniqueSources,
long UniqueRunTypes,
DateTimeOffset? EarliestEntry,
DateTimeOffset? LatestEntry)
{
public static LedgerSummaryResponse FromDomain(LedgerSummary summary) => new(
TotalEntries: summary.TotalEntries,
EntriesSince: summary.EntriesSince,
TotalRuns: summary.TotalRuns,
SuccessfulRuns: summary.SuccessfulRuns,
FailedRuns: summary.FailedRuns,
TotalJobs: summary.TotalJobs,
UniqueSources: summary.UniqueSources,
UniqueRunTypes: summary.UniqueRunTypes,
EarliestEntry: summary.EarliestEntry,
LatestEntry: summary.LatestEntry);
}
// ===== Export Contracts =====
/// <summary>
/// Request to create a ledger export.
/// </summary>
public sealed record CreateLedgerExportRequest(
string Format,
DateTimeOffset? StartTime,
DateTimeOffset? EndTime,
string? RunTypeFilter,
Guid? SourceIdFilter);
/// <summary>
/// Response for a ledger export.
/// </summary>
public sealed record LedgerExportResponse(
Guid ExportId,
string TenantId,
string Status,
string Format,
DateTimeOffset? StartTime,
DateTimeOffset? EndTime,
string? RunTypeFilter,
Guid? SourceIdFilter,
int EntryCount,
string? OutputUri,
string? OutputDigest,
long? OutputSizeBytes,
string RequestedBy,
DateTimeOffset RequestedAt,
DateTimeOffset? StartedAt,
DateTimeOffset? CompletedAt,
string? ErrorMessage)
{
public static LedgerExportResponse FromDomain(LedgerExport export) => new(
ExportId: export.ExportId,
TenantId: export.TenantId,
Status: export.Status.ToString(),
Format: export.Format,
StartTime: export.StartTime,
EndTime: export.EndTime,
RunTypeFilter: export.RunTypeFilter,
SourceIdFilter: export.SourceIdFilter,
EntryCount: export.EntryCount,
OutputUri: export.OutputUri,
OutputDigest: export.OutputDigest,
OutputSizeBytes: export.OutputSizeBytes,
RequestedBy: export.RequestedBy,
RequestedAt: export.RequestedAt,
StartedAt: export.StartedAt,
CompletedAt: export.CompletedAt,
ErrorMessage: export.ErrorMessage);
}
/// <summary>
/// List response for ledger exports.
/// </summary>
public sealed record LedgerExportListResponse(
IReadOnlyList<LedgerExportResponse> Exports,
string? NextCursor);
// ===== Manifest Contracts =====
/// <summary>
/// Response for a signed manifest.
/// </summary>
public sealed record ManifestResponse(
Guid ManifestId,
string SchemaVersion,
string TenantId,
string ProvenanceType,
Guid SubjectId,
string PayloadDigest,
string SignatureAlgorithm,
bool IsSigned,
bool IsExpired,
string KeyId,
DateTimeOffset CreatedAt,
DateTimeOffset? ExpiresAt)
{
public static ManifestResponse FromDomain(SignedManifest manifest) => new(
ManifestId: manifest.ManifestId,
SchemaVersion: manifest.SchemaVersion,
TenantId: manifest.TenantId,
ProvenanceType: manifest.ProvenanceType.ToString(),
SubjectId: manifest.SubjectId,
PayloadDigest: manifest.PayloadDigest,
SignatureAlgorithm: manifest.SignatureAlgorithm,
IsSigned: manifest.IsSigned,
IsExpired: manifest.IsExpired,
KeyId: manifest.KeyId,
CreatedAt: manifest.CreatedAt,
ExpiresAt: manifest.ExpiresAt);
}
/// <summary>
/// Response with full manifest details including statements and artifacts.
/// </summary>
public sealed record ManifestDetailResponse(
Guid ManifestId,
string SchemaVersion,
string TenantId,
string ProvenanceType,
Guid SubjectId,
string Statements,
string Artifacts,
string Materials,
string? BuildInfo,
string PayloadDigest,
string SignatureAlgorithm,
string Signature,
string KeyId,
DateTimeOffset CreatedAt,
DateTimeOffset? ExpiresAt,
string? Metadata)
{
public static ManifestDetailResponse FromDomain(SignedManifest manifest) => new(
ManifestId: manifest.ManifestId,
SchemaVersion: manifest.SchemaVersion,
TenantId: manifest.TenantId,
ProvenanceType: manifest.ProvenanceType.ToString(),
SubjectId: manifest.SubjectId,
Statements: manifest.Statements,
Artifacts: manifest.Artifacts,
Materials: manifest.Materials,
BuildInfo: manifest.BuildInfo,
PayloadDigest: manifest.PayloadDigest,
SignatureAlgorithm: manifest.SignatureAlgorithm,
Signature: manifest.Signature,
KeyId: manifest.KeyId,
CreatedAt: manifest.CreatedAt,
ExpiresAt: manifest.ExpiresAt,
Metadata: manifest.Metadata);
}
/// <summary>
/// List response for manifests.
/// </summary>
public sealed record ManifestListResponse(
IReadOnlyList<ManifestResponse> Manifests,
string? NextCursor);
/// <summary>
/// Response for manifest verification.
/// </summary>
public sealed record ManifestVerificationResponse(
Guid ManifestId,
bool PayloadIntegrityValid,
bool IsExpired,
bool IsSigned,
string? ValidationError);

View File

@@ -0,0 +1,46 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.WebService.Contracts;
/// <summary>
/// Response representing a DAG edge (job dependency).
/// </summary>
public sealed record DagEdgeResponse(
Guid EdgeId,
Guid RunId,
Guid ParentJobId,
Guid ChildJobId,
string EdgeType,
DateTimeOffset CreatedAt)
{
public static DagEdgeResponse FromDomain(DagEdge edge) => new(
edge.EdgeId,
edge.RunId,
edge.ParentJobId,
edge.ChildJobId,
edge.EdgeType,
edge.CreatedAt);
}
/// <summary>
/// Response containing the DAG structure for a run.
/// </summary>
public sealed record DagResponse(
Guid RunId,
IReadOnlyList<DagEdgeResponse> Edges,
IReadOnlyList<Guid> TopologicalOrder,
IReadOnlyList<Guid> CriticalPath,
TimeSpan? EstimatedDuration);
/// <summary>
/// Response containing a list of edges.
/// </summary>
public sealed record DagEdgeListResponse(
IReadOnlyList<DagEdgeResponse> Edges);
/// <summary>
/// Response for blocked jobs (transitively affected by a failure).
/// </summary>
public sealed record BlockedJobsResponse(
Guid FailedJobId,
IReadOnlyList<Guid> BlockedJobIds);

View File

@@ -0,0 +1,121 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.WebService.Contracts;
/// <summary>
/// Response representing a job.
/// </summary>
public sealed record JobResponse(
Guid JobId,
Guid? RunId,
string JobType,
string Status,
int Priority,
int Attempt,
int MaxAttempts,
string? CorrelationId,
string? WorkerId,
string? TaskRunnerId,
DateTimeOffset CreatedAt,
DateTimeOffset? ScheduledAt,
DateTimeOffset? LeasedAt,
DateTimeOffset? CompletedAt,
DateTimeOffset? NotBefore,
string? Reason,
Guid? ReplayOf,
string CreatedBy)
{
public static JobResponse FromDomain(Job job) => new(
job.JobId,
job.RunId,
job.JobType,
job.Status.ToString().ToLowerInvariant(),
job.Priority,
job.Attempt,
job.MaxAttempts,
job.CorrelationId,
job.WorkerId,
job.TaskRunnerId,
job.CreatedAt,
job.ScheduledAt,
job.LeasedAt,
job.CompletedAt,
job.NotBefore,
job.Reason,
job.ReplayOf,
job.CreatedBy);
}
/// <summary>
/// Response representing a job with its full payload.
/// </summary>
public sealed record JobDetailResponse(
Guid JobId,
Guid? RunId,
string JobType,
string Status,
int Priority,
int Attempt,
int MaxAttempts,
string PayloadDigest,
string Payload,
string IdempotencyKey,
string? CorrelationId,
Guid? LeaseId,
string? WorkerId,
string? TaskRunnerId,
DateTimeOffset? LeaseUntil,
DateTimeOffset CreatedAt,
DateTimeOffset? ScheduledAt,
DateTimeOffset? LeasedAt,
DateTimeOffset? CompletedAt,
DateTimeOffset? NotBefore,
string? Reason,
Guid? ReplayOf,
string CreatedBy)
{
public static JobDetailResponse FromDomain(Job job) => new(
job.JobId,
job.RunId,
job.JobType,
job.Status.ToString().ToLowerInvariant(),
job.Priority,
job.Attempt,
job.MaxAttempts,
job.PayloadDigest,
job.Payload,
job.IdempotencyKey,
job.CorrelationId,
job.LeaseId,
job.WorkerId,
job.TaskRunnerId,
job.LeaseUntil,
job.CreatedAt,
job.ScheduledAt,
job.LeasedAt,
job.CompletedAt,
job.NotBefore,
job.Reason,
job.ReplayOf,
job.CreatedBy);
}
/// <summary>
/// Response containing a list of jobs.
/// </summary>
public sealed record JobListResponse(
IReadOnlyList<JobResponse> Jobs,
string? NextCursor);
/// <summary>
/// Summary statistics for jobs.
/// </summary>
public sealed record JobSummary(
int TotalJobs,
int PendingJobs,
int ScheduledJobs,
int LeasedJobs,
int SucceededJobs,
int FailedJobs,
int CanceledJobs,
int TimedOutJobs);

View File

@@ -0,0 +1,22 @@
namespace StellaOps.Orchestrator.WebService.Contracts;
/// <summary>
/// Common query options for pagination.
/// </summary>
public sealed record QueryOptions
{
/// <summary>Maximum number of results to return. Default 50.</summary>
public int Limit { get; init; } = 50;
/// <summary>Cursor for pagination (opaque token).</summary>
public string? Cursor { get; init; }
/// <summary>Sort order: "asc" or "desc". Default "desc".</summary>
public string? Sort { get; init; }
/// <summary>Filter by created after date.</summary>
public DateTimeOffset? CreatedAfter { get; init; }
/// <summary>Filter by created before date.</summary>
public DateTimeOffset? CreatedBefore { get; init; }
}

View File

@@ -0,0 +1,352 @@
using StellaOps.Orchestrator.Core.Domain;
namespace StellaOps.Orchestrator.WebService.Contracts;
// ============================================================================
// Quota Contracts
// ============================================================================
/// <summary>
/// Request to create a quota.
/// </summary>
public sealed record CreateQuotaRequest(
string? JobType,
int MaxActive,
int MaxPerHour,
int BurstCapacity,
double RefillRate);
/// <summary>
/// Request to update a quota.
/// </summary>
public sealed record UpdateQuotaRequest(
int? MaxActive,
int? MaxPerHour,
int? BurstCapacity,
double? RefillRate);
/// <summary>
/// Request to pause a quota.
/// </summary>
public sealed record PauseQuotaRequest(
string Reason,
string? Ticket);
/// <summary>
/// Response for a quota.
/// </summary>
public sealed record QuotaResponse(
Guid QuotaId,
string TenantId,
string? JobType,
int MaxActive,
int MaxPerHour,
int BurstCapacity,
double RefillRate,
double CurrentTokens,
int CurrentActive,
int CurrentHourCount,
bool Paused,
string? PauseReason,
string? QuotaTicket,
DateTimeOffset CreatedAt,
DateTimeOffset UpdatedAt,
string UpdatedBy)
{
public static QuotaResponse FromDomain(Quota quota) =>
new(
QuotaId: quota.QuotaId,
TenantId: quota.TenantId,
JobType: quota.JobType,
MaxActive: quota.MaxActive,
MaxPerHour: quota.MaxPerHour,
BurstCapacity: quota.BurstCapacity,
RefillRate: quota.RefillRate,
CurrentTokens: quota.CurrentTokens,
CurrentActive: quota.CurrentActive,
CurrentHourCount: quota.CurrentHourCount,
Paused: quota.Paused,
PauseReason: quota.PauseReason,
QuotaTicket: quota.QuotaTicket,
CreatedAt: quota.CreatedAt,
UpdatedAt: quota.UpdatedAt,
UpdatedBy: quota.UpdatedBy);
}
/// <summary>
/// Response for quota list.
/// </summary>
public sealed record QuotaListResponse(
IReadOnlyList<QuotaResponse> Items,
string? NextCursor);
// ============================================================================
// SLO Contracts
// ============================================================================
/// <summary>
/// Request to create an SLO.
/// </summary>
public sealed record CreateSloRequest(
string Name,
string? Description,
string Type,
string? JobType,
Guid? SourceId,
double Target,
string Window,
double? LatencyPercentile,
double? LatencyTargetSeconds,
int? ThroughputMinimum);
/// <summary>
/// Request to update an SLO.
/// </summary>
public sealed record UpdateSloRequest(
string? Name,
string? Description,
double? Target,
bool? Enabled);
/// <summary>
/// Response for an SLO.
/// </summary>
public sealed record SloResponse(
Guid SloId,
string TenantId,
string Name,
string? Description,
string Type,
string? JobType,
Guid? SourceId,
double Target,
string Window,
double ErrorBudget,
double? LatencyPercentile,
double? LatencyTargetSeconds,
int? ThroughputMinimum,
bool Enabled,
DateTimeOffset CreatedAt,
DateTimeOffset UpdatedAt)
{
public static SloResponse FromDomain(Slo slo) =>
new(
SloId: slo.SloId,
TenantId: slo.TenantId,
Name: slo.Name,
Description: slo.Description,
Type: slo.Type.ToString().ToLowerInvariant(),
JobType: slo.JobType,
SourceId: slo.SourceId,
Target: slo.Target,
Window: FormatWindow(slo.Window),
ErrorBudget: slo.ErrorBudget,
LatencyPercentile: slo.LatencyPercentile,
LatencyTargetSeconds: slo.LatencyTargetSeconds,
ThroughputMinimum: slo.ThroughputMinimum,
Enabled: slo.Enabled,
CreatedAt: slo.CreatedAt,
UpdatedAt: slo.UpdatedAt);
private static string FormatWindow(SloWindow window) => window switch
{
SloWindow.OneHour => "1h",
SloWindow.OneDay => "1d",
SloWindow.SevenDays => "7d",
SloWindow.ThirtyDays => "30d",
_ => window.ToString()
};
}
/// <summary>
/// Response for SLO list.
/// </summary>
public sealed record SloListResponse(
IReadOnlyList<SloResponse> Items,
string? NextCursor);
/// <summary>
/// Response for SLO state (current metrics).
/// </summary>
public sealed record SloStateResponse(
Guid SloId,
double CurrentSli,
long TotalEvents,
long GoodEvents,
long BadEvents,
double BudgetConsumed,
double BudgetRemaining,
double BurnRate,
double? TimeToExhaustionSeconds,
bool IsMet,
string AlertSeverity,
DateTimeOffset ComputedAt,
DateTimeOffset WindowStart,
DateTimeOffset WindowEnd)
{
public static SloStateResponse FromDomain(SloState state) =>
new(
SloId: state.SloId,
CurrentSli: state.CurrentSli,
TotalEvents: state.TotalEvents,
GoodEvents: state.GoodEvents,
BadEvents: state.BadEvents,
BudgetConsumed: state.BudgetConsumed,
BudgetRemaining: state.BudgetRemaining,
BurnRate: state.BurnRate,
TimeToExhaustionSeconds: state.TimeToExhaustion?.TotalSeconds,
IsMet: state.IsMet,
AlertSeverity: state.AlertSeverity.ToString().ToLowerInvariant(),
ComputedAt: state.ComputedAt,
WindowStart: state.WindowStart,
WindowEnd: state.WindowEnd);
}
/// <summary>
/// Response with SLO and its current state.
/// </summary>
public sealed record SloWithStateResponse(
SloResponse Slo,
SloStateResponse State);
// ============================================================================
// Alert Threshold Contracts
// ============================================================================
/// <summary>
/// Request to create an alert threshold.
/// </summary>
public sealed record CreateAlertThresholdRequest(
double BudgetConsumedThreshold,
double? BurnRateThreshold,
string Severity,
string? NotificationChannel,
string? NotificationEndpoint,
int? CooldownMinutes);
/// <summary>
/// Response for an alert threshold.
/// </summary>
public sealed record AlertThresholdResponse(
Guid ThresholdId,
Guid SloId,
double BudgetConsumedThreshold,
double? BurnRateThreshold,
string Severity,
bool Enabled,
string? NotificationChannel,
string? NotificationEndpoint,
int CooldownMinutes,
DateTimeOffset? LastTriggeredAt,
DateTimeOffset CreatedAt,
DateTimeOffset UpdatedAt)
{
public static AlertThresholdResponse FromDomain(AlertBudgetThreshold threshold) =>
new(
ThresholdId: threshold.ThresholdId,
SloId: threshold.SloId,
BudgetConsumedThreshold: threshold.BudgetConsumedThreshold,
BurnRateThreshold: threshold.BurnRateThreshold,
Severity: threshold.Severity.ToString().ToLowerInvariant(),
Enabled: threshold.Enabled,
NotificationChannel: threshold.NotificationChannel,
NotificationEndpoint: threshold.NotificationEndpoint,
CooldownMinutes: (int)threshold.Cooldown.TotalMinutes,
LastTriggeredAt: threshold.LastTriggeredAt,
CreatedAt: threshold.CreatedAt,
UpdatedAt: threshold.UpdatedAt);
}
// ============================================================================
// Alert Contracts
// ============================================================================
/// <summary>
/// Response for an SLO alert.
/// </summary>
public sealed record SloAlertResponse(
Guid AlertId,
Guid SloId,
Guid ThresholdId,
string Severity,
string Message,
double BudgetConsumed,
double BurnRate,
double CurrentSli,
DateTimeOffset TriggeredAt,
DateTimeOffset? AcknowledgedAt,
string? AcknowledgedBy,
DateTimeOffset? ResolvedAt,
string? ResolutionNotes)
{
public static SloAlertResponse FromDomain(SloAlert alert) =>
new(
AlertId: alert.AlertId,
SloId: alert.SloId,
ThresholdId: alert.ThresholdId,
Severity: alert.Severity.ToString().ToLowerInvariant(),
Message: alert.Message,
BudgetConsumed: alert.BudgetConsumed,
BurnRate: alert.BurnRate,
CurrentSli: alert.CurrentSli,
TriggeredAt: alert.TriggeredAt,
AcknowledgedAt: alert.AcknowledgedAt,
AcknowledgedBy: alert.AcknowledgedBy,
ResolvedAt: alert.ResolvedAt,
ResolutionNotes: alert.ResolutionNotes);
}
/// <summary>
/// Response for alert list.
/// </summary>
public sealed record SloAlertListResponse(
IReadOnlyList<SloAlertResponse> Items,
string? NextCursor);
/// <summary>
/// Request to acknowledge an alert.
/// </summary>
public sealed record AcknowledgeAlertRequest(
string AcknowledgedBy);
/// <summary>
/// Request to resolve an alert.
/// </summary>
public sealed record ResolveAlertRequest(
string ResolutionNotes);
// ============================================================================
// Summary Contracts
// ============================================================================
/// <summary>
/// Summary response for SLO health.
/// </summary>
public sealed record SloSummaryResponse(
long TotalSlos,
long EnabledSlos,
long ActiveAlerts,
long UnacknowledgedAlerts,
long CriticalAlerts,
IReadOnlyList<SloWithStateResponse> SlosAtRisk);
/// <summary>
/// Summary response for quota usage.
/// </summary>
public sealed record QuotaSummaryResponse(
long TotalQuotas,
long PausedQuotas,
double AverageTokenUtilization,
double AverageConcurrencyUtilization,
IReadOnlyList<QuotaUtilizationResponse> Quotas);
/// <summary>
/// Quota utilization response.
/// </summary>
public sealed record QuotaUtilizationResponse(
Guid QuotaId,
string? JobType,
double TokenUtilization,
double ConcurrencyUtilization,
double HourlyUtilization,
bool Paused);

Some files were not shown because too many files have changed in this diff Show More