using Microsoft.AspNetCore.Mvc; using Npgsql; using StellaOps.Auth.ServerIntegration.Tenancy; using StellaOps.JobEngine.Core.DeadLetter; using StellaOps.JobEngine.Core.Domain; using StellaOps.JobEngine.WebService.Services; using System; using System.Globalization; using System.Text; using static StellaOps.Localization.T; namespace StellaOps.JobEngine.WebService.Endpoints; /// /// REST API endpoints for dead-letter store. /// public static class DeadLetterEndpoints { /// /// Maps dead-letter endpoints to the route builder. /// public static RouteGroupBuilder MapDeadLetterEndpoints(this IEndpointRouteBuilder app) { var group = app.MapGroup("/api/v1/jobengine/deadletter") .WithTags("Orchestrator Dead-Letter") .RequireAuthorization(JobEnginePolicies.Read) .RequireTenant(); // Entry management group.MapGet(string.Empty, ListEntries) .WithName("Orchestrator_ListDeadLetterEntries") .WithDescription(_t("orchestrator.dead_letter.list_description")); group.MapGet("{entryId:guid}", GetEntry) .WithName("Orchestrator_GetDeadLetterEntry") .WithDescription(_t("orchestrator.dead_letter.get_description")); group.MapGet("by-job/{jobId:guid}", GetEntryByJobId) .WithName("Orchestrator_GetDeadLetterEntryByJobId") .WithDescription(_t("orchestrator.dead_letter.get_by_job_description")); group.MapGet("stats", GetStats) .WithName("Orchestrator_GetDeadLetterStats") .WithDescription(_t("orchestrator.dead_letter.stats_description")); group.MapGet("export", ExportEntries) .WithName("Orchestrator_ExportDeadLetterEntries") .WithDescription(_t("orchestrator.dead_letter.export_description")); group.MapGet("summary", GetActionableSummary) .WithName("Orchestrator_GetDeadLetterSummary") .WithDescription(_t("orchestrator.dead_letter.summary_description")); // Replay operations group.MapPost("{entryId:guid}/replay", ReplayEntry) .WithName("Orchestrator_ReplayDeadLetterEntry") .WithDescription(_t("orchestrator.dead_letter.replay_description")) .RequireAuthorization(JobEnginePolicies.Operate); group.MapPost("replay/batch", ReplayBatch) .WithName("Orchestrator_ReplayDeadLetterBatch") .WithDescription(_t("orchestrator.dead_letter.replay_batch_description")) .RequireAuthorization(JobEnginePolicies.Operate); group.MapPost("replay/pending", ReplayPending) .WithName("Orchestrator_ReplayPendingDeadLetters") .WithDescription(_t("orchestrator.dead_letter.replay_pending_description")) .RequireAuthorization(JobEnginePolicies.Operate); // Resolution group.MapPost("{entryId:guid}/resolve", ResolveEntry) .WithName("Orchestrator_ResolveDeadLetterEntry") .WithDescription(_t("orchestrator.dead_letter.resolve_description")) .RequireAuthorization(JobEnginePolicies.Operate); group.MapPost("resolve/batch", ResolveBatch) .WithName("Orchestrator_ResolveDeadLetterBatch") .WithDescription(_t("orchestrator.dead_letter.resolve_batch_description")) .RequireAuthorization(JobEnginePolicies.Operate); // Error classification reference group.MapGet("error-codes", ListErrorCodes) .WithName("Orchestrator_ListDeadLetterErrorCodes") .WithDescription(_t("orchestrator.dead_letter.error_codes_description")); // Audit group.MapGet("{entryId:guid}/audit", GetReplayAudit) .WithName("Orchestrator_GetDeadLetterReplayAudit") .WithDescription(_t("orchestrator.dead_letter.replay_audit_description")); return group; } private static async Task ListEntries( HttpContext context, [FromServices] TenantResolver tenantResolver, [FromServices] IDeadLetterRepository repository, [FromQuery] string? status = null, [FromQuery] string? category = null, [FromQuery] string? jobType = null, [FromQuery] string? errorCode = null, [FromQuery] Guid? sourceId = null, [FromQuery] Guid? runId = null, [FromQuery] bool? isRetryable = null, [FromQuery] string? createdAfter = null, [FromQuery] string? createdBefore = null, [FromQuery] int? limit = null, [FromQuery] string? cursor = null, CancellationToken cancellationToken = default) { try { var tenantId = tenantResolver.Resolve(context); var effectiveLimit = EndpointHelpers.GetLimit(limit); var options = new DeadLetterListOptions( Status: TryParseDeadLetterStatus(status), Category: TryParseErrorCategory(category), JobType: jobType, ErrorCode: errorCode, SourceId: sourceId, RunId: runId, IsRetryable: isRetryable, CreatedAfter: EndpointHelpers.TryParseDateTimeOffset(createdAfter), CreatedBefore: EndpointHelpers.TryParseDateTimeOffset(createdBefore), Cursor: cursor, Limit: effectiveLimit); var entries = await repository.ListAsync(tenantId, options, cancellationToken) .ConfigureAwait(false); var totalCount = await repository.CountAsync(tenantId, options, cancellationToken) .ConfigureAwait(false); var responses = entries.Select(DeadLetterEntryResponse.FromDomain).ToList(); var nextCursor = entries.Count >= effectiveLimit ? entries.Last().CreatedAt.ToString("O", CultureInfo.InvariantCulture) : null; return Results.Ok(new DeadLetterListResponse(responses, nextCursor, totalCount)); } catch (InvalidOperationException ex) { return Results.BadRequest(new { error = ex.Message }); } catch (PostgresException ex) when (IsMissingDeadLetterTable(ex)) { return Results.Ok(new DeadLetterListResponse(new List(), null, 0)); } } private static async Task GetEntry( HttpContext context, [FromRoute] Guid entryId, [FromServices] TenantResolver tenantResolver, [FromServices] IDeadLetterRepository repository, CancellationToken cancellationToken = default) { try { var tenantId = tenantResolver.Resolve(context); var entry = await repository.GetByIdAsync(tenantId, entryId, cancellationToken) .ConfigureAwait(false); if (entry is null) { return Results.NotFound(); } return Results.Ok(DeadLetterEntryDetailResponse.FromDomain(entry)); } catch (InvalidOperationException ex) { return Results.BadRequest(new { error = ex.Message }); } catch (PostgresException ex) when (IsMissingDeadLetterTable(ex)) { return Results.NotFound(); } } private static async Task GetEntryByJobId( HttpContext context, [FromRoute] Guid jobId, [FromServices] TenantResolver tenantResolver, [FromServices] IDeadLetterRepository repository, CancellationToken cancellationToken = default) { try { var tenantId = tenantResolver.Resolve(context); var entry = await repository.GetByOriginalJobIdAsync(tenantId, jobId, cancellationToken) .ConfigureAwait(false); if (entry is null) { return Results.NotFound(); } return Results.Ok(DeadLetterEntryDetailResponse.FromDomain(entry)); } catch (InvalidOperationException ex) { return Results.BadRequest(new { error = ex.Message }); } catch (PostgresException ex) when (IsMissingDeadLetterTable(ex)) { return Results.NotFound(); } } private static async Task GetStats( HttpContext context, [FromServices] TenantResolver tenantResolver, [FromServices] IDeadLetterRepository repository, CancellationToken cancellationToken = default) { try { var tenantId = tenantResolver.Resolve(context); var stats = await repository.GetStatsAsync(tenantId, cancellationToken) .ConfigureAwait(false); return Results.Ok(DeadLetterStatsResponse.FromDomain(stats)); } catch (InvalidOperationException ex) { return Results.BadRequest(new { error = ex.Message }); } catch (PostgresException ex) when (IsMissingDeadLetterTable(ex)) { return Results.Ok(DeadLetterStatsResponse.FromDomain(CreateEmptyStats())); } } private static async Task ExportEntries( HttpContext context, [FromServices] TenantResolver tenantResolver, [FromServices] IDeadLetterRepository repository, [FromQuery] string? status = null, [FromQuery] string? category = null, [FromQuery] string? jobType = null, [FromQuery] string? errorCode = null, [FromQuery] bool? isRetryable = null, [FromQuery] int? limit = null, CancellationToken cancellationToken = default) { try { var tenantId = tenantResolver.Resolve(context); var effectiveLimit = Math.Clamp(limit ?? 1000, 1, 10000); var options = new DeadLetterListOptions( Status: TryParseDeadLetterStatus(status), Category: TryParseErrorCategory(category), JobType: jobType, ErrorCode: errorCode, IsRetryable: isRetryable, Limit: effectiveLimit); var entries = await repository.ListAsync(tenantId, options, cancellationToken) .ConfigureAwait(false); var csv = BuildDeadLetterCsv(entries); var payload = Encoding.UTF8.GetBytes(csv); var fileName = $"deadletter-export-{DateTime.UtcNow:yyyyMMdd-HHmmss}.csv"; return Results.File(payload, "text/csv", fileName); } catch (InvalidOperationException ex) { return Results.BadRequest(new { error = ex.Message }); } catch (PostgresException ex) when (IsMissingDeadLetterTable(ex)) { var payload = Encoding.UTF8.GetBytes(BuildDeadLetterCsv(Array.Empty())); var fileName = $"deadletter-export-{DateTime.UtcNow:yyyyMMdd-HHmmss}.csv"; return Results.File(payload, "text/csv", fileName); } } private static async Task GetActionableSummary( HttpContext context, [FromServices] TenantResolver tenantResolver, [FromServices] IDeadLetterRepository repository, [FromQuery] int? limit = null, CancellationToken cancellationToken = default) { try { var tenantId = tenantResolver.Resolve(context); var effectiveLimit = Math.Clamp(limit ?? 10, 1, 50); var summaries = await repository.GetActionableSummaryAsync(tenantId, effectiveLimit, cancellationToken) .ConfigureAwait(false); return Results.Ok(new DeadLetterSummaryListResponse( summaries.Select(s => new DeadLetterSummaryResponse( s.ErrorCode, s.Category.ToString(), s.EntryCount, s.RetryableCount, s.OldestEntry, s.SampleReason)).ToList())); } catch (InvalidOperationException ex) { return Results.BadRequest(new { error = ex.Message }); } catch (PostgresException ex) when (IsMissingDeadLetterTable(ex)) { return Results.Ok(new DeadLetterSummaryListResponse(new List())); } } private static async Task ReplayEntry( HttpContext context, [FromRoute] Guid entryId, [FromServices] TenantResolver tenantResolver, [FromServices] IReplayManager replayManager, CancellationToken cancellationToken = default) { try { var tenantId = tenantResolver.Resolve(context); var user = GetCurrentUser(context); var result = await replayManager.ReplayAsync(tenantId, entryId, user, cancellationToken) .ConfigureAwait(false); if (!result.Success) { return Results.UnprocessableEntity(new { error = result.ErrorMessage }); } return Results.Ok(new ReplayResultResponse( result.Success, result.NewJobId, result.ErrorMessage, DeadLetterEntryResponse.FromDomain(result.UpdatedEntry))); } catch (InvalidOperationException ex) { return Results.BadRequest(new { error = ex.Message }); } } private static async Task ReplayBatch( HttpContext context, [FromBody] ReplayBatchRequest request, [FromServices] TenantResolver tenantResolver, [FromServices] IReplayManager replayManager, CancellationToken cancellationToken = default) { try { var tenantId = tenantResolver.Resolve(context); var user = GetCurrentUser(context); var result = await replayManager.ReplayBatchAsync(tenantId, request.EntryIds, user, cancellationToken) .ConfigureAwait(false); return Results.Ok(new BatchReplayResultResponse( result.Attempted, result.Succeeded, result.Failed, result.Results.Select(r => new ReplayResultResponse( r.Success, r.NewJobId, r.ErrorMessage, r.UpdatedEntry is not null ? DeadLetterEntryResponse.FromDomain(r.UpdatedEntry) : null)).ToList())); } catch (ArgumentException ex) { return Results.BadRequest(new { error = ex.Message }); } catch (InvalidOperationException ex) { return Results.BadRequest(new { error = ex.Message }); } } private static async Task ReplayPending( HttpContext context, [FromBody] ReplayPendingRequest request, [FromServices] TenantResolver tenantResolver, [FromServices] IReplayManager replayManager, CancellationToken cancellationToken = default) { try { var tenantId = tenantResolver.Resolve(context); var user = GetCurrentUser(context); var result = await replayManager.ReplayPendingAsync( tenantId, request.ErrorCode, TryParseErrorCategory(request.Category), request.MaxCount ?? 100, user, cancellationToken).ConfigureAwait(false); return Results.Ok(new BatchReplayResultResponse( result.Attempted, result.Succeeded, result.Failed, result.Results.Select(r => new ReplayResultResponse( r.Success, r.NewJobId, r.ErrorMessage, r.UpdatedEntry is not null ? DeadLetterEntryResponse.FromDomain(r.UpdatedEntry) : null)).ToList())); } catch (InvalidOperationException ex) { return Results.BadRequest(new { error = ex.Message }); } } private static async Task ResolveEntry( HttpContext context, [FromRoute] Guid entryId, [FromBody] ResolveEntryRequest request, [FromServices] TenantResolver tenantResolver, [FromServices] IReplayManager replayManager, CancellationToken cancellationToken = default) { try { var tenantId = tenantResolver.Resolve(context); var user = GetCurrentUser(context); var entry = await replayManager.ResolveAsync(tenantId, entryId, request.Notes, user, cancellationToken) .ConfigureAwait(false); return Results.Ok(DeadLetterEntryResponse.FromDomain(entry)); } catch (InvalidOperationException ex) { return Results.BadRequest(new { error = ex.Message }); } } private static async Task ResolveBatch( HttpContext context, [FromBody] ResolveBatchRequest request, [FromServices] TenantResolver tenantResolver, [FromServices] IReplayManager replayManager, CancellationToken cancellationToken = default) { try { var tenantId = tenantResolver.Resolve(context); var user = GetCurrentUser(context); var count = await replayManager.ResolveBatchAsync( tenantId, request.EntryIds, request.Notes, user, cancellationToken) .ConfigureAwait(false); return Results.Ok(new { resolvedCount = count }); } catch (InvalidOperationException ex) { return Results.BadRequest(new { error = ex.Message }); } } private static Task ListErrorCodes( [FromServices] IErrorClassifier classifier, CancellationToken cancellationToken = default) { // Return the known error codes with their classifications var errorCodes = new[] { // Transient errors DefaultErrorClassifier.ErrorCodes.NetworkTimeout, DefaultErrorClassifier.ErrorCodes.ConnectionRefused, DefaultErrorClassifier.ErrorCodes.DnsResolutionFailed, DefaultErrorClassifier.ErrorCodes.ServiceUnavailable, DefaultErrorClassifier.ErrorCodes.GatewayTimeout, // Not found errors DefaultErrorClassifier.ErrorCodes.ImageNotFound, DefaultErrorClassifier.ErrorCodes.SourceNotFound, DefaultErrorClassifier.ErrorCodes.RegistryNotFound, // Auth errors DefaultErrorClassifier.ErrorCodes.InvalidCredentials, DefaultErrorClassifier.ErrorCodes.TokenExpired, DefaultErrorClassifier.ErrorCodes.InsufficientPermissions, // Rate limit errors DefaultErrorClassifier.ErrorCodes.RateLimited, DefaultErrorClassifier.ErrorCodes.QuotaExceeded, // Validation errors DefaultErrorClassifier.ErrorCodes.InvalidPayload, DefaultErrorClassifier.ErrorCodes.InvalidConfiguration, // Upstream errors DefaultErrorClassifier.ErrorCodes.RegistryError, DefaultErrorClassifier.ErrorCodes.AdvisoryFeedError, // Internal errors DefaultErrorClassifier.ErrorCodes.InternalError, DefaultErrorClassifier.ErrorCodes.ProcessingError }; var responses = errorCodes.Select(code => { var classified = classifier.Classify(code, string.Empty); return new ErrorCodeResponse( classified.ErrorCode, classified.Category.ToString(), classified.Description, classified.RemediationHint, classified.IsRetryable, classified.SuggestedRetryDelay?.TotalSeconds); }).ToList(); return Task.FromResult(Results.Ok(new ErrorCodeListResponse(responses))); } private static async Task GetReplayAudit( HttpContext context, [FromRoute] Guid entryId, [FromServices] TenantResolver tenantResolver, [FromServices] IReplayAuditRepository auditRepository, CancellationToken cancellationToken = default) { try { var tenantId = tenantResolver.Resolve(context); var audits = await auditRepository.GetByEntryAsync(tenantId, entryId, cancellationToken) .ConfigureAwait(false); var responses = audits.Select(a => new ReplayAuditResponse( a.AuditId, a.EntryId, a.AttemptNumber, a.Success, a.NewJobId, a.ErrorMessage, a.TriggeredBy, a.TriggeredAt, a.CompletedAt, a.InitiatedBy)).ToList(); return Results.Ok(new ReplayAuditListResponse(responses)); } catch (InvalidOperationException ex) { return Results.BadRequest(new { error = ex.Message }); } } private static DeadLetterStatus? TryParseDeadLetterStatus(string? value) => string.IsNullOrWhiteSpace(value) ? null : Enum.TryParse(value, ignoreCase: true, out var status) ? status : null; private static ErrorCategory? TryParseErrorCategory(string? value) => string.IsNullOrWhiteSpace(value) ? null : Enum.TryParse(value, ignoreCase: true, out var category) ? category : null; private static string GetCurrentUser(HttpContext context) => context.User?.Identity?.Name ?? "anonymous"; private static bool IsMissingDeadLetterTable(PostgresException exception) => string.Equals(exception.SqlState, "42P01", StringComparison.Ordinal) || string.Equals(exception.SqlState, "25P02", StringComparison.Ordinal); private static DeadLetterStats CreateEmptyStats() => new( TotalEntries: 0, PendingEntries: 0, ReplayingEntries: 0, ReplayedEntries: 0, ResolvedEntries: 0, ExhaustedEntries: 0, ExpiredEntries: 0, RetryableEntries: 0, ByCategory: new Dictionary(), TopErrorCodes: new Dictionary(), TopJobTypes: new Dictionary()); private static string BuildDeadLetterCsv(IReadOnlyList entries) { var builder = new StringBuilder(); builder.AppendLine("entryId,jobId,status,errorCode,category,retryable,replayAttempts,maxReplayAttempts,failedAt,createdAt,resolvedAt,reason"); foreach (var entry in entries) { builder.Append(EscapeCsv(entry.EntryId.ToString())).Append(','); builder.Append(EscapeCsv(entry.OriginalJobId.ToString())).Append(','); builder.Append(EscapeCsv(entry.Status.ToString())).Append(','); builder.Append(EscapeCsv(entry.ErrorCode)).Append(','); builder.Append(EscapeCsv(entry.Category.ToString())).Append(','); builder.Append(EscapeCsv(entry.IsRetryable.ToString(CultureInfo.InvariantCulture))).Append(','); builder.Append(EscapeCsv(entry.ReplayAttempts.ToString(CultureInfo.InvariantCulture))).Append(','); builder.Append(EscapeCsv(entry.MaxReplayAttempts.ToString(CultureInfo.InvariantCulture))).Append(','); builder.Append(EscapeCsv(entry.FailedAt.ToString("O", CultureInfo.InvariantCulture))).Append(','); builder.Append(EscapeCsv(entry.CreatedAt.ToString("O", CultureInfo.InvariantCulture))).Append(','); builder.Append(EscapeCsv(entry.ResolvedAt?.ToString("O", CultureInfo.InvariantCulture))).Append(','); builder.Append(EscapeCsv(entry.FailureReason)); builder.AppendLine(); } return builder.ToString(); } private static string EscapeCsv(string? value) { if (string.IsNullOrEmpty(value)) { return string.Empty; } return "\"" + value.Replace("\"", "\"\"", StringComparison.Ordinal) + "\""; } } // Response DTOs public sealed record DeadLetterEntryResponse( Guid EntryId, Guid OriginalJobId, Guid? RunId, Guid? SourceId, string JobType, string Status, string ErrorCode, string FailureReason, string? RemediationHint, string Category, bool IsRetryable, int OriginalAttempts, int ReplayAttempts, int MaxReplayAttempts, bool CanReplay, DateTimeOffset FailedAt, DateTimeOffset CreatedAt, DateTimeOffset ExpiresAt, DateTimeOffset? ResolvedAt) { public static DeadLetterEntryResponse FromDomain(DeadLetterEntry entry) => new( entry.EntryId, entry.OriginalJobId, entry.RunId, entry.SourceId, entry.JobType, entry.Status.ToString(), entry.ErrorCode, entry.FailureReason, entry.RemediationHint, entry.Category.ToString(), entry.IsRetryable, entry.OriginalAttempts, entry.ReplayAttempts, entry.MaxReplayAttempts, entry.CanReplay, entry.FailedAt, entry.CreatedAt, entry.ExpiresAt, entry.ResolvedAt); } public sealed record DeadLetterEntryDetailResponse( Guid EntryId, Guid OriginalJobId, Guid? RunId, Guid? SourceId, string JobType, string Payload, string PayloadDigest, string IdempotencyKey, string? CorrelationId, string Status, string ErrorCode, string FailureReason, string? RemediationHint, string Category, bool IsRetryable, int OriginalAttempts, int ReplayAttempts, int MaxReplayAttempts, bool CanReplay, DateTimeOffset FailedAt, DateTimeOffset CreatedAt, DateTimeOffset UpdatedAt, DateTimeOffset ExpiresAt, DateTimeOffset? ResolvedAt, string? ResolutionNotes, string CreatedBy, string UpdatedBy) { public static DeadLetterEntryDetailResponse FromDomain(DeadLetterEntry entry) => new( entry.EntryId, entry.OriginalJobId, entry.RunId, entry.SourceId, entry.JobType, entry.Payload, entry.PayloadDigest, entry.IdempotencyKey, entry.CorrelationId, entry.Status.ToString(), entry.ErrorCode, entry.FailureReason, entry.RemediationHint, entry.Category.ToString(), entry.IsRetryable, entry.OriginalAttempts, entry.ReplayAttempts, entry.MaxReplayAttempts, entry.CanReplay, entry.FailedAt, entry.CreatedAt, entry.UpdatedAt, entry.ExpiresAt, entry.ResolvedAt, entry.ResolutionNotes, entry.CreatedBy, entry.UpdatedBy); } public sealed record DeadLetterListResponse( IReadOnlyList Entries, string? NextCursor, long TotalCount); public sealed record DeadLetterStatsResponse( long TotalEntries, long PendingEntries, long ReplayingEntries, long ReplayedEntries, long ResolvedEntries, long ExhaustedEntries, long ExpiredEntries, long RetryableEntries, IDictionary ByCategory, IDictionary TopErrorCodes, IDictionary TopJobTypes) { public static DeadLetterStatsResponse FromDomain(DeadLetterStats stats) => new( stats.TotalEntries, stats.PendingEntries, stats.ReplayingEntries, stats.ReplayedEntries, stats.ResolvedEntries, stats.ExhaustedEntries, stats.ExpiredEntries, stats.RetryableEntries, stats.ByCategory.ToDictionary(kv => kv.Key.ToString(), kv => kv.Value), new Dictionary(stats.TopErrorCodes), new Dictionary(stats.TopJobTypes)); } public sealed record DeadLetterSummaryResponse( string ErrorCode, string Category, long EntryCount, long RetryableCount, DateTimeOffset OldestEntry, string? SampleReason); public sealed record DeadLetterSummaryListResponse( IReadOnlyList Summaries); public sealed record ReplayResultResponse( bool Success, Guid? NewJobId, string? ErrorMessage, DeadLetterEntryResponse? UpdatedEntry); public sealed record BatchReplayResultResponse( int Attempted, int Succeeded, int Failed, IReadOnlyList Results); public sealed record ReplayBatchRequest( IReadOnlyList EntryIds); public sealed record ReplayPendingRequest( string? ErrorCode, string? Category, int? MaxCount); public sealed record ResolveEntryRequest( string Notes); public sealed record ResolveBatchRequest( IReadOnlyList EntryIds, string Notes); public sealed record ErrorCodeResponse( string ErrorCode, string Category, string Description, string RemediationHint, bool IsRetryable, double? SuggestedRetryDelaySeconds); public sealed record ErrorCodeListResponse( IReadOnlyList ErrorCodes); public sealed record ReplayAuditResponse( Guid AuditId, Guid EntryId, int AttemptNumber, bool Success, Guid? NewJobId, string? ErrorMessage, string TriggeredBy, DateTimeOffset TriggeredAt, DateTimeOffset? CompletedAt, string InitiatedBy); public sealed record ReplayAuditListResponse( IReadOnlyList Audits);