760 lines
31 KiB
C#
760 lines
31 KiB
C#
using Microsoft.AspNetCore.Mvc;
|
|
using StellaOps.Auth.ServerIntegration.Tenancy;
|
|
using StellaOps.JobEngine.Core.Domain;
|
|
using StellaOps.JobEngine.Core.SloManagement;
|
|
using StellaOps.JobEngine.WebService.Contracts;
|
|
using StellaOps.JobEngine.WebService.Services;
|
|
|
|
namespace StellaOps.JobEngine.WebService.Endpoints;
|
|
|
|
/// <summary>
|
|
/// REST API endpoints for SLO management.
|
|
/// </summary>
|
|
public static class SloEndpoints
|
|
{
|
|
/// <summary>
|
|
/// Maps SLO endpoints to the route builder.
|
|
/// </summary>
|
|
public static RouteGroupBuilder MapSloEndpoints(this IEndpointRouteBuilder app)
|
|
{
|
|
var group = app.MapGroup("/api/v1/jobengine/slos")
|
|
.WithTags("Orchestrator SLOs")
|
|
.RequireAuthorization(JobEnginePolicies.Read)
|
|
.RequireTenant();
|
|
|
|
// SLO CRUD operations
|
|
group.MapGet(string.Empty, ListSlos)
|
|
.WithName("Orchestrator_ListSlos")
|
|
.WithDescription("Return a cursor-paginated list of Service Level Objectives defined for the calling tenant, optionally filtered by enabled state and job type. Each SLO record includes its target metric, threshold, evaluation window, and current enabled state.");
|
|
|
|
group.MapGet("{sloId:guid}", GetSlo)
|
|
.WithName("Orchestrator_GetSlo")
|
|
.WithDescription("Return the full definition of the specified SLO including its target metric type (success rate, p95 latency, throughput), threshold value, evaluation window, job type scope, and enabled state. Returns 404 when the SLO does not exist in the tenant.");
|
|
|
|
group.MapPost(string.Empty, CreateSlo)
|
|
.WithName("Orchestrator_CreateSlo")
|
|
.WithDescription("Create a new Service Level Objective for the calling tenant. The SLO is disabled by default and must be explicitly enabled. Specify the metric type, threshold, evaluation window, and the job type it governs.")
|
|
.RequireAuthorization(JobEnginePolicies.Operate);
|
|
|
|
group.MapPut("{sloId:guid}", UpdateSlo)
|
|
.WithName("Orchestrator_UpdateSlo")
|
|
.WithDescription("Update the definition of the specified SLO including threshold, evaluation window, and description. The SLO must be disabled before structural changes can be applied. Returns 404 when the SLO does not exist in the tenant.")
|
|
.RequireAuthorization(JobEnginePolicies.Operate);
|
|
|
|
group.MapDelete("{sloId:guid}", DeleteSlo)
|
|
.WithName("Orchestrator_DeleteSlo")
|
|
.WithDescription("Permanently remove the specified SLO definition and all associated alert thresholds. Active alerts linked to this SLO are automatically resolved. Returns 404 when the SLO does not exist in the tenant.")
|
|
.RequireAuthorization(JobEnginePolicies.Operate);
|
|
|
|
// SLO state
|
|
group.MapGet("{sloId:guid}/state", GetSloState)
|
|
.WithName("Orchestrator_GetSloState")
|
|
.WithDescription("Return the current evaluation state of the specified SLO including the measured metric value, the computed burn rate relative to the threshold, and whether the SLO is currently in breach. Updated on each evaluation cycle.");
|
|
|
|
group.MapGet("states", GetAllSloStates)
|
|
.WithName("Orchestrator_GetAllSloStates")
|
|
.WithDescription("Return the current evaluation state for all enabled SLOs in the calling tenant in a single response. Useful for operations dashboards that need a snapshot of overall SLO health without polling each SLO individually.");
|
|
|
|
// SLO control
|
|
group.MapPost("{sloId:guid}/enable", EnableSlo)
|
|
.WithName("Orchestrator_EnableSlo")
|
|
.WithDescription("Activate the specified SLO so that it is included in evaluation cycles and can generate alerts when its threshold is breached. The SLO must be in a disabled state; enabling an already-active SLO is a no-op.")
|
|
.RequireAuthorization(JobEnginePolicies.Operate);
|
|
|
|
group.MapPost("{sloId:guid}/disable", DisableSlo)
|
|
.WithName("Orchestrator_DisableSlo")
|
|
.WithDescription("Deactivate the specified SLO, pausing evaluation and suppressing new alerts. Any active alerts are automatically acknowledged. The SLO definition is retained and can be re-enabled without data loss.")
|
|
.RequireAuthorization(JobEnginePolicies.Operate);
|
|
|
|
// Alert thresholds
|
|
group.MapGet("{sloId:guid}/thresholds", ListThresholds)
|
|
.WithName("Orchestrator_ListAlertThresholds")
|
|
.WithDescription("Return all alert thresholds configured for the specified SLO including their severity level, burn rate multiplier trigger, and notification channel references. Thresholds define the graduated alerting behaviour as an SLO degrades.");
|
|
|
|
group.MapPost("{sloId:guid}/thresholds", CreateThreshold)
|
|
.WithName("Orchestrator_CreateAlertThreshold")
|
|
.WithDescription("Add a new alert threshold to the specified SLO. Each threshold specifies a severity level and the burn rate or metric value at which the alert fires. Multiple thresholds at different severities implement graduated alerting.")
|
|
.RequireAuthorization(JobEnginePolicies.Operate);
|
|
|
|
group.MapDelete("{sloId:guid}/thresholds/{thresholdId:guid}", DeleteThreshold)
|
|
.WithName("Orchestrator_DeleteAlertThreshold")
|
|
.WithDescription("Remove the specified alert threshold from its parent SLO. In-flight alerts generated by this threshold are not automatically resolved. Returns 404 when the threshold ID does not belong to the SLO in the calling tenant.")
|
|
.RequireAuthorization(JobEnginePolicies.Operate);
|
|
|
|
// Alerts
|
|
group.MapGet("alerts", ListAlerts)
|
|
.WithName("Orchestrator_ListSloAlerts")
|
|
.WithDescription("Return a paginated list of SLO alerts for the calling tenant, optionally filtered by SLO ID, severity, status (firing, acknowledged, resolved), and time window. Each alert record includes the SLO reference, breach value, and lifecycle timestamps.");
|
|
|
|
group.MapGet("alerts/{alertId:guid}", GetAlert)
|
|
.WithName("Orchestrator_GetSloAlert")
|
|
.WithDescription("Return the full alert record for the specified ID including the SLO reference, fired-at timestamp, breach metric value, current status, and the acknowledge/resolve attribution if applicable. Returns 404 when the alert does not exist in the tenant.");
|
|
|
|
group.MapPost("alerts/{alertId:guid}/acknowledge", AcknowledgeAlert)
|
|
.WithName("Orchestrator_AcknowledgeAlert")
|
|
.WithDescription("Acknowledge the specified SLO alert, recording the calling principal and timestamp. Acknowledgment suppresses repeat notifications for the breach period but does not resolve the alert; the SLO violation must be corrected for resolution.")
|
|
.RequireAuthorization(JobEnginePolicies.Operate);
|
|
|
|
group.MapPost("alerts/{alertId:guid}/resolve", ResolveAlert)
|
|
.WithName("Orchestrator_ResolveAlert")
|
|
.WithDescription("Mark the specified SLO alert as resolved, attributing the resolution to the calling principal. Resolved alerts are archived and excluded from active alert counts. Use when the underlying SLO breach has been addressed and the system is within threshold.")
|
|
.RequireAuthorization(JobEnginePolicies.Operate);
|
|
|
|
// Summary
|
|
group.MapGet("summary", GetSloSummary)
|
|
.WithName("Orchestrator_GetSloSummary")
|
|
.WithDescription("Return a tenant-wide SLO health summary including total SLO count, count of SLOs currently in breach, count of enabled SLOs, and the number of active (unresolved) alerts grouped by severity. Used for high-level service health dashboards.");
|
|
|
|
return group;
|
|
}
|
|
|
|
private static async Task<IResult> ListSlos(
|
|
HttpContext context,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloRepository repository,
|
|
[FromQuery] bool? enabled = null,
|
|
[FromQuery] string? jobType = null,
|
|
[FromQuery] int? limit = null,
|
|
[FromQuery] string? cursor = null,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var effectiveLimit = EndpointHelpers.GetLimit(limit);
|
|
var offset = EndpointHelpers.ParseCursorOffset(cursor);
|
|
|
|
var slos = await repository.ListAsync(
|
|
tenantId,
|
|
enabledOnly: enabled ?? false,
|
|
jobType: jobType,
|
|
cancellationToken: cancellationToken).ConfigureAwait(false);
|
|
|
|
// Apply pagination manually since ListAsync doesn't support it directly
|
|
var paged = slos.Skip(offset).Take(effectiveLimit).ToList();
|
|
var responses = paged.Select(SloResponse.FromDomain).ToList();
|
|
var nextCursor = EndpointHelpers.CreateNextCursor(offset, effectiveLimit, responses.Count);
|
|
|
|
return Results.Ok(new SloListResponse(responses, nextCursor));
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> GetSlo(
|
|
HttpContext context,
|
|
[FromRoute] Guid sloId,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloRepository repository,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var slo = await repository.GetByIdAsync(tenantId, sloId, cancellationToken).ConfigureAwait(false);
|
|
|
|
if (slo is null)
|
|
{
|
|
return Results.NotFound();
|
|
}
|
|
|
|
return Results.Ok(SloResponse.FromDomain(slo));
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> CreateSlo(
|
|
HttpContext context,
|
|
[FromBody] CreateSloRequest request,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloRepository repository,
|
|
[FromServices] TimeProvider timeProvider,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var actorId = context.User?.Identity?.Name ?? "system";
|
|
var now = timeProvider.GetUtcNow();
|
|
|
|
// Parse and validate type
|
|
if (!TryParseSloType(request.Type, out var sloType))
|
|
{
|
|
return Results.BadRequest(new { error = "Invalid SLO type. Must be 'availability', 'latency', or 'throughput'" });
|
|
}
|
|
|
|
// Parse and validate window
|
|
if (!TryParseSloWindow(request.Window, out var window))
|
|
{
|
|
return Results.BadRequest(new { error = "Invalid window. Must be '1h', '1d', '7d', or '30d'" });
|
|
}
|
|
|
|
// Create SLO based on type
|
|
Slo slo = sloType switch
|
|
{
|
|
SloType.Availability => Slo.CreateAvailability(
|
|
tenantId, request.Name, request.Target, window, actorId, now,
|
|
request.Description, request.JobType, request.SourceId),
|
|
|
|
SloType.Latency => Slo.CreateLatency(
|
|
tenantId, request.Name,
|
|
request.LatencyPercentile ?? 0.95,
|
|
request.LatencyTargetSeconds ?? 1.0,
|
|
request.Target, window, actorId, now,
|
|
request.Description, request.JobType, request.SourceId),
|
|
|
|
SloType.Throughput => Slo.CreateThroughput(
|
|
tenantId, request.Name,
|
|
request.ThroughputMinimum ?? 1,
|
|
request.Target, window, actorId, now,
|
|
request.Description, request.JobType, request.SourceId),
|
|
|
|
_ => throw new InvalidOperationException($"Unknown SLO type: {sloType}")
|
|
};
|
|
|
|
await repository.CreateAsync(slo, cancellationToken).ConfigureAwait(false);
|
|
|
|
return Results.Created($"/api/v1/jobengine/slos/{slo.SloId}", SloResponse.FromDomain(slo));
|
|
}
|
|
catch (ArgumentException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> UpdateSlo(
|
|
HttpContext context,
|
|
[FromRoute] Guid sloId,
|
|
[FromBody] UpdateSloRequest request,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloRepository repository,
|
|
[FromServices] TimeProvider timeProvider,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var actorId = context.User?.Identity?.Name ?? "system";
|
|
var now = timeProvider.GetUtcNow();
|
|
|
|
var slo = await repository.GetByIdAsync(tenantId, sloId, cancellationToken).ConfigureAwait(false);
|
|
if (slo is null)
|
|
{
|
|
return Results.NotFound();
|
|
}
|
|
|
|
var updated = slo.Update(
|
|
updatedAt: now,
|
|
name: request.Name,
|
|
description: request.Description,
|
|
target: request.Target,
|
|
enabled: request.Enabled,
|
|
updatedBy: actorId);
|
|
|
|
await repository.UpdateAsync(updated, cancellationToken).ConfigureAwait(false);
|
|
|
|
return Results.Ok(SloResponse.FromDomain(updated));
|
|
}
|
|
catch (ArgumentException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> DeleteSlo(
|
|
HttpContext context,
|
|
[FromRoute] Guid sloId,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloRepository repository,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var deleted = await repository.DeleteAsync(tenantId, sloId, cancellationToken).ConfigureAwait(false);
|
|
|
|
if (!deleted)
|
|
{
|
|
return Results.NotFound();
|
|
}
|
|
|
|
return Results.NoContent();
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> GetSloState(
|
|
HttpContext context,
|
|
[FromRoute] Guid sloId,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloRepository repository,
|
|
[FromServices] IBurnRateEngine burnRateEngine,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var slo = await repository.GetByIdAsync(tenantId, sloId, cancellationToken).ConfigureAwait(false);
|
|
|
|
if (slo is null)
|
|
{
|
|
return Results.NotFound();
|
|
}
|
|
|
|
var state = await burnRateEngine.ComputeStateAsync(slo, cancellationToken).ConfigureAwait(false);
|
|
|
|
return Results.Ok(new SloWithStateResponse(
|
|
Slo: SloResponse.FromDomain(slo),
|
|
State: SloStateResponse.FromDomain(state)));
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> GetAllSloStates(
|
|
HttpContext context,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloRepository repository,
|
|
[FromServices] IBurnRateEngine burnRateEngine,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var states = await burnRateEngine.ComputeAllStatesAsync(tenantId, cancellationToken).ConfigureAwait(false);
|
|
|
|
var slos = await repository.ListAsync(tenantId, enabledOnly: true, cancellationToken: cancellationToken)
|
|
.ConfigureAwait(false);
|
|
|
|
var sloMap = slos.ToDictionary(s => s.SloId);
|
|
var responses = states
|
|
.Where(s => sloMap.ContainsKey(s.SloId))
|
|
.Select(s => new SloWithStateResponse(
|
|
Slo: SloResponse.FromDomain(sloMap[s.SloId]),
|
|
State: SloStateResponse.FromDomain(s)))
|
|
.ToList();
|
|
|
|
return Results.Ok(responses);
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> EnableSlo(
|
|
HttpContext context,
|
|
[FromRoute] Guid sloId,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloRepository repository,
|
|
[FromServices] TimeProvider timeProvider,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var actorId = context.User?.Identity?.Name ?? "system";
|
|
var now = timeProvider.GetUtcNow();
|
|
|
|
var slo = await repository.GetByIdAsync(tenantId, sloId, cancellationToken).ConfigureAwait(false);
|
|
if (slo is null)
|
|
{
|
|
return Results.NotFound();
|
|
}
|
|
|
|
var enabled = slo.Enable(actorId, now);
|
|
await repository.UpdateAsync(enabled, cancellationToken).ConfigureAwait(false);
|
|
|
|
return Results.Ok(SloResponse.FromDomain(enabled));
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> DisableSlo(
|
|
HttpContext context,
|
|
[FromRoute] Guid sloId,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloRepository repository,
|
|
[FromServices] TimeProvider timeProvider,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var actorId = context.User?.Identity?.Name ?? "system";
|
|
var now = timeProvider.GetUtcNow();
|
|
|
|
var slo = await repository.GetByIdAsync(tenantId, sloId, cancellationToken).ConfigureAwait(false);
|
|
if (slo is null)
|
|
{
|
|
return Results.NotFound();
|
|
}
|
|
|
|
var disabled = slo.Disable(actorId, now);
|
|
await repository.UpdateAsync(disabled, cancellationToken).ConfigureAwait(false);
|
|
|
|
return Results.Ok(SloResponse.FromDomain(disabled));
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> ListThresholds(
|
|
HttpContext context,
|
|
[FromRoute] Guid sloId,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloRepository sloRepository,
|
|
[FromServices] IAlertThresholdRepository repository,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
|
|
var slo = await sloRepository.GetByIdAsync(tenantId, sloId, cancellationToken).ConfigureAwait(false);
|
|
if (slo is null)
|
|
{
|
|
return Results.NotFound();
|
|
}
|
|
|
|
var thresholds = await repository.ListBySloAsync(sloId, cancellationToken).ConfigureAwait(false);
|
|
var responses = thresholds.Select(AlertThresholdResponse.FromDomain).ToList();
|
|
|
|
return Results.Ok(responses);
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> CreateThreshold(
|
|
HttpContext context,
|
|
[FromRoute] Guid sloId,
|
|
[FromBody] CreateAlertThresholdRequest request,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloRepository sloRepository,
|
|
[FromServices] IAlertThresholdRepository repository,
|
|
[FromServices] TimeProvider timeProvider,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var actorId = context.User?.Identity?.Name ?? "system";
|
|
var now = timeProvider.GetUtcNow();
|
|
|
|
var slo = await sloRepository.GetByIdAsync(tenantId, sloId, cancellationToken).ConfigureAwait(false);
|
|
if (slo is null)
|
|
{
|
|
return Results.NotFound();
|
|
}
|
|
|
|
if (!TryParseAlertSeverity(request.Severity, out var severity))
|
|
{
|
|
return Results.BadRequest(new { error = "Invalid severity. Must be 'info', 'warning', 'critical', or 'emergency'" });
|
|
}
|
|
|
|
var threshold = AlertBudgetThreshold.Create(
|
|
sloId: sloId,
|
|
tenantId: tenantId,
|
|
budgetConsumedThreshold: request.BudgetConsumedThreshold,
|
|
severity: severity,
|
|
createdBy: actorId,
|
|
createdAt: now,
|
|
burnRateThreshold: request.BurnRateThreshold,
|
|
notificationChannel: request.NotificationChannel,
|
|
notificationEndpoint: request.NotificationEndpoint,
|
|
cooldown: request.CooldownMinutes.HasValue
|
|
? TimeSpan.FromMinutes(request.CooldownMinutes.Value)
|
|
: null);
|
|
|
|
await repository.CreateAsync(threshold, cancellationToken).ConfigureAwait(false);
|
|
|
|
return Results.Created(
|
|
$"/api/v1/jobengine/slos/{sloId}/thresholds/{threshold.ThresholdId}",
|
|
AlertThresholdResponse.FromDomain(threshold));
|
|
}
|
|
catch (ArgumentException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> DeleteThreshold(
|
|
HttpContext context,
|
|
[FromRoute] Guid sloId,
|
|
[FromRoute] Guid thresholdId,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloRepository sloRepository,
|
|
[FromServices] IAlertThresholdRepository repository,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
|
|
var slo = await sloRepository.GetByIdAsync(tenantId, sloId, cancellationToken).ConfigureAwait(false);
|
|
if (slo is null)
|
|
{
|
|
return Results.NotFound();
|
|
}
|
|
|
|
var deleted = await repository.DeleteAsync(tenantId, thresholdId, cancellationToken).ConfigureAwait(false);
|
|
if (!deleted)
|
|
{
|
|
return Results.NotFound();
|
|
}
|
|
|
|
return Results.NoContent();
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> ListAlerts(
|
|
HttpContext context,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloAlertRepository repository,
|
|
[FromQuery] Guid? sloId = null,
|
|
[FromQuery] bool? acknowledged = null,
|
|
[FromQuery] bool? resolved = null,
|
|
[FromQuery] int? limit = null,
|
|
[FromQuery] string? cursor = null,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var effectiveLimit = EndpointHelpers.GetLimit(limit);
|
|
var offset = EndpointHelpers.ParseCursorOffset(cursor);
|
|
|
|
var alerts = await repository.ListAsync(
|
|
tenantId, sloId, acknowledged, resolved, effectiveLimit, offset, cancellationToken)
|
|
.ConfigureAwait(false);
|
|
|
|
var responses = alerts.Select(SloAlertResponse.FromDomain).ToList();
|
|
var nextCursor = EndpointHelpers.CreateNextCursor(offset, effectiveLimit, responses.Count);
|
|
|
|
return Results.Ok(new SloAlertListResponse(responses, nextCursor));
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> GetAlert(
|
|
HttpContext context,
|
|
[FromRoute] Guid alertId,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloAlertRepository repository,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var alert = await repository.GetByIdAsync(tenantId, alertId, cancellationToken).ConfigureAwait(false);
|
|
|
|
if (alert is null)
|
|
{
|
|
return Results.NotFound();
|
|
}
|
|
|
|
return Results.Ok(SloAlertResponse.FromDomain(alert));
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> AcknowledgeAlert(
|
|
HttpContext context,
|
|
[FromRoute] Guid alertId,
|
|
[FromBody] AcknowledgeAlertRequest request,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloAlertRepository repository,
|
|
[FromServices] TimeProvider timeProvider,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var alert = await repository.GetByIdAsync(tenantId, alertId, cancellationToken).ConfigureAwait(false);
|
|
|
|
if (alert is null)
|
|
{
|
|
return Results.NotFound();
|
|
}
|
|
|
|
if (alert.IsAcknowledged)
|
|
{
|
|
return Results.BadRequest(new { error = "Alert is already acknowledged" });
|
|
}
|
|
|
|
var acknowledged = alert.Acknowledge(request.AcknowledgedBy, timeProvider.GetUtcNow());
|
|
await repository.UpdateAsync(acknowledged, cancellationToken).ConfigureAwait(false);
|
|
|
|
return Results.Ok(SloAlertResponse.FromDomain(acknowledged));
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> ResolveAlert(
|
|
HttpContext context,
|
|
[FromRoute] Guid alertId,
|
|
[FromBody] ResolveAlertRequest request,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloAlertRepository repository,
|
|
[FromServices] TimeProvider timeProvider,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
var alert = await repository.GetByIdAsync(tenantId, alertId, cancellationToken).ConfigureAwait(false);
|
|
|
|
if (alert is null)
|
|
{
|
|
return Results.NotFound();
|
|
}
|
|
|
|
if (alert.IsResolved)
|
|
{
|
|
return Results.BadRequest(new { error = "Alert is already resolved" });
|
|
}
|
|
|
|
var resolved = alert.Resolve(request.ResolutionNotes, timeProvider.GetUtcNow());
|
|
await repository.UpdateAsync(resolved, cancellationToken).ConfigureAwait(false);
|
|
|
|
return Results.Ok(SloAlertResponse.FromDomain(resolved));
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static async Task<IResult> GetSloSummary(
|
|
HttpContext context,
|
|
[FromServices] TenantResolver tenantResolver,
|
|
[FromServices] ISloRepository sloRepository,
|
|
[FromServices] ISloAlertRepository alertRepository,
|
|
[FromServices] IBurnRateEngine burnRateEngine,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
try
|
|
{
|
|
var tenantId = tenantResolver.Resolve(context);
|
|
|
|
var slos = await sloRepository.ListAsync(tenantId, enabledOnly: false, cancellationToken: cancellationToken)
|
|
.ConfigureAwait(false);
|
|
var enabledSlos = slos.Where(s => s.Enabled).ToList();
|
|
var states = await burnRateEngine.ComputeAllStatesAsync(tenantId, cancellationToken).ConfigureAwait(false);
|
|
|
|
var activeAlertCount = await alertRepository.GetActiveAlertCountAsync(tenantId, cancellationToken)
|
|
.ConfigureAwait(false);
|
|
|
|
var alerts = await alertRepository.ListAsync(tenantId, null, false, false, 100, 0, cancellationToken)
|
|
.ConfigureAwait(false);
|
|
var unacknowledgedAlerts = alerts.Count(a => !a.IsAcknowledged && !a.IsResolved);
|
|
var criticalAlerts = alerts.Count(a => !a.IsResolved &&
|
|
(a.Severity == AlertSeverity.Critical || a.Severity == AlertSeverity.Emergency));
|
|
|
|
// Find SLOs at risk (budget consumed > 50% or burn rate > 2x)
|
|
var sloMap = enabledSlos.ToDictionary(s => s.SloId);
|
|
var slosAtRisk = states
|
|
.Where(s => sloMap.ContainsKey(s.SloId) && (s.BudgetConsumed >= 0.5 || s.BurnRate >= 2.0))
|
|
.OrderByDescending(s => s.BudgetConsumed)
|
|
.Take(10)
|
|
.Select(s => new SloWithStateResponse(
|
|
Slo: SloResponse.FromDomain(sloMap[s.SloId]),
|
|
State: SloStateResponse.FromDomain(s)))
|
|
.ToList();
|
|
|
|
return Results.Ok(new SloSummaryResponse(
|
|
TotalSlos: slos.Count,
|
|
EnabledSlos: enabledSlos.Count,
|
|
ActiveAlerts: activeAlertCount,
|
|
UnacknowledgedAlerts: unacknowledgedAlerts,
|
|
CriticalAlerts: criticalAlerts,
|
|
SlosAtRisk: slosAtRisk));
|
|
}
|
|
catch (InvalidOperationException ex)
|
|
{
|
|
return Results.BadRequest(new { error = ex.Message });
|
|
}
|
|
}
|
|
|
|
private static bool TryParseSloType(string value, out SloType type)
|
|
{
|
|
type = value.ToLowerInvariant() switch
|
|
{
|
|
"availability" => SloType.Availability,
|
|
"latency" => SloType.Latency,
|
|
"throughput" => SloType.Throughput,
|
|
_ => default
|
|
};
|
|
return value.ToLowerInvariant() is "availability" or "latency" or "throughput";
|
|
}
|
|
|
|
private static bool TryParseSloWindow(string value, out SloWindow window)
|
|
{
|
|
window = value.ToLowerInvariant() switch
|
|
{
|
|
"1h" or "one_hour" => SloWindow.OneHour,
|
|
"1d" or "one_day" => SloWindow.OneDay,
|
|
"7d" or "seven_days" => SloWindow.SevenDays,
|
|
"30d" or "thirty_days" => SloWindow.ThirtyDays,
|
|
_ => default
|
|
};
|
|
return value.ToLowerInvariant() is "1h" or "one_hour" or "1d" or "one_day" or "7d" or "seven_days" or "30d" or "thirty_days";
|
|
}
|
|
|
|
private static bool TryParseAlertSeverity(string value, out AlertSeverity severity)
|
|
{
|
|
severity = value.ToLowerInvariant() switch
|
|
{
|
|
"info" => AlertSeverity.Info,
|
|
"warning" => AlertSeverity.Warning,
|
|
"critical" => AlertSeverity.Critical,
|
|
"emergency" => AlertSeverity.Emergency,
|
|
_ => default
|
|
};
|
|
return value.ToLowerInvariant() is "info" or "warning" or "critical" or "emergency";
|
|
}
|
|
}
|