Add unit tests for SBOM ingestion and transformation
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled

- Implement `SbomIngestServiceCollectionExtensionsTests` to verify the SBOM ingestion pipeline exports snapshots correctly.
- Create `SbomIngestTransformerTests` to ensure the transformation produces expected nodes and edges, including deduplication of license nodes and normalization of timestamps.
- Add `SbomSnapshotExporterTests` to test the export functionality for manifest, adjacency, nodes, and edges.
- Introduce `VexOverlayTransformerTests` to validate the transformation of VEX nodes and edges.
- Set up project file for the test project with necessary dependencies and configurations.
- Include JSON fixture files for testing purposes.
This commit is contained in:
master
2025-11-04 07:49:39 +02:00
parent f72c5c513a
commit 2eb6852d34
491 changed files with 39445 additions and 3917 deletions

View File

@@ -4,9 +4,13 @@ namespace StellaOps.Scheduler.WebService.PolicyRuns;
internal interface IPolicyRunService
{
Task<PolicyRunStatus> EnqueueAsync(string tenantId, PolicyRunRequest request, CancellationToken cancellationToken);
Task<IReadOnlyList<PolicyRunStatus>> ListAsync(string tenantId, PolicyRunQueryOptions options, CancellationToken cancellationToken);
Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken);
}
Task<PolicyRunStatus> EnqueueAsync(string tenantId, PolicyRunRequest request, CancellationToken cancellationToken);
Task<IReadOnlyList<PolicyRunStatus>> ListAsync(string tenantId, PolicyRunQueryOptions options, CancellationToken cancellationToken);
Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken);
Task<PolicyRunStatus?> RequestCancellationAsync(string tenantId, string runId, string? reason, CancellationToken cancellationToken);
Task<PolicyRunStatus> RetryAsync(string tenantId, string runId, string? requestedBy, CancellationToken cancellationToken);
}

View File

@@ -11,7 +11,7 @@ internal sealed class InMemoryPolicyRunService : IPolicyRunService
private readonly List<PolicyRunStatus> _orderedRuns = new();
private readonly object _gate = new();
public Task<PolicyRunStatus> EnqueueAsync(string tenantId, PolicyRunRequest request, CancellationToken cancellationToken)
public Task<PolicyRunStatus> EnqueueAsync(string tenantId, PolicyRunRequest request, CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentNullException.ThrowIfNull(request);
@@ -23,27 +23,30 @@ internal sealed class InMemoryPolicyRunService : IPolicyRunService
var queuedAt = request.QueuedAt ?? DateTimeOffset.UtcNow;
var status = new PolicyRunStatus(
runId,
tenantId,
request.PolicyId ?? throw new ValidationException("policyId must be provided."),
request.PolicyVersion ?? throw new ValidationException("policyVersion must be provided."),
request.Mode,
PolicyRunExecutionStatus.Queued,
request.Priority,
queuedAt,
PolicyRunStats.Empty,
request.Inputs ?? PolicyRunInputs.Empty,
null,
null,
null,
null,
null,
0,
null,
null,
request.Metadata ?? ImmutableSortedDictionary<string, string>.Empty,
SchedulerSchemaVersions.PolicyRunStatus);
var status = new PolicyRunStatus(
runId,
tenantId,
request.PolicyId ?? throw new ValidationException("policyId must be provided."),
request.PolicyVersion ?? throw new ValidationException("policyVersion must be provided."),
request.Mode,
PolicyRunExecutionStatus.Queued,
request.Priority,
queuedAt,
PolicyRunStats.Empty,
request.Inputs ?? PolicyRunInputs.Empty,
null,
null,
null,
null,
null,
0,
null,
null,
request.Metadata ?? ImmutableSortedDictionary<string, string>.Empty,
cancellationRequested: false,
cancellationRequestedAt: null,
cancellationReason: null,
SchedulerSchemaVersions.PolicyRunStatus);
lock (_gate)
{
@@ -110,7 +113,7 @@ internal sealed class InMemoryPolicyRunService : IPolicyRunService
return Task.FromResult<IReadOnlyList<PolicyRunStatus>>(result);
}
public Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken)
public Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
@@ -126,13 +129,121 @@ internal sealed class InMemoryPolicyRunService : IPolicyRunService
return Task.FromResult<PolicyRunStatus?>(null);
}
return Task.FromResult<PolicyRunStatus?>(run);
}
private static string GenerateRunId(string policyId, DateTimeOffset timestamp)
{
var normalizedPolicyId = string.IsNullOrWhiteSpace(policyId) ? "policy" : policyId.Trim();
var suffix = Guid.NewGuid().ToString("N")[..8];
return $"run:{normalizedPolicyId}:{timestamp:yyyyMMddTHHmmssZ}:{suffix}";
}
}
return Task.FromResult<PolicyRunStatus?>(run);
}
public Task<PolicyRunStatus?> RequestCancellationAsync(string tenantId, string runId, string? reason, CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
cancellationToken.ThrowIfCancellationRequested();
PolicyRunStatus? updated;
lock (_gate)
{
if (!_runs.TryGetValue(runId, out var existing) || !string.Equals(existing.TenantId, tenantId, StringComparison.Ordinal))
{
return Task.FromResult<PolicyRunStatus?>(null);
}
if (IsTerminal(existing.Status))
{
return Task.FromResult<PolicyRunStatus?>(existing);
}
var cancellationReason = NormalizeCancellationReason(reason);
var now = DateTimeOffset.UtcNow;
updated = existing with
{
Status = PolicyRunExecutionStatus.Cancelled,
FinishedAt = now,
CancellationRequested = true,
CancellationRequestedAt = now,
CancellationReason = cancellationReason
};
_runs[runId] = updated;
var index = _orderedRuns.FindIndex(status => string.Equals(status.RunId, runId, StringComparison.Ordinal));
if (index >= 0)
{
_orderedRuns[index] = updated;
}
}
return Task.FromResult<PolicyRunStatus?>(updated);
}
public async Task<PolicyRunStatus> RetryAsync(string tenantId, string runId, string? requestedBy, CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
cancellationToken.ThrowIfCancellationRequested();
PolicyRunStatus existing;
lock (_gate)
{
if (!_runs.TryGetValue(runId, out var status) || !string.Equals(status.TenantId, tenantId, StringComparison.Ordinal))
{
throw new KeyNotFoundException($"Policy simulation {runId} was not found for tenant {tenantId}.");
}
if (!IsTerminal(status.Status))
{
throw new InvalidOperationException("Simulation is still in progress and cannot be retried.");
}
existing = status;
}
var metadataBuilder = (existing.Metadata ?? ImmutableSortedDictionary<string, string>.Empty).ToBuilder();
metadataBuilder["retry-of"] = runId;
var request = new PolicyRunRequest(
tenantId,
existing.PolicyId,
PolicyRunMode.Simulate,
existing.Inputs,
existing.Priority,
runId: null,
policyVersion: existing.PolicyVersion,
requestedBy: NormalizeActor(requestedBy),
queuedAt: DateTimeOffset.UtcNow,
correlationId: null,
metadata: metadataBuilder.ToImmutable());
return await EnqueueAsync(tenantId, request, cancellationToken).ConfigureAwait(false);
}
private static string GenerateRunId(string policyId, DateTimeOffset timestamp)
{
var normalizedPolicyId = string.IsNullOrWhiteSpace(policyId) ? "policy" : policyId.Trim();
var suffix = Guid.NewGuid().ToString("N")[..8];
return $"run:{normalizedPolicyId}:{timestamp:yyyyMMddTHHmmssZ}:{suffix}";
}
private static bool IsTerminal(PolicyRunExecutionStatus status)
=> status is PolicyRunExecutionStatus.Succeeded or PolicyRunExecutionStatus.Failed or PolicyRunExecutionStatus.Cancelled;
private static string? NormalizeCancellationReason(string? value)
{
if (string.IsNullOrWhiteSpace(value))
{
return null;
}
var trimmed = value.Trim();
const int maxLength = 512;
return trimmed.Length > maxLength ? trimmed[..maxLength] : trimmed;
}
private static string? NormalizeActor(string? actor)
{
if (string.IsNullOrWhiteSpace(actor))
{
return null;
}
var trimmed = actor.Trim();
const int maxLength = 256;
return trimmed.Length > maxLength ? trimmed[..maxLength] : trimmed;
}
}

View File

@@ -17,13 +17,19 @@ internal sealed class PolicyRunQueryOptions
public string? PolicyId { get; private set; }
public PolicyRunMode? Mode { get; private set; }
public PolicyRunMode? Mode { get; private set; }
public PolicyRunExecutionStatus? Status { get; private set; }
public DateTimeOffset? QueuedAfter { get; private set; }
public int Limit { get; private set; } = DefaultLimit;
public int Limit { get; private set; } = DefaultLimit;
public PolicyRunQueryOptions ForceMode(PolicyRunMode mode)
{
Mode = mode;
return this;
}
public static PolicyRunQueryOptions FromRequest(HttpRequest request)
{

View File

@@ -47,7 +47,7 @@ internal sealed class PolicyRunService : IPolicyRunService
if (existing is not null)
{
_logger.LogDebug("Policy run job already exists for tenant {TenantId} and run {RunId}.", tenantId, runId);
return ToStatus(existing, now);
return PolicyRunStatusFactory.Create(existing, now);
}
}
@@ -94,7 +94,7 @@ internal sealed class PolicyRunService : IPolicyRunService
job.RunId,
job.Mode);
return ToStatus(job, now);
return PolicyRunStatusFactory.Create(job, now);
}
public async Task<IReadOnlyList<PolicyRunStatus>> ListAsync(
@@ -122,79 +122,139 @@ internal sealed class PolicyRunService : IPolicyRunService
.ConfigureAwait(false);
var now = _timeProvider.GetUtcNow();
return jobs
.Select(job => ToStatus(job, now))
.ToList();
return jobs
.Select(job => PolicyRunStatusFactory.Create(job, now))
.ToList();
}
public async Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
cancellationToken.ThrowIfCancellationRequested();
var job = await _repository
.GetByRunIdAsync(tenantId, runId, cancellationToken: cancellationToken)
.ConfigureAwait(false);
if (job is null)
{
return null;
}
var now = _timeProvider.GetUtcNow();
return ToStatus(job, now);
}
private static PolicyRunStatus ToStatus(PolicyRunJob job, DateTimeOffset now)
{
var status = MapExecutionStatus(job.Status);
var queuedAt = job.QueuedAt ?? job.CreatedAt;
var startedAt = job.SubmittedAt;
var finishedAt = job.CompletedAt ?? job.CancelledAt;
var metadata = job.Metadata ?? ImmutableSortedDictionary<string, string>.Empty;
var inputs = job.Inputs ?? PolicyRunInputs.Empty;
var policyVersion = job.PolicyVersion
?? throw new InvalidOperationException($"Policy run job '{job.Id}' is missing policyVersion.");
return new PolicyRunStatus(
job.RunId ?? job.Id,
job.TenantId,
job.PolicyId,
policyVersion,
job.Mode,
status,
job.Priority,
queuedAt,
job.Status == PolicyRunJobStatus.Pending ? null : startedAt,
finishedAt,
PolicyRunStats.Empty,
inputs,
determinismHash: null,
errorCode: null,
error: job.Status == PolicyRunJobStatus.Failed ? job.LastError : null,
attempts: job.AttemptCount,
traceId: null,
explainUri: null,
metadata,
SchedulerSchemaVersions.PolicyRunStatus);
}
private static PolicyRunExecutionStatus MapExecutionStatus(PolicyRunJobStatus status)
=> status switch
{
PolicyRunJobStatus.Pending => PolicyRunExecutionStatus.Queued,
PolicyRunJobStatus.Dispatching => PolicyRunExecutionStatus.Running,
PolicyRunJobStatus.Submitted => PolicyRunExecutionStatus.Running,
PolicyRunJobStatus.Completed => PolicyRunExecutionStatus.Succeeded,
PolicyRunJobStatus.Failed => PolicyRunExecutionStatus.Failed,
PolicyRunJobStatus.Cancelled => PolicyRunExecutionStatus.Cancelled,
_ => PolicyRunExecutionStatus.Queued
};
private static IReadOnlyCollection<PolicyRunJobStatus>? MapExecutionStatus(PolicyRunExecutionStatus status)
=> status switch
{
public async Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
cancellationToken.ThrowIfCancellationRequested();
var job = await _repository
.GetByRunIdAsync(tenantId, runId, cancellationToken: cancellationToken)
.ConfigureAwait(false);
if (job is null)
{
return null;
}
var now = _timeProvider.GetUtcNow();
return PolicyRunStatusFactory.Create(job, now);
}
public async Task<PolicyRunStatus?> RequestCancellationAsync(
string tenantId,
string runId,
string? reason,
CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
cancellationToken.ThrowIfCancellationRequested();
var job = await _repository
.GetByRunIdAsync(tenantId, runId, cancellationToken: cancellationToken)
.ConfigureAwait(false);
if (job is null)
{
return null;
}
var now = _timeProvider.GetUtcNow();
if (IsTerminal(job.Status))
{
return PolicyRunStatusFactory.Create(job, now);
}
if (job.CancellationRequested && string.Equals(job.CancellationReason, reason, StringComparison.Ordinal))
{
return PolicyRunStatusFactory.Create(job, now);
}
var updated = job with
{
CancellationRequested = true,
CancellationRequestedAt = now,
CancellationReason = NormalizeCancellationReason(reason),
UpdatedAt = now,
AvailableAt = now
};
var replaced = await _repository
.ReplaceAsync(updated, expectedLeaseOwner: job.LeaseOwner, cancellationToken: cancellationToken)
.ConfigureAwait(false);
if (!replaced)
{
_logger.LogWarning(
"Failed to persist cancellation request for policy run job {JobId} (runId={RunId}).",
job.Id,
job.RunId ?? "(pending)");
return PolicyRunStatusFactory.Create(job, now);
}
_logger.LogInformation(
"Cancellation requested for policy run job {JobId} (runId={RunId}, reason={Reason}).",
updated.Id,
updated.RunId ?? "(pending)",
updated.CancellationReason ?? "none");
return PolicyRunStatusFactory.Create(updated, now);
}
public async Task<PolicyRunStatus> RetryAsync(
string tenantId,
string runId,
string? requestedBy,
CancellationToken cancellationToken)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
cancellationToken.ThrowIfCancellationRequested();
var job = await _repository
.GetByRunIdAsync(tenantId, runId, cancellationToken: cancellationToken)
.ConfigureAwait(false)
?? throw new KeyNotFoundException($"Policy simulation {runId} was not found for tenant {tenantId}.");
if (job.Mode != PolicyRunMode.Simulate)
{
throw new InvalidOperationException("Only simulation runs can be retried through this endpoint.");
}
if (!IsTerminal(job.Status))
{
throw new InvalidOperationException("Simulation is still in progress and cannot be retried.");
}
var now = _timeProvider.GetUtcNow();
var metadataBuilder = (job.Metadata ?? ImmutableSortedDictionary<string, string>.Empty).ToBuilder();
metadataBuilder["retry-of"] = runId;
var request = new PolicyRunRequest(
tenantId,
job.PolicyId,
PolicyRunMode.Simulate,
job.Inputs ?? PolicyRunInputs.Empty,
job.Priority,
runId: null,
policyVersion: job.PolicyVersion,
requestedBy: NormalizeActor(requestedBy),
queuedAt: now,
correlationId: job.CorrelationId,
metadata: metadataBuilder.ToImmutable());
return await EnqueueAsync(tenantId, request, cancellationToken).ConfigureAwait(false);
}
private static IReadOnlyCollection<PolicyRunJobStatus>? MapExecutionStatus(PolicyRunExecutionStatus status)
=> status switch
{
PolicyRunExecutionStatus.Queued => new[] { PolicyRunJobStatus.Pending },
PolicyRunExecutionStatus.Running => new[] { PolicyRunJobStatus.Dispatching, PolicyRunJobStatus.Submitted },
PolicyRunExecutionStatus.Succeeded => new[] { PolicyRunJobStatus.Completed },
@@ -202,12 +262,39 @@ internal sealed class PolicyRunService : IPolicyRunService
PolicyRunExecutionStatus.Cancelled => new[] { PolicyRunJobStatus.Cancelled },
PolicyRunExecutionStatus.ReplayPending => Array.Empty<PolicyRunJobStatus>(),
_ => null
};
private static string GenerateRunId(string policyId, DateTimeOffset timestamp)
{
var normalizedPolicyId = string.IsNullOrWhiteSpace(policyId) ? "policy" : policyId.Trim();
var suffix = Guid.NewGuid().ToString("N")[..8];
return $"run:{normalizedPolicyId}:{timestamp:yyyyMMddTHHmmssZ}:{suffix}";
}
}
};
private static string GenerateRunId(string policyId, DateTimeOffset timestamp)
{
var normalizedPolicyId = string.IsNullOrWhiteSpace(policyId) ? "policy" : policyId.Trim();
var suffix = Guid.NewGuid().ToString("N")[..8];
return $"run:{normalizedPolicyId}:{timestamp:yyyyMMddTHHmmssZ}:{suffix}";
}
private static bool IsTerminal(PolicyRunJobStatus status)
=> status is PolicyRunJobStatus.Completed or PolicyRunJobStatus.Failed or PolicyRunJobStatus.Cancelled;
private static string? NormalizeCancellationReason(string? reason)
{
if (string.IsNullOrWhiteSpace(reason))
{
return null;
}
var trimmed = reason.Trim();
const int maxLength = 512;
return trimmed.Length > maxLength ? trimmed[..maxLength] : trimmed;
}
private static string? NormalizeActor(string? actor)
{
if (string.IsNullOrWhiteSpace(actor))
{
return null;
}
var trimmed = actor.Trim();
const int maxLength = 256;
return trimmed.Length > maxLength ? trimmed[..maxLength] : trimmed;
}
}

View File

@@ -0,0 +1,363 @@
using System.Collections.Immutable;
using System.ComponentModel.DataAnnotations;
using System.Text.Json.Serialization;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Mvc;
using StellaOps.Auth.Abstractions;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.WebService.Auth;
using StellaOps.Scheduler.WebService.PolicyRuns;
namespace StellaOps.Scheduler.WebService.PolicySimulations;
internal static class PolicySimulationEndpointExtensions
{
private const string Scope = StellaOpsScopes.PolicySimulate;
public static void MapPolicySimulationEndpoints(this IEndpointRouteBuilder builder)
{
var group = builder.MapGroup("/api/v1/scheduler/policies/simulations");
group.MapGet("/", ListSimulationsAsync);
group.MapGet("/{simulationId}", GetSimulationAsync);
group.MapGet("/{simulationId}/stream", StreamSimulationAsync);
group.MapGet("/metrics", GetMetricsAsync);
group.MapPost("/", CreateSimulationAsync);
group.MapPost("/{simulationId}/cancel", CancelSimulationAsync);
group.MapPost("/{simulationId}/retry", RetrySimulationAsync);
}
private static async Task<IResult> ListSimulationsAsync(
HttpContext httpContext,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IPolicyRunService policyRunService,
CancellationToken cancellationToken)
{
try
{
scopeAuthorizer.EnsureScope(httpContext, Scope);
var tenant = tenantAccessor.GetTenant(httpContext);
var options = PolicyRunQueryOptions
.FromRequest(httpContext.Request)
.ForceMode(PolicyRunMode.Simulate);
var simulations = await policyRunService
.ListAsync(tenant.TenantId, options, cancellationToken)
.ConfigureAwait(false);
return Results.Ok(new PolicySimulationCollectionResponse(simulations));
}
catch (UnauthorizedAccessException ex)
{
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
}
catch (InvalidOperationException ex)
{
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
}
catch (ValidationException ex)
{
return Results.BadRequest(new { error = ex.Message });
}
}
private static async Task<IResult> GetSimulationAsync(
HttpContext httpContext,
string simulationId,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IPolicyRunService policyRunService,
CancellationToken cancellationToken)
{
try
{
scopeAuthorizer.EnsureScope(httpContext, Scope);
var tenant = tenantAccessor.GetTenant(httpContext);
var simulation = await policyRunService
.GetAsync(tenant.TenantId, simulationId, cancellationToken)
.ConfigureAwait(false);
return simulation is null
? Results.NotFound()
: Results.Ok(new PolicySimulationResponse(simulation));
}
catch (UnauthorizedAccessException ex)
{
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
}
catch (InvalidOperationException ex)
{
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
}
catch (ValidationException ex)
{
return Results.BadRequest(new { error = ex.Message });
}
}
private static async Task<IResult> GetMetricsAsync(
HttpContext httpContext,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IPolicySimulationMetricsProvider? metricsProvider,
CancellationToken cancellationToken)
{
try
{
scopeAuthorizer.EnsureScope(httpContext, Scope);
var tenant = tenantAccessor.GetTenant(httpContext);
if (metricsProvider is null)
{
return Results.StatusCode(StatusCodes.Status501NotImplemented);
}
var metrics = await metricsProvider
.CaptureAsync(tenant.TenantId, cancellationToken)
.ConfigureAwait(false);
return Results.Ok(metrics);
}
catch (UnauthorizedAccessException ex)
{
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
}
catch (InvalidOperationException ex)
{
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
}
catch (ValidationException ex)
{
return Results.BadRequest(new { error = ex.Message });
}
}
private static async Task<IResult> CreateSimulationAsync(
HttpContext httpContext,
PolicySimulationCreateRequest request,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IPolicyRunService policyRunService,
CancellationToken cancellationToken)
{
try
{
scopeAuthorizer.EnsureScope(httpContext, Scope);
var tenant = tenantAccessor.GetTenant(httpContext);
var actor = SchedulerEndpointHelpers.ResolveActorId(httpContext);
if (string.IsNullOrWhiteSpace(request.PolicyId))
{
throw new ValidationException("policyId must be provided.");
}
if (request.PolicyVersion is null || request.PolicyVersion <= 0)
{
throw new ValidationException("policyVersion must be provided and greater than zero.");
}
var normalizedMetadata = NormalizeMetadata(request.Metadata);
var inputs = request.Inputs ?? PolicyRunInputs.Empty;
var policyRequest = new PolicyRunRequest(
tenant.TenantId,
request.PolicyId,
PolicyRunMode.Simulate,
inputs,
request.Priority,
runId: null,
policyVersion: request.PolicyVersion,
requestedBy: actor,
queuedAt: null,
correlationId: request.CorrelationId,
metadata: normalizedMetadata);
var status = await policyRunService
.EnqueueAsync(tenant.TenantId, policyRequest, cancellationToken)
.ConfigureAwait(false);
return Results.Created(
$"/api/v1/scheduler/policies/simulations/{status.RunId}",
new PolicySimulationResponse(status));
}
catch (UnauthorizedAccessException ex)
{
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
}
catch (InvalidOperationException ex)
{
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
}
catch (ValidationException ex)
{
return Results.BadRequest(new { error = ex.Message });
}
}
private static async Task<IResult> CancelSimulationAsync(
HttpContext httpContext,
string simulationId,
PolicySimulationCancelRequest? request,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IPolicyRunService policyRunService,
CancellationToken cancellationToken)
{
try
{
scopeAuthorizer.EnsureScope(httpContext, Scope);
var tenant = tenantAccessor.GetTenant(httpContext);
var cancellation = await policyRunService
.RequestCancellationAsync(tenant.TenantId, simulationId, request?.Reason, cancellationToken)
.ConfigureAwait(false);
return cancellation is null
? Results.NotFound()
: Results.Ok(new PolicySimulationResponse(cancellation));
}
catch (UnauthorizedAccessException ex)
{
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
}
catch (InvalidOperationException ex)
{
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
}
catch (ValidationException ex)
{
return Results.BadRequest(new { error = ex.Message });
}
}
private static async Task<IResult> RetrySimulationAsync(
HttpContext httpContext,
string simulationId,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IPolicyRunService policyRunService,
CancellationToken cancellationToken)
{
try
{
scopeAuthorizer.EnsureScope(httpContext, Scope);
var tenant = tenantAccessor.GetTenant(httpContext);
var actor = SchedulerEndpointHelpers.ResolveActorId(httpContext);
var status = await policyRunService
.RetryAsync(tenant.TenantId, simulationId, actor, cancellationToken)
.ConfigureAwait(false);
return Results.Created(
$"/api/v1/scheduler/policies/simulations/{status.RunId}",
new PolicySimulationResponse(status));
}
catch (KeyNotFoundException)
{
return Results.NotFound();
}
catch (UnauthorizedAccessException ex)
{
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
}
catch (InvalidOperationException ex)
{
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status409Conflict);
}
catch (ValidationException ex)
{
return Results.BadRequest(new { error = ex.Message });
}
}
private static async Task StreamSimulationAsync(
HttpContext httpContext,
string simulationId,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IPolicyRunService policyRunService,
[FromServices] IPolicySimulationStreamCoordinator streamCoordinator,
CancellationToken cancellationToken)
{
try
{
scopeAuthorizer.EnsureScope(httpContext, Scope);
var tenant = tenantAccessor.GetTenant(httpContext);
var simulation = await policyRunService
.GetAsync(tenant.TenantId, simulationId, cancellationToken)
.ConfigureAwait(false);
if (simulation is null)
{
await Results.NotFound().ExecuteAsync(httpContext);
return;
}
await streamCoordinator
.StreamAsync(httpContext, tenant.TenantId, simulation, cancellationToken)
.ConfigureAwait(false);
}
catch (UnauthorizedAccessException ex)
{
await Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized)
.ExecuteAsync(httpContext);
}
catch (InvalidOperationException ex)
{
await Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden)
.ExecuteAsync(httpContext);
}
catch (ValidationException ex)
{
await Results.BadRequest(new { error = ex.Message }).ExecuteAsync(httpContext);
}
}
private static ImmutableSortedDictionary<string, string>? NormalizeMetadata(IReadOnlyDictionary<string, string>? metadata)
{
if (metadata is null || metadata.Count == 0)
{
return null;
}
var builder = ImmutableSortedDictionary.CreateBuilder<string, string>(StringComparer.Ordinal);
foreach (var (key, value) in metadata)
{
var normalizedKey = key?.Trim();
var normalizedValue = value?.Trim();
if (string.IsNullOrEmpty(normalizedKey) || string.IsNullOrEmpty(normalizedValue))
{
continue;
}
var lowerKey = normalizedKey.ToLowerInvariant();
if (!builder.ContainsKey(lowerKey))
{
builder[lowerKey] = normalizedValue;
}
}
return builder.Count == 0 ? null : builder.ToImmutable();
}
}
internal sealed record PolicySimulationCreateRequest(
[property: JsonPropertyName("policyId")] string PolicyId,
[property: JsonPropertyName("policyVersion")] int? PolicyVersion,
[property: JsonPropertyName("priority")] PolicyRunPriority Priority = PolicyRunPriority.Normal,
[property: JsonPropertyName("correlationId")] string? CorrelationId = null,
[property: JsonPropertyName("metadata")] IReadOnlyDictionary<string, string>? Metadata = null,
[property: JsonPropertyName("inputs")] PolicyRunInputs? Inputs = null);
internal sealed record PolicySimulationCancelRequest(
[property: JsonPropertyName("reason")] string? Reason);
internal sealed record PolicySimulationCollectionResponse(
[property: JsonPropertyName("simulations")] IReadOnlyList<PolicyRunStatus> Simulations);
internal sealed record PolicySimulationResponse(
[property: JsonPropertyName("simulation")] PolicyRunStatus Simulation);

View File

@@ -0,0 +1,234 @@
using System;
using System.Collections.Generic;
using System.Diagnostics.Metrics;
using System.Linq;
using System.Text.Json.Serialization;
using System.Threading;
using System.Threading.Tasks;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Mongo.Repositories;
namespace StellaOps.Scheduler.WebService.PolicySimulations;
internal interface IPolicySimulationMetricsProvider
{
Task<PolicySimulationMetricsResponse> CaptureAsync(string tenantId, CancellationToken cancellationToken);
}
internal interface IPolicySimulationMetricsRecorder
{
void RecordLatency(PolicyRunStatus status, DateTimeOffset observedAt);
}
internal sealed class PolicySimulationMetricsProvider : IPolicySimulationMetricsProvider, IPolicySimulationMetricsRecorder, IDisposable
{
private static readonly PolicyRunJobStatus[] QueueStatuses =
{
PolicyRunJobStatus.Pending,
PolicyRunJobStatus.Dispatching,
PolicyRunJobStatus.Submitted,
};
private static readonly PolicyRunJobStatus[] TerminalStatuses =
{
PolicyRunJobStatus.Completed,
PolicyRunJobStatus.Failed,
PolicyRunJobStatus.Cancelled,
};
private readonly IPolicyRunJobRepository _repository;
private readonly TimeProvider _timeProvider;
private readonly Meter _meter;
private readonly ObservableGauge<long> _queueGauge;
private readonly Histogram<double> _latencyHistogram;
private readonly object _snapshotLock = new();
private IReadOnlyDictionary<string, long> _latestQueueSnapshot = new Dictionary<string, long>(StringComparer.Ordinal);
private bool _disposed;
public PolicySimulationMetricsProvider(IPolicyRunJobRepository repository, TimeProvider? timeProvider = null)
{
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
_timeProvider = timeProvider ?? TimeProvider.System;
_meter = new Meter("StellaOps.Scheduler.WebService.PolicySimulations");
_queueGauge = _meter.CreateObservableGauge<long>(
"policy_simulation_queue_depth",
ObserveQueueDepth,
unit: "runs",
description: "Queued policy simulation jobs grouped by status.");
_latencyHistogram = _meter.CreateHistogram<double>(
"policy_simulation_latency",
unit: "s",
description: "End-to-end policy simulation latency (seconds).");
}
public async Task<PolicySimulationMetricsResponse> CaptureAsync(string tenantId, CancellationToken cancellationToken)
{
cancellationToken.ThrowIfCancellationRequested();
if (string.IsNullOrWhiteSpace(tenantId))
{
throw new ArgumentException("Tenant id must be provided.", nameof(tenantId));
}
var queueCounts = new Dictionary<string, long>(StringComparer.OrdinalIgnoreCase);
long totalQueueDepth = 0;
foreach (var status in QueueStatuses)
{
var count = await _repository.CountAsync(
tenantId,
PolicyRunMode.Simulate,
new[] { status },
cancellationToken).ConfigureAwait(false);
queueCounts[status.ToString().ToLowerInvariant()] = count;
totalQueueDepth += count;
}
lock (_snapshotLock)
{
_latestQueueSnapshot = queueCounts;
}
var sampleSize = 200;
var recentJobs = await _repository.ListAsync(
tenantId,
policyId: null,
mode: PolicyRunMode.Simulate,
statuses: TerminalStatuses,
queuedAfter: null,
limit: sampleSize,
cancellationToken: cancellationToken).ConfigureAwait(false);
var durations = recentJobs
.Select(job => CalculateLatencySeconds(job, _timeProvider.GetUtcNow()))
.Where(duration => duration >= 0)
.OrderBy(duration => duration)
.ToArray();
var latencyMetrics = new PolicySimulationLatencyMetrics(
durations.Length,
Percentile(durations, 0.50),
Percentile(durations, 0.90),
Percentile(durations, 0.95),
Percentile(durations, 0.99),
Average(durations));
return new PolicySimulationMetricsResponse(
new PolicySimulationQueueDepth(totalQueueDepth, queueCounts),
latencyMetrics);
}
public void RecordLatency(PolicyRunStatus status, DateTimeOffset observedAt)
{
if (status is null)
{
throw new ArgumentNullException(nameof(status));
}
var latencySeconds = CalculateLatencySeconds(status, observedAt);
if (latencySeconds >= 0)
{
_latencyHistogram.Record(latencySeconds);
}
}
private IEnumerable<Measurement<long>> ObserveQueueDepth()
{
IReadOnlyDictionary<string, long> snapshot;
lock (_snapshotLock)
{
snapshot = _latestQueueSnapshot;
}
foreach (var pair in snapshot)
{
yield return new Measurement<long>(
pair.Value,
new KeyValuePair<string, object?>("status", pair.Key));
}
}
private static double CalculateLatencySeconds(PolicyRunJob job, DateTimeOffset now)
{
var started = job.QueuedAt ?? job.CreatedAt;
var finished = job.CompletedAt ?? job.CancelledAt ?? job.UpdatedAt;
if (started == default)
{
return -1;
}
var duration = (finished - started).TotalSeconds;
return duration < 0 ? 0 : duration;
}
private static double CalculateLatencySeconds(PolicyRunStatus status, DateTimeOffset now)
{
var started = status.QueuedAt;
var finished = status.FinishedAt ?? now;
if (started == default)
{
return -1;
}
var duration = (finished - started).TotalSeconds;
return duration < 0 ? 0 : duration;
}
private static double? Percentile(IReadOnlyList<double> values, double percentile)
{
if (values.Count == 0)
{
return null;
}
var position = percentile * (values.Count - 1);
var lowerIndex = (int)Math.Floor(position);
var upperIndex = (int)Math.Ceiling(position);
if (lowerIndex == upperIndex)
{
return Math.Round(values[lowerIndex], 4);
}
var fraction = position - lowerIndex;
var interpolated = values[lowerIndex] + (values[upperIndex] - values[lowerIndex]) * fraction;
return Math.Round(interpolated, 4);
}
private static double? Average(IReadOnlyList<double> values)
{
if (values.Count == 0)
{
return null;
}
var sum = values.Sum();
return Math.Round(sum / values.Count, 4);
}
public void Dispose()
{
if (_disposed)
{
return;
}
_meter.Dispose();
_disposed = true;
}
}
internal sealed record PolicySimulationMetricsResponse(
[property: JsonPropertyName("policy_simulation_queue_depth")] PolicySimulationQueueDepth QueueDepth,
[property: JsonPropertyName("policy_simulation_latency")] PolicySimulationLatencyMetrics Latency);
internal sealed record PolicySimulationQueueDepth(
[property: JsonPropertyName("total")] long Total,
[property: JsonPropertyName("by_status")] IReadOnlyDictionary<string, long> ByStatus);
internal sealed record PolicySimulationLatencyMetrics(
[property: JsonPropertyName("samples")] int Samples,
[property: JsonPropertyName("p50_seconds")] double? P50,
[property: JsonPropertyName("p90_seconds")] double? P90,
[property: JsonPropertyName("p95_seconds")] double? P95,
[property: JsonPropertyName("p99_seconds")] double? P99,
[property: JsonPropertyName("mean_seconds")] double? Mean);

View File

@@ -0,0 +1,198 @@
using System.Text.Json;
using System.Text.Json.Serialization;
using Microsoft.AspNetCore.Http;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.WebService.PolicyRuns;
using StellaOps.Scheduler.WebService.Runs;
namespace StellaOps.Scheduler.WebService.PolicySimulations;
internal interface IPolicySimulationStreamCoordinator
{
Task StreamAsync(HttpContext context, string tenantId, PolicyRunStatus initialStatus, CancellationToken cancellationToken);
}
internal sealed class PolicySimulationStreamCoordinator : IPolicySimulationStreamCoordinator
{
private static readonly JsonSerializerOptions SerializerOptions = new(JsonSerializerDefaults.Web);
private readonly IPolicyRunService _policyRunService;
private readonly IQueueLagSummaryProvider _queueLagProvider;
private readonly TimeProvider _timeProvider;
private readonly RunStreamOptions _options;
private readonly IPolicySimulationMetricsRecorder? _metricsRecorder;
private readonly ILogger<PolicySimulationStreamCoordinator> _logger;
public PolicySimulationStreamCoordinator(
IPolicyRunService policyRunService,
IQueueLagSummaryProvider queueLagProvider,
IOptions<RunStreamOptions> options,
TimeProvider? timeProvider,
ILogger<PolicySimulationStreamCoordinator> logger,
IPolicySimulationMetricsRecorder? metricsRecorder = null)
{
_policyRunService = policyRunService ?? throw new ArgumentNullException(nameof(policyRunService));
_queueLagProvider = queueLagProvider ?? throw new ArgumentNullException(nameof(queueLagProvider));
_options = (options ?? throw new ArgumentNullException(nameof(options))).Value.Validate();
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_metricsRecorder = metricsRecorder;
}
public async Task StreamAsync(HttpContext context, string tenantId, PolicyRunStatus initialStatus, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(context);
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
ArgumentNullException.ThrowIfNull(initialStatus);
ConfigureSseHeaders(context.Response);
await SseWriter.WriteRetryAsync(context.Response, _options.ReconnectDelay, cancellationToken).ConfigureAwait(false);
var last = initialStatus;
await SseWriter.WriteEventAsync(context.Response, "initial", PolicySimulationPayload.From(last), SerializerOptions, cancellationToken).ConfigureAwait(false);
await SseWriter.WriteEventAsync(context.Response, "queueLag", _queueLagProvider.Capture(), SerializerOptions, cancellationToken).ConfigureAwait(false);
await SseWriter.WriteEventAsync(context.Response, "heartbeat", HeartbeatPayload.Create(_timeProvider.GetUtcNow()), SerializerOptions, cancellationToken).ConfigureAwait(false);
if (IsTerminal(last.Status))
{
_metricsRecorder?.RecordLatency(last, _timeProvider.GetUtcNow());
await SseWriter.WriteEventAsync(context.Response, "completed", PolicySimulationPayload.From(last), SerializerOptions, cancellationToken).ConfigureAwait(false);
return;
}
using var pollTimer = new PeriodicTimer(_options.PollInterval);
using var queueTimer = new PeriodicTimer(_options.QueueLagInterval);
using var heartbeatTimer = new PeriodicTimer(_options.HeartbeatInterval);
try
{
while (!cancellationToken.IsCancellationRequested)
{
var pollTask = pollTimer.WaitForNextTickAsync(cancellationToken).AsTask();
var queueTask = queueTimer.WaitForNextTickAsync(cancellationToken).AsTask();
var heartbeatTask = heartbeatTimer.WaitForNextTickAsync(cancellationToken).AsTask();
var completed = await Task.WhenAny(pollTask, queueTask, heartbeatTask).ConfigureAwait(false);
if (completed == pollTask && await pollTask.ConfigureAwait(false))
{
var current = await _policyRunService
.GetAsync(tenantId, last.RunId, cancellationToken)
.ConfigureAwait(false);
if (current is null)
{
_logger.LogWarning("Policy simulation {RunId} disappeared while streaming.", last.RunId);
await SseWriter.WriteEventAsync(
context.Response,
"notFound",
new PolicySimulationNotFoundPayload(last.RunId),
SerializerOptions,
cancellationToken)
.ConfigureAwait(false);
break;
}
if (HasMeaningfulChange(last, current))
{
await SseWriter.WriteEventAsync(context.Response, "status", PolicySimulationPayload.From(current), SerializerOptions, cancellationToken)
.ConfigureAwait(false);
}
last = current;
if (IsTerminal(last.Status))
{
_metricsRecorder?.RecordLatency(last, _timeProvider.GetUtcNow());
await SseWriter.WriteEventAsync(context.Response, "completed", PolicySimulationPayload.From(last), SerializerOptions, cancellationToken)
.ConfigureAwait(false);
break;
}
}
else if (completed == queueTask && await queueTask.ConfigureAwait(false))
{
var summary = _queueLagProvider.Capture();
await SseWriter.WriteEventAsync(context.Response, "queueLag", summary, SerializerOptions, cancellationToken)
.ConfigureAwait(false);
}
else if (completed == heartbeatTask && await heartbeatTask.ConfigureAwait(false))
{
await SseWriter.WriteEventAsync(context.Response, "heartbeat", HeartbeatPayload.Create(_timeProvider.GetUtcNow()), SerializerOptions, cancellationToken)
.ConfigureAwait(false);
}
}
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
_logger.LogDebug("Policy simulation stream cancelled for run {RunId}.", last.RunId);
}
}
private static void ConfigureSseHeaders(HttpResponse response)
{
response.StatusCode = StatusCodes.Status200OK;
response.Headers.CacheControl = "no-store";
response.Headers["X-Accel-Buffering"] = "no";
response.Headers["Connection"] = "keep-alive";
response.ContentType = "text/event-stream";
}
private static bool HasMeaningfulChange(PolicyRunStatus previous, PolicyRunStatus current)
{
if (!EqualityComparer<PolicyRunExecutionStatus>.Default.Equals(previous.Status, current.Status))
{
return true;
}
if (!Nullable.Equals(previous.StartedAt, current.StartedAt) || !Nullable.Equals(previous.FinishedAt, current.FinishedAt))
{
return true;
}
if (previous.Attempts != current.Attempts)
{
return true;
}
if (!string.Equals(previous.Error, current.Error, StringComparison.Ordinal) ||
!string.Equals(previous.ErrorCode, current.ErrorCode, StringComparison.Ordinal) ||
!string.Equals(previous.DeterminismHash, current.DeterminismHash, StringComparison.Ordinal))
{
return true;
}
if (previous.CancellationRequested != current.CancellationRequested ||
!Nullable.Equals(previous.CancellationRequestedAt, current.CancellationRequestedAt) ||
!string.Equals(previous.CancellationReason, current.CancellationReason, StringComparison.Ordinal))
{
return true;
}
if (!EqualityComparer<PolicyRunStats>.Default.Equals(previous.Stats, current.Stats))
{
return true;
}
return false;
}
private static bool IsTerminal(PolicyRunExecutionStatus status)
=> status is PolicyRunExecutionStatus.Succeeded or PolicyRunExecutionStatus.Failed or PolicyRunExecutionStatus.Cancelled;
private sealed record PolicySimulationPayload(
[property: JsonPropertyName("simulation")] PolicyRunStatus Simulation)
{
public static PolicySimulationPayload From(PolicyRunStatus status) => new(status);
}
private sealed record PolicySimulationNotFoundPayload(
[property: JsonPropertyName("runId")] string RunId);
private sealed record HeartbeatPayload(
[property: JsonPropertyName("ts")] DateTimeOffset Timestamp)
{
public static HeartbeatPayload Create(DateTimeOffset timestamp) => new(timestamp);
}
}

View File

@@ -18,8 +18,9 @@ using StellaOps.Scheduler.WebService.GraphJobs;
using StellaOps.Scheduler.WebService.GraphJobs.Events;
using StellaOps.Scheduler.WebService.Schedules;
using StellaOps.Scheduler.WebService.Options;
using StellaOps.Scheduler.WebService.Runs;
using StellaOps.Scheduler.WebService.PolicyRuns;
using StellaOps.Scheduler.WebService.PolicySimulations;
using StellaOps.Scheduler.WebService.Runs;
var builder = WebApplication.CreateBuilder(args);
@@ -84,6 +85,8 @@ if (storageSection.Exists())
builder.Services.AddSchedulerMongoStorage(storageSection);
builder.Services.AddSingleton<IGraphJobStore, MongoGraphJobStore>();
builder.Services.AddSingleton<IPolicyRunService, PolicyRunService>();
builder.Services.AddSingleton<IPolicySimulationMetricsProvider, PolicySimulationMetricsProvider>();
builder.Services.AddSingleton<IPolicySimulationMetricsRecorder>(static sp => (IPolicySimulationMetricsRecorder)sp.GetRequiredService<IPolicySimulationMetricsProvider>());
}
else
{
@@ -117,6 +120,12 @@ builder.Services.AddOptions<SchedulerOptions>()
.Bind(builder.Configuration.GetSection("Scheduler"))
.PostConfigure(options => options.Validate());
builder.Services.AddSingleton<IQueueLagSummaryProvider, QueueLagSummaryProvider>();
builder.Services.AddSingleton<IRunStreamCoordinator, RunStreamCoordinator>();
builder.Services.AddSingleton<IPolicySimulationStreamCoordinator, PolicySimulationStreamCoordinator>();
builder.Services.AddOptions<RunStreamOptions>()
.Bind(builder.Configuration.GetSection("Scheduler:RunStream"));
var pluginHostOptions = SchedulerPluginHostFactory.Build(schedulerOptions.Plugins, builder.Environment.ContentRootPath);
builder.Services.AddSingleton(pluginHostOptions);
builder.Services.RegisterPluginRoutines(builder.Configuration, pluginHostOptions);
@@ -196,6 +205,7 @@ app.MapGraphJobEndpoints();
app.MapScheduleEndpoints();
app.MapRunEndpoints();
app.MapPolicyRunEndpoints();
app.MapPolicySimulationEndpoints();
app.MapSchedulerEventWebhookEndpoints();
app.Run();

View File

@@ -0,0 +1,60 @@
using System;
using System.Collections.Immutable;
using System.Linq;
using StellaOps.Scheduler.Queue;
namespace StellaOps.Scheduler.WebService.Runs;
internal interface IQueueLagSummaryProvider
{
QueueLagSummaryResponse Capture();
}
internal sealed class QueueLagSummaryProvider : IQueueLagSummaryProvider
{
private readonly TimeProvider _timeProvider;
public QueueLagSummaryProvider(TimeProvider? timeProvider = null)
{
_timeProvider = timeProvider ?? TimeProvider.System;
}
public QueueLagSummaryResponse Capture()
{
var samples = SchedulerQueueMetrics.CaptureDepthSamples();
if (samples.Count == 0)
{
return new QueueLagSummaryResponse(
_timeProvider.GetUtcNow(),
0,
0,
ImmutableArray<QueueLagEntry>.Empty);
}
var ordered = samples
.OrderBy(static sample => sample.Transport, StringComparer.Ordinal)
.ThenBy(static sample => sample.Queue, StringComparer.Ordinal)
.ToArray();
var builder = ImmutableArray.CreateBuilder<QueueLagEntry>(ordered.Length);
long totalDepth = 0;
long maxDepth = 0;
foreach (var sample in ordered)
{
totalDepth += sample.Depth;
if (sample.Depth > maxDepth)
{
maxDepth = sample.Depth;
}
builder.Add(new QueueLagEntry(sample.Transport, sample.Queue, sample.Depth));
}
return new QueueLagSummaryResponse(
_timeProvider.GetUtcNow(),
totalDepth,
maxDepth,
builder.ToImmutable());
}
}

View File

@@ -10,8 +10,9 @@ internal sealed record RunCreateRequest(
[property: JsonPropertyName("reason")] RunReason? Reason = null,
[property: JsonPropertyName("correlationId")] string? CorrelationId = null);
internal sealed record RunCollectionResponse(
[property: JsonPropertyName("runs")] IReadOnlyList<Run> Runs);
internal sealed record RunCollectionResponse(
[property: JsonPropertyName("runs")] IReadOnlyList<Run> Runs,
[property: JsonPropertyName("nextCursor")] string? NextCursor = null);
internal sealed record RunResponse(
[property: JsonPropertyName("run")] Run Run);
@@ -31,10 +32,24 @@ internal sealed record ImpactPreviewResponse(
[property: JsonPropertyName("snapshotId")] string? SnapshotId,
[property: JsonPropertyName("sample")] ImmutableArray<ImpactPreviewSample> Sample);
internal sealed record ImpactPreviewSample(
[property: JsonPropertyName("imageDigest")] string ImageDigest,
[property: JsonPropertyName("registry")] string Registry,
[property: JsonPropertyName("repository")] string Repository,
[property: JsonPropertyName("namespaces")] ImmutableArray<string> Namespaces,
[property: JsonPropertyName("tags")] ImmutableArray<string> Tags,
[property: JsonPropertyName("usedByEntrypoint")] bool UsedByEntrypoint);
internal sealed record ImpactPreviewSample(
[property: JsonPropertyName("imageDigest")] string ImageDigest,
[property: JsonPropertyName("registry")] string Registry,
[property: JsonPropertyName("repository")] string Repository,
[property: JsonPropertyName("namespaces")] ImmutableArray<string> Namespaces,
[property: JsonPropertyName("tags")] ImmutableArray<string> Tags,
[property: JsonPropertyName("usedByEntrypoint")] bool UsedByEntrypoint);
internal sealed record RunDeltaCollectionResponse(
[property: JsonPropertyName("deltas")] ImmutableArray<DeltaSummary> Deltas);
internal sealed record QueueLagSummaryResponse(
[property: JsonPropertyName("capturedAt")] DateTimeOffset CapturedAt,
[property: JsonPropertyName("totalDepth")] long TotalDepth,
[property: JsonPropertyName("maxDepth")] long MaxDepth,
[property: JsonPropertyName("queues")] ImmutableArray<QueueLagEntry> Queues);
internal sealed record QueueLagEntry(
[property: JsonPropertyName("transport")] string Transport,
[property: JsonPropertyName("queue")] string Queue,
[property: JsonPropertyName("depth")] long Depth);

View File

@@ -3,7 +3,8 @@ using System.Collections.Generic;
using System.Collections.Immutable;
using System.ComponentModel.DataAnnotations;
using System.Linq;
using Microsoft.AspNetCore.Http;
using System.Threading;
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Mvc;
using Microsoft.AspNetCore.Routing;
using Microsoft.Extensions.Primitives;
@@ -15,31 +16,57 @@ using StellaOps.Scheduler.WebService.Auth;
namespace StellaOps.Scheduler.WebService.Runs;
internal static class RunEndpoints
{
private const string ReadScope = "scheduler.runs.read";
private const string WriteScope = "scheduler.runs.write";
private const string PreviewScope = "scheduler.runs.preview";
internal static class RunEndpoints
{
private const string ReadScope = "scheduler.runs.read";
private const string WriteScope = "scheduler.runs.write";
private const string PreviewScope = "scheduler.runs.preview";
private const string ManageScope = "scheduler.runs.manage";
private const int DefaultRunListLimit = 50;
public static IEndpointRouteBuilder MapRunEndpoints(this IEndpointRouteBuilder routes)
{
var group = routes.MapGroup("/api/v1/scheduler/runs");
group.MapGet("/", ListRunsAsync);
group.MapGet("/queue/lag", GetQueueLagAsync);
group.MapGet("/{runId}/deltas", GetRunDeltasAsync);
group.MapGet("/{runId}/stream", StreamRunAsync);
group.MapGet("/{runId}", GetRunAsync);
group.MapPost("/", CreateRunAsync);
group.MapPost("/{runId}/cancel", CancelRunAsync);
group.MapPost("/{runId}/retry", RetryRunAsync);
group.MapPost("/preview", PreviewImpactAsync);
return routes;
}
public static IEndpointRouteBuilder MapRunEndpoints(this IEndpointRouteBuilder routes)
{
var group = routes.MapGroup("/api/v1/scheduler/runs");
group.MapGet("/", ListRunsAsync);
group.MapGet("/{runId}", GetRunAsync);
group.MapPost("/", CreateRunAsync);
group.MapPost("/{runId}/cancel", CancelRunAsync);
group.MapPost("/preview", PreviewImpactAsync);
return routes;
}
private static async Task<IResult> ListRunsAsync(
HttpContext httpContext,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IRunRepository repository,
CancellationToken cancellationToken)
private static IResult GetQueueLagAsync(
HttpContext httpContext,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IQueueLagSummaryProvider queueLagProvider)
{
try
{
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
tenantAccessor.GetTenant(httpContext);
var summary = queueLagProvider.Capture();
return Results.Ok(summary);
}
catch (Exception ex) when (ex is ArgumentException or ValidationException)
{
return Results.BadRequest(new { error = ex.Message });
}
}
private static async Task<IResult> ListRunsAsync(
HttpContext httpContext,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IRunRepository repository,
CancellationToken cancellationToken)
{
try
{
@@ -50,24 +77,35 @@ internal static class RunEndpoints
? scheduleValues.ToString().Trim()
: null;
var states = ParseRunStates(httpContext.Request.Query.TryGetValue("state", out var stateValues) ? stateValues : StringValues.Empty);
var createdAfter = SchedulerEndpointHelpers.TryParseDateTimeOffset(httpContext.Request.Query.TryGetValue("createdAfter", out var createdAfterValues) ? createdAfterValues.ToString() : null);
var limit = SchedulerEndpointHelpers.TryParsePositiveInt(httpContext.Request.Query.TryGetValue("limit", out var limitValues) ? limitValues.ToString() : null);
var sortAscending = httpContext.Request.Query.TryGetValue("sort", out var sortValues) &&
sortValues.Any(value => string.Equals(value, "asc", StringComparison.OrdinalIgnoreCase));
var options = new RunQueryOptions
{
ScheduleId = string.IsNullOrWhiteSpace(scheduleId) ? null : scheduleId,
States = states,
CreatedAfter = createdAfter,
Limit = limit,
SortAscending = sortAscending,
};
var runs = await repository.ListAsync(tenant.TenantId, options, cancellationToken: cancellationToken).ConfigureAwait(false);
return Results.Ok(new RunCollectionResponse(runs));
var states = ParseRunStates(httpContext.Request.Query.TryGetValue("state", out var stateValues) ? stateValues : StringValues.Empty);
var createdAfter = SchedulerEndpointHelpers.TryParseDateTimeOffset(httpContext.Request.Query.TryGetValue("createdAfter", out var createdAfterValues) ? createdAfterValues.ToString() : null);
var limit = SchedulerEndpointHelpers.TryParsePositiveInt(httpContext.Request.Query.TryGetValue("limit", out var limitValues) ? limitValues.ToString() : null);
var cursor = SchedulerEndpointHelpers.TryParseRunCursor(httpContext.Request.Query.TryGetValue("cursor", out var cursorValues) ? cursorValues.ToString() : null);
var sortAscending = httpContext.Request.Query.TryGetValue("sort", out var sortValues) &&
sortValues.Any(value => string.Equals(value, "asc", StringComparison.OrdinalIgnoreCase));
var appliedLimit = limit ?? DefaultRunListLimit;
var options = new RunQueryOptions
{
ScheduleId = string.IsNullOrWhiteSpace(scheduleId) ? null : scheduleId,
States = states,
CreatedAfter = createdAfter,
Cursor = cursor,
Limit = appliedLimit,
SortAscending = sortAscending,
};
var runs = await repository.ListAsync(tenant.TenantId, options, cancellationToken: cancellationToken).ConfigureAwait(false);
string? nextCursor = null;
if (runs.Count == appliedLimit && runs.Count > 0)
{
var last = runs[^1];
nextCursor = SchedulerEndpointHelpers.CreateRunCursor(last);
}
return Results.Ok(new RunCollectionResponse(runs, nextCursor));
}
catch (Exception ex) when (ex is ArgumentException or ValidationException)
{
@@ -75,32 +113,59 @@ internal static class RunEndpoints
}
}
private static async Task<IResult> GetRunAsync(
HttpContext httpContext,
string runId,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IRunRepository repository,
CancellationToken cancellationToken)
{
try
{
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
var tenant = tenantAccessor.GetTenant(httpContext);
var run = await repository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
if (run is null)
{
return Results.NotFound();
}
return Results.Ok(new RunResponse(run));
}
catch (Exception ex) when (ex is ArgumentException or ValidationException)
{
return Results.BadRequest(new { error = ex.Message });
}
}
private static async Task<IResult> GetRunAsync(
HttpContext httpContext,
string runId,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IRunRepository repository,
CancellationToken cancellationToken)
{
try
{
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
var tenant = tenantAccessor.GetTenant(httpContext);
var run = await repository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
if (run is null)
{
return Results.NotFound();
}
return Results.Ok(new RunResponse(run));
}
catch (Exception ex) when (ex is ArgumentException or ValidationException)
{
return Results.BadRequest(new { error = ex.Message });
}
}
private static async Task<IResult> GetRunDeltasAsync(
HttpContext httpContext,
string runId,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IRunRepository repository,
CancellationToken cancellationToken)
{
try
{
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
var tenant = tenantAccessor.GetTenant(httpContext);
var run = await repository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
if (run is null)
{
return Results.NotFound();
}
return Results.Ok(new RunDeltaCollectionResponse(run.Deltas));
}
catch (Exception ex) when (ex is ArgumentException or ValidationException)
{
return Results.BadRequest(new { error = ex.Message });
}
}
private static async Task<IResult> CreateRunAsync(
HttpContext httpContext,
@@ -116,7 +181,7 @@ internal static class RunEndpoints
{
try
{
scopeAuthorizer.EnsureScope(httpContext, WriteScope);
scopeAuthorizer.EnsureScope(httpContext, ManageScope);
var tenant = tenantAccessor.GetTenant(httpContext);
if (string.IsNullOrWhiteSpace(request.ScheduleId))
@@ -184,11 +249,11 @@ internal static class RunEndpoints
}
}
private static async Task<IResult> CancelRunAsync(
HttpContext httpContext,
string runId,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
private static async Task<IResult> CancelRunAsync(
HttpContext httpContext,
string runId,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IRunRepository repository,
[FromServices] IRunSummaryService runSummaryService,
[FromServices] ISchedulerAuditService auditService,
@@ -243,9 +308,145 @@ internal static class RunEndpoints
}
catch (Exception ex) when (ex is ArgumentException or ValidationException)
{
return Results.BadRequest(new { error = ex.Message });
}
}
return Results.BadRequest(new { error = ex.Message });
}
}
private static async Task<IResult> RetryRunAsync(
HttpContext httpContext,
string runId,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IScheduleRepository scheduleRepository,
[FromServices] IRunRepository runRepository,
[FromServices] IRunSummaryService runSummaryService,
[FromServices] ISchedulerAuditService auditService,
[FromServices] TimeProvider timeProvider,
CancellationToken cancellationToken)
{
try
{
scopeAuthorizer.EnsureScope(httpContext, ManageScope);
var tenant = tenantAccessor.GetTenant(httpContext);
var existing = await runRepository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
if (existing is null)
{
return Results.NotFound();
}
if (string.IsNullOrWhiteSpace(existing.ScheduleId))
{
return Results.BadRequest(new { error = "Run cannot be retried because it is not associated with a schedule." });
}
if (!RunStateMachine.IsTerminal(existing.State))
{
return Results.Conflict(new { error = "Run is not in a terminal state and cannot be retried." });
}
var schedule = await scheduleRepository.GetAsync(tenant.TenantId, existing.ScheduleId!, cancellationToken: cancellationToken).ConfigureAwait(false);
if (schedule is null)
{
return Results.BadRequest(new { error = "Associated schedule no longer exists." });
}
var now = timeProvider.GetUtcNow();
var newRunId = SchedulerEndpointHelpers.GenerateIdentifier("run");
var baselineReason = existing.Reason ?? RunReason.Empty;
var manualReason = string.IsNullOrWhiteSpace(baselineReason.ManualReason)
? $"retry-of:{existing.Id}"
: $"{baselineReason.ManualReason};retry-of:{existing.Id}";
var newReason = new RunReason(
manualReason,
baselineReason.ConselierExportId,
baselineReason.ExcitorExportId,
baselineReason.Cursor)
{
ImpactWindowFrom = baselineReason.ImpactWindowFrom,
ImpactWindowTo = baselineReason.ImpactWindowTo
};
var retryRun = new Run(
newRunId,
tenant.TenantId,
RunTrigger.Manual,
RunState.Planning,
RunStats.Empty,
now,
newReason,
existing.ScheduleId,
retryOf: existing.Id);
await runRepository.InsertAsync(retryRun, cancellationToken: cancellationToken).ConfigureAwait(false);
if (!string.IsNullOrWhiteSpace(retryRun.ScheduleId))
{
await runSummaryService.ProjectAsync(retryRun, cancellationToken).ConfigureAwait(false);
}
await auditService.WriteAsync(
new SchedulerAuditEvent(
tenant.TenantId,
"scheduler.run",
"retry",
SchedulerEndpointHelpers.ResolveAuditActor(httpContext),
RunId: retryRun.Id,
ScheduleId: retryRun.ScheduleId,
Metadata: BuildMetadata(
("state", retryRun.State.ToString().ToLowerInvariant()),
("retryOf", existing.Id),
("trigger", retryRun.Trigger.ToString().ToLowerInvariant()))),
cancellationToken).ConfigureAwait(false);
return Results.Created($"/api/v1/scheduler/runs/{retryRun.Id}", new RunResponse(retryRun));
}
catch (InvalidOperationException ex)
{
return Results.BadRequest(new { error = ex.Message });
}
catch (Exception ex) when (ex is ArgumentException or ValidationException)
{
return Results.BadRequest(new { error = ex.Message });
}
}
private static async Task StreamRunAsync(
HttpContext httpContext,
string runId,
[FromServices] ITenantContextAccessor tenantAccessor,
[FromServices] IScopeAuthorizer scopeAuthorizer,
[FromServices] IRunRepository runRepository,
[FromServices] IRunStreamCoordinator runStreamCoordinator,
CancellationToken cancellationToken)
{
try
{
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
var tenant = tenantAccessor.GetTenant(httpContext);
var run = await runRepository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
if (run is null)
{
await Results.NotFound().ExecuteAsync(httpContext);
return;
}
await runStreamCoordinator.StreamAsync(httpContext, tenant.TenantId, run, cancellationToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
// Client disconnected; nothing to do.
}
catch (Exception ex) when (ex is ArgumentException or ValidationException)
{
if (!httpContext.Response.HasStarted)
{
await Results.BadRequest(new { error = ex.Message }).ExecuteAsync(httpContext);
}
}
}
private static async Task<IResult> PreviewImpactAsync(
HttpContext httpContext,

View File

@@ -0,0 +1,225 @@
using System;
using System.Collections.Immutable;
using System.Linq;
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.AspNetCore.Http;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Mongo.Repositories;
namespace StellaOps.Scheduler.WebService.Runs;
internal interface IRunStreamCoordinator
{
Task StreamAsync(HttpContext context, string tenantId, Run initialRun, CancellationToken cancellationToken);
}
internal sealed class RunStreamCoordinator : IRunStreamCoordinator
{
private static readonly JsonSerializerOptions SerializerOptions = new(JsonSerializerDefaults.Web);
private readonly IRunRepository _runRepository;
private readonly IQueueLagSummaryProvider _queueLagProvider;
private readonly TimeProvider _timeProvider;
private readonly ILogger<RunStreamCoordinator> _logger;
private readonly RunStreamOptions _options;
public RunStreamCoordinator(
IRunRepository runRepository,
IQueueLagSummaryProvider queueLagProvider,
IOptions<RunStreamOptions> options,
TimeProvider? timeProvider,
ILogger<RunStreamCoordinator> logger)
{
_runRepository = runRepository ?? throw new ArgumentNullException(nameof(runRepository));
_queueLagProvider = queueLagProvider ?? throw new ArgumentNullException(nameof(queueLagProvider));
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_options = (options ?? throw new ArgumentNullException(nameof(options))).Value.Validate();
}
public async Task StreamAsync(HttpContext context, string tenantId, Run initialRun, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(context);
ArgumentNullException.ThrowIfNull(initialRun);
var response = context.Response;
ConfigureSseHeaders(response);
await SseWriter.WriteRetryAsync(response, _options.ReconnectDelay, cancellationToken).ConfigureAwait(false);
var lastRun = initialRun;
await SseWriter.WriteEventAsync(response, "initial", RunSnapshotPayload.From(lastRun), SerializerOptions, cancellationToken).ConfigureAwait(false);
await SseWriter.WriteEventAsync(response, "queueLag", _queueLagProvider.Capture(), SerializerOptions, cancellationToken).ConfigureAwait(false);
await SseWriter.WriteEventAsync(response, "heartbeat", HeartbeatPayload.Create(_timeProvider.GetUtcNow()), SerializerOptions, cancellationToken).ConfigureAwait(false);
if (RunStateMachine.IsTerminal(lastRun.State))
{
await SseWriter.WriteEventAsync(response, "completed", RunSnapshotPayload.From(lastRun), SerializerOptions, cancellationToken).ConfigureAwait(false);
return;
}
using var pollTimer = new PeriodicTimer(_options.PollInterval);
using var queueTimer = new PeriodicTimer(_options.QueueLagInterval);
using var heartbeatTimer = new PeriodicTimer(_options.HeartbeatInterval);
try
{
while (!cancellationToken.IsCancellationRequested)
{
var pollTask = pollTimer.WaitForNextTickAsync(cancellationToken).AsTask();
var queueTask = queueTimer.WaitForNextTickAsync(cancellationToken).AsTask();
var heartbeatTask = heartbeatTimer.WaitForNextTickAsync(cancellationToken).AsTask();
var completed = await Task.WhenAny(pollTask, queueTask, heartbeatTask).ConfigureAwait(false);
if (completed == pollTask && await pollTask.ConfigureAwait(false))
{
var current = await _runRepository.GetAsync(tenantId, lastRun.Id, cancellationToken: cancellationToken).ConfigureAwait(false);
if (current is null)
{
_logger.LogWarning("Run {RunId} disappeared while streaming; signalling notFound event.", lastRun.Id);
await SseWriter.WriteEventAsync(response, "notFound", new RunNotFoundPayload(lastRun.Id), SerializerOptions, cancellationToken).ConfigureAwait(false);
break;
}
await EmitRunDifferencesAsync(response, lastRun, current, cancellationToken).ConfigureAwait(false);
lastRun = current;
if (RunStateMachine.IsTerminal(lastRun.State))
{
await SseWriter.WriteEventAsync(response, "completed", RunSnapshotPayload.From(lastRun), SerializerOptions, cancellationToken).ConfigureAwait(false);
break;
}
}
else if (completed == queueTask && await queueTask.ConfigureAwait(false))
{
await SseWriter.WriteEventAsync(response, "queueLag", _queueLagProvider.Capture(), SerializerOptions, cancellationToken).ConfigureAwait(false);
}
else if (completed == heartbeatTask && await heartbeatTask.ConfigureAwait(false))
{
await SseWriter.WriteEventAsync(response, "heartbeat", HeartbeatPayload.Create(_timeProvider.GetUtcNow()), SerializerOptions, cancellationToken).ConfigureAwait(false);
}
}
}
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
{
_logger.LogDebug("Run stream cancelled for run {RunId}.", lastRun.Id);
}
}
private static void ConfigureSseHeaders(HttpResponse response)
{
response.StatusCode = StatusCodes.Status200OK;
response.Headers.CacheControl = "no-store";
response.Headers["X-Accel-Buffering"] = "no";
response.Headers["Connection"] = "keep-alive";
response.ContentType = "text/event-stream";
}
private async Task EmitRunDifferencesAsync(HttpResponse response, Run previous, Run current, CancellationToken cancellationToken)
{
var stateChanged = current.State != previous.State || current.StartedAt != previous.StartedAt || current.FinishedAt != previous.FinishedAt || !string.Equals(current.Error, previous.Error, StringComparison.Ordinal);
if (stateChanged)
{
await SseWriter.WriteEventAsync(response, "stateChanged", RunStateChangedPayload.From(current), SerializerOptions, cancellationToken).ConfigureAwait(false);
}
if (!ReferenceEquals(current.Stats, previous.Stats) && current.Stats != previous.Stats)
{
await SseWriter.WriteEventAsync(response, "segmentProgress", RunStatsPayload.From(current), SerializerOptions, cancellationToken).ConfigureAwait(false);
}
if (!current.Deltas.SequenceEqual(previous.Deltas))
{
await SseWriter.WriteEventAsync(response, "deltaSummary", new RunDeltaPayload(current.Id, current.Deltas), SerializerOptions, cancellationToken).ConfigureAwait(false);
}
}
private sealed record RunSnapshotPayload(
[property: JsonPropertyName("run")] Run Run)
{
public static RunSnapshotPayload From(Run run)
=> new(run);
}
private sealed record RunStateChangedPayload(
[property: JsonPropertyName("runId")] string RunId,
[property: JsonPropertyName("state")] string State,
[property: JsonPropertyName("startedAt")] DateTimeOffset? StartedAt,
[property: JsonPropertyName("finishedAt")] DateTimeOffset? FinishedAt,
[property: JsonPropertyName("error")] string? Error)
{
public static RunStateChangedPayload From(Run run)
=> new(
run.Id,
run.State.ToString().ToLowerInvariant(),
run.StartedAt,
run.FinishedAt,
run.Error);
}
private sealed record RunStatsPayload(
[property: JsonPropertyName("runId")] string RunId,
[property: JsonPropertyName("stats")] RunStats Stats)
{
public static RunStatsPayload From(Run run)
=> new(run.Id, run.Stats);
}
private sealed record RunDeltaPayload(
[property: JsonPropertyName("runId")] string RunId,
[property: JsonPropertyName("deltas")] ImmutableArray<DeltaSummary> Deltas);
private sealed record HeartbeatPayload(
[property: JsonPropertyName("ts")] DateTimeOffset Timestamp)
{
public static HeartbeatPayload Create(DateTimeOffset timestamp)
=> new(timestamp);
}
private sealed record RunNotFoundPayload(
[property: JsonPropertyName("runId")] string RunId);
}
internal sealed class RunStreamOptions
{
private static readonly TimeSpan MinimumInterval = TimeSpan.FromMilliseconds(100);
private static readonly TimeSpan MinimumReconnectDelay = TimeSpan.FromMilliseconds(500);
public TimeSpan PollInterval { get; set; } = TimeSpan.FromSeconds(2);
public TimeSpan QueueLagInterval { get; set; } = TimeSpan.FromSeconds(10);
public TimeSpan HeartbeatInterval { get; set; } = TimeSpan.FromSeconds(5);
public TimeSpan ReconnectDelay { get; set; } = TimeSpan.FromSeconds(5);
public RunStreamOptions Validate()
{
if (PollInterval < MinimumInterval)
{
throw new ArgumentOutOfRangeException(nameof(PollInterval), PollInterval, "Poll interval must be at least 100ms.");
}
if (QueueLagInterval < MinimumInterval)
{
throw new ArgumentOutOfRangeException(nameof(QueueLagInterval), QueueLagInterval, "Queue lag interval must be at least 100ms.");
}
if (HeartbeatInterval < MinimumInterval)
{
throw new ArgumentOutOfRangeException(nameof(HeartbeatInterval), HeartbeatInterval, "Heartbeat interval must be at least 100ms.");
}
if (ReconnectDelay < MinimumReconnectDelay)
{
throw new ArgumentOutOfRangeException(nameof(ReconnectDelay), ReconnectDelay, "Reconnect delay must be at least 500ms.");
}
return this;
}
}

View File

@@ -0,0 +1,45 @@
using System;
using System.IO;
using System.Text.Json;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.AspNetCore.Http;
namespace StellaOps.Scheduler.WebService.Runs;
internal static class SseWriter
{
public static async Task WriteRetryAsync(HttpResponse response, TimeSpan reconnectDelay, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(response);
var milliseconds = (int)Math.Clamp(reconnectDelay.TotalMilliseconds, 1, int.MaxValue);
await response.WriteAsync($"retry: {milliseconds}\r\n\r\n", cancellationToken).ConfigureAwait(false);
await response.Body.FlushAsync(cancellationToken).ConfigureAwait(false);
}
public static async Task WriteEventAsync(HttpResponse response, string eventName, object payload, JsonSerializerOptions serializerOptions, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(response);
ArgumentNullException.ThrowIfNull(payload);
ArgumentNullException.ThrowIfNull(serializerOptions);
if (string.IsNullOrWhiteSpace(eventName))
{
throw new ArgumentException("Event name must be provided.", nameof(eventName));
}
await response.WriteAsync($"event: {eventName}\r\n", cancellationToken).ConfigureAwait(false);
var json = JsonSerializer.Serialize(payload, serializerOptions);
using var reader = new StringReader(json);
string? line;
while ((line = reader.ReadLine()) is not null)
{
await response.WriteAsync($"data: {line}\r\n", cancellationToken).ConfigureAwait(false);
}
await response.WriteAsync("\r\n", cancellationToken).ConfigureAwait(false);
await response.Body.FlushAsync(cancellationToken).ConfigureAwait(false);
}
}

View File

@@ -1,7 +1,9 @@
using System.ComponentModel.DataAnnotations;
using System.Globalization;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Mongo.Services;
using System.ComponentModel.DataAnnotations;
using System.Globalization;
using System.Text;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Mongo.Repositories;
using StellaOps.Scheduler.Storage.Mongo.Services;
namespace StellaOps.Scheduler.WebService;
@@ -91,11 +93,11 @@ internal static class SchedulerEndpointHelpers
return null;
}
public static DateTimeOffset? TryParseDateTimeOffset(string? value)
{
if (string.IsNullOrWhiteSpace(value))
{
return null;
public static DateTimeOffset? TryParseDateTimeOffset(string? value)
{
if (string.IsNullOrWhiteSpace(value))
{
return null;
}
if (DateTimeOffset.TryParse(value, CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal, out var parsed))
@@ -114,14 +116,62 @@ internal static class SchedulerEndpointHelpers
throw new ArgumentException("Tenant identifier must be provided.", nameof(tenantId));
}
return new Selector(
selection.Scope,
tenantId,
selection.Namespaces,
selection.Repositories,
selection.Digests,
selection.IncludeTags,
selection.Labels,
selection.ResolvesTags);
}
}
return new Selector(
selection.Scope,
tenantId,
selection.Namespaces,
selection.Repositories,
selection.Digests,
selection.IncludeTags,
selection.Labels,
selection.ResolvesTags);
}
public static string CreateRunCursor(Run run)
{
ArgumentNullException.ThrowIfNull(run);
var payload = $"{run.CreatedAt.ToUniversalTime():O}|{run.Id}";
return Convert.ToBase64String(Encoding.UTF8.GetBytes(payload));
}
public static RunListCursor? TryParseRunCursor(string? value)
{
if (string.IsNullOrWhiteSpace(value))
{
return null;
}
var trimmed = value.Trim();
if (trimmed.Length == 0)
{
return null;
}
try
{
var bytes = Convert.FromBase64String(trimmed);
var decoded = Encoding.UTF8.GetString(bytes);
var parts = decoded.Split('|', 2, StringSplitOptions.TrimEntries);
if (parts.Length != 2)
{
throw new ValidationException($"Cursor '{value}' is not valid.");
}
if (!DateTimeOffset.TryParse(parts[0], CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal, out var timestamp))
{
throw new ValidationException($"Cursor '{value}' is not valid.");
}
if (string.IsNullOrWhiteSpace(parts[1]))
{
throw new ValidationException($"Cursor '{value}' is not valid.");
}
return new RunListCursor(timestamp.ToUniversalTime(), parts[1]);
}
catch (FormatException ex)
{
throw new ValidationException($"Cursor '{value}' is not valid.", ex);
}
}
}

View File

@@ -9,6 +9,7 @@
<ProjectReference Include="../__Libraries/StellaOps.Scheduler.Models/StellaOps.Scheduler.Models.csproj" />
<ProjectReference Include="../__Libraries/StellaOps.Scheduler.Storage.Mongo/StellaOps.Scheduler.Storage.Mongo.csproj" />
<ProjectReference Include="../__Libraries/StellaOps.Scheduler.ImpactIndex/StellaOps.Scheduler.ImpactIndex.csproj" />
<ProjectReference Include="../__Libraries/StellaOps.Scheduler.Queue/StellaOps.Scheduler.Queue.csproj" />
<ProjectReference Include="../../__Libraries/StellaOps.Plugin/StellaOps.Plugin.csproj" />
<ProjectReference Include="../../Authority/StellaOps.Authority/StellaOps.Auth.Abstractions/StellaOps.Auth.Abstractions.csproj" />
<ProjectReference Include="../../Authority/StellaOps.Authority/StellaOps.Auth.ServerIntegration/StellaOps.Auth.ServerIntegration.csproj" />

View File

@@ -22,13 +22,13 @@
## StellaOps Console (Sprint 23)
| ID | Status | Owner(s) | Depends on | Description | Exit Criteria |
|----|--------|----------|------------|-------------|---------------|
| SCHED-CONSOLE-23-001 | TODO | Scheduler WebService Guild, BE-Base Platform Guild | SCHED-WEB-16-103, SCHED-WEB-20-001 | Extend runs APIs with live progress SSE endpoints (`/console/runs/{id}/stream`), queue lag summaries, diff metadata fetch, retry/cancel hooks with RBAC enforcement, and deterministic pagination for history views consumed by Console. | SSE emits heartbeats/backoff headers, progress payload schema documented, unauthorized actions blocked in integration tests, metrics/logs expose queue lag + correlation IDs. |
| SCHED-CONSOLE-23-001 | DONE (2025-11-03) | Scheduler WebService Guild, BE-Base Platform Guild | SCHED-WEB-16-103, SCHED-WEB-20-001 | Extend runs APIs with live progress SSE endpoints (`/console/runs/{id}/stream`), queue lag summaries, diff metadata fetch, retry/cancel hooks with RBAC enforcement, and deterministic pagination for history views consumed by Console. | SSE emits heartbeats/backoff headers, progress payload schema documented, unauthorized actions blocked in integration tests, metrics/logs expose queue lag + correlation IDs. |
## Policy Studio (Sprint 27)
| ID | Status | Owner(s) | Depends on | Description | Exit Criteria |
|----|--------|----------|------------|-------------|---------------|
| SCHED-CONSOLE-27-001 | TODO | Scheduler WebService Guild, Policy Registry Guild | SCHED-WEB-16-103, REGISTRY-API-27-005 | Provide policy batch simulation orchestration endpoints (`/policies/simulations` POST/GET) exposing run creation, shard status, SSE progress, cancellation, and retries with RBAC enforcement. | API handles shard lifecycle with SSE heartbeats + retry headers; unauthorized requests rejected; integration tests cover submit/cancel/resume flows. |
| SCHED-CONSOLE-27-002 | TODO | Scheduler WebService Guild, Observability Guild | SCHED-CONSOLE-27-001 | Emit telemetry endpoints/metrics (`policy_simulation_queue_depth`, `policy_simulation_latency`) and webhook callbacks for completion/failure consumed by Registry. | Metrics exposed via gateway, dashboards seeded, webhook contract documented, integration tests validate metrics emission. |
| SCHED-CONSOLE-27-001 | DONE (2025-11-03) | Scheduler WebService Guild, Policy Registry Guild | SCHED-WEB-16-103, REGISTRY-API-27-005 | Provide policy batch simulation orchestration endpoints (`/policies/simulations` POST/GET) exposing run creation, shard status, SSE progress, cancellation, and retries with RBAC enforcement. | API handles shard lifecycle with SSE heartbeats + retry headers; unauthorized requests rejected; integration tests cover submit/cancel/resume flows. |
| SCHED-CONSOLE-27-002 | DOING (2025-11-03) | Scheduler WebService Guild, Observability Guild | SCHED-CONSOLE-27-001 | Emit telemetry endpoints/metrics (`policy_simulation_queue_depth`, `policy_simulation_latency`) and webhook callbacks for completion/failure consumed by Registry. | Metrics exposed via gateway, dashboards seeded, webhook contract documented, integration tests validate metrics emission. |
## Vulnerability Explorer (Sprint 29)
| ID | Status | Owner(s) | Depends on | Description | Exit Criteria |

View File

@@ -6,11 +6,21 @@
| Method | Path | Description | Scopes |
| ------ | ---- | ----------- | ------ |
| `GET` | `/api/v1/scheduler/runs` | List runs for the current tenant (filter by schedule, state, createdAfter). | `scheduler.runs.read` |
| `GET` | `/api/v1/scheduler/runs/{runId}` | Retrieve run details. | `scheduler.runs.read` |
| `POST` | `/api/v1/scheduler/runs` | Create an ad-hoc run bound to an existing schedule. | `scheduler.runs.write` |
| `POST` | `/api/v1/scheduler/runs/{runId}/cancel` | Transition a run to `cancelled` when still in a non-terminal state. | `scheduler.runs.write` |
| `POST` | `/api/v1/scheduler/runs/preview` | Resolve impacted images using the ImpactIndex without enqueuing work. | `scheduler.runs.preview` |
| `GET` | `/api/v1/scheduler/runs` | List runs for the current tenant (filter by schedule, state, createdAfter, cursor). | `scheduler.runs.read` |
| `GET` | `/api/v1/scheduler/runs/{runId}` | Retrieve run details. | `scheduler.runs.read` |
| `GET` | `/api/v1/scheduler/runs/{runId}/deltas` | Fetch deterministic delta metadata for the specified run. | `scheduler.runs.read` |
| `GET` | `/api/v1/scheduler/runs/queue/lag` | Snapshot queue depth per transport/queue for console dashboards. | `scheduler.runs.read` |
| `GET` | `/api/v1/scheduler/runs/{runId}/stream` | Server-sent events (SSE) stream for live progress, queue lag, and heartbeats. | `scheduler.runs.read` |
| `POST` | `/api/v1/scheduler/runs` | Create an ad-hoc run bound to an existing schedule. | `scheduler.runs.write` |
| `POST` | `/api/v1/scheduler/runs/{runId}/cancel` | Transition a run to `cancelled` when still in a non-terminal state. | `scheduler.runs.manage` |
| `POST` | `/api/v1/scheduler/runs/{runId}/retry` | Clone a terminal run into a new manual retry, preserving provenance. | `scheduler.runs.manage` |
| `POST` | `/api/v1/scheduler/runs/preview` | Resolve impacted images using the ImpactIndex without enqueuing work. | `scheduler.runs.preview` |
| `GET` | `/api/v1/scheduler/policies/simulations` | List policy simulations for the current tenant (filters: policyId, status, since, limit). | `policy:simulate` |
| `GET` | `/api/v1/scheduler/policies/simulations/{simulationId}` | Retrieve simulation status snapshot. | `policy:simulate` |
| `GET` | `/api/v1/scheduler/policies/simulations/{simulationId}/stream` | SSE stream emitting simulation status, queue lag, and heartbeats. | `policy:simulate` |
| `POST` | `/api/v1/scheduler/policies/simulations` | Enqueue a policy simulation (mode=`simulate`) with optional SBOM inputs and metadata. | `policy:simulate` |
| `POST` | `/api/v1/scheduler/policies/simulations/{simulationId}/cancel` | Request cancellation for an in-flight simulation. | `policy:simulate` |
| `POST` | `/api/v1/scheduler/policies/simulations/{simulationId}/retry` | Clone a terminal simulation into a new run preserving inputs/metadata. | `policy:simulate` |
All endpoints require a tenant context (`X-Tenant-Id`) and the appropriate scheduler scopes. Development mode allows header-based auth; production deployments must rely on Authority-issued tokens (OpTok + DPoP).
@@ -70,12 +80,12 @@ GET /api/v1/scheduler/runs?scheduleId=sch_4f2c7d9e0a2b4c64a0e7b5f9d65c1234&state
```
```json
{
"runs": [
{
"schemaVersion": "scheduler.run@1",
"id": "run_c7b4e9d2f6a04f8784a40476d8a2f771",
"tenantId": "tenant-alpha",
{
"runs": [
{
"schemaVersion": "scheduler.run@1",
"id": "run_c7b4e9d2f6a04f8784a40476d8a2f771",
"tenantId": "tenant-alpha",
"scheduleId": "sch_4f2c7d9e0a2b4c64a0e7b5f9d65c1234",
"trigger": "manual",
"state": "planning",
@@ -93,11 +103,13 @@ GET /api/v1/scheduler/runs?scheduleId=sch_4f2c7d9e0a2b4c64a0e7b5f9d65c1234&state
"reason": {
"manualReason": "Nightly backfill"
},
"createdAt": "2025-10-26T03:12:45Z"
}
]
}
```
"createdAt": "2025-10-26T03:12:45Z"
}
]
}
```
When additional pages are available the response includes `"nextCursor": "<base64>"`. Clients pass this cursor via `?cursor=` to fetch the next deterministic slice (ordering = `createdAt desc, id desc`).
## Cancel Run
@@ -136,7 +148,33 @@ POST /api/v1/scheduler/runs/run_c7b4e9d2f6a04f8784a40476d8a2f771/cancel
## Impact Preview
`/api/v1/scheduler/runs/preview` resolves impacted images via the ImpactIndex without mutating state. When `scheduleId` is provided the schedule selector is reused; callers may alternatively supply an explicit selector.
`/api/v1/scheduler/runs/preview` resolves impacted images via the ImpactIndex without mutating state. When `scheduleId` is provided the schedule selector is reused; callers may alternatively supply an explicit selector.
## Retry Run
`POST /api/v1/scheduler/runs/{runId}/retry` clones a terminal run into a new manual run with `retryOf` pointing to the original identifier. Retry is scope-gated with `scheduler.runs.manage`; the new runs `reason.manualReason` gains a `retry-of:<runId>` suffix for provenance.
## Run deltas
`GET /api/v1/scheduler/runs/{runId}/deltas` returns an immutable, deterministically sorted array of delta summaries (`[imageDigest, severity slices, KEV hits, attestations]`).
## Queue lag snapshot
`GET /api/v1/scheduler/runs/queue/lag` exposes queue depth summaries for planner/runner transports. The payload includes `capturedAt`, `totalDepth`, `maxDepth`, and ordered queue entries (transport + queue + depth). Console uses this for backlog dashboards and alert thresholds.
## Live stream (SSE)
`GET /api/v1/scheduler/runs/{runId}/stream` emits server-sent events for:
- `initial` — full run snapshot
- `stateChanged` — state/started/finished transitions
- `segmentProgress` — stats updates
- `deltaSummary` — deltas available
- `queueLag` — periodic queue snapshots
- `heartbeat` — uptime keep-alive (default 5s)
- `completed` — terminal summary
The stream is tolerant to clients reconnecting (idempotent payloads, deterministic ordering) and honours tenant scope plus cancellation tokens.
```http
POST /api/v1/scheduler/runs/preview
@@ -178,6 +216,106 @@ POST /api/v1/scheduler/runs/preview
### Integration notes
* Run creation and cancellation produce audit entries under category `scheduler.run` with correlation metadata when provided.
* The preview endpoint relies on the ImpactIndex stub in development. Production deployments must register the concrete index implementation before use.
* Planner/worker orchestration tasks will wire run creation to queueing in SCHED-WORKER-16-201/202.
* Run creation and cancellation produce audit entries under category `scheduler.run` with correlation metadata when provided.
* The preview endpoint relies on the ImpactIndex stub in development. Production deployments must register the concrete index implementation before use.
* Planner/worker orchestration tasks will wire run creation to queueing in SCHED-WORKER-16-201/202.
## Policy simulations
The policy simulation APIs mirror the run endpoints but operate on policy-mode jobs (`mode=simulate`) scoped by tenant and RBAC (`policy:simulate`).
### Create simulation
```http
POST /api/v1/scheduler/policies/simulations
X-Tenant-Id: tenant-alpha
Authorization: Bearer <OpTok>
```
```json
{
"policyId": "P-7",
"policyVersion": 4,
"priority": "normal",
"metadata": {
"source": "console.review"
},
"inputs": {
"sbomSet": ["sbom:S-318", "sbom:S-42"],
"captureExplain": true
}
}
```
```json
HTTP/1.1 201 Created
Location: /api/v1/scheduler/policies/simulations/run:P-7:20251103T153000Z:e4d1a9b2
{
"simulation": {
"schemaVersion": "scheduler.policy-run-status@1",
"runId": "run:P-7:20251103T153000Z:e4d1a9b2",
"tenantId": "tenant-alpha",
"policyId": "P-7",
"policyVersion": 4,
"mode": "simulate",
"status": "queued",
"priority": "normal",
"queuedAt": "2025-11-03T15:30:00Z",
"stats": {
"components": 0,
"rulesFired": 0,
"findingsWritten": 0,
"vexOverrides": 0
},
"inputs": {
"sbomSet": ["sbom:S-318", "sbom:S-42"],
"captureExplain": true
}
}
}
```
Canonical payload lives in `samples/api/scheduler/policy-simulation-status.json`.
### List and fetch simulations
- `GET /api/v1/scheduler/policies/simulations?policyId=P-7&status=queued&limit=25`
- `GET /api/v1/scheduler/policies/simulations/{simulationId}`
The response envelope mirrors `policy-run-status` but uses `simulations` / `simulation` wrappers. All metadata keys are lower-case; retries append `retry-of=<priorRunId>` for provenance.
### Cancel and retry
- `POST /api/v1/scheduler/policies/simulations/{simulationId}/cancel`
- Marks the job as `cancellationRequested` and surfaces the reason. Worker execution honours this flag before leasing.
- `POST /api/v1/scheduler/policies/simulations/{simulationId}/retry`
- Clones a terminal simulation, preserving inputs/metadata and adding `metadata.retry-of` pointing to the original run ID. Returns `409 Conflict` when the simulation is not terminal.
### Live stream (SSE)
`GET /api/v1/scheduler/policies/simulations/{simulationId}/stream` emits:
- `retry` — reconnection hint (milliseconds) emitted before events.
- `initial` — current simulation snapshot.
- `status` — status/attempt/stat updates.
- `queueLag` — periodic queue depth summary (shares payload with run streams).
- `heartbeat` — keep-alive ping (default 5s; configurable under `Scheduler:RunStream`).
- `completed` — terminal summary (`succeeded`, `failed`, or `cancelled`).
- `notFound` — emitted if the run record disappears while streaming.
Heartbeats, queue lag summaries, and the reconnection directive are sent immediately after connection so Console clients receive deterministic telemetry when loading a simulation workspace.
### Metrics
```
GET /api/v1/scheduler/policies/simulations/metrics
X-Tenant-Id: tenant-alpha
Authorization: Bearer <OpTok>
```
Returns queue depth and latency summaries tailored for simulation dashboards and alerting. Response properties align with the metric names exposed via OTEL (`policy_simulation_queue_depth`, `policy_simulation_latency`). Canonical payload lives at `samples/api/scheduler/policy-simulation-metrics.json`.
- `policy_simulation_queue_depth.total` — pending simulation jobs (aggregate of `pending`, `dispatching`, `submitted`).
- `policy_simulation_latency.*` — latency percentiles (seconds) computed from the most recent terminal simulations.
> **Note:** When Mongo storage is not configured the metrics provider is disabled and the endpoint responds with `501 Not Implemented`.

View File

@@ -0,0 +1,12 @@
## Policy simulations
`/api/v1/scheduler/policies/simulations` orchestrates Policy Engine runs in `simulate` mode without mutating persisted findings.
- **Create** — `POST /api/v1/scheduler/policies/simulations` (scope `policy:simulate`) enqueues a simulation for `policyId`/`policyVersion`, respecting optional `metadata` and structured `inputs` (`sbomSet`, `advisoryCursor`, `vexCursor`, `captureExplain`). Returns `201 Created` with `simulation.runId` and status `queued`.
- **List/Get** — `GET /api/v1/scheduler/policies/simulations` and `/.../{simulationId}` expose `PolicyRunStatus` documents filtered to `mode=simulate`, including attempt counts, stats, and cancellation markers.
- **Cancel** — `POST /.../{simulationId}/cancel` records `cancellationRequested=true` (optional reason, timestamp) and immediately reflects the updated status; workers honour the flag on the next lease cycle.
- **Retry** — `POST /.../{simulationId}/retry` clones a terminal simulation (cancelled/failed/succeeded) into a fresh job preserving inputs/metadata. Non-terminal runs yield `409 Conflict`.
- **Stream** — `GET /.../{simulationId}/stream` emits SSE events (`initial`, `status`, `queueLag`, `heartbeat`, `completed`) with the latest `PolicyRunStatus`, enabling Console to render shard progress and cancellation state in real time.
Simulation APIs share the same deterministic pagination/metadata contracts as policy runs and surface queue depth snapshots via the existing scheduler queue metrics.

View File

@@ -0,0 +1,78 @@
# SCHED-CONSOLE-27-002 · Policy Simulation Telemetry & Webhooks
> Owners: Scheduler WebService Guild, Observability Guild
> Scope: Policy simulation metrics endpoint and completion webhooks feeding Registry/Console integrations.
## 1. Metrics endpoint refresher
- `GET /api/v1/scheduler/policies/simulations/metrics` (scope: `policy:simulate`)
- Returns queue depth grouped by status plus latency percentiles derived from the most recent sample window (default 200 terminal runs).
- Surface area is unchanged from the implementation in Sprint 27 week 1; consumers should continue to rely on the contract in `samples/api/scheduler/policy-simulation-metrics.json`.
- When backing storage is not Mongo the endpoint responds `501 Not Implemented`.
## 2. Completion webhooks
Scheduler Worker now emits policy simulation webhooks whenever a simulation reaches a terminal state (`succeeded`, `failed`, `cancelled`). Payloads are aligned with the SSE `completed` event shape and include idempotency headers so downstream systems can safely de-duplicate.
### 2.1 Configuration
```jsonc
// scheduler-worker.appsettings.json
{
"Scheduler": {
"Worker": {
"Policy": {
"Webhook": {
"Enabled": true,
"Endpoint": "https://registry.internal/hooks/policy-simulation",
"ApiKeyHeader": "X-StellaOps-Webhook-Key",
"ApiKey": "replace-me",
"TimeoutSeconds": 10
}
}
}
}
}
```
- `Enabled`: feature flag; disabled by default to preserve air-gap behaviour.
- `Endpoint`: absolute HTTPS endpoint; requests use `POST`.
- `ApiKeyHeader`/`ApiKey`: optional bearer for Registry verification.
- `TimeoutSeconds`: per-request timeout (defaults to 10s).
### 2.2 Headers
| Header | Purpose |
|------------------------|---------------------------------------|
| `X-StellaOps-Tenant` | Tenant identifier for the simulation. |
| `X-StellaOps-Run-Id` | Stable run id (use as idempotency key). |
| `X-StellaOps-Webhook-Key` | Optional API key as configured. |
### 2.3 Payload
See `samples/api/scheduler/policy-simulation-webhook.json` for a canonical example.
```json
{
"tenantId": "tenant-alpha",
"simulation": { /* PolicyRunStatus document */ },
"result": "failed",
"observedAt": "2025-11-03T20:05:12Z",
"latencySeconds": 14.287,
"reason": "policy engine timeout"
}
```
- `result`: `succeeded`, `failed`, `cancelled`, `running`, or `queued`. Terminal webhooks are emitted only for the first three.
- `latencySeconds`: bounded to four decimal places; derived from `finishedAt - queuedAt` when timestamps exist, else falls back to observer timestamp.
- `reason`: surfaced for failures (`error`) and cancellations (`cancellationReason`); omitted otherwise.
### 2.4 Delivery semantics
- Best effort with no retry from the worker — Registry should use `X-StellaOps-Run-Id` for idempotency.
- Failures emit WARN logs (prefix `Policy run job {JobId}`).
- Disabled configuration short-circuits without network calls (debug log only).
## 3. SSE compatibility
No changes were required on the streaming endpoint (`GET /api/v1/scheduler/policies/simulations/{id}/stream`); Console continues to receive `completed` events containing the same `PolicyRunStatus` payload that the webhook publishes.