Add unit tests for SBOM ingestion and transformation
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
- Implement `SbomIngestServiceCollectionExtensionsTests` to verify the SBOM ingestion pipeline exports snapshots correctly. - Create `SbomIngestTransformerTests` to ensure the transformation produces expected nodes and edges, including deduplication of license nodes and normalization of timestamps. - Add `SbomSnapshotExporterTests` to test the export functionality for manifest, adjacency, nodes, and edges. - Introduce `VexOverlayTransformerTests` to validate the transformation of VEX nodes and edges. - Set up project file for the test project with necessary dependencies and configurations. - Include JSON fixture files for testing purposes.
This commit is contained in:
@@ -4,9 +4,13 @@ namespace StellaOps.Scheduler.WebService.PolicyRuns;
|
||||
|
||||
internal interface IPolicyRunService
|
||||
{
|
||||
Task<PolicyRunStatus> EnqueueAsync(string tenantId, PolicyRunRequest request, CancellationToken cancellationToken);
|
||||
|
||||
Task<IReadOnlyList<PolicyRunStatus>> ListAsync(string tenantId, PolicyRunQueryOptions options, CancellationToken cancellationToken);
|
||||
|
||||
Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken);
|
||||
}
|
||||
Task<PolicyRunStatus> EnqueueAsync(string tenantId, PolicyRunRequest request, CancellationToken cancellationToken);
|
||||
|
||||
Task<IReadOnlyList<PolicyRunStatus>> ListAsync(string tenantId, PolicyRunQueryOptions options, CancellationToken cancellationToken);
|
||||
|
||||
Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken);
|
||||
|
||||
Task<PolicyRunStatus?> RequestCancellationAsync(string tenantId, string runId, string? reason, CancellationToken cancellationToken);
|
||||
|
||||
Task<PolicyRunStatus> RetryAsync(string tenantId, string runId, string? requestedBy, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ internal sealed class InMemoryPolicyRunService : IPolicyRunService
|
||||
private readonly List<PolicyRunStatus> _orderedRuns = new();
|
||||
private readonly object _gate = new();
|
||||
|
||||
public Task<PolicyRunStatus> EnqueueAsync(string tenantId, PolicyRunRequest request, CancellationToken cancellationToken)
|
||||
public Task<PolicyRunStatus> EnqueueAsync(string tenantId, PolicyRunRequest request, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
@@ -23,27 +23,30 @@ internal sealed class InMemoryPolicyRunService : IPolicyRunService
|
||||
|
||||
var queuedAt = request.QueuedAt ?? DateTimeOffset.UtcNow;
|
||||
|
||||
var status = new PolicyRunStatus(
|
||||
runId,
|
||||
tenantId,
|
||||
request.PolicyId ?? throw new ValidationException("policyId must be provided."),
|
||||
request.PolicyVersion ?? throw new ValidationException("policyVersion must be provided."),
|
||||
request.Mode,
|
||||
PolicyRunExecutionStatus.Queued,
|
||||
request.Priority,
|
||||
queuedAt,
|
||||
PolicyRunStats.Empty,
|
||||
request.Inputs ?? PolicyRunInputs.Empty,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
0,
|
||||
null,
|
||||
null,
|
||||
request.Metadata ?? ImmutableSortedDictionary<string, string>.Empty,
|
||||
SchedulerSchemaVersions.PolicyRunStatus);
|
||||
var status = new PolicyRunStatus(
|
||||
runId,
|
||||
tenantId,
|
||||
request.PolicyId ?? throw new ValidationException("policyId must be provided."),
|
||||
request.PolicyVersion ?? throw new ValidationException("policyVersion must be provided."),
|
||||
request.Mode,
|
||||
PolicyRunExecutionStatus.Queued,
|
||||
request.Priority,
|
||||
queuedAt,
|
||||
PolicyRunStats.Empty,
|
||||
request.Inputs ?? PolicyRunInputs.Empty,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
0,
|
||||
null,
|
||||
null,
|
||||
request.Metadata ?? ImmutableSortedDictionary<string, string>.Empty,
|
||||
cancellationRequested: false,
|
||||
cancellationRequestedAt: null,
|
||||
cancellationReason: null,
|
||||
SchedulerSchemaVersions.PolicyRunStatus);
|
||||
|
||||
lock (_gate)
|
||||
{
|
||||
@@ -110,7 +113,7 @@ internal sealed class InMemoryPolicyRunService : IPolicyRunService
|
||||
return Task.FromResult<IReadOnlyList<PolicyRunStatus>>(result);
|
||||
}
|
||||
|
||||
public Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken)
|
||||
public Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
@@ -126,13 +129,121 @@ internal sealed class InMemoryPolicyRunService : IPolicyRunService
|
||||
return Task.FromResult<PolicyRunStatus?>(null);
|
||||
}
|
||||
|
||||
return Task.FromResult<PolicyRunStatus?>(run);
|
||||
}
|
||||
|
||||
private static string GenerateRunId(string policyId, DateTimeOffset timestamp)
|
||||
{
|
||||
var normalizedPolicyId = string.IsNullOrWhiteSpace(policyId) ? "policy" : policyId.Trim();
|
||||
var suffix = Guid.NewGuid().ToString("N")[..8];
|
||||
return $"run:{normalizedPolicyId}:{timestamp:yyyyMMddTHHmmssZ}:{suffix}";
|
||||
}
|
||||
}
|
||||
return Task.FromResult<PolicyRunStatus?>(run);
|
||||
}
|
||||
|
||||
public Task<PolicyRunStatus?> RequestCancellationAsync(string tenantId, string runId, string? reason, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
PolicyRunStatus? updated;
|
||||
lock (_gate)
|
||||
{
|
||||
if (!_runs.TryGetValue(runId, out var existing) || !string.Equals(existing.TenantId, tenantId, StringComparison.Ordinal))
|
||||
{
|
||||
return Task.FromResult<PolicyRunStatus?>(null);
|
||||
}
|
||||
|
||||
if (IsTerminal(existing.Status))
|
||||
{
|
||||
return Task.FromResult<PolicyRunStatus?>(existing);
|
||||
}
|
||||
|
||||
var cancellationReason = NormalizeCancellationReason(reason);
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
updated = existing with
|
||||
{
|
||||
Status = PolicyRunExecutionStatus.Cancelled,
|
||||
FinishedAt = now,
|
||||
CancellationRequested = true,
|
||||
CancellationRequestedAt = now,
|
||||
CancellationReason = cancellationReason
|
||||
};
|
||||
|
||||
_runs[runId] = updated;
|
||||
var index = _orderedRuns.FindIndex(status => string.Equals(status.RunId, runId, StringComparison.Ordinal));
|
||||
if (index >= 0)
|
||||
{
|
||||
_orderedRuns[index] = updated;
|
||||
}
|
||||
}
|
||||
|
||||
return Task.FromResult<PolicyRunStatus?>(updated);
|
||||
}
|
||||
|
||||
public async Task<PolicyRunStatus> RetryAsync(string tenantId, string runId, string? requestedBy, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
PolicyRunStatus existing;
|
||||
lock (_gate)
|
||||
{
|
||||
if (!_runs.TryGetValue(runId, out var status) || !string.Equals(status.TenantId, tenantId, StringComparison.Ordinal))
|
||||
{
|
||||
throw new KeyNotFoundException($"Policy simulation {runId} was not found for tenant {tenantId}.");
|
||||
}
|
||||
|
||||
if (!IsTerminal(status.Status))
|
||||
{
|
||||
throw new InvalidOperationException("Simulation is still in progress and cannot be retried.");
|
||||
}
|
||||
|
||||
existing = status;
|
||||
}
|
||||
|
||||
var metadataBuilder = (existing.Metadata ?? ImmutableSortedDictionary<string, string>.Empty).ToBuilder();
|
||||
metadataBuilder["retry-of"] = runId;
|
||||
var request = new PolicyRunRequest(
|
||||
tenantId,
|
||||
existing.PolicyId,
|
||||
PolicyRunMode.Simulate,
|
||||
existing.Inputs,
|
||||
existing.Priority,
|
||||
runId: null,
|
||||
policyVersion: existing.PolicyVersion,
|
||||
requestedBy: NormalizeActor(requestedBy),
|
||||
queuedAt: DateTimeOffset.UtcNow,
|
||||
correlationId: null,
|
||||
metadata: metadataBuilder.ToImmutable());
|
||||
|
||||
return await EnqueueAsync(tenantId, request, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private static string GenerateRunId(string policyId, DateTimeOffset timestamp)
|
||||
{
|
||||
var normalizedPolicyId = string.IsNullOrWhiteSpace(policyId) ? "policy" : policyId.Trim();
|
||||
var suffix = Guid.NewGuid().ToString("N")[..8];
|
||||
return $"run:{normalizedPolicyId}:{timestamp:yyyyMMddTHHmmssZ}:{suffix}";
|
||||
}
|
||||
|
||||
private static bool IsTerminal(PolicyRunExecutionStatus status)
|
||||
=> status is PolicyRunExecutionStatus.Succeeded or PolicyRunExecutionStatus.Failed or PolicyRunExecutionStatus.Cancelled;
|
||||
|
||||
private static string? NormalizeCancellationReason(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var trimmed = value.Trim();
|
||||
const int maxLength = 512;
|
||||
return trimmed.Length > maxLength ? trimmed[..maxLength] : trimmed;
|
||||
}
|
||||
|
||||
private static string? NormalizeActor(string? actor)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(actor))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var trimmed = actor.Trim();
|
||||
const int maxLength = 256;
|
||||
return trimmed.Length > maxLength ? trimmed[..maxLength] : trimmed;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,13 +17,19 @@ internal sealed class PolicyRunQueryOptions
|
||||
|
||||
public string? PolicyId { get; private set; }
|
||||
|
||||
public PolicyRunMode? Mode { get; private set; }
|
||||
public PolicyRunMode? Mode { get; private set; }
|
||||
|
||||
public PolicyRunExecutionStatus? Status { get; private set; }
|
||||
|
||||
public DateTimeOffset? QueuedAfter { get; private set; }
|
||||
|
||||
public int Limit { get; private set; } = DefaultLimit;
|
||||
public int Limit { get; private set; } = DefaultLimit;
|
||||
|
||||
public PolicyRunQueryOptions ForceMode(PolicyRunMode mode)
|
||||
{
|
||||
Mode = mode;
|
||||
return this;
|
||||
}
|
||||
|
||||
public static PolicyRunQueryOptions FromRequest(HttpRequest request)
|
||||
{
|
||||
|
||||
@@ -47,7 +47,7 @@ internal sealed class PolicyRunService : IPolicyRunService
|
||||
if (existing is not null)
|
||||
{
|
||||
_logger.LogDebug("Policy run job already exists for tenant {TenantId} and run {RunId}.", tenantId, runId);
|
||||
return ToStatus(existing, now);
|
||||
return PolicyRunStatusFactory.Create(existing, now);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,7 +94,7 @@ internal sealed class PolicyRunService : IPolicyRunService
|
||||
job.RunId,
|
||||
job.Mode);
|
||||
|
||||
return ToStatus(job, now);
|
||||
return PolicyRunStatusFactory.Create(job, now);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<PolicyRunStatus>> ListAsync(
|
||||
@@ -122,79 +122,139 @@ internal sealed class PolicyRunService : IPolicyRunService
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
return jobs
|
||||
.Select(job => ToStatus(job, now))
|
||||
.ToList();
|
||||
return jobs
|
||||
.Select(job => PolicyRunStatusFactory.Create(job, now))
|
||||
.ToList();
|
||||
}
|
||||
|
||||
public async Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var job = await _repository
|
||||
.GetByRunIdAsync(tenantId, runId, cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (job is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
return ToStatus(job, now);
|
||||
}
|
||||
|
||||
private static PolicyRunStatus ToStatus(PolicyRunJob job, DateTimeOffset now)
|
||||
{
|
||||
var status = MapExecutionStatus(job.Status);
|
||||
var queuedAt = job.QueuedAt ?? job.CreatedAt;
|
||||
var startedAt = job.SubmittedAt;
|
||||
var finishedAt = job.CompletedAt ?? job.CancelledAt;
|
||||
var metadata = job.Metadata ?? ImmutableSortedDictionary<string, string>.Empty;
|
||||
var inputs = job.Inputs ?? PolicyRunInputs.Empty;
|
||||
var policyVersion = job.PolicyVersion
|
||||
?? throw new InvalidOperationException($"Policy run job '{job.Id}' is missing policyVersion.");
|
||||
|
||||
return new PolicyRunStatus(
|
||||
job.RunId ?? job.Id,
|
||||
job.TenantId,
|
||||
job.PolicyId,
|
||||
policyVersion,
|
||||
job.Mode,
|
||||
status,
|
||||
job.Priority,
|
||||
queuedAt,
|
||||
job.Status == PolicyRunJobStatus.Pending ? null : startedAt,
|
||||
finishedAt,
|
||||
PolicyRunStats.Empty,
|
||||
inputs,
|
||||
determinismHash: null,
|
||||
errorCode: null,
|
||||
error: job.Status == PolicyRunJobStatus.Failed ? job.LastError : null,
|
||||
attempts: job.AttemptCount,
|
||||
traceId: null,
|
||||
explainUri: null,
|
||||
metadata,
|
||||
SchedulerSchemaVersions.PolicyRunStatus);
|
||||
}
|
||||
|
||||
private static PolicyRunExecutionStatus MapExecutionStatus(PolicyRunJobStatus status)
|
||||
=> status switch
|
||||
{
|
||||
PolicyRunJobStatus.Pending => PolicyRunExecutionStatus.Queued,
|
||||
PolicyRunJobStatus.Dispatching => PolicyRunExecutionStatus.Running,
|
||||
PolicyRunJobStatus.Submitted => PolicyRunExecutionStatus.Running,
|
||||
PolicyRunJobStatus.Completed => PolicyRunExecutionStatus.Succeeded,
|
||||
PolicyRunJobStatus.Failed => PolicyRunExecutionStatus.Failed,
|
||||
PolicyRunJobStatus.Cancelled => PolicyRunExecutionStatus.Cancelled,
|
||||
_ => PolicyRunExecutionStatus.Queued
|
||||
};
|
||||
|
||||
private static IReadOnlyCollection<PolicyRunJobStatus>? MapExecutionStatus(PolicyRunExecutionStatus status)
|
||||
=> status switch
|
||||
{
|
||||
public async Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var job = await _repository
|
||||
.GetByRunIdAsync(tenantId, runId, cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (job is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
return PolicyRunStatusFactory.Create(job, now);
|
||||
}
|
||||
|
||||
public async Task<PolicyRunStatus?> RequestCancellationAsync(
|
||||
string tenantId,
|
||||
string runId,
|
||||
string? reason,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var job = await _repository
|
||||
.GetByRunIdAsync(tenantId, runId, cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (job is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
if (IsTerminal(job.Status))
|
||||
{
|
||||
return PolicyRunStatusFactory.Create(job, now);
|
||||
}
|
||||
|
||||
if (job.CancellationRequested && string.Equals(job.CancellationReason, reason, StringComparison.Ordinal))
|
||||
{
|
||||
return PolicyRunStatusFactory.Create(job, now);
|
||||
}
|
||||
|
||||
var updated = job with
|
||||
{
|
||||
CancellationRequested = true,
|
||||
CancellationRequestedAt = now,
|
||||
CancellationReason = NormalizeCancellationReason(reason),
|
||||
UpdatedAt = now,
|
||||
AvailableAt = now
|
||||
};
|
||||
|
||||
var replaced = await _repository
|
||||
.ReplaceAsync(updated, expectedLeaseOwner: job.LeaseOwner, cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (!replaced)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Failed to persist cancellation request for policy run job {JobId} (runId={RunId}).",
|
||||
job.Id,
|
||||
job.RunId ?? "(pending)");
|
||||
return PolicyRunStatusFactory.Create(job, now);
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Cancellation requested for policy run job {JobId} (runId={RunId}, reason={Reason}).",
|
||||
updated.Id,
|
||||
updated.RunId ?? "(pending)",
|
||||
updated.CancellationReason ?? "none");
|
||||
|
||||
return PolicyRunStatusFactory.Create(updated, now);
|
||||
}
|
||||
|
||||
public async Task<PolicyRunStatus> RetryAsync(
|
||||
string tenantId,
|
||||
string runId,
|
||||
string? requestedBy,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var job = await _repository
|
||||
.GetByRunIdAsync(tenantId, runId, cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false)
|
||||
?? throw new KeyNotFoundException($"Policy simulation {runId} was not found for tenant {tenantId}.");
|
||||
|
||||
if (job.Mode != PolicyRunMode.Simulate)
|
||||
{
|
||||
throw new InvalidOperationException("Only simulation runs can be retried through this endpoint.");
|
||||
}
|
||||
|
||||
if (!IsTerminal(job.Status))
|
||||
{
|
||||
throw new InvalidOperationException("Simulation is still in progress and cannot be retried.");
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var metadataBuilder = (job.Metadata ?? ImmutableSortedDictionary<string, string>.Empty).ToBuilder();
|
||||
metadataBuilder["retry-of"] = runId;
|
||||
|
||||
var request = new PolicyRunRequest(
|
||||
tenantId,
|
||||
job.PolicyId,
|
||||
PolicyRunMode.Simulate,
|
||||
job.Inputs ?? PolicyRunInputs.Empty,
|
||||
job.Priority,
|
||||
runId: null,
|
||||
policyVersion: job.PolicyVersion,
|
||||
requestedBy: NormalizeActor(requestedBy),
|
||||
queuedAt: now,
|
||||
correlationId: job.CorrelationId,
|
||||
metadata: metadataBuilder.ToImmutable());
|
||||
|
||||
return await EnqueueAsync(tenantId, request, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private static IReadOnlyCollection<PolicyRunJobStatus>? MapExecutionStatus(PolicyRunExecutionStatus status)
|
||||
=> status switch
|
||||
{
|
||||
PolicyRunExecutionStatus.Queued => new[] { PolicyRunJobStatus.Pending },
|
||||
PolicyRunExecutionStatus.Running => new[] { PolicyRunJobStatus.Dispatching, PolicyRunJobStatus.Submitted },
|
||||
PolicyRunExecutionStatus.Succeeded => new[] { PolicyRunJobStatus.Completed },
|
||||
@@ -202,12 +262,39 @@ internal sealed class PolicyRunService : IPolicyRunService
|
||||
PolicyRunExecutionStatus.Cancelled => new[] { PolicyRunJobStatus.Cancelled },
|
||||
PolicyRunExecutionStatus.ReplayPending => Array.Empty<PolicyRunJobStatus>(),
|
||||
_ => null
|
||||
};
|
||||
|
||||
private static string GenerateRunId(string policyId, DateTimeOffset timestamp)
|
||||
{
|
||||
var normalizedPolicyId = string.IsNullOrWhiteSpace(policyId) ? "policy" : policyId.Trim();
|
||||
var suffix = Guid.NewGuid().ToString("N")[..8];
|
||||
return $"run:{normalizedPolicyId}:{timestamp:yyyyMMddTHHmmssZ}:{suffix}";
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
private static string GenerateRunId(string policyId, DateTimeOffset timestamp)
|
||||
{
|
||||
var normalizedPolicyId = string.IsNullOrWhiteSpace(policyId) ? "policy" : policyId.Trim();
|
||||
var suffix = Guid.NewGuid().ToString("N")[..8];
|
||||
return $"run:{normalizedPolicyId}:{timestamp:yyyyMMddTHHmmssZ}:{suffix}";
|
||||
}
|
||||
|
||||
private static bool IsTerminal(PolicyRunJobStatus status)
|
||||
=> status is PolicyRunJobStatus.Completed or PolicyRunJobStatus.Failed or PolicyRunJobStatus.Cancelled;
|
||||
|
||||
private static string? NormalizeCancellationReason(string? reason)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(reason))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var trimmed = reason.Trim();
|
||||
const int maxLength = 512;
|
||||
return trimmed.Length > maxLength ? trimmed[..maxLength] : trimmed;
|
||||
}
|
||||
|
||||
private static string? NormalizeActor(string? actor)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(actor))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var trimmed = actor.Trim();
|
||||
const int maxLength = 256;
|
||||
return trimmed.Length > maxLength ? trimmed[..maxLength] : trimmed;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,363 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.ComponentModel.DataAnnotations;
|
||||
using System.Text.Json.Serialization;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
using StellaOps.Auth.Abstractions;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.WebService.Auth;
|
||||
using StellaOps.Scheduler.WebService.PolicyRuns;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.PolicySimulations;
|
||||
|
||||
internal static class PolicySimulationEndpointExtensions
|
||||
{
|
||||
private const string Scope = StellaOpsScopes.PolicySimulate;
|
||||
|
||||
public static void MapPolicySimulationEndpoints(this IEndpointRouteBuilder builder)
|
||||
{
|
||||
var group = builder.MapGroup("/api/v1/scheduler/policies/simulations");
|
||||
|
||||
group.MapGet("/", ListSimulationsAsync);
|
||||
group.MapGet("/{simulationId}", GetSimulationAsync);
|
||||
group.MapGet("/{simulationId}/stream", StreamSimulationAsync);
|
||||
group.MapGet("/metrics", GetMetricsAsync);
|
||||
group.MapPost("/", CreateSimulationAsync);
|
||||
group.MapPost("/{simulationId}/cancel", CancelSimulationAsync);
|
||||
group.MapPost("/{simulationId}/retry", RetrySimulationAsync);
|
||||
}
|
||||
|
||||
private static async Task<IResult> ListSimulationsAsync(
|
||||
HttpContext httpContext,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicyRunService policyRunService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
var options = PolicyRunQueryOptions
|
||||
.FromRequest(httpContext.Request)
|
||||
.ForceMode(PolicyRunMode.Simulate);
|
||||
|
||||
var simulations = await policyRunService
|
||||
.ListAsync(tenant.TenantId, options, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return Results.Ok(new PolicySimulationCollectionResponse(simulations));
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> GetSimulationAsync(
|
||||
HttpContext httpContext,
|
||||
string simulationId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicyRunService policyRunService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var simulation = await policyRunService
|
||||
.GetAsync(tenant.TenantId, simulationId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return simulation is null
|
||||
? Results.NotFound()
|
||||
: Results.Ok(new PolicySimulationResponse(simulation));
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> GetMetricsAsync(
|
||||
HttpContext httpContext,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicySimulationMetricsProvider? metricsProvider,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
if (metricsProvider is null)
|
||||
{
|
||||
return Results.StatusCode(StatusCodes.Status501NotImplemented);
|
||||
}
|
||||
|
||||
var metrics = await metricsProvider
|
||||
.CaptureAsync(tenant.TenantId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return Results.Ok(metrics);
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> CreateSimulationAsync(
|
||||
HttpContext httpContext,
|
||||
PolicySimulationCreateRequest request,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicyRunService policyRunService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
var actor = SchedulerEndpointHelpers.ResolveActorId(httpContext);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(request.PolicyId))
|
||||
{
|
||||
throw new ValidationException("policyId must be provided.");
|
||||
}
|
||||
|
||||
if (request.PolicyVersion is null || request.PolicyVersion <= 0)
|
||||
{
|
||||
throw new ValidationException("policyVersion must be provided and greater than zero.");
|
||||
}
|
||||
|
||||
var normalizedMetadata = NormalizeMetadata(request.Metadata);
|
||||
var inputs = request.Inputs ?? PolicyRunInputs.Empty;
|
||||
|
||||
var policyRequest = new PolicyRunRequest(
|
||||
tenant.TenantId,
|
||||
request.PolicyId,
|
||||
PolicyRunMode.Simulate,
|
||||
inputs,
|
||||
request.Priority,
|
||||
runId: null,
|
||||
policyVersion: request.PolicyVersion,
|
||||
requestedBy: actor,
|
||||
queuedAt: null,
|
||||
correlationId: request.CorrelationId,
|
||||
metadata: normalizedMetadata);
|
||||
|
||||
var status = await policyRunService
|
||||
.EnqueueAsync(tenant.TenantId, policyRequest, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return Results.Created(
|
||||
$"/api/v1/scheduler/policies/simulations/{status.RunId}",
|
||||
new PolicySimulationResponse(status));
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> CancelSimulationAsync(
|
||||
HttpContext httpContext,
|
||||
string simulationId,
|
||||
PolicySimulationCancelRequest? request,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicyRunService policyRunService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
var cancellation = await policyRunService
|
||||
.RequestCancellationAsync(tenant.TenantId, simulationId, request?.Reason, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return cancellation is null
|
||||
? Results.NotFound()
|
||||
: Results.Ok(new PolicySimulationResponse(cancellation));
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> RetrySimulationAsync(
|
||||
HttpContext httpContext,
|
||||
string simulationId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicyRunService policyRunService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
var actor = SchedulerEndpointHelpers.ResolveActorId(httpContext);
|
||||
|
||||
var status = await policyRunService
|
||||
.RetryAsync(tenant.TenantId, simulationId, actor, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return Results.Created(
|
||||
$"/api/v1/scheduler/policies/simulations/{status.RunId}",
|
||||
new PolicySimulationResponse(status));
|
||||
}
|
||||
catch (KeyNotFoundException)
|
||||
{
|
||||
return Results.NotFound();
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status409Conflict);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task StreamSimulationAsync(
|
||||
HttpContext httpContext,
|
||||
string simulationId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicyRunService policyRunService,
|
||||
[FromServices] IPolicySimulationStreamCoordinator streamCoordinator,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var simulation = await policyRunService
|
||||
.GetAsync(tenant.TenantId, simulationId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (simulation is null)
|
||||
{
|
||||
await Results.NotFound().ExecuteAsync(httpContext);
|
||||
return;
|
||||
}
|
||||
|
||||
await streamCoordinator
|
||||
.StreamAsync(httpContext, tenant.TenantId, simulation, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
await Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized)
|
||||
.ExecuteAsync(httpContext);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
await Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden)
|
||||
.ExecuteAsync(httpContext);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
await Results.BadRequest(new { error = ex.Message }).ExecuteAsync(httpContext);
|
||||
}
|
||||
}
|
||||
|
||||
private static ImmutableSortedDictionary<string, string>? NormalizeMetadata(IReadOnlyDictionary<string, string>? metadata)
|
||||
{
|
||||
if (metadata is null || metadata.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var builder = ImmutableSortedDictionary.CreateBuilder<string, string>(StringComparer.Ordinal);
|
||||
foreach (var (key, value) in metadata)
|
||||
{
|
||||
var normalizedKey = key?.Trim();
|
||||
var normalizedValue = value?.Trim();
|
||||
if (string.IsNullOrEmpty(normalizedKey) || string.IsNullOrEmpty(normalizedValue))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var lowerKey = normalizedKey.ToLowerInvariant();
|
||||
if (!builder.ContainsKey(lowerKey))
|
||||
{
|
||||
builder[lowerKey] = normalizedValue;
|
||||
}
|
||||
}
|
||||
|
||||
return builder.Count == 0 ? null : builder.ToImmutable();
|
||||
}
|
||||
}
|
||||
|
||||
internal sealed record PolicySimulationCreateRequest(
|
||||
[property: JsonPropertyName("policyId")] string PolicyId,
|
||||
[property: JsonPropertyName("policyVersion")] int? PolicyVersion,
|
||||
[property: JsonPropertyName("priority")] PolicyRunPriority Priority = PolicyRunPriority.Normal,
|
||||
[property: JsonPropertyName("correlationId")] string? CorrelationId = null,
|
||||
[property: JsonPropertyName("metadata")] IReadOnlyDictionary<string, string>? Metadata = null,
|
||||
[property: JsonPropertyName("inputs")] PolicyRunInputs? Inputs = null);
|
||||
|
||||
internal sealed record PolicySimulationCancelRequest(
|
||||
[property: JsonPropertyName("reason")] string? Reason);
|
||||
|
||||
internal sealed record PolicySimulationCollectionResponse(
|
||||
[property: JsonPropertyName("simulations")] IReadOnlyList<PolicyRunStatus> Simulations);
|
||||
|
||||
internal sealed record PolicySimulationResponse(
|
||||
[property: JsonPropertyName("simulation")] PolicyRunStatus Simulation);
|
||||
@@ -0,0 +1,234 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics.Metrics;
|
||||
using System.Linq;
|
||||
using System.Text.Json.Serialization;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.PolicySimulations;
|
||||
|
||||
internal interface IPolicySimulationMetricsProvider
|
||||
{
|
||||
Task<PolicySimulationMetricsResponse> CaptureAsync(string tenantId, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
internal interface IPolicySimulationMetricsRecorder
|
||||
{
|
||||
void RecordLatency(PolicyRunStatus status, DateTimeOffset observedAt);
|
||||
}
|
||||
|
||||
internal sealed class PolicySimulationMetricsProvider : IPolicySimulationMetricsProvider, IPolicySimulationMetricsRecorder, IDisposable
|
||||
{
|
||||
private static readonly PolicyRunJobStatus[] QueueStatuses =
|
||||
{
|
||||
PolicyRunJobStatus.Pending,
|
||||
PolicyRunJobStatus.Dispatching,
|
||||
PolicyRunJobStatus.Submitted,
|
||||
};
|
||||
|
||||
private static readonly PolicyRunJobStatus[] TerminalStatuses =
|
||||
{
|
||||
PolicyRunJobStatus.Completed,
|
||||
PolicyRunJobStatus.Failed,
|
||||
PolicyRunJobStatus.Cancelled,
|
||||
};
|
||||
|
||||
private readonly IPolicyRunJobRepository _repository;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly Meter _meter;
|
||||
private readonly ObservableGauge<long> _queueGauge;
|
||||
private readonly Histogram<double> _latencyHistogram;
|
||||
private readonly object _snapshotLock = new();
|
||||
private IReadOnlyDictionary<string, long> _latestQueueSnapshot = new Dictionary<string, long>(StringComparer.Ordinal);
|
||||
private bool _disposed;
|
||||
|
||||
public PolicySimulationMetricsProvider(IPolicyRunJobRepository repository, TimeProvider? timeProvider = null)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_meter = new Meter("StellaOps.Scheduler.WebService.PolicySimulations");
|
||||
_queueGauge = _meter.CreateObservableGauge<long>(
|
||||
"policy_simulation_queue_depth",
|
||||
ObserveQueueDepth,
|
||||
unit: "runs",
|
||||
description: "Queued policy simulation jobs grouped by status.");
|
||||
_latencyHistogram = _meter.CreateHistogram<double>(
|
||||
"policy_simulation_latency",
|
||||
unit: "s",
|
||||
description: "End-to-end policy simulation latency (seconds).");
|
||||
}
|
||||
|
||||
public async Task<PolicySimulationMetricsResponse> CaptureAsync(string tenantId, CancellationToken cancellationToken)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
if (string.IsNullOrWhiteSpace(tenantId))
|
||||
{
|
||||
throw new ArgumentException("Tenant id must be provided.", nameof(tenantId));
|
||||
}
|
||||
|
||||
var queueCounts = new Dictionary<string, long>(StringComparer.OrdinalIgnoreCase);
|
||||
long totalQueueDepth = 0;
|
||||
|
||||
foreach (var status in QueueStatuses)
|
||||
{
|
||||
var count = await _repository.CountAsync(
|
||||
tenantId,
|
||||
PolicyRunMode.Simulate,
|
||||
new[] { status },
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
queueCounts[status.ToString().ToLowerInvariant()] = count;
|
||||
totalQueueDepth += count;
|
||||
}
|
||||
|
||||
lock (_snapshotLock)
|
||||
{
|
||||
_latestQueueSnapshot = queueCounts;
|
||||
}
|
||||
|
||||
var sampleSize = 200;
|
||||
var recentJobs = await _repository.ListAsync(
|
||||
tenantId,
|
||||
policyId: null,
|
||||
mode: PolicyRunMode.Simulate,
|
||||
statuses: TerminalStatuses,
|
||||
queuedAfter: null,
|
||||
limit: sampleSize,
|
||||
cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var durations = recentJobs
|
||||
.Select(job => CalculateLatencySeconds(job, _timeProvider.GetUtcNow()))
|
||||
.Where(duration => duration >= 0)
|
||||
.OrderBy(duration => duration)
|
||||
.ToArray();
|
||||
|
||||
var latencyMetrics = new PolicySimulationLatencyMetrics(
|
||||
durations.Length,
|
||||
Percentile(durations, 0.50),
|
||||
Percentile(durations, 0.90),
|
||||
Percentile(durations, 0.95),
|
||||
Percentile(durations, 0.99),
|
||||
Average(durations));
|
||||
|
||||
return new PolicySimulationMetricsResponse(
|
||||
new PolicySimulationQueueDepth(totalQueueDepth, queueCounts),
|
||||
latencyMetrics);
|
||||
}
|
||||
|
||||
public void RecordLatency(PolicyRunStatus status, DateTimeOffset observedAt)
|
||||
{
|
||||
if (status is null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(status));
|
||||
}
|
||||
|
||||
var latencySeconds = CalculateLatencySeconds(status, observedAt);
|
||||
if (latencySeconds >= 0)
|
||||
{
|
||||
_latencyHistogram.Record(latencySeconds);
|
||||
}
|
||||
}
|
||||
|
||||
private IEnumerable<Measurement<long>> ObserveQueueDepth()
|
||||
{
|
||||
IReadOnlyDictionary<string, long> snapshot;
|
||||
lock (_snapshotLock)
|
||||
{
|
||||
snapshot = _latestQueueSnapshot;
|
||||
}
|
||||
|
||||
foreach (var pair in snapshot)
|
||||
{
|
||||
yield return new Measurement<long>(
|
||||
pair.Value,
|
||||
new KeyValuePair<string, object?>("status", pair.Key));
|
||||
}
|
||||
}
|
||||
|
||||
private static double CalculateLatencySeconds(PolicyRunJob job, DateTimeOffset now)
|
||||
{
|
||||
var started = job.QueuedAt ?? job.CreatedAt;
|
||||
var finished = job.CompletedAt ?? job.CancelledAt ?? job.UpdatedAt;
|
||||
if (started == default)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
var duration = (finished - started).TotalSeconds;
|
||||
return duration < 0 ? 0 : duration;
|
||||
}
|
||||
|
||||
private static double CalculateLatencySeconds(PolicyRunStatus status, DateTimeOffset now)
|
||||
{
|
||||
var started = status.QueuedAt;
|
||||
var finished = status.FinishedAt ?? now;
|
||||
if (started == default)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
var duration = (finished - started).TotalSeconds;
|
||||
return duration < 0 ? 0 : duration;
|
||||
}
|
||||
|
||||
private static double? Percentile(IReadOnlyList<double> values, double percentile)
|
||||
{
|
||||
if (values.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var position = percentile * (values.Count - 1);
|
||||
var lowerIndex = (int)Math.Floor(position);
|
||||
var upperIndex = (int)Math.Ceiling(position);
|
||||
|
||||
if (lowerIndex == upperIndex)
|
||||
{
|
||||
return Math.Round(values[lowerIndex], 4);
|
||||
}
|
||||
|
||||
var fraction = position - lowerIndex;
|
||||
var interpolated = values[lowerIndex] + (values[upperIndex] - values[lowerIndex]) * fraction;
|
||||
return Math.Round(interpolated, 4);
|
||||
}
|
||||
|
||||
private static double? Average(IReadOnlyList<double> values)
|
||||
{
|
||||
if (values.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var sum = values.Sum();
|
||||
return Math.Round(sum / values.Count, 4);
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
_meter.Dispose();
|
||||
_disposed = true;
|
||||
}
|
||||
}
|
||||
|
||||
internal sealed record PolicySimulationMetricsResponse(
|
||||
[property: JsonPropertyName("policy_simulation_queue_depth")] PolicySimulationQueueDepth QueueDepth,
|
||||
[property: JsonPropertyName("policy_simulation_latency")] PolicySimulationLatencyMetrics Latency);
|
||||
|
||||
internal sealed record PolicySimulationQueueDepth(
|
||||
[property: JsonPropertyName("total")] long Total,
|
||||
[property: JsonPropertyName("by_status")] IReadOnlyDictionary<string, long> ByStatus);
|
||||
|
||||
internal sealed record PolicySimulationLatencyMetrics(
|
||||
[property: JsonPropertyName("samples")] int Samples,
|
||||
[property: JsonPropertyName("p50_seconds")] double? P50,
|
||||
[property: JsonPropertyName("p90_seconds")] double? P90,
|
||||
[property: JsonPropertyName("p95_seconds")] double? P95,
|
||||
[property: JsonPropertyName("p99_seconds")] double? P99,
|
||||
[property: JsonPropertyName("mean_seconds")] double? Mean);
|
||||
@@ -0,0 +1,198 @@
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.WebService.PolicyRuns;
|
||||
using StellaOps.Scheduler.WebService.Runs;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.PolicySimulations;
|
||||
|
||||
internal interface IPolicySimulationStreamCoordinator
|
||||
{
|
||||
Task StreamAsync(HttpContext context, string tenantId, PolicyRunStatus initialStatus, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
internal sealed class PolicySimulationStreamCoordinator : IPolicySimulationStreamCoordinator
|
||||
{
|
||||
private static readonly JsonSerializerOptions SerializerOptions = new(JsonSerializerDefaults.Web);
|
||||
|
||||
private readonly IPolicyRunService _policyRunService;
|
||||
private readonly IQueueLagSummaryProvider _queueLagProvider;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly RunStreamOptions _options;
|
||||
private readonly IPolicySimulationMetricsRecorder? _metricsRecorder;
|
||||
private readonly ILogger<PolicySimulationStreamCoordinator> _logger;
|
||||
|
||||
public PolicySimulationStreamCoordinator(
|
||||
IPolicyRunService policyRunService,
|
||||
IQueueLagSummaryProvider queueLagProvider,
|
||||
IOptions<RunStreamOptions> options,
|
||||
TimeProvider? timeProvider,
|
||||
ILogger<PolicySimulationStreamCoordinator> logger,
|
||||
IPolicySimulationMetricsRecorder? metricsRecorder = null)
|
||||
{
|
||||
_policyRunService = policyRunService ?? throw new ArgumentNullException(nameof(policyRunService));
|
||||
_queueLagProvider = queueLagProvider ?? throw new ArgumentNullException(nameof(queueLagProvider));
|
||||
_options = (options ?? throw new ArgumentNullException(nameof(options))).Value.Validate();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_metricsRecorder = metricsRecorder;
|
||||
}
|
||||
|
||||
public async Task StreamAsync(HttpContext context, string tenantId, PolicyRunStatus initialStatus, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentNullException.ThrowIfNull(initialStatus);
|
||||
|
||||
ConfigureSseHeaders(context.Response);
|
||||
await SseWriter.WriteRetryAsync(context.Response, _options.ReconnectDelay, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var last = initialStatus;
|
||||
await SseWriter.WriteEventAsync(context.Response, "initial", PolicySimulationPayload.From(last), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
await SseWriter.WriteEventAsync(context.Response, "queueLag", _queueLagProvider.Capture(), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
await SseWriter.WriteEventAsync(context.Response, "heartbeat", HeartbeatPayload.Create(_timeProvider.GetUtcNow()), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (IsTerminal(last.Status))
|
||||
{
|
||||
_metricsRecorder?.RecordLatency(last, _timeProvider.GetUtcNow());
|
||||
await SseWriter.WriteEventAsync(context.Response, "completed", PolicySimulationPayload.From(last), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
using var pollTimer = new PeriodicTimer(_options.PollInterval);
|
||||
using var queueTimer = new PeriodicTimer(_options.QueueLagInterval);
|
||||
using var heartbeatTimer = new PeriodicTimer(_options.HeartbeatInterval);
|
||||
|
||||
try
|
||||
{
|
||||
while (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
var pollTask = pollTimer.WaitForNextTickAsync(cancellationToken).AsTask();
|
||||
var queueTask = queueTimer.WaitForNextTickAsync(cancellationToken).AsTask();
|
||||
var heartbeatTask = heartbeatTimer.WaitForNextTickAsync(cancellationToken).AsTask();
|
||||
|
||||
var completed = await Task.WhenAny(pollTask, queueTask, heartbeatTask).ConfigureAwait(false);
|
||||
|
||||
if (completed == pollTask && await pollTask.ConfigureAwait(false))
|
||||
{
|
||||
var current = await _policyRunService
|
||||
.GetAsync(tenantId, last.RunId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (current is null)
|
||||
{
|
||||
_logger.LogWarning("Policy simulation {RunId} disappeared while streaming.", last.RunId);
|
||||
await SseWriter.WriteEventAsync(
|
||||
context.Response,
|
||||
"notFound",
|
||||
new PolicySimulationNotFoundPayload(last.RunId),
|
||||
SerializerOptions,
|
||||
cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
break;
|
||||
}
|
||||
|
||||
if (HasMeaningfulChange(last, current))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(context.Response, "status", PolicySimulationPayload.From(current), SerializerOptions, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
|
||||
last = current;
|
||||
|
||||
if (IsTerminal(last.Status))
|
||||
{
|
||||
_metricsRecorder?.RecordLatency(last, _timeProvider.GetUtcNow());
|
||||
await SseWriter.WriteEventAsync(context.Response, "completed", PolicySimulationPayload.From(last), SerializerOptions, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (completed == queueTask && await queueTask.ConfigureAwait(false))
|
||||
{
|
||||
var summary = _queueLagProvider.Capture();
|
||||
await SseWriter.WriteEventAsync(context.Response, "queueLag", summary, SerializerOptions, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
else if (completed == heartbeatTask && await heartbeatTask.ConfigureAwait(false))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(context.Response, "heartbeat", HeartbeatPayload.Create(_timeProvider.GetUtcNow()), SerializerOptions, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogDebug("Policy simulation stream cancelled for run {RunId}.", last.RunId);
|
||||
}
|
||||
}
|
||||
|
||||
private static void ConfigureSseHeaders(HttpResponse response)
|
||||
{
|
||||
response.StatusCode = StatusCodes.Status200OK;
|
||||
response.Headers.CacheControl = "no-store";
|
||||
response.Headers["X-Accel-Buffering"] = "no";
|
||||
response.Headers["Connection"] = "keep-alive";
|
||||
response.ContentType = "text/event-stream";
|
||||
}
|
||||
|
||||
private static bool HasMeaningfulChange(PolicyRunStatus previous, PolicyRunStatus current)
|
||||
{
|
||||
if (!EqualityComparer<PolicyRunExecutionStatus>.Default.Equals(previous.Status, current.Status))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!Nullable.Equals(previous.StartedAt, current.StartedAt) || !Nullable.Equals(previous.FinishedAt, current.FinishedAt))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (previous.Attempts != current.Attempts)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!string.Equals(previous.Error, current.Error, StringComparison.Ordinal) ||
|
||||
!string.Equals(previous.ErrorCode, current.ErrorCode, StringComparison.Ordinal) ||
|
||||
!string.Equals(previous.DeterminismHash, current.DeterminismHash, StringComparison.Ordinal))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (previous.CancellationRequested != current.CancellationRequested ||
|
||||
!Nullable.Equals(previous.CancellationRequestedAt, current.CancellationRequestedAt) ||
|
||||
!string.Equals(previous.CancellationReason, current.CancellationReason, StringComparison.Ordinal))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!EqualityComparer<PolicyRunStats>.Default.Equals(previous.Stats, current.Stats))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static bool IsTerminal(PolicyRunExecutionStatus status)
|
||||
=> status is PolicyRunExecutionStatus.Succeeded or PolicyRunExecutionStatus.Failed or PolicyRunExecutionStatus.Cancelled;
|
||||
|
||||
private sealed record PolicySimulationPayload(
|
||||
[property: JsonPropertyName("simulation")] PolicyRunStatus Simulation)
|
||||
{
|
||||
public static PolicySimulationPayload From(PolicyRunStatus status) => new(status);
|
||||
}
|
||||
|
||||
private sealed record PolicySimulationNotFoundPayload(
|
||||
[property: JsonPropertyName("runId")] string RunId);
|
||||
|
||||
private sealed record HeartbeatPayload(
|
||||
[property: JsonPropertyName("ts")] DateTimeOffset Timestamp)
|
||||
{
|
||||
public static HeartbeatPayload Create(DateTimeOffset timestamp) => new(timestamp);
|
||||
}
|
||||
}
|
||||
@@ -18,8 +18,9 @@ using StellaOps.Scheduler.WebService.GraphJobs;
|
||||
using StellaOps.Scheduler.WebService.GraphJobs.Events;
|
||||
using StellaOps.Scheduler.WebService.Schedules;
|
||||
using StellaOps.Scheduler.WebService.Options;
|
||||
using StellaOps.Scheduler.WebService.Runs;
|
||||
using StellaOps.Scheduler.WebService.PolicyRuns;
|
||||
using StellaOps.Scheduler.WebService.PolicySimulations;
|
||||
using StellaOps.Scheduler.WebService.Runs;
|
||||
|
||||
var builder = WebApplication.CreateBuilder(args);
|
||||
|
||||
@@ -84,6 +85,8 @@ if (storageSection.Exists())
|
||||
builder.Services.AddSchedulerMongoStorage(storageSection);
|
||||
builder.Services.AddSingleton<IGraphJobStore, MongoGraphJobStore>();
|
||||
builder.Services.AddSingleton<IPolicyRunService, PolicyRunService>();
|
||||
builder.Services.AddSingleton<IPolicySimulationMetricsProvider, PolicySimulationMetricsProvider>();
|
||||
builder.Services.AddSingleton<IPolicySimulationMetricsRecorder>(static sp => (IPolicySimulationMetricsRecorder)sp.GetRequiredService<IPolicySimulationMetricsProvider>());
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -117,6 +120,12 @@ builder.Services.AddOptions<SchedulerOptions>()
|
||||
.Bind(builder.Configuration.GetSection("Scheduler"))
|
||||
.PostConfigure(options => options.Validate());
|
||||
|
||||
builder.Services.AddSingleton<IQueueLagSummaryProvider, QueueLagSummaryProvider>();
|
||||
builder.Services.AddSingleton<IRunStreamCoordinator, RunStreamCoordinator>();
|
||||
builder.Services.AddSingleton<IPolicySimulationStreamCoordinator, PolicySimulationStreamCoordinator>();
|
||||
builder.Services.AddOptions<RunStreamOptions>()
|
||||
.Bind(builder.Configuration.GetSection("Scheduler:RunStream"));
|
||||
|
||||
var pluginHostOptions = SchedulerPluginHostFactory.Build(schedulerOptions.Plugins, builder.Environment.ContentRootPath);
|
||||
builder.Services.AddSingleton(pluginHostOptions);
|
||||
builder.Services.RegisterPluginRoutines(builder.Configuration, pluginHostOptions);
|
||||
@@ -196,6 +205,7 @@ app.MapGraphJobEndpoints();
|
||||
app.MapScheduleEndpoints();
|
||||
app.MapRunEndpoints();
|
||||
app.MapPolicyRunEndpoints();
|
||||
app.MapPolicySimulationEndpoints();
|
||||
app.MapSchedulerEventWebhookEndpoints();
|
||||
|
||||
app.Run();
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
using System;
|
||||
using System.Collections.Immutable;
|
||||
using System.Linq;
|
||||
using StellaOps.Scheduler.Queue;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.Runs;
|
||||
|
||||
internal interface IQueueLagSummaryProvider
|
||||
{
|
||||
QueueLagSummaryResponse Capture();
|
||||
}
|
||||
|
||||
internal sealed class QueueLagSummaryProvider : IQueueLagSummaryProvider
|
||||
{
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
public QueueLagSummaryProvider(TimeProvider? timeProvider = null)
|
||||
{
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
}
|
||||
|
||||
public QueueLagSummaryResponse Capture()
|
||||
{
|
||||
var samples = SchedulerQueueMetrics.CaptureDepthSamples();
|
||||
if (samples.Count == 0)
|
||||
{
|
||||
return new QueueLagSummaryResponse(
|
||||
_timeProvider.GetUtcNow(),
|
||||
0,
|
||||
0,
|
||||
ImmutableArray<QueueLagEntry>.Empty);
|
||||
}
|
||||
|
||||
var ordered = samples
|
||||
.OrderBy(static sample => sample.Transport, StringComparer.Ordinal)
|
||||
.ThenBy(static sample => sample.Queue, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
|
||||
var builder = ImmutableArray.CreateBuilder<QueueLagEntry>(ordered.Length);
|
||||
long totalDepth = 0;
|
||||
long maxDepth = 0;
|
||||
|
||||
foreach (var sample in ordered)
|
||||
{
|
||||
totalDepth += sample.Depth;
|
||||
if (sample.Depth > maxDepth)
|
||||
{
|
||||
maxDepth = sample.Depth;
|
||||
}
|
||||
|
||||
builder.Add(new QueueLagEntry(sample.Transport, sample.Queue, sample.Depth));
|
||||
}
|
||||
|
||||
return new QueueLagSummaryResponse(
|
||||
_timeProvider.GetUtcNow(),
|
||||
totalDepth,
|
||||
maxDepth,
|
||||
builder.ToImmutable());
|
||||
}
|
||||
}
|
||||
@@ -10,8 +10,9 @@ internal sealed record RunCreateRequest(
|
||||
[property: JsonPropertyName("reason")] RunReason? Reason = null,
|
||||
[property: JsonPropertyName("correlationId")] string? CorrelationId = null);
|
||||
|
||||
internal sealed record RunCollectionResponse(
|
||||
[property: JsonPropertyName("runs")] IReadOnlyList<Run> Runs);
|
||||
internal sealed record RunCollectionResponse(
|
||||
[property: JsonPropertyName("runs")] IReadOnlyList<Run> Runs,
|
||||
[property: JsonPropertyName("nextCursor")] string? NextCursor = null);
|
||||
|
||||
internal sealed record RunResponse(
|
||||
[property: JsonPropertyName("run")] Run Run);
|
||||
@@ -31,10 +32,24 @@ internal sealed record ImpactPreviewResponse(
|
||||
[property: JsonPropertyName("snapshotId")] string? SnapshotId,
|
||||
[property: JsonPropertyName("sample")] ImmutableArray<ImpactPreviewSample> Sample);
|
||||
|
||||
internal sealed record ImpactPreviewSample(
|
||||
[property: JsonPropertyName("imageDigest")] string ImageDigest,
|
||||
[property: JsonPropertyName("registry")] string Registry,
|
||||
[property: JsonPropertyName("repository")] string Repository,
|
||||
[property: JsonPropertyName("namespaces")] ImmutableArray<string> Namespaces,
|
||||
[property: JsonPropertyName("tags")] ImmutableArray<string> Tags,
|
||||
[property: JsonPropertyName("usedByEntrypoint")] bool UsedByEntrypoint);
|
||||
internal sealed record ImpactPreviewSample(
|
||||
[property: JsonPropertyName("imageDigest")] string ImageDigest,
|
||||
[property: JsonPropertyName("registry")] string Registry,
|
||||
[property: JsonPropertyName("repository")] string Repository,
|
||||
[property: JsonPropertyName("namespaces")] ImmutableArray<string> Namespaces,
|
||||
[property: JsonPropertyName("tags")] ImmutableArray<string> Tags,
|
||||
[property: JsonPropertyName("usedByEntrypoint")] bool UsedByEntrypoint);
|
||||
|
||||
internal sealed record RunDeltaCollectionResponse(
|
||||
[property: JsonPropertyName("deltas")] ImmutableArray<DeltaSummary> Deltas);
|
||||
|
||||
internal sealed record QueueLagSummaryResponse(
|
||||
[property: JsonPropertyName("capturedAt")] DateTimeOffset CapturedAt,
|
||||
[property: JsonPropertyName("totalDepth")] long TotalDepth,
|
||||
[property: JsonPropertyName("maxDepth")] long MaxDepth,
|
||||
[property: JsonPropertyName("queues")] ImmutableArray<QueueLagEntry> Queues);
|
||||
|
||||
internal sealed record QueueLagEntry(
|
||||
[property: JsonPropertyName("transport")] string Transport,
|
||||
[property: JsonPropertyName("queue")] string Queue,
|
||||
[property: JsonPropertyName("depth")] long Depth);
|
||||
|
||||
@@ -3,7 +3,8 @@ using System.Collections.Generic;
|
||||
using System.Collections.Immutable;
|
||||
using System.ComponentModel.DataAnnotations;
|
||||
using System.Linq;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
using System.Threading;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
using Microsoft.AspNetCore.Routing;
|
||||
using Microsoft.Extensions.Primitives;
|
||||
@@ -15,31 +16,57 @@ using StellaOps.Scheduler.WebService.Auth;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.Runs;
|
||||
|
||||
internal static class RunEndpoints
|
||||
{
|
||||
private const string ReadScope = "scheduler.runs.read";
|
||||
private const string WriteScope = "scheduler.runs.write";
|
||||
private const string PreviewScope = "scheduler.runs.preview";
|
||||
internal static class RunEndpoints
|
||||
{
|
||||
private const string ReadScope = "scheduler.runs.read";
|
||||
private const string WriteScope = "scheduler.runs.write";
|
||||
private const string PreviewScope = "scheduler.runs.preview";
|
||||
private const string ManageScope = "scheduler.runs.manage";
|
||||
private const int DefaultRunListLimit = 50;
|
||||
|
||||
public static IEndpointRouteBuilder MapRunEndpoints(this IEndpointRouteBuilder routes)
|
||||
{
|
||||
var group = routes.MapGroup("/api/v1/scheduler/runs");
|
||||
|
||||
group.MapGet("/", ListRunsAsync);
|
||||
group.MapGet("/queue/lag", GetQueueLagAsync);
|
||||
group.MapGet("/{runId}/deltas", GetRunDeltasAsync);
|
||||
group.MapGet("/{runId}/stream", StreamRunAsync);
|
||||
group.MapGet("/{runId}", GetRunAsync);
|
||||
group.MapPost("/", CreateRunAsync);
|
||||
group.MapPost("/{runId}/cancel", CancelRunAsync);
|
||||
group.MapPost("/{runId}/retry", RetryRunAsync);
|
||||
group.MapPost("/preview", PreviewImpactAsync);
|
||||
|
||||
return routes;
|
||||
}
|
||||
|
||||
public static IEndpointRouteBuilder MapRunEndpoints(this IEndpointRouteBuilder routes)
|
||||
{
|
||||
var group = routes.MapGroup("/api/v1/scheduler/runs");
|
||||
|
||||
group.MapGet("/", ListRunsAsync);
|
||||
group.MapGet("/{runId}", GetRunAsync);
|
||||
group.MapPost("/", CreateRunAsync);
|
||||
group.MapPost("/{runId}/cancel", CancelRunAsync);
|
||||
group.MapPost("/preview", PreviewImpactAsync);
|
||||
|
||||
return routes;
|
||||
}
|
||||
|
||||
private static async Task<IResult> ListRunsAsync(
|
||||
HttpContext httpContext,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository repository,
|
||||
CancellationToken cancellationToken)
|
||||
private static IResult GetQueueLagAsync(
|
||||
HttpContext httpContext,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IQueueLagSummaryProvider queueLagProvider)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
|
||||
tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var summary = queueLagProvider.Capture();
|
||||
return Results.Ok(summary);
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> ListRunsAsync(
|
||||
HttpContext httpContext,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository repository,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
@@ -50,24 +77,35 @@ internal static class RunEndpoints
|
||||
? scheduleValues.ToString().Trim()
|
||||
: null;
|
||||
|
||||
var states = ParseRunStates(httpContext.Request.Query.TryGetValue("state", out var stateValues) ? stateValues : StringValues.Empty);
|
||||
var createdAfter = SchedulerEndpointHelpers.TryParseDateTimeOffset(httpContext.Request.Query.TryGetValue("createdAfter", out var createdAfterValues) ? createdAfterValues.ToString() : null);
|
||||
var limit = SchedulerEndpointHelpers.TryParsePositiveInt(httpContext.Request.Query.TryGetValue("limit", out var limitValues) ? limitValues.ToString() : null);
|
||||
|
||||
var sortAscending = httpContext.Request.Query.TryGetValue("sort", out var sortValues) &&
|
||||
sortValues.Any(value => string.Equals(value, "asc", StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
var options = new RunQueryOptions
|
||||
{
|
||||
ScheduleId = string.IsNullOrWhiteSpace(scheduleId) ? null : scheduleId,
|
||||
States = states,
|
||||
CreatedAfter = createdAfter,
|
||||
Limit = limit,
|
||||
SortAscending = sortAscending,
|
||||
};
|
||||
|
||||
var runs = await repository.ListAsync(tenant.TenantId, options, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
return Results.Ok(new RunCollectionResponse(runs));
|
||||
var states = ParseRunStates(httpContext.Request.Query.TryGetValue("state", out var stateValues) ? stateValues : StringValues.Empty);
|
||||
var createdAfter = SchedulerEndpointHelpers.TryParseDateTimeOffset(httpContext.Request.Query.TryGetValue("createdAfter", out var createdAfterValues) ? createdAfterValues.ToString() : null);
|
||||
var limit = SchedulerEndpointHelpers.TryParsePositiveInt(httpContext.Request.Query.TryGetValue("limit", out var limitValues) ? limitValues.ToString() : null);
|
||||
var cursor = SchedulerEndpointHelpers.TryParseRunCursor(httpContext.Request.Query.TryGetValue("cursor", out var cursorValues) ? cursorValues.ToString() : null);
|
||||
|
||||
var sortAscending = httpContext.Request.Query.TryGetValue("sort", out var sortValues) &&
|
||||
sortValues.Any(value => string.Equals(value, "asc", StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
var appliedLimit = limit ?? DefaultRunListLimit;
|
||||
var options = new RunQueryOptions
|
||||
{
|
||||
ScheduleId = string.IsNullOrWhiteSpace(scheduleId) ? null : scheduleId,
|
||||
States = states,
|
||||
CreatedAfter = createdAfter,
|
||||
Cursor = cursor,
|
||||
Limit = appliedLimit,
|
||||
SortAscending = sortAscending,
|
||||
};
|
||||
|
||||
var runs = await repository.ListAsync(tenant.TenantId, options, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
|
||||
string? nextCursor = null;
|
||||
if (runs.Count == appliedLimit && runs.Count > 0)
|
||||
{
|
||||
var last = runs[^1];
|
||||
nextCursor = SchedulerEndpointHelpers.CreateRunCursor(last);
|
||||
}
|
||||
|
||||
return Results.Ok(new RunCollectionResponse(runs, nextCursor));
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
@@ -75,32 +113,59 @@ internal static class RunEndpoints
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> GetRunAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository repository,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var run = await repository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (run is null)
|
||||
{
|
||||
return Results.NotFound();
|
||||
}
|
||||
|
||||
return Results.Ok(new RunResponse(run));
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
private static async Task<IResult> GetRunAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository repository,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var run = await repository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (run is null)
|
||||
{
|
||||
return Results.NotFound();
|
||||
}
|
||||
|
||||
return Results.Ok(new RunResponse(run));
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> GetRunDeltasAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository repository,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var run = await repository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (run is null)
|
||||
{
|
||||
return Results.NotFound();
|
||||
}
|
||||
|
||||
return Results.Ok(new RunDeltaCollectionResponse(run.Deltas));
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> CreateRunAsync(
|
||||
HttpContext httpContext,
|
||||
@@ -116,7 +181,7 @@ internal static class RunEndpoints
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, WriteScope);
|
||||
scopeAuthorizer.EnsureScope(httpContext, ManageScope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(request.ScheduleId))
|
||||
@@ -184,11 +249,11 @@ internal static class RunEndpoints
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> CancelRunAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
private static async Task<IResult> CancelRunAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository repository,
|
||||
[FromServices] IRunSummaryService runSummaryService,
|
||||
[FromServices] ISchedulerAuditService auditService,
|
||||
@@ -243,9 +308,145 @@ internal static class RunEndpoints
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> RetryRunAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IScheduleRepository scheduleRepository,
|
||||
[FromServices] IRunRepository runRepository,
|
||||
[FromServices] IRunSummaryService runSummaryService,
|
||||
[FromServices] ISchedulerAuditService auditService,
|
||||
[FromServices] TimeProvider timeProvider,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, ManageScope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var existing = await runRepository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (existing is null)
|
||||
{
|
||||
return Results.NotFound();
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(existing.ScheduleId))
|
||||
{
|
||||
return Results.BadRequest(new { error = "Run cannot be retried because it is not associated with a schedule." });
|
||||
}
|
||||
|
||||
if (!RunStateMachine.IsTerminal(existing.State))
|
||||
{
|
||||
return Results.Conflict(new { error = "Run is not in a terminal state and cannot be retried." });
|
||||
}
|
||||
|
||||
var schedule = await scheduleRepository.GetAsync(tenant.TenantId, existing.ScheduleId!, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (schedule is null)
|
||||
{
|
||||
return Results.BadRequest(new { error = "Associated schedule no longer exists." });
|
||||
}
|
||||
|
||||
var now = timeProvider.GetUtcNow();
|
||||
var newRunId = SchedulerEndpointHelpers.GenerateIdentifier("run");
|
||||
var baselineReason = existing.Reason ?? RunReason.Empty;
|
||||
var manualReason = string.IsNullOrWhiteSpace(baselineReason.ManualReason)
|
||||
? $"retry-of:{existing.Id}"
|
||||
: $"{baselineReason.ManualReason};retry-of:{existing.Id}";
|
||||
|
||||
var newReason = new RunReason(
|
||||
manualReason,
|
||||
baselineReason.ConselierExportId,
|
||||
baselineReason.ExcitorExportId,
|
||||
baselineReason.Cursor)
|
||||
{
|
||||
ImpactWindowFrom = baselineReason.ImpactWindowFrom,
|
||||
ImpactWindowTo = baselineReason.ImpactWindowTo
|
||||
};
|
||||
|
||||
var retryRun = new Run(
|
||||
newRunId,
|
||||
tenant.TenantId,
|
||||
RunTrigger.Manual,
|
||||
RunState.Planning,
|
||||
RunStats.Empty,
|
||||
now,
|
||||
newReason,
|
||||
existing.ScheduleId,
|
||||
retryOf: existing.Id);
|
||||
|
||||
await runRepository.InsertAsync(retryRun, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(retryRun.ScheduleId))
|
||||
{
|
||||
await runSummaryService.ProjectAsync(retryRun, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
await auditService.WriteAsync(
|
||||
new SchedulerAuditEvent(
|
||||
tenant.TenantId,
|
||||
"scheduler.run",
|
||||
"retry",
|
||||
SchedulerEndpointHelpers.ResolveAuditActor(httpContext),
|
||||
RunId: retryRun.Id,
|
||||
ScheduleId: retryRun.ScheduleId,
|
||||
Metadata: BuildMetadata(
|
||||
("state", retryRun.State.ToString().ToLowerInvariant()),
|
||||
("retryOf", existing.Id),
|
||||
("trigger", retryRun.Trigger.ToString().ToLowerInvariant()))),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
return Results.Created($"/api/v1/scheduler/runs/{retryRun.Id}", new RunResponse(retryRun));
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task StreamRunAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository runRepository,
|
||||
[FromServices] IRunStreamCoordinator runStreamCoordinator,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var run = await runRepository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (run is null)
|
||||
{
|
||||
await Results.NotFound().ExecuteAsync(httpContext);
|
||||
return;
|
||||
}
|
||||
|
||||
await runStreamCoordinator.StreamAsync(httpContext, tenant.TenantId, run, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
// Client disconnected; nothing to do.
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
if (!httpContext.Response.HasStarted)
|
||||
{
|
||||
await Results.BadRequest(new { error = ex.Message }).ExecuteAsync(httpContext);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> PreviewImpactAsync(
|
||||
HttpContext httpContext,
|
||||
|
||||
@@ -0,0 +1,225 @@
|
||||
using System;
|
||||
using System.Collections.Immutable;
|
||||
using System.Linq;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.Runs;
|
||||
|
||||
internal interface IRunStreamCoordinator
|
||||
{
|
||||
Task StreamAsync(HttpContext context, string tenantId, Run initialRun, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
internal sealed class RunStreamCoordinator : IRunStreamCoordinator
|
||||
{
|
||||
private static readonly JsonSerializerOptions SerializerOptions = new(JsonSerializerDefaults.Web);
|
||||
|
||||
private readonly IRunRepository _runRepository;
|
||||
private readonly IQueueLagSummaryProvider _queueLagProvider;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<RunStreamCoordinator> _logger;
|
||||
private readonly RunStreamOptions _options;
|
||||
|
||||
public RunStreamCoordinator(
|
||||
IRunRepository runRepository,
|
||||
IQueueLagSummaryProvider queueLagProvider,
|
||||
IOptions<RunStreamOptions> options,
|
||||
TimeProvider? timeProvider,
|
||||
ILogger<RunStreamCoordinator> logger)
|
||||
{
|
||||
_runRepository = runRepository ?? throw new ArgumentNullException(nameof(runRepository));
|
||||
_queueLagProvider = queueLagProvider ?? throw new ArgumentNullException(nameof(queueLagProvider));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_options = (options ?? throw new ArgumentNullException(nameof(options))).Value.Validate();
|
||||
}
|
||||
|
||||
public async Task StreamAsync(HttpContext context, string tenantId, Run initialRun, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
ArgumentNullException.ThrowIfNull(initialRun);
|
||||
|
||||
var response = context.Response;
|
||||
ConfigureSseHeaders(response);
|
||||
await SseWriter.WriteRetryAsync(response, _options.ReconnectDelay, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var lastRun = initialRun;
|
||||
await SseWriter.WriteEventAsync(response, "initial", RunSnapshotPayload.From(lastRun), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
await SseWriter.WriteEventAsync(response, "queueLag", _queueLagProvider.Capture(), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
await SseWriter.WriteEventAsync(response, "heartbeat", HeartbeatPayload.Create(_timeProvider.GetUtcNow()), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (RunStateMachine.IsTerminal(lastRun.State))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "completed", RunSnapshotPayload.From(lastRun), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
using var pollTimer = new PeriodicTimer(_options.PollInterval);
|
||||
using var queueTimer = new PeriodicTimer(_options.QueueLagInterval);
|
||||
using var heartbeatTimer = new PeriodicTimer(_options.HeartbeatInterval);
|
||||
|
||||
try
|
||||
{
|
||||
while (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
var pollTask = pollTimer.WaitForNextTickAsync(cancellationToken).AsTask();
|
||||
var queueTask = queueTimer.WaitForNextTickAsync(cancellationToken).AsTask();
|
||||
var heartbeatTask = heartbeatTimer.WaitForNextTickAsync(cancellationToken).AsTask();
|
||||
|
||||
var completed = await Task.WhenAny(pollTask, queueTask, heartbeatTask).ConfigureAwait(false);
|
||||
|
||||
if (completed == pollTask && await pollTask.ConfigureAwait(false))
|
||||
{
|
||||
var current = await _runRepository.GetAsync(tenantId, lastRun.Id, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (current is null)
|
||||
{
|
||||
_logger.LogWarning("Run {RunId} disappeared while streaming; signalling notFound event.", lastRun.Id);
|
||||
await SseWriter.WriteEventAsync(response, "notFound", new RunNotFoundPayload(lastRun.Id), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
break;
|
||||
}
|
||||
|
||||
await EmitRunDifferencesAsync(response, lastRun, current, cancellationToken).ConfigureAwait(false);
|
||||
lastRun = current;
|
||||
|
||||
if (RunStateMachine.IsTerminal(lastRun.State))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "completed", RunSnapshotPayload.From(lastRun), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (completed == queueTask && await queueTask.ConfigureAwait(false))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "queueLag", _queueLagProvider.Capture(), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
else if (completed == heartbeatTask && await heartbeatTask.ConfigureAwait(false))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "heartbeat", HeartbeatPayload.Create(_timeProvider.GetUtcNow()), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogDebug("Run stream cancelled for run {RunId}.", lastRun.Id);
|
||||
}
|
||||
}
|
||||
|
||||
private static void ConfigureSseHeaders(HttpResponse response)
|
||||
{
|
||||
response.StatusCode = StatusCodes.Status200OK;
|
||||
response.Headers.CacheControl = "no-store";
|
||||
response.Headers["X-Accel-Buffering"] = "no";
|
||||
response.Headers["Connection"] = "keep-alive";
|
||||
response.ContentType = "text/event-stream";
|
||||
}
|
||||
|
||||
private async Task EmitRunDifferencesAsync(HttpResponse response, Run previous, Run current, CancellationToken cancellationToken)
|
||||
{
|
||||
var stateChanged = current.State != previous.State || current.StartedAt != previous.StartedAt || current.FinishedAt != previous.FinishedAt || !string.Equals(current.Error, previous.Error, StringComparison.Ordinal);
|
||||
if (stateChanged)
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "stateChanged", RunStateChangedPayload.From(current), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
if (!ReferenceEquals(current.Stats, previous.Stats) && current.Stats != previous.Stats)
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "segmentProgress", RunStatsPayload.From(current), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
if (!current.Deltas.SequenceEqual(previous.Deltas))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "deltaSummary", new RunDeltaPayload(current.Id, current.Deltas), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private sealed record RunSnapshotPayload(
|
||||
[property: JsonPropertyName("run")] Run Run)
|
||||
{
|
||||
public static RunSnapshotPayload From(Run run)
|
||||
=> new(run);
|
||||
}
|
||||
|
||||
private sealed record RunStateChangedPayload(
|
||||
[property: JsonPropertyName("runId")] string RunId,
|
||||
[property: JsonPropertyName("state")] string State,
|
||||
[property: JsonPropertyName("startedAt")] DateTimeOffset? StartedAt,
|
||||
[property: JsonPropertyName("finishedAt")] DateTimeOffset? FinishedAt,
|
||||
[property: JsonPropertyName("error")] string? Error)
|
||||
{
|
||||
public static RunStateChangedPayload From(Run run)
|
||||
=> new(
|
||||
run.Id,
|
||||
run.State.ToString().ToLowerInvariant(),
|
||||
run.StartedAt,
|
||||
run.FinishedAt,
|
||||
run.Error);
|
||||
}
|
||||
|
||||
private sealed record RunStatsPayload(
|
||||
[property: JsonPropertyName("runId")] string RunId,
|
||||
[property: JsonPropertyName("stats")] RunStats Stats)
|
||||
{
|
||||
public static RunStatsPayload From(Run run)
|
||||
=> new(run.Id, run.Stats);
|
||||
}
|
||||
|
||||
private sealed record RunDeltaPayload(
|
||||
[property: JsonPropertyName("runId")] string RunId,
|
||||
[property: JsonPropertyName("deltas")] ImmutableArray<DeltaSummary> Deltas);
|
||||
|
||||
private sealed record HeartbeatPayload(
|
||||
[property: JsonPropertyName("ts")] DateTimeOffset Timestamp)
|
||||
{
|
||||
public static HeartbeatPayload Create(DateTimeOffset timestamp)
|
||||
=> new(timestamp);
|
||||
}
|
||||
|
||||
private sealed record RunNotFoundPayload(
|
||||
[property: JsonPropertyName("runId")] string RunId);
|
||||
}
|
||||
|
||||
internal sealed class RunStreamOptions
|
||||
{
|
||||
private static readonly TimeSpan MinimumInterval = TimeSpan.FromMilliseconds(100);
|
||||
private static readonly TimeSpan MinimumReconnectDelay = TimeSpan.FromMilliseconds(500);
|
||||
|
||||
public TimeSpan PollInterval { get; set; } = TimeSpan.FromSeconds(2);
|
||||
|
||||
public TimeSpan QueueLagInterval { get; set; } = TimeSpan.FromSeconds(10);
|
||||
|
||||
public TimeSpan HeartbeatInterval { get; set; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
public TimeSpan ReconnectDelay { get; set; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
public RunStreamOptions Validate()
|
||||
{
|
||||
if (PollInterval < MinimumInterval)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(PollInterval), PollInterval, "Poll interval must be at least 100ms.");
|
||||
}
|
||||
|
||||
if (QueueLagInterval < MinimumInterval)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(QueueLagInterval), QueueLagInterval, "Queue lag interval must be at least 100ms.");
|
||||
}
|
||||
|
||||
if (HeartbeatInterval < MinimumInterval)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(HeartbeatInterval), HeartbeatInterval, "Heartbeat interval must be at least 100ms.");
|
||||
}
|
||||
|
||||
if (ReconnectDelay < MinimumReconnectDelay)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(ReconnectDelay), ReconnectDelay, "Reconnect delay must be at least 500ms.");
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Text.Json;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.Runs;
|
||||
|
||||
internal static class SseWriter
|
||||
{
|
||||
public static async Task WriteRetryAsync(HttpResponse response, TimeSpan reconnectDelay, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(response);
|
||||
|
||||
var milliseconds = (int)Math.Clamp(reconnectDelay.TotalMilliseconds, 1, int.MaxValue);
|
||||
await response.WriteAsync($"retry: {milliseconds}\r\n\r\n", cancellationToken).ConfigureAwait(false);
|
||||
await response.Body.FlushAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public static async Task WriteEventAsync(HttpResponse response, string eventName, object payload, JsonSerializerOptions serializerOptions, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(response);
|
||||
ArgumentNullException.ThrowIfNull(payload);
|
||||
ArgumentNullException.ThrowIfNull(serializerOptions);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(eventName))
|
||||
{
|
||||
throw new ArgumentException("Event name must be provided.", nameof(eventName));
|
||||
}
|
||||
|
||||
await response.WriteAsync($"event: {eventName}\r\n", cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var json = JsonSerializer.Serialize(payload, serializerOptions);
|
||||
using var reader = new StringReader(json);
|
||||
string? line;
|
||||
while ((line = reader.ReadLine()) is not null)
|
||||
{
|
||||
await response.WriteAsync($"data: {line}\r\n", cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
await response.WriteAsync("\r\n", cancellationToken).ConfigureAwait(false);
|
||||
await response.Body.FlushAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,9 @@
|
||||
using System.ComponentModel.DataAnnotations;
|
||||
using System.Globalization;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Services;
|
||||
using System.ComponentModel.DataAnnotations;
|
||||
using System.Globalization;
|
||||
using System.Text;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Services;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService;
|
||||
|
||||
@@ -91,11 +93,11 @@ internal static class SchedulerEndpointHelpers
|
||||
return null;
|
||||
}
|
||||
|
||||
public static DateTimeOffset? TryParseDateTimeOffset(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value))
|
||||
{
|
||||
return null;
|
||||
public static DateTimeOffset? TryParseDateTimeOffset(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
if (DateTimeOffset.TryParse(value, CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal, out var parsed))
|
||||
@@ -114,14 +116,62 @@ internal static class SchedulerEndpointHelpers
|
||||
throw new ArgumentException("Tenant identifier must be provided.", nameof(tenantId));
|
||||
}
|
||||
|
||||
return new Selector(
|
||||
selection.Scope,
|
||||
tenantId,
|
||||
selection.Namespaces,
|
||||
selection.Repositories,
|
||||
selection.Digests,
|
||||
selection.IncludeTags,
|
||||
selection.Labels,
|
||||
selection.ResolvesTags);
|
||||
}
|
||||
}
|
||||
return new Selector(
|
||||
selection.Scope,
|
||||
tenantId,
|
||||
selection.Namespaces,
|
||||
selection.Repositories,
|
||||
selection.Digests,
|
||||
selection.IncludeTags,
|
||||
selection.Labels,
|
||||
selection.ResolvesTags);
|
||||
}
|
||||
|
||||
public static string CreateRunCursor(Run run)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(run);
|
||||
var payload = $"{run.CreatedAt.ToUniversalTime():O}|{run.Id}";
|
||||
return Convert.ToBase64String(Encoding.UTF8.GetBytes(payload));
|
||||
}
|
||||
|
||||
public static RunListCursor? TryParseRunCursor(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var trimmed = value.Trim();
|
||||
if (trimmed.Length == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var bytes = Convert.FromBase64String(trimmed);
|
||||
var decoded = Encoding.UTF8.GetString(bytes);
|
||||
var parts = decoded.Split('|', 2, StringSplitOptions.TrimEntries);
|
||||
if (parts.Length != 2)
|
||||
{
|
||||
throw new ValidationException($"Cursor '{value}' is not valid.");
|
||||
}
|
||||
|
||||
if (!DateTimeOffset.TryParse(parts[0], CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal, out var timestamp))
|
||||
{
|
||||
throw new ValidationException($"Cursor '{value}' is not valid.");
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(parts[1]))
|
||||
{
|
||||
throw new ValidationException($"Cursor '{value}' is not valid.");
|
||||
}
|
||||
|
||||
return new RunListCursor(timestamp.ToUniversalTime(), parts[1]);
|
||||
}
|
||||
catch (FormatException ex)
|
||||
{
|
||||
throw new ValidationException($"Cursor '{value}' is not valid.", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
<ProjectReference Include="../__Libraries/StellaOps.Scheduler.Models/StellaOps.Scheduler.Models.csproj" />
|
||||
<ProjectReference Include="../__Libraries/StellaOps.Scheduler.Storage.Mongo/StellaOps.Scheduler.Storage.Mongo.csproj" />
|
||||
<ProjectReference Include="../__Libraries/StellaOps.Scheduler.ImpactIndex/StellaOps.Scheduler.ImpactIndex.csproj" />
|
||||
<ProjectReference Include="../__Libraries/StellaOps.Scheduler.Queue/StellaOps.Scheduler.Queue.csproj" />
|
||||
<ProjectReference Include="../../__Libraries/StellaOps.Plugin/StellaOps.Plugin.csproj" />
|
||||
<ProjectReference Include="../../Authority/StellaOps.Authority/StellaOps.Auth.Abstractions/StellaOps.Auth.Abstractions.csproj" />
|
||||
<ProjectReference Include="../../Authority/StellaOps.Authority/StellaOps.Auth.ServerIntegration/StellaOps.Auth.ServerIntegration.csproj" />
|
||||
|
||||
@@ -22,13 +22,13 @@
|
||||
## StellaOps Console (Sprint 23)
|
||||
| ID | Status | Owner(s) | Depends on | Description | Exit Criteria |
|
||||
|----|--------|----------|------------|-------------|---------------|
|
||||
| SCHED-CONSOLE-23-001 | TODO | Scheduler WebService Guild, BE-Base Platform Guild | SCHED-WEB-16-103, SCHED-WEB-20-001 | Extend runs APIs with live progress SSE endpoints (`/console/runs/{id}/stream`), queue lag summaries, diff metadata fetch, retry/cancel hooks with RBAC enforcement, and deterministic pagination for history views consumed by Console. | SSE emits heartbeats/backoff headers, progress payload schema documented, unauthorized actions blocked in integration tests, metrics/logs expose queue lag + correlation IDs. |
|
||||
| SCHED-CONSOLE-23-001 | DONE (2025-11-03) | Scheduler WebService Guild, BE-Base Platform Guild | SCHED-WEB-16-103, SCHED-WEB-20-001 | Extend runs APIs with live progress SSE endpoints (`/console/runs/{id}/stream`), queue lag summaries, diff metadata fetch, retry/cancel hooks with RBAC enforcement, and deterministic pagination for history views consumed by Console. | SSE emits heartbeats/backoff headers, progress payload schema documented, unauthorized actions blocked in integration tests, metrics/logs expose queue lag + correlation IDs. |
|
||||
|
||||
## Policy Studio (Sprint 27)
|
||||
| ID | Status | Owner(s) | Depends on | Description | Exit Criteria |
|
||||
|----|--------|----------|------------|-------------|---------------|
|
||||
| SCHED-CONSOLE-27-001 | TODO | Scheduler WebService Guild, Policy Registry Guild | SCHED-WEB-16-103, REGISTRY-API-27-005 | Provide policy batch simulation orchestration endpoints (`/policies/simulations` POST/GET) exposing run creation, shard status, SSE progress, cancellation, and retries with RBAC enforcement. | API handles shard lifecycle with SSE heartbeats + retry headers; unauthorized requests rejected; integration tests cover submit/cancel/resume flows. |
|
||||
| SCHED-CONSOLE-27-002 | TODO | Scheduler WebService Guild, Observability Guild | SCHED-CONSOLE-27-001 | Emit telemetry endpoints/metrics (`policy_simulation_queue_depth`, `policy_simulation_latency`) and webhook callbacks for completion/failure consumed by Registry. | Metrics exposed via gateway, dashboards seeded, webhook contract documented, integration tests validate metrics emission. |
|
||||
| SCHED-CONSOLE-27-001 | DONE (2025-11-03) | Scheduler WebService Guild, Policy Registry Guild | SCHED-WEB-16-103, REGISTRY-API-27-005 | Provide policy batch simulation orchestration endpoints (`/policies/simulations` POST/GET) exposing run creation, shard status, SSE progress, cancellation, and retries with RBAC enforcement. | API handles shard lifecycle with SSE heartbeats + retry headers; unauthorized requests rejected; integration tests cover submit/cancel/resume flows. |
|
||||
| SCHED-CONSOLE-27-002 | DOING (2025-11-03) | Scheduler WebService Guild, Observability Guild | SCHED-CONSOLE-27-001 | Emit telemetry endpoints/metrics (`policy_simulation_queue_depth`, `policy_simulation_latency`) and webhook callbacks for completion/failure consumed by Registry. | Metrics exposed via gateway, dashboards seeded, webhook contract documented, integration tests validate metrics emission. |
|
||||
|
||||
## Vulnerability Explorer (Sprint 29)
|
||||
| ID | Status | Owner(s) | Depends on | Description | Exit Criteria |
|
||||
|
||||
@@ -6,11 +6,21 @@
|
||||
|
||||
| Method | Path | Description | Scopes |
|
||||
| ------ | ---- | ----------- | ------ |
|
||||
| `GET` | `/api/v1/scheduler/runs` | List runs for the current tenant (filter by schedule, state, createdAfter). | `scheduler.runs.read` |
|
||||
| `GET` | `/api/v1/scheduler/runs/{runId}` | Retrieve run details. | `scheduler.runs.read` |
|
||||
| `POST` | `/api/v1/scheduler/runs` | Create an ad-hoc run bound to an existing schedule. | `scheduler.runs.write` |
|
||||
| `POST` | `/api/v1/scheduler/runs/{runId}/cancel` | Transition a run to `cancelled` when still in a non-terminal state. | `scheduler.runs.write` |
|
||||
| `POST` | `/api/v1/scheduler/runs/preview` | Resolve impacted images using the ImpactIndex without enqueuing work. | `scheduler.runs.preview` |
|
||||
| `GET` | `/api/v1/scheduler/runs` | List runs for the current tenant (filter by schedule, state, createdAfter, cursor). | `scheduler.runs.read` |
|
||||
| `GET` | `/api/v1/scheduler/runs/{runId}` | Retrieve run details. | `scheduler.runs.read` |
|
||||
| `GET` | `/api/v1/scheduler/runs/{runId}/deltas` | Fetch deterministic delta metadata for the specified run. | `scheduler.runs.read` |
|
||||
| `GET` | `/api/v1/scheduler/runs/queue/lag` | Snapshot queue depth per transport/queue for console dashboards. | `scheduler.runs.read` |
|
||||
| `GET` | `/api/v1/scheduler/runs/{runId}/stream` | Server-sent events (SSE) stream for live progress, queue lag, and heartbeats. | `scheduler.runs.read` |
|
||||
| `POST` | `/api/v1/scheduler/runs` | Create an ad-hoc run bound to an existing schedule. | `scheduler.runs.write` |
|
||||
| `POST` | `/api/v1/scheduler/runs/{runId}/cancel` | Transition a run to `cancelled` when still in a non-terminal state. | `scheduler.runs.manage` |
|
||||
| `POST` | `/api/v1/scheduler/runs/{runId}/retry` | Clone a terminal run into a new manual retry, preserving provenance. | `scheduler.runs.manage` |
|
||||
| `POST` | `/api/v1/scheduler/runs/preview` | Resolve impacted images using the ImpactIndex without enqueuing work. | `scheduler.runs.preview` |
|
||||
| `GET` | `/api/v1/scheduler/policies/simulations` | List policy simulations for the current tenant (filters: policyId, status, since, limit). | `policy:simulate` |
|
||||
| `GET` | `/api/v1/scheduler/policies/simulations/{simulationId}` | Retrieve simulation status snapshot. | `policy:simulate` |
|
||||
| `GET` | `/api/v1/scheduler/policies/simulations/{simulationId}/stream` | SSE stream emitting simulation status, queue lag, and heartbeats. | `policy:simulate` |
|
||||
| `POST` | `/api/v1/scheduler/policies/simulations` | Enqueue a policy simulation (mode=`simulate`) with optional SBOM inputs and metadata. | `policy:simulate` |
|
||||
| `POST` | `/api/v1/scheduler/policies/simulations/{simulationId}/cancel` | Request cancellation for an in-flight simulation. | `policy:simulate` |
|
||||
| `POST` | `/api/v1/scheduler/policies/simulations/{simulationId}/retry` | Clone a terminal simulation into a new run preserving inputs/metadata. | `policy:simulate` |
|
||||
|
||||
All endpoints require a tenant context (`X-Tenant-Id`) and the appropriate scheduler scopes. Development mode allows header-based auth; production deployments must rely on Authority-issued tokens (OpTok + DPoP).
|
||||
|
||||
@@ -70,12 +80,12 @@ GET /api/v1/scheduler/runs?scheduleId=sch_4f2c7d9e0a2b4c64a0e7b5f9d65c1234&state
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"runs": [
|
||||
{
|
||||
"schemaVersion": "scheduler.run@1",
|
||||
"id": "run_c7b4e9d2f6a04f8784a40476d8a2f771",
|
||||
"tenantId": "tenant-alpha",
|
||||
{
|
||||
"runs": [
|
||||
{
|
||||
"schemaVersion": "scheduler.run@1",
|
||||
"id": "run_c7b4e9d2f6a04f8784a40476d8a2f771",
|
||||
"tenantId": "tenant-alpha",
|
||||
"scheduleId": "sch_4f2c7d9e0a2b4c64a0e7b5f9d65c1234",
|
||||
"trigger": "manual",
|
||||
"state": "planning",
|
||||
@@ -93,11 +103,13 @@ GET /api/v1/scheduler/runs?scheduleId=sch_4f2c7d9e0a2b4c64a0e7b5f9d65c1234&state
|
||||
"reason": {
|
||||
"manualReason": "Nightly backfill"
|
||||
},
|
||||
"createdAt": "2025-10-26T03:12:45Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
"createdAt": "2025-10-26T03:12:45Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
When additional pages are available the response includes `"nextCursor": "<base64>"`. Clients pass this cursor via `?cursor=` to fetch the next deterministic slice (ordering = `createdAt desc, id desc`).
|
||||
|
||||
## Cancel Run
|
||||
|
||||
@@ -136,7 +148,33 @@ POST /api/v1/scheduler/runs/run_c7b4e9d2f6a04f8784a40476d8a2f771/cancel
|
||||
|
||||
## Impact Preview
|
||||
|
||||
`/api/v1/scheduler/runs/preview` resolves impacted images via the ImpactIndex without mutating state. When `scheduleId` is provided the schedule selector is reused; callers may alternatively supply an explicit selector.
|
||||
`/api/v1/scheduler/runs/preview` resolves impacted images via the ImpactIndex without mutating state. When `scheduleId` is provided the schedule selector is reused; callers may alternatively supply an explicit selector.
|
||||
|
||||
## Retry Run
|
||||
|
||||
`POST /api/v1/scheduler/runs/{runId}/retry` clones a terminal run into a new manual run with `retryOf` pointing to the original identifier. Retry is scope-gated with `scheduler.runs.manage`; the new run’s `reason.manualReason` gains a `retry-of:<runId>` suffix for provenance.
|
||||
|
||||
## Run deltas
|
||||
|
||||
`GET /api/v1/scheduler/runs/{runId}/deltas` returns an immutable, deterministically sorted array of delta summaries (`[imageDigest, severity slices, KEV hits, attestations]`).
|
||||
|
||||
## Queue lag snapshot
|
||||
|
||||
`GET /api/v1/scheduler/runs/queue/lag` exposes queue depth summaries for planner/runner transports. The payload includes `capturedAt`, `totalDepth`, `maxDepth`, and ordered queue entries (transport + queue + depth). Console uses this for backlog dashboards and alert thresholds.
|
||||
|
||||
## Live stream (SSE)
|
||||
|
||||
`GET /api/v1/scheduler/runs/{runId}/stream` emits server-sent events for:
|
||||
|
||||
- `initial` — full run snapshot
|
||||
- `stateChanged` — state/started/finished transitions
|
||||
- `segmentProgress` — stats updates
|
||||
- `deltaSummary` — deltas available
|
||||
- `queueLag` — periodic queue snapshots
|
||||
- `heartbeat` — uptime keep-alive (default 5s)
|
||||
- `completed` — terminal summary
|
||||
|
||||
The stream is tolerant to clients reconnecting (idempotent payloads, deterministic ordering) and honours tenant scope plus cancellation tokens.
|
||||
|
||||
```http
|
||||
POST /api/v1/scheduler/runs/preview
|
||||
@@ -178,6 +216,106 @@ POST /api/v1/scheduler/runs/preview
|
||||
|
||||
### Integration notes
|
||||
|
||||
* Run creation and cancellation produce audit entries under category `scheduler.run` with correlation metadata when provided.
|
||||
* The preview endpoint relies on the ImpactIndex stub in development. Production deployments must register the concrete index implementation before use.
|
||||
* Planner/worker orchestration tasks will wire run creation to queueing in SCHED-WORKER-16-201/202.
|
||||
* Run creation and cancellation produce audit entries under category `scheduler.run` with correlation metadata when provided.
|
||||
* The preview endpoint relies on the ImpactIndex stub in development. Production deployments must register the concrete index implementation before use.
|
||||
* Planner/worker orchestration tasks will wire run creation to queueing in SCHED-WORKER-16-201/202.
|
||||
|
||||
## Policy simulations
|
||||
|
||||
The policy simulation APIs mirror the run endpoints but operate on policy-mode jobs (`mode=simulate`) scoped by tenant and RBAC (`policy:simulate`).
|
||||
|
||||
### Create simulation
|
||||
|
||||
```http
|
||||
POST /api/v1/scheduler/policies/simulations
|
||||
X-Tenant-Id: tenant-alpha
|
||||
Authorization: Bearer <OpTok>
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"policyId": "P-7",
|
||||
"policyVersion": 4,
|
||||
"priority": "normal",
|
||||
"metadata": {
|
||||
"source": "console.review"
|
||||
},
|
||||
"inputs": {
|
||||
"sbomSet": ["sbom:S-318", "sbom:S-42"],
|
||||
"captureExplain": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```json
|
||||
HTTP/1.1 201 Created
|
||||
Location: /api/v1/scheduler/policies/simulations/run:P-7:20251103T153000Z:e4d1a9b2
|
||||
{
|
||||
"simulation": {
|
||||
"schemaVersion": "scheduler.policy-run-status@1",
|
||||
"runId": "run:P-7:20251103T153000Z:e4d1a9b2",
|
||||
"tenantId": "tenant-alpha",
|
||||
"policyId": "P-7",
|
||||
"policyVersion": 4,
|
||||
"mode": "simulate",
|
||||
"status": "queued",
|
||||
"priority": "normal",
|
||||
"queuedAt": "2025-11-03T15:30:00Z",
|
||||
"stats": {
|
||||
"components": 0,
|
||||
"rulesFired": 0,
|
||||
"findingsWritten": 0,
|
||||
"vexOverrides": 0
|
||||
},
|
||||
"inputs": {
|
||||
"sbomSet": ["sbom:S-318", "sbom:S-42"],
|
||||
"captureExplain": true
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Canonical payload lives in `samples/api/scheduler/policy-simulation-status.json`.
|
||||
|
||||
### List and fetch simulations
|
||||
|
||||
- `GET /api/v1/scheduler/policies/simulations?policyId=P-7&status=queued&limit=25`
|
||||
- `GET /api/v1/scheduler/policies/simulations/{simulationId}`
|
||||
|
||||
The response envelope mirrors `policy-run-status` but uses `simulations` / `simulation` wrappers. All metadata keys are lower-case; retries append `retry-of=<priorRunId>` for provenance.
|
||||
|
||||
### Cancel and retry
|
||||
|
||||
- `POST /api/v1/scheduler/policies/simulations/{simulationId}/cancel`
|
||||
- Marks the job as `cancellationRequested` and surfaces the reason. Worker execution honours this flag before leasing.
|
||||
- `POST /api/v1/scheduler/policies/simulations/{simulationId}/retry`
|
||||
- Clones a terminal simulation, preserving inputs/metadata and adding `metadata.retry-of` pointing to the original run ID. Returns `409 Conflict` when the simulation is not terminal.
|
||||
|
||||
### Live stream (SSE)
|
||||
|
||||
`GET /api/v1/scheduler/policies/simulations/{simulationId}/stream` emits:
|
||||
|
||||
- `retry` — reconnection hint (milliseconds) emitted before events.
|
||||
- `initial` — current simulation snapshot.
|
||||
- `status` — status/attempt/stat updates.
|
||||
- `queueLag` — periodic queue depth summary (shares payload with run streams).
|
||||
- `heartbeat` — keep-alive ping (default 5s; configurable under `Scheduler:RunStream`).
|
||||
- `completed` — terminal summary (`succeeded`, `failed`, or `cancelled`).
|
||||
- `notFound` — emitted if the run record disappears while streaming.
|
||||
|
||||
Heartbeats, queue lag summaries, and the reconnection directive are sent immediately after connection so Console clients receive deterministic telemetry when loading a simulation workspace.
|
||||
|
||||
### Metrics
|
||||
|
||||
```
|
||||
GET /api/v1/scheduler/policies/simulations/metrics
|
||||
X-Tenant-Id: tenant-alpha
|
||||
Authorization: Bearer <OpTok>
|
||||
```
|
||||
|
||||
Returns queue depth and latency summaries tailored for simulation dashboards and alerting. Response properties align with the metric names exposed via OTEL (`policy_simulation_queue_depth`, `policy_simulation_latency`). Canonical payload lives at `samples/api/scheduler/policy-simulation-metrics.json`.
|
||||
|
||||
- `policy_simulation_queue_depth.total` — pending simulation jobs (aggregate of `pending`, `dispatching`, `submitted`).
|
||||
- `policy_simulation_latency.*` — latency percentiles (seconds) computed from the most recent terminal simulations.
|
||||
|
||||
> **Note:** When Mongo storage is not configured the metrics provider is disabled and the endpoint responds with `501 Not Implemented`.
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
|
||||
## Policy simulations
|
||||
|
||||
`/api/v1/scheduler/policies/simulations` orchestrates Policy Engine runs in `simulate` mode without mutating persisted findings.
|
||||
|
||||
- **Create** — `POST /api/v1/scheduler/policies/simulations` (scope `policy:simulate`) enqueues a simulation for `policyId`/`policyVersion`, respecting optional `metadata` and structured `inputs` (`sbomSet`, `advisoryCursor`, `vexCursor`, `captureExplain`). Returns `201 Created` with `simulation.runId` and status `queued`.
|
||||
- **List/Get** — `GET /api/v1/scheduler/policies/simulations` and `/.../{simulationId}` expose `PolicyRunStatus` documents filtered to `mode=simulate`, including attempt counts, stats, and cancellation markers.
|
||||
- **Cancel** — `POST /.../{simulationId}/cancel` records `cancellationRequested=true` (optional reason, timestamp) and immediately reflects the updated status; workers honour the flag on the next lease cycle.
|
||||
- **Retry** — `POST /.../{simulationId}/retry` clones a terminal simulation (cancelled/failed/succeeded) into a fresh job preserving inputs/metadata. Non-terminal runs yield `409 Conflict`.
|
||||
- **Stream** — `GET /.../{simulationId}/stream` emits SSE events (`initial`, `status`, `queueLag`, `heartbeat`, `completed`) with the latest `PolicyRunStatus`, enabling Console to render shard progress and cancellation state in real time.
|
||||
|
||||
Simulation APIs share the same deterministic pagination/metadata contracts as policy runs and surface queue depth snapshots via the existing scheduler queue metrics.
|
||||
@@ -0,0 +1,78 @@
|
||||
# SCHED-CONSOLE-27-002 · Policy Simulation Telemetry & Webhooks
|
||||
|
||||
> Owners: Scheduler WebService Guild, Observability Guild
|
||||
> Scope: Policy simulation metrics endpoint and completion webhooks feeding Registry/Console integrations.
|
||||
|
||||
## 1. Metrics endpoint refresher
|
||||
|
||||
- `GET /api/v1/scheduler/policies/simulations/metrics` (scope: `policy:simulate`)
|
||||
- Returns queue depth grouped by status plus latency percentiles derived from the most recent sample window (default 200 terminal runs).
|
||||
- Surface area is unchanged from the implementation in Sprint 27 week 1; consumers should continue to rely on the contract in `samples/api/scheduler/policy-simulation-metrics.json`.
|
||||
- When backing storage is not Mongo the endpoint responds `501 Not Implemented`.
|
||||
|
||||
## 2. Completion webhooks
|
||||
|
||||
Scheduler Worker now emits policy simulation webhooks whenever a simulation reaches a terminal state (`succeeded`, `failed`, `cancelled`). Payloads are aligned with the SSE `completed` event shape and include idempotency headers so downstream systems can safely de-duplicate.
|
||||
|
||||
### 2.1 Configuration
|
||||
|
||||
```jsonc
|
||||
// scheduler-worker.appsettings.json
|
||||
{
|
||||
"Scheduler": {
|
||||
"Worker": {
|
||||
"Policy": {
|
||||
"Webhook": {
|
||||
"Enabled": true,
|
||||
"Endpoint": "https://registry.internal/hooks/policy-simulation",
|
||||
"ApiKeyHeader": "X-StellaOps-Webhook-Key",
|
||||
"ApiKey": "replace-me",
|
||||
"TimeoutSeconds": 10
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- `Enabled`: feature flag; disabled by default to preserve air-gap behaviour.
|
||||
- `Endpoint`: absolute HTTPS endpoint; requests use `POST`.
|
||||
- `ApiKeyHeader`/`ApiKey`: optional bearer for Registry verification.
|
||||
- `TimeoutSeconds`: per-request timeout (defaults to 10s).
|
||||
|
||||
### 2.2 Headers
|
||||
|
||||
| Header | Purpose |
|
||||
|------------------------|---------------------------------------|
|
||||
| `X-StellaOps-Tenant` | Tenant identifier for the simulation. |
|
||||
| `X-StellaOps-Run-Id` | Stable run id (use as idempotency key). |
|
||||
| `X-StellaOps-Webhook-Key` | Optional API key as configured. |
|
||||
|
||||
### 2.3 Payload
|
||||
|
||||
See `samples/api/scheduler/policy-simulation-webhook.json` for a canonical example.
|
||||
|
||||
```json
|
||||
{
|
||||
"tenantId": "tenant-alpha",
|
||||
"simulation": { /* PolicyRunStatus document */ },
|
||||
"result": "failed",
|
||||
"observedAt": "2025-11-03T20:05:12Z",
|
||||
"latencySeconds": 14.287,
|
||||
"reason": "policy engine timeout"
|
||||
}
|
||||
```
|
||||
|
||||
- `result`: `succeeded`, `failed`, `cancelled`, `running`, or `queued`. Terminal webhooks are emitted only for the first three.
|
||||
- `latencySeconds`: bounded to four decimal places; derived from `finishedAt - queuedAt` when timestamps exist, else falls back to observer timestamp.
|
||||
- `reason`: surfaced for failures (`error`) and cancellations (`cancellationReason`); omitted otherwise.
|
||||
|
||||
### 2.4 Delivery semantics
|
||||
|
||||
- Best effort with no retry from the worker — Registry should use `X-StellaOps-Run-Id` for idempotency.
|
||||
- Failures emit WARN logs (prefix `Policy run job {JobId}`).
|
||||
- Disabled configuration short-circuits without network calls (debug log only).
|
||||
|
||||
## 3. SSE compatibility
|
||||
|
||||
No changes were required on the streaming endpoint (`GET /api/v1/scheduler/policies/simulations/{id}/stream`); Console continues to receive `completed` events containing the same `PolicyRunStatus` payload that the webhook publishes.
|
||||
Reference in New Issue
Block a user