Add unit tests for SBOM ingestion and transformation
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
- Implement `SbomIngestServiceCollectionExtensionsTests` to verify the SBOM ingestion pipeline exports snapshots correctly. - Create `SbomIngestTransformerTests` to ensure the transformation produces expected nodes and edges, including deduplication of license nodes and normalization of timestamps. - Add `SbomSnapshotExporterTests` to test the export functionality for manifest, adjacency, nodes, and edges. - Introduce `VexOverlayTransformerTests` to validate the transformation of VEX nodes and edges. - Set up project file for the test project with necessary dependencies and configurations. - Include JSON fixture files for testing purposes.
This commit is contained in:
@@ -4,9 +4,13 @@ namespace StellaOps.Scheduler.WebService.PolicyRuns;
|
||||
|
||||
internal interface IPolicyRunService
|
||||
{
|
||||
Task<PolicyRunStatus> EnqueueAsync(string tenantId, PolicyRunRequest request, CancellationToken cancellationToken);
|
||||
|
||||
Task<IReadOnlyList<PolicyRunStatus>> ListAsync(string tenantId, PolicyRunQueryOptions options, CancellationToken cancellationToken);
|
||||
|
||||
Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken);
|
||||
}
|
||||
Task<PolicyRunStatus> EnqueueAsync(string tenantId, PolicyRunRequest request, CancellationToken cancellationToken);
|
||||
|
||||
Task<IReadOnlyList<PolicyRunStatus>> ListAsync(string tenantId, PolicyRunQueryOptions options, CancellationToken cancellationToken);
|
||||
|
||||
Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken);
|
||||
|
||||
Task<PolicyRunStatus?> RequestCancellationAsync(string tenantId, string runId, string? reason, CancellationToken cancellationToken);
|
||||
|
||||
Task<PolicyRunStatus> RetryAsync(string tenantId, string runId, string? requestedBy, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ internal sealed class InMemoryPolicyRunService : IPolicyRunService
|
||||
private readonly List<PolicyRunStatus> _orderedRuns = new();
|
||||
private readonly object _gate = new();
|
||||
|
||||
public Task<PolicyRunStatus> EnqueueAsync(string tenantId, PolicyRunRequest request, CancellationToken cancellationToken)
|
||||
public Task<PolicyRunStatus> EnqueueAsync(string tenantId, PolicyRunRequest request, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
@@ -23,27 +23,30 @@ internal sealed class InMemoryPolicyRunService : IPolicyRunService
|
||||
|
||||
var queuedAt = request.QueuedAt ?? DateTimeOffset.UtcNow;
|
||||
|
||||
var status = new PolicyRunStatus(
|
||||
runId,
|
||||
tenantId,
|
||||
request.PolicyId ?? throw new ValidationException("policyId must be provided."),
|
||||
request.PolicyVersion ?? throw new ValidationException("policyVersion must be provided."),
|
||||
request.Mode,
|
||||
PolicyRunExecutionStatus.Queued,
|
||||
request.Priority,
|
||||
queuedAt,
|
||||
PolicyRunStats.Empty,
|
||||
request.Inputs ?? PolicyRunInputs.Empty,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
0,
|
||||
null,
|
||||
null,
|
||||
request.Metadata ?? ImmutableSortedDictionary<string, string>.Empty,
|
||||
SchedulerSchemaVersions.PolicyRunStatus);
|
||||
var status = new PolicyRunStatus(
|
||||
runId,
|
||||
tenantId,
|
||||
request.PolicyId ?? throw new ValidationException("policyId must be provided."),
|
||||
request.PolicyVersion ?? throw new ValidationException("policyVersion must be provided."),
|
||||
request.Mode,
|
||||
PolicyRunExecutionStatus.Queued,
|
||||
request.Priority,
|
||||
queuedAt,
|
||||
PolicyRunStats.Empty,
|
||||
request.Inputs ?? PolicyRunInputs.Empty,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
0,
|
||||
null,
|
||||
null,
|
||||
request.Metadata ?? ImmutableSortedDictionary<string, string>.Empty,
|
||||
cancellationRequested: false,
|
||||
cancellationRequestedAt: null,
|
||||
cancellationReason: null,
|
||||
SchedulerSchemaVersions.PolicyRunStatus);
|
||||
|
||||
lock (_gate)
|
||||
{
|
||||
@@ -110,7 +113,7 @@ internal sealed class InMemoryPolicyRunService : IPolicyRunService
|
||||
return Task.FromResult<IReadOnlyList<PolicyRunStatus>>(result);
|
||||
}
|
||||
|
||||
public Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken)
|
||||
public Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
@@ -126,13 +129,121 @@ internal sealed class InMemoryPolicyRunService : IPolicyRunService
|
||||
return Task.FromResult<PolicyRunStatus?>(null);
|
||||
}
|
||||
|
||||
return Task.FromResult<PolicyRunStatus?>(run);
|
||||
}
|
||||
|
||||
private static string GenerateRunId(string policyId, DateTimeOffset timestamp)
|
||||
{
|
||||
var normalizedPolicyId = string.IsNullOrWhiteSpace(policyId) ? "policy" : policyId.Trim();
|
||||
var suffix = Guid.NewGuid().ToString("N")[..8];
|
||||
return $"run:{normalizedPolicyId}:{timestamp:yyyyMMddTHHmmssZ}:{suffix}";
|
||||
}
|
||||
}
|
||||
return Task.FromResult<PolicyRunStatus?>(run);
|
||||
}
|
||||
|
||||
public Task<PolicyRunStatus?> RequestCancellationAsync(string tenantId, string runId, string? reason, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
PolicyRunStatus? updated;
|
||||
lock (_gate)
|
||||
{
|
||||
if (!_runs.TryGetValue(runId, out var existing) || !string.Equals(existing.TenantId, tenantId, StringComparison.Ordinal))
|
||||
{
|
||||
return Task.FromResult<PolicyRunStatus?>(null);
|
||||
}
|
||||
|
||||
if (IsTerminal(existing.Status))
|
||||
{
|
||||
return Task.FromResult<PolicyRunStatus?>(existing);
|
||||
}
|
||||
|
||||
var cancellationReason = NormalizeCancellationReason(reason);
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
updated = existing with
|
||||
{
|
||||
Status = PolicyRunExecutionStatus.Cancelled,
|
||||
FinishedAt = now,
|
||||
CancellationRequested = true,
|
||||
CancellationRequestedAt = now,
|
||||
CancellationReason = cancellationReason
|
||||
};
|
||||
|
||||
_runs[runId] = updated;
|
||||
var index = _orderedRuns.FindIndex(status => string.Equals(status.RunId, runId, StringComparison.Ordinal));
|
||||
if (index >= 0)
|
||||
{
|
||||
_orderedRuns[index] = updated;
|
||||
}
|
||||
}
|
||||
|
||||
return Task.FromResult<PolicyRunStatus?>(updated);
|
||||
}
|
||||
|
||||
public async Task<PolicyRunStatus> RetryAsync(string tenantId, string runId, string? requestedBy, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
PolicyRunStatus existing;
|
||||
lock (_gate)
|
||||
{
|
||||
if (!_runs.TryGetValue(runId, out var status) || !string.Equals(status.TenantId, tenantId, StringComparison.Ordinal))
|
||||
{
|
||||
throw new KeyNotFoundException($"Policy simulation {runId} was not found for tenant {tenantId}.");
|
||||
}
|
||||
|
||||
if (!IsTerminal(status.Status))
|
||||
{
|
||||
throw new InvalidOperationException("Simulation is still in progress and cannot be retried.");
|
||||
}
|
||||
|
||||
existing = status;
|
||||
}
|
||||
|
||||
var metadataBuilder = (existing.Metadata ?? ImmutableSortedDictionary<string, string>.Empty).ToBuilder();
|
||||
metadataBuilder["retry-of"] = runId;
|
||||
var request = new PolicyRunRequest(
|
||||
tenantId,
|
||||
existing.PolicyId,
|
||||
PolicyRunMode.Simulate,
|
||||
existing.Inputs,
|
||||
existing.Priority,
|
||||
runId: null,
|
||||
policyVersion: existing.PolicyVersion,
|
||||
requestedBy: NormalizeActor(requestedBy),
|
||||
queuedAt: DateTimeOffset.UtcNow,
|
||||
correlationId: null,
|
||||
metadata: metadataBuilder.ToImmutable());
|
||||
|
||||
return await EnqueueAsync(tenantId, request, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private static string GenerateRunId(string policyId, DateTimeOffset timestamp)
|
||||
{
|
||||
var normalizedPolicyId = string.IsNullOrWhiteSpace(policyId) ? "policy" : policyId.Trim();
|
||||
var suffix = Guid.NewGuid().ToString("N")[..8];
|
||||
return $"run:{normalizedPolicyId}:{timestamp:yyyyMMddTHHmmssZ}:{suffix}";
|
||||
}
|
||||
|
||||
private static bool IsTerminal(PolicyRunExecutionStatus status)
|
||||
=> status is PolicyRunExecutionStatus.Succeeded or PolicyRunExecutionStatus.Failed or PolicyRunExecutionStatus.Cancelled;
|
||||
|
||||
private static string? NormalizeCancellationReason(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var trimmed = value.Trim();
|
||||
const int maxLength = 512;
|
||||
return trimmed.Length > maxLength ? trimmed[..maxLength] : trimmed;
|
||||
}
|
||||
|
||||
private static string? NormalizeActor(string? actor)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(actor))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var trimmed = actor.Trim();
|
||||
const int maxLength = 256;
|
||||
return trimmed.Length > maxLength ? trimmed[..maxLength] : trimmed;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,13 +17,19 @@ internal sealed class PolicyRunQueryOptions
|
||||
|
||||
public string? PolicyId { get; private set; }
|
||||
|
||||
public PolicyRunMode? Mode { get; private set; }
|
||||
public PolicyRunMode? Mode { get; private set; }
|
||||
|
||||
public PolicyRunExecutionStatus? Status { get; private set; }
|
||||
|
||||
public DateTimeOffset? QueuedAfter { get; private set; }
|
||||
|
||||
public int Limit { get; private set; } = DefaultLimit;
|
||||
public int Limit { get; private set; } = DefaultLimit;
|
||||
|
||||
public PolicyRunQueryOptions ForceMode(PolicyRunMode mode)
|
||||
{
|
||||
Mode = mode;
|
||||
return this;
|
||||
}
|
||||
|
||||
public static PolicyRunQueryOptions FromRequest(HttpRequest request)
|
||||
{
|
||||
|
||||
@@ -47,7 +47,7 @@ internal sealed class PolicyRunService : IPolicyRunService
|
||||
if (existing is not null)
|
||||
{
|
||||
_logger.LogDebug("Policy run job already exists for tenant {TenantId} and run {RunId}.", tenantId, runId);
|
||||
return ToStatus(existing, now);
|
||||
return PolicyRunStatusFactory.Create(existing, now);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,7 +94,7 @@ internal sealed class PolicyRunService : IPolicyRunService
|
||||
job.RunId,
|
||||
job.Mode);
|
||||
|
||||
return ToStatus(job, now);
|
||||
return PolicyRunStatusFactory.Create(job, now);
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<PolicyRunStatus>> ListAsync(
|
||||
@@ -122,79 +122,139 @@ internal sealed class PolicyRunService : IPolicyRunService
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
return jobs
|
||||
.Select(job => ToStatus(job, now))
|
||||
.ToList();
|
||||
return jobs
|
||||
.Select(job => PolicyRunStatusFactory.Create(job, now))
|
||||
.ToList();
|
||||
}
|
||||
|
||||
public async Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var job = await _repository
|
||||
.GetByRunIdAsync(tenantId, runId, cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (job is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
return ToStatus(job, now);
|
||||
}
|
||||
|
||||
private static PolicyRunStatus ToStatus(PolicyRunJob job, DateTimeOffset now)
|
||||
{
|
||||
var status = MapExecutionStatus(job.Status);
|
||||
var queuedAt = job.QueuedAt ?? job.CreatedAt;
|
||||
var startedAt = job.SubmittedAt;
|
||||
var finishedAt = job.CompletedAt ?? job.CancelledAt;
|
||||
var metadata = job.Metadata ?? ImmutableSortedDictionary<string, string>.Empty;
|
||||
var inputs = job.Inputs ?? PolicyRunInputs.Empty;
|
||||
var policyVersion = job.PolicyVersion
|
||||
?? throw new InvalidOperationException($"Policy run job '{job.Id}' is missing policyVersion.");
|
||||
|
||||
return new PolicyRunStatus(
|
||||
job.RunId ?? job.Id,
|
||||
job.TenantId,
|
||||
job.PolicyId,
|
||||
policyVersion,
|
||||
job.Mode,
|
||||
status,
|
||||
job.Priority,
|
||||
queuedAt,
|
||||
job.Status == PolicyRunJobStatus.Pending ? null : startedAt,
|
||||
finishedAt,
|
||||
PolicyRunStats.Empty,
|
||||
inputs,
|
||||
determinismHash: null,
|
||||
errorCode: null,
|
||||
error: job.Status == PolicyRunJobStatus.Failed ? job.LastError : null,
|
||||
attempts: job.AttemptCount,
|
||||
traceId: null,
|
||||
explainUri: null,
|
||||
metadata,
|
||||
SchedulerSchemaVersions.PolicyRunStatus);
|
||||
}
|
||||
|
||||
private static PolicyRunExecutionStatus MapExecutionStatus(PolicyRunJobStatus status)
|
||||
=> status switch
|
||||
{
|
||||
PolicyRunJobStatus.Pending => PolicyRunExecutionStatus.Queued,
|
||||
PolicyRunJobStatus.Dispatching => PolicyRunExecutionStatus.Running,
|
||||
PolicyRunJobStatus.Submitted => PolicyRunExecutionStatus.Running,
|
||||
PolicyRunJobStatus.Completed => PolicyRunExecutionStatus.Succeeded,
|
||||
PolicyRunJobStatus.Failed => PolicyRunExecutionStatus.Failed,
|
||||
PolicyRunJobStatus.Cancelled => PolicyRunExecutionStatus.Cancelled,
|
||||
_ => PolicyRunExecutionStatus.Queued
|
||||
};
|
||||
|
||||
private static IReadOnlyCollection<PolicyRunJobStatus>? MapExecutionStatus(PolicyRunExecutionStatus status)
|
||||
=> status switch
|
||||
{
|
||||
public async Task<PolicyRunStatus?> GetAsync(string tenantId, string runId, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var job = await _repository
|
||||
.GetByRunIdAsync(tenantId, runId, cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (job is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
return PolicyRunStatusFactory.Create(job, now);
|
||||
}
|
||||
|
||||
public async Task<PolicyRunStatus?> RequestCancellationAsync(
|
||||
string tenantId,
|
||||
string runId,
|
||||
string? reason,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var job = await _repository
|
||||
.GetByRunIdAsync(tenantId, runId, cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (job is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
if (IsTerminal(job.Status))
|
||||
{
|
||||
return PolicyRunStatusFactory.Create(job, now);
|
||||
}
|
||||
|
||||
if (job.CancellationRequested && string.Equals(job.CancellationReason, reason, StringComparison.Ordinal))
|
||||
{
|
||||
return PolicyRunStatusFactory.Create(job, now);
|
||||
}
|
||||
|
||||
var updated = job with
|
||||
{
|
||||
CancellationRequested = true,
|
||||
CancellationRequestedAt = now,
|
||||
CancellationReason = NormalizeCancellationReason(reason),
|
||||
UpdatedAt = now,
|
||||
AvailableAt = now
|
||||
};
|
||||
|
||||
var replaced = await _repository
|
||||
.ReplaceAsync(updated, expectedLeaseOwner: job.LeaseOwner, cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (!replaced)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Failed to persist cancellation request for policy run job {JobId} (runId={RunId}).",
|
||||
job.Id,
|
||||
job.RunId ?? "(pending)");
|
||||
return PolicyRunStatusFactory.Create(job, now);
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Cancellation requested for policy run job {JobId} (runId={RunId}, reason={Reason}).",
|
||||
updated.Id,
|
||||
updated.RunId ?? "(pending)",
|
||||
updated.CancellationReason ?? "none");
|
||||
|
||||
return PolicyRunStatusFactory.Create(updated, now);
|
||||
}
|
||||
|
||||
public async Task<PolicyRunStatus> RetryAsync(
|
||||
string tenantId,
|
||||
string runId,
|
||||
string? requestedBy,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(runId);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var job = await _repository
|
||||
.GetByRunIdAsync(tenantId, runId, cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false)
|
||||
?? throw new KeyNotFoundException($"Policy simulation {runId} was not found for tenant {tenantId}.");
|
||||
|
||||
if (job.Mode != PolicyRunMode.Simulate)
|
||||
{
|
||||
throw new InvalidOperationException("Only simulation runs can be retried through this endpoint.");
|
||||
}
|
||||
|
||||
if (!IsTerminal(job.Status))
|
||||
{
|
||||
throw new InvalidOperationException("Simulation is still in progress and cannot be retried.");
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var metadataBuilder = (job.Metadata ?? ImmutableSortedDictionary<string, string>.Empty).ToBuilder();
|
||||
metadataBuilder["retry-of"] = runId;
|
||||
|
||||
var request = new PolicyRunRequest(
|
||||
tenantId,
|
||||
job.PolicyId,
|
||||
PolicyRunMode.Simulate,
|
||||
job.Inputs ?? PolicyRunInputs.Empty,
|
||||
job.Priority,
|
||||
runId: null,
|
||||
policyVersion: job.PolicyVersion,
|
||||
requestedBy: NormalizeActor(requestedBy),
|
||||
queuedAt: now,
|
||||
correlationId: job.CorrelationId,
|
||||
metadata: metadataBuilder.ToImmutable());
|
||||
|
||||
return await EnqueueAsync(tenantId, request, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private static IReadOnlyCollection<PolicyRunJobStatus>? MapExecutionStatus(PolicyRunExecutionStatus status)
|
||||
=> status switch
|
||||
{
|
||||
PolicyRunExecutionStatus.Queued => new[] { PolicyRunJobStatus.Pending },
|
||||
PolicyRunExecutionStatus.Running => new[] { PolicyRunJobStatus.Dispatching, PolicyRunJobStatus.Submitted },
|
||||
PolicyRunExecutionStatus.Succeeded => new[] { PolicyRunJobStatus.Completed },
|
||||
@@ -202,12 +262,39 @@ internal sealed class PolicyRunService : IPolicyRunService
|
||||
PolicyRunExecutionStatus.Cancelled => new[] { PolicyRunJobStatus.Cancelled },
|
||||
PolicyRunExecutionStatus.ReplayPending => Array.Empty<PolicyRunJobStatus>(),
|
||||
_ => null
|
||||
};
|
||||
|
||||
private static string GenerateRunId(string policyId, DateTimeOffset timestamp)
|
||||
{
|
||||
var normalizedPolicyId = string.IsNullOrWhiteSpace(policyId) ? "policy" : policyId.Trim();
|
||||
var suffix = Guid.NewGuid().ToString("N")[..8];
|
||||
return $"run:{normalizedPolicyId}:{timestamp:yyyyMMddTHHmmssZ}:{suffix}";
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
private static string GenerateRunId(string policyId, DateTimeOffset timestamp)
|
||||
{
|
||||
var normalizedPolicyId = string.IsNullOrWhiteSpace(policyId) ? "policy" : policyId.Trim();
|
||||
var suffix = Guid.NewGuid().ToString("N")[..8];
|
||||
return $"run:{normalizedPolicyId}:{timestamp:yyyyMMddTHHmmssZ}:{suffix}";
|
||||
}
|
||||
|
||||
private static bool IsTerminal(PolicyRunJobStatus status)
|
||||
=> status is PolicyRunJobStatus.Completed or PolicyRunJobStatus.Failed or PolicyRunJobStatus.Cancelled;
|
||||
|
||||
private static string? NormalizeCancellationReason(string? reason)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(reason))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var trimmed = reason.Trim();
|
||||
const int maxLength = 512;
|
||||
return trimmed.Length > maxLength ? trimmed[..maxLength] : trimmed;
|
||||
}
|
||||
|
||||
private static string? NormalizeActor(string? actor)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(actor))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var trimmed = actor.Trim();
|
||||
const int maxLength = 256;
|
||||
return trimmed.Length > maxLength ? trimmed[..maxLength] : trimmed;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,363 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.ComponentModel.DataAnnotations;
|
||||
using System.Text.Json.Serialization;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
using StellaOps.Auth.Abstractions;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.WebService.Auth;
|
||||
using StellaOps.Scheduler.WebService.PolicyRuns;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.PolicySimulations;
|
||||
|
||||
internal static class PolicySimulationEndpointExtensions
|
||||
{
|
||||
private const string Scope = StellaOpsScopes.PolicySimulate;
|
||||
|
||||
public static void MapPolicySimulationEndpoints(this IEndpointRouteBuilder builder)
|
||||
{
|
||||
var group = builder.MapGroup("/api/v1/scheduler/policies/simulations");
|
||||
|
||||
group.MapGet("/", ListSimulationsAsync);
|
||||
group.MapGet("/{simulationId}", GetSimulationAsync);
|
||||
group.MapGet("/{simulationId}/stream", StreamSimulationAsync);
|
||||
group.MapGet("/metrics", GetMetricsAsync);
|
||||
group.MapPost("/", CreateSimulationAsync);
|
||||
group.MapPost("/{simulationId}/cancel", CancelSimulationAsync);
|
||||
group.MapPost("/{simulationId}/retry", RetrySimulationAsync);
|
||||
}
|
||||
|
||||
private static async Task<IResult> ListSimulationsAsync(
|
||||
HttpContext httpContext,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicyRunService policyRunService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
var options = PolicyRunQueryOptions
|
||||
.FromRequest(httpContext.Request)
|
||||
.ForceMode(PolicyRunMode.Simulate);
|
||||
|
||||
var simulations = await policyRunService
|
||||
.ListAsync(tenant.TenantId, options, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return Results.Ok(new PolicySimulationCollectionResponse(simulations));
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> GetSimulationAsync(
|
||||
HttpContext httpContext,
|
||||
string simulationId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicyRunService policyRunService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var simulation = await policyRunService
|
||||
.GetAsync(tenant.TenantId, simulationId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return simulation is null
|
||||
? Results.NotFound()
|
||||
: Results.Ok(new PolicySimulationResponse(simulation));
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> GetMetricsAsync(
|
||||
HttpContext httpContext,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicySimulationMetricsProvider? metricsProvider,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
if (metricsProvider is null)
|
||||
{
|
||||
return Results.StatusCode(StatusCodes.Status501NotImplemented);
|
||||
}
|
||||
|
||||
var metrics = await metricsProvider
|
||||
.CaptureAsync(tenant.TenantId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return Results.Ok(metrics);
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> CreateSimulationAsync(
|
||||
HttpContext httpContext,
|
||||
PolicySimulationCreateRequest request,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicyRunService policyRunService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
var actor = SchedulerEndpointHelpers.ResolveActorId(httpContext);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(request.PolicyId))
|
||||
{
|
||||
throw new ValidationException("policyId must be provided.");
|
||||
}
|
||||
|
||||
if (request.PolicyVersion is null || request.PolicyVersion <= 0)
|
||||
{
|
||||
throw new ValidationException("policyVersion must be provided and greater than zero.");
|
||||
}
|
||||
|
||||
var normalizedMetadata = NormalizeMetadata(request.Metadata);
|
||||
var inputs = request.Inputs ?? PolicyRunInputs.Empty;
|
||||
|
||||
var policyRequest = new PolicyRunRequest(
|
||||
tenant.TenantId,
|
||||
request.PolicyId,
|
||||
PolicyRunMode.Simulate,
|
||||
inputs,
|
||||
request.Priority,
|
||||
runId: null,
|
||||
policyVersion: request.PolicyVersion,
|
||||
requestedBy: actor,
|
||||
queuedAt: null,
|
||||
correlationId: request.CorrelationId,
|
||||
metadata: normalizedMetadata);
|
||||
|
||||
var status = await policyRunService
|
||||
.EnqueueAsync(tenant.TenantId, policyRequest, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return Results.Created(
|
||||
$"/api/v1/scheduler/policies/simulations/{status.RunId}",
|
||||
new PolicySimulationResponse(status));
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> CancelSimulationAsync(
|
||||
HttpContext httpContext,
|
||||
string simulationId,
|
||||
PolicySimulationCancelRequest? request,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicyRunService policyRunService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
var cancellation = await policyRunService
|
||||
.RequestCancellationAsync(tenant.TenantId, simulationId, request?.Reason, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return cancellation is null
|
||||
? Results.NotFound()
|
||||
: Results.Ok(new PolicySimulationResponse(cancellation));
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> RetrySimulationAsync(
|
||||
HttpContext httpContext,
|
||||
string simulationId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicyRunService policyRunService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
var actor = SchedulerEndpointHelpers.ResolveActorId(httpContext);
|
||||
|
||||
var status = await policyRunService
|
||||
.RetryAsync(tenant.TenantId, simulationId, actor, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return Results.Created(
|
||||
$"/api/v1/scheduler/policies/simulations/{status.RunId}",
|
||||
new PolicySimulationResponse(status));
|
||||
}
|
||||
catch (KeyNotFoundException)
|
||||
{
|
||||
return Results.NotFound();
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status409Conflict);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task StreamSimulationAsync(
|
||||
HttpContext httpContext,
|
||||
string simulationId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IPolicyRunService policyRunService,
|
||||
[FromServices] IPolicySimulationStreamCoordinator streamCoordinator,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, Scope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var simulation = await policyRunService
|
||||
.GetAsync(tenant.TenantId, simulationId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (simulation is null)
|
||||
{
|
||||
await Results.NotFound().ExecuteAsync(httpContext);
|
||||
return;
|
||||
}
|
||||
|
||||
await streamCoordinator
|
||||
.StreamAsync(httpContext, tenant.TenantId, simulation, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (UnauthorizedAccessException ex)
|
||||
{
|
||||
await Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status401Unauthorized)
|
||||
.ExecuteAsync(httpContext);
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
await Results.Json(new { error = ex.Message }, statusCode: StatusCodes.Status403Forbidden)
|
||||
.ExecuteAsync(httpContext);
|
||||
}
|
||||
catch (ValidationException ex)
|
||||
{
|
||||
await Results.BadRequest(new { error = ex.Message }).ExecuteAsync(httpContext);
|
||||
}
|
||||
}
|
||||
|
||||
private static ImmutableSortedDictionary<string, string>? NormalizeMetadata(IReadOnlyDictionary<string, string>? metadata)
|
||||
{
|
||||
if (metadata is null || metadata.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var builder = ImmutableSortedDictionary.CreateBuilder<string, string>(StringComparer.Ordinal);
|
||||
foreach (var (key, value) in metadata)
|
||||
{
|
||||
var normalizedKey = key?.Trim();
|
||||
var normalizedValue = value?.Trim();
|
||||
if (string.IsNullOrEmpty(normalizedKey) || string.IsNullOrEmpty(normalizedValue))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var lowerKey = normalizedKey.ToLowerInvariant();
|
||||
if (!builder.ContainsKey(lowerKey))
|
||||
{
|
||||
builder[lowerKey] = normalizedValue;
|
||||
}
|
||||
}
|
||||
|
||||
return builder.Count == 0 ? null : builder.ToImmutable();
|
||||
}
|
||||
}
|
||||
|
||||
internal sealed record PolicySimulationCreateRequest(
|
||||
[property: JsonPropertyName("policyId")] string PolicyId,
|
||||
[property: JsonPropertyName("policyVersion")] int? PolicyVersion,
|
||||
[property: JsonPropertyName("priority")] PolicyRunPriority Priority = PolicyRunPriority.Normal,
|
||||
[property: JsonPropertyName("correlationId")] string? CorrelationId = null,
|
||||
[property: JsonPropertyName("metadata")] IReadOnlyDictionary<string, string>? Metadata = null,
|
||||
[property: JsonPropertyName("inputs")] PolicyRunInputs? Inputs = null);
|
||||
|
||||
internal sealed record PolicySimulationCancelRequest(
|
||||
[property: JsonPropertyName("reason")] string? Reason);
|
||||
|
||||
internal sealed record PolicySimulationCollectionResponse(
|
||||
[property: JsonPropertyName("simulations")] IReadOnlyList<PolicyRunStatus> Simulations);
|
||||
|
||||
internal sealed record PolicySimulationResponse(
|
||||
[property: JsonPropertyName("simulation")] PolicyRunStatus Simulation);
|
||||
@@ -0,0 +1,234 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics.Metrics;
|
||||
using System.Linq;
|
||||
using System.Text.Json.Serialization;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.PolicySimulations;
|
||||
|
||||
internal interface IPolicySimulationMetricsProvider
|
||||
{
|
||||
Task<PolicySimulationMetricsResponse> CaptureAsync(string tenantId, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
internal interface IPolicySimulationMetricsRecorder
|
||||
{
|
||||
void RecordLatency(PolicyRunStatus status, DateTimeOffset observedAt);
|
||||
}
|
||||
|
||||
internal sealed class PolicySimulationMetricsProvider : IPolicySimulationMetricsProvider, IPolicySimulationMetricsRecorder, IDisposable
|
||||
{
|
||||
private static readonly PolicyRunJobStatus[] QueueStatuses =
|
||||
{
|
||||
PolicyRunJobStatus.Pending,
|
||||
PolicyRunJobStatus.Dispatching,
|
||||
PolicyRunJobStatus.Submitted,
|
||||
};
|
||||
|
||||
private static readonly PolicyRunJobStatus[] TerminalStatuses =
|
||||
{
|
||||
PolicyRunJobStatus.Completed,
|
||||
PolicyRunJobStatus.Failed,
|
||||
PolicyRunJobStatus.Cancelled,
|
||||
};
|
||||
|
||||
private readonly IPolicyRunJobRepository _repository;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly Meter _meter;
|
||||
private readonly ObservableGauge<long> _queueGauge;
|
||||
private readonly Histogram<double> _latencyHistogram;
|
||||
private readonly object _snapshotLock = new();
|
||||
private IReadOnlyDictionary<string, long> _latestQueueSnapshot = new Dictionary<string, long>(StringComparer.Ordinal);
|
||||
private bool _disposed;
|
||||
|
||||
public PolicySimulationMetricsProvider(IPolicyRunJobRepository repository, TimeProvider? timeProvider = null)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_meter = new Meter("StellaOps.Scheduler.WebService.PolicySimulations");
|
||||
_queueGauge = _meter.CreateObservableGauge<long>(
|
||||
"policy_simulation_queue_depth",
|
||||
ObserveQueueDepth,
|
||||
unit: "runs",
|
||||
description: "Queued policy simulation jobs grouped by status.");
|
||||
_latencyHistogram = _meter.CreateHistogram<double>(
|
||||
"policy_simulation_latency",
|
||||
unit: "s",
|
||||
description: "End-to-end policy simulation latency (seconds).");
|
||||
}
|
||||
|
||||
public async Task<PolicySimulationMetricsResponse> CaptureAsync(string tenantId, CancellationToken cancellationToken)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
if (string.IsNullOrWhiteSpace(tenantId))
|
||||
{
|
||||
throw new ArgumentException("Tenant id must be provided.", nameof(tenantId));
|
||||
}
|
||||
|
||||
var queueCounts = new Dictionary<string, long>(StringComparer.OrdinalIgnoreCase);
|
||||
long totalQueueDepth = 0;
|
||||
|
||||
foreach (var status in QueueStatuses)
|
||||
{
|
||||
var count = await _repository.CountAsync(
|
||||
tenantId,
|
||||
PolicyRunMode.Simulate,
|
||||
new[] { status },
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
queueCounts[status.ToString().ToLowerInvariant()] = count;
|
||||
totalQueueDepth += count;
|
||||
}
|
||||
|
||||
lock (_snapshotLock)
|
||||
{
|
||||
_latestQueueSnapshot = queueCounts;
|
||||
}
|
||||
|
||||
var sampleSize = 200;
|
||||
var recentJobs = await _repository.ListAsync(
|
||||
tenantId,
|
||||
policyId: null,
|
||||
mode: PolicyRunMode.Simulate,
|
||||
statuses: TerminalStatuses,
|
||||
queuedAfter: null,
|
||||
limit: sampleSize,
|
||||
cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var durations = recentJobs
|
||||
.Select(job => CalculateLatencySeconds(job, _timeProvider.GetUtcNow()))
|
||||
.Where(duration => duration >= 0)
|
||||
.OrderBy(duration => duration)
|
||||
.ToArray();
|
||||
|
||||
var latencyMetrics = new PolicySimulationLatencyMetrics(
|
||||
durations.Length,
|
||||
Percentile(durations, 0.50),
|
||||
Percentile(durations, 0.90),
|
||||
Percentile(durations, 0.95),
|
||||
Percentile(durations, 0.99),
|
||||
Average(durations));
|
||||
|
||||
return new PolicySimulationMetricsResponse(
|
||||
new PolicySimulationQueueDepth(totalQueueDepth, queueCounts),
|
||||
latencyMetrics);
|
||||
}
|
||||
|
||||
public void RecordLatency(PolicyRunStatus status, DateTimeOffset observedAt)
|
||||
{
|
||||
if (status is null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(status));
|
||||
}
|
||||
|
||||
var latencySeconds = CalculateLatencySeconds(status, observedAt);
|
||||
if (latencySeconds >= 0)
|
||||
{
|
||||
_latencyHistogram.Record(latencySeconds);
|
||||
}
|
||||
}
|
||||
|
||||
private IEnumerable<Measurement<long>> ObserveQueueDepth()
|
||||
{
|
||||
IReadOnlyDictionary<string, long> snapshot;
|
||||
lock (_snapshotLock)
|
||||
{
|
||||
snapshot = _latestQueueSnapshot;
|
||||
}
|
||||
|
||||
foreach (var pair in snapshot)
|
||||
{
|
||||
yield return new Measurement<long>(
|
||||
pair.Value,
|
||||
new KeyValuePair<string, object?>("status", pair.Key));
|
||||
}
|
||||
}
|
||||
|
||||
private static double CalculateLatencySeconds(PolicyRunJob job, DateTimeOffset now)
|
||||
{
|
||||
var started = job.QueuedAt ?? job.CreatedAt;
|
||||
var finished = job.CompletedAt ?? job.CancelledAt ?? job.UpdatedAt;
|
||||
if (started == default)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
var duration = (finished - started).TotalSeconds;
|
||||
return duration < 0 ? 0 : duration;
|
||||
}
|
||||
|
||||
private static double CalculateLatencySeconds(PolicyRunStatus status, DateTimeOffset now)
|
||||
{
|
||||
var started = status.QueuedAt;
|
||||
var finished = status.FinishedAt ?? now;
|
||||
if (started == default)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
var duration = (finished - started).TotalSeconds;
|
||||
return duration < 0 ? 0 : duration;
|
||||
}
|
||||
|
||||
private static double? Percentile(IReadOnlyList<double> values, double percentile)
|
||||
{
|
||||
if (values.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var position = percentile * (values.Count - 1);
|
||||
var lowerIndex = (int)Math.Floor(position);
|
||||
var upperIndex = (int)Math.Ceiling(position);
|
||||
|
||||
if (lowerIndex == upperIndex)
|
||||
{
|
||||
return Math.Round(values[lowerIndex], 4);
|
||||
}
|
||||
|
||||
var fraction = position - lowerIndex;
|
||||
var interpolated = values[lowerIndex] + (values[upperIndex] - values[lowerIndex]) * fraction;
|
||||
return Math.Round(interpolated, 4);
|
||||
}
|
||||
|
||||
private static double? Average(IReadOnlyList<double> values)
|
||||
{
|
||||
if (values.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var sum = values.Sum();
|
||||
return Math.Round(sum / values.Count, 4);
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
_meter.Dispose();
|
||||
_disposed = true;
|
||||
}
|
||||
}
|
||||
|
||||
internal sealed record PolicySimulationMetricsResponse(
|
||||
[property: JsonPropertyName("policy_simulation_queue_depth")] PolicySimulationQueueDepth QueueDepth,
|
||||
[property: JsonPropertyName("policy_simulation_latency")] PolicySimulationLatencyMetrics Latency);
|
||||
|
||||
internal sealed record PolicySimulationQueueDepth(
|
||||
[property: JsonPropertyName("total")] long Total,
|
||||
[property: JsonPropertyName("by_status")] IReadOnlyDictionary<string, long> ByStatus);
|
||||
|
||||
internal sealed record PolicySimulationLatencyMetrics(
|
||||
[property: JsonPropertyName("samples")] int Samples,
|
||||
[property: JsonPropertyName("p50_seconds")] double? P50,
|
||||
[property: JsonPropertyName("p90_seconds")] double? P90,
|
||||
[property: JsonPropertyName("p95_seconds")] double? P95,
|
||||
[property: JsonPropertyName("p99_seconds")] double? P99,
|
||||
[property: JsonPropertyName("mean_seconds")] double? Mean);
|
||||
@@ -0,0 +1,198 @@
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.WebService.PolicyRuns;
|
||||
using StellaOps.Scheduler.WebService.Runs;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.PolicySimulations;
|
||||
|
||||
internal interface IPolicySimulationStreamCoordinator
|
||||
{
|
||||
Task StreamAsync(HttpContext context, string tenantId, PolicyRunStatus initialStatus, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
internal sealed class PolicySimulationStreamCoordinator : IPolicySimulationStreamCoordinator
|
||||
{
|
||||
private static readonly JsonSerializerOptions SerializerOptions = new(JsonSerializerDefaults.Web);
|
||||
|
||||
private readonly IPolicyRunService _policyRunService;
|
||||
private readonly IQueueLagSummaryProvider _queueLagProvider;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly RunStreamOptions _options;
|
||||
private readonly IPolicySimulationMetricsRecorder? _metricsRecorder;
|
||||
private readonly ILogger<PolicySimulationStreamCoordinator> _logger;
|
||||
|
||||
public PolicySimulationStreamCoordinator(
|
||||
IPolicyRunService policyRunService,
|
||||
IQueueLagSummaryProvider queueLagProvider,
|
||||
IOptions<RunStreamOptions> options,
|
||||
TimeProvider? timeProvider,
|
||||
ILogger<PolicySimulationStreamCoordinator> logger,
|
||||
IPolicySimulationMetricsRecorder? metricsRecorder = null)
|
||||
{
|
||||
_policyRunService = policyRunService ?? throw new ArgumentNullException(nameof(policyRunService));
|
||||
_queueLagProvider = queueLagProvider ?? throw new ArgumentNullException(nameof(queueLagProvider));
|
||||
_options = (options ?? throw new ArgumentNullException(nameof(options))).Value.Validate();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_metricsRecorder = metricsRecorder;
|
||||
}
|
||||
|
||||
public async Task StreamAsync(HttpContext context, string tenantId, PolicyRunStatus initialStatus, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
ArgumentNullException.ThrowIfNull(initialStatus);
|
||||
|
||||
ConfigureSseHeaders(context.Response);
|
||||
await SseWriter.WriteRetryAsync(context.Response, _options.ReconnectDelay, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var last = initialStatus;
|
||||
await SseWriter.WriteEventAsync(context.Response, "initial", PolicySimulationPayload.From(last), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
await SseWriter.WriteEventAsync(context.Response, "queueLag", _queueLagProvider.Capture(), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
await SseWriter.WriteEventAsync(context.Response, "heartbeat", HeartbeatPayload.Create(_timeProvider.GetUtcNow()), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (IsTerminal(last.Status))
|
||||
{
|
||||
_metricsRecorder?.RecordLatency(last, _timeProvider.GetUtcNow());
|
||||
await SseWriter.WriteEventAsync(context.Response, "completed", PolicySimulationPayload.From(last), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
using var pollTimer = new PeriodicTimer(_options.PollInterval);
|
||||
using var queueTimer = new PeriodicTimer(_options.QueueLagInterval);
|
||||
using var heartbeatTimer = new PeriodicTimer(_options.HeartbeatInterval);
|
||||
|
||||
try
|
||||
{
|
||||
while (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
var pollTask = pollTimer.WaitForNextTickAsync(cancellationToken).AsTask();
|
||||
var queueTask = queueTimer.WaitForNextTickAsync(cancellationToken).AsTask();
|
||||
var heartbeatTask = heartbeatTimer.WaitForNextTickAsync(cancellationToken).AsTask();
|
||||
|
||||
var completed = await Task.WhenAny(pollTask, queueTask, heartbeatTask).ConfigureAwait(false);
|
||||
|
||||
if (completed == pollTask && await pollTask.ConfigureAwait(false))
|
||||
{
|
||||
var current = await _policyRunService
|
||||
.GetAsync(tenantId, last.RunId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (current is null)
|
||||
{
|
||||
_logger.LogWarning("Policy simulation {RunId} disappeared while streaming.", last.RunId);
|
||||
await SseWriter.WriteEventAsync(
|
||||
context.Response,
|
||||
"notFound",
|
||||
new PolicySimulationNotFoundPayload(last.RunId),
|
||||
SerializerOptions,
|
||||
cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
break;
|
||||
}
|
||||
|
||||
if (HasMeaningfulChange(last, current))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(context.Response, "status", PolicySimulationPayload.From(current), SerializerOptions, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
|
||||
last = current;
|
||||
|
||||
if (IsTerminal(last.Status))
|
||||
{
|
||||
_metricsRecorder?.RecordLatency(last, _timeProvider.GetUtcNow());
|
||||
await SseWriter.WriteEventAsync(context.Response, "completed", PolicySimulationPayload.From(last), SerializerOptions, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (completed == queueTask && await queueTask.ConfigureAwait(false))
|
||||
{
|
||||
var summary = _queueLagProvider.Capture();
|
||||
await SseWriter.WriteEventAsync(context.Response, "queueLag", summary, SerializerOptions, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
else if (completed == heartbeatTask && await heartbeatTask.ConfigureAwait(false))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(context.Response, "heartbeat", HeartbeatPayload.Create(_timeProvider.GetUtcNow()), SerializerOptions, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogDebug("Policy simulation stream cancelled for run {RunId}.", last.RunId);
|
||||
}
|
||||
}
|
||||
|
||||
private static void ConfigureSseHeaders(HttpResponse response)
|
||||
{
|
||||
response.StatusCode = StatusCodes.Status200OK;
|
||||
response.Headers.CacheControl = "no-store";
|
||||
response.Headers["X-Accel-Buffering"] = "no";
|
||||
response.Headers["Connection"] = "keep-alive";
|
||||
response.ContentType = "text/event-stream";
|
||||
}
|
||||
|
||||
private static bool HasMeaningfulChange(PolicyRunStatus previous, PolicyRunStatus current)
|
||||
{
|
||||
if (!EqualityComparer<PolicyRunExecutionStatus>.Default.Equals(previous.Status, current.Status))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!Nullable.Equals(previous.StartedAt, current.StartedAt) || !Nullable.Equals(previous.FinishedAt, current.FinishedAt))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (previous.Attempts != current.Attempts)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!string.Equals(previous.Error, current.Error, StringComparison.Ordinal) ||
|
||||
!string.Equals(previous.ErrorCode, current.ErrorCode, StringComparison.Ordinal) ||
|
||||
!string.Equals(previous.DeterminismHash, current.DeterminismHash, StringComparison.Ordinal))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (previous.CancellationRequested != current.CancellationRequested ||
|
||||
!Nullable.Equals(previous.CancellationRequestedAt, current.CancellationRequestedAt) ||
|
||||
!string.Equals(previous.CancellationReason, current.CancellationReason, StringComparison.Ordinal))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!EqualityComparer<PolicyRunStats>.Default.Equals(previous.Stats, current.Stats))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static bool IsTerminal(PolicyRunExecutionStatus status)
|
||||
=> status is PolicyRunExecutionStatus.Succeeded or PolicyRunExecutionStatus.Failed or PolicyRunExecutionStatus.Cancelled;
|
||||
|
||||
private sealed record PolicySimulationPayload(
|
||||
[property: JsonPropertyName("simulation")] PolicyRunStatus Simulation)
|
||||
{
|
||||
public static PolicySimulationPayload From(PolicyRunStatus status) => new(status);
|
||||
}
|
||||
|
||||
private sealed record PolicySimulationNotFoundPayload(
|
||||
[property: JsonPropertyName("runId")] string RunId);
|
||||
|
||||
private sealed record HeartbeatPayload(
|
||||
[property: JsonPropertyName("ts")] DateTimeOffset Timestamp)
|
||||
{
|
||||
public static HeartbeatPayload Create(DateTimeOffset timestamp) => new(timestamp);
|
||||
}
|
||||
}
|
||||
@@ -18,8 +18,9 @@ using StellaOps.Scheduler.WebService.GraphJobs;
|
||||
using StellaOps.Scheduler.WebService.GraphJobs.Events;
|
||||
using StellaOps.Scheduler.WebService.Schedules;
|
||||
using StellaOps.Scheduler.WebService.Options;
|
||||
using StellaOps.Scheduler.WebService.Runs;
|
||||
using StellaOps.Scheduler.WebService.PolicyRuns;
|
||||
using StellaOps.Scheduler.WebService.PolicySimulations;
|
||||
using StellaOps.Scheduler.WebService.Runs;
|
||||
|
||||
var builder = WebApplication.CreateBuilder(args);
|
||||
|
||||
@@ -84,6 +85,8 @@ if (storageSection.Exists())
|
||||
builder.Services.AddSchedulerMongoStorage(storageSection);
|
||||
builder.Services.AddSingleton<IGraphJobStore, MongoGraphJobStore>();
|
||||
builder.Services.AddSingleton<IPolicyRunService, PolicyRunService>();
|
||||
builder.Services.AddSingleton<IPolicySimulationMetricsProvider, PolicySimulationMetricsProvider>();
|
||||
builder.Services.AddSingleton<IPolicySimulationMetricsRecorder>(static sp => (IPolicySimulationMetricsRecorder)sp.GetRequiredService<IPolicySimulationMetricsProvider>());
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -117,6 +120,12 @@ builder.Services.AddOptions<SchedulerOptions>()
|
||||
.Bind(builder.Configuration.GetSection("Scheduler"))
|
||||
.PostConfigure(options => options.Validate());
|
||||
|
||||
builder.Services.AddSingleton<IQueueLagSummaryProvider, QueueLagSummaryProvider>();
|
||||
builder.Services.AddSingleton<IRunStreamCoordinator, RunStreamCoordinator>();
|
||||
builder.Services.AddSingleton<IPolicySimulationStreamCoordinator, PolicySimulationStreamCoordinator>();
|
||||
builder.Services.AddOptions<RunStreamOptions>()
|
||||
.Bind(builder.Configuration.GetSection("Scheduler:RunStream"));
|
||||
|
||||
var pluginHostOptions = SchedulerPluginHostFactory.Build(schedulerOptions.Plugins, builder.Environment.ContentRootPath);
|
||||
builder.Services.AddSingleton(pluginHostOptions);
|
||||
builder.Services.RegisterPluginRoutines(builder.Configuration, pluginHostOptions);
|
||||
@@ -196,6 +205,7 @@ app.MapGraphJobEndpoints();
|
||||
app.MapScheduleEndpoints();
|
||||
app.MapRunEndpoints();
|
||||
app.MapPolicyRunEndpoints();
|
||||
app.MapPolicySimulationEndpoints();
|
||||
app.MapSchedulerEventWebhookEndpoints();
|
||||
|
||||
app.Run();
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
using System;
|
||||
using System.Collections.Immutable;
|
||||
using System.Linq;
|
||||
using StellaOps.Scheduler.Queue;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.Runs;
|
||||
|
||||
internal interface IQueueLagSummaryProvider
|
||||
{
|
||||
QueueLagSummaryResponse Capture();
|
||||
}
|
||||
|
||||
internal sealed class QueueLagSummaryProvider : IQueueLagSummaryProvider
|
||||
{
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
public QueueLagSummaryProvider(TimeProvider? timeProvider = null)
|
||||
{
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
}
|
||||
|
||||
public QueueLagSummaryResponse Capture()
|
||||
{
|
||||
var samples = SchedulerQueueMetrics.CaptureDepthSamples();
|
||||
if (samples.Count == 0)
|
||||
{
|
||||
return new QueueLagSummaryResponse(
|
||||
_timeProvider.GetUtcNow(),
|
||||
0,
|
||||
0,
|
||||
ImmutableArray<QueueLagEntry>.Empty);
|
||||
}
|
||||
|
||||
var ordered = samples
|
||||
.OrderBy(static sample => sample.Transport, StringComparer.Ordinal)
|
||||
.ThenBy(static sample => sample.Queue, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
|
||||
var builder = ImmutableArray.CreateBuilder<QueueLagEntry>(ordered.Length);
|
||||
long totalDepth = 0;
|
||||
long maxDepth = 0;
|
||||
|
||||
foreach (var sample in ordered)
|
||||
{
|
||||
totalDepth += sample.Depth;
|
||||
if (sample.Depth > maxDepth)
|
||||
{
|
||||
maxDepth = sample.Depth;
|
||||
}
|
||||
|
||||
builder.Add(new QueueLagEntry(sample.Transport, sample.Queue, sample.Depth));
|
||||
}
|
||||
|
||||
return new QueueLagSummaryResponse(
|
||||
_timeProvider.GetUtcNow(),
|
||||
totalDepth,
|
||||
maxDepth,
|
||||
builder.ToImmutable());
|
||||
}
|
||||
}
|
||||
@@ -10,8 +10,9 @@ internal sealed record RunCreateRequest(
|
||||
[property: JsonPropertyName("reason")] RunReason? Reason = null,
|
||||
[property: JsonPropertyName("correlationId")] string? CorrelationId = null);
|
||||
|
||||
internal sealed record RunCollectionResponse(
|
||||
[property: JsonPropertyName("runs")] IReadOnlyList<Run> Runs);
|
||||
internal sealed record RunCollectionResponse(
|
||||
[property: JsonPropertyName("runs")] IReadOnlyList<Run> Runs,
|
||||
[property: JsonPropertyName("nextCursor")] string? NextCursor = null);
|
||||
|
||||
internal sealed record RunResponse(
|
||||
[property: JsonPropertyName("run")] Run Run);
|
||||
@@ -31,10 +32,24 @@ internal sealed record ImpactPreviewResponse(
|
||||
[property: JsonPropertyName("snapshotId")] string? SnapshotId,
|
||||
[property: JsonPropertyName("sample")] ImmutableArray<ImpactPreviewSample> Sample);
|
||||
|
||||
internal sealed record ImpactPreviewSample(
|
||||
[property: JsonPropertyName("imageDigest")] string ImageDigest,
|
||||
[property: JsonPropertyName("registry")] string Registry,
|
||||
[property: JsonPropertyName("repository")] string Repository,
|
||||
[property: JsonPropertyName("namespaces")] ImmutableArray<string> Namespaces,
|
||||
[property: JsonPropertyName("tags")] ImmutableArray<string> Tags,
|
||||
[property: JsonPropertyName("usedByEntrypoint")] bool UsedByEntrypoint);
|
||||
internal sealed record ImpactPreviewSample(
|
||||
[property: JsonPropertyName("imageDigest")] string ImageDigest,
|
||||
[property: JsonPropertyName("registry")] string Registry,
|
||||
[property: JsonPropertyName("repository")] string Repository,
|
||||
[property: JsonPropertyName("namespaces")] ImmutableArray<string> Namespaces,
|
||||
[property: JsonPropertyName("tags")] ImmutableArray<string> Tags,
|
||||
[property: JsonPropertyName("usedByEntrypoint")] bool UsedByEntrypoint);
|
||||
|
||||
internal sealed record RunDeltaCollectionResponse(
|
||||
[property: JsonPropertyName("deltas")] ImmutableArray<DeltaSummary> Deltas);
|
||||
|
||||
internal sealed record QueueLagSummaryResponse(
|
||||
[property: JsonPropertyName("capturedAt")] DateTimeOffset CapturedAt,
|
||||
[property: JsonPropertyName("totalDepth")] long TotalDepth,
|
||||
[property: JsonPropertyName("maxDepth")] long MaxDepth,
|
||||
[property: JsonPropertyName("queues")] ImmutableArray<QueueLagEntry> Queues);
|
||||
|
||||
internal sealed record QueueLagEntry(
|
||||
[property: JsonPropertyName("transport")] string Transport,
|
||||
[property: JsonPropertyName("queue")] string Queue,
|
||||
[property: JsonPropertyName("depth")] long Depth);
|
||||
|
||||
@@ -3,7 +3,8 @@ using System.Collections.Generic;
|
||||
using System.Collections.Immutable;
|
||||
using System.ComponentModel.DataAnnotations;
|
||||
using System.Linq;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
using System.Threading;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
using Microsoft.AspNetCore.Routing;
|
||||
using Microsoft.Extensions.Primitives;
|
||||
@@ -15,31 +16,57 @@ using StellaOps.Scheduler.WebService.Auth;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.Runs;
|
||||
|
||||
internal static class RunEndpoints
|
||||
{
|
||||
private const string ReadScope = "scheduler.runs.read";
|
||||
private const string WriteScope = "scheduler.runs.write";
|
||||
private const string PreviewScope = "scheduler.runs.preview";
|
||||
internal static class RunEndpoints
|
||||
{
|
||||
private const string ReadScope = "scheduler.runs.read";
|
||||
private const string WriteScope = "scheduler.runs.write";
|
||||
private const string PreviewScope = "scheduler.runs.preview";
|
||||
private const string ManageScope = "scheduler.runs.manage";
|
||||
private const int DefaultRunListLimit = 50;
|
||||
|
||||
public static IEndpointRouteBuilder MapRunEndpoints(this IEndpointRouteBuilder routes)
|
||||
{
|
||||
var group = routes.MapGroup("/api/v1/scheduler/runs");
|
||||
|
||||
group.MapGet("/", ListRunsAsync);
|
||||
group.MapGet("/queue/lag", GetQueueLagAsync);
|
||||
group.MapGet("/{runId}/deltas", GetRunDeltasAsync);
|
||||
group.MapGet("/{runId}/stream", StreamRunAsync);
|
||||
group.MapGet("/{runId}", GetRunAsync);
|
||||
group.MapPost("/", CreateRunAsync);
|
||||
group.MapPost("/{runId}/cancel", CancelRunAsync);
|
||||
group.MapPost("/{runId}/retry", RetryRunAsync);
|
||||
group.MapPost("/preview", PreviewImpactAsync);
|
||||
|
||||
return routes;
|
||||
}
|
||||
|
||||
public static IEndpointRouteBuilder MapRunEndpoints(this IEndpointRouteBuilder routes)
|
||||
{
|
||||
var group = routes.MapGroup("/api/v1/scheduler/runs");
|
||||
|
||||
group.MapGet("/", ListRunsAsync);
|
||||
group.MapGet("/{runId}", GetRunAsync);
|
||||
group.MapPost("/", CreateRunAsync);
|
||||
group.MapPost("/{runId}/cancel", CancelRunAsync);
|
||||
group.MapPost("/preview", PreviewImpactAsync);
|
||||
|
||||
return routes;
|
||||
}
|
||||
|
||||
private static async Task<IResult> ListRunsAsync(
|
||||
HttpContext httpContext,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository repository,
|
||||
CancellationToken cancellationToken)
|
||||
private static IResult GetQueueLagAsync(
|
||||
HttpContext httpContext,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IQueueLagSummaryProvider queueLagProvider)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
|
||||
tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var summary = queueLagProvider.Capture();
|
||||
return Results.Ok(summary);
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> ListRunsAsync(
|
||||
HttpContext httpContext,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository repository,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
@@ -50,24 +77,35 @@ internal static class RunEndpoints
|
||||
? scheduleValues.ToString().Trim()
|
||||
: null;
|
||||
|
||||
var states = ParseRunStates(httpContext.Request.Query.TryGetValue("state", out var stateValues) ? stateValues : StringValues.Empty);
|
||||
var createdAfter = SchedulerEndpointHelpers.TryParseDateTimeOffset(httpContext.Request.Query.TryGetValue("createdAfter", out var createdAfterValues) ? createdAfterValues.ToString() : null);
|
||||
var limit = SchedulerEndpointHelpers.TryParsePositiveInt(httpContext.Request.Query.TryGetValue("limit", out var limitValues) ? limitValues.ToString() : null);
|
||||
|
||||
var sortAscending = httpContext.Request.Query.TryGetValue("sort", out var sortValues) &&
|
||||
sortValues.Any(value => string.Equals(value, "asc", StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
var options = new RunQueryOptions
|
||||
{
|
||||
ScheduleId = string.IsNullOrWhiteSpace(scheduleId) ? null : scheduleId,
|
||||
States = states,
|
||||
CreatedAfter = createdAfter,
|
||||
Limit = limit,
|
||||
SortAscending = sortAscending,
|
||||
};
|
||||
|
||||
var runs = await repository.ListAsync(tenant.TenantId, options, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
return Results.Ok(new RunCollectionResponse(runs));
|
||||
var states = ParseRunStates(httpContext.Request.Query.TryGetValue("state", out var stateValues) ? stateValues : StringValues.Empty);
|
||||
var createdAfter = SchedulerEndpointHelpers.TryParseDateTimeOffset(httpContext.Request.Query.TryGetValue("createdAfter", out var createdAfterValues) ? createdAfterValues.ToString() : null);
|
||||
var limit = SchedulerEndpointHelpers.TryParsePositiveInt(httpContext.Request.Query.TryGetValue("limit", out var limitValues) ? limitValues.ToString() : null);
|
||||
var cursor = SchedulerEndpointHelpers.TryParseRunCursor(httpContext.Request.Query.TryGetValue("cursor", out var cursorValues) ? cursorValues.ToString() : null);
|
||||
|
||||
var sortAscending = httpContext.Request.Query.TryGetValue("sort", out var sortValues) &&
|
||||
sortValues.Any(value => string.Equals(value, "asc", StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
var appliedLimit = limit ?? DefaultRunListLimit;
|
||||
var options = new RunQueryOptions
|
||||
{
|
||||
ScheduleId = string.IsNullOrWhiteSpace(scheduleId) ? null : scheduleId,
|
||||
States = states,
|
||||
CreatedAfter = createdAfter,
|
||||
Cursor = cursor,
|
||||
Limit = appliedLimit,
|
||||
SortAscending = sortAscending,
|
||||
};
|
||||
|
||||
var runs = await repository.ListAsync(tenant.TenantId, options, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
|
||||
string? nextCursor = null;
|
||||
if (runs.Count == appliedLimit && runs.Count > 0)
|
||||
{
|
||||
var last = runs[^1];
|
||||
nextCursor = SchedulerEndpointHelpers.CreateRunCursor(last);
|
||||
}
|
||||
|
||||
return Results.Ok(new RunCollectionResponse(runs, nextCursor));
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
@@ -75,32 +113,59 @@ internal static class RunEndpoints
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> GetRunAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository repository,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var run = await repository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (run is null)
|
||||
{
|
||||
return Results.NotFound();
|
||||
}
|
||||
|
||||
return Results.Ok(new RunResponse(run));
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
private static async Task<IResult> GetRunAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository repository,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var run = await repository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (run is null)
|
||||
{
|
||||
return Results.NotFound();
|
||||
}
|
||||
|
||||
return Results.Ok(new RunResponse(run));
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> GetRunDeltasAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository repository,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var run = await repository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (run is null)
|
||||
{
|
||||
return Results.NotFound();
|
||||
}
|
||||
|
||||
return Results.Ok(new RunDeltaCollectionResponse(run.Deltas));
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> CreateRunAsync(
|
||||
HttpContext httpContext,
|
||||
@@ -116,7 +181,7 @@ internal static class RunEndpoints
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, WriteScope);
|
||||
scopeAuthorizer.EnsureScope(httpContext, ManageScope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(request.ScheduleId))
|
||||
@@ -184,11 +249,11 @@ internal static class RunEndpoints
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> CancelRunAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
private static async Task<IResult> CancelRunAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository repository,
|
||||
[FromServices] IRunSummaryService runSummaryService,
|
||||
[FromServices] ISchedulerAuditService auditService,
|
||||
@@ -243,9 +308,145 @@ internal static class RunEndpoints
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> RetryRunAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IScheduleRepository scheduleRepository,
|
||||
[FromServices] IRunRepository runRepository,
|
||||
[FromServices] IRunSummaryService runSummaryService,
|
||||
[FromServices] ISchedulerAuditService auditService,
|
||||
[FromServices] TimeProvider timeProvider,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, ManageScope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var existing = await runRepository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (existing is null)
|
||||
{
|
||||
return Results.NotFound();
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(existing.ScheduleId))
|
||||
{
|
||||
return Results.BadRequest(new { error = "Run cannot be retried because it is not associated with a schedule." });
|
||||
}
|
||||
|
||||
if (!RunStateMachine.IsTerminal(existing.State))
|
||||
{
|
||||
return Results.Conflict(new { error = "Run is not in a terminal state and cannot be retried." });
|
||||
}
|
||||
|
||||
var schedule = await scheduleRepository.GetAsync(tenant.TenantId, existing.ScheduleId!, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (schedule is null)
|
||||
{
|
||||
return Results.BadRequest(new { error = "Associated schedule no longer exists." });
|
||||
}
|
||||
|
||||
var now = timeProvider.GetUtcNow();
|
||||
var newRunId = SchedulerEndpointHelpers.GenerateIdentifier("run");
|
||||
var baselineReason = existing.Reason ?? RunReason.Empty;
|
||||
var manualReason = string.IsNullOrWhiteSpace(baselineReason.ManualReason)
|
||||
? $"retry-of:{existing.Id}"
|
||||
: $"{baselineReason.ManualReason};retry-of:{existing.Id}";
|
||||
|
||||
var newReason = new RunReason(
|
||||
manualReason,
|
||||
baselineReason.ConselierExportId,
|
||||
baselineReason.ExcitorExportId,
|
||||
baselineReason.Cursor)
|
||||
{
|
||||
ImpactWindowFrom = baselineReason.ImpactWindowFrom,
|
||||
ImpactWindowTo = baselineReason.ImpactWindowTo
|
||||
};
|
||||
|
||||
var retryRun = new Run(
|
||||
newRunId,
|
||||
tenant.TenantId,
|
||||
RunTrigger.Manual,
|
||||
RunState.Planning,
|
||||
RunStats.Empty,
|
||||
now,
|
||||
newReason,
|
||||
existing.ScheduleId,
|
||||
retryOf: existing.Id);
|
||||
|
||||
await runRepository.InsertAsync(retryRun, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(retryRun.ScheduleId))
|
||||
{
|
||||
await runSummaryService.ProjectAsync(retryRun, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
await auditService.WriteAsync(
|
||||
new SchedulerAuditEvent(
|
||||
tenant.TenantId,
|
||||
"scheduler.run",
|
||||
"retry",
|
||||
SchedulerEndpointHelpers.ResolveAuditActor(httpContext),
|
||||
RunId: retryRun.Id,
|
||||
ScheduleId: retryRun.ScheduleId,
|
||||
Metadata: BuildMetadata(
|
||||
("state", retryRun.State.ToString().ToLowerInvariant()),
|
||||
("retryOf", existing.Id),
|
||||
("trigger", retryRun.Trigger.ToString().ToLowerInvariant()))),
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
return Results.Created($"/api/v1/scheduler/runs/{retryRun.Id}", new RunResponse(retryRun));
|
||||
}
|
||||
catch (InvalidOperationException ex)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
return Results.BadRequest(new { error = ex.Message });
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task StreamRunAsync(
|
||||
HttpContext httpContext,
|
||||
string runId,
|
||||
[FromServices] ITenantContextAccessor tenantAccessor,
|
||||
[FromServices] IScopeAuthorizer scopeAuthorizer,
|
||||
[FromServices] IRunRepository runRepository,
|
||||
[FromServices] IRunStreamCoordinator runStreamCoordinator,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
scopeAuthorizer.EnsureScope(httpContext, ReadScope);
|
||||
var tenant = tenantAccessor.GetTenant(httpContext);
|
||||
|
||||
var run = await runRepository.GetAsync(tenant.TenantId, runId, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (run is null)
|
||||
{
|
||||
await Results.NotFound().ExecuteAsync(httpContext);
|
||||
return;
|
||||
}
|
||||
|
||||
await runStreamCoordinator.StreamAsync(httpContext, tenant.TenantId, run, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
// Client disconnected; nothing to do.
|
||||
}
|
||||
catch (Exception ex) when (ex is ArgumentException or ValidationException)
|
||||
{
|
||||
if (!httpContext.Response.HasStarted)
|
||||
{
|
||||
await Results.BadRequest(new { error = ex.Message }).ExecuteAsync(httpContext);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task<IResult> PreviewImpactAsync(
|
||||
HttpContext httpContext,
|
||||
|
||||
@@ -0,0 +1,225 @@
|
||||
using System;
|
||||
using System.Collections.Immutable;
|
||||
using System.Linq;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.Runs;
|
||||
|
||||
internal interface IRunStreamCoordinator
|
||||
{
|
||||
Task StreamAsync(HttpContext context, string tenantId, Run initialRun, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
internal sealed class RunStreamCoordinator : IRunStreamCoordinator
|
||||
{
|
||||
private static readonly JsonSerializerOptions SerializerOptions = new(JsonSerializerDefaults.Web);
|
||||
|
||||
private readonly IRunRepository _runRepository;
|
||||
private readonly IQueueLagSummaryProvider _queueLagProvider;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<RunStreamCoordinator> _logger;
|
||||
private readonly RunStreamOptions _options;
|
||||
|
||||
public RunStreamCoordinator(
|
||||
IRunRepository runRepository,
|
||||
IQueueLagSummaryProvider queueLagProvider,
|
||||
IOptions<RunStreamOptions> options,
|
||||
TimeProvider? timeProvider,
|
||||
ILogger<RunStreamCoordinator> logger)
|
||||
{
|
||||
_runRepository = runRepository ?? throw new ArgumentNullException(nameof(runRepository));
|
||||
_queueLagProvider = queueLagProvider ?? throw new ArgumentNullException(nameof(queueLagProvider));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_options = (options ?? throw new ArgumentNullException(nameof(options))).Value.Validate();
|
||||
}
|
||||
|
||||
public async Task StreamAsync(HttpContext context, string tenantId, Run initialRun, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
ArgumentNullException.ThrowIfNull(initialRun);
|
||||
|
||||
var response = context.Response;
|
||||
ConfigureSseHeaders(response);
|
||||
await SseWriter.WriteRetryAsync(response, _options.ReconnectDelay, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var lastRun = initialRun;
|
||||
await SseWriter.WriteEventAsync(response, "initial", RunSnapshotPayload.From(lastRun), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
await SseWriter.WriteEventAsync(response, "queueLag", _queueLagProvider.Capture(), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
await SseWriter.WriteEventAsync(response, "heartbeat", HeartbeatPayload.Create(_timeProvider.GetUtcNow()), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (RunStateMachine.IsTerminal(lastRun.State))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "completed", RunSnapshotPayload.From(lastRun), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
using var pollTimer = new PeriodicTimer(_options.PollInterval);
|
||||
using var queueTimer = new PeriodicTimer(_options.QueueLagInterval);
|
||||
using var heartbeatTimer = new PeriodicTimer(_options.HeartbeatInterval);
|
||||
|
||||
try
|
||||
{
|
||||
while (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
var pollTask = pollTimer.WaitForNextTickAsync(cancellationToken).AsTask();
|
||||
var queueTask = queueTimer.WaitForNextTickAsync(cancellationToken).AsTask();
|
||||
var heartbeatTask = heartbeatTimer.WaitForNextTickAsync(cancellationToken).AsTask();
|
||||
|
||||
var completed = await Task.WhenAny(pollTask, queueTask, heartbeatTask).ConfigureAwait(false);
|
||||
|
||||
if (completed == pollTask && await pollTask.ConfigureAwait(false))
|
||||
{
|
||||
var current = await _runRepository.GetAsync(tenantId, lastRun.Id, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (current is null)
|
||||
{
|
||||
_logger.LogWarning("Run {RunId} disappeared while streaming; signalling notFound event.", lastRun.Id);
|
||||
await SseWriter.WriteEventAsync(response, "notFound", new RunNotFoundPayload(lastRun.Id), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
break;
|
||||
}
|
||||
|
||||
await EmitRunDifferencesAsync(response, lastRun, current, cancellationToken).ConfigureAwait(false);
|
||||
lastRun = current;
|
||||
|
||||
if (RunStateMachine.IsTerminal(lastRun.State))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "completed", RunSnapshotPayload.From(lastRun), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (completed == queueTask && await queueTask.ConfigureAwait(false))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "queueLag", _queueLagProvider.Capture(), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
else if (completed == heartbeatTask && await heartbeatTask.ConfigureAwait(false))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "heartbeat", HeartbeatPayload.Create(_timeProvider.GetUtcNow()), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogDebug("Run stream cancelled for run {RunId}.", lastRun.Id);
|
||||
}
|
||||
}
|
||||
|
||||
private static void ConfigureSseHeaders(HttpResponse response)
|
||||
{
|
||||
response.StatusCode = StatusCodes.Status200OK;
|
||||
response.Headers.CacheControl = "no-store";
|
||||
response.Headers["X-Accel-Buffering"] = "no";
|
||||
response.Headers["Connection"] = "keep-alive";
|
||||
response.ContentType = "text/event-stream";
|
||||
}
|
||||
|
||||
private async Task EmitRunDifferencesAsync(HttpResponse response, Run previous, Run current, CancellationToken cancellationToken)
|
||||
{
|
||||
var stateChanged = current.State != previous.State || current.StartedAt != previous.StartedAt || current.FinishedAt != previous.FinishedAt || !string.Equals(current.Error, previous.Error, StringComparison.Ordinal);
|
||||
if (stateChanged)
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "stateChanged", RunStateChangedPayload.From(current), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
if (!ReferenceEquals(current.Stats, previous.Stats) && current.Stats != previous.Stats)
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "segmentProgress", RunStatsPayload.From(current), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
if (!current.Deltas.SequenceEqual(previous.Deltas))
|
||||
{
|
||||
await SseWriter.WriteEventAsync(response, "deltaSummary", new RunDeltaPayload(current.Id, current.Deltas), SerializerOptions, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private sealed record RunSnapshotPayload(
|
||||
[property: JsonPropertyName("run")] Run Run)
|
||||
{
|
||||
public static RunSnapshotPayload From(Run run)
|
||||
=> new(run);
|
||||
}
|
||||
|
||||
private sealed record RunStateChangedPayload(
|
||||
[property: JsonPropertyName("runId")] string RunId,
|
||||
[property: JsonPropertyName("state")] string State,
|
||||
[property: JsonPropertyName("startedAt")] DateTimeOffset? StartedAt,
|
||||
[property: JsonPropertyName("finishedAt")] DateTimeOffset? FinishedAt,
|
||||
[property: JsonPropertyName("error")] string? Error)
|
||||
{
|
||||
public static RunStateChangedPayload From(Run run)
|
||||
=> new(
|
||||
run.Id,
|
||||
run.State.ToString().ToLowerInvariant(),
|
||||
run.StartedAt,
|
||||
run.FinishedAt,
|
||||
run.Error);
|
||||
}
|
||||
|
||||
private sealed record RunStatsPayload(
|
||||
[property: JsonPropertyName("runId")] string RunId,
|
||||
[property: JsonPropertyName("stats")] RunStats Stats)
|
||||
{
|
||||
public static RunStatsPayload From(Run run)
|
||||
=> new(run.Id, run.Stats);
|
||||
}
|
||||
|
||||
private sealed record RunDeltaPayload(
|
||||
[property: JsonPropertyName("runId")] string RunId,
|
||||
[property: JsonPropertyName("deltas")] ImmutableArray<DeltaSummary> Deltas);
|
||||
|
||||
private sealed record HeartbeatPayload(
|
||||
[property: JsonPropertyName("ts")] DateTimeOffset Timestamp)
|
||||
{
|
||||
public static HeartbeatPayload Create(DateTimeOffset timestamp)
|
||||
=> new(timestamp);
|
||||
}
|
||||
|
||||
private sealed record RunNotFoundPayload(
|
||||
[property: JsonPropertyName("runId")] string RunId);
|
||||
}
|
||||
|
||||
internal sealed class RunStreamOptions
|
||||
{
|
||||
private static readonly TimeSpan MinimumInterval = TimeSpan.FromMilliseconds(100);
|
||||
private static readonly TimeSpan MinimumReconnectDelay = TimeSpan.FromMilliseconds(500);
|
||||
|
||||
public TimeSpan PollInterval { get; set; } = TimeSpan.FromSeconds(2);
|
||||
|
||||
public TimeSpan QueueLagInterval { get; set; } = TimeSpan.FromSeconds(10);
|
||||
|
||||
public TimeSpan HeartbeatInterval { get; set; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
public TimeSpan ReconnectDelay { get; set; } = TimeSpan.FromSeconds(5);
|
||||
|
||||
public RunStreamOptions Validate()
|
||||
{
|
||||
if (PollInterval < MinimumInterval)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(PollInterval), PollInterval, "Poll interval must be at least 100ms.");
|
||||
}
|
||||
|
||||
if (QueueLagInterval < MinimumInterval)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(QueueLagInterval), QueueLagInterval, "Queue lag interval must be at least 100ms.");
|
||||
}
|
||||
|
||||
if (HeartbeatInterval < MinimumInterval)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(HeartbeatInterval), HeartbeatInterval, "Heartbeat interval must be at least 100ms.");
|
||||
}
|
||||
|
||||
if (ReconnectDelay < MinimumReconnectDelay)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(ReconnectDelay), ReconnectDelay, "Reconnect delay must be at least 500ms.");
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Text.Json;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.AspNetCore.Http;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.Runs;
|
||||
|
||||
internal static class SseWriter
|
||||
{
|
||||
public static async Task WriteRetryAsync(HttpResponse response, TimeSpan reconnectDelay, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(response);
|
||||
|
||||
var milliseconds = (int)Math.Clamp(reconnectDelay.TotalMilliseconds, 1, int.MaxValue);
|
||||
await response.WriteAsync($"retry: {milliseconds}\r\n\r\n", cancellationToken).ConfigureAwait(false);
|
||||
await response.Body.FlushAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public static async Task WriteEventAsync(HttpResponse response, string eventName, object payload, JsonSerializerOptions serializerOptions, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(response);
|
||||
ArgumentNullException.ThrowIfNull(payload);
|
||||
ArgumentNullException.ThrowIfNull(serializerOptions);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(eventName))
|
||||
{
|
||||
throw new ArgumentException("Event name must be provided.", nameof(eventName));
|
||||
}
|
||||
|
||||
await response.WriteAsync($"event: {eventName}\r\n", cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var json = JsonSerializer.Serialize(payload, serializerOptions);
|
||||
using var reader = new StringReader(json);
|
||||
string? line;
|
||||
while ((line = reader.ReadLine()) is not null)
|
||||
{
|
||||
await response.WriteAsync($"data: {line}\r\n", cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
await response.WriteAsync("\r\n", cancellationToken).ConfigureAwait(false);
|
||||
await response.Body.FlushAsync(cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,9 @@
|
||||
using System.ComponentModel.DataAnnotations;
|
||||
using System.Globalization;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Services;
|
||||
using System.ComponentModel.DataAnnotations;
|
||||
using System.Globalization;
|
||||
using System.Text;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Services;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService;
|
||||
|
||||
@@ -91,11 +93,11 @@ internal static class SchedulerEndpointHelpers
|
||||
return null;
|
||||
}
|
||||
|
||||
public static DateTimeOffset? TryParseDateTimeOffset(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value))
|
||||
{
|
||||
return null;
|
||||
public static DateTimeOffset? TryParseDateTimeOffset(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
if (DateTimeOffset.TryParse(value, CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal, out var parsed))
|
||||
@@ -114,14 +116,62 @@ internal static class SchedulerEndpointHelpers
|
||||
throw new ArgumentException("Tenant identifier must be provided.", nameof(tenantId));
|
||||
}
|
||||
|
||||
return new Selector(
|
||||
selection.Scope,
|
||||
tenantId,
|
||||
selection.Namespaces,
|
||||
selection.Repositories,
|
||||
selection.Digests,
|
||||
selection.IncludeTags,
|
||||
selection.Labels,
|
||||
selection.ResolvesTags);
|
||||
}
|
||||
}
|
||||
return new Selector(
|
||||
selection.Scope,
|
||||
tenantId,
|
||||
selection.Namespaces,
|
||||
selection.Repositories,
|
||||
selection.Digests,
|
||||
selection.IncludeTags,
|
||||
selection.Labels,
|
||||
selection.ResolvesTags);
|
||||
}
|
||||
|
||||
public static string CreateRunCursor(Run run)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(run);
|
||||
var payload = $"{run.CreatedAt.ToUniversalTime():O}|{run.Id}";
|
||||
return Convert.ToBase64String(Encoding.UTF8.GetBytes(payload));
|
||||
}
|
||||
|
||||
public static RunListCursor? TryParseRunCursor(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var trimmed = value.Trim();
|
||||
if (trimmed.Length == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var bytes = Convert.FromBase64String(trimmed);
|
||||
var decoded = Encoding.UTF8.GetString(bytes);
|
||||
var parts = decoded.Split('|', 2, StringSplitOptions.TrimEntries);
|
||||
if (parts.Length != 2)
|
||||
{
|
||||
throw new ValidationException($"Cursor '{value}' is not valid.");
|
||||
}
|
||||
|
||||
if (!DateTimeOffset.TryParse(parts[0], CultureInfo.InvariantCulture, DateTimeStyles.AssumeUniversal, out var timestamp))
|
||||
{
|
||||
throw new ValidationException($"Cursor '{value}' is not valid.");
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(parts[1]))
|
||||
{
|
||||
throw new ValidationException($"Cursor '{value}' is not valid.");
|
||||
}
|
||||
|
||||
return new RunListCursor(timestamp.ToUniversalTime(), parts[1]);
|
||||
}
|
||||
catch (FormatException ex)
|
||||
{
|
||||
throw new ValidationException($"Cursor '{value}' is not valid.", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
<ProjectReference Include="../__Libraries/StellaOps.Scheduler.Models/StellaOps.Scheduler.Models.csproj" />
|
||||
<ProjectReference Include="../__Libraries/StellaOps.Scheduler.Storage.Mongo/StellaOps.Scheduler.Storage.Mongo.csproj" />
|
||||
<ProjectReference Include="../__Libraries/StellaOps.Scheduler.ImpactIndex/StellaOps.Scheduler.ImpactIndex.csproj" />
|
||||
<ProjectReference Include="../__Libraries/StellaOps.Scheduler.Queue/StellaOps.Scheduler.Queue.csproj" />
|
||||
<ProjectReference Include="../../__Libraries/StellaOps.Plugin/StellaOps.Plugin.csproj" />
|
||||
<ProjectReference Include="../../Authority/StellaOps.Authority/StellaOps.Auth.Abstractions/StellaOps.Auth.Abstractions.csproj" />
|
||||
<ProjectReference Include="../../Authority/StellaOps.Authority/StellaOps.Auth.ServerIntegration/StellaOps.Auth.ServerIntegration.csproj" />
|
||||
|
||||
@@ -22,13 +22,13 @@
|
||||
## StellaOps Console (Sprint 23)
|
||||
| ID | Status | Owner(s) | Depends on | Description | Exit Criteria |
|
||||
|----|--------|----------|------------|-------------|---------------|
|
||||
| SCHED-CONSOLE-23-001 | TODO | Scheduler WebService Guild, BE-Base Platform Guild | SCHED-WEB-16-103, SCHED-WEB-20-001 | Extend runs APIs with live progress SSE endpoints (`/console/runs/{id}/stream`), queue lag summaries, diff metadata fetch, retry/cancel hooks with RBAC enforcement, and deterministic pagination for history views consumed by Console. | SSE emits heartbeats/backoff headers, progress payload schema documented, unauthorized actions blocked in integration tests, metrics/logs expose queue lag + correlation IDs. |
|
||||
| SCHED-CONSOLE-23-001 | DONE (2025-11-03) | Scheduler WebService Guild, BE-Base Platform Guild | SCHED-WEB-16-103, SCHED-WEB-20-001 | Extend runs APIs with live progress SSE endpoints (`/console/runs/{id}/stream`), queue lag summaries, diff metadata fetch, retry/cancel hooks with RBAC enforcement, and deterministic pagination for history views consumed by Console. | SSE emits heartbeats/backoff headers, progress payload schema documented, unauthorized actions blocked in integration tests, metrics/logs expose queue lag + correlation IDs. |
|
||||
|
||||
## Policy Studio (Sprint 27)
|
||||
| ID | Status | Owner(s) | Depends on | Description | Exit Criteria |
|
||||
|----|--------|----------|------------|-------------|---------------|
|
||||
| SCHED-CONSOLE-27-001 | TODO | Scheduler WebService Guild, Policy Registry Guild | SCHED-WEB-16-103, REGISTRY-API-27-005 | Provide policy batch simulation orchestration endpoints (`/policies/simulations` POST/GET) exposing run creation, shard status, SSE progress, cancellation, and retries with RBAC enforcement. | API handles shard lifecycle with SSE heartbeats + retry headers; unauthorized requests rejected; integration tests cover submit/cancel/resume flows. |
|
||||
| SCHED-CONSOLE-27-002 | TODO | Scheduler WebService Guild, Observability Guild | SCHED-CONSOLE-27-001 | Emit telemetry endpoints/metrics (`policy_simulation_queue_depth`, `policy_simulation_latency`) and webhook callbacks for completion/failure consumed by Registry. | Metrics exposed via gateway, dashboards seeded, webhook contract documented, integration tests validate metrics emission. |
|
||||
| SCHED-CONSOLE-27-001 | DONE (2025-11-03) | Scheduler WebService Guild, Policy Registry Guild | SCHED-WEB-16-103, REGISTRY-API-27-005 | Provide policy batch simulation orchestration endpoints (`/policies/simulations` POST/GET) exposing run creation, shard status, SSE progress, cancellation, and retries with RBAC enforcement. | API handles shard lifecycle with SSE heartbeats + retry headers; unauthorized requests rejected; integration tests cover submit/cancel/resume flows. |
|
||||
| SCHED-CONSOLE-27-002 | DOING (2025-11-03) | Scheduler WebService Guild, Observability Guild | SCHED-CONSOLE-27-001 | Emit telemetry endpoints/metrics (`policy_simulation_queue_depth`, `policy_simulation_latency`) and webhook callbacks for completion/failure consumed by Registry. | Metrics exposed via gateway, dashboards seeded, webhook contract documented, integration tests validate metrics emission. |
|
||||
|
||||
## Vulnerability Explorer (Sprint 29)
|
||||
| ID | Status | Owner(s) | Depends on | Description | Exit Criteria |
|
||||
|
||||
@@ -6,11 +6,21 @@
|
||||
|
||||
| Method | Path | Description | Scopes |
|
||||
| ------ | ---- | ----------- | ------ |
|
||||
| `GET` | `/api/v1/scheduler/runs` | List runs for the current tenant (filter by schedule, state, createdAfter). | `scheduler.runs.read` |
|
||||
| `GET` | `/api/v1/scheduler/runs/{runId}` | Retrieve run details. | `scheduler.runs.read` |
|
||||
| `POST` | `/api/v1/scheduler/runs` | Create an ad-hoc run bound to an existing schedule. | `scheduler.runs.write` |
|
||||
| `POST` | `/api/v1/scheduler/runs/{runId}/cancel` | Transition a run to `cancelled` when still in a non-terminal state. | `scheduler.runs.write` |
|
||||
| `POST` | `/api/v1/scheduler/runs/preview` | Resolve impacted images using the ImpactIndex without enqueuing work. | `scheduler.runs.preview` |
|
||||
| `GET` | `/api/v1/scheduler/runs` | List runs for the current tenant (filter by schedule, state, createdAfter, cursor). | `scheduler.runs.read` |
|
||||
| `GET` | `/api/v1/scheduler/runs/{runId}` | Retrieve run details. | `scheduler.runs.read` |
|
||||
| `GET` | `/api/v1/scheduler/runs/{runId}/deltas` | Fetch deterministic delta metadata for the specified run. | `scheduler.runs.read` |
|
||||
| `GET` | `/api/v1/scheduler/runs/queue/lag` | Snapshot queue depth per transport/queue for console dashboards. | `scheduler.runs.read` |
|
||||
| `GET` | `/api/v1/scheduler/runs/{runId}/stream` | Server-sent events (SSE) stream for live progress, queue lag, and heartbeats. | `scheduler.runs.read` |
|
||||
| `POST` | `/api/v1/scheduler/runs` | Create an ad-hoc run bound to an existing schedule. | `scheduler.runs.write` |
|
||||
| `POST` | `/api/v1/scheduler/runs/{runId}/cancel` | Transition a run to `cancelled` when still in a non-terminal state. | `scheduler.runs.manage` |
|
||||
| `POST` | `/api/v1/scheduler/runs/{runId}/retry` | Clone a terminal run into a new manual retry, preserving provenance. | `scheduler.runs.manage` |
|
||||
| `POST` | `/api/v1/scheduler/runs/preview` | Resolve impacted images using the ImpactIndex without enqueuing work. | `scheduler.runs.preview` |
|
||||
| `GET` | `/api/v1/scheduler/policies/simulations` | List policy simulations for the current tenant (filters: policyId, status, since, limit). | `policy:simulate` |
|
||||
| `GET` | `/api/v1/scheduler/policies/simulations/{simulationId}` | Retrieve simulation status snapshot. | `policy:simulate` |
|
||||
| `GET` | `/api/v1/scheduler/policies/simulations/{simulationId}/stream` | SSE stream emitting simulation status, queue lag, and heartbeats. | `policy:simulate` |
|
||||
| `POST` | `/api/v1/scheduler/policies/simulations` | Enqueue a policy simulation (mode=`simulate`) with optional SBOM inputs and metadata. | `policy:simulate` |
|
||||
| `POST` | `/api/v1/scheduler/policies/simulations/{simulationId}/cancel` | Request cancellation for an in-flight simulation. | `policy:simulate` |
|
||||
| `POST` | `/api/v1/scheduler/policies/simulations/{simulationId}/retry` | Clone a terminal simulation into a new run preserving inputs/metadata. | `policy:simulate` |
|
||||
|
||||
All endpoints require a tenant context (`X-Tenant-Id`) and the appropriate scheduler scopes. Development mode allows header-based auth; production deployments must rely on Authority-issued tokens (OpTok + DPoP).
|
||||
|
||||
@@ -70,12 +80,12 @@ GET /api/v1/scheduler/runs?scheduleId=sch_4f2c7d9e0a2b4c64a0e7b5f9d65c1234&state
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"runs": [
|
||||
{
|
||||
"schemaVersion": "scheduler.run@1",
|
||||
"id": "run_c7b4e9d2f6a04f8784a40476d8a2f771",
|
||||
"tenantId": "tenant-alpha",
|
||||
{
|
||||
"runs": [
|
||||
{
|
||||
"schemaVersion": "scheduler.run@1",
|
||||
"id": "run_c7b4e9d2f6a04f8784a40476d8a2f771",
|
||||
"tenantId": "tenant-alpha",
|
||||
"scheduleId": "sch_4f2c7d9e0a2b4c64a0e7b5f9d65c1234",
|
||||
"trigger": "manual",
|
||||
"state": "planning",
|
||||
@@ -93,11 +103,13 @@ GET /api/v1/scheduler/runs?scheduleId=sch_4f2c7d9e0a2b4c64a0e7b5f9d65c1234&state
|
||||
"reason": {
|
||||
"manualReason": "Nightly backfill"
|
||||
},
|
||||
"createdAt": "2025-10-26T03:12:45Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
"createdAt": "2025-10-26T03:12:45Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
When additional pages are available the response includes `"nextCursor": "<base64>"`. Clients pass this cursor via `?cursor=` to fetch the next deterministic slice (ordering = `createdAt desc, id desc`).
|
||||
|
||||
## Cancel Run
|
||||
|
||||
@@ -136,7 +148,33 @@ POST /api/v1/scheduler/runs/run_c7b4e9d2f6a04f8784a40476d8a2f771/cancel
|
||||
|
||||
## Impact Preview
|
||||
|
||||
`/api/v1/scheduler/runs/preview` resolves impacted images via the ImpactIndex without mutating state. When `scheduleId` is provided the schedule selector is reused; callers may alternatively supply an explicit selector.
|
||||
`/api/v1/scheduler/runs/preview` resolves impacted images via the ImpactIndex without mutating state. When `scheduleId` is provided the schedule selector is reused; callers may alternatively supply an explicit selector.
|
||||
|
||||
## Retry Run
|
||||
|
||||
`POST /api/v1/scheduler/runs/{runId}/retry` clones a terminal run into a new manual run with `retryOf` pointing to the original identifier. Retry is scope-gated with `scheduler.runs.manage`; the new run’s `reason.manualReason` gains a `retry-of:<runId>` suffix for provenance.
|
||||
|
||||
## Run deltas
|
||||
|
||||
`GET /api/v1/scheduler/runs/{runId}/deltas` returns an immutable, deterministically sorted array of delta summaries (`[imageDigest, severity slices, KEV hits, attestations]`).
|
||||
|
||||
## Queue lag snapshot
|
||||
|
||||
`GET /api/v1/scheduler/runs/queue/lag` exposes queue depth summaries for planner/runner transports. The payload includes `capturedAt`, `totalDepth`, `maxDepth`, and ordered queue entries (transport + queue + depth). Console uses this for backlog dashboards and alert thresholds.
|
||||
|
||||
## Live stream (SSE)
|
||||
|
||||
`GET /api/v1/scheduler/runs/{runId}/stream` emits server-sent events for:
|
||||
|
||||
- `initial` — full run snapshot
|
||||
- `stateChanged` — state/started/finished transitions
|
||||
- `segmentProgress` — stats updates
|
||||
- `deltaSummary` — deltas available
|
||||
- `queueLag` — periodic queue snapshots
|
||||
- `heartbeat` — uptime keep-alive (default 5s)
|
||||
- `completed` — terminal summary
|
||||
|
||||
The stream is tolerant to clients reconnecting (idempotent payloads, deterministic ordering) and honours tenant scope plus cancellation tokens.
|
||||
|
||||
```http
|
||||
POST /api/v1/scheduler/runs/preview
|
||||
@@ -178,6 +216,106 @@ POST /api/v1/scheduler/runs/preview
|
||||
|
||||
### Integration notes
|
||||
|
||||
* Run creation and cancellation produce audit entries under category `scheduler.run` with correlation metadata when provided.
|
||||
* The preview endpoint relies on the ImpactIndex stub in development. Production deployments must register the concrete index implementation before use.
|
||||
* Planner/worker orchestration tasks will wire run creation to queueing in SCHED-WORKER-16-201/202.
|
||||
* Run creation and cancellation produce audit entries under category `scheduler.run` with correlation metadata when provided.
|
||||
* The preview endpoint relies on the ImpactIndex stub in development. Production deployments must register the concrete index implementation before use.
|
||||
* Planner/worker orchestration tasks will wire run creation to queueing in SCHED-WORKER-16-201/202.
|
||||
|
||||
## Policy simulations
|
||||
|
||||
The policy simulation APIs mirror the run endpoints but operate on policy-mode jobs (`mode=simulate`) scoped by tenant and RBAC (`policy:simulate`).
|
||||
|
||||
### Create simulation
|
||||
|
||||
```http
|
||||
POST /api/v1/scheduler/policies/simulations
|
||||
X-Tenant-Id: tenant-alpha
|
||||
Authorization: Bearer <OpTok>
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"policyId": "P-7",
|
||||
"policyVersion": 4,
|
||||
"priority": "normal",
|
||||
"metadata": {
|
||||
"source": "console.review"
|
||||
},
|
||||
"inputs": {
|
||||
"sbomSet": ["sbom:S-318", "sbom:S-42"],
|
||||
"captureExplain": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
```json
|
||||
HTTP/1.1 201 Created
|
||||
Location: /api/v1/scheduler/policies/simulations/run:P-7:20251103T153000Z:e4d1a9b2
|
||||
{
|
||||
"simulation": {
|
||||
"schemaVersion": "scheduler.policy-run-status@1",
|
||||
"runId": "run:P-7:20251103T153000Z:e4d1a9b2",
|
||||
"tenantId": "tenant-alpha",
|
||||
"policyId": "P-7",
|
||||
"policyVersion": 4,
|
||||
"mode": "simulate",
|
||||
"status": "queued",
|
||||
"priority": "normal",
|
||||
"queuedAt": "2025-11-03T15:30:00Z",
|
||||
"stats": {
|
||||
"components": 0,
|
||||
"rulesFired": 0,
|
||||
"findingsWritten": 0,
|
||||
"vexOverrides": 0
|
||||
},
|
||||
"inputs": {
|
||||
"sbomSet": ["sbom:S-318", "sbom:S-42"],
|
||||
"captureExplain": true
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Canonical payload lives in `samples/api/scheduler/policy-simulation-status.json`.
|
||||
|
||||
### List and fetch simulations
|
||||
|
||||
- `GET /api/v1/scheduler/policies/simulations?policyId=P-7&status=queued&limit=25`
|
||||
- `GET /api/v1/scheduler/policies/simulations/{simulationId}`
|
||||
|
||||
The response envelope mirrors `policy-run-status` but uses `simulations` / `simulation` wrappers. All metadata keys are lower-case; retries append `retry-of=<priorRunId>` for provenance.
|
||||
|
||||
### Cancel and retry
|
||||
|
||||
- `POST /api/v1/scheduler/policies/simulations/{simulationId}/cancel`
|
||||
- Marks the job as `cancellationRequested` and surfaces the reason. Worker execution honours this flag before leasing.
|
||||
- `POST /api/v1/scheduler/policies/simulations/{simulationId}/retry`
|
||||
- Clones a terminal simulation, preserving inputs/metadata and adding `metadata.retry-of` pointing to the original run ID. Returns `409 Conflict` when the simulation is not terminal.
|
||||
|
||||
### Live stream (SSE)
|
||||
|
||||
`GET /api/v1/scheduler/policies/simulations/{simulationId}/stream` emits:
|
||||
|
||||
- `retry` — reconnection hint (milliseconds) emitted before events.
|
||||
- `initial` — current simulation snapshot.
|
||||
- `status` — status/attempt/stat updates.
|
||||
- `queueLag` — periodic queue depth summary (shares payload with run streams).
|
||||
- `heartbeat` — keep-alive ping (default 5s; configurable under `Scheduler:RunStream`).
|
||||
- `completed` — terminal summary (`succeeded`, `failed`, or `cancelled`).
|
||||
- `notFound` — emitted if the run record disappears while streaming.
|
||||
|
||||
Heartbeats, queue lag summaries, and the reconnection directive are sent immediately after connection so Console clients receive deterministic telemetry when loading a simulation workspace.
|
||||
|
||||
### Metrics
|
||||
|
||||
```
|
||||
GET /api/v1/scheduler/policies/simulations/metrics
|
||||
X-Tenant-Id: tenant-alpha
|
||||
Authorization: Bearer <OpTok>
|
||||
```
|
||||
|
||||
Returns queue depth and latency summaries tailored for simulation dashboards and alerting. Response properties align with the metric names exposed via OTEL (`policy_simulation_queue_depth`, `policy_simulation_latency`). Canonical payload lives at `samples/api/scheduler/policy-simulation-metrics.json`.
|
||||
|
||||
- `policy_simulation_queue_depth.total` — pending simulation jobs (aggregate of `pending`, `dispatching`, `submitted`).
|
||||
- `policy_simulation_latency.*` — latency percentiles (seconds) computed from the most recent terminal simulations.
|
||||
|
||||
> **Note:** When Mongo storage is not configured the metrics provider is disabled and the endpoint responds with `501 Not Implemented`.
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
|
||||
## Policy simulations
|
||||
|
||||
`/api/v1/scheduler/policies/simulations` orchestrates Policy Engine runs in `simulate` mode without mutating persisted findings.
|
||||
|
||||
- **Create** — `POST /api/v1/scheduler/policies/simulations` (scope `policy:simulate`) enqueues a simulation for `policyId`/`policyVersion`, respecting optional `metadata` and structured `inputs` (`sbomSet`, `advisoryCursor`, `vexCursor`, `captureExplain`). Returns `201 Created` with `simulation.runId` and status `queued`.
|
||||
- **List/Get** — `GET /api/v1/scheduler/policies/simulations` and `/.../{simulationId}` expose `PolicyRunStatus` documents filtered to `mode=simulate`, including attempt counts, stats, and cancellation markers.
|
||||
- **Cancel** — `POST /.../{simulationId}/cancel` records `cancellationRequested=true` (optional reason, timestamp) and immediately reflects the updated status; workers honour the flag on the next lease cycle.
|
||||
- **Retry** — `POST /.../{simulationId}/retry` clones a terminal simulation (cancelled/failed/succeeded) into a fresh job preserving inputs/metadata. Non-terminal runs yield `409 Conflict`.
|
||||
- **Stream** — `GET /.../{simulationId}/stream` emits SSE events (`initial`, `status`, `queueLag`, `heartbeat`, `completed`) with the latest `PolicyRunStatus`, enabling Console to render shard progress and cancellation state in real time.
|
||||
|
||||
Simulation APIs share the same deterministic pagination/metadata contracts as policy runs and surface queue depth snapshots via the existing scheduler queue metrics.
|
||||
@@ -0,0 +1,78 @@
|
||||
# SCHED-CONSOLE-27-002 · Policy Simulation Telemetry & Webhooks
|
||||
|
||||
> Owners: Scheduler WebService Guild, Observability Guild
|
||||
> Scope: Policy simulation metrics endpoint and completion webhooks feeding Registry/Console integrations.
|
||||
|
||||
## 1. Metrics endpoint refresher
|
||||
|
||||
- `GET /api/v1/scheduler/policies/simulations/metrics` (scope: `policy:simulate`)
|
||||
- Returns queue depth grouped by status plus latency percentiles derived from the most recent sample window (default 200 terminal runs).
|
||||
- Surface area is unchanged from the implementation in Sprint 27 week 1; consumers should continue to rely on the contract in `samples/api/scheduler/policy-simulation-metrics.json`.
|
||||
- When backing storage is not Mongo the endpoint responds `501 Not Implemented`.
|
||||
|
||||
## 2. Completion webhooks
|
||||
|
||||
Scheduler Worker now emits policy simulation webhooks whenever a simulation reaches a terminal state (`succeeded`, `failed`, `cancelled`). Payloads are aligned with the SSE `completed` event shape and include idempotency headers so downstream systems can safely de-duplicate.
|
||||
|
||||
### 2.1 Configuration
|
||||
|
||||
```jsonc
|
||||
// scheduler-worker.appsettings.json
|
||||
{
|
||||
"Scheduler": {
|
||||
"Worker": {
|
||||
"Policy": {
|
||||
"Webhook": {
|
||||
"Enabled": true,
|
||||
"Endpoint": "https://registry.internal/hooks/policy-simulation",
|
||||
"ApiKeyHeader": "X-StellaOps-Webhook-Key",
|
||||
"ApiKey": "replace-me",
|
||||
"TimeoutSeconds": 10
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- `Enabled`: feature flag; disabled by default to preserve air-gap behaviour.
|
||||
- `Endpoint`: absolute HTTPS endpoint; requests use `POST`.
|
||||
- `ApiKeyHeader`/`ApiKey`: optional bearer for Registry verification.
|
||||
- `TimeoutSeconds`: per-request timeout (defaults to 10s).
|
||||
|
||||
### 2.2 Headers
|
||||
|
||||
| Header | Purpose |
|
||||
|------------------------|---------------------------------------|
|
||||
| `X-StellaOps-Tenant` | Tenant identifier for the simulation. |
|
||||
| `X-StellaOps-Run-Id` | Stable run id (use as idempotency key). |
|
||||
| `X-StellaOps-Webhook-Key` | Optional API key as configured. |
|
||||
|
||||
### 2.3 Payload
|
||||
|
||||
See `samples/api/scheduler/policy-simulation-webhook.json` for a canonical example.
|
||||
|
||||
```json
|
||||
{
|
||||
"tenantId": "tenant-alpha",
|
||||
"simulation": { /* PolicyRunStatus document */ },
|
||||
"result": "failed",
|
||||
"observedAt": "2025-11-03T20:05:12Z",
|
||||
"latencySeconds": 14.287,
|
||||
"reason": "policy engine timeout"
|
||||
}
|
||||
```
|
||||
|
||||
- `result`: `succeeded`, `failed`, `cancelled`, `running`, or `queued`. Terminal webhooks are emitted only for the first three.
|
||||
- `latencySeconds`: bounded to four decimal places; derived from `finishedAt - queuedAt` when timestamps exist, else falls back to observer timestamp.
|
||||
- `reason`: surfaced for failures (`error`) and cancellations (`cancellationReason`); omitted otherwise.
|
||||
|
||||
### 2.4 Delivery semantics
|
||||
|
||||
- Best effort with no retry from the worker — Registry should use `X-StellaOps-Run-Id` for idempotency.
|
||||
- Failures emit WARN logs (prefix `Policy run job {JobId}`).
|
||||
- Disabled configuration short-circuits without network calls (debug log only).
|
||||
|
||||
## 3. SSE compatibility
|
||||
|
||||
No changes were required on the streaming endpoint (`GET /api/v1/scheduler/policies/simulations/{id}/stream`); Console continues to receive `completed` events containing the same `PolicyRunStatus` payload that the webhook publishes.
|
||||
@@ -265,13 +265,16 @@ public sealed record PolicyRunStatus
|
||||
int attempts = 0,
|
||||
string? traceId = null,
|
||||
string? explainUri = null,
|
||||
ImmutableSortedDictionary<string, string>? metadata = null,
|
||||
string? schemaVersion = null)
|
||||
: this(
|
||||
runId,
|
||||
tenantId,
|
||||
policyId,
|
||||
policyVersion,
|
||||
ImmutableSortedDictionary<string, string>? metadata = null,
|
||||
bool cancellationRequested = false,
|
||||
DateTimeOffset? cancellationRequestedAt = null,
|
||||
string? cancellationReason = null,
|
||||
string? schemaVersion = null)
|
||||
: this(
|
||||
runId,
|
||||
tenantId,
|
||||
policyId,
|
||||
policyVersion,
|
||||
mode,
|
||||
status,
|
||||
priority,
|
||||
@@ -282,16 +285,19 @@ public sealed record PolicyRunStatus
|
||||
inputs ?? PolicyRunInputs.Empty,
|
||||
determinismHash,
|
||||
Validation.TrimToNull(errorCode),
|
||||
Validation.TrimToNull(error),
|
||||
attempts,
|
||||
Validation.TrimToNull(traceId),
|
||||
Validation.TrimToNull(explainUri),
|
||||
metadata ?? ImmutableSortedDictionary<string, string>.Empty,
|
||||
schemaVersion)
|
||||
{
|
||||
}
|
||||
|
||||
[JsonConstructor]
|
||||
Validation.TrimToNull(error),
|
||||
attempts,
|
||||
Validation.TrimToNull(traceId),
|
||||
Validation.TrimToNull(explainUri),
|
||||
metadata ?? ImmutableSortedDictionary<string, string>.Empty,
|
||||
cancellationRequested,
|
||||
cancellationRequestedAt,
|
||||
cancellationReason,
|
||||
schemaVersion)
|
||||
{
|
||||
}
|
||||
|
||||
[JsonConstructor]
|
||||
public PolicyRunStatus(
|
||||
string runId,
|
||||
string tenantId,
|
||||
@@ -307,12 +313,15 @@ public sealed record PolicyRunStatus
|
||||
PolicyRunInputs inputs,
|
||||
string? determinismHash,
|
||||
string? errorCode,
|
||||
string? error,
|
||||
int attempts,
|
||||
string? traceId,
|
||||
string? explainUri,
|
||||
ImmutableSortedDictionary<string, string> metadata,
|
||||
string? schemaVersion = null)
|
||||
string? error,
|
||||
int attempts,
|
||||
string? traceId,
|
||||
string? explainUri,
|
||||
ImmutableSortedDictionary<string, string> metadata,
|
||||
bool cancellationRequested,
|
||||
DateTimeOffset? cancellationRequestedAt,
|
||||
string? cancellationReason,
|
||||
string? schemaVersion = null)
|
||||
{
|
||||
SchemaVersion = SchedulerSchemaVersions.EnsurePolicyRunStatus(schemaVersion);
|
||||
RunId = Validation.EnsureId(runId, nameof(runId));
|
||||
@@ -339,16 +348,19 @@ public sealed record PolicyRunStatus
|
||||
? throw new ArgumentOutOfRangeException(nameof(attempts), attempts, "Attempts must be non-negative.")
|
||||
: attempts;
|
||||
TraceId = Validation.TrimToNull(traceId);
|
||||
ExplainUri = Validation.TrimToNull(explainUri);
|
||||
Metadata = (metadata ?? ImmutableSortedDictionary<string, string>.Empty)
|
||||
.Select(static pair => new KeyValuePair<string, string>(
|
||||
Validation.TrimToNull(pair.Key)?.ToLowerInvariant() ?? string.Empty,
|
||||
Validation.TrimToNull(pair.Value) ?? string.Empty))
|
||||
.Where(static pair => !string.IsNullOrEmpty(pair.Key) && !string.IsNullOrEmpty(pair.Value))
|
||||
.DistinctBy(static pair => pair.Key, StringComparer.Ordinal)
|
||||
.OrderBy(static pair => pair.Key, StringComparer.Ordinal)
|
||||
.ToImmutableSortedDictionary(static pair => pair.Key, static pair => pair.Value, StringComparer.Ordinal);
|
||||
}
|
||||
ExplainUri = Validation.TrimToNull(explainUri);
|
||||
Metadata = (metadata ?? ImmutableSortedDictionary<string, string>.Empty)
|
||||
.Select(static pair => new KeyValuePair<string, string>(
|
||||
Validation.TrimToNull(pair.Key)?.ToLowerInvariant() ?? string.Empty,
|
||||
Validation.TrimToNull(pair.Value) ?? string.Empty))
|
||||
.Where(static pair => !string.IsNullOrEmpty(pair.Key) && !string.IsNullOrEmpty(pair.Value))
|
||||
.DistinctBy(static pair => pair.Key, StringComparer.Ordinal)
|
||||
.OrderBy(static pair => pair.Key, StringComparer.Ordinal)
|
||||
.ToImmutableSortedDictionary(static pair => pair.Key, static pair => pair.Value, StringComparer.Ordinal);
|
||||
CancellationRequested = cancellationRequested;
|
||||
CancellationRequestedAt = Validation.NormalizeTimestamp(cancellationRequestedAt);
|
||||
CancellationReason = Validation.TrimToNull(cancellationReason);
|
||||
}
|
||||
|
||||
public string SchemaVersion { get; }
|
||||
|
||||
@@ -392,13 +404,22 @@ public sealed record PolicyRunStatus
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? ExplainUri { get; init; }
|
||||
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
|
||||
public ImmutableSortedDictionary<string, string> Metadata { get; init; } = ImmutableSortedDictionary<string, string>.Empty;
|
||||
|
||||
public PolicyRunStats Stats { get; init; } = PolicyRunStats.Empty;
|
||||
|
||||
public PolicyRunInputs Inputs { get; init; } = PolicyRunInputs.Empty;
|
||||
}
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
|
||||
public ImmutableSortedDictionary<string, string> Metadata { get; init; } = ImmutableSortedDictionary<string, string>.Empty;
|
||||
|
||||
public PolicyRunStats Stats { get; init; } = PolicyRunStats.Empty;
|
||||
|
||||
public PolicyRunInputs Inputs { get; init; } = PolicyRunInputs.Empty;
|
||||
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
|
||||
public bool CancellationRequested { get; init; }
|
||||
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public DateTimeOffset? CancellationRequestedAt { get; init; }
|
||||
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? CancellationReason { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Aggregated metrics captured for a policy run.
|
||||
|
||||
@@ -0,0 +1,62 @@
|
||||
using System;
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.Scheduler.Models;
|
||||
|
||||
/// <summary>
|
||||
/// Shared helper for translating persisted <see cref="PolicyRunJob"/> documents into
|
||||
/// API-facing <see cref="PolicyRunStatus"/> projections.
|
||||
/// </summary>
|
||||
public static class PolicyRunStatusFactory
|
||||
{
|
||||
public static PolicyRunStatus Create(PolicyRunJob job, DateTimeOffset nowUtc)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(job);
|
||||
|
||||
var status = MapExecutionStatus(job.Status);
|
||||
var queuedAt = job.QueuedAt ?? job.CreatedAt;
|
||||
var startedAt = job.SubmittedAt;
|
||||
var finishedAt = job.CompletedAt ?? job.CancelledAt;
|
||||
var metadata = job.Metadata ?? ImmutableSortedDictionary<string, string>.Empty;
|
||||
var inputs = job.Inputs ?? PolicyRunInputs.Empty;
|
||||
var policyVersion = job.PolicyVersion
|
||||
?? throw new InvalidOperationException($"Policy run job '{job.Id}' is missing policyVersion.");
|
||||
|
||||
return new PolicyRunStatus(
|
||||
job.RunId ?? job.Id,
|
||||
job.TenantId,
|
||||
job.PolicyId,
|
||||
policyVersion,
|
||||
job.Mode,
|
||||
status,
|
||||
job.Priority,
|
||||
queuedAt,
|
||||
job.Status == PolicyRunJobStatus.Pending ? null : startedAt,
|
||||
finishedAt,
|
||||
PolicyRunStats.Empty,
|
||||
inputs,
|
||||
determinismHash: null,
|
||||
errorCode: null,
|
||||
error: job.Status == PolicyRunJobStatus.Failed ? job.LastError : null,
|
||||
attempts: job.AttemptCount,
|
||||
traceId: null,
|
||||
explainUri: null,
|
||||
metadata,
|
||||
cancellationRequested: job.CancellationRequested,
|
||||
cancellationRequestedAt: job.CancellationRequestedAt,
|
||||
cancellationReason: job.CancellationReason,
|
||||
SchedulerSchemaVersions.PolicyRunStatus);
|
||||
}
|
||||
|
||||
private static PolicyRunExecutionStatus MapExecutionStatus(PolicyRunJobStatus status)
|
||||
=> status switch
|
||||
{
|
||||
PolicyRunJobStatus.Pending => PolicyRunExecutionStatus.Queued,
|
||||
PolicyRunJobStatus.Dispatching => PolicyRunExecutionStatus.Running,
|
||||
PolicyRunJobStatus.Submitted => PolicyRunExecutionStatus.Running,
|
||||
PolicyRunJobStatus.Completed => PolicyRunExecutionStatus.Succeeded,
|
||||
PolicyRunJobStatus.Failed => PolicyRunExecutionStatus.Failed,
|
||||
PolicyRunJobStatus.Cancelled => PolicyRunExecutionStatus.Cancelled,
|
||||
_ => PolicyRunExecutionStatus.Queued
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
using System;
|
||||
using System.Text.Json.Serialization;
|
||||
|
||||
namespace StellaOps.Scheduler.Models;
|
||||
|
||||
public sealed record PolicySimulationWebhookPayload(
|
||||
[property: JsonPropertyName("tenantId")] string TenantId,
|
||||
[property: JsonPropertyName("simulation")] PolicyRunStatus Simulation,
|
||||
[property: JsonPropertyName("result")] string Result,
|
||||
[property: JsonPropertyName("observedAt")] DateTimeOffset ObservedAt,
|
||||
[property: JsonPropertyName("latencySeconds")] double? LatencySeconds,
|
||||
[property: JsonPropertyName("reason")] string? Reason);
|
||||
|
||||
public static class PolicySimulationWebhookPayloadFactory
|
||||
{
|
||||
public static PolicySimulationWebhookPayload Create(PolicyRunStatus status, DateTimeOffset observedAt)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(status);
|
||||
|
||||
var result = status.Status switch
|
||||
{
|
||||
PolicyRunExecutionStatus.Succeeded => "succeeded",
|
||||
PolicyRunExecutionStatus.Failed => "failed",
|
||||
PolicyRunExecutionStatus.Cancelled => "cancelled",
|
||||
PolicyRunExecutionStatus.ReplayPending => "replay_pending",
|
||||
PolicyRunExecutionStatus.Running => "running",
|
||||
_ => "queued"
|
||||
};
|
||||
|
||||
var latencySeconds = CalculateLatencySeconds(status, observedAt);
|
||||
var reason = status.Status switch
|
||||
{
|
||||
PolicyRunExecutionStatus.Failed => status.Error,
|
||||
PolicyRunExecutionStatus.Cancelled => status.CancellationReason,
|
||||
_ => null
|
||||
};
|
||||
|
||||
return new PolicySimulationWebhookPayload(
|
||||
status.TenantId,
|
||||
status,
|
||||
result,
|
||||
observedAt,
|
||||
latencySeconds,
|
||||
reason);
|
||||
}
|
||||
|
||||
private static double? CalculateLatencySeconds(PolicyRunStatus status, DateTimeOffset observedAt)
|
||||
{
|
||||
var started = status.QueuedAt;
|
||||
var finished = status.FinishedAt ?? observedAt;
|
||||
|
||||
if (started == default)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var duration = (finished - started).TotalSeconds;
|
||||
if (duration < 0)
|
||||
{
|
||||
duration = 0;
|
||||
}
|
||||
|
||||
return Math.Round(duration, 4);
|
||||
}
|
||||
}
|
||||
@@ -21,6 +21,7 @@ public sealed record Run
|
||||
DateTimeOffset? finishedAt = null,
|
||||
string? error = null,
|
||||
IEnumerable<DeltaSummary>? deltas = null,
|
||||
string? retryOf = null,
|
||||
string? schemaVersion = null)
|
||||
: this(
|
||||
id,
|
||||
@@ -35,6 +36,7 @@ public sealed record Run
|
||||
Validation.NormalizeTimestamp(finishedAt),
|
||||
Validation.TrimToNull(error),
|
||||
NormalizeDeltas(deltas),
|
||||
Validation.TrimToNull(retryOf),
|
||||
schemaVersion)
|
||||
{
|
||||
}
|
||||
@@ -53,6 +55,7 @@ public sealed record Run
|
||||
DateTimeOffset? finishedAt,
|
||||
string? error,
|
||||
ImmutableArray<DeltaSummary> deltas,
|
||||
string? retryOf,
|
||||
string? schemaVersion = null)
|
||||
{
|
||||
Id = Validation.EnsureId(id, nameof(id));
|
||||
@@ -69,6 +72,7 @@ public sealed record Run
|
||||
Deltas = deltas.IsDefault
|
||||
? ImmutableArray<DeltaSummary>.Empty
|
||||
: deltas.OrderBy(static delta => delta.ImageDigest, StringComparer.Ordinal).ToImmutableArray();
|
||||
RetryOf = Validation.TrimToNull(retryOf);
|
||||
SchemaVersion = SchedulerSchemaVersions.EnsureRun(schemaVersion);
|
||||
}
|
||||
|
||||
@@ -103,6 +107,9 @@ public sealed record Run
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
|
||||
public ImmutableArray<DeltaSummary> Deltas { get; } = ImmutableArray<DeltaSummary>.Empty;
|
||||
|
||||
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
public string? RetryOf { get; }
|
||||
|
||||
private static ImmutableArray<DeltaSummary> NormalizeDeltas(IEnumerable<DeltaSummary>? deltas)
|
||||
{
|
||||
if (deltas is null)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics.Metrics;
|
||||
@@ -5,7 +6,7 @@ using System.Linq;
|
||||
|
||||
namespace StellaOps.Scheduler.Queue;
|
||||
|
||||
internal static class SchedulerQueueMetrics
|
||||
public static class SchedulerQueueMetrics
|
||||
{
|
||||
private const string TransportTagName = "transport";
|
||||
private const string QueueTagName = "queue";
|
||||
@@ -21,6 +22,25 @@ internal static class SchedulerQueueMetrics
|
||||
"scheduler_queue_depth",
|
||||
ObserveDepth);
|
||||
|
||||
public static IReadOnlyList<SchedulerQueueDepthSample> CaptureDepthSamples()
|
||||
{
|
||||
var snapshot = DepthSamples.ToArray();
|
||||
if (snapshot.Length == 0)
|
||||
{
|
||||
return Array.Empty<SchedulerQueueDepthSample>();
|
||||
}
|
||||
|
||||
var samples = new SchedulerQueueDepthSample[snapshot.Length];
|
||||
for (var i = 0; i < snapshot.Length; i++)
|
||||
{
|
||||
var entry = snapshot[i];
|
||||
samples[i] = new SchedulerQueueDepthSample(entry.Key.transport, entry.Key.queue, entry.Value);
|
||||
}
|
||||
|
||||
Array.Sort(samples, SchedulerQueueDepthSampleComparer.Instance);
|
||||
return Array.AsReadOnly(samples);
|
||||
}
|
||||
|
||||
public static void RecordEnqueued(string transport, string queue)
|
||||
=> EnqueuedCounter.Add(1, BuildTags(transport, queue));
|
||||
|
||||
@@ -45,6 +65,22 @@ internal static class SchedulerQueueMetrics
|
||||
internal static IReadOnlyDictionary<(string transport, string queue), long> SnapshotDepths()
|
||||
=> DepthSamples.ToDictionary(pair => pair.Key, pair => pair.Value);
|
||||
|
||||
private sealed class SchedulerQueueDepthSampleComparer : IComparer<SchedulerQueueDepthSample>
|
||||
{
|
||||
public static SchedulerQueueDepthSampleComparer Instance { get; } = new();
|
||||
|
||||
public int Compare(SchedulerQueueDepthSample x, SchedulerQueueDepthSample y)
|
||||
{
|
||||
var transport = string.Compare(x.Transport, y.Transport, StringComparison.Ordinal);
|
||||
if (transport != 0)
|
||||
{
|
||||
return transport;
|
||||
}
|
||||
|
||||
return string.Compare(x.Queue, y.Queue, StringComparison.Ordinal);
|
||||
}
|
||||
}
|
||||
|
||||
private static KeyValuePair<string, object?>[] BuildTags(string transport, string queue)
|
||||
=> new[]
|
||||
{
|
||||
@@ -63,3 +99,5 @@ internal static class SchedulerQueueMetrics
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public readonly record struct SchedulerQueueDepthSample(string Transport, string Queue, long Depth);
|
||||
|
||||
@@ -15,6 +15,8 @@ public sealed class SchedulerMongoOptions
|
||||
|
||||
public string RunsCollection { get; set; } = "runs";
|
||||
|
||||
public string PolicyJobsCollection { get; set; } = "policy_jobs";
|
||||
|
||||
public string ImpactSnapshotsCollection { get; set; } = "impact_snapshots";
|
||||
|
||||
public string AuditCollection { get; set; } = "audit";
|
||||
|
||||
@@ -36,13 +36,19 @@ public interface IPolicyRunJobRepository
|
||||
PolicyRunMode? mode = null,
|
||||
IReadOnlyCollection<PolicyRunJobStatus>? statuses = null,
|
||||
DateTimeOffset? queuedAfter = null,
|
||||
int limit = 50,
|
||||
IClientSessionHandle? session = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
Task<bool> ReplaceAsync(
|
||||
PolicyRunJob job,
|
||||
string? expectedLeaseOwner = null,
|
||||
IClientSessionHandle? session = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
int limit = 50,
|
||||
IClientSessionHandle? session = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
Task<bool> ReplaceAsync(
|
||||
PolicyRunJob job,
|
||||
string? expectedLeaseOwner = null,
|
||||
IClientSessionHandle? session = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
Task<long> CountAsync(
|
||||
string tenantId,
|
||||
PolicyRunMode mode,
|
||||
IReadOnlyCollection<PolicyRunJobStatus> statuses,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using MongoDB.Bson;
|
||||
using MongoDB.Driver;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Internal;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Serialization;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Threading;
|
||||
using MongoDB.Bson;
|
||||
using MongoDB.Driver;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Internal;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Serialization;
|
||||
|
||||
namespace StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
|
||||
@@ -206,16 +207,43 @@ internal sealed class PolicyRunJobRepository : IPolicyRunJobRepository
|
||||
.ToListAsync(cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return documents
|
||||
.Select(PolicyRunJobDocumentMapper.FromBsonDocument)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
public async Task<bool> ReplaceAsync(
|
||||
PolicyRunJob job,
|
||||
string? expectedLeaseOwner = null,
|
||||
IClientSessionHandle? session = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
return documents
|
||||
.Select(PolicyRunJobDocumentMapper.FromBsonDocument)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
public async Task<long> CountAsync(
|
||||
string tenantId,
|
||||
PolicyRunMode mode,
|
||||
IReadOnlyCollection<PolicyRunJobStatus> statuses,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(tenantId))
|
||||
{
|
||||
throw new ArgumentException("Tenant id must be provided.", nameof(tenantId));
|
||||
}
|
||||
|
||||
var filters = new List<FilterDefinition<BsonDocument>>
|
||||
{
|
||||
Filter.Eq("tenantId", tenantId),
|
||||
Filter.Eq("mode", mode.ToString().ToLowerInvariant())
|
||||
};
|
||||
|
||||
if (statuses is { Count: > 0 })
|
||||
{
|
||||
var array = new BsonArray(statuses.Select(static status => status.ToString().ToLowerInvariant()));
|
||||
filters.Add(Filter.In("status", array));
|
||||
}
|
||||
|
||||
var filter = Filter.And(filters);
|
||||
return await _collection.CountDocumentsAsync(filter, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public async Task<bool> ReplaceAsync(
|
||||
PolicyRunJob job,
|
||||
string? expectedLeaseOwner = null,
|
||||
IClientSessionHandle? session = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(job);
|
||||
|
||||
|
||||
@@ -0,0 +1,47 @@
|
||||
using System;
|
||||
|
||||
namespace StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
|
||||
/// <summary>
|
||||
/// Cursor describing the position of a run in deterministic ordering.
|
||||
/// </summary>
|
||||
public sealed record RunListCursor
|
||||
{
|
||||
public RunListCursor(DateTimeOffset createdAt, string runId)
|
||||
{
|
||||
CreatedAt = NormalizeTimestamp(createdAt);
|
||||
RunId = NormalizeRunId(runId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Timestamp of the last run observed (UTC).
|
||||
/// </summary>
|
||||
public DateTimeOffset CreatedAt { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Identifier of the last run observed.
|
||||
/// </summary>
|
||||
public string RunId { get; }
|
||||
|
||||
private static DateTimeOffset NormalizeTimestamp(DateTimeOffset value)
|
||||
{
|
||||
var utc = value.ToUniversalTime();
|
||||
return new DateTimeOffset(DateTime.SpecifyKind(utc.DateTime, DateTimeKind.Utc));
|
||||
}
|
||||
|
||||
private static string NormalizeRunId(string value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value))
|
||||
{
|
||||
throw new ArgumentException("Run id must be provided.", nameof(value));
|
||||
}
|
||||
|
||||
var trimmed = value.Trim();
|
||||
if (trimmed.Length > 256)
|
||||
{
|
||||
throw new ArgumentException("Run id exceeds 256 characters.", nameof(value));
|
||||
}
|
||||
|
||||
return trimmed;
|
||||
}
|
||||
}
|
||||
@@ -19,16 +19,21 @@ public sealed class RunQueryOptions
|
||||
public ImmutableArray<RunState> States { get; init; } = ImmutableArray<RunState>.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Optional lower bound for creation timestamp (UTC).
|
||||
/// </summary>
|
||||
public DateTimeOffset? CreatedAfter { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of runs to return (default 50 when unspecified).
|
||||
/// </summary>
|
||||
public int? Limit { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional lower bound for creation timestamp (UTC).
|
||||
/// </summary>
|
||||
public DateTimeOffset? CreatedAfter { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional cursor to resume iteration using deterministic ordering.
|
||||
/// </summary>
|
||||
public RunListCursor? Cursor { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of runs to return (default 50 when unspecified).
|
||||
/// </summary>
|
||||
public int? Limit { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Sort order flag. Defaults to descending by createdAt.
|
||||
/// </summary>
|
||||
public bool SortAscending { get; init; }
|
||||
|
||||
@@ -127,28 +127,53 @@ internal sealed class RunRepository : IRunRepository
|
||||
filters.Add(Filter.In("state", options.States.Select(state => state.ToString().ToLowerInvariant())));
|
||||
}
|
||||
|
||||
if (options.CreatedAfter is { } createdAfter)
|
||||
{
|
||||
filters.Add(Filter.Gt("createdAt", createdAfter.ToUniversalTime().UtcDateTime));
|
||||
}
|
||||
if (options.CreatedAfter is { } createdAfter)
|
||||
{
|
||||
filters.Add(Filter.Gt("createdAt", createdAfter.ToUniversalTime().UtcDateTime));
|
||||
}
|
||||
|
||||
if (options.Cursor is { } cursor)
|
||||
{
|
||||
var createdAtUtc = cursor.CreatedAt.ToUniversalTime().UtcDateTime;
|
||||
FilterDefinition<BsonDocument> cursorFilter;
|
||||
|
||||
if (options.SortAscending)
|
||||
{
|
||||
cursorFilter = Filter.Or(
|
||||
Filter.Gt("createdAt", createdAtUtc),
|
||||
Filter.And(
|
||||
Filter.Eq("createdAt", createdAtUtc),
|
||||
Filter.Gt("_id", cursor.RunId)));
|
||||
}
|
||||
else
|
||||
{
|
||||
cursorFilter = Filter.Or(
|
||||
Filter.Lt("createdAt", createdAtUtc),
|
||||
Filter.And(
|
||||
Filter.Eq("createdAt", createdAtUtc),
|
||||
Filter.Lt("_id", cursor.RunId)));
|
||||
}
|
||||
|
||||
filters.Add(cursorFilter);
|
||||
}
|
||||
|
||||
var combined = Filter.And(filters);
|
||||
|
||||
var find = session is null
|
||||
? _collection.Find(combined)
|
||||
: _collection.Find(session, combined);
|
||||
|
||||
var combined = Filter.And(filters);
|
||||
|
||||
var find = session is null
|
||||
? _collection.Find(combined)
|
||||
: _collection.Find(session, combined);
|
||||
|
||||
var limit = options.Limit is { } specified && specified > 0 ? specified : DefaultListLimit;
|
||||
find = find.Limit(limit);
|
||||
|
||||
var sortDefinition = options.SortAscending
|
||||
? Sort.Ascending("createdAt")
|
||||
: Sort.Descending("createdAt");
|
||||
|
||||
find = find.Sort(sortDefinition);
|
||||
|
||||
var documents = await find.ToListAsync(cancellationToken).ConfigureAwait(false);
|
||||
return documents.Select(RunDocumentMapper.FromBsonDocument).ToArray();
|
||||
var limit = options.Limit is { } specified && specified > 0 ? specified : DefaultListLimit;
|
||||
find = find.Limit(limit);
|
||||
|
||||
var sortDefinition = options.SortAscending
|
||||
? Sort.Combine(Sort.Ascending("createdAt"), Sort.Ascending("_id"))
|
||||
: Sort.Combine(Sort.Descending("createdAt"), Sort.Descending("_id"));
|
||||
|
||||
find = find.Sort(sortDefinition);
|
||||
|
||||
var documents = await find.ToListAsync(cancellationToken).ConfigureAwait(false);
|
||||
return documents.Select(RunDocumentMapper.FromBsonDocument).ToArray();
|
||||
}
|
||||
|
||||
public async Task<IReadOnlyList<Run>> ListByStateAsync(
|
||||
|
||||
@@ -57,8 +57,9 @@ public static class SchedulerWorkerServiceCollectionExtensions
|
||||
loggerFactory.CreateLogger<SchedulerEventPublisher>());
|
||||
});
|
||||
|
||||
services.AddHttpClient<IScannerReportClient, HttpScannerReportClient>();
|
||||
services.AddHttpClient<IPolicyRunClient, HttpPolicyRunClient>();
|
||||
services.AddHttpClient<IScannerReportClient, HttpScannerReportClient>();
|
||||
services.AddHttpClient<IPolicyRunClient, HttpPolicyRunClient>();
|
||||
services.AddHttpClient<IPolicySimulationWebhookClient, HttpPolicySimulationWebhookClient>();
|
||||
services.AddHttpClient<ICartographerBuildClient, HttpCartographerBuildClient>((sp, client) =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<SchedulerWorkerOptions>>().Value.Graph;
|
||||
|
||||
@@ -4,10 +4,11 @@ using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Graph.Cartographer;
|
||||
using StellaOps.Scheduler.Worker.Graph.Scheduler;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Graph.Cartographer;
|
||||
using StellaOps.Scheduler.Worker.Graph.Scheduler;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Graph;
|
||||
|
||||
|
||||
@@ -4,10 +4,11 @@ using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Graph.Cartographer;
|
||||
using StellaOps.Scheduler.Worker.Graph.Scheduler;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Graph.Cartographer;
|
||||
using StellaOps.Scheduler.Worker.Graph.Scheduler;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Graph;
|
||||
|
||||
|
||||
@@ -1,236 +1,245 @@
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics.Metrics;
|
||||
using StellaOps.Scheduler.Models;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Observability;
|
||||
|
||||
public sealed class SchedulerWorkerMetrics : IDisposable
|
||||
{
|
||||
public const string MeterName = "StellaOps.Scheduler.Worker";
|
||||
|
||||
private readonly Meter _meter;
|
||||
private readonly Counter<long> _plannerRunsTotal;
|
||||
private readonly Histogram<double> _plannerLatencySeconds;
|
||||
private readonly Counter<long> _runnerSegmentsTotal;
|
||||
private readonly Counter<long> _runnerImagesTotal;
|
||||
private readonly Counter<long> _runnerDeltaCriticalTotal;
|
||||
private readonly Counter<long> _runnerDeltaHighTotal;
|
||||
private readonly Counter<long> _runnerDeltaFindingsTotal;
|
||||
private readonly Counter<long> _runnerKevHitsTotal;
|
||||
private readonly Histogram<double> _runDurationSeconds;
|
||||
private readonly UpDownCounter<long> _runsActive;
|
||||
private readonly Counter<long> _graphJobsTotal;
|
||||
private readonly Histogram<double> _graphJobDurationSeconds;
|
||||
private readonly ConcurrentDictionary<string, long> _backlog = new(StringComparer.Ordinal);
|
||||
private readonly ObservableGauge<long> _backlogGauge;
|
||||
private bool _disposed;
|
||||
|
||||
public SchedulerWorkerMetrics()
|
||||
{
|
||||
_meter = new Meter(MeterName);
|
||||
_plannerRunsTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_planner_runs_total",
|
||||
unit: "count",
|
||||
description: "Planner runs grouped by status and mode.");
|
||||
_plannerLatencySeconds = _meter.CreateHistogram<double>(
|
||||
"scheduler_planner_latency_seconds",
|
||||
unit: "s",
|
||||
description: "Latency between run creation and planner processing grouped by mode and status.");
|
||||
_runnerSegmentsTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_runner_segments_total",
|
||||
unit: "count",
|
||||
description: "Runner segments processed grouped by status and mode.");
|
||||
_runnerImagesTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_runner_images_total",
|
||||
unit: "count",
|
||||
description: "Images processed by runner grouped by mode and delta outcome.");
|
||||
_runnerDeltaCriticalTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_runner_delta_critical_total",
|
||||
unit: "count",
|
||||
description: "Critical findings observed by runner grouped by mode.");
|
||||
_runnerDeltaHighTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_runner_delta_high_total",
|
||||
unit: "count",
|
||||
description: "High findings observed by runner grouped by mode.");
|
||||
_runnerDeltaFindingsTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_runner_delta_total",
|
||||
unit: "count",
|
||||
description: "Total findings observed by runner grouped by mode.");
|
||||
_runnerKevHitsTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_runner_delta_kev_total",
|
||||
unit: "count",
|
||||
description: "KEV hits observed by runner grouped by mode.");
|
||||
_runDurationSeconds = _meter.CreateHistogram<double>(
|
||||
"scheduler_run_duration_seconds",
|
||||
unit: "s",
|
||||
description: "End-to-end run durations grouped by mode and result.");
|
||||
_runsActive = _meter.CreateUpDownCounter<long>(
|
||||
"scheduler_runs_active",
|
||||
unit: "count",
|
||||
description: "Active scheduler runs grouped by mode.");
|
||||
_graphJobsTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_graph_jobs_total",
|
||||
unit: "count",
|
||||
description: "Graph jobs processed by the worker grouped by type and result.");
|
||||
_graphJobDurationSeconds = _meter.CreateHistogram<double>(
|
||||
"scheduler_graph_job_duration_seconds",
|
||||
unit: "s",
|
||||
description: "Graph job durations grouped by type and result.");
|
||||
_backlogGauge = _meter.CreateObservableGauge<long>(
|
||||
"scheduler_runner_backlog",
|
||||
ObserveBacklog,
|
||||
unit: "images",
|
||||
description: "Remaining images queued for runner processing grouped by mode and schedule.");
|
||||
}
|
||||
|
||||
public void RecordGraphJobResult(string type, string result, TimeSpan? duration = null)
|
||||
{
|
||||
var tags = new[]
|
||||
{
|
||||
new KeyValuePair<string, object?>("type", type),
|
||||
new KeyValuePair<string, object?>("result", result)
|
||||
};
|
||||
|
||||
_graphJobsTotal.Add(1, tags);
|
||||
|
||||
if (duration is { } jobDuration)
|
||||
{
|
||||
_graphJobDurationSeconds.Record(Math.Max(jobDuration.TotalSeconds, 0d), tags);
|
||||
}
|
||||
}
|
||||
|
||||
public void RecordPlannerResult(string mode, string status, TimeSpan latency, int imageCount)
|
||||
{
|
||||
var tags = new[]
|
||||
{
|
||||
new KeyValuePair<string, object?>("mode", mode),
|
||||
new KeyValuePair<string, object?>("status", status)
|
||||
};
|
||||
_plannerRunsTotal.Add(1, tags);
|
||||
_plannerLatencySeconds.Record(Math.Max(latency.TotalSeconds, 0d), tags);
|
||||
|
||||
if (status.Equals("enqueued", StringComparison.OrdinalIgnoreCase) && imageCount > 0)
|
||||
{
|
||||
_runsActive.Add(1, new[] { new KeyValuePair<string, object?>("mode", mode) });
|
||||
}
|
||||
}
|
||||
|
||||
public void RecordRunnerSegment(string mode, string status, int processedImages, int deltaImages)
|
||||
{
|
||||
var tags = new[]
|
||||
{
|
||||
new KeyValuePair<string, object?>("mode", mode),
|
||||
new KeyValuePair<string, object?>("status", status)
|
||||
};
|
||||
|
||||
_runnerSegmentsTotal.Add(1, tags);
|
||||
|
||||
var imageTags = new[]
|
||||
{
|
||||
new KeyValuePair<string, object?>("mode", mode),
|
||||
new KeyValuePair<string, object?>("delta", deltaImages > 0 ? "true" : "false")
|
||||
};
|
||||
_runnerImagesTotal.Add(processedImages, imageTags);
|
||||
}
|
||||
|
||||
public void RecordDeltaSummaries(string mode, IReadOnlyList<DeltaSummary> deltas)
|
||||
{
|
||||
if (deltas.Count == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var tags = new[] { new KeyValuePair<string, object?>("mode", mode) };
|
||||
|
||||
foreach (var delta in deltas)
|
||||
{
|
||||
if (delta.NewCriticals > 0)
|
||||
{
|
||||
_runnerDeltaCriticalTotal.Add(delta.NewCriticals, tags);
|
||||
}
|
||||
|
||||
if (delta.NewHigh > 0)
|
||||
{
|
||||
_runnerDeltaHighTotal.Add(delta.NewHigh, tags);
|
||||
}
|
||||
|
||||
if (delta.NewFindings > 0)
|
||||
{
|
||||
_runnerDeltaFindingsTotal.Add(delta.NewFindings, tags);
|
||||
}
|
||||
|
||||
if (!delta.KevHits.IsDefaultOrEmpty)
|
||||
{
|
||||
_runnerKevHitsTotal.Add(delta.KevHits.Length, tags);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void RecordRunCompletion(string mode, string result, TimeSpan? duration, bool decrementActive = true)
|
||||
{
|
||||
var tags = new[]
|
||||
{
|
||||
new KeyValuePair<string, object?>("mode", mode),
|
||||
new KeyValuePair<string, object?>("result", result)
|
||||
};
|
||||
|
||||
if (duration is { } runDuration)
|
||||
{
|
||||
_runDurationSeconds.Record(Math.Max(runDuration.TotalSeconds, 0d), tags);
|
||||
}
|
||||
|
||||
if (decrementActive)
|
||||
{
|
||||
_runsActive.Add(-1, new[] { new KeyValuePair<string, object?>("mode", mode) });
|
||||
}
|
||||
}
|
||||
|
||||
public void UpdateBacklog(string mode, string? scheduleId, long backlog)
|
||||
{
|
||||
var key = BuildBacklogKey(mode, scheduleId);
|
||||
if (backlog <= 0)
|
||||
{
|
||||
_backlog.TryRemove(key, out _);
|
||||
}
|
||||
else
|
||||
{
|
||||
_backlog[key] = backlog;
|
||||
}
|
||||
}
|
||||
|
||||
private IEnumerable<Measurement<long>> ObserveBacklog()
|
||||
{
|
||||
foreach (var entry in _backlog)
|
||||
{
|
||||
var (mode, scheduleId) = SplitBacklogKey(entry.Key);
|
||||
yield return new Measurement<long>(
|
||||
entry.Value,
|
||||
new KeyValuePair<string, object?>("mode", mode),
|
||||
new KeyValuePair<string, object?>("scheduleId", scheduleId ?? string.Empty));
|
||||
}
|
||||
}
|
||||
|
||||
private static string BuildBacklogKey(string mode, string? scheduleId)
|
||||
=> $"{mode}|{scheduleId ?? string.Empty}";
|
||||
|
||||
private static (string Mode, string? ScheduleId) SplitBacklogKey(string key)
|
||||
{
|
||||
var parts = key.Split('|', 2);
|
||||
return parts.Length == 2
|
||||
? (parts[0], string.IsNullOrEmpty(parts[1]) ? null : parts[1])
|
||||
: (key, null);
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
_meter.Dispose();
|
||||
_disposed = true;
|
||||
}
|
||||
}
|
||||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics.Metrics;
|
||||
using StellaOps.Scheduler.Models;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Observability;
|
||||
|
||||
public sealed class SchedulerWorkerMetrics : IDisposable
|
||||
{
|
||||
public const string MeterName = "StellaOps.Scheduler.Worker";
|
||||
|
||||
private readonly Meter _meter;
|
||||
private readonly Counter<long> _plannerRunsTotal;
|
||||
private readonly Histogram<double> _plannerLatencySeconds;
|
||||
private readonly Counter<long> _runnerSegmentsTotal;
|
||||
private readonly Counter<long> _runnerImagesTotal;
|
||||
private readonly Counter<long> _runnerDeltaCriticalTotal;
|
||||
private readonly Counter<long> _runnerDeltaHighTotal;
|
||||
private readonly Counter<long> _runnerDeltaFindingsTotal;
|
||||
private readonly Counter<long> _runnerKevHitsTotal;
|
||||
private readonly Histogram<double> _runDurationSeconds;
|
||||
private readonly UpDownCounter<long> _runsActive;
|
||||
private readonly Counter<long> _graphJobsTotal;
|
||||
private readonly Histogram<double> _graphJobDurationSeconds;
|
||||
private readonly ConcurrentDictionary<string, long> _backlog = new(StringComparer.Ordinal);
|
||||
private readonly ObservableGauge<long> _backlogGauge;
|
||||
private bool _disposed;
|
||||
|
||||
public SchedulerWorkerMetrics()
|
||||
{
|
||||
_meter = new Meter(MeterName);
|
||||
_plannerRunsTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_planner_runs_total",
|
||||
unit: "count",
|
||||
description: "Planner runs grouped by status and mode.");
|
||||
_plannerLatencySeconds = _meter.CreateHistogram<double>(
|
||||
"scheduler_planner_latency_seconds",
|
||||
unit: "s",
|
||||
description: "Latency between run creation and planner processing grouped by mode and status.");
|
||||
_runnerSegmentsTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_runner_segments_total",
|
||||
unit: "count",
|
||||
description: "Runner segments processed grouped by status and mode.");
|
||||
_runnerImagesTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_runner_images_total",
|
||||
unit: "count",
|
||||
description: "Images processed by runner grouped by mode and delta outcome.");
|
||||
_runnerDeltaCriticalTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_runner_delta_critical_total",
|
||||
unit: "count",
|
||||
description: "Critical findings observed by runner grouped by mode.");
|
||||
_runnerDeltaHighTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_runner_delta_high_total",
|
||||
unit: "count",
|
||||
description: "High findings observed by runner grouped by mode.");
|
||||
_runnerDeltaFindingsTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_runner_delta_total",
|
||||
unit: "count",
|
||||
description: "Total findings observed by runner grouped by mode.");
|
||||
_runnerKevHitsTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_runner_delta_kev_total",
|
||||
unit: "count",
|
||||
description: "KEV hits observed by runner grouped by mode.");
|
||||
_runDurationSeconds = _meter.CreateHistogram<double>(
|
||||
"scheduler_run_duration_seconds",
|
||||
unit: "s",
|
||||
description: "End-to-end run durations grouped by mode and result.");
|
||||
_runsActive = _meter.CreateUpDownCounter<long>(
|
||||
"scheduler_runs_active",
|
||||
unit: "count",
|
||||
description: "Active scheduler runs grouped by mode.");
|
||||
_graphJobsTotal = _meter.CreateCounter<long>(
|
||||
"scheduler_graph_jobs_total",
|
||||
unit: "count",
|
||||
description: "Graph jobs processed by the worker grouped by type and result.");
|
||||
_graphJobDurationSeconds = _meter.CreateHistogram<double>(
|
||||
"scheduler_graph_job_duration_seconds",
|
||||
unit: "s",
|
||||
description: "Graph job durations grouped by type and result.");
|
||||
_backlogGauge = _meter.CreateObservableGauge<long>(
|
||||
"scheduler_runner_backlog",
|
||||
ObserveBacklog,
|
||||
unit: "images",
|
||||
description: "Remaining images queued for runner processing grouped by mode and schedule.");
|
||||
}
|
||||
|
||||
public void RecordGraphJobResult(string type, string result, TimeSpan? duration = null)
|
||||
{
|
||||
var tags = new[]
|
||||
{
|
||||
new KeyValuePair<string, object?>("type", type),
|
||||
new KeyValuePair<string, object?>("result", result)
|
||||
};
|
||||
|
||||
_graphJobsTotal.Add(1, tags);
|
||||
|
||||
if (duration is { } jobDuration)
|
||||
{
|
||||
_graphJobDurationSeconds.Record(Math.Max(jobDuration.TotalSeconds, 0d), tags);
|
||||
}
|
||||
}
|
||||
|
||||
public void RecordPlannerResult(string mode, string status, TimeSpan latency, int imageCount)
|
||||
{
|
||||
var tags = new[]
|
||||
{
|
||||
new KeyValuePair<string, object?>("mode", mode),
|
||||
new KeyValuePair<string, object?>("status", status)
|
||||
};
|
||||
_plannerRunsTotal.Add(1, tags);
|
||||
_plannerLatencySeconds.Record(Math.Max(latency.TotalSeconds, 0d), tags);
|
||||
|
||||
if (status.Equals("enqueued", StringComparison.OrdinalIgnoreCase) && imageCount > 0)
|
||||
{
|
||||
_runsActive.Add(1, new[] { new KeyValuePair<string, object?>("mode", mode) });
|
||||
}
|
||||
}
|
||||
|
||||
public void RecordRunnerSegment(string mode, string status, int processedImages, int deltaImages)
|
||||
{
|
||||
var tags = new[]
|
||||
{
|
||||
new KeyValuePair<string, object?>("mode", mode),
|
||||
new KeyValuePair<string, object?>("status", status)
|
||||
};
|
||||
|
||||
_runnerSegmentsTotal.Add(1, tags);
|
||||
|
||||
var imageTags = new[]
|
||||
{
|
||||
new KeyValuePair<string, object?>("mode", mode),
|
||||
new KeyValuePair<string, object?>("delta", deltaImages > 0 ? "true" : "false")
|
||||
};
|
||||
_runnerImagesTotal.Add(processedImages, imageTags);
|
||||
}
|
||||
|
||||
public void RecordDeltaSummaries(string mode, IReadOnlyList<DeltaSummary> deltas)
|
||||
{
|
||||
if (deltas.Count == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var tags = new[] { new KeyValuePair<string, object?>("mode", mode) };
|
||||
|
||||
foreach (var delta in deltas)
|
||||
{
|
||||
if (delta.NewCriticals > 0)
|
||||
{
|
||||
_runnerDeltaCriticalTotal.Add(delta.NewCriticals, tags);
|
||||
}
|
||||
|
||||
if (delta.NewHigh > 0)
|
||||
{
|
||||
_runnerDeltaHighTotal.Add(delta.NewHigh, tags);
|
||||
}
|
||||
|
||||
if (delta.NewFindings > 0)
|
||||
{
|
||||
_runnerDeltaFindingsTotal.Add(delta.NewFindings, tags);
|
||||
}
|
||||
|
||||
if (!delta.KevHits.IsDefaultOrEmpty)
|
||||
{
|
||||
_runnerKevHitsTotal.Add(delta.KevHits.Length, tags);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void RecordPolicyRunEvent(string tenantId, string policyId, PolicyRunMode mode, string status, TimeSpan? latency = null, string? reason = null)
|
||||
{
|
||||
var modeTag = mode.ToString().ToLowerInvariant();
|
||||
var decrementActive = !string.Equals(status, "submitted", StringComparison.OrdinalIgnoreCase)
|
||||
&& !string.Equals(status, "retry", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
RecordRunCompletion(modeTag, status, latency, decrementActive);
|
||||
}
|
||||
|
||||
public void RecordRunCompletion(string mode, string result, TimeSpan? duration, bool decrementActive = true)
|
||||
{
|
||||
var tags = new[]
|
||||
{
|
||||
new KeyValuePair<string, object?>("mode", mode),
|
||||
new KeyValuePair<string, object?>("result", result)
|
||||
};
|
||||
|
||||
if (duration is { } runDuration)
|
||||
{
|
||||
_runDurationSeconds.Record(Math.Max(runDuration.TotalSeconds, 0d), tags);
|
||||
}
|
||||
|
||||
if (decrementActive)
|
||||
{
|
||||
_runsActive.Add(-1, new[] { new KeyValuePair<string, object?>("mode", mode) });
|
||||
}
|
||||
}
|
||||
|
||||
public void UpdateBacklog(string mode, string? scheduleId, long backlog)
|
||||
{
|
||||
var key = BuildBacklogKey(mode, scheduleId);
|
||||
if (backlog <= 0)
|
||||
{
|
||||
_backlog.TryRemove(key, out _);
|
||||
}
|
||||
else
|
||||
{
|
||||
_backlog[key] = backlog;
|
||||
}
|
||||
}
|
||||
|
||||
private IEnumerable<Measurement<long>> ObserveBacklog()
|
||||
{
|
||||
foreach (var entry in _backlog)
|
||||
{
|
||||
var (mode, scheduleId) = SplitBacklogKey(entry.Key);
|
||||
yield return new Measurement<long>(
|
||||
entry.Value,
|
||||
new KeyValuePair<string, object?>("mode", mode),
|
||||
new KeyValuePair<string, object?>("scheduleId", scheduleId ?? string.Empty));
|
||||
}
|
||||
}
|
||||
|
||||
private static string BuildBacklogKey(string mode, string? scheduleId)
|
||||
=> $"{mode}|{scheduleId ?? string.Empty}";
|
||||
|
||||
private static (string Mode, string? ScheduleId) SplitBacklogKey(string key)
|
||||
{
|
||||
var parts = key.Split('|', 2);
|
||||
return parts.Length == 2
|
||||
? (parts[0], string.IsNullOrEmpty(parts[1]) ? null : parts[1])
|
||||
: (key, null);
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
_meter.Dispose();
|
||||
_disposed = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -280,18 +280,21 @@ public sealed class SchedulerWorkerOptions
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
public DispatchOptions Dispatch { get; set; } = new();
|
||||
|
||||
public ApiOptions Api { get; set; } = new();
|
||||
|
||||
public TargetingOptions Targeting { get; set; } = new();
|
||||
|
||||
public void Validate()
|
||||
{
|
||||
Dispatch.Validate();
|
||||
Api.Validate();
|
||||
Targeting.Validate();
|
||||
}
|
||||
public DispatchOptions Dispatch { get; set; } = new();
|
||||
|
||||
public ApiOptions Api { get; set; } = new();
|
||||
|
||||
public TargetingOptions Targeting { get; set; } = new();
|
||||
|
||||
public WebhookOptions Webhook { get; set; } = new();
|
||||
|
||||
public void Validate()
|
||||
{
|
||||
Dispatch.Validate();
|
||||
Api.Validate();
|
||||
Targeting.Validate();
|
||||
Webhook.Validate();
|
||||
}
|
||||
|
||||
public sealed class DispatchOptions
|
||||
{
|
||||
@@ -430,11 +433,11 @@ public sealed class SchedulerWorkerOptions
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class TargetingOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// When disabled the worker skips policy delta targeting.
|
||||
/// </summary>
|
||||
public sealed class TargetingOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// When disabled the worker skips policy delta targeting.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
@@ -454,8 +457,59 @@ public sealed class SchedulerWorkerOptions
|
||||
throw new InvalidOperationException("Policy targeting MaxSboms must be greater than zero.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class WebhookOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Controls whether webhook callbacks are emitted when simulations complete.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Absolute endpoint to invoke for webhook callbacks.
|
||||
/// </summary>
|
||||
public string? Endpoint { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional header to carry an API key.
|
||||
/// </summary>
|
||||
public string? ApiKeyHeader { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional API key value aligned with <see cref="ApiKeyHeader"/>.
|
||||
/// </summary>
|
||||
public string? ApiKey { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Request timeout in seconds.
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; set; } = 10;
|
||||
|
||||
public void Validate()
|
||||
{
|
||||
if (!Enabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(Endpoint))
|
||||
{
|
||||
throw new InvalidOperationException("Policy webhook endpoint must be configured when enabled.");
|
||||
}
|
||||
|
||||
if (!Uri.TryCreate(Endpoint, UriKind.Absolute, out _))
|
||||
{
|
||||
throw new InvalidOperationException("Policy webhook endpoint must be an absolute URI.");
|
||||
}
|
||||
|
||||
if (TimeoutSeconds <= 0)
|
||||
{
|
||||
throw new InvalidOperationException("Policy webhook timeout must be greater than zero.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class GraphOptions
|
||||
{
|
||||
|
||||
@@ -13,30 +13,33 @@ namespace StellaOps.Scheduler.Worker.Policy;
|
||||
internal sealed class PolicyRunExecutionService
|
||||
{
|
||||
private readonly IPolicyRunJobRepository _repository;
|
||||
private readonly IPolicyRunClient _client;
|
||||
private readonly IOptions<SchedulerWorkerOptions> _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly IPolicyRunTargetingService _targetingService;
|
||||
private readonly ILogger<PolicyRunExecutionService> _logger;
|
||||
|
||||
public PolicyRunExecutionService(
|
||||
IPolicyRunJobRepository repository,
|
||||
IPolicyRunClient client,
|
||||
IOptions<SchedulerWorkerOptions> options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
IPolicyRunTargetingService targetingService,
|
||||
ILogger<PolicyRunExecutionService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_client = client ?? throw new ArgumentNullException(nameof(client));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_targetingService = targetingService ?? throw new ArgumentNullException(nameof(targetingService));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
private readonly IPolicyRunClient _client;
|
||||
private readonly IOptions<SchedulerWorkerOptions> _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly IPolicyRunTargetingService _targetingService;
|
||||
private readonly IPolicySimulationWebhookClient _webhookClient;
|
||||
private readonly ILogger<PolicyRunExecutionService> _logger;
|
||||
|
||||
public PolicyRunExecutionService(
|
||||
IPolicyRunJobRepository repository,
|
||||
IPolicyRunClient client,
|
||||
IOptions<SchedulerWorkerOptions> options,
|
||||
TimeProvider? timeProvider,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
IPolicyRunTargetingService targetingService,
|
||||
IPolicySimulationWebhookClient webhookClient,
|
||||
ILogger<PolicyRunExecutionService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_client = client ?? throw new ArgumentNullException(nameof(client));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_targetingService = targetingService ?? throw new ArgumentNullException(nameof(targetingService));
|
||||
_webhookClient = webhookClient ?? throw new ArgumentNullException(nameof(webhookClient));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<PolicyRunExecutionResult> ExecuteAsync(PolicyRunJob job, CancellationToken cancellationToken)
|
||||
{
|
||||
@@ -62,20 +65,24 @@ internal sealed class PolicyRunExecutionService
|
||||
_logger.LogWarning("Failed to update cancelled policy run job {JobId}.", job.Id);
|
||||
}
|
||||
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
cancelled.TenantId,
|
||||
cancelled.PolicyId,
|
||||
cancelled.Mode,
|
||||
"cancelled",
|
||||
reason: cancelled.CancellationReason);
|
||||
_logger.LogInformation(
|
||||
"Policy run job {JobId} cancelled (tenant={TenantId}, policy={PolicyId}, runId={RunId}).",
|
||||
cancelled.Id,
|
||||
cancelled.TenantId,
|
||||
cancelled.PolicyId,
|
||||
cancelled.RunId ?? "(pending)");
|
||||
|
||||
return PolicyRunExecutionResult.Cancelled(cancelled);
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
cancelled.TenantId,
|
||||
cancelled.PolicyId,
|
||||
cancelled.Mode,
|
||||
"cancelled",
|
||||
reason: cancelled.CancellationReason);
|
||||
_logger.LogInformation(
|
||||
"Policy run job {JobId} cancelled (tenant={TenantId}, policy={PolicyId}, runId={RunId}).",
|
||||
cancelled.Id,
|
||||
cancelled.TenantId,
|
||||
cancelled.PolicyId,
|
||||
cancelled.RunId ?? "(pending)");
|
||||
|
||||
var cancelledStatus = PolicyRunStatusFactory.Create(cancelled, cancelledAt);
|
||||
var cancelledPayload = PolicySimulationWebhookPayloadFactory.Create(cancelledStatus, cancelledAt);
|
||||
await _webhookClient.NotifyAsync(cancelledPayload, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
return PolicyRunExecutionResult.Cancelled(cancelled);
|
||||
}
|
||||
|
||||
var targeting = await _targetingService
|
||||
@@ -108,19 +115,23 @@ internal sealed class PolicyRunExecutionService
|
||||
}
|
||||
|
||||
var latency = CalculateLatency(job, completionTime);
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
completed.TenantId,
|
||||
completed.PolicyId,
|
||||
completed.Mode,
|
||||
"no_work",
|
||||
latency,
|
||||
targeting.Reason);
|
||||
_logger.LogInformation(
|
||||
"Policy run job {JobId} completed without submission (reason={Reason}).",
|
||||
completed.Id,
|
||||
targeting.Reason ?? "none");
|
||||
|
||||
return PolicyRunExecutionResult.NoOp(completed, targeting.Reason);
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
completed.TenantId,
|
||||
completed.PolicyId,
|
||||
completed.Mode,
|
||||
"no_work",
|
||||
latency,
|
||||
targeting.Reason);
|
||||
_logger.LogInformation(
|
||||
"Policy run job {JobId} completed without submission (reason={Reason}).",
|
||||
completed.Id,
|
||||
targeting.Reason ?? "none");
|
||||
|
||||
var completedStatus = PolicyRunStatusFactory.Create(completed, completionTime);
|
||||
var completedPayload = PolicySimulationWebhookPayloadFactory.Create(completedStatus, completionTime);
|
||||
await _webhookClient.NotifyAsync(completedPayload, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
return PolicyRunExecutionResult.NoOp(completed, targeting.Reason);
|
||||
}
|
||||
|
||||
job = targeting.Job;
|
||||
@@ -200,24 +211,28 @@ internal sealed class PolicyRunExecutionService
|
||||
|
||||
if (nextStatus == PolicyRunJobStatus.Failed)
|
||||
{
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
failedJob.TenantId,
|
||||
failedJob.PolicyId,
|
||||
failedJob.Mode,
|
||||
"failed",
|
||||
latencyForFailure,
|
||||
reason);
|
||||
|
||||
_logger.LogError(
|
||||
"Policy run job {JobId} failed after {Attempts} attempts (tenant={TenantId}, policy={PolicyId}, runId={RunId}). Error: {Error}",
|
||||
failedJob.Id,
|
||||
attemptCount,
|
||||
failedJob.TenantId,
|
||||
failedJob.PolicyId,
|
||||
failedJob.RunId ?? "(pending)",
|
||||
submission.Error ?? "unknown");
|
||||
|
||||
return PolicyRunExecutionResult.Failed(failedJob, submission.Error);
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
failedJob.TenantId,
|
||||
failedJob.PolicyId,
|
||||
failedJob.Mode,
|
||||
"failed",
|
||||
latencyForFailure,
|
||||
reason);
|
||||
|
||||
_logger.LogError(
|
||||
"Policy run job {JobId} failed after {Attempts} attempts (tenant={TenantId}, policy={PolicyId}, runId={RunId}). Error: {Error}",
|
||||
failedJob.Id,
|
||||
attemptCount,
|
||||
failedJob.TenantId,
|
||||
failedJob.PolicyId,
|
||||
failedJob.RunId ?? "(pending)",
|
||||
submission.Error ?? "unknown");
|
||||
|
||||
var failedStatus = PolicyRunStatusFactory.Create(failedJob, now);
|
||||
var failedPayload = PolicySimulationWebhookPayloadFactory.Create(failedStatus, now);
|
||||
await _webhookClient.NotifyAsync(failedPayload, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
return PolicyRunExecutionResult.Failed(failedJob, submission.Error);
|
||||
}
|
||||
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
|
||||
@@ -0,0 +1,104 @@
|
||||
using System;
|
||||
using System.Net.Http;
|
||||
using System.Net.Mime;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Policy;
|
||||
|
||||
internal interface IPolicySimulationWebhookClient
|
||||
{
|
||||
Task NotifyAsync(PolicySimulationWebhookPayload payload, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
internal sealed class HttpPolicySimulationWebhookClient : IPolicySimulationWebhookClient
|
||||
{
|
||||
private static readonly JsonSerializerOptions SerializerOptions = new(JsonSerializerDefaults.Web)
|
||||
{
|
||||
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull
|
||||
};
|
||||
|
||||
private readonly HttpClient _httpClient;
|
||||
private readonly IOptionsMonitor<SchedulerWorkerOptions> _options;
|
||||
private readonly ILogger<HttpPolicySimulationWebhookClient> _logger;
|
||||
|
||||
public HttpPolicySimulationWebhookClient(
|
||||
HttpClient httpClient,
|
||||
IOptionsMonitor<SchedulerWorkerOptions> options,
|
||||
ILogger<HttpPolicySimulationWebhookClient> logger)
|
||||
{
|
||||
_httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task NotifyAsync(PolicySimulationWebhookPayload payload, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(payload);
|
||||
|
||||
var snapshot = _options.CurrentValue.Policy.Webhook;
|
||||
if (!snapshot.Enabled)
|
||||
{
|
||||
_logger.LogDebug("Policy simulation webhook disabled; skip run {RunId}.", payload.Simulation.RunId);
|
||||
return;
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(snapshot.Endpoint))
|
||||
{
|
||||
_logger.LogWarning("Policy simulation webhook endpoint missing; run {RunId} not dispatched.", payload.Simulation.RunId);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!Uri.TryCreate(snapshot.Endpoint, UriKind.Absolute, out var endpoint))
|
||||
{
|
||||
_logger.LogError("Policy simulation webhook endpoint '{Endpoint}' invalid.", snapshot.Endpoint);
|
||||
return;
|
||||
}
|
||||
|
||||
var timeout = snapshot.TimeoutSeconds <= 0 ? TimeSpan.FromSeconds(10) : TimeSpan.FromSeconds(snapshot.TimeoutSeconds);
|
||||
_httpClient.Timeout = timeout;
|
||||
|
||||
using var request = new HttpRequestMessage(HttpMethod.Post, endpoint)
|
||||
{
|
||||
Content = new StringContent(JsonSerializer.Serialize(payload, SerializerOptions), Encoding.UTF8, MediaTypeNames.Application.Json)
|
||||
};
|
||||
|
||||
request.Headers.TryAddWithoutValidation("X-StellaOps-Tenant", payload.TenantId);
|
||||
if (!string.IsNullOrWhiteSpace(payload.Simulation.RunId))
|
||||
{
|
||||
request.Headers.TryAddWithoutValidation("X-StellaOps-Run-Id", payload.Simulation.RunId);
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(snapshot.ApiKey) && !string.IsNullOrWhiteSpace(snapshot.ApiKeyHeader))
|
||||
{
|
||||
request.Headers.TryAddWithoutValidation(snapshot.ApiKeyHeader!, snapshot.ApiKey);
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
using var response = await _httpClient.SendAsync(request, cancellationToken).ConfigureAwait(false);
|
||||
if (!response.IsSuccessStatusCode)
|
||||
{
|
||||
var body = await response.Content.ReadAsStringAsync(cancellationToken).ConfigureAwait(false);
|
||||
_logger.LogWarning(
|
||||
"Policy simulation webhook responded {StatusCode} for run {RunId}: {Body}",
|
||||
(int)response.StatusCode,
|
||||
payload.Simulation.RunId,
|
||||
body);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Policy simulation webhook failed for run {RunId}.", payload.Simulation.RunId);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -26,8 +26,70 @@ public sealed class PolicyRunModelsTests
|
||||
Assert.Equal(JsonValueKind.True, inputs.Environment["sealed"].ValueKind);
|
||||
Assert.Equal("internet", inputs.Environment["exposure"].GetString());
|
||||
Assert.Equal("global", inputs.Environment["region"].GetString());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PolicySimulationWebhookPayloadFactory_ComputesSucceeded()
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var job = CreateJob(PolicyRunJobStatus.Completed, now);
|
||||
var status = PolicyRunStatusFactory.Create(job, now);
|
||||
|
||||
var payload = PolicySimulationWebhookPayloadFactory.Create(status, now);
|
||||
|
||||
Assert.Equal(succeeded, payload.Result);
|
||||
Assert.Equal(status, payload.Simulation);
|
||||
Assert.Null(payload.Reason);
|
||||
Assert.NotNull(payload.LatencySeconds);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PolicySimulationWebhookPayloadFactory_ComputesFailureReason()
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var job = CreateJob(PolicyRunJobStatus.Failed, now) with { LastError = timeout };
|
||||
var status = PolicyRunStatusFactory.Create(job, now);
|
||||
|
||||
var payload = PolicySimulationWebhookPayloadFactory.Create(status, now);
|
||||
|
||||
Assert.Equal(failed, payload.Result);
|
||||
Assert.Equal(timeout, payload.Reason);
|
||||
}
|
||||
|
||||
private static PolicyRunJob CreateJob(PolicyRunJobStatus status, DateTimeOffset timestamp)
|
||||
{
|
||||
return new PolicyRunJob(
|
||||
SchemaVersion: SchedulerSchemaVersions.PolicyRunJob,
|
||||
Id: job,
|
||||
TenantId: tenant,
|
||||
PolicyId: policy,
|
||||
PolicyVersion: 1,
|
||||
Mode: PolicyRunMode.Simulate,
|
||||
Priority: PolicyRunPriority.Normal,
|
||||
PriorityRank: 0,
|
||||
RunId: run,
|
||||
RequestedBy: tester,
|
||||
CorrelationId: corr,
|
||||
Metadata: null,
|
||||
Inputs: PolicyRunInputs.Empty,
|
||||
QueuedAt: timestamp,
|
||||
Status: status,
|
||||
AttemptCount: 1,
|
||||
LastAttemptAt: timestamp,
|
||||
LastError: status == PolicyRunJobStatus.Failed ? error : null,
|
||||
CreatedAt: timestamp,
|
||||
UpdatedAt: timestamp,
|
||||
AvailableAt: timestamp,
|
||||
SubmittedAt: timestamp,
|
||||
CompletedAt: status == PolicyRunJobStatus.Completed ? timestamp : null,
|
||||
LeaseOwner: null,
|
||||
LeaseExpiresAt: null,
|
||||
CancellationRequested: status == PolicyRunJobStatus.Cancelled,
|
||||
CancellationRequestedAt: null,
|
||||
CancellationReason: null,
|
||||
CancelledAt: status == PolicyRunJobStatus.Cancelled ? timestamp : null);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PolicyRunStatus_ThrowsOnNegativeAttempts()
|
||||
{
|
||||
|
||||
@@ -56,10 +56,11 @@ public sealed class GraphJobEventPublisherTests
|
||||
|
||||
await publisher.PublishAsync(notification, CancellationToken.None);
|
||||
|
||||
var message = Assert.Single(loggerProvider.Messages);
|
||||
Assert.Contains("\"kind\":\"scheduler.graph.job.completed\"", message);
|
||||
Assert.Contains("\"tenant\":\"tenant-alpha\"", message);
|
||||
Assert.Contains("\"resultUri\":\"oras://result\"", message);
|
||||
Assert.Contains(loggerProvider.Messages, message => message.Contains("unsupported driver", StringComparison.OrdinalIgnoreCase));
|
||||
var eventPayload = loggerProvider.Messages.FirstOrDefault(message => message.Contains("\"kind\":\"scheduler.graph.job.completed\"", StringComparison.Ordinal));
|
||||
Assert.NotNull(eventPayload);
|
||||
Assert.Contains("\"tenant\":\"tenant-alpha\"", eventPayload);
|
||||
Assert.Contains("\"resultUri\":\"oras://result\"", eventPayload);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
|
||||
@@ -0,0 +1,332 @@
|
||||
using System.Net;
|
||||
using System.Net.Http.Headers;
|
||||
using System.Text.Json;
|
||||
using Microsoft.AspNetCore.Mvc.Testing;
|
||||
using Mongo2Go;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Scheduler.WebService.PolicySimulations;
|
||||
using System.Collections.Generic;
|
||||
using System.Threading;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.Tests;
|
||||
|
||||
public sealed class PolicySimulationEndpointTests : IClassFixture<WebApplicationFactory<Program>>
|
||||
{
|
||||
private readonly WebApplicationFactory<Program> _factory;
|
||||
|
||||
public PolicySimulationEndpointTests(WebApplicationFactory<Program> factory)
|
||||
{
|
||||
_factory = factory;
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CreateListGetSimulation()
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-sim");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "policy:simulate");
|
||||
|
||||
var createResponse = await client.PostAsJsonAsync("/api/v1/scheduler/policies/simulations", new
|
||||
{
|
||||
policyId = "policy-alpha",
|
||||
policyVersion = 3,
|
||||
metadata = new Dictionary<string, string> { ["requestedBy"] = "unit-test" },
|
||||
inputs = new
|
||||
{
|
||||
sbomSet = new[] { "sbom://alpha", "sbom://bravo" },
|
||||
captureExplain = true
|
||||
}
|
||||
});
|
||||
|
||||
createResponse.EnsureSuccessStatusCode();
|
||||
Assert.Equal(System.Net.HttpStatusCode.Created, createResponse.StatusCode);
|
||||
var created = await createResponse.Content.ReadFromJsonAsync<JsonElement>();
|
||||
var runId = created.GetProperty("simulation").GetProperty("runId").GetString();
|
||||
Assert.False(string.IsNullOrEmpty(runId));
|
||||
Assert.Equal("simulate", created.GetProperty("simulation").GetProperty("mode").GetString());
|
||||
|
||||
var listResponse = await client.GetAsync("/api/v1/scheduler/policies/simulations?limit=5");
|
||||
listResponse.EnsureSuccessStatusCode();
|
||||
var list = await listResponse.Content.ReadFromJsonAsync<JsonElement>();
|
||||
Assert.True(list.GetProperty("simulations").EnumerateArray().Any());
|
||||
|
||||
var getResponse = await client.GetAsync($"/api/v1/scheduler/policies/simulations/{runId}");
|
||||
getResponse.EnsureSuccessStatusCode();
|
||||
var simulation = await getResponse.Content.ReadFromJsonAsync<JsonElement>();
|
||||
Assert.Equal(runId, simulation.GetProperty("simulation").GetProperty("runId").GetString());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MetricsEndpointWithoutProviderReturns501()
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-sim-metrics-missing");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "policy:simulate");
|
||||
|
||||
var response = await client.GetAsync("/api/v1/scheduler/policies/simulations/metrics");
|
||||
Assert.Equal(HttpStatusCode.NotImplemented, response.StatusCode);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MetricsEndpointReturnsSummary()
|
||||
{
|
||||
var stub = new StubPolicySimulationMetricsProvider
|
||||
{
|
||||
Response = new PolicySimulationMetricsResponse(
|
||||
new PolicySimulationQueueDepth(
|
||||
3,
|
||||
new Dictionary<string, long>
|
||||
{
|
||||
["pending"] = 2,
|
||||
["dispatching"] = 1
|
||||
}),
|
||||
new PolicySimulationLatencyMetrics(
|
||||
Samples: 2,
|
||||
P50: 1.5,
|
||||
P90: 2.5,
|
||||
P95: 3.5,
|
||||
P99: 4.0,
|
||||
Mean: 2.0))
|
||||
};
|
||||
|
||||
await using var factory = _factory.WithWebHostBuilder(builder =>
|
||||
{
|
||||
builder.ConfigureServices(services =>
|
||||
{
|
||||
services.AddSingleton<IPolicySimulationMetricsProvider>(stub);
|
||||
services.AddSingleton<IPolicySimulationMetricsRecorder>(stub);
|
||||
});
|
||||
});
|
||||
|
||||
using var client = factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-sim-metrics");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "policy:simulate");
|
||||
|
||||
var response = await client.GetAsync("/api/v1/scheduler/policies/simulations/metrics");
|
||||
response.EnsureSuccessStatusCode();
|
||||
|
||||
var payload = await response.Content.ReadFromJsonAsync<JsonElement>();
|
||||
Assert.Equal(3, payload.GetProperty("policy_simulation_queue_depth").GetProperty("total").GetInt32());
|
||||
Assert.Equal(2, payload.GetProperty("policy_simulation_latency").GetProperty("samples").GetInt32());
|
||||
Assert.Equal(2.0, payload.GetProperty("policy_simulation_latency").GetProperty("mean_seconds").GetDouble());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CreateSimulationRequiresScopeHeader()
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-sim-auth");
|
||||
|
||||
var response = await client.PostAsJsonAsync("/api/v1/scheduler/policies/simulations", new
|
||||
{
|
||||
policyId = "policy-auth",
|
||||
policyVersion = 1
|
||||
});
|
||||
|
||||
Assert.Equal(HttpStatusCode.Unauthorized, response.StatusCode);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CreateSimulationRequiresPolicySimulateScope()
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-sim-authz");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "policy:run");
|
||||
|
||||
var response = await client.PostAsJsonAsync("/api/v1/scheduler/policies/simulations", new
|
||||
{
|
||||
policyId = "policy-authz",
|
||||
policyVersion = 2
|
||||
});
|
||||
|
||||
Assert.Equal(HttpStatusCode.Forbidden, response.StatusCode);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CancelSimulationMarksStatus()
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-sim-cancel");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "policy:simulate");
|
||||
|
||||
var create = await client.PostAsJsonAsync("/api/v1/scheduler/policies/simulations", new
|
||||
{
|
||||
policyId = "policy-bravo",
|
||||
policyVersion = 2
|
||||
});
|
||||
create.EnsureSuccessStatusCode();
|
||||
var runId = (await create.Content.ReadFromJsonAsync<JsonElement>()).GetProperty("simulation").GetProperty("runId").GetString();
|
||||
|
||||
var cancel = await client.PostAsJsonAsync($"/api/v1/scheduler/policies/simulations/{runId}/cancel", new
|
||||
{
|
||||
reason = "user-request"
|
||||
});
|
||||
|
||||
cancel.EnsureSuccessStatusCode();
|
||||
var cancelled = await cancel.Content.ReadFromJsonAsync<JsonElement>();
|
||||
Assert.True(cancelled.GetProperty("simulation").GetProperty("cancellationRequested").GetBoolean());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task RetrySimulationCreatesNewRun()
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-sim-retry");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "policy:simulate");
|
||||
|
||||
var create = await client.PostAsJsonAsync("/api/v1/scheduler/policies/simulations", new
|
||||
{
|
||||
policyId = "policy-charlie",
|
||||
policyVersion = 5
|
||||
});
|
||||
create.EnsureSuccessStatusCode();
|
||||
var runId = (await create.Content.ReadFromJsonAsync<JsonElement>()).GetProperty("simulation").GetProperty("runId").GetString();
|
||||
|
||||
// Mark as cancelled to allow retry
|
||||
await client.PostAsJsonAsync($"/api/v1/scheduler/policies/simulations/{runId}/cancel", new { reason = "cleanup" });
|
||||
|
||||
var retry = await client.PostAsync($"/api/v1/scheduler/policies/simulations/{runId}/retry", content: null);
|
||||
retry.EnsureSuccessStatusCode();
|
||||
Assert.Equal(System.Net.HttpStatusCode.Created, retry.StatusCode);
|
||||
var retried = await retry.Content.ReadFromJsonAsync<JsonElement>();
|
||||
var newRunId = retried.GetProperty("simulation").GetProperty("runId").GetString();
|
||||
Assert.False(string.IsNullOrEmpty(newRunId));
|
||||
Assert.NotEqual(runId, newRunId);
|
||||
var metadata = retried.GetProperty("simulation").GetProperty("metadata");
|
||||
Assert.True(metadata.TryGetProperty("retry-of", out var retryOf));
|
||||
Assert.Equal(runId, retryOf.GetString());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task StreamSimulationEmitsCoreEvents()
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-sim-stream");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "policy:simulate");
|
||||
|
||||
var create = await client.PostAsJsonAsync("/api/v1/scheduler/policies/simulations", new
|
||||
{
|
||||
policyId = "policy-delta",
|
||||
policyVersion = 7
|
||||
});
|
||||
create.EnsureSuccessStatusCode();
|
||||
var runId = (await create.Content.ReadFromJsonAsync<JsonElement>()).GetProperty("simulation").GetProperty("runId").GetString();
|
||||
|
||||
using var request = new HttpRequestMessage(HttpMethod.Get, $"/api/v1/scheduler/policies/simulations/{runId}/stream");
|
||||
request.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue("text/event-stream"));
|
||||
|
||||
using var response = await client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead);
|
||||
response.EnsureSuccessStatusCode();
|
||||
|
||||
await using var stream = await response.Content.ReadAsStreamAsync();
|
||||
using var reader = new StreamReader(stream);
|
||||
|
||||
var seenRetry = false;
|
||||
var seenInitial = false;
|
||||
var seenQueueLag = false;
|
||||
var seenHeartbeat = false;
|
||||
|
||||
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(2));
|
||||
while (!cts.Token.IsCancellationRequested && !(seenRetry && seenInitial && seenQueueLag && seenHeartbeat))
|
||||
{
|
||||
var readTask = reader.ReadLineAsync();
|
||||
var completed = await Task.WhenAny(readTask, Task.Delay(200, cts.Token));
|
||||
if (completed != readTask)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var line = await readTask;
|
||||
if (line is null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (line.Length == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line.StartsWith("retry:", StringComparison.Ordinal))
|
||||
{
|
||||
seenRetry = true;
|
||||
}
|
||||
else if (line.StartsWith("event: initial", StringComparison.Ordinal))
|
||||
{
|
||||
seenInitial = true;
|
||||
}
|
||||
else if (line.StartsWith("event: queueLag", StringComparison.Ordinal))
|
||||
{
|
||||
seenQueueLag = true;
|
||||
}
|
||||
else if (line.StartsWith("event: heartbeat", StringComparison.Ordinal))
|
||||
{
|
||||
seenHeartbeat = true;
|
||||
}
|
||||
}
|
||||
|
||||
Assert.True(seenRetry, "Retry directive should be emitted before events.");
|
||||
Assert.True(seenInitial, "Initial event was not observed.");
|
||||
Assert.True(seenQueueLag, "Queue lag event was not observed.");
|
||||
Assert.True(seenHeartbeat, "Heartbeat event was not observed.");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MongoBackedCreateSimulationPersists()
|
||||
{
|
||||
using var runner = MongoDbRunner.Start(additionalMongodArguments: "--quiet");
|
||||
await using var factory = _factory.WithWebHostBuilder(builder =>
|
||||
{
|
||||
builder.ConfigureAppConfiguration((_, configuration) =>
|
||||
{
|
||||
configuration.AddInMemoryCollection(new[]
|
||||
{
|
||||
new KeyValuePair<string, string?>("Scheduler:Storage:ConnectionString", runner.ConnectionString),
|
||||
new KeyValuePair<string, string?>("Scheduler:Storage:Database", $"scheduler_web_tests_{Guid.NewGuid():N}")
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
using var client = factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-sim-mongo");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "policy:simulate");
|
||||
|
||||
var createResponse = await client.PostAsJsonAsync("/api/v1/scheduler/policies/simulations", new
|
||||
{
|
||||
policyId = "policy-mongo",
|
||||
policyVersion = 11
|
||||
});
|
||||
createResponse.EnsureSuccessStatusCode();
|
||||
var runId = (await createResponse.Content.ReadFromJsonAsync<JsonElement>()).GetProperty("simulation").GetProperty("runId").GetString();
|
||||
Assert.False(string.IsNullOrEmpty(runId));
|
||||
|
||||
var fetched = await client.GetAsync($"/api/v1/scheduler/policies/simulations/{runId}");
|
||||
fetched.EnsureSuccessStatusCode();
|
||||
var payload = await fetched.Content.ReadFromJsonAsync<JsonElement>();
|
||||
Assert.Equal(runId, payload.GetProperty("simulation").GetProperty("runId").GetString());
|
||||
}
|
||||
|
||||
private sealed class StubPolicySimulationMetricsProvider : IPolicySimulationMetricsProvider, IPolicySimulationMetricsRecorder
|
||||
{
|
||||
public PolicySimulationMetricsResponse Response { get; set; } = new(
|
||||
new PolicySimulationQueueDepth(0, new Dictionary<string, long>()),
|
||||
new PolicySimulationLatencyMetrics(0, null, null, null, null, null));
|
||||
|
||||
public List<double> RecordedLatencies { get; } = new();
|
||||
|
||||
public Task<PolicySimulationMetricsResponse> CaptureAsync(string tenantId, CancellationToken cancellationToken)
|
||||
=> Task.FromResult(Response);
|
||||
|
||||
public void RecordLatency(PolicyRunStatus status, DateTimeOffset observedAt)
|
||||
{
|
||||
var finishedAt = status.FinishedAt ?? observedAt;
|
||||
var latency = (finishedAt - status.QueuedAt).TotalSeconds;
|
||||
if (latency >= 0)
|
||||
{
|
||||
RecordedLatencies.Add(latency);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,16 @@
|
||||
using System.Linq;
|
||||
using System.Text.Json;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Collections.Immutable;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Net.Http;
|
||||
using System.Text.Json;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Queue;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.Tests;
|
||||
|
||||
@@ -17,7 +28,7 @@ public sealed class RunEndpointTests : IClassFixture<WebApplicationFactory<Progr
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-runs");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "scheduler.schedules.write scheduler.schedules.read scheduler.runs.write scheduler.runs.read scheduler.runs.preview");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "scheduler.schedules.write scheduler.schedules.read scheduler.runs.write scheduler.runs.read scheduler.runs.preview scheduler.runs.manage");
|
||||
|
||||
var scheduleResponse = await client.PostAsJsonAsync("/api/v1/scheduler/schedules", new
|
||||
{
|
||||
@@ -66,11 +77,11 @@ public sealed class RunEndpointTests : IClassFixture<WebApplicationFactory<Progr
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task PreviewImpactForSchedule()
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-preview");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "scheduler.schedules.write scheduler.schedules.read scheduler.runs.write scheduler.runs.read scheduler.runs.preview");
|
||||
public async Task PreviewImpactForSchedule()
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-preview");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "scheduler.schedules.write scheduler.schedules.read scheduler.runs.write scheduler.runs.read scheduler.runs.preview scheduler.runs.manage");
|
||||
|
||||
var scheduleResponse = await client.PostAsJsonAsync("/api/v1/scheduler/schedules", new
|
||||
{
|
||||
@@ -96,9 +107,224 @@ public sealed class RunEndpointTests : IClassFixture<WebApplicationFactory<Progr
|
||||
sampleSize = 3
|
||||
});
|
||||
|
||||
previewResponse.EnsureSuccessStatusCode();
|
||||
var preview = await previewResponse.Content.ReadFromJsonAsync<JsonElement>();
|
||||
Assert.True(preview.GetProperty("total").GetInt32() >= 0);
|
||||
Assert.True(preview.GetProperty("sample").GetArrayLength() <= 3);
|
||||
}
|
||||
}
|
||||
previewResponse.EnsureSuccessStatusCode();
|
||||
var preview = await previewResponse.Content.ReadFromJsonAsync<JsonElement>();
|
||||
Assert.True(preview.GetProperty("total").GetInt32() >= 0);
|
||||
Assert.True(preview.GetProperty("sample").GetArrayLength() <= 3);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task RetryRunCreatesNewRun()
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-retry");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "scheduler.schedules.write scheduler.schedules.read scheduler.runs.write scheduler.runs.read scheduler.runs.preview scheduler.runs.manage");
|
||||
|
||||
var scheduleId = await CreateScheduleAsync(client, "RetrySchedule");
|
||||
|
||||
var createRun = await client.PostAsJsonAsync("/api/v1/scheduler/runs", new
|
||||
{
|
||||
scheduleId,
|
||||
trigger = "manual"
|
||||
});
|
||||
|
||||
createRun.EnsureSuccessStatusCode();
|
||||
var runJson = await createRun.Content.ReadFromJsonAsync<JsonElement>();
|
||||
var runId = runJson.GetProperty("run").GetProperty("id").GetString();
|
||||
Assert.False(string.IsNullOrEmpty(runId));
|
||||
|
||||
var cancelResponse = await client.PostAsync($"/api/v1/scheduler/runs/{runId}/cancel", null);
|
||||
cancelResponse.EnsureSuccessStatusCode();
|
||||
|
||||
var retryResponse = await client.PostAsync($"/api/v1/scheduler/runs/{runId}/retry", content: null);
|
||||
retryResponse.EnsureSuccessStatusCode();
|
||||
Assert.Equal(System.Net.HttpStatusCode.Created, retryResponse.StatusCode);
|
||||
|
||||
var retryJson = await retryResponse.Content.ReadFromJsonAsync<JsonElement>();
|
||||
var retryRun = retryJson.GetProperty("run");
|
||||
Assert.Equal("planning", retryRun.GetProperty("state").GetString());
|
||||
Assert.Equal(runId, retryRun.GetProperty("retryOf").GetString());
|
||||
Assert.Equal("manual", retryRun.GetProperty("trigger").GetString());
|
||||
Assert.Contains("retry-of:", retryRun.GetProperty("reason").GetProperty("manualReason").GetString());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GetRunDeltasReturnsMetadata()
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-deltas");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "scheduler.schedules.write scheduler.schedules.read scheduler.runs.write scheduler.runs.read scheduler.runs.preview scheduler.runs.manage");
|
||||
|
||||
var scheduleId = await CreateScheduleAsync(client, "DeltaSchedule");
|
||||
|
||||
var runResponse = await client.PostAsJsonAsync("/api/v1/scheduler/runs", new
|
||||
{
|
||||
scheduleId,
|
||||
trigger = "manual"
|
||||
});
|
||||
|
||||
runResponse.EnsureSuccessStatusCode();
|
||||
var runJson = await runResponse.Content.ReadFromJsonAsync<JsonElement>();
|
||||
var runId = runJson.GetProperty("run").GetProperty("id").GetString()!;
|
||||
|
||||
using (var scope = _factory.Services.CreateScope())
|
||||
{
|
||||
var repository = scope.ServiceProvider.GetRequiredService<IRunRepository>();
|
||||
var existing = await repository.GetAsync("tenant-deltas", runId);
|
||||
Assert.NotNull(existing);
|
||||
|
||||
var deltas = ImmutableArray.Create(new DeltaSummary(
|
||||
"sha256:" + new string('a', 64),
|
||||
newFindings: 2,
|
||||
newCriticals: 1,
|
||||
newHigh: 1,
|
||||
newMedium: 0,
|
||||
newLow: 0));
|
||||
|
||||
var updated = new Run(
|
||||
existing!.Id,
|
||||
existing.TenantId,
|
||||
existing.Trigger,
|
||||
existing.State,
|
||||
existing.Stats,
|
||||
existing.CreatedAt,
|
||||
existing.Reason,
|
||||
existing.ScheduleId,
|
||||
existing.StartedAt,
|
||||
existing.FinishedAt,
|
||||
existing.Error,
|
||||
deltas,
|
||||
existing.RetryOf,
|
||||
existing.SchemaVersion);
|
||||
|
||||
await repository.UpdateAsync(updated);
|
||||
}
|
||||
|
||||
var deltasResponse = await client.GetAsync($"/api/v1/scheduler/runs/{runId}/deltas");
|
||||
deltasResponse.EnsureSuccessStatusCode();
|
||||
|
||||
var deltasJson = await deltasResponse.Content.ReadFromJsonAsync<JsonElement>();
|
||||
Assert.Equal(1, deltasJson.GetProperty("deltas").GetArrayLength());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task QueueLagSummaryReturnsDepth()
|
||||
{
|
||||
SchedulerQueueMetrics.RecordDepth("redis", "scheduler-runner", 7);
|
||||
|
||||
try
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-queue");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "scheduler.runs.read scheduler.runs.manage");
|
||||
|
||||
var queueResponse = await client.GetAsync("/api/v1/scheduler/runs/queue/lag");
|
||||
queueResponse.EnsureSuccessStatusCode();
|
||||
|
||||
var summary = await queueResponse.Content.ReadFromJsonAsync<JsonElement>();
|
||||
Assert.True(summary.GetProperty("totalDepth").GetInt64() >= 7);
|
||||
Assert.True(summary.GetProperty("queues").EnumerateArray().Any());
|
||||
}
|
||||
finally
|
||||
{
|
||||
SchedulerQueueMetrics.RemoveDepth("redis", "scheduler-runner");
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task StreamRunEmitsInitialEvent()
|
||||
{
|
||||
using var client = _factory.CreateClient();
|
||||
client.DefaultRequestHeaders.Add("X-Tenant-Id", "tenant-stream");
|
||||
client.DefaultRequestHeaders.Add("X-Scopes", "scheduler.schedules.write scheduler.schedules.read scheduler.runs.write scheduler.runs.read scheduler.runs.preview scheduler.runs.manage");
|
||||
|
||||
var scheduleId = await CreateScheduleAsync(client, "StreamSchedule");
|
||||
|
||||
var runResponse = await client.PostAsJsonAsync("/api/v1/scheduler/runs", new
|
||||
{
|
||||
scheduleId,
|
||||
trigger = "manual"
|
||||
});
|
||||
|
||||
runResponse.EnsureSuccessStatusCode();
|
||||
var runJson = await runResponse.Content.ReadFromJsonAsync<JsonElement>();
|
||||
var runId = runJson.GetProperty("run").GetProperty("id").GetString();
|
||||
Assert.False(string.IsNullOrEmpty(runId));
|
||||
|
||||
using var request = new HttpRequestMessage(HttpMethod.Get, $"/api/v1/scheduler/runs/{runId}/stream");
|
||||
request.Headers.Accept.ParseAdd("text/event-stream");
|
||||
|
||||
using var response = await client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead);
|
||||
response.EnsureSuccessStatusCode();
|
||||
|
||||
await using var stream = await response.Content.ReadAsStreamAsync();
|
||||
using var reader = new StreamReader(stream);
|
||||
|
||||
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(2));
|
||||
var seenRetry = false;
|
||||
var seenInitial = false;
|
||||
var seenQueueLag = false;
|
||||
var seenHeartbeat = false;
|
||||
|
||||
while (!cts.IsCancellationRequested && !(seenRetry && seenInitial && seenQueueLag && seenHeartbeat))
|
||||
{
|
||||
var readTask = reader.ReadLineAsync();
|
||||
var completed = await Task.WhenAny(readTask, Task.Delay(200, cts.Token));
|
||||
if (completed != readTask)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var line = await readTask;
|
||||
if (line is null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (line.Length == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line.StartsWith("retry:", StringComparison.Ordinal))
|
||||
{
|
||||
seenRetry = true;
|
||||
}
|
||||
else if (line.StartsWith("event: initial", StringComparison.Ordinal))
|
||||
{
|
||||
seenInitial = true;
|
||||
}
|
||||
else if (line.StartsWith("event: queueLag", StringComparison.Ordinal))
|
||||
{
|
||||
seenQueueLag = true;
|
||||
}
|
||||
else if (line.StartsWith("event: heartbeat", StringComparison.Ordinal))
|
||||
{
|
||||
seenHeartbeat = true;
|
||||
}
|
||||
}
|
||||
|
||||
Assert.True(seenRetry, "Retry directive was not observed.");
|
||||
Assert.True(seenInitial, "Initial snapshot was not observed.");
|
||||
Assert.True(seenQueueLag, "Queue lag event was not observed.");
|
||||
Assert.True(seenHeartbeat, "Heartbeat event was not observed.");
|
||||
}
|
||||
|
||||
private static async Task<string> CreateScheduleAsync(HttpClient client, string name)
|
||||
{
|
||||
var scheduleResponse = await client.PostAsJsonAsync("/api/v1/scheduler/schedules", new
|
||||
{
|
||||
name,
|
||||
cronExpression = "0 1 * * *",
|
||||
timezone = "UTC",
|
||||
mode = "analysis-only",
|
||||
selection = new { scope = "all-images" }
|
||||
});
|
||||
|
||||
scheduleResponse.EnsureSuccessStatusCode();
|
||||
var scheduleJson = await scheduleResponse.Content.ReadFromJsonAsync<JsonElement>();
|
||||
var scheduleId = scheduleJson.GetProperty("schedule").GetProperty("id").GetString();
|
||||
Assert.False(string.IsNullOrEmpty(scheduleId));
|
||||
return scheduleId!;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Microsoft.AspNetCore.Hosting;
|
||||
using Microsoft.AspNetCore.Mvc.Testing;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Scheduler.WebService.Options;
|
||||
using StellaOps.Scheduler.WebService.Runs;
|
||||
|
||||
namespace StellaOps.Scheduler.WebService.Tests;
|
||||
|
||||
@@ -41,6 +43,13 @@ public sealed class SchedulerWebApplicationFactory : WebApplicationFactory<Progr
|
||||
options.Webhooks.Excitor.HmacSecret = "excitor-secret";
|
||||
options.Webhooks.Excitor.Enabled = true;
|
||||
});
|
||||
|
||||
services.PostConfigure<RunStreamOptions>(options =>
|
||||
{
|
||||
options.PollInterval = TimeSpan.FromMilliseconds(100);
|
||||
options.QueueLagInterval = TimeSpan.FromMilliseconds(200);
|
||||
options.HeartbeatInterval = TimeSpan.FromMilliseconds(150);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
<UseConcelierTestInfra>false</UseConcelierTestInfra>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Mongo2Go" Version="4.1.0" />
|
||||
<PackageReference Include="Microsoft.AspNetCore.Mvc.Testing" Version="10.0.0-rc.2.25502.107" />
|
||||
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.14.0" />
|
||||
<PackageReference Include="xunit" Version="2.9.2" />
|
||||
@@ -17,4 +18,4 @@
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../../StellaOps.Scheduler.WebService/StellaOps.Scheduler.WebService.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
||||
@@ -4,12 +4,13 @@ using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Graph;
|
||||
using StellaOps.Scheduler.Worker.Graph.Cartographer;
|
||||
using StellaOps.Scheduler.Worker.Graph.Scheduler;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Graph;
|
||||
using StellaOps.Scheduler.Worker.Graph.Cartographer;
|
||||
using StellaOps.Scheduler.Worker.Graph.Scheduler;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Tests;
|
||||
|
||||
@@ -6,10 +6,11 @@ using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Graph;
|
||||
using StellaOps.Scheduler.Worker.Graph.Cartographer;
|
||||
using StellaOps.Scheduler.Worker.Graph.Scheduler;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
using StellaOps.Scheduler.Worker.Graph;
|
||||
using StellaOps.Scheduler.Worker.Graph.Cartographer;
|
||||
using StellaOps.Scheduler.Worker.Graph.Scheduler;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Tests;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
using System;
|
||||
using System.Collections.Immutable;
|
||||
using System.Collections.Generic;
|
||||
using System.Collections.Immutable;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
@@ -46,11 +47,12 @@ public sealed class PolicyRunExecutionServiceTests
|
||||
var options = Microsoft.Extensions.Options.Options.Create(CloneOptions());
|
||||
var timeProvider = new TestTimeProvider(DateTimeOffset.Parse("2025-10-28T10:00:00Z"));
|
||||
using var metrics = new SchedulerWorkerMetrics();
|
||||
var targeting = new StubPolicyRunTargetingService
|
||||
{
|
||||
OnEnsureTargets = job => PolicyRunTargetingResult.Unchanged(job)
|
||||
};
|
||||
var service = new PolicyRunExecutionService(repository, client, options, timeProvider, metrics, targeting, NullLogger<PolicyRunExecutionService>.Instance);
|
||||
var targeting = new StubPolicyRunTargetingService
|
||||
{
|
||||
OnEnsureTargets = job => PolicyRunTargetingResult.Unchanged(job)
|
||||
};
|
||||
var webhook = new RecordingPolicySimulationWebhookClient();
|
||||
var service = new PolicyRunExecutionService(repository, client, options, timeProvider, metrics, targeting, webhook, NullLogger<PolicyRunExecutionService>.Instance);
|
||||
|
||||
var job = CreateJob(status: PolicyRunJobStatus.Dispatching) with
|
||||
{
|
||||
@@ -63,26 +65,29 @@ public sealed class PolicyRunExecutionServiceTests
|
||||
|
||||
Assert.Equal(PolicyRunExecutionResultType.Cancelled, result.Type);
|
||||
Assert.Equal(PolicyRunJobStatus.Cancelled, result.UpdatedJob.Status);
|
||||
Assert.True(repository.ReplaceCalled);
|
||||
Assert.Equal("test-dispatch", repository.ExpectedLeaseOwner);
|
||||
Assert.True(repository.ReplaceCalled);
|
||||
Assert.Equal("test-dispatch", repository.ExpectedLeaseOwner);
|
||||
Assert.Single(webhook.Payloads);
|
||||
Assert.Equal("cancelled", webhook.Payloads[0].Result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ExecuteAsync_SubmitsJob_OnSuccess()
|
||||
{
|
||||
var repository = new RecordingPolicyRunJobRepository();
|
||||
var client = new StubPolicyRunClient
|
||||
{
|
||||
Result = PolicyRunSubmissionResult.Succeeded("run:P-7:2025", DateTimeOffset.Parse("2025-10-28T10:01:00Z"))
|
||||
};
|
||||
var options = Microsoft.Extensions.Options.Options.Create(CloneOptions());
|
||||
var timeProvider = new TestTimeProvider(DateTimeOffset.Parse("2025-10-28T10:00:00Z"));
|
||||
using var metrics = new SchedulerWorkerMetrics();
|
||||
var targeting = new StubPolicyRunTargetingService
|
||||
{
|
||||
OnEnsureTargets = job => PolicyRunTargetingResult.Unchanged(job)
|
||||
};
|
||||
var service = new PolicyRunExecutionService(repository, client, options, timeProvider, metrics, targeting, NullLogger<PolicyRunExecutionService>.Instance);
|
||||
var client = new StubPolicyRunClient
|
||||
{
|
||||
Result = PolicyRunSubmissionResult.Succeeded("run:P-7:2025", DateTimeOffset.Parse("2025-10-28T10:01:00Z"))
|
||||
};
|
||||
var options = Microsoft.Extensions.Options.Options.Create(CloneOptions());
|
||||
var timeProvider = new TestTimeProvider(DateTimeOffset.Parse("2025-10-28T10:00:00Z"));
|
||||
using var metrics = new SchedulerWorkerMetrics();
|
||||
var targeting = new StubPolicyRunTargetingService
|
||||
{
|
||||
OnEnsureTargets = job => PolicyRunTargetingResult.Unchanged(job)
|
||||
};
|
||||
var webhook = new RecordingPolicySimulationWebhookClient();
|
||||
var service = new PolicyRunExecutionService(repository, client, options, timeProvider, metrics, targeting, webhook, NullLogger<PolicyRunExecutionService>.Instance);
|
||||
|
||||
var job = CreateJob(status: PolicyRunJobStatus.Dispatching) with
|
||||
{
|
||||
@@ -93,11 +98,12 @@ public sealed class PolicyRunExecutionServiceTests
|
||||
var result = await service.ExecuteAsync(job, CancellationToken.None);
|
||||
|
||||
Assert.Equal(PolicyRunExecutionResultType.Submitted, result.Type);
|
||||
Assert.Equal(PolicyRunJobStatus.Submitted, result.UpdatedJob.Status);
|
||||
Assert.Equal("run:P-7:2025", result.UpdatedJob.RunId);
|
||||
Assert.Equal(job.AttemptCount + 1, result.UpdatedJob.AttemptCount);
|
||||
Assert.Null(result.UpdatedJob.LastError);
|
||||
Assert.True(repository.ReplaceCalled);
|
||||
Assert.Equal(PolicyRunJobStatus.Submitted, result.UpdatedJob.Status);
|
||||
Assert.Equal("run:P-7:2025", result.UpdatedJob.RunId);
|
||||
Assert.Equal(job.AttemptCount + 1, result.UpdatedJob.AttemptCount);
|
||||
Assert.Null(result.UpdatedJob.LastError);
|
||||
Assert.True(repository.ReplaceCalled);
|
||||
Assert.Empty(webhook.Payloads);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -109,13 +115,14 @@ public sealed class PolicyRunExecutionServiceTests
|
||||
Result = PolicyRunSubmissionResult.Failed("timeout")
|
||||
};
|
||||
var options = Microsoft.Extensions.Options.Options.Create(CloneOptions());
|
||||
var timeProvider = new TestTimeProvider(DateTimeOffset.Parse("2025-10-28T10:00:00Z"));
|
||||
using var metrics = new SchedulerWorkerMetrics();
|
||||
var targeting = new StubPolicyRunTargetingService
|
||||
{
|
||||
OnEnsureTargets = job => PolicyRunTargetingResult.Unchanged(job)
|
||||
};
|
||||
var service = new PolicyRunExecutionService(repository, client, options, timeProvider, metrics, targeting, NullLogger<PolicyRunExecutionService>.Instance);
|
||||
var timeProvider = new TestTimeProvider(DateTimeOffset.Parse("2025-10-28T10:00:00Z"));
|
||||
using var metrics = new SchedulerWorkerMetrics();
|
||||
var targeting = new StubPolicyRunTargetingService
|
||||
{
|
||||
OnEnsureTargets = job => PolicyRunTargetingResult.Unchanged(job)
|
||||
};
|
||||
var webhook = new RecordingPolicySimulationWebhookClient();
|
||||
var service = new PolicyRunExecutionService(repository, client, options, timeProvider, metrics, targeting, webhook, NullLogger<PolicyRunExecutionService>.Instance);
|
||||
|
||||
var job = CreateJob(status: PolicyRunJobStatus.Dispatching) with
|
||||
{
|
||||
@@ -127,9 +134,10 @@ public sealed class PolicyRunExecutionServiceTests
|
||||
|
||||
Assert.Equal(PolicyRunExecutionResultType.Retrying, result.Type);
|
||||
Assert.Equal(PolicyRunJobStatus.Pending, result.UpdatedJob.Status);
|
||||
Assert.Equal(job.AttemptCount + 1, result.UpdatedJob.AttemptCount);
|
||||
Assert.Equal("timeout", result.UpdatedJob.LastError);
|
||||
Assert.True(result.UpdatedJob.AvailableAt > job.AvailableAt);
|
||||
Assert.Equal(job.AttemptCount + 1, result.UpdatedJob.AttemptCount);
|
||||
Assert.Equal("timeout", result.UpdatedJob.LastError);
|
||||
Assert.True(result.UpdatedJob.AvailableAt > job.AvailableAt);
|
||||
Assert.Empty(webhook.Payloads);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -144,12 +152,13 @@ public sealed class PolicyRunExecutionServiceTests
|
||||
optionsValue.Policy.Dispatch.MaxAttempts = 1;
|
||||
var options = Microsoft.Extensions.Options.Options.Create(optionsValue);
|
||||
var timeProvider = new TestTimeProvider(DateTimeOffset.Parse("2025-10-28T10:00:00Z"));
|
||||
using var metrics = new SchedulerWorkerMetrics();
|
||||
var targeting = new StubPolicyRunTargetingService
|
||||
{
|
||||
OnEnsureTargets = job => PolicyRunTargetingResult.Unchanged(job)
|
||||
};
|
||||
var service = new PolicyRunExecutionService(repository, client, options, timeProvider, metrics, targeting, NullLogger<PolicyRunExecutionService>.Instance);
|
||||
using var metrics = new SchedulerWorkerMetrics();
|
||||
var targeting = new StubPolicyRunTargetingService
|
||||
{
|
||||
OnEnsureTargets = job => PolicyRunTargetingResult.Unchanged(job)
|
||||
};
|
||||
var webhook = new RecordingPolicySimulationWebhookClient();
|
||||
var service = new PolicyRunExecutionService(repository, client, options, timeProvider, metrics, targeting, webhook, NullLogger<PolicyRunExecutionService>.Instance);
|
||||
|
||||
var job = CreateJob(status: PolicyRunJobStatus.Dispatching, attemptCount: 0) with
|
||||
{
|
||||
@@ -157,11 +166,13 @@ public sealed class PolicyRunExecutionServiceTests
|
||||
LeaseExpiresAt = timeProvider.GetUtcNow().AddMinutes(1)
|
||||
};
|
||||
|
||||
var result = await service.ExecuteAsync(job, CancellationToken.None);
|
||||
|
||||
Assert.Equal(PolicyRunExecutionResultType.Failed, result.Type);
|
||||
Assert.Equal(PolicyRunJobStatus.Failed, result.UpdatedJob.Status);
|
||||
Assert.Equal("bad request", result.UpdatedJob.LastError);
|
||||
var result = await service.ExecuteAsync(job, CancellationToken.None);
|
||||
|
||||
Assert.Equal(PolicyRunExecutionResultType.Failed, result.Type);
|
||||
Assert.Equal(PolicyRunJobStatus.Failed, result.UpdatedJob.Status);
|
||||
Assert.Equal("bad request", result.UpdatedJob.LastError);
|
||||
Assert.Single(webhook.Payloads);
|
||||
Assert.Equal("failed", webhook.Payloads[0].Result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
@@ -172,11 +183,12 @@ public sealed class PolicyRunExecutionServiceTests
|
||||
var options = Microsoft.Extensions.Options.Options.Create(CloneOptions());
|
||||
var timeProvider = new TestTimeProvider(DateTimeOffset.Parse("2025-10-28T10:00:00Z"));
|
||||
using var metrics = new SchedulerWorkerMetrics();
|
||||
var targeting = new StubPolicyRunTargetingService
|
||||
{
|
||||
OnEnsureTargets = job => PolicyRunTargetingResult.NoWork(job, "empty")
|
||||
};
|
||||
var service = new PolicyRunExecutionService(repository, client, options, timeProvider, metrics, targeting, NullLogger<PolicyRunExecutionService>.Instance);
|
||||
var targeting = new StubPolicyRunTargetingService
|
||||
{
|
||||
OnEnsureTargets = job => PolicyRunTargetingResult.NoWork(job, "empty")
|
||||
};
|
||||
var webhook = new RecordingPolicySimulationWebhookClient();
|
||||
var service = new PolicyRunExecutionService(repository, client, options, timeProvider, metrics, targeting, webhook, NullLogger<PolicyRunExecutionService>.Instance);
|
||||
|
||||
var job = CreateJob(status: PolicyRunJobStatus.Dispatching, inputs: PolicyRunInputs.Empty) with
|
||||
{
|
||||
@@ -186,10 +198,12 @@ public sealed class PolicyRunExecutionServiceTests
|
||||
|
||||
var result = await service.ExecuteAsync(job, CancellationToken.None);
|
||||
|
||||
Assert.Equal(PolicyRunExecutionResultType.NoOp, result.Type);
|
||||
Assert.Equal(PolicyRunJobStatus.Completed, result.UpdatedJob.Status);
|
||||
Assert.True(repository.ReplaceCalled);
|
||||
Assert.Equal("test-dispatch", repository.ExpectedLeaseOwner);
|
||||
Assert.Equal(PolicyRunExecutionResultType.NoOp, result.Type);
|
||||
Assert.Equal(PolicyRunJobStatus.Completed, result.UpdatedJob.Status);
|
||||
Assert.True(repository.ReplaceCalled);
|
||||
Assert.Equal("test-dispatch", repository.ExpectedLeaseOwner);
|
||||
Assert.Single(webhook.Payloads);
|
||||
Assert.Equal("succeeded", webhook.Payloads[0].Result);
|
||||
}
|
||||
|
||||
private static PolicyRunJob CreateJob(PolicyRunJobStatus status, int attemptCount = 0, PolicyRunInputs? inputs = null)
|
||||
@@ -253,15 +267,23 @@ public sealed class PolicyRunExecutionServiceTests
|
||||
IdempotencyHeader = WorkerOptions.Policy.Api.IdempotencyHeader,
|
||||
RequestTimeout = WorkerOptions.Policy.Api.RequestTimeout
|
||||
},
|
||||
Targeting = new SchedulerWorkerOptions.PolicyOptions.TargetingOptions
|
||||
{
|
||||
Enabled = WorkerOptions.Policy.Targeting.Enabled,
|
||||
MaxSboms = WorkerOptions.Policy.Targeting.MaxSboms,
|
||||
DefaultUsageOnly = WorkerOptions.Policy.Targeting.DefaultUsageOnly
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
Targeting = new SchedulerWorkerOptions.PolicyOptions.TargetingOptions
|
||||
{
|
||||
Enabled = WorkerOptions.Policy.Targeting.Enabled,
|
||||
MaxSboms = WorkerOptions.Policy.Targeting.MaxSboms,
|
||||
DefaultUsageOnly = WorkerOptions.Policy.Targeting.DefaultUsageOnly
|
||||
},
|
||||
Webhook = new SchedulerWorkerOptions.PolicyOptions.WebhookOptions
|
||||
{
|
||||
Enabled = WorkerOptions.Policy.Webhook.Enabled,
|
||||
Endpoint = WorkerOptions.Policy.Webhook.Endpoint,
|
||||
ApiKeyHeader = WorkerOptions.Policy.Webhook.ApiKeyHeader,
|
||||
ApiKey = WorkerOptions.Policy.Webhook.ApiKey,
|
||||
TimeoutSeconds = WorkerOptions.Policy.Webhook.TimeoutSeconds
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private sealed class StubPolicyRunTargetingService : IPolicyRunTargetingService
|
||||
{
|
||||
@@ -271,8 +293,19 @@ public sealed class PolicyRunExecutionServiceTests
|
||||
=> Task.FromResult(OnEnsureTargets?.Invoke(job) ?? PolicyRunTargetingResult.Unchanged(job));
|
||||
}
|
||||
|
||||
private sealed class RecordingPolicyRunJobRepository : IPolicyRunJobRepository
|
||||
{
|
||||
private sealed class RecordingPolicySimulationWebhookClient : IPolicySimulationWebhookClient
|
||||
{
|
||||
public List<PolicySimulationWebhookPayload> Payloads { get; } = new();
|
||||
|
||||
public Task NotifyAsync(PolicySimulationWebhookPayload payload, CancellationToken cancellationToken)
|
||||
{
|
||||
Payloads.Add(payload);
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class RecordingPolicyRunJobRepository : IPolicyRunJobRepository
|
||||
{
|
||||
public bool ReplaceCalled { get; private set; }
|
||||
public string? ExpectedLeaseOwner { get; private set; }
|
||||
public PolicyRunJob? LastJob { get; private set; }
|
||||
@@ -280,17 +313,20 @@ public sealed class PolicyRunExecutionServiceTests
|
||||
public Task<PolicyRunJob?> GetAsync(string tenantId, string jobId, IClientSessionHandle? session = null, CancellationToken cancellationToken = default)
|
||||
=> Task.FromResult<PolicyRunJob?>(null);
|
||||
|
||||
public Task<PolicyRunJob?> GetByRunIdAsync(string tenantId, string runId, IClientSessionHandle? session = null, CancellationToken cancellationToken = default)
|
||||
=> Task.FromResult<PolicyRunJob?>(null);
|
||||
|
||||
public Task InsertAsync(PolicyRunJob job, IClientSessionHandle? session = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
LastJob = job;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public Task<PolicyRunJob?> LeaseAsync(string leaseOwner, DateTimeOffset now, TimeSpan leaseDuration, int maxAttempts, IClientSessionHandle? session = null, CancellationToken cancellationToken = default)
|
||||
=> Task.FromResult<PolicyRunJob?>(null);
|
||||
public Task<PolicyRunJob?> GetByRunIdAsync(string tenantId, string runId, IClientSessionHandle? session = null, CancellationToken cancellationToken = default)
|
||||
=> Task.FromResult<PolicyRunJob?>(null);
|
||||
|
||||
public Task InsertAsync(PolicyRunJob job, IClientSessionHandle? session = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
LastJob = job;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public Task<long> CountAsync(string tenantId, PolicyRunMode mode, IReadOnlyCollection<PolicyRunJobStatus> statuses, CancellationToken cancellationToken = default)
|
||||
=> Task.FromResult(0L);
|
||||
|
||||
public Task<PolicyRunJob?> LeaseAsync(string leaseOwner, DateTimeOffset now, TimeSpan leaseDuration, int maxAttempts, IClientSessionHandle? session = null, CancellationToken cancellationToken = default)
|
||||
=> Task.FromResult<PolicyRunJob?>(null);
|
||||
|
||||
public Task<bool> ReplaceAsync(PolicyRunJob job, string? expectedLeaseOwner = null, IClientSessionHandle? session = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
|
||||
@@ -0,0 +1,146 @@
|
||||
using System;
|
||||
using System.Linq;
|
||||
using System.Net;
|
||||
using System.Net.Http;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
using StellaOps.Scheduler.Worker.Policy;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Tests;
|
||||
|
||||
public sealed class PolicySimulationWebhookClientTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task NotifyAsync_Disabled_DoesNotInvokeEndpoint()
|
||||
{
|
||||
var handler = new RecordingHandler();
|
||||
using var httpClient = new HttpClient(handler);
|
||||
var options = CreateOptions();
|
||||
var client = new HttpPolicySimulationWebhookClient(httpClient, options, NullLogger<HttpPolicySimulationWebhookClient>.Instance);
|
||||
|
||||
var payload = PolicySimulationWebhookPayloadFactory.Create(CreateStatus(), DateTimeOffset.UtcNow);
|
||||
await client.NotifyAsync(payload, CancellationToken.None);
|
||||
|
||||
Assert.False(handler.WasInvoked);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task NotifyAsync_SendsPayload_WhenEnabled()
|
||||
{
|
||||
var handler = new RecordingHandler(new HttpResponseMessage(HttpStatusCode.Accepted));
|
||||
using var httpClient = new HttpClient(handler);
|
||||
var options = CreateOptions(o =>
|
||||
{
|
||||
o.Policy.Webhook.Enabled = true;
|
||||
o.Policy.Webhook.Endpoint = "https://example.org/webhooks/policy";
|
||||
o.Policy.Webhook.ApiKeyHeader = "X-Test-Key";
|
||||
o.Policy.Webhook.ApiKey = "secret";
|
||||
o.Policy.Webhook.TimeoutSeconds = 5;
|
||||
});
|
||||
|
||||
var client = new HttpPolicySimulationWebhookClient(httpClient, options, NullLogger<HttpPolicySimulationWebhookClient>.Instance);
|
||||
|
||||
var observedAt = DateTimeOffset.UtcNow;
|
||||
var payload = PolicySimulationWebhookPayloadFactory.Create(CreateStatus(), observedAt);
|
||||
await client.NotifyAsync(payload, CancellationToken.None);
|
||||
|
||||
Assert.True(handler.WasInvoked);
|
||||
Assert.NotNull(handler.LastRequest);
|
||||
Assert.Equal("https://example.org/webhooks/policy", handler.LastRequest!.RequestUri!.ToString());
|
||||
Assert.True(handler.LastRequest.Headers.Contains("X-Test-Key"));
|
||||
Assert.True(handler.LastRequest.Headers.Contains("X-StellaOps-Run-Id"));
|
||||
Assert.Equal("secret", handler.LastRequest.Headers.GetValues("X-Test-Key").Single());
|
||||
}
|
||||
|
||||
private static PolicyRunStatus CreateStatus()
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var job = new PolicyRunJob(
|
||||
SchemaVersion: SchedulerSchemaVersions.PolicyRunJob,
|
||||
Id: "job",
|
||||
TenantId: "tenant",
|
||||
PolicyId: "policy",
|
||||
PolicyVersion: 1,
|
||||
Mode: PolicyRunMode.Simulate,
|
||||
Priority: PolicyRunPriority.Normal,
|
||||
PriorityRank: 0,
|
||||
RunId: "run:policy:123",
|
||||
RequestedBy: "tester",
|
||||
CorrelationId: "corr",
|
||||
Metadata: null,
|
||||
Inputs: PolicyRunInputs.Empty,
|
||||
QueuedAt: now,
|
||||
Status: PolicyRunJobStatus.Completed,
|
||||
AttemptCount: 1,
|
||||
LastAttemptAt: now,
|
||||
LastError: null,
|
||||
CreatedAt: now,
|
||||
UpdatedAt: now,
|
||||
AvailableAt: now,
|
||||
SubmittedAt: now,
|
||||
CompletedAt: now,
|
||||
LeaseOwner: null,
|
||||
LeaseExpiresAt: null,
|
||||
CancellationRequested: false,
|
||||
CancellationRequestedAt: null,
|
||||
CancellationReason: null,
|
||||
CancelledAt: null);
|
||||
|
||||
return PolicyRunStatusFactory.Create(job, now);
|
||||
}
|
||||
|
||||
private static IOptionsMonitor<SchedulerWorkerOptions> CreateOptions(Action<SchedulerWorkerOptions>? configure = null)
|
||||
{
|
||||
var value = new SchedulerWorkerOptions();
|
||||
configure?.Invoke(value);
|
||||
return new StaticOptionsMonitor<SchedulerWorkerOptions>(value);
|
||||
}
|
||||
|
||||
private sealed class RecordingHandler : HttpMessageHandler
|
||||
{
|
||||
private readonly HttpResponseMessage _response;
|
||||
|
||||
public RecordingHandler(HttpResponseMessage? response = null)
|
||||
{
|
||||
_response = response ?? new HttpResponseMessage(HttpStatusCode.OK);
|
||||
}
|
||||
|
||||
public bool WasInvoked { get; private set; }
|
||||
|
||||
public HttpRequestMessage? LastRequest { get; private set; }
|
||||
|
||||
protected override Task<HttpResponseMessage> SendAsync(HttpRequestMessage request, CancellationToken cancellationToken)
|
||||
{
|
||||
WasInvoked = true;
|
||||
LastRequest = request;
|
||||
return Task.FromResult(_response);
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class StaticOptionsMonitor<T> : IOptionsMonitor<T>
|
||||
{
|
||||
private sealed class NoopDisposable : IDisposable
|
||||
{
|
||||
public static readonly IDisposable Instance = new NoopDisposable();
|
||||
public void Dispose()
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public StaticOptionsMonitor(T value)
|
||||
{
|
||||
CurrentValue = value;
|
||||
}
|
||||
|
||||
public T CurrentValue { get; }
|
||||
|
||||
public T Get(string? name) => CurrentValue;
|
||||
|
||||
public IDisposable OnChange(Action<T, string?> listener) => NoopDisposable.Instance;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user