feat: Add Go module and workspace test fixtures

- Created expected JSON files for Go modules and workspaces.
- Added go.mod and go.sum files for example projects.
- Implemented private module structure with expected JSON output.
- Introduced vendored dependencies with corresponding expected JSON.
- Developed PostgresGraphJobStore for managing graph jobs.
- Established SQL migration scripts for graph jobs schema.
- Implemented GraphJobRepository for CRUD operations on graph jobs.
- Created IGraphJobRepository interface for repository abstraction.
- Added unit tests for GraphJobRepository to ensure functionality.
This commit is contained in:
StellaOps Bot
2025-12-06 20:04:03 +02:00
parent a6f1406509
commit 05597616d6
178 changed files with 12022 additions and 4545 deletions

View File

@@ -0,0 +1,34 @@
-- Scheduler graph jobs schema (Postgres)
DO $$ BEGIN
CREATE TYPE scheduler.graph_job_type AS ENUM ('build', 'overlay');
EXCEPTION WHEN duplicate_object THEN NULL; END $$;
DO $$ BEGIN
CREATE TYPE scheduler.graph_job_status AS ENUM ('pending', 'running', 'completed', 'failed', 'canceled');
EXCEPTION WHEN duplicate_object THEN NULL; END $$;
CREATE TABLE IF NOT EXISTS scheduler.graph_jobs (
id UUID PRIMARY KEY,
tenant_id TEXT NOT NULL,
type scheduler.graph_job_type NOT NULL,
status scheduler.graph_job_status NOT NULL,
payload JSONB NOT NULL,
correlation_id TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_graph_jobs_tenant_status ON scheduler.graph_jobs(tenant_id, status, created_at DESC);
CREATE INDEX IF NOT EXISTS idx_graph_jobs_tenant_type_status ON scheduler.graph_jobs(tenant_id, type, status, created_at DESC);
CREATE TABLE IF NOT EXISTS scheduler.graph_job_events (
id BIGSERIAL PRIMARY KEY,
job_id UUID NOT NULL REFERENCES scheduler.graph_jobs(id) ON DELETE CASCADE,
tenant_id TEXT NOT NULL,
status scheduler.graph_job_status NOT NULL,
payload JSONB NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_graph_job_events_job ON scheduler.graph_job_events(job_id, created_at DESC);

View File

@@ -0,0 +1,157 @@
using System.Collections.Generic;
using System.Text.Json;
using Dapper;
using Npgsql;
using StellaOps.Infrastructure.Postgres;
using StellaOps.Scheduler.Models;
namespace StellaOps.Scheduler.Storage.Postgres.Repositories;
public sealed class GraphJobRepository : IGraphJobRepository
{
private readonly SchedulerDataSource _dataSource;
private readonly JsonSerializerOptions _json;
public GraphJobRepository(SchedulerDataSource dataSource)
{
_dataSource = dataSource;
_json = CanonicalJsonSerializer.Options;
}
public async ValueTask InsertAsync(GraphBuildJob job, CancellationToken cancellationToken)
{
const string sql = @"INSERT INTO scheduler.graph_jobs
(id, tenant_id, type, status, payload, created_at, updated_at, correlation_id)
VALUES (@Id, @TenantId, @Type, @Status, @Payload, @CreatedAt, @UpdatedAt, @CorrelationId);";
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
await conn.ExecuteAsync(sql, new
{
job.Id,
job.TenantId,
Type = (short)GraphJobQueryType.Build,
Status = (short)job.Status,
Payload = JsonSerializer.Serialize(job, _json),
job.CreatedAt,
UpdatedAt = job.UpdatedAt ?? job.CreatedAt,
job.CorrelationId
});
}
public async ValueTask InsertAsync(GraphOverlayJob job, CancellationToken cancellationToken)
{
const string sql = @"INSERT INTO scheduler.graph_jobs
(id, tenant_id, type, status, payload, created_at, updated_at, correlation_id)
VALUES (@Id, @TenantId, @Type, @Status, @Payload, @CreatedAt, @UpdatedAt, @CorrelationId);";
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
await conn.ExecuteAsync(sql, new
{
job.Id,
job.TenantId,
Type = (short)GraphJobQueryType.Overlay,
Status = (short)job.Status,
Payload = JsonSerializer.Serialize(job, _json),
job.CreatedAt,
UpdatedAt = job.UpdatedAt ?? job.CreatedAt,
job.CorrelationId
});
}
public async ValueTask<GraphBuildJob?> GetBuildJobAsync(string tenantId, string jobId, CancellationToken cancellationToken)
{
const string sql = "SELECT payload FROM scheduler.graph_jobs WHERE tenant_id=@TenantId AND id=@Id AND type=@Type LIMIT 1";
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
var payload = await conn.ExecuteScalarAsync<string?>(sql, new { TenantId = tenantId, Id = jobId, Type = (short)GraphJobQueryType.Build });
return payload is null ? null : JsonSerializer.Deserialize<GraphBuildJob>(payload, _json);
}
public async ValueTask<GraphOverlayJob?> GetOverlayJobAsync(string tenantId, string jobId, CancellationToken cancellationToken)
{
const string sql = "SELECT payload FROM scheduler.graph_jobs WHERE tenant_id=@TenantId AND id=@Id AND type=@Type LIMIT 1";
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
var payload = await conn.ExecuteScalarAsync<string?>(sql, new { TenantId = tenantId, Id = jobId, Type = (short)GraphJobQueryType.Overlay });
return payload is null ? null : JsonSerializer.Deserialize<GraphOverlayJob>(payload, _json);
}
public async ValueTask<IReadOnlyCollection<GraphBuildJob>> ListBuildJobsAsync(string tenantId, GraphJobStatus? status, int limit, CancellationToken cancellationToken)
{
var sql = "SELECT payload FROM scheduler.graph_jobs WHERE tenant_id=@TenantId AND type=@Type";
if (status is not null)
{
sql += " AND status=@Status";
}
sql += " ORDER BY created_at DESC LIMIT @Limit";
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
var rows = await conn.QueryAsync<string>(sql, new
{
TenantId = tenantId,
Type = (short)GraphJobQueryType.Build,
Status = status is null ? null : (short)status,
Limit = limit
});
return rows.Select(r => JsonSerializer.Deserialize<GraphBuildJob>(r, _json)!).ToArray();
}
public async ValueTask<IReadOnlyCollection<GraphOverlayJob>> ListOverlayJobsAsync(string tenantId, GraphJobStatus? status, int limit, CancellationToken cancellationToken)
{
var sql = "SELECT payload FROM scheduler.graph_jobs WHERE tenant_id=@TenantId AND type=@Type";
if (status is not null)
{
sql += " AND status=@Status";
}
sql += " ORDER BY created_at DESC LIMIT @Limit";
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
var rows = await conn.QueryAsync<string>(sql, new
{
TenantId = tenantId,
Type = (short)GraphJobQueryType.Overlay,
Status = status is null ? null : (short)status,
Limit = limit
});
return rows.Select(r => JsonSerializer.Deserialize<GraphOverlayJob>(r, _json)!).ToArray();
}
public ValueTask<IReadOnlyCollection<GraphOverlayJob>> ListOverlayJobsAsync(string tenantId, CancellationToken cancellationToken)
=> ListOverlayJobsAsync(tenantId, status: null, limit: 50, cancellationToken);
public async ValueTask<bool> TryReplaceAsync(GraphBuildJob job, GraphJobStatus expectedStatus, CancellationToken cancellationToken)
{
const string sql = @"UPDATE scheduler.graph_jobs
SET status=@NewStatus, payload=@Payload, updated_at=NOW()
WHERE tenant_id=@TenantId AND id=@Id AND status=@ExpectedStatus AND type=@Type";
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
var rows = await conn.ExecuteAsync(sql, new
{
job.TenantId,
job.Id,
ExpectedStatus = (short)expectedStatus,
NewStatus = (short)job.Status,
Type = (short)GraphJobQueryType.Build,
Payload = JsonSerializer.Serialize(job, _json)
});
return rows == 1;
}
public async ValueTask<bool> TryReplaceOverlayAsync(GraphOverlayJob job, GraphJobStatus expectedStatus, CancellationToken cancellationToken)
{
const string sql = @"UPDATE scheduler.graph_jobs
SET status=@NewStatus, payload=@Payload, updated_at=NOW()
WHERE tenant_id=@TenantId AND id=@Id AND status=@ExpectedStatus AND type=@Type";
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
var rows = await conn.ExecuteAsync(sql, new
{
job.TenantId,
job.Id,
ExpectedStatus = (short)expectedStatus,
NewStatus = (short)job.Status,
Type = (short)GraphJobQueryType.Overlay,
Payload = JsonSerializer.Serialize(job, _json)
});
return rows == 1;
}
}

View File

@@ -0,0 +1,22 @@
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
using StellaOps.Scheduler.Models;
namespace StellaOps.Scheduler.Storage.Postgres.Repositories;
public interface IGraphJobRepository
{
ValueTask InsertAsync(GraphBuildJob job, CancellationToken cancellationToken);
ValueTask InsertAsync(GraphOverlayJob job, CancellationToken cancellationToken);
ValueTask<bool> TryReplaceAsync(GraphBuildJob job, GraphJobStatus expectedStatus, CancellationToken cancellationToken);
ValueTask<bool> TryReplaceOverlayAsync(GraphOverlayJob job, GraphJobStatus expectedStatus, CancellationToken cancellationToken);
ValueTask<GraphBuildJob?> GetBuildJobAsync(string tenantId, string jobId, CancellationToken cancellationToken);
ValueTask<GraphOverlayJob?> GetOverlayJobAsync(string tenantId, string jobId, CancellationToken cancellationToken);
ValueTask<IReadOnlyCollection<GraphBuildJob>> ListBuildJobsAsync(string tenantId, GraphJobStatus? status, int limit, CancellationToken cancellationToken);
ValueTask<IReadOnlyCollection<GraphOverlayJob>> ListOverlayJobsAsync(string tenantId, GraphJobStatus? status, int limit, CancellationToken cancellationToken);
ValueTask<IReadOnlyCollection<GraphOverlayJob>> ListOverlayJobsAsync(string tenantId, CancellationToken cancellationToken);
}

View File

@@ -33,6 +33,7 @@ public static class ServiceCollectionExtensions
services.AddScoped<IDistributedLockRepository, DistributedLockRepository>();
services.AddScoped<IJobHistoryRepository, JobHistoryRepository>();
services.AddScoped<IMetricsRepository, MetricsRepository>();
services.AddScoped<IGraphJobRepository, GraphJobRepository>();
return services;
}
@@ -57,6 +58,7 @@ public static class ServiceCollectionExtensions
services.AddScoped<IDistributedLockRepository, DistributedLockRepository>();
services.AddScoped<IJobHistoryRepository, JobHistoryRepository>();
services.AddScoped<IMetricsRepository, MetricsRepository>();
services.AddScoped<IGraphJobRepository, GraphJobRepository>();
return services;
}

View File

@@ -16,6 +16,11 @@
<ItemGroup>
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Infrastructure.Postgres\StellaOps.Infrastructure.Postgres.csproj" />
<ProjectReference Include="..\StellaOps.Scheduler.Models\StellaOps.Scheduler.Models.csproj" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Dapper" Version="2.1.24" />
</ItemGroup>
</Project>

View File

@@ -6,8 +6,8 @@ using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Queue;
using StellaOps.Scheduler.Storage.Mongo.Repositories;
using StellaOps.Scheduler.Storage.Mongo.Services;
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
using StellaOps.Scheduler.Storage.Postgres.Repositories.Services;
using StellaOps.Scheduler.Worker.Events;
using StellaOps.Scheduler.Worker.Observability;

View File

@@ -1,129 +1,129 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Mongo.Repositories;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Graph;
internal sealed class GraphBuildBackgroundService : BackgroundService
{
private readonly IGraphJobRepository _repository;
private readonly GraphBuildExecutionService _executionService;
private readonly IOptions<SchedulerWorkerOptions> _options;
private readonly ILogger<GraphBuildBackgroundService> _logger;
public GraphBuildBackgroundService(
IGraphJobRepository repository,
GraphBuildExecutionService executionService,
IOptions<SchedulerWorkerOptions> options,
ILogger<GraphBuildBackgroundService> logger)
{
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
_executionService = executionService ?? throw new ArgumentNullException(nameof(executionService));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("Graph build worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
var graphOptions = _options.Value.Graph;
if (!graphOptions.Enabled)
{
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
var jobs = await _repository.ListBuildJobsAsync(GraphJobStatus.Pending, graphOptions.BatchSize, stoppingToken).ConfigureAwait(false);
if (jobs.Count == 0)
{
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
foreach (var job in jobs)
{
try
{
var result = await _executionService.ExecuteAsync(job, stoppingToken).ConfigureAwait(false);
LogResult(result);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Unhandled exception while processing graph build job {JobId}.", job.Id);
}
}
await DelayAsync(graphOptions.PollInterval, stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Graph build worker encountered an error; backing off.");
await DelayAsync(TimeSpan.FromSeconds(5), stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Graph build worker stopping.");
}
private async Task DelayAsync(TimeSpan delay, CancellationToken cancellationToken)
{
if (delay <= TimeSpan.Zero)
{
return;
}
try
{
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
}
catch (TaskCanceledException)
{
}
}
private void LogResult(GraphBuildExecutionResult result)
{
switch (result.Type)
{
case GraphBuildExecutionResultType.Completed:
_logger.LogInformation(
"Graph build job {JobId} completed (tenant={TenantId}).",
result.Job.Id,
result.Job.TenantId);
break;
case GraphBuildExecutionResultType.Failed:
_logger.LogWarning(
"Graph build job {JobId} failed (tenant={TenantId}): {Reason}.",
result.Job.Id,
result.Job.TenantId,
result.Reason ?? "unknown error");
break;
case GraphBuildExecutionResultType.Skipped:
_logger.LogDebug(
"Graph build job {JobId} skipped: {Reason}.",
result.Job.Id,
result.Reason ?? "no reason");
break;
}
}
}
using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Graph;
internal sealed class GraphBuildBackgroundService : BackgroundService
{
private readonly IGraphJobRepository _repository;
private readonly GraphBuildExecutionService _executionService;
private readonly IOptions<SchedulerWorkerOptions> _options;
private readonly ILogger<GraphBuildBackgroundService> _logger;
public GraphBuildBackgroundService(
IGraphJobRepository repository,
GraphBuildExecutionService executionService,
IOptions<SchedulerWorkerOptions> options,
ILogger<GraphBuildBackgroundService> logger)
{
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
_executionService = executionService ?? throw new ArgumentNullException(nameof(executionService));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("Graph build worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
var graphOptions = _options.Value.Graph;
if (!graphOptions.Enabled)
{
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
var jobs = await _repository.ListBuildJobsAsync(GraphJobStatus.Pending, graphOptions.BatchSize, stoppingToken).ConfigureAwait(false);
if (jobs.Count == 0)
{
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
foreach (var job in jobs)
{
try
{
var result = await _executionService.ExecuteAsync(job, stoppingToken).ConfigureAwait(false);
LogResult(result);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Unhandled exception while processing graph build job {JobId}.", job.Id);
}
}
await DelayAsync(graphOptions.PollInterval, stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Graph build worker encountered an error; backing off.");
await DelayAsync(TimeSpan.FromSeconds(5), stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Graph build worker stopping.");
}
private async Task DelayAsync(TimeSpan delay, CancellationToken cancellationToken)
{
if (delay <= TimeSpan.Zero)
{
return;
}
try
{
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
}
catch (TaskCanceledException)
{
}
}
private void LogResult(GraphBuildExecutionResult result)
{
switch (result.Type)
{
case GraphBuildExecutionResultType.Completed:
_logger.LogInformation(
"Graph build job {JobId} completed (tenant={TenantId}).",
result.Job.Id,
result.Job.TenantId);
break;
case GraphBuildExecutionResultType.Failed:
_logger.LogWarning(
"Graph build job {JobId} failed (tenant={TenantId}): {Reason}.",
result.Job.Id,
result.Job.TenantId,
result.Reason ?? "unknown error");
break;
case GraphBuildExecutionResultType.Skipped:
_logger.LogDebug(
"Graph build job {JobId} skipped: {Reason}.",
result.Job.Id,
result.Reason ?? "no reason");
break;
}
}
}

View File

@@ -1,76 +1,76 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Mongo.Repositories;
using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
using StellaOps.Scheduler.Worker.Graph.Cartographer;
using StellaOps.Scheduler.Worker.Graph.Scheduler;
using StellaOps.Scheduler.Worker.Options;
using StellaOps.Scheduler.Worker.Observability;
namespace StellaOps.Scheduler.Worker.Graph;
internal sealed class GraphBuildExecutionService
{
private readonly IGraphJobRepository _repository;
private readonly ICartographerBuildClient _cartographerClient;
private readonly IGraphJobCompletionClient _completionClient;
private readonly IOptions<SchedulerWorkerOptions> _options;
private readonly SchedulerWorkerMetrics _metrics;
private readonly TimeProvider _timeProvider;
private readonly ILogger<GraphBuildExecutionService> _logger;
public GraphBuildExecutionService(
IGraphJobRepository repository,
ICartographerBuildClient cartographerClient,
IGraphJobCompletionClient completionClient,
IOptions<SchedulerWorkerOptions> options,
SchedulerWorkerMetrics metrics,
TimeProvider? timeProvider,
ILogger<GraphBuildExecutionService> logger)
{
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
_cartographerClient = cartographerClient ?? throw new ArgumentNullException(nameof(cartographerClient));
_completionClient = completionClient ?? throw new ArgumentNullException(nameof(completionClient));
_options = options ?? throw new ArgumentNullException(nameof(options));
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<GraphBuildExecutionResult> ExecuteAsync(GraphBuildJob job, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(job);
var graphOptions = _options.Value.Graph;
if (!graphOptions.Enabled)
{
_metrics.RecordGraphJobResult("build", "skipped");
return GraphBuildExecutionResult.Skipped(job, "graph_processing_disabled");
}
if (job.Status != GraphJobStatus.Pending)
{
_metrics.RecordGraphJobResult("build", "skipped");
return GraphBuildExecutionResult.Skipped(job, "status_not_pending");
}
var now = _timeProvider.GetUtcNow();
GraphBuildJob running;
try
{
running = GraphJobStateMachine.EnsureTransition(job, GraphJobStatus.Running, now, attempts: job.Attempts + 1);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to transition graph job {JobId} to running state.", job.Id);
_metrics.RecordGraphJobResult("build", "skipped");
return GraphBuildExecutionResult.Skipped(job, "transition_invalid");
}
namespace StellaOps.Scheduler.Worker.Graph;
internal sealed class GraphBuildExecutionService
{
private readonly IGraphJobRepository _repository;
private readonly ICartographerBuildClient _cartographerClient;
private readonly IGraphJobCompletionClient _completionClient;
private readonly IOptions<SchedulerWorkerOptions> _options;
private readonly SchedulerWorkerMetrics _metrics;
private readonly TimeProvider _timeProvider;
private readonly ILogger<GraphBuildExecutionService> _logger;
public GraphBuildExecutionService(
IGraphJobRepository repository,
ICartographerBuildClient cartographerClient,
IGraphJobCompletionClient completionClient,
IOptions<SchedulerWorkerOptions> options,
SchedulerWorkerMetrics metrics,
TimeProvider? timeProvider,
ILogger<GraphBuildExecutionService> logger)
{
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
_cartographerClient = cartographerClient ?? throw new ArgumentNullException(nameof(cartographerClient));
_completionClient = completionClient ?? throw new ArgumentNullException(nameof(completionClient));
_options = options ?? throw new ArgumentNullException(nameof(options));
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<GraphBuildExecutionResult> ExecuteAsync(GraphBuildJob job, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(job);
var graphOptions = _options.Value.Graph;
if (!graphOptions.Enabled)
{
_metrics.RecordGraphJobResult("build", "skipped");
return GraphBuildExecutionResult.Skipped(job, "graph_processing_disabled");
}
if (job.Status != GraphJobStatus.Pending)
{
_metrics.RecordGraphJobResult("build", "skipped");
return GraphBuildExecutionResult.Skipped(job, "status_not_pending");
}
var now = _timeProvider.GetUtcNow();
GraphBuildJob running;
try
{
running = GraphJobStateMachine.EnsureTransition(job, GraphJobStatus.Running, now, attempts: job.Attempts + 1);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to transition graph job {JobId} to running state.", job.Id);
_metrics.RecordGraphJobResult("build", "skipped");
return GraphBuildExecutionResult.Skipped(job, "transition_invalid");
}
if (!await _repository.TryReplaceAsync(running, job.Status, cancellationToken).ConfigureAwait(false))
{
_metrics.RecordGraphJobResult("build", "skipped");
@@ -78,161 +78,161 @@ internal sealed class GraphBuildExecutionService
}
_metrics.RecordGraphJobStart("build", running.TenantId, running.GraphSnapshotId ?? running.SbomId);
var attempt = 0;
CartographerBuildResult? lastResult = null;
Exception? lastException = null;
var backoff = graphOptions.RetryBackoff;
while (attempt < graphOptions.MaxAttempts)
{
cancellationToken.ThrowIfCancellationRequested();
attempt++;
try
{
var response = await _cartographerClient.StartBuildAsync(running, cancellationToken).ConfigureAwait(false);
lastResult = response;
if (!string.IsNullOrWhiteSpace(response.CartographerJobId) && response.CartographerJobId != running.CartographerJobId)
{
var updated = running with { CartographerJobId = response.CartographerJobId };
if (await _repository.TryReplaceAsync(updated, GraphJobStatus.Running, cancellationToken).ConfigureAwait(false))
{
running = updated;
}
}
if (!string.IsNullOrWhiteSpace(response.GraphSnapshotId) && response.GraphSnapshotId != running.GraphSnapshotId)
{
var updated = running with { GraphSnapshotId = response.GraphSnapshotId };
if (await _repository.TryReplaceAsync(updated, GraphJobStatus.Running, cancellationToken).ConfigureAwait(false))
{
running = updated;
}
}
if (response.Status == GraphJobStatus.Completed || response.Status == GraphJobStatus.Cancelled || response.Status == GraphJobStatus.Running)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Completed, completionTime, response.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
var attempt = 0;
CartographerBuildResult? lastResult = null;
Exception? lastException = null;
var backoff = graphOptions.RetryBackoff;
while (attempt < graphOptions.MaxAttempts)
{
cancellationToken.ThrowIfCancellationRequested();
attempt++;
try
{
var response = await _cartographerClient.StartBuildAsync(running, cancellationToken).ConfigureAwait(false);
lastResult = response;
if (!string.IsNullOrWhiteSpace(response.CartographerJobId) && response.CartographerJobId != running.CartographerJobId)
{
var updated = running with { CartographerJobId = response.CartographerJobId };
if (await _repository.TryReplaceAsync(updated, GraphJobStatus.Running, cancellationToken).ConfigureAwait(false))
{
running = updated;
}
}
if (!string.IsNullOrWhiteSpace(response.GraphSnapshotId) && response.GraphSnapshotId != running.GraphSnapshotId)
{
var updated = running with { GraphSnapshotId = response.GraphSnapshotId };
if (await _repository.TryReplaceAsync(updated, GraphJobStatus.Running, cancellationToken).ConfigureAwait(false))
{
running = updated;
}
}
if (response.Status == GraphJobStatus.Completed || response.Status == GraphJobStatus.Cancelled || response.Status == GraphJobStatus.Running)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Completed, completionTime, response.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
var duration = completionTime - running.CreatedAt;
_metrics.RecordGraphJobResult("build", "completed", duration);
_metrics.RecordGraphJobCompletion("build", running.TenantId, running.GraphSnapshotId ?? running.SbomId, "completed", duration);
return GraphBuildExecutionResult.Completed(running, response.ResultUri);
}
if (response.Status == GraphJobStatus.Failed)
{
if (attempt >= graphOptions.MaxAttempts)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
if (response.Status == GraphJobStatus.Failed)
{
if (attempt >= graphOptions.MaxAttempts)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
var duration = completionTime - running.CreatedAt;
_metrics.RecordGraphJobResult("build", "failed", duration);
_metrics.RecordGraphJobCompletion("build", running.TenantId, running.GraphSnapshotId ?? running.SbomId, "failed", duration);
return GraphBuildExecutionResult.Failed(running, response.Error);
}
_logger.LogWarning(
"Cartographer build attempt {Attempt} failed for job {JobId}; retrying in {Delay} (reason: {Reason}).",
attempt,
job.Id,
backoff,
response.Error ?? "unknown");
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
continue;
}
// If Cartographer reports pending/queued we wait and retry.
if (attempt >= graphOptions.MaxAttempts)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId, response.ResultUri, response.Error ?? "Cartographer did not complete the build.", cancellationToken).ConfigureAwait(false);
_logger.LogWarning(
"Cartographer build attempt {Attempt} failed for job {JobId}; retrying in {Delay} (reason: {Reason}).",
attempt,
job.Id,
backoff,
response.Error ?? "unknown");
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
continue;
}
// If Cartographer reports pending/queued we wait and retry.
if (attempt >= graphOptions.MaxAttempts)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId, response.ResultUri, response.Error ?? "Cartographer did not complete the build.", cancellationToken).ConfigureAwait(false);
var duration = completionTime - running.CreatedAt;
_metrics.RecordGraphJobResult("build", "failed", duration);
_metrics.RecordGraphJobCompletion("build", running.TenantId, running.GraphSnapshotId ?? running.SbomId, "failed", duration);
return GraphBuildExecutionResult.Failed(running, response.Error);
}
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
{
lastException = ex;
if (attempt >= graphOptions.MaxAttempts)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, running.GraphSnapshotId, null, ex.Message, cancellationToken).ConfigureAwait(false);
_metrics.RecordGraphJobResult("build", "failed", completionTime - running.CreatedAt);
return GraphBuildExecutionResult.Failed(running, ex.Message);
}
_logger.LogWarning(ex, "Cartographer build attempt {Attempt} failed for job {JobId}; retrying in {Delay}.", attempt, job.Id, backoff);
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
}
}
var error = lastResult?.Error ?? lastException?.Message ?? "Cartographer build failed";
var finalTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, finalTime, lastResult?.GraphSnapshotId ?? running.GraphSnapshotId, lastResult?.ResultUri, error, cancellationToken).ConfigureAwait(false);
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
{
lastException = ex;
if (attempt >= graphOptions.MaxAttempts)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, running.GraphSnapshotId, null, ex.Message, cancellationToken).ConfigureAwait(false);
_metrics.RecordGraphJobResult("build", "failed", completionTime - running.CreatedAt);
return GraphBuildExecutionResult.Failed(running, ex.Message);
}
_logger.LogWarning(ex, "Cartographer build attempt {Attempt} failed for job {JobId}; retrying in {Delay}.", attempt, job.Id, backoff);
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
}
}
var error = lastResult?.Error ?? lastException?.Message ?? "Cartographer build failed";
var finalTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, finalTime, lastResult?.GraphSnapshotId ?? running.GraphSnapshotId, lastResult?.ResultUri, error, cancellationToken).ConfigureAwait(false);
var finalDuration = finalTime - running.CreatedAt;
_metrics.RecordGraphJobResult("build", "failed", finalDuration);
_metrics.RecordGraphJobCompletion("build", running.TenantId, running.GraphSnapshotId ?? running.SbomId, "failed", finalDuration);
return GraphBuildExecutionResult.Failed(running, error);
}
private async Task NotifyCompletionAsync(
GraphBuildJob job,
GraphJobStatus status,
DateTimeOffset occurredAt,
string? graphSnapshotId,
string? resultUri,
string? error,
CancellationToken cancellationToken)
{
var dto = new GraphJobCompletionRequestDto(
job.Id,
"Build",
status,
occurredAt,
graphSnapshotId ?? job.GraphSnapshotId,
resultUri,
job.CorrelationId,
status == GraphJobStatus.Failed ? (error ?? "Cartographer build failed.") : null);
try
{
await _completionClient.NotifyAsync(dto, cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
{
_logger.LogError(ex, "Failed notifying Scheduler completion for graph job {JobId}.", job.Id);
}
}
}
internal enum GraphBuildExecutionResultType
{
Completed,
Failed,
Skipped
}
internal readonly record struct GraphBuildExecutionResult(
GraphBuildExecutionResultType Type,
GraphBuildJob Job,
string? Reason = null,
string? ResultUri = null)
{
public static GraphBuildExecutionResult Completed(GraphBuildJob job, string? resultUri)
=> new(GraphBuildExecutionResultType.Completed, job, ResultUri: resultUri);
public static GraphBuildExecutionResult Failed(GraphBuildJob job, string? error)
=> new(GraphBuildExecutionResultType.Failed, job, error);
public static GraphBuildExecutionResult Skipped(GraphBuildJob job, string reason)
=> new(GraphBuildExecutionResultType.Skipped, job, reason);
}
private async Task NotifyCompletionAsync(
GraphBuildJob job,
GraphJobStatus status,
DateTimeOffset occurredAt,
string? graphSnapshotId,
string? resultUri,
string? error,
CancellationToken cancellationToken)
{
var dto = new GraphJobCompletionRequestDto(
job.Id,
"Build",
status,
occurredAt,
graphSnapshotId ?? job.GraphSnapshotId,
resultUri,
job.CorrelationId,
status == GraphJobStatus.Failed ? (error ?? "Cartographer build failed.") : null);
try
{
await _completionClient.NotifyAsync(dto, cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
{
_logger.LogError(ex, "Failed notifying Scheduler completion for graph job {JobId}.", job.Id);
}
}
}
internal enum GraphBuildExecutionResultType
{
Completed,
Failed,
Skipped
}
internal readonly record struct GraphBuildExecutionResult(
GraphBuildExecutionResultType Type,
GraphBuildJob Job,
string? Reason = null,
string? ResultUri = null)
{
public static GraphBuildExecutionResult Completed(GraphBuildJob job, string? resultUri)
=> new(GraphBuildExecutionResultType.Completed, job, ResultUri: resultUri);
public static GraphBuildExecutionResult Failed(GraphBuildJob job, string? error)
=> new(GraphBuildExecutionResultType.Failed, job, error);
public static GraphBuildExecutionResult Skipped(GraphBuildJob job, string reason)
=> new(GraphBuildExecutionResultType.Skipped, job, reason);
}

View File

@@ -1,128 +1,128 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Mongo.Repositories;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Graph;
internal sealed class GraphOverlayBackgroundService : BackgroundService
{
private readonly IGraphJobRepository _repository;
private readonly GraphOverlayExecutionService _executionService;
private readonly IOptions<SchedulerWorkerOptions> _options;
private readonly ILogger<GraphOverlayBackgroundService> _logger;
public GraphOverlayBackgroundService(
IGraphJobRepository repository,
GraphOverlayExecutionService executionService,
IOptions<SchedulerWorkerOptions> options,
ILogger<GraphOverlayBackgroundService> logger)
{
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
_executionService = executionService ?? throw new ArgumentNullException(nameof(executionService));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("Graph overlay worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
var graphOptions = _options.Value.Graph;
if (!graphOptions.Enabled)
{
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
var jobs = await _repository.ListOverlayJobsAsync(GraphJobStatus.Pending, graphOptions.BatchSize, stoppingToken).ConfigureAwait(false);
if (jobs.Count == 0)
{
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
foreach (var job in jobs)
{
try
{
var result = await _executionService.ExecuteAsync(job, stoppingToken).ConfigureAwait(false);
LogResult(result);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Unhandled exception while processing graph overlay job {JobId}.", job.Id);
}
}
await DelayAsync(graphOptions.PollInterval, stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Graph overlay worker encountered an error; backing off.");
await DelayAsync(TimeSpan.FromSeconds(5), stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Graph overlay worker stopping.");
}
private async Task DelayAsync(TimeSpan delay, CancellationToken cancellationToken)
{
if (delay <= TimeSpan.Zero)
{
return;
}
try
{
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
}
catch (TaskCanceledException)
{
}
}
private void LogResult(GraphOverlayExecutionResult result)
{
switch (result.Type)
{
case GraphOverlayExecutionResultType.Completed:
_logger.LogInformation(
"Graph overlay job {JobId} completed (tenant={TenantId}).",
result.Job.Id,
result.Job.TenantId);
break;
case GraphOverlayExecutionResultType.Failed:
_logger.LogWarning(
"Graph overlay job {JobId} failed (tenant={TenantId}): {Reason}.",
result.Job.Id,
result.Job.TenantId,
result.Reason ?? "unknown error");
break;
case GraphOverlayExecutionResultType.Skipped:
_logger.LogDebug(
"Graph overlay job {JobId} skipped: {Reason}.",
result.Job.Id,
result.Reason ?? "no reason");
break;
}
}
}
using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Graph;
internal sealed class GraphOverlayBackgroundService : BackgroundService
{
private readonly IGraphJobRepository _repository;
private readonly GraphOverlayExecutionService _executionService;
private readonly IOptions<SchedulerWorkerOptions> _options;
private readonly ILogger<GraphOverlayBackgroundService> _logger;
public GraphOverlayBackgroundService(
IGraphJobRepository repository,
GraphOverlayExecutionService executionService,
IOptions<SchedulerWorkerOptions> options,
ILogger<GraphOverlayBackgroundService> logger)
{
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
_executionService = executionService ?? throw new ArgumentNullException(nameof(executionService));
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("Graph overlay worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
var graphOptions = _options.Value.Graph;
if (!graphOptions.Enabled)
{
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
var jobs = await _repository.ListOverlayJobsAsync(GraphJobStatus.Pending, graphOptions.BatchSize, stoppingToken).ConfigureAwait(false);
if (jobs.Count == 0)
{
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
foreach (var job in jobs)
{
try
{
var result = await _executionService.ExecuteAsync(job, stoppingToken).ConfigureAwait(false);
LogResult(result);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Unhandled exception while processing graph overlay job {JobId}.", job.Id);
}
}
await DelayAsync(graphOptions.PollInterval, stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Graph overlay worker encountered an error; backing off.");
await DelayAsync(TimeSpan.FromSeconds(5), stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Graph overlay worker stopping.");
}
private async Task DelayAsync(TimeSpan delay, CancellationToken cancellationToken)
{
if (delay <= TimeSpan.Zero)
{
return;
}
try
{
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
}
catch (TaskCanceledException)
{
}
}
private void LogResult(GraphOverlayExecutionResult result)
{
switch (result.Type)
{
case GraphOverlayExecutionResultType.Completed:
_logger.LogInformation(
"Graph overlay job {JobId} completed (tenant={TenantId}).",
result.Job.Id,
result.Job.TenantId);
break;
case GraphOverlayExecutionResultType.Failed:
_logger.LogWarning(
"Graph overlay job {JobId} failed (tenant={TenantId}): {Reason}.",
result.Job.Id,
result.Job.TenantId,
result.Reason ?? "unknown error");
break;
case GraphOverlayExecutionResultType.Skipped:
_logger.LogDebug(
"Graph overlay job {JobId} skipped: {Reason}.",
result.Job.Id,
result.Reason ?? "no reason");
break;
}
}
}

View File

@@ -1,76 +1,76 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Mongo.Repositories;
using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
using StellaOps.Scheduler.Worker.Graph.Cartographer;
using StellaOps.Scheduler.Worker.Graph.Scheduler;
using StellaOps.Scheduler.Worker.Options;
using StellaOps.Scheduler.Worker.Observability;
namespace StellaOps.Scheduler.Worker.Graph;
internal sealed class GraphOverlayExecutionService
{
private readonly IGraphJobRepository _repository;
private readonly ICartographerOverlayClient _overlayClient;
private readonly IGraphJobCompletionClient _completionClient;
private readonly IOptions<SchedulerWorkerOptions> _options;
private readonly SchedulerWorkerMetrics _metrics;
private readonly TimeProvider _timeProvider;
private readonly ILogger<GraphOverlayExecutionService> _logger;
public GraphOverlayExecutionService(
IGraphJobRepository repository,
ICartographerOverlayClient overlayClient,
IGraphJobCompletionClient completionClient,
IOptions<SchedulerWorkerOptions> options,
SchedulerWorkerMetrics metrics,
TimeProvider? timeProvider,
ILogger<GraphOverlayExecutionService> logger)
{
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
_overlayClient = overlayClient ?? throw new ArgumentNullException(nameof(overlayClient));
_completionClient = completionClient ?? throw new ArgumentNullException(nameof(completionClient));
_options = options ?? throw new ArgumentNullException(nameof(options));
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<GraphOverlayExecutionResult> ExecuteAsync(GraphOverlayJob job, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(job);
var graphOptions = _options.Value.Graph;
if (!graphOptions.Enabled)
{
_metrics.RecordGraphJobResult("overlay", "skipped");
return GraphOverlayExecutionResult.Skipped(job, "graph_processing_disabled");
}
if (job.Status != GraphJobStatus.Pending)
{
_metrics.RecordGraphJobResult("overlay", "skipped");
return GraphOverlayExecutionResult.Skipped(job, "status_not_pending");
}
var now = _timeProvider.GetUtcNow();
GraphOverlayJob running;
try
{
running = GraphJobStateMachine.EnsureTransition(job, GraphJobStatus.Running, now, attempts: job.Attempts + 1);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to transition graph overlay job {JobId} to running state.", job.Id);
_metrics.RecordGraphJobResult("overlay", "skipped");
return GraphOverlayExecutionResult.Skipped(job, "transition_invalid");
}
namespace StellaOps.Scheduler.Worker.Graph;
internal sealed class GraphOverlayExecutionService
{
private readonly IGraphJobRepository _repository;
private readonly ICartographerOverlayClient _overlayClient;
private readonly IGraphJobCompletionClient _completionClient;
private readonly IOptions<SchedulerWorkerOptions> _options;
private readonly SchedulerWorkerMetrics _metrics;
private readonly TimeProvider _timeProvider;
private readonly ILogger<GraphOverlayExecutionService> _logger;
public GraphOverlayExecutionService(
IGraphJobRepository repository,
ICartographerOverlayClient overlayClient,
IGraphJobCompletionClient completionClient,
IOptions<SchedulerWorkerOptions> options,
SchedulerWorkerMetrics metrics,
TimeProvider? timeProvider,
ILogger<GraphOverlayExecutionService> logger)
{
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
_overlayClient = overlayClient ?? throw new ArgumentNullException(nameof(overlayClient));
_completionClient = completionClient ?? throw new ArgumentNullException(nameof(completionClient));
_options = options ?? throw new ArgumentNullException(nameof(options));
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<GraphOverlayExecutionResult> ExecuteAsync(GraphOverlayJob job, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(job);
var graphOptions = _options.Value.Graph;
if (!graphOptions.Enabled)
{
_metrics.RecordGraphJobResult("overlay", "skipped");
return GraphOverlayExecutionResult.Skipped(job, "graph_processing_disabled");
}
if (job.Status != GraphJobStatus.Pending)
{
_metrics.RecordGraphJobResult("overlay", "skipped");
return GraphOverlayExecutionResult.Skipped(job, "status_not_pending");
}
var now = _timeProvider.GetUtcNow();
GraphOverlayJob running;
try
{
running = GraphJobStateMachine.EnsureTransition(job, GraphJobStatus.Running, now, attempts: job.Attempts + 1);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to transition graph overlay job {JobId} to running state.", job.Id);
_metrics.RecordGraphJobResult("overlay", "skipped");
return GraphOverlayExecutionResult.Skipped(job, "transition_invalid");
}
if (!await _repository.TryReplaceOverlayAsync(running, job.Status, cancellationToken).ConfigureAwait(false))
{
_metrics.RecordGraphJobResult("overlay", "skipped");
@@ -78,142 +78,142 @@ internal sealed class GraphOverlayExecutionService
}
_metrics.RecordGraphJobStart("overlay", running.TenantId, running.GraphSnapshotId);
var attempt = 0;
CartographerOverlayResult? lastResult = null;
Exception? lastException = null;
var backoff = graphOptions.RetryBackoff;
while (attempt < graphOptions.MaxAttempts)
{
cancellationToken.ThrowIfCancellationRequested();
attempt++;
try
{
var response = await _overlayClient.StartOverlayAsync(running, cancellationToken).ConfigureAwait(false);
lastResult = response;
if (response.Status == GraphJobStatus.Completed || response.Status == GraphJobStatus.Cancelled || response.Status == GraphJobStatus.Running)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Completed, completionTime, response.GraphSnapshotId ?? running.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
var attempt = 0;
CartographerOverlayResult? lastResult = null;
Exception? lastException = null;
var backoff = graphOptions.RetryBackoff;
while (attempt < graphOptions.MaxAttempts)
{
cancellationToken.ThrowIfCancellationRequested();
attempt++;
try
{
var response = await _overlayClient.StartOverlayAsync(running, cancellationToken).ConfigureAwait(false);
lastResult = response;
if (response.Status == GraphJobStatus.Completed || response.Status == GraphJobStatus.Cancelled || response.Status == GraphJobStatus.Running)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Completed, completionTime, response.GraphSnapshotId ?? running.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
var duration = completionTime - running.CreatedAt;
_metrics.RecordGraphJobResult("overlay", "completed", duration);
_metrics.RecordGraphJobCompletion("overlay", running.TenantId, running.GraphSnapshotId, "completed", duration);
return GraphOverlayExecutionResult.Completed(running, response.ResultUri);
}
if (response.Status == GraphJobStatus.Failed)
{
if (attempt >= graphOptions.MaxAttempts)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId ?? running.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
if (response.Status == GraphJobStatus.Failed)
{
if (attempt >= graphOptions.MaxAttempts)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId ?? running.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
var duration = completionTime - running.CreatedAt;
_metrics.RecordGraphJobResult("overlay", "failed", duration);
_metrics.RecordGraphJobCompletion("overlay", running.TenantId, running.GraphSnapshotId, "failed", duration);
return GraphOverlayExecutionResult.Failed(running, response.Error);
}
_logger.LogWarning(
"Cartographer overlay attempt {Attempt} failed for job {JobId}; retrying in {Delay} (reason: {Reason}).",
attempt,
job.Id,
backoff,
response.Error ?? "unknown");
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
continue;
}
if (attempt >= graphOptions.MaxAttempts)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId ?? running.GraphSnapshotId, response.ResultUri, response.Error ?? "Cartographer did not complete the overlay.", cancellationToken).ConfigureAwait(false);
_logger.LogWarning(
"Cartographer overlay attempt {Attempt} failed for job {JobId}; retrying in {Delay} (reason: {Reason}).",
attempt,
job.Id,
backoff,
response.Error ?? "unknown");
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
continue;
}
if (attempt >= graphOptions.MaxAttempts)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId ?? running.GraphSnapshotId, response.ResultUri, response.Error ?? "Cartographer did not complete the overlay.", cancellationToken).ConfigureAwait(false);
var duration = completionTime - running.CreatedAt;
_metrics.RecordGraphJobResult("overlay", "failed", duration);
_metrics.RecordGraphJobCompletion("overlay", running.TenantId, running.GraphSnapshotId, "failed", duration);
return GraphOverlayExecutionResult.Failed(running, response.Error);
}
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
{
lastException = ex;
if (attempt >= graphOptions.MaxAttempts)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, running.GraphSnapshotId, null, ex.Message, cancellationToken).ConfigureAwait(false);
_metrics.RecordGraphJobResult("overlay", "failed", completionTime - running.CreatedAt);
return GraphOverlayExecutionResult.Failed(running, ex.Message);
}
_logger.LogWarning(ex, "Cartographer overlay attempt {Attempt} failed for job {JobId}; retrying in {Delay}.", attempt, job.Id, backoff);
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
}
}
var error = lastResult?.Error ?? lastException?.Message ?? "Cartographer overlay failed";
var finalTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, finalTime, lastResult?.GraphSnapshotId ?? running.GraphSnapshotId, lastResult?.ResultUri, error, cancellationToken).ConfigureAwait(false);
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
{
lastException = ex;
if (attempt >= graphOptions.MaxAttempts)
{
var completionTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, running.GraphSnapshotId, null, ex.Message, cancellationToken).ConfigureAwait(false);
_metrics.RecordGraphJobResult("overlay", "failed", completionTime - running.CreatedAt);
return GraphOverlayExecutionResult.Failed(running, ex.Message);
}
_logger.LogWarning(ex, "Cartographer overlay attempt {Attempt} failed for job {JobId}; retrying in {Delay}.", attempt, job.Id, backoff);
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
}
}
var error = lastResult?.Error ?? lastException?.Message ?? "Cartographer overlay failed";
var finalTime = _timeProvider.GetUtcNow();
await NotifyCompletionAsync(running, GraphJobStatus.Failed, finalTime, lastResult?.GraphSnapshotId ?? running.GraphSnapshotId, lastResult?.ResultUri, error, cancellationToken).ConfigureAwait(false);
var finalDuration = finalTime - running.CreatedAt;
_metrics.RecordGraphJobResult("overlay", "failed", finalDuration);
_metrics.RecordGraphJobCompletion("overlay", running.TenantId, running.GraphSnapshotId, "failed", finalDuration);
return GraphOverlayExecutionResult.Failed(running, error);
}
private async Task NotifyCompletionAsync(
GraphOverlayJob job,
GraphJobStatus status,
DateTimeOffset occurredAt,
string? graphSnapshotId,
string? resultUri,
string? error,
CancellationToken cancellationToken)
{
var dto = new GraphJobCompletionRequestDto(
job.Id,
"Overlay",
status,
occurredAt,
graphSnapshotId ?? job.GraphSnapshotId,
resultUri,
job.CorrelationId,
status == GraphJobStatus.Failed ? (error ?? "Cartographer overlay failed.") : null);
try
{
await _completionClient.NotifyAsync(dto, cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
{
_logger.LogError(ex, "Failed notifying Scheduler completion for graph overlay job {JobId}.", job.Id);
}
}
}
internal enum GraphOverlayExecutionResultType
{
Completed,
Failed,
Skipped
}
internal readonly record struct GraphOverlayExecutionResult(
GraphOverlayExecutionResultType Type,
GraphOverlayJob Job,
string? Reason = null,
string? ResultUri = null)
{
public static GraphOverlayExecutionResult Completed(GraphOverlayJob job, string? resultUri)
=> new(GraphOverlayExecutionResultType.Completed, job, ResultUri: resultUri);
public static GraphOverlayExecutionResult Failed(GraphOverlayJob job, string? error)
=> new(GraphOverlayExecutionResultType.Failed, job, error);
public static GraphOverlayExecutionResult Skipped(GraphOverlayJob job, string reason)
=> new(GraphOverlayExecutionResultType.Skipped, job, reason);
}
private async Task NotifyCompletionAsync(
GraphOverlayJob job,
GraphJobStatus status,
DateTimeOffset occurredAt,
string? graphSnapshotId,
string? resultUri,
string? error,
CancellationToken cancellationToken)
{
var dto = new GraphJobCompletionRequestDto(
job.Id,
"Overlay",
status,
occurredAt,
graphSnapshotId ?? job.GraphSnapshotId,
resultUri,
job.CorrelationId,
status == GraphJobStatus.Failed ? (error ?? "Cartographer overlay failed.") : null);
try
{
await _completionClient.NotifyAsync(dto, cancellationToken).ConfigureAwait(false);
}
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
{
_logger.LogError(ex, "Failed notifying Scheduler completion for graph overlay job {JobId}.", job.Id);
}
}
}
internal enum GraphOverlayExecutionResultType
{
Completed,
Failed,
Skipped
}
internal readonly record struct GraphOverlayExecutionResult(
GraphOverlayExecutionResultType Type,
GraphOverlayJob Job,
string? Reason = null,
string? ResultUri = null)
{
public static GraphOverlayExecutionResult Completed(GraphOverlayJob job, string? resultUri)
=> new(GraphOverlayExecutionResultType.Completed, job, ResultUri: resultUri);
public static GraphOverlayExecutionResult Failed(GraphOverlayJob job, string? error)
=> new(GraphOverlayExecutionResultType.Failed, job, error);
public static GraphOverlayExecutionResult Skipped(GraphOverlayJob job, string reason)
=> new(GraphOverlayExecutionResultType.Skipped, job, reason);
}

View File

@@ -1,7 +1,7 @@
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Mongo.Repositories;
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Planning;

View File

@@ -2,8 +2,8 @@ using System.Collections.Immutable;
using Microsoft.Extensions.Logging;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Queue;
using StellaOps.Scheduler.Storage.Mongo.Repositories;
using StellaOps.Scheduler.Storage.Mongo.Services;
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
using StellaOps.Scheduler.Storage.Postgres.Repositories.Services;
using StellaOps.Scheduler.Worker.Options;
using StellaOps.Scheduler.Worker.Observability;

View File

@@ -1,188 +1,188 @@
using System;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Mongo.Repositories;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Policy;
internal sealed class PolicyRunDispatchBackgroundService : BackgroundService
{
private readonly IPolicyRunJobRepository _repository;
private readonly PolicyRunExecutionService _executionService;
private readonly IOptions<SchedulerWorkerOptions> _options;
private readonly TimeProvider _timeProvider;
private readonly ILogger<PolicyRunDispatchBackgroundService> _logger;
private readonly string _leaseOwner;
public PolicyRunDispatchBackgroundService(
IPolicyRunJobRepository repository,
PolicyRunExecutionService executionService,
IOptions<SchedulerWorkerOptions> options,
TimeProvider? timeProvider,
ILogger<PolicyRunDispatchBackgroundService> logger)
{
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
_executionService = executionService ?? throw new ArgumentNullException(nameof(executionService));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_leaseOwner = options.Value.Policy.Dispatch.LeaseOwner;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("Policy run dispatcher loop started with lease owner {LeaseOwner}.", _leaseOwner);
while (!stoppingToken.IsCancellationRequested)
{
try
{
var policyOptions = _options.Value.Policy;
if (!policyOptions.Enabled)
{
await DelayAsync(policyOptions.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
var batch = await LeaseBatchAsync(policyOptions.Dispatch, stoppingToken).ConfigureAwait(false);
if (batch.Count == 0)
{
await DelayAsync(policyOptions.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
foreach (var job in batch)
{
try
{
var result = await _executionService.ExecuteAsync(job, stoppingToken).ConfigureAwait(false);
LogResult(result);
}
catch (OperationCanceledException)
{
throw;
}
catch (Exception ex)
{
_logger.LogError(ex, "Unhandled exception while processing policy run job {JobId}.", job.Id);
}
}
}
catch (OperationCanceledException)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Policy run dispatcher encountered an error; backing off.");
await DelayAsync(TimeSpan.FromSeconds(5), stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Policy run dispatcher loop stopping.");
}
private async Task<IReadOnlyList<PolicyRunJob>> LeaseBatchAsync(
SchedulerWorkerOptions.PolicyOptions.DispatchOptions dispatchOptions,
CancellationToken cancellationToken)
{
var jobs = new List<PolicyRunJob>(dispatchOptions.BatchSize);
for (var i = 0; i < dispatchOptions.BatchSize; i++)
{
var now = _timeProvider.GetUtcNow();
PolicyRunJob? leased;
try
{
leased = await _repository
.LeaseAsync(_leaseOwner, now, dispatchOptions.LeaseDuration, dispatchOptions.MaxAttempts, cancellationToken: cancellationToken)
.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
throw;
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to lease policy run job on attempt {Attempt}.", i + 1);
break;
}
if (leased is null)
{
break;
}
jobs.Add(leased);
}
return jobs;
}
private void LogResult(PolicyRunExecutionResult result)
{
switch (result.Type)
{
case PolicyRunExecutionResultType.Submitted:
_logger.LogInformation(
"Policy run job {JobId} submitted for tenant {TenantId} policy {PolicyId} (runId={RunId}).",
result.UpdatedJob.Id,
result.UpdatedJob.TenantId,
result.UpdatedJob.PolicyId,
result.UpdatedJob.RunId);
break;
case PolicyRunExecutionResultType.Retrying:
_logger.LogWarning(
"Policy run job {JobId} will retry for tenant {TenantId} policy {PolicyId}: {Error}.",
result.UpdatedJob.Id,
result.UpdatedJob.TenantId,
result.UpdatedJob.PolicyId,
result.Error);
break;
case PolicyRunExecutionResultType.Failed:
_logger.LogError(
"Policy run job {JobId} failed permanently for tenant {TenantId} policy {PolicyId}: {Error}.",
result.UpdatedJob.Id,
result.UpdatedJob.TenantId,
result.UpdatedJob.PolicyId,
result.Error);
break;
case PolicyRunExecutionResultType.Cancelled:
_logger.LogInformation(
"Policy run job {JobId} cancelled for tenant {TenantId} policy {PolicyId}.",
result.UpdatedJob.Id,
result.UpdatedJob.TenantId,
result.UpdatedJob.PolicyId);
break;
case PolicyRunExecutionResultType.NoOp:
_logger.LogInformation(
"Policy run job {JobId} completed without submission for tenant {TenantId} policy {PolicyId} (reason={Reason}).",
result.UpdatedJob.Id,
result.UpdatedJob.TenantId,
result.UpdatedJob.PolicyId,
result.Error ?? "none");
break;
}
}
private async Task DelayAsync(TimeSpan delay, CancellationToken cancellationToken)
{
if (delay <= TimeSpan.Zero)
{
return;
}
try
{
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
}
catch (TaskCanceledException)
{
}
}
}
using System;
using System.Collections.Generic;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Policy;
internal sealed class PolicyRunDispatchBackgroundService : BackgroundService
{
private readonly IPolicyRunJobRepository _repository;
private readonly PolicyRunExecutionService _executionService;
private readonly IOptions<SchedulerWorkerOptions> _options;
private readonly TimeProvider _timeProvider;
private readonly ILogger<PolicyRunDispatchBackgroundService> _logger;
private readonly string _leaseOwner;
public PolicyRunDispatchBackgroundService(
IPolicyRunJobRepository repository,
PolicyRunExecutionService executionService,
IOptions<SchedulerWorkerOptions> options,
TimeProvider? timeProvider,
ILogger<PolicyRunDispatchBackgroundService> logger)
{
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
_executionService = executionService ?? throw new ArgumentNullException(nameof(executionService));
_options = options ?? throw new ArgumentNullException(nameof(options));
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_leaseOwner = options.Value.Policy.Dispatch.LeaseOwner;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation("Policy run dispatcher loop started with lease owner {LeaseOwner}.", _leaseOwner);
while (!stoppingToken.IsCancellationRequested)
{
try
{
var policyOptions = _options.Value.Policy;
if (!policyOptions.Enabled)
{
await DelayAsync(policyOptions.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
var batch = await LeaseBatchAsync(policyOptions.Dispatch, stoppingToken).ConfigureAwait(false);
if (batch.Count == 0)
{
await DelayAsync(policyOptions.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
foreach (var job in batch)
{
try
{
var result = await _executionService.ExecuteAsync(job, stoppingToken).ConfigureAwait(false);
LogResult(result);
}
catch (OperationCanceledException)
{
throw;
}
catch (Exception ex)
{
_logger.LogError(ex, "Unhandled exception while processing policy run job {JobId}.", job.Id);
}
}
}
catch (OperationCanceledException)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Policy run dispatcher encountered an error; backing off.");
await DelayAsync(TimeSpan.FromSeconds(5), stoppingToken).ConfigureAwait(false);
}
}
_logger.LogInformation("Policy run dispatcher loop stopping.");
}
private async Task<IReadOnlyList<PolicyRunJob>> LeaseBatchAsync(
SchedulerWorkerOptions.PolicyOptions.DispatchOptions dispatchOptions,
CancellationToken cancellationToken)
{
var jobs = new List<PolicyRunJob>(dispatchOptions.BatchSize);
for (var i = 0; i < dispatchOptions.BatchSize; i++)
{
var now = _timeProvider.GetUtcNow();
PolicyRunJob? leased;
try
{
leased = await _repository
.LeaseAsync(_leaseOwner, now, dispatchOptions.LeaseDuration, dispatchOptions.MaxAttempts, cancellationToken: cancellationToken)
.ConfigureAwait(false);
}
catch (OperationCanceledException)
{
throw;
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to lease policy run job on attempt {Attempt}.", i + 1);
break;
}
if (leased is null)
{
break;
}
jobs.Add(leased);
}
return jobs;
}
private void LogResult(PolicyRunExecutionResult result)
{
switch (result.Type)
{
case PolicyRunExecutionResultType.Submitted:
_logger.LogInformation(
"Policy run job {JobId} submitted for tenant {TenantId} policy {PolicyId} (runId={RunId}).",
result.UpdatedJob.Id,
result.UpdatedJob.TenantId,
result.UpdatedJob.PolicyId,
result.UpdatedJob.RunId);
break;
case PolicyRunExecutionResultType.Retrying:
_logger.LogWarning(
"Policy run job {JobId} will retry for tenant {TenantId} policy {PolicyId}: {Error}.",
result.UpdatedJob.Id,
result.UpdatedJob.TenantId,
result.UpdatedJob.PolicyId,
result.Error);
break;
case PolicyRunExecutionResultType.Failed:
_logger.LogError(
"Policy run job {JobId} failed permanently for tenant {TenantId} policy {PolicyId}: {Error}.",
result.UpdatedJob.Id,
result.UpdatedJob.TenantId,
result.UpdatedJob.PolicyId,
result.Error);
break;
case PolicyRunExecutionResultType.Cancelled:
_logger.LogInformation(
"Policy run job {JobId} cancelled for tenant {TenantId} policy {PolicyId}.",
result.UpdatedJob.Id,
result.UpdatedJob.TenantId,
result.UpdatedJob.PolicyId);
break;
case PolicyRunExecutionResultType.NoOp:
_logger.LogInformation(
"Policy run job {JobId} completed without submission for tenant {TenantId} policy {PolicyId} (reason={Reason}).",
result.UpdatedJob.Id,
result.UpdatedJob.TenantId,
result.UpdatedJob.PolicyId,
result.Error ?? "none");
break;
}
}
private async Task DelayAsync(TimeSpan delay, CancellationToken cancellationToken)
{
if (delay <= TimeSpan.Zero)
{
return;
}
try
{
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
}
catch (TaskCanceledException)
{
}
}
}

View File

@@ -1,18 +1,18 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Mongo.Repositories;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Policy;
internal sealed class PolicyRunExecutionService
{
private readonly IPolicyRunJobRepository _repository;
using System;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Scheduler.Models;
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
using StellaOps.Scheduler.Worker.Observability;
using StellaOps.Scheduler.Worker.Options;
namespace StellaOps.Scheduler.Worker.Policy;
internal sealed class PolicyRunExecutionService
{
private readonly IPolicyRunJobRepository _repository;
private readonly IPolicyRunClient _client;
private readonly IOptions<SchedulerWorkerOptions> _options;
private readonly TimeProvider _timeProvider;
@@ -40,31 +40,31 @@ internal sealed class PolicyRunExecutionService
_webhookClient = webhookClient ?? throw new ArgumentNullException(nameof(webhookClient));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public async Task<PolicyRunExecutionResult> ExecuteAsync(PolicyRunJob job, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(job);
cancellationToken.ThrowIfCancellationRequested();
if (job.CancellationRequested)
{
var cancelledAt = _timeProvider.GetUtcNow();
var cancelled = job with
{
Status = PolicyRunJobStatus.Cancelled,
CancelledAt = cancelledAt,
UpdatedAt = cancelledAt,
LeaseOwner = null,
LeaseExpiresAt = null,
AvailableAt = cancelledAt
};
var replaced = await _repository.ReplaceAsync(cancelled, job.LeaseOwner, cancellationToken: cancellationToken).ConfigureAwait(false);
if (!replaced)
{
_logger.LogWarning("Failed to update cancelled policy run job {JobId}.", job.Id);
}
public async Task<PolicyRunExecutionResult> ExecuteAsync(PolicyRunJob job, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(job);
cancellationToken.ThrowIfCancellationRequested();
if (job.CancellationRequested)
{
var cancelledAt = _timeProvider.GetUtcNow();
var cancelled = job with
{
Status = PolicyRunJobStatus.Cancelled,
CancelledAt = cancelledAt,
UpdatedAt = cancelledAt,
LeaseOwner = null,
LeaseExpiresAt = null,
AvailableAt = cancelledAt
};
var replaced = await _repository.ReplaceAsync(cancelled, job.LeaseOwner, cancellationToken: cancellationToken).ConfigureAwait(false);
if (!replaced)
{
_logger.LogWarning("Failed to update cancelled policy run job {JobId}.", job.Id);
}
_metrics.RecordPolicyRunEvent(
cancelled.TenantId,
cancelled.PolicyId,
@@ -83,38 +83,38 @@ internal sealed class PolicyRunExecutionService
await _webhookClient.NotifyAsync(cancelledPayload, cancellationToken).ConfigureAwait(false);
return PolicyRunExecutionResult.Cancelled(cancelled);
}
var targeting = await _targetingService
.EnsureTargetsAsync(job, cancellationToken)
.ConfigureAwait(false);
if (targeting.Status == PolicyRunTargetingStatus.NoWork)
{
var completionTime = _timeProvider.GetUtcNow();
var completed = targeting.Job with
{
Status = PolicyRunJobStatus.Completed,
CompletedAt = completionTime,
UpdatedAt = completionTime,
LeaseOwner = null,
LeaseExpiresAt = null,
AvailableAt = completionTime,
LastError = null
};
var replaced = await _repository.ReplaceAsync(
completed,
job.LeaseOwner,
cancellationToken: cancellationToken)
.ConfigureAwait(false);
if (!replaced)
{
_logger.LogWarning("Failed to persist no-work completion for policy run job {JobId}.", job.Id);
}
var latency = CalculateLatency(job, completionTime);
}
var targeting = await _targetingService
.EnsureTargetsAsync(job, cancellationToken)
.ConfigureAwait(false);
if (targeting.Status == PolicyRunTargetingStatus.NoWork)
{
var completionTime = _timeProvider.GetUtcNow();
var completed = targeting.Job with
{
Status = PolicyRunJobStatus.Completed,
CompletedAt = completionTime,
UpdatedAt = completionTime,
LeaseOwner = null,
LeaseExpiresAt = null,
AvailableAt = completionTime,
LastError = null
};
var replaced = await _repository.ReplaceAsync(
completed,
job.LeaseOwner,
cancellationToken: cancellationToken)
.ConfigureAwait(false);
if (!replaced)
{
_logger.LogWarning("Failed to persist no-work completion for policy run job {JobId}.", job.Id);
}
var latency = CalculateLatency(job, completionTime);
_metrics.RecordPolicyRunEvent(
completed.TenantId,
completed.PolicyId,
@@ -132,85 +132,85 @@ internal sealed class PolicyRunExecutionService
await _webhookClient.NotifyAsync(completedPayload, cancellationToken).ConfigureAwait(false);
return PolicyRunExecutionResult.NoOp(completed, targeting.Reason);
}
job = targeting.Job;
var now = _timeProvider.GetUtcNow();
var request = job.ToPolicyRunRequest(now);
var submission = await _client.SubmitAsync(job, request, cancellationToken).ConfigureAwait(false);
var dispatchOptions = _options.Value.Policy.Dispatch;
var attemptCount = job.AttemptCount + 1;
if (submission.Success)
{
var updated = job with
{
Status = PolicyRunJobStatus.Submitted,
RunId = submission.RunId ?? job.RunId,
SubmittedAt = submission.QueuedAt ?? now,
UpdatedAt = now,
AttemptCount = attemptCount,
LastAttemptAt = now,
LastError = null,
LeaseOwner = null,
LeaseExpiresAt = null,
AvailableAt = now
};
var replaced = await _repository.ReplaceAsync(updated, job.LeaseOwner, cancellationToken: cancellationToken).ConfigureAwait(false);
if (!replaced)
{
_logger.LogWarning("Failed to persist submitted policy run job {JobId}.", job.Id);
}
var latency = CalculateLatency(job, now);
_metrics.RecordPolicyRunEvent(
updated.TenantId,
updated.PolicyId,
updated.Mode,
"submitted",
latency);
_logger.LogInformation(
"Policy run job {JobId} submitted (tenant={TenantId}, policy={PolicyId}, runId={RunId}, attempts={Attempts}).",
updated.Id,
updated.TenantId,
updated.PolicyId,
updated.RunId ?? "(pending)",
attemptCount);
return PolicyRunExecutionResult.Submitted(updated);
}
var nextStatus = attemptCount >= dispatchOptions.MaxAttempts
? PolicyRunJobStatus.Failed
: PolicyRunJobStatus.Pending;
var nextAvailable = nextStatus == PolicyRunJobStatus.Pending
? now.Add(dispatchOptions.RetryBackoff)
: now;
var failedJob = job with
{
Status = nextStatus,
AttemptCount = attemptCount,
LastAttemptAt = now,
LastError = submission.Error,
LeaseOwner = null,
LeaseExpiresAt = null,
UpdatedAt = now,
AvailableAt = nextAvailable
};
var updateSuccess = await _repository.ReplaceAsync(failedJob, job.LeaseOwner, cancellationToken: cancellationToken).ConfigureAwait(false);
if (!updateSuccess)
{
_logger.LogWarning("Failed to update policy run job {JobId} after submission failure.", job.Id);
}
var latencyForFailure = CalculateLatency(job, now);
var reason = string.IsNullOrWhiteSpace(submission.Error) ? null : submission.Error;
if (nextStatus == PolicyRunJobStatus.Failed)
{
}
job = targeting.Job;
var now = _timeProvider.GetUtcNow();
var request = job.ToPolicyRunRequest(now);
var submission = await _client.SubmitAsync(job, request, cancellationToken).ConfigureAwait(false);
var dispatchOptions = _options.Value.Policy.Dispatch;
var attemptCount = job.AttemptCount + 1;
if (submission.Success)
{
var updated = job with
{
Status = PolicyRunJobStatus.Submitted,
RunId = submission.RunId ?? job.RunId,
SubmittedAt = submission.QueuedAt ?? now,
UpdatedAt = now,
AttemptCount = attemptCount,
LastAttemptAt = now,
LastError = null,
LeaseOwner = null,
LeaseExpiresAt = null,
AvailableAt = now
};
var replaced = await _repository.ReplaceAsync(updated, job.LeaseOwner, cancellationToken: cancellationToken).ConfigureAwait(false);
if (!replaced)
{
_logger.LogWarning("Failed to persist submitted policy run job {JobId}.", job.Id);
}
var latency = CalculateLatency(job, now);
_metrics.RecordPolicyRunEvent(
updated.TenantId,
updated.PolicyId,
updated.Mode,
"submitted",
latency);
_logger.LogInformation(
"Policy run job {JobId} submitted (tenant={TenantId}, policy={PolicyId}, runId={RunId}, attempts={Attempts}).",
updated.Id,
updated.TenantId,
updated.PolicyId,
updated.RunId ?? "(pending)",
attemptCount);
return PolicyRunExecutionResult.Submitted(updated);
}
var nextStatus = attemptCount >= dispatchOptions.MaxAttempts
? PolicyRunJobStatus.Failed
: PolicyRunJobStatus.Pending;
var nextAvailable = nextStatus == PolicyRunJobStatus.Pending
? now.Add(dispatchOptions.RetryBackoff)
: now;
var failedJob = job with
{
Status = nextStatus,
AttemptCount = attemptCount,
LastAttemptAt = now,
LastError = submission.Error,
LeaseOwner = null,
LeaseExpiresAt = null,
UpdatedAt = now,
AvailableAt = nextAvailable
};
var updateSuccess = await _repository.ReplaceAsync(failedJob, job.LeaseOwner, cancellationToken: cancellationToken).ConfigureAwait(false);
if (!updateSuccess)
{
_logger.LogWarning("Failed to update policy run job {JobId} after submission failure.", job.Id);
}
var latencyForFailure = CalculateLatency(job, now);
var reason = string.IsNullOrWhiteSpace(submission.Error) ? null : submission.Error;
if (nextStatus == PolicyRunJobStatus.Failed)
{
_metrics.RecordPolicyRunEvent(
failedJob.TenantId,
failedJob.PolicyId,
@@ -233,31 +233,31 @@ internal sealed class PolicyRunExecutionService
await _webhookClient.NotifyAsync(failedPayload, cancellationToken).ConfigureAwait(false);
return PolicyRunExecutionResult.Failed(failedJob, submission.Error);
}
_metrics.RecordPolicyRunEvent(
failedJob.TenantId,
failedJob.PolicyId,
failedJob.Mode,
"retry",
latencyForFailure,
reason);
_logger.LogWarning(
"Policy run job {JobId} retry scheduled (tenant={TenantId}, policy={PolicyId}, runId={RunId}, attempt={Attempt}). Error: {Error}",
failedJob.Id,
failedJob.TenantId,
failedJob.PolicyId,
failedJob.RunId ?? "(pending)",
attemptCount,
submission.Error ?? "unknown");
return PolicyRunExecutionResult.Retrying(failedJob, submission.Error);
}
private static TimeSpan CalculateLatency(PolicyRunJob job, DateTimeOffset now)
{
var origin = job.QueuedAt ?? job.CreatedAt;
var latency = now - origin;
return latency < TimeSpan.Zero ? TimeSpan.Zero : latency;
}
}
}
_metrics.RecordPolicyRunEvent(
failedJob.TenantId,
failedJob.PolicyId,
failedJob.Mode,
"retry",
latencyForFailure,
reason);
_logger.LogWarning(
"Policy run job {JobId} retry scheduled (tenant={TenantId}, policy={PolicyId}, runId={RunId}, attempt={Attempt}). Error: {Error}",
failedJob.Id,
failedJob.TenantId,
failedJob.PolicyId,
failedJob.RunId ?? "(pending)",
attemptCount,
submission.Error ?? "unknown");
return PolicyRunExecutionResult.Retrying(failedJob, submission.Error);
}
private static TimeSpan CalculateLatency(PolicyRunJob job, DateTimeOffset now)
{
var origin = job.QueuedAt ?? job.CreatedAt;
var latency = now - origin;
return latency < TimeSpan.Zero ? TimeSpan.Zero : latency;
}
}

View File

@@ -8,7 +8,7 @@
<ItemGroup>
<ProjectReference Include="../StellaOps.Scheduler.ImpactIndex/StellaOps.Scheduler.ImpactIndex.csproj" />
<ProjectReference Include="../StellaOps.Scheduler.Models/StellaOps.Scheduler.Models.csproj" />
<ProjectReference Include="../StellaOps.Scheduler.Storage.Mongo/StellaOps.Scheduler.Storage.Mongo.csproj" />
<ProjectReference Include="../StellaOps.Scheduler.Storage.Postgres/StellaOps.Scheduler.Storage.Postgres.csproj" />
<ProjectReference Include="../StellaOps.Scheduler.Queue/StellaOps.Scheduler.Queue.csproj" />
<ProjectReference Include="../../../Notify/__Libraries/StellaOps.Notify.Models/StellaOps.Notify.Models.csproj" />
<ProjectReference Include="../../../Notify/__Libraries/StellaOps.Notify.Queue/StellaOps.Notify.Queue.csproj" />