feat: Add Go module and workspace test fixtures
- Created expected JSON files for Go modules and workspaces. - Added go.mod and go.sum files for example projects. - Implemented private module structure with expected JSON output. - Introduced vendored dependencies with corresponding expected JSON. - Developed PostgresGraphJobStore for managing graph jobs. - Established SQL migration scripts for graph jobs schema. - Implemented GraphJobRepository for CRUD operations on graph jobs. - Created IGraphJobRepository interface for repository abstraction. - Added unit tests for GraphJobRepository to ensure functionality.
This commit is contained in:
@@ -0,0 +1,34 @@
|
||||
-- Scheduler graph jobs schema (Postgres)
|
||||
|
||||
DO $$ BEGIN
|
||||
CREATE TYPE scheduler.graph_job_type AS ENUM ('build', 'overlay');
|
||||
EXCEPTION WHEN duplicate_object THEN NULL; END $$;
|
||||
|
||||
DO $$ BEGIN
|
||||
CREATE TYPE scheduler.graph_job_status AS ENUM ('pending', 'running', 'completed', 'failed', 'canceled');
|
||||
EXCEPTION WHEN duplicate_object THEN NULL; END $$;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS scheduler.graph_jobs (
|
||||
id UUID PRIMARY KEY,
|
||||
tenant_id TEXT NOT NULL,
|
||||
type scheduler.graph_job_type NOT NULL,
|
||||
status scheduler.graph_job_status NOT NULL,
|
||||
payload JSONB NOT NULL,
|
||||
correlation_id TEXT,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_graph_jobs_tenant_status ON scheduler.graph_jobs(tenant_id, status, created_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_graph_jobs_tenant_type_status ON scheduler.graph_jobs(tenant_id, type, status, created_at DESC);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS scheduler.graph_job_events (
|
||||
id BIGSERIAL PRIMARY KEY,
|
||||
job_id UUID NOT NULL REFERENCES scheduler.graph_jobs(id) ON DELETE CASCADE,
|
||||
tenant_id TEXT NOT NULL,
|
||||
status scheduler.graph_job_status NOT NULL,
|
||||
payload JSONB NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_graph_job_events_job ON scheduler.graph_job_events(job_id, created_at DESC);
|
||||
@@ -0,0 +1,157 @@
|
||||
using System.Collections.Generic;
|
||||
using System.Text.Json;
|
||||
using Dapper;
|
||||
using Npgsql;
|
||||
using StellaOps.Infrastructure.Postgres;
|
||||
using StellaOps.Scheduler.Models;
|
||||
|
||||
namespace StellaOps.Scheduler.Storage.Postgres.Repositories;
|
||||
|
||||
public sealed class GraphJobRepository : IGraphJobRepository
|
||||
{
|
||||
private readonly SchedulerDataSource _dataSource;
|
||||
private readonly JsonSerializerOptions _json;
|
||||
|
||||
public GraphJobRepository(SchedulerDataSource dataSource)
|
||||
{
|
||||
_dataSource = dataSource;
|
||||
_json = CanonicalJsonSerializer.Options;
|
||||
}
|
||||
|
||||
public async ValueTask InsertAsync(GraphBuildJob job, CancellationToken cancellationToken)
|
||||
{
|
||||
const string sql = @"INSERT INTO scheduler.graph_jobs
|
||||
(id, tenant_id, type, status, payload, created_at, updated_at, correlation_id)
|
||||
VALUES (@Id, @TenantId, @Type, @Status, @Payload, @CreatedAt, @UpdatedAt, @CorrelationId);";
|
||||
|
||||
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
|
||||
await conn.ExecuteAsync(sql, new
|
||||
{
|
||||
job.Id,
|
||||
job.TenantId,
|
||||
Type = (short)GraphJobQueryType.Build,
|
||||
Status = (short)job.Status,
|
||||
Payload = JsonSerializer.Serialize(job, _json),
|
||||
job.CreatedAt,
|
||||
UpdatedAt = job.UpdatedAt ?? job.CreatedAt,
|
||||
job.CorrelationId
|
||||
});
|
||||
}
|
||||
|
||||
public async ValueTask InsertAsync(GraphOverlayJob job, CancellationToken cancellationToken)
|
||||
{
|
||||
const string sql = @"INSERT INTO scheduler.graph_jobs
|
||||
(id, tenant_id, type, status, payload, created_at, updated_at, correlation_id)
|
||||
VALUES (@Id, @TenantId, @Type, @Status, @Payload, @CreatedAt, @UpdatedAt, @CorrelationId);";
|
||||
|
||||
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
|
||||
await conn.ExecuteAsync(sql, new
|
||||
{
|
||||
job.Id,
|
||||
job.TenantId,
|
||||
Type = (short)GraphJobQueryType.Overlay,
|
||||
Status = (short)job.Status,
|
||||
Payload = JsonSerializer.Serialize(job, _json),
|
||||
job.CreatedAt,
|
||||
UpdatedAt = job.UpdatedAt ?? job.CreatedAt,
|
||||
job.CorrelationId
|
||||
});
|
||||
}
|
||||
|
||||
public async ValueTask<GraphBuildJob?> GetBuildJobAsync(string tenantId, string jobId, CancellationToken cancellationToken)
|
||||
{
|
||||
const string sql = "SELECT payload FROM scheduler.graph_jobs WHERE tenant_id=@TenantId AND id=@Id AND type=@Type LIMIT 1";
|
||||
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
|
||||
var payload = await conn.ExecuteScalarAsync<string?>(sql, new { TenantId = tenantId, Id = jobId, Type = (short)GraphJobQueryType.Build });
|
||||
return payload is null ? null : JsonSerializer.Deserialize<GraphBuildJob>(payload, _json);
|
||||
}
|
||||
|
||||
public async ValueTask<GraphOverlayJob?> GetOverlayJobAsync(string tenantId, string jobId, CancellationToken cancellationToken)
|
||||
{
|
||||
const string sql = "SELECT payload FROM scheduler.graph_jobs WHERE tenant_id=@TenantId AND id=@Id AND type=@Type LIMIT 1";
|
||||
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
|
||||
var payload = await conn.ExecuteScalarAsync<string?>(sql, new { TenantId = tenantId, Id = jobId, Type = (short)GraphJobQueryType.Overlay });
|
||||
return payload is null ? null : JsonSerializer.Deserialize<GraphOverlayJob>(payload, _json);
|
||||
}
|
||||
|
||||
public async ValueTask<IReadOnlyCollection<GraphBuildJob>> ListBuildJobsAsync(string tenantId, GraphJobStatus? status, int limit, CancellationToken cancellationToken)
|
||||
{
|
||||
var sql = "SELECT payload FROM scheduler.graph_jobs WHERE tenant_id=@TenantId AND type=@Type";
|
||||
if (status is not null)
|
||||
{
|
||||
sql += " AND status=@Status";
|
||||
}
|
||||
sql += " ORDER BY created_at DESC LIMIT @Limit";
|
||||
|
||||
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
|
||||
var rows = await conn.QueryAsync<string>(sql, new
|
||||
{
|
||||
TenantId = tenantId,
|
||||
Type = (short)GraphJobQueryType.Build,
|
||||
Status = status is null ? null : (short)status,
|
||||
Limit = limit
|
||||
});
|
||||
return rows.Select(r => JsonSerializer.Deserialize<GraphBuildJob>(r, _json)!).ToArray();
|
||||
}
|
||||
|
||||
public async ValueTask<IReadOnlyCollection<GraphOverlayJob>> ListOverlayJobsAsync(string tenantId, GraphJobStatus? status, int limit, CancellationToken cancellationToken)
|
||||
{
|
||||
var sql = "SELECT payload FROM scheduler.graph_jobs WHERE tenant_id=@TenantId AND type=@Type";
|
||||
if (status is not null)
|
||||
{
|
||||
sql += " AND status=@Status";
|
||||
}
|
||||
sql += " ORDER BY created_at DESC LIMIT @Limit";
|
||||
|
||||
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
|
||||
var rows = await conn.QueryAsync<string>(sql, new
|
||||
{
|
||||
TenantId = tenantId,
|
||||
Type = (short)GraphJobQueryType.Overlay,
|
||||
Status = status is null ? null : (short)status,
|
||||
Limit = limit
|
||||
});
|
||||
return rows.Select(r => JsonSerializer.Deserialize<GraphOverlayJob>(r, _json)!).ToArray();
|
||||
}
|
||||
|
||||
public ValueTask<IReadOnlyCollection<GraphOverlayJob>> ListOverlayJobsAsync(string tenantId, CancellationToken cancellationToken)
|
||||
=> ListOverlayJobsAsync(tenantId, status: null, limit: 50, cancellationToken);
|
||||
|
||||
public async ValueTask<bool> TryReplaceAsync(GraphBuildJob job, GraphJobStatus expectedStatus, CancellationToken cancellationToken)
|
||||
{
|
||||
const string sql = @"UPDATE scheduler.graph_jobs
|
||||
SET status=@NewStatus, payload=@Payload, updated_at=NOW()
|
||||
WHERE tenant_id=@TenantId AND id=@Id AND status=@ExpectedStatus AND type=@Type";
|
||||
|
||||
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
|
||||
var rows = await conn.ExecuteAsync(sql, new
|
||||
{
|
||||
job.TenantId,
|
||||
job.Id,
|
||||
ExpectedStatus = (short)expectedStatus,
|
||||
NewStatus = (short)job.Status,
|
||||
Type = (short)GraphJobQueryType.Build,
|
||||
Payload = JsonSerializer.Serialize(job, _json)
|
||||
});
|
||||
return rows == 1;
|
||||
}
|
||||
|
||||
public async ValueTask<bool> TryReplaceOverlayAsync(GraphOverlayJob job, GraphJobStatus expectedStatus, CancellationToken cancellationToken)
|
||||
{
|
||||
const string sql = @"UPDATE scheduler.graph_jobs
|
||||
SET status=@NewStatus, payload=@Payload, updated_at=NOW()
|
||||
WHERE tenant_id=@TenantId AND id=@Id AND status=@ExpectedStatus AND type=@Type";
|
||||
|
||||
await using var conn = await _dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
|
||||
var rows = await conn.ExecuteAsync(sql, new
|
||||
{
|
||||
job.TenantId,
|
||||
job.Id,
|
||||
ExpectedStatus = (short)expectedStatus,
|
||||
NewStatus = (short)job.Status,
|
||||
Type = (short)GraphJobQueryType.Overlay,
|
||||
Payload = JsonSerializer.Serialize(job, _json)
|
||||
});
|
||||
return rows == 1;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
using System.Collections.Generic;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using StellaOps.Scheduler.Models;
|
||||
|
||||
namespace StellaOps.Scheduler.Storage.Postgres.Repositories;
|
||||
|
||||
public interface IGraphJobRepository
|
||||
{
|
||||
ValueTask InsertAsync(GraphBuildJob job, CancellationToken cancellationToken);
|
||||
ValueTask InsertAsync(GraphOverlayJob job, CancellationToken cancellationToken);
|
||||
|
||||
ValueTask<bool> TryReplaceAsync(GraphBuildJob job, GraphJobStatus expectedStatus, CancellationToken cancellationToken);
|
||||
ValueTask<bool> TryReplaceOverlayAsync(GraphOverlayJob job, GraphJobStatus expectedStatus, CancellationToken cancellationToken);
|
||||
|
||||
ValueTask<GraphBuildJob?> GetBuildJobAsync(string tenantId, string jobId, CancellationToken cancellationToken);
|
||||
ValueTask<GraphOverlayJob?> GetOverlayJobAsync(string tenantId, string jobId, CancellationToken cancellationToken);
|
||||
|
||||
ValueTask<IReadOnlyCollection<GraphBuildJob>> ListBuildJobsAsync(string tenantId, GraphJobStatus? status, int limit, CancellationToken cancellationToken);
|
||||
ValueTask<IReadOnlyCollection<GraphOverlayJob>> ListOverlayJobsAsync(string tenantId, GraphJobStatus? status, int limit, CancellationToken cancellationToken);
|
||||
ValueTask<IReadOnlyCollection<GraphOverlayJob>> ListOverlayJobsAsync(string tenantId, CancellationToken cancellationToken);
|
||||
}
|
||||
@@ -33,6 +33,7 @@ public static class ServiceCollectionExtensions
|
||||
services.AddScoped<IDistributedLockRepository, DistributedLockRepository>();
|
||||
services.AddScoped<IJobHistoryRepository, JobHistoryRepository>();
|
||||
services.AddScoped<IMetricsRepository, MetricsRepository>();
|
||||
services.AddScoped<IGraphJobRepository, GraphJobRepository>();
|
||||
|
||||
return services;
|
||||
}
|
||||
@@ -57,6 +58,7 @@ public static class ServiceCollectionExtensions
|
||||
services.AddScoped<IDistributedLockRepository, DistributedLockRepository>();
|
||||
services.AddScoped<IJobHistoryRepository, JobHistoryRepository>();
|
||||
services.AddScoped<IMetricsRepository, MetricsRepository>();
|
||||
services.AddScoped<IGraphJobRepository, GraphJobRepository>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
@@ -16,6 +16,11 @@
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Infrastructure.Postgres\StellaOps.Infrastructure.Postgres.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.Scheduler.Models\StellaOps.Scheduler.Models.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Dapper" Version="2.1.24" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
||||
@@ -6,8 +6,8 @@ using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Queue;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Services;
|
||||
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
|
||||
using StellaOps.Scheduler.Storage.Postgres.Repositories.Services;
|
||||
using StellaOps.Scheduler.Worker.Events;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
|
||||
|
||||
@@ -1,129 +1,129 @@
|
||||
using System;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Graph;
|
||||
|
||||
internal sealed class GraphBuildBackgroundService : BackgroundService
|
||||
{
|
||||
private readonly IGraphJobRepository _repository;
|
||||
private readonly GraphBuildExecutionService _executionService;
|
||||
private readonly IOptions<SchedulerWorkerOptions> _options;
|
||||
private readonly ILogger<GraphBuildBackgroundService> _logger;
|
||||
|
||||
public GraphBuildBackgroundService(
|
||||
IGraphJobRepository repository,
|
||||
GraphBuildExecutionService executionService,
|
||||
IOptions<SchedulerWorkerOptions> options,
|
||||
ILogger<GraphBuildBackgroundService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_executionService = executionService ?? throw new ArgumentNullException(nameof(executionService));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Graph build worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var graphOptions = _options.Value.Graph;
|
||||
if (!graphOptions.Enabled)
|
||||
{
|
||||
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
var jobs = await _repository.ListBuildJobsAsync(GraphJobStatus.Pending, graphOptions.BatchSize, stoppingToken).ConfigureAwait(false);
|
||||
|
||||
if (jobs.Count == 0)
|
||||
{
|
||||
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var job in jobs)
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await _executionService.ExecuteAsync(job, stoppingToken).ConfigureAwait(false);
|
||||
LogResult(result);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Unhandled exception while processing graph build job {JobId}.", job.Id);
|
||||
}
|
||||
}
|
||||
|
||||
await DelayAsync(graphOptions.PollInterval, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Graph build worker encountered an error; backing off.");
|
||||
await DelayAsync(TimeSpan.FromSeconds(5), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Graph build worker stopping.");
|
||||
}
|
||||
|
||||
private async Task DelayAsync(TimeSpan delay, CancellationToken cancellationToken)
|
||||
{
|
||||
if (delay <= TimeSpan.Zero)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (TaskCanceledException)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
private void LogResult(GraphBuildExecutionResult result)
|
||||
{
|
||||
switch (result.Type)
|
||||
{
|
||||
case GraphBuildExecutionResultType.Completed:
|
||||
_logger.LogInformation(
|
||||
"Graph build job {JobId} completed (tenant={TenantId}).",
|
||||
result.Job.Id,
|
||||
result.Job.TenantId);
|
||||
break;
|
||||
case GraphBuildExecutionResultType.Failed:
|
||||
_logger.LogWarning(
|
||||
"Graph build job {JobId} failed (tenant={TenantId}): {Reason}.",
|
||||
result.Job.Id,
|
||||
result.Job.TenantId,
|
||||
result.Reason ?? "unknown error");
|
||||
break;
|
||||
case GraphBuildExecutionResultType.Skipped:
|
||||
_logger.LogDebug(
|
||||
"Graph build job {JobId} skipped: {Reason}.",
|
||||
result.Job.Id,
|
||||
result.Reason ?? "no reason");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
using System;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Graph;
|
||||
|
||||
internal sealed class GraphBuildBackgroundService : BackgroundService
|
||||
{
|
||||
private readonly IGraphJobRepository _repository;
|
||||
private readonly GraphBuildExecutionService _executionService;
|
||||
private readonly IOptions<SchedulerWorkerOptions> _options;
|
||||
private readonly ILogger<GraphBuildBackgroundService> _logger;
|
||||
|
||||
public GraphBuildBackgroundService(
|
||||
IGraphJobRepository repository,
|
||||
GraphBuildExecutionService executionService,
|
||||
IOptions<SchedulerWorkerOptions> options,
|
||||
ILogger<GraphBuildBackgroundService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_executionService = executionService ?? throw new ArgumentNullException(nameof(executionService));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Graph build worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var graphOptions = _options.Value.Graph;
|
||||
if (!graphOptions.Enabled)
|
||||
{
|
||||
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
var jobs = await _repository.ListBuildJobsAsync(GraphJobStatus.Pending, graphOptions.BatchSize, stoppingToken).ConfigureAwait(false);
|
||||
|
||||
if (jobs.Count == 0)
|
||||
{
|
||||
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var job in jobs)
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await _executionService.ExecuteAsync(job, stoppingToken).ConfigureAwait(false);
|
||||
LogResult(result);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Unhandled exception while processing graph build job {JobId}.", job.Id);
|
||||
}
|
||||
}
|
||||
|
||||
await DelayAsync(graphOptions.PollInterval, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Graph build worker encountered an error; backing off.");
|
||||
await DelayAsync(TimeSpan.FromSeconds(5), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Graph build worker stopping.");
|
||||
}
|
||||
|
||||
private async Task DelayAsync(TimeSpan delay, CancellationToken cancellationToken)
|
||||
{
|
||||
if (delay <= TimeSpan.Zero)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (TaskCanceledException)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
private void LogResult(GraphBuildExecutionResult result)
|
||||
{
|
||||
switch (result.Type)
|
||||
{
|
||||
case GraphBuildExecutionResultType.Completed:
|
||||
_logger.LogInformation(
|
||||
"Graph build job {JobId} completed (tenant={TenantId}).",
|
||||
result.Job.Id,
|
||||
result.Job.TenantId);
|
||||
break;
|
||||
case GraphBuildExecutionResultType.Failed:
|
||||
_logger.LogWarning(
|
||||
"Graph build job {JobId} failed (tenant={TenantId}): {Reason}.",
|
||||
result.Job.Id,
|
||||
result.Job.TenantId,
|
||||
result.Reason ?? "unknown error");
|
||||
break;
|
||||
case GraphBuildExecutionResultType.Skipped:
|
||||
_logger.LogDebug(
|
||||
"Graph build job {JobId} skipped: {Reason}.",
|
||||
result.Job.Id,
|
||||
result.Reason ?? "no reason");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,76 +1,76 @@
|
||||
using System;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using System;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Graph.Cartographer;
|
||||
using StellaOps.Scheduler.Worker.Graph.Scheduler;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Graph;
|
||||
|
||||
internal sealed class GraphBuildExecutionService
|
||||
{
|
||||
private readonly IGraphJobRepository _repository;
|
||||
private readonly ICartographerBuildClient _cartographerClient;
|
||||
private readonly IGraphJobCompletionClient _completionClient;
|
||||
private readonly IOptions<SchedulerWorkerOptions> _options;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<GraphBuildExecutionService> _logger;
|
||||
|
||||
public GraphBuildExecutionService(
|
||||
IGraphJobRepository repository,
|
||||
ICartographerBuildClient cartographerClient,
|
||||
IGraphJobCompletionClient completionClient,
|
||||
IOptions<SchedulerWorkerOptions> options,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
TimeProvider? timeProvider,
|
||||
ILogger<GraphBuildExecutionService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_cartographerClient = cartographerClient ?? throw new ArgumentNullException(nameof(cartographerClient));
|
||||
_completionClient = completionClient ?? throw new ArgumentNullException(nameof(completionClient));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<GraphBuildExecutionResult> ExecuteAsync(GraphBuildJob job, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(job);
|
||||
|
||||
var graphOptions = _options.Value.Graph;
|
||||
if (!graphOptions.Enabled)
|
||||
{
|
||||
_metrics.RecordGraphJobResult("build", "skipped");
|
||||
return GraphBuildExecutionResult.Skipped(job, "graph_processing_disabled");
|
||||
}
|
||||
|
||||
if (job.Status != GraphJobStatus.Pending)
|
||||
{
|
||||
_metrics.RecordGraphJobResult("build", "skipped");
|
||||
return GraphBuildExecutionResult.Skipped(job, "status_not_pending");
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
GraphBuildJob running;
|
||||
|
||||
try
|
||||
{
|
||||
running = GraphJobStateMachine.EnsureTransition(job, GraphJobStatus.Running, now, attempts: job.Attempts + 1);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to transition graph job {JobId} to running state.", job.Id);
|
||||
_metrics.RecordGraphJobResult("build", "skipped");
|
||||
return GraphBuildExecutionResult.Skipped(job, "transition_invalid");
|
||||
}
|
||||
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Graph;
|
||||
|
||||
internal sealed class GraphBuildExecutionService
|
||||
{
|
||||
private readonly IGraphJobRepository _repository;
|
||||
private readonly ICartographerBuildClient _cartographerClient;
|
||||
private readonly IGraphJobCompletionClient _completionClient;
|
||||
private readonly IOptions<SchedulerWorkerOptions> _options;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<GraphBuildExecutionService> _logger;
|
||||
|
||||
public GraphBuildExecutionService(
|
||||
IGraphJobRepository repository,
|
||||
ICartographerBuildClient cartographerClient,
|
||||
IGraphJobCompletionClient completionClient,
|
||||
IOptions<SchedulerWorkerOptions> options,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
TimeProvider? timeProvider,
|
||||
ILogger<GraphBuildExecutionService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_cartographerClient = cartographerClient ?? throw new ArgumentNullException(nameof(cartographerClient));
|
||||
_completionClient = completionClient ?? throw new ArgumentNullException(nameof(completionClient));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<GraphBuildExecutionResult> ExecuteAsync(GraphBuildJob job, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(job);
|
||||
|
||||
var graphOptions = _options.Value.Graph;
|
||||
if (!graphOptions.Enabled)
|
||||
{
|
||||
_metrics.RecordGraphJobResult("build", "skipped");
|
||||
return GraphBuildExecutionResult.Skipped(job, "graph_processing_disabled");
|
||||
}
|
||||
|
||||
if (job.Status != GraphJobStatus.Pending)
|
||||
{
|
||||
_metrics.RecordGraphJobResult("build", "skipped");
|
||||
return GraphBuildExecutionResult.Skipped(job, "status_not_pending");
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
GraphBuildJob running;
|
||||
|
||||
try
|
||||
{
|
||||
running = GraphJobStateMachine.EnsureTransition(job, GraphJobStatus.Running, now, attempts: job.Attempts + 1);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to transition graph job {JobId} to running state.", job.Id);
|
||||
_metrics.RecordGraphJobResult("build", "skipped");
|
||||
return GraphBuildExecutionResult.Skipped(job, "transition_invalid");
|
||||
}
|
||||
|
||||
if (!await _repository.TryReplaceAsync(running, job.Status, cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
_metrics.RecordGraphJobResult("build", "skipped");
|
||||
@@ -78,161 +78,161 @@ internal sealed class GraphBuildExecutionService
|
||||
}
|
||||
|
||||
_metrics.RecordGraphJobStart("build", running.TenantId, running.GraphSnapshotId ?? running.SbomId);
|
||||
|
||||
var attempt = 0;
|
||||
CartographerBuildResult? lastResult = null;
|
||||
Exception? lastException = null;
|
||||
var backoff = graphOptions.RetryBackoff;
|
||||
|
||||
while (attempt < graphOptions.MaxAttempts)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
attempt++;
|
||||
|
||||
try
|
||||
{
|
||||
var response = await _cartographerClient.StartBuildAsync(running, cancellationToken).ConfigureAwait(false);
|
||||
lastResult = response;
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(response.CartographerJobId) && response.CartographerJobId != running.CartographerJobId)
|
||||
{
|
||||
var updated = running with { CartographerJobId = response.CartographerJobId };
|
||||
if (await _repository.TryReplaceAsync(updated, GraphJobStatus.Running, cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
running = updated;
|
||||
}
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(response.GraphSnapshotId) && response.GraphSnapshotId != running.GraphSnapshotId)
|
||||
{
|
||||
var updated = running with { GraphSnapshotId = response.GraphSnapshotId };
|
||||
if (await _repository.TryReplaceAsync(updated, GraphJobStatus.Running, cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
running = updated;
|
||||
}
|
||||
}
|
||||
|
||||
if (response.Status == GraphJobStatus.Completed || response.Status == GraphJobStatus.Cancelled || response.Status == GraphJobStatus.Running)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Completed, completionTime, response.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var attempt = 0;
|
||||
CartographerBuildResult? lastResult = null;
|
||||
Exception? lastException = null;
|
||||
var backoff = graphOptions.RetryBackoff;
|
||||
|
||||
while (attempt < graphOptions.MaxAttempts)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
attempt++;
|
||||
|
||||
try
|
||||
{
|
||||
var response = await _cartographerClient.StartBuildAsync(running, cancellationToken).ConfigureAwait(false);
|
||||
lastResult = response;
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(response.CartographerJobId) && response.CartographerJobId != running.CartographerJobId)
|
||||
{
|
||||
var updated = running with { CartographerJobId = response.CartographerJobId };
|
||||
if (await _repository.TryReplaceAsync(updated, GraphJobStatus.Running, cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
running = updated;
|
||||
}
|
||||
}
|
||||
|
||||
if (!string.IsNullOrWhiteSpace(response.GraphSnapshotId) && response.GraphSnapshotId != running.GraphSnapshotId)
|
||||
{
|
||||
var updated = running with { GraphSnapshotId = response.GraphSnapshotId };
|
||||
if (await _repository.TryReplaceAsync(updated, GraphJobStatus.Running, cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
running = updated;
|
||||
}
|
||||
}
|
||||
|
||||
if (response.Status == GraphJobStatus.Completed || response.Status == GraphJobStatus.Cancelled || response.Status == GraphJobStatus.Running)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Completed, completionTime, response.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
|
||||
var duration = completionTime - running.CreatedAt;
|
||||
_metrics.RecordGraphJobResult("build", "completed", duration);
|
||||
_metrics.RecordGraphJobCompletion("build", running.TenantId, running.GraphSnapshotId ?? running.SbomId, "completed", duration);
|
||||
return GraphBuildExecutionResult.Completed(running, response.ResultUri);
|
||||
}
|
||||
|
||||
if (response.Status == GraphJobStatus.Failed)
|
||||
{
|
||||
if (attempt >= graphOptions.MaxAttempts)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (response.Status == GraphJobStatus.Failed)
|
||||
{
|
||||
if (attempt >= graphOptions.MaxAttempts)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
|
||||
var duration = completionTime - running.CreatedAt;
|
||||
_metrics.RecordGraphJobResult("build", "failed", duration);
|
||||
_metrics.RecordGraphJobCompletion("build", running.TenantId, running.GraphSnapshotId ?? running.SbomId, "failed", duration);
|
||||
return GraphBuildExecutionResult.Failed(running, response.Error);
|
||||
}
|
||||
|
||||
_logger.LogWarning(
|
||||
"Cartographer build attempt {Attempt} failed for job {JobId}; retrying in {Delay} (reason: {Reason}).",
|
||||
attempt,
|
||||
job.Id,
|
||||
backoff,
|
||||
response.Error ?? "unknown");
|
||||
|
||||
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
// If Cartographer reports pending/queued we wait and retry.
|
||||
if (attempt >= graphOptions.MaxAttempts)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId, response.ResultUri, response.Error ?? "Cartographer did not complete the build.", cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogWarning(
|
||||
"Cartographer build attempt {Attempt} failed for job {JobId}; retrying in {Delay} (reason: {Reason}).",
|
||||
attempt,
|
||||
job.Id,
|
||||
backoff,
|
||||
response.Error ?? "unknown");
|
||||
|
||||
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
// If Cartographer reports pending/queued we wait and retry.
|
||||
if (attempt >= graphOptions.MaxAttempts)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId, response.ResultUri, response.Error ?? "Cartographer did not complete the build.", cancellationToken).ConfigureAwait(false);
|
||||
var duration = completionTime - running.CreatedAt;
|
||||
_metrics.RecordGraphJobResult("build", "failed", duration);
|
||||
_metrics.RecordGraphJobCompletion("build", running.TenantId, running.GraphSnapshotId ?? running.SbomId, "failed", duration);
|
||||
return GraphBuildExecutionResult.Failed(running, response.Error);
|
||||
}
|
||||
|
||||
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
lastException = ex;
|
||||
|
||||
if (attempt >= graphOptions.MaxAttempts)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, running.GraphSnapshotId, null, ex.Message, cancellationToken).ConfigureAwait(false);
|
||||
_metrics.RecordGraphJobResult("build", "failed", completionTime - running.CreatedAt);
|
||||
return GraphBuildExecutionResult.Failed(running, ex.Message);
|
||||
}
|
||||
|
||||
_logger.LogWarning(ex, "Cartographer build attempt {Attempt} failed for job {JobId}; retrying in {Delay}.", attempt, job.Id, backoff);
|
||||
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
var error = lastResult?.Error ?? lastException?.Message ?? "Cartographer build failed";
|
||||
var finalTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, finalTime, lastResult?.GraphSnapshotId ?? running.GraphSnapshotId, lastResult?.ResultUri, error, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
lastException = ex;
|
||||
|
||||
if (attempt >= graphOptions.MaxAttempts)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, running.GraphSnapshotId, null, ex.Message, cancellationToken).ConfigureAwait(false);
|
||||
_metrics.RecordGraphJobResult("build", "failed", completionTime - running.CreatedAt);
|
||||
return GraphBuildExecutionResult.Failed(running, ex.Message);
|
||||
}
|
||||
|
||||
_logger.LogWarning(ex, "Cartographer build attempt {Attempt} failed for job {JobId}; retrying in {Delay}.", attempt, job.Id, backoff);
|
||||
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
var error = lastResult?.Error ?? lastException?.Message ?? "Cartographer build failed";
|
||||
var finalTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, finalTime, lastResult?.GraphSnapshotId ?? running.GraphSnapshotId, lastResult?.ResultUri, error, cancellationToken).ConfigureAwait(false);
|
||||
var finalDuration = finalTime - running.CreatedAt;
|
||||
_metrics.RecordGraphJobResult("build", "failed", finalDuration);
|
||||
_metrics.RecordGraphJobCompletion("build", running.TenantId, running.GraphSnapshotId ?? running.SbomId, "failed", finalDuration);
|
||||
return GraphBuildExecutionResult.Failed(running, error);
|
||||
}
|
||||
|
||||
private async Task NotifyCompletionAsync(
|
||||
GraphBuildJob job,
|
||||
GraphJobStatus status,
|
||||
DateTimeOffset occurredAt,
|
||||
string? graphSnapshotId,
|
||||
string? resultUri,
|
||||
string? error,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var dto = new GraphJobCompletionRequestDto(
|
||||
job.Id,
|
||||
"Build",
|
||||
status,
|
||||
occurredAt,
|
||||
graphSnapshotId ?? job.GraphSnapshotId,
|
||||
resultUri,
|
||||
job.CorrelationId,
|
||||
status == GraphJobStatus.Failed ? (error ?? "Cartographer build failed.") : null);
|
||||
|
||||
try
|
||||
{
|
||||
await _completionClient.NotifyAsync(dto, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogError(ex, "Failed notifying Scheduler completion for graph job {JobId}.", job.Id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal enum GraphBuildExecutionResultType
|
||||
{
|
||||
Completed,
|
||||
Failed,
|
||||
Skipped
|
||||
}
|
||||
|
||||
internal readonly record struct GraphBuildExecutionResult(
|
||||
GraphBuildExecutionResultType Type,
|
||||
GraphBuildJob Job,
|
||||
string? Reason = null,
|
||||
string? ResultUri = null)
|
||||
{
|
||||
public static GraphBuildExecutionResult Completed(GraphBuildJob job, string? resultUri)
|
||||
=> new(GraphBuildExecutionResultType.Completed, job, ResultUri: resultUri);
|
||||
|
||||
public static GraphBuildExecutionResult Failed(GraphBuildJob job, string? error)
|
||||
=> new(GraphBuildExecutionResultType.Failed, job, error);
|
||||
|
||||
public static GraphBuildExecutionResult Skipped(GraphBuildJob job, string reason)
|
||||
=> new(GraphBuildExecutionResultType.Skipped, job, reason);
|
||||
}
|
||||
|
||||
private async Task NotifyCompletionAsync(
|
||||
GraphBuildJob job,
|
||||
GraphJobStatus status,
|
||||
DateTimeOffset occurredAt,
|
||||
string? graphSnapshotId,
|
||||
string? resultUri,
|
||||
string? error,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var dto = new GraphJobCompletionRequestDto(
|
||||
job.Id,
|
||||
"Build",
|
||||
status,
|
||||
occurredAt,
|
||||
graphSnapshotId ?? job.GraphSnapshotId,
|
||||
resultUri,
|
||||
job.CorrelationId,
|
||||
status == GraphJobStatus.Failed ? (error ?? "Cartographer build failed.") : null);
|
||||
|
||||
try
|
||||
{
|
||||
await _completionClient.NotifyAsync(dto, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogError(ex, "Failed notifying Scheduler completion for graph job {JobId}.", job.Id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal enum GraphBuildExecutionResultType
|
||||
{
|
||||
Completed,
|
||||
Failed,
|
||||
Skipped
|
||||
}
|
||||
|
||||
internal readonly record struct GraphBuildExecutionResult(
|
||||
GraphBuildExecutionResultType Type,
|
||||
GraphBuildJob Job,
|
||||
string? Reason = null,
|
||||
string? ResultUri = null)
|
||||
{
|
||||
public static GraphBuildExecutionResult Completed(GraphBuildJob job, string? resultUri)
|
||||
=> new(GraphBuildExecutionResultType.Completed, job, ResultUri: resultUri);
|
||||
|
||||
public static GraphBuildExecutionResult Failed(GraphBuildJob job, string? error)
|
||||
=> new(GraphBuildExecutionResultType.Failed, job, error);
|
||||
|
||||
public static GraphBuildExecutionResult Skipped(GraphBuildJob job, string reason)
|
||||
=> new(GraphBuildExecutionResultType.Skipped, job, reason);
|
||||
}
|
||||
|
||||
@@ -1,128 +1,128 @@
|
||||
using System;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Graph;
|
||||
|
||||
internal sealed class GraphOverlayBackgroundService : BackgroundService
|
||||
{
|
||||
private readonly IGraphJobRepository _repository;
|
||||
private readonly GraphOverlayExecutionService _executionService;
|
||||
private readonly IOptions<SchedulerWorkerOptions> _options;
|
||||
private readonly ILogger<GraphOverlayBackgroundService> _logger;
|
||||
|
||||
public GraphOverlayBackgroundService(
|
||||
IGraphJobRepository repository,
|
||||
GraphOverlayExecutionService executionService,
|
||||
IOptions<SchedulerWorkerOptions> options,
|
||||
ILogger<GraphOverlayBackgroundService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_executionService = executionService ?? throw new ArgumentNullException(nameof(executionService));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Graph overlay worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var graphOptions = _options.Value.Graph;
|
||||
if (!graphOptions.Enabled)
|
||||
{
|
||||
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
var jobs = await _repository.ListOverlayJobsAsync(GraphJobStatus.Pending, graphOptions.BatchSize, stoppingToken).ConfigureAwait(false);
|
||||
if (jobs.Count == 0)
|
||||
{
|
||||
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var job in jobs)
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await _executionService.ExecuteAsync(job, stoppingToken).ConfigureAwait(false);
|
||||
LogResult(result);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Unhandled exception while processing graph overlay job {JobId}.", job.Id);
|
||||
}
|
||||
}
|
||||
|
||||
await DelayAsync(graphOptions.PollInterval, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Graph overlay worker encountered an error; backing off.");
|
||||
await DelayAsync(TimeSpan.FromSeconds(5), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Graph overlay worker stopping.");
|
||||
}
|
||||
|
||||
private async Task DelayAsync(TimeSpan delay, CancellationToken cancellationToken)
|
||||
{
|
||||
if (delay <= TimeSpan.Zero)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (TaskCanceledException)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
private void LogResult(GraphOverlayExecutionResult result)
|
||||
{
|
||||
switch (result.Type)
|
||||
{
|
||||
case GraphOverlayExecutionResultType.Completed:
|
||||
_logger.LogInformation(
|
||||
"Graph overlay job {JobId} completed (tenant={TenantId}).",
|
||||
result.Job.Id,
|
||||
result.Job.TenantId);
|
||||
break;
|
||||
case GraphOverlayExecutionResultType.Failed:
|
||||
_logger.LogWarning(
|
||||
"Graph overlay job {JobId} failed (tenant={TenantId}): {Reason}.",
|
||||
result.Job.Id,
|
||||
result.Job.TenantId,
|
||||
result.Reason ?? "unknown error");
|
||||
break;
|
||||
case GraphOverlayExecutionResultType.Skipped:
|
||||
_logger.LogDebug(
|
||||
"Graph overlay job {JobId} skipped: {Reason}.",
|
||||
result.Job.Id,
|
||||
result.Reason ?? "no reason");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
using System;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Graph;
|
||||
|
||||
internal sealed class GraphOverlayBackgroundService : BackgroundService
|
||||
{
|
||||
private readonly IGraphJobRepository _repository;
|
||||
private readonly GraphOverlayExecutionService _executionService;
|
||||
private readonly IOptions<SchedulerWorkerOptions> _options;
|
||||
private readonly ILogger<GraphOverlayBackgroundService> _logger;
|
||||
|
||||
public GraphOverlayBackgroundService(
|
||||
IGraphJobRepository repository,
|
||||
GraphOverlayExecutionService executionService,
|
||||
IOptions<SchedulerWorkerOptions> options,
|
||||
ILogger<GraphOverlayBackgroundService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_executionService = executionService ?? throw new ArgumentNullException(nameof(executionService));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Graph overlay worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var graphOptions = _options.Value.Graph;
|
||||
if (!graphOptions.Enabled)
|
||||
{
|
||||
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
var jobs = await _repository.ListOverlayJobsAsync(GraphJobStatus.Pending, graphOptions.BatchSize, stoppingToken).ConfigureAwait(false);
|
||||
if (jobs.Count == 0)
|
||||
{
|
||||
await DelayAsync(graphOptions.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var job in jobs)
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await _executionService.ExecuteAsync(job, stoppingToken).ConfigureAwait(false);
|
||||
LogResult(result);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Unhandled exception while processing graph overlay job {JobId}.", job.Id);
|
||||
}
|
||||
}
|
||||
|
||||
await DelayAsync(graphOptions.PollInterval, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Graph overlay worker encountered an error; backing off.");
|
||||
await DelayAsync(TimeSpan.FromSeconds(5), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Graph overlay worker stopping.");
|
||||
}
|
||||
|
||||
private async Task DelayAsync(TimeSpan delay, CancellationToken cancellationToken)
|
||||
{
|
||||
if (delay <= TimeSpan.Zero)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (TaskCanceledException)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
private void LogResult(GraphOverlayExecutionResult result)
|
||||
{
|
||||
switch (result.Type)
|
||||
{
|
||||
case GraphOverlayExecutionResultType.Completed:
|
||||
_logger.LogInformation(
|
||||
"Graph overlay job {JobId} completed (tenant={TenantId}).",
|
||||
result.Job.Id,
|
||||
result.Job.TenantId);
|
||||
break;
|
||||
case GraphOverlayExecutionResultType.Failed:
|
||||
_logger.LogWarning(
|
||||
"Graph overlay job {JobId} failed (tenant={TenantId}): {Reason}.",
|
||||
result.Job.Id,
|
||||
result.Job.TenantId,
|
||||
result.Reason ?? "unknown error");
|
||||
break;
|
||||
case GraphOverlayExecutionResultType.Skipped:
|
||||
_logger.LogDebug(
|
||||
"Graph overlay job {JobId} skipped: {Reason}.",
|
||||
result.Job.Id,
|
||||
result.Reason ?? "no reason");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,76 +1,76 @@
|
||||
using System;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using System;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Graph.Cartographer;
|
||||
using StellaOps.Scheduler.Worker.Graph.Scheduler;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Graph;
|
||||
|
||||
internal sealed class GraphOverlayExecutionService
|
||||
{
|
||||
private readonly IGraphJobRepository _repository;
|
||||
private readonly ICartographerOverlayClient _overlayClient;
|
||||
private readonly IGraphJobCompletionClient _completionClient;
|
||||
private readonly IOptions<SchedulerWorkerOptions> _options;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<GraphOverlayExecutionService> _logger;
|
||||
|
||||
public GraphOverlayExecutionService(
|
||||
IGraphJobRepository repository,
|
||||
ICartographerOverlayClient overlayClient,
|
||||
IGraphJobCompletionClient completionClient,
|
||||
IOptions<SchedulerWorkerOptions> options,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
TimeProvider? timeProvider,
|
||||
ILogger<GraphOverlayExecutionService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_overlayClient = overlayClient ?? throw new ArgumentNullException(nameof(overlayClient));
|
||||
_completionClient = completionClient ?? throw new ArgumentNullException(nameof(completionClient));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<GraphOverlayExecutionResult> ExecuteAsync(GraphOverlayJob job, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(job);
|
||||
|
||||
var graphOptions = _options.Value.Graph;
|
||||
if (!graphOptions.Enabled)
|
||||
{
|
||||
_metrics.RecordGraphJobResult("overlay", "skipped");
|
||||
return GraphOverlayExecutionResult.Skipped(job, "graph_processing_disabled");
|
||||
}
|
||||
|
||||
if (job.Status != GraphJobStatus.Pending)
|
||||
{
|
||||
_metrics.RecordGraphJobResult("overlay", "skipped");
|
||||
return GraphOverlayExecutionResult.Skipped(job, "status_not_pending");
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
GraphOverlayJob running;
|
||||
|
||||
try
|
||||
{
|
||||
running = GraphJobStateMachine.EnsureTransition(job, GraphJobStatus.Running, now, attempts: job.Attempts + 1);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to transition graph overlay job {JobId} to running state.", job.Id);
|
||||
_metrics.RecordGraphJobResult("overlay", "skipped");
|
||||
return GraphOverlayExecutionResult.Skipped(job, "transition_invalid");
|
||||
}
|
||||
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Graph;
|
||||
|
||||
internal sealed class GraphOverlayExecutionService
|
||||
{
|
||||
private readonly IGraphJobRepository _repository;
|
||||
private readonly ICartographerOverlayClient _overlayClient;
|
||||
private readonly IGraphJobCompletionClient _completionClient;
|
||||
private readonly IOptions<SchedulerWorkerOptions> _options;
|
||||
private readonly SchedulerWorkerMetrics _metrics;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<GraphOverlayExecutionService> _logger;
|
||||
|
||||
public GraphOverlayExecutionService(
|
||||
IGraphJobRepository repository,
|
||||
ICartographerOverlayClient overlayClient,
|
||||
IGraphJobCompletionClient completionClient,
|
||||
IOptions<SchedulerWorkerOptions> options,
|
||||
SchedulerWorkerMetrics metrics,
|
||||
TimeProvider? timeProvider,
|
||||
ILogger<GraphOverlayExecutionService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_overlayClient = overlayClient ?? throw new ArgumentNullException(nameof(overlayClient));
|
||||
_completionClient = completionClient ?? throw new ArgumentNullException(nameof(completionClient));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<GraphOverlayExecutionResult> ExecuteAsync(GraphOverlayJob job, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(job);
|
||||
|
||||
var graphOptions = _options.Value.Graph;
|
||||
if (!graphOptions.Enabled)
|
||||
{
|
||||
_metrics.RecordGraphJobResult("overlay", "skipped");
|
||||
return GraphOverlayExecutionResult.Skipped(job, "graph_processing_disabled");
|
||||
}
|
||||
|
||||
if (job.Status != GraphJobStatus.Pending)
|
||||
{
|
||||
_metrics.RecordGraphJobResult("overlay", "skipped");
|
||||
return GraphOverlayExecutionResult.Skipped(job, "status_not_pending");
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
GraphOverlayJob running;
|
||||
|
||||
try
|
||||
{
|
||||
running = GraphJobStateMachine.EnsureTransition(job, GraphJobStatus.Running, now, attempts: job.Attempts + 1);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to transition graph overlay job {JobId} to running state.", job.Id);
|
||||
_metrics.RecordGraphJobResult("overlay", "skipped");
|
||||
return GraphOverlayExecutionResult.Skipped(job, "transition_invalid");
|
||||
}
|
||||
|
||||
if (!await _repository.TryReplaceOverlayAsync(running, job.Status, cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
_metrics.RecordGraphJobResult("overlay", "skipped");
|
||||
@@ -78,142 +78,142 @@ internal sealed class GraphOverlayExecutionService
|
||||
}
|
||||
|
||||
_metrics.RecordGraphJobStart("overlay", running.TenantId, running.GraphSnapshotId);
|
||||
|
||||
var attempt = 0;
|
||||
CartographerOverlayResult? lastResult = null;
|
||||
Exception? lastException = null;
|
||||
var backoff = graphOptions.RetryBackoff;
|
||||
|
||||
while (attempt < graphOptions.MaxAttempts)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
attempt++;
|
||||
|
||||
try
|
||||
{
|
||||
var response = await _overlayClient.StartOverlayAsync(running, cancellationToken).ConfigureAwait(false);
|
||||
lastResult = response;
|
||||
|
||||
if (response.Status == GraphJobStatus.Completed || response.Status == GraphJobStatus.Cancelled || response.Status == GraphJobStatus.Running)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Completed, completionTime, response.GraphSnapshotId ?? running.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var attempt = 0;
|
||||
CartographerOverlayResult? lastResult = null;
|
||||
Exception? lastException = null;
|
||||
var backoff = graphOptions.RetryBackoff;
|
||||
|
||||
while (attempt < graphOptions.MaxAttempts)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
attempt++;
|
||||
|
||||
try
|
||||
{
|
||||
var response = await _overlayClient.StartOverlayAsync(running, cancellationToken).ConfigureAwait(false);
|
||||
lastResult = response;
|
||||
|
||||
if (response.Status == GraphJobStatus.Completed || response.Status == GraphJobStatus.Cancelled || response.Status == GraphJobStatus.Running)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Completed, completionTime, response.GraphSnapshotId ?? running.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
|
||||
var duration = completionTime - running.CreatedAt;
|
||||
_metrics.RecordGraphJobResult("overlay", "completed", duration);
|
||||
_metrics.RecordGraphJobCompletion("overlay", running.TenantId, running.GraphSnapshotId, "completed", duration);
|
||||
return GraphOverlayExecutionResult.Completed(running, response.ResultUri);
|
||||
}
|
||||
|
||||
if (response.Status == GraphJobStatus.Failed)
|
||||
{
|
||||
if (attempt >= graphOptions.MaxAttempts)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId ?? running.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (response.Status == GraphJobStatus.Failed)
|
||||
{
|
||||
if (attempt >= graphOptions.MaxAttempts)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId ?? running.GraphSnapshotId, response.ResultUri, response.Error, cancellationToken).ConfigureAwait(false);
|
||||
var duration = completionTime - running.CreatedAt;
|
||||
_metrics.RecordGraphJobResult("overlay", "failed", duration);
|
||||
_metrics.RecordGraphJobCompletion("overlay", running.TenantId, running.GraphSnapshotId, "failed", duration);
|
||||
return GraphOverlayExecutionResult.Failed(running, response.Error);
|
||||
}
|
||||
|
||||
_logger.LogWarning(
|
||||
"Cartographer overlay attempt {Attempt} failed for job {JobId}; retrying in {Delay} (reason: {Reason}).",
|
||||
attempt,
|
||||
job.Id,
|
||||
backoff,
|
||||
response.Error ?? "unknown");
|
||||
|
||||
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (attempt >= graphOptions.MaxAttempts)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId ?? running.GraphSnapshotId, response.ResultUri, response.Error ?? "Cartographer did not complete the overlay.", cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogWarning(
|
||||
"Cartographer overlay attempt {Attempt} failed for job {JobId}; retrying in {Delay} (reason: {Reason}).",
|
||||
attempt,
|
||||
job.Id,
|
||||
backoff,
|
||||
response.Error ?? "unknown");
|
||||
|
||||
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (attempt >= graphOptions.MaxAttempts)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, response.GraphSnapshotId ?? running.GraphSnapshotId, response.ResultUri, response.Error ?? "Cartographer did not complete the overlay.", cancellationToken).ConfigureAwait(false);
|
||||
var duration = completionTime - running.CreatedAt;
|
||||
_metrics.RecordGraphJobResult("overlay", "failed", duration);
|
||||
_metrics.RecordGraphJobCompletion("overlay", running.TenantId, running.GraphSnapshotId, "failed", duration);
|
||||
return GraphOverlayExecutionResult.Failed(running, response.Error);
|
||||
}
|
||||
|
||||
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
lastException = ex;
|
||||
|
||||
if (attempt >= graphOptions.MaxAttempts)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, running.GraphSnapshotId, null, ex.Message, cancellationToken).ConfigureAwait(false);
|
||||
_metrics.RecordGraphJobResult("overlay", "failed", completionTime - running.CreatedAt);
|
||||
return GraphOverlayExecutionResult.Failed(running, ex.Message);
|
||||
}
|
||||
|
||||
_logger.LogWarning(ex, "Cartographer overlay attempt {Attempt} failed for job {JobId}; retrying in {Delay}.", attempt, job.Id, backoff);
|
||||
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
var error = lastResult?.Error ?? lastException?.Message ?? "Cartographer overlay failed";
|
||||
var finalTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, finalTime, lastResult?.GraphSnapshotId ?? running.GraphSnapshotId, lastResult?.ResultUri, error, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
lastException = ex;
|
||||
|
||||
if (attempt >= graphOptions.MaxAttempts)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, completionTime, running.GraphSnapshotId, null, ex.Message, cancellationToken).ConfigureAwait(false);
|
||||
_metrics.RecordGraphJobResult("overlay", "failed", completionTime - running.CreatedAt);
|
||||
return GraphOverlayExecutionResult.Failed(running, ex.Message);
|
||||
}
|
||||
|
||||
_logger.LogWarning(ex, "Cartographer overlay attempt {Attempt} failed for job {JobId}; retrying in {Delay}.", attempt, job.Id, backoff);
|
||||
await Task.Delay(backoff, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
var error = lastResult?.Error ?? lastException?.Message ?? "Cartographer overlay failed";
|
||||
var finalTime = _timeProvider.GetUtcNow();
|
||||
await NotifyCompletionAsync(running, GraphJobStatus.Failed, finalTime, lastResult?.GraphSnapshotId ?? running.GraphSnapshotId, lastResult?.ResultUri, error, cancellationToken).ConfigureAwait(false);
|
||||
var finalDuration = finalTime - running.CreatedAt;
|
||||
_metrics.RecordGraphJobResult("overlay", "failed", finalDuration);
|
||||
_metrics.RecordGraphJobCompletion("overlay", running.TenantId, running.GraphSnapshotId, "failed", finalDuration);
|
||||
return GraphOverlayExecutionResult.Failed(running, error);
|
||||
}
|
||||
|
||||
private async Task NotifyCompletionAsync(
|
||||
GraphOverlayJob job,
|
||||
GraphJobStatus status,
|
||||
DateTimeOffset occurredAt,
|
||||
string? graphSnapshotId,
|
||||
string? resultUri,
|
||||
string? error,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var dto = new GraphJobCompletionRequestDto(
|
||||
job.Id,
|
||||
"Overlay",
|
||||
status,
|
||||
occurredAt,
|
||||
graphSnapshotId ?? job.GraphSnapshotId,
|
||||
resultUri,
|
||||
job.CorrelationId,
|
||||
status == GraphJobStatus.Failed ? (error ?? "Cartographer overlay failed.") : null);
|
||||
|
||||
try
|
||||
{
|
||||
await _completionClient.NotifyAsync(dto, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogError(ex, "Failed notifying Scheduler completion for graph overlay job {JobId}.", job.Id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal enum GraphOverlayExecutionResultType
|
||||
{
|
||||
Completed,
|
||||
Failed,
|
||||
Skipped
|
||||
}
|
||||
|
||||
internal readonly record struct GraphOverlayExecutionResult(
|
||||
GraphOverlayExecutionResultType Type,
|
||||
GraphOverlayJob Job,
|
||||
string? Reason = null,
|
||||
string? ResultUri = null)
|
||||
{
|
||||
public static GraphOverlayExecutionResult Completed(GraphOverlayJob job, string? resultUri)
|
||||
=> new(GraphOverlayExecutionResultType.Completed, job, ResultUri: resultUri);
|
||||
|
||||
public static GraphOverlayExecutionResult Failed(GraphOverlayJob job, string? error)
|
||||
=> new(GraphOverlayExecutionResultType.Failed, job, error);
|
||||
|
||||
public static GraphOverlayExecutionResult Skipped(GraphOverlayJob job, string reason)
|
||||
=> new(GraphOverlayExecutionResultType.Skipped, job, reason);
|
||||
}
|
||||
|
||||
private async Task NotifyCompletionAsync(
|
||||
GraphOverlayJob job,
|
||||
GraphJobStatus status,
|
||||
DateTimeOffset occurredAt,
|
||||
string? graphSnapshotId,
|
||||
string? resultUri,
|
||||
string? error,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var dto = new GraphJobCompletionRequestDto(
|
||||
job.Id,
|
||||
"Overlay",
|
||||
status,
|
||||
occurredAt,
|
||||
graphSnapshotId ?? job.GraphSnapshotId,
|
||||
resultUri,
|
||||
job.CorrelationId,
|
||||
status == GraphJobStatus.Failed ? (error ?? "Cartographer overlay failed.") : null);
|
||||
|
||||
try
|
||||
{
|
||||
await _completionClient.NotifyAsync(dto, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex) when (!cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogError(ex, "Failed notifying Scheduler completion for graph overlay job {JobId}.", job.Id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal enum GraphOverlayExecutionResultType
|
||||
{
|
||||
Completed,
|
||||
Failed,
|
||||
Skipped
|
||||
}
|
||||
|
||||
internal readonly record struct GraphOverlayExecutionResult(
|
||||
GraphOverlayExecutionResultType Type,
|
||||
GraphOverlayJob Job,
|
||||
string? Reason = null,
|
||||
string? ResultUri = null)
|
||||
{
|
||||
public static GraphOverlayExecutionResult Completed(GraphOverlayJob job, string? resultUri)
|
||||
=> new(GraphOverlayExecutionResultType.Completed, job, ResultUri: resultUri);
|
||||
|
||||
public static GraphOverlayExecutionResult Failed(GraphOverlayJob job, string? error)
|
||||
=> new(GraphOverlayExecutionResultType.Failed, job, error);
|
||||
|
||||
public static GraphOverlayExecutionResult Skipped(GraphOverlayJob job, string reason)
|
||||
=> new(GraphOverlayExecutionResultType.Skipped, job, reason);
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Planning;
|
||||
|
||||
@@ -2,8 +2,8 @@ using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Queue;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Services;
|
||||
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
|
||||
using StellaOps.Scheduler.Storage.Postgres.Repositories.Services;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
|
||||
|
||||
@@ -1,188 +1,188 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Policy;
|
||||
|
||||
internal sealed class PolicyRunDispatchBackgroundService : BackgroundService
|
||||
{
|
||||
private readonly IPolicyRunJobRepository _repository;
|
||||
private readonly PolicyRunExecutionService _executionService;
|
||||
private readonly IOptions<SchedulerWorkerOptions> _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<PolicyRunDispatchBackgroundService> _logger;
|
||||
private readonly string _leaseOwner;
|
||||
|
||||
public PolicyRunDispatchBackgroundService(
|
||||
IPolicyRunJobRepository repository,
|
||||
PolicyRunExecutionService executionService,
|
||||
IOptions<SchedulerWorkerOptions> options,
|
||||
TimeProvider? timeProvider,
|
||||
ILogger<PolicyRunDispatchBackgroundService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_executionService = executionService ?? throw new ArgumentNullException(nameof(executionService));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_leaseOwner = options.Value.Policy.Dispatch.LeaseOwner;
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Policy run dispatcher loop started with lease owner {LeaseOwner}.", _leaseOwner);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var policyOptions = _options.Value.Policy;
|
||||
if (!policyOptions.Enabled)
|
||||
{
|
||||
await DelayAsync(policyOptions.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
var batch = await LeaseBatchAsync(policyOptions.Dispatch, stoppingToken).ConfigureAwait(false);
|
||||
if (batch.Count == 0)
|
||||
{
|
||||
await DelayAsync(policyOptions.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var job in batch)
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await _executionService.ExecuteAsync(job, stoppingToken).ConfigureAwait(false);
|
||||
LogResult(result);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Unhandled exception while processing policy run job {JobId}.", job.Id);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Policy run dispatcher encountered an error; backing off.");
|
||||
await DelayAsync(TimeSpan.FromSeconds(5), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Policy run dispatcher loop stopping.");
|
||||
}
|
||||
|
||||
private async Task<IReadOnlyList<PolicyRunJob>> LeaseBatchAsync(
|
||||
SchedulerWorkerOptions.PolicyOptions.DispatchOptions dispatchOptions,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var jobs = new List<PolicyRunJob>(dispatchOptions.BatchSize);
|
||||
for (var i = 0; i < dispatchOptions.BatchSize; i++)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
PolicyRunJob? leased;
|
||||
try
|
||||
{
|
||||
leased = await _repository
|
||||
.LeaseAsync(_leaseOwner, now, dispatchOptions.LeaseDuration, dispatchOptions.MaxAttempts, cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to lease policy run job on attempt {Attempt}.", i + 1);
|
||||
break;
|
||||
}
|
||||
|
||||
if (leased is null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
jobs.Add(leased);
|
||||
}
|
||||
|
||||
return jobs;
|
||||
}
|
||||
|
||||
private void LogResult(PolicyRunExecutionResult result)
|
||||
{
|
||||
switch (result.Type)
|
||||
{
|
||||
case PolicyRunExecutionResultType.Submitted:
|
||||
_logger.LogInformation(
|
||||
"Policy run job {JobId} submitted for tenant {TenantId} policy {PolicyId} (runId={RunId}).",
|
||||
result.UpdatedJob.Id,
|
||||
result.UpdatedJob.TenantId,
|
||||
result.UpdatedJob.PolicyId,
|
||||
result.UpdatedJob.RunId);
|
||||
break;
|
||||
case PolicyRunExecutionResultType.Retrying:
|
||||
_logger.LogWarning(
|
||||
"Policy run job {JobId} will retry for tenant {TenantId} policy {PolicyId}: {Error}.",
|
||||
result.UpdatedJob.Id,
|
||||
result.UpdatedJob.TenantId,
|
||||
result.UpdatedJob.PolicyId,
|
||||
result.Error);
|
||||
break;
|
||||
case PolicyRunExecutionResultType.Failed:
|
||||
_logger.LogError(
|
||||
"Policy run job {JobId} failed permanently for tenant {TenantId} policy {PolicyId}: {Error}.",
|
||||
result.UpdatedJob.Id,
|
||||
result.UpdatedJob.TenantId,
|
||||
result.UpdatedJob.PolicyId,
|
||||
result.Error);
|
||||
break;
|
||||
case PolicyRunExecutionResultType.Cancelled:
|
||||
_logger.LogInformation(
|
||||
"Policy run job {JobId} cancelled for tenant {TenantId} policy {PolicyId}.",
|
||||
result.UpdatedJob.Id,
|
||||
result.UpdatedJob.TenantId,
|
||||
result.UpdatedJob.PolicyId);
|
||||
break;
|
||||
case PolicyRunExecutionResultType.NoOp:
|
||||
_logger.LogInformation(
|
||||
"Policy run job {JobId} completed without submission for tenant {TenantId} policy {PolicyId} (reason={Reason}).",
|
||||
result.UpdatedJob.Id,
|
||||
result.UpdatedJob.TenantId,
|
||||
result.UpdatedJob.PolicyId,
|
||||
result.Error ?? "none");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private async Task DelayAsync(TimeSpan delay, CancellationToken cancellationToken)
|
||||
{
|
||||
if (delay <= TimeSpan.Zero)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (TaskCanceledException)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Policy;
|
||||
|
||||
internal sealed class PolicyRunDispatchBackgroundService : BackgroundService
|
||||
{
|
||||
private readonly IPolicyRunJobRepository _repository;
|
||||
private readonly PolicyRunExecutionService _executionService;
|
||||
private readonly IOptions<SchedulerWorkerOptions> _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<PolicyRunDispatchBackgroundService> _logger;
|
||||
private readonly string _leaseOwner;
|
||||
|
||||
public PolicyRunDispatchBackgroundService(
|
||||
IPolicyRunJobRepository repository,
|
||||
PolicyRunExecutionService executionService,
|
||||
IOptions<SchedulerWorkerOptions> options,
|
||||
TimeProvider? timeProvider,
|
||||
ILogger<PolicyRunDispatchBackgroundService> logger)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_executionService = executionService ?? throw new ArgumentNullException(nameof(executionService));
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_leaseOwner = options.Value.Policy.Dispatch.LeaseOwner;
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Policy run dispatcher loop started with lease owner {LeaseOwner}.", _leaseOwner);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var policyOptions = _options.Value.Policy;
|
||||
if (!policyOptions.Enabled)
|
||||
{
|
||||
await DelayAsync(policyOptions.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
var batch = await LeaseBatchAsync(policyOptions.Dispatch, stoppingToken).ConfigureAwait(false);
|
||||
if (batch.Count == 0)
|
||||
{
|
||||
await DelayAsync(policyOptions.Dispatch.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
foreach (var job in batch)
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await _executionService.ExecuteAsync(job, stoppingToken).ConfigureAwait(false);
|
||||
LogResult(result);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Unhandled exception while processing policy run job {JobId}.", job.Id);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Policy run dispatcher encountered an error; backing off.");
|
||||
await DelayAsync(TimeSpan.FromSeconds(5), stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Policy run dispatcher loop stopping.");
|
||||
}
|
||||
|
||||
private async Task<IReadOnlyList<PolicyRunJob>> LeaseBatchAsync(
|
||||
SchedulerWorkerOptions.PolicyOptions.DispatchOptions dispatchOptions,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var jobs = new List<PolicyRunJob>(dispatchOptions.BatchSize);
|
||||
for (var i = 0; i < dispatchOptions.BatchSize; i++)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
PolicyRunJob? leased;
|
||||
try
|
||||
{
|
||||
leased = await _repository
|
||||
.LeaseAsync(_leaseOwner, now, dispatchOptions.LeaseDuration, dispatchOptions.MaxAttempts, cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to lease policy run job on attempt {Attempt}.", i + 1);
|
||||
break;
|
||||
}
|
||||
|
||||
if (leased is null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
jobs.Add(leased);
|
||||
}
|
||||
|
||||
return jobs;
|
||||
}
|
||||
|
||||
private void LogResult(PolicyRunExecutionResult result)
|
||||
{
|
||||
switch (result.Type)
|
||||
{
|
||||
case PolicyRunExecutionResultType.Submitted:
|
||||
_logger.LogInformation(
|
||||
"Policy run job {JobId} submitted for tenant {TenantId} policy {PolicyId} (runId={RunId}).",
|
||||
result.UpdatedJob.Id,
|
||||
result.UpdatedJob.TenantId,
|
||||
result.UpdatedJob.PolicyId,
|
||||
result.UpdatedJob.RunId);
|
||||
break;
|
||||
case PolicyRunExecutionResultType.Retrying:
|
||||
_logger.LogWarning(
|
||||
"Policy run job {JobId} will retry for tenant {TenantId} policy {PolicyId}: {Error}.",
|
||||
result.UpdatedJob.Id,
|
||||
result.UpdatedJob.TenantId,
|
||||
result.UpdatedJob.PolicyId,
|
||||
result.Error);
|
||||
break;
|
||||
case PolicyRunExecutionResultType.Failed:
|
||||
_logger.LogError(
|
||||
"Policy run job {JobId} failed permanently for tenant {TenantId} policy {PolicyId}: {Error}.",
|
||||
result.UpdatedJob.Id,
|
||||
result.UpdatedJob.TenantId,
|
||||
result.UpdatedJob.PolicyId,
|
||||
result.Error);
|
||||
break;
|
||||
case PolicyRunExecutionResultType.Cancelled:
|
||||
_logger.LogInformation(
|
||||
"Policy run job {JobId} cancelled for tenant {TenantId} policy {PolicyId}.",
|
||||
result.UpdatedJob.Id,
|
||||
result.UpdatedJob.TenantId,
|
||||
result.UpdatedJob.PolicyId);
|
||||
break;
|
||||
case PolicyRunExecutionResultType.NoOp:
|
||||
_logger.LogInformation(
|
||||
"Policy run job {JobId} completed without submission for tenant {TenantId} policy {PolicyId} (reason={Reason}).",
|
||||
result.UpdatedJob.Id,
|
||||
result.UpdatedJob.TenantId,
|
||||
result.UpdatedJob.PolicyId,
|
||||
result.Error ?? "none");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private async Task DelayAsync(TimeSpan delay, CancellationToken cancellationToken)
|
||||
{
|
||||
if (delay <= TimeSpan.Zero)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Delay(delay, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (TaskCanceledException)
|
||||
{
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
using System;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Mongo.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Policy;
|
||||
|
||||
internal sealed class PolicyRunExecutionService
|
||||
{
|
||||
private readonly IPolicyRunJobRepository _repository;
|
||||
using System;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Scheduler.Models;
|
||||
using StellaOps.Scheduler.Storage.Postgres.Repositories.Repositories;
|
||||
using StellaOps.Scheduler.Worker.Observability;
|
||||
using StellaOps.Scheduler.Worker.Options;
|
||||
|
||||
namespace StellaOps.Scheduler.Worker.Policy;
|
||||
|
||||
internal sealed class PolicyRunExecutionService
|
||||
{
|
||||
private readonly IPolicyRunJobRepository _repository;
|
||||
private readonly IPolicyRunClient _client;
|
||||
private readonly IOptions<SchedulerWorkerOptions> _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
@@ -40,31 +40,31 @@ internal sealed class PolicyRunExecutionService
|
||||
_webhookClient = webhookClient ?? throw new ArgumentNullException(nameof(webhookClient));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public async Task<PolicyRunExecutionResult> ExecuteAsync(PolicyRunJob job, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(job);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
if (job.CancellationRequested)
|
||||
{
|
||||
var cancelledAt = _timeProvider.GetUtcNow();
|
||||
var cancelled = job with
|
||||
{
|
||||
Status = PolicyRunJobStatus.Cancelled,
|
||||
CancelledAt = cancelledAt,
|
||||
UpdatedAt = cancelledAt,
|
||||
LeaseOwner = null,
|
||||
LeaseExpiresAt = null,
|
||||
AvailableAt = cancelledAt
|
||||
};
|
||||
|
||||
var replaced = await _repository.ReplaceAsync(cancelled, job.LeaseOwner, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (!replaced)
|
||||
{
|
||||
_logger.LogWarning("Failed to update cancelled policy run job {JobId}.", job.Id);
|
||||
}
|
||||
|
||||
|
||||
public async Task<PolicyRunExecutionResult> ExecuteAsync(PolicyRunJob job, CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(job);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
if (job.CancellationRequested)
|
||||
{
|
||||
var cancelledAt = _timeProvider.GetUtcNow();
|
||||
var cancelled = job with
|
||||
{
|
||||
Status = PolicyRunJobStatus.Cancelled,
|
||||
CancelledAt = cancelledAt,
|
||||
UpdatedAt = cancelledAt,
|
||||
LeaseOwner = null,
|
||||
LeaseExpiresAt = null,
|
||||
AvailableAt = cancelledAt
|
||||
};
|
||||
|
||||
var replaced = await _repository.ReplaceAsync(cancelled, job.LeaseOwner, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (!replaced)
|
||||
{
|
||||
_logger.LogWarning("Failed to update cancelled policy run job {JobId}.", job.Id);
|
||||
}
|
||||
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
cancelled.TenantId,
|
||||
cancelled.PolicyId,
|
||||
@@ -83,38 +83,38 @@ internal sealed class PolicyRunExecutionService
|
||||
await _webhookClient.NotifyAsync(cancelledPayload, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
return PolicyRunExecutionResult.Cancelled(cancelled);
|
||||
}
|
||||
|
||||
var targeting = await _targetingService
|
||||
.EnsureTargetsAsync(job, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (targeting.Status == PolicyRunTargetingStatus.NoWork)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
var completed = targeting.Job with
|
||||
{
|
||||
Status = PolicyRunJobStatus.Completed,
|
||||
CompletedAt = completionTime,
|
||||
UpdatedAt = completionTime,
|
||||
LeaseOwner = null,
|
||||
LeaseExpiresAt = null,
|
||||
AvailableAt = completionTime,
|
||||
LastError = null
|
||||
};
|
||||
|
||||
var replaced = await _repository.ReplaceAsync(
|
||||
completed,
|
||||
job.LeaseOwner,
|
||||
cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (!replaced)
|
||||
{
|
||||
_logger.LogWarning("Failed to persist no-work completion for policy run job {JobId}.", job.Id);
|
||||
}
|
||||
|
||||
var latency = CalculateLatency(job, completionTime);
|
||||
}
|
||||
|
||||
var targeting = await _targetingService
|
||||
.EnsureTargetsAsync(job, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (targeting.Status == PolicyRunTargetingStatus.NoWork)
|
||||
{
|
||||
var completionTime = _timeProvider.GetUtcNow();
|
||||
var completed = targeting.Job with
|
||||
{
|
||||
Status = PolicyRunJobStatus.Completed,
|
||||
CompletedAt = completionTime,
|
||||
UpdatedAt = completionTime,
|
||||
LeaseOwner = null,
|
||||
LeaseExpiresAt = null,
|
||||
AvailableAt = completionTime,
|
||||
LastError = null
|
||||
};
|
||||
|
||||
var replaced = await _repository.ReplaceAsync(
|
||||
completed,
|
||||
job.LeaseOwner,
|
||||
cancellationToken: cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (!replaced)
|
||||
{
|
||||
_logger.LogWarning("Failed to persist no-work completion for policy run job {JobId}.", job.Id);
|
||||
}
|
||||
|
||||
var latency = CalculateLatency(job, completionTime);
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
completed.TenantId,
|
||||
completed.PolicyId,
|
||||
@@ -132,85 +132,85 @@ internal sealed class PolicyRunExecutionService
|
||||
await _webhookClient.NotifyAsync(completedPayload, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
return PolicyRunExecutionResult.NoOp(completed, targeting.Reason);
|
||||
}
|
||||
|
||||
job = targeting.Job;
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var request = job.ToPolicyRunRequest(now);
|
||||
var submission = await _client.SubmitAsync(job, request, cancellationToken).ConfigureAwait(false);
|
||||
var dispatchOptions = _options.Value.Policy.Dispatch;
|
||||
var attemptCount = job.AttemptCount + 1;
|
||||
|
||||
if (submission.Success)
|
||||
{
|
||||
var updated = job with
|
||||
{
|
||||
Status = PolicyRunJobStatus.Submitted,
|
||||
RunId = submission.RunId ?? job.RunId,
|
||||
SubmittedAt = submission.QueuedAt ?? now,
|
||||
UpdatedAt = now,
|
||||
AttemptCount = attemptCount,
|
||||
LastAttemptAt = now,
|
||||
LastError = null,
|
||||
LeaseOwner = null,
|
||||
LeaseExpiresAt = null,
|
||||
AvailableAt = now
|
||||
};
|
||||
|
||||
var replaced = await _repository.ReplaceAsync(updated, job.LeaseOwner, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (!replaced)
|
||||
{
|
||||
_logger.LogWarning("Failed to persist submitted policy run job {JobId}.", job.Id);
|
||||
}
|
||||
|
||||
var latency = CalculateLatency(job, now);
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
updated.TenantId,
|
||||
updated.PolicyId,
|
||||
updated.Mode,
|
||||
"submitted",
|
||||
latency);
|
||||
_logger.LogInformation(
|
||||
"Policy run job {JobId} submitted (tenant={TenantId}, policy={PolicyId}, runId={RunId}, attempts={Attempts}).",
|
||||
updated.Id,
|
||||
updated.TenantId,
|
||||
updated.PolicyId,
|
||||
updated.RunId ?? "(pending)",
|
||||
attemptCount);
|
||||
|
||||
return PolicyRunExecutionResult.Submitted(updated);
|
||||
}
|
||||
|
||||
var nextStatus = attemptCount >= dispatchOptions.MaxAttempts
|
||||
? PolicyRunJobStatus.Failed
|
||||
: PolicyRunJobStatus.Pending;
|
||||
var nextAvailable = nextStatus == PolicyRunJobStatus.Pending
|
||||
? now.Add(dispatchOptions.RetryBackoff)
|
||||
: now;
|
||||
|
||||
var failedJob = job with
|
||||
{
|
||||
Status = nextStatus,
|
||||
AttemptCount = attemptCount,
|
||||
LastAttemptAt = now,
|
||||
LastError = submission.Error,
|
||||
LeaseOwner = null,
|
||||
LeaseExpiresAt = null,
|
||||
UpdatedAt = now,
|
||||
AvailableAt = nextAvailable
|
||||
};
|
||||
|
||||
var updateSuccess = await _repository.ReplaceAsync(failedJob, job.LeaseOwner, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (!updateSuccess)
|
||||
{
|
||||
_logger.LogWarning("Failed to update policy run job {JobId} after submission failure.", job.Id);
|
||||
}
|
||||
|
||||
var latencyForFailure = CalculateLatency(job, now);
|
||||
var reason = string.IsNullOrWhiteSpace(submission.Error) ? null : submission.Error;
|
||||
|
||||
if (nextStatus == PolicyRunJobStatus.Failed)
|
||||
{
|
||||
}
|
||||
|
||||
job = targeting.Job;
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var request = job.ToPolicyRunRequest(now);
|
||||
var submission = await _client.SubmitAsync(job, request, cancellationToken).ConfigureAwait(false);
|
||||
var dispatchOptions = _options.Value.Policy.Dispatch;
|
||||
var attemptCount = job.AttemptCount + 1;
|
||||
|
||||
if (submission.Success)
|
||||
{
|
||||
var updated = job with
|
||||
{
|
||||
Status = PolicyRunJobStatus.Submitted,
|
||||
RunId = submission.RunId ?? job.RunId,
|
||||
SubmittedAt = submission.QueuedAt ?? now,
|
||||
UpdatedAt = now,
|
||||
AttemptCount = attemptCount,
|
||||
LastAttemptAt = now,
|
||||
LastError = null,
|
||||
LeaseOwner = null,
|
||||
LeaseExpiresAt = null,
|
||||
AvailableAt = now
|
||||
};
|
||||
|
||||
var replaced = await _repository.ReplaceAsync(updated, job.LeaseOwner, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (!replaced)
|
||||
{
|
||||
_logger.LogWarning("Failed to persist submitted policy run job {JobId}.", job.Id);
|
||||
}
|
||||
|
||||
var latency = CalculateLatency(job, now);
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
updated.TenantId,
|
||||
updated.PolicyId,
|
||||
updated.Mode,
|
||||
"submitted",
|
||||
latency);
|
||||
_logger.LogInformation(
|
||||
"Policy run job {JobId} submitted (tenant={TenantId}, policy={PolicyId}, runId={RunId}, attempts={Attempts}).",
|
||||
updated.Id,
|
||||
updated.TenantId,
|
||||
updated.PolicyId,
|
||||
updated.RunId ?? "(pending)",
|
||||
attemptCount);
|
||||
|
||||
return PolicyRunExecutionResult.Submitted(updated);
|
||||
}
|
||||
|
||||
var nextStatus = attemptCount >= dispatchOptions.MaxAttempts
|
||||
? PolicyRunJobStatus.Failed
|
||||
: PolicyRunJobStatus.Pending;
|
||||
var nextAvailable = nextStatus == PolicyRunJobStatus.Pending
|
||||
? now.Add(dispatchOptions.RetryBackoff)
|
||||
: now;
|
||||
|
||||
var failedJob = job with
|
||||
{
|
||||
Status = nextStatus,
|
||||
AttemptCount = attemptCount,
|
||||
LastAttemptAt = now,
|
||||
LastError = submission.Error,
|
||||
LeaseOwner = null,
|
||||
LeaseExpiresAt = null,
|
||||
UpdatedAt = now,
|
||||
AvailableAt = nextAvailable
|
||||
};
|
||||
|
||||
var updateSuccess = await _repository.ReplaceAsync(failedJob, job.LeaseOwner, cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
if (!updateSuccess)
|
||||
{
|
||||
_logger.LogWarning("Failed to update policy run job {JobId} after submission failure.", job.Id);
|
||||
}
|
||||
|
||||
var latencyForFailure = CalculateLatency(job, now);
|
||||
var reason = string.IsNullOrWhiteSpace(submission.Error) ? null : submission.Error;
|
||||
|
||||
if (nextStatus == PolicyRunJobStatus.Failed)
|
||||
{
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
failedJob.TenantId,
|
||||
failedJob.PolicyId,
|
||||
@@ -233,31 +233,31 @@ internal sealed class PolicyRunExecutionService
|
||||
await _webhookClient.NotifyAsync(failedPayload, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
return PolicyRunExecutionResult.Failed(failedJob, submission.Error);
|
||||
}
|
||||
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
failedJob.TenantId,
|
||||
failedJob.PolicyId,
|
||||
failedJob.Mode,
|
||||
"retry",
|
||||
latencyForFailure,
|
||||
reason);
|
||||
_logger.LogWarning(
|
||||
"Policy run job {JobId} retry scheduled (tenant={TenantId}, policy={PolicyId}, runId={RunId}, attempt={Attempt}). Error: {Error}",
|
||||
failedJob.Id,
|
||||
failedJob.TenantId,
|
||||
failedJob.PolicyId,
|
||||
failedJob.RunId ?? "(pending)",
|
||||
attemptCount,
|
||||
submission.Error ?? "unknown");
|
||||
|
||||
return PolicyRunExecutionResult.Retrying(failedJob, submission.Error);
|
||||
}
|
||||
|
||||
private static TimeSpan CalculateLatency(PolicyRunJob job, DateTimeOffset now)
|
||||
{
|
||||
var origin = job.QueuedAt ?? job.CreatedAt;
|
||||
var latency = now - origin;
|
||||
return latency < TimeSpan.Zero ? TimeSpan.Zero : latency;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_metrics.RecordPolicyRunEvent(
|
||||
failedJob.TenantId,
|
||||
failedJob.PolicyId,
|
||||
failedJob.Mode,
|
||||
"retry",
|
||||
latencyForFailure,
|
||||
reason);
|
||||
_logger.LogWarning(
|
||||
"Policy run job {JobId} retry scheduled (tenant={TenantId}, policy={PolicyId}, runId={RunId}, attempt={Attempt}). Error: {Error}",
|
||||
failedJob.Id,
|
||||
failedJob.TenantId,
|
||||
failedJob.PolicyId,
|
||||
failedJob.RunId ?? "(pending)",
|
||||
attemptCount,
|
||||
submission.Error ?? "unknown");
|
||||
|
||||
return PolicyRunExecutionResult.Retrying(failedJob, submission.Error);
|
||||
}
|
||||
|
||||
private static TimeSpan CalculateLatency(PolicyRunJob job, DateTimeOffset now)
|
||||
{
|
||||
var origin = job.QueuedAt ?? job.CreatedAt;
|
||||
var latency = now - origin;
|
||||
return latency < TimeSpan.Zero ? TimeSpan.Zero : latency;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../StellaOps.Scheduler.ImpactIndex/StellaOps.Scheduler.ImpactIndex.csproj" />
|
||||
<ProjectReference Include="../StellaOps.Scheduler.Models/StellaOps.Scheduler.Models.csproj" />
|
||||
<ProjectReference Include="../StellaOps.Scheduler.Storage.Mongo/StellaOps.Scheduler.Storage.Mongo.csproj" />
|
||||
<ProjectReference Include="../StellaOps.Scheduler.Storage.Postgres/StellaOps.Scheduler.Storage.Postgres.csproj" />
|
||||
<ProjectReference Include="../StellaOps.Scheduler.Queue/StellaOps.Scheduler.Queue.csproj" />
|
||||
<ProjectReference Include="../../../Notify/__Libraries/StellaOps.Notify.Models/StellaOps.Notify.Models.csproj" />
|
||||
<ProjectReference Include="../../../Notify/__Libraries/StellaOps.Notify.Queue/StellaOps.Notify.Queue.csproj" />
|
||||
|
||||
Reference in New Issue
Block a user