consolidation of some of the modules, localization fixes, product advisories work, qa work
This commit is contained in:
@@ -0,0 +1,234 @@
|
||||
using Microsoft.Extensions.Options;
|
||||
using OpenTelemetry.Metrics;
|
||||
using OpenTelemetry.Trace;
|
||||
using StellaOps.AirGap.Policy;
|
||||
using StellaOps.Infrastructure.Postgres.Options;
|
||||
using StellaOps.TaskRunner.Core.Configuration;
|
||||
using StellaOps.TaskRunner.Core.Execution;
|
||||
using StellaOps.TaskRunner.Core.Execution.Simulation;
|
||||
using StellaOps.TaskRunner.Infrastructure.Execution;
|
||||
using StellaOps.TaskRunner.Persistence.Postgres;
|
||||
using StellaOps.TaskRunner.Persistence.Postgres.Repositories;
|
||||
using StellaOps.TaskRunner.Worker.Services;
|
||||
using StellaOps.Telemetry.Core;
|
||||
using StellaOps.Worker.Health;
|
||||
|
||||
var builder = WebApplication.CreateSlimBuilder(args);
|
||||
|
||||
builder.Services.AddAirGapEgressPolicy(builder.Configuration, sectionName: "AirGap");
|
||||
builder.Services.Configure<PackRunWorkerOptions>(builder.Configuration.GetSection("Worker"));
|
||||
builder.Services.Configure<NotificationOptions>(builder.Configuration.GetSection("Notifications"));
|
||||
builder.Services.AddHttpClient("taskrunner-notifications");
|
||||
builder.Services.AddSingleton(TimeProvider.System);
|
||||
|
||||
builder.Services.AddSingleton(sp =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<PackRunWorkerOptions>>();
|
||||
var egressPolicy = sp.GetRequiredService<IEgressPolicy>();
|
||||
return new FilesystemPackRunDispatcher(options.Value.QueuePath, options.Value.ArchivePath, egressPolicy);
|
||||
});
|
||||
builder.Services.AddSingleton<IPackRunJobDispatcher>(sp => sp.GetRequiredService<FilesystemPackRunDispatcher>());
|
||||
builder.Services.AddSingleton<IPackRunJobScheduler>(sp => sp.GetRequiredService<FilesystemPackRunDispatcher>());
|
||||
|
||||
builder.Services.AddSingleton<IPackRunNotificationPublisher>(sp =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<NotificationOptions>>().Value;
|
||||
if (options.ApprovalEndpoint is not null || options.PolicyEndpoint is not null)
|
||||
{
|
||||
return new HttpPackRunNotificationPublisher(
|
||||
sp.GetRequiredService<IHttpClientFactory>(),
|
||||
sp.GetRequiredService<IOptions<NotificationOptions>>(),
|
||||
sp.GetRequiredService<ILogger<HttpPackRunNotificationPublisher>>());
|
||||
}
|
||||
|
||||
return new LoggingPackRunNotificationPublisher(sp.GetRequiredService<ILogger<LoggingPackRunNotificationPublisher>>());
|
||||
});
|
||||
|
||||
builder.Services.AddSingleton<IPackRunStepExecutor, BundleIngestionStepExecutor>();
|
||||
builder.Services.AddSingleton<PackRunExecutionGraphBuilder>();
|
||||
builder.Services.AddSingleton<PackRunSimulationEngine>();
|
||||
builder.Services.AddSingleton<PackRunProcessor>();
|
||||
builder.Services.AddStellaOpsTelemetry(
|
||||
builder.Configuration,
|
||||
serviceName: "StellaOps.TaskRunner.Worker",
|
||||
configureTracing: tracing => tracing.AddHttpClientInstrumentation(),
|
||||
configureMetrics: metrics => metrics
|
||||
.AddRuntimeInstrumentation()
|
||||
.AddMeter(TaskRunnerTelemetry.MeterName));
|
||||
|
||||
var storageDriver = ResolveStorageDriver(builder.Configuration, "TaskRunner");
|
||||
RegisterStateStores(builder.Services, builder.Configuration, builder.Environment.IsDevelopment(), storageDriver);
|
||||
ValidateObjectStoreContract(builder.Configuration, builder.Environment.IsDevelopment(), "TaskRunner");
|
||||
|
||||
builder.Services.AddSingleton<IPackRunArtifactUploader>(sp =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<PackRunWorkerOptions>>().Value;
|
||||
var timeProvider = sp.GetRequiredService<TimeProvider>();
|
||||
var logger = sp.GetRequiredService<ILogger<FilesystemPackRunArtifactUploader>>();
|
||||
var configuration = sp.GetRequiredService<IConfiguration>();
|
||||
var artifactsRoot = ResolveSeedFsRootPath(configuration, "TaskRunner", options.ArtifactsPath);
|
||||
return new FilesystemPackRunArtifactUploader(artifactsRoot, timeProvider, logger);
|
||||
});
|
||||
builder.Services.AddSingleton<IPackRunProvenanceWriter>(sp =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<PackRunWorkerOptions>>().Value;
|
||||
var timeProvider = sp.GetRequiredService<TimeProvider>();
|
||||
var configuration = sp.GetRequiredService<IConfiguration>();
|
||||
var artifactsRoot = ResolveSeedFsRootPath(configuration, "TaskRunner", options.ArtifactsPath);
|
||||
return new FilesystemPackRunProvenanceWriter(artifactsRoot, timeProvider);
|
||||
});
|
||||
|
||||
builder.Services.AddHostedService<PackRunWorkerService>();
|
||||
|
||||
builder.Services.AddWorkerHealthChecks();
|
||||
|
||||
var app = builder.Build();
|
||||
app.MapWorkerHealthEndpoints();
|
||||
app.Run();
|
||||
|
||||
static void RegisterStateStores(IServiceCollection services, IConfiguration configuration, bool isDevelopment, string storageDriver)
|
||||
{
|
||||
if (string.Equals(storageDriver, "postgres", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
var connectionString = ResolvePostgresConnectionString(configuration, "TaskRunner");
|
||||
if (string.IsNullOrWhiteSpace(connectionString))
|
||||
{
|
||||
if (!isDevelopment)
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
"TaskRunner worker requires PostgreSQL connection settings in non-development mode. " +
|
||||
"Set ConnectionStrings:Default or TaskRunner:Storage:Postgres:ConnectionString.");
|
||||
}
|
||||
|
||||
RegisterFilesystemStateStores(services);
|
||||
return;
|
||||
}
|
||||
|
||||
services.Configure<PostgresOptions>(options =>
|
||||
{
|
||||
options.ConnectionString = connectionString;
|
||||
options.SchemaName = ResolveSchemaName(configuration, "TaskRunner") ?? TaskRunnerDataSource.DefaultSchemaName;
|
||||
});
|
||||
services.AddSingleton<TaskRunnerDataSource>();
|
||||
services.AddSingleton<IPackRunStateStore, PostgresPackRunStateStore>();
|
||||
services.AddSingleton<IPackRunApprovalStore, PostgresPackRunApprovalStore>();
|
||||
services.AddSingleton<IPackRunLogStore, PostgresPackRunLogStore>();
|
||||
return;
|
||||
}
|
||||
|
||||
if (string.Equals(storageDriver, "filesystem", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
RegisterFilesystemStateStores(services);
|
||||
return;
|
||||
}
|
||||
|
||||
if (string.Equals(storageDriver, "inmemory", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
services.AddSingleton<IPackRunStateStore, InMemoryPackRunStateStore>();
|
||||
services.AddSingleton<IPackRunApprovalStore, InMemoryPackRunApprovalStore>();
|
||||
services.AddSingleton<IPackRunLogStore, InMemoryPackRunLogStore>();
|
||||
return;
|
||||
}
|
||||
|
||||
throw new InvalidOperationException(
|
||||
$"Unsupported TaskRunner storage driver '{storageDriver}'. Allowed values: postgres, filesystem, inmemory.");
|
||||
}
|
||||
|
||||
static void RegisterFilesystemStateStores(IServiceCollection services)
|
||||
{
|
||||
services.AddSingleton<IPackRunApprovalStore>(sp =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<PackRunWorkerOptions>>();
|
||||
return new FilePackRunApprovalStore(options.Value.ApprovalStorePath);
|
||||
});
|
||||
services.AddSingleton<IPackRunStateStore>(sp =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<PackRunWorkerOptions>>();
|
||||
return new FilePackRunStateStore(options.Value.RunStatePath);
|
||||
});
|
||||
services.AddSingleton<IPackRunLogStore>(sp =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<PackRunWorkerOptions>>();
|
||||
return new FilePackRunLogStore(options.Value.LogsPath);
|
||||
});
|
||||
}
|
||||
|
||||
static string ResolveStorageDriver(IConfiguration configuration, string serviceName)
|
||||
{
|
||||
return FirstNonEmpty(
|
||||
configuration["Storage:Driver"],
|
||||
configuration[$"{serviceName}:Storage:Driver"])
|
||||
?? "postgres";
|
||||
}
|
||||
|
||||
static string? ResolvePostgresConnectionString(IConfiguration configuration, string serviceName)
|
||||
{
|
||||
return FirstNonEmpty(
|
||||
configuration[$"{serviceName}:Storage:Postgres:ConnectionString"],
|
||||
configuration["Storage:Postgres:ConnectionString"],
|
||||
configuration[$"Postgres:{serviceName}:ConnectionString"],
|
||||
configuration[$"ConnectionStrings:{serviceName}"],
|
||||
configuration["ConnectionStrings:Default"]);
|
||||
}
|
||||
|
||||
static string? ResolveSchemaName(IConfiguration configuration, string serviceName)
|
||||
{
|
||||
return FirstNonEmpty(
|
||||
configuration[$"{serviceName}:Storage:Postgres:Schema"],
|
||||
configuration["Storage:Postgres:Schema"],
|
||||
configuration[$"Postgres:{serviceName}:SchemaName"]);
|
||||
}
|
||||
|
||||
static void ValidateObjectStoreContract(IConfiguration configuration, bool isDevelopment, string serviceName)
|
||||
{
|
||||
var objectStoreDriver = ResolveObjectStoreDriver(configuration, serviceName);
|
||||
if (!string.Equals(objectStoreDriver, "seed-fs", StringComparison.OrdinalIgnoreCase) &&
|
||||
!string.Equals(objectStoreDriver, "rustfs", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"Unsupported object store driver '{objectStoreDriver}' for {serviceName}. Allowed values: seed-fs, rustfs.");
|
||||
}
|
||||
|
||||
if (string.Equals(objectStoreDriver, "rustfs", StringComparison.OrdinalIgnoreCase) && !isDevelopment)
|
||||
{
|
||||
var rustFsBaseUrl = FirstNonEmpty(
|
||||
configuration[$"{serviceName}:Storage:ObjectStore:RustFs:BaseUrl"],
|
||||
configuration["Storage:ObjectStore:RustFs:BaseUrl"]);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(rustFsBaseUrl))
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"RustFS object store is configured for {serviceName}, but BaseUrl is missing.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static string ResolveObjectStoreDriver(IConfiguration configuration, string serviceName)
|
||||
{
|
||||
return FirstNonEmpty(
|
||||
configuration[$"{serviceName}:Storage:ObjectStore:Driver"],
|
||||
configuration["Storage:ObjectStore:Driver"])
|
||||
?? "seed-fs";
|
||||
}
|
||||
|
||||
static string ResolveSeedFsRootPath(IConfiguration configuration, string serviceName, string fallbackPath)
|
||||
{
|
||||
return FirstNonEmpty(
|
||||
configuration[$"{serviceName}:Storage:ObjectStore:SeedFs:RootPath"],
|
||||
configuration["Storage:ObjectStore:SeedFs:RootPath"],
|
||||
configuration[$"{serviceName}:Worker:ArtifactsPath"])
|
||||
?? fallbackPath;
|
||||
}
|
||||
|
||||
static string? FirstNonEmpty(params string?[] values)
|
||||
{
|
||||
foreach (var value in values)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(value))
|
||||
{
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"$schema": "https://json.schemastore.org/launchsettings.json",
|
||||
"profiles": {
|
||||
"StellaOps.TaskRunner.Worker": {
|
||||
"commandName": "Project",
|
||||
"dotnetRunMessages": true,
|
||||
"environmentVariables": {
|
||||
"DOTNET_ENVIRONMENT": "Development"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,657 @@
|
||||
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.TaskRunner.Core.Configuration;
|
||||
using StellaOps.TaskRunner.Core.Execution;
|
||||
using StellaOps.TaskRunner.Core.Execution.Simulation;
|
||||
using StellaOps.TaskRunner.Core.Planning;
|
||||
using StellaOps.TaskRunner.Infrastructure.Execution;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.ObjectModel;
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
using System.Globalization;
|
||||
using System.Text.Json.Nodes;
|
||||
|
||||
namespace StellaOps.TaskRunner.Worker.Services;
|
||||
|
||||
public sealed class PackRunWorkerService : BackgroundService
|
||||
{
|
||||
private const string ChildFailureReason = "child-failure";
|
||||
private const string AwaitingRetryReason = "awaiting-retry";
|
||||
|
||||
private readonly IPackRunJobDispatcher dispatcher;
|
||||
private readonly PackRunProcessor processor;
|
||||
private readonly PackRunWorkerOptions options;
|
||||
private readonly IPackRunStateStore stateStore;
|
||||
private readonly PackRunExecutionGraphBuilder graphBuilder;
|
||||
private readonly PackRunSimulationEngine simulationEngine;
|
||||
private readonly IPackRunStepExecutor executor;
|
||||
private readonly IPackRunArtifactUploader artifactUploader;
|
||||
private readonly IPackRunProvenanceWriter provenanceWriter;
|
||||
private readonly IPackRunLogStore logStore;
|
||||
private readonly TimeProvider timeProvider;
|
||||
private readonly ILogger<PackRunWorkerService> logger;
|
||||
private readonly UpDownCounter<long> runningSteps;
|
||||
|
||||
public PackRunWorkerService(
|
||||
IPackRunJobDispatcher dispatcher,
|
||||
PackRunProcessor processor,
|
||||
IPackRunStateStore stateStore,
|
||||
PackRunExecutionGraphBuilder graphBuilder,
|
||||
PackRunSimulationEngine simulationEngine,
|
||||
IPackRunStepExecutor executor,
|
||||
IPackRunArtifactUploader artifactUploader,
|
||||
IPackRunProvenanceWriter provenanceWriter,
|
||||
IPackRunLogStore logStore,
|
||||
IOptions<PackRunWorkerOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<PackRunWorkerService> logger)
|
||||
{
|
||||
this.dispatcher = dispatcher ?? throw new ArgumentNullException(nameof(dispatcher));
|
||||
this.processor = processor ?? throw new ArgumentNullException(nameof(processor));
|
||||
this.stateStore = stateStore ?? throw new ArgumentNullException(nameof(stateStore));
|
||||
this.graphBuilder = graphBuilder ?? throw new ArgumentNullException(nameof(graphBuilder));
|
||||
this.simulationEngine = simulationEngine ?? throw new ArgumentNullException(nameof(simulationEngine));
|
||||
this.executor = executor ?? throw new ArgumentNullException(nameof(executor));
|
||||
this.artifactUploader = artifactUploader ?? throw new ArgumentNullException(nameof(artifactUploader));
|
||||
this.provenanceWriter = provenanceWriter ?? throw new ArgumentNullException(nameof(provenanceWriter));
|
||||
this.logStore = logStore ?? throw new ArgumentNullException(nameof(logStore));
|
||||
this.options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
||||
this.timeProvider = timeProvider ?? TimeProvider.System;
|
||||
this.logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
runningSteps = TaskRunnerTelemetry.RunningSteps;
|
||||
|
||||
if (dispatcher is FilesystemPackRunDispatcher fsDispatcher)
|
||||
{
|
||||
TaskRunnerTelemetry.Meter.CreateObservableGauge<long>(
|
||||
"taskrunner.queue.depth",
|
||||
() => new Measurement<long>(
|
||||
Directory.Exists(fsDispatcher.QueuePath)
|
||||
? Directory.GetFiles(fsDispatcher.QueuePath, "*.json", SearchOption.TopDirectoryOnly).LongLength
|
||||
: 0));
|
||||
}
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
var context = await dispatcher.TryDequeueAsync(stoppingToken).ConfigureAwait(false);
|
||||
if (context is null)
|
||||
{
|
||||
await Task.Delay(options.IdleDelay, stoppingToken).ConfigureAwait(false);
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await ProcessRunAsync(context, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
logger.LogError(ex, "Unhandled exception while processing run {RunId}.", context.RunId);
|
||||
var metadata = new Dictionary<string, string>(StringComparer.Ordinal)
|
||||
{
|
||||
["exceptionType"] = ex.GetType().FullName ?? ex.GetType().Name
|
||||
};
|
||||
await AppendLogAsync(
|
||||
context.RunId,
|
||||
"error",
|
||||
"run.failed",
|
||||
"Unhandled exception while processing run.",
|
||||
stoppingToken,
|
||||
metadata: metadata).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ProcessRunAsync(PackRunExecutionContext context, CancellationToken cancellationToken)
|
||||
{
|
||||
logger.LogInformation("Processing pack run {RunId}.", context.RunId);
|
||||
|
||||
await AppendLogAsync(
|
||||
context.RunId,
|
||||
"info",
|
||||
"run.received",
|
||||
"Run dequeued by worker.",
|
||||
cancellationToken,
|
||||
metadata: new Dictionary<string, string>(StringComparer.Ordinal)
|
||||
{
|
||||
["planHash"] = context.Plan.Hash
|
||||
}).ConfigureAwait(false);
|
||||
|
||||
var processorResult = await processor.ProcessNewRunAsync(context, cancellationToken).ConfigureAwait(false);
|
||||
var graph = graphBuilder.Build(context.Plan);
|
||||
|
||||
var state = await stateStore.GetAsync(context.RunId, cancellationToken).ConfigureAwait(false);
|
||||
if (state is null || !string.Equals(state.PlanHash, context.Plan.Hash, StringComparison.Ordinal))
|
||||
{
|
||||
state = await CreateInitialStateAsync(context, graph, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
if (!processorResult.ShouldResumeImmediately)
|
||||
{
|
||||
logger.LogInformation("Run {RunId} awaiting approvals or policy gates.", context.RunId);
|
||||
await AppendLogAsync(
|
||||
context.RunId,
|
||||
"info",
|
||||
"run.awaiting-approvals",
|
||||
"Run paused awaiting approvals or policy gates.",
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
var gateUpdate = PackRunGateStateUpdater.Apply(state, graph, processorResult.ApprovalCoordinator, timeProvider.GetUtcNow());
|
||||
state = gateUpdate.State;
|
||||
|
||||
if (gateUpdate.HasBlockingFailure)
|
||||
{
|
||||
await stateStore.SaveAsync(state, cancellationToken).ConfigureAwait(false);
|
||||
logger.LogWarning("Run {RunId} halted because a gate failed.", context.RunId);
|
||||
await AppendLogAsync(
|
||||
context.RunId,
|
||||
"warn",
|
||||
"run.gate-blocked",
|
||||
"Run halted because a gate failed.",
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
var updatedState = await ExecuteGraphAsync(context, graph, state, cancellationToken).ConfigureAwait(false);
|
||||
await stateStore.SaveAsync(updatedState, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (updatedState.Steps.Values.All(step => step.Status is PackRunStepExecutionStatus.Succeeded or PackRunStepExecutionStatus.Skipped))
|
||||
{
|
||||
logger.LogInformation("Run {RunId} finished successfully.", context.RunId);
|
||||
await AppendLogAsync(
|
||||
context.RunId,
|
||||
"info",
|
||||
"run.completed",
|
||||
"Run finished successfully.",
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
await artifactUploader.UploadAsync(context, updatedState, context.Plan.Outputs, cancellationToken).ConfigureAwait(false);
|
||||
await provenanceWriter.WriteAsync(context, updatedState, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.LogInformation("Run {RunId} paused with pending work.", context.RunId);
|
||||
await AppendLogAsync(
|
||||
context.RunId,
|
||||
"info",
|
||||
"run.paused",
|
||||
"Run paused with pending work.",
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<PackRunState> CreateInitialStateAsync(
|
||||
PackRunExecutionContext context,
|
||||
PackRunExecutionGraph graph,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var timestamp = timeProvider.GetUtcNow();
|
||||
var state = PackRunStateFactory.CreateInitialState(context, graph, simulationEngine, timestamp);
|
||||
await stateStore.SaveAsync(state, cancellationToken).ConfigureAwait(false);
|
||||
return state;
|
||||
}
|
||||
|
||||
private Task AppendLogAsync(
|
||||
string runId,
|
||||
string level,
|
||||
string eventType,
|
||||
string message,
|
||||
CancellationToken cancellationToken,
|
||||
string? stepId = null,
|
||||
IReadOnlyDictionary<string, string>? metadata = null)
|
||||
{
|
||||
var entry = new PackRunLogEntry(timeProvider.GetUtcNow(), level, eventType, message, stepId, metadata);
|
||||
return logStore.AppendAsync(runId, entry, cancellationToken);
|
||||
}
|
||||
|
||||
private async Task<PackRunState> ExecuteGraphAsync(
|
||||
PackRunExecutionContext context,
|
||||
PackRunExecutionGraph graph,
|
||||
PackRunState state,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var mutable = new ConcurrentDictionary<string, PackRunStepStateRecord>(state.Steps, StringComparer.Ordinal);
|
||||
var failurePolicy = graph.FailurePolicy ?? PackRunExecutionGraph.DefaultFailurePolicy;
|
||||
var executionContext = new ExecutionContext(context.RunId, failurePolicy, mutable, cancellationToken);
|
||||
|
||||
foreach (var step in graph.Steps)
|
||||
{
|
||||
var outcome = await ExecuteStepAsync(step, executionContext).ConfigureAwait(false);
|
||||
if (outcome is StepExecutionOutcome.AbortRun or StepExecutionOutcome.Defer)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
var updated = new ReadOnlyDictionary<string, PackRunStepStateRecord>(mutable);
|
||||
return state with
|
||||
{
|
||||
UpdatedAt = timeProvider.GetUtcNow(),
|
||||
Steps = updated
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<StepExecutionOutcome> ExecuteStepAsync(
|
||||
PackRunExecutionStep step,
|
||||
ExecutionContext executionContext)
|
||||
{
|
||||
executionContext.CancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
if (!executionContext.Steps.TryGetValue(step.Id, out var record))
|
||||
{
|
||||
return StepExecutionOutcome.Continue;
|
||||
}
|
||||
|
||||
if (!record.Enabled)
|
||||
{
|
||||
return StepExecutionOutcome.Continue;
|
||||
}
|
||||
|
||||
if (record.Status == PackRunStepExecutionStatus.Succeeded || record.Status == PackRunStepExecutionStatus.Skipped)
|
||||
{
|
||||
return StepExecutionOutcome.Continue;
|
||||
}
|
||||
|
||||
if (record.NextAttemptAt is { } scheduled && scheduled > timeProvider.GetUtcNow())
|
||||
{
|
||||
logger.LogInformation(
|
||||
"Run {RunId} step {StepId} waiting until {NextAttempt} for retry.",
|
||||
executionContext.RunId,
|
||||
record.StepId,
|
||||
scheduled);
|
||||
var metadata = new Dictionary<string, string>(StringComparer.Ordinal)
|
||||
{
|
||||
["nextAttemptAt"] = scheduled.ToUniversalTime().ToString("O", CultureInfo.InvariantCulture),
|
||||
["attempts"] = record.Attempts.ToString(CultureInfo.InvariantCulture)
|
||||
};
|
||||
await AppendLogAsync(
|
||||
executionContext.RunId,
|
||||
"info",
|
||||
"step.awaiting-retry",
|
||||
$"Step {record.StepId} waiting for retry.",
|
||||
executionContext.CancellationToken,
|
||||
record.StepId,
|
||||
metadata).ConfigureAwait(false);
|
||||
return StepExecutionOutcome.Defer;
|
||||
}
|
||||
|
||||
switch (step.Kind)
|
||||
{
|
||||
case PackRunStepKind.GateApproval:
|
||||
case PackRunStepKind.GatePolicy:
|
||||
executionContext.Steps[step.Id] = record with
|
||||
{
|
||||
Status = PackRunStepExecutionStatus.Succeeded,
|
||||
StatusReason = null,
|
||||
LastTransitionAt = timeProvider.GetUtcNow(),
|
||||
NextAttemptAt = null
|
||||
};
|
||||
await AppendLogAsync(
|
||||
executionContext.RunId,
|
||||
"info",
|
||||
step.Kind == PackRunStepKind.GateApproval ? "step.approval-satisfied" : "step.policy-satisfied",
|
||||
$"Gate {step.Id} satisfied.",
|
||||
executionContext.CancellationToken,
|
||||
step.Id).ConfigureAwait(false);
|
||||
return StepExecutionOutcome.Continue;
|
||||
|
||||
case PackRunStepKind.Parallel:
|
||||
return await ExecuteParallelStepAsync(step, executionContext).ConfigureAwait(false);
|
||||
|
||||
case PackRunStepKind.Map:
|
||||
return await ExecuteMapStepAsync(step, executionContext).ConfigureAwait(false);
|
||||
|
||||
case PackRunStepKind.Run:
|
||||
return await ExecuteRunStepAsync(step, executionContext).ConfigureAwait(false);
|
||||
|
||||
default:
|
||||
logger.LogWarning("Run {RunId} encountered unsupported step kind '{Kind}' for step {StepId}. Marking as skipped.",
|
||||
executionContext.RunId,
|
||||
step.Kind,
|
||||
step.Id);
|
||||
executionContext.Steps[step.Id] = record with
|
||||
{
|
||||
Status = PackRunStepExecutionStatus.Skipped,
|
||||
StatusReason = "unsupported-kind",
|
||||
LastTransitionAt = timeProvider.GetUtcNow()
|
||||
};
|
||||
await AppendLogAsync(
|
||||
executionContext.RunId,
|
||||
"warn",
|
||||
"step.skipped",
|
||||
"Step skipped because the step kind is unsupported.",
|
||||
executionContext.CancellationToken,
|
||||
step.Id,
|
||||
new Dictionary<string, string>(StringComparer.Ordinal)
|
||||
{
|
||||
["kind"] = step.Kind.ToString()
|
||||
}).ConfigureAwait(false);
|
||||
return StepExecutionOutcome.Continue;
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<StepExecutionOutcome> ExecuteRunStepAsync(
|
||||
PackRunExecutionStep step,
|
||||
ExecutionContext executionContext)
|
||||
{
|
||||
var record = executionContext.Steps[step.Id];
|
||||
var now = timeProvider.GetUtcNow();
|
||||
var currentState = new PackRunStepState(record.Status, record.Attempts, record.LastTransitionAt, record.NextAttemptAt);
|
||||
|
||||
if (currentState.Status == PackRunStepExecutionStatus.Pending)
|
||||
{
|
||||
currentState = PackRunStepStateMachine.Start(currentState, now);
|
||||
record = record with
|
||||
{
|
||||
Status = currentState.Status,
|
||||
LastTransitionAt = currentState.LastTransitionAt,
|
||||
NextAttemptAt = currentState.NextAttemptAt,
|
||||
StatusReason = null
|
||||
};
|
||||
executionContext.Steps[step.Id] = record;
|
||||
var startMetadata = new Dictionary<string, string>(StringComparer.Ordinal)
|
||||
{
|
||||
["attempt"] = currentState.Attempts.ToString(CultureInfo.InvariantCulture)
|
||||
};
|
||||
await AppendLogAsync(
|
||||
executionContext.RunId,
|
||||
"info",
|
||||
"step.started",
|
||||
$"Step {step.Id} started.",
|
||||
executionContext.CancellationToken,
|
||||
step.Id,
|
||||
startMetadata).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
runningSteps.Add(1);
|
||||
var stopwatch = Stopwatch.StartNew();
|
||||
var result = await executor.ExecuteAsync(step, step.Parameters ?? PackRunExecutionStep.EmptyParameters, executionContext.CancellationToken).ConfigureAwait(false);
|
||||
stopwatch.Stop();
|
||||
TaskRunnerTelemetry.StepDurationMs.Record(
|
||||
stopwatch.Elapsed.TotalMilliseconds,
|
||||
new KeyValuePair<string, object?>("step_kind", step.Kind.ToString()));
|
||||
runningSteps.Add(-1);
|
||||
|
||||
if (result.Succeeded)
|
||||
{
|
||||
currentState = PackRunStepStateMachine.CompleteSuccess(currentState, timeProvider.GetUtcNow());
|
||||
executionContext.Steps[step.Id] = record with
|
||||
{
|
||||
Status = currentState.Status,
|
||||
Attempts = currentState.Attempts,
|
||||
LastTransitionAt = currentState.LastTransitionAt,
|
||||
NextAttemptAt = currentState.NextAttemptAt,
|
||||
StatusReason = null
|
||||
};
|
||||
|
||||
var successMetadata = new Dictionary<string, string>(StringComparer.Ordinal)
|
||||
{
|
||||
["attempt"] = currentState.Attempts.ToString(CultureInfo.InvariantCulture)
|
||||
};
|
||||
await AppendLogAsync(
|
||||
executionContext.RunId,
|
||||
"info",
|
||||
"step.succeeded",
|
||||
$"Step {step.Id} succeeded.",
|
||||
executionContext.CancellationToken,
|
||||
step.Id,
|
||||
successMetadata).ConfigureAwait(false);
|
||||
|
||||
return StepExecutionOutcome.Continue;
|
||||
}
|
||||
|
||||
logger.LogWarning(
|
||||
"Run {RunId} step {StepId} failed: {Error}",
|
||||
executionContext.RunId,
|
||||
step.Id,
|
||||
result.Error ?? "unknown error");
|
||||
|
||||
var failure = PackRunStepStateMachine.RegisterFailure(currentState, timeProvider.GetUtcNow(), executionContext.FailurePolicy);
|
||||
var updatedRecord = record with
|
||||
{
|
||||
Status = failure.State.Status,
|
||||
Attempts = failure.State.Attempts,
|
||||
LastTransitionAt = failure.State.LastTransitionAt,
|
||||
NextAttemptAt = failure.State.NextAttemptAt,
|
||||
StatusReason = result.Error
|
||||
};
|
||||
|
||||
executionContext.Steps[step.Id] = updatedRecord;
|
||||
|
||||
var failureMetadata = new Dictionary<string, string>(StringComparer.Ordinal)
|
||||
{
|
||||
["attempt"] = failure.State.Attempts.ToString(CultureInfo.InvariantCulture)
|
||||
};
|
||||
if (!string.IsNullOrWhiteSpace(result.Error))
|
||||
{
|
||||
failureMetadata["error"] = result.Error;
|
||||
}
|
||||
if (failure.State.NextAttemptAt is { } retryAt)
|
||||
{
|
||||
failureMetadata["nextAttemptAt"] = retryAt.ToUniversalTime().ToString("O", CultureInfo.InvariantCulture);
|
||||
}
|
||||
|
||||
var failureLevel = failure.Outcome == PackRunStepFailureOutcome.Abort && !step.ContinueOnError
|
||||
? "error"
|
||||
: "warn";
|
||||
|
||||
await AppendLogAsync(
|
||||
executionContext.RunId,
|
||||
failureLevel,
|
||||
"step.failed",
|
||||
$"Step {step.Id} failed.",
|
||||
executionContext.CancellationToken,
|
||||
step.Id,
|
||||
failureMetadata).ConfigureAwait(false);
|
||||
|
||||
if (failure.Outcome == PackRunStepFailureOutcome.Retry)
|
||||
{
|
||||
TaskRunnerTelemetry.StepRetryCount.Add(1, new KeyValuePair<string, object?>("step_kind", step.Kind.ToString()));
|
||||
var retryMetadata = new Dictionary<string, string>(failureMetadata, StringComparer.Ordinal)
|
||||
{
|
||||
["outcome"] = "retry"
|
||||
};
|
||||
await AppendLogAsync(
|
||||
executionContext.RunId,
|
||||
"info",
|
||||
"step.retry-scheduled",
|
||||
$"Step {step.Id} scheduled for retry.",
|
||||
executionContext.CancellationToken,
|
||||
step.Id,
|
||||
retryMetadata).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
return failure.Outcome switch
|
||||
{
|
||||
PackRunStepFailureOutcome.Retry => StepExecutionOutcome.Defer,
|
||||
PackRunStepFailureOutcome.Abort when step.ContinueOnError => StepExecutionOutcome.Continue,
|
||||
PackRunStepFailureOutcome.Abort => StepExecutionOutcome.AbortRun,
|
||||
_ => StepExecutionOutcome.AbortRun
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<StepExecutionOutcome> ExecuteParallelStepAsync(
|
||||
PackRunExecutionStep step,
|
||||
ExecutionContext executionContext)
|
||||
{
|
||||
var children = step.Children;
|
||||
if (children.Count == 0)
|
||||
{
|
||||
MarkContainerSucceeded(step, executionContext);
|
||||
return StepExecutionOutcome.Continue;
|
||||
}
|
||||
|
||||
var maxParallel = step.MaxParallel is > 0 ? step.MaxParallel.Value : children.Count;
|
||||
var queue = new Queue<PackRunExecutionStep>(children);
|
||||
var running = new List<Task<StepExecutionOutcome>>(maxParallel);
|
||||
var outcome = StepExecutionOutcome.Continue;
|
||||
var childFailureDetected = false;
|
||||
|
||||
while (queue.Count > 0 || running.Count > 0)
|
||||
{
|
||||
while (queue.Count > 0 && running.Count < maxParallel)
|
||||
{
|
||||
var child = queue.Dequeue();
|
||||
running.Add(ExecuteStepAsync(child, executionContext));
|
||||
}
|
||||
|
||||
var completed = await Task.WhenAny(running).ConfigureAwait(false);
|
||||
running.Remove(completed);
|
||||
var childOutcome = await completed.ConfigureAwait(false);
|
||||
|
||||
switch (childOutcome)
|
||||
{
|
||||
case StepExecutionOutcome.AbortRun:
|
||||
if (step.ContinueOnError)
|
||||
{
|
||||
childFailureDetected = true;
|
||||
outcome = StepExecutionOutcome.Continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
outcome = StepExecutionOutcome.AbortRun;
|
||||
running.Clear();
|
||||
queue.Clear();
|
||||
}
|
||||
break;
|
||||
|
||||
case StepExecutionOutcome.Defer:
|
||||
outcome = StepExecutionOutcome.Defer;
|
||||
running.Clear();
|
||||
queue.Clear();
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (!step.ContinueOnError && outcome != StepExecutionOutcome.Continue)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (outcome == StepExecutionOutcome.Continue)
|
||||
{
|
||||
if (childFailureDetected)
|
||||
{
|
||||
MarkContainerFailure(step, executionContext, ChildFailureReason);
|
||||
}
|
||||
else
|
||||
{
|
||||
MarkContainerSucceeded(step, executionContext);
|
||||
}
|
||||
}
|
||||
else if (outcome == StepExecutionOutcome.AbortRun)
|
||||
{
|
||||
MarkContainerFailure(step, executionContext, ChildFailureReason);
|
||||
}
|
||||
else if (outcome == StepExecutionOutcome.Defer)
|
||||
{
|
||||
MarkContainerPending(step, executionContext, AwaitingRetryReason);
|
||||
}
|
||||
|
||||
return outcome;
|
||||
}
|
||||
|
||||
private async Task<StepExecutionOutcome> ExecuteMapStepAsync(
|
||||
PackRunExecutionStep step,
|
||||
ExecutionContext executionContext)
|
||||
{
|
||||
foreach (var child in step.Children)
|
||||
{
|
||||
var outcome = await ExecuteStepAsync(child, executionContext).ConfigureAwait(false);
|
||||
if (outcome != StepExecutionOutcome.Continue)
|
||||
{
|
||||
if (outcome == StepExecutionOutcome.Defer)
|
||||
{
|
||||
MarkContainerPending(step, executionContext, AwaitingRetryReason);
|
||||
return outcome;
|
||||
}
|
||||
|
||||
if (!step.ContinueOnError)
|
||||
{
|
||||
MarkContainerFailure(step, executionContext, ChildFailureReason);
|
||||
return outcome;
|
||||
}
|
||||
|
||||
MarkContainerFailure(step, executionContext, ChildFailureReason);
|
||||
}
|
||||
}
|
||||
|
||||
MarkContainerSucceeded(step, executionContext);
|
||||
return StepExecutionOutcome.Continue;
|
||||
}
|
||||
|
||||
private void MarkContainerSucceeded(PackRunExecutionStep step, ExecutionContext executionContext)
|
||||
{
|
||||
if (!executionContext.Steps.TryGetValue(step.Id, out var record))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (record.Status == PackRunStepExecutionStatus.Succeeded)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
executionContext.Steps[step.Id] = record with
|
||||
{
|
||||
Status = PackRunStepExecutionStatus.Succeeded,
|
||||
StatusReason = null,
|
||||
LastTransitionAt = timeProvider.GetUtcNow(),
|
||||
NextAttemptAt = null
|
||||
};
|
||||
}
|
||||
|
||||
private void MarkContainerFailure(PackRunExecutionStep step, ExecutionContext executionContext, string reason)
|
||||
{
|
||||
if (!executionContext.Steps.TryGetValue(step.Id, out var record))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
executionContext.Steps[step.Id] = record with
|
||||
{
|
||||
Status = PackRunStepExecutionStatus.Failed,
|
||||
StatusReason = reason,
|
||||
LastTransitionAt = timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
private void MarkContainerPending(PackRunExecutionStep step, ExecutionContext executionContext, string reason)
|
||||
{
|
||||
if (!executionContext.Steps.TryGetValue(step.Id, out var record))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
executionContext.Steps[step.Id] = record with
|
||||
{
|
||||
Status = PackRunStepExecutionStatus.Pending,
|
||||
StatusReason = reason,
|
||||
LastTransitionAt = timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
private sealed record ExecutionContext(
|
||||
string RunId,
|
||||
TaskPackPlanFailurePolicy FailurePolicy,
|
||||
ConcurrentDictionary<string, PackRunStepStateRecord> Steps,
|
||||
CancellationToken CancellationToken);
|
||||
|
||||
private enum StepExecutionOutcome
|
||||
{
|
||||
Continue,
|
||||
Defer,
|
||||
AbortRun
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
<?xml version="1.0" ?>
|
||||
<Project Sdk="Microsoft.NET.Sdk.Web">
|
||||
|
||||
|
||||
|
||||
<PropertyGroup>
|
||||
|
||||
|
||||
<UserSecretsId>dotnet-StellaOps.TaskRunner.Worker-ce7b902e-94f1-41c2-861b-daa533850dc5</UserSecretsId>
|
||||
|
||||
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
|
||||
|
||||
<!-- FrameworkReference Microsoft.AspNetCore.App is provided by Sdk.Web -->
|
||||
|
||||
<ItemGroup>
|
||||
<!-- Microsoft.Extensions.Hosting is provided by Sdk.Worker -->
|
||||
<PackageReference Include="OpenTelemetry.Instrumentation.Http" />
|
||||
<PackageReference Include="OpenTelemetry.Instrumentation.Runtime" />
|
||||
</ItemGroup>
|
||||
|
||||
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
|
||||
<ProjectReference Include="..\StellaOps.TaskRunner.Core\StellaOps.TaskRunner.Core.csproj"/>
|
||||
|
||||
|
||||
<ProjectReference Include="..\StellaOps.TaskRunner.Infrastructure\StellaOps.TaskRunner.Infrastructure.csproj"/>
|
||||
<ProjectReference Include="..\..\StellaOps.TaskRunner.__Libraries\StellaOps.TaskRunner.Persistence\StellaOps.TaskRunner.Persistence.csproj"/>
|
||||
|
||||
<ProjectReference Include="..\..\..\Telemetry\StellaOps.Telemetry.Core\StellaOps.Telemetry.Core\StellaOps.Telemetry.Core.csproj"/>
|
||||
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Worker.Health\StellaOps.Worker.Health.csproj"/>
|
||||
|
||||
</ItemGroup>
|
||||
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,9 @@
|
||||
# StellaOps.TaskRunner.Worker Task Board
|
||||
This board mirrors active sprint tasks for this module.
|
||||
Source of truth: `docs/implplan/SPRINT_20260130_002_Tools_csproj_remediation_solid_review.md`.
|
||||
|
||||
| Task ID | Status | Notes |
|
||||
| --- | --- | --- |
|
||||
| REMED-05 | TODO | Remediation checklist: docs/implplan/audits/csproj-standards/remediation/checklists/src/TaskRunner/StellaOps.TaskRunner/StellaOps.TaskRunner.Worker/StellaOps.TaskRunner.Worker.md. |
|
||||
| REMED-06 | DONE | SOLID review notes captured for SPRINT_20260130_002. |
|
||||
| SPRINT-312-004 | DONE | Worker storage wiring aligned to Postgres state/log/approval and seed-fs artifact/provenance object-store contract. |
|
||||
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"Logging": {
|
||||
"LogLevel": {
|
||||
"Default": "Information",
|
||||
"Microsoft.Hosting.Lifetime": "Information"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
{
|
||||
"Logging": {
|
||||
"LogLevel": {
|
||||
"Default": "Information",
|
||||
"Microsoft.Hosting.Lifetime": "Information"
|
||||
}
|
||||
},
|
||||
"Worker": {
|
||||
"IdleDelay": "00:00:01",
|
||||
"QueuePath": "queue",
|
||||
"ArchivePath": "queue/archive",
|
||||
"ApprovalStorePath": "state/approvals",
|
||||
"RunStatePath": "state/runs"
|
||||
},
|
||||
"Notifications": {
|
||||
"ApprovalEndpoint": null,
|
||||
"PolicyEndpoint": null
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user