consolidation of some of the modules, localization fixes, product advisories work, qa work

This commit is contained in:
master
2026-03-05 03:54:22 +02:00
parent 7bafcc3eef
commit 8e1cb9448d
3878 changed files with 72600 additions and 46861 deletions

View File

@@ -0,0 +1,234 @@
using Microsoft.Extensions.Options;
using OpenTelemetry.Metrics;
using OpenTelemetry.Trace;
using StellaOps.AirGap.Policy;
using StellaOps.Infrastructure.Postgres.Options;
using StellaOps.TaskRunner.Core.Configuration;
using StellaOps.TaskRunner.Core.Execution;
using StellaOps.TaskRunner.Core.Execution.Simulation;
using StellaOps.TaskRunner.Infrastructure.Execution;
using StellaOps.TaskRunner.Persistence.Postgres;
using StellaOps.TaskRunner.Persistence.Postgres.Repositories;
using StellaOps.TaskRunner.Worker.Services;
using StellaOps.Telemetry.Core;
using StellaOps.Worker.Health;
var builder = WebApplication.CreateSlimBuilder(args);
builder.Services.AddAirGapEgressPolicy(builder.Configuration, sectionName: "AirGap");
builder.Services.Configure<PackRunWorkerOptions>(builder.Configuration.GetSection("Worker"));
builder.Services.Configure<NotificationOptions>(builder.Configuration.GetSection("Notifications"));
builder.Services.AddHttpClient("taskrunner-notifications");
builder.Services.AddSingleton(TimeProvider.System);
builder.Services.AddSingleton(sp =>
{
var options = sp.GetRequiredService<IOptions<PackRunWorkerOptions>>();
var egressPolicy = sp.GetRequiredService<IEgressPolicy>();
return new FilesystemPackRunDispatcher(options.Value.QueuePath, options.Value.ArchivePath, egressPolicy);
});
builder.Services.AddSingleton<IPackRunJobDispatcher>(sp => sp.GetRequiredService<FilesystemPackRunDispatcher>());
builder.Services.AddSingleton<IPackRunJobScheduler>(sp => sp.GetRequiredService<FilesystemPackRunDispatcher>());
builder.Services.AddSingleton<IPackRunNotificationPublisher>(sp =>
{
var options = sp.GetRequiredService<IOptions<NotificationOptions>>().Value;
if (options.ApprovalEndpoint is not null || options.PolicyEndpoint is not null)
{
return new HttpPackRunNotificationPublisher(
sp.GetRequiredService<IHttpClientFactory>(),
sp.GetRequiredService<IOptions<NotificationOptions>>(),
sp.GetRequiredService<ILogger<HttpPackRunNotificationPublisher>>());
}
return new LoggingPackRunNotificationPublisher(sp.GetRequiredService<ILogger<LoggingPackRunNotificationPublisher>>());
});
builder.Services.AddSingleton<IPackRunStepExecutor, BundleIngestionStepExecutor>();
builder.Services.AddSingleton<PackRunExecutionGraphBuilder>();
builder.Services.AddSingleton<PackRunSimulationEngine>();
builder.Services.AddSingleton<PackRunProcessor>();
builder.Services.AddStellaOpsTelemetry(
builder.Configuration,
serviceName: "StellaOps.TaskRunner.Worker",
configureTracing: tracing => tracing.AddHttpClientInstrumentation(),
configureMetrics: metrics => metrics
.AddRuntimeInstrumentation()
.AddMeter(TaskRunnerTelemetry.MeterName));
var storageDriver = ResolveStorageDriver(builder.Configuration, "TaskRunner");
RegisterStateStores(builder.Services, builder.Configuration, builder.Environment.IsDevelopment(), storageDriver);
ValidateObjectStoreContract(builder.Configuration, builder.Environment.IsDevelopment(), "TaskRunner");
builder.Services.AddSingleton<IPackRunArtifactUploader>(sp =>
{
var options = sp.GetRequiredService<IOptions<PackRunWorkerOptions>>().Value;
var timeProvider = sp.GetRequiredService<TimeProvider>();
var logger = sp.GetRequiredService<ILogger<FilesystemPackRunArtifactUploader>>();
var configuration = sp.GetRequiredService<IConfiguration>();
var artifactsRoot = ResolveSeedFsRootPath(configuration, "TaskRunner", options.ArtifactsPath);
return new FilesystemPackRunArtifactUploader(artifactsRoot, timeProvider, logger);
});
builder.Services.AddSingleton<IPackRunProvenanceWriter>(sp =>
{
var options = sp.GetRequiredService<IOptions<PackRunWorkerOptions>>().Value;
var timeProvider = sp.GetRequiredService<TimeProvider>();
var configuration = sp.GetRequiredService<IConfiguration>();
var artifactsRoot = ResolveSeedFsRootPath(configuration, "TaskRunner", options.ArtifactsPath);
return new FilesystemPackRunProvenanceWriter(artifactsRoot, timeProvider);
});
builder.Services.AddHostedService<PackRunWorkerService>();
builder.Services.AddWorkerHealthChecks();
var app = builder.Build();
app.MapWorkerHealthEndpoints();
app.Run();
static void RegisterStateStores(IServiceCollection services, IConfiguration configuration, bool isDevelopment, string storageDriver)
{
if (string.Equals(storageDriver, "postgres", StringComparison.OrdinalIgnoreCase))
{
var connectionString = ResolvePostgresConnectionString(configuration, "TaskRunner");
if (string.IsNullOrWhiteSpace(connectionString))
{
if (!isDevelopment)
{
throw new InvalidOperationException(
"TaskRunner worker requires PostgreSQL connection settings in non-development mode. " +
"Set ConnectionStrings:Default or TaskRunner:Storage:Postgres:ConnectionString.");
}
RegisterFilesystemStateStores(services);
return;
}
services.Configure<PostgresOptions>(options =>
{
options.ConnectionString = connectionString;
options.SchemaName = ResolveSchemaName(configuration, "TaskRunner") ?? TaskRunnerDataSource.DefaultSchemaName;
});
services.AddSingleton<TaskRunnerDataSource>();
services.AddSingleton<IPackRunStateStore, PostgresPackRunStateStore>();
services.AddSingleton<IPackRunApprovalStore, PostgresPackRunApprovalStore>();
services.AddSingleton<IPackRunLogStore, PostgresPackRunLogStore>();
return;
}
if (string.Equals(storageDriver, "filesystem", StringComparison.OrdinalIgnoreCase))
{
RegisterFilesystemStateStores(services);
return;
}
if (string.Equals(storageDriver, "inmemory", StringComparison.OrdinalIgnoreCase))
{
services.AddSingleton<IPackRunStateStore, InMemoryPackRunStateStore>();
services.AddSingleton<IPackRunApprovalStore, InMemoryPackRunApprovalStore>();
services.AddSingleton<IPackRunLogStore, InMemoryPackRunLogStore>();
return;
}
throw new InvalidOperationException(
$"Unsupported TaskRunner storage driver '{storageDriver}'. Allowed values: postgres, filesystem, inmemory.");
}
static void RegisterFilesystemStateStores(IServiceCollection services)
{
services.AddSingleton<IPackRunApprovalStore>(sp =>
{
var options = sp.GetRequiredService<IOptions<PackRunWorkerOptions>>();
return new FilePackRunApprovalStore(options.Value.ApprovalStorePath);
});
services.AddSingleton<IPackRunStateStore>(sp =>
{
var options = sp.GetRequiredService<IOptions<PackRunWorkerOptions>>();
return new FilePackRunStateStore(options.Value.RunStatePath);
});
services.AddSingleton<IPackRunLogStore>(sp =>
{
var options = sp.GetRequiredService<IOptions<PackRunWorkerOptions>>();
return new FilePackRunLogStore(options.Value.LogsPath);
});
}
static string ResolveStorageDriver(IConfiguration configuration, string serviceName)
{
return FirstNonEmpty(
configuration["Storage:Driver"],
configuration[$"{serviceName}:Storage:Driver"])
?? "postgres";
}
static string? ResolvePostgresConnectionString(IConfiguration configuration, string serviceName)
{
return FirstNonEmpty(
configuration[$"{serviceName}:Storage:Postgres:ConnectionString"],
configuration["Storage:Postgres:ConnectionString"],
configuration[$"Postgres:{serviceName}:ConnectionString"],
configuration[$"ConnectionStrings:{serviceName}"],
configuration["ConnectionStrings:Default"]);
}
static string? ResolveSchemaName(IConfiguration configuration, string serviceName)
{
return FirstNonEmpty(
configuration[$"{serviceName}:Storage:Postgres:Schema"],
configuration["Storage:Postgres:Schema"],
configuration[$"Postgres:{serviceName}:SchemaName"]);
}
static void ValidateObjectStoreContract(IConfiguration configuration, bool isDevelopment, string serviceName)
{
var objectStoreDriver = ResolveObjectStoreDriver(configuration, serviceName);
if (!string.Equals(objectStoreDriver, "seed-fs", StringComparison.OrdinalIgnoreCase) &&
!string.Equals(objectStoreDriver, "rustfs", StringComparison.OrdinalIgnoreCase))
{
throw new InvalidOperationException(
$"Unsupported object store driver '{objectStoreDriver}' for {serviceName}. Allowed values: seed-fs, rustfs.");
}
if (string.Equals(objectStoreDriver, "rustfs", StringComparison.OrdinalIgnoreCase) && !isDevelopment)
{
var rustFsBaseUrl = FirstNonEmpty(
configuration[$"{serviceName}:Storage:ObjectStore:RustFs:BaseUrl"],
configuration["Storage:ObjectStore:RustFs:BaseUrl"]);
if (string.IsNullOrWhiteSpace(rustFsBaseUrl))
{
throw new InvalidOperationException(
$"RustFS object store is configured for {serviceName}, but BaseUrl is missing.");
}
}
}
static string ResolveObjectStoreDriver(IConfiguration configuration, string serviceName)
{
return FirstNonEmpty(
configuration[$"{serviceName}:Storage:ObjectStore:Driver"],
configuration["Storage:ObjectStore:Driver"])
?? "seed-fs";
}
static string ResolveSeedFsRootPath(IConfiguration configuration, string serviceName, string fallbackPath)
{
return FirstNonEmpty(
configuration[$"{serviceName}:Storage:ObjectStore:SeedFs:RootPath"],
configuration["Storage:ObjectStore:SeedFs:RootPath"],
configuration[$"{serviceName}:Worker:ArtifactsPath"])
?? fallbackPath;
}
static string? FirstNonEmpty(params string?[] values)
{
foreach (var value in values)
{
if (!string.IsNullOrWhiteSpace(value))
{
return value;
}
}
return null;
}

View File

@@ -0,0 +1,12 @@
{
"$schema": "https://json.schemastore.org/launchsettings.json",
"profiles": {
"StellaOps.TaskRunner.Worker": {
"commandName": "Project",
"dotnetRunMessages": true,
"environmentVariables": {
"DOTNET_ENVIRONMENT": "Development"
}
}
}
}

View File

@@ -0,0 +1,657 @@
using Microsoft.Extensions.Options;
using StellaOps.TaskRunner.Core.Configuration;
using StellaOps.TaskRunner.Core.Execution;
using StellaOps.TaskRunner.Core.Execution.Simulation;
using StellaOps.TaskRunner.Core.Planning;
using StellaOps.TaskRunner.Infrastructure.Execution;
using System.Collections.Concurrent;
using System.Collections.ObjectModel;
using System.Diagnostics;
using System.Diagnostics.Metrics;
using System.Globalization;
using System.Text.Json.Nodes;
namespace StellaOps.TaskRunner.Worker.Services;
public sealed class PackRunWorkerService : BackgroundService
{
private const string ChildFailureReason = "child-failure";
private const string AwaitingRetryReason = "awaiting-retry";
private readonly IPackRunJobDispatcher dispatcher;
private readonly PackRunProcessor processor;
private readonly PackRunWorkerOptions options;
private readonly IPackRunStateStore stateStore;
private readonly PackRunExecutionGraphBuilder graphBuilder;
private readonly PackRunSimulationEngine simulationEngine;
private readonly IPackRunStepExecutor executor;
private readonly IPackRunArtifactUploader artifactUploader;
private readonly IPackRunProvenanceWriter provenanceWriter;
private readonly IPackRunLogStore logStore;
private readonly TimeProvider timeProvider;
private readonly ILogger<PackRunWorkerService> logger;
private readonly UpDownCounter<long> runningSteps;
public PackRunWorkerService(
IPackRunJobDispatcher dispatcher,
PackRunProcessor processor,
IPackRunStateStore stateStore,
PackRunExecutionGraphBuilder graphBuilder,
PackRunSimulationEngine simulationEngine,
IPackRunStepExecutor executor,
IPackRunArtifactUploader artifactUploader,
IPackRunProvenanceWriter provenanceWriter,
IPackRunLogStore logStore,
IOptions<PackRunWorkerOptions> options,
TimeProvider timeProvider,
ILogger<PackRunWorkerService> logger)
{
this.dispatcher = dispatcher ?? throw new ArgumentNullException(nameof(dispatcher));
this.processor = processor ?? throw new ArgumentNullException(nameof(processor));
this.stateStore = stateStore ?? throw new ArgumentNullException(nameof(stateStore));
this.graphBuilder = graphBuilder ?? throw new ArgumentNullException(nameof(graphBuilder));
this.simulationEngine = simulationEngine ?? throw new ArgumentNullException(nameof(simulationEngine));
this.executor = executor ?? throw new ArgumentNullException(nameof(executor));
this.artifactUploader = artifactUploader ?? throw new ArgumentNullException(nameof(artifactUploader));
this.provenanceWriter = provenanceWriter ?? throw new ArgumentNullException(nameof(provenanceWriter));
this.logStore = logStore ?? throw new ArgumentNullException(nameof(logStore));
this.options = options?.Value ?? throw new ArgumentNullException(nameof(options));
this.timeProvider = timeProvider ?? TimeProvider.System;
this.logger = logger ?? throw new ArgumentNullException(nameof(logger));
runningSteps = TaskRunnerTelemetry.RunningSteps;
if (dispatcher is FilesystemPackRunDispatcher fsDispatcher)
{
TaskRunnerTelemetry.Meter.CreateObservableGauge<long>(
"taskrunner.queue.depth",
() => new Measurement<long>(
Directory.Exists(fsDispatcher.QueuePath)
? Directory.GetFiles(fsDispatcher.QueuePath, "*.json", SearchOption.TopDirectoryOnly).LongLength
: 0));
}
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
while (!stoppingToken.IsCancellationRequested)
{
var context = await dispatcher.TryDequeueAsync(stoppingToken).ConfigureAwait(false);
if (context is null)
{
await Task.Delay(options.IdleDelay, stoppingToken).ConfigureAwait(false);
continue;
}
try
{
await ProcessRunAsync(context, stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
logger.LogError(ex, "Unhandled exception while processing run {RunId}.", context.RunId);
var metadata = new Dictionary<string, string>(StringComparer.Ordinal)
{
["exceptionType"] = ex.GetType().FullName ?? ex.GetType().Name
};
await AppendLogAsync(
context.RunId,
"error",
"run.failed",
"Unhandled exception while processing run.",
stoppingToken,
metadata: metadata).ConfigureAwait(false);
}
}
}
private async Task ProcessRunAsync(PackRunExecutionContext context, CancellationToken cancellationToken)
{
logger.LogInformation("Processing pack run {RunId}.", context.RunId);
await AppendLogAsync(
context.RunId,
"info",
"run.received",
"Run dequeued by worker.",
cancellationToken,
metadata: new Dictionary<string, string>(StringComparer.Ordinal)
{
["planHash"] = context.Plan.Hash
}).ConfigureAwait(false);
var processorResult = await processor.ProcessNewRunAsync(context, cancellationToken).ConfigureAwait(false);
var graph = graphBuilder.Build(context.Plan);
var state = await stateStore.GetAsync(context.RunId, cancellationToken).ConfigureAwait(false);
if (state is null || !string.Equals(state.PlanHash, context.Plan.Hash, StringComparison.Ordinal))
{
state = await CreateInitialStateAsync(context, graph, cancellationToken).ConfigureAwait(false);
}
if (!processorResult.ShouldResumeImmediately)
{
logger.LogInformation("Run {RunId} awaiting approvals or policy gates.", context.RunId);
await AppendLogAsync(
context.RunId,
"info",
"run.awaiting-approvals",
"Run paused awaiting approvals or policy gates.",
cancellationToken).ConfigureAwait(false);
return;
}
var gateUpdate = PackRunGateStateUpdater.Apply(state, graph, processorResult.ApprovalCoordinator, timeProvider.GetUtcNow());
state = gateUpdate.State;
if (gateUpdate.HasBlockingFailure)
{
await stateStore.SaveAsync(state, cancellationToken).ConfigureAwait(false);
logger.LogWarning("Run {RunId} halted because a gate failed.", context.RunId);
await AppendLogAsync(
context.RunId,
"warn",
"run.gate-blocked",
"Run halted because a gate failed.",
cancellationToken).ConfigureAwait(false);
return;
}
var updatedState = await ExecuteGraphAsync(context, graph, state, cancellationToken).ConfigureAwait(false);
await stateStore.SaveAsync(updatedState, cancellationToken).ConfigureAwait(false);
if (updatedState.Steps.Values.All(step => step.Status is PackRunStepExecutionStatus.Succeeded or PackRunStepExecutionStatus.Skipped))
{
logger.LogInformation("Run {RunId} finished successfully.", context.RunId);
await AppendLogAsync(
context.RunId,
"info",
"run.completed",
"Run finished successfully.",
cancellationToken).ConfigureAwait(false);
await artifactUploader.UploadAsync(context, updatedState, context.Plan.Outputs, cancellationToken).ConfigureAwait(false);
await provenanceWriter.WriteAsync(context, updatedState, cancellationToken).ConfigureAwait(false);
}
else
{
logger.LogInformation("Run {RunId} paused with pending work.", context.RunId);
await AppendLogAsync(
context.RunId,
"info",
"run.paused",
"Run paused with pending work.",
cancellationToken).ConfigureAwait(false);
}
}
private async Task<PackRunState> CreateInitialStateAsync(
PackRunExecutionContext context,
PackRunExecutionGraph graph,
CancellationToken cancellationToken)
{
var timestamp = timeProvider.GetUtcNow();
var state = PackRunStateFactory.CreateInitialState(context, graph, simulationEngine, timestamp);
await stateStore.SaveAsync(state, cancellationToken).ConfigureAwait(false);
return state;
}
private Task AppendLogAsync(
string runId,
string level,
string eventType,
string message,
CancellationToken cancellationToken,
string? stepId = null,
IReadOnlyDictionary<string, string>? metadata = null)
{
var entry = new PackRunLogEntry(timeProvider.GetUtcNow(), level, eventType, message, stepId, metadata);
return logStore.AppendAsync(runId, entry, cancellationToken);
}
private async Task<PackRunState> ExecuteGraphAsync(
PackRunExecutionContext context,
PackRunExecutionGraph graph,
PackRunState state,
CancellationToken cancellationToken)
{
var mutable = new ConcurrentDictionary<string, PackRunStepStateRecord>(state.Steps, StringComparer.Ordinal);
var failurePolicy = graph.FailurePolicy ?? PackRunExecutionGraph.DefaultFailurePolicy;
var executionContext = new ExecutionContext(context.RunId, failurePolicy, mutable, cancellationToken);
foreach (var step in graph.Steps)
{
var outcome = await ExecuteStepAsync(step, executionContext).ConfigureAwait(false);
if (outcome is StepExecutionOutcome.AbortRun or StepExecutionOutcome.Defer)
{
break;
}
}
var updated = new ReadOnlyDictionary<string, PackRunStepStateRecord>(mutable);
return state with
{
UpdatedAt = timeProvider.GetUtcNow(),
Steps = updated
};
}
private async Task<StepExecutionOutcome> ExecuteStepAsync(
PackRunExecutionStep step,
ExecutionContext executionContext)
{
executionContext.CancellationToken.ThrowIfCancellationRequested();
if (!executionContext.Steps.TryGetValue(step.Id, out var record))
{
return StepExecutionOutcome.Continue;
}
if (!record.Enabled)
{
return StepExecutionOutcome.Continue;
}
if (record.Status == PackRunStepExecutionStatus.Succeeded || record.Status == PackRunStepExecutionStatus.Skipped)
{
return StepExecutionOutcome.Continue;
}
if (record.NextAttemptAt is { } scheduled && scheduled > timeProvider.GetUtcNow())
{
logger.LogInformation(
"Run {RunId} step {StepId} waiting until {NextAttempt} for retry.",
executionContext.RunId,
record.StepId,
scheduled);
var metadata = new Dictionary<string, string>(StringComparer.Ordinal)
{
["nextAttemptAt"] = scheduled.ToUniversalTime().ToString("O", CultureInfo.InvariantCulture),
["attempts"] = record.Attempts.ToString(CultureInfo.InvariantCulture)
};
await AppendLogAsync(
executionContext.RunId,
"info",
"step.awaiting-retry",
$"Step {record.StepId} waiting for retry.",
executionContext.CancellationToken,
record.StepId,
metadata).ConfigureAwait(false);
return StepExecutionOutcome.Defer;
}
switch (step.Kind)
{
case PackRunStepKind.GateApproval:
case PackRunStepKind.GatePolicy:
executionContext.Steps[step.Id] = record with
{
Status = PackRunStepExecutionStatus.Succeeded,
StatusReason = null,
LastTransitionAt = timeProvider.GetUtcNow(),
NextAttemptAt = null
};
await AppendLogAsync(
executionContext.RunId,
"info",
step.Kind == PackRunStepKind.GateApproval ? "step.approval-satisfied" : "step.policy-satisfied",
$"Gate {step.Id} satisfied.",
executionContext.CancellationToken,
step.Id).ConfigureAwait(false);
return StepExecutionOutcome.Continue;
case PackRunStepKind.Parallel:
return await ExecuteParallelStepAsync(step, executionContext).ConfigureAwait(false);
case PackRunStepKind.Map:
return await ExecuteMapStepAsync(step, executionContext).ConfigureAwait(false);
case PackRunStepKind.Run:
return await ExecuteRunStepAsync(step, executionContext).ConfigureAwait(false);
default:
logger.LogWarning("Run {RunId} encountered unsupported step kind '{Kind}' for step {StepId}. Marking as skipped.",
executionContext.RunId,
step.Kind,
step.Id);
executionContext.Steps[step.Id] = record with
{
Status = PackRunStepExecutionStatus.Skipped,
StatusReason = "unsupported-kind",
LastTransitionAt = timeProvider.GetUtcNow()
};
await AppendLogAsync(
executionContext.RunId,
"warn",
"step.skipped",
"Step skipped because the step kind is unsupported.",
executionContext.CancellationToken,
step.Id,
new Dictionary<string, string>(StringComparer.Ordinal)
{
["kind"] = step.Kind.ToString()
}).ConfigureAwait(false);
return StepExecutionOutcome.Continue;
}
}
private async Task<StepExecutionOutcome> ExecuteRunStepAsync(
PackRunExecutionStep step,
ExecutionContext executionContext)
{
var record = executionContext.Steps[step.Id];
var now = timeProvider.GetUtcNow();
var currentState = new PackRunStepState(record.Status, record.Attempts, record.LastTransitionAt, record.NextAttemptAt);
if (currentState.Status == PackRunStepExecutionStatus.Pending)
{
currentState = PackRunStepStateMachine.Start(currentState, now);
record = record with
{
Status = currentState.Status,
LastTransitionAt = currentState.LastTransitionAt,
NextAttemptAt = currentState.NextAttemptAt,
StatusReason = null
};
executionContext.Steps[step.Id] = record;
var startMetadata = new Dictionary<string, string>(StringComparer.Ordinal)
{
["attempt"] = currentState.Attempts.ToString(CultureInfo.InvariantCulture)
};
await AppendLogAsync(
executionContext.RunId,
"info",
"step.started",
$"Step {step.Id} started.",
executionContext.CancellationToken,
step.Id,
startMetadata).ConfigureAwait(false);
}
runningSteps.Add(1);
var stopwatch = Stopwatch.StartNew();
var result = await executor.ExecuteAsync(step, step.Parameters ?? PackRunExecutionStep.EmptyParameters, executionContext.CancellationToken).ConfigureAwait(false);
stopwatch.Stop();
TaskRunnerTelemetry.StepDurationMs.Record(
stopwatch.Elapsed.TotalMilliseconds,
new KeyValuePair<string, object?>("step_kind", step.Kind.ToString()));
runningSteps.Add(-1);
if (result.Succeeded)
{
currentState = PackRunStepStateMachine.CompleteSuccess(currentState, timeProvider.GetUtcNow());
executionContext.Steps[step.Id] = record with
{
Status = currentState.Status,
Attempts = currentState.Attempts,
LastTransitionAt = currentState.LastTransitionAt,
NextAttemptAt = currentState.NextAttemptAt,
StatusReason = null
};
var successMetadata = new Dictionary<string, string>(StringComparer.Ordinal)
{
["attempt"] = currentState.Attempts.ToString(CultureInfo.InvariantCulture)
};
await AppendLogAsync(
executionContext.RunId,
"info",
"step.succeeded",
$"Step {step.Id} succeeded.",
executionContext.CancellationToken,
step.Id,
successMetadata).ConfigureAwait(false);
return StepExecutionOutcome.Continue;
}
logger.LogWarning(
"Run {RunId} step {StepId} failed: {Error}",
executionContext.RunId,
step.Id,
result.Error ?? "unknown error");
var failure = PackRunStepStateMachine.RegisterFailure(currentState, timeProvider.GetUtcNow(), executionContext.FailurePolicy);
var updatedRecord = record with
{
Status = failure.State.Status,
Attempts = failure.State.Attempts,
LastTransitionAt = failure.State.LastTransitionAt,
NextAttemptAt = failure.State.NextAttemptAt,
StatusReason = result.Error
};
executionContext.Steps[step.Id] = updatedRecord;
var failureMetadata = new Dictionary<string, string>(StringComparer.Ordinal)
{
["attempt"] = failure.State.Attempts.ToString(CultureInfo.InvariantCulture)
};
if (!string.IsNullOrWhiteSpace(result.Error))
{
failureMetadata["error"] = result.Error;
}
if (failure.State.NextAttemptAt is { } retryAt)
{
failureMetadata["nextAttemptAt"] = retryAt.ToUniversalTime().ToString("O", CultureInfo.InvariantCulture);
}
var failureLevel = failure.Outcome == PackRunStepFailureOutcome.Abort && !step.ContinueOnError
? "error"
: "warn";
await AppendLogAsync(
executionContext.RunId,
failureLevel,
"step.failed",
$"Step {step.Id} failed.",
executionContext.CancellationToken,
step.Id,
failureMetadata).ConfigureAwait(false);
if (failure.Outcome == PackRunStepFailureOutcome.Retry)
{
TaskRunnerTelemetry.StepRetryCount.Add(1, new KeyValuePair<string, object?>("step_kind", step.Kind.ToString()));
var retryMetadata = new Dictionary<string, string>(failureMetadata, StringComparer.Ordinal)
{
["outcome"] = "retry"
};
await AppendLogAsync(
executionContext.RunId,
"info",
"step.retry-scheduled",
$"Step {step.Id} scheduled for retry.",
executionContext.CancellationToken,
step.Id,
retryMetadata).ConfigureAwait(false);
}
return failure.Outcome switch
{
PackRunStepFailureOutcome.Retry => StepExecutionOutcome.Defer,
PackRunStepFailureOutcome.Abort when step.ContinueOnError => StepExecutionOutcome.Continue,
PackRunStepFailureOutcome.Abort => StepExecutionOutcome.AbortRun,
_ => StepExecutionOutcome.AbortRun
};
}
private async Task<StepExecutionOutcome> ExecuteParallelStepAsync(
PackRunExecutionStep step,
ExecutionContext executionContext)
{
var children = step.Children;
if (children.Count == 0)
{
MarkContainerSucceeded(step, executionContext);
return StepExecutionOutcome.Continue;
}
var maxParallel = step.MaxParallel is > 0 ? step.MaxParallel.Value : children.Count;
var queue = new Queue<PackRunExecutionStep>(children);
var running = new List<Task<StepExecutionOutcome>>(maxParallel);
var outcome = StepExecutionOutcome.Continue;
var childFailureDetected = false;
while (queue.Count > 0 || running.Count > 0)
{
while (queue.Count > 0 && running.Count < maxParallel)
{
var child = queue.Dequeue();
running.Add(ExecuteStepAsync(child, executionContext));
}
var completed = await Task.WhenAny(running).ConfigureAwait(false);
running.Remove(completed);
var childOutcome = await completed.ConfigureAwait(false);
switch (childOutcome)
{
case StepExecutionOutcome.AbortRun:
if (step.ContinueOnError)
{
childFailureDetected = true;
outcome = StepExecutionOutcome.Continue;
}
else
{
outcome = StepExecutionOutcome.AbortRun;
running.Clear();
queue.Clear();
}
break;
case StepExecutionOutcome.Defer:
outcome = StepExecutionOutcome.Defer;
running.Clear();
queue.Clear();
break;
default:
break;
}
if (!step.ContinueOnError && outcome != StepExecutionOutcome.Continue)
{
break;
}
}
if (outcome == StepExecutionOutcome.Continue)
{
if (childFailureDetected)
{
MarkContainerFailure(step, executionContext, ChildFailureReason);
}
else
{
MarkContainerSucceeded(step, executionContext);
}
}
else if (outcome == StepExecutionOutcome.AbortRun)
{
MarkContainerFailure(step, executionContext, ChildFailureReason);
}
else if (outcome == StepExecutionOutcome.Defer)
{
MarkContainerPending(step, executionContext, AwaitingRetryReason);
}
return outcome;
}
private async Task<StepExecutionOutcome> ExecuteMapStepAsync(
PackRunExecutionStep step,
ExecutionContext executionContext)
{
foreach (var child in step.Children)
{
var outcome = await ExecuteStepAsync(child, executionContext).ConfigureAwait(false);
if (outcome != StepExecutionOutcome.Continue)
{
if (outcome == StepExecutionOutcome.Defer)
{
MarkContainerPending(step, executionContext, AwaitingRetryReason);
return outcome;
}
if (!step.ContinueOnError)
{
MarkContainerFailure(step, executionContext, ChildFailureReason);
return outcome;
}
MarkContainerFailure(step, executionContext, ChildFailureReason);
}
}
MarkContainerSucceeded(step, executionContext);
return StepExecutionOutcome.Continue;
}
private void MarkContainerSucceeded(PackRunExecutionStep step, ExecutionContext executionContext)
{
if (!executionContext.Steps.TryGetValue(step.Id, out var record))
{
return;
}
if (record.Status == PackRunStepExecutionStatus.Succeeded)
{
return;
}
executionContext.Steps[step.Id] = record with
{
Status = PackRunStepExecutionStatus.Succeeded,
StatusReason = null,
LastTransitionAt = timeProvider.GetUtcNow(),
NextAttemptAt = null
};
}
private void MarkContainerFailure(PackRunExecutionStep step, ExecutionContext executionContext, string reason)
{
if (!executionContext.Steps.TryGetValue(step.Id, out var record))
{
return;
}
executionContext.Steps[step.Id] = record with
{
Status = PackRunStepExecutionStatus.Failed,
StatusReason = reason,
LastTransitionAt = timeProvider.GetUtcNow()
};
}
private void MarkContainerPending(PackRunExecutionStep step, ExecutionContext executionContext, string reason)
{
if (!executionContext.Steps.TryGetValue(step.Id, out var record))
{
return;
}
executionContext.Steps[step.Id] = record with
{
Status = PackRunStepExecutionStatus.Pending,
StatusReason = reason,
LastTransitionAt = timeProvider.GetUtcNow()
};
}
private sealed record ExecutionContext(
string RunId,
TaskPackPlanFailurePolicy FailurePolicy,
ConcurrentDictionary<string, PackRunStepStateRecord> Steps,
CancellationToken CancellationToken);
private enum StepExecutionOutcome
{
Continue,
Defer,
AbortRun
}
}

View File

@@ -0,0 +1,46 @@
<?xml version="1.0" ?>
<Project Sdk="Microsoft.NET.Sdk.Web">
<PropertyGroup>
<UserSecretsId>dotnet-StellaOps.TaskRunner.Worker-ce7b902e-94f1-41c2-861b-daa533850dc5</UserSecretsId>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<LangVersion>preview</LangVersion>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
</PropertyGroup>
<!-- FrameworkReference Microsoft.AspNetCore.App is provided by Sdk.Web -->
<ItemGroup>
<!-- Microsoft.Extensions.Hosting is provided by Sdk.Worker -->
<PackageReference Include="OpenTelemetry.Instrumentation.Http" />
<PackageReference Include="OpenTelemetry.Instrumentation.Runtime" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\StellaOps.TaskRunner.Core\StellaOps.TaskRunner.Core.csproj"/>
<ProjectReference Include="..\StellaOps.TaskRunner.Infrastructure\StellaOps.TaskRunner.Infrastructure.csproj"/>
<ProjectReference Include="..\..\StellaOps.TaskRunner.__Libraries\StellaOps.TaskRunner.Persistence\StellaOps.TaskRunner.Persistence.csproj"/>
<ProjectReference Include="..\..\..\Telemetry\StellaOps.Telemetry.Core\StellaOps.Telemetry.Core\StellaOps.Telemetry.Core.csproj"/>
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Worker.Health\StellaOps.Worker.Health.csproj"/>
</ItemGroup>
</Project>

View File

@@ -0,0 +1,9 @@
# StellaOps.TaskRunner.Worker Task Board
This board mirrors active sprint tasks for this module.
Source of truth: `docs/implplan/SPRINT_20260130_002_Tools_csproj_remediation_solid_review.md`.
| Task ID | Status | Notes |
| --- | --- | --- |
| REMED-05 | TODO | Remediation checklist: docs/implplan/audits/csproj-standards/remediation/checklists/src/TaskRunner/StellaOps.TaskRunner/StellaOps.TaskRunner.Worker/StellaOps.TaskRunner.Worker.md. |
| REMED-06 | DONE | SOLID review notes captured for SPRINT_20260130_002. |
| SPRINT-312-004 | DONE | Worker storage wiring aligned to Postgres state/log/approval and seed-fs artifact/provenance object-store contract. |

View File

@@ -0,0 +1,8 @@
{
"Logging": {
"LogLevel": {
"Default": "Information",
"Microsoft.Hosting.Lifetime": "Information"
}
}
}

View File

@@ -0,0 +1,19 @@
{
"Logging": {
"LogLevel": {
"Default": "Information",
"Microsoft.Hosting.Lifetime": "Information"
}
},
"Worker": {
"IdleDelay": "00:00:01",
"QueuePath": "queue",
"ArchivePath": "queue/archive",
"ApprovalStorePath": "state/approvals",
"RunStatePath": "state/runs"
},
"Notifications": {
"ApprovalEndpoint": null,
"PolicyEndpoint": null
}
}