release orchestrator v1 draft and build fixes

This commit is contained in:
master
2026-01-12 12:24:17 +02:00
parent f3de858c59
commit 9873f80830
1598 changed files with 240385 additions and 5944 deletions

View File

@@ -0,0 +1,216 @@
using Amazon.CloudWatchLogs;
using Amazon.CloudWatchLogs.Model;
using Microsoft.Extensions.Logging;
using Task = System.Threading.Tasks.Task;
namespace StellaOps.Agent.Ecs;
/// <summary>
/// Streams logs from CloudWatch Logs for ECS tasks.
/// </summary>
public sealed class CloudWatchLogStreamer
{
private readonly IAmazonCloudWatchLogs _logsClient;
private readonly ILogger<CloudWatchLogStreamer> _logger;
/// <summary>
/// Event raised when a log message is received.
/// </summary>
public event EventHandler<LogMessageEventArgs>? LogReceived;
/// <summary>
/// Creates a new CloudWatch log streamer.
/// </summary>
public CloudWatchLogStreamer(
IAmazonCloudWatchLogs logsClient,
ILogger<CloudWatchLogStreamer> logger)
{
_logsClient = logsClient;
_logger = logger;
}
/// <summary>
/// Streams logs from a CloudWatch log group/stream.
/// </summary>
/// <param name="logGroupName">The log group name.</param>
/// <param name="logStreamName">The log stream name.</param>
/// <param name="startTime">The start time for log retrieval.</param>
/// <param name="ct">Cancellation token.</param>
public async Task StreamLogsAsync(
string logGroupName,
string logStreamName,
DateTimeOffset? startTime = null,
CancellationToken ct = default)
{
string? nextToken = null;
var startFromHead = startTime is null;
var startTimeUtc = startTime?.UtcDateTime;
_logger.LogDebug(
"Starting log stream from {LogGroup}/{LogStream}",
logGroupName,
logStreamName);
try
{
while (!ct.IsCancellationRequested)
{
var request = new GetLogEventsRequest
{
LogGroupName = logGroupName,
LogStreamName = logStreamName,
StartFromHead = startFromHead,
NextToken = nextToken
};
if (startTimeUtc.HasValue && nextToken is null)
{
request.StartTime = startTimeUtc.Value;
}
var response = await _logsClient.GetLogEventsAsync(request, ct);
foreach (var logEvent in response.Events)
{
var level = DetectLogLevel(logEvent.Message);
OnLogReceived(new LogMessageEventArgs(
logGroupName,
logStreamName,
logEvent.Timestamp ?? DateTime.UtcNow,
level,
logEvent.Message));
}
// If token hasn't changed, no new logs - wait before polling
if (response.NextForwardToken == nextToken)
{
await Task.Delay(TimeSpan.FromSeconds(2), ct);
}
nextToken = response.NextForwardToken;
startFromHead = false;
}
}
catch (OperationCanceledException)
{
_logger.LogDebug("Log streaming cancelled");
}
catch (ResourceNotFoundException)
{
_logger.LogWarning(
"Log stream {LogGroup}/{LogStream} not found",
logGroupName,
logStreamName);
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"Error streaming logs from {LogGroup}/{LogStream}",
logGroupName,
logStreamName);
}
}
/// <summary>
/// Gets the log stream name for an ECS task.
/// </summary>
/// <param name="logStreamPrefix">The log stream prefix configured in the task definition.</param>
/// <param name="containerName">The container name.</param>
/// <param name="taskId">The task ID (last part of task ARN).</param>
/// <returns>The full log stream name.</returns>
public static string GetTaskLogStreamName(
string logStreamPrefix,
string containerName,
string taskId)
{
return $"{logStreamPrefix}/{containerName}/{taskId}";
}
/// <summary>
/// Extracts the task ID from a task ARN.
/// </summary>
/// <param name="taskArn">The task ARN.</param>
/// <returns>The task ID.</returns>
public static string ExtractTaskId(string taskArn)
{
var parts = taskArn.Split('/');
return parts.Length > 0 ? parts[^1] : taskArn;
}
private void OnLogReceived(LogMessageEventArgs e)
{
LogReceived?.Invoke(this, e);
}
private static LogLevel DetectLogLevel(string message)
{
var upperMessage = message.ToUpperInvariant();
if (upperMessage.Contains("ERROR") || upperMessage.Contains("FATAL") ||
upperMessage.Contains("EXCEPTION") || upperMessage.Contains("FAIL"))
{
return LogLevel.Error;
}
if (upperMessage.Contains("WARN"))
{
return LogLevel.Warning;
}
if (upperMessage.Contains("DEBUG") || upperMessage.Contains("TRACE"))
{
return LogLevel.Debug;
}
return LogLevel.Information;
}
}
/// <summary>
/// Event args for log messages.
/// </summary>
public sealed class LogMessageEventArgs : EventArgs
{
/// <summary>
/// The log group name.
/// </summary>
public string LogGroup { get; }
/// <summary>
/// The log stream name.
/// </summary>
public string LogStream { get; }
/// <summary>
/// The timestamp of the log event.
/// </summary>
public DateTime Timestamp { get; }
/// <summary>
/// The detected log level.
/// </summary>
public LogLevel Level { get; }
/// <summary>
/// The log message.
/// </summary>
public string Message { get; }
/// <summary>
/// Creates a new log message event args.
/// </summary>
public LogMessageEventArgs(
string logGroup,
string logStream,
DateTime timestamp,
LogLevel level,
string message)
{
LogGroup = logGroup;
LogStream = logStream;
Timestamp = timestamp;
Level = level;
Message = message;
}
}

View File

@@ -0,0 +1,222 @@
using System.Text.Json;
using Amazon.CloudWatchLogs;
using Amazon.ECS;
using Amazon.ECS.Model;
using Microsoft.Extensions.Logging;
using StellaOps.Agent.Core.Capability;
using StellaOps.Agent.Core.Models;
using StellaOps.Agent.Ecs.Tasks;
namespace StellaOps.Agent.Ecs;
/// <summary>
/// Agent capability for managing AWS ECS services and tasks.
/// </summary>
public sealed class EcsCapability : IAgentCapability, IAsyncDisposable
{
private readonly IAmazonECS _ecsClient;
private readonly IAmazonCloudWatchLogs _logsClient;
private readonly TimeProvider _timeProvider;
private readonly ILoggerFactory _loggerFactory;
private readonly ILogger<EcsCapability> _logger;
private readonly Dictionary<string, Func<AgentTaskInfo, CancellationToken, Task<AgentTaskResult>>> _taskHandlers;
/// <summary>
/// Gets the capability name.
/// </summary>
public string Name => "ecs";
/// <summary>
/// Gets the capability version.
/// </summary>
public string Version => "1.0.0";
/// <summary>
/// Gets the supported task types.
/// </summary>
public IReadOnlyList<string> SupportedTaskTypes { get; } = new[]
{
"ecs.deploy",
"ecs.run",
"ecs.stop",
"ecs.scale",
"ecs.register",
"ecs.health",
"ecs.describe"
};
/// <summary>
/// Creates a new ECS capability.
/// </summary>
/// <param name="ecsClient">The ECS client.</param>
/// <param name="logsClient">The CloudWatch Logs client.</param>
/// <param name="timeProvider">Time provider for timestamps.</param>
/// <param name="loggerFactory">Logger factory.</param>
public EcsCapability(
IAmazonECS ecsClient,
IAmazonCloudWatchLogs logsClient,
TimeProvider timeProvider,
ILoggerFactory loggerFactory)
{
_ecsClient = ecsClient ?? throw new ArgumentNullException(nameof(ecsClient));
_logsClient = logsClient ?? throw new ArgumentNullException(nameof(logsClient));
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
_loggerFactory = loggerFactory ?? throw new ArgumentNullException(nameof(loggerFactory));
_logger = loggerFactory.CreateLogger<EcsCapability>();
_taskHandlers = new Dictionary<string, Func<AgentTaskInfo, CancellationToken, Task<AgentTaskResult>>>
{
["ecs.deploy"] = ExecuteDeployAsync,
["ecs.run"] = ExecuteRunTaskAsync,
["ecs.stop"] = ExecuteStopTaskAsync,
["ecs.scale"] = ExecuteScaleAsync,
["ecs.register"] = ExecuteRegisterAsync,
["ecs.health"] = ExecuteHealthCheckAsync,
["ecs.describe"] = ExecuteDescribeAsync
};
}
/// <inheritdoc />
public async Task<bool> InitializeAsync(CancellationToken ct = default)
{
try
{
// Verify AWS credentials and ECS access by listing clusters
var response = await _ecsClient.ListClustersAsync(new ListClustersRequest
{
MaxResults = 1
}, ct);
_logger.LogInformation(
"ECS capability initialized, AWS API accessible");
return true;
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to initialize ECS capability - AWS API not accessible");
return false;
}
}
/// <inheritdoc />
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
{
if (!_taskHandlers.TryGetValue(task.TaskType, out var handler))
{
throw new InvalidEcsPayloadException(task.TaskType, "Unsupported task type");
}
var startTime = _timeProvider.GetUtcNow();
try
{
var result = await handler(task, ct);
return result with
{
Duration = _timeProvider.GetUtcNow() - startTime
};
}
catch (InvalidEcsPayloadException)
{
throw;
}
catch (Exception ex)
{
_logger.LogError(ex, "ECS task {TaskType} failed", task.TaskType);
return new AgentTaskResult
{
TaskId = task.Id,
Success = false,
Error = ex.Message,
CompletedAt = _timeProvider.GetUtcNow(),
Duration = _timeProvider.GetUtcNow() - startTime
};
}
}
/// <inheritdoc />
public async Task<CapabilityHealthStatus> CheckHealthAsync(CancellationToken ct = default)
{
try
{
await _ecsClient.ListClustersAsync(new ListClustersRequest { MaxResults = 1 }, ct);
return new CapabilityHealthStatus(true, "ECS capability ready");
}
catch (Exception ex)
{
return new CapabilityHealthStatus(false, $"ECS API not accessible: {ex.Message}");
}
}
private async Task<AgentTaskResult> ExecuteDeployAsync(AgentTaskInfo task, CancellationToken ct)
{
var taskHandler = new EcsDeployServiceTask(
_ecsClient,
_timeProvider,
_loggerFactory.CreateLogger<EcsDeployServiceTask>());
return await taskHandler.ExecuteAsync(task, ct);
}
private async Task<AgentTaskResult> ExecuteRunTaskAsync(AgentTaskInfo task, CancellationToken ct)
{
var taskHandler = new EcsRunTaskTask(
_ecsClient,
_timeProvider,
_loggerFactory.CreateLogger<EcsRunTaskTask>());
return await taskHandler.ExecuteAsync(task, ct);
}
private async Task<AgentTaskResult> ExecuteStopTaskAsync(AgentTaskInfo task, CancellationToken ct)
{
var taskHandler = new EcsStopTaskTask(
_ecsClient,
_timeProvider,
_loggerFactory.CreateLogger<EcsStopTaskTask>());
return await taskHandler.ExecuteAsync(task, ct);
}
private async Task<AgentTaskResult> ExecuteScaleAsync(AgentTaskInfo task, CancellationToken ct)
{
var taskHandler = new EcsScaleServiceTask(
_ecsClient,
_timeProvider,
_loggerFactory.CreateLogger<EcsScaleServiceTask>());
return await taskHandler.ExecuteAsync(task, ct);
}
private async Task<AgentTaskResult> ExecuteRegisterAsync(AgentTaskInfo task, CancellationToken ct)
{
var taskHandler = new EcsRegisterTaskDefinitionTask(
_ecsClient,
_timeProvider,
_loggerFactory.CreateLogger<EcsRegisterTaskDefinitionTask>());
return await taskHandler.ExecuteAsync(task, ct);
}
private async Task<AgentTaskResult> ExecuteHealthCheckAsync(AgentTaskInfo task, CancellationToken ct)
{
var taskHandler = new EcsHealthCheckTask(
_ecsClient,
_timeProvider,
_loggerFactory.CreateLogger<EcsHealthCheckTask>());
return await taskHandler.ExecuteAsync(task, ct);
}
private async Task<AgentTaskResult> ExecuteDescribeAsync(AgentTaskInfo task, CancellationToken ct)
{
var taskHandler = new EcsDescribeServiceTask(
_ecsClient,
_timeProvider,
_loggerFactory.CreateLogger<EcsDescribeServiceTask>());
return await taskHandler.ExecuteAsync(task, ct);
}
/// <inheritdoc />
public ValueTask DisposeAsync()
{
_ecsClient.Dispose();
_logsClient.Dispose();
return ValueTask.CompletedTask;
}
}

View File

@@ -0,0 +1,86 @@
namespace StellaOps.Agent.Ecs;
/// <summary>
/// Base exception for ECS agent operations.
/// </summary>
public class EcsAgentException : Exception
{
public EcsAgentException(string message) : base(message) { }
public EcsAgentException(string message, Exception innerException) : base(message, innerException) { }
}
/// <summary>
/// Thrown when an ECS task payload is invalid or missing required fields.
/// </summary>
public class InvalidEcsPayloadException : EcsAgentException
{
public string TaskType { get; }
public InvalidEcsPayloadException(string taskType, string? details = null)
: base($"Invalid payload for ECS task type '{taskType}'{(details is not null ? $": {details}" : "")}")
{
TaskType = taskType;
}
}
/// <summary>
/// Thrown when an ECS service or task operation fails.
/// </summary>
public class EcsOperationException : EcsAgentException
{
public string Operation { get; }
public string? Cluster { get; }
public string? Resource { get; }
public EcsOperationException(string operation, string? cluster, string? resource, string message)
: base($"ECS {operation} failed{(cluster is not null ? $" in cluster '{cluster}'" : "")}{(resource is not null ? $" for '{resource}'" : "")}: {message}")
{
Operation = operation;
Cluster = cluster;
Resource = resource;
}
public EcsOperationException(string operation, string? cluster, string? resource, string message, Exception innerException)
: base($"ECS {operation} failed{(cluster is not null ? $" in cluster '{cluster}'" : "")}{(resource is not null ? $" for '{resource}'" : "")}: {message}", innerException)
{
Operation = operation;
Cluster = cluster;
Resource = resource;
}
}
/// <summary>
/// Thrown when an ECS deployment times out waiting for stabilization.
/// </summary>
public class EcsDeploymentTimeoutException : EcsAgentException
{
public string Cluster { get; }
public string ServiceName { get; }
public TimeSpan Timeout { get; }
public EcsDeploymentTimeoutException(string cluster, string serviceName, TimeSpan timeout)
: base($"ECS deployment timed out waiting for service '{serviceName}' in cluster '{cluster}' to stabilize after {timeout}")
{
Cluster = cluster;
ServiceName = serviceName;
Timeout = timeout;
}
}
/// <summary>
/// Thrown when an ECS task fails to complete successfully.
/// </summary>
public class EcsTaskFailedException : EcsAgentException
{
public string Cluster { get; }
public IReadOnlyList<string> TaskArns { get; }
public IReadOnlyList<int> ExitCodes { get; }
public EcsTaskFailedException(string cluster, IReadOnlyList<string> taskArns, IReadOnlyList<int> exitCodes)
: base($"ECS task(s) failed in cluster '{cluster}' with exit codes: [{string.Join(", ", exitCodes)}]")
{
Cluster = cluster;
TaskArns = taskArns;
ExitCodes = exitCodes;
}
}

View File

@@ -0,0 +1,17 @@
using StellaOps.Agent.Core.Models;
namespace StellaOps.Agent.Ecs;
/// <summary>
/// Interface for ECS task handlers.
/// </summary>
public interface IEcsTask
{
/// <summary>
/// Executes the ECS task.
/// </summary>
/// <param name="task">The agent task to execute.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>The result of the task execution.</returns>
Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default);
}

View File

@@ -0,0 +1,25 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<LangVersion>preview</LangVersion>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<RootNamespace>StellaOps.Agent.Ecs</RootNamespace>
<Description>Stella Agent ECS Capability - manages AWS ECS services and tasks</Description>
<!-- AWS SDK v4 nullable annotations cause false positives with value type boxing to Dictionary<string, object> -->
<NoWarn>$(NoWarn);CS8600;CS8601;CS8620</NoWarn>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="AWSSDK.ECS" />
<PackageReference Include="AWSSDK.CloudWatchLogs" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\StellaOps.Agent.Core\StellaOps.Agent.Core.csproj" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,470 @@
using System.Text.Json;
using Amazon.ECS;
using Amazon.ECS.Model;
using Microsoft.Extensions.Logging;
using StellaOps.Agent.Core.Models;
using Task = System.Threading.Tasks.Task;
namespace StellaOps.Agent.Ecs.Tasks;
/// <summary>
/// Task handler for deploying ECS services.
/// </summary>
public sealed class EcsDeployServiceTask : IEcsTask
{
private readonly IAmazonECS _ecsClient;
private readonly TimeProvider _timeProvider;
private readonly ILogger<EcsDeployServiceTask> _logger;
/// <summary>
/// Payload for deploying an ECS service.
/// </summary>
public sealed record DeployServicePayload
{
/// <summary>
/// Name or ARN of the ECS cluster.
/// </summary>
public required string Cluster { get; init; }
/// <summary>
/// Name of the service to deploy.
/// </summary>
public required string ServiceName { get; init; }
/// <summary>
/// Task definition family:revision or ARN.
/// </summary>
public required string TaskDefinition { get; init; }
/// <summary>
/// Desired number of tasks.
/// </summary>
public int DesiredCount { get; init; } = 1;
/// <summary>
/// Launch type (FARGATE or EC2).
/// </summary>
public string? LaunchType { get; init; }
/// <summary>
/// Network configuration for awsvpc mode.
/// </summary>
public NetworkConfigurationPayload? NetworkConfiguration { get; init; }
/// <summary>
/// Load balancer configuration.
/// </summary>
public LoadBalancerPayload? LoadBalancer { get; init; }
/// <summary>
/// Deployment configuration.
/// </summary>
public DeploymentConfigPayload? DeploymentConfiguration { get; init; }
/// <summary>
/// Whether to force a new deployment.
/// </summary>
public bool ForceNewDeployment { get; init; } = true;
/// <summary>
/// Timeout waiting for deployment to stabilize.
/// </summary>
public TimeSpan DeploymentTimeout { get; init; } = TimeSpan.FromMinutes(10);
/// <summary>
/// Tags to apply to the service.
/// </summary>
public IReadOnlyDictionary<string, string>? Tags { get; init; }
}
/// <summary>
/// Network configuration payload.
/// </summary>
public sealed record NetworkConfigurationPayload
{
/// <summary>
/// Subnet IDs.
/// </summary>
public required IReadOnlyList<string> Subnets { get; init; }
/// <summary>
/// Security group IDs.
/// </summary>
public IReadOnlyList<string>? SecurityGroups { get; init; }
/// <summary>
/// Whether to assign a public IP.
/// </summary>
public bool AssignPublicIp { get; init; }
}
/// <summary>
/// Load balancer configuration payload.
/// </summary>
public sealed record LoadBalancerPayload
{
/// <summary>
/// Target group ARN.
/// </summary>
public required string TargetGroupArn { get; init; }
/// <summary>
/// Container name for the target.
/// </summary>
public required string ContainerName { get; init; }
/// <summary>
/// Container port.
/// </summary>
public required int ContainerPort { get; init; }
}
/// <summary>
/// Deployment configuration payload.
/// </summary>
public sealed record DeploymentConfigPayload
{
/// <summary>
/// Maximum percent during deployment.
/// </summary>
public int MaximumPercent { get; init; } = 200;
/// <summary>
/// Minimum healthy percent.
/// </summary>
public int MinimumHealthyPercent { get; init; } = 100;
/// <summary>
/// Enable deployment circuit breaker.
/// </summary>
public bool EnableCircuitBreaker { get; init; } = true;
/// <summary>
/// Enable rollback on failure.
/// </summary>
public bool EnableRollback { get; init; } = true;
}
/// <summary>
/// Creates a new ECS deploy service task handler.
/// </summary>
public EcsDeployServiceTask(
IAmazonECS ecsClient,
TimeProvider timeProvider,
ILogger<EcsDeployServiceTask> logger)
{
_ecsClient = ecsClient;
_timeProvider = timeProvider;
_logger = logger;
}
/// <inheritdoc />
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
{
var payload = JsonSerializer.Deserialize<DeployServicePayload>(task.Payload)
?? throw new InvalidEcsPayloadException("ecs.deploy", "Failed to deserialize payload");
_logger.LogInformation(
"Deploying ECS service {Service} to cluster {Cluster} with task definition {TaskDef}",
payload.ServiceName,
payload.Cluster,
payload.TaskDefinition);
try
{
// Check if service exists
var existingService = await GetServiceAsync(payload.Cluster, payload.ServiceName, ct);
if (existingService is not null && existingService.Status != "INACTIVE")
{
return await UpdateServiceAsync(task.Id, payload, ct);
}
else
{
return await CreateServiceAsync(task.Id, payload, ct);
}
}
catch (AmazonECSException ex)
{
_logger.LogError(ex, "Failed to deploy ECS service {Service}", payload.ServiceName);
return new AgentTaskResult
{
TaskId = task.Id,
Success = false,
Error = $"ECS deployment failed: {ex.Message}",
CompletedAt = _timeProvider.GetUtcNow()
};
}
}
private async Task<Service?> GetServiceAsync(string cluster, string serviceName, CancellationToken ct)
{
try
{
var response = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
{
Cluster = cluster,
Services = new List<string> { serviceName }
}, ct);
return response.Services.FirstOrDefault();
}
catch
{
return null;
}
}
private async Task<AgentTaskResult> CreateServiceAsync(
Guid taskId,
DeployServicePayload payload,
CancellationToken ct)
{
_logger.LogInformation("Creating new ECS service {Service}", payload.ServiceName);
var request = new CreateServiceRequest
{
Cluster = payload.Cluster,
ServiceName = payload.ServiceName,
TaskDefinition = payload.TaskDefinition,
DesiredCount = payload.DesiredCount
};
if (!string.IsNullOrEmpty(payload.LaunchType))
{
request.LaunchType = new LaunchType(payload.LaunchType);
}
if (payload.NetworkConfiguration is not null)
{
request.NetworkConfiguration = new NetworkConfiguration
{
AwsvpcConfiguration = new AwsVpcConfiguration
{
Subnets = payload.NetworkConfiguration.Subnets.ToList(),
SecurityGroups = payload.NetworkConfiguration.SecurityGroups?.ToList(),
AssignPublicIp = payload.NetworkConfiguration.AssignPublicIp
? AssignPublicIp.ENABLED
: AssignPublicIp.DISABLED
}
};
}
if (payload.LoadBalancer is not null)
{
request.LoadBalancers = new List<LoadBalancer>
{
new()
{
TargetGroupArn = payload.LoadBalancer.TargetGroupArn,
ContainerName = payload.LoadBalancer.ContainerName,
ContainerPort = payload.LoadBalancer.ContainerPort
}
};
}
if (payload.DeploymentConfiguration is not null)
{
request.DeploymentConfiguration = new DeploymentConfiguration
{
MaximumPercent = payload.DeploymentConfiguration.MaximumPercent,
MinimumHealthyPercent = payload.DeploymentConfiguration.MinimumHealthyPercent,
DeploymentCircuitBreaker = new DeploymentCircuitBreaker
{
Enable = payload.DeploymentConfiguration.EnableCircuitBreaker,
Rollback = payload.DeploymentConfiguration.EnableRollback
}
};
}
if (payload.Tags is not null)
{
request.Tags = payload.Tags.Select(kv => new Tag { Key = kv.Key, Value = kv.Value }).ToList();
}
var createResponse = await _ecsClient.CreateServiceAsync(request, ct);
if (createResponse.Service is not { } service)
{
return new AgentTaskResult
{
TaskId = taskId,
Success = false,
Error = "Service creation returned no service object",
CompletedAt = _timeProvider.GetUtcNow()
};
}
_logger.LogInformation(
"Created ECS service {Service} (ARN: {Arn})",
payload.ServiceName,
service.ServiceArn);
// Wait for deployment to stabilize
var stable = await WaitForServiceStableAsync(
payload.Cluster,
payload.ServiceName,
payload.DeploymentTimeout,
ct);
return new AgentTaskResult
{
TaskId = taskId,
Success = stable,
Error = stable ? null : "Service did not stabilize within timeout",
Outputs = new Dictionary<string, object>
{
["serviceArn"] = service.ServiceArn ?? "",
["serviceName"] = service.ServiceName ?? "",
["taskDefinition"] = service.TaskDefinition ?? "",
["runningCount"] = service.RunningCount,
["desiredCount"] = service.DesiredCount,
["deploymentStatus"] = stable ? "COMPLETED" : "TIMED_OUT",
["operation"] = "create"
},
CompletedAt = _timeProvider.GetUtcNow()
};
}
private async Task<AgentTaskResult> UpdateServiceAsync(
Guid taskId,
DeployServicePayload payload,
CancellationToken ct)
{
_logger.LogInformation(
"Updating existing ECS service {Service} to task definition {TaskDef}",
payload.ServiceName,
payload.TaskDefinition);
var request = new UpdateServiceRequest
{
Cluster = payload.Cluster,
Service = payload.ServiceName,
TaskDefinition = payload.TaskDefinition,
DesiredCount = payload.DesiredCount,
ForceNewDeployment = payload.ForceNewDeployment
};
if (payload.DeploymentConfiguration is not null)
{
request.DeploymentConfiguration = new DeploymentConfiguration
{
MaximumPercent = payload.DeploymentConfiguration.MaximumPercent,
MinimumHealthyPercent = payload.DeploymentConfiguration.MinimumHealthyPercent,
DeploymentCircuitBreaker = new DeploymentCircuitBreaker
{
Enable = payload.DeploymentConfiguration.EnableCircuitBreaker,
Rollback = payload.DeploymentConfiguration.EnableRollback
}
};
}
var updateResponse = await _ecsClient.UpdateServiceAsync(request, ct);
if (updateResponse.Service is not { } service)
{
return new AgentTaskResult
{
TaskId = taskId,
Success = false,
Error = "Service update returned no service object",
CompletedAt = _timeProvider.GetUtcNow()
};
}
_logger.LogInformation(
"Updated ECS service {Service}, deployment ID: {DeploymentId}",
payload.ServiceName,
service.Deployments.FirstOrDefault()?.Id ?? "unknown");
// Wait for deployment to stabilize
var stable = await WaitForServiceStableAsync(
payload.Cluster,
payload.ServiceName,
payload.DeploymentTimeout,
ct);
return new AgentTaskResult
{
TaskId = taskId,
Success = stable,
Error = stable ? null : "Service did not stabilize within timeout",
Outputs = new Dictionary<string, object>
{
["serviceArn"] = service.ServiceArn ?? "",
["serviceName"] = service.ServiceName ?? "",
["taskDefinition"] = service.TaskDefinition ?? "",
["runningCount"] = service.RunningCount,
["desiredCount"] = service.DesiredCount,
["deploymentId"] = service.Deployments.FirstOrDefault()?.Id ?? "",
["deploymentStatus"] = stable ? "COMPLETED" : "TIMED_OUT",
["operation"] = "update"
},
CompletedAt = _timeProvider.GetUtcNow()
};
}
private async Task<bool> WaitForServiceStableAsync(
string cluster,
string serviceName,
TimeSpan timeout,
CancellationToken ct)
{
_logger.LogInformation("Waiting for service {Service} to stabilize", serviceName);
using var timeoutCts = new CancellationTokenSource(timeout);
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
try
{
while (!linkedCts.IsCancellationRequested)
{
var response = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
{
Cluster = cluster,
Services = new List<string> { serviceName }
}, linkedCts.Token);
var service = response.Services.FirstOrDefault();
if (service is null)
{
_logger.LogWarning("Service {Service} not found during stabilization check", serviceName);
return false;
}
var primaryDeployment = service.Deployments.FirstOrDefault(d => d.Status == "PRIMARY");
if (primaryDeployment is null)
{
await Task.Delay(TimeSpan.FromSeconds(10), linkedCts.Token);
continue;
}
if (primaryDeployment.RunningCount == primaryDeployment.DesiredCount &&
service.Deployments.Count == 1)
{
_logger.LogInformation(
"Service {Service} stabilized with {Count} running tasks",
serviceName,
primaryDeployment.RunningCount);
return true;
}
_logger.LogDebug(
"Service {Service} not stable: running={Running}, desired={Desired}, deployments={Deployments}",
serviceName,
primaryDeployment.RunningCount,
primaryDeployment.DesiredCount,
service.Deployments.Count);
await Task.Delay(TimeSpan.FromSeconds(10), linkedCts.Token);
}
}
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
{
_logger.LogWarning("Service {Service} stabilization timed out after {Timeout}", serviceName, timeout);
}
return false;
}
}

View File

@@ -0,0 +1,173 @@
using System.Globalization;
using System.Text.Json;
using Amazon.ECS;
using Amazon.ECS.Model;
using Microsoft.Extensions.Logging;
using StellaOps.Agent.Core.Models;
using Task = System.Threading.Tasks.Task;
namespace StellaOps.Agent.Ecs.Tasks;
/// <summary>
/// Task handler for describing ECS services.
/// </summary>
public sealed class EcsDescribeServiceTask : IEcsTask
{
private readonly IAmazonECS _ecsClient;
private readonly TimeProvider _timeProvider;
private readonly ILogger<EcsDescribeServiceTask> _logger;
/// <summary>
/// Payload for describing an ECS service.
/// </summary>
public sealed record DescribeServicePayload
{
/// <summary>
/// Name or ARN of the ECS cluster.
/// </summary>
public required string Cluster { get; init; }
/// <summary>
/// Name of the service to describe.
/// </summary>
public required string ServiceName { get; init; }
/// <summary>
/// Whether to include task information.
/// </summary>
public bool IncludeTasks { get; init; } = false;
}
/// <summary>
/// Creates a new ECS describe service task handler.
/// </summary>
public EcsDescribeServiceTask(
IAmazonECS ecsClient,
TimeProvider timeProvider,
ILogger<EcsDescribeServiceTask> logger)
{
_ecsClient = ecsClient;
_timeProvider = timeProvider;
_logger = logger;
}
/// <inheritdoc />
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
{
var payload = JsonSerializer.Deserialize<DescribeServicePayload>(task.Payload)
?? throw new InvalidEcsPayloadException("ecs.describe", "Failed to deserialize payload");
_logger.LogInformation(
"Describing ECS service {Service} in cluster {Cluster}",
payload.ServiceName,
payload.Cluster);
try
{
var response = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
{
Cluster = payload.Cluster,
Services = new List<string> { payload.ServiceName }
}, ct);
if (response.Services.FirstOrDefault() is not { } service)
{
return new AgentTaskResult
{
TaskId = task.Id,
Success = false,
Error = $"Service '{payload.ServiceName}' not found",
CompletedAt = _timeProvider.GetUtcNow()
};
}
var outputs = new Dictionary<string, object>
{
["serviceArn"] = service.ServiceArn ?? "",
["serviceName"] = service.ServiceName ?? "",
["clusterArn"] = service.ClusterArn ?? "",
["status"] = service.Status ?? "",
["taskDefinition"] = service.TaskDefinition ?? "",
["desiredCount"] = service.DesiredCount,
["runningCount"] = service.RunningCount,
["pendingCount"] = service.PendingCount,
["launchType"] = service.LaunchType?.Value ?? "unknown",
["deploymentCount"] = service.Deployments.Count,
["createdAt"] = service.CreatedAt.GetValueOrDefault().ToUniversalTime().ToString("o", CultureInfo.InvariantCulture),
["deployments"] = service.Deployments.Select(d => new Dictionary<string, object>
{
["id"] = d.Id ?? "",
["status"] = d.Status ?? "",
["taskDefinition"] = d.TaskDefinition ?? "",
["desiredCount"] = d.DesiredCount,
["runningCount"] = d.RunningCount,
["pendingCount"] = d.PendingCount,
["createdAt"] = d.CreatedAt.GetValueOrDefault().ToUniversalTime().ToString("o", CultureInfo.InvariantCulture)
}).ToList()
};
// Include tasks if requested
if (payload.IncludeTasks)
{
var tasksResponse = await _ecsClient.ListTasksAsync(new ListTasksRequest
{
Cluster = payload.Cluster,
ServiceName = payload.ServiceName
}, ct);
if (tasksResponse.TaskArns.Count > 0)
{
var describeTasksResponse = await _ecsClient.DescribeTasksAsync(new DescribeTasksRequest
{
Cluster = payload.Cluster,
Tasks = tasksResponse.TaskArns
}, ct);
outputs["tasks"] = describeTasksResponse.Tasks.Select(t => new Dictionary<string, object>
{
["taskArn"] = t.TaskArn ?? "",
["taskDefinitionArn"] = t.TaskDefinitionArn ?? "",
["lastStatus"] = t.LastStatus ?? "",
["desiredStatus"] = t.DesiredStatus ?? "",
["healthStatus"] = t.HealthStatus?.Value ?? "unknown",
["createdAt"] = t.CreatedAt.GetValueOrDefault().ToUniversalTime().ToString("o", CultureInfo.InvariantCulture),
["containers"] = t.Containers.Select(c => new Dictionary<string, object>
{
["name"] = c.Name ?? "",
["lastStatus"] = c.LastStatus ?? "",
["exitCode"] = c.ExitCode ?? -1,
["healthStatus"] = c.HealthStatus?.Value ?? "unknown"
}).ToList()
}).ToList();
}
}
_logger.LogInformation(
"Described ECS service {Service}: {Running}/{Desired} running, {Deployments} deployments",
payload.ServiceName,
service.RunningCount,
service.DesiredCount,
service.Deployments.Count);
return new AgentTaskResult
{
TaskId = task.Id,
Success = true,
Outputs = outputs,
CompletedAt = _timeProvider.GetUtcNow()
};
}
catch (AmazonECSException ex)
{
_logger.LogError(ex, "Failed to describe ECS service {Service}", payload.ServiceName);
return new AgentTaskResult
{
TaskId = task.Id,
Success = false,
Error = $"Failed to describe service: {ex.Message}",
CompletedAt = _timeProvider.GetUtcNow()
};
}
}
}

View File

@@ -0,0 +1,233 @@
using System.Text.Json;
using Amazon.ECS;
using Amazon.ECS.Model;
using Microsoft.Extensions.Logging;
using StellaOps.Agent.Core.Models;
using Task = System.Threading.Tasks.Task;
namespace StellaOps.Agent.Ecs.Tasks;
/// <summary>
/// Task handler for checking ECS service health.
/// </summary>
public sealed class EcsHealthCheckTask : IEcsTask
{
private readonly IAmazonECS _ecsClient;
private readonly TimeProvider _timeProvider;
private readonly ILogger<EcsHealthCheckTask> _logger;
/// <summary>
/// Payload for checking ECS service health.
/// </summary>
public sealed record HealthCheckPayload
{
/// <summary>
/// Name or ARN of the ECS cluster.
/// </summary>
public required string Cluster { get; init; }
/// <summary>
/// Name of the service to check.
/// </summary>
public required string ServiceName { get; init; }
/// <summary>
/// Minimum healthy percent to consider the service healthy.
/// </summary>
public int MinHealthyPercent { get; init; } = 100;
/// <summary>
/// Whether to wait for the service to become healthy.
/// </summary>
public bool WaitForHealthy { get; init; } = true;
/// <summary>
/// Timeout for waiting for health.
/// </summary>
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(5);
}
/// <summary>
/// Creates a new ECS health check task handler.
/// </summary>
public EcsHealthCheckTask(
IAmazonECS ecsClient,
TimeProvider timeProvider,
ILogger<EcsHealthCheckTask> logger)
{
_ecsClient = ecsClient;
_timeProvider = timeProvider;
_logger = logger;
}
/// <inheritdoc />
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
{
var payload = JsonSerializer.Deserialize<HealthCheckPayload>(task.Payload)
?? throw new InvalidEcsPayloadException("ecs.health", "Failed to deserialize payload");
_logger.LogInformation(
"Checking health of ECS service {Service} in cluster {Cluster}",
payload.ServiceName,
payload.Cluster);
try
{
if (!payload.WaitForHealthy)
{
return await CheckHealthOnceAsync(task.Id, payload, ct);
}
return await WaitForHealthyAsync(task.Id, payload, ct);
}
catch (AmazonECSException ex)
{
_logger.LogError(ex, "Failed to check health of ECS service {Service}", payload.ServiceName);
return new AgentTaskResult
{
TaskId = task.Id,
Success = false,
Error = $"Health check failed: {ex.Message}",
CompletedAt = _timeProvider.GetUtcNow()
};
}
}
private async Task<AgentTaskResult> CheckHealthOnceAsync(
Guid taskId,
HealthCheckPayload payload,
CancellationToken ct)
{
var response = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
{
Cluster = payload.Cluster,
Services = new List<string> { payload.ServiceName }
}, ct);
var service = response.Services.FirstOrDefault();
if (service is null)
{
return new AgentTaskResult
{
TaskId = taskId,
Success = false,
Error = $"Service '{payload.ServiceName}' not found",
CompletedAt = _timeProvider.GetUtcNow()
};
}
var healthyPercent = service.DesiredCount > 0
? (service.RunningCount * 100) / service.DesiredCount
: 0;
var isHealthy = healthyPercent >= payload.MinHealthyPercent && service.Deployments.Count == 1;
return new AgentTaskResult
{
TaskId = taskId,
Success = isHealthy,
Error = isHealthy ? null : $"Service unhealthy: {healthyPercent}% running (minimum: {payload.MinHealthyPercent}%)",
Outputs = new Dictionary<string, object>
{
["serviceName"] = service.ServiceName ?? "",
["serviceArn"] = service.ServiceArn ?? "",
["runningCount"] = service.RunningCount,
["desiredCount"] = service.DesiredCount,
["healthyPercent"] = healthyPercent,
["status"] = service.Status ?? "",
["deploymentCount"] = service.Deployments.Count,
["isHealthy"] = isHealthy
},
CompletedAt = _timeProvider.GetUtcNow()
};
}
private async System.Threading.Tasks.Task<AgentTaskResult> WaitForHealthyAsync(
Guid taskId,
HealthCheckPayload payload,
CancellationToken ct)
{
using var timeoutCts = new CancellationTokenSource(payload.Timeout);
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
try
{
while (!linkedCts.IsCancellationRequested)
{
var response = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
{
Cluster = payload.Cluster,
Services = new List<string> { payload.ServiceName }
}, linkedCts.Token);
var service = response.Services.FirstOrDefault();
if (service is null)
{
return new AgentTaskResult
{
TaskId = taskId,
Success = false,
Error = $"Service '{payload.ServiceName}' not found",
CompletedAt = _timeProvider.GetUtcNow()
};
}
var healthyPercent = service.DesiredCount > 0
? (service.RunningCount * 100) / service.DesiredCount
: 0;
if (healthyPercent >= payload.MinHealthyPercent && service.Deployments.Count == 1)
{
_logger.LogInformation(
"Service {Service} is healthy: {Running}/{Desired} tasks running ({Percent}%)",
payload.ServiceName,
service.RunningCount,
service.DesiredCount,
healthyPercent);
return new AgentTaskResult
{
TaskId = taskId,
Success = true,
Outputs = new Dictionary<string, object>
{
["serviceName"] = service.ServiceName ?? "",
["serviceArn"] = service.ServiceArn ?? "",
["runningCount"] = service.RunningCount,
["desiredCount"] = service.DesiredCount,
["healthyPercent"] = healthyPercent,
["status"] = service.Status ?? "",
["isHealthy"] = true
},
CompletedAt = _timeProvider.GetUtcNow()
};
}
_logger.LogDebug(
"Service {Service} health check: {Running}/{Desired} ({Percent}%), waiting...",
payload.ServiceName,
service.RunningCount,
service.DesiredCount,
healthyPercent);
await Task.Delay(TimeSpan.FromSeconds(10), linkedCts.Token);
}
}
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
{
_logger.LogWarning(
"Health check timed out after {Timeout} for service {Service}",
payload.Timeout,
payload.ServiceName);
}
return new AgentTaskResult
{
TaskId = taskId,
Success = false,
Error = $"Health check timed out after {payload.Timeout}",
CompletedAt = _timeProvider.GetUtcNow()
};
}
}

View File

@@ -0,0 +1,282 @@
using System.Text.Json;
using Amazon.ECS;
using Amazon.ECS.Model;
using Microsoft.Extensions.Logging;
using StellaOps.Agent.Core.Models;
namespace StellaOps.Agent.Ecs.Tasks;
/// <summary>
/// Task handler for registering ECS task definitions.
/// </summary>
public sealed class EcsRegisterTaskDefinitionTask : IEcsTask
{
private readonly IAmazonECS _ecsClient;
private readonly TimeProvider _timeProvider;
private readonly ILogger<EcsRegisterTaskDefinitionTask> _logger;
/// <summary>
/// Payload for registering an ECS task definition.
/// </summary>
public sealed record RegisterTaskDefinitionPayload
{
/// <summary>
/// Family name for the task definition.
/// </summary>
public required string Family { get; init; }
/// <summary>
/// Container definitions.
/// </summary>
public required IReadOnlyList<ContainerDefinitionPayload> ContainerDefinitions { get; init; }
/// <summary>
/// Task-level CPU.
/// </summary>
public string? Cpu { get; init; }
/// <summary>
/// Task-level memory.
/// </summary>
public string? Memory { get; init; }
/// <summary>
/// Network mode.
/// </summary>
public string? NetworkMode { get; init; }
/// <summary>
/// Task role ARN.
/// </summary>
public string? TaskRoleArn { get; init; }
/// <summary>
/// Execution role ARN.
/// </summary>
public string? ExecutionRoleArn { get; init; }
/// <summary>
/// Required capabilities (FARGATE, EC2).
/// </summary>
public IReadOnlyList<string>? RequiresCompatibilities { get; init; }
/// <summary>
/// Tags to apply.
/// </summary>
public IReadOnlyDictionary<string, string>? Tags { get; init; }
}
/// <summary>
/// Container definition payload.
/// </summary>
public sealed record ContainerDefinitionPayload
{
/// <summary>
/// Container name.
/// </summary>
public required string Name { get; init; }
/// <summary>
/// Container image.
/// </summary>
public required string Image { get; init; }
/// <summary>
/// Container CPU units.
/// </summary>
public int? Cpu { get; init; }
/// <summary>
/// Container memory in MB.
/// </summary>
public int? Memory { get; init; }
/// <summary>
/// Container memory reservation in MB.
/// </summary>
public int? MemoryReservation { get; init; }
/// <summary>
/// Port mappings.
/// </summary>
public IReadOnlyList<PortMappingPayload>? PortMappings { get; init; }
/// <summary>
/// Environment variables.
/// </summary>
public IReadOnlyDictionary<string, string>? Environment { get; init; }
/// <summary>
/// Whether the container is essential.
/// </summary>
public bool Essential { get; init; } = true;
/// <summary>
/// Entry point override.
/// </summary>
public IReadOnlyList<string>? EntryPoint { get; init; }
/// <summary>
/// Command override.
/// </summary>
public IReadOnlyList<string>? Command { get; init; }
/// <summary>
/// Log configuration.
/// </summary>
public LogConfigurationPayload? LogConfiguration { get; init; }
}
/// <summary>
/// Port mapping payload.
/// </summary>
public sealed record PortMappingPayload
{
/// <summary>
/// Container port.
/// </summary>
public required int ContainerPort { get; init; }
/// <summary>
/// Host port.
/// </summary>
public int? HostPort { get; init; }
/// <summary>
/// Protocol (tcp or udp).
/// </summary>
public string Protocol { get; init; } = "tcp";
}
/// <summary>
/// Log configuration payload.
/// </summary>
public sealed record LogConfigurationPayload
{
/// <summary>
/// Log driver.
/// </summary>
public required string LogDriver { get; init; }
/// <summary>
/// Log driver options.
/// </summary>
public IReadOnlyDictionary<string, string>? Options { get; init; }
}
/// <summary>
/// Creates a new ECS register task definition handler.
/// </summary>
public EcsRegisterTaskDefinitionTask(
IAmazonECS ecsClient,
TimeProvider timeProvider,
ILogger<EcsRegisterTaskDefinitionTask> logger)
{
_ecsClient = ecsClient;
_timeProvider = timeProvider;
_logger = logger;
}
/// <inheritdoc />
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
{
var payload = JsonSerializer.Deserialize<RegisterTaskDefinitionPayload>(task.Payload)
?? throw new InvalidEcsPayloadException("ecs.register", "Failed to deserialize payload");
_logger.LogInformation(
"Registering ECS task definition for family {Family}",
payload.Family);
try
{
var request = new RegisterTaskDefinitionRequest
{
Family = payload.Family,
Cpu = payload.Cpu,
Memory = payload.Memory,
TaskRoleArn = payload.TaskRoleArn,
ExecutionRoleArn = payload.ExecutionRoleArn,
ContainerDefinitions = payload.ContainerDefinitions.Select(c => new ContainerDefinition
{
Name = c.Name,
Image = c.Image,
Cpu = c.Cpu ?? 0,
Memory = c.Memory,
MemoryReservation = c.MemoryReservation,
Essential = c.Essential,
EntryPoint = c.EntryPoint?.ToList(),
Command = c.Command?.ToList(),
PortMappings = c.PortMappings?.Select(p => new PortMapping
{
ContainerPort = p.ContainerPort,
HostPort = p.HostPort ?? p.ContainerPort,
Protocol = p.Protocol
}).ToList(),
Environment = c.Environment?.Select(kv => new Amazon.ECS.Model.KeyValuePair
{
Name = kv.Key,
Value = kv.Value
}).ToList(),
LogConfiguration = c.LogConfiguration is not null
? new LogConfiguration
{
LogDriver = c.LogConfiguration.LogDriver,
Options = c.LogConfiguration.Options?.ToDictionary(kv => kv.Key, kv => kv.Value)
}
: null
}).ToList()
};
if (!string.IsNullOrEmpty(payload.NetworkMode))
{
request.NetworkMode = new NetworkMode(payload.NetworkMode);
}
if (payload.RequiresCompatibilities is not null)
{
request.RequiresCompatibilities = payload.RequiresCompatibilities.ToList();
}
if (payload.Tags is not null)
{
request.Tags = payload.Tags.Select(kv => new Tag { Key = kv.Key, Value = kv.Value }).ToList();
}
var response = await _ecsClient.RegisterTaskDefinitionAsync(request, ct);
var taskDef = response.TaskDefinition;
_logger.LogInformation(
"Registered ECS task definition {Family}:{Revision} (ARN: {Arn})",
taskDef.Family,
taskDef.Revision,
taskDef.TaskDefinitionArn);
return new AgentTaskResult
{
TaskId = task.Id,
Success = true,
Outputs = new Dictionary<string, object>
{
["taskDefinitionArn"] = taskDef.TaskDefinitionArn ?? "",
["family"] = taskDef.Family ?? "",
["revision"] = taskDef.Revision,
["status"] = taskDef.Status?.Value ?? "",
["containerCount"] = taskDef.ContainerDefinitions.Count
},
CompletedAt = _timeProvider.GetUtcNow()
};
}
catch (AmazonECSException ex)
{
_logger.LogError(ex, "Failed to register ECS task definition for family {Family}", payload.Family);
return new AgentTaskResult
{
TaskId = task.Id,
Success = false,
Error = $"Failed to register task definition: {ex.Message}",
CompletedAt = _timeProvider.GetUtcNow()
};
}
}
}

View File

@@ -0,0 +1,331 @@
using System.Text.Json;
using Amazon.ECS;
using Amazon.ECS.Model;
using Microsoft.Extensions.Logging;
using StellaOps.Agent.Core.Models;
using Task = System.Threading.Tasks.Task;
namespace StellaOps.Agent.Ecs.Tasks;
/// <summary>
/// Task handler for running ECS tasks.
/// </summary>
public sealed class EcsRunTaskTask : IEcsTask
{
private readonly IAmazonECS _ecsClient;
private readonly TimeProvider _timeProvider;
private readonly ILogger<EcsRunTaskTask> _logger;
/// <summary>
/// Payload for running an ECS task.
/// </summary>
public sealed record RunTaskPayload
{
/// <summary>
/// Name or ARN of the ECS cluster.
/// </summary>
public required string Cluster { get; init; }
/// <summary>
/// Task definition family:revision or ARN.
/// </summary>
public required string TaskDefinition { get; init; }
/// <summary>
/// Number of tasks to run.
/// </summary>
public int Count { get; init; } = 1;
/// <summary>
/// Launch type (FARGATE or EC2).
/// </summary>
public string? LaunchType { get; init; }
/// <summary>
/// Network configuration for awsvpc mode.
/// </summary>
public NetworkConfigurationPayload? NetworkConfiguration { get; init; }
/// <summary>
/// Container overrides.
/// </summary>
public IReadOnlyList<ContainerOverridePayload>? Overrides { get; init; }
/// <summary>
/// Task group.
/// </summary>
public string? Group { get; init; }
/// <summary>
/// Whether to wait for task completion.
/// </summary>
public bool WaitForCompletion { get; init; } = true;
/// <summary>
/// Timeout for waiting for completion.
/// </summary>
public TimeSpan CompletionTimeout { get; init; } = TimeSpan.FromMinutes(30);
/// <summary>
/// Tags to apply to the task.
/// </summary>
public IReadOnlyDictionary<string, string>? Tags { get; init; }
}
/// <summary>
/// Network configuration payload.
/// </summary>
public sealed record NetworkConfigurationPayload
{
/// <summary>
/// Subnet IDs.
/// </summary>
public required IReadOnlyList<string> Subnets { get; init; }
/// <summary>
/// Security group IDs.
/// </summary>
public IReadOnlyList<string>? SecurityGroups { get; init; }
/// <summary>
/// Whether to assign a public IP.
/// </summary>
public bool AssignPublicIp { get; init; }
}
/// <summary>
/// Container override payload.
/// </summary>
public sealed record ContainerOverridePayload
{
/// <summary>
/// Container name.
/// </summary>
public required string Name { get; init; }
/// <summary>
/// Command override.
/// </summary>
public IReadOnlyList<string>? Command { get; init; }
/// <summary>
/// Environment variable overrides.
/// </summary>
public IReadOnlyDictionary<string, string>? Environment { get; init; }
/// <summary>
/// CPU override.
/// </summary>
public int? Cpu { get; init; }
/// <summary>
/// Memory override.
/// </summary>
public int? Memory { get; init; }
}
/// <summary>
/// Creates a new ECS run task handler.
/// </summary>
public EcsRunTaskTask(
IAmazonECS ecsClient,
TimeProvider timeProvider,
ILogger<EcsRunTaskTask> logger)
{
_ecsClient = ecsClient;
_timeProvider = timeProvider;
_logger = logger;
}
/// <inheritdoc />
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
{
var payload = JsonSerializer.Deserialize<RunTaskPayload>(task.Payload)
?? throw new InvalidEcsPayloadException("ecs.run", "Failed to deserialize payload");
_logger.LogInformation(
"Running ECS task from definition {TaskDef} on cluster {Cluster}",
payload.TaskDefinition,
payload.Cluster);
try
{
var request = new RunTaskRequest
{
Cluster = payload.Cluster,
TaskDefinition = payload.TaskDefinition,
Count = payload.Count,
Group = payload.Group
};
if (!string.IsNullOrEmpty(payload.LaunchType))
{
request.LaunchType = new LaunchType(payload.LaunchType);
}
if (payload.NetworkConfiguration is not null)
{
request.NetworkConfiguration = new NetworkConfiguration
{
AwsvpcConfiguration = new AwsVpcConfiguration
{
Subnets = payload.NetworkConfiguration.Subnets.ToList(),
SecurityGroups = payload.NetworkConfiguration.SecurityGroups?.ToList(),
AssignPublicIp = payload.NetworkConfiguration.AssignPublicIp
? AssignPublicIp.ENABLED
: AssignPublicIp.DISABLED
}
};
}
if (payload.Overrides is not null && payload.Overrides.Count > 0)
{
request.Overrides = new TaskOverride
{
ContainerOverrides = payload.Overrides.Select(o => new ContainerOverride
{
Name = o.Name,
Command = o.Command?.ToList(),
Environment = o.Environment?.Select(kv => new Amazon.ECS.Model.KeyValuePair
{
Name = kv.Key,
Value = kv.Value
}).ToList(),
Cpu = o.Cpu,
Memory = o.Memory
}).ToList()
};
}
if (payload.Tags is not null)
{
request.Tags = payload.Tags.Select(kv => new Tag { Key = kv.Key, Value = kv.Value }).ToList();
}
var runResponse = await _ecsClient.RunTaskAsync(request, ct);
if (runResponse.Failures.Count > 0)
{
var failure = runResponse.Failures.First();
_logger.LogError(
"Failed to run ECS task: {Reason} (ARN: {Arn})",
failure.Reason,
failure.Arn);
return new AgentTaskResult
{
TaskId = task.Id,
Success = false,
Error = $"Failed to run task: {failure.Reason}",
CompletedAt = _timeProvider.GetUtcNow()
};
}
var ecsTasks = runResponse.Tasks;
var taskArns = ecsTasks.Select(t => t.TaskArn).ToList();
_logger.LogInformation(
"Started {Count} ECS task(s): {TaskArns}",
ecsTasks.Count,
string.Join(", ", taskArns.Select(a => a.Split('/').Last())));
if (!payload.WaitForCompletion)
{
return new AgentTaskResult
{
TaskId = task.Id,
Success = true,
Outputs = new Dictionary<string, object>
{
["taskArns"] = taskArns,
["taskCount"] = ecsTasks.Count,
["status"] = "RUNNING"
},
CompletedAt = _timeProvider.GetUtcNow()
};
}
// Wait for tasks to complete
var (completed, exitCodes) = await WaitForTasksAsync(
payload.Cluster,
taskArns,
payload.CompletionTimeout,
ct);
var allSucceeded = completed && exitCodes.All(e => e == 0);
return new AgentTaskResult
{
TaskId = task.Id,
Success = allSucceeded,
Error = allSucceeded ? null : $"Task(s) failed with exit codes: [{string.Join(", ", exitCodes)}]",
Outputs = new Dictionary<string, object>
{
["taskArns"] = taskArns,
["taskCount"] = ecsTasks.Count,
["exitCodes"] = exitCodes,
["status"] = allSucceeded ? "SUCCEEDED" : "FAILED"
},
CompletedAt = _timeProvider.GetUtcNow()
};
}
catch (AmazonECSException ex)
{
_logger.LogError(ex, "Failed to run ECS task from {TaskDef}", payload.TaskDefinition);
return new AgentTaskResult
{
TaskId = task.Id,
Success = false,
Error = $"Failed to run task: {ex.Message}",
CompletedAt = _timeProvider.GetUtcNow()
};
}
}
private async Task<(bool Completed, List<int> ExitCodes)> WaitForTasksAsync(
string cluster,
List<string> taskArns,
TimeSpan timeout,
CancellationToken ct)
{
using var timeoutCts = new CancellationTokenSource(timeout);
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
var exitCodes = new List<int>();
try
{
while (!linkedCts.IsCancellationRequested)
{
var response = await _ecsClient.DescribeTasksAsync(new DescribeTasksRequest
{
Cluster = cluster,
Tasks = taskArns
}, linkedCts.Token);
var allStopped = response.Tasks.All(t => t.LastStatus == "STOPPED");
if (allStopped)
{
exitCodes = response.Tasks
.SelectMany(t => t.Containers.Select(c => c.ExitCode ?? -1))
.ToList();
_logger.LogInformation(
"ECS tasks completed with exit codes: [{ExitCodes}]",
string.Join(", ", exitCodes));
return (true, exitCodes);
}
await Task.Delay(TimeSpan.FromSeconds(10), linkedCts.Token);
}
}
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
{
_logger.LogWarning("Task completion wait timed out after {Timeout}", timeout);
}
return (false, exitCodes);
}
}

View File

@@ -0,0 +1,231 @@
using System.Text.Json;
using Amazon.ECS;
using Amazon.ECS.Model;
using Microsoft.Extensions.Logging;
using StellaOps.Agent.Core.Models;
using Task = System.Threading.Tasks.Task;
namespace StellaOps.Agent.Ecs.Tasks;
/// <summary>
/// Task handler for scaling ECS services.
/// </summary>
public sealed class EcsScaleServiceTask : IEcsTask
{
private readonly IAmazonECS _ecsClient;
private readonly TimeProvider _timeProvider;
private readonly ILogger<EcsScaleServiceTask> _logger;
/// <summary>
/// Payload for scaling an ECS service.
/// </summary>
public sealed record ScaleServicePayload
{
/// <summary>
/// Name or ARN of the ECS cluster.
/// </summary>
public required string Cluster { get; init; }
/// <summary>
/// Name of the service to scale.
/// </summary>
public required string ServiceName { get; init; }
/// <summary>
/// Desired number of tasks.
/// </summary>
public required int DesiredCount { get; init; }
/// <summary>
/// Whether to wait for the scaling operation to complete.
/// </summary>
public bool WaitForStable { get; init; } = true;
/// <summary>
/// Timeout waiting for stabilization.
/// </summary>
public TimeSpan StabilizeTimeout { get; init; } = TimeSpan.FromMinutes(5);
}
/// <summary>
/// Creates a new ECS scale service task handler.
/// </summary>
public EcsScaleServiceTask(
IAmazonECS ecsClient,
TimeProvider timeProvider,
ILogger<EcsScaleServiceTask> logger)
{
_ecsClient = ecsClient;
_timeProvider = timeProvider;
_logger = logger;
}
/// <inheritdoc />
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
{
var payload = JsonSerializer.Deserialize<ScaleServicePayload>(task.Payload)
?? throw new InvalidEcsPayloadException("ecs.scale", "Failed to deserialize payload");
_logger.LogInformation(
"Scaling ECS service {Service} in cluster {Cluster} to {DesiredCount} tasks",
payload.ServiceName,
payload.Cluster,
payload.DesiredCount);
try
{
// Get current service state
var describeResponse = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
{
Cluster = payload.Cluster,
Services = new List<string> { payload.ServiceName }
}, ct);
var currentService = describeResponse.Services.FirstOrDefault();
if (currentService is null)
{
return new AgentTaskResult
{
TaskId = task.Id,
Success = false,
Error = $"Service '{payload.ServiceName}' not found in cluster '{payload.Cluster}'",
CompletedAt = _timeProvider.GetUtcNow()
};
}
var previousCount = currentService.DesiredCount;
// Update desired count
var updateResponse = await _ecsClient.UpdateServiceAsync(new UpdateServiceRequest
{
Cluster = payload.Cluster,
Service = payload.ServiceName,
DesiredCount = payload.DesiredCount
}, ct);
if (updateResponse.Service is not { } service)
{
return new AgentTaskResult
{
TaskId = task.Id,
Success = false,
Error = $"Service update returned no service object",
CompletedAt = _timeProvider.GetUtcNow()
};
}
_logger.LogInformation(
"Updated ECS service {Service} desired count from {Previous} to {New}",
payload.ServiceName,
previousCount,
payload.DesiredCount);
if (!payload.WaitForStable)
{
return new AgentTaskResult
{
TaskId = task.Id,
Success = true,
Outputs = new Dictionary<string, object>
{
["serviceArn"] = service.ServiceArn ?? "",
["serviceName"] = service.ServiceName ?? "",
["previousDesiredCount"] = previousCount,
["newDesiredCount"] = payload.DesiredCount,
["runningCount"] = service.RunningCount,
["status"] = "SCALING"
},
CompletedAt = _timeProvider.GetUtcNow()
};
}
// Wait for stable
var stable = await WaitForServiceStableAsync(
payload.Cluster,
payload.ServiceName,
payload.DesiredCount,
payload.StabilizeTimeout,
ct);
return new AgentTaskResult
{
TaskId = task.Id,
Success = stable,
Error = stable ? null : "Service did not stabilize within timeout",
Outputs = new Dictionary<string, object>
{
["serviceArn"] = service.ServiceArn ?? "",
["serviceName"] = service.ServiceName ?? "",
["previousDesiredCount"] = previousCount,
["newDesiredCount"] = payload.DesiredCount,
["status"] = stable ? "STABLE" : "UNSTABLE"
},
CompletedAt = _timeProvider.GetUtcNow()
};
}
catch (AmazonECSException ex)
{
_logger.LogError(ex, "Failed to scale ECS service {Service}", payload.ServiceName);
return new AgentTaskResult
{
TaskId = task.Id,
Success = false,
Error = $"Failed to scale service: {ex.Message}",
CompletedAt = _timeProvider.GetUtcNow()
};
}
}
private async Task<bool> WaitForServiceStableAsync(
string cluster,
string serviceName,
int targetCount,
TimeSpan timeout,
CancellationToken ct)
{
using var timeoutCts = new CancellationTokenSource(timeout);
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
try
{
while (!linkedCts.IsCancellationRequested)
{
var response = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
{
Cluster = cluster,
Services = new List<string> { serviceName }
}, linkedCts.Token);
var service = response.Services.FirstOrDefault();
if (service is null)
{
return false;
}
if (service.RunningCount == targetCount && service.Deployments.Count == 1)
{
_logger.LogInformation(
"Service {Service} scaled to {Count} running tasks",
serviceName,
targetCount);
return true;
}
_logger.LogDebug(
"Service {Service} scaling: running={Running}, desired={Desired}",
serviceName,
service.RunningCount,
targetCount);
await Task.Delay(TimeSpan.FromSeconds(5), linkedCts.Token);
}
}
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
{
_logger.LogWarning("Service scaling stabilization timed out after {Timeout}", timeout);
}
return false;
}
}

View File

@@ -0,0 +1,107 @@
using System.Text.Json;
using Amazon.ECS;
using Amazon.ECS.Model;
using Microsoft.Extensions.Logging;
using StellaOps.Agent.Core.Models;
namespace StellaOps.Agent.Ecs.Tasks;
/// <summary>
/// Task handler for stopping ECS tasks.
/// </summary>
public sealed class EcsStopTaskTask : IEcsTask
{
private readonly IAmazonECS _ecsClient;
private readonly TimeProvider _timeProvider;
private readonly ILogger<EcsStopTaskTask> _logger;
/// <summary>
/// Payload for stopping an ECS task.
/// </summary>
public sealed record StopTaskPayload
{
/// <summary>
/// Name or ARN of the ECS cluster.
/// </summary>
public required string Cluster { get; init; }
/// <summary>
/// Task ARN or ID to stop.
/// </summary>
public required string TaskArn { get; init; }
/// <summary>
/// Reason for stopping the task.
/// </summary>
public string? Reason { get; init; }
}
/// <summary>
/// Creates a new ECS stop task handler.
/// </summary>
public EcsStopTaskTask(
IAmazonECS ecsClient,
TimeProvider timeProvider,
ILogger<EcsStopTaskTask> logger)
{
_ecsClient = ecsClient;
_timeProvider = timeProvider;
_logger = logger;
}
/// <inheritdoc />
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
{
var payload = JsonSerializer.Deserialize<StopTaskPayload>(task.Payload)
?? throw new InvalidEcsPayloadException("ecs.stop", "Failed to deserialize payload");
_logger.LogInformation(
"Stopping ECS task {TaskArn} in cluster {Cluster}",
payload.TaskArn,
payload.Cluster);
try
{
var request = new StopTaskRequest
{
Cluster = payload.Cluster,
Task = payload.TaskArn,
Reason = payload.Reason ?? "Stopped by Stella Agent"
};
var response = await _ecsClient.StopTaskAsync(request, ct);
var stoppedTask = response.Task;
_logger.LogInformation(
"Stopped ECS task {TaskArn}, last status: {Status}",
stoppedTask.TaskArn,
stoppedTask.LastStatus);
return new AgentTaskResult
{
TaskId = task.Id,
Success = true,
Outputs = new Dictionary<string, object>
{
["taskArn"] = stoppedTask.TaskArn ?? "",
["lastStatus"] = stoppedTask.LastStatus ?? "",
["stoppedReason"] = stoppedTask.StoppedReason ?? payload.Reason ?? "Stopped by agent",
["stoppedAt"] = stoppedTask.StoppedAt.GetValueOrDefault().ToUniversalTime().ToString("o", System.Globalization.CultureInfo.InvariantCulture)
},
CompletedAt = _timeProvider.GetUtcNow()
};
}
catch (AmazonECSException ex)
{
_logger.LogError(ex, "Failed to stop ECS task {TaskArn}", payload.TaskArn);
return new AgentTaskResult
{
TaskId = task.Id,
Success = false,
Error = $"Failed to stop task: {ex.Message}",
CompletedAt = _timeProvider.GetUtcNow()
};
}
}
}