release orchestrator v1 draft and build fixes
This commit is contained in:
@@ -0,0 +1,216 @@
|
||||
using Amazon.CloudWatchLogs;
|
||||
using Amazon.CloudWatchLogs.Model;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Task = System.Threading.Tasks.Task;
|
||||
|
||||
namespace StellaOps.Agent.Ecs;
|
||||
|
||||
/// <summary>
|
||||
/// Streams logs from CloudWatch Logs for ECS tasks.
|
||||
/// </summary>
|
||||
public sealed class CloudWatchLogStreamer
|
||||
{
|
||||
private readonly IAmazonCloudWatchLogs _logsClient;
|
||||
private readonly ILogger<CloudWatchLogStreamer> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Event raised when a log message is received.
|
||||
/// </summary>
|
||||
public event EventHandler<LogMessageEventArgs>? LogReceived;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new CloudWatch log streamer.
|
||||
/// </summary>
|
||||
public CloudWatchLogStreamer(
|
||||
IAmazonCloudWatchLogs logsClient,
|
||||
ILogger<CloudWatchLogStreamer> logger)
|
||||
{
|
||||
_logsClient = logsClient;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Streams logs from a CloudWatch log group/stream.
|
||||
/// </summary>
|
||||
/// <param name="logGroupName">The log group name.</param>
|
||||
/// <param name="logStreamName">The log stream name.</param>
|
||||
/// <param name="startTime">The start time for log retrieval.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
public async Task StreamLogsAsync(
|
||||
string logGroupName,
|
||||
string logStreamName,
|
||||
DateTimeOffset? startTime = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
string? nextToken = null;
|
||||
var startFromHead = startTime is null;
|
||||
var startTimeUtc = startTime?.UtcDateTime;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Starting log stream from {LogGroup}/{LogStream}",
|
||||
logGroupName,
|
||||
logStreamName);
|
||||
|
||||
try
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
var request = new GetLogEventsRequest
|
||||
{
|
||||
LogGroupName = logGroupName,
|
||||
LogStreamName = logStreamName,
|
||||
StartFromHead = startFromHead,
|
||||
NextToken = nextToken
|
||||
};
|
||||
|
||||
if (startTimeUtc.HasValue && nextToken is null)
|
||||
{
|
||||
request.StartTime = startTimeUtc.Value;
|
||||
}
|
||||
|
||||
var response = await _logsClient.GetLogEventsAsync(request, ct);
|
||||
|
||||
foreach (var logEvent in response.Events)
|
||||
{
|
||||
var level = DetectLogLevel(logEvent.Message);
|
||||
OnLogReceived(new LogMessageEventArgs(
|
||||
logGroupName,
|
||||
logStreamName,
|
||||
logEvent.Timestamp ?? DateTime.UtcNow,
|
||||
level,
|
||||
logEvent.Message));
|
||||
}
|
||||
|
||||
// If token hasn't changed, no new logs - wait before polling
|
||||
if (response.NextForwardToken == nextToken)
|
||||
{
|
||||
await Task.Delay(TimeSpan.FromSeconds(2), ct);
|
||||
}
|
||||
|
||||
nextToken = response.NextForwardToken;
|
||||
startFromHead = false;
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
_logger.LogDebug("Log streaming cancelled");
|
||||
}
|
||||
catch (ResourceNotFoundException)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Log stream {LogGroup}/{LogStream} not found",
|
||||
logGroupName,
|
||||
logStreamName);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Error streaming logs from {LogGroup}/{LogStream}",
|
||||
logGroupName,
|
||||
logStreamName);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the log stream name for an ECS task.
|
||||
/// </summary>
|
||||
/// <param name="logStreamPrefix">The log stream prefix configured in the task definition.</param>
|
||||
/// <param name="containerName">The container name.</param>
|
||||
/// <param name="taskId">The task ID (last part of task ARN).</param>
|
||||
/// <returns>The full log stream name.</returns>
|
||||
public static string GetTaskLogStreamName(
|
||||
string logStreamPrefix,
|
||||
string containerName,
|
||||
string taskId)
|
||||
{
|
||||
return $"{logStreamPrefix}/{containerName}/{taskId}";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extracts the task ID from a task ARN.
|
||||
/// </summary>
|
||||
/// <param name="taskArn">The task ARN.</param>
|
||||
/// <returns>The task ID.</returns>
|
||||
public static string ExtractTaskId(string taskArn)
|
||||
{
|
||||
var parts = taskArn.Split('/');
|
||||
return parts.Length > 0 ? parts[^1] : taskArn;
|
||||
}
|
||||
|
||||
private void OnLogReceived(LogMessageEventArgs e)
|
||||
{
|
||||
LogReceived?.Invoke(this, e);
|
||||
}
|
||||
|
||||
private static LogLevel DetectLogLevel(string message)
|
||||
{
|
||||
var upperMessage = message.ToUpperInvariant();
|
||||
|
||||
if (upperMessage.Contains("ERROR") || upperMessage.Contains("FATAL") ||
|
||||
upperMessage.Contains("EXCEPTION") || upperMessage.Contains("FAIL"))
|
||||
{
|
||||
return LogLevel.Error;
|
||||
}
|
||||
|
||||
if (upperMessage.Contains("WARN"))
|
||||
{
|
||||
return LogLevel.Warning;
|
||||
}
|
||||
|
||||
if (upperMessage.Contains("DEBUG") || upperMessage.Contains("TRACE"))
|
||||
{
|
||||
return LogLevel.Debug;
|
||||
}
|
||||
|
||||
return LogLevel.Information;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event args for log messages.
|
||||
/// </summary>
|
||||
public sealed class LogMessageEventArgs : EventArgs
|
||||
{
|
||||
/// <summary>
|
||||
/// The log group name.
|
||||
/// </summary>
|
||||
public string LogGroup { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The log stream name.
|
||||
/// </summary>
|
||||
public string LogStream { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The timestamp of the log event.
|
||||
/// </summary>
|
||||
public DateTime Timestamp { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The detected log level.
|
||||
/// </summary>
|
||||
public LogLevel Level { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The log message.
|
||||
/// </summary>
|
||||
public string Message { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new log message event args.
|
||||
/// </summary>
|
||||
public LogMessageEventArgs(
|
||||
string logGroup,
|
||||
string logStream,
|
||||
DateTime timestamp,
|
||||
LogLevel level,
|
||||
string message)
|
||||
{
|
||||
LogGroup = logGroup;
|
||||
LogStream = logStream;
|
||||
Timestamp = timestamp;
|
||||
Level = level;
|
||||
Message = message;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,222 @@
|
||||
using System.Text.Json;
|
||||
using Amazon.CloudWatchLogs;
|
||||
using Amazon.ECS;
|
||||
using Amazon.ECS.Model;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Agent.Core.Capability;
|
||||
using StellaOps.Agent.Core.Models;
|
||||
using StellaOps.Agent.Ecs.Tasks;
|
||||
|
||||
namespace StellaOps.Agent.Ecs;
|
||||
|
||||
/// <summary>
|
||||
/// Agent capability for managing AWS ECS services and tasks.
|
||||
/// </summary>
|
||||
public sealed class EcsCapability : IAgentCapability, IAsyncDisposable
|
||||
{
|
||||
private readonly IAmazonECS _ecsClient;
|
||||
private readonly IAmazonCloudWatchLogs _logsClient;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILoggerFactory _loggerFactory;
|
||||
private readonly ILogger<EcsCapability> _logger;
|
||||
private readonly Dictionary<string, Func<AgentTaskInfo, CancellationToken, Task<AgentTaskResult>>> _taskHandlers;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the capability name.
|
||||
/// </summary>
|
||||
public string Name => "ecs";
|
||||
|
||||
/// <summary>
|
||||
/// Gets the capability version.
|
||||
/// </summary>
|
||||
public string Version => "1.0.0";
|
||||
|
||||
/// <summary>
|
||||
/// Gets the supported task types.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> SupportedTaskTypes { get; } = new[]
|
||||
{
|
||||
"ecs.deploy",
|
||||
"ecs.run",
|
||||
"ecs.stop",
|
||||
"ecs.scale",
|
||||
"ecs.register",
|
||||
"ecs.health",
|
||||
"ecs.describe"
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new ECS capability.
|
||||
/// </summary>
|
||||
/// <param name="ecsClient">The ECS client.</param>
|
||||
/// <param name="logsClient">The CloudWatch Logs client.</param>
|
||||
/// <param name="timeProvider">Time provider for timestamps.</param>
|
||||
/// <param name="loggerFactory">Logger factory.</param>
|
||||
public EcsCapability(
|
||||
IAmazonECS ecsClient,
|
||||
IAmazonCloudWatchLogs logsClient,
|
||||
TimeProvider timeProvider,
|
||||
ILoggerFactory loggerFactory)
|
||||
{
|
||||
_ecsClient = ecsClient ?? throw new ArgumentNullException(nameof(ecsClient));
|
||||
_logsClient = logsClient ?? throw new ArgumentNullException(nameof(logsClient));
|
||||
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
|
||||
_loggerFactory = loggerFactory ?? throw new ArgumentNullException(nameof(loggerFactory));
|
||||
_logger = loggerFactory.CreateLogger<EcsCapability>();
|
||||
|
||||
_taskHandlers = new Dictionary<string, Func<AgentTaskInfo, CancellationToken, Task<AgentTaskResult>>>
|
||||
{
|
||||
["ecs.deploy"] = ExecuteDeployAsync,
|
||||
["ecs.run"] = ExecuteRunTaskAsync,
|
||||
["ecs.stop"] = ExecuteStopTaskAsync,
|
||||
["ecs.scale"] = ExecuteScaleAsync,
|
||||
["ecs.register"] = ExecuteRegisterAsync,
|
||||
["ecs.health"] = ExecuteHealthCheckAsync,
|
||||
["ecs.describe"] = ExecuteDescribeAsync
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<bool> InitializeAsync(CancellationToken ct = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Verify AWS credentials and ECS access by listing clusters
|
||||
var response = await _ecsClient.ListClustersAsync(new ListClustersRequest
|
||||
{
|
||||
MaxResults = 1
|
||||
}, ct);
|
||||
|
||||
_logger.LogInformation(
|
||||
"ECS capability initialized, AWS API accessible");
|
||||
|
||||
return true;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to initialize ECS capability - AWS API not accessible");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
|
||||
{
|
||||
if (!_taskHandlers.TryGetValue(task.TaskType, out var handler))
|
||||
{
|
||||
throw new InvalidEcsPayloadException(task.TaskType, "Unsupported task type");
|
||||
}
|
||||
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
|
||||
try
|
||||
{
|
||||
var result = await handler(task, ct);
|
||||
return result with
|
||||
{
|
||||
Duration = _timeProvider.GetUtcNow() - startTime
|
||||
};
|
||||
}
|
||||
catch (InvalidEcsPayloadException)
|
||||
{
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "ECS task {TaskType} failed", task.TaskType);
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = false,
|
||||
Error = ex.Message,
|
||||
CompletedAt = _timeProvider.GetUtcNow(),
|
||||
Duration = _timeProvider.GetUtcNow() - startTime
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<CapabilityHealthStatus> CheckHealthAsync(CancellationToken ct = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _ecsClient.ListClustersAsync(new ListClustersRequest { MaxResults = 1 }, ct);
|
||||
return new CapabilityHealthStatus(true, "ECS capability ready");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new CapabilityHealthStatus(false, $"ECS API not accessible: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<AgentTaskResult> ExecuteDeployAsync(AgentTaskInfo task, CancellationToken ct)
|
||||
{
|
||||
var taskHandler = new EcsDeployServiceTask(
|
||||
_ecsClient,
|
||||
_timeProvider,
|
||||
_loggerFactory.CreateLogger<EcsDeployServiceTask>());
|
||||
return await taskHandler.ExecuteAsync(task, ct);
|
||||
}
|
||||
|
||||
private async Task<AgentTaskResult> ExecuteRunTaskAsync(AgentTaskInfo task, CancellationToken ct)
|
||||
{
|
||||
var taskHandler = new EcsRunTaskTask(
|
||||
_ecsClient,
|
||||
_timeProvider,
|
||||
_loggerFactory.CreateLogger<EcsRunTaskTask>());
|
||||
return await taskHandler.ExecuteAsync(task, ct);
|
||||
}
|
||||
|
||||
private async Task<AgentTaskResult> ExecuteStopTaskAsync(AgentTaskInfo task, CancellationToken ct)
|
||||
{
|
||||
var taskHandler = new EcsStopTaskTask(
|
||||
_ecsClient,
|
||||
_timeProvider,
|
||||
_loggerFactory.CreateLogger<EcsStopTaskTask>());
|
||||
return await taskHandler.ExecuteAsync(task, ct);
|
||||
}
|
||||
|
||||
private async Task<AgentTaskResult> ExecuteScaleAsync(AgentTaskInfo task, CancellationToken ct)
|
||||
{
|
||||
var taskHandler = new EcsScaleServiceTask(
|
||||
_ecsClient,
|
||||
_timeProvider,
|
||||
_loggerFactory.CreateLogger<EcsScaleServiceTask>());
|
||||
return await taskHandler.ExecuteAsync(task, ct);
|
||||
}
|
||||
|
||||
private async Task<AgentTaskResult> ExecuteRegisterAsync(AgentTaskInfo task, CancellationToken ct)
|
||||
{
|
||||
var taskHandler = new EcsRegisterTaskDefinitionTask(
|
||||
_ecsClient,
|
||||
_timeProvider,
|
||||
_loggerFactory.CreateLogger<EcsRegisterTaskDefinitionTask>());
|
||||
return await taskHandler.ExecuteAsync(task, ct);
|
||||
}
|
||||
|
||||
private async Task<AgentTaskResult> ExecuteHealthCheckAsync(AgentTaskInfo task, CancellationToken ct)
|
||||
{
|
||||
var taskHandler = new EcsHealthCheckTask(
|
||||
_ecsClient,
|
||||
_timeProvider,
|
||||
_loggerFactory.CreateLogger<EcsHealthCheckTask>());
|
||||
return await taskHandler.ExecuteAsync(task, ct);
|
||||
}
|
||||
|
||||
private async Task<AgentTaskResult> ExecuteDescribeAsync(AgentTaskInfo task, CancellationToken ct)
|
||||
{
|
||||
var taskHandler = new EcsDescribeServiceTask(
|
||||
_ecsClient,
|
||||
_timeProvider,
|
||||
_loggerFactory.CreateLogger<EcsDescribeServiceTask>());
|
||||
return await taskHandler.ExecuteAsync(task, ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public ValueTask DisposeAsync()
|
||||
{
|
||||
_ecsClient.Dispose();
|
||||
_logsClient.Dispose();
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
namespace StellaOps.Agent.Ecs;
|
||||
|
||||
/// <summary>
|
||||
/// Base exception for ECS agent operations.
|
||||
/// </summary>
|
||||
public class EcsAgentException : Exception
|
||||
{
|
||||
public EcsAgentException(string message) : base(message) { }
|
||||
public EcsAgentException(string message, Exception innerException) : base(message, innerException) { }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Thrown when an ECS task payload is invalid or missing required fields.
|
||||
/// </summary>
|
||||
public class InvalidEcsPayloadException : EcsAgentException
|
||||
{
|
||||
public string TaskType { get; }
|
||||
|
||||
public InvalidEcsPayloadException(string taskType, string? details = null)
|
||||
: base($"Invalid payload for ECS task type '{taskType}'{(details is not null ? $": {details}" : "")}")
|
||||
{
|
||||
TaskType = taskType;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Thrown when an ECS service or task operation fails.
|
||||
/// </summary>
|
||||
public class EcsOperationException : EcsAgentException
|
||||
{
|
||||
public string Operation { get; }
|
||||
public string? Cluster { get; }
|
||||
public string? Resource { get; }
|
||||
|
||||
public EcsOperationException(string operation, string? cluster, string? resource, string message)
|
||||
: base($"ECS {operation} failed{(cluster is not null ? $" in cluster '{cluster}'" : "")}{(resource is not null ? $" for '{resource}'" : "")}: {message}")
|
||||
{
|
||||
Operation = operation;
|
||||
Cluster = cluster;
|
||||
Resource = resource;
|
||||
}
|
||||
|
||||
public EcsOperationException(string operation, string? cluster, string? resource, string message, Exception innerException)
|
||||
: base($"ECS {operation} failed{(cluster is not null ? $" in cluster '{cluster}'" : "")}{(resource is not null ? $" for '{resource}'" : "")}: {message}", innerException)
|
||||
{
|
||||
Operation = operation;
|
||||
Cluster = cluster;
|
||||
Resource = resource;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Thrown when an ECS deployment times out waiting for stabilization.
|
||||
/// </summary>
|
||||
public class EcsDeploymentTimeoutException : EcsAgentException
|
||||
{
|
||||
public string Cluster { get; }
|
||||
public string ServiceName { get; }
|
||||
public TimeSpan Timeout { get; }
|
||||
|
||||
public EcsDeploymentTimeoutException(string cluster, string serviceName, TimeSpan timeout)
|
||||
: base($"ECS deployment timed out waiting for service '{serviceName}' in cluster '{cluster}' to stabilize after {timeout}")
|
||||
{
|
||||
Cluster = cluster;
|
||||
ServiceName = serviceName;
|
||||
Timeout = timeout;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Thrown when an ECS task fails to complete successfully.
|
||||
/// </summary>
|
||||
public class EcsTaskFailedException : EcsAgentException
|
||||
{
|
||||
public string Cluster { get; }
|
||||
public IReadOnlyList<string> TaskArns { get; }
|
||||
public IReadOnlyList<int> ExitCodes { get; }
|
||||
|
||||
public EcsTaskFailedException(string cluster, IReadOnlyList<string> taskArns, IReadOnlyList<int> exitCodes)
|
||||
: base($"ECS task(s) failed in cluster '{cluster}' with exit codes: [{string.Join(", ", exitCodes)}]")
|
||||
{
|
||||
Cluster = cluster;
|
||||
TaskArns = taskArns;
|
||||
ExitCodes = exitCodes;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
using StellaOps.Agent.Core.Models;
|
||||
|
||||
namespace StellaOps.Agent.Ecs;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for ECS task handlers.
|
||||
/// </summary>
|
||||
public interface IEcsTask
|
||||
{
|
||||
/// <summary>
|
||||
/// Executes the ECS task.
|
||||
/// </summary>
|
||||
/// <param name="task">The agent task to execute.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The result of the task execution.</returns>
|
||||
Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<RootNamespace>StellaOps.Agent.Ecs</RootNamespace>
|
||||
<Description>Stella Agent ECS Capability - manages AWS ECS services and tasks</Description>
|
||||
<!-- AWS SDK v4 nullable annotations cause false positives with value type boxing to Dictionary<string, object> -->
|
||||
<NoWarn>$(NoWarn);CS8600;CS8601;CS8620</NoWarn>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="AWSSDK.ECS" />
|
||||
<PackageReference Include="AWSSDK.CloudWatchLogs" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.Agent.Core\StellaOps.Agent.Core.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,470 @@
|
||||
using System.Text.Json;
|
||||
using Amazon.ECS;
|
||||
using Amazon.ECS.Model;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Agent.Core.Models;
|
||||
using Task = System.Threading.Tasks.Task;
|
||||
|
||||
namespace StellaOps.Agent.Ecs.Tasks;
|
||||
|
||||
/// <summary>
|
||||
/// Task handler for deploying ECS services.
|
||||
/// </summary>
|
||||
public sealed class EcsDeployServiceTask : IEcsTask
|
||||
{
|
||||
private readonly IAmazonECS _ecsClient;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<EcsDeployServiceTask> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Payload for deploying an ECS service.
|
||||
/// </summary>
|
||||
public sealed record DeployServicePayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Name or ARN of the ECS cluster.
|
||||
/// </summary>
|
||||
public required string Cluster { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Name of the service to deploy.
|
||||
/// </summary>
|
||||
public required string ServiceName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Task definition family:revision or ARN.
|
||||
/// </summary>
|
||||
public required string TaskDefinition { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Desired number of tasks.
|
||||
/// </summary>
|
||||
public int DesiredCount { get; init; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// Launch type (FARGATE or EC2).
|
||||
/// </summary>
|
||||
public string? LaunchType { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Network configuration for awsvpc mode.
|
||||
/// </summary>
|
||||
public NetworkConfigurationPayload? NetworkConfiguration { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Load balancer configuration.
|
||||
/// </summary>
|
||||
public LoadBalancerPayload? LoadBalancer { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Deployment configuration.
|
||||
/// </summary>
|
||||
public DeploymentConfigPayload? DeploymentConfiguration { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to force a new deployment.
|
||||
/// </summary>
|
||||
public bool ForceNewDeployment { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Timeout waiting for deployment to stabilize.
|
||||
/// </summary>
|
||||
public TimeSpan DeploymentTimeout { get; init; } = TimeSpan.FromMinutes(10);
|
||||
|
||||
/// <summary>
|
||||
/// Tags to apply to the service.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, string>? Tags { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Network configuration payload.
|
||||
/// </summary>
|
||||
public sealed record NetworkConfigurationPayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Subnet IDs.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<string> Subnets { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Security group IDs.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? SecurityGroups { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to assign a public IP.
|
||||
/// </summary>
|
||||
public bool AssignPublicIp { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Load balancer configuration payload.
|
||||
/// </summary>
|
||||
public sealed record LoadBalancerPayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Target group ARN.
|
||||
/// </summary>
|
||||
public required string TargetGroupArn { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Container name for the target.
|
||||
/// </summary>
|
||||
public required string ContainerName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Container port.
|
||||
/// </summary>
|
||||
public required int ContainerPort { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deployment configuration payload.
|
||||
/// </summary>
|
||||
public sealed record DeploymentConfigPayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Maximum percent during deployment.
|
||||
/// </summary>
|
||||
public int MaximumPercent { get; init; } = 200;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum healthy percent.
|
||||
/// </summary>
|
||||
public int MinimumHealthyPercent { get; init; } = 100;
|
||||
|
||||
/// <summary>
|
||||
/// Enable deployment circuit breaker.
|
||||
/// </summary>
|
||||
public bool EnableCircuitBreaker { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Enable rollback on failure.
|
||||
/// </summary>
|
||||
public bool EnableRollback { get; init; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new ECS deploy service task handler.
|
||||
/// </summary>
|
||||
public EcsDeployServiceTask(
|
||||
IAmazonECS ecsClient,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<EcsDeployServiceTask> logger)
|
||||
{
|
||||
_ecsClient = ecsClient;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
|
||||
{
|
||||
var payload = JsonSerializer.Deserialize<DeployServicePayload>(task.Payload)
|
||||
?? throw new InvalidEcsPayloadException("ecs.deploy", "Failed to deserialize payload");
|
||||
|
||||
_logger.LogInformation(
|
||||
"Deploying ECS service {Service} to cluster {Cluster} with task definition {TaskDef}",
|
||||
payload.ServiceName,
|
||||
payload.Cluster,
|
||||
payload.TaskDefinition);
|
||||
|
||||
try
|
||||
{
|
||||
// Check if service exists
|
||||
var existingService = await GetServiceAsync(payload.Cluster, payload.ServiceName, ct);
|
||||
|
||||
if (existingService is not null && existingService.Status != "INACTIVE")
|
||||
{
|
||||
return await UpdateServiceAsync(task.Id, payload, ct);
|
||||
}
|
||||
else
|
||||
{
|
||||
return await CreateServiceAsync(task.Id, payload, ct);
|
||||
}
|
||||
}
|
||||
catch (AmazonECSException ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to deploy ECS service {Service}", payload.ServiceName);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = false,
|
||||
Error = $"ECS deployment failed: {ex.Message}",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<Service?> GetServiceAsync(string cluster, string serviceName, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var response = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
|
||||
{
|
||||
Cluster = cluster,
|
||||
Services = new List<string> { serviceName }
|
||||
}, ct);
|
||||
|
||||
return response.Services.FirstOrDefault();
|
||||
}
|
||||
catch
|
||||
{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<AgentTaskResult> CreateServiceAsync(
|
||||
Guid taskId,
|
||||
DeployServicePayload payload,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation("Creating new ECS service {Service}", payload.ServiceName);
|
||||
|
||||
var request = new CreateServiceRequest
|
||||
{
|
||||
Cluster = payload.Cluster,
|
||||
ServiceName = payload.ServiceName,
|
||||
TaskDefinition = payload.TaskDefinition,
|
||||
DesiredCount = payload.DesiredCount
|
||||
};
|
||||
|
||||
if (!string.IsNullOrEmpty(payload.LaunchType))
|
||||
{
|
||||
request.LaunchType = new LaunchType(payload.LaunchType);
|
||||
}
|
||||
|
||||
if (payload.NetworkConfiguration is not null)
|
||||
{
|
||||
request.NetworkConfiguration = new NetworkConfiguration
|
||||
{
|
||||
AwsvpcConfiguration = new AwsVpcConfiguration
|
||||
{
|
||||
Subnets = payload.NetworkConfiguration.Subnets.ToList(),
|
||||
SecurityGroups = payload.NetworkConfiguration.SecurityGroups?.ToList(),
|
||||
AssignPublicIp = payload.NetworkConfiguration.AssignPublicIp
|
||||
? AssignPublicIp.ENABLED
|
||||
: AssignPublicIp.DISABLED
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
if (payload.LoadBalancer is not null)
|
||||
{
|
||||
request.LoadBalancers = new List<LoadBalancer>
|
||||
{
|
||||
new()
|
||||
{
|
||||
TargetGroupArn = payload.LoadBalancer.TargetGroupArn,
|
||||
ContainerName = payload.LoadBalancer.ContainerName,
|
||||
ContainerPort = payload.LoadBalancer.ContainerPort
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
if (payload.DeploymentConfiguration is not null)
|
||||
{
|
||||
request.DeploymentConfiguration = new DeploymentConfiguration
|
||||
{
|
||||
MaximumPercent = payload.DeploymentConfiguration.MaximumPercent,
|
||||
MinimumHealthyPercent = payload.DeploymentConfiguration.MinimumHealthyPercent,
|
||||
DeploymentCircuitBreaker = new DeploymentCircuitBreaker
|
||||
{
|
||||
Enable = payload.DeploymentConfiguration.EnableCircuitBreaker,
|
||||
Rollback = payload.DeploymentConfiguration.EnableRollback
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
if (payload.Tags is not null)
|
||||
{
|
||||
request.Tags = payload.Tags.Select(kv => new Tag { Key = kv.Key, Value = kv.Value }).ToList();
|
||||
}
|
||||
|
||||
var createResponse = await _ecsClient.CreateServiceAsync(request, ct);
|
||||
|
||||
if (createResponse.Service is not { } service)
|
||||
{
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = taskId,
|
||||
Success = false,
|
||||
Error = "Service creation returned no service object",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Created ECS service {Service} (ARN: {Arn})",
|
||||
payload.ServiceName,
|
||||
service.ServiceArn);
|
||||
|
||||
// Wait for deployment to stabilize
|
||||
var stable = await WaitForServiceStableAsync(
|
||||
payload.Cluster,
|
||||
payload.ServiceName,
|
||||
payload.DeploymentTimeout,
|
||||
ct);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = taskId,
|
||||
Success = stable,
|
||||
Error = stable ? null : "Service did not stabilize within timeout",
|
||||
Outputs = new Dictionary<string, object>
|
||||
{
|
||||
["serviceArn"] = service.ServiceArn ?? "",
|
||||
["serviceName"] = service.ServiceName ?? "",
|
||||
["taskDefinition"] = service.TaskDefinition ?? "",
|
||||
["runningCount"] = service.RunningCount,
|
||||
["desiredCount"] = service.DesiredCount,
|
||||
["deploymentStatus"] = stable ? "COMPLETED" : "TIMED_OUT",
|
||||
["operation"] = "create"
|
||||
},
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<AgentTaskResult> UpdateServiceAsync(
|
||||
Guid taskId,
|
||||
DeployServicePayload payload,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Updating existing ECS service {Service} to task definition {TaskDef}",
|
||||
payload.ServiceName,
|
||||
payload.TaskDefinition);
|
||||
|
||||
var request = new UpdateServiceRequest
|
||||
{
|
||||
Cluster = payload.Cluster,
|
||||
Service = payload.ServiceName,
|
||||
TaskDefinition = payload.TaskDefinition,
|
||||
DesiredCount = payload.DesiredCount,
|
||||
ForceNewDeployment = payload.ForceNewDeployment
|
||||
};
|
||||
|
||||
if (payload.DeploymentConfiguration is not null)
|
||||
{
|
||||
request.DeploymentConfiguration = new DeploymentConfiguration
|
||||
{
|
||||
MaximumPercent = payload.DeploymentConfiguration.MaximumPercent,
|
||||
MinimumHealthyPercent = payload.DeploymentConfiguration.MinimumHealthyPercent,
|
||||
DeploymentCircuitBreaker = new DeploymentCircuitBreaker
|
||||
{
|
||||
Enable = payload.DeploymentConfiguration.EnableCircuitBreaker,
|
||||
Rollback = payload.DeploymentConfiguration.EnableRollback
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
var updateResponse = await _ecsClient.UpdateServiceAsync(request, ct);
|
||||
|
||||
if (updateResponse.Service is not { } service)
|
||||
{
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = taskId,
|
||||
Success = false,
|
||||
Error = "Service update returned no service object",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Updated ECS service {Service}, deployment ID: {DeploymentId}",
|
||||
payload.ServiceName,
|
||||
service.Deployments.FirstOrDefault()?.Id ?? "unknown");
|
||||
|
||||
// Wait for deployment to stabilize
|
||||
var stable = await WaitForServiceStableAsync(
|
||||
payload.Cluster,
|
||||
payload.ServiceName,
|
||||
payload.DeploymentTimeout,
|
||||
ct);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = taskId,
|
||||
Success = stable,
|
||||
Error = stable ? null : "Service did not stabilize within timeout",
|
||||
Outputs = new Dictionary<string, object>
|
||||
{
|
||||
["serviceArn"] = service.ServiceArn ?? "",
|
||||
["serviceName"] = service.ServiceName ?? "",
|
||||
["taskDefinition"] = service.TaskDefinition ?? "",
|
||||
["runningCount"] = service.RunningCount,
|
||||
["desiredCount"] = service.DesiredCount,
|
||||
["deploymentId"] = service.Deployments.FirstOrDefault()?.Id ?? "",
|
||||
["deploymentStatus"] = stable ? "COMPLETED" : "TIMED_OUT",
|
||||
["operation"] = "update"
|
||||
},
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<bool> WaitForServiceStableAsync(
|
||||
string cluster,
|
||||
string serviceName,
|
||||
TimeSpan timeout,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation("Waiting for service {Service} to stabilize", serviceName);
|
||||
|
||||
using var timeoutCts = new CancellationTokenSource(timeout);
|
||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
|
||||
|
||||
try
|
||||
{
|
||||
while (!linkedCts.IsCancellationRequested)
|
||||
{
|
||||
var response = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
|
||||
{
|
||||
Cluster = cluster,
|
||||
Services = new List<string> { serviceName }
|
||||
}, linkedCts.Token);
|
||||
|
||||
var service = response.Services.FirstOrDefault();
|
||||
if (service is null)
|
||||
{
|
||||
_logger.LogWarning("Service {Service} not found during stabilization check", serviceName);
|
||||
return false;
|
||||
}
|
||||
|
||||
var primaryDeployment = service.Deployments.FirstOrDefault(d => d.Status == "PRIMARY");
|
||||
if (primaryDeployment is null)
|
||||
{
|
||||
await Task.Delay(TimeSpan.FromSeconds(10), linkedCts.Token);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (primaryDeployment.RunningCount == primaryDeployment.DesiredCount &&
|
||||
service.Deployments.Count == 1)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Service {Service} stabilized with {Count} running tasks",
|
||||
serviceName,
|
||||
primaryDeployment.RunningCount);
|
||||
return true;
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Service {Service} not stable: running={Running}, desired={Desired}, deployments={Deployments}",
|
||||
serviceName,
|
||||
primaryDeployment.RunningCount,
|
||||
primaryDeployment.DesiredCount,
|
||||
service.Deployments.Count);
|
||||
|
||||
await Task.Delay(TimeSpan.FromSeconds(10), linkedCts.Token);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogWarning("Service {Service} stabilization timed out after {Timeout}", serviceName, timeout);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,173 @@
|
||||
using System.Globalization;
|
||||
using System.Text.Json;
|
||||
using Amazon.ECS;
|
||||
using Amazon.ECS.Model;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Agent.Core.Models;
|
||||
using Task = System.Threading.Tasks.Task;
|
||||
|
||||
namespace StellaOps.Agent.Ecs.Tasks;
|
||||
|
||||
/// <summary>
|
||||
/// Task handler for describing ECS services.
|
||||
/// </summary>
|
||||
public sealed class EcsDescribeServiceTask : IEcsTask
|
||||
{
|
||||
private readonly IAmazonECS _ecsClient;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<EcsDescribeServiceTask> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Payload for describing an ECS service.
|
||||
/// </summary>
|
||||
public sealed record DescribeServicePayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Name or ARN of the ECS cluster.
|
||||
/// </summary>
|
||||
public required string Cluster { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Name of the service to describe.
|
||||
/// </summary>
|
||||
public required string ServiceName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include task information.
|
||||
/// </summary>
|
||||
public bool IncludeTasks { get; init; } = false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new ECS describe service task handler.
|
||||
/// </summary>
|
||||
public EcsDescribeServiceTask(
|
||||
IAmazonECS ecsClient,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<EcsDescribeServiceTask> logger)
|
||||
{
|
||||
_ecsClient = ecsClient;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
|
||||
{
|
||||
var payload = JsonSerializer.Deserialize<DescribeServicePayload>(task.Payload)
|
||||
?? throw new InvalidEcsPayloadException("ecs.describe", "Failed to deserialize payload");
|
||||
|
||||
_logger.LogInformation(
|
||||
"Describing ECS service {Service} in cluster {Cluster}",
|
||||
payload.ServiceName,
|
||||
payload.Cluster);
|
||||
|
||||
try
|
||||
{
|
||||
var response = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
|
||||
{
|
||||
Cluster = payload.Cluster,
|
||||
Services = new List<string> { payload.ServiceName }
|
||||
}, ct);
|
||||
|
||||
if (response.Services.FirstOrDefault() is not { } service)
|
||||
{
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = false,
|
||||
Error = $"Service '{payload.ServiceName}' not found",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
var outputs = new Dictionary<string, object>
|
||||
{
|
||||
["serviceArn"] = service.ServiceArn ?? "",
|
||||
["serviceName"] = service.ServiceName ?? "",
|
||||
["clusterArn"] = service.ClusterArn ?? "",
|
||||
["status"] = service.Status ?? "",
|
||||
["taskDefinition"] = service.TaskDefinition ?? "",
|
||||
["desiredCount"] = service.DesiredCount,
|
||||
["runningCount"] = service.RunningCount,
|
||||
["pendingCount"] = service.PendingCount,
|
||||
["launchType"] = service.LaunchType?.Value ?? "unknown",
|
||||
["deploymentCount"] = service.Deployments.Count,
|
||||
["createdAt"] = service.CreatedAt.GetValueOrDefault().ToUniversalTime().ToString("o", CultureInfo.InvariantCulture),
|
||||
["deployments"] = service.Deployments.Select(d => new Dictionary<string, object>
|
||||
{
|
||||
["id"] = d.Id ?? "",
|
||||
["status"] = d.Status ?? "",
|
||||
["taskDefinition"] = d.TaskDefinition ?? "",
|
||||
["desiredCount"] = d.DesiredCount,
|
||||
["runningCount"] = d.RunningCount,
|
||||
["pendingCount"] = d.PendingCount,
|
||||
["createdAt"] = d.CreatedAt.GetValueOrDefault().ToUniversalTime().ToString("o", CultureInfo.InvariantCulture)
|
||||
}).ToList()
|
||||
};
|
||||
|
||||
// Include tasks if requested
|
||||
if (payload.IncludeTasks)
|
||||
{
|
||||
var tasksResponse = await _ecsClient.ListTasksAsync(new ListTasksRequest
|
||||
{
|
||||
Cluster = payload.Cluster,
|
||||
ServiceName = payload.ServiceName
|
||||
}, ct);
|
||||
|
||||
if (tasksResponse.TaskArns.Count > 0)
|
||||
{
|
||||
var describeTasksResponse = await _ecsClient.DescribeTasksAsync(new DescribeTasksRequest
|
||||
{
|
||||
Cluster = payload.Cluster,
|
||||
Tasks = tasksResponse.TaskArns
|
||||
}, ct);
|
||||
|
||||
outputs["tasks"] = describeTasksResponse.Tasks.Select(t => new Dictionary<string, object>
|
||||
{
|
||||
["taskArn"] = t.TaskArn ?? "",
|
||||
["taskDefinitionArn"] = t.TaskDefinitionArn ?? "",
|
||||
["lastStatus"] = t.LastStatus ?? "",
|
||||
["desiredStatus"] = t.DesiredStatus ?? "",
|
||||
["healthStatus"] = t.HealthStatus?.Value ?? "unknown",
|
||||
["createdAt"] = t.CreatedAt.GetValueOrDefault().ToUniversalTime().ToString("o", CultureInfo.InvariantCulture),
|
||||
["containers"] = t.Containers.Select(c => new Dictionary<string, object>
|
||||
{
|
||||
["name"] = c.Name ?? "",
|
||||
["lastStatus"] = c.LastStatus ?? "",
|
||||
["exitCode"] = c.ExitCode ?? -1,
|
||||
["healthStatus"] = c.HealthStatus?.Value ?? "unknown"
|
||||
}).ToList()
|
||||
}).ToList();
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Described ECS service {Service}: {Running}/{Desired} running, {Deployments} deployments",
|
||||
payload.ServiceName,
|
||||
service.RunningCount,
|
||||
service.DesiredCount,
|
||||
service.Deployments.Count);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = true,
|
||||
Outputs = outputs,
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
catch (AmazonECSException ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to describe ECS service {Service}", payload.ServiceName);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = false,
|
||||
Error = $"Failed to describe service: {ex.Message}",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,233 @@
|
||||
using System.Text.Json;
|
||||
using Amazon.ECS;
|
||||
using Amazon.ECS.Model;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Agent.Core.Models;
|
||||
using Task = System.Threading.Tasks.Task;
|
||||
|
||||
namespace StellaOps.Agent.Ecs.Tasks;
|
||||
|
||||
/// <summary>
|
||||
/// Task handler for checking ECS service health.
|
||||
/// </summary>
|
||||
public sealed class EcsHealthCheckTask : IEcsTask
|
||||
{
|
||||
private readonly IAmazonECS _ecsClient;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<EcsHealthCheckTask> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Payload for checking ECS service health.
|
||||
/// </summary>
|
||||
public sealed record HealthCheckPayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Name or ARN of the ECS cluster.
|
||||
/// </summary>
|
||||
public required string Cluster { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Name of the service to check.
|
||||
/// </summary>
|
||||
public required string ServiceName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Minimum healthy percent to consider the service healthy.
|
||||
/// </summary>
|
||||
public int MinHealthyPercent { get; init; } = 100;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to wait for the service to become healthy.
|
||||
/// </summary>
|
||||
public bool WaitForHealthy { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Timeout for waiting for health.
|
||||
/// </summary>
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(5);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new ECS health check task handler.
|
||||
/// </summary>
|
||||
public EcsHealthCheckTask(
|
||||
IAmazonECS ecsClient,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<EcsHealthCheckTask> logger)
|
||||
{
|
||||
_ecsClient = ecsClient;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
|
||||
{
|
||||
var payload = JsonSerializer.Deserialize<HealthCheckPayload>(task.Payload)
|
||||
?? throw new InvalidEcsPayloadException("ecs.health", "Failed to deserialize payload");
|
||||
|
||||
_logger.LogInformation(
|
||||
"Checking health of ECS service {Service} in cluster {Cluster}",
|
||||
payload.ServiceName,
|
||||
payload.Cluster);
|
||||
|
||||
try
|
||||
{
|
||||
if (!payload.WaitForHealthy)
|
||||
{
|
||||
return await CheckHealthOnceAsync(task.Id, payload, ct);
|
||||
}
|
||||
|
||||
return await WaitForHealthyAsync(task.Id, payload, ct);
|
||||
}
|
||||
catch (AmazonECSException ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to check health of ECS service {Service}", payload.ServiceName);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = false,
|
||||
Error = $"Health check failed: {ex.Message}",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<AgentTaskResult> CheckHealthOnceAsync(
|
||||
Guid taskId,
|
||||
HealthCheckPayload payload,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var response = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
|
||||
{
|
||||
Cluster = payload.Cluster,
|
||||
Services = new List<string> { payload.ServiceName }
|
||||
}, ct);
|
||||
|
||||
var service = response.Services.FirstOrDefault();
|
||||
if (service is null)
|
||||
{
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = taskId,
|
||||
Success = false,
|
||||
Error = $"Service '{payload.ServiceName}' not found",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
var healthyPercent = service.DesiredCount > 0
|
||||
? (service.RunningCount * 100) / service.DesiredCount
|
||||
: 0;
|
||||
|
||||
var isHealthy = healthyPercent >= payload.MinHealthyPercent && service.Deployments.Count == 1;
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = taskId,
|
||||
Success = isHealthy,
|
||||
Error = isHealthy ? null : $"Service unhealthy: {healthyPercent}% running (minimum: {payload.MinHealthyPercent}%)",
|
||||
Outputs = new Dictionary<string, object>
|
||||
{
|
||||
["serviceName"] = service.ServiceName ?? "",
|
||||
["serviceArn"] = service.ServiceArn ?? "",
|
||||
["runningCount"] = service.RunningCount,
|
||||
["desiredCount"] = service.DesiredCount,
|
||||
["healthyPercent"] = healthyPercent,
|
||||
["status"] = service.Status ?? "",
|
||||
["deploymentCount"] = service.Deployments.Count,
|
||||
["isHealthy"] = isHealthy
|
||||
},
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
private async System.Threading.Tasks.Task<AgentTaskResult> WaitForHealthyAsync(
|
||||
Guid taskId,
|
||||
HealthCheckPayload payload,
|
||||
CancellationToken ct)
|
||||
{
|
||||
using var timeoutCts = new CancellationTokenSource(payload.Timeout);
|
||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
|
||||
|
||||
try
|
||||
{
|
||||
while (!linkedCts.IsCancellationRequested)
|
||||
{
|
||||
var response = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
|
||||
{
|
||||
Cluster = payload.Cluster,
|
||||
Services = new List<string> { payload.ServiceName }
|
||||
}, linkedCts.Token);
|
||||
|
||||
var service = response.Services.FirstOrDefault();
|
||||
if (service is null)
|
||||
{
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = taskId,
|
||||
Success = false,
|
||||
Error = $"Service '{payload.ServiceName}' not found",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
var healthyPercent = service.DesiredCount > 0
|
||||
? (service.RunningCount * 100) / service.DesiredCount
|
||||
: 0;
|
||||
|
||||
if (healthyPercent >= payload.MinHealthyPercent && service.Deployments.Count == 1)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Service {Service} is healthy: {Running}/{Desired} tasks running ({Percent}%)",
|
||||
payload.ServiceName,
|
||||
service.RunningCount,
|
||||
service.DesiredCount,
|
||||
healthyPercent);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = taskId,
|
||||
Success = true,
|
||||
Outputs = new Dictionary<string, object>
|
||||
{
|
||||
["serviceName"] = service.ServiceName ?? "",
|
||||
["serviceArn"] = service.ServiceArn ?? "",
|
||||
["runningCount"] = service.RunningCount,
|
||||
["desiredCount"] = service.DesiredCount,
|
||||
["healthyPercent"] = healthyPercent,
|
||||
["status"] = service.Status ?? "",
|
||||
["isHealthy"] = true
|
||||
},
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Service {Service} health check: {Running}/{Desired} ({Percent}%), waiting...",
|
||||
payload.ServiceName,
|
||||
service.RunningCount,
|
||||
service.DesiredCount,
|
||||
healthyPercent);
|
||||
|
||||
await Task.Delay(TimeSpan.FromSeconds(10), linkedCts.Token);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Health check timed out after {Timeout} for service {Service}",
|
||||
payload.Timeout,
|
||||
payload.ServiceName);
|
||||
}
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = taskId,
|
||||
Success = false,
|
||||
Error = $"Health check timed out after {payload.Timeout}",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,282 @@
|
||||
using System.Text.Json;
|
||||
using Amazon.ECS;
|
||||
using Amazon.ECS.Model;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Agent.Core.Models;
|
||||
|
||||
namespace StellaOps.Agent.Ecs.Tasks;
|
||||
|
||||
/// <summary>
|
||||
/// Task handler for registering ECS task definitions.
|
||||
/// </summary>
|
||||
public sealed class EcsRegisterTaskDefinitionTask : IEcsTask
|
||||
{
|
||||
private readonly IAmazonECS _ecsClient;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<EcsRegisterTaskDefinitionTask> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Payload for registering an ECS task definition.
|
||||
/// </summary>
|
||||
public sealed record RegisterTaskDefinitionPayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Family name for the task definition.
|
||||
/// </summary>
|
||||
public required string Family { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Container definitions.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<ContainerDefinitionPayload> ContainerDefinitions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Task-level CPU.
|
||||
/// </summary>
|
||||
public string? Cpu { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Task-level memory.
|
||||
/// </summary>
|
||||
public string? Memory { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Network mode.
|
||||
/// </summary>
|
||||
public string? NetworkMode { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Task role ARN.
|
||||
/// </summary>
|
||||
public string? TaskRoleArn { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Execution role ARN.
|
||||
/// </summary>
|
||||
public string? ExecutionRoleArn { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Required capabilities (FARGATE, EC2).
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? RequiresCompatibilities { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Tags to apply.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, string>? Tags { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Container definition payload.
|
||||
/// </summary>
|
||||
public sealed record ContainerDefinitionPayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Container name.
|
||||
/// </summary>
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Container image.
|
||||
/// </summary>
|
||||
public required string Image { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Container CPU units.
|
||||
/// </summary>
|
||||
public int? Cpu { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Container memory in MB.
|
||||
/// </summary>
|
||||
public int? Memory { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Container memory reservation in MB.
|
||||
/// </summary>
|
||||
public int? MemoryReservation { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Port mappings.
|
||||
/// </summary>
|
||||
public IReadOnlyList<PortMappingPayload>? PortMappings { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Environment variables.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, string>? Environment { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether the container is essential.
|
||||
/// </summary>
|
||||
public bool Essential { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Entry point override.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? EntryPoint { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Command override.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? Command { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Log configuration.
|
||||
/// </summary>
|
||||
public LogConfigurationPayload? LogConfiguration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Port mapping payload.
|
||||
/// </summary>
|
||||
public sealed record PortMappingPayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Container port.
|
||||
/// </summary>
|
||||
public required int ContainerPort { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Host port.
|
||||
/// </summary>
|
||||
public int? HostPort { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Protocol (tcp or udp).
|
||||
/// </summary>
|
||||
public string Protocol { get; init; } = "tcp";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Log configuration payload.
|
||||
/// </summary>
|
||||
public sealed record LogConfigurationPayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Log driver.
|
||||
/// </summary>
|
||||
public required string LogDriver { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Log driver options.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, string>? Options { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new ECS register task definition handler.
|
||||
/// </summary>
|
||||
public EcsRegisterTaskDefinitionTask(
|
||||
IAmazonECS ecsClient,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<EcsRegisterTaskDefinitionTask> logger)
|
||||
{
|
||||
_ecsClient = ecsClient;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
|
||||
{
|
||||
var payload = JsonSerializer.Deserialize<RegisterTaskDefinitionPayload>(task.Payload)
|
||||
?? throw new InvalidEcsPayloadException("ecs.register", "Failed to deserialize payload");
|
||||
|
||||
_logger.LogInformation(
|
||||
"Registering ECS task definition for family {Family}",
|
||||
payload.Family);
|
||||
|
||||
try
|
||||
{
|
||||
var request = new RegisterTaskDefinitionRequest
|
||||
{
|
||||
Family = payload.Family,
|
||||
Cpu = payload.Cpu,
|
||||
Memory = payload.Memory,
|
||||
TaskRoleArn = payload.TaskRoleArn,
|
||||
ExecutionRoleArn = payload.ExecutionRoleArn,
|
||||
ContainerDefinitions = payload.ContainerDefinitions.Select(c => new ContainerDefinition
|
||||
{
|
||||
Name = c.Name,
|
||||
Image = c.Image,
|
||||
Cpu = c.Cpu ?? 0,
|
||||
Memory = c.Memory,
|
||||
MemoryReservation = c.MemoryReservation,
|
||||
Essential = c.Essential,
|
||||
EntryPoint = c.EntryPoint?.ToList(),
|
||||
Command = c.Command?.ToList(),
|
||||
PortMappings = c.PortMappings?.Select(p => new PortMapping
|
||||
{
|
||||
ContainerPort = p.ContainerPort,
|
||||
HostPort = p.HostPort ?? p.ContainerPort,
|
||||
Protocol = p.Protocol
|
||||
}).ToList(),
|
||||
Environment = c.Environment?.Select(kv => new Amazon.ECS.Model.KeyValuePair
|
||||
{
|
||||
Name = kv.Key,
|
||||
Value = kv.Value
|
||||
}).ToList(),
|
||||
LogConfiguration = c.LogConfiguration is not null
|
||||
? new LogConfiguration
|
||||
{
|
||||
LogDriver = c.LogConfiguration.LogDriver,
|
||||
Options = c.LogConfiguration.Options?.ToDictionary(kv => kv.Key, kv => kv.Value)
|
||||
}
|
||||
: null
|
||||
}).ToList()
|
||||
};
|
||||
|
||||
if (!string.IsNullOrEmpty(payload.NetworkMode))
|
||||
{
|
||||
request.NetworkMode = new NetworkMode(payload.NetworkMode);
|
||||
}
|
||||
|
||||
if (payload.RequiresCompatibilities is not null)
|
||||
{
|
||||
request.RequiresCompatibilities = payload.RequiresCompatibilities.ToList();
|
||||
}
|
||||
|
||||
if (payload.Tags is not null)
|
||||
{
|
||||
request.Tags = payload.Tags.Select(kv => new Tag { Key = kv.Key, Value = kv.Value }).ToList();
|
||||
}
|
||||
|
||||
var response = await _ecsClient.RegisterTaskDefinitionAsync(request, ct);
|
||||
var taskDef = response.TaskDefinition;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Registered ECS task definition {Family}:{Revision} (ARN: {Arn})",
|
||||
taskDef.Family,
|
||||
taskDef.Revision,
|
||||
taskDef.TaskDefinitionArn);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = true,
|
||||
Outputs = new Dictionary<string, object>
|
||||
{
|
||||
["taskDefinitionArn"] = taskDef.TaskDefinitionArn ?? "",
|
||||
["family"] = taskDef.Family ?? "",
|
||||
["revision"] = taskDef.Revision,
|
||||
["status"] = taskDef.Status?.Value ?? "",
|
||||
["containerCount"] = taskDef.ContainerDefinitions.Count
|
||||
},
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
catch (AmazonECSException ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to register ECS task definition for family {Family}", payload.Family);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = false,
|
||||
Error = $"Failed to register task definition: {ex.Message}",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,331 @@
|
||||
using System.Text.Json;
|
||||
using Amazon.ECS;
|
||||
using Amazon.ECS.Model;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Agent.Core.Models;
|
||||
using Task = System.Threading.Tasks.Task;
|
||||
|
||||
namespace StellaOps.Agent.Ecs.Tasks;
|
||||
|
||||
/// <summary>
|
||||
/// Task handler for running ECS tasks.
|
||||
/// </summary>
|
||||
public sealed class EcsRunTaskTask : IEcsTask
|
||||
{
|
||||
private readonly IAmazonECS _ecsClient;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<EcsRunTaskTask> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Payload for running an ECS task.
|
||||
/// </summary>
|
||||
public sealed record RunTaskPayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Name or ARN of the ECS cluster.
|
||||
/// </summary>
|
||||
public required string Cluster { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Task definition family:revision or ARN.
|
||||
/// </summary>
|
||||
public required string TaskDefinition { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of tasks to run.
|
||||
/// </summary>
|
||||
public int Count { get; init; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// Launch type (FARGATE or EC2).
|
||||
/// </summary>
|
||||
public string? LaunchType { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Network configuration for awsvpc mode.
|
||||
/// </summary>
|
||||
public NetworkConfigurationPayload? NetworkConfiguration { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Container overrides.
|
||||
/// </summary>
|
||||
public IReadOnlyList<ContainerOverridePayload>? Overrides { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Task group.
|
||||
/// </summary>
|
||||
public string? Group { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to wait for task completion.
|
||||
/// </summary>
|
||||
public bool WaitForCompletion { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Timeout for waiting for completion.
|
||||
/// </summary>
|
||||
public TimeSpan CompletionTimeout { get; init; } = TimeSpan.FromMinutes(30);
|
||||
|
||||
/// <summary>
|
||||
/// Tags to apply to the task.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, string>? Tags { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Network configuration payload.
|
||||
/// </summary>
|
||||
public sealed record NetworkConfigurationPayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Subnet IDs.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<string> Subnets { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Security group IDs.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? SecurityGroups { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to assign a public IP.
|
||||
/// </summary>
|
||||
public bool AssignPublicIp { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Container override payload.
|
||||
/// </summary>
|
||||
public sealed record ContainerOverridePayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Container name.
|
||||
/// </summary>
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Command override.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? Command { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Environment variable overrides.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, string>? Environment { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// CPU override.
|
||||
/// </summary>
|
||||
public int? Cpu { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Memory override.
|
||||
/// </summary>
|
||||
public int? Memory { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new ECS run task handler.
|
||||
/// </summary>
|
||||
public EcsRunTaskTask(
|
||||
IAmazonECS ecsClient,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<EcsRunTaskTask> logger)
|
||||
{
|
||||
_ecsClient = ecsClient;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
|
||||
{
|
||||
var payload = JsonSerializer.Deserialize<RunTaskPayload>(task.Payload)
|
||||
?? throw new InvalidEcsPayloadException("ecs.run", "Failed to deserialize payload");
|
||||
|
||||
_logger.LogInformation(
|
||||
"Running ECS task from definition {TaskDef} on cluster {Cluster}",
|
||||
payload.TaskDefinition,
|
||||
payload.Cluster);
|
||||
|
||||
try
|
||||
{
|
||||
var request = new RunTaskRequest
|
||||
{
|
||||
Cluster = payload.Cluster,
|
||||
TaskDefinition = payload.TaskDefinition,
|
||||
Count = payload.Count,
|
||||
Group = payload.Group
|
||||
};
|
||||
|
||||
if (!string.IsNullOrEmpty(payload.LaunchType))
|
||||
{
|
||||
request.LaunchType = new LaunchType(payload.LaunchType);
|
||||
}
|
||||
|
||||
if (payload.NetworkConfiguration is not null)
|
||||
{
|
||||
request.NetworkConfiguration = new NetworkConfiguration
|
||||
{
|
||||
AwsvpcConfiguration = new AwsVpcConfiguration
|
||||
{
|
||||
Subnets = payload.NetworkConfiguration.Subnets.ToList(),
|
||||
SecurityGroups = payload.NetworkConfiguration.SecurityGroups?.ToList(),
|
||||
AssignPublicIp = payload.NetworkConfiguration.AssignPublicIp
|
||||
? AssignPublicIp.ENABLED
|
||||
: AssignPublicIp.DISABLED
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
if (payload.Overrides is not null && payload.Overrides.Count > 0)
|
||||
{
|
||||
request.Overrides = new TaskOverride
|
||||
{
|
||||
ContainerOverrides = payload.Overrides.Select(o => new ContainerOverride
|
||||
{
|
||||
Name = o.Name,
|
||||
Command = o.Command?.ToList(),
|
||||
Environment = o.Environment?.Select(kv => new Amazon.ECS.Model.KeyValuePair
|
||||
{
|
||||
Name = kv.Key,
|
||||
Value = kv.Value
|
||||
}).ToList(),
|
||||
Cpu = o.Cpu,
|
||||
Memory = o.Memory
|
||||
}).ToList()
|
||||
};
|
||||
}
|
||||
|
||||
if (payload.Tags is not null)
|
||||
{
|
||||
request.Tags = payload.Tags.Select(kv => new Tag { Key = kv.Key, Value = kv.Value }).ToList();
|
||||
}
|
||||
|
||||
var runResponse = await _ecsClient.RunTaskAsync(request, ct);
|
||||
|
||||
if (runResponse.Failures.Count > 0)
|
||||
{
|
||||
var failure = runResponse.Failures.First();
|
||||
_logger.LogError(
|
||||
"Failed to run ECS task: {Reason} (ARN: {Arn})",
|
||||
failure.Reason,
|
||||
failure.Arn);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = false,
|
||||
Error = $"Failed to run task: {failure.Reason}",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
var ecsTasks = runResponse.Tasks;
|
||||
var taskArns = ecsTasks.Select(t => t.TaskArn).ToList();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Started {Count} ECS task(s): {TaskArns}",
|
||||
ecsTasks.Count,
|
||||
string.Join(", ", taskArns.Select(a => a.Split('/').Last())));
|
||||
|
||||
if (!payload.WaitForCompletion)
|
||||
{
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = true,
|
||||
Outputs = new Dictionary<string, object>
|
||||
{
|
||||
["taskArns"] = taskArns,
|
||||
["taskCount"] = ecsTasks.Count,
|
||||
["status"] = "RUNNING"
|
||||
},
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
// Wait for tasks to complete
|
||||
var (completed, exitCodes) = await WaitForTasksAsync(
|
||||
payload.Cluster,
|
||||
taskArns,
|
||||
payload.CompletionTimeout,
|
||||
ct);
|
||||
|
||||
var allSucceeded = completed && exitCodes.All(e => e == 0);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = allSucceeded,
|
||||
Error = allSucceeded ? null : $"Task(s) failed with exit codes: [{string.Join(", ", exitCodes)}]",
|
||||
Outputs = new Dictionary<string, object>
|
||||
{
|
||||
["taskArns"] = taskArns,
|
||||
["taskCount"] = ecsTasks.Count,
|
||||
["exitCodes"] = exitCodes,
|
||||
["status"] = allSucceeded ? "SUCCEEDED" : "FAILED"
|
||||
},
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
catch (AmazonECSException ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to run ECS task from {TaskDef}", payload.TaskDefinition);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = false,
|
||||
Error = $"Failed to run task: {ex.Message}",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<(bool Completed, List<int> ExitCodes)> WaitForTasksAsync(
|
||||
string cluster,
|
||||
List<string> taskArns,
|
||||
TimeSpan timeout,
|
||||
CancellationToken ct)
|
||||
{
|
||||
using var timeoutCts = new CancellationTokenSource(timeout);
|
||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
|
||||
|
||||
var exitCodes = new List<int>();
|
||||
|
||||
try
|
||||
{
|
||||
while (!linkedCts.IsCancellationRequested)
|
||||
{
|
||||
var response = await _ecsClient.DescribeTasksAsync(new DescribeTasksRequest
|
||||
{
|
||||
Cluster = cluster,
|
||||
Tasks = taskArns
|
||||
}, linkedCts.Token);
|
||||
|
||||
var allStopped = response.Tasks.All(t => t.LastStatus == "STOPPED");
|
||||
if (allStopped)
|
||||
{
|
||||
exitCodes = response.Tasks
|
||||
.SelectMany(t => t.Containers.Select(c => c.ExitCode ?? -1))
|
||||
.ToList();
|
||||
|
||||
_logger.LogInformation(
|
||||
"ECS tasks completed with exit codes: [{ExitCodes}]",
|
||||
string.Join(", ", exitCodes));
|
||||
|
||||
return (true, exitCodes);
|
||||
}
|
||||
|
||||
await Task.Delay(TimeSpan.FromSeconds(10), linkedCts.Token);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogWarning("Task completion wait timed out after {Timeout}", timeout);
|
||||
}
|
||||
|
||||
return (false, exitCodes);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,231 @@
|
||||
using System.Text.Json;
|
||||
using Amazon.ECS;
|
||||
using Amazon.ECS.Model;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Agent.Core.Models;
|
||||
using Task = System.Threading.Tasks.Task;
|
||||
|
||||
namespace StellaOps.Agent.Ecs.Tasks;
|
||||
|
||||
/// <summary>
|
||||
/// Task handler for scaling ECS services.
|
||||
/// </summary>
|
||||
public sealed class EcsScaleServiceTask : IEcsTask
|
||||
{
|
||||
private readonly IAmazonECS _ecsClient;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<EcsScaleServiceTask> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Payload for scaling an ECS service.
|
||||
/// </summary>
|
||||
public sealed record ScaleServicePayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Name or ARN of the ECS cluster.
|
||||
/// </summary>
|
||||
public required string Cluster { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Name of the service to scale.
|
||||
/// </summary>
|
||||
public required string ServiceName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Desired number of tasks.
|
||||
/// </summary>
|
||||
public required int DesiredCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to wait for the scaling operation to complete.
|
||||
/// </summary>
|
||||
public bool WaitForStable { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Timeout waiting for stabilization.
|
||||
/// </summary>
|
||||
public TimeSpan StabilizeTimeout { get; init; } = TimeSpan.FromMinutes(5);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new ECS scale service task handler.
|
||||
/// </summary>
|
||||
public EcsScaleServiceTask(
|
||||
IAmazonECS ecsClient,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<EcsScaleServiceTask> logger)
|
||||
{
|
||||
_ecsClient = ecsClient;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
|
||||
{
|
||||
var payload = JsonSerializer.Deserialize<ScaleServicePayload>(task.Payload)
|
||||
?? throw new InvalidEcsPayloadException("ecs.scale", "Failed to deserialize payload");
|
||||
|
||||
_logger.LogInformation(
|
||||
"Scaling ECS service {Service} in cluster {Cluster} to {DesiredCount} tasks",
|
||||
payload.ServiceName,
|
||||
payload.Cluster,
|
||||
payload.DesiredCount);
|
||||
|
||||
try
|
||||
{
|
||||
// Get current service state
|
||||
var describeResponse = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
|
||||
{
|
||||
Cluster = payload.Cluster,
|
||||
Services = new List<string> { payload.ServiceName }
|
||||
}, ct);
|
||||
|
||||
var currentService = describeResponse.Services.FirstOrDefault();
|
||||
if (currentService is null)
|
||||
{
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = false,
|
||||
Error = $"Service '{payload.ServiceName}' not found in cluster '{payload.Cluster}'",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
var previousCount = currentService.DesiredCount;
|
||||
|
||||
// Update desired count
|
||||
var updateResponse = await _ecsClient.UpdateServiceAsync(new UpdateServiceRequest
|
||||
{
|
||||
Cluster = payload.Cluster,
|
||||
Service = payload.ServiceName,
|
||||
DesiredCount = payload.DesiredCount
|
||||
}, ct);
|
||||
|
||||
if (updateResponse.Service is not { } service)
|
||||
{
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = false,
|
||||
Error = $"Service update returned no service object",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Updated ECS service {Service} desired count from {Previous} to {New}",
|
||||
payload.ServiceName,
|
||||
previousCount,
|
||||
payload.DesiredCount);
|
||||
|
||||
if (!payload.WaitForStable)
|
||||
{
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = true,
|
||||
Outputs = new Dictionary<string, object>
|
||||
{
|
||||
["serviceArn"] = service.ServiceArn ?? "",
|
||||
["serviceName"] = service.ServiceName ?? "",
|
||||
["previousDesiredCount"] = previousCount,
|
||||
["newDesiredCount"] = payload.DesiredCount,
|
||||
["runningCount"] = service.RunningCount,
|
||||
["status"] = "SCALING"
|
||||
},
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
// Wait for stable
|
||||
var stable = await WaitForServiceStableAsync(
|
||||
payload.Cluster,
|
||||
payload.ServiceName,
|
||||
payload.DesiredCount,
|
||||
payload.StabilizeTimeout,
|
||||
ct);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = stable,
|
||||
Error = stable ? null : "Service did not stabilize within timeout",
|
||||
Outputs = new Dictionary<string, object>
|
||||
{
|
||||
["serviceArn"] = service.ServiceArn ?? "",
|
||||
["serviceName"] = service.ServiceName ?? "",
|
||||
["previousDesiredCount"] = previousCount,
|
||||
["newDesiredCount"] = payload.DesiredCount,
|
||||
["status"] = stable ? "STABLE" : "UNSTABLE"
|
||||
},
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
catch (AmazonECSException ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to scale ECS service {Service}", payload.ServiceName);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = false,
|
||||
Error = $"Failed to scale service: {ex.Message}",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<bool> WaitForServiceStableAsync(
|
||||
string cluster,
|
||||
string serviceName,
|
||||
int targetCount,
|
||||
TimeSpan timeout,
|
||||
CancellationToken ct)
|
||||
{
|
||||
using var timeoutCts = new CancellationTokenSource(timeout);
|
||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
|
||||
|
||||
try
|
||||
{
|
||||
while (!linkedCts.IsCancellationRequested)
|
||||
{
|
||||
var response = await _ecsClient.DescribeServicesAsync(new DescribeServicesRequest
|
||||
{
|
||||
Cluster = cluster,
|
||||
Services = new List<string> { serviceName }
|
||||
}, linkedCts.Token);
|
||||
|
||||
var service = response.Services.FirstOrDefault();
|
||||
if (service is null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (service.RunningCount == targetCount && service.Deployments.Count == 1)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Service {Service} scaled to {Count} running tasks",
|
||||
serviceName,
|
||||
targetCount);
|
||||
return true;
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Service {Service} scaling: running={Running}, desired={Desired}",
|
||||
serviceName,
|
||||
service.RunningCount,
|
||||
targetCount);
|
||||
|
||||
await Task.Delay(TimeSpan.FromSeconds(5), linkedCts.Token);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
_logger.LogWarning("Service scaling stabilization timed out after {Timeout}", timeout);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,107 @@
|
||||
using System.Text.Json;
|
||||
using Amazon.ECS;
|
||||
using Amazon.ECS.Model;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Agent.Core.Models;
|
||||
|
||||
namespace StellaOps.Agent.Ecs.Tasks;
|
||||
|
||||
/// <summary>
|
||||
/// Task handler for stopping ECS tasks.
|
||||
/// </summary>
|
||||
public sealed class EcsStopTaskTask : IEcsTask
|
||||
{
|
||||
private readonly IAmazonECS _ecsClient;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<EcsStopTaskTask> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Payload for stopping an ECS task.
|
||||
/// </summary>
|
||||
public sealed record StopTaskPayload
|
||||
{
|
||||
/// <summary>
|
||||
/// Name or ARN of the ECS cluster.
|
||||
/// </summary>
|
||||
public required string Cluster { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Task ARN or ID to stop.
|
||||
/// </summary>
|
||||
public required string TaskArn { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Reason for stopping the task.
|
||||
/// </summary>
|
||||
public string? Reason { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new ECS stop task handler.
|
||||
/// </summary>
|
||||
public EcsStopTaskTask(
|
||||
IAmazonECS ecsClient,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<EcsStopTaskTask> logger)
|
||||
{
|
||||
_ecsClient = ecsClient;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<AgentTaskResult> ExecuteAsync(AgentTaskInfo task, CancellationToken ct = default)
|
||||
{
|
||||
var payload = JsonSerializer.Deserialize<StopTaskPayload>(task.Payload)
|
||||
?? throw new InvalidEcsPayloadException("ecs.stop", "Failed to deserialize payload");
|
||||
|
||||
_logger.LogInformation(
|
||||
"Stopping ECS task {TaskArn} in cluster {Cluster}",
|
||||
payload.TaskArn,
|
||||
payload.Cluster);
|
||||
|
||||
try
|
||||
{
|
||||
var request = new StopTaskRequest
|
||||
{
|
||||
Cluster = payload.Cluster,
|
||||
Task = payload.TaskArn,
|
||||
Reason = payload.Reason ?? "Stopped by Stella Agent"
|
||||
};
|
||||
|
||||
var response = await _ecsClient.StopTaskAsync(request, ct);
|
||||
var stoppedTask = response.Task;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Stopped ECS task {TaskArn}, last status: {Status}",
|
||||
stoppedTask.TaskArn,
|
||||
stoppedTask.LastStatus);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = true,
|
||||
Outputs = new Dictionary<string, object>
|
||||
{
|
||||
["taskArn"] = stoppedTask.TaskArn ?? "",
|
||||
["lastStatus"] = stoppedTask.LastStatus ?? "",
|
||||
["stoppedReason"] = stoppedTask.StoppedReason ?? payload.Reason ?? "Stopped by agent",
|
||||
["stoppedAt"] = stoppedTask.StoppedAt.GetValueOrDefault().ToUniversalTime().ToString("o", System.Globalization.CultureInfo.InvariantCulture)
|
||||
},
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
catch (AmazonECSException ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to stop ECS task {TaskArn}", payload.TaskArn);
|
||||
|
||||
return new AgentTaskResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = false,
|
||||
Error = $"Failed to stop task: {ex.Message}",
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user