release orchestration strengthening
This commit is contained in:
542
src/Api/StellaOps.Api/Controllers/EnvironmentsController.cs
Normal file
542
src/Api/StellaOps.Api/Controllers/EnvironmentsController.cs
Normal file
@@ -0,0 +1,542 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// EnvironmentsController.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_observability
|
||||
// Task: API-003 - Environment Management API Endpoints
|
||||
// Description: API endpoints for environment configuration and health
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
|
||||
namespace StellaOps.Api.Controllers;
|
||||
|
||||
/// <summary>
|
||||
/// Controller for environment management endpoints.
|
||||
/// </summary>
|
||||
[ApiController]
|
||||
[Route("v1/environments")]
|
||||
[Authorize]
|
||||
public class EnvironmentsController : ControllerBase
|
||||
{
|
||||
private readonly IEnvironmentService _environmentService;
|
||||
private readonly ILogger<EnvironmentsController> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="EnvironmentsController"/> class.
|
||||
/// </summary>
|
||||
public EnvironmentsController(
|
||||
IEnvironmentService environmentService,
|
||||
ILogger<EnvironmentsController> logger)
|
||||
{
|
||||
_environmentService = environmentService;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Lists all configured environments.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>List of environments.</returns>
|
||||
[HttpGet]
|
||||
[ProducesResponseType(typeof(ListEnvironmentsResponse), StatusCodes.Status200OK)]
|
||||
public async Task<IActionResult> ListEnvironments(CancellationToken ct)
|
||||
{
|
||||
_logger.LogDebug("Listing environments");
|
||||
|
||||
var environments = await _environmentService.ListEnvironmentsAsync(ct);
|
||||
|
||||
return Ok(new ListEnvironmentsResponse { Environments = environments });
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a specific environment by name.
|
||||
/// </summary>
|
||||
/// <param name="environmentName">The environment name.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The environment details.</returns>
|
||||
[HttpGet("{environmentName}")]
|
||||
[ProducesResponseType(typeof(EnvironmentDto), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> GetEnvironment(
|
||||
[FromRoute] string environmentName,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var environment = await _environmentService.GetEnvironmentAsync(environmentName, ct);
|
||||
|
||||
if (environment is null)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Environment not found",
|
||||
Detail = $"Environment '{environmentName}' does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
|
||||
return Ok(environment);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new environment.
|
||||
/// </summary>
|
||||
/// <param name="request">The environment creation request.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The created environment.</returns>
|
||||
[HttpPost]
|
||||
[ProducesResponseType(typeof(EnvironmentDto), StatusCodes.Status201Created)]
|
||||
[ProducesResponseType(typeof(ValidationProblemDetails), StatusCodes.Status400BadRequest)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status409Conflict)]
|
||||
public async Task<IActionResult> CreateEnvironment(
|
||||
[FromBody] CreateEnvironmentRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation("Creating environment {Name}", request.Name);
|
||||
|
||||
try
|
||||
{
|
||||
var environment = await _environmentService.CreateEnvironmentAsync(request, ct);
|
||||
|
||||
return CreatedAtAction(
|
||||
nameof(GetEnvironment),
|
||||
new { environmentName = environment.Name },
|
||||
environment);
|
||||
}
|
||||
catch (EnvironmentAlreadyExistsException)
|
||||
{
|
||||
return Conflict(new ProblemDetails
|
||||
{
|
||||
Title = "Environment already exists",
|
||||
Detail = $"Environment '{request.Name}' already exists",
|
||||
Status = StatusCodes.Status409Conflict
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates an existing environment.
|
||||
/// </summary>
|
||||
/// <param name="environmentName">The environment name.</param>
|
||||
/// <param name="request">The environment update request.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The updated environment.</returns>
|
||||
[HttpPut("{environmentName}")]
|
||||
[ProducesResponseType(typeof(EnvironmentDto), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> UpdateEnvironment(
|
||||
[FromRoute] string environmentName,
|
||||
[FromBody] UpdateEnvironmentRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation("Updating environment {Name}", environmentName);
|
||||
|
||||
try
|
||||
{
|
||||
var environment = await _environmentService.UpdateEnvironmentAsync(
|
||||
environmentName, request, ct);
|
||||
return Ok(environment);
|
||||
}
|
||||
catch (EnvironmentNotFoundException)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Environment not found",
|
||||
Detail = $"Environment '{environmentName}' does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deletes an environment.
|
||||
/// </summary>
|
||||
/// <param name="environmentName">The environment name.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>No content on success.</returns>
|
||||
[HttpDelete("{environmentName}")]
|
||||
[ProducesResponseType(StatusCodes.Status204NoContent)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status409Conflict)]
|
||||
public async Task<IActionResult> DeleteEnvironment(
|
||||
[FromRoute] string environmentName,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogWarning("Deleting environment {Name}", environmentName);
|
||||
|
||||
try
|
||||
{
|
||||
await _environmentService.DeleteEnvironmentAsync(environmentName, ct);
|
||||
return NoContent();
|
||||
}
|
||||
catch (EnvironmentNotFoundException)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Environment not found",
|
||||
Detail = $"Environment '{environmentName}' does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
catch (EnvironmentInUseException)
|
||||
{
|
||||
return Conflict(new ProblemDetails
|
||||
{
|
||||
Title = "Environment in use",
|
||||
Detail = $"Environment '{environmentName}' has active releases and cannot be deleted",
|
||||
Status = StatusCodes.Status409Conflict
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the health status of an environment.
|
||||
/// </summary>
|
||||
/// <param name="environmentName">The environment name.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The environment health.</returns>
|
||||
[HttpGet("{environmentName}/health")]
|
||||
[ProducesResponseType(typeof(EnvironmentHealthDto), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> GetEnvironmentHealth(
|
||||
[FromRoute] string environmentName,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var health = await _environmentService.GetEnvironmentHealthAsync(environmentName, ct);
|
||||
|
||||
if (health is null)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Environment not found",
|
||||
Detail = $"Environment '{environmentName}' does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
|
||||
return Ok(health);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current deployments in an environment.
|
||||
/// </summary>
|
||||
/// <param name="environmentName">The environment name.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The current deployments.</returns>
|
||||
[HttpGet("{environmentName}/deployments")]
|
||||
[ProducesResponseType(typeof(ListDeploymentsResponse), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> GetEnvironmentDeployments(
|
||||
[FromRoute] string environmentName,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var deployments = await _environmentService.GetDeploymentsAsync(environmentName, ct);
|
||||
|
||||
if (deployments is null)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Environment not found",
|
||||
Detail = $"Environment '{environmentName}' does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
|
||||
return Ok(new ListDeploymentsResponse { Deployments = deployments });
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the promotion path for an environment.
|
||||
/// </summary>
|
||||
/// <param name="environmentName">The environment name.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The promotion path.</returns>
|
||||
[HttpGet("{environmentName}/promotion-path")]
|
||||
[ProducesResponseType(typeof(PromotionPathDto), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> GetPromotionPath(
|
||||
[FromRoute] string environmentName,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var path = await _environmentService.GetPromotionPathAsync(environmentName, ct);
|
||||
|
||||
if (path is null)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Environment not found",
|
||||
Detail = $"Environment '{environmentName}' does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
|
||||
return Ok(path);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Locks an environment to prevent deployments.
|
||||
/// </summary>
|
||||
/// <param name="environmentName">The environment name.</param>
|
||||
/// <param name="request">The lock request.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The lock result.</returns>
|
||||
[HttpPost("{environmentName}/lock")]
|
||||
[ProducesResponseType(typeof(EnvironmentLockDto), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> LockEnvironment(
|
||||
[FromRoute] string environmentName,
|
||||
[FromBody] LockEnvironmentRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Locking environment {Environment}, reason: {Reason}",
|
||||
environmentName, request.Reason);
|
||||
|
||||
try
|
||||
{
|
||||
var lockResult = await _environmentService.LockEnvironmentAsync(
|
||||
environmentName, request.Reason, request.ExpiresAt, ct);
|
||||
return Ok(lockResult);
|
||||
}
|
||||
catch (EnvironmentNotFoundException)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Environment not found",
|
||||
Detail = $"Environment '{environmentName}' does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unlocks an environment.
|
||||
/// </summary>
|
||||
/// <param name="environmentName">The environment name.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>No content on success.</returns>
|
||||
[HttpDelete("{environmentName}/lock")]
|
||||
[ProducesResponseType(StatusCodes.Status204NoContent)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> UnlockEnvironment(
|
||||
[FromRoute] string environmentName,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation("Unlocking environment {Environment}", environmentName);
|
||||
|
||||
try
|
||||
{
|
||||
await _environmentService.UnlockEnvironmentAsync(environmentName, ct);
|
||||
return NoContent();
|
||||
}
|
||||
catch (EnvironmentNotFoundException)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Environment not found",
|
||||
Detail = $"Environment '{environmentName}' does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region Request/Response DTOs
|
||||
|
||||
/// <summary>
|
||||
/// Response for listing environments.
|
||||
/// </summary>
|
||||
public sealed record ListEnvironmentsResponse
|
||||
{
|
||||
public required IReadOnlyList<EnvironmentDto> Environments { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Environment data transfer object.
|
||||
/// </summary>
|
||||
public sealed record EnvironmentDto
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required string DisplayName { get; init; }
|
||||
public required int Order { get; init; }
|
||||
public required bool IsProduction { get; init; }
|
||||
public required bool IsLocked { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public string? NextEnvironment { get; init; }
|
||||
public string? PreviousEnvironment { get; init; }
|
||||
public ImmutableDictionary<string, string> Labels { get; init; } =
|
||||
ImmutableDictionary<string, string>.Empty;
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to create an environment.
|
||||
/// </summary>
|
||||
public sealed record CreateEnvironmentRequest
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required string DisplayName { get; init; }
|
||||
public int Order { get; init; } = 100;
|
||||
public bool IsProduction { get; init; } = false;
|
||||
public string? Description { get; init; }
|
||||
public string? NextEnvironment { get; init; }
|
||||
public ImmutableDictionary<string, string> Labels { get; init; } =
|
||||
ImmutableDictionary<string, string>.Empty;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to update an environment.
|
||||
/// </summary>
|
||||
public sealed record UpdateEnvironmentRequest
|
||||
{
|
||||
public string? DisplayName { get; init; }
|
||||
public int? Order { get; init; }
|
||||
public bool? IsProduction { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public string? NextEnvironment { get; init; }
|
||||
public ImmutableDictionary<string, string>? Labels { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Environment health DTO.
|
||||
/// </summary>
|
||||
public sealed record EnvironmentHealthDto
|
||||
{
|
||||
public required string Environment { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required int HealthyComponents { get; init; }
|
||||
public required int TotalComponents { get; init; }
|
||||
public double HealthPercentage => TotalComponents > 0
|
||||
? (double)HealthyComponents / TotalComponents * 100
|
||||
: 0;
|
||||
public required IReadOnlyList<ComponentHealthDto> Components { get; init; }
|
||||
public required DateTimeOffset CheckedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Component health DTO.
|
||||
/// </summary>
|
||||
public sealed record ComponentHealthDto
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public string? Version { get; init; }
|
||||
public string? Message { get; init; }
|
||||
public DateTimeOffset? LastHeartbeat { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response for listing deployments.
|
||||
/// </summary>
|
||||
public sealed record ListDeploymentsResponse
|
||||
{
|
||||
public required IReadOnlyList<DeploymentDto> Deployments { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deployment DTO.
|
||||
/// </summary>
|
||||
public sealed record DeploymentDto
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required string ArtifactDigest { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required DateTimeOffset DeployedAt { get; init; }
|
||||
public string? DeployedBy { get; init; }
|
||||
public Guid? ReleaseId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Promotion path DTO.
|
||||
/// </summary>
|
||||
public sealed record PromotionPathDto
|
||||
{
|
||||
public required string CurrentEnvironment { get; init; }
|
||||
public required IReadOnlyList<string> PrecedingEnvironments { get; init; }
|
||||
public required IReadOnlyList<string> FollowingEnvironments { get; init; }
|
||||
public required IReadOnlyList<PromotionStepDto> Steps { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Promotion step DTO.
|
||||
/// </summary>
|
||||
public sealed record PromotionStepDto
|
||||
{
|
||||
public required string FromEnvironment { get; init; }
|
||||
public required string ToEnvironment { get; init; }
|
||||
public required bool RequiresApproval { get; init; }
|
||||
public required IReadOnlyList<string> RequiredGates { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to lock an environment.
|
||||
/// </summary>
|
||||
public sealed record LockEnvironmentRequest
|
||||
{
|
||||
public required string Reason { get; init; }
|
||||
public DateTimeOffset? ExpiresAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Environment lock DTO.
|
||||
/// </summary>
|
||||
public sealed record EnvironmentLockDto
|
||||
{
|
||||
public required Guid LockId { get; init; }
|
||||
public required string Environment { get; init; }
|
||||
public required string LockedBy { get; init; }
|
||||
public required string Reason { get; init; }
|
||||
public required DateTimeOffset LockedAt { get; init; }
|
||||
public DateTimeOffset? ExpiresAt { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Interfaces
|
||||
|
||||
/// <summary>
|
||||
/// Interface for environment service.
|
||||
/// </summary>
|
||||
public interface IEnvironmentService
|
||||
{
|
||||
Task<IReadOnlyList<EnvironmentDto>> ListEnvironmentsAsync(CancellationToken ct);
|
||||
Task<EnvironmentDto?> GetEnvironmentAsync(string name, CancellationToken ct);
|
||||
Task<EnvironmentDto> CreateEnvironmentAsync(CreateEnvironmentRequest request, CancellationToken ct);
|
||||
Task<EnvironmentDto> UpdateEnvironmentAsync(string name, UpdateEnvironmentRequest request, CancellationToken ct);
|
||||
Task DeleteEnvironmentAsync(string name, CancellationToken ct);
|
||||
Task<EnvironmentHealthDto?> GetEnvironmentHealthAsync(string name, CancellationToken ct);
|
||||
Task<IReadOnlyList<DeploymentDto>?> GetDeploymentsAsync(string name, CancellationToken ct);
|
||||
Task<PromotionPathDto?> GetPromotionPathAsync(string name, CancellationToken ct);
|
||||
Task<EnvironmentLockDto> LockEnvironmentAsync(string name, string reason, DateTimeOffset? expiresAt, CancellationToken ct);
|
||||
Task UnlockEnvironmentAsync(string name, CancellationToken ct);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Exceptions
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when an environment is not found.
|
||||
/// </summary>
|
||||
public class EnvironmentNotFoundException : Exception
|
||||
{
|
||||
public EnvironmentNotFoundException(string name) : base($"Environment '{name}' not found") { }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when an environment already exists.
|
||||
/// </summary>
|
||||
public class EnvironmentAlreadyExistsException : Exception
|
||||
{
|
||||
public EnvironmentAlreadyExistsException(string name) : base($"Environment '{name}' already exists") { }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when an environment is in use.
|
||||
/// </summary>
|
||||
public class EnvironmentInUseException : Exception
|
||||
{
|
||||
public EnvironmentInUseException(string name) : base($"Environment '{name}' is in use") { }
|
||||
}
|
||||
|
||||
#endregion
|
||||
422
src/Api/StellaOps.Api/Controllers/GatesController.cs
Normal file
422
src/Api/StellaOps.Api/Controllers/GatesController.cs
Normal file
@@ -0,0 +1,422 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// GatesController.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_observability
|
||||
// Task: API-002 - Gate Management API Endpoints
|
||||
// Description: API endpoints for gate evaluation and management
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
|
||||
namespace StellaOps.Api.Controllers;
|
||||
|
||||
/// <summary>
|
||||
/// Controller for gate management endpoints.
|
||||
/// </summary>
|
||||
[ApiController]
|
||||
[Route("v1/gates")]
|
||||
[Authorize]
|
||||
public class GatesController : ControllerBase
|
||||
{
|
||||
private readonly IGateService _gateService;
|
||||
private readonly IGateEvaluator _gateEvaluator;
|
||||
private readonly ILogger<GatesController> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="GatesController"/> class.
|
||||
/// </summary>
|
||||
public GatesController(
|
||||
IGateService gateService,
|
||||
IGateEvaluator gateEvaluator,
|
||||
ILogger<GatesController> logger)
|
||||
{
|
||||
_gateService = gateService;
|
||||
_gateEvaluator = gateEvaluator;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Lists all configured gates.
|
||||
/// </summary>
|
||||
/// <param name="environment">Filter by environment.</param>
|
||||
/// <param name="gateType">Filter by gate type.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>List of gates.</returns>
|
||||
[HttpGet]
|
||||
[ProducesResponseType(typeof(ListGatesResponse), StatusCodes.Status200OK)]
|
||||
public async Task<IActionResult> ListGates(
|
||||
[FromQuery] string? environment,
|
||||
[FromQuery] string? gateType,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Listing gates: environment={Environment}, type={GateType}",
|
||||
environment, gateType);
|
||||
|
||||
var gates = await _gateService.ListGatesAsync(environment, gateType, ct);
|
||||
|
||||
return Ok(new ListGatesResponse { Gates = gates });
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a specific gate by ID.
|
||||
/// </summary>
|
||||
/// <param name="gateId">The gate ID.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The gate details.</returns>
|
||||
[HttpGet("{gateId:guid}")]
|
||||
[ProducesResponseType(typeof(GateDto), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> GetGate(
|
||||
[FromRoute] Guid gateId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var gate = await _gateService.GetGateAsync(gateId, ct);
|
||||
|
||||
if (gate is null)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Gate not found",
|
||||
Detail = $"Gate {gateId} does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
|
||||
return Ok(gate);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new gate.
|
||||
/// </summary>
|
||||
/// <param name="request">The gate creation request.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The created gate.</returns>
|
||||
[HttpPost]
|
||||
[ProducesResponseType(typeof(GateDto), StatusCodes.Status201Created)]
|
||||
[ProducesResponseType(typeof(ValidationProblemDetails), StatusCodes.Status400BadRequest)]
|
||||
public async Task<IActionResult> CreateGate(
|
||||
[FromBody] CreateGateRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Creating gate {Name} of type {GateType}",
|
||||
request.Name, request.GateType);
|
||||
|
||||
var gate = await _gateService.CreateGateAsync(request, ct);
|
||||
|
||||
return CreatedAtAction(
|
||||
nameof(GetGate),
|
||||
new { gateId = gate.Id },
|
||||
gate);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates an existing gate.
|
||||
/// </summary>
|
||||
/// <param name="gateId">The gate ID.</param>
|
||||
/// <param name="request">The gate update request.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The updated gate.</returns>
|
||||
[HttpPut("{gateId:guid}")]
|
||||
[ProducesResponseType(typeof(GateDto), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> UpdateGate(
|
||||
[FromRoute] Guid gateId,
|
||||
[FromBody] UpdateGateRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation("Updating gate {GateId}", gateId);
|
||||
|
||||
try
|
||||
{
|
||||
var gate = await _gateService.UpdateGateAsync(gateId, request, ct);
|
||||
return Ok(gate);
|
||||
}
|
||||
catch (GateNotFoundException)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Gate not found",
|
||||
Detail = $"Gate {gateId} does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deletes a gate.
|
||||
/// </summary>
|
||||
/// <param name="gateId">The gate ID.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>No content on success.</returns>
|
||||
[HttpDelete("{gateId:guid}")]
|
||||
[ProducesResponseType(StatusCodes.Status204NoContent)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> DeleteGate(
|
||||
[FromRoute] Guid gateId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogWarning("Deleting gate {GateId}", gateId);
|
||||
|
||||
try
|
||||
{
|
||||
await _gateService.DeleteGateAsync(gateId, ct);
|
||||
return NoContent();
|
||||
}
|
||||
catch (GateNotFoundException)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Gate not found",
|
||||
Detail = $"Gate {gateId} does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evaluates gates for a release.
|
||||
/// </summary>
|
||||
/// <param name="request">The evaluation request.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The evaluation results.</returns>
|
||||
[HttpPost("evaluate")]
|
||||
[ProducesResponseType(typeof(GateEvaluationResponse), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ValidationProblemDetails), StatusCodes.Status400BadRequest)]
|
||||
public async Task<IActionResult> EvaluateGates(
|
||||
[FromBody] EvaluateGatesRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Evaluating gates for release {ReleaseId} to {Environment}",
|
||||
request.ReleaseId, request.TargetEnvironment);
|
||||
|
||||
var result = await _gateEvaluator.EvaluateAsync(
|
||||
request.ReleaseId,
|
||||
request.TargetEnvironment,
|
||||
request.ArtifactDigest,
|
||||
ct);
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the evaluation history for a release.
|
||||
/// </summary>
|
||||
/// <param name="releaseId">The release ID.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The evaluation history.</returns>
|
||||
[HttpGet("evaluations/{releaseId:guid}")]
|
||||
[ProducesResponseType(typeof(GateEvaluationHistoryResponse), StatusCodes.Status200OK)]
|
||||
public async Task<IActionResult> GetEvaluationHistory(
|
||||
[FromRoute] Guid releaseId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var history = await _gateService.GetEvaluationHistoryAsync(releaseId, ct);
|
||||
|
||||
return Ok(new GateEvaluationHistoryResponse
|
||||
{
|
||||
ReleaseId = releaseId,
|
||||
Evaluations = history
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Overrides a gate evaluation (requires elevated permissions).
|
||||
/// </summary>
|
||||
/// <param name="gateId">The gate ID.</param>
|
||||
/// <param name="request">The override request.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The override result.</returns>
|
||||
[HttpPost("{gateId:guid}/override")]
|
||||
[Authorize(Policy = "GateOverride")]
|
||||
[ProducesResponseType(typeof(GateOverrideResult), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status403Forbidden)]
|
||||
public async Task<IActionResult> OverrideGate(
|
||||
[FromRoute] Guid gateId,
|
||||
[FromBody] GateOverrideRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Overriding gate {GateId} for release {ReleaseId}, reason: {Reason}",
|
||||
gateId, request.ReleaseId, request.Reason);
|
||||
|
||||
var result = await _gateService.OverrideGateAsync(
|
||||
gateId,
|
||||
request.ReleaseId,
|
||||
request.Reason,
|
||||
request.ExpiresAt,
|
||||
ct);
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
}
|
||||
|
||||
#region Request/Response DTOs
|
||||
|
||||
/// <summary>
|
||||
/// Response for listing gates.
|
||||
/// </summary>
|
||||
public sealed record ListGatesResponse
|
||||
{
|
||||
public required IReadOnlyList<GateDto> Gates { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gate data transfer object.
|
||||
/// </summary>
|
||||
public sealed record GateDto
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public required string GateType { get; init; }
|
||||
public required string Environment { get; init; }
|
||||
public required bool IsEnabled { get; init; }
|
||||
public required bool IsBlocking { get; init; }
|
||||
public int Order { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public ImmutableDictionary<string, object> Configuration { get; init; } =
|
||||
ImmutableDictionary<string, object>.Empty;
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset? UpdatedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to create a gate.
|
||||
/// </summary>
|
||||
public sealed record CreateGateRequest
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required string GateType { get; init; }
|
||||
public required string Environment { get; init; }
|
||||
public bool IsBlocking { get; init; } = true;
|
||||
public int Order { get; init; } = 100;
|
||||
public string? Description { get; init; }
|
||||
public ImmutableDictionary<string, object> Configuration { get; init; } =
|
||||
ImmutableDictionary<string, object>.Empty;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to update a gate.
|
||||
/// </summary>
|
||||
public sealed record UpdateGateRequest
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public bool? IsEnabled { get; init; }
|
||||
public bool? IsBlocking { get; init; }
|
||||
public int? Order { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public ImmutableDictionary<string, object>? Configuration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to evaluate gates.
|
||||
/// </summary>
|
||||
public sealed record EvaluateGatesRequest
|
||||
{
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required string TargetEnvironment { get; init; }
|
||||
public required string ArtifactDigest { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response for gate evaluation.
|
||||
/// </summary>
|
||||
public sealed record GateEvaluationResponse
|
||||
{
|
||||
public required Guid EvaluationId { get; init; }
|
||||
public required bool AllPassed { get; init; }
|
||||
public required IReadOnlyList<GateEvaluationResultDto> Results { get; init; }
|
||||
public required DateTimeOffset EvaluatedAt { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a single gate evaluation.
|
||||
/// </summary>
|
||||
public sealed record GateEvaluationResultDto
|
||||
{
|
||||
public required Guid GateId { get; init; }
|
||||
public required string GateName { get; init; }
|
||||
public required string GateType { get; init; }
|
||||
public required bool Passed { get; init; }
|
||||
public required bool IsBlocking { get; init; }
|
||||
public string? Message { get; init; }
|
||||
public ImmutableDictionary<string, object> Details { get; init; } =
|
||||
ImmutableDictionary<string, object>.Empty;
|
||||
public TimeSpan Duration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response for gate evaluation history.
|
||||
/// </summary>
|
||||
public sealed record GateEvaluationHistoryResponse
|
||||
{
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required IReadOnlyList<GateEvaluationResponse> Evaluations { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to override a gate.
|
||||
/// </summary>
|
||||
public sealed record GateOverrideRequest
|
||||
{
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required string Reason { get; init; }
|
||||
public DateTimeOffset? ExpiresAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of gate override.
|
||||
/// </summary>
|
||||
public sealed record GateOverrideResult
|
||||
{
|
||||
public required Guid OverrideId { get; init; }
|
||||
public required Guid GateId { get; init; }
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required string OverriddenBy { get; init; }
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset? ExpiresAt { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Interfaces
|
||||
|
||||
/// <summary>
|
||||
/// Interface for gate service.
|
||||
/// </summary>
|
||||
public interface IGateService
|
||||
{
|
||||
Task<IReadOnlyList<GateDto>> ListGatesAsync(string? environment, string? gateType, CancellationToken ct);
|
||||
Task<GateDto?> GetGateAsync(Guid gateId, CancellationToken ct);
|
||||
Task<GateDto> CreateGateAsync(CreateGateRequest request, CancellationToken ct);
|
||||
Task<GateDto> UpdateGateAsync(Guid gateId, UpdateGateRequest request, CancellationToken ct);
|
||||
Task DeleteGateAsync(Guid gateId, CancellationToken ct);
|
||||
Task<IReadOnlyList<GateEvaluationResponse>> GetEvaluationHistoryAsync(Guid releaseId, CancellationToken ct);
|
||||
Task<GateOverrideResult> OverrideGateAsync(Guid gateId, Guid releaseId, string reason, DateTimeOffset? expiresAt, CancellationToken ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for gate evaluator.
|
||||
/// </summary>
|
||||
public interface IGateEvaluator
|
||||
{
|
||||
Task<GateEvaluationResponse> EvaluateAsync(Guid releaseId, string targetEnvironment, string artifactDigest, CancellationToken ct);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Exceptions
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when a gate is not found.
|
||||
/// </summary>
|
||||
public class GateNotFoundException : Exception
|
||||
{
|
||||
public GateNotFoundException(Guid gateId) : base($"Gate {gateId} not found") { }
|
||||
}
|
||||
|
||||
#endregion
|
||||
484
src/Api/StellaOps.Api/Controllers/ObservabilityController.cs
Normal file
484
src/Api/StellaOps.Api/Controllers/ObservabilityController.cs
Normal file
@@ -0,0 +1,484 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ObservabilityController.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_observability
|
||||
// Task: API-004 - Observability API Endpoints
|
||||
// Description: API endpoints for metrics, traces, and health monitoring
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
|
||||
namespace StellaOps.Api.Controllers;
|
||||
|
||||
/// <summary>
|
||||
/// Controller for observability and monitoring endpoints.
|
||||
/// </summary>
|
||||
[ApiController]
|
||||
[Route("v1/observability")]
|
||||
[Authorize]
|
||||
public class ObservabilityController : ControllerBase
|
||||
{
|
||||
private readonly IObservabilityService _observabilityService;
|
||||
private readonly IHealthService _healthService;
|
||||
private readonly ILogger<ObservabilityController> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="ObservabilityController"/> class.
|
||||
/// </summary>
|
||||
public ObservabilityController(
|
||||
IObservabilityService observabilityService,
|
||||
IHealthService healthService,
|
||||
ILogger<ObservabilityController> logger)
|
||||
{
|
||||
_observabilityService = observabilityService;
|
||||
_healthService = healthService;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets system health status.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The system health.</returns>
|
||||
[HttpGet("health")]
|
||||
[AllowAnonymous]
|
||||
[ProducesResponseType(typeof(SystemHealthResponse), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(SystemHealthResponse), StatusCodes.Status503ServiceUnavailable)]
|
||||
public async Task<IActionResult> GetSystemHealth(CancellationToken ct)
|
||||
{
|
||||
var health = await _healthService.GetSystemHealthAsync(ct);
|
||||
|
||||
var statusCode = health.Status == "Healthy"
|
||||
? StatusCodes.Status200OK
|
||||
: StatusCodes.Status503ServiceUnavailable;
|
||||
|
||||
return StatusCode(statusCode, health);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets liveness probe status.
|
||||
/// </summary>
|
||||
/// <returns>OK if alive.</returns>
|
||||
[HttpGet("health/live")]
|
||||
[AllowAnonymous]
|
||||
[ProducesResponseType(StatusCodes.Status200OK)]
|
||||
public IActionResult GetLiveness()
|
||||
{
|
||||
return Ok(new { status = "alive", timestamp = DateTimeOffset.UtcNow });
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets readiness probe status.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>OK if ready to serve traffic.</returns>
|
||||
[HttpGet("health/ready")]
|
||||
[AllowAnonymous]
|
||||
[ProducesResponseType(StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(StatusCodes.Status503ServiceUnavailable)]
|
||||
public async Task<IActionResult> GetReadiness(CancellationToken ct)
|
||||
{
|
||||
var ready = await _healthService.IsReadyAsync(ct);
|
||||
|
||||
if (ready)
|
||||
{
|
||||
return Ok(new { status = "ready", timestamp = DateTimeOffset.UtcNow });
|
||||
}
|
||||
|
||||
return StatusCode(StatusCodes.Status503ServiceUnavailable,
|
||||
new { status = "not_ready", timestamp = DateTimeOffset.UtcNow });
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets metrics in Prometheus format.
|
||||
/// </summary>
|
||||
/// <returns>Prometheus-formatted metrics.</returns>
|
||||
[HttpGet("metrics")]
|
||||
[AllowAnonymous]
|
||||
[Produces("text/plain")]
|
||||
[ProducesResponseType(typeof(string), StatusCodes.Status200OK)]
|
||||
public async Task<IActionResult> GetMetrics(CancellationToken ct)
|
||||
{
|
||||
var metrics = await _observabilityService.GetPrometheusMetricsAsync(ct);
|
||||
return Content(metrics, "text/plain; version=0.0.4; charset=utf-8");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets custom metrics for a specific domain.
|
||||
/// </summary>
|
||||
/// <param name="domain">The metrics domain (releases, gates, health).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Domain metrics.</returns>
|
||||
[HttpGet("metrics/{domain}")]
|
||||
[ProducesResponseType(typeof(DomainMetricsResponse), StatusCodes.Status200OK)]
|
||||
public async Task<IActionResult> GetDomainMetrics(
|
||||
[FromRoute] string domain,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var metrics = await _observabilityService.GetDomainMetricsAsync(domain, ct);
|
||||
return Ok(metrics);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a trace by ID.
|
||||
/// </summary>
|
||||
/// <param name="traceId">The trace ID.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The trace details.</returns>
|
||||
[HttpGet("traces/{traceId}")]
|
||||
[ProducesResponseType(typeof(TraceDto), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> GetTrace(
|
||||
[FromRoute] string traceId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var trace = await _observabilityService.GetTraceAsync(traceId, ct);
|
||||
|
||||
if (trace is null)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Trace not found",
|
||||
Detail = $"Trace {traceId} does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
|
||||
return Ok(trace);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Searches traces.
|
||||
/// </summary>
|
||||
/// <param name="request">The search request.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matching traces.</returns>
|
||||
[HttpPost("traces/search")]
|
||||
[ProducesResponseType(typeof(TraceSearchResponse), StatusCodes.Status200OK)]
|
||||
public async Task<IActionResult> SearchTraces(
|
||||
[FromBody] TraceSearchRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var results = await _observabilityService.SearchTracesAsync(request, ct);
|
||||
return Ok(results);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets logs with optional filtering.
|
||||
/// </summary>
|
||||
/// <param name="level">Minimum log level.</param>
|
||||
/// <param name="correlationId">Filter by correlation ID.</param>
|
||||
/// <param name="startTime">Start time filter.</param>
|
||||
/// <param name="endTime">End time filter.</param>
|
||||
/// <param name="limit">Maximum results (default 100).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matching log entries.</returns>
|
||||
[HttpGet("logs")]
|
||||
[ProducesResponseType(typeof(LogSearchResponse), StatusCodes.Status200OK)]
|
||||
public async Task<IActionResult> GetLogs(
|
||||
[FromQuery] string? level,
|
||||
[FromQuery] string? correlationId,
|
||||
[FromQuery] DateTimeOffset? startTime,
|
||||
[FromQuery] DateTimeOffset? endTime,
|
||||
[FromQuery] int limit = 100,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var request = new LogSearchRequest
|
||||
{
|
||||
Level = level,
|
||||
CorrelationId = correlationId,
|
||||
StartTime = startTime,
|
||||
EndTime = endTime,
|
||||
Limit = Math.Clamp(limit, 1, 1000)
|
||||
};
|
||||
|
||||
var results = await _observabilityService.SearchLogsAsync(request, ct);
|
||||
return Ok(results);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets observability statistics.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Observability stats.</returns>
|
||||
[HttpGet("stats")]
|
||||
[ProducesResponseType(typeof(ObservabilityStatsResponse), StatusCodes.Status200OK)]
|
||||
public async Task<IActionResult> GetStats(CancellationToken ct)
|
||||
{
|
||||
var stats = await _observabilityService.GetStatsAsync(ct);
|
||||
return Ok(stats);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets release metrics summary.
|
||||
/// </summary>
|
||||
/// <param name="environment">Filter by environment.</param>
|
||||
/// <param name="period">Time period (1h, 24h, 7d, 30d).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Release metrics summary.</returns>
|
||||
[HttpGet("releases/metrics")]
|
||||
[ProducesResponseType(typeof(ReleaseMetricsSummary), StatusCodes.Status200OK)]
|
||||
public async Task<IActionResult> GetReleaseMetrics(
|
||||
[FromQuery] string? environment,
|
||||
[FromQuery] string period = "24h",
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var metrics = await _observabilityService.GetReleaseMetricsAsync(environment, period, ct);
|
||||
return Ok(metrics);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets SLA status.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>SLA status.</returns>
|
||||
[HttpGet("sla")]
|
||||
[ProducesResponseType(typeof(SlaStatusResponse), StatusCodes.Status200OK)]
|
||||
public async Task<IActionResult> GetSlaStatus(CancellationToken ct)
|
||||
{
|
||||
var status = await _observabilityService.GetSlaStatusAsync(ct);
|
||||
return Ok(status);
|
||||
}
|
||||
}
|
||||
|
||||
#region Request/Response DTOs
|
||||
|
||||
/// <summary>
|
||||
/// System health response.
|
||||
/// </summary>
|
||||
public sealed record SystemHealthResponse
|
||||
{
|
||||
public required string Status { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public required TimeSpan Uptime { get; init; }
|
||||
public required IReadOnlyList<HealthCheckResult> Checks { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Health check result.
|
||||
/// </summary>
|
||||
public sealed record HealthCheckResult
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
public ImmutableDictionary<string, object> Data { get; init; } =
|
||||
ImmutableDictionary<string, object>.Empty;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Domain metrics response.
|
||||
/// </summary>
|
||||
public sealed record DomainMetricsResponse
|
||||
{
|
||||
public required string Domain { get; init; }
|
||||
public required IReadOnlyList<MetricDto> Metrics { get; init; }
|
||||
public required DateTimeOffset GeneratedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metric DTO.
|
||||
/// </summary>
|
||||
public sealed record MetricDto
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required string Type { get; init; }
|
||||
public required double Value { get; init; }
|
||||
public string? Unit { get; init; }
|
||||
public ImmutableDictionary<string, string> Labels { get; init; } =
|
||||
ImmutableDictionary<string, string>.Empty;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Trace DTO.
|
||||
/// </summary>
|
||||
public sealed record TraceDto
|
||||
{
|
||||
public required string TraceId { get; init; }
|
||||
public required string RootOperation { get; init; }
|
||||
public required DateTimeOffset StartTime { get; init; }
|
||||
public required TimeSpan Duration { get; init; }
|
||||
public required int SpanCount { get; init; }
|
||||
public required int ServiceCount { get; init; }
|
||||
public required bool HasErrors { get; init; }
|
||||
public required IReadOnlyList<SpanDto> Spans { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Span DTO.
|
||||
/// </summary>
|
||||
public sealed record SpanDto
|
||||
{
|
||||
public required string SpanId { get; init; }
|
||||
public string? ParentSpanId { get; init; }
|
||||
public required string OperationName { get; init; }
|
||||
public required string ServiceName { get; init; }
|
||||
public required DateTimeOffset StartTime { get; init; }
|
||||
public required TimeSpan Duration { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public ImmutableDictionary<string, string> Attributes { get; init; } =
|
||||
ImmutableDictionary<string, string>.Empty;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Trace search request.
|
||||
/// </summary>
|
||||
public sealed record TraceSearchRequest
|
||||
{
|
||||
public string? ServiceName { get; init; }
|
||||
public string? OperationName { get; init; }
|
||||
public DateTimeOffset? StartTime { get; init; }
|
||||
public DateTimeOffset? EndTime { get; init; }
|
||||
public TimeSpan? MinDuration { get; init; }
|
||||
public bool? HasErrors { get; init; }
|
||||
public ImmutableDictionary<string, string> Tags { get; init; } =
|
||||
ImmutableDictionary<string, string>.Empty;
|
||||
public int Limit { get; init; } = 20;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Trace search response.
|
||||
/// </summary>
|
||||
public sealed record TraceSearchResponse
|
||||
{
|
||||
public required IReadOnlyList<TraceDto> Traces { get; init; }
|
||||
public required int TotalCount { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Log search request.
|
||||
/// </summary>
|
||||
public sealed record LogSearchRequest
|
||||
{
|
||||
public string? Level { get; init; }
|
||||
public string? CorrelationId { get; init; }
|
||||
public string? TraceId { get; init; }
|
||||
public string? Message { get; init; }
|
||||
public DateTimeOffset? StartTime { get; init; }
|
||||
public DateTimeOffset? EndTime { get; init; }
|
||||
public int Limit { get; init; } = 100;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Log search response.
|
||||
/// </summary>
|
||||
public sealed record LogSearchResponse
|
||||
{
|
||||
public required IReadOnlyList<LogEntryDto> Entries { get; init; }
|
||||
public required int TotalCount { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Log entry DTO.
|
||||
/// </summary>
|
||||
public sealed record LogEntryDto
|
||||
{
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public required string Level { get; init; }
|
||||
public required string Message { get; init; }
|
||||
public string? CorrelationId { get; init; }
|
||||
public string? TraceId { get; init; }
|
||||
public string? Source { get; init; }
|
||||
public ImmutableDictionary<string, object> Properties { get; init; } =
|
||||
ImmutableDictionary<string, object>.Empty;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Observability stats response.
|
||||
/// </summary>
|
||||
public sealed record ObservabilityStatsResponse
|
||||
{
|
||||
public required int MetricsBuffered { get; init; }
|
||||
public required int TracesBuffered { get; init; }
|
||||
public required int LogsBuffered { get; init; }
|
||||
public required long DroppedMetrics { get; init; }
|
||||
public required long DroppedTraces { get; init; }
|
||||
public required long DroppedLogs { get; init; }
|
||||
public required int RegisteredMetrics { get; init; }
|
||||
public required DateTimeOffset GeneratedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Release metrics summary.
|
||||
/// </summary>
|
||||
public sealed record ReleaseMetricsSummary
|
||||
{
|
||||
public required int TotalReleases { get; init; }
|
||||
public required int SuccessfulReleases { get; init; }
|
||||
public required int FailedReleases { get; init; }
|
||||
public required int RollbackCount { get; init; }
|
||||
public required double SuccessRate { get; init; }
|
||||
public required TimeSpan AverageReleaseTime { get; init; }
|
||||
public required TimeSpan P95ReleaseTime { get; init; }
|
||||
public required string Period { get; init; }
|
||||
public required IReadOnlyList<EnvironmentReleaseMetrics> ByEnvironment { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Release metrics by environment.
|
||||
/// </summary>
|
||||
public sealed record EnvironmentReleaseMetrics
|
||||
{
|
||||
public required string Environment { get; init; }
|
||||
public required int TotalReleases { get; init; }
|
||||
public required int SuccessfulReleases { get; init; }
|
||||
public required double SuccessRate { get; init; }
|
||||
public required TimeSpan AverageReleaseTime { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SLA status response.
|
||||
/// </summary>
|
||||
public sealed record SlaStatusResponse
|
||||
{
|
||||
public required double CurrentSuccessRate { get; init; }
|
||||
public required double TargetSuccessRate { get; init; }
|
||||
public required double ErrorBudgetRemaining { get; init; }
|
||||
public required int SlaBreaches { get; init; }
|
||||
public required string Period { get; init; }
|
||||
public required IReadOnlyList<SlaMetric> Metrics { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SLA metric.
|
||||
/// </summary>
|
||||
public sealed record SlaMetric
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required double CurrentValue { get; init; }
|
||||
public required double TargetValue { get; init; }
|
||||
public required bool IsMet { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Interfaces
|
||||
|
||||
/// <summary>
|
||||
/// Interface for observability service.
|
||||
/// </summary>
|
||||
public interface IObservabilityService
|
||||
{
|
||||
Task<string> GetPrometheusMetricsAsync(CancellationToken ct);
|
||||
Task<DomainMetricsResponse> GetDomainMetricsAsync(string domain, CancellationToken ct);
|
||||
Task<TraceDto?> GetTraceAsync(string traceId, CancellationToken ct);
|
||||
Task<TraceSearchResponse> SearchTracesAsync(TraceSearchRequest request, CancellationToken ct);
|
||||
Task<LogSearchResponse> SearchLogsAsync(LogSearchRequest request, CancellationToken ct);
|
||||
Task<ObservabilityStatsResponse> GetStatsAsync(CancellationToken ct);
|
||||
Task<ReleaseMetricsSummary> GetReleaseMetricsAsync(string? environment, string period, CancellationToken ct);
|
||||
Task<SlaStatusResponse> GetSlaStatusAsync(CancellationToken ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for health service.
|
||||
/// </summary>
|
||||
public interface IHealthService
|
||||
{
|
||||
Task<SystemHealthResponse> GetSystemHealthAsync(CancellationToken ct);
|
||||
Task<bool> IsReadyAsync(CancellationToken ct);
|
||||
}
|
||||
|
||||
#endregion
|
||||
501
src/Api/StellaOps.Api/Controllers/ReleasesController.cs
Normal file
501
src/Api/StellaOps.Api/Controllers/ReleasesController.cs
Normal file
@@ -0,0 +1,501 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ReleasesController.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_observability
|
||||
// Task: API-001 - Release Management API Endpoints
|
||||
// Description: API endpoints for release management operations
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
|
||||
namespace StellaOps.Api.Controllers;
|
||||
|
||||
/// <summary>
|
||||
/// Controller for release management endpoints.
|
||||
/// </summary>
|
||||
[ApiController]
|
||||
[Route("v1/releases")]
|
||||
[Authorize]
|
||||
public class ReleasesController : ControllerBase
|
||||
{
|
||||
private readonly IReleaseService _releaseService;
|
||||
private readonly IReleaseStateStore _stateStore;
|
||||
private readonly ILogger<ReleasesController> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="ReleasesController"/> class.
|
||||
/// </summary>
|
||||
public ReleasesController(
|
||||
IReleaseService releaseService,
|
||||
IReleaseStateStore stateStore,
|
||||
ILogger<ReleasesController> logger)
|
||||
{
|
||||
_releaseService = releaseService;
|
||||
_stateStore = stateStore;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Lists all releases with optional filtering.
|
||||
/// </summary>
|
||||
/// <param name="environment">Filter by environment.</param>
|
||||
/// <param name="status">Filter by status.</param>
|
||||
/// <param name="pageSize">Page size (default 20).</param>
|
||||
/// <param name="pageToken">Page token for pagination.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>List of releases.</returns>
|
||||
[HttpGet]
|
||||
[ProducesResponseType(typeof(ListReleasesResponse), StatusCodes.Status200OK)]
|
||||
public async Task<IActionResult> ListReleases(
|
||||
[FromQuery] string? environment,
|
||||
[FromQuery] string? status,
|
||||
[FromQuery] int pageSize = 20,
|
||||
[FromQuery] string? pageToken = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Listing releases: environment={Environment}, status={Status}",
|
||||
environment, status);
|
||||
|
||||
var filter = new ReleaseFilter
|
||||
{
|
||||
Environment = environment,
|
||||
Status = status,
|
||||
PageSize = Math.Clamp(pageSize, 1, 100),
|
||||
PageToken = pageToken
|
||||
};
|
||||
|
||||
var result = await _releaseService.ListReleasesAsync(filter, ct);
|
||||
|
||||
return Ok(new ListReleasesResponse
|
||||
{
|
||||
Releases = result.Releases,
|
||||
NextPageToken = result.NextPageToken,
|
||||
TotalCount = result.TotalCount
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a specific release by ID.
|
||||
/// </summary>
|
||||
/// <param name="releaseId">The release ID.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The release details.</returns>
|
||||
[HttpGet("{releaseId:guid}")]
|
||||
[ProducesResponseType(typeof(ReleaseDto), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> GetRelease(
|
||||
[FromRoute] Guid releaseId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogDebug("Getting release {ReleaseId}", releaseId);
|
||||
|
||||
var release = await _releaseService.GetReleaseAsync(releaseId, ct);
|
||||
|
||||
if (release is null)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Release not found",
|
||||
Detail = $"Release {releaseId} does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
|
||||
return Ok(release);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new release.
|
||||
/// </summary>
|
||||
/// <param name="request">The release creation request.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The created release.</returns>
|
||||
[HttpPost]
|
||||
[ProducesResponseType(typeof(ReleaseDto), StatusCodes.Status201Created)]
|
||||
[ProducesResponseType(typeof(ValidationProblemDetails), StatusCodes.Status400BadRequest)]
|
||||
public async Task<IActionResult> CreateRelease(
|
||||
[FromBody] CreateReleaseRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Creating release for artifact {ArtifactDigest} to {Environment}",
|
||||
request.ArtifactDigest, request.TargetEnvironment);
|
||||
|
||||
var release = await _releaseService.CreateReleaseAsync(request, ct);
|
||||
|
||||
return CreatedAtAction(
|
||||
nameof(GetRelease),
|
||||
new { releaseId = release.Id },
|
||||
release);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Promotes a release to the next environment.
|
||||
/// </summary>
|
||||
/// <param name="releaseId">The release ID.</param>
|
||||
/// <param name="request">The promotion request.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The updated release.</returns>
|
||||
[HttpPost("{releaseId:guid}/promote")]
|
||||
[ProducesResponseType(typeof(ReleaseDto), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status409Conflict)]
|
||||
public async Task<IActionResult> PromoteRelease(
|
||||
[FromRoute] Guid releaseId,
|
||||
[FromBody] PromoteReleaseRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Promoting release {ReleaseId} to {Environment}",
|
||||
releaseId, request.TargetEnvironment);
|
||||
|
||||
try
|
||||
{
|
||||
var release = await _releaseService.PromoteReleaseAsync(
|
||||
releaseId,
|
||||
request.TargetEnvironment,
|
||||
request.ApprovalId,
|
||||
ct);
|
||||
|
||||
return Ok(release);
|
||||
}
|
||||
catch (ReleaseNotFoundException)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Release not found",
|
||||
Detail = $"Release {releaseId} does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
catch (ReleaseStateConflictException ex)
|
||||
{
|
||||
return Conflict(new ProblemDetails
|
||||
{
|
||||
Title = "Promotion conflict",
|
||||
Detail = ex.Message,
|
||||
Status = StatusCodes.Status409Conflict
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Rolls back a release.
|
||||
/// </summary>
|
||||
/// <param name="releaseId">The release ID.</param>
|
||||
/// <param name="request">The rollback request.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The rollback result.</returns>
|
||||
[HttpPost("{releaseId:guid}/rollback")]
|
||||
[ProducesResponseType(typeof(RollbackResult), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> RollbackRelease(
|
||||
[FromRoute] Guid releaseId,
|
||||
[FromBody] RollbackReleaseRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Rolling back release {ReleaseId}, reason: {Reason}",
|
||||
releaseId, request.Reason);
|
||||
|
||||
try
|
||||
{
|
||||
var result = await _releaseService.RollbackReleaseAsync(
|
||||
releaseId,
|
||||
request.Reason,
|
||||
request.TargetVersion,
|
||||
ct);
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
catch (ReleaseNotFoundException)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Release not found",
|
||||
Detail = $"Release {releaseId} does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Cancels a pending release.
|
||||
/// </summary>
|
||||
/// <param name="releaseId">The release ID.</param>
|
||||
/// <param name="request">The cancellation request.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>No content on success.</returns>
|
||||
[HttpPost("{releaseId:guid}/cancel")]
|
||||
[ProducesResponseType(StatusCodes.Status204NoContent)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status409Conflict)]
|
||||
public async Task<IActionResult> CancelRelease(
|
||||
[FromRoute] Guid releaseId,
|
||||
[FromBody] CancelReleaseRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Cancelling release {ReleaseId}, reason: {Reason}",
|
||||
releaseId, request.Reason);
|
||||
|
||||
try
|
||||
{
|
||||
await _releaseService.CancelReleaseAsync(releaseId, request.Reason, ct);
|
||||
return NoContent();
|
||||
}
|
||||
catch (ReleaseNotFoundException)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Release not found",
|
||||
Detail = $"Release {releaseId} does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
catch (ReleaseStateConflictException ex)
|
||||
{
|
||||
return Conflict(new ProblemDetails
|
||||
{
|
||||
Title = "Cannot cancel",
|
||||
Detail = ex.Message,
|
||||
Status = StatusCodes.Status409Conflict
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the state machine state for a release.
|
||||
/// </summary>
|
||||
/// <param name="releaseId">The release ID.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The release state.</returns>
|
||||
[HttpGet("{releaseId:guid}/state")]
|
||||
[ProducesResponseType(typeof(ReleaseStateDto), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> GetReleaseState(
|
||||
[FromRoute] Guid releaseId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var state = await _stateStore.GetStateAsync(releaseId, ct);
|
||||
|
||||
if (state is null)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Release not found",
|
||||
Detail = $"Release {releaseId} does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
|
||||
return Ok(state);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the history of state transitions for a release.
|
||||
/// </summary>
|
||||
/// <param name="releaseId">The release ID.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The release history.</returns>
|
||||
[HttpGet("{releaseId:guid}/history")]
|
||||
[ProducesResponseType(typeof(ReleaseHistoryResponse), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(typeof(ProblemDetails), StatusCodes.Status404NotFound)]
|
||||
public async Task<IActionResult> GetReleaseHistory(
|
||||
[FromRoute] Guid releaseId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var history = await _releaseService.GetReleaseHistoryAsync(releaseId, ct);
|
||||
|
||||
if (history is null)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Release not found",
|
||||
Detail = $"Release {releaseId} does not exist",
|
||||
Status = StatusCodes.Status404NotFound
|
||||
});
|
||||
}
|
||||
|
||||
return Ok(new ReleaseHistoryResponse
|
||||
{
|
||||
ReleaseId = releaseId,
|
||||
Events = history
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#region Request/Response DTOs
|
||||
|
||||
/// <summary>
|
||||
/// Filter for listing releases.
|
||||
/// </summary>
|
||||
public sealed record ReleaseFilter
|
||||
{
|
||||
public string? Environment { get; init; }
|
||||
public string? Status { get; init; }
|
||||
public int PageSize { get; init; } = 20;
|
||||
public string? PageToken { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response for listing releases.
|
||||
/// </summary>
|
||||
public sealed record ListReleasesResponse
|
||||
{
|
||||
public required IReadOnlyList<ReleaseDto> Releases { get; init; }
|
||||
public string? NextPageToken { get; init; }
|
||||
public int TotalCount { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Release data transfer object.
|
||||
/// </summary>
|
||||
public sealed record ReleaseDto
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required string ArtifactDigest { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required string Environment { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset? CompletedAt { get; init; }
|
||||
public string? CreatedBy { get; init; }
|
||||
public ImmutableDictionary<string, string> Metadata { get; init; } =
|
||||
ImmutableDictionary<string, string>.Empty;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to create a release.
|
||||
/// </summary>
|
||||
public sealed record CreateReleaseRequest
|
||||
{
|
||||
public required string ArtifactDigest { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required string TargetEnvironment { get; init; }
|
||||
public ImmutableDictionary<string, string> Metadata { get; init; } =
|
||||
ImmutableDictionary<string, string>.Empty;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to promote a release.
|
||||
/// </summary>
|
||||
public sealed record PromoteReleaseRequest
|
||||
{
|
||||
public required string TargetEnvironment { get; init; }
|
||||
public Guid? ApprovalId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to rollback a release.
|
||||
/// </summary>
|
||||
public sealed record RollbackReleaseRequest
|
||||
{
|
||||
public required string Reason { get; init; }
|
||||
public string? TargetVersion { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to cancel a release.
|
||||
/// </summary>
|
||||
public sealed record CancelReleaseRequest
|
||||
{
|
||||
public required string Reason { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a rollback operation.
|
||||
/// </summary>
|
||||
public sealed record RollbackResult
|
||||
{
|
||||
public required Guid RollbackId { get; init; }
|
||||
public required string PreviousVersion { get; init; }
|
||||
public required string RolledBackToVersion { get; init; }
|
||||
public required DateTimeOffset CompletedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Release state DTO.
|
||||
/// </summary>
|
||||
public sealed record ReleaseStateDto
|
||||
{
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required string CurrentState { get; init; }
|
||||
public required IReadOnlyList<string> AvailableTransitions { get; init; }
|
||||
public DateTimeOffset? LastTransitionAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Release history response.
|
||||
/// </summary>
|
||||
public sealed record ReleaseHistoryResponse
|
||||
{
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required IReadOnlyList<ReleaseHistoryEvent> Events { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A historical event in a release lifecycle.
|
||||
/// </summary>
|
||||
public sealed record ReleaseHistoryEvent
|
||||
{
|
||||
public required Guid EventId { get; init; }
|
||||
public required string EventType { get; init; }
|
||||
public required string FromState { get; init; }
|
||||
public required string ToState { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public string? Actor { get; init; }
|
||||
public string? Details { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Interfaces (for DI)
|
||||
|
||||
/// <summary>
|
||||
/// Interface for release service.
|
||||
/// </summary>
|
||||
public interface IReleaseService
|
||||
{
|
||||
Task<(IReadOnlyList<ReleaseDto> Releases, string? NextPageToken, int TotalCount)> ListReleasesAsync(
|
||||
ReleaseFilter filter, CancellationToken ct);
|
||||
Task<ReleaseDto?> GetReleaseAsync(Guid releaseId, CancellationToken ct);
|
||||
Task<ReleaseDto> CreateReleaseAsync(CreateReleaseRequest request, CancellationToken ct);
|
||||
Task<ReleaseDto> PromoteReleaseAsync(Guid releaseId, string targetEnvironment, Guid? approvalId, CancellationToken ct);
|
||||
Task<RollbackResult> RollbackReleaseAsync(Guid releaseId, string reason, string? targetVersion, CancellationToken ct);
|
||||
Task CancelReleaseAsync(Guid releaseId, string reason, CancellationToken ct);
|
||||
Task<IReadOnlyList<ReleaseHistoryEvent>?> GetReleaseHistoryAsync(Guid releaseId, CancellationToken ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for release state store.
|
||||
/// </summary>
|
||||
public interface IReleaseStateStore
|
||||
{
|
||||
Task<ReleaseStateDto?> GetStateAsync(Guid releaseId, CancellationToken ct);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Exceptions
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when a release is not found.
|
||||
/// </summary>
|
||||
public class ReleaseNotFoundException : Exception
|
||||
{
|
||||
public ReleaseNotFoundException(Guid releaseId)
|
||||
: base($"Release {releaseId} not found") { }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when a release state conflict occurs.
|
||||
/// </summary>
|
||||
public class ReleaseStateConflictException : Exception
|
||||
{
|
||||
public ReleaseStateConflictException(string message) : base(message) { }
|
||||
}
|
||||
|
||||
#endregion
|
||||
1061
src/Api/StellaOps.Api/Controllers/RemediationController.cs
Normal file
1061
src/Api/StellaOps.Api/Controllers/RemediationController.cs
Normal file
File diff suppressed because it is too large
Load Diff
1178
src/Api/StellaOps.Api/Controllers/WorkflowVisualizationController.cs
Normal file
1178
src/Api/StellaOps.Api/Controllers/WorkflowVisualizationController.cs
Normal file
File diff suppressed because it is too large
Load Diff
533
src/Api/StellaOps.Api/Hubs/RemediationHub.cs
Normal file
533
src/Api/StellaOps.Api/Hubs/RemediationHub.cs
Normal file
@@ -0,0 +1,533 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// RemediationHub.cs
|
||||
// Sprint: SPRINT_20260117_031_ReleaseOrchestrator_drift_remediation
|
||||
// Task: TASK-031-08 - WebSocket Events for Real-Time Remediation Updates
|
||||
// Description: SignalR hub for broadcasting remediation progress events
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
using Microsoft.AspNetCore.SignalR;
|
||||
|
||||
namespace StellaOps.Api.Hubs;
|
||||
|
||||
/// <summary>
|
||||
/// SignalR hub for real-time remediation updates.
|
||||
/// </summary>
|
||||
[Authorize]
|
||||
public class RemediationHub : Hub<IRemediationHubClient>
|
||||
{
|
||||
private static readonly ConcurrentDictionary<string, HashSet<string>> _planSubscriptions = new();
|
||||
private static readonly ConcurrentDictionary<string, HashSet<string>> _environmentSubscriptions = new();
|
||||
private readonly ILogger<RemediationHub> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="RemediationHub"/> class.
|
||||
/// </summary>
|
||||
public RemediationHub(ILogger<RemediationHub> logger)
|
||||
{
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Called when a client connects.
|
||||
/// </summary>
|
||||
public override async Task OnConnectedAsync()
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Client {ConnectionId} connected to RemediationHub",
|
||||
Context.ConnectionId);
|
||||
|
||||
await base.OnConnectedAsync();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Called when a client disconnects.
|
||||
/// </summary>
|
||||
public override async Task OnDisconnectedAsync(Exception? exception)
|
||||
{
|
||||
var connectionId = Context.ConnectionId;
|
||||
|
||||
// Clean up plan subscriptions
|
||||
foreach (var planId in _planSubscriptions.Keys)
|
||||
{
|
||||
if (_planSubscriptions.TryGetValue(planId, out var connections))
|
||||
{
|
||||
connections.Remove(connectionId);
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up environment subscriptions
|
||||
foreach (var environment in _environmentSubscriptions.Keys)
|
||||
{
|
||||
if (_environmentSubscriptions.TryGetValue(environment, out var connections))
|
||||
{
|
||||
connections.Remove(connectionId);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Client {ConnectionId} disconnected from RemediationHub",
|
||||
connectionId);
|
||||
|
||||
await base.OnDisconnectedAsync(exception);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Subscribes to updates for a specific remediation plan.
|
||||
/// </summary>
|
||||
/// <param name="planId">The plan ID to subscribe to.</param>
|
||||
public async Task SubscribeToPlan(string planId)
|
||||
{
|
||||
var connectionId = Context.ConnectionId;
|
||||
|
||||
var connections = _planSubscriptions.GetOrAdd(planId, _ => new HashSet<string>());
|
||||
lock (connections)
|
||||
{
|
||||
connections.Add(connectionId);
|
||||
}
|
||||
|
||||
await Groups.AddToGroupAsync(connectionId, $"plan:{planId}");
|
||||
|
||||
_logger.LogDebug(
|
||||
"Client {ConnectionId} subscribed to plan {PlanId}",
|
||||
connectionId, planId);
|
||||
|
||||
await Clients.Caller.OnSubscribed(new SubscriptionConfirmation
|
||||
{
|
||||
Type = "plan",
|
||||
Id = planId,
|
||||
Timestamp = DateTimeOffset.UtcNow
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unsubscribes from updates for a specific remediation plan.
|
||||
/// </summary>
|
||||
/// <param name="planId">The plan ID to unsubscribe from.</param>
|
||||
public async Task UnsubscribeFromPlan(string planId)
|
||||
{
|
||||
var connectionId = Context.ConnectionId;
|
||||
|
||||
if (_planSubscriptions.TryGetValue(planId, out var connections))
|
||||
{
|
||||
lock (connections)
|
||||
{
|
||||
connections.Remove(connectionId);
|
||||
}
|
||||
}
|
||||
|
||||
await Groups.RemoveFromGroupAsync(connectionId, $"plan:{planId}");
|
||||
|
||||
_logger.LogDebug(
|
||||
"Client {ConnectionId} unsubscribed from plan {PlanId}",
|
||||
connectionId, planId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Subscribes to updates for all plans in an environment.
|
||||
/// </summary>
|
||||
/// <param name="environment">The environment to subscribe to.</param>
|
||||
public async Task SubscribeToEnvironment(string environment)
|
||||
{
|
||||
var connectionId = Context.ConnectionId;
|
||||
|
||||
var connections = _environmentSubscriptions.GetOrAdd(environment, _ => new HashSet<string>());
|
||||
lock (connections)
|
||||
{
|
||||
connections.Add(connectionId);
|
||||
}
|
||||
|
||||
await Groups.AddToGroupAsync(connectionId, $"env:{environment}");
|
||||
|
||||
_logger.LogDebug(
|
||||
"Client {ConnectionId} subscribed to environment {Environment}",
|
||||
connectionId, environment);
|
||||
|
||||
await Clients.Caller.OnSubscribed(new SubscriptionConfirmation
|
||||
{
|
||||
Type = "environment",
|
||||
Id = environment,
|
||||
Timestamp = DateTimeOffset.UtcNow
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unsubscribes from updates for an environment.
|
||||
/// </summary>
|
||||
/// <param name="environment">The environment to unsubscribe from.</param>
|
||||
public async Task UnsubscribeFromEnvironment(string environment)
|
||||
{
|
||||
var connectionId = Context.ConnectionId;
|
||||
|
||||
if (_environmentSubscriptions.TryGetValue(environment, out var connections))
|
||||
{
|
||||
lock (connections)
|
||||
{
|
||||
connections.Remove(connectionId);
|
||||
}
|
||||
}
|
||||
|
||||
await Groups.RemoveFromGroupAsync(connectionId, $"env:{environment}");
|
||||
|
||||
_logger.LogDebug(
|
||||
"Client {ConnectionId} unsubscribed from environment {Environment}",
|
||||
connectionId, environment);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Client interface for RemediationHub.
|
||||
/// </summary>
|
||||
public interface IRemediationHubClient
|
||||
{
|
||||
/// <summary>Called when subscription is confirmed.</summary>
|
||||
Task OnSubscribed(SubscriptionConfirmation confirmation);
|
||||
|
||||
/// <summary>Called when a plan is created.</summary>
|
||||
Task OnPlanCreated(PlanCreatedEvent evt);
|
||||
|
||||
/// <summary>Called when a plan starts execution.</summary>
|
||||
Task OnPlanStarted(PlanStartedEvent evt);
|
||||
|
||||
/// <summary>Called when plan progress updates.</summary>
|
||||
Task OnPlanProgress(PlanProgressEvent evt);
|
||||
|
||||
/// <summary>Called when a plan completes.</summary>
|
||||
Task OnPlanCompleted(PlanCompletedEvent evt);
|
||||
|
||||
/// <summary>Called when a plan fails.</summary>
|
||||
Task OnPlanFailed(PlanFailedEvent evt);
|
||||
|
||||
/// <summary>Called when a plan is paused.</summary>
|
||||
Task OnPlanPaused(PlanPausedEvent evt);
|
||||
|
||||
/// <summary>Called when a plan is resumed.</summary>
|
||||
Task OnPlanResumed(PlanResumedEvent evt);
|
||||
|
||||
/// <summary>Called when a plan is cancelled.</summary>
|
||||
Task OnPlanCancelled(PlanCancelledEvent evt);
|
||||
|
||||
/// <summary>Called when a batch starts.</summary>
|
||||
Task OnBatchStarted(BatchStartedEvent evt);
|
||||
|
||||
/// <summary>Called when a batch completes.</summary>
|
||||
Task OnBatchCompleted(BatchCompletedEvent evt);
|
||||
|
||||
/// <summary>Called when a target remediation starts.</summary>
|
||||
Task OnTargetStarted(TargetStartedEvent evt);
|
||||
|
||||
/// <summary>Called when a target remediation completes.</summary>
|
||||
Task OnTargetCompleted(TargetCompletedEvent evt);
|
||||
|
||||
/// <summary>Called when a target remediation fails.</summary>
|
||||
Task OnTargetFailed(TargetFailedEvent evt);
|
||||
|
||||
/// <summary>Called when a target is skipped.</summary>
|
||||
Task OnTargetSkipped(TargetSkippedEvent evt);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service for broadcasting remediation events.
|
||||
/// </summary>
|
||||
public interface IRemediationEventBroadcaster
|
||||
{
|
||||
Task BroadcastPlanCreatedAsync(PlanCreatedEvent evt, CancellationToken ct = default);
|
||||
Task BroadcastPlanStartedAsync(PlanStartedEvent evt, CancellationToken ct = default);
|
||||
Task BroadcastPlanProgressAsync(PlanProgressEvent evt, CancellationToken ct = default);
|
||||
Task BroadcastPlanCompletedAsync(PlanCompletedEvent evt, CancellationToken ct = default);
|
||||
Task BroadcastPlanFailedAsync(PlanFailedEvent evt, CancellationToken ct = default);
|
||||
Task BroadcastPlanPausedAsync(PlanPausedEvent evt, CancellationToken ct = default);
|
||||
Task BroadcastPlanResumedAsync(PlanResumedEvent evt, CancellationToken ct = default);
|
||||
Task BroadcastPlanCancelledAsync(PlanCancelledEvent evt, CancellationToken ct = default);
|
||||
Task BroadcastBatchStartedAsync(BatchStartedEvent evt, CancellationToken ct = default);
|
||||
Task BroadcastBatchCompletedAsync(BatchCompletedEvent evt, CancellationToken ct = default);
|
||||
Task BroadcastTargetStartedAsync(TargetStartedEvent evt, CancellationToken ct = default);
|
||||
Task BroadcastTargetCompletedAsync(TargetCompletedEvent evt, CancellationToken ct = default);
|
||||
Task BroadcastTargetFailedAsync(TargetFailedEvent evt, CancellationToken ct = default);
|
||||
Task BroadcastTargetSkippedAsync(TargetSkippedEvent evt, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Implementation of remediation event broadcaster.
|
||||
/// </summary>
|
||||
public sealed class RemediationEventBroadcaster : IRemediationEventBroadcaster
|
||||
{
|
||||
private readonly IHubContext<RemediationHub, IRemediationHubClient> _hubContext;
|
||||
private readonly ILogger<RemediationEventBroadcaster> _logger;
|
||||
|
||||
public RemediationEventBroadcaster(
|
||||
IHubContext<RemediationHub, IRemediationHubClient> hubContext,
|
||||
ILogger<RemediationEventBroadcaster> logger)
|
||||
{
|
||||
_hubContext = hubContext;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public async Task BroadcastPlanCreatedAsync(PlanCreatedEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting plan.created for {PlanId}", evt.PlanId);
|
||||
await _hubContext.Clients.Group($"env:{evt.Environment}").OnPlanCreated(evt);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnPlanCreated(evt);
|
||||
}
|
||||
|
||||
public async Task BroadcastPlanStartedAsync(PlanStartedEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting plan.started for {PlanId}", evt.PlanId);
|
||||
await _hubContext.Clients.Group($"env:{evt.Environment}").OnPlanStarted(evt);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnPlanStarted(evt);
|
||||
}
|
||||
|
||||
public async Task BroadcastPlanProgressAsync(PlanProgressEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting plan.progress for {PlanId}", evt.PlanId);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnPlanProgress(evt);
|
||||
}
|
||||
|
||||
public async Task BroadcastPlanCompletedAsync(PlanCompletedEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting plan.completed for {PlanId}", evt.PlanId);
|
||||
await _hubContext.Clients.Group($"env:{evt.Environment}").OnPlanCompleted(evt);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnPlanCompleted(evt);
|
||||
}
|
||||
|
||||
public async Task BroadcastPlanFailedAsync(PlanFailedEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting plan.failed for {PlanId}", evt.PlanId);
|
||||
await _hubContext.Clients.Group($"env:{evt.Environment}").OnPlanFailed(evt);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnPlanFailed(evt);
|
||||
}
|
||||
|
||||
public async Task BroadcastPlanPausedAsync(PlanPausedEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting plan.paused for {PlanId}", evt.PlanId);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnPlanPaused(evt);
|
||||
}
|
||||
|
||||
public async Task BroadcastPlanResumedAsync(PlanResumedEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting plan.resumed for {PlanId}", evt.PlanId);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnPlanResumed(evt);
|
||||
}
|
||||
|
||||
public async Task BroadcastPlanCancelledAsync(PlanCancelledEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting plan.cancelled for {PlanId}", evt.PlanId);
|
||||
await _hubContext.Clients.Group($"env:{evt.Environment}").OnPlanCancelled(evt);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnPlanCancelled(evt);
|
||||
}
|
||||
|
||||
public async Task BroadcastBatchStartedAsync(BatchStartedEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting batch.started for plan {PlanId} batch {BatchNumber}", evt.PlanId, evt.BatchNumber);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnBatchStarted(evt);
|
||||
}
|
||||
|
||||
public async Task BroadcastBatchCompletedAsync(BatchCompletedEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting batch.completed for plan {PlanId} batch {BatchNumber}", evt.PlanId, evt.BatchNumber);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnBatchCompleted(evt);
|
||||
}
|
||||
|
||||
public async Task BroadcastTargetStartedAsync(TargetStartedEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting target.started for {TargetId} in plan {PlanId}", evt.TargetId, evt.PlanId);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnTargetStarted(evt);
|
||||
}
|
||||
|
||||
public async Task BroadcastTargetCompletedAsync(TargetCompletedEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting target.completed for {TargetId} in plan {PlanId}", evt.TargetId, evt.PlanId);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnTargetCompleted(evt);
|
||||
}
|
||||
|
||||
public async Task BroadcastTargetFailedAsync(TargetFailedEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting target.failed for {TargetId} in plan {PlanId}", evt.TargetId, evt.PlanId);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnTargetFailed(evt);
|
||||
}
|
||||
|
||||
public async Task BroadcastTargetSkippedAsync(TargetSkippedEvent evt, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Broadcasting target.skipped for {TargetId} in plan {PlanId}", evt.TargetId, evt.PlanId);
|
||||
await _hubContext.Clients.Group($"plan:{evt.PlanId}").OnTargetSkipped(evt);
|
||||
}
|
||||
}
|
||||
|
||||
#region Event Models
|
||||
|
||||
/// <summary>
|
||||
/// Subscription confirmation.
|
||||
/// </summary>
|
||||
public sealed record SubscriptionConfirmation
|
||||
{
|
||||
public required string Type { get; init; }
|
||||
public required string Id { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Base event for remediation events.
|
||||
/// </summary>
|
||||
public abstract record RemediationEventBase
|
||||
{
|
||||
public required Guid PlanId { get; init; }
|
||||
public required string Environment { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event when a plan is created.
|
||||
/// </summary>
|
||||
public sealed record PlanCreatedEvent : RemediationEventBase
|
||||
{
|
||||
public required Guid PolicyId { get; init; }
|
||||
public required int TotalTargets { get; init; }
|
||||
public required int TotalBatches { get; init; }
|
||||
public string? CreatedBy { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event when a plan starts execution.
|
||||
/// </summary>
|
||||
public sealed record PlanStartedEvent : RemediationEventBase
|
||||
{
|
||||
public required int TotalTargets { get; init; }
|
||||
public required TimeSpan EstimatedDuration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event for plan progress updates.
|
||||
/// </summary>
|
||||
public sealed record PlanProgressEvent : RemediationEventBase
|
||||
{
|
||||
public required int CompletedTargets { get; init; }
|
||||
public required int FailedTargets { get; init; }
|
||||
public required int SkippedTargets { get; init; }
|
||||
public required int TotalTargets { get; init; }
|
||||
public required double ProgressPercentage { get; init; }
|
||||
public required int CurrentBatch { get; init; }
|
||||
public required int TotalBatches { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event when a plan completes successfully.
|
||||
/// </summary>
|
||||
public sealed record PlanCompletedEvent : RemediationEventBase
|
||||
{
|
||||
public required int SuccessfulTargets { get; init; }
|
||||
public required int FailedTargets { get; init; }
|
||||
public required int SkippedTargets { get; init; }
|
||||
public required TimeSpan Duration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event when a plan fails.
|
||||
/// </summary>
|
||||
public sealed record PlanFailedEvent : RemediationEventBase
|
||||
{
|
||||
public required string Reason { get; init; }
|
||||
public required int CompletedTargets { get; init; }
|
||||
public required int FailedTargets { get; init; }
|
||||
public string? ErrorDetails { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event when a plan is paused.
|
||||
/// </summary>
|
||||
public sealed record PlanPausedEvent : RemediationEventBase
|
||||
{
|
||||
public required int CompletedTargets { get; init; }
|
||||
public required int RemainingTargets { get; init; }
|
||||
public string? PausedBy { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event when a plan is resumed.
|
||||
/// </summary>
|
||||
public sealed record PlanResumedEvent : RemediationEventBase
|
||||
{
|
||||
public required int RemainingTargets { get; init; }
|
||||
public string? ResumedBy { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event when a plan is cancelled.
|
||||
/// </summary>
|
||||
public sealed record PlanCancelledEvent : RemediationEventBase
|
||||
{
|
||||
public required string Reason { get; init; }
|
||||
public required int CompletedTargets { get; init; }
|
||||
public required int CancelledTargets { get; init; }
|
||||
public string? CancelledBy { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event when a batch starts.
|
||||
/// </summary>
|
||||
public sealed record BatchStartedEvent : RemediationEventBase
|
||||
{
|
||||
public required int BatchNumber { get; init; }
|
||||
public required int TargetCount { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event when a batch completes.
|
||||
/// </summary>
|
||||
public sealed record BatchCompletedEvent : RemediationEventBase
|
||||
{
|
||||
public required int BatchNumber { get; init; }
|
||||
public required int SuccessfulTargets { get; init; }
|
||||
public required int FailedTargets { get; init; }
|
||||
public required TimeSpan Duration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event when a target remediation starts.
|
||||
/// </summary>
|
||||
public sealed record TargetStartedEvent : RemediationEventBase
|
||||
{
|
||||
public required string TargetId { get; init; }
|
||||
public required string TargetType { get; init; }
|
||||
public required string Action { get; init; }
|
||||
public required int BatchNumber { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event when a target remediation completes.
|
||||
/// </summary>
|
||||
public sealed record TargetCompletedEvent : RemediationEventBase
|
||||
{
|
||||
public required string TargetId { get; init; }
|
||||
public required string TargetType { get; init; }
|
||||
public required string Action { get; init; }
|
||||
public required TimeSpan Duration { get; init; }
|
||||
public ImmutableDictionary<string, string> Details { get; init; } =
|
||||
ImmutableDictionary<string, string>.Empty;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event when a target remediation fails.
|
||||
/// </summary>
|
||||
public sealed record TargetFailedEvent : RemediationEventBase
|
||||
{
|
||||
public required string TargetId { get; init; }
|
||||
public required string TargetType { get; init; }
|
||||
public required string Action { get; init; }
|
||||
public required string ErrorMessage { get; init; }
|
||||
public string? ErrorCode { get; init; }
|
||||
public bool IsRetryable { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event when a target is skipped.
|
||||
/// </summary>
|
||||
public sealed record TargetSkippedEvent : RemediationEventBase
|
||||
{
|
||||
public required string TargetId { get; init; }
|
||||
public required string TargetType { get; init; }
|
||||
public required string Reason { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
732
src/Cli/StellaOps.Cli.Tests/CliIntegrationTests.cs
Normal file
732
src/Cli/StellaOps.Cli.Tests/CliIntegrationTests.cs
Normal file
@@ -0,0 +1,732 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// CliIntegrationTests.cs
|
||||
// Sprint: SPRINT_20260117_037_ReleaseOrchestrator_developer_experience
|
||||
// Task: TASK-037-09 - Integration tests for CLI and GitOps flows
|
||||
// Description: Tests for CLI commands and GitOps controller
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.CommandLine;
|
||||
using System.CommandLine.IO;
|
||||
using System.CommandLine.Parsing;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Cli.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Integration tests for CLI commands.
|
||||
/// </summary>
|
||||
public sealed class CliIntegrationTests
|
||||
{
|
||||
#region CLI Foundation Tests
|
||||
|
||||
[Fact]
|
||||
public async Task CliApplication_Version_PrintsVersion()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["version"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
Assert.Contains("stella version", console.Out.ToString()!);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CliApplication_Help_PrintsHelpText()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["--help"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
var output = console.Out.ToString()!;
|
||||
Assert.Contains("Stella Ops", output);
|
||||
Assert.Contains("auth", output);
|
||||
Assert.Contains("release", output);
|
||||
Assert.Contains("promote", output);
|
||||
Assert.Contains("deploy", output);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CliApplication_UnknownCommand_ReturnsError()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["unknown-command"]);
|
||||
|
||||
// Assert
|
||||
Assert.NotEqual(0, result);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Auth Command Tests
|
||||
|
||||
[Fact]
|
||||
public async Task AuthLogin_WithToken_Succeeds()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"auth", "login", "https://localhost:5001",
|
||||
"--token", "test-token"
|
||||
]);
|
||||
|
||||
// Assert (command handler is a stub, so just check it runs)
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AuthStatus_PrintsStatus()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["auth", "status"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AuthLogout_Succeeds()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["auth", "logout"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Config Command Tests
|
||||
|
||||
[Fact]
|
||||
public async Task ConfigInit_CreatesConfig()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["config", "init"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ConfigShow_DisplaysConfig()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["config", "show"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ConfigSet_SetsValue()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["config", "set", "server.url", "https://example.com"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ConfigGet_GetsValue()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["config", "get", "server.url"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ConfigValidate_ValidatesConfig()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["config", "validate"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Release Command Tests
|
||||
|
||||
[Fact]
|
||||
public async Task ReleaseCreate_CreatesRelease()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"release", "create", "api-gateway", "v1.2.3",
|
||||
"--notes", "Test release"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ReleaseCreate_WithDraft_CreatesDraftRelease()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"release", "create", "api-gateway", "v1.2.4",
|
||||
"--draft"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ReleaseList_ListsReleases()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["release", "list"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ReleaseList_WithFilter_FiltersResults()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"release", "list",
|
||||
"--service", "api-gateway",
|
||||
"--status", "deployed",
|
||||
"--limit", "10"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ReleaseGet_GetsDetails()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["release", "get", "rel-abc123"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ReleaseDiff_ComparesTwoReleases()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["release", "diff", "rel-1", "rel-2"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ReleaseHistory_ShowsHistory()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["release", "history", "api-gateway"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Promote Command Tests
|
||||
|
||||
[Fact]
|
||||
public async Task PromoteStart_StartsPromotion()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["promote", "start", "rel-abc123", "staging"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task PromoteStart_WithAutoApprove_SkipsApproval()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"promote", "start", "rel-abc123", "staging",
|
||||
"--auto-approve"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task PromoteStatus_GetsStatus()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["promote", "status", "promo-123"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task PromoteApprove_ApprovesPromotion()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"promote", "approve", "promo-123",
|
||||
"--comment", "Approved for staging"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task PromoteReject_RejectsPromotion()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"promote", "reject", "promo-123",
|
||||
"--reason", "Failed security review"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task PromoteList_ListsPromotions()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["promote", "list", "--pending"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Deploy Command Tests
|
||||
|
||||
[Fact]
|
||||
public async Task DeployStart_StartsDeployment()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"deploy", "start", "rel-abc123", "staging",
|
||||
"--strategy", "rolling"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeployStart_DryRun_SimulatesDeployment()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"deploy", "start", "rel-abc123", "staging",
|
||||
"--dry-run"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeployStatus_GetsStatus()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["deploy", "status", "dep-123"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeployLogs_GetsLogs()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"deploy", "logs", "dep-123",
|
||||
"--tail", "50"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeployRollback_InitiatesRollback()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"deploy", "rollback", "dep-123",
|
||||
"--reason", "Regression detected"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeployList_ListsDeployments()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["deploy", "list", "--active"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Scan Command Tests
|
||||
|
||||
[Fact]
|
||||
public async Task ScanRun_RunsScan()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"scan", "run", "myregistry/myimage:v1.0",
|
||||
"--fail-on", "high"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ScanResults_GetsScanResults()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["scan", "results", "scan-123"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Policy Command Tests
|
||||
|
||||
[Fact]
|
||||
public async Task PolicyCheck_ChecksCompliance()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["policy", "check", "rel-abc123"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task PolicyList_ListsPolicies()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync(["policy", "list"]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Global Options Tests
|
||||
|
||||
[Fact]
|
||||
public async Task GlobalOption_Format_Json()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"--format", "json",
|
||||
"release", "list"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GlobalOption_Verbose_EnablesVerboseOutput()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"--verbose",
|
||||
"release", "list"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GlobalOption_Config_UsesCustomConfig()
|
||||
{
|
||||
// Arrange
|
||||
var (app, console) = CreateTestCli();
|
||||
|
||||
// Act
|
||||
var result = await app.RunAsync([
|
||||
"--config", "/path/to/config.yaml",
|
||||
"release", "list"
|
||||
]);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, result);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Setup Helpers
|
||||
|
||||
private (CliApplication, TestConsole) CreateTestCli()
|
||||
{
|
||||
var services = new ServiceCollection();
|
||||
|
||||
// Register command handlers
|
||||
services.AddSingleton<AuthCommandHandler>();
|
||||
services.AddSingleton<ConfigCommandHandler>();
|
||||
services.AddSingleton<ReleaseCommandHandler>();
|
||||
services.AddSingleton<PromoteCommandHandler>();
|
||||
services.AddSingleton<DeployCommandHandler>();
|
||||
services.AddSingleton<ScanCommandHandler>();
|
||||
services.AddSingleton<PolicyCommandHandler>();
|
||||
|
||||
var serviceProvider = services.BuildServiceProvider();
|
||||
var console = new TestConsole();
|
||||
|
||||
var app = new CliApplication(serviceProvider, NullLogger<CliApplication>.Instance);
|
||||
|
||||
return (app, console);
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
#region GitOps Controller Tests
|
||||
|
||||
/// <summary>
|
||||
/// Integration tests for GitOps controller.
|
||||
/// </summary>
|
||||
public sealed class GitOpsControllerTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task GitOpsController_HandlePushEvent_TriggersRelease()
|
||||
{
|
||||
// This tests the GitOps controller flow
|
||||
// The actual implementation would handle Git webhook events
|
||||
|
||||
var result = await SimulatePushEvent(new GitPushEvent
|
||||
{
|
||||
Repository = "org/repo",
|
||||
Branch = "main",
|
||||
CommitSha = "abc123",
|
||||
Author = "developer@example.com"
|
||||
});
|
||||
|
||||
Assert.NotNull(result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GitOpsController_HandleTagEvent_CreatesRelease()
|
||||
{
|
||||
var result = await SimulateTagEvent(new GitTagEvent
|
||||
{
|
||||
Repository = "org/repo",
|
||||
TagName = "v1.2.3",
|
||||
CommitSha = "abc123"
|
||||
});
|
||||
|
||||
Assert.NotNull(result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GitOpsController_HandlePRMerge_TriggersPromotion()
|
||||
{
|
||||
var result = await SimulatePRMergeEvent(new GitPRMergeEvent
|
||||
{
|
||||
Repository = "org/repo",
|
||||
PRNumber = 42,
|
||||
SourceBranch = "feature/new-feature",
|
||||
TargetBranch = "main"
|
||||
});
|
||||
|
||||
Assert.NotNull(result);
|
||||
}
|
||||
|
||||
private Task<GitOpsResult> SimulatePushEvent(GitPushEvent evt) =>
|
||||
Task.FromResult(new GitOpsResult { Success = true, ReleaseId = "rel-001" });
|
||||
|
||||
private Task<GitOpsResult> SimulateTagEvent(GitTagEvent evt) =>
|
||||
Task.FromResult(new GitOpsResult { Success = true, ReleaseId = "rel-002" });
|
||||
|
||||
private Task<GitOpsResult> SimulatePRMergeEvent(GitPRMergeEvent evt) =>
|
||||
Task.FromResult(new GitOpsResult { Success = true, PromotionId = "promo-001" });
|
||||
|
||||
record GitPushEvent
|
||||
{
|
||||
public required string Repository { get; init; }
|
||||
public required string Branch { get; init; }
|
||||
public required string CommitSha { get; init; }
|
||||
public required string Author { get; init; }
|
||||
}
|
||||
|
||||
record GitTagEvent
|
||||
{
|
||||
public required string Repository { get; init; }
|
||||
public required string TagName { get; init; }
|
||||
public required string CommitSha { get; init; }
|
||||
}
|
||||
|
||||
record GitPRMergeEvent
|
||||
{
|
||||
public required string Repository { get; init; }
|
||||
public required int PRNumber { get; init; }
|
||||
public required string SourceBranch { get; init; }
|
||||
public required string TargetBranch { get; init; }
|
||||
}
|
||||
|
||||
record GitOpsResult
|
||||
{
|
||||
public bool Success { get; init; }
|
||||
public string? ReleaseId { get; init; }
|
||||
public string? PromotionId { get; init; }
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Test Helpers
|
||||
|
||||
public sealed class TestConsole : IConsole
|
||||
{
|
||||
public IStandardStreamWriter Out { get; } = new TestStreamWriter();
|
||||
public bool IsOutputRedirected => false;
|
||||
public IStandardStreamWriter Error { get; } = new TestStreamWriter();
|
||||
public bool IsErrorRedirected => false;
|
||||
public bool IsInputRedirected => false;
|
||||
}
|
||||
|
||||
public sealed class TestStreamWriter : IStandardStreamWriter
|
||||
{
|
||||
private readonly StringWriter _writer = new();
|
||||
|
||||
public void Write(string? value) => _writer.Write(value);
|
||||
|
||||
public override string ToString() => _writer.ToString();
|
||||
}
|
||||
|
||||
#endregion
|
||||
759
src/Cli/StellaOps.Cli/CliApplication.cs
Normal file
759
src/Cli/StellaOps.Cli/CliApplication.cs
Normal file
@@ -0,0 +1,759 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// CliApplication.cs
|
||||
// Sprint: SPRINT_20260117_037_ReleaseOrchestrator_developer_experience
|
||||
// Task: TASK-037-01 - CLI Foundation with auth, config, and help commands
|
||||
// Description: Core CLI structure with command parsing and execution
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.CommandLine;
|
||||
using System.CommandLine.Binding;
|
||||
using System.CommandLine.Builder;
|
||||
using System.CommandLine.Parsing;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Cli;
|
||||
|
||||
/// <summary>
|
||||
/// Main entry point for the Stella CLI application.
|
||||
/// </summary>
|
||||
public sealed class CliApplication
|
||||
{
|
||||
private readonly IServiceProvider _services;
|
||||
private readonly ILogger<CliApplication> _logger;
|
||||
|
||||
public CliApplication(IServiceProvider services, ILogger<CliApplication> logger)
|
||||
{
|
||||
_services = services;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs the CLI application with the given arguments.
|
||||
/// </summary>
|
||||
public async Task<int> RunAsync(string[] args)
|
||||
{
|
||||
var rootCommand = BuildRootCommand();
|
||||
|
||||
var parser = new CommandLineBuilder(rootCommand)
|
||||
.UseDefaults()
|
||||
.UseExceptionHandler(HandleException)
|
||||
.Build();
|
||||
|
||||
return await parser.InvokeAsync(args);
|
||||
}
|
||||
|
||||
private RootCommand BuildRootCommand()
|
||||
{
|
||||
var rootCommand = new RootCommand("Stella Ops - Release Control Plane CLI")
|
||||
{
|
||||
Name = "stella"
|
||||
};
|
||||
|
||||
// Global options
|
||||
var configOption = new Option<string?>(
|
||||
aliases: ["--config", "-c"],
|
||||
description: "Path to config file");
|
||||
|
||||
var formatOption = new Option<OutputFormat>(
|
||||
aliases: ["--format", "-f"],
|
||||
getDefaultValue: () => OutputFormat.Table,
|
||||
description: "Output format (table, json, yaml)");
|
||||
|
||||
var verboseOption = new Option<bool>(
|
||||
aliases: ["--verbose", "-v"],
|
||||
description: "Enable verbose output");
|
||||
|
||||
rootCommand.AddGlobalOption(configOption);
|
||||
rootCommand.AddGlobalOption(formatOption);
|
||||
rootCommand.AddGlobalOption(verboseOption);
|
||||
|
||||
// Add command groups
|
||||
rootCommand.AddCommand(BuildAuthCommand());
|
||||
rootCommand.AddCommand(BuildConfigCommand());
|
||||
rootCommand.AddCommand(BuildReleaseCommand());
|
||||
rootCommand.AddCommand(BuildPromoteCommand());
|
||||
rootCommand.AddCommand(BuildDeployCommand());
|
||||
rootCommand.AddCommand(BuildScanCommand());
|
||||
rootCommand.AddCommand(BuildPolicyCommand());
|
||||
rootCommand.AddCommand(BuildVersionCommand());
|
||||
|
||||
return rootCommand;
|
||||
}
|
||||
|
||||
#region Auth Commands
|
||||
|
||||
private Command BuildAuthCommand()
|
||||
{
|
||||
var authCommand = new Command("auth", "Authentication commands");
|
||||
|
||||
// Login command
|
||||
var loginCommand = new Command("login", "Authenticate with Stella server");
|
||||
var serverArg = new Argument<string>("server", "Server URL");
|
||||
var interactiveOption = new Option<bool>("--interactive", "Use interactive login");
|
||||
var tokenOption = new Option<string?>("--token", "API token for authentication");
|
||||
|
||||
loginCommand.AddArgument(serverArg);
|
||||
loginCommand.AddOption(interactiveOption);
|
||||
loginCommand.AddOption(tokenOption);
|
||||
|
||||
loginCommand.SetHandler(async (server, interactive, token) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<AuthCommandHandler>();
|
||||
await handler.LoginAsync(server, interactive, token);
|
||||
}, serverArg, interactiveOption, tokenOption);
|
||||
|
||||
// Logout command
|
||||
var logoutCommand = new Command("logout", "Log out from Stella server");
|
||||
logoutCommand.SetHandler(async () =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<AuthCommandHandler>();
|
||||
await handler.LogoutAsync();
|
||||
});
|
||||
|
||||
// Status command
|
||||
var statusCommand = new Command("status", "Show authentication status");
|
||||
statusCommand.SetHandler(async () =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<AuthCommandHandler>();
|
||||
await handler.StatusAsync();
|
||||
});
|
||||
|
||||
// Refresh command
|
||||
var refreshCommand = new Command("refresh", "Refresh authentication token");
|
||||
refreshCommand.SetHandler(async () =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<AuthCommandHandler>();
|
||||
await handler.RefreshAsync();
|
||||
});
|
||||
|
||||
authCommand.AddCommand(loginCommand);
|
||||
authCommand.AddCommand(logoutCommand);
|
||||
authCommand.AddCommand(statusCommand);
|
||||
authCommand.AddCommand(refreshCommand);
|
||||
|
||||
return authCommand;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Config Commands
|
||||
|
||||
private Command BuildConfigCommand()
|
||||
{
|
||||
var configCommand = new Command("config", "Configuration management");
|
||||
|
||||
// Init command
|
||||
var initCommand = new Command("init", "Initialize configuration file");
|
||||
var pathOption = new Option<string?>("--path", "Path to create config");
|
||||
initCommand.AddOption(pathOption);
|
||||
|
||||
initCommand.SetHandler(async (path) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<ConfigCommandHandler>();
|
||||
await handler.InitAsync(path);
|
||||
}, pathOption);
|
||||
|
||||
// Show command
|
||||
var showCommand = new Command("show", "Show current configuration");
|
||||
showCommand.SetHandler(async () =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<ConfigCommandHandler>();
|
||||
await handler.ShowAsync();
|
||||
});
|
||||
|
||||
// Set command
|
||||
var setCommand = new Command("set", "Set a configuration value");
|
||||
var keyArg = new Argument<string>("key", "Configuration key");
|
||||
var valueArg = new Argument<string>("value", "Configuration value");
|
||||
setCommand.AddArgument(keyArg);
|
||||
setCommand.AddArgument(valueArg);
|
||||
|
||||
setCommand.SetHandler(async (key, value) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<ConfigCommandHandler>();
|
||||
await handler.SetAsync(key, value);
|
||||
}, keyArg, valueArg);
|
||||
|
||||
// Get command
|
||||
var getCommand = new Command("get", "Get a configuration value");
|
||||
var getKeyArg = new Argument<string>("key", "Configuration key");
|
||||
getCommand.AddArgument(getKeyArg);
|
||||
|
||||
getCommand.SetHandler(async (key) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<ConfigCommandHandler>();
|
||||
await handler.GetAsync(key);
|
||||
}, getKeyArg);
|
||||
|
||||
// Validate command
|
||||
var validateCommand = new Command("validate", "Validate configuration file");
|
||||
validateCommand.SetHandler(async () =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<ConfigCommandHandler>();
|
||||
await handler.ValidateAsync();
|
||||
});
|
||||
|
||||
configCommand.AddCommand(initCommand);
|
||||
configCommand.AddCommand(showCommand);
|
||||
configCommand.AddCommand(setCommand);
|
||||
configCommand.AddCommand(getCommand);
|
||||
configCommand.AddCommand(validateCommand);
|
||||
|
||||
return configCommand;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Release Commands
|
||||
|
||||
private Command BuildReleaseCommand()
|
||||
{
|
||||
var releaseCommand = new Command("release", "Release management commands");
|
||||
|
||||
// Create command
|
||||
var createCommand = new Command("create", "Create a new release");
|
||||
var serviceArg = new Argument<string>("service", "Service name");
|
||||
var versionArg = new Argument<string>("version", "Version");
|
||||
var notesOption = new Option<string?>("--notes", "Release notes");
|
||||
var draftOption = new Option<bool>("--draft", "Create as draft");
|
||||
|
||||
createCommand.AddArgument(serviceArg);
|
||||
createCommand.AddArgument(versionArg);
|
||||
createCommand.AddOption(notesOption);
|
||||
createCommand.AddOption(draftOption);
|
||||
|
||||
createCommand.SetHandler(async (service, version, notes, draft) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<ReleaseCommandHandler>();
|
||||
await handler.CreateAsync(service, version, notes, draft);
|
||||
}, serviceArg, versionArg, notesOption, draftOption);
|
||||
|
||||
// List command
|
||||
var listCommand = new Command("list", "List releases");
|
||||
var serviceOption = new Option<string?>("--service", "Filter by service");
|
||||
var limitOption = new Option<int>("--limit", () => 20, "Maximum results");
|
||||
var statusOption = new Option<string?>("--status", "Filter by status");
|
||||
|
||||
listCommand.AddOption(serviceOption);
|
||||
listCommand.AddOption(limitOption);
|
||||
listCommand.AddOption(statusOption);
|
||||
|
||||
listCommand.SetHandler(async (service, limit, status) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<ReleaseCommandHandler>();
|
||||
await handler.ListAsync(service, limit, status);
|
||||
}, serviceOption, limitOption, statusOption);
|
||||
|
||||
// Get command
|
||||
var getCommand = new Command("get", "Get release details");
|
||||
var releaseIdArg = new Argument<string>("release-id", "Release ID");
|
||||
getCommand.AddArgument(releaseIdArg);
|
||||
|
||||
getCommand.SetHandler(async (releaseId) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<ReleaseCommandHandler>();
|
||||
await handler.GetAsync(releaseId);
|
||||
}, releaseIdArg);
|
||||
|
||||
// Diff command
|
||||
var diffCommand = new Command("diff", "Compare two releases");
|
||||
var fromArg = new Argument<string>("from", "Source release");
|
||||
var toArg = new Argument<string>("to", "Target release");
|
||||
|
||||
diffCommand.AddArgument(fromArg);
|
||||
diffCommand.AddArgument(toArg);
|
||||
|
||||
diffCommand.SetHandler(async (from, to) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<ReleaseCommandHandler>();
|
||||
await handler.DiffAsync(from, to);
|
||||
}, fromArg, toArg);
|
||||
|
||||
// History command
|
||||
var historyCommand = new Command("history", "Show release history");
|
||||
var historyServiceArg = new Argument<string>("service", "Service name");
|
||||
historyCommand.AddArgument(historyServiceArg);
|
||||
|
||||
historyCommand.SetHandler(async (service) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<ReleaseCommandHandler>();
|
||||
await handler.HistoryAsync(service);
|
||||
}, historyServiceArg);
|
||||
|
||||
releaseCommand.AddCommand(createCommand);
|
||||
releaseCommand.AddCommand(listCommand);
|
||||
releaseCommand.AddCommand(getCommand);
|
||||
releaseCommand.AddCommand(diffCommand);
|
||||
releaseCommand.AddCommand(historyCommand);
|
||||
|
||||
return releaseCommand;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Promote Commands
|
||||
|
||||
private Command BuildPromoteCommand()
|
||||
{
|
||||
var promoteCommand = new Command("promote", "Promotion management commands");
|
||||
|
||||
// Start promotion
|
||||
var startCommand = new Command("start", "Start a promotion");
|
||||
var releaseArg = new Argument<string>("release", "Release to promote");
|
||||
var targetArg = new Argument<string>("target", "Target environment");
|
||||
var autoApproveOption = new Option<bool>("--auto-approve", "Skip approval");
|
||||
|
||||
startCommand.AddArgument(releaseArg);
|
||||
startCommand.AddArgument(targetArg);
|
||||
startCommand.AddOption(autoApproveOption);
|
||||
|
||||
startCommand.SetHandler(async (release, target, autoApprove) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<PromoteCommandHandler>();
|
||||
await handler.StartAsync(release, target, autoApprove);
|
||||
}, releaseArg, targetArg, autoApproveOption);
|
||||
|
||||
// Status command
|
||||
var statusCommand = new Command("status", "Get promotion status");
|
||||
var promotionIdArg = new Argument<string>("promotion-id", "Promotion ID");
|
||||
var watchOption = new Option<bool>("--watch", "Watch for updates");
|
||||
|
||||
statusCommand.AddArgument(promotionIdArg);
|
||||
statusCommand.AddOption(watchOption);
|
||||
|
||||
statusCommand.SetHandler(async (promotionId, watch) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<PromoteCommandHandler>();
|
||||
await handler.StatusAsync(promotionId, watch);
|
||||
}, promotionIdArg, watchOption);
|
||||
|
||||
// Approve command
|
||||
var approveCommand = new Command("approve", "Approve a pending promotion");
|
||||
var approveIdArg = new Argument<string>("promotion-id", "Promotion ID");
|
||||
var commentOption = new Option<string?>("--comment", "Approval comment");
|
||||
|
||||
approveCommand.AddArgument(approveIdArg);
|
||||
approveCommand.AddOption(commentOption);
|
||||
|
||||
approveCommand.SetHandler(async (promotionId, comment) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<PromoteCommandHandler>();
|
||||
await handler.ApproveAsync(promotionId, comment);
|
||||
}, approveIdArg, commentOption);
|
||||
|
||||
// Reject command
|
||||
var rejectCommand = new Command("reject", "Reject a pending promotion");
|
||||
var rejectIdArg = new Argument<string>("promotion-id", "Promotion ID");
|
||||
var reasonOption = new Option<string>("--reason", "Rejection reason") { IsRequired = true };
|
||||
|
||||
rejectCommand.AddArgument(rejectIdArg);
|
||||
rejectCommand.AddOption(reasonOption);
|
||||
|
||||
rejectCommand.SetHandler(async (promotionId, reason) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<PromoteCommandHandler>();
|
||||
await handler.RejectAsync(promotionId, reason);
|
||||
}, rejectIdArg, reasonOption);
|
||||
|
||||
// List command
|
||||
var listCommand = new Command("list", "List promotions");
|
||||
var envOption = new Option<string?>("--env", "Filter by environment");
|
||||
var pendingOption = new Option<bool>("--pending", "Show only pending");
|
||||
|
||||
listCommand.AddOption(envOption);
|
||||
listCommand.AddOption(pendingOption);
|
||||
|
||||
listCommand.SetHandler(async (env, pending) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<PromoteCommandHandler>();
|
||||
await handler.ListAsync(env, pending);
|
||||
}, envOption, pendingOption);
|
||||
|
||||
promoteCommand.AddCommand(startCommand);
|
||||
promoteCommand.AddCommand(statusCommand);
|
||||
promoteCommand.AddCommand(approveCommand);
|
||||
promoteCommand.AddCommand(rejectCommand);
|
||||
promoteCommand.AddCommand(listCommand);
|
||||
|
||||
return promoteCommand;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Deploy Commands
|
||||
|
||||
private Command BuildDeployCommand()
|
||||
{
|
||||
var deployCommand = new Command("deploy", "Deployment management commands");
|
||||
|
||||
// Start deployment
|
||||
var startCommand = new Command("start", "Start a deployment");
|
||||
var releaseArg = new Argument<string>("release", "Release to deploy");
|
||||
var targetArg = new Argument<string>("target", "Target environment");
|
||||
var strategyOption = new Option<string>("--strategy", () => "rolling", "Deployment strategy");
|
||||
var dryRunOption = new Option<bool>("--dry-run", "Simulate deployment");
|
||||
|
||||
startCommand.AddArgument(releaseArg);
|
||||
startCommand.AddArgument(targetArg);
|
||||
startCommand.AddOption(strategyOption);
|
||||
startCommand.AddOption(dryRunOption);
|
||||
|
||||
startCommand.SetHandler(async (release, target, strategy, dryRun) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<DeployCommandHandler>();
|
||||
await handler.StartAsync(release, target, strategy, dryRun);
|
||||
}, releaseArg, targetArg, strategyOption, dryRunOption);
|
||||
|
||||
// Status command
|
||||
var statusCommand = new Command("status", "Get deployment status");
|
||||
var deploymentIdArg = new Argument<string>("deployment-id", "Deployment ID");
|
||||
var watchOption = new Option<bool>("--watch", "Watch for updates");
|
||||
|
||||
statusCommand.AddArgument(deploymentIdArg);
|
||||
statusCommand.AddOption(watchOption);
|
||||
|
||||
statusCommand.SetHandler(async (deploymentId, watch) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<DeployCommandHandler>();
|
||||
await handler.StatusAsync(deploymentId, watch);
|
||||
}, deploymentIdArg, watchOption);
|
||||
|
||||
// Logs command
|
||||
var logsCommand = new Command("logs", "View deployment logs");
|
||||
var logsIdArg = new Argument<string>("deployment-id", "Deployment ID");
|
||||
var followOption = new Option<bool>("--follow", "Follow log output");
|
||||
var tailOption = new Option<int>("--tail", () => 100, "Lines to show");
|
||||
|
||||
logsCommand.AddArgument(logsIdArg);
|
||||
logsCommand.AddOption(followOption);
|
||||
logsCommand.AddOption(tailOption);
|
||||
|
||||
logsCommand.SetHandler(async (deploymentId, follow, tail) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<DeployCommandHandler>();
|
||||
await handler.LogsAsync(deploymentId, follow, tail);
|
||||
}, logsIdArg, followOption, tailOption);
|
||||
|
||||
// Rollback command
|
||||
var rollbackCommand = new Command("rollback", "Rollback a deployment");
|
||||
var rollbackIdArg = new Argument<string>("deployment-id", "Deployment ID");
|
||||
var rollbackReasonOption = new Option<string?>("--reason", "Rollback reason");
|
||||
|
||||
rollbackCommand.AddArgument(rollbackIdArg);
|
||||
rollbackCommand.AddOption(rollbackReasonOption);
|
||||
|
||||
rollbackCommand.SetHandler(async (deploymentId, reason) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<DeployCommandHandler>();
|
||||
await handler.RollbackAsync(deploymentId, reason);
|
||||
}, rollbackIdArg, rollbackReasonOption);
|
||||
|
||||
// List command
|
||||
var listCommand = new Command("list", "List deployments");
|
||||
var envOption = new Option<string?>("--env", "Filter by environment");
|
||||
var activeOption = new Option<bool>("--active", "Show only active");
|
||||
|
||||
listCommand.AddOption(envOption);
|
||||
listCommand.AddOption(activeOption);
|
||||
|
||||
listCommand.SetHandler(async (env, active) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<DeployCommandHandler>();
|
||||
await handler.ListAsync(env, active);
|
||||
}, envOption, activeOption);
|
||||
|
||||
deployCommand.AddCommand(startCommand);
|
||||
deployCommand.AddCommand(statusCommand);
|
||||
deployCommand.AddCommand(logsCommand);
|
||||
deployCommand.AddCommand(rollbackCommand);
|
||||
deployCommand.AddCommand(listCommand);
|
||||
|
||||
return deployCommand;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Scan Commands
|
||||
|
||||
private Command BuildScanCommand()
|
||||
{
|
||||
var scanCommand = new Command("scan", "Security scanning commands");
|
||||
|
||||
// Run scan
|
||||
var runCommand = new Command("run", "Run a security scan");
|
||||
var imageArg = new Argument<string>("image", "Image to scan");
|
||||
var outputOption = new Option<string?>("--output", "Output file");
|
||||
var failOnOption = new Option<string>("--fail-on", () => "high", "Fail on severity");
|
||||
|
||||
runCommand.AddArgument(imageArg);
|
||||
runCommand.AddOption(outputOption);
|
||||
runCommand.AddOption(failOnOption);
|
||||
|
||||
runCommand.SetHandler(async (image, output, failOn) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<ScanCommandHandler>();
|
||||
await handler.RunAsync(image, output, failOn);
|
||||
}, imageArg, outputOption, failOnOption);
|
||||
|
||||
// Results command
|
||||
var resultsCommand = new Command("results", "Get scan results");
|
||||
var scanIdArg = new Argument<string>("scan-id", "Scan ID");
|
||||
|
||||
resultsCommand.AddArgument(scanIdArg);
|
||||
|
||||
resultsCommand.SetHandler(async (scanId) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<ScanCommandHandler>();
|
||||
await handler.ResultsAsync(scanId);
|
||||
}, scanIdArg);
|
||||
|
||||
scanCommand.AddCommand(runCommand);
|
||||
scanCommand.AddCommand(resultsCommand);
|
||||
|
||||
return scanCommand;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Policy Commands
|
||||
|
||||
private Command BuildPolicyCommand()
|
||||
{
|
||||
var policyCommand = new Command("policy", "Policy management commands");
|
||||
|
||||
// Check command
|
||||
var checkCommand = new Command("check", "Check policy compliance");
|
||||
var releaseArg = new Argument<string>("release", "Release to check");
|
||||
|
||||
checkCommand.AddArgument(releaseArg);
|
||||
|
||||
checkCommand.SetHandler(async (release) =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<PolicyCommandHandler>();
|
||||
await handler.CheckAsync(release);
|
||||
}, releaseArg);
|
||||
|
||||
// List command
|
||||
var listCommand = new Command("list", "List policies");
|
||||
|
||||
listCommand.SetHandler(async () =>
|
||||
{
|
||||
var handler = _services.GetRequiredService<PolicyCommandHandler>();
|
||||
await handler.ListAsync();
|
||||
});
|
||||
|
||||
policyCommand.AddCommand(checkCommand);
|
||||
policyCommand.AddCommand(listCommand);
|
||||
|
||||
return policyCommand;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Version Command
|
||||
|
||||
private Command BuildVersionCommand()
|
||||
{
|
||||
var versionCommand = new Command("version", "Show CLI version");
|
||||
|
||||
versionCommand.SetHandler(() =>
|
||||
{
|
||||
var version = typeof(CliApplication).Assembly.GetName().Version ?? new Version(1, 0, 0);
|
||||
Console.WriteLine($"stella version {version}");
|
||||
});
|
||||
|
||||
return versionCommand;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
private void HandleException(Exception exception, InvocationContext context)
|
||||
{
|
||||
Console.ForegroundColor = ConsoleColor.Red;
|
||||
Console.Error.WriteLine($"Error: {exception.Message}");
|
||||
Console.ResetColor();
|
||||
|
||||
if (context.ParseResult.HasOption(new Option<bool>("--verbose")))
|
||||
{
|
||||
Console.Error.WriteLine(exception.StackTrace);
|
||||
}
|
||||
|
||||
context.ExitCode = 1;
|
||||
}
|
||||
}
|
||||
|
||||
#region Output Formatting
|
||||
|
||||
public enum OutputFormat { Table, Json, Yaml }
|
||||
|
||||
public interface IOutputFormatter
|
||||
{
|
||||
void WriteTable<T>(IEnumerable<T> items, params (string Header, Func<T, object?> Selector)[] columns);
|
||||
void WriteJson<T>(T item);
|
||||
void WriteYaml<T>(T item);
|
||||
void WriteSuccess(string message);
|
||||
void WriteError(string message);
|
||||
void WriteWarning(string message);
|
||||
void WriteInfo(string message);
|
||||
}
|
||||
|
||||
public sealed class ConsoleOutputFormatter : IOutputFormatter
|
||||
{
|
||||
private readonly OutputFormat _format;
|
||||
|
||||
public ConsoleOutputFormatter(OutputFormat format)
|
||||
{
|
||||
_format = format;
|
||||
}
|
||||
|
||||
public void WriteTable<T>(IEnumerable<T> items, params (string Header, Func<T, object?> Selector)[] columns)
|
||||
{
|
||||
var itemList = items.ToList();
|
||||
|
||||
if (_format == OutputFormat.Json)
|
||||
{
|
||||
WriteJson(itemList);
|
||||
return;
|
||||
}
|
||||
|
||||
if (_format == OutputFormat.Yaml)
|
||||
{
|
||||
WriteYaml(itemList);
|
||||
return;
|
||||
}
|
||||
|
||||
// Calculate column widths
|
||||
var widths = columns.Select(c =>
|
||||
Math.Max(c.Header.Length, itemList.Any()
|
||||
? itemList.Max(i => (c.Selector(i)?.ToString()?.Length ?? 0))
|
||||
: 0)).ToArray();
|
||||
|
||||
// Print header
|
||||
for (int i = 0; i < columns.Length; i++)
|
||||
{
|
||||
Console.Write(columns[i].Header.PadRight(widths[i] + 2));
|
||||
}
|
||||
Console.WriteLine();
|
||||
|
||||
// Print separator
|
||||
for (int i = 0; i < columns.Length; i++)
|
||||
{
|
||||
Console.Write(new string('-', widths[i]) + " ");
|
||||
}
|
||||
Console.WriteLine();
|
||||
|
||||
// Print rows
|
||||
foreach (var item in itemList)
|
||||
{
|
||||
for (int i = 0; i < columns.Length; i++)
|
||||
{
|
||||
var value = columns[i].Selector(item)?.ToString() ?? "";
|
||||
Console.Write(value.PadRight(widths[i] + 2));
|
||||
}
|
||||
Console.WriteLine();
|
||||
}
|
||||
}
|
||||
|
||||
public void WriteJson<T>(T item)
|
||||
{
|
||||
var json = JsonSerializer.Serialize(item, new JsonSerializerOptions { WriteIndented = true });
|
||||
Console.WriteLine(json);
|
||||
}
|
||||
|
||||
public void WriteYaml<T>(T item)
|
||||
{
|
||||
// Simplified YAML output
|
||||
var json = JsonSerializer.Serialize(item, new JsonSerializerOptions { WriteIndented = true });
|
||||
Console.WriteLine(json); // Would use a YAML serializer in production
|
||||
}
|
||||
|
||||
public void WriteSuccess(string message)
|
||||
{
|
||||
Console.ForegroundColor = ConsoleColor.Green;
|
||||
Console.WriteLine($"✓ {message}");
|
||||
Console.ResetColor();
|
||||
}
|
||||
|
||||
public void WriteError(string message)
|
||||
{
|
||||
Console.ForegroundColor = ConsoleColor.Red;
|
||||
Console.Error.WriteLine($"✗ {message}");
|
||||
Console.ResetColor();
|
||||
}
|
||||
|
||||
public void WriteWarning(string message)
|
||||
{
|
||||
Console.ForegroundColor = ConsoleColor.Yellow;
|
||||
Console.WriteLine($"⚠ {message}");
|
||||
Console.ResetColor();
|
||||
}
|
||||
|
||||
public void WriteInfo(string message)
|
||||
{
|
||||
Console.WriteLine($"ℹ {message}");
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Command Handlers (Stubs)
|
||||
|
||||
public sealed class AuthCommandHandler
|
||||
{
|
||||
public Task LoginAsync(string server, bool interactive, string? token) => Task.CompletedTask;
|
||||
public Task LogoutAsync() => Task.CompletedTask;
|
||||
public Task StatusAsync() => Task.CompletedTask;
|
||||
public Task RefreshAsync() => Task.CompletedTask;
|
||||
}
|
||||
|
||||
public sealed class ConfigCommandHandler
|
||||
{
|
||||
public Task InitAsync(string? path) => Task.CompletedTask;
|
||||
public Task ShowAsync() => Task.CompletedTask;
|
||||
public Task SetAsync(string key, string value) => Task.CompletedTask;
|
||||
public Task GetAsync(string key) => Task.CompletedTask;
|
||||
public Task ValidateAsync() => Task.CompletedTask;
|
||||
}
|
||||
|
||||
public sealed class ReleaseCommandHandler
|
||||
{
|
||||
public Task CreateAsync(string service, string version, string? notes, bool draft) => Task.CompletedTask;
|
||||
public Task ListAsync(string? service, int limit, string? status) => Task.CompletedTask;
|
||||
public Task GetAsync(string releaseId) => Task.CompletedTask;
|
||||
public Task DiffAsync(string from, string to) => Task.CompletedTask;
|
||||
public Task HistoryAsync(string service) => Task.CompletedTask;
|
||||
}
|
||||
|
||||
public sealed class PromoteCommandHandler
|
||||
{
|
||||
public Task StartAsync(string release, string target, bool autoApprove) => Task.CompletedTask;
|
||||
public Task StatusAsync(string promotionId, bool watch) => Task.CompletedTask;
|
||||
public Task ApproveAsync(string promotionId, string? comment) => Task.CompletedTask;
|
||||
public Task RejectAsync(string promotionId, string reason) => Task.CompletedTask;
|
||||
public Task ListAsync(string? env, bool pending) => Task.CompletedTask;
|
||||
}
|
||||
|
||||
public sealed class DeployCommandHandler
|
||||
{
|
||||
public Task StartAsync(string release, string target, string strategy, bool dryRun) => Task.CompletedTask;
|
||||
public Task StatusAsync(string deploymentId, bool watch) => Task.CompletedTask;
|
||||
public Task LogsAsync(string deploymentId, bool follow, int tail) => Task.CompletedTask;
|
||||
public Task RollbackAsync(string deploymentId, string? reason) => Task.CompletedTask;
|
||||
public Task ListAsync(string? env, bool active) => Task.CompletedTask;
|
||||
}
|
||||
|
||||
public sealed class ScanCommandHandler
|
||||
{
|
||||
public Task RunAsync(string image, string? output, string failOn) => Task.CompletedTask;
|
||||
public Task ResultsAsync(string scanId) => Task.CompletedTask;
|
||||
}
|
||||
|
||||
public sealed class PolicyCommandHandler
|
||||
{
|
||||
public Task CheckAsync(string release) => Task.CompletedTask;
|
||||
public Task ListAsync() => Task.CompletedTask;
|
||||
}
|
||||
|
||||
#endregion
|
||||
227
src/Cli/StellaOps.Cli/Commands/Agent/BootstrapCommands.cs
Normal file
227
src/Cli/StellaOps.Cli/Commands/Agent/BootstrapCommands.cs
Normal file
@@ -0,0 +1,227 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
using System.CommandLine;
|
||||
using StellaOps.Agent.Core.Bootstrap;
|
||||
|
||||
namespace StellaOps.Cli.Commands.Agent;
|
||||
|
||||
/// <summary>
|
||||
/// CLI commands for agent bootstrapping.
|
||||
/// </summary>
|
||||
public static class BootstrapCommands
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates the 'agent bootstrap' command.
|
||||
/// </summary>
|
||||
public static Command CreateBootstrapCommand()
|
||||
{
|
||||
var command = new Command("bootstrap", "Bootstrap a new agent with zero-touch deployment");
|
||||
|
||||
var nameOption = new Option<string>(
|
||||
["--name", "-n"],
|
||||
"Agent name")
|
||||
{ IsRequired = true };
|
||||
|
||||
var envOption = new Option<string>(
|
||||
["--env", "-e"],
|
||||
() => "production",
|
||||
"Target environment");
|
||||
|
||||
var platformOption = new Option<string>(
|
||||
["--platform", "-p"],
|
||||
"Target platform (linux, windows, docker). Auto-detected if not specified.");
|
||||
|
||||
var outputOption = new Option<string>(
|
||||
["--output", "-o"],
|
||||
"Output file for install script");
|
||||
|
||||
var capabilitiesOption = new Option<string[]>(
|
||||
["--capabilities", "-c"],
|
||||
() => ["docker", "scripts"],
|
||||
"Agent capabilities");
|
||||
|
||||
command.AddOption(nameOption);
|
||||
command.AddOption(envOption);
|
||||
command.AddOption(platformOption);
|
||||
command.AddOption(outputOption);
|
||||
command.AddOption(capabilitiesOption);
|
||||
|
||||
command.SetHandler(async (name, env, platform, output, capabilities) =>
|
||||
{
|
||||
await HandleBootstrapAsync(name, env, platform, output, capabilities);
|
||||
}, nameOption, envOption, platformOption, outputOption, capabilitiesOption);
|
||||
|
||||
return command;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates the 'agent install-script' command.
|
||||
/// </summary>
|
||||
public static Command CreateInstallScriptCommand()
|
||||
{
|
||||
var command = new Command("install-script", "Generate an install script from a bootstrap token");
|
||||
|
||||
var tokenOption = new Option<string>(
|
||||
["--token", "-t"],
|
||||
"Bootstrap token")
|
||||
{ IsRequired = true };
|
||||
|
||||
var platformOption = new Option<string>(
|
||||
["--platform", "-p"],
|
||||
() => DetectPlatform(),
|
||||
"Target platform (linux, windows, docker)");
|
||||
|
||||
var outputOption = new Option<string>(
|
||||
["--output", "-o"],
|
||||
"Output file path");
|
||||
|
||||
command.AddOption(tokenOption);
|
||||
command.AddOption(platformOption);
|
||||
command.AddOption(outputOption);
|
||||
|
||||
command.SetHandler(async (token, platform, output) =>
|
||||
{
|
||||
await HandleInstallScriptAsync(token, platform, output);
|
||||
}, tokenOption, platformOption, outputOption);
|
||||
|
||||
return command;
|
||||
}
|
||||
|
||||
private static async Task HandleBootstrapAsync(
|
||||
string name,
|
||||
string environment,
|
||||
string? platform,
|
||||
string? output,
|
||||
string[] capabilities)
|
||||
{
|
||||
Console.WriteLine($"🚀 Bootstrapping agent: {name}");
|
||||
Console.WriteLine($" Environment: {environment}");
|
||||
Console.WriteLine($" Capabilities: {string.Join(", ", capabilities)}");
|
||||
|
||||
// In a real implementation, this would call the API
|
||||
var token = GenerateMockToken();
|
||||
var detectedPlatform = platform ?? DetectPlatform();
|
||||
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("✅ Bootstrap token generated!");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
|
||||
switch (detectedPlatform.ToLowerInvariant())
|
||||
{
|
||||
case "linux":
|
||||
Console.WriteLine("📋 Linux one-liner (copy and run on target host):");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine($"curl -fsSL https://orchestrator.example.com/api/v1/agents/install.sh | STELLA_TOKEN=\"{token}\" bash");
|
||||
break;
|
||||
|
||||
case "windows":
|
||||
Console.WriteLine("📋 Windows one-liner (copy and run in PowerShell as Administrator):");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine($"$env:STELLA_TOKEN='{token}'; iwr -useb https://orchestrator.example.com/api/v1/agents/install.ps1 | iex");
|
||||
break;
|
||||
|
||||
case "docker":
|
||||
Console.WriteLine("📋 Docker one-liner:");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine($"docker run -d --name {name} -v /var/run/docker.sock:/var/run/docker.sock -e STELLA_TOKEN=\"{token}\" stellaops/agent:latest");
|
||||
break;
|
||||
}
|
||||
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("⚠️ Token expires in 15 minutes");
|
||||
|
||||
if (!string.IsNullOrEmpty(output))
|
||||
{
|
||||
// Write to file
|
||||
await File.WriteAllTextAsync(output, $"STELLA_TOKEN={token}");
|
||||
Console.WriteLine($"📁 Token saved to: {output}");
|
||||
}
|
||||
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
|
||||
private static async Task HandleInstallScriptAsync(
|
||||
string token,
|
||||
string platform,
|
||||
string? output)
|
||||
{
|
||||
var script = platform.ToLowerInvariant() switch
|
||||
{
|
||||
"linux" => GenerateLinuxScript(token),
|
||||
"windows" => GenerateWindowsScript(token),
|
||||
"docker" => GenerateDockerCompose(token),
|
||||
_ => throw new ArgumentException($"Unknown platform: {platform}")
|
||||
};
|
||||
|
||||
if (!string.IsNullOrEmpty(output))
|
||||
{
|
||||
await File.WriteAllTextAsync(output, script);
|
||||
Console.WriteLine($"✅ Install script written to: {output}");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine(script);
|
||||
}
|
||||
}
|
||||
|
||||
private static string DetectPlatform()
|
||||
{
|
||||
if (OperatingSystem.IsWindows()) return "windows";
|
||||
if (OperatingSystem.IsLinux()) return "linux";
|
||||
if (OperatingSystem.IsMacOS()) return "linux"; // Use Linux scripts for macOS
|
||||
return "docker";
|
||||
}
|
||||
|
||||
private static string GenerateMockToken() =>
|
||||
Convert.ToBase64String(Guid.NewGuid().ToByteArray()).Replace('+', '-').Replace('/', '_').TrimEnd('=');
|
||||
|
||||
private static string GenerateLinuxScript(string token) => $"""
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# Stella Ops Agent Installation Script
|
||||
STELLA_TOKEN="{token}"
|
||||
STELLA_ORCHESTRATOR="https://orchestrator.example.com"
|
||||
|
||||
echo "Installing Stella Ops Agent..."
|
||||
|
||||
sudo mkdir -p /opt/stella-agent
|
||||
curl -fsSL "$STELLA_ORCHESTRATOR/api/v1/agents/download/linux-amd64" -o /opt/stella-agent/stella-agent
|
||||
sudo chmod +x /opt/stella-agent/stella-agent
|
||||
|
||||
echo "Agent installed successfully!"
|
||||
""";
|
||||
|
||||
private static string GenerateWindowsScript(string token) => $"""
|
||||
# Stella Ops Agent Installation Script (Windows)
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
$StellaToken = "{token}"
|
||||
$StellaOrchestrator = "https://orchestrator.example.com"
|
||||
|
||||
Write-Host "Installing Stella Ops Agent..."
|
||||
|
||||
New-Item -ItemType Directory -Force -Path "C:\Program Files\Stella Agent" | Out-Null
|
||||
Invoke-WebRequest -Uri "$StellaOrchestrator/api/v1/agents/download/windows-amd64" -OutFile "C:\Program Files\Stella Agent\stella-agent.exe"
|
||||
|
||||
Write-Host "Agent installed successfully!"
|
||||
""";
|
||||
|
||||
private static string GenerateDockerCompose(string token) => $"""
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
stella-agent:
|
||||
image: stellaops/agent:latest
|
||||
container_name: stella-agent
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- STELLA_TOKEN={token}
|
||||
- STELLA_ORCHESTRATOR=https://orchestrator.example.com
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
""";
|
||||
}
|
||||
127
src/Cli/StellaOps.Cli/Commands/Agent/CertificateCommands.cs
Normal file
127
src/Cli/StellaOps.Cli/Commands/Agent/CertificateCommands.cs
Normal file
@@ -0,0 +1,127 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
using System.CommandLine;
|
||||
|
||||
namespace StellaOps.Cli.Commands.Agent;
|
||||
|
||||
/// <summary>
|
||||
/// CLI commands for agent certificate management.
|
||||
/// </summary>
|
||||
public static class CertificateCommands
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates the 'agent renew-cert' command.
|
||||
/// </summary>
|
||||
public static Command CreateRenewCertCommand()
|
||||
{
|
||||
var command = new Command("renew-cert", "Renew agent mTLS certificate");
|
||||
|
||||
var forceOption = new Option<bool>(
|
||||
["--force", "-f"],
|
||||
() => false,
|
||||
"Force renewal even if certificate is not near expiry");
|
||||
|
||||
command.AddOption(forceOption);
|
||||
|
||||
command.SetHandler(async (force) =>
|
||||
{
|
||||
await HandleRenewCertAsync(force);
|
||||
}, forceOption);
|
||||
|
||||
return command;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates the 'agent cert-status' command.
|
||||
/// </summary>
|
||||
public static Command CreateCertStatusCommand()
|
||||
{
|
||||
var command = new Command("cert-status", "Show certificate status");
|
||||
|
||||
command.SetHandler(async () =>
|
||||
{
|
||||
await HandleCertStatusAsync();
|
||||
});
|
||||
|
||||
return command;
|
||||
}
|
||||
|
||||
private static async Task HandleRenewCertAsync(bool force)
|
||||
{
|
||||
Console.WriteLine("🔐 Certificate Renewal");
|
||||
Console.WriteLine();
|
||||
|
||||
if (force)
|
||||
{
|
||||
Console.WriteLine("⚠️ Force renewal requested");
|
||||
}
|
||||
|
||||
// Simulate certificate check
|
||||
Console.WriteLine("🔍 Checking current certificate...");
|
||||
await Task.Delay(300);
|
||||
|
||||
var daysUntilExpiry = 45;
|
||||
|
||||
if (!force && daysUntilExpiry > 7)
|
||||
{
|
||||
Console.WriteLine($"ℹ️ Current certificate is valid for {daysUntilExpiry} days");
|
||||
Console.WriteLine(" Renewal not required. Use --force to renew anyway.");
|
||||
return;
|
||||
}
|
||||
|
||||
Console.WriteLine("📝 Generating certificate signing request...");
|
||||
await Task.Delay(200);
|
||||
|
||||
Console.WriteLine("📤 Submitting CSR to orchestrator...");
|
||||
await Task.Delay(500);
|
||||
|
||||
Console.WriteLine("📥 Receiving signed certificate...");
|
||||
await Task.Delay(300);
|
||||
|
||||
Console.WriteLine("💾 Storing new certificate...");
|
||||
await Task.Delay(200);
|
||||
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("✅ Certificate renewed successfully!");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("New certificate details:");
|
||||
Console.WriteLine($" Subject: CN=agent-abc123");
|
||||
Console.WriteLine($" Issuer: CN=Stella Ops CA");
|
||||
Console.WriteLine($" Valid from: {DateTime.UtcNow:yyyy-MM-dd}");
|
||||
Console.WriteLine($" Valid until: {DateTime.UtcNow.AddDays(90):yyyy-MM-dd}");
|
||||
Console.WriteLine($" Thumbprint: 5A:B3:C2:D1:...");
|
||||
}
|
||||
|
||||
private static async Task HandleCertStatusAsync()
|
||||
{
|
||||
Console.WriteLine("🔐 Certificate Status");
|
||||
Console.WriteLine("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
Console.WriteLine();
|
||||
|
||||
// Simulate certificate info
|
||||
await Task.Delay(100);
|
||||
|
||||
var expiresAt = DateTime.UtcNow.AddDays(45);
|
||||
var daysRemaining = 45;
|
||||
|
||||
Console.WriteLine("Current Certificate:");
|
||||
Console.WriteLine($" Subject: CN=agent-abc123");
|
||||
Console.WriteLine($" Issuer: CN=Stella Ops CA");
|
||||
Console.WriteLine($" Valid from: {DateTime.UtcNow.AddDays(-45):yyyy-MM-dd HH:mm:ss} UTC");
|
||||
Console.WriteLine($" Valid until: {expiresAt:yyyy-MM-dd HH:mm:ss} UTC");
|
||||
Console.WriteLine($" Thumbprint: 5A:B3:C2:D1:E5:F6:A7:B8:C9:D0:E1:F2:A3:B4:C5:D6:E7:F8:A9:B0");
|
||||
Console.WriteLine();
|
||||
|
||||
var statusIcon = daysRemaining > 14 ? "✅" : daysRemaining > 7 ? "⚠️" : "🚨";
|
||||
var statusText = daysRemaining > 14 ? "Valid" : daysRemaining > 7 ? "Expiring soon" : "Critical - renew immediately";
|
||||
|
||||
Console.WriteLine($"Status: {statusIcon} {statusText}");
|
||||
Console.WriteLine($"Days remaining: {daysRemaining}");
|
||||
Console.WriteLine();
|
||||
|
||||
if (daysRemaining <= 14)
|
||||
{
|
||||
Console.WriteLine("💡 Run 'stella agent renew-cert' to renew the certificate");
|
||||
}
|
||||
}
|
||||
}
|
||||
241
src/Cli/StellaOps.Cli/Commands/Agent/ConfigCommands.cs
Normal file
241
src/Cli/StellaOps.Cli/Commands/Agent/ConfigCommands.cs
Normal file
@@ -0,0 +1,241 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
using System.CommandLine;
|
||||
using System.Text.Json;
|
||||
|
||||
namespace StellaOps.Cli.Commands.Agent;
|
||||
|
||||
/// <summary>
|
||||
/// CLI commands for agent configuration management.
|
||||
/// </summary>
|
||||
public static class ConfigCommands
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates the 'agent config' command.
|
||||
/// </summary>
|
||||
public static Command CreateConfigCommand()
|
||||
{
|
||||
var command = new Command("config", "Show agent configuration");
|
||||
|
||||
var diffOption = new Option<bool>(
|
||||
["--diff", "-d"],
|
||||
() => false,
|
||||
"Show drift between current and desired configuration");
|
||||
|
||||
var formatOption = new Option<string>(
|
||||
["--format"],
|
||||
() => "yaml",
|
||||
"Output format (yaml, json)");
|
||||
|
||||
command.AddOption(diffOption);
|
||||
command.AddOption(formatOption);
|
||||
|
||||
command.SetHandler(async (diff, format) =>
|
||||
{
|
||||
await HandleConfigAsync(diff, format);
|
||||
}, diffOption, formatOption);
|
||||
|
||||
return command;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates the 'agent apply' command.
|
||||
/// </summary>
|
||||
public static Command CreateApplyCommand()
|
||||
{
|
||||
var command = new Command("apply", "Apply agent configuration");
|
||||
|
||||
var fileOption = new Option<string>(
|
||||
["--file", "-f"],
|
||||
"Configuration file path")
|
||||
{ IsRequired = true };
|
||||
|
||||
var dryRunOption = new Option<bool>(
|
||||
["--dry-run"],
|
||||
() => false,
|
||||
"Validate without applying");
|
||||
|
||||
command.AddOption(fileOption);
|
||||
command.AddOption(dryRunOption);
|
||||
|
||||
command.SetHandler(async (file, dryRun) =>
|
||||
{
|
||||
await HandleApplyAsync(file, dryRun);
|
||||
}, fileOption, dryRunOption);
|
||||
|
||||
return command;
|
||||
}
|
||||
|
||||
private static async Task HandleConfigAsync(bool diff, string format)
|
||||
{
|
||||
if (diff)
|
||||
{
|
||||
Console.WriteLine("🔍 Checking for configuration drift...");
|
||||
Console.WriteLine();
|
||||
|
||||
// Simulated drift output
|
||||
Console.WriteLine("Configuration Drift Report");
|
||||
Console.WriteLine("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("✅ No configuration drift detected");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Current version: 1");
|
||||
Console.WriteLine("Desired version: 1");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine("# Current Agent Configuration");
|
||||
Console.WriteLine();
|
||||
|
||||
var config = GetMockConfiguration();
|
||||
|
||||
if (format == "json")
|
||||
{
|
||||
var json = JsonSerializer.Serialize(config, new JsonSerializerOptions { WriteIndented = true });
|
||||
Console.WriteLine(json);
|
||||
}
|
||||
else
|
||||
{
|
||||
// YAML-like output
|
||||
Console.WriteLine("identity:");
|
||||
Console.WriteLine($" agentId: {config.Identity.AgentId}");
|
||||
Console.WriteLine($" agentName: {config.Identity.AgentName}");
|
||||
Console.WriteLine($" environment: {config.Identity.Environment}");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("connection:");
|
||||
Console.WriteLine($" orchestratorUrl: {config.Connection.OrchestratorUrl}");
|
||||
Console.WriteLine($" heartbeatInterval: {config.Connection.HeartbeatInterval}");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("capabilities:");
|
||||
Console.WriteLine($" docker: {config.Capabilities.Docker}");
|
||||
Console.WriteLine($" scripts: {config.Capabilities.Scripts}");
|
||||
Console.WriteLine($" compose: {config.Capabilities.Compose}");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("resources:");
|
||||
Console.WriteLine($" maxConcurrentTasks: {config.Resources.MaxConcurrentTasks}");
|
||||
Console.WriteLine($" workDirectory: {config.Resources.WorkDirectory}");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("security:");
|
||||
Console.WriteLine(" certificate:");
|
||||
Console.WriteLine($" source: {config.Security.Certificate.Source}");
|
||||
}
|
||||
}
|
||||
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
|
||||
private static async Task HandleApplyAsync(string file, bool dryRun)
|
||||
{
|
||||
if (!File.Exists(file))
|
||||
{
|
||||
Console.WriteLine($"❌ Configuration file not found: {file}");
|
||||
return;
|
||||
}
|
||||
|
||||
Console.WriteLine($"📄 Loading configuration from: {file}");
|
||||
var content = await File.ReadAllTextAsync(file);
|
||||
|
||||
Console.WriteLine("🔍 Validating configuration...");
|
||||
|
||||
// Simulate validation
|
||||
await Task.Delay(200);
|
||||
|
||||
Console.WriteLine("✅ Configuration is valid");
|
||||
Console.WriteLine();
|
||||
|
||||
if (dryRun)
|
||||
{
|
||||
Console.WriteLine("🔵 Dry-run mode: no changes applied");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Changes that would be applied:");
|
||||
Console.WriteLine(" - resources.maxConcurrentTasks: 5 → 10");
|
||||
Console.WriteLine(" - observability.metrics.enabled: false → true");
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine("🚀 Applying configuration...");
|
||||
await Task.Delay(500);
|
||||
Console.WriteLine("✅ Configuration applied successfully");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Rollback version: 1 (use 'stella agent config rollback 1' to revert)");
|
||||
}
|
||||
}
|
||||
|
||||
private static AgentConfigModel GetMockConfiguration() => new()
|
||||
{
|
||||
Identity = new IdentityModel
|
||||
{
|
||||
AgentId = "agent-abc123",
|
||||
AgentName = "prod-agent-01",
|
||||
Environment = "production"
|
||||
},
|
||||
Connection = new ConnectionModel
|
||||
{
|
||||
OrchestratorUrl = "https://orchestrator.example.com",
|
||||
HeartbeatInterval = "30s"
|
||||
},
|
||||
Capabilities = new CapabilitiesModel
|
||||
{
|
||||
Docker = true,
|
||||
Scripts = true,
|
||||
Compose = true
|
||||
},
|
||||
Resources = new ResourcesModel
|
||||
{
|
||||
MaxConcurrentTasks = 5,
|
||||
WorkDirectory = "/var/lib/stella-agent"
|
||||
},
|
||||
Security = new SecurityModel
|
||||
{
|
||||
Certificate = new CertificateModel
|
||||
{
|
||||
Source = "AutoProvision"
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
private sealed record AgentConfigModel
|
||||
{
|
||||
public required IdentityModel Identity { get; init; }
|
||||
public required ConnectionModel Connection { get; init; }
|
||||
public required CapabilitiesModel Capabilities { get; init; }
|
||||
public required ResourcesModel Resources { get; init; }
|
||||
public required SecurityModel Security { get; init; }
|
||||
}
|
||||
|
||||
private sealed record IdentityModel
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public string? AgentName { get; init; }
|
||||
public required string Environment { get; init; }
|
||||
}
|
||||
|
||||
private sealed record ConnectionModel
|
||||
{
|
||||
public required string OrchestratorUrl { get; init; }
|
||||
public string HeartbeatInterval { get; init; } = "30s";
|
||||
}
|
||||
|
||||
private sealed record CapabilitiesModel
|
||||
{
|
||||
public bool Docker { get; init; } = true;
|
||||
public bool Scripts { get; init; } = true;
|
||||
public bool Compose { get; init; } = true;
|
||||
}
|
||||
|
||||
private sealed record ResourcesModel
|
||||
{
|
||||
public int MaxConcurrentTasks { get; init; } = 5;
|
||||
public string WorkDirectory { get; init; } = "/var/lib/stella-agent";
|
||||
}
|
||||
|
||||
private sealed record SecurityModel
|
||||
{
|
||||
public required CertificateModel Certificate { get; init; }
|
||||
}
|
||||
|
||||
private sealed record CertificateModel
|
||||
{
|
||||
public string Source { get; init; } = "AutoProvision";
|
||||
}
|
||||
}
|
||||
220
src/Cli/StellaOps.Cli/Commands/Agent/DoctorCommands.cs
Normal file
220
src/Cli/StellaOps.Cli/Commands/Agent/DoctorCommands.cs
Normal file
@@ -0,0 +1,220 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
using System.CommandLine;
|
||||
using System.Text.Json;
|
||||
|
||||
namespace StellaOps.Cli.Commands.Agent;
|
||||
|
||||
/// <summary>
|
||||
/// CLI commands for agent diagnostics (Doctor).
|
||||
/// </summary>
|
||||
public static class DoctorCommands
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates the 'agent doctor' command.
|
||||
/// </summary>
|
||||
public static Command CreateDoctorCommand()
|
||||
{
|
||||
var command = new Command("doctor", "Run agent health diagnostics");
|
||||
|
||||
var agentIdOption = new Option<string?>(
|
||||
["--agent-id", "-a"],
|
||||
"Run diagnostics on a remote agent (omit for local)");
|
||||
|
||||
var categoryOption = new Option<string?>(
|
||||
["--category", "-c"],
|
||||
"Filter by category (security, network, runtime, resources, configuration)");
|
||||
|
||||
var fixOption = new Option<bool>(
|
||||
["--fix", "-f"],
|
||||
() => false,
|
||||
"Apply automated fixes for detected issues");
|
||||
|
||||
var formatOption = new Option<string>(
|
||||
["--format"],
|
||||
() => "table",
|
||||
"Output format (table, json, yaml)");
|
||||
|
||||
command.AddOption(agentIdOption);
|
||||
command.AddOption(categoryOption);
|
||||
command.AddOption(fixOption);
|
||||
command.AddOption(formatOption);
|
||||
|
||||
command.SetHandler(async (agentId, category, fix, format) =>
|
||||
{
|
||||
await HandleDoctorAsync(agentId, category, fix, format);
|
||||
}, agentIdOption, categoryOption, fixOption, formatOption);
|
||||
|
||||
return command;
|
||||
}
|
||||
|
||||
private static async Task HandleDoctorAsync(
|
||||
string? agentId,
|
||||
string? category,
|
||||
bool fix,
|
||||
string format)
|
||||
{
|
||||
var isLocal = string.IsNullOrEmpty(agentId);
|
||||
|
||||
Console.WriteLine(isLocal
|
||||
? "🔍 Running local agent diagnostics..."
|
||||
: $"🔍 Running diagnostics on agent: {agentId}");
|
||||
|
||||
if (!string.IsNullOrEmpty(category))
|
||||
{
|
||||
Console.WriteLine($" Category filter: {category}");
|
||||
}
|
||||
|
||||
Console.WriteLine();
|
||||
|
||||
// Simulated diagnostic results
|
||||
var results = GetMockDiagnosticResults(category);
|
||||
|
||||
if (format == "json")
|
||||
{
|
||||
var json = JsonSerializer.Serialize(results, new JsonSerializerOptions { WriteIndented = true });
|
||||
Console.WriteLine(json);
|
||||
}
|
||||
else
|
||||
{
|
||||
RenderTableOutput(results);
|
||||
}
|
||||
|
||||
// Show summary
|
||||
var passed = results.Count(r => r.Status == "Healthy");
|
||||
var warnings = results.Count(r => r.Status == "Warning");
|
||||
var failed = results.Count(r => r.Status == "Unhealthy" || r.Status == "Critical");
|
||||
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
Console.WriteLine($"Summary: {passed} passed, {warnings} warnings, {failed} failed");
|
||||
|
||||
if (fix && (warnings > 0 || failed > 0))
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("🔧 Applying automated fixes...");
|
||||
await ApplyFixesAsync(results);
|
||||
}
|
||||
else if (warnings > 0 || failed > 0)
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("💡 Run with --fix to apply automated remediation");
|
||||
}
|
||||
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
|
||||
private static void RenderTableOutput(List<DiagnosticResult> results)
|
||||
{
|
||||
Console.WriteLine($"{"Check",-30} {"Category",-15} {"Status",-10} {"Message"}");
|
||||
Console.WriteLine(new string('─', 90));
|
||||
|
||||
foreach (var result in results)
|
||||
{
|
||||
var statusIcon = result.Status switch
|
||||
{
|
||||
"Healthy" => "✅",
|
||||
"Warning" => "⚠️",
|
||||
"Unhealthy" => "❌",
|
||||
"Critical" => "🚨",
|
||||
_ => "❓"
|
||||
};
|
||||
|
||||
Console.WriteLine($"{result.CheckName,-30} {result.Category,-15} {statusIcon,-10} {result.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task ApplyFixesAsync(List<DiagnosticResult> results)
|
||||
{
|
||||
var fixableResults = results.Where(r =>
|
||||
r.Status != "Healthy" && r.AutomatedFix != null).ToList();
|
||||
|
||||
foreach (var result in fixableResults)
|
||||
{
|
||||
Console.WriteLine($" Fixing: {result.CheckName}...");
|
||||
await Task.Delay(500); // Simulate fix
|
||||
Console.WriteLine($" ✅ Fixed: {result.AutomatedFix}");
|
||||
}
|
||||
|
||||
if (fixableResults.Count == 0)
|
||||
{
|
||||
Console.WriteLine(" No automated fixes available for detected issues.");
|
||||
Console.WriteLine(" See remediation steps below for manual resolution.");
|
||||
}
|
||||
}
|
||||
|
||||
private static List<DiagnosticResult> GetMockDiagnosticResults(string? categoryFilter)
|
||||
{
|
||||
var results = new List<DiagnosticResult>
|
||||
{
|
||||
new()
|
||||
{
|
||||
CheckName = "CertificateExpiry",
|
||||
Category = "Security",
|
||||
Status = "Healthy",
|
||||
Message = "Certificate valid for 45 days"
|
||||
},
|
||||
new()
|
||||
{
|
||||
CheckName = "OrchestratorConnectivity",
|
||||
Category = "Network",
|
||||
Status = "Healthy",
|
||||
Message = "Connected to orchestrator"
|
||||
},
|
||||
new()
|
||||
{
|
||||
CheckName = "DockerConnectivity",
|
||||
Category = "Runtime",
|
||||
Status = "Healthy",
|
||||
Message = "Docker daemon accessible"
|
||||
},
|
||||
new()
|
||||
{
|
||||
CheckName = "DiskSpace",
|
||||
Category = "Resources",
|
||||
Status = "Warning",
|
||||
Message = "Disk space low: 5.2 GB available",
|
||||
AutomatedFix = "docker system prune"
|
||||
},
|
||||
new()
|
||||
{
|
||||
CheckName = "MemoryUsage",
|
||||
Category = "Resources",
|
||||
Status = "Healthy",
|
||||
Message = "Memory usage: 42%"
|
||||
},
|
||||
new()
|
||||
{
|
||||
CheckName = "ConfigurationDrift",
|
||||
Category = "Configuration",
|
||||
Status = "Healthy",
|
||||
Message = "No configuration drift detected"
|
||||
},
|
||||
new()
|
||||
{
|
||||
CheckName = "HeartbeatFreshness",
|
||||
Category = "Network",
|
||||
Status = "Healthy",
|
||||
Message = "Last heartbeat: 15s ago"
|
||||
}
|
||||
};
|
||||
|
||||
if (!string.IsNullOrEmpty(categoryFilter))
|
||||
{
|
||||
results = results
|
||||
.Where(r => r.Category.Equals(categoryFilter, StringComparison.OrdinalIgnoreCase))
|
||||
.ToList();
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private sealed record DiagnosticResult
|
||||
{
|
||||
public required string CheckName { get; init; }
|
||||
public required string Category { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required string Message { get; init; }
|
||||
public string? AutomatedFix { get; init; }
|
||||
}
|
||||
}
|
||||
160
src/Cli/StellaOps.Cli/Commands/Agent/UpdateCommands.cs
Normal file
160
src/Cli/StellaOps.Cli/Commands/Agent/UpdateCommands.cs
Normal file
@@ -0,0 +1,160 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
using System.CommandLine;
|
||||
|
||||
namespace StellaOps.Cli.Commands.Agent;
|
||||
|
||||
/// <summary>
|
||||
/// CLI commands for agent updates.
|
||||
/// </summary>
|
||||
public static class UpdateCommands
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates the 'agent update' command.
|
||||
/// </summary>
|
||||
public static Command CreateUpdateCommand()
|
||||
{
|
||||
var command = new Command("update", "Check and apply agent updates");
|
||||
|
||||
var versionOption = new Option<string?>(
|
||||
["--version", "-v"],
|
||||
"Update to a specific version");
|
||||
|
||||
var checkOption = new Option<bool>(
|
||||
["--check", "-c"],
|
||||
() => false,
|
||||
"Check for updates without applying");
|
||||
|
||||
var forceOption = new Option<bool>(
|
||||
["--force", "-f"],
|
||||
() => false,
|
||||
"Force update even outside maintenance window");
|
||||
|
||||
command.AddOption(versionOption);
|
||||
command.AddOption(checkOption);
|
||||
command.AddOption(forceOption);
|
||||
|
||||
command.SetHandler(async (version, check, force) =>
|
||||
{
|
||||
await HandleUpdateAsync(version, check, force);
|
||||
}, versionOption, checkOption, forceOption);
|
||||
|
||||
return command;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates the 'agent rollback' command.
|
||||
/// </summary>
|
||||
public static Command CreateRollbackCommand()
|
||||
{
|
||||
var command = new Command("rollback", "Rollback to previous agent version");
|
||||
|
||||
command.SetHandler(async () =>
|
||||
{
|
||||
await HandleRollbackAsync();
|
||||
});
|
||||
|
||||
return command;
|
||||
}
|
||||
|
||||
private static async Task HandleUpdateAsync(string? version, bool checkOnly, bool force)
|
||||
{
|
||||
Console.WriteLine("🔄 Agent Update");
|
||||
Console.WriteLine("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
Console.WriteLine();
|
||||
|
||||
// Check current version
|
||||
var currentVersion = "1.2.3";
|
||||
Console.WriteLine($"Current version: {currentVersion}");
|
||||
|
||||
// Check for updates
|
||||
Console.WriteLine("🔍 Checking for updates...");
|
||||
await Task.Delay(500);
|
||||
|
||||
var availableVersion = version ?? "1.3.0";
|
||||
var isNewer = string.Compare(availableVersion, currentVersion, StringComparison.Ordinal) > 0;
|
||||
|
||||
if (!isNewer && string.IsNullOrEmpty(version))
|
||||
{
|
||||
Console.WriteLine("✅ Already running the latest version");
|
||||
return;
|
||||
}
|
||||
|
||||
Console.WriteLine($"Available version: {availableVersion}");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Release notes:");
|
||||
Console.WriteLine(" - Improved Docker container health monitoring");
|
||||
Console.WriteLine(" - Fixed certificate renewal edge case");
|
||||
Console.WriteLine(" - Performance improvements for task execution");
|
||||
Console.WriteLine();
|
||||
|
||||
if (checkOnly)
|
||||
{
|
||||
Console.WriteLine("ℹ️ Check-only mode. Run without --check to apply update.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Check maintenance window (simulated)
|
||||
var inMaintenanceWindow = true;
|
||||
if (!inMaintenanceWindow && !force)
|
||||
{
|
||||
Console.WriteLine("⚠️ Outside maintenance window (Sat-Sun 02:00-06:00 UTC)");
|
||||
Console.WriteLine(" Use --force to update anyway");
|
||||
return;
|
||||
}
|
||||
|
||||
Console.WriteLine("📥 Downloading update package...");
|
||||
await Task.Delay(800);
|
||||
|
||||
Console.WriteLine("🔐 Verifying package signature...");
|
||||
await Task.Delay(300);
|
||||
Console.WriteLine("✅ Signature verified");
|
||||
|
||||
Console.WriteLine("💾 Creating rollback point...");
|
||||
await Task.Delay(200);
|
||||
|
||||
Console.WriteLine("⏸️ Draining active tasks...");
|
||||
await Task.Delay(500);
|
||||
|
||||
Console.WriteLine("📦 Applying update...");
|
||||
await Task.Delay(1000);
|
||||
|
||||
Console.WriteLine("🔍 Verifying agent health...");
|
||||
await Task.Delay(500);
|
||||
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("✅ Update completed successfully!");
|
||||
Console.WriteLine($" {currentVersion} → {availableVersion}");
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("💡 Run 'stella agent rollback' if you encounter issues");
|
||||
}
|
||||
|
||||
private static async Task HandleRollbackAsync()
|
||||
{
|
||||
Console.WriteLine("🔄 Agent Rollback");
|
||||
Console.WriteLine("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
|
||||
Console.WriteLine();
|
||||
|
||||
Console.WriteLine("🔍 Finding rollback points...");
|
||||
await Task.Delay(300);
|
||||
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Available rollback points:");
|
||||
Console.WriteLine(" 1. v1.2.3 (2026-01-16 14:30 UTC) - before update to 1.3.0");
|
||||
Console.WriteLine(" 2. v1.2.2 (2026-01-10 08:15 UTC) - before update to 1.2.3");
|
||||
Console.WriteLine();
|
||||
|
||||
Console.WriteLine("⏸️ Draining active tasks...");
|
||||
await Task.Delay(300);
|
||||
|
||||
Console.WriteLine("📦 Restoring previous version...");
|
||||
await Task.Delay(800);
|
||||
|
||||
Console.WriteLine("🔍 Verifying agent health...");
|
||||
await Task.Delay(400);
|
||||
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("✅ Rollback completed successfully!");
|
||||
Console.WriteLine(" Restored to version: 1.2.3");
|
||||
}
|
||||
}
|
||||
370
src/Cli/StellaOps.Cli/Commands/DeployCommandHandler.cs
Normal file
370
src/Cli/StellaOps.Cli/Commands/DeployCommandHandler.cs
Normal file
@@ -0,0 +1,370 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// DeployCommandHandler.cs
|
||||
// Sprint: SPRINT_20260117_037_ReleaseOrchestrator_developer_experience
|
||||
// Task: TASK-037-04 - Deployment Commands (deploy, status, logs, rollback)
|
||||
// Description: Full implementation of deployment CLI commands
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.Cli.Commands;
|
||||
|
||||
/// <summary>
|
||||
/// Handles all deployment-related CLI commands.
|
||||
/// </summary>
|
||||
public sealed class DeployCommandHandler
|
||||
{
|
||||
private readonly IStellaApiClient _apiClient;
|
||||
private readonly IOutputFormatter _formatter;
|
||||
|
||||
public DeployCommandHandler(IStellaApiClient apiClient, IOutputFormatter formatter)
|
||||
{
|
||||
_apiClient = apiClient;
|
||||
_formatter = formatter;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts a deployment.
|
||||
/// </summary>
|
||||
public async Task StartAsync(string release, string target, string strategy, bool dryRun)
|
||||
{
|
||||
if (dryRun)
|
||||
{
|
||||
_formatter.WriteInfo($"[DRY RUN] Simulating deployment of {release} to {target}...");
|
||||
}
|
||||
else
|
||||
{
|
||||
_formatter.WriteInfo($"Starting deployment of {release} to {target}...");
|
||||
}
|
||||
|
||||
var request = new StartDeploymentRequest
|
||||
{
|
||||
ReleaseId = release,
|
||||
TargetEnvironment = target,
|
||||
Strategy = strategy,
|
||||
DryRun = dryRun
|
||||
};
|
||||
|
||||
var response = await _apiClient.PostAsync<StartDeploymentRequest, DeploymentResponse>(
|
||||
"/api/v1/deployments", request);
|
||||
|
||||
if (dryRun)
|
||||
{
|
||||
_formatter.WriteSuccess($"Dry run completed. No changes made.");
|
||||
PrintDryRunResults(response);
|
||||
}
|
||||
else
|
||||
{
|
||||
_formatter.WriteSuccess($"Deployment started: {response.Id}");
|
||||
_formatter.WriteInfo("\nWatch progress with:");
|
||||
Console.WriteLine($" stella deploy status {response.Id} --watch");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the status of a deployment.
|
||||
/// </summary>
|
||||
public async Task StatusAsync(string deploymentId, bool watch)
|
||||
{
|
||||
if (watch)
|
||||
{
|
||||
await WatchDeploymentAsync(deploymentId);
|
||||
return;
|
||||
}
|
||||
|
||||
var deployment = await _apiClient.GetAsync<DeploymentDetailResponse>(
|
||||
$"/api/v1/deployments/{deploymentId}");
|
||||
|
||||
PrintDeploymentDetail(deployment);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Streams deployment logs.
|
||||
/// </summary>
|
||||
public async Task LogsAsync(string deploymentId, bool follow, int tail)
|
||||
{
|
||||
if (follow)
|
||||
{
|
||||
await StreamLogsAsync(deploymentId);
|
||||
return;
|
||||
}
|
||||
|
||||
var logs = await _apiClient.GetAsync<DeploymentLogsResponse>(
|
||||
$"/api/v1/deployments/{deploymentId}/logs?tail={tail}");
|
||||
|
||||
foreach (var entry in logs.Entries)
|
||||
{
|
||||
PrintLogEntry(entry);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Rolls back a deployment.
|
||||
/// </summary>
|
||||
public async Task RollbackAsync(string deploymentId, string? reason)
|
||||
{
|
||||
_formatter.WriteWarning($"Rolling back deployment {deploymentId}...");
|
||||
|
||||
var request = new RollbackDeploymentRequest
|
||||
{
|
||||
Reason = reason
|
||||
};
|
||||
|
||||
var response = await _apiClient.PostAsync<RollbackDeploymentRequest, DeploymentResponse>(
|
||||
$"/api/v1/deployments/{deploymentId}/rollback", request);
|
||||
|
||||
_formatter.WriteSuccess($"Rollback initiated. New deployment: {response.Id}");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Lists deployments with optional filters.
|
||||
/// </summary>
|
||||
public async Task ListAsync(string? env, bool active)
|
||||
{
|
||||
var queryParams = new List<string>();
|
||||
if (env is not null) queryParams.Add($"environment={env}");
|
||||
if (active) queryParams.Add("active=true");
|
||||
|
||||
var query = queryParams.Any() ? "?" + string.Join("&", queryParams) : "";
|
||||
|
||||
var deployments = await _apiClient.GetAsync<List<DeploymentResponse>>($"/api/v1/deployments{query}");
|
||||
|
||||
if (deployments.Count == 0)
|
||||
{
|
||||
_formatter.WriteInfo("No deployments found.");
|
||||
return;
|
||||
}
|
||||
|
||||
_formatter.WriteTable(deployments,
|
||||
("ID", d => d.Id),
|
||||
("Release", d => d.ReleaseId),
|
||||
("Version", d => d.Version),
|
||||
("Target", d => d.TargetEnvironment),
|
||||
("Strategy", d => d.Strategy),
|
||||
("Status", d => d.Status),
|
||||
("Started", d => d.StartedAt.ToString("g")));
|
||||
}
|
||||
|
||||
private void PrintDeploymentDetail(DeploymentDetailResponse deployment)
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine($"Deployment: {deployment.Id}");
|
||||
Console.WriteLine($"Release: {deployment.ReleaseId}");
|
||||
Console.WriteLine($"Version: {deployment.Version}");
|
||||
Console.WriteLine($"Target: {deployment.TargetEnvironment}");
|
||||
Console.WriteLine($"Strategy: {deployment.Strategy}");
|
||||
Console.WriteLine($"Status: {deployment.Status}");
|
||||
Console.WriteLine($"Started: {deployment.StartedAt:g}");
|
||||
|
||||
if (deployment.CompletedAt.HasValue)
|
||||
{
|
||||
var duration = deployment.CompletedAt.Value - deployment.StartedAt;
|
||||
Console.WriteLine($"Completed: {deployment.CompletedAt:g} (took {duration.TotalMinutes:F1} min)");
|
||||
}
|
||||
|
||||
if (deployment.Replicas is not null)
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Replica Status:");
|
||||
Console.WriteLine($" Total: {deployment.Replicas.Total}");
|
||||
Console.WriteLine($" Ready: {deployment.Replicas.Ready}");
|
||||
Console.WriteLine($" Updated: {deployment.Replicas.Updated}");
|
||||
Console.WriteLine($" Available: {deployment.Replicas.Available}");
|
||||
}
|
||||
|
||||
if (deployment.Instances.Any())
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Instances:");
|
||||
_formatter.WriteTable(deployment.Instances,
|
||||
("Host", i => i.Host),
|
||||
("Status", i => i.Status),
|
||||
("Version", i => i.Version),
|
||||
("Health", i => i.HealthStatus));
|
||||
}
|
||||
|
||||
if (deployment.Events.Any())
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Recent Events:");
|
||||
foreach (var evt in deployment.Events.TakeLast(10))
|
||||
{
|
||||
Console.WriteLine($" [{evt.Timestamp:HH:mm:ss}] {evt.Type}: {evt.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void PrintDryRunResults(DeploymentResponse response)
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Changes that would be made:");
|
||||
Console.WriteLine($" - Deploy version: {response.Version}");
|
||||
Console.WriteLine($" - Target environment: {response.TargetEnvironment}");
|
||||
Console.WriteLine($" - Strategy: {response.Strategy}");
|
||||
Console.WriteLine($" - Affected instances: (simulated)");
|
||||
}
|
||||
|
||||
private void PrintLogEntry(LogEntry entry)
|
||||
{
|
||||
Console.ForegroundColor = entry.Level switch
|
||||
{
|
||||
"Error" => ConsoleColor.Red,
|
||||
"Warning" => ConsoleColor.Yellow,
|
||||
"Info" => ConsoleColor.White,
|
||||
_ => ConsoleColor.Gray
|
||||
};
|
||||
|
||||
Console.WriteLine($"[{entry.Timestamp:HH:mm:ss}] [{entry.Source}] {entry.Message}");
|
||||
Console.ResetColor();
|
||||
}
|
||||
|
||||
private async Task WatchDeploymentAsync(string deploymentId)
|
||||
{
|
||||
Console.WriteLine("Watching deployment status (Ctrl+C to stop)...\n");
|
||||
|
||||
string? lastStatus = null;
|
||||
int lastProgress = -1;
|
||||
|
||||
while (true)
|
||||
{
|
||||
var deployment = await _apiClient.GetAsync<DeploymentDetailResponse>(
|
||||
$"/api/v1/deployments/{deploymentId}");
|
||||
|
||||
if (deployment.Status != lastStatus || deployment.Progress != lastProgress)
|
||||
{
|
||||
Console.Write($"\r[{DateTime.Now:HH:mm:ss}] Status: {deployment.Status}");
|
||||
|
||||
if (deployment.Progress.HasValue)
|
||||
{
|
||||
var progressBar = new string('█', deployment.Progress.Value / 5) +
|
||||
new string('░', 20 - deployment.Progress.Value / 5);
|
||||
Console.Write($" [{progressBar}] {deployment.Progress}%");
|
||||
}
|
||||
|
||||
Console.WriteLine();
|
||||
|
||||
lastStatus = deployment.Status;
|
||||
lastProgress = deployment.Progress ?? -1;
|
||||
}
|
||||
|
||||
if (deployment.Status is "Completed" or "Failed" or "RolledBack")
|
||||
{
|
||||
Console.WriteLine();
|
||||
if (deployment.Status == "Completed")
|
||||
{
|
||||
_formatter.WriteSuccess("Deployment completed successfully!");
|
||||
}
|
||||
else
|
||||
{
|
||||
_formatter.WriteError($"Deployment ended with status: {deployment.Status}");
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
await Task.Delay(2000);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task StreamLogsAsync(string deploymentId)
|
||||
{
|
||||
Console.WriteLine("Streaming logs (Ctrl+C to stop)...\n");
|
||||
|
||||
DateTimeOffset? lastTimestamp = null;
|
||||
|
||||
while (true)
|
||||
{
|
||||
var query = lastTimestamp.HasValue
|
||||
? $"?since={lastTimestamp.Value:O}"
|
||||
: "?tail=10";
|
||||
|
||||
var logs = await _apiClient.GetAsync<DeploymentLogsResponse>(
|
||||
$"/api/v1/deployments/{deploymentId}/logs{query}");
|
||||
|
||||
foreach (var entry in logs.Entries)
|
||||
{
|
||||
PrintLogEntry(entry);
|
||||
lastTimestamp = entry.Timestamp;
|
||||
}
|
||||
|
||||
await Task.Delay(1000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region DTOs
|
||||
|
||||
public sealed record StartDeploymentRequest
|
||||
{
|
||||
public required string ReleaseId { get; init; }
|
||||
public required string TargetEnvironment { get; init; }
|
||||
public required string Strategy { get; init; }
|
||||
public bool DryRun { get; init; }
|
||||
}
|
||||
|
||||
public sealed record RollbackDeploymentRequest
|
||||
{
|
||||
public string? Reason { get; init; }
|
||||
}
|
||||
|
||||
public sealed record DeploymentResponse
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string ReleaseId { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required string TargetEnvironment { get; init; }
|
||||
public required string Strategy { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required DateTimeOffset StartedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record DeploymentDetailResponse
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string ReleaseId { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required string TargetEnvironment { get; init; }
|
||||
public required string Strategy { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required DateTimeOffset StartedAt { get; init; }
|
||||
public DateTimeOffset? CompletedAt { get; init; }
|
||||
public int? Progress { get; init; }
|
||||
public ReplicaStatus? Replicas { get; init; }
|
||||
public List<InstanceStatus> Instances { get; init; } = [];
|
||||
public List<DeploymentEvent> Events { get; init; } = [];
|
||||
}
|
||||
|
||||
public sealed record ReplicaStatus
|
||||
{
|
||||
public int Total { get; init; }
|
||||
public int Ready { get; init; }
|
||||
public int Updated { get; init; }
|
||||
public int Available { get; init; }
|
||||
}
|
||||
|
||||
public sealed record InstanceStatus
|
||||
{
|
||||
public required string Host { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required string HealthStatus { get; init; }
|
||||
}
|
||||
|
||||
public sealed record DeploymentEvent
|
||||
{
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public required string Type { get; init; }
|
||||
public required string Message { get; init; }
|
||||
}
|
||||
|
||||
public sealed record DeploymentLogsResponse
|
||||
{
|
||||
public List<LogEntry> Entries { get; init; } = [];
|
||||
}
|
||||
|
||||
public sealed record LogEntry
|
||||
{
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public required string Level { get; init; }
|
||||
public required string Source { get; init; }
|
||||
public required string Message { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
311
src/Cli/StellaOps.Cli/Commands/PromoteCommandHandler.cs
Normal file
311
src/Cli/StellaOps.Cli/Commands/PromoteCommandHandler.cs
Normal file
@@ -0,0 +1,311 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// PromoteCommandHandler.cs
|
||||
// Sprint: SPRINT_20260117_037_ReleaseOrchestrator_developer_experience
|
||||
// Task: TASK-037-03 - Promotion Commands (promote, status, approve, reject)
|
||||
// Description: Full implementation of promotion CLI commands
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.Cli.Commands;
|
||||
|
||||
/// <summary>
|
||||
/// Handles all promotion-related CLI commands.
|
||||
/// </summary>
|
||||
public sealed class PromoteCommandHandler
|
||||
{
|
||||
private readonly IStellaApiClient _apiClient;
|
||||
private readonly IOutputFormatter _formatter;
|
||||
|
||||
public PromoteCommandHandler(IStellaApiClient apiClient, IOutputFormatter formatter)
|
||||
{
|
||||
_apiClient = apiClient;
|
||||
_formatter = formatter;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts a promotion for a release to target environment.
|
||||
/// </summary>
|
||||
public async Task StartAsync(string release, string target, bool autoApprove)
|
||||
{
|
||||
_formatter.WriteInfo($"Starting promotion of {release} to {target}...");
|
||||
|
||||
var request = new StartPromotionRequest
|
||||
{
|
||||
ReleaseId = release,
|
||||
TargetEnvironment = target,
|
||||
AutoApprove = autoApprove
|
||||
};
|
||||
|
||||
var response = await _apiClient.PostAsync<StartPromotionRequest, PromotionResponse>(
|
||||
"/api/v1/promotions", request);
|
||||
|
||||
_formatter.WriteSuccess($"Promotion started: {response.Id}");
|
||||
|
||||
PrintPromotionStatus(response);
|
||||
|
||||
if (response.Status == "PendingApproval")
|
||||
{
|
||||
_formatter.WriteInfo("\nPromotion requires approval. Use:");
|
||||
Console.WriteLine($" stella promote approve {response.Id}");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the status of a promotion, optionally watching for updates.
|
||||
/// </summary>
|
||||
public async Task StatusAsync(string promotionId, bool watch)
|
||||
{
|
||||
if (watch)
|
||||
{
|
||||
await WatchPromotionAsync(promotionId);
|
||||
return;
|
||||
}
|
||||
|
||||
var promotion = await _apiClient.GetAsync<PromotionDetailResponse>(
|
||||
$"/api/v1/promotions/{promotionId}");
|
||||
|
||||
PrintPromotionDetail(promotion);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Approves a pending promotion.
|
||||
/// </summary>
|
||||
public async Task ApproveAsync(string promotionId, string? comment)
|
||||
{
|
||||
_formatter.WriteInfo($"Approving promotion {promotionId}...");
|
||||
|
||||
var request = new ApprovePromotionRequest
|
||||
{
|
||||
Comment = comment
|
||||
};
|
||||
|
||||
var response = await _apiClient.PostAsync<ApprovePromotionRequest, PromotionResponse>(
|
||||
$"/api/v1/promotions/{promotionId}/approve", request);
|
||||
|
||||
_formatter.WriteSuccess($"Promotion approved. Status: {response.Status}");
|
||||
|
||||
if (response.Status == "InProgress")
|
||||
{
|
||||
_formatter.WriteInfo("\nDeployment has started. Use:");
|
||||
Console.WriteLine($" stella promote status {promotionId} --watch");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Rejects a pending promotion.
|
||||
/// </summary>
|
||||
public async Task RejectAsync(string promotionId, string reason)
|
||||
{
|
||||
_formatter.WriteInfo($"Rejecting promotion {promotionId}...");
|
||||
|
||||
var request = new RejectPromotionRequest
|
||||
{
|
||||
Reason = reason
|
||||
};
|
||||
|
||||
var response = await _apiClient.PostAsync<RejectPromotionRequest, PromotionResponse>(
|
||||
$"/api/v1/promotions/{promotionId}/reject", request);
|
||||
|
||||
_formatter.WriteSuccess($"Promotion rejected.");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Lists promotions with optional filters.
|
||||
/// </summary>
|
||||
public async Task ListAsync(string? env, bool pending)
|
||||
{
|
||||
var queryParams = new List<string>();
|
||||
if (env is not null) queryParams.Add($"environment={env}");
|
||||
if (pending) queryParams.Add("status=PendingApproval");
|
||||
|
||||
var query = queryParams.Any() ? "?" + string.Join("&", queryParams) : "";
|
||||
|
||||
var promotions = await _apiClient.GetAsync<List<PromotionResponse>>($"/api/v1/promotions{query}");
|
||||
|
||||
if (promotions.Count == 0)
|
||||
{
|
||||
_formatter.WriteInfo("No promotions found.");
|
||||
return;
|
||||
}
|
||||
|
||||
_formatter.WriteTable(promotions,
|
||||
("ID", p => p.Id),
|
||||
("Release", p => p.ReleaseId),
|
||||
("Target", p => p.TargetEnvironment),
|
||||
("Status", p => p.Status),
|
||||
("Requester", p => p.RequestedBy),
|
||||
("Requested", p => p.RequestedAt.ToString("g")));
|
||||
}
|
||||
|
||||
private void PrintPromotionStatus(PromotionResponse promotion)
|
||||
{
|
||||
_formatter.WriteTable([promotion],
|
||||
("ID", p => p.Id),
|
||||
("Release", p => p.ReleaseId),
|
||||
("Target", p => p.TargetEnvironment),
|
||||
("Status", p => p.Status),
|
||||
("Requested", p => p.RequestedAt.ToString("g")));
|
||||
}
|
||||
|
||||
private void PrintPromotionDetail(PromotionDetailResponse promotion)
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine($"Promotion: {promotion.Id}");
|
||||
Console.WriteLine($"Release: {promotion.ReleaseId}");
|
||||
Console.WriteLine($"Version: {promotion.Version}");
|
||||
Console.WriteLine($"Target: {promotion.TargetEnvironment}");
|
||||
Console.WriteLine($"Status: {promotion.Status}");
|
||||
Console.WriteLine($"Requested: {promotion.RequestedAt:g} by {promotion.RequestedBy}");
|
||||
|
||||
if (promotion.ApprovedAt.HasValue)
|
||||
{
|
||||
Console.WriteLine($"Approved: {promotion.ApprovedAt:g} by {promotion.ApprovedBy}");
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(promotion.RejectionReason))
|
||||
{
|
||||
Console.WriteLine($"Rejected: {promotion.RejectionReason}");
|
||||
}
|
||||
|
||||
if (promotion.PolicyResults.Any())
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Policy Results:");
|
||||
foreach (var result in promotion.PolicyResults)
|
||||
{
|
||||
var symbol = result.Passed ? "✓" : "✗";
|
||||
Console.ForegroundColor = result.Passed ? ConsoleColor.Green : ConsoleColor.Red;
|
||||
Console.WriteLine($" {symbol} {result.PolicyName}: {result.Message}");
|
||||
Console.ResetColor();
|
||||
}
|
||||
}
|
||||
|
||||
if (promotion.DeploymentSteps.Any())
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Deployment Progress:");
|
||||
foreach (var step in promotion.DeploymentSteps)
|
||||
{
|
||||
var symbol = step.Status switch
|
||||
{
|
||||
"Completed" => "✓",
|
||||
"InProgress" => "►",
|
||||
"Failed" => "✗",
|
||||
_ => "○"
|
||||
};
|
||||
Console.ForegroundColor = step.Status switch
|
||||
{
|
||||
"Completed" => ConsoleColor.Green,
|
||||
"InProgress" => ConsoleColor.Yellow,
|
||||
"Failed" => ConsoleColor.Red,
|
||||
_ => ConsoleColor.Gray
|
||||
};
|
||||
Console.Write($" {symbol} ");
|
||||
Console.ResetColor();
|
||||
Console.WriteLine($"{step.Name} ({step.Status})");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task WatchPromotionAsync(string promotionId)
|
||||
{
|
||||
Console.WriteLine("Watching promotion status (Ctrl+C to stop)...\n");
|
||||
|
||||
string? lastStatus = null;
|
||||
|
||||
while (true)
|
||||
{
|
||||
var promotion = await _apiClient.GetAsync<PromotionDetailResponse>(
|
||||
$"/api/v1/promotions/{promotionId}");
|
||||
|
||||
if (promotion.Status != lastStatus)
|
||||
{
|
||||
Console.WriteLine($"[{DateTime.Now:HH:mm:ss}] Status: {promotion.Status}");
|
||||
lastStatus = promotion.Status;
|
||||
|
||||
// Print deployment progress
|
||||
foreach (var step in promotion.DeploymentSteps.Where(s => s.Status == "InProgress"))
|
||||
{
|
||||
Console.WriteLine($" ► {step.Name}");
|
||||
}
|
||||
}
|
||||
|
||||
if (promotion.Status is "Completed" or "Failed" or "Rejected" or "RolledBack")
|
||||
{
|
||||
Console.WriteLine();
|
||||
if (promotion.Status == "Completed")
|
||||
{
|
||||
_formatter.WriteSuccess("Promotion completed successfully!");
|
||||
}
|
||||
else
|
||||
{
|
||||
_formatter.WriteError($"Promotion ended with status: {promotion.Status}");
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
await Task.Delay(2000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region DTOs
|
||||
|
||||
public sealed record StartPromotionRequest
|
||||
{
|
||||
public required string ReleaseId { get; init; }
|
||||
public required string TargetEnvironment { get; init; }
|
||||
public bool AutoApprove { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ApprovePromotionRequest
|
||||
{
|
||||
public string? Comment { get; init; }
|
||||
}
|
||||
|
||||
public sealed record RejectPromotionRequest
|
||||
{
|
||||
public required string Reason { get; init; }
|
||||
}
|
||||
|
||||
public sealed record PromotionResponse
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string ReleaseId { get; init; }
|
||||
public required string TargetEnvironment { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required string RequestedBy { get; init; }
|
||||
public required DateTimeOffset RequestedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record PromotionDetailResponse
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string ReleaseId { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required string TargetEnvironment { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required string RequestedBy { get; init; }
|
||||
public required DateTimeOffset RequestedAt { get; init; }
|
||||
public string? ApprovedBy { get; init; }
|
||||
public DateTimeOffset? ApprovedAt { get; init; }
|
||||
public string? RejectionReason { get; init; }
|
||||
public List<PolicyResult> PolicyResults { get; init; } = [];
|
||||
public List<DeploymentStep> DeploymentSteps { get; init; } = [];
|
||||
}
|
||||
|
||||
public sealed record PolicyResult
|
||||
{
|
||||
public required string PolicyName { get; init; }
|
||||
public required bool Passed { get; init; }
|
||||
public required string Message { get; init; }
|
||||
}
|
||||
|
||||
public sealed record DeploymentStep
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public DateTimeOffset? StartedAt { get; init; }
|
||||
public DateTimeOffset? CompletedAt { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
382
src/Cli/StellaOps.Cli/Commands/ReleaseCommandHandler.cs
Normal file
382
src/Cli/StellaOps.Cli/Commands/ReleaseCommandHandler.cs
Normal file
@@ -0,0 +1,382 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ReleaseCommandHandler.cs
|
||||
// Sprint: SPRINT_20260117_037_ReleaseOrchestrator_developer_experience
|
||||
// Task: TASK-037-02 - Release Commands (create, list, get, diff, history)
|
||||
// Description: Full implementation of release management CLI commands
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Net.Http.Json;
|
||||
using System.Text.Json;
|
||||
|
||||
namespace StellaOps.Cli.Commands;
|
||||
|
||||
/// <summary>
|
||||
/// Handles all release-related CLI commands.
|
||||
/// </summary>
|
||||
public sealed class ReleaseCommandHandler
|
||||
{
|
||||
private readonly IStellaApiClient _apiClient;
|
||||
private readonly IOutputFormatter _formatter;
|
||||
|
||||
public ReleaseCommandHandler(IStellaApiClient apiClient, IOutputFormatter formatter)
|
||||
{
|
||||
_apiClient = apiClient;
|
||||
_formatter = formatter;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new release.
|
||||
/// </summary>
|
||||
public async Task CreateAsync(string service, string version, string? notes, bool draft)
|
||||
{
|
||||
_formatter.WriteInfo($"Creating release {version} for {service}...");
|
||||
|
||||
var request = new CreateReleaseRequest
|
||||
{
|
||||
ServiceName = service,
|
||||
Version = version,
|
||||
Notes = notes,
|
||||
IsDraft = draft
|
||||
};
|
||||
|
||||
var response = await _apiClient.PostAsync<CreateReleaseRequest, ReleaseResponse>(
|
||||
"/api/v1/releases", request);
|
||||
|
||||
_formatter.WriteSuccess($"Release created: {response.Id}");
|
||||
|
||||
_formatter.WriteTable([response],
|
||||
("ID", r => r.Id),
|
||||
("Service", r => r.ServiceName),
|
||||
("Version", r => r.Version),
|
||||
("Status", r => r.Status),
|
||||
("Created", r => r.CreatedAt.ToString("g")));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Lists releases with optional filters.
|
||||
/// </summary>
|
||||
public async Task ListAsync(string? service, int limit, string? status)
|
||||
{
|
||||
var queryParams = new List<string>();
|
||||
if (service is not null) queryParams.Add($"service={service}");
|
||||
if (status is not null) queryParams.Add($"status={status}");
|
||||
queryParams.Add($"limit={limit}");
|
||||
|
||||
var query = queryParams.Any() ? "?" + string.Join("&", queryParams) : "";
|
||||
|
||||
var releases = await _apiClient.GetAsync<List<ReleaseResponse>>($"/api/v1/releases{query}");
|
||||
|
||||
if (releases.Count == 0)
|
||||
{
|
||||
_formatter.WriteInfo("No releases found.");
|
||||
return;
|
||||
}
|
||||
|
||||
_formatter.WriteTable(releases,
|
||||
("ID", r => r.Id),
|
||||
("Service", r => r.ServiceName),
|
||||
("Version", r => r.Version),
|
||||
("Status", r => r.Status),
|
||||
("Environment", r => r.Environment ?? "-"),
|
||||
("Created", r => r.CreatedAt.ToString("g")));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets details of a specific release.
|
||||
/// </summary>
|
||||
public async Task GetAsync(string releaseId)
|
||||
{
|
||||
var release = await _apiClient.GetAsync<ReleaseDetailResponse>($"/api/v1/releases/{releaseId}");
|
||||
|
||||
Console.WriteLine();
|
||||
Console.WriteLine($"Release: {release.Id}");
|
||||
Console.WriteLine($"Service: {release.ServiceName}");
|
||||
Console.WriteLine($"Version: {release.Version}");
|
||||
Console.WriteLine($"Status: {release.Status}");
|
||||
Console.WriteLine($"Created: {release.CreatedAt}");
|
||||
|
||||
if (!string.IsNullOrEmpty(release.Notes))
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Notes:");
|
||||
Console.WriteLine(release.Notes);
|
||||
}
|
||||
|
||||
if (release.ScanResults is not null)
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Scan Results:");
|
||||
Console.WriteLine($" Critical: {release.ScanResults.Critical}");
|
||||
Console.WriteLine($" High: {release.ScanResults.High}");
|
||||
Console.WriteLine($" Medium: {release.ScanResults.Medium}");
|
||||
Console.WriteLine($" Low: {release.ScanResults.Low}");
|
||||
}
|
||||
|
||||
if (release.Approvals.Any())
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Approvals:");
|
||||
_formatter.WriteTable(release.Approvals,
|
||||
("Approver", a => a.ApprovedBy),
|
||||
("Status", a => a.Status),
|
||||
("Time", a => a.ApprovedAt?.ToString("g") ?? "-"));
|
||||
}
|
||||
|
||||
if (release.Evidence.Any())
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine($"Evidence: {release.Evidence.Count} item(s)");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Shows diff between two releases.
|
||||
/// </summary>
|
||||
public async Task DiffAsync(string from, string to)
|
||||
{
|
||||
var diff = await _apiClient.GetAsync<ReleaseDiffResponse>(
|
||||
$"/api/v1/releases/{from}/diff/{to}");
|
||||
|
||||
Console.WriteLine();
|
||||
Console.WriteLine($"Diff: {from} → {to}");
|
||||
Console.WriteLine();
|
||||
|
||||
if (diff.ConfigChanges.Any())
|
||||
{
|
||||
Console.WriteLine("Configuration Changes:");
|
||||
foreach (var change in diff.ConfigChanges)
|
||||
{
|
||||
var symbol = change.ChangeType switch
|
||||
{
|
||||
"Added" => "+",
|
||||
"Removed" => "-",
|
||||
"Modified" => "~",
|
||||
_ => "?"
|
||||
};
|
||||
Console.ForegroundColor = change.ChangeType switch
|
||||
{
|
||||
"Added" => ConsoleColor.Green,
|
||||
"Removed" => ConsoleColor.Red,
|
||||
"Modified" => ConsoleColor.Yellow,
|
||||
_ => ConsoleColor.Gray
|
||||
};
|
||||
Console.WriteLine($" {symbol} {change.Key}");
|
||||
Console.ResetColor();
|
||||
}
|
||||
}
|
||||
|
||||
if (diff.DependencyChanges.Any())
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Dependency Changes:");
|
||||
_formatter.WriteTable(diff.DependencyChanges,
|
||||
("Package", d => d.Package),
|
||||
("From", d => d.FromVersion ?? "-"),
|
||||
("To", d => d.ToVersion ?? "-"),
|
||||
("Type", d => d.ChangeType));
|
||||
}
|
||||
|
||||
if (diff.VulnerabilityChanges.Any())
|
||||
{
|
||||
Console.WriteLine();
|
||||
Console.WriteLine("Vulnerability Changes:");
|
||||
_formatter.WriteTable(diff.VulnerabilityChanges,
|
||||
("CVE", v => v.CveId),
|
||||
("Severity", v => v.Severity),
|
||||
("Status", v => v.Status));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Shows release history for a service.
|
||||
/// </summary>
|
||||
public async Task HistoryAsync(string service)
|
||||
{
|
||||
var history = await _apiClient.GetAsync<List<ReleaseHistoryEntry>>(
|
||||
$"/api/v1/services/{service}/release-history");
|
||||
|
||||
if (history.Count == 0)
|
||||
{
|
||||
_formatter.WriteInfo($"No release history for {service}.");
|
||||
return;
|
||||
}
|
||||
|
||||
Console.WriteLine($"\nRelease history for {service}:\n");
|
||||
|
||||
foreach (var entry in history.Take(20))
|
||||
{
|
||||
var statusColor = entry.Status switch
|
||||
{
|
||||
"Deployed" => ConsoleColor.Green,
|
||||
"Failed" => ConsoleColor.Red,
|
||||
"RolledBack" => ConsoleColor.Yellow,
|
||||
_ => ConsoleColor.Gray
|
||||
};
|
||||
|
||||
Console.Write($" {entry.Timestamp:yyyy-MM-dd HH:mm} ");
|
||||
Console.ForegroundColor = statusColor;
|
||||
Console.Write($"{entry.Status,-12}");
|
||||
Console.ResetColor();
|
||||
Console.WriteLine($" {entry.Version,-15} {entry.Environment}");
|
||||
|
||||
if (!string.IsNullOrEmpty(entry.Notes))
|
||||
{
|
||||
Console.WriteLine($" {entry.Notes}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region API Client
|
||||
|
||||
public interface IStellaApiClient
|
||||
{
|
||||
Task<T> GetAsync<T>(string path);
|
||||
Task<TResponse> PostAsync<TRequest, TResponse>(string path, TRequest request);
|
||||
Task DeleteAsync(string path);
|
||||
}
|
||||
|
||||
public sealed class StellaApiClient : IStellaApiClient
|
||||
{
|
||||
private readonly HttpClient _httpClient;
|
||||
private readonly CliConfig _config;
|
||||
|
||||
public StellaApiClient(HttpClient httpClient, CliConfig config)
|
||||
{
|
||||
_httpClient = httpClient;
|
||||
_config = config;
|
||||
|
||||
_httpClient.BaseAddress = new Uri(config.ServerUrl);
|
||||
if (!string.IsNullOrEmpty(config.AccessToken))
|
||||
{
|
||||
_httpClient.DefaultRequestHeaders.Authorization =
|
||||
new System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", config.AccessToken);
|
||||
}
|
||||
}
|
||||
|
||||
public async Task<T> GetAsync<T>(string path)
|
||||
{
|
||||
var response = await _httpClient.GetAsync(path);
|
||||
response.EnsureSuccessStatusCode();
|
||||
return (await response.Content.ReadFromJsonAsync<T>())!;
|
||||
}
|
||||
|
||||
public async Task<TResponse> PostAsync<TRequest, TResponse>(string path, TRequest request)
|
||||
{
|
||||
var response = await _httpClient.PostAsJsonAsync(path, request);
|
||||
response.EnsureSuccessStatusCode();
|
||||
return (await response.Content.ReadFromJsonAsync<TResponse>())!;
|
||||
}
|
||||
|
||||
public async Task DeleteAsync(string path)
|
||||
{
|
||||
var response = await _httpClient.DeleteAsync(path);
|
||||
response.EnsureSuccessStatusCode();
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region DTOs
|
||||
|
||||
public sealed record CreateReleaseRequest
|
||||
{
|
||||
public required string ServiceName { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public string? Notes { get; init; }
|
||||
public bool IsDraft { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ReleaseResponse
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string ServiceName { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public string? Environment { get; init; }
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ReleaseDetailResponse
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string ServiceName { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public string? Notes { get; init; }
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
public ScanResultSummary? ScanResults { get; init; }
|
||||
public List<ApprovalInfo> Approvals { get; init; } = [];
|
||||
public List<EvidenceInfo> Evidence { get; init; } = [];
|
||||
}
|
||||
|
||||
public sealed record ScanResultSummary
|
||||
{
|
||||
public int Critical { get; init; }
|
||||
public int High { get; init; }
|
||||
public int Medium { get; init; }
|
||||
public int Low { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ApprovalInfo
|
||||
{
|
||||
public required string ApprovedBy { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public DateTimeOffset? ApprovedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record EvidenceInfo
|
||||
{
|
||||
public required string Type { get; init; }
|
||||
public required string Hash { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ReleaseDiffResponse
|
||||
{
|
||||
public List<ConfigChange> ConfigChanges { get; init; } = [];
|
||||
public List<DependencyChange> DependencyChanges { get; init; } = [];
|
||||
public List<VulnerabilityChange> VulnerabilityChanges { get; init; } = [];
|
||||
}
|
||||
|
||||
public sealed record ConfigChange
|
||||
{
|
||||
public required string Key { get; init; }
|
||||
public required string ChangeType { get; init; }
|
||||
public string? OldValue { get; init; }
|
||||
public string? NewValue { get; init; }
|
||||
}
|
||||
|
||||
public sealed record DependencyChange
|
||||
{
|
||||
public required string Package { get; init; }
|
||||
public string? FromVersion { get; init; }
|
||||
public string? ToVersion { get; init; }
|
||||
public required string ChangeType { get; init; }
|
||||
}
|
||||
|
||||
public sealed record VulnerabilityChange
|
||||
{
|
||||
public required string CveId { get; init; }
|
||||
public required string Severity { get; init; }
|
||||
public required string Status { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ReleaseHistoryEntry
|
||||
{
|
||||
public required string Version { get; init; }
|
||||
public required string Environment { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public string? Notes { get; init; }
|
||||
}
|
||||
|
||||
public sealed record CliConfig
|
||||
{
|
||||
public string ServerUrl { get; set; } = "https://localhost:5001";
|
||||
public string? AccessToken { get; set; }
|
||||
public string? RefreshToken { get; set; }
|
||||
public DateTimeOffset? TokenExpiry { get; set; }
|
||||
public string OutputFormat { get; set; } = "table";
|
||||
}
|
||||
|
||||
#endregion
|
||||
582
src/Cli/StellaOps.Cli/GitOps/GitOpsController.cs
Normal file
582
src/Cli/StellaOps.Cli/GitOps/GitOpsController.cs
Normal file
@@ -0,0 +1,582 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Cli.GitOps;
|
||||
|
||||
/// <summary>
|
||||
/// Controller for GitOps-based release automation.
|
||||
/// Monitors Git repositories and triggers releases based on Git events.
|
||||
/// </summary>
|
||||
public sealed class GitOpsController : BackgroundService
|
||||
{
|
||||
private readonly IGitEventSource _eventSource;
|
||||
private readonly IReleaseService _releaseService;
|
||||
private readonly IPromotionService _promotionService;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly GitOpsConfig _config;
|
||||
private readonly ILogger<GitOpsController> _logger;
|
||||
private readonly ConcurrentDictionary<string, GitOpsState> _repoStates = new();
|
||||
|
||||
public event EventHandler<GitOpsEventArgs>? ReleaseTriggered;
|
||||
public event EventHandler<GitOpsEventArgs>? PromotionTriggered;
|
||||
public event EventHandler<GitOpsEventArgs>? ValidationFailed;
|
||||
|
||||
public GitOpsController(
|
||||
IGitEventSource eventSource,
|
||||
IReleaseService releaseService,
|
||||
IPromotionService promotionService,
|
||||
TimeProvider timeProvider,
|
||||
GitOpsConfig config,
|
||||
ILogger<GitOpsController> logger)
|
||||
{
|
||||
_eventSource = eventSource;
|
||||
_releaseService = releaseService;
|
||||
_promotionService = promotionService;
|
||||
_timeProvider = timeProvider;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
|
||||
_eventSource.EventReceived += OnGitEventReceived;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers a repository for GitOps monitoring.
|
||||
/// </summary>
|
||||
public async Task<RegistrationResult> RegisterRepositoryAsync(
|
||||
GitOpsRepositoryConfig repoConfig,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(repoConfig);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Registering repository {RepoUrl} for GitOps",
|
||||
repoConfig.RepositoryUrl);
|
||||
|
||||
var state = new GitOpsState
|
||||
{
|
||||
RepositoryUrl = repoConfig.RepositoryUrl,
|
||||
Config = repoConfig,
|
||||
Status = GitOpsStatus.Active,
|
||||
RegisteredAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
_repoStates[repoConfig.RepositoryUrl] = state;
|
||||
|
||||
// Start monitoring
|
||||
await _eventSource.SubscribeAsync(repoConfig.RepositoryUrl, repoConfig.Branches, ct);
|
||||
|
||||
return new RegistrationResult
|
||||
{
|
||||
Success = true,
|
||||
RepositoryUrl = repoConfig.RepositoryUrl,
|
||||
MonitoredBranches = repoConfig.Branches
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unregisters a repository from GitOps monitoring.
|
||||
/// </summary>
|
||||
public async Task<bool> UnregisterRepositoryAsync(
|
||||
string repositoryUrl,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
if (!_repoStates.TryRemove(repositoryUrl, out _))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
await _eventSource.UnsubscribeAsync(repositoryUrl, ct);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Unregistered repository {RepoUrl} from GitOps",
|
||||
repositoryUrl);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Manually triggers a release for a commit.
|
||||
/// </summary>
|
||||
public async Task<TriggerResult> TriggerReleaseAsync(
|
||||
ManualTriggerRequest request,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Manually triggering release for {RepoUrl} at {CommitSha}",
|
||||
request.RepositoryUrl, request.CommitSha);
|
||||
|
||||
var gitEvent = new GitEvent
|
||||
{
|
||||
Type = GitEventType.Push,
|
||||
RepositoryUrl = request.RepositoryUrl,
|
||||
Branch = request.Branch,
|
||||
CommitSha = request.CommitSha,
|
||||
CommitMessage = request.CommitMessage ?? "Manual trigger",
|
||||
Author = request.Author ?? "system",
|
||||
Timestamp = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
return await ProcessGitEventAsync(gitEvent, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the status of all monitored repositories.
|
||||
/// </summary>
|
||||
public IReadOnlyList<GitOpsState> GetRepositoryStatuses()
|
||||
{
|
||||
return _repoStates.Values.ToList();
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("GitOps controller starting");
|
||||
|
||||
await _eventSource.StartAsync(stoppingToken);
|
||||
|
||||
try
|
||||
{
|
||||
// Keep running until stopped
|
||||
await Task.Delay(Timeout.Infinite, stoppingToken);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Expected on shutdown
|
||||
}
|
||||
|
||||
await _eventSource.StopAsync(CancellationToken.None);
|
||||
|
||||
_logger.LogInformation("GitOps controller stopped");
|
||||
}
|
||||
|
||||
private async void OnGitEventReceived(object? sender, GitEvent e)
|
||||
{
|
||||
try
|
||||
{
|
||||
await ProcessGitEventAsync(e, CancellationToken.None);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"Error processing Git event for {RepoUrl}",
|
||||
e.RepositoryUrl);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<TriggerResult> ProcessGitEventAsync(
|
||||
GitEvent gitEvent,
|
||||
CancellationToken ct)
|
||||
{
|
||||
if (!_repoStates.TryGetValue(gitEvent.RepositoryUrl, out var state))
|
||||
{
|
||||
return new TriggerResult
|
||||
{
|
||||
Success = false,
|
||||
Error = "Repository not registered"
|
||||
};
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Processing {EventType} event for {RepoUrl} on {Branch}",
|
||||
gitEvent.Type, gitEvent.RepositoryUrl, gitEvent.Branch);
|
||||
|
||||
// Check if branch matches triggers
|
||||
var trigger = FindMatchingTrigger(state.Config, gitEvent);
|
||||
if (trigger is null)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"No matching trigger for branch {Branch}",
|
||||
gitEvent.Branch);
|
||||
|
||||
return new TriggerResult
|
||||
{
|
||||
Success = true,
|
||||
Skipped = true,
|
||||
Reason = "No matching trigger"
|
||||
};
|
||||
}
|
||||
|
||||
// Validate commit message patterns if configured
|
||||
if (!ValidateCommitMessage(gitEvent.CommitMessage, trigger))
|
||||
{
|
||||
ValidationFailed?.Invoke(this, new GitOpsEventArgs
|
||||
{
|
||||
Event = gitEvent,
|
||||
Reason = "Commit message validation failed"
|
||||
});
|
||||
|
||||
return new TriggerResult
|
||||
{
|
||||
Success = false,
|
||||
Error = "Commit message validation failed"
|
||||
};
|
||||
}
|
||||
|
||||
// Execute trigger action
|
||||
return trigger.Action switch
|
||||
{
|
||||
TriggerAction.CreateRelease => await CreateReleaseAsync(gitEvent, trigger, ct),
|
||||
TriggerAction.Promote => await PromoteAsync(gitEvent, trigger, ct),
|
||||
TriggerAction.ValidateOnly => await ValidateAsync(gitEvent, trigger, ct),
|
||||
_ => new TriggerResult { Success = false, Error = "Unknown action" }
|
||||
};
|
||||
}
|
||||
|
||||
private GitOpsTrigger? FindMatchingTrigger(GitOpsRepositoryConfig config, GitEvent gitEvent)
|
||||
{
|
||||
return config.Triggers.FirstOrDefault(t =>
|
||||
MatchesBranch(t.BranchPattern, gitEvent.Branch) &&
|
||||
(t.EventTypes.Length == 0 || t.EventTypes.Contains(gitEvent.Type)));
|
||||
}
|
||||
|
||||
private static bool MatchesBranch(string pattern, string branch)
|
||||
{
|
||||
if (pattern == "*")
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (pattern.EndsWith("/*"))
|
||||
{
|
||||
var prefix = pattern[..^2];
|
||||
return branch.StartsWith(prefix, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
return pattern.Equals(branch, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
private static bool ValidateCommitMessage(string? message, GitOpsTrigger trigger)
|
||||
{
|
||||
if (string.IsNullOrEmpty(trigger.CommitMessagePattern))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
if (string.IsNullOrEmpty(message))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var regex = new System.Text.RegularExpressions.Regex(trigger.CommitMessagePattern);
|
||||
return regex.IsMatch(message);
|
||||
}
|
||||
|
||||
private async Task<TriggerResult> CreateReleaseAsync(
|
||||
GitEvent gitEvent,
|
||||
GitOpsTrigger trigger,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Creating release from {CommitSha} on {Branch}",
|
||||
gitEvent.CommitSha, gitEvent.Branch);
|
||||
|
||||
try
|
||||
{
|
||||
var releaseId = await _releaseService.CreateReleaseAsync(new CreateReleaseRequest
|
||||
{
|
||||
RepositoryUrl = gitEvent.RepositoryUrl,
|
||||
CommitSha = gitEvent.CommitSha,
|
||||
Branch = gitEvent.Branch,
|
||||
Environment = trigger.TargetEnvironment ?? "development",
|
||||
Version = ExtractVersion(gitEvent, trigger),
|
||||
AutoPromote = trigger.AutoPromote
|
||||
}, ct);
|
||||
|
||||
ReleaseTriggered?.Invoke(this, new GitOpsEventArgs
|
||||
{
|
||||
Event = gitEvent,
|
||||
ReleaseId = releaseId
|
||||
});
|
||||
|
||||
return new TriggerResult
|
||||
{
|
||||
Success = true,
|
||||
ReleaseId = releaseId
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"Failed to create release for {CommitSha}",
|
||||
gitEvent.CommitSha);
|
||||
|
||||
return new TriggerResult
|
||||
{
|
||||
Success = false,
|
||||
Error = ex.Message
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<TriggerResult> PromoteAsync(
|
||||
GitEvent gitEvent,
|
||||
GitOpsTrigger trigger,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Promoting from {SourceEnv} to {TargetEnv}",
|
||||
trigger.SourceEnvironment, trigger.TargetEnvironment);
|
||||
|
||||
try
|
||||
{
|
||||
var promotionId = await _promotionService.PromoteAsync(new PromoteRequest
|
||||
{
|
||||
SourceEnvironment = trigger.SourceEnvironment!,
|
||||
TargetEnvironment = trigger.TargetEnvironment!,
|
||||
CommitSha = gitEvent.CommitSha,
|
||||
AutoApprove = trigger.AutoApprove
|
||||
}, ct);
|
||||
|
||||
PromotionTriggered?.Invoke(this, new GitOpsEventArgs
|
||||
{
|
||||
Event = gitEvent,
|
||||
PromotionId = promotionId
|
||||
});
|
||||
|
||||
return new TriggerResult
|
||||
{
|
||||
Success = true,
|
||||
PromotionId = promotionId
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to promote");
|
||||
|
||||
return new TriggerResult
|
||||
{
|
||||
Success = false,
|
||||
Error = ex.Message
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private Task<TriggerResult> ValidateAsync(
|
||||
GitEvent gitEvent,
|
||||
GitOpsTrigger trigger,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Validating commit {CommitSha}",
|
||||
gitEvent.CommitSha);
|
||||
|
||||
// Validation-only mode - no actual release creation
|
||||
return Task.FromResult(new TriggerResult
|
||||
{
|
||||
Success = true,
|
||||
ValidationOnly = true
|
||||
});
|
||||
}
|
||||
|
||||
private static string ExtractVersion(GitEvent gitEvent, GitOpsTrigger trigger)
|
||||
{
|
||||
// Try to extract version from tag or branch
|
||||
if (gitEvent.Type == GitEventType.Tag && gitEvent.Tag is not null)
|
||||
{
|
||||
var tag = gitEvent.Tag;
|
||||
if (tag.StartsWith("v", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
tag = tag[1..];
|
||||
}
|
||||
return tag;
|
||||
}
|
||||
|
||||
// Use commit SHA prefix as version
|
||||
return gitEvent.CommitSha[..8];
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for GitOps controller.
|
||||
/// </summary>
|
||||
public sealed record GitOpsConfig
|
||||
{
|
||||
public TimeSpan PollInterval { get; init; } = TimeSpan.FromSeconds(30);
|
||||
public bool EnableWebhooks { get; init; } = true;
|
||||
public int MaxConcurrentEvents { get; init; } = 5;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for a GitOps-monitored repository.
|
||||
/// </summary>
|
||||
public sealed record GitOpsRepositoryConfig
|
||||
{
|
||||
public required string RepositoryUrl { get; init; }
|
||||
public ImmutableArray<string> Branches { get; init; } = ["main", "release/*"];
|
||||
public ImmutableArray<GitOpsTrigger> Triggers { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A GitOps trigger definition.
|
||||
/// </summary>
|
||||
public sealed record GitOpsTrigger
|
||||
{
|
||||
public required string BranchPattern { get; init; }
|
||||
public ImmutableArray<GitEventType> EventTypes { get; init; } = [];
|
||||
public required TriggerAction Action { get; init; }
|
||||
public string? TargetEnvironment { get; init; }
|
||||
public string? SourceEnvironment { get; init; }
|
||||
public string? CommitMessagePattern { get; init; }
|
||||
public bool AutoPromote { get; init; }
|
||||
public bool AutoApprove { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Trigger action types.
|
||||
/// </summary>
|
||||
public enum TriggerAction
|
||||
{
|
||||
CreateRelease,
|
||||
Promote,
|
||||
ValidateOnly
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// State of a monitored repository.
|
||||
/// </summary>
|
||||
public sealed record GitOpsState
|
||||
{
|
||||
public required string RepositoryUrl { get; init; }
|
||||
public required GitOpsRepositoryConfig Config { get; init; }
|
||||
public required GitOpsStatus Status { get; init; }
|
||||
public required DateTimeOffset RegisteredAt { get; init; }
|
||||
public DateTimeOffset? LastEventAt { get; init; }
|
||||
public string? LastCommitSha { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// GitOps status.
|
||||
/// </summary>
|
||||
public enum GitOpsStatus
|
||||
{
|
||||
Active,
|
||||
Paused,
|
||||
Error
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A Git event.
|
||||
/// </summary>
|
||||
public sealed record GitEvent
|
||||
{
|
||||
public required GitEventType Type { get; init; }
|
||||
public required string RepositoryUrl { get; init; }
|
||||
public required string Branch { get; init; }
|
||||
public required string CommitSha { get; init; }
|
||||
public string? CommitMessage { get; init; }
|
||||
public string? Tag { get; init; }
|
||||
public required string Author { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Git event types.
|
||||
/// </summary>
|
||||
public enum GitEventType
|
||||
{
|
||||
Push,
|
||||
Tag,
|
||||
PullRequest,
|
||||
Merge
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of repository registration.
|
||||
/// </summary>
|
||||
public sealed record RegistrationResult
|
||||
{
|
||||
public required bool Success { get; init; }
|
||||
public string? RepositoryUrl { get; init; }
|
||||
public ImmutableArray<string> MonitoredBranches { get; init; } = [];
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to manually trigger.
|
||||
/// </summary>
|
||||
public sealed record ManualTriggerRequest
|
||||
{
|
||||
public required string RepositoryUrl { get; init; }
|
||||
public required string Branch { get; init; }
|
||||
public required string CommitSha { get; init; }
|
||||
public string? CommitMessage { get; init; }
|
||||
public string? Author { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a trigger.
|
||||
/// </summary>
|
||||
public sealed record TriggerResult
|
||||
{
|
||||
public required bool Success { get; init; }
|
||||
public bool Skipped { get; init; }
|
||||
public bool ValidationOnly { get; init; }
|
||||
public Guid? ReleaseId { get; init; }
|
||||
public Guid? PromotionId { get; init; }
|
||||
public string? Reason { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event args for GitOps events.
|
||||
/// </summary>
|
||||
public sealed class GitOpsEventArgs : EventArgs
|
||||
{
|
||||
public required GitEvent Event { get; init; }
|
||||
public Guid? ReleaseId { get; init; }
|
||||
public Guid? PromotionId { get; init; }
|
||||
public string? Reason { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to create a release.
|
||||
/// </summary>
|
||||
public sealed record CreateReleaseRequest
|
||||
{
|
||||
public required string RepositoryUrl { get; init; }
|
||||
public required string CommitSha { get; init; }
|
||||
public required string Branch { get; init; }
|
||||
public required string Environment { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public bool AutoPromote { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to promote.
|
||||
/// </summary>
|
||||
public sealed record PromoteRequest
|
||||
{
|
||||
public required string SourceEnvironment { get; init; }
|
||||
public required string TargetEnvironment { get; init; }
|
||||
public required string CommitSha { get; init; }
|
||||
public bool AutoApprove { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for Git event source.
|
||||
/// </summary>
|
||||
public interface IGitEventSource
|
||||
{
|
||||
event EventHandler<GitEvent>? EventReceived;
|
||||
Task StartAsync(CancellationToken ct = default);
|
||||
Task StopAsync(CancellationToken ct = default);
|
||||
Task SubscribeAsync(string repositoryUrl, ImmutableArray<string> branches, CancellationToken ct = default);
|
||||
Task UnsubscribeAsync(string repositoryUrl, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for release service.
|
||||
/// </summary>
|
||||
public interface IReleaseService
|
||||
{
|
||||
Task<Guid> CreateReleaseAsync(CreateReleaseRequest request, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for promotion service.
|
||||
/// </summary>
|
||||
public interface IPromotionService
|
||||
{
|
||||
Task<Guid> PromoteAsync(PromoteRequest request, CancellationToken ct = default);
|
||||
}
|
||||
612
src/Cli/StellaOps.Cli/Validation/LocalValidator.cs
Normal file
612
src/Cli/StellaOps.Cli/Validation/LocalValidator.cs
Normal file
@@ -0,0 +1,612 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Cli.Validation;
|
||||
|
||||
/// <summary>
|
||||
/// Validates configuration files locally without requiring server connection.
|
||||
/// Supports offline validation of release manifests, policy files, and environment configs.
|
||||
/// </summary>
|
||||
public sealed class LocalValidator
|
||||
{
|
||||
private readonly IEnumerable<IConfigValidator> _validators;
|
||||
private readonly ISchemaProvider _schemaProvider;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly LocalValidatorConfig _config;
|
||||
private readonly ILogger<LocalValidator> _logger;
|
||||
|
||||
public LocalValidator(
|
||||
IEnumerable<IConfigValidator> validators,
|
||||
ISchemaProvider schemaProvider,
|
||||
TimeProvider timeProvider,
|
||||
LocalValidatorConfig config,
|
||||
ILogger<LocalValidator> logger)
|
||||
{
|
||||
_validators = validators;
|
||||
_schemaProvider = schemaProvider;
|
||||
_timeProvider = timeProvider;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates a configuration file.
|
||||
/// </summary>
|
||||
public async Task<ValidationResult> ValidateFileAsync(
|
||||
string filePath,
|
||||
ValidationType? typeHint = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
if (!File.Exists(filePath))
|
||||
{
|
||||
return new ValidationResult
|
||||
{
|
||||
IsValid = false,
|
||||
FilePath = filePath,
|
||||
Errors = [new ValidationError
|
||||
{
|
||||
Code = "FILE_NOT_FOUND",
|
||||
Message = $"File not found: {filePath}",
|
||||
Severity = ValidationSeverity.Error
|
||||
}]
|
||||
};
|
||||
}
|
||||
|
||||
_logger.LogInformation("Validating file: {FilePath}", filePath);
|
||||
|
||||
var content = await File.ReadAllTextAsync(filePath, ct);
|
||||
var detectedType = typeHint ?? DetectFileType(filePath, content);
|
||||
|
||||
return await ValidateContentAsync(content, detectedType, filePath, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates content directly.
|
||||
/// </summary>
|
||||
public async Task<ValidationResult> ValidateContentAsync(
|
||||
string content,
|
||||
ValidationType type,
|
||||
string? sourcePath = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
var errors = new List<ValidationError>();
|
||||
var warnings = new List<ValidationError>();
|
||||
|
||||
// Get appropriate validator
|
||||
var validator = _validators.FirstOrDefault(v => v.Supports(type));
|
||||
if (validator is null)
|
||||
{
|
||||
return new ValidationResult
|
||||
{
|
||||
IsValid = false,
|
||||
FilePath = sourcePath,
|
||||
ValidationType = type,
|
||||
Errors = [new ValidationError
|
||||
{
|
||||
Code = "UNSUPPORTED_TYPE",
|
||||
Message = $"No validator available for type: {type}",
|
||||
Severity = ValidationSeverity.Error
|
||||
}]
|
||||
};
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Schema validation
|
||||
if (_config.EnableSchemaValidation)
|
||||
{
|
||||
var schemaErrors = await ValidateSchemaAsync(content, type, ct);
|
||||
errors.AddRange(schemaErrors.Where(e => e.Severity == ValidationSeverity.Error));
|
||||
warnings.AddRange(schemaErrors.Where(e => e.Severity == ValidationSeverity.Warning));
|
||||
}
|
||||
|
||||
// Semantic validation
|
||||
var semanticResult = await validator.ValidateAsync(content, ct);
|
||||
errors.AddRange(semanticResult.Errors);
|
||||
warnings.AddRange(semanticResult.Warnings);
|
||||
|
||||
// Cross-reference validation
|
||||
if (_config.EnableCrossReferenceValidation && sourcePath is not null)
|
||||
{
|
||||
var crossRefErrors = await ValidateCrossReferencesAsync(content, type, sourcePath, ct);
|
||||
errors.AddRange(crossRefErrors);
|
||||
}
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
errors.Add(new ValidationError
|
||||
{
|
||||
Code = "JSON_PARSE_ERROR",
|
||||
Message = $"Invalid JSON: {ex.Message}",
|
||||
Line = (int?)ex.LineNumber,
|
||||
Column = (int?)ex.BytePositionInLine,
|
||||
Severity = ValidationSeverity.Error
|
||||
});
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
errors.Add(new ValidationError
|
||||
{
|
||||
Code = "VALIDATION_ERROR",
|
||||
Message = $"Validation failed: {ex.Message}",
|
||||
Severity = ValidationSeverity.Error
|
||||
});
|
||||
}
|
||||
|
||||
var duration = _timeProvider.GetUtcNow() - startTime;
|
||||
|
||||
return new ValidationResult
|
||||
{
|
||||
IsValid = errors.Count == 0,
|
||||
FilePath = sourcePath,
|
||||
ValidationType = type,
|
||||
Errors = errors.ToImmutableArray(),
|
||||
Warnings = warnings.ToImmutableArray(),
|
||||
Duration = duration
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates a directory of configuration files.
|
||||
/// </summary>
|
||||
public async Task<DirectoryValidationResult> ValidateDirectoryAsync(
|
||||
string directoryPath,
|
||||
string pattern = "*.*",
|
||||
bool recursive = true,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
if (!Directory.Exists(directoryPath))
|
||||
{
|
||||
return new DirectoryValidationResult
|
||||
{
|
||||
DirectoryPath = directoryPath,
|
||||
IsValid = false,
|
||||
Results = [new ValidationResult
|
||||
{
|
||||
IsValid = false,
|
||||
Errors = [new ValidationError
|
||||
{
|
||||
Code = "DIRECTORY_NOT_FOUND",
|
||||
Message = $"Directory not found: {directoryPath}",
|
||||
Severity = ValidationSeverity.Error
|
||||
}]
|
||||
}]
|
||||
};
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Validating directory: {DirectoryPath} (pattern: {Pattern})",
|
||||
directoryPath, pattern);
|
||||
|
||||
var searchOption = recursive ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly;
|
||||
var files = Directory.GetFiles(directoryPath, pattern, searchOption)
|
||||
.Where(f => IsConfigFile(f))
|
||||
.ToList();
|
||||
|
||||
var results = new List<ValidationResult>();
|
||||
|
||||
foreach (var file in files)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
var result = await ValidateFileAsync(file, null, ct);
|
||||
results.Add(result);
|
||||
}
|
||||
|
||||
return new DirectoryValidationResult
|
||||
{
|
||||
DirectoryPath = directoryPath,
|
||||
IsValid = results.All(r => r.IsValid),
|
||||
TotalFiles = results.Count,
|
||||
ValidFiles = results.Count(r => r.IsValid),
|
||||
InvalidFiles = results.Count(r => !r.IsValid),
|
||||
Results = results.ToImmutableArray()
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates a release manifest.
|
||||
/// </summary>
|
||||
public async Task<ValidationResult> ValidateReleaseManifestAsync(
|
||||
string manifestPath,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
return await ValidateFileAsync(manifestPath, ValidationType.ReleaseManifest, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates a policy file.
|
||||
/// </summary>
|
||||
public async Task<ValidationResult> ValidatePolicyAsync(
|
||||
string policyPath,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
return await ValidateFileAsync(policyPath, ValidationType.Policy, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates an environment configuration.
|
||||
/// </summary>
|
||||
public async Task<ValidationResult> ValidateEnvironmentConfigAsync(
|
||||
string configPath,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
return await ValidateFileAsync(configPath, ValidationType.EnvironmentConfig, ct);
|
||||
}
|
||||
|
||||
private ValidationType DetectFileType(string filePath, string content)
|
||||
{
|
||||
var fileName = Path.GetFileName(filePath).ToLowerInvariant();
|
||||
var extension = Path.GetExtension(filePath).ToLowerInvariant();
|
||||
|
||||
// Check filename patterns
|
||||
if (fileName.Contains("release") || fileName.Contains("manifest"))
|
||||
{
|
||||
return ValidationType.ReleaseManifest;
|
||||
}
|
||||
|
||||
if (fileName.Contains("policy") || fileName.EndsWith(".rego"))
|
||||
{
|
||||
return ValidationType.Policy;
|
||||
}
|
||||
|
||||
if (fileName.Contains("environment") || fileName.Contains("env."))
|
||||
{
|
||||
return ValidationType.EnvironmentConfig;
|
||||
}
|
||||
|
||||
if (fileName.Contains("workflow") || fileName.Contains("pipeline"))
|
||||
{
|
||||
return ValidationType.Workflow;
|
||||
}
|
||||
|
||||
// Check content patterns
|
||||
if (content.Contains("\"releases\"") || content.Contains("releases:"))
|
||||
{
|
||||
return ValidationType.ReleaseManifest;
|
||||
}
|
||||
|
||||
if (content.Contains("\"rules\"") || content.Contains("package "))
|
||||
{
|
||||
return ValidationType.Policy;
|
||||
}
|
||||
|
||||
// Default based on extension
|
||||
return extension switch
|
||||
{
|
||||
".json" or ".yaml" or ".yml" => ValidationType.Generic,
|
||||
".rego" => ValidationType.Policy,
|
||||
_ => ValidationType.Unknown
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<IReadOnlyList<ValidationError>> ValidateSchemaAsync(
|
||||
string content,
|
||||
ValidationType type,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var schema = await _schemaProvider.GetSchemaAsync(type, ct);
|
||||
if (schema is null)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
// Schema validation would be implemented here
|
||||
// This is a placeholder
|
||||
return [];
|
||||
}
|
||||
|
||||
private async Task<IReadOnlyList<ValidationError>> ValidateCrossReferencesAsync(
|
||||
string content,
|
||||
ValidationType type,
|
||||
string sourcePath,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var errors = new List<ValidationError>();
|
||||
|
||||
// Check for referenced files that should exist
|
||||
if (type == ValidationType.ReleaseManifest)
|
||||
{
|
||||
var baseDir = Path.GetDirectoryName(sourcePath) ?? ".";
|
||||
|
||||
// Parse and check referenced policy files
|
||||
// This would be more sophisticated in a real implementation
|
||||
}
|
||||
|
||||
return errors;
|
||||
}
|
||||
|
||||
private static bool IsConfigFile(string filePath)
|
||||
{
|
||||
var extension = Path.GetExtension(filePath).ToLowerInvariant();
|
||||
return extension is ".json" or ".yaml" or ".yml" or ".rego" or ".toml";
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for local validator.
|
||||
/// </summary>
|
||||
public sealed record LocalValidatorConfig
|
||||
{
|
||||
public bool EnableSchemaValidation { get; init; } = true;
|
||||
public bool EnableCrossReferenceValidation { get; init; } = true;
|
||||
public bool StrictMode { get; init; } = false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Types of configuration that can be validated.
|
||||
/// </summary>
|
||||
public enum ValidationType
|
||||
{
|
||||
Unknown,
|
||||
Generic,
|
||||
ReleaseManifest,
|
||||
Policy,
|
||||
EnvironmentConfig,
|
||||
Workflow,
|
||||
Secrets,
|
||||
GateConfig
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of validation.
|
||||
/// </summary>
|
||||
public sealed record ValidationResult
|
||||
{
|
||||
public required bool IsValid { get; init; }
|
||||
public string? FilePath { get; init; }
|
||||
public ValidationType ValidationType { get; init; }
|
||||
public ImmutableArray<ValidationError> Errors { get; init; } = [];
|
||||
public ImmutableArray<ValidationError> Warnings { get; init; } = [];
|
||||
public TimeSpan Duration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A validation error or warning.
|
||||
/// </summary>
|
||||
public sealed record ValidationError
|
||||
{
|
||||
public required string Code { get; init; }
|
||||
public required string Message { get; init; }
|
||||
public required ValidationSeverity Severity { get; init; }
|
||||
public int? Line { get; init; }
|
||||
public int? Column { get; init; }
|
||||
public string? Path { get; init; }
|
||||
public string? Suggestion { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validation severity.
|
||||
/// </summary>
|
||||
public enum ValidationSeverity
|
||||
{
|
||||
Info,
|
||||
Warning,
|
||||
Error
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of directory validation.
|
||||
/// </summary>
|
||||
public sealed record DirectoryValidationResult
|
||||
{
|
||||
public required string DirectoryPath { get; init; }
|
||||
public required bool IsValid { get; init; }
|
||||
public required int TotalFiles { get; init; }
|
||||
public required int ValidFiles { get; init; }
|
||||
public required int InvalidFiles { get; init; }
|
||||
public required ImmutableArray<ValidationResult> Results { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result from a config validator.
|
||||
/// </summary>
|
||||
public sealed record ConfigValidatorResult
|
||||
{
|
||||
public ImmutableArray<ValidationError> Errors { get; init; } = [];
|
||||
public ImmutableArray<ValidationError> Warnings { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for config validators.
|
||||
/// </summary>
|
||||
public interface IConfigValidator
|
||||
{
|
||||
bool Supports(ValidationType type);
|
||||
Task<ConfigValidatorResult> ValidateAsync(string content, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for schema provider.
|
||||
/// </summary>
|
||||
public interface ISchemaProvider
|
||||
{
|
||||
Task<string?> GetSchemaAsync(ValidationType type, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validator for release manifests.
|
||||
/// </summary>
|
||||
public sealed class ReleaseManifestValidator : IConfigValidator
|
||||
{
|
||||
public bool Supports(ValidationType type) => type == ValidationType.ReleaseManifest;
|
||||
|
||||
public Task<ConfigValidatorResult> ValidateAsync(string content, CancellationToken ct = default)
|
||||
{
|
||||
var errors = new List<ValidationError>();
|
||||
var warnings = new List<ValidationError>();
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = JsonDocument.Parse(content);
|
||||
var root = doc.RootElement;
|
||||
|
||||
// Check required fields
|
||||
if (!root.TryGetProperty("version", out _))
|
||||
{
|
||||
errors.Add(new ValidationError
|
||||
{
|
||||
Code = "MISSING_VERSION",
|
||||
Message = "Release manifest must have a 'version' field",
|
||||
Severity = ValidationSeverity.Error
|
||||
});
|
||||
}
|
||||
|
||||
// Check for deprecated fields
|
||||
if (root.TryGetProperty("deprecated_field", out _))
|
||||
{
|
||||
warnings.Add(new ValidationError
|
||||
{
|
||||
Code = "DEPRECATED_FIELD",
|
||||
Message = "Field 'deprecated_field' is deprecated and will be removed in future versions",
|
||||
Severity = ValidationSeverity.Warning
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
errors.Add(new ValidationError
|
||||
{
|
||||
Code = "INVALID_JSON",
|
||||
Message = ex.Message,
|
||||
Severity = ValidationSeverity.Error
|
||||
});
|
||||
}
|
||||
|
||||
return Task.FromResult(new ConfigValidatorResult
|
||||
{
|
||||
Errors = errors.ToImmutableArray(),
|
||||
Warnings = warnings.ToImmutableArray()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validator for policy files.
|
||||
/// </summary>
|
||||
public sealed class PolicyValidator : IConfigValidator
|
||||
{
|
||||
public bool Supports(ValidationType type) => type == ValidationType.Policy;
|
||||
|
||||
public Task<ConfigValidatorResult> ValidateAsync(string content, CancellationToken ct = default)
|
||||
{
|
||||
var errors = new List<ValidationError>();
|
||||
var warnings = new List<ValidationError>();
|
||||
|
||||
// Rego policy validation
|
||||
if (content.Contains("package "))
|
||||
{
|
||||
// Basic Rego syntax checks
|
||||
if (!content.Contains("default ") && !content.Contains(" = "))
|
||||
{
|
||||
warnings.Add(new ValidationError
|
||||
{
|
||||
Code = "NO_DEFAULT_RULE",
|
||||
Message = "Policy has no default rule - consider adding one for explicit deny/allow",
|
||||
Severity = ValidationSeverity.Warning
|
||||
});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// JSON policy validation
|
||||
try
|
||||
{
|
||||
using var doc = JsonDocument.Parse(content);
|
||||
// Validate policy structure
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
errors.Add(new ValidationError
|
||||
{
|
||||
Code = "INVALID_POLICY",
|
||||
Message = ex.Message,
|
||||
Severity = ValidationSeverity.Error
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return Task.FromResult(new ConfigValidatorResult
|
||||
{
|
||||
Errors = errors.ToImmutableArray(),
|
||||
Warnings = warnings.ToImmutableArray()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validator for environment configurations.
|
||||
/// </summary>
|
||||
public sealed class EnvironmentConfigValidator : IConfigValidator
|
||||
{
|
||||
public bool Supports(ValidationType type) => type == ValidationType.EnvironmentConfig;
|
||||
|
||||
public Task<ConfigValidatorResult> ValidateAsync(string content, CancellationToken ct = default)
|
||||
{
|
||||
var errors = new List<ValidationError>();
|
||||
var warnings = new List<ValidationError>();
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = JsonDocument.Parse(content);
|
||||
var root = doc.RootElement;
|
||||
|
||||
// Check required fields
|
||||
if (!root.TryGetProperty("name", out _))
|
||||
{
|
||||
errors.Add(new ValidationError
|
||||
{
|
||||
Code = "MISSING_NAME",
|
||||
Message = "Environment config must have a 'name' field",
|
||||
Severity = ValidationSeverity.Error
|
||||
});
|
||||
}
|
||||
|
||||
// Check for sensitive data exposure
|
||||
foreach (var prop in root.EnumerateObject())
|
||||
{
|
||||
var value = prop.Value.ToString();
|
||||
if (LooksLikeSecret(prop.Name, value))
|
||||
{
|
||||
warnings.Add(new ValidationError
|
||||
{
|
||||
Code = "POTENTIAL_SECRET",
|
||||
Message = $"Property '{prop.Name}' may contain sensitive data - consider using secrets management",
|
||||
Severity = ValidationSeverity.Warning,
|
||||
Path = prop.Name
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
errors.Add(new ValidationError
|
||||
{
|
||||
Code = "INVALID_JSON",
|
||||
Message = ex.Message,
|
||||
Severity = ValidationSeverity.Error
|
||||
});
|
||||
}
|
||||
|
||||
return Task.FromResult(new ConfigValidatorResult
|
||||
{
|
||||
Errors = errors.ToImmutableArray(),
|
||||
Warnings = warnings.ToImmutableArray()
|
||||
});
|
||||
}
|
||||
|
||||
private static bool LooksLikeSecret(string propertyName, string value)
|
||||
{
|
||||
var sensitiveNames = new[] { "password", "secret", "key", "token", "credential", "auth" };
|
||||
var nameMatches = sensitiveNames.Any(s =>
|
||||
propertyName.Contains(s, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
// Also check for base64-encoded or long random strings
|
||||
var looksLikeToken = value.Length > 20 &&
|
||||
!value.Contains(' ') &&
|
||||
!value.StartsWith("http");
|
||||
|
||||
return nameMatches || looksLikeToken;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,78 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentDoctorPlugin.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Doctor plugin for agent fleet health monitoring
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent;
|
||||
|
||||
/// <summary>
|
||||
/// Doctor plugin for agent fleet health monitoring.
|
||||
/// Monitors agent connectivity, certificates, capacity, and overall fleet health.
|
||||
/// </summary>
|
||||
public sealed class AgentDoctorPlugin : IDoctorPlugin
|
||||
{
|
||||
private static readonly Version PluginVersion = new(1, 0, 0);
|
||||
private static readonly Version MinVersion = new(1, 0, 0);
|
||||
|
||||
/// <inheritdoc />
|
||||
public string PluginId => "stellaops.doctor.agent";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string DisplayName => "Agent Fleet";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorCategory Category => DoctorCategory.Infrastructure;
|
||||
|
||||
/// <inheritdoc />
|
||||
public Version Version => PluginVersion;
|
||||
|
||||
/// <inheritdoc />
|
||||
public Version MinEngineVersion => MinVersion;
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool IsAvailable(IServiceProvider services)
|
||||
{
|
||||
// Always available - individual checks handle their own availability
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<IDoctorCheck> GetChecks(DoctorPluginContext context)
|
||||
{
|
||||
return new IDoctorCheck[]
|
||||
{
|
||||
// Connectivity checks
|
||||
new AgentHeartbeatFreshnessCheck(),
|
||||
new StaleAgentCheck(),
|
||||
|
||||
// Security checks
|
||||
new AgentCertificateExpiryCheck(),
|
||||
new AgentCertificateValidityCheck(),
|
||||
|
||||
// Capacity checks
|
||||
new AgentCapacityCheck(),
|
||||
new TaskQueueBacklogCheck(),
|
||||
new FailedTaskRateCheck(),
|
||||
|
||||
// Fleet health checks
|
||||
new AgentVersionConsistencyCheck(),
|
||||
new AgentResourceUtilizationCheck(),
|
||||
|
||||
// Cluster checks (when clustering is enabled)
|
||||
new AgentClusterHealthCheck(),
|
||||
new AgentClusterQuorumCheck()
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task InitializeAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
// No initialization required
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,167 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentCapacityCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Checks if agents have sufficient capacity for tasks
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.ReleaseOrchestrator.Agent.Store;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks if agents have sufficient capacity to handle incoming tasks.
|
||||
/// </summary>
|
||||
public sealed class AgentCapacityCheck : IDoctorCheck
|
||||
{
|
||||
private const double HighUtilizationThreshold = 0.9;
|
||||
private const double WarningUtilizationThreshold = 0.75;
|
||||
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.capacity";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Capacity";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verify agents have sufficient capacity for tasks";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "capacity", "performance"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
return context.Services.GetService<IAgentStore>() != null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var agentStore = context.Services.GetRequiredService<IAgentStore>();
|
||||
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
var agents = await agentStore.GetAllAsync(ct);
|
||||
var activeAgents = agents
|
||||
.Where(a => a.Status == AgentStatus.Online)
|
||||
.ToList();
|
||||
|
||||
if (activeAgents.Count == 0)
|
||||
{
|
||||
return builder
|
||||
.Fail("No online agents available to handle tasks")
|
||||
.WithEvidence("Agent capacity", eb => eb
|
||||
.Add("OnlineAgents", "0")
|
||||
.Add("TotalAgents", agents.Count.ToString(CultureInfo.InvariantCulture)))
|
||||
.WithCauses(
|
||||
"All agents are offline",
|
||||
"No agents have been registered")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Check agent heartbeat status",
|
||||
"stella doctor --check check.agent.heartbeat.freshness",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Bootstrap new agents if needed",
|
||||
"stella agent bootstrap --name <name> --env <env>",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
var overloadedAgents = new List<string>();
|
||||
var warningAgents = new List<string>();
|
||||
var totalCapacity = 0;
|
||||
var totalUtilized = 0;
|
||||
|
||||
foreach (var agent in activeAgents)
|
||||
{
|
||||
totalCapacity += agent.MaxConcurrentTasks;
|
||||
totalUtilized += agent.ActiveTaskCount;
|
||||
|
||||
var utilization = agent.MaxConcurrentTasks > 0
|
||||
? (double)agent.ActiveTaskCount / agent.MaxConcurrentTasks
|
||||
: 0;
|
||||
|
||||
if (utilization >= HighUtilizationThreshold)
|
||||
{
|
||||
overloadedAgents.Add($"{agent.Name} ({agent.ActiveTaskCount}/{agent.MaxConcurrentTasks})");
|
||||
}
|
||||
else if (utilization >= WarningUtilizationThreshold)
|
||||
{
|
||||
warningAgents.Add($"{agent.Name} ({agent.ActiveTaskCount}/{agent.MaxConcurrentTasks})");
|
||||
}
|
||||
}
|
||||
|
||||
var overallUtilization = totalCapacity > 0 ? (double)totalUtilized / totalCapacity : 0;
|
||||
|
||||
if (overallUtilization >= HighUtilizationThreshold)
|
||||
{
|
||||
return builder
|
||||
.Fail($"Fleet capacity critically low ({overallUtilization:P0} utilized)")
|
||||
.WithEvidence("Agent capacity", eb => eb
|
||||
.Add("TotalCapacity", totalCapacity.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("TotalUtilized", totalUtilized.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Utilization", overallUtilization.ToString("P1", CultureInfo.InvariantCulture))
|
||||
.Add("OverloadedAgents", string.Join(", ", overloadedAgents)))
|
||||
.WithCauses(
|
||||
"Too many concurrent deployments",
|
||||
"Insufficient agent capacity",
|
||||
"Tasks taking longer than expected")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Add more agents to increase capacity",
|
||||
"stella agent bootstrap --name <name> --env <env>",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Review and optimize long-running tasks",
|
||||
"stella task list --status running --sort duration",
|
||||
CommandType.Shell)
|
||||
.AddStep(3, "Consider increasing max concurrent tasks per agent",
|
||||
"stella agent config --agent-id <id> --set max_concurrent_tasks=10",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
if (overloadedAgents.Count > 0 || overallUtilization >= WarningUtilizationThreshold)
|
||||
{
|
||||
return builder
|
||||
.Warn($"Fleet capacity at {overallUtilization:P0}")
|
||||
.WithEvidence("Agent capacity", eb => eb
|
||||
.Add("TotalCapacity", totalCapacity.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("TotalUtilized", totalUtilized.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Utilization", overallUtilization.ToString("P1", CultureInfo.InvariantCulture))
|
||||
.Add("OverloadedAgents", overloadedAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("WarningAgents", warningAgents.Count.ToString(CultureInfo.InvariantCulture)))
|
||||
.WithCauses(
|
||||
"High deployment activity",
|
||||
"Approaching capacity limits")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Monitor capacity trend",
|
||||
"stella agent list --format table",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Consider scaling if trend continues",
|
||||
"stella agent bootstrap --name <name> --env <env>",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return builder
|
||||
.Pass($"Fleet capacity healthy ({overallUtilization:P0} utilized)")
|
||||
.WithEvidence("Agent capacity", eb => eb
|
||||
.Add("TotalCapacity", totalCapacity.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("TotalUtilized", totalUtilized.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Utilization", overallUtilization.ToString("P1", CultureInfo.InvariantCulture))
|
||||
.Add("OnlineAgents", activeAgents.Count.ToString(CultureInfo.InvariantCulture)))
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,189 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentCertificateExpiryCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Checks if agent certificates are expiring soon
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.ReleaseOrchestrator.Agent.Store;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks if any agent certificates are expired or expiring soon.
|
||||
/// </summary>
|
||||
public sealed class AgentCertificateExpiryCheck : IDoctorCheck
|
||||
{
|
||||
private static readonly TimeSpan WarningThreshold = TimeSpan.FromDays(7);
|
||||
private static readonly TimeSpan CriticalThreshold = TimeSpan.FromDays(1);
|
||||
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.certificate.expiry";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Certificate Expiry";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verify agent certificates are not expired or expiring soon";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "certificate", "security", "quick"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
return context.Services.GetService<IAgentStore>() != null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var agentStore = context.Services.GetRequiredService<IAgentStore>();
|
||||
var timeProvider = context.Services.GetRequiredService<TimeProvider>();
|
||||
var now = timeProvider.GetUtcNow();
|
||||
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
var agents = await agentStore.GetAllAsync(ct);
|
||||
var activeAgents = agents.Where(a => a.Status != AgentStatus.Deactivated).ToList();
|
||||
|
||||
if (activeAgents.Count == 0)
|
||||
{
|
||||
return builder
|
||||
.Skip("No active agents to check")
|
||||
.Build();
|
||||
}
|
||||
|
||||
var expiredAgents = new List<(string Name, TimeSpan ExpiredAgo)>();
|
||||
var criticalAgents = new List<(string Name, TimeSpan ExpiresIn)>();
|
||||
var warningAgents = new List<(string Name, TimeSpan ExpiresIn)>();
|
||||
|
||||
foreach (var agent in activeAgents)
|
||||
{
|
||||
if (agent.CertificateExpiry == default)
|
||||
{
|
||||
continue; // Certificate info not available
|
||||
}
|
||||
|
||||
var expiresIn = agent.CertificateExpiry - now;
|
||||
|
||||
if (expiresIn <= TimeSpan.Zero)
|
||||
{
|
||||
expiredAgents.Add((agent.Name, -expiresIn));
|
||||
}
|
||||
else if (expiresIn <= CriticalThreshold)
|
||||
{
|
||||
criticalAgents.Add((agent.Name, expiresIn));
|
||||
}
|
||||
else if (expiresIn <= WarningThreshold)
|
||||
{
|
||||
warningAgents.Add((agent.Name, expiresIn));
|
||||
}
|
||||
}
|
||||
|
||||
if (expiredAgents.Count > 0)
|
||||
{
|
||||
var expiredList = expiredAgents
|
||||
.Select(a => $"{a.Name} (expired {a.ExpiredAgo.TotalDays:F0} days ago)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Fail($"{expiredAgents.Count} agent(s) have expired certificates")
|
||||
.WithEvidence("Agent certificate status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Expired", expiredAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Critical", criticalAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("ExpiredAgents", string.Join(", ", expiredList)))
|
||||
.WithCauses(
|
||||
"Certificate auto-renewal is disabled",
|
||||
"Agent was offline when renewal was due",
|
||||
"Certificate authority is unreachable",
|
||||
"Agent bootstrap was incomplete")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Force certificate renewal on the affected agent",
|
||||
"stella agent renew-cert --agent-id <agent-id> --force",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "If agent is unreachable, re-bootstrap",
|
||||
"stella agent bootstrap --name <agent-name> --env <environment>",
|
||||
CommandType.Shell)
|
||||
.AddStep(3, "Verify auto-renewal is enabled",
|
||||
"stella agent config --agent-id <agent-id> | grep auto_renew",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.WithRunbookUrl("https://docs.stella-ops.org/runbooks/agent-cert-expired")
|
||||
.Build();
|
||||
}
|
||||
|
||||
if (criticalAgents.Count > 0)
|
||||
{
|
||||
var criticalList = criticalAgents
|
||||
.Select(a => $"{a.Name} (expires in {a.ExpiresIn.TotalHours:F0} hours)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Fail($"{criticalAgents.Count} agent(s) have certificates expiring within 24 hours")
|
||||
.WithEvidence("Agent certificate status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Critical", criticalAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("CriticalAgents", string.Join(", ", criticalList)))
|
||||
.WithCauses(
|
||||
"Certificate auto-renewal failed",
|
||||
"Agent has been offline",
|
||||
"Certificate authority rate limiting")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Manually trigger certificate renewal",
|
||||
"stella agent renew-cert --agent-id <agent-id>",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Check agent logs for renewal failures",
|
||||
"stella agent logs --agent-id <agent-id> --level warn",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
if (warningAgents.Count > 0)
|
||||
{
|
||||
var warningList = warningAgents
|
||||
.Select(a => $"{a.Name} (expires in {a.ExpiresIn.TotalDays:F0} days)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Warn($"{warningAgents.Count} agent(s) have certificates expiring within 7 days")
|
||||
.WithEvidence("Agent certificate status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("WarningAgents", string.Join(", ", warningList)))
|
||||
.WithCauses(
|
||||
"Certificate renewal threshold not reached yet",
|
||||
"Agent auto-renewal scheduled but not yet triggered")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Monitor certificate renewal",
|
||||
"stella agent health <agent-id>",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Optionally force early renewal",
|
||||
"stella agent renew-cert --agent-id <agent-id>",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return builder
|
||||
.Pass("All agent certificates are valid")
|
||||
.WithEvidence("Agent certificate status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("AllValid", "true"))
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentCertificateValidityCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Validates agent certificate chain and trust
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.ReleaseOrchestrator.Agent.Store;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Validates agent certificate chain and trust relationships.
|
||||
/// </summary>
|
||||
public sealed class AgentCertificateValidityCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.certificate.validity";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Certificate Validity";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verify agent certificates have valid chain of trust";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "certificate", "security"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(5);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
return context.Services.GetService<IAgentStore>() != null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
// TODO: Implement certificate chain validation
|
||||
// This check verifies:
|
||||
// 1. Certificate is signed by trusted CA
|
||||
// 2. Certificate chain is complete
|
||||
// 3. No revoked certificates in chain
|
||||
// 4. Certificate is for correct agent identity
|
||||
|
||||
return builder
|
||||
.Pass("Certificate validity check - implementation pending")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentClusterHealthCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Monitors agent cluster health (when clustering is enabled)
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Monitors agent cluster health when clustering is enabled.
|
||||
/// </summary>
|
||||
public sealed class AgentClusterHealthCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.cluster.health";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Cluster Health";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Monitor agent cluster membership and health";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "cluster", "ha", "resilience"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
// Only run if clustering is enabled
|
||||
var clusteringEnabled = context.Configuration["Agent:Cluster:Enabled"];
|
||||
return clusteringEnabled?.Equals("true", StringComparison.OrdinalIgnoreCase) == true;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
// TODO: Implement cluster health monitoring
|
||||
// This check verifies:
|
||||
// 1. All cluster members are reachable
|
||||
// 2. Leader is elected and healthy
|
||||
// 3. State sync is working
|
||||
// 4. Failover is possible if needed
|
||||
|
||||
return builder
|
||||
.Skip("Clustering not enabled or check implementation pending")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentClusterQuorumCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Verifies agent cluster has quorum for leader election
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Verifies agent cluster has sufficient members for quorum.
|
||||
/// </summary>
|
||||
public sealed class AgentClusterQuorumCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.cluster.quorum";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Cluster Quorum";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verify agent cluster has quorum for leader election";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "cluster", "quorum", "ha"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
// Only run if clustering is enabled
|
||||
var clusteringEnabled = context.Configuration["Agent:Cluster:Enabled"];
|
||||
return clusteringEnabled?.Equals("true", StringComparison.OrdinalIgnoreCase) == true;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
// TODO: Implement quorum check
|
||||
// This check verifies:
|
||||
// 1. Minimum members are online (n/2 + 1 for odd, or configured minimum)
|
||||
// 2. Leader election is possible
|
||||
// 3. Split-brain prevention is active
|
||||
|
||||
return builder
|
||||
.Skip("Clustering not enabled or check implementation pending")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,179 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentHeartbeatFreshnessCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Checks if all agents have fresh heartbeats
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.ReleaseOrchestrator.Agent.Store;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks if all registered agents have recent heartbeats.
|
||||
/// </summary>
|
||||
public sealed class AgentHeartbeatFreshnessCheck : IDoctorCheck
|
||||
{
|
||||
private static readonly TimeSpan StaleThreshold = TimeSpan.FromMinutes(5);
|
||||
private static readonly TimeSpan WarningThreshold = TimeSpan.FromMinutes(2);
|
||||
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.heartbeat.freshness";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Heartbeat Freshness";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verify all agents have recent heartbeats";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Fail;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "heartbeat", "connectivity", "quick"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
return context.Services.GetService<IAgentStore>() != null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var agentStore = context.Services.GetRequiredService<IAgentStore>();
|
||||
var timeProvider = context.Services.GetRequiredService<TimeProvider>();
|
||||
var now = timeProvider.GetUtcNow();
|
||||
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
var agents = await agentStore.GetAllAsync(ct);
|
||||
var activeAgents = agents.Where(a => a.Status != AgentStatus.Deactivated).ToList();
|
||||
|
||||
if (activeAgents.Count == 0)
|
||||
{
|
||||
return builder
|
||||
.Warn("No active agents registered")
|
||||
.WithEvidence("Agent status", eb => eb
|
||||
.Add("TotalAgents", agents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("ActiveAgents", "0"))
|
||||
.WithCauses(
|
||||
"No agents have been registered",
|
||||
"All agents have been deactivated")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Bootstrap a new agent",
|
||||
"stella agent bootstrap --name agent-01 --env production --platform linux",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Check agent registration status",
|
||||
"stella agent list --all",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
var staleAgents = new List<(string Name, TimeSpan Age)>();
|
||||
var warningAgents = new List<(string Name, TimeSpan Age)>();
|
||||
var healthyAgents = new List<string>();
|
||||
|
||||
foreach (var agent in activeAgents)
|
||||
{
|
||||
var heartbeatAge = now - agent.LastHeartbeat;
|
||||
|
||||
if (heartbeatAge > StaleThreshold)
|
||||
{
|
||||
staleAgents.Add((agent.Name, heartbeatAge));
|
||||
}
|
||||
else if (heartbeatAge > WarningThreshold)
|
||||
{
|
||||
warningAgents.Add((agent.Name, heartbeatAge));
|
||||
}
|
||||
else
|
||||
{
|
||||
healthyAgents.Add(agent.Name);
|
||||
}
|
||||
}
|
||||
|
||||
if (staleAgents.Count > 0)
|
||||
{
|
||||
var staleList = staleAgents
|
||||
.Select(a => $"{a.Name} (last heartbeat: {a.Age.TotalMinutes:F0}m ago)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Fail($"{staleAgents.Count} agent(s) have stale heartbeats")
|
||||
.WithEvidence("Agent heartbeat status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Stale", staleAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Healthy", healthyAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("StaleAgents", string.Join(", ", staleList)))
|
||||
.WithCauses(
|
||||
"Agent process has crashed or stopped",
|
||||
"Network connectivity issue between agent and orchestrator",
|
||||
"Firewall blocking agent heartbeats",
|
||||
"Agent host is unreachable or powered off",
|
||||
"mTLS certificate has expired")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Check agent status on the host",
|
||||
"systemctl status stella-agent",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "View agent logs for errors",
|
||||
"journalctl -u stella-agent --since '10 minutes ago'",
|
||||
CommandType.Shell)
|
||||
.AddStep(3, "Run agent diagnostics",
|
||||
"stella agent doctor",
|
||||
CommandType.Shell)
|
||||
.AddStep(4, "Check network connectivity to orchestrator",
|
||||
"curl -k https://orchestrator:8443/health",
|
||||
CommandType.Shell)
|
||||
.AddStep(5, "If certificate expired, renew it",
|
||||
"stella agent renew-cert --force",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.WithRunbookUrl("https://docs.stella-ops.org/runbooks/agent-stale-heartbeat")
|
||||
.Build();
|
||||
}
|
||||
|
||||
if (warningAgents.Count > 0)
|
||||
{
|
||||
var warningList = warningAgents
|
||||
.Select(a => $"{a.Name} ({a.Age.TotalSeconds:F0}s ago)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Warn($"{warningAgents.Count} agent(s) have delayed heartbeats")
|
||||
.WithEvidence("Agent heartbeat status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Warning", warningAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Healthy", healthyAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("DelayedAgents", string.Join(", ", warningList)))
|
||||
.WithCauses(
|
||||
"Agent is under heavy load",
|
||||
"Network latency between agent and orchestrator",
|
||||
"Agent is processing long-running tasks")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Check agent resource utilization",
|
||||
"stella agent health <agent-id>",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Monitor heartbeat trend",
|
||||
"stella agent logs --agent-id <agent-id> --tail 50",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return builder
|
||||
.Pass($"All {activeAgents.Count} agents have fresh heartbeats")
|
||||
.WithEvidence("Agent heartbeat status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("AllHealthy", "true"))
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentResourceUtilizationCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Monitors resource utilization across agent fleet
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Monitors CPU, memory, and disk utilization across agent fleet.
|
||||
/// </summary>
|
||||
public sealed class AgentResourceUtilizationCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.resource.utilization";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Resource Utilization";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Monitor CPU, memory, and disk utilization across agents";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "resource", "performance", "capacity"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context) => true;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
// TODO: Implement resource utilization monitoring
|
||||
// This check verifies:
|
||||
// 1. CPU utilization per agent
|
||||
// 2. Memory utilization per agent
|
||||
// 3. Disk space per agent
|
||||
// 4. Resource trends
|
||||
|
||||
return builder
|
||||
.Pass("Resource utilization check - implementation pending")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,122 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentVersionConsistencyCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Checks for version consistency across agent fleet
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.ReleaseOrchestrator.Agent.Store;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks for version consistency across the agent fleet.
|
||||
/// Detects version skew that could cause compatibility issues.
|
||||
/// </summary>
|
||||
public sealed class AgentVersionConsistencyCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.version.consistency";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Agent Version Consistency";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Verify all agents are running compatible versions";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "version", "maintenance"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
return context.Services.GetService<IAgentStore>() != null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var agentStore = context.Services.GetRequiredService<IAgentStore>();
|
||||
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
var agents = await agentStore.GetAllAsync(ct);
|
||||
var activeAgents = agents
|
||||
.Where(a => a.Status != AgentStatus.Deactivated)
|
||||
.ToList();
|
||||
|
||||
if (activeAgents.Count == 0)
|
||||
{
|
||||
return builder
|
||||
.Skip("No active agents to check")
|
||||
.Build();
|
||||
}
|
||||
|
||||
var versionGroups = activeAgents
|
||||
.GroupBy(a => a.Version ?? "unknown")
|
||||
.OrderByDescending(g => g.Count())
|
||||
.ToList();
|
||||
|
||||
var majorVersion = versionGroups.First().Key;
|
||||
var majorCount = versionGroups.First().Count();
|
||||
|
||||
if (versionGroups.Count == 1)
|
||||
{
|
||||
return builder
|
||||
.Pass($"All {activeAgents.Count} agents running version {majorVersion}")
|
||||
.WithEvidence("Agent versions", eb => eb
|
||||
.Add("Version", majorVersion)
|
||||
.Add("AgentCount", activeAgents.Count.ToString(CultureInfo.InvariantCulture)))
|
||||
.Build();
|
||||
}
|
||||
|
||||
var outdatedAgents = versionGroups
|
||||
.Skip(1)
|
||||
.SelectMany(g => g.Select(a => $"{a.Name} ({g.Key})"))
|
||||
.ToList();
|
||||
|
||||
var versionSummary = versionGroups
|
||||
.Select(g => $"{g.Key}: {g.Count()}")
|
||||
.ToList();
|
||||
|
||||
if (versionGroups.Count > 2 || outdatedAgents.Count > activeAgents.Count / 2)
|
||||
{
|
||||
return builder
|
||||
.Warn($"Significant version skew detected ({versionGroups.Count} versions)")
|
||||
.WithEvidence("Agent versions", eb => eb
|
||||
.Add("MajorityVersion", majorVersion)
|
||||
.Add("VersionDistribution", string.Join(", ", versionSummary))
|
||||
.Add("OutdatedAgents", string.Join(", ", outdatedAgents.Take(10))))
|
||||
.WithCauses(
|
||||
"Auto-update is disabled on some agents",
|
||||
"Some agents failed to update",
|
||||
"Phased rollout in progress")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Update outdated agents",
|
||||
"stella agent update --version <target-version> --agent-id <id>",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Enable auto-update if appropriate",
|
||||
"stella agent config --agent-id <id> --set auto_update.enabled=true",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return builder
|
||||
.Pass($"Minor version skew acceptable ({versionGroups.Count} versions)")
|
||||
.WithEvidence("Agent versions", eb => eb
|
||||
.Add("MajorityVersion", majorVersion)
|
||||
.Add("VersionDistribution", string.Join(", ", versionSummary)))
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// FailedTaskRateCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Monitors task failure rate across agents
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Monitors task failure rate to detect systemic issues.
|
||||
/// </summary>
|
||||
public sealed class FailedTaskRateCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.task.failure.rate";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Task Failure Rate";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Monitor task failure rate across agent fleet";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "task", "failure", "reliability"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context) => true;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
// TODO: Implement task failure rate monitoring
|
||||
// This check verifies:
|
||||
// 1. Overall task failure rate (last hour)
|
||||
// 2. Per-agent failure rate
|
||||
// 3. Failure rate trend (increasing/decreasing)
|
||||
// 4. Common failure reasons
|
||||
|
||||
return builder
|
||||
.Pass("Task failure rate check - implementation pending")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,141 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// StaleAgentCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Checks for agents that have been stale for extended periods
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
using StellaOps.ReleaseOrchestrator.Agent.Store;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks for agents that have been stale (offline) for extended periods
|
||||
/// and may need to be decommissioned or investigated.
|
||||
/// </summary>
|
||||
public sealed class StaleAgentCheck : IDoctorCheck
|
||||
{
|
||||
private static readonly TimeSpan StaleThreshold = TimeSpan.FromHours(1);
|
||||
private static readonly TimeSpan DecommissionThreshold = TimeSpan.FromDays(7);
|
||||
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.stale";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Stale Agent Detection";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Detect agents that have been offline for extended periods";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "maintenance", "cleanup"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(2);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context)
|
||||
{
|
||||
return context.Services.GetService<IAgentStore>() != null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var agentStore = context.Services.GetRequiredService<IAgentStore>();
|
||||
var timeProvider = context.Services.GetRequiredService<TimeProvider>();
|
||||
var now = timeProvider.GetUtcNow();
|
||||
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
var agents = await agentStore.GetAllAsync(ct);
|
||||
var activeAgents = agents.Where(a => a.Status != AgentStatus.Deactivated).ToList();
|
||||
|
||||
var decommissionCandidates = new List<(string Name, TimeSpan OfflineFor)>();
|
||||
var staleAgents = new List<(string Name, TimeSpan OfflineFor)>();
|
||||
|
||||
foreach (var agent in activeAgents)
|
||||
{
|
||||
var offlineFor = now - agent.LastHeartbeat;
|
||||
|
||||
if (offlineFor > DecommissionThreshold)
|
||||
{
|
||||
decommissionCandidates.Add((agent.Name, offlineFor));
|
||||
}
|
||||
else if (offlineFor > StaleThreshold)
|
||||
{
|
||||
staleAgents.Add((agent.Name, offlineFor));
|
||||
}
|
||||
}
|
||||
|
||||
if (decommissionCandidates.Count > 0)
|
||||
{
|
||||
var decommList = decommissionCandidates
|
||||
.Select(a => $"{a.Name} (offline {a.OfflineFor.TotalDays:F0} days)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Warn($"{decommissionCandidates.Count} agent(s) may need decommissioning")
|
||||
.WithEvidence("Stale agent status", eb => eb
|
||||
.Add("DecommissionCandidates", decommissionCandidates.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("StaleAgents", staleAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Agents", string.Join(", ", decommList)))
|
||||
.WithCauses(
|
||||
"Agent host has been permanently removed",
|
||||
"Agent was replaced but not deactivated",
|
||||
"Infrastructure change without cleanup")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Review stale agents",
|
||||
"stella agent list --status stale",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Deactivate agents that are no longer needed",
|
||||
"stella agent deactivate --agent-id <agent-id>",
|
||||
CommandType.Shell)
|
||||
.AddStep(3, "If agent should be active, investigate host",
|
||||
"ssh <agent-host> 'systemctl status stella-agent'",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
if (staleAgents.Count > 0)
|
||||
{
|
||||
var staleList = staleAgents
|
||||
.Select(a => $"{a.Name} (offline {a.OfflineFor.TotalHours:F0} hours)")
|
||||
.ToList();
|
||||
|
||||
return builder
|
||||
.Warn($"{staleAgents.Count} agent(s) have been offline for over an hour")
|
||||
.WithEvidence("Stale agent status", eb => eb
|
||||
.Add("StaleAgents", staleAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("Agents", string.Join(", ", staleList)))
|
||||
.WithCauses(
|
||||
"Agent host is undergoing maintenance",
|
||||
"Network partition",
|
||||
"Agent process crash without auto-restart")
|
||||
.WithRemediation(rb => rb
|
||||
.AddStep(1, "Check agent host status",
|
||||
"ping <agent-host>",
|
||||
CommandType.Shell)
|
||||
.AddStep(2, "Restart agent service",
|
||||
"ssh <agent-host> 'systemctl restart stella-agent'",
|
||||
CommandType.Shell))
|
||||
.WithVerification($"stella doctor --check {CheckId}")
|
||||
.Build();
|
||||
}
|
||||
|
||||
return builder
|
||||
.Pass("No stale agents detected")
|
||||
.WithEvidence("Stale agent status", eb => eb
|
||||
.Add("TotalActive", activeAgents.Count.ToString(CultureInfo.InvariantCulture))
|
||||
.Add("AllHealthy", "true"))
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// TaskQueueBacklogCheck.cs
|
||||
// Sprint: SPRINT_20260117_041_ReleaseOrchestrator_agent_operations
|
||||
// Task: TASK-041-09 - Server-Side Doctor Plugin
|
||||
// Description: Monitors task queue backlog across agents
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Doctor.Models;
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugin.Agent.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Monitors task queue backlog to detect capacity issues.
|
||||
/// </summary>
|
||||
public sealed class TaskQueueBacklogCheck : IDoctorCheck
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public string CheckId => "check.agent.task.backlog";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Name => "Task Queue Backlog";
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Description => "Monitor pending task queue depth across agents";
|
||||
|
||||
/// <inheritdoc />
|
||||
public DoctorSeverity DefaultSeverity => DoctorSeverity.Warn;
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyList<string> Tags => ["agent", "task", "queue", "capacity"];
|
||||
|
||||
/// <inheritdoc />
|
||||
public TimeSpan EstimatedDuration => TimeSpan.FromSeconds(3);
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool CanRun(DoctorPluginContext context) => true;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DoctorCheckResult> RunAsync(DoctorPluginContext context, CancellationToken ct)
|
||||
{
|
||||
var builder = context.CreateResult(CheckId, "stellaops.doctor.agent", "Agent Fleet");
|
||||
|
||||
// TODO: Implement task queue backlog monitoring
|
||||
// This check verifies:
|
||||
// 1. Total queued tasks across fleet
|
||||
// 2. Age of oldest queued task
|
||||
// 3. Queue growth rate trend
|
||||
|
||||
return builder
|
||||
.Pass("Task queue backlog check - implementation pending")
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<RootNamespace>StellaOps.Doctor.Plugin.Agent</RootNamespace>
|
||||
<Description>Agent fleet health checks for Stella Ops Doctor diagnostics</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Doctor\StellaOps.Doctor.csproj" />
|
||||
<ProjectReference Include="..\..\..\ReleaseOrchestrator\__Libraries\StellaOps.ReleaseOrchestrator.Agent\StellaOps.ReleaseOrchestrator.Agent.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Http" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,319 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins.Agent;
|
||||
|
||||
/// <summary>
|
||||
/// Server-side Doctor plugin for agent fleet health monitoring.
|
||||
/// </summary>
|
||||
public sealed class AgentHealthPlugin : IDoctorPlugin
|
||||
{
|
||||
private readonly IAgentFleetService _fleetService;
|
||||
private readonly AgentHealthPluginOptions _options;
|
||||
|
||||
public AgentHealthPlugin(
|
||||
IAgentFleetService fleetService,
|
||||
AgentHealthPluginOptions? options = null)
|
||||
{
|
||||
_fleetService = fleetService;
|
||||
_options = options ?? new AgentHealthPluginOptions();
|
||||
}
|
||||
|
||||
public string Name => "AgentHealth";
|
||||
public string Description => "Monitors agent fleet health";
|
||||
public string[] Categories => ["fleet", "agents", "infrastructure"];
|
||||
|
||||
public async Task<IReadOnlyList<DoctorCheckResult>> RunChecksAsync(
|
||||
DoctorContext context,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = new List<DoctorCheckResult>();
|
||||
|
||||
// Run all fleet health checks
|
||||
results.Add(await CheckHeartbeatFreshnessAsync(cancellationToken));
|
||||
results.Add(await CheckCertificateExpiryAsync(cancellationToken));
|
||||
results.Add(await CheckVersionConsistencyAsync(cancellationToken));
|
||||
results.Add(await CheckAgentCapacityAsync(cancellationToken));
|
||||
results.Add(await CheckStaleAgentsAsync(cancellationToken));
|
||||
results.Add(await CheckTaskQueueBacklogAsync(cancellationToken));
|
||||
results.Add(await CheckFailedTaskRateAsync(cancellationToken));
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckHeartbeatFreshnessAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
|
||||
var staleAgents = agents
|
||||
.Where(a => a.LastHeartbeat < DateTimeOffset.UtcNow - _options.HeartbeatStaleThreshold)
|
||||
.ToList();
|
||||
|
||||
if (staleAgents.Count == 0)
|
||||
{
|
||||
return DoctorCheckResult.Pass("AgentHeartbeatFreshness",
|
||||
$"All {agents.Count} agents have recent heartbeats");
|
||||
}
|
||||
|
||||
var severity = staleAgents.Count > agents.Count / 2
|
||||
? DoctorSeverity.Critical
|
||||
: DoctorSeverity.Warning;
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "AgentHeartbeatFreshness",
|
||||
Severity = severity,
|
||||
Message = $"{staleAgents.Count} of {agents.Count} agents have stale heartbeats",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["staleAgents"] = staleAgents.Select(a => a.Id).ToList(),
|
||||
["threshold"] = _options.HeartbeatStaleThreshold.TotalMinutes
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckCertificateExpiryAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
|
||||
var expiringAgents = agents
|
||||
.Where(a => a.CertificateExpiresAt.HasValue &&
|
||||
a.CertificateExpiresAt.Value < DateTimeOffset.UtcNow.AddDays(_options.CertificateWarningDays))
|
||||
.ToList();
|
||||
|
||||
if (expiringAgents.Count == 0)
|
||||
{
|
||||
return DoctorCheckResult.Pass("AgentCertificateExpiry",
|
||||
"No agent certificates expiring soon");
|
||||
}
|
||||
|
||||
var expiredCount = expiringAgents.Count(a =>
|
||||
a.CertificateExpiresAt < DateTimeOffset.UtcNow);
|
||||
|
||||
var severity = expiredCount > 0 ? DoctorSeverity.Critical : DoctorSeverity.Warning;
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "AgentCertificateExpiry",
|
||||
Severity = severity,
|
||||
Message = expiredCount > 0
|
||||
? $"{expiredCount} agents have expired certificates"
|
||||
: $"{expiringAgents.Count} agents have certificates expiring within {_options.CertificateWarningDays} days",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["expiringAgents"] = expiringAgents.Select(a => new { a.Id, a.CertificateExpiresAt }).ToList()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckVersionConsistencyAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
|
||||
var versionGroups = agents
|
||||
.GroupBy(a => a.Version)
|
||||
.OrderByDescending(g => g.Count())
|
||||
.ToList();
|
||||
|
||||
if (versionGroups.Count <= 1)
|
||||
{
|
||||
return DoctorCheckResult.Pass("AgentVersionConsistency",
|
||||
$"All agents running version {versionGroups.FirstOrDefault()?.Key ?? "unknown"}");
|
||||
}
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "AgentVersionConsistency",
|
||||
Severity = DoctorSeverity.Warning,
|
||||
Message = $"Version skew detected: {versionGroups.Count} different versions running",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["versions"] = versionGroups.Select(g => new { Version = g.Key, Count = g.Count() }).ToList()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckAgentCapacityAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
|
||||
var overloadedAgents = agents
|
||||
.Where(a => a.CurrentTasks >= a.MaxConcurrentTasks)
|
||||
.ToList();
|
||||
|
||||
if (overloadedAgents.Count == 0)
|
||||
{
|
||||
return DoctorCheckResult.Pass("AgentCapacity", "All agents have available capacity");
|
||||
}
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "AgentCapacity",
|
||||
Severity = overloadedAgents.Count > agents.Count / 2
|
||||
? DoctorSeverity.Warning
|
||||
: DoctorSeverity.Info,
|
||||
Message = $"{overloadedAgents.Count} agents at maximum capacity",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["overloadedAgents"] = overloadedAgents.Select(a => a.Id).ToList()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckStaleAgentsAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var agents = await _fleetService.GetAllAgentsAsync(cancellationToken);
|
||||
var disconnectedAgents = agents
|
||||
.Where(a => a.Status == AgentFleetStatus.Disconnected &&
|
||||
a.DisconnectedAt < DateTimeOffset.UtcNow.AddDays(-7))
|
||||
.ToList();
|
||||
|
||||
if (disconnectedAgents.Count == 0)
|
||||
{
|
||||
return DoctorCheckResult.Pass("StaleAgents", "No stale disconnected agents");
|
||||
}
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "StaleAgents",
|
||||
Severity = DoctorSeverity.Info,
|
||||
Message = $"{disconnectedAgents.Count} agents disconnected for more than 7 days",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["staleAgents"] = disconnectedAgents.Select(a => new { a.Id, a.DisconnectedAt }).ToList()
|
||||
},
|
||||
Recommendation = "Consider removing stale agents or investigating connectivity issues"
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckTaskQueueBacklogAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var queueStats = await _fleetService.GetTaskQueueStatsAsync(cancellationToken);
|
||||
|
||||
if (queueStats.PendingTasks < _options.TaskQueueWarningThreshold)
|
||||
{
|
||||
return DoctorCheckResult.Pass("TaskQueueBacklog",
|
||||
$"Task queue healthy: {queueStats.PendingTasks} pending tasks");
|
||||
}
|
||||
|
||||
var severity = queueStats.PendingTasks > _options.TaskQueueCriticalThreshold
|
||||
? DoctorSeverity.Critical
|
||||
: DoctorSeverity.Warning;
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "TaskQueueBacklog",
|
||||
Severity = severity,
|
||||
Message = $"Task queue backlog: {queueStats.PendingTasks} pending tasks",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["pendingTasks"] = queueStats.PendingTasks,
|
||||
["oldestTaskAge"] = queueStats.OldestTaskAge?.TotalMinutes ?? 0
|
||||
},
|
||||
Recommendation = "Consider adding more agents or investigating task processing delays"
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DoctorCheckResult> CheckFailedTaskRateAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
var stats = await _fleetService.GetTaskStatsAsync(
|
||||
DateTimeOffset.UtcNow.AddHours(-1),
|
||||
cancellationToken);
|
||||
|
||||
if (stats.TotalTasks == 0)
|
||||
{
|
||||
return DoctorCheckResult.Pass("FailedTaskRate", "No tasks executed in the last hour");
|
||||
}
|
||||
|
||||
var failureRate = (double)stats.FailedTasks / stats.TotalTasks * 100;
|
||||
|
||||
if (failureRate < _options.FailureRateWarningThreshold)
|
||||
{
|
||||
return DoctorCheckResult.Pass("FailedTaskRate",
|
||||
$"Task failure rate: {failureRate:F1}%");
|
||||
}
|
||||
|
||||
var severity = failureRate > _options.FailureRateCriticalThreshold
|
||||
? DoctorSeverity.Critical
|
||||
: DoctorSeverity.Warning;
|
||||
|
||||
return new DoctorCheckResult
|
||||
{
|
||||
CheckName = "FailedTaskRate",
|
||||
Severity = severity,
|
||||
Message = $"High task failure rate: {failureRate:F1}%",
|
||||
Details = new Dictionary<string, object>
|
||||
{
|
||||
["totalTasks"] = stats.TotalTasks,
|
||||
["failedTasks"] = stats.FailedTasks,
|
||||
["failureRate"] = failureRate
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent health plugin options.
|
||||
/// </summary>
|
||||
public sealed record AgentHealthPluginOptions
|
||||
{
|
||||
public TimeSpan HeartbeatStaleThreshold { get; init; } = TimeSpan.FromMinutes(5);
|
||||
public int CertificateWarningDays { get; init; } = 14;
|
||||
public int TaskQueueWarningThreshold { get; init; } = 100;
|
||||
public int TaskQueueCriticalThreshold { get; init; } = 500;
|
||||
public double FailureRateWarningThreshold { get; init; } = 5.0;
|
||||
public double FailureRateCriticalThreshold { get; init; } = 20.0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent fleet service interface.
|
||||
/// </summary>
|
||||
public interface IAgentFleetService
|
||||
{
|
||||
Task<IReadOnlyList<AgentFleetInfo>> GetAllAgentsAsync(CancellationToken cancellationToken = default);
|
||||
Task<TaskQueueStats> GetTaskQueueStatsAsync(CancellationToken cancellationToken = default);
|
||||
Task<TaskExecutionStats> GetTaskStatsAsync(DateTimeOffset since, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent fleet info.
|
||||
/// </summary>
|
||||
public sealed record AgentFleetInfo
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required AgentFleetStatus Status { get; init; }
|
||||
public DateTimeOffset LastHeartbeat { get; init; }
|
||||
public DateTimeOffset? CertificateExpiresAt { get; init; }
|
||||
public int CurrentTasks { get; init; }
|
||||
public int MaxConcurrentTasks { get; init; }
|
||||
public DateTimeOffset? DisconnectedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent fleet status.
|
||||
/// </summary>
|
||||
public enum AgentFleetStatus
|
||||
{
|
||||
Unknown,
|
||||
Online,
|
||||
Disconnected,
|
||||
Draining
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Task queue stats.
|
||||
/// </summary>
|
||||
public sealed record TaskQueueStats
|
||||
{
|
||||
public int PendingTasks { get; init; }
|
||||
public TimeSpan? OldestTaskAge { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Task execution stats.
|
||||
/// </summary>
|
||||
public sealed record TaskExecutionStats
|
||||
{
|
||||
public int TotalTasks { get; init; }
|
||||
public int SuccessfulTasks { get; init; }
|
||||
public int FailedTasks { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
using StellaOps.Doctor.Plugins;
|
||||
|
||||
namespace StellaOps.Doctor.Plugins;
|
||||
|
||||
/// <summary>
|
||||
/// Doctor plugin interface.
|
||||
/// </summary>
|
||||
public interface IDoctorPlugin
|
||||
{
|
||||
/// <summary>
|
||||
/// Plugin name.
|
||||
/// </summary>
|
||||
string Name { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Plugin description.
|
||||
/// </summary>
|
||||
string Description { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Categories this plugin covers.
|
||||
/// </summary>
|
||||
string[] Categories { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Runs all health checks for this plugin.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<DoctorCheckResult>> RunChecksAsync(
|
||||
DoctorContext context,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Doctor check result.
|
||||
/// </summary>
|
||||
public sealed record DoctorCheckResult
|
||||
{
|
||||
public required string CheckName { get; init; }
|
||||
public required DoctorSeverity Severity { get; init; }
|
||||
public required string Message { get; init; }
|
||||
public IReadOnlyDictionary<string, object>? Details { get; init; }
|
||||
public string? Recommendation { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
|
||||
public static DoctorCheckResult Pass(string checkName, string message) =>
|
||||
new()
|
||||
{
|
||||
CheckName = checkName,
|
||||
Severity = DoctorSeverity.None,
|
||||
Message = message
|
||||
};
|
||||
|
||||
public static DoctorCheckResult Info(string checkName, string message) =>
|
||||
new()
|
||||
{
|
||||
CheckName = checkName,
|
||||
Severity = DoctorSeverity.Info,
|
||||
Message = message
|
||||
};
|
||||
|
||||
public static DoctorCheckResult Warning(string checkName, string message) =>
|
||||
new()
|
||||
{
|
||||
CheckName = checkName,
|
||||
Severity = DoctorSeverity.Warning,
|
||||
Message = message
|
||||
};
|
||||
|
||||
public static DoctorCheckResult Error(string checkName, string message) =>
|
||||
new()
|
||||
{
|
||||
CheckName = checkName,
|
||||
Severity = DoctorSeverity.Error,
|
||||
Message = message
|
||||
};
|
||||
|
||||
public static DoctorCheckResult Critical(string checkName, string message) =>
|
||||
new()
|
||||
{
|
||||
CheckName = checkName,
|
||||
Severity = DoctorSeverity.Critical,
|
||||
Message = message
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Doctor severity levels.
|
||||
/// </summary>
|
||||
public enum DoctorSeverity
|
||||
{
|
||||
None,
|
||||
Info,
|
||||
Warning,
|
||||
Error,
|
||||
Critical
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Doctor execution context.
|
||||
/// </summary>
|
||||
public sealed record DoctorContext
|
||||
{
|
||||
/// <summary>
|
||||
/// Categories to check (null = all).
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? Categories { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include detailed diagnostics.
|
||||
/// </summary>
|
||||
public bool IncludeDetails { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Per-check timeout.
|
||||
/// </summary>
|
||||
public TimeSpan CheckTimeout { get; init; } = TimeSpan.FromSeconds(30);
|
||||
}
|
||||
@@ -708,6 +708,80 @@ public sealed class InMemoryVexObservationStore : IVexObservationStore
|
||||
: 0;
|
||||
return ValueTask.FromResult((long)count);
|
||||
}
|
||||
|
||||
public ValueTask<bool> UpdateRekorLinkageAsync(
|
||||
string tenant,
|
||||
string observationId,
|
||||
RekorLinkage linkage,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(tenant);
|
||||
ArgumentNullException.ThrowIfNull(observationId);
|
||||
ArgumentNullException.ThrowIfNull(linkage);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
if (!_tenants.TryGetValue(tenant, out var store) || !store.TryGetValue(observationId, out var observation))
|
||||
{
|
||||
return ValueTask.FromResult(false);
|
||||
}
|
||||
|
||||
var updated = observation with
|
||||
{
|
||||
RekorUuid = linkage.Uuid,
|
||||
RekorLogIndex = linkage.LogIndex,
|
||||
RekorIntegratedTime = linkage.IntegratedTime,
|
||||
RekorLogUrl = linkage.LogUrl,
|
||||
RekorInclusionProof = linkage.InclusionProof,
|
||||
RekorLinkedAt = linkage.LinkedAt
|
||||
};
|
||||
|
||||
store[observationId] = updated;
|
||||
return ValueTask.FromResult(true);
|
||||
}
|
||||
|
||||
public ValueTask<IReadOnlyList<VexObservation>> GetPendingRekorAttestationAsync(
|
||||
string tenant,
|
||||
int limit,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
if (limit <= 0)
|
||||
{
|
||||
limit = 50;
|
||||
}
|
||||
|
||||
var results = _tenants.TryGetValue(tenant, out var store)
|
||||
? store.Values
|
||||
.Where(o => string.IsNullOrWhiteSpace(o.RekorUuid))
|
||||
.OrderBy(o => o.CreatedAt)
|
||||
.Take(limit)
|
||||
.ToList()
|
||||
: new List<VexObservation>();
|
||||
|
||||
return ValueTask.FromResult<IReadOnlyList<VexObservation>>(results);
|
||||
}
|
||||
|
||||
public ValueTask<VexObservation?> GetByRekorUuidAsync(
|
||||
string tenant,
|
||||
string rekorUuid,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(tenant);
|
||||
ArgumentNullException.ThrowIfNull(rekorUuid);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
if (!_tenants.TryGetValue(tenant, out var store))
|
||||
{
|
||||
return ValueTask.FromResult<VexObservation?>(null);
|
||||
}
|
||||
|
||||
var result = store.Values.FirstOrDefault(o =>
|
||||
!string.IsNullOrWhiteSpace(o.RekorUuid) &&
|
||||
string.Equals(o.RekorUuid, rekorUuid, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
return ValueTask.FromResult(result);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -735,12 +735,12 @@ public sealed class PostgresVexObservationStore : RepositoryBase<ExcititorDataSo
|
||||
await using var command = CreateCommand(sql, connection);
|
||||
command.Parameters.AddWithValue("tenant", tenant.ToLowerInvariant());
|
||||
command.Parameters.AddWithValue("observation_id", observationId);
|
||||
command.Parameters.AddWithValue("rekor_uuid", linkage.EntryUuid ?? (object)DBNull.Value);
|
||||
command.Parameters.AddWithValue("rekor_log_index", linkage.LogIndex ?? (object)DBNull.Value);
|
||||
command.Parameters.AddWithValue("rekor_integrated_time", linkage.IntegratedTime ?? (object)DBNull.Value);
|
||||
command.Parameters.AddWithValue("rekor_uuid", linkage.Uuid ?? (object)DBNull.Value);
|
||||
command.Parameters.AddWithValue("rekor_log_index", linkage.LogIndex);
|
||||
command.Parameters.AddWithValue("rekor_integrated_time", linkage.IntegratedTime);
|
||||
command.Parameters.AddWithValue("rekor_log_url", linkage.LogUrl ?? (object)DBNull.Value);
|
||||
command.Parameters.AddWithValue("rekor_tree_root", linkage.InclusionProof?.TreeRoot ?? (object)DBNull.Value);
|
||||
command.Parameters.AddWithValue("rekor_tree_size", linkage.InclusionProof?.TreeSize ?? (object)DBNull.Value);
|
||||
command.Parameters.AddWithValue("rekor_tree_root", linkage.TreeRoot ?? (object)DBNull.Value);
|
||||
command.Parameters.AddWithValue("rekor_tree_size", linkage.TreeSize ?? (object)DBNull.Value);
|
||||
|
||||
var inclusionProofJson = linkage.InclusionProof is not null
|
||||
? JsonSerializer.Serialize(linkage.InclusionProof)
|
||||
@@ -786,7 +786,7 @@ public sealed class PostgresVexObservationStore : RepositoryBase<ExcititorDataSo
|
||||
|
||||
while (await reader.ReadAsync(cancellationToken).ConfigureAwait(false))
|
||||
{
|
||||
var observation = MapReaderToObservation(reader);
|
||||
var observation = Map(reader);
|
||||
if (observation is not null)
|
||||
{
|
||||
results.Add(observation);
|
||||
@@ -833,7 +833,7 @@ public sealed class PostgresVexObservationStore : RepositoryBase<ExcititorDataSo
|
||||
|
||||
private VexObservation? MapReaderToObservationWithRekor(NpgsqlDataReader reader)
|
||||
{
|
||||
var observation = MapReaderToObservation(reader);
|
||||
var observation = Map(reader);
|
||||
if (observation is null)
|
||||
{
|
||||
return null;
|
||||
|
||||
@@ -0,0 +1,343 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// StellaOpsPlugin.kt - JetBrains Plugin
|
||||
// Sprint: SPRINT_20260117_037_ReleaseOrchestrator_developer_experience
|
||||
// Task: TASK-037-07 - JetBrains plugin with tool window and annotators
|
||||
// Description: IntelliJ IDEA / JetBrains plugin for Stella Ops
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
package org.stellaops.intellij
|
||||
|
||||
import com.intellij.openapi.actionSystem.*
|
||||
import com.intellij.openapi.application.ApplicationManager
|
||||
import com.intellij.openapi.editor.Editor
|
||||
import com.intellij.openapi.project.Project
|
||||
import com.intellij.openapi.wm.ToolWindow
|
||||
import com.intellij.openapi.wm.ToolWindowFactory
|
||||
import com.intellij.ui.components.*
|
||||
import com.intellij.ui.content.ContentFactory
|
||||
import com.intellij.ui.treeStructure.Tree
|
||||
import javax.swing.*
|
||||
import javax.swing.tree.DefaultMutableTreeNode
|
||||
import javax.swing.tree.DefaultTreeModel
|
||||
|
||||
/**
|
||||
* Stella Ops Plugin for JetBrains IDEs
|
||||
*
|
||||
* Features:
|
||||
* - Tool window for releases and environments
|
||||
* - File annotations for stella.yaml
|
||||
* - Action menu integrations
|
||||
* - Status bar widget
|
||||
*/
|
||||
|
||||
// ============================================================================
|
||||
// Tool Window Factory
|
||||
// ============================================================================
|
||||
|
||||
class StellaToolWindowFactory : ToolWindowFactory {
|
||||
override fun createToolWindowContent(project: Project, toolWindow: ToolWindow) {
|
||||
val stellaToolWindow = StellaToolWindow(project)
|
||||
val content = ContentFactory.getInstance().createContent(
|
||||
stellaToolWindow.content,
|
||||
"Releases",
|
||||
false
|
||||
)
|
||||
toolWindow.contentManager.addContent(content)
|
||||
}
|
||||
}
|
||||
|
||||
class StellaToolWindow(private val project: Project) {
|
||||
val content: JPanel = JPanel()
|
||||
|
||||
init {
|
||||
content.layout = BoxLayout(content, BoxLayout.Y_AXIS)
|
||||
|
||||
// Create tabbed pane
|
||||
val tabbedPane = JBTabbedPane()
|
||||
|
||||
// Releases tab
|
||||
tabbedPane.addTab("Releases", createReleasesPanel())
|
||||
|
||||
// Environments tab
|
||||
tabbedPane.addTab("Environments", createEnvironmentsPanel())
|
||||
|
||||
// Deployments tab
|
||||
tabbedPane.addTab("Deployments", createDeploymentsPanel())
|
||||
|
||||
content.add(tabbedPane)
|
||||
}
|
||||
|
||||
private fun createReleasesPanel(): JComponent {
|
||||
val root = DefaultMutableTreeNode("Services")
|
||||
|
||||
// Sample data
|
||||
val apiGateway = DefaultMutableTreeNode("api-gateway")
|
||||
apiGateway.add(DefaultMutableTreeNode("v2.3.1 (Production)"))
|
||||
apiGateway.add(DefaultMutableTreeNode("v2.4.0 (Staging)"))
|
||||
apiGateway.add(DefaultMutableTreeNode("v2.5.0-rc1 (Dev)"))
|
||||
|
||||
val userService = DefaultMutableTreeNode("user-service")
|
||||
userService.add(DefaultMutableTreeNode("v1.8.0 (Production)"))
|
||||
userService.add(DefaultMutableTreeNode("v1.9.0 (Staging)"))
|
||||
|
||||
root.add(apiGateway)
|
||||
root.add(userService)
|
||||
|
||||
val tree = Tree(DefaultTreeModel(root))
|
||||
tree.isRootVisible = false
|
||||
|
||||
val panel = JPanel()
|
||||
panel.layout = BoxLayout(panel, BoxLayout.Y_AXIS)
|
||||
|
||||
// Toolbar
|
||||
val toolbar = JPanel()
|
||||
toolbar.add(JButton("Refresh").apply {
|
||||
addActionListener { refreshReleases() }
|
||||
})
|
||||
toolbar.add(JButton("Create Release").apply {
|
||||
addActionListener { showCreateReleaseDialog() }
|
||||
})
|
||||
|
||||
panel.add(toolbar)
|
||||
panel.add(JBScrollPane(tree))
|
||||
|
||||
return panel
|
||||
}
|
||||
|
||||
private fun createEnvironmentsPanel(): JComponent {
|
||||
val panel = JPanel()
|
||||
panel.layout = BoxLayout(panel, BoxLayout.Y_AXIS)
|
||||
|
||||
val envList = listOf(
|
||||
EnvironmentInfo("Production", "prod", "Healthy", "3 services"),
|
||||
EnvironmentInfo("Staging", "staging", "Healthy", "3 services"),
|
||||
EnvironmentInfo("Development", "dev", "Healthy", "3 services")
|
||||
)
|
||||
|
||||
for (env in envList) {
|
||||
val envPanel = JPanel()
|
||||
envPanel.layout = BoxLayout(envPanel, BoxLayout.X_AXIS)
|
||||
envPanel.border = BorderFactory.createEmptyBorder(5, 10, 5, 10)
|
||||
|
||||
val statusIcon = when (env.status) {
|
||||
"Healthy" -> "✓"
|
||||
"Degraded" -> "⚠"
|
||||
else -> "✗"
|
||||
}
|
||||
|
||||
envPanel.add(JBLabel("$statusIcon ${env.name}"))
|
||||
envPanel.add(Box.createHorizontalGlue())
|
||||
envPanel.add(JBLabel(env.services))
|
||||
envPanel.add(JButton("View").apply {
|
||||
addActionListener { openEnvironmentDetails(env.id) }
|
||||
})
|
||||
|
||||
panel.add(envPanel)
|
||||
}
|
||||
|
||||
return JBScrollPane(panel)
|
||||
}
|
||||
|
||||
private fun createDeploymentsPanel(): JComponent {
|
||||
val panel = JPanel()
|
||||
panel.layout = BoxLayout(panel, BoxLayout.Y_AXIS)
|
||||
|
||||
val headers = arrayOf("ID", "Service", "Version", "Environment", "Status")
|
||||
val data = arrayOf(
|
||||
arrayOf("dep-001", "api-gateway", "v2.3.1", "Production", "Completed"),
|
||||
arrayOf("dep-002", "user-service", "v1.9.0", "Staging", "In Progress"),
|
||||
arrayOf("dep-003", "order-service", "v3.0.0", "Development", "Pending")
|
||||
)
|
||||
|
||||
val table = JBTable(data, headers)
|
||||
panel.add(JBScrollPane(table))
|
||||
|
||||
return panel
|
||||
}
|
||||
|
||||
private fun refreshReleases() {
|
||||
// Refresh releases from API
|
||||
ApplicationManager.getApplication().invokeLater {
|
||||
// Update tree
|
||||
}
|
||||
}
|
||||
|
||||
private fun showCreateReleaseDialog() {
|
||||
val dialog = CreateReleaseDialog(project)
|
||||
if (dialog.showAndGet()) {
|
||||
// Create release via CLI
|
||||
val service = dialog.serviceName
|
||||
val version = dialog.version
|
||||
executeCliCommand("stella release create $service $version")
|
||||
}
|
||||
}
|
||||
|
||||
private fun openEnvironmentDetails(envId: String) {
|
||||
// Open browser to environment dashboard
|
||||
java.awt.Desktop.getDesktop().browse(
|
||||
java.net.URI("http://localhost:5000/environments/$envId")
|
||||
)
|
||||
}
|
||||
|
||||
private fun executeCliCommand(command: String) {
|
||||
// Execute via terminal
|
||||
val terminal = com.intellij.terminal.JBTerminalWidget.installByDefault(project, null)
|
||||
// terminal.sendCommand(command)
|
||||
}
|
||||
|
||||
data class EnvironmentInfo(
|
||||
val name: String,
|
||||
val id: String,
|
||||
val status: String,
|
||||
val services: String
|
||||
)
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Create Release Dialog
|
||||
// ============================================================================
|
||||
|
||||
class CreateReleaseDialog(project: Project) : com.intellij.openapi.ui.DialogWrapper(project) {
|
||||
private val serviceField = JBTextField()
|
||||
private val versionField = JBTextField()
|
||||
private val notesField = JBTextArea()
|
||||
|
||||
val serviceName: String get() = serviceField.text
|
||||
val version: String get() = versionField.text
|
||||
val notes: String get() = notesField.text
|
||||
|
||||
init {
|
||||
title = "Create Release"
|
||||
init()
|
||||
}
|
||||
|
||||
override fun createCenterPanel(): JComponent {
|
||||
val panel = JPanel()
|
||||
panel.layout = BoxLayout(panel, BoxLayout.Y_AXIS)
|
||||
|
||||
panel.add(JBLabel("Service Name:"))
|
||||
panel.add(serviceField)
|
||||
|
||||
panel.add(Box.createVerticalStrut(10))
|
||||
|
||||
panel.add(JBLabel("Version:"))
|
||||
panel.add(versionField)
|
||||
|
||||
panel.add(Box.createVerticalStrut(10))
|
||||
|
||||
panel.add(JBLabel("Release Notes:"))
|
||||
panel.add(JBScrollPane(notesField).apply {
|
||||
preferredSize = java.awt.Dimension(300, 100)
|
||||
})
|
||||
|
||||
return panel
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Actions
|
||||
// ============================================================================
|
||||
|
||||
class CreateReleaseAction : AnAction("Create Release", "Create a new release", null) {
|
||||
override fun actionPerformed(e: AnActionEvent) {
|
||||
val project = e.project ?: return
|
||||
val dialog = CreateReleaseDialog(project)
|
||||
if (dialog.showAndGet()) {
|
||||
// Execute create release
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class PromoteReleaseAction : AnAction("Promote Release", "Promote a release to another environment", null) {
|
||||
override fun actionPerformed(e: AnActionEvent) {
|
||||
val project = e.project ?: return
|
||||
// Show promote dialog
|
||||
}
|
||||
}
|
||||
|
||||
class ValidateConfigAction : AnAction("Validate Configuration", "Validate stella.yaml configuration", null) {
|
||||
override fun actionPerformed(e: AnActionEvent) {
|
||||
val project = e.project ?: return
|
||||
// Execute validation
|
||||
}
|
||||
}
|
||||
|
||||
class OpenDashboardAction : AnAction("Open Dashboard", "Open Stella Ops dashboard in browser", null) {
|
||||
override fun actionPerformed(e: AnActionEvent) {
|
||||
java.awt.Desktop.getDesktop().browse(
|
||||
java.net.URI("http://localhost:5000/dashboard")
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Annotator for stella.yaml
|
||||
// ============================================================================
|
||||
|
||||
class StellaYamlAnnotator : com.intellij.lang.annotation.Annotator {
|
||||
override fun annotate(element: com.intellij.psi.PsiElement, holder: com.intellij.lang.annotation.AnnotationHolder) {
|
||||
// Skip if not a YAML file
|
||||
val file = element.containingFile ?: return
|
||||
if (!file.name.endsWith("stella.yaml")) return
|
||||
|
||||
val text = element.text
|
||||
|
||||
// Annotate version references
|
||||
if (text.startsWith("version:")) {
|
||||
holder.newAnnotation(
|
||||
com.intellij.lang.annotation.HighlightSeverity.INFORMATION,
|
||||
"Stella version declaration"
|
||||
)
|
||||
.range(element.textRange)
|
||||
.create()
|
||||
}
|
||||
|
||||
// Annotate environment references
|
||||
if (text.matches(Regex("environment:\\s*\\w+"))) {
|
||||
holder.newAnnotation(
|
||||
com.intellij.lang.annotation.HighlightSeverity.INFORMATION,
|
||||
"Target environment"
|
||||
)
|
||||
.range(element.textRange)
|
||||
.create()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Status Bar Widget
|
||||
// ============================================================================
|
||||
|
||||
class StellaStatusBarWidgetFactory : com.intellij.openapi.wm.StatusBarWidgetFactory {
|
||||
override fun getId(): String = "StellaOpsStatus"
|
||||
override fun getDisplayName(): String = "Stella Ops"
|
||||
override fun isAvailable(project: Project): Boolean = true
|
||||
override fun createWidget(project: Project): com.intellij.openapi.wm.StatusBarWidget {
|
||||
return StellaStatusBarWidget()
|
||||
}
|
||||
override fun disposeWidget(widget: com.intellij.openapi.wm.StatusBarWidget) {
|
||||
// Cleanup
|
||||
}
|
||||
override fun canBeEnabledOn(statusBar: com.intellij.openapi.wm.StatusBar): Boolean = true
|
||||
}
|
||||
|
||||
class StellaStatusBarWidget : com.intellij.openapi.wm.StatusBarWidget,
|
||||
com.intellij.openapi.wm.StatusBarWidget.TextPresentation {
|
||||
|
||||
override fun ID(): String = "StellaOpsStatus"
|
||||
override fun getPresentation(): com.intellij.openapi.wm.StatusBarWidget.WidgetPresentation = this
|
||||
override fun install(statusBar: com.intellij.openapi.wm.StatusBar) {}
|
||||
override fun dispose() {}
|
||||
|
||||
override fun getText(): String = "🚀 Stella Ops"
|
||||
override fun getAlignment(): Float = 0f
|
||||
override fun getTooltipText(): String = "Stella Ops - Click to open dashboard"
|
||||
|
||||
override fun getClickConsumer(): com.intellij.util.Consumer<java.awt.event.MouseEvent>? {
|
||||
return com.intellij.util.Consumer {
|
||||
java.awt.Desktop.getDesktop().browse(
|
||||
java.net.URI("http://localhost:5000/dashboard")
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
146
src/Extensions/vscode-stella-ops/package.json
Normal file
146
src/Extensions/vscode-stella-ops/package.json
Normal file
@@ -0,0 +1,146 @@
|
||||
{
|
||||
"name": "stella-ops",
|
||||
"displayName": "Stella Ops",
|
||||
"description": "VS Code extension for Stella Ops release control plane",
|
||||
"version": "1.0.0",
|
||||
"publisher": "stella-ops",
|
||||
"engines": {
|
||||
"vscode": "^1.85.0"
|
||||
},
|
||||
"categories": [
|
||||
"Other",
|
||||
"SCM Providers"
|
||||
],
|
||||
"keywords": [
|
||||
"release",
|
||||
"deployment",
|
||||
"devops",
|
||||
"ci-cd",
|
||||
"promotion"
|
||||
],
|
||||
"activationEvents": [
|
||||
"workspaceContains:**/stella.yaml"
|
||||
],
|
||||
"main": "./out/extension.js",
|
||||
"contributes": {
|
||||
"commands": [
|
||||
{
|
||||
"command": "stella.createRelease",
|
||||
"title": "Create Release",
|
||||
"category": "Stella"
|
||||
},
|
||||
{
|
||||
"command": "stella.promote",
|
||||
"title": "Promote Release",
|
||||
"category": "Stella"
|
||||
},
|
||||
{
|
||||
"command": "stella.viewRelease",
|
||||
"title": "View Release Details",
|
||||
"category": "Stella"
|
||||
},
|
||||
{
|
||||
"command": "stella.viewDeployment",
|
||||
"title": "View Deployment",
|
||||
"category": "Stella"
|
||||
},
|
||||
{
|
||||
"command": "stella.refreshReleases",
|
||||
"title": "Refresh Releases",
|
||||
"category": "Stella",
|
||||
"icon": "$(refresh)"
|
||||
},
|
||||
{
|
||||
"command": "stella.validateConfig",
|
||||
"title": "Validate Configuration",
|
||||
"category": "Stella"
|
||||
},
|
||||
{
|
||||
"command": "stella.openDashboard",
|
||||
"title": "Open Dashboard",
|
||||
"category": "Stella"
|
||||
},
|
||||
{
|
||||
"command": "stella.login",
|
||||
"title": "Login",
|
||||
"category": "Stella"
|
||||
}
|
||||
],
|
||||
"viewsContainers": {
|
||||
"activitybar": [
|
||||
{
|
||||
"id": "stella-ops",
|
||||
"title": "Stella Ops",
|
||||
"icon": "resources/stella-icon.svg"
|
||||
}
|
||||
]
|
||||
},
|
||||
"views": {
|
||||
"stella-ops": [
|
||||
{
|
||||
"id": "stellaReleases",
|
||||
"name": "Releases",
|
||||
"icon": "resources/release-icon.svg"
|
||||
},
|
||||
{
|
||||
"id": "stellaEnvironments",
|
||||
"name": "Environments",
|
||||
"icon": "resources/environment-icon.svg"
|
||||
}
|
||||
]
|
||||
},
|
||||
"menus": {
|
||||
"view/title": [
|
||||
{
|
||||
"command": "stella.refreshReleases",
|
||||
"when": "view == stellaReleases",
|
||||
"group": "navigation"
|
||||
}
|
||||
],
|
||||
"view/item/context": [
|
||||
{
|
||||
"command": "stella.promote",
|
||||
"when": "viewItem == release",
|
||||
"group": "inline"
|
||||
}
|
||||
]
|
||||
},
|
||||
"configuration": {
|
||||
"title": "Stella Ops",
|
||||
"properties": {
|
||||
"stella.serverUrl": {
|
||||
"type": "string",
|
||||
"default": "https://localhost:5001",
|
||||
"description": "Stella Ops server URL"
|
||||
},
|
||||
"stella.autoValidate": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Automatically validate stella.yaml on save"
|
||||
}
|
||||
}
|
||||
},
|
||||
"languages": [
|
||||
{
|
||||
"id": "stella-yaml",
|
||||
"extensions": [".stella.yaml"],
|
||||
"aliases": ["Stella Configuration"],
|
||||
"configuration": "./language-configuration.json"
|
||||
}
|
||||
]
|
||||
},
|
||||
"scripts": {
|
||||
"vscode:prepublish": "npm run compile",
|
||||
"compile": "tsc -p ./",
|
||||
"watch": "tsc -watch -p ./",
|
||||
"lint": "eslint src --ext ts"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/vscode": "^1.85.0",
|
||||
"@types/node": "^20.0.0",
|
||||
"typescript": "^5.3.0",
|
||||
"@typescript-eslint/eslint-plugin": "^6.0.0",
|
||||
"@typescript-eslint/parser": "^6.0.0",
|
||||
"eslint": "^8.0.0"
|
||||
}
|
||||
}
|
||||
367
src/Extensions/vscode-stella-ops/src/extension.ts
Normal file
367
src/Extensions/vscode-stella-ops/src/extension.ts
Normal file
@@ -0,0 +1,367 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// StellaOpsExtension - VS Code Extension
|
||||
// Sprint: SPRINT_20260117_037_ReleaseOrchestrator_developer_experience
|
||||
// Task: TASK-037-06 - VS Code Extension with tree view, commands, and code lens
|
||||
// Description: VS Code extension package definition
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* VS Code Extension for Stella Ops
|
||||
*
|
||||
* Features:
|
||||
* - Tree view for releases, environments, and deployments
|
||||
* - Code lens for stella.yaml configuration files
|
||||
* - Commands for release management
|
||||
* - Status bar integration
|
||||
* - IntelliSense for configuration files
|
||||
*/
|
||||
|
||||
import * as vscode from 'vscode';
|
||||
|
||||
// ============================================================================
|
||||
// Extension Activation
|
||||
// ============================================================================
|
||||
|
||||
export function activate(context: vscode.ExtensionContext) {
|
||||
console.log('Stella Ops extension is now active');
|
||||
|
||||
// Register providers
|
||||
const releaseTreeProvider = new ReleaseTreeProvider();
|
||||
const environmentTreeProvider = new EnvironmentTreeProvider();
|
||||
const stellaCodeLensProvider = new StellaCodeLensProvider();
|
||||
|
||||
// Tree views
|
||||
vscode.window.registerTreeDataProvider('stellaReleases', releaseTreeProvider);
|
||||
vscode.window.registerTreeDataProvider('stellaEnvironments', environmentTreeProvider);
|
||||
|
||||
// Code lens for stella.yaml files
|
||||
context.subscriptions.push(
|
||||
vscode.languages.registerCodeLensProvider(
|
||||
{ pattern: '**/stella.yaml' },
|
||||
stellaCodeLensProvider
|
||||
)
|
||||
);
|
||||
|
||||
// Register commands
|
||||
context.subscriptions.push(
|
||||
vscode.commands.registerCommand('stella.createRelease', createReleaseCommand),
|
||||
vscode.commands.registerCommand('stella.promote', promoteCommand),
|
||||
vscode.commands.registerCommand('stella.viewRelease', viewReleaseCommand),
|
||||
vscode.commands.registerCommand('stella.viewDeployment', viewDeploymentCommand),
|
||||
vscode.commands.registerCommand('stella.refreshReleases', () => releaseTreeProvider.refresh()),
|
||||
vscode.commands.registerCommand('stella.validateConfig', validateConfigCommand),
|
||||
vscode.commands.registerCommand('stella.openDashboard', openDashboardCommand),
|
||||
vscode.commands.registerCommand('stella.login', loginCommand)
|
||||
);
|
||||
|
||||
// Status bar
|
||||
const statusBarItem = vscode.window.createStatusBarItem(vscode.StatusBarAlignment.Right, 100);
|
||||
statusBarItem.text = '$(rocket) Stella Ops';
|
||||
statusBarItem.command = 'stella.openDashboard';
|
||||
statusBarItem.show();
|
||||
context.subscriptions.push(statusBarItem);
|
||||
|
||||
// File watcher for stella.yaml changes
|
||||
const watcher = vscode.workspace.createFileSystemWatcher('**/stella.yaml');
|
||||
watcher.onDidChange(() => validateConfigCommand());
|
||||
context.subscriptions.push(watcher);
|
||||
}
|
||||
|
||||
export function deactivate() {}
|
||||
|
||||
// ============================================================================
|
||||
// Tree Data Providers
|
||||
// ============================================================================
|
||||
|
||||
class ReleaseTreeProvider implements vscode.TreeDataProvider<ReleaseTreeItem> {
|
||||
private _onDidChangeTreeData = new vscode.EventEmitter<ReleaseTreeItem | undefined>();
|
||||
readonly onDidChangeTreeData = this._onDidChangeTreeData.event;
|
||||
|
||||
refresh(): void {
|
||||
this._onDidChangeTreeData.fire(undefined);
|
||||
}
|
||||
|
||||
getTreeItem(element: ReleaseTreeItem): vscode.TreeItem {
|
||||
return element;
|
||||
}
|
||||
|
||||
async getChildren(element?: ReleaseTreeItem): Promise<ReleaseTreeItem[]> {
|
||||
if (!element) {
|
||||
// Root level: show services
|
||||
return [
|
||||
new ReleaseTreeItem('api-gateway', 'service', vscode.TreeItemCollapsibleState.Collapsed),
|
||||
new ReleaseTreeItem('user-service', 'service', vscode.TreeItemCollapsibleState.Collapsed),
|
||||
new ReleaseTreeItem('order-service', 'service', vscode.TreeItemCollapsibleState.Collapsed)
|
||||
];
|
||||
}
|
||||
|
||||
if (element.itemType === 'service') {
|
||||
// Service level: show releases
|
||||
return [
|
||||
new ReleaseTreeItem('v2.3.1 (Production)', 'release', vscode.TreeItemCollapsibleState.None, {
|
||||
status: 'deployed',
|
||||
environment: 'prod'
|
||||
}),
|
||||
new ReleaseTreeItem('v2.4.0 (Staging)', 'release', vscode.TreeItemCollapsibleState.None, {
|
||||
status: 'deployed',
|
||||
environment: 'staging'
|
||||
}),
|
||||
new ReleaseTreeItem('v2.5.0-rc1 (Dev)', 'release', vscode.TreeItemCollapsibleState.None, {
|
||||
status: 'deployed',
|
||||
environment: 'dev'
|
||||
})
|
||||
];
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
class ReleaseTreeItem extends vscode.TreeItem {
|
||||
constructor(
|
||||
public readonly label: string,
|
||||
public readonly itemType: 'service' | 'release',
|
||||
public readonly collapsibleState: vscode.TreeItemCollapsibleState,
|
||||
public readonly metadata?: { status?: string; environment?: string }
|
||||
) {
|
||||
super(label, collapsibleState);
|
||||
|
||||
if (itemType === 'service') {
|
||||
this.iconPath = new vscode.ThemeIcon('package');
|
||||
this.contextValue = 'service';
|
||||
} else {
|
||||
this.iconPath = metadata?.status === 'deployed'
|
||||
? new vscode.ThemeIcon('check', new vscode.ThemeColor('testing.iconPassed'))
|
||||
: new vscode.ThemeIcon('circle-outline');
|
||||
this.contextValue = 'release';
|
||||
this.command = {
|
||||
command: 'stella.viewRelease',
|
||||
title: 'View Release',
|
||||
arguments: [this]
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class EnvironmentTreeProvider implements vscode.TreeDataProvider<EnvironmentTreeItem> {
|
||||
private _onDidChangeTreeData = new vscode.EventEmitter<EnvironmentTreeItem | undefined>();
|
||||
readonly onDidChangeTreeData = this._onDidChangeTreeData.event;
|
||||
|
||||
getTreeItem(element: EnvironmentTreeItem): vscode.TreeItem {
|
||||
return element;
|
||||
}
|
||||
|
||||
async getChildren(element?: EnvironmentTreeItem): Promise<EnvironmentTreeItem[]> {
|
||||
if (!element) {
|
||||
return [
|
||||
new EnvironmentTreeItem('Production', 'prod', 'healthy'),
|
||||
new EnvironmentTreeItem('Staging', 'staging', 'healthy'),
|
||||
new EnvironmentTreeItem('Development', 'dev', 'healthy')
|
||||
];
|
||||
}
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
class EnvironmentTreeItem extends vscode.TreeItem {
|
||||
constructor(
|
||||
public readonly label: string,
|
||||
public readonly envId: string,
|
||||
public readonly health: 'healthy' | 'degraded' | 'unhealthy'
|
||||
) {
|
||||
super(label, vscode.TreeItemCollapsibleState.None);
|
||||
|
||||
this.iconPath = health === 'healthy'
|
||||
? new vscode.ThemeIcon('check', new vscode.ThemeColor('testing.iconPassed'))
|
||||
: health === 'degraded'
|
||||
? new vscode.ThemeIcon('warning', new vscode.ThemeColor('editorWarning.foreground'))
|
||||
: new vscode.ThemeIcon('error', new vscode.ThemeColor('editorError.foreground'));
|
||||
|
||||
this.description = health;
|
||||
this.contextValue = 'environment';
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Code Lens Provider
|
||||
// ============================================================================
|
||||
|
||||
class StellaCodeLensProvider implements vscode.CodeLensProvider {
|
||||
provideCodeLenses(document: vscode.TextDocument): vscode.CodeLens[] {
|
||||
const codeLenses: vscode.CodeLens[] = [];
|
||||
const text = document.getText();
|
||||
const lines = text.split('\n');
|
||||
|
||||
lines.forEach((line, index) => {
|
||||
// Add code lens for version declarations
|
||||
if (line.match(/^\s*version:/)) {
|
||||
const range = new vscode.Range(index, 0, index, line.length);
|
||||
codeLenses.push(
|
||||
new vscode.CodeLens(range, {
|
||||
title: '$(rocket) Create Release',
|
||||
command: 'stella.createRelease'
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
// Add code lens for environment references
|
||||
if (line.match(/^\s*environment:/)) {
|
||||
const range = new vscode.Range(index, 0, index, line.length);
|
||||
codeLenses.push(
|
||||
new vscode.CodeLens(range, {
|
||||
title: '$(server-environment) View Environment',
|
||||
command: 'stella.openDashboard'
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
// Add code lens for policy references
|
||||
if (line.match(/^\s*policies:/)) {
|
||||
const range = new vscode.Range(index, 0, index, line.length);
|
||||
codeLenses.push(
|
||||
new vscode.CodeLens(range, {
|
||||
title: '$(shield) Validate Policies',
|
||||
command: 'stella.validateConfig'
|
||||
})
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
return codeLenses;
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Commands
|
||||
// ============================================================================
|
||||
|
||||
async function createReleaseCommand() {
|
||||
const service = await vscode.window.showInputBox({
|
||||
prompt: 'Service name',
|
||||
placeHolder: 'e.g., api-gateway'
|
||||
});
|
||||
|
||||
if (!service) return;
|
||||
|
||||
const version = await vscode.window.showInputBox({
|
||||
prompt: 'Version',
|
||||
placeHolder: 'e.g., v1.2.3'
|
||||
});
|
||||
|
||||
if (!version) return;
|
||||
|
||||
const notes = await vscode.window.showInputBox({
|
||||
prompt: 'Release notes (optional)',
|
||||
placeHolder: 'Description of changes'
|
||||
});
|
||||
|
||||
// Execute CLI command
|
||||
const terminal = vscode.window.createTerminal('Stella Ops');
|
||||
terminal.sendText(`stella release create ${service} ${version}${notes ? ` --notes "${notes}"` : ''}`);
|
||||
terminal.show();
|
||||
}
|
||||
|
||||
async function promoteCommand() {
|
||||
const release = await vscode.window.showInputBox({
|
||||
prompt: 'Release ID',
|
||||
placeHolder: 'e.g., rel-abc123'
|
||||
});
|
||||
|
||||
if (!release) return;
|
||||
|
||||
const target = await vscode.window.showQuickPick(
|
||||
['dev', 'staging', 'production'],
|
||||
{ placeHolder: 'Select target environment' }
|
||||
);
|
||||
|
||||
if (!target) return;
|
||||
|
||||
const terminal = vscode.window.createTerminal('Stella Ops');
|
||||
terminal.sendText(`stella promote start ${release} ${target}`);
|
||||
terminal.show();
|
||||
}
|
||||
|
||||
async function viewReleaseCommand(item?: ReleaseTreeItem) {
|
||||
// Open release details in a webview
|
||||
const panel = vscode.window.createWebviewPanel(
|
||||
'stellaRelease',
|
||||
`Release: ${item?.label || 'Details'}`,
|
||||
vscode.ViewColumn.One,
|
||||
{ enableScripts: true }
|
||||
);
|
||||
|
||||
panel.webview.html = getReleaseWebviewContent(item?.label || 'Unknown');
|
||||
}
|
||||
|
||||
async function viewDeploymentCommand() {
|
||||
const deploymentId = await vscode.window.showInputBox({
|
||||
prompt: 'Deployment ID',
|
||||
placeHolder: 'e.g., dep-abc123'
|
||||
});
|
||||
|
||||
if (!deploymentId) return;
|
||||
|
||||
const terminal = vscode.window.createTerminal('Stella Ops');
|
||||
terminal.sendText(`stella deploy status ${deploymentId} --watch`);
|
||||
terminal.show();
|
||||
}
|
||||
|
||||
async function validateConfigCommand() {
|
||||
const terminal = vscode.window.createTerminal('Stella Ops');
|
||||
terminal.sendText('stella config validate');
|
||||
terminal.show();
|
||||
}
|
||||
|
||||
async function openDashboardCommand() {
|
||||
vscode.env.openExternal(vscode.Uri.parse('http://localhost:5000/dashboard'));
|
||||
}
|
||||
|
||||
async function loginCommand() {
|
||||
const server = await vscode.window.showInputBox({
|
||||
prompt: 'Stella server URL',
|
||||
placeHolder: 'https://stella.example.com',
|
||||
value: 'https://localhost:5001'
|
||||
});
|
||||
|
||||
if (!server) return;
|
||||
|
||||
const terminal = vscode.window.createTerminal('Stella Ops');
|
||||
terminal.sendText(`stella auth login ${server} --interactive`);
|
||||
terminal.show();
|
||||
}
|
||||
|
||||
function getReleaseWebviewContent(releaseName: string): string {
|
||||
return `
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Release Details</title>
|
||||
<style>
|
||||
body { font-family: var(--vscode-font-family); padding: 20px; }
|
||||
h1 { color: var(--vscode-editor-foreground); }
|
||||
.section { margin: 20px 0; }
|
||||
.label { color: var(--vscode-descriptionForeground); }
|
||||
.value { color: var(--vscode-editor-foreground); font-weight: bold; }
|
||||
.status-deployed { color: var(--vscode-testing-iconPassed); }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Release: ${releaseName}</h1>
|
||||
<div class="section">
|
||||
<span class="label">Status: </span>
|
||||
<span class="value status-deployed">Deployed</span>
|
||||
</div>
|
||||
<div class="section">
|
||||
<span class="label">Environment: </span>
|
||||
<span class="value">Production</span>
|
||||
</div>
|
||||
<div class="section">
|
||||
<span class="label">Deployed At: </span>
|
||||
<span class="value">2026-01-17 12:00 UTC</span>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
}
|
||||
@@ -65,7 +65,7 @@ public static class DeterminizationConfigEndpoints
|
||||
private static async Task<IResult> GetEffectiveConfig(
|
||||
HttpContext context,
|
||||
IDeterminizationConfigStore configStore,
|
||||
ILogger<DeterminizationConfigEndpoints> logger,
|
||||
ILogger logger,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var tenantId = GetTenantId(context);
|
||||
@@ -86,7 +86,7 @@ public static class DeterminizationConfigEndpoints
|
||||
}
|
||||
|
||||
private static IResult GetDefaultConfig(
|
||||
ILogger<DeterminizationConfigEndpoints> logger)
|
||||
ILogger logger)
|
||||
{
|
||||
logger.LogDebug("Getting default determinization config");
|
||||
return Results.Ok(new DeterminizationOptions());
|
||||
@@ -95,7 +95,7 @@ public static class DeterminizationConfigEndpoints
|
||||
private static async Task<IResult> GetAuditHistory(
|
||||
HttpContext context,
|
||||
IDeterminizationConfigStore configStore,
|
||||
ILogger<DeterminizationConfigEndpoints> logger,
|
||||
ILogger logger,
|
||||
int limit = 50,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
@@ -122,7 +122,7 @@ public static class DeterminizationConfigEndpoints
|
||||
private static async Task<IResult> UpdateConfig(
|
||||
HttpContext context,
|
||||
IDeterminizationConfigStore configStore,
|
||||
ILogger<DeterminizationConfigEndpoints> logger,
|
||||
ILogger logger,
|
||||
UpdateConfigRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
@@ -171,7 +171,7 @@ public static class DeterminizationConfigEndpoints
|
||||
|
||||
private static IResult ValidateConfig(
|
||||
ValidateConfigRequest request,
|
||||
ILogger<DeterminizationConfigEndpoints> logger)
|
||||
ILogger logger)
|
||||
{
|
||||
logger.LogDebug("Validating determinization config");
|
||||
|
||||
@@ -203,48 +203,43 @@ public static class DeterminizationConfigEndpoints
|
||||
}
|
||||
|
||||
// Validate conflict policy
|
||||
if (config.Conflicts.EscalationSeverityThreshold < 0 || config.Conflicts.EscalationSeverityThreshold > 1)
|
||||
if (config.ConflictPolicy.EscalationSeverityThreshold < 0 || config.ConflictPolicy.EscalationSeverityThreshold > 1)
|
||||
{
|
||||
errors.Add("EscalationSeverityThreshold must be between 0 and 1");
|
||||
}
|
||||
|
||||
if (config.Conflicts.ConflictTtlHours < 1)
|
||||
if (config.ConflictPolicy.ConflictTtlHours < 1)
|
||||
{
|
||||
errors.Add("ConflictTtlHours must be at least 1");
|
||||
}
|
||||
|
||||
// Validate environment thresholds
|
||||
ValidateThresholds(config.Thresholds.Development, "Development", errors, warnings);
|
||||
ValidateThresholds(config.Thresholds.Staging, "Staging", errors, warnings);
|
||||
ValidateThresholds(config.Thresholds.Production, "Production", errors, warnings);
|
||||
ValidateThresholds(config.EnvironmentThresholds.Development, "Development", errors, warnings);
|
||||
ValidateThresholds(config.EnvironmentThresholds.Staging, "Staging", errors, warnings);
|
||||
ValidateThresholds(config.EnvironmentThresholds.Production, "Production", errors, warnings);
|
||||
|
||||
return (errors.Count == 0, errors, warnings);
|
||||
}
|
||||
|
||||
private static void ValidateThresholds(
|
||||
EnvironmentThreshold threshold,
|
||||
EnvironmentThresholdValues threshold,
|
||||
string envName,
|
||||
List<string> errors,
|
||||
List<string> warnings)
|
||||
{
|
||||
if (threshold.EpssThreshold < 0 || threshold.EpssThreshold > 1)
|
||||
if (threshold.MaxPassEntropy < 0 || threshold.MaxPassEntropy > 1)
|
||||
{
|
||||
errors.Add($"{envName}.EpssThreshold must be between 0 and 1");
|
||||
errors.Add($"{envName}.MaxPassEntropy must be between 0 and 1");
|
||||
}
|
||||
|
||||
if (threshold.UncertaintyFactor < 0 || threshold.UncertaintyFactor > 1)
|
||||
if (threshold.MinEvidenceCount < 0)
|
||||
{
|
||||
errors.Add($"{envName}.UncertaintyFactor must be between 0 and 1");
|
||||
errors.Add($"{envName}.MinEvidenceCount must be >= 0");
|
||||
}
|
||||
|
||||
if (threshold.MinScore < 0 || threshold.MinScore > 100)
|
||||
if (threshold.MaxPassEntropy > 0.8)
|
||||
{
|
||||
errors.Add($"{envName}.MinScore must be between 0 and 100");
|
||||
}
|
||||
|
||||
if (threshold.MaxScore < threshold.MinScore)
|
||||
{
|
||||
errors.Add($"{envName}.MaxScore must be >= MinScore");
|
||||
warnings.Add($"{envName}.MaxPassEntropy above 0.8 may reduce confidence controls");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -312,5 +307,4 @@ public sealed record AuditEntryDto
|
||||
public string? Summary { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>Logger wrapper for DI.</summary>
|
||||
file class DeterminizationConfigEndpoints { }
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ public sealed class SignalUpdateHandler : ISignalUpdateSubscription
|
||||
IEventPublisher eventPublisher,
|
||||
ILogger<SignalUpdateHandler> logger)
|
||||
: this(observations, gate, eventPublisher,
|
||||
Options.Create(new DeterminizationOptions()),
|
||||
Microsoft.Extensions.Options.Options.Create(new DeterminizationOptions()),
|
||||
TimeProvider.System,
|
||||
logger)
|
||||
{
|
||||
|
||||
@@ -0,0 +1,595 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ComplianceController.cs
|
||||
// Sprint: SPRINT_20260117_039_ReleaseOrchestrator_compliance
|
||||
// Task: TASK-039-07 - REST API for compliance status, reports, evidence, and audit queries
|
||||
// Description: API endpoints for compliance management
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Api.Controllers;
|
||||
|
||||
/// <summary>
|
||||
/// API endpoints for compliance management, reporting, and auditing.
|
||||
/// </summary>
|
||||
[ApiController]
|
||||
[Route("api/v1/compliance")]
|
||||
[Authorize]
|
||||
public sealed class ComplianceController : ControllerBase
|
||||
{
|
||||
private readonly IComplianceEngine _complianceEngine;
|
||||
private readonly IReportGenerator _reportGenerator;
|
||||
private readonly IEvidenceChainVisualizer _evidenceChainVisualizer;
|
||||
private readonly IAuditQueryEngine _auditQueryEngine;
|
||||
private readonly IScheduledReportService _scheduledReportService;
|
||||
|
||||
public ComplianceController(
|
||||
IComplianceEngine complianceEngine,
|
||||
IReportGenerator reportGenerator,
|
||||
IEvidenceChainVisualizer evidenceChainVisualizer,
|
||||
IAuditQueryEngine auditQueryEngine,
|
||||
IScheduledReportService scheduledReportService)
|
||||
{
|
||||
_complianceEngine = complianceEngine;
|
||||
_reportGenerator = reportGenerator;
|
||||
_evidenceChainVisualizer = evidenceChainVisualizer;
|
||||
_auditQueryEngine = auditQueryEngine;
|
||||
_scheduledReportService = scheduledReportService;
|
||||
}
|
||||
|
||||
#region Compliance Status
|
||||
|
||||
/// <summary>
|
||||
/// Gets overall compliance status.
|
||||
/// </summary>
|
||||
[HttpGet("status")]
|
||||
[ProducesResponseType(typeof(ComplianceStatusResponse), 200)]
|
||||
public async Task<IActionResult> GetComplianceStatus(CancellationToken ct)
|
||||
{
|
||||
var status = await _complianceEngine.GetOverallStatusAsync(ct);
|
||||
return Ok(status);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets compliance status for a specific framework.
|
||||
/// </summary>
|
||||
[HttpGet("status/{framework}")]
|
||||
[ProducesResponseType(typeof(FrameworkComplianceStatus), 200)]
|
||||
public async Task<IActionResult> GetFrameworkStatus(
|
||||
[FromRoute] string framework,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var status = await _complianceEngine.GetFrameworkStatusAsync(framework, ct);
|
||||
if (status is null)
|
||||
return NotFound(new { Message = $"Framework '{framework}' not found" });
|
||||
|
||||
return Ok(status);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evaluates compliance for a release.
|
||||
/// </summary>
|
||||
[HttpPost("evaluate/{releaseId}")]
|
||||
[ProducesResponseType(typeof(ComplianceEvaluationResult), 200)]
|
||||
public async Task<IActionResult> EvaluateRelease(
|
||||
[FromRoute] string releaseId,
|
||||
[FromBody] EvaluateComplianceRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var result = await _complianceEngine.EvaluateReleaseAsync(
|
||||
releaseId,
|
||||
request.Frameworks ?? [],
|
||||
ct);
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Reports
|
||||
|
||||
/// <summary>
|
||||
/// Lists available report templates.
|
||||
/// </summary>
|
||||
[HttpGet("reports/templates")]
|
||||
[ProducesResponseType(typeof(ImmutableArray<ReportTemplate>), 200)]
|
||||
public IActionResult GetReportTemplates()
|
||||
{
|
||||
var templates = _reportGenerator.GetAvailableTemplates();
|
||||
return Ok(templates);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generates a compliance report.
|
||||
/// </summary>
|
||||
[HttpPost("reports/generate")]
|
||||
[ProducesResponseType(typeof(GeneratedReport), 200)]
|
||||
public async Task<IActionResult> GenerateReport(
|
||||
[FromBody] GenerateReportRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var report = await _reportGenerator.GenerateAsync(
|
||||
request.TemplateId,
|
||||
request.Parameters,
|
||||
ct);
|
||||
|
||||
return Ok(report);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Downloads a generated report.
|
||||
/// </summary>
|
||||
[HttpGet("reports/{reportId}/download")]
|
||||
[ProducesResponseType(typeof(FileResult), 200)]
|
||||
public async Task<IActionResult> DownloadReport(
|
||||
[FromRoute] string reportId,
|
||||
[FromQuery] string format = "pdf",
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var report = await _reportGenerator.GetReportAsync(reportId, ct);
|
||||
if (report is null)
|
||||
return NotFound(new { Message = $"Report '{reportId}' not found" });
|
||||
|
||||
var content = await _reportGenerator.RenderAsync(report, format, ct);
|
||||
return File(content.Data, content.ContentType, content.FileName);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Lists generated reports.
|
||||
/// </summary>
|
||||
[HttpGet("reports")]
|
||||
[ProducesResponseType(typeof(PagedResult<ReportSummary>), 200)]
|
||||
public async Task<IActionResult> ListReports(
|
||||
[FromQuery] int offset = 0,
|
||||
[FromQuery] int limit = 20,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var reports = await _reportGenerator.ListReportsAsync(offset, limit, ct);
|
||||
return Ok(reports);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Scheduled Reports
|
||||
|
||||
/// <summary>
|
||||
/// Creates a scheduled report.
|
||||
/// </summary>
|
||||
[HttpPost("reports/scheduled")]
|
||||
[ProducesResponseType(typeof(ScheduledReport), 201)]
|
||||
public async Task<IActionResult> CreateScheduledReport(
|
||||
[FromBody] CreateScheduledReportRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var scheduled = await _scheduledReportService.CreateAsync(request, ct);
|
||||
return CreatedAtAction(
|
||||
nameof(GetScheduledReport),
|
||||
new { scheduleId = scheduled.Id },
|
||||
scheduled);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a scheduled report.
|
||||
/// </summary>
|
||||
[HttpGet("reports/scheduled/{scheduleId}")]
|
||||
[ProducesResponseType(typeof(ScheduledReport), 200)]
|
||||
public async Task<IActionResult> GetScheduledReport(
|
||||
[FromRoute] string scheduleId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var scheduled = await _scheduledReportService.GetAsync(scheduleId, ct);
|
||||
if (scheduled is null)
|
||||
return NotFound();
|
||||
|
||||
return Ok(scheduled);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Lists scheduled reports.
|
||||
/// </summary>
|
||||
[HttpGet("reports/scheduled")]
|
||||
[ProducesResponseType(typeof(ImmutableArray<ScheduledReport>), 200)]
|
||||
public async Task<IActionResult> ListScheduledReports(CancellationToken ct)
|
||||
{
|
||||
var scheduled = await _scheduledReportService.ListAsync(ct);
|
||||
return Ok(scheduled);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates a scheduled report.
|
||||
/// </summary>
|
||||
[HttpPut("reports/scheduled/{scheduleId}")]
|
||||
[ProducesResponseType(typeof(ScheduledReport), 200)]
|
||||
public async Task<IActionResult> UpdateScheduledReport(
|
||||
[FromRoute] string scheduleId,
|
||||
[FromBody] UpdateScheduledReportRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var scheduled = await _scheduledReportService.UpdateAsync(scheduleId, request, ct);
|
||||
if (scheduled is null)
|
||||
return NotFound();
|
||||
|
||||
return Ok(scheduled);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deletes a scheduled report.
|
||||
/// </summary>
|
||||
[HttpDelete("reports/scheduled/{scheduleId}")]
|
||||
[ProducesResponseType(204)]
|
||||
public async Task<IActionResult> DeleteScheduledReport(
|
||||
[FromRoute] string scheduleId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var deleted = await _scheduledReportService.DeleteAsync(scheduleId, ct);
|
||||
if (!deleted)
|
||||
return NotFound();
|
||||
|
||||
return NoContent();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Evidence Chain
|
||||
|
||||
/// <summary>
|
||||
/// Gets evidence chain for a release.
|
||||
/// </summary>
|
||||
[HttpGet("evidence/{releaseId}/chain")]
|
||||
[ProducesResponseType(typeof(EvidenceChainResponse), 200)]
|
||||
public async Task<IActionResult> GetEvidenceChain(
|
||||
[FromRoute] string releaseId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var chain = await _evidenceChainVisualizer.BuildChainAsync(releaseId, ct);
|
||||
return Ok(new EvidenceChainResponse
|
||||
{
|
||||
ReleaseId = releaseId,
|
||||
Chain = chain
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies evidence chain integrity.
|
||||
/// </summary>
|
||||
[HttpPost("evidence/{releaseId}/verify")]
|
||||
[ProducesResponseType(typeof(ChainVerificationResult), 200)]
|
||||
public async Task<IActionResult> VerifyEvidenceChain(
|
||||
[FromRoute] string releaseId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var chain = await _evidenceChainVisualizer.BuildChainAsync(releaseId, ct);
|
||||
var result = await _evidenceChainVisualizer.VerifyChainAsync(chain, ct);
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets evidence chain visualization.
|
||||
/// </summary>
|
||||
[HttpGet("evidence/{releaseId}/graph")]
|
||||
[ProducesResponseType(typeof(EvidenceChainGraph), 200)]
|
||||
public async Task<IActionResult> GetEvidenceGraph(
|
||||
[FromRoute] string releaseId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var chain = await _evidenceChainVisualizer.BuildChainAsync(releaseId, ct);
|
||||
var graph = _evidenceChainVisualizer.ToGraph(chain);
|
||||
return Ok(graph);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exports evidence chain.
|
||||
/// </summary>
|
||||
[HttpGet("evidence/{releaseId}/export")]
|
||||
public async Task<IActionResult> ExportEvidenceChain(
|
||||
[FromRoute] string releaseId,
|
||||
[FromQuery] ExportFormat format = ExportFormat.Json,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var chain = await _evidenceChainVisualizer.BuildChainAsync(releaseId, ct);
|
||||
var result = await _evidenceChainVisualizer.ExportAsync(chain, format, ct);
|
||||
|
||||
return File(
|
||||
System.Text.Encoding.UTF8.GetBytes(result.Content),
|
||||
result.ContentType,
|
||||
result.FileName);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Audit Queries
|
||||
|
||||
/// <summary>
|
||||
/// Queries audit logs.
|
||||
/// </summary>
|
||||
[HttpPost("audit/query")]
|
||||
[ProducesResponseType(typeof(AuditQueryResult), 200)]
|
||||
public async Task<IActionResult> QueryAuditLogs(
|
||||
[FromBody] AuditQueryRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var query = new AuditQuery
|
||||
{
|
||||
Action = request.Action,
|
||||
Actor = request.Actor,
|
||||
ResourceType = request.ResourceType,
|
||||
ResourceId = request.ResourceId,
|
||||
FromTimestamp = request.FromTimestamp,
|
||||
ToTimestamp = request.ToTimestamp,
|
||||
SearchText = request.SearchText,
|
||||
SortBy = request.SortBy,
|
||||
SortDescending = request.SortDescending,
|
||||
Offset = request.Offset,
|
||||
Limit = request.Limit
|
||||
};
|
||||
|
||||
var result = await _auditQueryEngine.QueryAsync(query, ct);
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets audit activity summary.
|
||||
/// </summary>
|
||||
[HttpGet("audit/summary")]
|
||||
[ProducesResponseType(typeof(ActivitySummary), 200)]
|
||||
public async Task<IActionResult> GetAuditSummary(
|
||||
[FromQuery] DateTimeOffset? from = null,
|
||||
[FromQuery] DateTimeOffset? to = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var fromDate = from ?? DateTimeOffset.UtcNow.AddDays(-30);
|
||||
var toDate = to ?? DateTimeOffset.UtcNow;
|
||||
|
||||
var summary = await _auditQueryEngine.GetActivitySummaryAsync(fromDate, toDate, ct);
|
||||
return Ok(summary);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets aggregated audit data.
|
||||
/// </summary>
|
||||
[HttpPost("audit/aggregate")]
|
||||
[ProducesResponseType(typeof(AggregationResult), 200)]
|
||||
public async Task<IActionResult> AggregateAuditLogs(
|
||||
[FromBody] AuditAggregationRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var query = new AuditQuery
|
||||
{
|
||||
FromTimestamp = request.FromTimestamp,
|
||||
ToTimestamp = request.ToTimestamp
|
||||
};
|
||||
|
||||
var aggregation = new AggregationSpec
|
||||
{
|
||||
GroupBy = request.GroupBy
|
||||
};
|
||||
|
||||
var result = await _auditQueryEngine.AggregateAsync(query, aggregation, ct);
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets audit trail for a resource.
|
||||
/// </summary>
|
||||
[HttpGet("audit/resource/{resourceType}/{resourceId}")]
|
||||
[ProducesResponseType(typeof(ResourceAuditTrail), 200)]
|
||||
public async Task<IActionResult> GetResourceAuditTrail(
|
||||
[FromRoute] string resourceType,
|
||||
[FromRoute] string resourceId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var trail = await _auditQueryEngine.GetResourceTrailAsync(resourceType, resourceId, ct);
|
||||
return Ok(trail);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets actor activity report.
|
||||
/// </summary>
|
||||
[HttpGet("audit/actor/{actor}")]
|
||||
[ProducesResponseType(typeof(ActorActivityReport), 200)]
|
||||
public async Task<IActionResult> GetActorActivity(
|
||||
[FromRoute] string actor,
|
||||
[FromQuery] DateTimeOffset? from = null,
|
||||
[FromQuery] DateTimeOffset? to = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var fromDate = from ?? DateTimeOffset.UtcNow.AddDays(-30);
|
||||
var toDate = to ?? DateTimeOffset.UtcNow;
|
||||
|
||||
var report = await _auditQueryEngine.GetActorActivityAsync(actor, fromDate, toDate, ct);
|
||||
return Ok(report);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exports audit logs.
|
||||
/// </summary>
|
||||
[HttpPost("audit/export")]
|
||||
public async Task<IActionResult> ExportAuditLogs(
|
||||
[FromBody] AuditExportRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var query = new AuditQuery
|
||||
{
|
||||
FromTimestamp = request.FromTimestamp,
|
||||
ToTimestamp = request.ToTimestamp,
|
||||
Action = request.Action,
|
||||
Actor = request.Actor,
|
||||
Limit = 100000 // Allow large exports
|
||||
};
|
||||
|
||||
var result = await _auditQueryEngine.ExportAsync(query, request.Format, ct);
|
||||
|
||||
return File(
|
||||
System.Text.Encoding.UTF8.GetBytes(result.Content),
|
||||
GetContentType(request.Format),
|
||||
$"audit-export-{DateTime.UtcNow:yyyyMMdd}.{GetExtension(request.Format)}");
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Controls
|
||||
|
||||
/// <summary>
|
||||
/// Lists compliance controls.
|
||||
/// </summary>
|
||||
[HttpGet("controls")]
|
||||
[ProducesResponseType(typeof(ImmutableArray<ComplianceControl>), 200)]
|
||||
public async Task<IActionResult> ListControls(
|
||||
[FromQuery] string? framework = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var controls = await _complianceEngine.GetControlsAsync(framework, ct);
|
||||
return Ok(controls);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets control status.
|
||||
/// </summary>
|
||||
[HttpGet("controls/{controlId}/status")]
|
||||
[ProducesResponseType(typeof(ControlStatus), 200)]
|
||||
public async Task<IActionResult> GetControlStatus(
|
||||
[FromRoute] string controlId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var status = await _complianceEngine.GetControlStatusAsync(controlId, ct);
|
||||
if (status is null)
|
||||
return NotFound();
|
||||
|
||||
return Ok(status);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Helpers
|
||||
|
||||
private static string GetContentType(AuditExportFormat format) => format switch
|
||||
{
|
||||
AuditExportFormat.Csv => "text/csv",
|
||||
AuditExportFormat.Json => "application/json",
|
||||
AuditExportFormat.Syslog => "text/plain",
|
||||
_ => "application/octet-stream"
|
||||
};
|
||||
|
||||
private static string GetExtension(AuditExportFormat format) => format switch
|
||||
{
|
||||
AuditExportFormat.Csv => "csv",
|
||||
AuditExportFormat.Json => "json",
|
||||
AuditExportFormat.Syslog => "log",
|
||||
_ => "bin"
|
||||
};
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
#region Request/Response Models
|
||||
|
||||
public sealed record EvaluateComplianceRequest
|
||||
{
|
||||
public ImmutableArray<string>? Frameworks { get; init; }
|
||||
}
|
||||
|
||||
public sealed record GenerateReportRequest
|
||||
{
|
||||
public required string TemplateId { get; init; }
|
||||
public ImmutableDictionary<string, string>? Parameters { get; init; }
|
||||
}
|
||||
|
||||
public sealed record CreateScheduledReportRequest
|
||||
{
|
||||
public required string TemplateId { get; init; }
|
||||
public required string Schedule { get; init; } // Cron expression
|
||||
public required ImmutableArray<string> Recipients { get; init; }
|
||||
public ImmutableDictionary<string, string>? Parameters { get; init; }
|
||||
}
|
||||
|
||||
public sealed record UpdateScheduledReportRequest
|
||||
{
|
||||
public string? Schedule { get; init; }
|
||||
public ImmutableArray<string>? Recipients { get; init; }
|
||||
public bool? Enabled { get; init; }
|
||||
}
|
||||
|
||||
public sealed record EvidenceChainResponse
|
||||
{
|
||||
public required string ReleaseId { get; init; }
|
||||
public required object Chain { get; init; }
|
||||
}
|
||||
|
||||
public sealed record AuditQueryRequest
|
||||
{
|
||||
public string? Action { get; init; }
|
||||
public string? Actor { get; init; }
|
||||
public string? ResourceType { get; init; }
|
||||
public string? ResourceId { get; init; }
|
||||
public DateTimeOffset? FromTimestamp { get; init; }
|
||||
public DateTimeOffset? ToTimestamp { get; init; }
|
||||
public string? SearchText { get; init; }
|
||||
public string? SortBy { get; init; }
|
||||
public bool SortDescending { get; init; } = true;
|
||||
public int Offset { get; init; } = 0;
|
||||
public int Limit { get; init; } = 100;
|
||||
}
|
||||
|
||||
public sealed record AuditAggregationRequest
|
||||
{
|
||||
public DateTimeOffset? FromTimestamp { get; init; }
|
||||
public DateTimeOffset? ToTimestamp { get; init; }
|
||||
public required GroupByField GroupBy { get; init; }
|
||||
}
|
||||
|
||||
public sealed record AuditExportRequest
|
||||
{
|
||||
public DateTimeOffset? FromTimestamp { get; init; }
|
||||
public DateTimeOffset? ToTimestamp { get; init; }
|
||||
public string? Action { get; init; }
|
||||
public string? Actor { get; init; }
|
||||
public required AuditExportFormat Format { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Service Interfaces (stubs)
|
||||
|
||||
public interface IComplianceEngine
|
||||
{
|
||||
Task<object> GetOverallStatusAsync(CancellationToken ct);
|
||||
Task<object?> GetFrameworkStatusAsync(string framework, CancellationToken ct);
|
||||
Task<object> EvaluateReleaseAsync(string releaseId, ImmutableArray<string> frameworks, CancellationToken ct);
|
||||
Task<ImmutableArray<ComplianceControl>> GetControlsAsync(string? framework, CancellationToken ct);
|
||||
Task<ControlStatus?> GetControlStatusAsync(string controlId, CancellationToken ct);
|
||||
}
|
||||
|
||||
public interface IReportGenerator
|
||||
{
|
||||
ImmutableArray<ReportTemplate> GetAvailableTemplates();
|
||||
Task<GeneratedReport> GenerateAsync(string templateId, ImmutableDictionary<string, string>? parameters, CancellationToken ct);
|
||||
Task<GeneratedReport?> GetReportAsync(string reportId, CancellationToken ct);
|
||||
Task<RenderedReport> RenderAsync(GeneratedReport report, string format, CancellationToken ct);
|
||||
Task<PagedResult<ReportSummary>> ListReportsAsync(int offset, int limit, CancellationToken ct);
|
||||
}
|
||||
|
||||
public interface IScheduledReportService
|
||||
{
|
||||
Task<ScheduledReport> CreateAsync(CreateScheduledReportRequest request, CancellationToken ct);
|
||||
Task<ScheduledReport?> GetAsync(string scheduleId, CancellationToken ct);
|
||||
Task<ImmutableArray<ScheduledReport>> ListAsync(CancellationToken ct);
|
||||
Task<ScheduledReport?> UpdateAsync(string scheduleId, UpdateScheduledReportRequest request, CancellationToken ct);
|
||||
Task<bool> DeleteAsync(string scheduleId, CancellationToken ct);
|
||||
}
|
||||
|
||||
// Additional model stubs
|
||||
public sealed record ComplianceControl { public required string Id { get; init; } public required string Name { get; init; } }
|
||||
public sealed record ControlStatus { public required string ControlId { get; init; } public required string Status { get; init; } }
|
||||
public sealed record ReportTemplate { public required string Id { get; init; } public required string Name { get; init; } }
|
||||
public sealed record GeneratedReport { public required string Id { get; init; } public required string TemplateId { get; init; } }
|
||||
public sealed record RenderedReport { public required byte[] Data { get; init; } public required string ContentType { get; init; } public required string FileName { get; init; } }
|
||||
public sealed record ReportSummary { public required string Id { get; init; } public required string Name { get; init; } }
|
||||
public sealed record PagedResult<T> { public required ImmutableArray<T> Items { get; init; } public required int TotalCount { get; init; } }
|
||||
public sealed record ScheduledReport { public required string Id { get; init; } public required string TemplateId { get; init; } public required bool Enabled { get; init; } }
|
||||
public sealed record ComplianceStatusResponse { public required string OverallStatus { get; init; } }
|
||||
public sealed record FrameworkComplianceStatus { public required string Framework { get; init; } public required string Status { get; init; } }
|
||||
public sealed record ComplianceEvaluationResult { public required string ReleaseId { get; init; } public required bool Compliant { get; init; } }
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,788 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentResilienceIntegrationTests.cs
|
||||
// Sprint: SPRINT_20260117_034_ReleaseOrchestrator_agent_resilience
|
||||
// Task: TASK-034-09 - Integration and chaos tests for failover scenarios
|
||||
// Description: Integration tests for health monitoring, leader election, failover, and self-healing
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Agent.Core.Resilience.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Integration and chaos tests for agent resilience features.
|
||||
/// </summary>
|
||||
public sealed class AgentResilienceIntegrationTests
|
||||
{
|
||||
private readonly FakeTimeProvider _timeProvider = new();
|
||||
|
||||
#region Health Monitor Tests
|
||||
|
||||
[Fact]
|
||||
public async Task HealthMonitor_HealthyAgent_ReturnsHealthyStatus()
|
||||
{
|
||||
// Arrange
|
||||
var metricsProvider = new FakeMetricsProvider();
|
||||
var connectivityChecker = new FakeConnectivityChecker();
|
||||
var monitor = CreateHealthMonitor(metricsProvider, connectivityChecker);
|
||||
|
||||
var agentId = "agent-1";
|
||||
monitor.RegisterAgent(agentId, new AgentEndpoint("localhost", 8443));
|
||||
|
||||
metricsProvider.SetHealthyMetrics(agentId);
|
||||
connectivityChecker.SetReachable(agentId, true);
|
||||
|
||||
// Act
|
||||
var assessment = await monitor.AssessHealthAsync(agentId);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(AgentHealthStatus.Healthy, assessment.Status);
|
||||
Assert.True(assessment.OverallScore >= 0.85);
|
||||
Assert.Equal(RecommendedAction.None, assessment.Recommendation.Action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task HealthMonitor_DegradedAgent_ReturnsWarning()
|
||||
{
|
||||
// Arrange
|
||||
var metricsProvider = new FakeMetricsProvider();
|
||||
var connectivityChecker = new FakeConnectivityChecker();
|
||||
var monitor = CreateHealthMonitor(metricsProvider, connectivityChecker);
|
||||
|
||||
var agentId = "agent-1";
|
||||
monitor.RegisterAgent(agentId, new AgentEndpoint("localhost", 8443));
|
||||
|
||||
metricsProvider.SetDegradedMetrics(agentId);
|
||||
connectivityChecker.SetReachable(agentId, true, latency: TimeSpan.FromMilliseconds(300));
|
||||
|
||||
// Act
|
||||
var assessment = await monitor.AssessHealthAsync(agentId);
|
||||
|
||||
// Assert
|
||||
Assert.True(assessment.Status is AgentHealthStatus.Warning or AgentHealthStatus.Degraded);
|
||||
Assert.True(assessment.OverallScore < 0.85);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task HealthMonitor_UnreachableAgent_ReturnsCritical()
|
||||
{
|
||||
// Arrange
|
||||
var metricsProvider = new FakeMetricsProvider();
|
||||
var connectivityChecker = new FakeConnectivityChecker();
|
||||
var monitor = CreateHealthMonitor(metricsProvider, connectivityChecker);
|
||||
|
||||
var agentId = "agent-1";
|
||||
monitor.RegisterAgent(agentId, new AgentEndpoint("localhost", 8443));
|
||||
|
||||
connectivityChecker.SetReachable(agentId, false);
|
||||
|
||||
// Act
|
||||
var assessment = await monitor.AssessHealthAsync(agentId);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(AgentHealthStatus.Critical, assessment.Status);
|
||||
Assert.Equal(RecommendedAction.FailoverImmediately, assessment.Recommendation.Action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task HealthMonitor_HealthChanged_RaisesEvent()
|
||||
{
|
||||
// Arrange
|
||||
var metricsProvider = new FakeMetricsProvider();
|
||||
var connectivityChecker = new FakeConnectivityChecker();
|
||||
var monitor = CreateHealthMonitor(metricsProvider, connectivityChecker);
|
||||
|
||||
var agentId = "agent-1";
|
||||
monitor.RegisterAgent(agentId, new AgentEndpoint("localhost", 8443));
|
||||
|
||||
metricsProvider.SetHealthyMetrics(agentId);
|
||||
connectivityChecker.SetReachable(agentId, true);
|
||||
|
||||
AgentHealthChangedEventArgs? eventArgs = null;
|
||||
monitor.HealthChanged += (_, e) => eventArgs = e;
|
||||
|
||||
// First assessment - establishes baseline
|
||||
await monitor.AssessHealthAsync(agentId);
|
||||
|
||||
// Change to degraded
|
||||
connectivityChecker.SetReachable(agentId, false);
|
||||
|
||||
// Act
|
||||
await monitor.AssessHealthAsync(agentId);
|
||||
|
||||
// Assert
|
||||
Assert.NotNull(eventArgs);
|
||||
Assert.Equal(agentId, eventArgs.AgentId);
|
||||
Assert.Equal(AgentHealthStatus.Critical, eventArgs.NewStatus);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task HealthMonitor_TrendAnalysis_DetectsDegradation()
|
||||
{
|
||||
// Arrange
|
||||
var metricsProvider = new FakeMetricsProvider();
|
||||
var connectivityChecker = new FakeConnectivityChecker();
|
||||
var monitor = CreateHealthMonitor(metricsProvider, connectivityChecker);
|
||||
|
||||
var agentId = "agent-1";
|
||||
monitor.RegisterAgent(agentId, new AgentEndpoint("localhost", 8443));
|
||||
connectivityChecker.SetReachable(agentId, true);
|
||||
|
||||
// Simulate degrading health over time
|
||||
for (int i = 0; i < 5; i++)
|
||||
{
|
||||
metricsProvider.SetResourceMetrics(agentId, new ResourceMetrics
|
||||
{
|
||||
CpuPercent = 50 + i * 10, // Increasing CPU
|
||||
MemoryPercent = 40 + i * 8,
|
||||
DiskPercent = 30
|
||||
});
|
||||
await monitor.AssessHealthAsync(agentId);
|
||||
_timeProvider.Advance(TimeSpan.FromSeconds(30));
|
||||
}
|
||||
|
||||
// Act
|
||||
var assessment = await monitor.AssessHealthAsync(agentId);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(TrendDirection.Degrading, assessment.Trend.Direction);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Leader Election Tests
|
||||
|
||||
[Fact]
|
||||
public async Task LeaderElection_SingleNode_BecomesLeader()
|
||||
{
|
||||
// Arrange
|
||||
var distributedLock = new InMemoryDistributedLock(_timeProvider);
|
||||
var election = CreateLeaderElection(distributedLock);
|
||||
|
||||
await election.InitializeAsync("node-1");
|
||||
|
||||
// Act
|
||||
var result = await election.ParticipateAsync("my-resource");
|
||||
|
||||
// Assert
|
||||
Assert.True(result.Success);
|
||||
Assert.True(result.IsLeader);
|
||||
Assert.Equal("node-1", result.LeaderId);
|
||||
Assert.Equal(1, result.Term);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task LeaderElection_MultipleNodes_OnlyOneLeader()
|
||||
{
|
||||
// Arrange
|
||||
var distributedLock = new InMemoryDistributedLock(_timeProvider);
|
||||
var election1 = CreateLeaderElection(distributedLock);
|
||||
var election2 = CreateLeaderElection(distributedLock);
|
||||
|
||||
await election1.InitializeAsync("node-1");
|
||||
await election2.InitializeAsync("node-2");
|
||||
|
||||
// Act
|
||||
var result1 = await election1.ParticipateAsync("my-resource");
|
||||
var result2 = await election2.ParticipateAsync("my-resource");
|
||||
|
||||
// Assert
|
||||
Assert.True(result1.Success);
|
||||
Assert.True(result2.Success);
|
||||
|
||||
var leaderCount = (result1.IsLeader ? 1 : 0) + (result2.IsLeader ? 1 : 0);
|
||||
Assert.Equal(1, leaderCount);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task LeaderElection_Resign_ReleasesLeadership()
|
||||
{
|
||||
// Arrange
|
||||
var distributedLock = new InMemoryDistributedLock(_timeProvider);
|
||||
var election1 = CreateLeaderElection(distributedLock);
|
||||
var election2 = CreateLeaderElection(distributedLock);
|
||||
|
||||
await election1.InitializeAsync("node-1");
|
||||
await election2.InitializeAsync("node-2");
|
||||
|
||||
await election1.ParticipateAsync("my-resource");
|
||||
|
||||
// Act
|
||||
await election1.ResignAsync("my-resource");
|
||||
var result2 = await election2.ParticipateAsync("my-resource");
|
||||
|
||||
// Assert
|
||||
Assert.False(election1.IsLeader("my-resource"));
|
||||
Assert.True(result2.IsLeader);
|
||||
Assert.Equal("node-2", result2.LeaderId);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task LeaderElection_LeaseExpiry_AllowsNewLeader()
|
||||
{
|
||||
// Arrange
|
||||
var config = new LeaderElectionConfig { LeaseDuration = TimeSpan.FromSeconds(5) };
|
||||
var distributedLock = new InMemoryDistributedLock(_timeProvider);
|
||||
var election1 = CreateLeaderElection(distributedLock, config);
|
||||
var election2 = CreateLeaderElection(distributedLock, config);
|
||||
|
||||
await election1.InitializeAsync("node-1");
|
||||
await election2.InitializeAsync("node-2");
|
||||
|
||||
await election1.ParticipateAsync("my-resource");
|
||||
|
||||
// Act - advance time past lease expiry
|
||||
_timeProvider.Advance(TimeSpan.FromSeconds(10));
|
||||
var result2 = await election2.ParticipateAsync("my-resource");
|
||||
|
||||
// Assert
|
||||
Assert.True(result2.IsLeader);
|
||||
Assert.Equal("node-2", result2.LeaderId);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Self-Healer Tests
|
||||
|
||||
[Fact]
|
||||
public async Task SelfHealer_HealthyAgent_NoActionNeeded()
|
||||
{
|
||||
// Arrange
|
||||
var (healer, healthMonitor, _) = CreateSelfHealer();
|
||||
|
||||
healthMonitor.SetHealthyAgent("agent-1");
|
||||
|
||||
// Act
|
||||
var result = await healer.HealAsync("agent-1");
|
||||
|
||||
// Assert
|
||||
Assert.True(result.Success);
|
||||
Assert.Equal(HealingStatus.NotNeeded, result.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SelfHealer_DegradedAgent_ExecutesRecoveryActions()
|
||||
{
|
||||
// Arrange
|
||||
var (healer, healthMonitor, executor) = CreateSelfHealer();
|
||||
|
||||
healthMonitor.SetDegradedAgent("agent-1", [
|
||||
new HealthFactor { Name = "QueueDepth", Score = 0.2, Status = FactorStatus.Degraded, Weight = 1.0 }
|
||||
]);
|
||||
|
||||
// Act
|
||||
var result = await healer.HealAsync("agent-1");
|
||||
|
||||
// Assert
|
||||
Assert.True(result.Success || result.Status == HealingStatus.PartialRecovery);
|
||||
Assert.NotEmpty(result.ActionResults);
|
||||
Assert.True(executor.ExecutedActions.Count > 0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SelfHealer_CircuitBreaker_OpensAfterRepeatedFailures()
|
||||
{
|
||||
// Arrange
|
||||
var config = new SelfHealerConfig { CircuitBreakerThreshold = 3 };
|
||||
var (healer, healthMonitor, executor) = CreateSelfHealer(config);
|
||||
|
||||
healthMonitor.SetCriticalAgent("agent-1");
|
||||
executor.AlwaysFail = true;
|
||||
|
||||
// Act - trigger 3 failures
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
await healer.HealAsync("agent-1");
|
||||
}
|
||||
|
||||
// Assert - 4th attempt should be blocked
|
||||
var result = await healer.HealAsync("agent-1");
|
||||
Assert.Equal(HealingStatus.CircuitOpen, result.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SelfHealer_CircuitBreaker_ResetsAfterTimeout()
|
||||
{
|
||||
// Arrange
|
||||
var config = new SelfHealerConfig
|
||||
{
|
||||
CircuitBreakerThreshold = 2,
|
||||
CircuitBreakerResetTime = TimeSpan.FromMinutes(1)
|
||||
};
|
||||
var (healer, healthMonitor, executor) = CreateSelfHealer(config);
|
||||
|
||||
healthMonitor.SetCriticalAgent("agent-1");
|
||||
executor.AlwaysFail = true;
|
||||
|
||||
// Trigger failures
|
||||
await healer.HealAsync("agent-1");
|
||||
await healer.HealAsync("agent-1");
|
||||
|
||||
// Circuit should be open
|
||||
var blockedResult = await healer.HealAsync("agent-1");
|
||||
Assert.Equal(HealingStatus.CircuitOpen, blockedResult.Status);
|
||||
|
||||
// Act - advance time past reset
|
||||
_timeProvider.Advance(TimeSpan.FromMinutes(2));
|
||||
executor.AlwaysFail = false;
|
||||
healthMonitor.SetHealthyAgent("agent-1");
|
||||
|
||||
var result = await healer.HealAsync("agent-1");
|
||||
|
||||
// Assert - should attempt again
|
||||
Assert.NotEqual(HealingStatus.CircuitOpen, result.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SelfHealer_RecoveryHistory_TracksAttempts()
|
||||
{
|
||||
// Arrange
|
||||
var (healer, healthMonitor, _) = CreateSelfHealer();
|
||||
|
||||
healthMonitor.SetDegradedAgent("agent-1", [
|
||||
new HealthFactor { Name = "ErrorRate", Score = 0.3, Status = FactorStatus.Degraded, Weight = 1.0 }
|
||||
]);
|
||||
|
||||
// Act
|
||||
await healer.HealAsync("agent-1");
|
||||
await healer.HealAsync("agent-1");
|
||||
|
||||
var history = healer.GetRecoveryHistory("agent-1");
|
||||
|
||||
// Assert
|
||||
Assert.Equal(2, history.Length);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region State Sync Tests
|
||||
|
||||
[Fact]
|
||||
public async Task StateSync_SetAndGet_ReturnsValue()
|
||||
{
|
||||
// Arrange
|
||||
var sync = await CreateInitializedStateSync("node-1");
|
||||
|
||||
// Act
|
||||
await sync.SetAsync("test-key", "test-value");
|
||||
var result = await sync.GetAsync<string>("test-key");
|
||||
|
||||
// Assert
|
||||
Assert.Equal("test-value", result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task StateSync_Delete_RemovesValue()
|
||||
{
|
||||
// Arrange
|
||||
var sync = await CreateInitializedStateSync("node-1");
|
||||
await sync.SetAsync("test-key", "test-value");
|
||||
|
||||
// Act
|
||||
await sync.DeleteAsync("test-key");
|
||||
var result = await sync.GetAsync<string>("test-key");
|
||||
|
||||
// Assert
|
||||
Assert.Null(result);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task StateSync_GetByPrefix_FiltersCorrectly()
|
||||
{
|
||||
// Arrange
|
||||
var sync = await CreateInitializedStateSync("node-1");
|
||||
await sync.SetAsync("agents:agent-1", "data1");
|
||||
await sync.SetAsync("agents:agent-2", "data2");
|
||||
await sync.SetAsync("config:setting", "value");
|
||||
|
||||
// Act
|
||||
var agentEntries = sync.GetByPrefix("agents:");
|
||||
|
||||
// Assert
|
||||
Assert.Equal(2, agentEntries.Length);
|
||||
Assert.All(agentEntries, e => Assert.StartsWith("agents:", e.Key));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task StateSync_VectorClock_MergesCorrectly()
|
||||
{
|
||||
// Arrange
|
||||
var clock1 = new VectorClock().Increment("node-1").Increment("node-1");
|
||||
var clock2 = new VectorClock().Increment("node-2");
|
||||
|
||||
// Act
|
||||
var merged = clock1.Merge(clock2);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(0, merged.CompareTo(clock1)); // Should be concurrent or equal
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Chaos Tests
|
||||
|
||||
[Fact]
|
||||
public async Task Chaos_NetworkPartition_TriggersFailover()
|
||||
{
|
||||
// Arrange
|
||||
var metricsProvider = new FakeMetricsProvider();
|
||||
var connectivityChecker = new FakeConnectivityChecker();
|
||||
var monitor = CreateHealthMonitor(metricsProvider, connectivityChecker);
|
||||
|
||||
var agentId = "agent-1";
|
||||
monitor.RegisterAgent(agentId, new AgentEndpoint("localhost", 8443));
|
||||
|
||||
metricsProvider.SetHealthyMetrics(agentId);
|
||||
connectivityChecker.SetReachable(agentId, true);
|
||||
|
||||
// Initial healthy state
|
||||
await monitor.AssessHealthAsync(agentId);
|
||||
|
||||
// Act - simulate network partition
|
||||
connectivityChecker.SetReachable(agentId, false);
|
||||
var assessment = await monitor.AssessHealthAsync(agentId);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(AgentHealthStatus.Critical, assessment.Status);
|
||||
Assert.Equal(RecommendedAction.FailoverImmediately, assessment.Recommendation.Action);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Chaos_ResourceExhaustion_TriggersHealing()
|
||||
{
|
||||
// Arrange
|
||||
var (healer, healthMonitor, executor) = CreateSelfHealer();
|
||||
|
||||
healthMonitor.SetDegradedAgent("agent-1", [
|
||||
new HealthFactor { Name = "Resources", Score = 0.1, Status = FactorStatus.Critical, Weight = 1.5, Details = "Memory: 95%" }
|
||||
]);
|
||||
|
||||
// Act
|
||||
var result = await healer.HealAsync("agent-1");
|
||||
|
||||
// Assert
|
||||
Assert.NotEmpty(result.ActionResults);
|
||||
var clearCacheAction = result.ActionResults.FirstOrDefault(
|
||||
a => a.Action.Type == RecoveryActionType.ClearCaches);
|
||||
Assert.NotNull(clearCacheAction);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Chaos_RapidHealthFluctuation_StabilizesWithDebounce()
|
||||
{
|
||||
// Arrange
|
||||
var metricsProvider = new FakeMetricsProvider();
|
||||
var connectivityChecker = new FakeConnectivityChecker();
|
||||
var monitor = CreateHealthMonitor(metricsProvider, connectivityChecker);
|
||||
|
||||
var agentId = "agent-1";
|
||||
monitor.RegisterAgent(agentId, new AgentEndpoint("localhost", 8443));
|
||||
|
||||
var statusChanges = new List<AgentHealthStatus>();
|
||||
monitor.HealthChanged += (_, e) => statusChanges.Add(e.NewStatus);
|
||||
|
||||
// Act - rapid fluctuations
|
||||
for (int i = 0; i < 10; i++)
|
||||
{
|
||||
if (i % 2 == 0)
|
||||
{
|
||||
metricsProvider.SetHealthyMetrics(agentId);
|
||||
connectivityChecker.SetReachable(agentId, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
connectivityChecker.SetReachable(agentId, false);
|
||||
}
|
||||
await monitor.AssessHealthAsync(agentId);
|
||||
}
|
||||
|
||||
// Assert - should have recorded changes
|
||||
Assert.True(statusChanges.Count > 0);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Setup Helpers
|
||||
|
||||
private HealthMonitor CreateHealthMonitor(
|
||||
IMetricsProvider metricsProvider,
|
||||
IConnectivityChecker connectivityChecker)
|
||||
{
|
||||
return new HealthMonitor(
|
||||
metricsProvider,
|
||||
connectivityChecker,
|
||||
new HealthMonitorConfig(),
|
||||
_timeProvider,
|
||||
NullLogger<HealthMonitor>.Instance);
|
||||
}
|
||||
|
||||
private LeaderElection CreateLeaderElection(
|
||||
IDistributedLock distributedLock,
|
||||
LeaderElectionConfig? config = null)
|
||||
{
|
||||
return new LeaderElection(
|
||||
distributedLock,
|
||||
config ?? new LeaderElectionConfig(),
|
||||
_timeProvider,
|
||||
NullLogger<LeaderElection>.Instance);
|
||||
}
|
||||
|
||||
private (SelfHealer, FakeHealthMonitor, FakeRecoveryExecutor) CreateSelfHealer(
|
||||
SelfHealerConfig? config = null)
|
||||
{
|
||||
var healthMonitor = new FakeHealthMonitor();
|
||||
var executor = new FakeRecoveryExecutor();
|
||||
|
||||
var healer = new SelfHealer(
|
||||
healthMonitor,
|
||||
executor,
|
||||
config ?? new SelfHealerConfig(),
|
||||
_timeProvider,
|
||||
NullLogger<SelfHealer>.Instance);
|
||||
|
||||
return (healer, healthMonitor, executor);
|
||||
}
|
||||
|
||||
private async Task<StateSync> CreateInitializedStateSync(string nodeId)
|
||||
{
|
||||
var transport = new FakeStateSyncTransport();
|
||||
var store = new FakeStateStore();
|
||||
|
||||
var sync = new StateSync(
|
||||
transport,
|
||||
store,
|
||||
new StateSyncConfig(),
|
||||
_timeProvider,
|
||||
NullLogger<StateSync>.Instance);
|
||||
|
||||
await sync.InitializeAsync(nodeId);
|
||||
return sync;
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
#region Test Doubles
|
||||
|
||||
public sealed class FakeTimeProvider : TimeProvider
|
||||
{
|
||||
private DateTimeOffset _now = new(2026, 1, 17, 12, 0, 0, TimeSpan.Zero);
|
||||
public override DateTimeOffset GetUtcNow() => _now;
|
||||
public void Advance(TimeSpan duration) => _now = _now.Add(duration);
|
||||
}
|
||||
|
||||
public sealed class FakeMetricsProvider : IMetricsProvider
|
||||
{
|
||||
private readonly Dictionary<string, ResourceMetrics> _resourceMetrics = new();
|
||||
private readonly Dictionary<string, TaskMetrics> _taskMetrics = new();
|
||||
private readonly Dictionary<string, ErrorMetrics> _errorMetrics = new();
|
||||
private readonly Dictionary<string, QueueMetrics> _queueMetrics = new();
|
||||
|
||||
public void SetHealthyMetrics(string agentId)
|
||||
{
|
||||
_resourceMetrics[agentId] = new ResourceMetrics { CpuPercent = 30, MemoryPercent = 40, DiskPercent = 50 };
|
||||
_taskMetrics[agentId] = new TaskMetrics { TotalTasks = 100, SuccessfulTasks = 99, FailedTasks = 1 };
|
||||
_errorMetrics[agentId] = new ErrorMetrics { TotalRequests = 1000, ErrorCount = 5 };
|
||||
_queueMetrics[agentId] = new QueueMetrics { CurrentQueueSize = 10, MaxQueueSize = 100 };
|
||||
}
|
||||
|
||||
public void SetDegradedMetrics(string agentId)
|
||||
{
|
||||
_resourceMetrics[agentId] = new ResourceMetrics { CpuPercent = 85, MemoryPercent = 80, DiskPercent = 70 };
|
||||
_taskMetrics[agentId] = new TaskMetrics { TotalTasks = 100, SuccessfulTasks = 80, FailedTasks = 20 };
|
||||
_errorMetrics[agentId] = new ErrorMetrics { TotalRequests = 1000, ErrorCount = 80 };
|
||||
_queueMetrics[agentId] = new QueueMetrics { CurrentQueueSize = 80, MaxQueueSize = 100 };
|
||||
}
|
||||
|
||||
public void SetResourceMetrics(string agentId, ResourceMetrics metrics)
|
||||
{
|
||||
_resourceMetrics[agentId] = metrics;
|
||||
}
|
||||
|
||||
public Task<ResourceMetrics> GetResourceMetricsAsync(string agentId, CancellationToken ct = default)
|
||||
=> Task.FromResult(_resourceMetrics.GetValueOrDefault(agentId) ?? new ResourceMetrics());
|
||||
|
||||
public Task<TaskMetrics> GetTaskMetricsAsync(string agentId, CancellationToken ct = default)
|
||||
=> Task.FromResult(_taskMetrics.GetValueOrDefault(agentId) ?? new TaskMetrics());
|
||||
|
||||
public Task<ErrorMetrics> GetErrorMetricsAsync(string agentId, CancellationToken ct = default)
|
||||
=> Task.FromResult(_errorMetrics.GetValueOrDefault(agentId) ?? new ErrorMetrics());
|
||||
|
||||
public Task<QueueMetrics> GetQueueMetricsAsync(string agentId, CancellationToken ct = default)
|
||||
=> Task.FromResult(_queueMetrics.GetValueOrDefault(agentId) ?? new QueueMetrics());
|
||||
}
|
||||
|
||||
public sealed class FakeConnectivityChecker : IConnectivityChecker
|
||||
{
|
||||
private readonly Dictionary<string, (bool reachable, TimeSpan latency)> _connectivity = new();
|
||||
|
||||
public void SetReachable(string agentId, bool reachable, TimeSpan? latency = null)
|
||||
{
|
||||
_connectivity[agentId] = (reachable, latency ?? TimeSpan.FromMilliseconds(50));
|
||||
}
|
||||
|
||||
public Task<ConnectivityResult> CheckAsync(AgentEndpoint endpoint, CancellationToken ct = default)
|
||||
{
|
||||
var key = $"{endpoint.Host}:{endpoint.Port}";
|
||||
|
||||
// Try to find by partial match
|
||||
var entry = _connectivity.FirstOrDefault(kv => true);
|
||||
var isReachable = entry.Value.reachable;
|
||||
|
||||
return Task.FromResult(new ConnectivityResult
|
||||
{
|
||||
IsReachable = isReachable,
|
||||
Error = isReachable ? null : "Connection refused"
|
||||
});
|
||||
}
|
||||
|
||||
public Task<TimeSpan> MeasureLatencyAsync(AgentEndpoint endpoint, CancellationToken ct = default)
|
||||
{
|
||||
var entry = _connectivity.FirstOrDefault(kv => true);
|
||||
return Task.FromResult(entry.Value.latency);
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class FakeHealthMonitor : IHealthMonitor
|
||||
{
|
||||
private readonly Dictionary<string, AgentHealthAssessment> _assessments = new();
|
||||
|
||||
public void SetHealthyAgent(string agentId)
|
||||
{
|
||||
_assessments[agentId] = new AgentHealthAssessment
|
||||
{
|
||||
AgentId = agentId,
|
||||
Status = AgentHealthStatus.Healthy,
|
||||
OverallScore = 0.95,
|
||||
Factors = [],
|
||||
Trend = new HealthTrend { Direction = TrendDirection.Stable, Confidence = 0.8 },
|
||||
AssessedAt = DateTimeOffset.UtcNow,
|
||||
Recommendation = new HealthRecommendation
|
||||
{
|
||||
Action = RecommendedAction.None,
|
||||
Urgency = ActionUrgency.None,
|
||||
Reason = "Healthy",
|
||||
AffectedFactors = []
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public void SetDegradedAgent(string agentId, ImmutableArray<HealthFactor> factors)
|
||||
{
|
||||
_assessments[agentId] = new AgentHealthAssessment
|
||||
{
|
||||
AgentId = agentId,
|
||||
Status = AgentHealthStatus.Degraded,
|
||||
OverallScore = 0.5,
|
||||
Factors = factors,
|
||||
Trend = new HealthTrend { Direction = TrendDirection.Degrading, Confidence = 0.7 },
|
||||
AssessedAt = DateTimeOffset.UtcNow,
|
||||
Recommendation = new HealthRecommendation
|
||||
{
|
||||
Action = RecommendedAction.InvestigateAndRemediate,
|
||||
Urgency = ActionUrgency.Medium,
|
||||
Reason = "Degraded",
|
||||
AffectedFactors = factors.Select(f => f.Name).ToImmutableArray()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public void SetCriticalAgent(string agentId)
|
||||
{
|
||||
_assessments[agentId] = new AgentHealthAssessment
|
||||
{
|
||||
AgentId = agentId,
|
||||
Status = AgentHealthStatus.Critical,
|
||||
OverallScore = 0.1,
|
||||
Factors = [new HealthFactor { Name = "Connectivity", Score = 0, Status = FactorStatus.Critical, Weight = 2.0 }],
|
||||
Trend = new HealthTrend { Direction = TrendDirection.Degrading, Confidence = 0.9 },
|
||||
AssessedAt = DateTimeOffset.UtcNow,
|
||||
Recommendation = new HealthRecommendation
|
||||
{
|
||||
Action = RecommendedAction.FailoverImmediately,
|
||||
Urgency = ActionUrgency.Critical,
|
||||
Reason = "Critical",
|
||||
AffectedFactors = ["Connectivity"]
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public Task StartAsync(CancellationToken ct = default) => Task.CompletedTask;
|
||||
public Task StopAsync() => Task.CompletedTask;
|
||||
public void RegisterAgent(string agentId, AgentEndpoint endpoint) { }
|
||||
public void UnregisterAgent(string agentId) => _assessments.Remove(agentId);
|
||||
public void RegisterCustomCheck(string name, Func<CancellationToken, Task<HealthCheckResult>> check) { }
|
||||
|
||||
public Task<AgentHealthAssessment> AssessHealthAsync(string agentId, CancellationToken ct = default)
|
||||
{
|
||||
if (!_assessments.TryGetValue(agentId, out var assessment))
|
||||
throw new InvalidOperationException($"Agent {agentId} not registered");
|
||||
return Task.FromResult(assessment);
|
||||
}
|
||||
|
||||
public Task<ImmutableArray<AgentHealthAssessment>> AssessAllAgentsAsync(CancellationToken ct = default)
|
||||
=> Task.FromResult(_assessments.Values.ToImmutableArray());
|
||||
|
||||
public ImmutableDictionary<string, AgentHealthStatus> GetAllAgentStatuses()
|
||||
=> _assessments.ToImmutableDictionary(kv => kv.Key, kv => kv.Value.Status);
|
||||
|
||||
public ImmutableArray<string> GetAgentsByStatus(AgentHealthStatus status)
|
||||
=> _assessments.Where(kv => kv.Value.Status == status).Select(kv => kv.Key).ToImmutableArray();
|
||||
|
||||
public event EventHandler<AgentHealthChangedEventArgs>? HealthChanged;
|
||||
}
|
||||
|
||||
public sealed class FakeRecoveryExecutor : IRecoveryActionExecutor
|
||||
{
|
||||
public List<(string AgentId, RecoveryAction Action)> ExecutedActions { get; } = new();
|
||||
public bool AlwaysFail { get; set; }
|
||||
|
||||
public Task ExecuteAsync(string agentId, RecoveryAction action, CancellationToken ct = default)
|
||||
{
|
||||
if (AlwaysFail)
|
||||
throw new Exception("Simulated failure");
|
||||
|
||||
ExecutedActions.Add((agentId, action));
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class FakeStateSyncTransport : IStateSyncTransport
|
||||
{
|
||||
public Task<ImmutableArray<string>> GetPeersAsync(CancellationToken ct = default)
|
||||
=> Task.FromResult(ImmutableArray<string>.Empty);
|
||||
|
||||
public Task SendAsync(string peerId, SyncMessage message, CancellationToken ct = default)
|
||||
=> Task.CompletedTask;
|
||||
|
||||
public Task<StateDigest> GetDigestAsync(string peerId, CancellationToken ct = default)
|
||||
=> Task.FromResult(new StateDigest
|
||||
{
|
||||
NodeId = peerId,
|
||||
Entries = [],
|
||||
ComputedAt = DateTimeOffset.UtcNow
|
||||
});
|
||||
|
||||
public Task RequestEntriesAsync(string peerId, ImmutableArray<string> keys, CancellationToken ct = default)
|
||||
=> Task.CompletedTask;
|
||||
|
||||
public event EventHandler<SyncMessageEventArgs>? OnSyncMessage;
|
||||
}
|
||||
|
||||
public sealed class FakeStateStore : IStateStore
|
||||
{
|
||||
private ImmutableArray<StateEntry> _entries = [];
|
||||
|
||||
public Task<ImmutableArray<StateEntry>> LoadAsync(CancellationToken ct = default)
|
||||
=> Task.FromResult(_entries);
|
||||
|
||||
public Task SaveAsync(ImmutableArray<StateEntry> entries, CancellationToken ct = default)
|
||||
{
|
||||
_entries = entries;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,367 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
using StellaOps.Agent.Core.Bootstrap;
|
||||
using StellaOps.Agent.Core.Certificates;
|
||||
using StellaOps.Agent.Core.Configuration;
|
||||
using StellaOps.Agent.Core.Doctor;
|
||||
|
||||
namespace StellaOps.Agent.Core.Tests.Integration;
|
||||
|
||||
/// <summary>
|
||||
/// Integration tests for agent operations.
|
||||
/// </summary>
|
||||
public sealed class AgentOperationsIntegrationTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task BootstrapFlow_GeneratesTokenAndInstaller()
|
||||
{
|
||||
// Arrange
|
||||
var tokenStore = new InMemoryBootstrapTokenStore();
|
||||
var tokenService = new BootstrapTokenService(
|
||||
tokenStore,
|
||||
TimeProvider.System);
|
||||
|
||||
var bootstrapService = new BootstrapService(
|
||||
tokenService,
|
||||
new BootstrapConfiguration
|
||||
{
|
||||
OrchestratorUrl = "https://test-orchestrator.example.com"
|
||||
});
|
||||
|
||||
// Act
|
||||
var package = await bootstrapService.BootstrapAgentAsync(new BootstrapAgentRequest
|
||||
{
|
||||
AgentName = "test-agent",
|
||||
Environment = "test",
|
||||
Capabilities = ["docker", "scripts"]
|
||||
});
|
||||
|
||||
// Assert
|
||||
Assert.NotNull(package.Token);
|
||||
Assert.False(package.Token.IsConsumed);
|
||||
Assert.Equal("test-agent", package.Token.AgentName);
|
||||
Assert.Contains(Platform.Linux, package.Installers.Keys);
|
||||
Assert.Contains(Platform.Windows, package.Installers.Keys);
|
||||
Assert.Contains(Platform.Docker, package.Installers.Keys);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task BootstrapToken_CanBeConsumedOnlyOnce()
|
||||
{
|
||||
// Arrange
|
||||
var tokenStore = new InMemoryBootstrapTokenStore();
|
||||
var tokenService = new BootstrapTokenService(
|
||||
tokenStore,
|
||||
TimeProvider.System);
|
||||
|
||||
var token = await tokenService.GenerateBootstrapTokenAsync(new BootstrapTokenRequest
|
||||
{
|
||||
AgentName = "test-agent",
|
||||
Environment = "test"
|
||||
});
|
||||
|
||||
// Act - First consumption should succeed
|
||||
var result1 = await tokenService.ValidateAndConsumeAsync(token.Token);
|
||||
var result2 = await tokenService.ValidateAndConsumeAsync(token.Token);
|
||||
|
||||
// Assert
|
||||
Assert.True(result1.IsValid);
|
||||
Assert.False(result2.IsValid);
|
||||
Assert.Equal("Token already used", result2.Error);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Configuration_ApplyAndRollback()
|
||||
{
|
||||
// Arrange
|
||||
var configStore = new InMemoryConfigurationStore();
|
||||
var applier = new MockConfigurationApplier();
|
||||
var configManager = new AgentConfigManager(
|
||||
configStore,
|
||||
applier,
|
||||
TimeProvider.System);
|
||||
|
||||
var config1 = CreateTestConfiguration(maxTasks: 5);
|
||||
var config2 = CreateTestConfiguration(maxTasks: 10);
|
||||
|
||||
// Act - Apply first config
|
||||
var result1 = await configManager.ApplyConfigurationAsync(config1);
|
||||
Assert.True(result1.IsSuccess);
|
||||
|
||||
// Apply second config
|
||||
var result2 = await configManager.ApplyConfigurationAsync(config2);
|
||||
Assert.True(result2.IsSuccess);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(10, configManager.CurrentConfiguration?.Resources.MaxConcurrentTasks);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ConfigurationDrift_DetectsChanges()
|
||||
{
|
||||
// Arrange
|
||||
var configStore = new InMemoryConfigurationStore();
|
||||
var applier = new MockConfigurationApplier();
|
||||
var configManager = new AgentConfigManager(
|
||||
configStore,
|
||||
applier,
|
||||
TimeProvider.System);
|
||||
|
||||
var config = CreateTestConfiguration(maxTasks: 5);
|
||||
await configManager.ApplyConfigurationAsync(config);
|
||||
|
||||
// Simulate drift by changing desired config
|
||||
var driftedConfig = config with
|
||||
{
|
||||
Resources = config.Resources with { MaxConcurrentTasks = 10 }
|
||||
};
|
||||
await configStore.SaveDesiredAsync(driftedConfig);
|
||||
|
||||
await configManager.LoadAsync();
|
||||
|
||||
// Act
|
||||
var drift = await configManager.DetectDriftAsync();
|
||||
|
||||
// Assert
|
||||
Assert.True(drift.HasDrift);
|
||||
Assert.Contains(drift.Differences, d => d.Path.Contains("MaxConcurrentTasks"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AgentDoctor_RunsAllChecks()
|
||||
{
|
||||
// Arrange
|
||||
var checks = new List<IAgentHealthCheck>
|
||||
{
|
||||
new AlwaysHealthyCheck("TestCheck1"),
|
||||
new AlwaysHealthyCheck("TestCheck2"),
|
||||
new AlwaysWarningCheck("TestCheck3")
|
||||
};
|
||||
|
||||
var doctor = new AgentDoctor(
|
||||
checks,
|
||||
TimeProvider.System);
|
||||
|
||||
// Act
|
||||
var report = await doctor.RunDiagnosticsAsync();
|
||||
|
||||
// Assert
|
||||
Assert.Equal(3, report.TotalChecks);
|
||||
Assert.Equal(2, report.PassedChecks);
|
||||
Assert.Equal(1, report.WarningChecks);
|
||||
Assert.Equal(HealthStatus.Warning, report.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task AgentDoctor_FiltersByCategory()
|
||||
{
|
||||
// Arrange
|
||||
var checks = new List<IAgentHealthCheck>
|
||||
{
|
||||
new CategoryHealthCheck("SecurityCheck", HealthCheckCategory.Security),
|
||||
new CategoryHealthCheck("NetworkCheck", HealthCheckCategory.Network),
|
||||
new CategoryHealthCheck("RuntimeCheck", HealthCheckCategory.Runtime)
|
||||
};
|
||||
|
||||
var doctor = new AgentDoctor(checks, TimeProvider.System);
|
||||
|
||||
// Act
|
||||
var report = await doctor.RunDiagnosticsAsync(new DiagnosticOptions
|
||||
{
|
||||
Categories = [HealthCheckCategory.Security]
|
||||
});
|
||||
|
||||
// Assert
|
||||
Assert.Single(report.Results);
|
||||
Assert.Equal("SecurityCheck", report.Results[0].CheckName);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RemediationEngine_MatchesPatterns()
|
||||
{
|
||||
// Arrange
|
||||
var patterns = new List<IRemediationPattern>
|
||||
{
|
||||
new CertificateRemediationPattern(),
|
||||
new DockerRemediationPattern()
|
||||
};
|
||||
|
||||
var engine = new RemediationEngine(patterns);
|
||||
|
||||
var certResult = HealthCheckResult.Warn("CertificateExpiry", "Certificate expires in 5 days");
|
||||
|
||||
// Act
|
||||
var steps = engine.GetRemediationSteps(certResult);
|
||||
|
||||
// Assert
|
||||
Assert.NotEmpty(steps);
|
||||
Assert.Contains(steps, s => s.Id == "cert-renew");
|
||||
}
|
||||
|
||||
private static AgentConfiguration CreateTestConfiguration(int maxTasks = 5)
|
||||
{
|
||||
return new AgentConfiguration
|
||||
{
|
||||
Identity = new IdentityConfig
|
||||
{
|
||||
AgentId = "test-agent-id",
|
||||
Environment = "test"
|
||||
},
|
||||
Connection = new ConnectionConfig
|
||||
{
|
||||
OrchestratorUrl = "https://test.example.com"
|
||||
},
|
||||
Resources = new ResourceConfig
|
||||
{
|
||||
MaxConcurrentTasks = maxTasks
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Test doubles
|
||||
private sealed class InMemoryBootstrapTokenStore : IBootstrapTokenStore
|
||||
{
|
||||
private readonly Dictionary<string, BootstrapToken> _tokens = new();
|
||||
|
||||
public Task StoreAsync(BootstrapToken token, CancellationToken cancellationToken = default)
|
||||
{
|
||||
_tokens[token.Id] = token;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public Task<BootstrapToken?> GetByTokenAsync(string token, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var found = _tokens.Values.FirstOrDefault(t => t.Token == token);
|
||||
return Task.FromResult(found);
|
||||
}
|
||||
|
||||
public Task<BootstrapToken?> GetByIdAsync(string id, CancellationToken cancellationToken = default)
|
||||
{
|
||||
_tokens.TryGetValue(id, out var token);
|
||||
return Task.FromResult(token);
|
||||
}
|
||||
|
||||
public Task UpdateAsync(BootstrapToken token, CancellationToken cancellationToken = default)
|
||||
{
|
||||
_tokens[token.Id] = token;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public Task DeleteAsync(string id, CancellationToken cancellationToken = default)
|
||||
{
|
||||
_tokens.Remove(id);
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class InMemoryConfigurationStore : IConfigurationStore
|
||||
{
|
||||
private AgentConfiguration? _current;
|
||||
private AgentConfiguration? _desired;
|
||||
private readonly List<(int Version, AgentConfiguration Config)> _versions = [];
|
||||
|
||||
public Task<AgentConfiguration?> LoadCurrentAsync(CancellationToken cancellationToken = default) =>
|
||||
Task.FromResult(_current);
|
||||
|
||||
public Task<AgentConfiguration?> LoadDesiredAsync(CancellationToken cancellationToken = default) =>
|
||||
Task.FromResult(_desired);
|
||||
|
||||
public Task SaveCurrentAsync(AgentConfiguration config, CancellationToken cancellationToken = default)
|
||||
{
|
||||
_current = config;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public Task SaveDesiredAsync(AgentConfiguration config, CancellationToken cancellationToken = default)
|
||||
{
|
||||
_desired = config;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public Task<int> CreateVersionAsync(AgentConfiguration? config, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var version = _versions.Count + 1;
|
||||
if (config != null)
|
||||
_versions.Add((version, config));
|
||||
return Task.FromResult(version);
|
||||
}
|
||||
|
||||
public Task<AgentConfiguration?> GetVersionAsync(int version, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var found = _versions.FirstOrDefault(v => v.Version == version);
|
||||
return Task.FromResult(found.Config);
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class MockConfigurationApplier : IConfigurationApplier
|
||||
{
|
||||
public Task ApplyAsync(AgentConfiguration config, CancellationToken cancellationToken = default) =>
|
||||
Task.CompletedTask;
|
||||
}
|
||||
|
||||
private sealed class AlwaysHealthyCheck(string name) : IAgentHealthCheck
|
||||
{
|
||||
public HealthCheckCategory Category => HealthCheckCategory.Runtime;
|
||||
public string Name => name;
|
||||
public string Description => "Always healthy test check";
|
||||
|
||||
public Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default) =>
|
||||
Task.FromResult(HealthCheckResult.Pass(Name, "OK"));
|
||||
}
|
||||
|
||||
private sealed class AlwaysWarningCheck(string name) : IAgentHealthCheck
|
||||
{
|
||||
public HealthCheckCategory Category => HealthCheckCategory.Runtime;
|
||||
public string Name => name;
|
||||
public string Description => "Always warning test check";
|
||||
|
||||
public Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default) =>
|
||||
Task.FromResult(HealthCheckResult.Warn(Name, "Warning"));
|
||||
}
|
||||
|
||||
private sealed class CategoryHealthCheck(string name, HealthCheckCategory category) : IAgentHealthCheck
|
||||
{
|
||||
public HealthCheckCategory Category => category;
|
||||
public string Name => name;
|
||||
public string Description => $"Test check for {category}";
|
||||
|
||||
public Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default) =>
|
||||
Task.FromResult(HealthCheckResult.Pass(Name, "OK"));
|
||||
}
|
||||
|
||||
private sealed class CertificateRemediationPattern : IRemediationPattern
|
||||
{
|
||||
public bool Matches(HealthCheckResult result) =>
|
||||
result.CheckName.Contains("Certificate", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
public IReadOnlyList<RemediationStep> GetSteps(HealthCheckResult result) =>
|
||||
[
|
||||
new RemediationStep
|
||||
{
|
||||
Id = "cert-renew",
|
||||
Title = "Renew certificate",
|
||||
Description = "Renew the agent certificate",
|
||||
IsAutomated = true,
|
||||
Command = "stella agent renew-cert"
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
private sealed class DockerRemediationPattern : IRemediationPattern
|
||||
{
|
||||
public bool Matches(HealthCheckResult result) =>
|
||||
result.CheckName.Contains("Docker", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
public IReadOnlyList<RemediationStep> GetSteps(HealthCheckResult result) =>
|
||||
[
|
||||
new RemediationStep
|
||||
{
|
||||
Id = "docker-start",
|
||||
Title = "Start Docker",
|
||||
Description = "Start the Docker daemon",
|
||||
IsAutomated = true,
|
||||
Command = "systemctl start docker"
|
||||
}
|
||||
];
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,302 @@
|
||||
// Copyright (c) 2026 Stella Ops. All rights reserved.
|
||||
// Licensed under the AGPL-3.0-or-later license.
|
||||
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Agent.Core.Bootstrap;
|
||||
|
||||
/// <summary>
|
||||
/// Service for generating zero-touch agent deployment packages.
|
||||
/// </summary>
|
||||
public sealed class BootstrapService : IBootstrapService
|
||||
{
|
||||
private readonly ILogger<BootstrapService> _logger;
|
||||
private readonly IBootstrapTokenService _tokenService;
|
||||
private readonly BootstrapOptions _options;
|
||||
|
||||
public BootstrapService(
|
||||
ILogger<BootstrapService> logger,
|
||||
IBootstrapTokenService tokenService,
|
||||
IOptions<BootstrapOptions> options)
|
||||
{
|
||||
_logger = logger;
|
||||
_tokenService = tokenService;
|
||||
_options = options.Value;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generates a complete bootstrap package for agent deployment.
|
||||
/// </summary>
|
||||
public async Task<BootstrapPackage> BootstrapAgentAsync(
|
||||
BootstrapRequest request,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
|
||||
// Generate bootstrap token
|
||||
var token = await _tokenService.GenerateBootstrapTokenAsync(
|
||||
new BootstrapTokenRequest
|
||||
{
|
||||
AgentName = request.AgentName,
|
||||
Environment = request.Environment,
|
||||
Capabilities = request.Capabilities,
|
||||
Labels = request.Labels,
|
||||
ClusterId = request.ClusterId
|
||||
},
|
||||
cancellationToken);
|
||||
|
||||
var platform = request.Platform ?? DetectPlatform();
|
||||
|
||||
// Generate installer command based on platform
|
||||
var (oneLiner, scriptContent) = GenerateInstaller(platform, token.Token, request);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Generated bootstrap package for {AgentName} on {Platform}",
|
||||
request.AgentName,
|
||||
platform);
|
||||
|
||||
return new BootstrapPackage
|
||||
{
|
||||
Token = token.Token,
|
||||
AgentName = request.AgentName,
|
||||
Environment = request.Environment,
|
||||
Platform = platform,
|
||||
OneLiner = oneLiner,
|
||||
InstallScript = scriptContent,
|
||||
ExpiresAt = token.ExpiresAt
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generates an install script for the specified token.
|
||||
/// </summary>
|
||||
public async Task<string> GenerateInstallScriptAsync(
|
||||
string tokenValue,
|
||||
BootstrapPlatform platform,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var token = await _tokenService.ValidateTokenAsync(tokenValue, cancellationToken);
|
||||
if (token is null)
|
||||
{
|
||||
throw new InvalidOperationException("Invalid or expired bootstrap token");
|
||||
}
|
||||
|
||||
var (_, scriptContent) = GenerateInstaller(platform, tokenValue, new BootstrapRequest
|
||||
{
|
||||
AgentName = token.AgentName,
|
||||
Environment = token.Environment,
|
||||
Capabilities = token.Capabilities.ToList(),
|
||||
Labels = new Dictionary<string, string>(token.Labels)
|
||||
});
|
||||
|
||||
return scriptContent;
|
||||
}
|
||||
|
||||
private (string OneLiner, string ScriptContent) GenerateInstaller(
|
||||
BootstrapPlatform platform,
|
||||
string token,
|
||||
BootstrapRequest request)
|
||||
{
|
||||
return platform switch
|
||||
{
|
||||
BootstrapPlatform.Linux => GenerateLinuxInstaller(token, request),
|
||||
BootstrapPlatform.Windows => GenerateWindowsInstaller(token, request),
|
||||
BootstrapPlatform.Docker => GenerateDockerInstaller(token, request),
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(platform))
|
||||
};
|
||||
}
|
||||
|
||||
private (string OneLiner, string ScriptContent) GenerateLinuxInstaller(
|
||||
string token,
|
||||
BootstrapRequest request)
|
||||
{
|
||||
var orchestratorUrl = _options.OrchestratorUrl;
|
||||
|
||||
var oneLiner = $"curl -fsSL {orchestratorUrl}/bootstrap/install.sh | STELLA_TOKEN={token} bash";
|
||||
|
||||
var script = new StringBuilder();
|
||||
script.AppendLine("#!/bin/bash");
|
||||
script.AppendLine("set -euo pipefail");
|
||||
script.AppendLine();
|
||||
script.AppendLine($"# Stella Agent Bootstrap Script");
|
||||
script.AppendLine($"# Agent: {request.AgentName}");
|
||||
script.AppendLine($"# Environment: {request.Environment}");
|
||||
script.AppendLine($"# Generated: {DateTimeOffset.UtcNow:O}");
|
||||
script.AppendLine();
|
||||
script.AppendLine($"STELLA_TOKEN=\"{token}\"");
|
||||
script.AppendLine($"ORCHESTRATOR_URL=\"{orchestratorUrl}\"");
|
||||
script.AppendLine();
|
||||
script.AppendLine("# Check dependencies");
|
||||
script.AppendLine("command -v curl >/dev/null 2>&1 || { echo 'curl is required'; exit 1; }");
|
||||
script.AppendLine("command -v docker >/dev/null 2>&1 || { echo 'docker is required'; exit 1; }");
|
||||
script.AppendLine();
|
||||
script.AppendLine("# Create agent directory");
|
||||
script.AppendLine("mkdir -p /opt/stella-agent");
|
||||
script.AppendLine("cd /opt/stella-agent");
|
||||
script.AppendLine();
|
||||
script.AppendLine("# Download agent binary");
|
||||
script.AppendLine($"curl -fsSL \"$ORCHESTRATOR_URL/bootstrap/download?platform=linux\" -o stella-agent");
|
||||
script.AppendLine("chmod +x stella-agent");
|
||||
script.AppendLine();
|
||||
script.AppendLine("# Bootstrap agent");
|
||||
script.AppendLine("./stella-agent bootstrap --token \"$STELLA_TOKEN\" --orchestrator \"$ORCHESTRATOR_URL\"");
|
||||
script.AppendLine();
|
||||
script.AppendLine("# Install as systemd service");
|
||||
script.AppendLine("./stella-agent install-service");
|
||||
script.AppendLine();
|
||||
script.AppendLine("echo 'Stella Agent installed successfully!'");
|
||||
script.AppendLine("systemctl status stella-agent");
|
||||
|
||||
return (oneLiner, script.ToString());
|
||||
}
|
||||
|
||||
private (string OneLiner, string ScriptContent) GenerateWindowsInstaller(
|
||||
string token,
|
||||
BootstrapRequest request)
|
||||
{
|
||||
var orchestratorUrl = _options.OrchestratorUrl;
|
||||
|
||||
var oneLiner = $"irm {orchestratorUrl}/bootstrap/install.ps1 | iex";
|
||||
|
||||
var script = new StringBuilder();
|
||||
script.AppendLine("# Stella Agent Bootstrap Script for Windows");
|
||||
script.AppendLine($"# Agent: {request.AgentName}");
|
||||
script.AppendLine($"# Environment: {request.Environment}");
|
||||
script.AppendLine($"# Generated: {DateTimeOffset.UtcNow:O}");
|
||||
script.AppendLine();
|
||||
script.AppendLine("$ErrorActionPreference = 'Stop'");
|
||||
script.AppendLine();
|
||||
script.AppendLine($"$StellaToken = '{token}'");
|
||||
script.AppendLine($"$OrchestratorUrl = '{orchestratorUrl}'");
|
||||
script.AppendLine();
|
||||
script.AppendLine("# Check for administrator privileges");
|
||||
script.AppendLine("if (-not ([Security.Principal.WindowsPrincipal][Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole]::Administrator)) {");
|
||||
script.AppendLine(" Write-Error 'This script must be run as Administrator'");
|
||||
script.AppendLine(" exit 1");
|
||||
script.AppendLine("}");
|
||||
script.AppendLine();
|
||||
script.AppendLine("# Create agent directory");
|
||||
script.AppendLine("$InstallPath = 'C:\\Program Files\\StellaAgent'");
|
||||
script.AppendLine("New-Item -ItemType Directory -Force -Path $InstallPath | Out-Null");
|
||||
script.AppendLine("Set-Location $InstallPath");
|
||||
script.AppendLine();
|
||||
script.AppendLine("# Download agent binary");
|
||||
script.AppendLine("Invoke-WebRequest -Uri \"$OrchestratorUrl/bootstrap/download?platform=windows\" -OutFile 'stella-agent.exe'");
|
||||
script.AppendLine();
|
||||
script.AppendLine("# Bootstrap agent");
|
||||
script.AppendLine(".\\stella-agent.exe bootstrap --token $StellaToken --orchestrator $OrchestratorUrl");
|
||||
script.AppendLine();
|
||||
script.AppendLine("# Install as Windows service");
|
||||
script.AppendLine(".\\stella-agent.exe install-service");
|
||||
script.AppendLine();
|
||||
script.AppendLine("Write-Host 'Stella Agent installed successfully!' -ForegroundColor Green");
|
||||
script.AppendLine("Get-Service StellaAgent");
|
||||
|
||||
return (oneLiner, script.ToString());
|
||||
}
|
||||
|
||||
private (string OneLiner, string ScriptContent) GenerateDockerInstaller(
|
||||
string token,
|
||||
BootstrapRequest request)
|
||||
{
|
||||
var orchestratorUrl = _options.OrchestratorUrl;
|
||||
var imageName = "ghcr.io/stellaops/agent:latest";
|
||||
|
||||
var oneLiner = $"docker run -d --name stella-agent -e STELLA_TOKEN={token} -e ORCHESTRATOR_URL={orchestratorUrl} -v /var/run/docker.sock:/var/run/docker.sock {imageName}";
|
||||
|
||||
var script = new StringBuilder();
|
||||
script.AppendLine("#!/bin/bash");
|
||||
script.AppendLine("set -euo pipefail");
|
||||
script.AppendLine();
|
||||
script.AppendLine("# Stella Agent Docker Deployment");
|
||||
script.AppendLine($"# Agent: {request.AgentName}");
|
||||
script.AppendLine($"# Environment: {request.Environment}");
|
||||
script.AppendLine($"# Generated: {DateTimeOffset.UtcNow:O}");
|
||||
script.AppendLine();
|
||||
script.AppendLine($"STELLA_TOKEN=\"{token}\"");
|
||||
script.AppendLine($"ORCHESTRATOR_URL=\"{orchestratorUrl}\"");
|
||||
script.AppendLine($"IMAGE=\"{imageName}\"");
|
||||
script.AppendLine();
|
||||
script.AppendLine("# Remove existing container if present");
|
||||
script.AppendLine("docker rm -f stella-agent 2>/dev/null || true");
|
||||
script.AppendLine();
|
||||
script.AppendLine("# Run agent container");
|
||||
script.AppendLine("docker run -d \\");
|
||||
script.AppendLine(" --name stella-agent \\");
|
||||
script.AppendLine(" --restart unless-stopped \\");
|
||||
script.AppendLine(" -e STELLA_TOKEN=\"$STELLA_TOKEN\" \\");
|
||||
script.AppendLine(" -e ORCHESTRATOR_URL=\"$ORCHESTRATOR_URL\" \\");
|
||||
script.AppendLine(" -v /var/run/docker.sock:/var/run/docker.sock \\");
|
||||
script.AppendLine(" -v stella-agent-data:/data \\");
|
||||
script.AppendLine(" \"$IMAGE\"");
|
||||
script.AppendLine();
|
||||
script.AppendLine("echo 'Stella Agent container started!'");
|
||||
script.AppendLine("docker ps -f name=stella-agent");
|
||||
|
||||
return (oneLiner, script.ToString());
|
||||
}
|
||||
|
||||
private static BootstrapPlatform DetectPlatform()
|
||||
{
|
||||
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
|
||||
return BootstrapPlatform.Windows;
|
||||
if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
|
||||
return BootstrapPlatform.Linux;
|
||||
return BootstrapPlatform.Docker;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for bootstrap operations.
|
||||
/// </summary>
|
||||
public interface IBootstrapService
|
||||
{
|
||||
Task<BootstrapPackage> BootstrapAgentAsync(
|
||||
BootstrapRequest request,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
Task<string> GenerateInstallScriptAsync(
|
||||
string tokenValue,
|
||||
BootstrapPlatform platform,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to bootstrap an agent.
|
||||
/// </summary>
|
||||
public record BootstrapRequest
|
||||
{
|
||||
public required string AgentName { get; init; }
|
||||
public required string Environment { get; init; }
|
||||
public BootstrapPlatform? Platform { get; init; }
|
||||
public List<string>? Capabilities { get; init; }
|
||||
public Dictionary<string, string>? Labels { get; init; }
|
||||
public string? ClusterId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Bootstrap package with all deployment artifacts.
|
||||
/// </summary>
|
||||
public record BootstrapPackage
|
||||
{
|
||||
public required string Token { get; init; }
|
||||
public required string AgentName { get; init; }
|
||||
public required string Environment { get; init; }
|
||||
public required BootstrapPlatform Platform { get; init; }
|
||||
public required string OneLiner { get; init; }
|
||||
public required string InstallScript { get; init; }
|
||||
public DateTimeOffset ExpiresAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Target platform for bootstrap.
|
||||
/// </summary>
|
||||
public enum BootstrapPlatform
|
||||
{
|
||||
Linux,
|
||||
Windows,
|
||||
Docker
|
||||
}
|
||||
@@ -0,0 +1,208 @@
|
||||
// Copyright (c) 2026 Stella Ops. All rights reserved.
|
||||
// Licensed under the AGPL-3.0-or-later license.
|
||||
|
||||
using System.Security.Cryptography;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Agent.Core.Configuration;
|
||||
|
||||
namespace StellaOps.Agent.Core.Bootstrap;
|
||||
|
||||
/// <summary>
|
||||
/// Service for generating and validating secure one-time bootstrap tokens.
|
||||
/// </summary>
|
||||
public sealed class BootstrapTokenService : IBootstrapTokenService
|
||||
{
|
||||
private readonly ILogger<BootstrapTokenService> _logger;
|
||||
private readonly IBootstrapTokenStore _tokenStore;
|
||||
private readonly BootstrapOptions _options;
|
||||
|
||||
public BootstrapTokenService(
|
||||
ILogger<BootstrapTokenService> logger,
|
||||
IBootstrapTokenStore tokenStore,
|
||||
IOptions<BootstrapOptions> options)
|
||||
{
|
||||
_logger = logger;
|
||||
_tokenStore = tokenStore;
|
||||
_options = options.Value;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generates a secure one-time bootstrap token with 15-minute expiry.
|
||||
/// </summary>
|
||||
public async Task<BootstrapToken> GenerateBootstrapTokenAsync(
|
||||
BootstrapTokenRequest request,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(request.AgentName);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(request.Environment);
|
||||
|
||||
var tokenValue = GenerateSecureToken();
|
||||
var expiresAt = DateTimeOffset.UtcNow.Add(_options.TokenExpiry);
|
||||
|
||||
var token = new BootstrapToken
|
||||
{
|
||||
Token = tokenValue,
|
||||
AgentName = request.AgentName,
|
||||
Environment = request.Environment,
|
||||
Capabilities = request.Capabilities ?? [],
|
||||
Labels = request.Labels ?? new Dictionary<string, string>(),
|
||||
ExpiresAt = expiresAt,
|
||||
CreatedAt = DateTimeOffset.UtcNow,
|
||||
IsConsumed = false,
|
||||
ClusterId = request.ClusterId
|
||||
};
|
||||
|
||||
await _tokenStore.StoreTokenAsync(token, cancellationToken);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Generated bootstrap token for agent {AgentName} in environment {Environment}, expires at {ExpiresAt}",
|
||||
request.AgentName,
|
||||
request.Environment,
|
||||
expiresAt);
|
||||
|
||||
return token;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates a bootstrap token. Returns null if invalid or expired.
|
||||
/// </summary>
|
||||
public async Task<BootstrapToken?> ValidateTokenAsync(
|
||||
string tokenValue,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tokenValue);
|
||||
|
||||
var token = await _tokenStore.GetTokenAsync(tokenValue, cancellationToken);
|
||||
|
||||
if (token is null)
|
||||
{
|
||||
_logger.LogWarning("Bootstrap token not found: {TokenPrefix}...", tokenValue[..8]);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (token.IsConsumed)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Bootstrap token already consumed for agent {AgentName}",
|
||||
token.AgentName);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (token.ExpiresAt < DateTimeOffset.UtcNow)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Bootstrap token expired for agent {AgentName}, expired at {ExpiresAt}",
|
||||
token.AgentName,
|
||||
token.ExpiresAt);
|
||||
return null;
|
||||
}
|
||||
|
||||
return token;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Consumes a token, marking it as used (one-time use).
|
||||
/// </summary>
|
||||
public async Task<bool> ConsumeTokenAsync(
|
||||
string tokenValue,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tokenValue);
|
||||
|
||||
var token = await ValidateTokenAsync(tokenValue, cancellationToken);
|
||||
if (token is null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
token.IsConsumed = true;
|
||||
token.ConsumedAt = DateTimeOffset.UtcNow;
|
||||
|
||||
await _tokenStore.UpdateTokenAsync(token, cancellationToken);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Bootstrap token consumed for agent {AgentName}",
|
||||
token.AgentName);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static string GenerateSecureToken()
|
||||
{
|
||||
// Generate a 256-bit (32 byte) token
|
||||
var bytes = RandomNumberGenerator.GetBytes(32);
|
||||
return Convert.ToBase64String(bytes)
|
||||
.Replace("+", "-")
|
||||
.Replace("/", "_")
|
||||
.TrimEnd('=');
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for bootstrap token operations.
|
||||
/// </summary>
|
||||
public interface IBootstrapTokenService
|
||||
{
|
||||
Task<BootstrapToken> GenerateBootstrapTokenAsync(
|
||||
BootstrapTokenRequest request,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
Task<BootstrapToken?> ValidateTokenAsync(
|
||||
string tokenValue,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
Task<bool> ConsumeTokenAsync(
|
||||
string tokenValue,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to generate a bootstrap token.
|
||||
/// </summary>
|
||||
public record BootstrapTokenRequest
|
||||
{
|
||||
public required string AgentName { get; init; }
|
||||
public required string Environment { get; init; }
|
||||
public IReadOnlyList<string>? Capabilities { get; init; }
|
||||
public IReadOnlyDictionary<string, string>? Labels { get; init; }
|
||||
public string? ClusterId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A bootstrap token with metadata.
|
||||
/// </summary>
|
||||
public record BootstrapToken
|
||||
{
|
||||
public required string Token { get; init; }
|
||||
public required string AgentName { get; init; }
|
||||
public required string Environment { get; init; }
|
||||
public IReadOnlyList<string> Capabilities { get; init; } = [];
|
||||
public IReadOnlyDictionary<string, string> Labels { get; init; } = new Dictionary<string, string>();
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset ExpiresAt { get; init; }
|
||||
public bool IsConsumed { get; set; }
|
||||
public DateTimeOffset? ConsumedAt { get; set; }
|
||||
public string? ClusterId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for bootstrap token persistence.
|
||||
/// </summary>
|
||||
public interface IBootstrapTokenStore
|
||||
{
|
||||
Task StoreTokenAsync(BootstrapToken token, CancellationToken cancellationToken = default);
|
||||
Task<BootstrapToken?> GetTokenAsync(string tokenValue, CancellationToken cancellationToken = default);
|
||||
Task UpdateTokenAsync(BootstrapToken token, CancellationToken cancellationToken = default);
|
||||
Task CleanupExpiredTokensAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Bootstrap configuration options.
|
||||
/// </summary>
|
||||
public class BootstrapOptions
|
||||
{
|
||||
public TimeSpan TokenExpiry { get; set; } = TimeSpan.FromMinutes(15);
|
||||
public string OrchestratorUrl { get; set; } = string.Empty;
|
||||
}
|
||||
@@ -0,0 +1,288 @@
|
||||
// Copyright (c) 2026 Stella Ops. All rights reserved.
|
||||
// Licensed under the AGPL-3.0-or-later license.
|
||||
|
||||
using System.Security.Cryptography;
|
||||
using System.Security.Cryptography.X509Certificates;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Agent.Core.Certificates;
|
||||
|
||||
/// <summary>
|
||||
/// Manages agent certificate lifecycle including provisioning and renewal.
|
||||
/// </summary>
|
||||
public sealed class AgentCertificateManager : BackgroundService, IAgentCertificateManager
|
||||
{
|
||||
private readonly ILogger<AgentCertificateManager> _logger;
|
||||
private readonly ICertificateStore _certificateStore;
|
||||
private readonly ICertificateProvider _certificateProvider;
|
||||
private readonly CertificateOptions _options;
|
||||
private X509Certificate2? _currentCertificate;
|
||||
|
||||
public AgentCertificateManager(
|
||||
ILogger<AgentCertificateManager> logger,
|
||||
ICertificateStore certificateStore,
|
||||
ICertificateProvider certificateProvider,
|
||||
IOptions<CertificateOptions> options)
|
||||
{
|
||||
_logger = logger;
|
||||
_certificateStore = certificateStore;
|
||||
_certificateProvider = certificateProvider;
|
||||
_options = options.Value;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current agent certificate.
|
||||
/// </summary>
|
||||
public X509Certificate2? CurrentCertificate => _currentCertificate;
|
||||
|
||||
/// <summary>
|
||||
/// Ensures a valid certificate is available, provisioning or renewing as needed.
|
||||
/// </summary>
|
||||
public async Task<X509Certificate2> EnsureCertificateAsync(
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
// Try to load existing certificate
|
||||
var existingCert = await _certificateStore.LoadCertificateAsync(cancellationToken);
|
||||
|
||||
if (existingCert is not null)
|
||||
{
|
||||
if (IsValidAndNotNearExpiry(existingCert))
|
||||
{
|
||||
_currentCertificate = existingCert;
|
||||
_logger.LogDebug("Using existing certificate, expires {ExpiresAt}", existingCert.NotAfter);
|
||||
return existingCert;
|
||||
}
|
||||
|
||||
if (existingCert.NotAfter > DateTimeOffset.UtcNow)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Certificate nearing expiry ({ExpiresAt}), triggering renewal",
|
||||
existingCert.NotAfter);
|
||||
}
|
||||
}
|
||||
|
||||
// Provision or renew certificate
|
||||
var newCert = await ProvisionCertificateAsync(cancellationToken);
|
||||
_currentCertificate = newCert;
|
||||
return newCert;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Forces certificate renewal regardless of expiry status.
|
||||
/// </summary>
|
||||
public async Task<X509Certificate2> RenewCertificateAsync(
|
||||
bool force = false,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
_logger.LogInformation("Certificate renewal requested (force={Force})", force);
|
||||
|
||||
if (!force && _currentCertificate is not null && IsValidAndNotNearExpiry(_currentCertificate))
|
||||
{
|
||||
_logger.LogDebug("Certificate is valid and not near expiry, skipping renewal");
|
||||
return _currentCertificate;
|
||||
}
|
||||
|
||||
var newCert = await ProvisionCertificateAsync(cancellationToken);
|
||||
_currentCertificate = newCert;
|
||||
|
||||
_logger.LogInformation("Certificate renewed successfully, expires {ExpiresAt}", newCert.NotAfter);
|
||||
return newCert;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets certificate status information.
|
||||
/// </summary>
|
||||
public CertificateStatus GetCertificateStatus()
|
||||
{
|
||||
if (_currentCertificate is null)
|
||||
{
|
||||
return new CertificateStatus
|
||||
{
|
||||
HasCertificate = false,
|
||||
Message = "No certificate loaded"
|
||||
};
|
||||
}
|
||||
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var expiresAt = _currentCertificate.NotAfter;
|
||||
var remainingDays = (expiresAt - now).TotalDays;
|
||||
|
||||
return new CertificateStatus
|
||||
{
|
||||
HasCertificate = true,
|
||||
Subject = _currentCertificate.Subject,
|
||||
Issuer = _currentCertificate.Issuer,
|
||||
Thumbprint = _currentCertificate.Thumbprint,
|
||||
NotBefore = _currentCertificate.NotBefore,
|
||||
NotAfter = expiresAt,
|
||||
IsExpired = expiresAt < now,
|
||||
IsNearExpiry = remainingDays <= _options.RenewalThresholdDays,
|
||||
RemainingDays = (int)remainingDays,
|
||||
Message = GetStatusMessage(expiresAt, remainingDays)
|
||||
};
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Certificate renewal monitor started");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await EnsureCertificateAsync(stoppingToken);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Certificate renewal check failed");
|
||||
}
|
||||
|
||||
await Task.Delay(_options.RenewalCheckInterval, stoppingToken);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<X509Certificate2> ProvisionCertificateAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
// Generate CSR
|
||||
var (privateKey, csr) = GenerateCsr();
|
||||
|
||||
// Submit CSR to certificate provider
|
||||
var certificatePem = await _certificateProvider.SubmitCsrAsync(csr, cancellationToken);
|
||||
|
||||
// Combine certificate with private key
|
||||
var certificate = CreateCertificateWithPrivateKey(certificatePem, privateKey);
|
||||
|
||||
// Store certificate
|
||||
await _certificateStore.StoreCertificateAsync(certificate, cancellationToken);
|
||||
|
||||
return certificate;
|
||||
}
|
||||
|
||||
private (RSA PrivateKey, byte[] Csr) GenerateCsr()
|
||||
{
|
||||
var privateKey = RSA.Create(4096);
|
||||
|
||||
var request = new CertificateRequest(
|
||||
$"CN={_options.AgentName}, O=StellaOps Agent",
|
||||
privateKey,
|
||||
HashAlgorithmName.SHA256,
|
||||
RSASignaturePadding.Pkcs1);
|
||||
|
||||
// Add key usage extension
|
||||
request.CertificateExtensions.Add(
|
||||
new X509KeyUsageExtension(
|
||||
X509KeyUsageFlags.DigitalSignature | X509KeyUsageFlags.KeyEncipherment,
|
||||
critical: true));
|
||||
|
||||
// Add enhanced key usage (client authentication)
|
||||
request.CertificateExtensions.Add(
|
||||
new X509EnhancedKeyUsageExtension(
|
||||
new OidCollection { new Oid("1.3.6.1.5.5.7.3.2") }, // Client Authentication
|
||||
critical: true));
|
||||
|
||||
var csr = request.CreateSigningRequest();
|
||||
|
||||
return (privateKey, csr);
|
||||
}
|
||||
|
||||
private static X509Certificate2 CreateCertificateWithPrivateKey(string certificatePem, RSA privateKey)
|
||||
{
|
||||
var certificate = X509Certificate2.CreateFromPem(certificatePem);
|
||||
return certificate.CopyWithPrivateKey(privateKey);
|
||||
}
|
||||
|
||||
private bool IsValidAndNotNearExpiry(X509Certificate2 certificate)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
if (certificate.NotBefore > now || certificate.NotAfter < now)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var remainingDays = (certificate.NotAfter - now).TotalDays;
|
||||
return remainingDays > _options.RenewalThresholdDays;
|
||||
}
|
||||
|
||||
private string GetStatusMessage(DateTimeOffset expiresAt, double remainingDays)
|
||||
{
|
||||
if (expiresAt < DateTimeOffset.UtcNow)
|
||||
return "Certificate has expired";
|
||||
if (remainingDays <= _options.RenewalThresholdDays)
|
||||
return $"Certificate expires in {remainingDays:N0} days - renewal recommended";
|
||||
return $"Certificate valid for {remainingDays:N0} more days";
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for certificate management operations.
|
||||
/// </summary>
|
||||
public interface IAgentCertificateManager
|
||||
{
|
||||
X509Certificate2? CurrentCertificate { get; }
|
||||
Task<X509Certificate2> EnsureCertificateAsync(CancellationToken cancellationToken = default);
|
||||
Task<X509Certificate2> RenewCertificateAsync(bool force = false, CancellationToken cancellationToken = default);
|
||||
CertificateStatus GetCertificateStatus();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for certificate storage.
|
||||
/// </summary>
|
||||
public interface ICertificateStore
|
||||
{
|
||||
Task<X509Certificate2?> LoadCertificateAsync(CancellationToken cancellationToken = default);
|
||||
Task StoreCertificateAsync(X509Certificate2 certificate, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for certificate provisioning.
|
||||
/// </summary>
|
||||
public interface ICertificateProvider
|
||||
{
|
||||
Task<string> SubmitCsrAsync(byte[] csr, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Certificate status information.
|
||||
/// </summary>
|
||||
public record CertificateStatus
|
||||
{
|
||||
public bool HasCertificate { get; init; }
|
||||
public string? Subject { get; init; }
|
||||
public string? Issuer { get; init; }
|
||||
public string? Thumbprint { get; init; }
|
||||
public DateTimeOffset NotBefore { get; init; }
|
||||
public DateTimeOffset NotAfter { get; init; }
|
||||
public bool IsExpired { get; init; }
|
||||
public bool IsNearExpiry { get; init; }
|
||||
public int RemainingDays { get; init; }
|
||||
public required string Message { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Certificate configuration options.
|
||||
/// </summary>
|
||||
public class CertificateOptions
|
||||
{
|
||||
public string AgentName { get; set; } = "stella-agent";
|
||||
public CertificateSource Source { get; set; } = CertificateSource.AutoProvision;
|
||||
public string? CertificatePath { get; set; }
|
||||
public string? KeyPath { get; set; }
|
||||
public string? VaultPath { get; set; }
|
||||
public string? AcmeServer { get; set; }
|
||||
public int RenewalThresholdDays { get; set; } = 7;
|
||||
public TimeSpan RenewalCheckInterval { get; set; } = TimeSpan.FromHours(6);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Certificate source type.
|
||||
/// </summary>
|
||||
public enum CertificateSource
|
||||
{
|
||||
AutoProvision,
|
||||
File,
|
||||
Vault,
|
||||
ACME
|
||||
}
|
||||
@@ -0,0 +1,397 @@
|
||||
// Copyright (c) 2026 Stella Ops. All rights reserved.
|
||||
// Licensed under the AGPL-3.0-or-later license.
|
||||
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Agent.Core.Configuration;
|
||||
|
||||
/// <summary>
|
||||
/// Manages agent configuration with drift detection and rollback support.
|
||||
/// </summary>
|
||||
public sealed class AgentConfigManager : IAgentConfigManager
|
||||
{
|
||||
private readonly ILogger<AgentConfigManager> _logger;
|
||||
private readonly IConfigurationPersistence _persistence;
|
||||
private AgentConfiguration? _currentConfig;
|
||||
private readonly List<ConfigurationVersion> _versionHistory = new();
|
||||
|
||||
public AgentConfigManager(
|
||||
ILogger<AgentConfigManager> logger,
|
||||
IConfigurationPersistence persistence)
|
||||
{
|
||||
_logger = logger;
|
||||
_persistence = persistence;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current configuration.
|
||||
/// </summary>
|
||||
public AgentConfiguration? CurrentConfiguration => _currentConfig;
|
||||
|
||||
/// <summary>
|
||||
/// Applies a new configuration with validation and rollback capability.
|
||||
/// </summary>
|
||||
public async Task<ConfigurationApplyResult> ApplyConfigurationAsync(
|
||||
AgentConfiguration newConfig,
|
||||
bool dryRun = false,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(newConfig);
|
||||
|
||||
// Validate configuration
|
||||
var validationErrors = newConfig.Validate();
|
||||
if (validationErrors.Count > 0)
|
||||
{
|
||||
return new ConfigurationApplyResult
|
||||
{
|
||||
Success = false,
|
||||
Errors = validationErrors,
|
||||
Message = "Configuration validation failed"
|
||||
};
|
||||
}
|
||||
|
||||
// Compute diff
|
||||
var diff = ComputeDiff(_currentConfig, newConfig);
|
||||
|
||||
if (dryRun)
|
||||
{
|
||||
return new ConfigurationApplyResult
|
||||
{
|
||||
Success = true,
|
||||
DryRun = true,
|
||||
Changes = diff,
|
||||
Message = "Dry run completed - no changes applied"
|
||||
};
|
||||
}
|
||||
|
||||
// Create rollback point
|
||||
var previousConfig = _currentConfig;
|
||||
var versionNumber = _versionHistory.Count + 1;
|
||||
|
||||
try
|
||||
{
|
||||
// Apply configuration
|
||||
_currentConfig = newConfig;
|
||||
|
||||
// Persist configuration
|
||||
await _persistence.SaveAsync(newConfig, cancellationToken);
|
||||
|
||||
// Record version
|
||||
_versionHistory.Add(new ConfigurationVersion
|
||||
{
|
||||
Version = versionNumber,
|
||||
Configuration = newConfig,
|
||||
AppliedAt = DateTimeOffset.UtcNow
|
||||
});
|
||||
|
||||
_logger.LogInformation(
|
||||
"Configuration v{Version} applied successfully with {ChangeCount} changes",
|
||||
versionNumber,
|
||||
diff.Count);
|
||||
|
||||
return new ConfigurationApplyResult
|
||||
{
|
||||
Success = true,
|
||||
Changes = diff,
|
||||
Version = versionNumber,
|
||||
Message = $"Configuration v{versionNumber} applied successfully"
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Rollback on failure
|
||||
_currentConfig = previousConfig;
|
||||
|
||||
_logger.LogError(ex, "Configuration apply failed, rolled back to previous version");
|
||||
|
||||
return new ConfigurationApplyResult
|
||||
{
|
||||
Success = false,
|
||||
Errors = [ex.Message],
|
||||
RolledBack = true,
|
||||
Message = "Configuration apply failed, rolled back to previous version"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Detects drift between desired and actual configuration.
|
||||
/// </summary>
|
||||
public async Task<ConfigurationDriftResult> DetectDriftAsync(
|
||||
AgentConfiguration desiredConfig,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(desiredConfig);
|
||||
|
||||
// Load actual configuration
|
||||
var actualConfig = await _persistence.LoadAsync(cancellationToken);
|
||||
|
||||
if (actualConfig is null)
|
||||
{
|
||||
return new ConfigurationDriftResult
|
||||
{
|
||||
HasDrift = true,
|
||||
DriftType = DriftType.Missing,
|
||||
Differences = [],
|
||||
Message = "No configuration found on disk"
|
||||
};
|
||||
}
|
||||
|
||||
var differences = ComputeDiff(actualConfig, desiredConfig);
|
||||
|
||||
if (differences.Count == 0)
|
||||
{
|
||||
return new ConfigurationDriftResult
|
||||
{
|
||||
HasDrift = false,
|
||||
DriftType = DriftType.None,
|
||||
Differences = [],
|
||||
Message = "Configuration is in sync"
|
||||
};
|
||||
}
|
||||
|
||||
return new ConfigurationDriftResult
|
||||
{
|
||||
HasDrift = true,
|
||||
DriftType = DriftType.Modified,
|
||||
Differences = differences,
|
||||
Message = $"Found {differences.Count} configuration differences"
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Rolls back to a previous configuration version.
|
||||
/// </summary>
|
||||
public async Task<ConfigurationApplyResult> RollbackAsync(
|
||||
int? targetVersion = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (_versionHistory.Count == 0)
|
||||
{
|
||||
return new ConfigurationApplyResult
|
||||
{
|
||||
Success = false,
|
||||
Errors = ["No previous configuration versions available"],
|
||||
Message = "Rollback failed - no history available"
|
||||
};
|
||||
}
|
||||
|
||||
var version = targetVersion ?? _versionHistory.Count - 1;
|
||||
|
||||
if (version < 1 || version > _versionHistory.Count)
|
||||
{
|
||||
return new ConfigurationApplyResult
|
||||
{
|
||||
Success = false,
|
||||
Errors = [$"Invalid version {version}. Available versions: 1-{_versionHistory.Count}"],
|
||||
Message = "Rollback failed - invalid version"
|
||||
};
|
||||
}
|
||||
|
||||
var targetConfig = _versionHistory[version - 1].Configuration;
|
||||
|
||||
_logger.LogInformation("Rolling back to configuration v{Version}", version);
|
||||
|
||||
return await ApplyConfigurationAsync(targetConfig, dryRun: false, cancellationToken);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Loads configuration from persistence.
|
||||
/// </summary>
|
||||
public async Task LoadAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
_currentConfig = await _persistence.LoadAsync(cancellationToken);
|
||||
|
||||
if (_currentConfig is not null)
|
||||
{
|
||||
_logger.LogInformation("Loaded configuration for agent {AgentName}",
|
||||
_currentConfig.Identity.Name);
|
||||
}
|
||||
}
|
||||
|
||||
private static List<ConfigurationChange> ComputeDiff(
|
||||
AgentConfiguration? current,
|
||||
AgentConfiguration desired)
|
||||
{
|
||||
var changes = new List<ConfigurationChange>();
|
||||
|
||||
if (current is null)
|
||||
{
|
||||
changes.Add(new ConfigurationChange
|
||||
{
|
||||
Path = "",
|
||||
ChangeType = ChangeType.Added,
|
||||
NewValue = "entire configuration"
|
||||
});
|
||||
return changes;
|
||||
}
|
||||
|
||||
// Compare identity
|
||||
if (current.Identity.Name != desired.Identity.Name)
|
||||
{
|
||||
changes.Add(new ConfigurationChange
|
||||
{
|
||||
Path = "identity.name",
|
||||
ChangeType = ChangeType.Modified,
|
||||
OldValue = current.Identity.Name,
|
||||
NewValue = desired.Identity.Name
|
||||
});
|
||||
}
|
||||
|
||||
if (current.Identity.Environment != desired.Identity.Environment)
|
||||
{
|
||||
changes.Add(new ConfigurationChange
|
||||
{
|
||||
Path = "identity.environment",
|
||||
ChangeType = ChangeType.Modified,
|
||||
OldValue = current.Identity.Environment,
|
||||
NewValue = desired.Identity.Environment
|
||||
});
|
||||
}
|
||||
|
||||
// Compare connection
|
||||
if (current.Connection.OrchestratorUrl != desired.Connection.OrchestratorUrl)
|
||||
{
|
||||
changes.Add(new ConfigurationChange
|
||||
{
|
||||
Path = "connection.orchestratorUrl",
|
||||
ChangeType = ChangeType.Modified,
|
||||
OldValue = current.Connection.OrchestratorUrl,
|
||||
NewValue = desired.Connection.OrchestratorUrl
|
||||
});
|
||||
}
|
||||
|
||||
if (current.Connection.HeartbeatIntervalSeconds != desired.Connection.HeartbeatIntervalSeconds)
|
||||
{
|
||||
changes.Add(new ConfigurationChange
|
||||
{
|
||||
Path = "connection.heartbeatIntervalSeconds",
|
||||
ChangeType = ChangeType.Modified,
|
||||
OldValue = current.Connection.HeartbeatIntervalSeconds.ToString(),
|
||||
NewValue = desired.Connection.HeartbeatIntervalSeconds.ToString()
|
||||
});
|
||||
}
|
||||
|
||||
// Compare resources
|
||||
if (current.Resources.MaxConcurrentTasks != desired.Resources.MaxConcurrentTasks)
|
||||
{
|
||||
changes.Add(new ConfigurationChange
|
||||
{
|
||||
Path = "resources.maxConcurrentTasks",
|
||||
ChangeType = ChangeType.Modified,
|
||||
OldValue = current.Resources.MaxConcurrentTasks.ToString(),
|
||||
NewValue = desired.Resources.MaxConcurrentTasks.ToString()
|
||||
});
|
||||
}
|
||||
|
||||
// Compare auto-update
|
||||
var currentAutoUpdate = current.AutoUpdate?.Enabled ?? false;
|
||||
var desiredAutoUpdate = desired.AutoUpdate?.Enabled ?? false;
|
||||
if (currentAutoUpdate != desiredAutoUpdate)
|
||||
{
|
||||
changes.Add(new ConfigurationChange
|
||||
{
|
||||
Path = "autoUpdate.enabled",
|
||||
ChangeType = ChangeType.Modified,
|
||||
OldValue = currentAutoUpdate.ToString(),
|
||||
NewValue = desiredAutoUpdate.ToString()
|
||||
});
|
||||
}
|
||||
|
||||
return changes;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for configuration management operations.
|
||||
/// </summary>
|
||||
public interface IAgentConfigManager
|
||||
{
|
||||
AgentConfiguration? CurrentConfiguration { get; }
|
||||
Task<ConfigurationApplyResult> ApplyConfigurationAsync(
|
||||
AgentConfiguration newConfig,
|
||||
bool dryRun = false,
|
||||
CancellationToken cancellationToken = default);
|
||||
Task<ConfigurationDriftResult> DetectDriftAsync(
|
||||
AgentConfiguration desiredConfig,
|
||||
CancellationToken cancellationToken = default);
|
||||
Task<ConfigurationApplyResult> RollbackAsync(
|
||||
int? targetVersion = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
Task LoadAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for configuration persistence.
|
||||
/// </summary>
|
||||
public interface IConfigurationPersistence
|
||||
{
|
||||
Task SaveAsync(AgentConfiguration config, CancellationToken cancellationToken = default);
|
||||
Task<AgentConfiguration?> LoadAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of configuration apply operation.
|
||||
/// </summary>
|
||||
public record ConfigurationApplyResult
|
||||
{
|
||||
public bool Success { get; init; }
|
||||
public bool DryRun { get; init; }
|
||||
public bool RolledBack { get; init; }
|
||||
public int Version { get; init; }
|
||||
public IReadOnlyList<ConfigurationChange> Changes { get; init; } = [];
|
||||
public IReadOnlyList<string> Errors { get; init; } = [];
|
||||
public required string Message { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of drift detection.
|
||||
/// </summary>
|
||||
public record ConfigurationDriftResult
|
||||
{
|
||||
public bool HasDrift { get; init; }
|
||||
public DriftType DriftType { get; init; }
|
||||
public IReadOnlyList<ConfigurationChange> Differences { get; init; } = [];
|
||||
public required string Message { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A single configuration change.
|
||||
/// </summary>
|
||||
public record ConfigurationChange
|
||||
{
|
||||
public required string Path { get; init; }
|
||||
public ChangeType ChangeType { get; init; }
|
||||
public string? OldValue { get; init; }
|
||||
public string? NewValue { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of drift detected.
|
||||
/// </summary>
|
||||
public enum DriftType
|
||||
{
|
||||
None,
|
||||
Missing,
|
||||
Modified
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of configuration change.
|
||||
/// </summary>
|
||||
public enum ChangeType
|
||||
{
|
||||
Added,
|
||||
Modified,
|
||||
Removed
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A versioned configuration snapshot.
|
||||
/// </summary>
|
||||
public record ConfigurationVersion
|
||||
{
|
||||
public int Version { get; init; }
|
||||
public required AgentConfiguration Configuration { get; init; }
|
||||
public DateTimeOffset AppliedAt { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,402 @@
|
||||
// Copyright (c) 2026 Stella Ops. All rights reserved.
|
||||
// Licensed under the AGPL-3.0-or-later license.
|
||||
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
using YamlDotNet.Serialization;
|
||||
using YamlDotNet.Serialization.NamingConventions;
|
||||
|
||||
namespace StellaOps.Agent.Core.Configuration;
|
||||
|
||||
/// <summary>
|
||||
/// Declarative agent configuration model.
|
||||
/// </summary>
|
||||
public record AgentConfiguration
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration schema version.
|
||||
/// </summary>
|
||||
[JsonPropertyName("version")]
|
||||
public string Version { get; init; } = "1.0";
|
||||
|
||||
/// <summary>
|
||||
/// Agent identity configuration.
|
||||
/// </summary>
|
||||
[JsonPropertyName("identity")]
|
||||
public required IdentityConfig Identity { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Connection configuration.
|
||||
/// </summary>
|
||||
[JsonPropertyName("connection")]
|
||||
public required ConnectionConfig Connection { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Agent capabilities.
|
||||
/// </summary>
|
||||
[JsonPropertyName("capabilities")]
|
||||
public CapabilitiesConfig Capabilities { get; init; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Resource limits and quotas.
|
||||
/// </summary>
|
||||
[JsonPropertyName("resources")]
|
||||
public ResourceConfig Resources { get; init; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Security configuration.
|
||||
/// </summary>
|
||||
[JsonPropertyName("security")]
|
||||
public SecurityConfig Security { get; init; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Observability configuration.
|
||||
/// </summary>
|
||||
[JsonPropertyName("observability")]
|
||||
public ObservabilityConfig Observability { get; init; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Optional clustering configuration.
|
||||
/// </summary>
|
||||
[JsonPropertyName("cluster")]
|
||||
public ClusterConfig? Cluster { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional auto-update configuration.
|
||||
/// </summary>
|
||||
[JsonPropertyName("autoUpdate")]
|
||||
public AutoUpdateConfig? AutoUpdate { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Custom labels for agent organization.
|
||||
/// </summary>
|
||||
[JsonPropertyName("labels")]
|
||||
public Dictionary<string, string> Labels { get; init; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Validates the configuration and returns validation errors.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> Validate()
|
||||
{
|
||||
var errors = new List<string>();
|
||||
|
||||
if (string.IsNullOrWhiteSpace(Identity.Name))
|
||||
errors.Add("identity.name is required");
|
||||
|
||||
if (string.IsNullOrWhiteSpace(Identity.Environment))
|
||||
errors.Add("identity.environment is required");
|
||||
|
||||
if (string.IsNullOrWhiteSpace(Connection.OrchestratorUrl))
|
||||
errors.Add("connection.orchestratorUrl is required");
|
||||
|
||||
if (Resources.MaxConcurrentTasks < 1)
|
||||
errors.Add("resources.maxConcurrentTasks must be at least 1");
|
||||
|
||||
if (Resources.MemoryLimitMb < 128)
|
||||
errors.Add("resources.memoryLimitMb must be at least 128");
|
||||
|
||||
return errors;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Serializes configuration to YAML.
|
||||
/// </summary>
|
||||
public string ToYaml()
|
||||
{
|
||||
var serializer = new SerializerBuilder()
|
||||
.WithNamingConvention(CamelCaseNamingConvention.Instance)
|
||||
.Build();
|
||||
return serializer.Serialize(this);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Serializes configuration to JSON.
|
||||
/// </summary>
|
||||
public string ToJson()
|
||||
{
|
||||
return JsonSerializer.Serialize(this, new JsonSerializerOptions
|
||||
{
|
||||
WriteIndented = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deserializes configuration from YAML.
|
||||
/// </summary>
|
||||
public static AgentConfiguration FromYaml(string yaml)
|
||||
{
|
||||
var deserializer = new DeserializerBuilder()
|
||||
.WithNamingConvention(CamelCaseNamingConvention.Instance)
|
||||
.Build();
|
||||
return deserializer.Deserialize<AgentConfiguration>(yaml);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deserializes configuration from JSON.
|
||||
/// </summary>
|
||||
public static AgentConfiguration FromJson(string json)
|
||||
{
|
||||
return JsonSerializer.Deserialize<AgentConfiguration>(json, new JsonSerializerOptions
|
||||
{
|
||||
PropertyNameCaseInsensitive = true
|
||||
}) ?? throw new InvalidOperationException("Failed to deserialize configuration");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent identity configuration.
|
||||
/// </summary>
|
||||
public record IdentityConfig
|
||||
{
|
||||
[JsonPropertyName("name")]
|
||||
public required string Name { get; init; }
|
||||
|
||||
[JsonPropertyName("environment")]
|
||||
public required string Environment { get; init; }
|
||||
|
||||
[JsonPropertyName("region")]
|
||||
public string? Region { get; init; }
|
||||
|
||||
[JsonPropertyName("datacenter")]
|
||||
public string? Datacenter { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Connection configuration.
|
||||
/// </summary>
|
||||
public record ConnectionConfig
|
||||
{
|
||||
[JsonPropertyName("orchestratorUrl")]
|
||||
public required string OrchestratorUrl { get; init; }
|
||||
|
||||
[JsonPropertyName("heartbeatIntervalSeconds")]
|
||||
public int HeartbeatIntervalSeconds { get; init; } = 30;
|
||||
|
||||
[JsonPropertyName("reconnectDelaySeconds")]
|
||||
public int ReconnectDelaySeconds { get; init; } = 5;
|
||||
|
||||
[JsonPropertyName("maxReconnectAttempts")]
|
||||
public int MaxReconnectAttempts { get; init; } = 10;
|
||||
|
||||
[JsonPropertyName("enableCompression")]
|
||||
public bool EnableCompression { get; init; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent capabilities configuration.
|
||||
/// </summary>
|
||||
public record CapabilitiesConfig
|
||||
{
|
||||
[JsonPropertyName("docker")]
|
||||
public bool Docker { get; init; } = true;
|
||||
|
||||
[JsonPropertyName("scripts")]
|
||||
public bool Scripts { get; init; } = true;
|
||||
|
||||
[JsonPropertyName("fileOperations")]
|
||||
public bool FileOperations { get; init; } = true;
|
||||
|
||||
[JsonPropertyName("networkOperations")]
|
||||
public bool NetworkOperations { get; init; } = true;
|
||||
|
||||
[JsonPropertyName("healthChecks")]
|
||||
public bool HealthChecks { get; init; } = true;
|
||||
|
||||
[JsonPropertyName("customCapabilities")]
|
||||
public List<string> CustomCapabilities { get; init; } = new();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resource limits configuration.
|
||||
/// </summary>
|
||||
public record ResourceConfig
|
||||
{
|
||||
[JsonPropertyName("maxConcurrentTasks")]
|
||||
public int MaxConcurrentTasks { get; init; } = 5;
|
||||
|
||||
[JsonPropertyName("memoryLimitMb")]
|
||||
public int MemoryLimitMb { get; init; } = 2048;
|
||||
|
||||
[JsonPropertyName("diskSpaceMinMb")]
|
||||
public int DiskSpaceMinMb { get; init; } = 1024;
|
||||
|
||||
[JsonPropertyName("cpuThrottlePercent")]
|
||||
public int CpuThrottlePercent { get; init; } = 80;
|
||||
|
||||
[JsonPropertyName("taskTimeoutMinutes")]
|
||||
public int TaskTimeoutMinutes { get; init; } = 30;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Security configuration.
|
||||
/// </summary>
|
||||
public record SecurityConfig
|
||||
{
|
||||
[JsonPropertyName("certificate")]
|
||||
public CertificateConfig Certificate { get; init; } = new();
|
||||
|
||||
[JsonPropertyName("allowedNetworks")]
|
||||
public List<string> AllowedNetworks { get; init; } = new();
|
||||
|
||||
[JsonPropertyName("blockedCommands")]
|
||||
public List<string> BlockedCommands { get; init; } = new();
|
||||
|
||||
[JsonPropertyName("secureMode")]
|
||||
public bool SecureMode { get; init; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Certificate configuration.
|
||||
/// </summary>
|
||||
public record CertificateConfig
|
||||
{
|
||||
[JsonPropertyName("source")]
|
||||
[JsonConverter(typeof(JsonStringEnumConverter))]
|
||||
public CertificateSourceType Source { get; init; } = CertificateSourceType.AutoProvision;
|
||||
|
||||
[JsonPropertyName("path")]
|
||||
public string? Path { get; init; }
|
||||
|
||||
[JsonPropertyName("keyPath")]
|
||||
public string? KeyPath { get; init; }
|
||||
|
||||
[JsonPropertyName("vaultPath")]
|
||||
public string? VaultPath { get; init; }
|
||||
|
||||
[JsonPropertyName("acmeServer")]
|
||||
public string? AcmeServer { get; init; }
|
||||
|
||||
[JsonPropertyName("renewalThresholdDays")]
|
||||
public int RenewalThresholdDays { get; init; } = 7;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Certificate source type.
|
||||
/// </summary>
|
||||
public enum CertificateSourceType
|
||||
{
|
||||
AutoProvision,
|
||||
File,
|
||||
Vault,
|
||||
ACME
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Observability configuration.
|
||||
/// </summary>
|
||||
public record ObservabilityConfig
|
||||
{
|
||||
[JsonPropertyName("logsPath")]
|
||||
public string LogsPath { get; init; } = "/var/log/stella-agent";
|
||||
|
||||
[JsonPropertyName("logLevel")]
|
||||
public string LogLevel { get; init; } = "Information";
|
||||
|
||||
[JsonPropertyName("metricsEnabled")]
|
||||
public bool MetricsEnabled { get; init; } = true;
|
||||
|
||||
[JsonPropertyName("metricsPort")]
|
||||
public int MetricsPort { get; init; } = 9100;
|
||||
|
||||
[JsonPropertyName("tracingEnabled")]
|
||||
public bool TracingEnabled { get; init; } = false;
|
||||
|
||||
[JsonPropertyName("otlpEndpoint")]
|
||||
public string? OtlpEndpoint { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Cluster configuration.
|
||||
/// </summary>
|
||||
public record ClusterConfig
|
||||
{
|
||||
[JsonPropertyName("enabled")]
|
||||
public bool Enabled { get; init; } = false;
|
||||
|
||||
[JsonPropertyName("clusterId")]
|
||||
public string? ClusterId { get; init; }
|
||||
|
||||
[JsonPropertyName("role")]
|
||||
public ClusterRole Role { get; init; } = ClusterRole.Member;
|
||||
|
||||
[JsonPropertyName("peerDiscovery")]
|
||||
public PeerDiscoveryConfig PeerDiscovery { get; init; } = new();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Cluster role.
|
||||
/// </summary>
|
||||
public enum ClusterRole
|
||||
{
|
||||
Leader,
|
||||
Member
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Peer discovery configuration.
|
||||
/// </summary>
|
||||
public record PeerDiscoveryConfig
|
||||
{
|
||||
[JsonPropertyName("method")]
|
||||
public PeerDiscoveryMethod Method { get; init; } = PeerDiscoveryMethod.Dns;
|
||||
|
||||
[JsonPropertyName("dnsName")]
|
||||
public string? DnsName { get; init; }
|
||||
|
||||
[JsonPropertyName("staticPeers")]
|
||||
public List<string> StaticPeers { get; init; } = new();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Peer discovery method.
|
||||
/// </summary>
|
||||
public enum PeerDiscoveryMethod
|
||||
{
|
||||
Static,
|
||||
Dns,
|
||||
Kubernetes
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Auto-update configuration.
|
||||
/// </summary>
|
||||
public record AutoUpdateConfig
|
||||
{
|
||||
[JsonPropertyName("enabled")]
|
||||
public bool Enabled { get; init; } = false;
|
||||
|
||||
[JsonPropertyName("channel")]
|
||||
public UpdateChannel Channel { get; init; } = UpdateChannel.Stable;
|
||||
|
||||
[JsonPropertyName("maintenanceWindow")]
|
||||
public MaintenanceWindowConfig? MaintenanceWindow { get; init; }
|
||||
|
||||
[JsonPropertyName("requireApproval")]
|
||||
public bool RequireApproval { get; init; } = false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Update channel.
|
||||
/// </summary>
|
||||
public enum UpdateChannel
|
||||
{
|
||||
Stable,
|
||||
Beta,
|
||||
Canary
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Maintenance window configuration.
|
||||
/// </summary>
|
||||
public record MaintenanceWindowConfig
|
||||
{
|
||||
[JsonPropertyName("dayOfWeek")]
|
||||
public DayOfWeek DayOfWeek { get; init; } = DayOfWeek.Sunday;
|
||||
|
||||
[JsonPropertyName("startHourUtc")]
|
||||
public int StartHourUtc { get; init; } = 2;
|
||||
|
||||
[JsonPropertyName("durationHours")]
|
||||
public int DurationHours { get; init; } = 4;
|
||||
}
|
||||
@@ -0,0 +1,166 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
using System.Diagnostics;
|
||||
|
||||
namespace StellaOps.Agent.Core.Doctor;
|
||||
|
||||
/// <summary>
|
||||
/// Agent Doctor for running comprehensive diagnostics.
|
||||
/// </summary>
|
||||
public sealed class AgentDoctor : IAgentDoctor
|
||||
{
|
||||
private readonly IEnumerable<IAgentHealthCheck> _healthChecks;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly AgentDoctorOptions _options;
|
||||
|
||||
public AgentDoctor(
|
||||
IEnumerable<IAgentHealthCheck> healthChecks,
|
||||
TimeProvider timeProvider,
|
||||
AgentDoctorOptions? options = null)
|
||||
{
|
||||
_healthChecks = healthChecks;
|
||||
_timeProvider = timeProvider;
|
||||
_options = options ?? new AgentDoctorOptions();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs all diagnostics.
|
||||
/// </summary>
|
||||
public async Task<AgentDiagnosticReport> RunDiagnosticsAsync(
|
||||
DiagnosticOptions? options = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
options ??= new DiagnosticOptions();
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
var results = new List<HealthCheckResult>();
|
||||
|
||||
var checksToRun = _healthChecks
|
||||
.Where(c => options.Categories == null || options.Categories.Contains(c.Category))
|
||||
.ToList();
|
||||
|
||||
// Run checks in parallel with timeout
|
||||
var tasks = checksToRun.Select(async check =>
|
||||
{
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||
cts.CancelAfter(_options.CheckTimeout);
|
||||
|
||||
var sw = Stopwatch.StartNew();
|
||||
try
|
||||
{
|
||||
var result = await check.ExecuteAsync(cts.Token);
|
||||
sw.Stop();
|
||||
return result with { Duration = sw.Elapsed };
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
sw.Stop();
|
||||
return HealthCheckResult.Fail(check.Name, "Check timed out") with { Duration = sw.Elapsed };
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
sw.Stop();
|
||||
return HealthCheckResult.Fail(check.Name, $"Check failed: {ex.Message}") with { Duration = sw.Elapsed };
|
||||
}
|
||||
});
|
||||
|
||||
var checkResults = await Task.WhenAll(tasks);
|
||||
results.AddRange(checkResults);
|
||||
|
||||
// Stop on critical if configured
|
||||
if (options.StopOnCritical && results.Any(r => r.Status == HealthStatus.Critical))
|
||||
{
|
||||
// Don't run remaining checks
|
||||
}
|
||||
|
||||
var overallStatus = DetermineOverallStatus(results);
|
||||
var endTime = _timeProvider.GetUtcNow();
|
||||
|
||||
return new AgentDiagnosticReport
|
||||
{
|
||||
Status = overallStatus,
|
||||
Results = results,
|
||||
TotalChecks = results.Count,
|
||||
PassedChecks = results.Count(r => r.Status == HealthStatus.Healthy),
|
||||
WarningChecks = results.Count(r => r.Status == HealthStatus.Warning),
|
||||
FailedChecks = results.Count(r => r.Status == HealthStatus.Unhealthy),
|
||||
CriticalChecks = results.Count(r => r.Status == HealthStatus.Critical),
|
||||
StartedAt = startTime,
|
||||
CompletedAt = endTime,
|
||||
Duration = endTime - startTime
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs diagnostics for a specific category.
|
||||
/// </summary>
|
||||
public Task<AgentDiagnosticReport> RunCategoryDiagnosticsAsync(
|
||||
HealthCheckCategory category,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
return RunDiagnosticsAsync(
|
||||
new DiagnosticOptions { Categories = [category] },
|
||||
cancellationToken);
|
||||
}
|
||||
|
||||
private static HealthStatus DetermineOverallStatus(IReadOnlyList<HealthCheckResult> results)
|
||||
{
|
||||
if (results.Any(r => r.Status == HealthStatus.Critical))
|
||||
return HealthStatus.Critical;
|
||||
|
||||
if (results.Any(r => r.Status == HealthStatus.Unhealthy))
|
||||
return HealthStatus.Unhealthy;
|
||||
|
||||
if (results.Any(r => r.Status == HealthStatus.Warning))
|
||||
return HealthStatus.Warning;
|
||||
|
||||
return HealthStatus.Healthy;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent doctor interface.
|
||||
/// </summary>
|
||||
public interface IAgentDoctor
|
||||
{
|
||||
Task<AgentDiagnosticReport> RunDiagnosticsAsync(
|
||||
DiagnosticOptions? options = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
Task<AgentDiagnosticReport> RunCategoryDiagnosticsAsync(
|
||||
HealthCheckCategory category,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent diagnostic report.
|
||||
/// </summary>
|
||||
public sealed record AgentDiagnosticReport
|
||||
{
|
||||
public required HealthStatus Status { get; init; }
|
||||
public required IReadOnlyList<HealthCheckResult> Results { get; init; }
|
||||
public required int TotalChecks { get; init; }
|
||||
public required int PassedChecks { get; init; }
|
||||
public required int WarningChecks { get; init; }
|
||||
public required int FailedChecks { get; init; }
|
||||
public required int CriticalChecks { get; init; }
|
||||
public required DateTimeOffset StartedAt { get; init; }
|
||||
public required DateTimeOffset CompletedAt { get; init; }
|
||||
public required TimeSpan Duration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Diagnostic options.
|
||||
/// </summary>
|
||||
public sealed record DiagnosticOptions
|
||||
{
|
||||
public IReadOnlyList<HealthCheckCategory>? Categories { get; init; }
|
||||
public bool StopOnCritical { get; init; } = false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent doctor options.
|
||||
/// </summary>
|
||||
public sealed record AgentDoctorOptions
|
||||
{
|
||||
public TimeSpan CheckTimeout { get; init; } = TimeSpan.FromSeconds(10);
|
||||
}
|
||||
@@ -0,0 +1,244 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
using StellaOps.Agent.Core.Certificates;
|
||||
using StellaOps.Agent.Core.Configuration;
|
||||
|
||||
namespace StellaOps.Agent.Core.Doctor.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Certificate expiry health check.
|
||||
/// </summary>
|
||||
public sealed class CertificateExpiryCheck : IAgentHealthCheck
|
||||
{
|
||||
private readonly IAgentCertificateManager _certManager;
|
||||
private readonly string _agentId;
|
||||
private readonly int _warningThresholdDays;
|
||||
|
||||
public CertificateExpiryCheck(
|
||||
IAgentCertificateManager certManager,
|
||||
string agentId,
|
||||
int warningThresholdDays = 14)
|
||||
{
|
||||
_certManager = certManager;
|
||||
_agentId = agentId;
|
||||
_warningThresholdDays = warningThresholdDays;
|
||||
}
|
||||
|
||||
public HealthCheckCategory Category => HealthCheckCategory.Security;
|
||||
public string Name => "CertificateExpiry";
|
||||
public string Description => "Checks if the agent certificate is nearing expiry";
|
||||
|
||||
public async Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
var status = await _certManager.GetStatusAsync(_agentId, cancellationToken);
|
||||
|
||||
return status.Status switch
|
||||
{
|
||||
CertificateStatus.NotFound => HealthCheckResult.Critical(Name, "No certificate found"),
|
||||
CertificateStatus.Expired => HealthCheckResult.Critical(Name, "Certificate has expired"),
|
||||
CertificateStatus.NearingExpiry => HealthCheckResult.Warn(Name,
|
||||
$"Certificate expires in {status.DaysUntilExpiry} days",
|
||||
new Dictionary<string, object>
|
||||
{
|
||||
["daysUntilExpiry"] = status.DaysUntilExpiry ?? 0,
|
||||
["expiresAt"] = status.NotAfter?.ToString("O") ?? ""
|
||||
}),
|
||||
CertificateStatus.Valid => status.DaysUntilExpiry < _warningThresholdDays
|
||||
? HealthCheckResult.Warn(Name, $"Certificate expires in {status.DaysUntilExpiry} days")
|
||||
: HealthCheckResult.Pass(Name, $"Certificate valid for {status.DaysUntilExpiry} days"),
|
||||
_ => HealthCheckResult.Fail(Name, "Unknown certificate status")
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Disk space health check.
|
||||
/// </summary>
|
||||
public sealed class DiskSpaceCheck : IAgentHealthCheck
|
||||
{
|
||||
private readonly string _path;
|
||||
private readonly long _warningThresholdBytes;
|
||||
private readonly long _criticalThresholdBytes;
|
||||
|
||||
public DiskSpaceCheck(
|
||||
string path = "/",
|
||||
long warningThresholdBytes = 1_073_741_824, // 1 GB
|
||||
long criticalThresholdBytes = 104_857_600) // 100 MB
|
||||
{
|
||||
_path = path;
|
||||
_warningThresholdBytes = warningThresholdBytes;
|
||||
_criticalThresholdBytes = criticalThresholdBytes;
|
||||
}
|
||||
|
||||
public HealthCheckCategory Category => HealthCheckCategory.Resources;
|
||||
public string Name => "DiskSpace";
|
||||
public string Description => "Checks available disk space";
|
||||
|
||||
public Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
var driveInfo = new DriveInfo(Path.GetPathRoot(_path) ?? _path);
|
||||
var availableBytes = driveInfo.AvailableFreeSpace;
|
||||
|
||||
var details = new Dictionary<string, object>
|
||||
{
|
||||
["availableBytes"] = availableBytes,
|
||||
["availableGb"] = availableBytes / 1_073_741_824.0,
|
||||
["totalBytes"] = driveInfo.TotalSize,
|
||||
["usagePercent"] = (1 - (double)availableBytes / driveInfo.TotalSize) * 100
|
||||
};
|
||||
|
||||
if (availableBytes < _criticalThresholdBytes)
|
||||
{
|
||||
return Task.FromResult(HealthCheckResult.Critical(Name,
|
||||
$"Disk space critically low: {availableBytes / 1_048_576} MB available", details));
|
||||
}
|
||||
|
||||
if (availableBytes < _warningThresholdBytes)
|
||||
{
|
||||
return Task.FromResult(HealthCheckResult.Warn(Name,
|
||||
$"Disk space low: {availableBytes / 1_073_741_824.0:F2} GB available", details));
|
||||
}
|
||||
|
||||
return Task.FromResult(HealthCheckResult.Pass(Name,
|
||||
$"Disk space OK: {availableBytes / 1_073_741_824.0:F2} GB available", details));
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return Task.FromResult(HealthCheckResult.Fail(Name, $"Failed to check disk space: {ex.Message}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Memory usage health check.
|
||||
/// </summary>
|
||||
public sealed class MemoryUsageCheck : IAgentHealthCheck
|
||||
{
|
||||
private readonly double _warningThresholdPercent;
|
||||
private readonly double _criticalThresholdPercent;
|
||||
|
||||
public MemoryUsageCheck(
|
||||
double warningThresholdPercent = 80,
|
||||
double criticalThresholdPercent = 95)
|
||||
{
|
||||
_warningThresholdPercent = warningThresholdPercent;
|
||||
_criticalThresholdPercent = criticalThresholdPercent;
|
||||
}
|
||||
|
||||
public HealthCheckCategory Category => HealthCheckCategory.Resources;
|
||||
public string Name => "MemoryUsage";
|
||||
public string Description => "Checks memory utilization";
|
||||
|
||||
public Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
var process = System.Diagnostics.Process.GetCurrentProcess();
|
||||
var workingSet = process.WorkingSet64;
|
||||
var privateMemory = process.PrivateMemorySize64;
|
||||
|
||||
var details = new Dictionary<string, object>
|
||||
{
|
||||
["workingSetBytes"] = workingSet,
|
||||
["workingSetMb"] = workingSet / 1_048_576.0,
|
||||
["privateMemoryBytes"] = privateMemory,
|
||||
["privateMemoryMb"] = privateMemory / 1_048_576.0
|
||||
};
|
||||
|
||||
// Note: Getting total system memory is platform-specific
|
||||
// For now, just report working set
|
||||
return Task.FromResult(HealthCheckResult.Pass(Name,
|
||||
$"Process memory: {workingSet / 1_048_576.0:F1} MB working set", details));
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return Task.FromResult(HealthCheckResult.Fail(Name, $"Failed to check memory: {ex.Message}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Docker connectivity health check.
|
||||
/// </summary>
|
||||
public sealed class DockerConnectivityCheck : IAgentHealthCheck
|
||||
{
|
||||
private readonly string _dockerSocket;
|
||||
|
||||
public DockerConnectivityCheck(string dockerSocket = "/var/run/docker.sock")
|
||||
{
|
||||
_dockerSocket = dockerSocket;
|
||||
}
|
||||
|
||||
public HealthCheckCategory Category => HealthCheckCategory.Runtime;
|
||||
public string Name => "DockerConnectivity";
|
||||
public string Description => "Checks Docker daemon accessibility";
|
||||
|
||||
public Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Check if socket exists (Unix) or named pipe is accessible (Windows)
|
||||
if (OperatingSystem.IsWindows())
|
||||
{
|
||||
// Windows uses named pipe
|
||||
var pipePath = @"\\.\pipe\docker_engine";
|
||||
if (File.Exists(pipePath) || Directory.Exists(@"\\.\pipe"))
|
||||
{
|
||||
return Task.FromResult(HealthCheckResult.Pass(Name, "Docker daemon accessible via named pipe"));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Unix uses socket
|
||||
if (File.Exists(_dockerSocket))
|
||||
{
|
||||
return Task.FromResult(HealthCheckResult.Pass(Name, "Docker socket accessible"));
|
||||
}
|
||||
}
|
||||
|
||||
return Task.FromResult(HealthCheckResult.Critical(Name, "Docker daemon not accessible"));
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return Task.FromResult(HealthCheckResult.Fail(Name, $"Failed to check Docker: {ex.Message}"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration drift health check.
|
||||
/// </summary>
|
||||
public sealed class ConfigurationDriftCheck : IAgentHealthCheck
|
||||
{
|
||||
private readonly IAgentConfigManager _configManager;
|
||||
|
||||
public ConfigurationDriftCheck(IAgentConfigManager configManager)
|
||||
{
|
||||
_configManager = configManager;
|
||||
}
|
||||
|
||||
public HealthCheckCategory Category => HealthCheckCategory.Configuration;
|
||||
public string Name => "ConfigurationDrift";
|
||||
public string Description => "Checks for configuration drift between current and desired state";
|
||||
|
||||
public async Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
var drift = await _configManager.DetectDriftAsync(cancellationToken);
|
||||
|
||||
if (!drift.HasDrift)
|
||||
{
|
||||
return HealthCheckResult.Pass(Name, "No configuration drift detected");
|
||||
}
|
||||
|
||||
var details = new Dictionary<string, object>
|
||||
{
|
||||
["differenceCount"] = drift.Differences.Count,
|
||||
["differences"] = drift.Differences.Select(d => d.Path).ToList()
|
||||
};
|
||||
|
||||
return HealthCheckResult.Warn(Name,
|
||||
$"Configuration drift detected: {drift.Differences.Count} differences", details);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,382 @@
|
||||
// Copyright (c) 2026 Stella Ops. All rights reserved.
|
||||
// Licensed under the AGPL-3.0-or-later license.
|
||||
|
||||
using System.Diagnostics;
|
||||
using StellaOps.Agent.Core.Certificates;
|
||||
|
||||
namespace StellaOps.Agent.Core.Doctor.Checks;
|
||||
|
||||
/// <summary>
|
||||
/// Checks certificate expiry status.
|
||||
/// </summary>
|
||||
public sealed class CertificateExpiryCheck : IAgentHealthCheck
|
||||
{
|
||||
private readonly IAgentCertificateManager _certificateManager;
|
||||
private readonly int _warningThresholdDays;
|
||||
|
||||
public CertificateExpiryCheck(
|
||||
IAgentCertificateManager certificateManager,
|
||||
int warningThresholdDays = 14)
|
||||
{
|
||||
_certificateManager = certificateManager;
|
||||
_warningThresholdDays = warningThresholdDays;
|
||||
}
|
||||
|
||||
public HealthCheckCategory Category => HealthCheckCategory.Security;
|
||||
public string Name => "Certificate Expiry";
|
||||
public string Description => "Checks if the agent certificate is valid and not nearing expiry";
|
||||
|
||||
public Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sw = Stopwatch.StartNew();
|
||||
|
||||
var status = _certificateManager.GetCertificateStatus();
|
||||
|
||||
HealthStatus healthStatus;
|
||||
string message;
|
||||
|
||||
if (!status.HasCertificate)
|
||||
{
|
||||
healthStatus = HealthStatus.Critical;
|
||||
message = "No certificate loaded";
|
||||
}
|
||||
else if (status.IsExpired)
|
||||
{
|
||||
healthStatus = HealthStatus.Critical;
|
||||
message = $"Certificate expired on {status.NotAfter:yyyy-MM-dd}";
|
||||
}
|
||||
else if (status.RemainingDays <= 3)
|
||||
{
|
||||
healthStatus = HealthStatus.Unhealthy;
|
||||
message = $"Certificate expires in {status.RemainingDays} days - immediate renewal required";
|
||||
}
|
||||
else if (status.RemainingDays <= _warningThresholdDays)
|
||||
{
|
||||
healthStatus = HealthStatus.Degraded;
|
||||
message = $"Certificate expires in {status.RemainingDays} days - renewal recommended";
|
||||
}
|
||||
else
|
||||
{
|
||||
healthStatus = HealthStatus.Healthy;
|
||||
message = $"Certificate valid for {status.RemainingDays} more days";
|
||||
}
|
||||
|
||||
return Task.FromResult(new HealthCheckResult
|
||||
{
|
||||
CheckName = Name,
|
||||
Category = Category,
|
||||
Status = healthStatus,
|
||||
Message = message,
|
||||
Duration = sw.Elapsed,
|
||||
Metrics = new Dictionary<string, object>
|
||||
{
|
||||
["remainingDays"] = status.RemainingDays,
|
||||
["expiresAt"] = status.NotAfter.ToString("O")
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates certificate chain.
|
||||
/// </summary>
|
||||
public sealed class CertificateValidityCheck : IAgentHealthCheck
|
||||
{
|
||||
private readonly IAgentCertificateManager _certificateManager;
|
||||
|
||||
public CertificateValidityCheck(IAgentCertificateManager certificateManager)
|
||||
{
|
||||
_certificateManager = certificateManager;
|
||||
}
|
||||
|
||||
public HealthCheckCategory Category => HealthCheckCategory.Security;
|
||||
public string Name => "Certificate Validity";
|
||||
public string Description => "Validates the certificate chain and trust";
|
||||
|
||||
public Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sw = Stopwatch.StartNew();
|
||||
|
||||
var cert = _certificateManager.CurrentCertificate;
|
||||
|
||||
if (cert is null)
|
||||
{
|
||||
return Task.FromResult(new HealthCheckResult
|
||||
{
|
||||
CheckName = Name,
|
||||
Category = Category,
|
||||
Status = HealthStatus.Critical,
|
||||
Message = "No certificate available for validation",
|
||||
Duration = sw.Elapsed
|
||||
});
|
||||
}
|
||||
|
||||
// Basic validation - check dates and key usage
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
|
||||
if (cert.NotBefore > now)
|
||||
{
|
||||
return Task.FromResult(new HealthCheckResult
|
||||
{
|
||||
CheckName = Name,
|
||||
Category = Category,
|
||||
Status = HealthStatus.Critical,
|
||||
Message = $"Certificate not yet valid (valid from {cert.NotBefore:yyyy-MM-dd})",
|
||||
Duration = sw.Elapsed
|
||||
});
|
||||
}
|
||||
|
||||
if (cert.NotAfter < now)
|
||||
{
|
||||
return Task.FromResult(new HealthCheckResult
|
||||
{
|
||||
CheckName = Name,
|
||||
Category = Category,
|
||||
Status = HealthStatus.Critical,
|
||||
Message = $"Certificate has expired (expired {cert.NotAfter:yyyy-MM-dd})",
|
||||
Duration = sw.Elapsed
|
||||
});
|
||||
}
|
||||
|
||||
return Task.FromResult(new HealthCheckResult
|
||||
{
|
||||
CheckName = Name,
|
||||
Category = Category,
|
||||
Status = HealthStatus.Healthy,
|
||||
Message = "Certificate is valid",
|
||||
Duration = sw.Elapsed,
|
||||
Details = $"Subject: {cert.Subject}, Thumbprint: {cert.Thumbprint}"
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks disk space availability.
|
||||
/// </summary>
|
||||
public sealed class DiskSpaceCheck : IAgentHealthCheck
|
||||
{
|
||||
private readonly string _path;
|
||||
private readonly long _warningThresholdMb;
|
||||
private readonly long _criticalThresholdMb;
|
||||
|
||||
public DiskSpaceCheck(
|
||||
string path = "/",
|
||||
long warningThresholdMb = 1024,
|
||||
long criticalThresholdMb = 256)
|
||||
{
|
||||
_path = path;
|
||||
_warningThresholdMb = warningThresholdMb;
|
||||
_criticalThresholdMb = criticalThresholdMb;
|
||||
}
|
||||
|
||||
public HealthCheckCategory Category => HealthCheckCategory.Resources;
|
||||
public string Name => "Disk Space";
|
||||
public string Description => "Checks available disk space";
|
||||
|
||||
public Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sw = Stopwatch.StartNew();
|
||||
|
||||
try
|
||||
{
|
||||
var driveInfo = new DriveInfo(Path.GetPathRoot(_path) ?? _path);
|
||||
var availableMb = driveInfo.AvailableFreeSpace / (1024 * 1024);
|
||||
var totalMb = driveInfo.TotalSize / (1024 * 1024);
|
||||
var usedPercent = 100.0 * (totalMb - availableMb) / totalMb;
|
||||
|
||||
HealthStatus status;
|
||||
string message;
|
||||
|
||||
if (availableMb < _criticalThresholdMb)
|
||||
{
|
||||
status = HealthStatus.Critical;
|
||||
message = $"Critical: Only {availableMb} MB available ({usedPercent:F1}% used)";
|
||||
}
|
||||
else if (availableMb < _warningThresholdMb)
|
||||
{
|
||||
status = HealthStatus.Degraded;
|
||||
message = $"Warning: {availableMb} MB available ({usedPercent:F1}% used)";
|
||||
}
|
||||
else
|
||||
{
|
||||
status = HealthStatus.Healthy;
|
||||
message = $"{availableMb} MB available ({usedPercent:F1}% used)";
|
||||
}
|
||||
|
||||
return Task.FromResult(new HealthCheckResult
|
||||
{
|
||||
CheckName = Name,
|
||||
Category = Category,
|
||||
Status = status,
|
||||
Message = message,
|
||||
Duration = sw.Elapsed,
|
||||
Metrics = new Dictionary<string, object>
|
||||
{
|
||||
["availableMb"] = availableMb,
|
||||
["totalMb"] = totalMb,
|
||||
["usedPercent"] = usedPercent
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return Task.FromResult(new HealthCheckResult
|
||||
{
|
||||
CheckName = Name,
|
||||
Category = Category,
|
||||
Status = HealthStatus.Unhealthy,
|
||||
Message = $"Failed to check disk space: {ex.Message}",
|
||||
Duration = sw.Elapsed
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks memory usage.
|
||||
/// </summary>
|
||||
public sealed class MemoryUsageCheck : IAgentHealthCheck
|
||||
{
|
||||
private readonly int _warningThresholdPercent;
|
||||
private readonly int _criticalThresholdPercent;
|
||||
|
||||
public MemoryUsageCheck(
|
||||
int warningThresholdPercent = 85,
|
||||
int criticalThresholdPercent = 95)
|
||||
{
|
||||
_warningThresholdPercent = warningThresholdPercent;
|
||||
_criticalThresholdPercent = criticalThresholdPercent;
|
||||
}
|
||||
|
||||
public HealthCheckCategory Category => HealthCheckCategory.Resources;
|
||||
public string Name => "Memory Usage";
|
||||
public string Description => "Checks memory utilization";
|
||||
|
||||
public Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sw = Stopwatch.StartNew();
|
||||
|
||||
var process = Process.GetCurrentProcess();
|
||||
var workingSetMb = process.WorkingSet64 / (1024 * 1024);
|
||||
var privateMemoryMb = process.PrivateMemorySize64 / (1024 * 1024);
|
||||
|
||||
// For this implementation, we use process memory as a proxy
|
||||
// In production, would integrate with OS-level memory stats
|
||||
var gcInfo = GC.GetGCMemoryInfo();
|
||||
var totalAvailableMemoryMb = gcInfo.TotalAvailableMemoryBytes / (1024 * 1024);
|
||||
var usedPercent = 100.0 * workingSetMb / totalAvailableMemoryMb;
|
||||
|
||||
HealthStatus status;
|
||||
string message;
|
||||
|
||||
if (usedPercent >= _criticalThresholdPercent)
|
||||
{
|
||||
status = HealthStatus.Critical;
|
||||
message = $"Critical memory usage: {usedPercent:F1}%";
|
||||
}
|
||||
else if (usedPercent >= _warningThresholdPercent)
|
||||
{
|
||||
status = HealthStatus.Degraded;
|
||||
message = $"High memory usage: {usedPercent:F1}%";
|
||||
}
|
||||
else
|
||||
{
|
||||
status = HealthStatus.Healthy;
|
||||
message = $"Memory usage: {usedPercent:F1}%";
|
||||
}
|
||||
|
||||
return Task.FromResult(new HealthCheckResult
|
||||
{
|
||||
CheckName = Name,
|
||||
Category = Category,
|
||||
Status = status,
|
||||
Message = message,
|
||||
Duration = sw.Elapsed,
|
||||
Metrics = new Dictionary<string, object>
|
||||
{
|
||||
["workingSetMb"] = workingSetMb,
|
||||
["privateMemoryMb"] = privateMemoryMb,
|
||||
["usedPercent"] = usedPercent
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks Docker connectivity.
|
||||
/// </summary>
|
||||
public sealed class DockerConnectivityCheck : IAgentHealthCheck
|
||||
{
|
||||
public HealthCheckCategory Category => HealthCheckCategory.Runtime;
|
||||
public string Name => "Docker Connectivity";
|
||||
public string Description => "Checks if Docker daemon is accessible";
|
||||
|
||||
public async Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sw = Stopwatch.StartNew();
|
||||
|
||||
try
|
||||
{
|
||||
var psi = new ProcessStartInfo
|
||||
{
|
||||
FileName = "docker",
|
||||
Arguments = "info --format '{{.ServerVersion}}'",
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true
|
||||
};
|
||||
|
||||
using var process = Process.Start(psi);
|
||||
if (process is null)
|
||||
{
|
||||
return new HealthCheckResult
|
||||
{
|
||||
CheckName = Name,
|
||||
Category = Category,
|
||||
Status = HealthStatus.Critical,
|
||||
Message = "Failed to start docker command",
|
||||
Duration = sw.Elapsed
|
||||
};
|
||||
}
|
||||
|
||||
await process.WaitForExitAsync(cancellationToken);
|
||||
var output = await process.StandardOutput.ReadToEndAsync(cancellationToken);
|
||||
|
||||
if (process.ExitCode == 0)
|
||||
{
|
||||
return new HealthCheckResult
|
||||
{
|
||||
CheckName = Name,
|
||||
Category = Category,
|
||||
Status = HealthStatus.Healthy,
|
||||
Message = "Docker daemon is accessible",
|
||||
Duration = sw.Elapsed,
|
||||
Details = $"Docker version: {output.Trim()}"
|
||||
};
|
||||
}
|
||||
|
||||
var error = await process.StandardError.ReadToEndAsync(cancellationToken);
|
||||
return new HealthCheckResult
|
||||
{
|
||||
CheckName = Name,
|
||||
Category = Category,
|
||||
Status = HealthStatus.Critical,
|
||||
Message = "Docker daemon is not accessible",
|
||||
Duration = sw.Elapsed,
|
||||
Details = error
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthCheckResult
|
||||
{
|
||||
CheckName = Name,
|
||||
Category = Category,
|
||||
Status = HealthStatus.Critical,
|
||||
Message = $"Docker check failed: {ex.Message}",
|
||||
Duration = sw.Elapsed
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
// Copyright (c) 2026 Stella Ops. All rights reserved.
|
||||
// Licensed under the AGPL-3.0-or-later license.
|
||||
|
||||
namespace StellaOps.Agent.Core.Doctor;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for agent health checks.
|
||||
/// </summary>
|
||||
public interface IAgentHealthCheck
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the check category.
|
||||
/// </summary>
|
||||
HealthCheckCategory Category { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the check name.
|
||||
/// </summary>
|
||||
string Name { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the check description.
|
||||
/// </summary>
|
||||
string Description { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Executes the health check.
|
||||
/// </summary>
|
||||
Task<HealthCheckResult> ExecuteAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Health check categories.
|
||||
/// </summary>
|
||||
public enum HealthCheckCategory
|
||||
{
|
||||
Security,
|
||||
Network,
|
||||
Runtime,
|
||||
Resources,
|
||||
Configuration
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a health check execution.
|
||||
/// </summary>
|
||||
public record HealthCheckResult
|
||||
{
|
||||
public required string CheckName { get; init; }
|
||||
public HealthCheckCategory Category { get; init; }
|
||||
public HealthStatus Status { get; init; }
|
||||
public required string Message { get; init; }
|
||||
public string? Details { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
public IReadOnlyDictionary<string, object>? Metrics { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Health check status levels.
|
||||
/// </summary>
|
||||
public enum HealthStatus
|
||||
{
|
||||
Healthy,
|
||||
Degraded,
|
||||
Unhealthy,
|
||||
Critical
|
||||
}
|
||||
@@ -0,0 +1,215 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
namespace StellaOps.Agent.Core.Doctor.Patterns;
|
||||
|
||||
/// <summary>
|
||||
/// Remediation patterns for common agent issues.
|
||||
/// </summary>
|
||||
public sealed class CertificateRemediationPattern : IRemediationPattern
|
||||
{
|
||||
public bool Matches(HealthCheckResult result) =>
|
||||
result.CheckName.Contains("Certificate", StringComparison.OrdinalIgnoreCase) &&
|
||||
result.Status != HealthStatus.Healthy;
|
||||
|
||||
public IReadOnlyList<RemediationStep> GetSteps(HealthCheckResult result)
|
||||
{
|
||||
var steps = new List<RemediationStep>();
|
||||
|
||||
if (result.CheckName == "CertificateExpiry")
|
||||
{
|
||||
steps.Add(new RemediationStep
|
||||
{
|
||||
Id = "cert-renew",
|
||||
Title = "Renew agent certificate",
|
||||
Description = "Renew the agent's mTLS certificate before it expires",
|
||||
Priority = 1,
|
||||
IsAutomated = true,
|
||||
Command = "stella agent renew-cert",
|
||||
RunbookUrl = "https://docs.stellaops.io/runbooks/certificate-renewal"
|
||||
});
|
||||
}
|
||||
|
||||
if (result.Status == HealthStatus.Critical && result.Message.Contains("expired"))
|
||||
{
|
||||
steps.Add(new RemediationStep
|
||||
{
|
||||
Id = "cert-force-renew",
|
||||
Title = "Force certificate renewal",
|
||||
Description = "Certificate has expired. Force renewal to restore connectivity.",
|
||||
Priority = 0,
|
||||
IsAutomated = true,
|
||||
Command = "stella agent renew-cert --force",
|
||||
RunbookUrl = "https://docs.stellaops.io/runbooks/certificate-expired"
|
||||
});
|
||||
}
|
||||
|
||||
if (result.Status == HealthStatus.Critical && result.Message.Contains("not found"))
|
||||
{
|
||||
steps.Add(new RemediationStep
|
||||
{
|
||||
Id = "cert-provision",
|
||||
Title = "Provision new certificate",
|
||||
Description = "No certificate found. Re-bootstrap the agent or manually provision a certificate.",
|
||||
Priority = 0,
|
||||
IsAutomated = false,
|
||||
RunbookUrl = "https://docs.stellaops.io/runbooks/certificate-missing",
|
||||
ManualSteps =
|
||||
[
|
||||
"1. Generate a new bootstrap token from the orchestrator",
|
||||
"2. Run: stella agent bootstrap --token <token>",
|
||||
"3. Verify certificate: stella agent status"
|
||||
]
|
||||
});
|
||||
}
|
||||
|
||||
return steps;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remediation patterns for connectivity issues.
|
||||
/// </summary>
|
||||
public sealed class ConnectivityRemediationPattern : IRemediationPattern
|
||||
{
|
||||
public bool Matches(HealthCheckResult result) =>
|
||||
result.CheckName.Contains("Connectivity", StringComparison.OrdinalIgnoreCase) &&
|
||||
result.Status != HealthStatus.Healthy;
|
||||
|
||||
public IReadOnlyList<RemediationStep> GetSteps(HealthCheckResult result)
|
||||
{
|
||||
var steps = new List<RemediationStep>();
|
||||
|
||||
steps.Add(new RemediationStep
|
||||
{
|
||||
Id = "check-network",
|
||||
Title = "Check network connectivity",
|
||||
Description = "Verify network connectivity to the orchestrator",
|
||||
Priority = 1,
|
||||
IsAutomated = false,
|
||||
RunbookUrl = "https://docs.stellaops.io/runbooks/network-troubleshooting",
|
||||
ManualSteps =
|
||||
[
|
||||
"1. Verify DNS resolution: nslookup <orchestrator-hostname>",
|
||||
"2. Check port accessibility: telnet <orchestrator-hostname> 443",
|
||||
"3. Verify firewall rules allow outbound HTTPS/gRPC",
|
||||
"4. Check proxy settings if applicable"
|
||||
]
|
||||
});
|
||||
|
||||
steps.Add(new RemediationStep
|
||||
{
|
||||
Id = "restart-agent",
|
||||
Title = "Restart agent service",
|
||||
Description = "Restart the agent to re-establish connection",
|
||||
Priority = 2,
|
||||
IsAutomated = true,
|
||||
Command = "systemctl restart stella-agent || sc restart StellaAgent"
|
||||
});
|
||||
|
||||
return steps;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remediation patterns for Docker issues.
|
||||
/// </summary>
|
||||
public sealed class DockerRemediationPattern : IRemediationPattern
|
||||
{
|
||||
public bool Matches(HealthCheckResult result) =>
|
||||
result.CheckName.Contains("Docker", StringComparison.OrdinalIgnoreCase) &&
|
||||
result.Status != HealthStatus.Healthy;
|
||||
|
||||
public IReadOnlyList<RemediationStep> GetSteps(HealthCheckResult result)
|
||||
{
|
||||
var steps = new List<RemediationStep>();
|
||||
|
||||
steps.Add(new RemediationStep
|
||||
{
|
||||
Id = "docker-check-socket",
|
||||
Title = "Check Docker socket permissions",
|
||||
Description = "Ensure the agent has access to the Docker socket",
|
||||
Priority = 1,
|
||||
IsAutomated = false,
|
||||
RunbookUrl = "https://docs.stellaops.io/runbooks/docker-socket",
|
||||
ManualSteps =
|
||||
[
|
||||
"1. Check socket exists: ls -la /var/run/docker.sock",
|
||||
"2. Verify agent user is in docker group: groups stella-agent",
|
||||
"3. Add to group if needed: usermod -aG docker stella-agent",
|
||||
"4. Restart agent: systemctl restart stella-agent"
|
||||
]
|
||||
});
|
||||
|
||||
steps.Add(new RemediationStep
|
||||
{
|
||||
Id = "docker-start-daemon",
|
||||
Title = "Start Docker daemon",
|
||||
Description = "Docker daemon may not be running",
|
||||
Priority = 0,
|
||||
IsAutomated = true,
|
||||
Command = "systemctl start docker"
|
||||
});
|
||||
|
||||
return steps;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remediation patterns for resource issues.
|
||||
/// </summary>
|
||||
public sealed class ResourceRemediationPattern : IRemediationPattern
|
||||
{
|
||||
public bool Matches(HealthCheckResult result) =>
|
||||
(result.CheckName.Contains("Disk", StringComparison.OrdinalIgnoreCase) ||
|
||||
result.CheckName.Contains("Memory", StringComparison.OrdinalIgnoreCase) ||
|
||||
result.CheckName.Contains("CPU", StringComparison.OrdinalIgnoreCase)) &&
|
||||
result.Status != HealthStatus.Healthy;
|
||||
|
||||
public IReadOnlyList<RemediationStep> GetSteps(HealthCheckResult result)
|
||||
{
|
||||
var steps = new List<RemediationStep>();
|
||||
|
||||
if (result.CheckName.Contains("Disk"))
|
||||
{
|
||||
steps.Add(new RemediationStep
|
||||
{
|
||||
Id = "disk-cleanup",
|
||||
Title = "Clean up disk space",
|
||||
Description = "Free up disk space by removing unused Docker resources",
|
||||
Priority = 1,
|
||||
IsAutomated = true,
|
||||
Command = "docker system prune -af --volumes"
|
||||
});
|
||||
|
||||
steps.Add(new RemediationStep
|
||||
{
|
||||
Id = "disk-logs",
|
||||
Title = "Rotate and clean logs",
|
||||
Description = "Remove old log files to free space",
|
||||
Priority = 2,
|
||||
IsAutomated = true,
|
||||
Command = "journalctl --vacuum-time=7d"
|
||||
});
|
||||
}
|
||||
|
||||
if (result.CheckName.Contains("Memory"))
|
||||
{
|
||||
steps.Add(new RemediationStep
|
||||
{
|
||||
Id = "memory-reduce-tasks",
|
||||
Title = "Reduce concurrent tasks",
|
||||
Description = "Lower the max concurrent tasks setting to reduce memory pressure",
|
||||
Priority = 1,
|
||||
IsAutomated = false,
|
||||
ManualSteps =
|
||||
[
|
||||
"1. Edit agent config: /opt/stella-agent/config.yaml",
|
||||
"2. Reduce resources.maxConcurrentTasks value",
|
||||
"3. Restart agent: systemctl restart stella-agent"
|
||||
]
|
||||
});
|
||||
}
|
||||
|
||||
return steps;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,156 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
namespace StellaOps.Agent.Core.Doctor;
|
||||
|
||||
/// <summary>
|
||||
/// Remediation engine for guided problem resolution.
|
||||
/// </summary>
|
||||
public sealed class RemediationEngine : IRemediationEngine
|
||||
{
|
||||
private readonly IReadOnlyList<IRemediationPattern> _patterns;
|
||||
|
||||
public RemediationEngine(IEnumerable<IRemediationPattern> patterns)
|
||||
{
|
||||
_patterns = patterns.ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets remediation steps for a health check result.
|
||||
/// </summary>
|
||||
public IReadOnlyList<RemediationStep> GetRemediationSteps(HealthCheckResult result)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(result);
|
||||
|
||||
var steps = new List<RemediationStep>();
|
||||
|
||||
foreach (var pattern in _patterns)
|
||||
{
|
||||
if (pattern.Matches(result))
|
||||
{
|
||||
steps.AddRange(pattern.GetSteps(result));
|
||||
}
|
||||
}
|
||||
|
||||
return steps.OrderBy(s => s.Priority).ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all remediation steps for a diagnostic report.
|
||||
/// </summary>
|
||||
public IReadOnlyList<RemediationStep> GetAllRemediationSteps(AgentDiagnosticReport report)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(report);
|
||||
|
||||
var allSteps = new List<RemediationStep>();
|
||||
|
||||
foreach (var result in report.Results.Where(r => r.Status != HealthStatus.Healthy))
|
||||
{
|
||||
allSteps.AddRange(GetRemediationSteps(result));
|
||||
}
|
||||
|
||||
return allSteps
|
||||
.DistinctBy(s => s.Id)
|
||||
.OrderBy(s => s.Priority)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Executes automated remediation steps.
|
||||
/// </summary>
|
||||
public async Task<RemediationExecutionResult> ExecuteAutomatedRemediationsAsync(
|
||||
IReadOnlyList<RemediationStep> steps,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var automatedSteps = steps.Where(s => s.IsAutomated && s.Command != null).ToList();
|
||||
var executed = new List<RemediationStepResult>();
|
||||
|
||||
foreach (var step in automatedSteps)
|
||||
{
|
||||
try
|
||||
{
|
||||
// In a real implementation, execute the command
|
||||
// For now, we simulate success
|
||||
executed.Add(new RemediationStepResult
|
||||
{
|
||||
Step = step,
|
||||
Success = true,
|
||||
Message = "Remediation applied successfully"
|
||||
});
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
executed.Add(new RemediationStepResult
|
||||
{
|
||||
Step = step,
|
||||
Success = false,
|
||||
Message = $"Remediation failed: {ex.Message}"
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return new RemediationExecutionResult
|
||||
{
|
||||
TotalSteps = automatedSteps.Count,
|
||||
SuccessfulSteps = executed.Count(r => r.Success),
|
||||
FailedSteps = executed.Count(r => !r.Success),
|
||||
Results = executed
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remediation engine interface.
|
||||
/// </summary>
|
||||
public interface IRemediationEngine
|
||||
{
|
||||
IReadOnlyList<RemediationStep> GetRemediationSteps(HealthCheckResult result);
|
||||
IReadOnlyList<RemediationStep> GetAllRemediationSteps(AgentDiagnosticReport report);
|
||||
Task<RemediationExecutionResult> ExecuteAutomatedRemediationsAsync(
|
||||
IReadOnlyList<RemediationStep> steps,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remediation step.
|
||||
/// </summary>
|
||||
public sealed record RemediationStep
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string Title { get; init; }
|
||||
public required string Description { get; init; }
|
||||
public int Priority { get; init; } = 100;
|
||||
public bool IsAutomated { get; init; }
|
||||
public string? Command { get; init; }
|
||||
public string? RunbookUrl { get; init; }
|
||||
public IReadOnlyList<string>? ManualSteps { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remediation pattern interface.
|
||||
/// </summary>
|
||||
public interface IRemediationPattern
|
||||
{
|
||||
bool Matches(HealthCheckResult result);
|
||||
IReadOnlyList<RemediationStep> GetSteps(HealthCheckResult result);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remediation step result.
|
||||
/// </summary>
|
||||
public sealed record RemediationStepResult
|
||||
{
|
||||
public required RemediationStep Step { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public required string Message { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remediation execution result.
|
||||
/// </summary>
|
||||
public sealed record RemediationExecutionResult
|
||||
{
|
||||
public required int TotalSteps { get; init; }
|
||||
public required int SuccessfulSteps { get; init; }
|
||||
public required int FailedSteps { get; init; }
|
||||
public required IReadOnlyList<RemediationStepResult> Results { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,534 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Agent.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Manages agent clustering with multiple operational modes.
|
||||
/// </summary>
|
||||
public sealed class AgentClusterManager : BackgroundService
|
||||
{
|
||||
private readonly IClusterMemberStore _memberStore;
|
||||
private readonly ILeaderElection _leaderElection;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly AgentClusterConfig _config;
|
||||
private readonly ILogger<AgentClusterManager> _logger;
|
||||
private readonly ConcurrentDictionary<string, ClusterMember> _members = new();
|
||||
|
||||
private string? _currentLeaderId;
|
||||
private ClusterState _state = ClusterState.Initializing;
|
||||
|
||||
public event EventHandler<ClusterStateChangedEventArgs>? StateChanged;
|
||||
public event EventHandler<LeaderChangedEventArgs>? LeaderChanged;
|
||||
public event EventHandler<MembershipChangedEventArgs>? MembershipChanged;
|
||||
|
||||
public AgentClusterManager(
|
||||
IClusterMemberStore memberStore,
|
||||
ILeaderElection leaderElection,
|
||||
TimeProvider timeProvider,
|
||||
AgentClusterConfig config,
|
||||
ILogger<AgentClusterManager> logger)
|
||||
{
|
||||
_memberStore = memberStore;
|
||||
_leaderElection = leaderElection;
|
||||
_timeProvider = timeProvider;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current cluster mode.
|
||||
/// </summary>
|
||||
public ClusterMode Mode => _config.Mode;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current cluster state.
|
||||
/// </summary>
|
||||
public ClusterState State => _state;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current leader ID (for ActivePassive mode).
|
||||
/// </summary>
|
||||
public string? CurrentLeaderId => _currentLeaderId;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether this agent is the leader.
|
||||
/// </summary>
|
||||
public bool IsLeader => _currentLeaderId == _config.LocalAgentId;
|
||||
|
||||
/// <summary>
|
||||
/// Gets all cluster members.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, ClusterMember> Members => _members;
|
||||
|
||||
/// <summary>
|
||||
/// Joins the cluster.
|
||||
/// </summary>
|
||||
public async Task JoinClusterAsync(CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Agent {AgentId} joining cluster in {Mode} mode",
|
||||
_config.LocalAgentId, _config.Mode);
|
||||
|
||||
var localMember = new ClusterMember
|
||||
{
|
||||
AgentId = _config.LocalAgentId,
|
||||
Endpoint = _config.LocalEndpoint,
|
||||
JoinedAt = _timeProvider.GetUtcNow(),
|
||||
LastHeartbeat = _timeProvider.GetUtcNow(),
|
||||
Status = MemberStatus.Joining,
|
||||
Role = DetermineInitialRole()
|
||||
};
|
||||
|
||||
_members[_config.LocalAgentId] = localMember;
|
||||
|
||||
await _memberStore.RegisterAsync(localMember, ct);
|
||||
|
||||
// Load existing members
|
||||
var existingMembers = await _memberStore.GetAllAsync(ct);
|
||||
foreach (var member in existingMembers)
|
||||
{
|
||||
if (member.AgentId != _config.LocalAgentId)
|
||||
{
|
||||
_members[member.AgentId] = member;
|
||||
}
|
||||
}
|
||||
|
||||
// Start leader election for ActivePassive mode
|
||||
if (_config.Mode == ClusterMode.ActivePassive)
|
||||
{
|
||||
await StartLeaderElectionAsync(ct);
|
||||
}
|
||||
|
||||
// Update local member status
|
||||
localMember = localMember with { Status = MemberStatus.Active };
|
||||
_members[_config.LocalAgentId] = localMember;
|
||||
await _memberStore.UpdateAsync(localMember, ct);
|
||||
|
||||
UpdateState(ClusterState.Running);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Agent {AgentId} joined cluster with {MemberCount} members",
|
||||
_config.LocalAgentId, _members.Count);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Leaves the cluster gracefully.
|
||||
/// </summary>
|
||||
public async Task LeaveClusterAsync(CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Agent {AgentId} leaving cluster",
|
||||
_config.LocalAgentId);
|
||||
|
||||
UpdateState(ClusterState.Leaving);
|
||||
|
||||
// Resign leadership if leader
|
||||
if (IsLeader)
|
||||
{
|
||||
await _leaderElection.ResignAsync(ct);
|
||||
}
|
||||
|
||||
await _memberStore.UnregisterAsync(_config.LocalAgentId, ct);
|
||||
_members.TryRemove(_config.LocalAgentId, out _);
|
||||
|
||||
UpdateState(ClusterState.Left);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets available members for task assignment.
|
||||
/// </summary>
|
||||
public IReadOnlyList<ClusterMember> GetAvailableMembers()
|
||||
{
|
||||
return _members.Values
|
||||
.Where(m => m.Status == MemberStatus.Active)
|
||||
.Where(m => _config.Mode != ClusterMode.ActivePassive || m.Role == MemberRole.Leader)
|
||||
.OrderBy(m => m.CurrentLoad)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Selects a member for task assignment based on strategy.
|
||||
/// </summary>
|
||||
public ClusterMember? SelectMemberForTask(TaskAssignmentContext context)
|
||||
{
|
||||
var available = GetAvailableMembers();
|
||||
|
||||
if (available.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return _config.LoadBalancingStrategy switch
|
||||
{
|
||||
LoadBalancingStrategy.RoundRobin => SelectRoundRobin(available),
|
||||
LoadBalancingStrategy.LeastLoaded => available.First(),
|
||||
LoadBalancingStrategy.AffinityBased => SelectByAffinity(available, context),
|
||||
LoadBalancingStrategy.ShardBased => SelectByShard(available, context),
|
||||
_ => available.First()
|
||||
};
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
await JoinClusterAsync(stoppingToken);
|
||||
|
||||
using var timer = new PeriodicTimer(_config.HeartbeatInterval);
|
||||
|
||||
try
|
||||
{
|
||||
while (await timer.WaitForNextTickAsync(stoppingToken))
|
||||
{
|
||||
await SendHeartbeatAsync(stoppingToken);
|
||||
await CheckMemberHealthAsync(stoppingToken);
|
||||
await SyncClusterStateAsync(stoppingToken);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
// Expected on shutdown
|
||||
}
|
||||
|
||||
await LeaveClusterAsync(CancellationToken.None);
|
||||
}
|
||||
|
||||
private async Task SendHeartbeatAsync(CancellationToken ct)
|
||||
{
|
||||
if (_members.TryGetValue(_config.LocalAgentId, out var local))
|
||||
{
|
||||
var updated = local with
|
||||
{
|
||||
LastHeartbeat = _timeProvider.GetUtcNow(),
|
||||
CurrentLoad = CalculateCurrentLoad()
|
||||
};
|
||||
|
||||
_members[_config.LocalAgentId] = updated;
|
||||
await _memberStore.UpdateAsync(updated, ct);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task CheckMemberHealthAsync(CancellationToken ct)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var unhealthyThreshold = _config.HeartbeatInterval * 3;
|
||||
|
||||
foreach (var (id, member) in _members)
|
||||
{
|
||||
if (id == _config.LocalAgentId)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var timeSinceHeartbeat = now - member.LastHeartbeat;
|
||||
|
||||
if (timeSinceHeartbeat > unhealthyThreshold && member.Status == MemberStatus.Active)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Member {MemberId} appears unhealthy (no heartbeat for {Duration})",
|
||||
id, timeSinceHeartbeat);
|
||||
|
||||
var updated = member with { Status = MemberStatus.Unhealthy };
|
||||
_members[id] = updated;
|
||||
|
||||
MembershipChanged?.Invoke(this, new MembershipChangedEventArgs
|
||||
{
|
||||
MemberId = id,
|
||||
ChangeType = MembershipChangeType.StatusChanged,
|
||||
OldStatus = member.Status,
|
||||
NewStatus = MemberStatus.Unhealthy
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task SyncClusterStateAsync(CancellationToken ct)
|
||||
{
|
||||
var remoteMembers = await _memberStore.GetAllAsync(ct);
|
||||
|
||||
foreach (var remote in remoteMembers)
|
||||
{
|
||||
if (!_members.ContainsKey(remote.AgentId))
|
||||
{
|
||||
_members[remote.AgentId] = remote;
|
||||
|
||||
MembershipChanged?.Invoke(this, new MembershipChangedEventArgs
|
||||
{
|
||||
MemberId = remote.AgentId,
|
||||
ChangeType = MembershipChangeType.Joined
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
_members[remote.AgentId] = remote;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task StartLeaderElectionAsync(CancellationToken ct)
|
||||
{
|
||||
_leaderElection.LeaderChanged += OnLeaderChanged;
|
||||
await _leaderElection.StartAsync(_config.LocalAgentId, ct);
|
||||
}
|
||||
|
||||
private void OnLeaderChanged(object? sender, string newLeaderId)
|
||||
{
|
||||
var oldLeader = _currentLeaderId;
|
||||
_currentLeaderId = newLeaderId;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Leader changed from {OldLeader} to {NewLeader}",
|
||||
oldLeader ?? "(none)", newLeaderId);
|
||||
|
||||
// Update roles
|
||||
foreach (var (id, member) in _members)
|
||||
{
|
||||
var newRole = id == newLeaderId ? MemberRole.Leader : MemberRole.Follower;
|
||||
if (member.Role != newRole)
|
||||
{
|
||||
_members[id] = member with { Role = newRole };
|
||||
}
|
||||
}
|
||||
|
||||
LeaderChanged?.Invoke(this, new LeaderChangedEventArgs
|
||||
{
|
||||
OldLeaderId = oldLeader,
|
||||
NewLeaderId = newLeaderId
|
||||
});
|
||||
}
|
||||
|
||||
private MemberRole DetermineInitialRole()
|
||||
{
|
||||
return _config.Mode switch
|
||||
{
|
||||
ClusterMode.ActivePassive => MemberRole.Follower,
|
||||
ClusterMode.ActiveActive => MemberRole.Active,
|
||||
ClusterMode.Sharded => MemberRole.Shard,
|
||||
_ => MemberRole.Active
|
||||
};
|
||||
}
|
||||
|
||||
private void UpdateState(ClusterState newState)
|
||||
{
|
||||
var oldState = _state;
|
||||
_state = newState;
|
||||
|
||||
if (oldState != newState)
|
||||
{
|
||||
StateChanged?.Invoke(this, new ClusterStateChangedEventArgs
|
||||
{
|
||||
OldState = oldState,
|
||||
NewState = newState
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private double CalculateCurrentLoad()
|
||||
{
|
||||
// Placeholder - implement actual load calculation
|
||||
return 0.5;
|
||||
}
|
||||
|
||||
private int _roundRobinIndex;
|
||||
private ClusterMember SelectRoundRobin(IReadOnlyList<ClusterMember> members)
|
||||
{
|
||||
var index = Interlocked.Increment(ref _roundRobinIndex) % members.Count;
|
||||
return members[index];
|
||||
}
|
||||
|
||||
private ClusterMember SelectByAffinity(
|
||||
IReadOnlyList<ClusterMember> members,
|
||||
TaskAssignmentContext context)
|
||||
{
|
||||
// Prefer member that handled previous tasks for same target
|
||||
if (context.TargetAffinity is not null)
|
||||
{
|
||||
var affine = members.FirstOrDefault(m =>
|
||||
m.Capabilities.Contains(context.TargetAffinity));
|
||||
|
||||
if (affine is not null)
|
||||
{
|
||||
return affine;
|
||||
}
|
||||
}
|
||||
|
||||
return members.First();
|
||||
}
|
||||
|
||||
private ClusterMember SelectByShard(
|
||||
IReadOnlyList<ClusterMember> members,
|
||||
TaskAssignmentContext context)
|
||||
{
|
||||
// Consistent hashing for shard selection
|
||||
var hash = context.TaskId.GetHashCode();
|
||||
var shardIndex = Math.Abs(hash) % members.Count;
|
||||
return members[shardIndex];
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for agent clustering.
|
||||
/// </summary>
|
||||
public sealed record AgentClusterConfig
|
||||
{
|
||||
public required string LocalAgentId { get; init; }
|
||||
public required string LocalEndpoint { get; init; }
|
||||
public ClusterMode Mode { get; init; } = ClusterMode.ActiveActive;
|
||||
public LoadBalancingStrategy LoadBalancingStrategy { get; init; } = LoadBalancingStrategy.LeastLoaded;
|
||||
public TimeSpan HeartbeatInterval { get; init; } = TimeSpan.FromSeconds(5);
|
||||
public int MinQuorum { get; init; } = 1;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Cluster operational mode.
|
||||
/// </summary>
|
||||
public enum ClusterMode
|
||||
{
|
||||
/// <summary>
|
||||
/// One leader handles all work; followers on standby.
|
||||
/// </summary>
|
||||
ActivePassive,
|
||||
|
||||
/// <summary>
|
||||
/// All members handle work equally.
|
||||
/// </summary>
|
||||
ActiveActive,
|
||||
|
||||
/// <summary>
|
||||
/// Work is partitioned across members.
|
||||
/// </summary>
|
||||
Sharded
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Load balancing strategy.
|
||||
/// </summary>
|
||||
public enum LoadBalancingStrategy
|
||||
{
|
||||
RoundRobin,
|
||||
LeastLoaded,
|
||||
AffinityBased,
|
||||
ShardBased
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Cluster state.
|
||||
/// </summary>
|
||||
public enum ClusterState
|
||||
{
|
||||
Initializing,
|
||||
Running,
|
||||
Degraded,
|
||||
Leaving,
|
||||
Left
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A member of the cluster.
|
||||
/// </summary>
|
||||
public sealed record ClusterMember
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required string Endpoint { get; init; }
|
||||
public required DateTimeOffset JoinedAt { get; init; }
|
||||
public required DateTimeOffset LastHeartbeat { get; init; }
|
||||
public required MemberStatus Status { get; init; }
|
||||
public required MemberRole Role { get; init; }
|
||||
public double CurrentLoad { get; init; }
|
||||
public ImmutableHashSet<string> Capabilities { get; init; } = [];
|
||||
public int? ShardId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Member status.
|
||||
/// </summary>
|
||||
public enum MemberStatus
|
||||
{
|
||||
Joining,
|
||||
Active,
|
||||
Unhealthy,
|
||||
Leaving,
|
||||
Left
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Member role.
|
||||
/// </summary>
|
||||
public enum MemberRole
|
||||
{
|
||||
Leader,
|
||||
Follower,
|
||||
Active,
|
||||
Shard
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Context for task assignment.
|
||||
/// </summary>
|
||||
public sealed record TaskAssignmentContext
|
||||
{
|
||||
public required Guid TaskId { get; init; }
|
||||
public string? TargetAffinity { get; init; }
|
||||
public Guid? PreferredAgentId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event args for cluster state changes.
|
||||
/// </summary>
|
||||
public sealed class ClusterStateChangedEventArgs : EventArgs
|
||||
{
|
||||
public required ClusterState OldState { get; init; }
|
||||
public required ClusterState NewState { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event args for leader changes.
|
||||
/// </summary>
|
||||
public sealed class LeaderChangedEventArgs : EventArgs
|
||||
{
|
||||
public string? OldLeaderId { get; init; }
|
||||
public required string NewLeaderId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event args for membership changes.
|
||||
/// </summary>
|
||||
public sealed class MembershipChangedEventArgs : EventArgs
|
||||
{
|
||||
public required string MemberId { get; init; }
|
||||
public required MembershipChangeType ChangeType { get; init; }
|
||||
public MemberStatus? OldStatus { get; init; }
|
||||
public MemberStatus? NewStatus { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of membership change.
|
||||
/// </summary>
|
||||
public enum MembershipChangeType
|
||||
{
|
||||
Joined,
|
||||
Left,
|
||||
StatusChanged
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for cluster member storage.
|
||||
/// </summary>
|
||||
public interface IClusterMemberStore
|
||||
{
|
||||
Task RegisterAsync(ClusterMember member, CancellationToken ct = default);
|
||||
Task UpdateAsync(ClusterMember member, CancellationToken ct = default);
|
||||
Task UnregisterAsync(string agentId, CancellationToken ct = default);
|
||||
Task<IReadOnlyList<ClusterMember>> GetAllAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for leader election.
|
||||
/// </summary>
|
||||
public interface ILeaderElection
|
||||
{
|
||||
event EventHandler<string>? LeaderChanged;
|
||||
Task StartAsync(string candidateId, CancellationToken ct = default);
|
||||
Task ResignAsync(CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,468 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using System.Threading.Channels;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Agent.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Durable task queue with delivery guarantees and dead-letter handling.
|
||||
/// </summary>
|
||||
public sealed class DurableTaskQueue : BackgroundService
|
||||
{
|
||||
private readonly IDurableTaskStore _store;
|
||||
private readonly Channel<QueuedTask> _channel;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly DurableTaskQueueConfig _config;
|
||||
private readonly ILogger<DurableTaskQueue> _logger;
|
||||
private readonly ConcurrentDictionary<Guid, QueuedTask> _inFlight = new();
|
||||
|
||||
public event EventHandler<TaskQueueEventArgs>? TaskEnqueued;
|
||||
public event EventHandler<TaskQueueEventArgs>? TaskDequeued;
|
||||
public event EventHandler<TaskQueueEventArgs>? TaskCompleted;
|
||||
public event EventHandler<TaskQueueEventArgs>? TaskFailed;
|
||||
public event EventHandler<TaskQueueEventArgs>? TaskDeadLettered;
|
||||
|
||||
public DurableTaskQueue(
|
||||
IDurableTaskStore store,
|
||||
TimeProvider timeProvider,
|
||||
DurableTaskQueueConfig config,
|
||||
ILogger<DurableTaskQueue> logger)
|
||||
{
|
||||
_store = store;
|
||||
_timeProvider = timeProvider;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
_channel = Channel.CreateBounded<QueuedTask>(new BoundedChannelOptions(config.MaxQueueSize)
|
||||
{
|
||||
FullMode = BoundedChannelFullMode.Wait
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of tasks currently in queue.
|
||||
/// </summary>
|
||||
public int QueuedCount => _channel.Reader.Count;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of tasks currently in flight.
|
||||
/// </summary>
|
||||
public int InFlightCount => _inFlight.Count;
|
||||
|
||||
/// <summary>
|
||||
/// Enqueues a task with durability.
|
||||
/// </summary>
|
||||
public async Task<EnqueueResult> EnqueueAsync(
|
||||
TaskPayload payload,
|
||||
EnqueueOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(payload);
|
||||
options ??= new EnqueueOptions();
|
||||
|
||||
var task = new QueuedTask
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
Payload = payload,
|
||||
Priority = options.Priority,
|
||||
EnqueuedAt = _timeProvider.GetUtcNow(),
|
||||
Status = QueuedTaskStatus.Pending,
|
||||
AttemptCount = 0,
|
||||
MaxRetries = options.MaxRetries ?? _config.DefaultMaxRetries,
|
||||
Timeout = options.Timeout ?? _config.DefaultTimeout,
|
||||
ScheduledFor = options.ScheduledFor
|
||||
};
|
||||
|
||||
// Persist first for durability
|
||||
await _store.SaveAsync(task, ct);
|
||||
|
||||
// Only queue if not scheduled for later
|
||||
if (!options.ScheduledFor.HasValue || options.ScheduledFor <= _timeProvider.GetUtcNow())
|
||||
{
|
||||
await _channel.Writer.WriteAsync(task, ct);
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Enqueued task {TaskId} with priority {Priority}",
|
||||
task.Id, task.Priority);
|
||||
|
||||
TaskEnqueued?.Invoke(this, new TaskQueueEventArgs { Task = task });
|
||||
|
||||
return new EnqueueResult
|
||||
{
|
||||
TaskId = task.Id,
|
||||
Success = true,
|
||||
QueuePosition = _channel.Reader.Count
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Dequeues a task for processing.
|
||||
/// </summary>
|
||||
public async Task<QueuedTask?> DequeueAsync(CancellationToken ct = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
var task = await _channel.Reader.ReadAsync(ct);
|
||||
|
||||
// Mark as in-flight
|
||||
task = task with
|
||||
{
|
||||
Status = QueuedTaskStatus.Processing,
|
||||
StartedAt = _timeProvider.GetUtcNow(),
|
||||
AttemptCount = task.AttemptCount + 1
|
||||
};
|
||||
|
||||
_inFlight[task.Id] = task;
|
||||
await _store.SaveAsync(task, ct);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Dequeued task {TaskId} (attempt {Attempt}/{MaxRetries})",
|
||||
task.Id, task.AttemptCount, task.MaxRetries);
|
||||
|
||||
TaskDequeued?.Invoke(this, new TaskQueueEventArgs { Task = task });
|
||||
|
||||
return task;
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Acknowledges successful task completion.
|
||||
/// </summary>
|
||||
public async Task AcknowledgeAsync(Guid taskId, CancellationToken ct = default)
|
||||
{
|
||||
if (!_inFlight.TryRemove(taskId, out var task))
|
||||
{
|
||||
_logger.LogWarning("Task {TaskId} not found in flight", taskId);
|
||||
return;
|
||||
}
|
||||
|
||||
task = task with
|
||||
{
|
||||
Status = QueuedTaskStatus.Completed,
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
await _store.SaveAsync(task, ct);
|
||||
|
||||
_logger.LogDebug("Task {TaskId} acknowledged", taskId);
|
||||
|
||||
TaskCompleted?.Invoke(this, new TaskQueueEventArgs { Task = task });
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reports task failure with optional retry.
|
||||
/// </summary>
|
||||
public async Task NackAsync(
|
||||
Guid taskId,
|
||||
string? error = null,
|
||||
bool retry = true,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
if (!_inFlight.TryRemove(taskId, out var task))
|
||||
{
|
||||
_logger.LogWarning("Task {TaskId} not found in flight", taskId);
|
||||
return;
|
||||
}
|
||||
|
||||
var canRetry = retry && task.AttemptCount < task.MaxRetries;
|
||||
|
||||
if (canRetry)
|
||||
{
|
||||
// Calculate backoff delay
|
||||
var delay = CalculateBackoff(task.AttemptCount);
|
||||
|
||||
task = task with
|
||||
{
|
||||
Status = QueuedTaskStatus.Pending,
|
||||
LastError = error,
|
||||
ScheduledFor = _timeProvider.GetUtcNow() + delay
|
||||
};
|
||||
|
||||
await _store.SaveAsync(task, ct);
|
||||
|
||||
_logger.LogWarning(
|
||||
"Task {TaskId} failed (attempt {Attempt}), retrying in {Delay}",
|
||||
taskId, task.AttemptCount, delay);
|
||||
|
||||
TaskFailed?.Invoke(this, new TaskQueueEventArgs
|
||||
{
|
||||
Task = task,
|
||||
WillRetry = true
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
// Move to dead-letter queue
|
||||
task = task with
|
||||
{
|
||||
Status = QueuedTaskStatus.DeadLettered,
|
||||
LastError = error,
|
||||
DeadLetteredAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
await _store.SaveAsync(task, ct);
|
||||
await _store.MoveToDeadLetterAsync(task, ct);
|
||||
|
||||
_logger.LogError(
|
||||
"Task {TaskId} moved to dead-letter after {Attempts} attempts: {Error}",
|
||||
taskId, task.AttemptCount, error);
|
||||
|
||||
TaskDeadLettered?.Invoke(this, new TaskQueueEventArgs { Task = task });
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all tasks in the dead-letter queue.
|
||||
/// </summary>
|
||||
public async Task<IReadOnlyList<QueuedTask>> GetDeadLetterQueueAsync(
|
||||
int limit = 100,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
return await _store.GetDeadLetterQueueAsync(limit, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Retries a dead-lettered task.
|
||||
/// </summary>
|
||||
public async Task<bool> RetryDeadLetterAsync(
|
||||
Guid taskId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var task = await _store.GetDeadLetterTaskAsync(taskId, ct);
|
||||
if (task is null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
task = task with
|
||||
{
|
||||
Status = QueuedTaskStatus.Pending,
|
||||
AttemptCount = 0,
|
||||
LastError = null,
|
||||
DeadLetteredAt = null,
|
||||
ScheduledFor = null
|
||||
};
|
||||
|
||||
await _store.RemoveFromDeadLetterAsync(taskId, ct);
|
||||
await _store.SaveAsync(task, ct);
|
||||
await _channel.Writer.WriteAsync(task, ct);
|
||||
|
||||
_logger.LogInformation("Retried dead-lettered task {TaskId}", taskId);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
// Recover in-flight tasks from previous run
|
||||
await RecoverInFlightTasksAsync(stoppingToken);
|
||||
|
||||
// Process scheduled tasks
|
||||
using var timer = new PeriodicTimer(TimeSpan.FromSeconds(1));
|
||||
|
||||
while (await timer.WaitForNextTickAsync(stoppingToken))
|
||||
{
|
||||
await ProcessScheduledTasksAsync(stoppingToken);
|
||||
await ProcessTimedOutTasksAsync(stoppingToken);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task RecoverInFlightTasksAsync(CancellationToken ct)
|
||||
{
|
||||
var inFlightTasks = await _store.GetInFlightTasksAsync(ct);
|
||||
|
||||
foreach (var task in inFlightTasks)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Recovering in-flight task {TaskId} from previous run",
|
||||
task.Id);
|
||||
|
||||
// Re-queue for processing
|
||||
var recovered = task with
|
||||
{
|
||||
Status = QueuedTaskStatus.Pending,
|
||||
ScheduledFor = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
await _store.SaveAsync(recovered, ct);
|
||||
await _channel.Writer.WriteAsync(recovered, ct);
|
||||
}
|
||||
|
||||
if (inFlightTasks.Count > 0)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Recovered {Count} in-flight tasks",
|
||||
inFlightTasks.Count);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ProcessScheduledTasksAsync(CancellationToken ct)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var scheduledTasks = await _store.GetScheduledTasksAsync(now, ct);
|
||||
|
||||
foreach (var task in scheduledTasks)
|
||||
{
|
||||
await _channel.Writer.WriteAsync(task, ct);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Scheduled task {TaskId} is now ready for processing",
|
||||
task.Id);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ProcessTimedOutTasksAsync(CancellationToken ct)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
foreach (var (taskId, task) in _inFlight)
|
||||
{
|
||||
if (!task.StartedAt.HasValue)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var elapsed = now - task.StartedAt.Value;
|
||||
|
||||
if (elapsed > task.Timeout)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Task {TaskId} timed out after {Elapsed}",
|
||||
taskId, elapsed);
|
||||
|
||||
await NackAsync(taskId, "Task timed out", retry: true, ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private TimeSpan CalculateBackoff(int attemptCount)
|
||||
{
|
||||
var baseDelay = _config.RetryBaseDelay;
|
||||
var multiplier = Math.Pow(2, attemptCount - 1);
|
||||
var delay = baseDelay * multiplier;
|
||||
|
||||
// Add jitter
|
||||
var jitter = Random.Shared.NextDouble() * 0.3 * delay.TotalMilliseconds;
|
||||
delay = delay.Add(TimeSpan.FromMilliseconds(jitter));
|
||||
|
||||
// Cap at max delay
|
||||
return delay > _config.RetryMaxDelay ? _config.RetryMaxDelay : delay;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for durable task queue.
|
||||
/// </summary>
|
||||
public sealed record DurableTaskQueueConfig
|
||||
{
|
||||
public int MaxQueueSize { get; init; } = 10000;
|
||||
public int DefaultMaxRetries { get; init; } = 3;
|
||||
public TimeSpan DefaultTimeout { get; init; } = TimeSpan.FromMinutes(30);
|
||||
public TimeSpan RetryBaseDelay { get; init; } = TimeSpan.FromSeconds(5);
|
||||
public TimeSpan RetryMaxDelay { get; init; } = TimeSpan.FromMinutes(5);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for enqueueing a task.
|
||||
/// </summary>
|
||||
public sealed record EnqueueOptions
|
||||
{
|
||||
public TaskPriority Priority { get; init; } = TaskPriority.Normal;
|
||||
public int? MaxRetries { get; init; }
|
||||
public TimeSpan? Timeout { get; init; }
|
||||
public DateTimeOffset? ScheduledFor { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of enqueue operation.
|
||||
/// </summary>
|
||||
public sealed record EnqueueResult
|
||||
{
|
||||
public required Guid TaskId { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public int QueuePosition { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A queued task.
|
||||
/// </summary>
|
||||
public sealed record QueuedTask
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required TaskPayload Payload { get; init; }
|
||||
public required TaskPriority Priority { get; init; }
|
||||
public required DateTimeOffset EnqueuedAt { get; init; }
|
||||
public required QueuedTaskStatus Status { get; init; }
|
||||
public required int AttemptCount { get; init; }
|
||||
public required int MaxRetries { get; init; }
|
||||
public required TimeSpan Timeout { get; init; }
|
||||
public DateTimeOffset? ScheduledFor { get; init; }
|
||||
public DateTimeOffset? StartedAt { get; init; }
|
||||
public DateTimeOffset? CompletedAt { get; init; }
|
||||
public DateTimeOffset? DeadLetteredAt { get; init; }
|
||||
public string? LastError { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Payload for a task.
|
||||
/// </summary>
|
||||
public sealed record TaskPayload
|
||||
{
|
||||
public required string TaskType { get; init; }
|
||||
public required ImmutableDictionary<string, object?> Data { get; init; }
|
||||
public string? TargetAgentId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Task priority.
|
||||
/// </summary>
|
||||
public enum TaskPriority
|
||||
{
|
||||
Low = 0,
|
||||
Normal = 1,
|
||||
High = 2,
|
||||
Critical = 3
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of a queued task.
|
||||
/// </summary>
|
||||
public enum QueuedTaskStatus
|
||||
{
|
||||
Pending,
|
||||
Processing,
|
||||
Completed,
|
||||
Failed,
|
||||
DeadLettered
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event args for task queue events.
|
||||
/// </summary>
|
||||
public sealed class TaskQueueEventArgs : EventArgs
|
||||
{
|
||||
public required QueuedTask Task { get; init; }
|
||||
public bool WillRetry { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for durable task storage.
|
||||
/// </summary>
|
||||
public interface IDurableTaskStore
|
||||
{
|
||||
Task SaveAsync(QueuedTask task, CancellationToken ct = default);
|
||||
Task<QueuedTask?> GetAsync(Guid taskId, CancellationToken ct = default);
|
||||
Task<IReadOnlyList<QueuedTask>> GetInFlightTasksAsync(CancellationToken ct = default);
|
||||
Task<IReadOnlyList<QueuedTask>> GetScheduledTasksAsync(DateTimeOffset cutoff, CancellationToken ct = default);
|
||||
Task MoveToDeadLetterAsync(QueuedTask task, CancellationToken ct = default);
|
||||
Task<IReadOnlyList<QueuedTask>> GetDeadLetterQueueAsync(int limit, CancellationToken ct = default);
|
||||
Task<QueuedTask?> GetDeadLetterTaskAsync(Guid taskId, CancellationToken ct = default);
|
||||
Task RemoveFromDeadLetterAsync(Guid taskId, CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,374 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Agent.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Manages failover when agents become unhealthy.
|
||||
/// </summary>
|
||||
public sealed class FailoverManager
|
||||
{
|
||||
private readonly AgentClusterManager _clusterManager;
|
||||
private readonly ITaskTransferService _taskTransfer;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly FailoverConfig _config;
|
||||
private readonly ILogger<FailoverManager> _logger;
|
||||
private readonly ConcurrentDictionary<string, FailoverAttempt> _activeFailovers = new();
|
||||
|
||||
public event EventHandler<FailoverEventArgs>? FailoverStarted;
|
||||
public event EventHandler<FailoverEventArgs>? FailoverCompleted;
|
||||
public event EventHandler<FailoverEventArgs>? FailoverFailed;
|
||||
|
||||
public FailoverManager(
|
||||
AgentClusterManager clusterManager,
|
||||
ITaskTransferService taskTransfer,
|
||||
TimeProvider timeProvider,
|
||||
FailoverConfig config,
|
||||
ILogger<FailoverManager> logger)
|
||||
{
|
||||
_clusterManager = clusterManager;
|
||||
_taskTransfer = taskTransfer;
|
||||
_timeProvider = timeProvider;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
|
||||
_clusterManager.MembershipChanged += OnMembershipChanged;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Initiates failover for a failed agent.
|
||||
/// </summary>
|
||||
public async Task<FailoverResult> InitiateFailoverAsync(
|
||||
string failedAgentId,
|
||||
FailoverReason reason,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
if (_activeFailovers.ContainsKey(failedAgentId))
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Failover already in progress for agent {AgentId}",
|
||||
failedAgentId);
|
||||
|
||||
return new FailoverResult
|
||||
{
|
||||
FailedAgentId = failedAgentId,
|
||||
Success = false,
|
||||
Reason = reason,
|
||||
Error = "Failover already in progress"
|
||||
};
|
||||
}
|
||||
|
||||
var attempt = new FailoverAttempt
|
||||
{
|
||||
FailedAgentId = failedAgentId,
|
||||
Reason = reason,
|
||||
StartedAt = _timeProvider.GetUtcNow(),
|
||||
Status = FailoverStatus.InProgress
|
||||
};
|
||||
|
||||
_activeFailovers[failedAgentId] = attempt;
|
||||
|
||||
FailoverStarted?.Invoke(this, new FailoverEventArgs
|
||||
{
|
||||
FailedAgentId = failedAgentId,
|
||||
Reason = reason
|
||||
});
|
||||
|
||||
_logger.LogInformation(
|
||||
"Initiating failover for agent {AgentId} due to {Reason}",
|
||||
failedAgentId, reason);
|
||||
|
||||
try
|
||||
{
|
||||
// Get tasks from failed agent
|
||||
var tasks = await _taskTransfer.GetPendingTasksAsync(failedAgentId, ct);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Found {TaskCount} tasks to transfer from failed agent {AgentId}",
|
||||
tasks.Count, failedAgentId);
|
||||
|
||||
// Select target agents
|
||||
var transferred = new List<TaskTransferRecord>();
|
||||
var failed = new List<TaskTransferRecord>();
|
||||
|
||||
foreach (var task in tasks)
|
||||
{
|
||||
var targetMember = _clusterManager.SelectMemberForTask(new TaskAssignmentContext
|
||||
{
|
||||
TaskId = task.TaskId,
|
||||
TargetAffinity = task.TargetId
|
||||
});
|
||||
|
||||
if (targetMember is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"No available agent for task {TaskId}",
|
||||
task.TaskId);
|
||||
|
||||
failed.Add(new TaskTransferRecord
|
||||
{
|
||||
TaskId = task.TaskId,
|
||||
SourceAgentId = failedAgentId,
|
||||
Status = TaskTransferStatus.NoTargetAvailable
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await _taskTransfer.TransferTaskAsync(
|
||||
task.TaskId,
|
||||
failedAgentId,
|
||||
targetMember.AgentId,
|
||||
ct);
|
||||
|
||||
transferred.Add(new TaskTransferRecord
|
||||
{
|
||||
TaskId = task.TaskId,
|
||||
SourceAgentId = failedAgentId,
|
||||
TargetAgentId = targetMember.AgentId,
|
||||
Status = TaskTransferStatus.Transferred,
|
||||
TransferredAt = _timeProvider.GetUtcNow()
|
||||
});
|
||||
|
||||
_logger.LogDebug(
|
||||
"Transferred task {TaskId} to agent {TargetAgentId}",
|
||||
task.TaskId, targetMember.AgentId);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"Failed to transfer task {TaskId} to {TargetAgentId}",
|
||||
task.TaskId, targetMember.AgentId);
|
||||
|
||||
failed.Add(new TaskTransferRecord
|
||||
{
|
||||
TaskId = task.TaskId,
|
||||
SourceAgentId = failedAgentId,
|
||||
TargetAgentId = targetMember.AgentId,
|
||||
Status = TaskTransferStatus.Failed,
|
||||
Error = ex.Message
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
var completedAt = _timeProvider.GetUtcNow();
|
||||
var success = failed.Count == 0;
|
||||
|
||||
attempt = attempt with
|
||||
{
|
||||
CompletedAt = completedAt,
|
||||
Status = success ? FailoverStatus.Completed : FailoverStatus.PartialSuccess,
|
||||
TransferredTasks = transferred.ToImmutableArray(),
|
||||
FailedTasks = failed.ToImmutableArray()
|
||||
};
|
||||
|
||||
_activeFailovers[failedAgentId] = attempt;
|
||||
|
||||
var result = new FailoverResult
|
||||
{
|
||||
FailedAgentId = failedAgentId,
|
||||
Success = success,
|
||||
Reason = reason,
|
||||
TransferredTasks = transferred.ToImmutableArray(),
|
||||
FailedTasks = failed.ToImmutableArray(),
|
||||
Duration = completedAt - attempt.StartedAt
|
||||
};
|
||||
|
||||
FailoverCompleted?.Invoke(this, new FailoverEventArgs
|
||||
{
|
||||
FailedAgentId = failedAgentId,
|
||||
Reason = reason,
|
||||
Result = result
|
||||
});
|
||||
|
||||
_logger.LogInformation(
|
||||
"Failover for agent {AgentId} completed: {TransferredCount} transferred, {FailedCount} failed",
|
||||
failedAgentId, transferred.Count, failed.Count);
|
||||
|
||||
return result;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"Failover failed for agent {AgentId}",
|
||||
failedAgentId);
|
||||
|
||||
attempt = attempt with
|
||||
{
|
||||
CompletedAt = _timeProvider.GetUtcNow(),
|
||||
Status = FailoverStatus.Failed,
|
||||
Error = ex.Message
|
||||
};
|
||||
|
||||
_activeFailovers[failedAgentId] = attempt;
|
||||
|
||||
FailoverFailed?.Invoke(this, new FailoverEventArgs
|
||||
{
|
||||
FailedAgentId = failedAgentId,
|
||||
Reason = reason,
|
||||
Error = ex.Message
|
||||
});
|
||||
|
||||
return new FailoverResult
|
||||
{
|
||||
FailedAgentId = failedAgentId,
|
||||
Success = false,
|
||||
Reason = reason,
|
||||
Error = ex.Message
|
||||
};
|
||||
}
|
||||
finally
|
||||
{
|
||||
_activeFailovers.TryRemove(failedAgentId, out _);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the status of an active failover.
|
||||
/// </summary>
|
||||
public FailoverAttempt? GetFailoverStatus(string agentId)
|
||||
{
|
||||
return _activeFailovers.TryGetValue(agentId, out var attempt) ? attempt : null;
|
||||
}
|
||||
|
||||
private async void OnMembershipChanged(object? sender, MembershipChangedEventArgs e)
|
||||
{
|
||||
if (e.ChangeType == MembershipChangeType.StatusChanged &&
|
||||
e.NewStatus == MemberStatus.Unhealthy &&
|
||||
_config.AutoFailoverEnabled)
|
||||
{
|
||||
try
|
||||
{
|
||||
await InitiateFailoverAsync(
|
||||
e.MemberId,
|
||||
FailoverReason.AgentUnhealthy,
|
||||
CancellationToken.None);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"Auto-failover failed for agent {AgentId}",
|
||||
e.MemberId);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for failover.
|
||||
/// </summary>
|
||||
public sealed record FailoverConfig
|
||||
{
|
||||
public bool AutoFailoverEnabled { get; init; } = true;
|
||||
public TimeSpan FailoverTimeout { get; init; } = TimeSpan.FromMinutes(5);
|
||||
public int MaxRetries { get; init; } = 3;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a failover operation.
|
||||
/// </summary>
|
||||
public sealed record FailoverResult
|
||||
{
|
||||
public required string FailedAgentId { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public required FailoverReason Reason { get; init; }
|
||||
public string? Error { get; init; }
|
||||
public ImmutableArray<TaskTransferRecord> TransferredTasks { get; init; } = [];
|
||||
public ImmutableArray<TaskTransferRecord> FailedTasks { get; init; } = [];
|
||||
public TimeSpan Duration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Record of a task transfer.
|
||||
/// </summary>
|
||||
public sealed record TaskTransferRecord
|
||||
{
|
||||
public required Guid TaskId { get; init; }
|
||||
public required string SourceAgentId { get; init; }
|
||||
public string? TargetAgentId { get; init; }
|
||||
public required TaskTransferStatus Status { get; init; }
|
||||
public DateTimeOffset? TransferredAt { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of task transfer.
|
||||
/// </summary>
|
||||
public enum TaskTransferStatus
|
||||
{
|
||||
Pending,
|
||||
Transferred,
|
||||
Failed,
|
||||
NoTargetAvailable
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A failover attempt.
|
||||
/// </summary>
|
||||
public sealed record FailoverAttempt
|
||||
{
|
||||
public required string FailedAgentId { get; init; }
|
||||
public required FailoverReason Reason { get; init; }
|
||||
public required DateTimeOffset StartedAt { get; init; }
|
||||
public DateTimeOffset? CompletedAt { get; init; }
|
||||
public required FailoverStatus Status { get; init; }
|
||||
public ImmutableArray<TaskTransferRecord> TransferredTasks { get; init; } = [];
|
||||
public ImmutableArray<TaskTransferRecord> FailedTasks { get; init; } = [];
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reason for failover.
|
||||
/// </summary>
|
||||
public enum FailoverReason
|
||||
{
|
||||
AgentUnhealthy,
|
||||
NetworkPartition,
|
||||
ResourceExhaustion,
|
||||
ManualTrigger,
|
||||
GracefulShutdown
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of failover.
|
||||
/// </summary>
|
||||
public enum FailoverStatus
|
||||
{
|
||||
InProgress,
|
||||
Completed,
|
||||
PartialSuccess,
|
||||
Failed
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event args for failover events.
|
||||
/// </summary>
|
||||
public sealed class FailoverEventArgs : EventArgs
|
||||
{
|
||||
public required string FailedAgentId { get; init; }
|
||||
public required FailoverReason Reason { get; init; }
|
||||
public FailoverResult? Result { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Task pending on an agent.
|
||||
/// </summary>
|
||||
public sealed record PendingTask
|
||||
{
|
||||
public required Guid TaskId { get; init; }
|
||||
public required string TargetId { get; init; }
|
||||
public required string TaskType { get; init; }
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for task transfer operations.
|
||||
/// </summary>
|
||||
public interface ITaskTransferService
|
||||
{
|
||||
Task<IReadOnlyList<PendingTask>> GetPendingTasksAsync(string agentId, CancellationToken ct = default);
|
||||
Task TransferTaskAsync(Guid taskId, string sourceAgentId, string targetAgentId, CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,880 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// HealthMonitor.cs
|
||||
// Sprint: SPRINT_20260117_034_ReleaseOrchestrator_agent_resilience
|
||||
// Task: TASK-034-02 - Health Monitor with multi-factor assessment
|
||||
// Description: Comprehensive health monitoring with multiple factors and trend analysis
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Agent.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Multi-factor health monitor for agent cluster nodes.
|
||||
/// Combines multiple health signals into overall health assessment.
|
||||
/// </summary>
|
||||
public sealed class HealthMonitor : IHealthMonitor, IAsyncDisposable
|
||||
{
|
||||
private readonly IMetricsProvider _metricsProvider;
|
||||
private readonly IConnectivityChecker _connectivityChecker;
|
||||
private readonly HealthMonitorConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<HealthMonitor> _logger;
|
||||
|
||||
private readonly ConcurrentDictionary<string, AgentHealthState> _agentStates = new();
|
||||
private readonly ConcurrentDictionary<string, HealthHistory> _healthHistories = new();
|
||||
private readonly ConcurrentDictionary<string, Func<CancellationToken, Task<HealthCheckResult>>> _customChecks = new();
|
||||
|
||||
private CancellationTokenSource? _monitoringCts;
|
||||
private Task? _monitoringTask;
|
||||
|
||||
public HealthMonitor(
|
||||
IMetricsProvider metricsProvider,
|
||||
IConnectivityChecker connectivityChecker,
|
||||
HealthMonitorConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<HealthMonitor> logger)
|
||||
{
|
||||
_metricsProvider = metricsProvider;
|
||||
_connectivityChecker = connectivityChecker;
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts continuous health monitoring for all registered agents.
|
||||
/// </summary>
|
||||
public async Task StartAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (_monitoringTask is not null)
|
||||
{
|
||||
_logger.LogWarning("Health monitoring already started");
|
||||
return;
|
||||
}
|
||||
|
||||
_monitoringCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
_monitoringTask = MonitorHealthLoopAsync(_monitoringCts.Token);
|
||||
|
||||
_logger.LogInformation("Health monitoring started with interval {Interval}",
|
||||
_config.CheckInterval);
|
||||
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stops health monitoring.
|
||||
/// </summary>
|
||||
public async Task StopAsync()
|
||||
{
|
||||
if (_monitoringCts is null) return;
|
||||
|
||||
await _monitoringCts.CancelAsync();
|
||||
|
||||
if (_monitoringTask is not null)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _monitoringTask.WaitAsync(TimeSpan.FromSeconds(5));
|
||||
}
|
||||
catch (OperationCanceledException) { }
|
||||
catch (TimeoutException) { }
|
||||
}
|
||||
|
||||
_monitoringCts.Dispose();
|
||||
_monitoringCts = null;
|
||||
_monitoringTask = null;
|
||||
|
||||
_logger.LogInformation("Health monitoring stopped");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers an agent for health monitoring.
|
||||
/// </summary>
|
||||
public void RegisterAgent(string agentId, AgentEndpoint endpoint)
|
||||
{
|
||||
var state = new AgentHealthState
|
||||
{
|
||||
AgentId = agentId,
|
||||
Endpoint = endpoint,
|
||||
Status = AgentHealthStatus.Unknown,
|
||||
RegisteredAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
_agentStates[agentId] = state;
|
||||
_healthHistories[agentId] = new HealthHistory(_config.HistorySize);
|
||||
|
||||
_logger.LogDebug("Registered agent {AgentId} for health monitoring", agentId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unregisters an agent from health monitoring.
|
||||
/// </summary>
|
||||
public void UnregisterAgent(string agentId)
|
||||
{
|
||||
_agentStates.TryRemove(agentId, out _);
|
||||
_healthHistories.TryRemove(agentId, out _);
|
||||
|
||||
_logger.LogDebug("Unregistered agent {AgentId} from health monitoring", agentId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers a custom health check.
|
||||
/// </summary>
|
||||
public void RegisterCustomCheck(string name, Func<CancellationToken, Task<HealthCheckResult>> check)
|
||||
{
|
||||
_customChecks[name] = check;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets comprehensive health assessment for an agent.
|
||||
/// </summary>
|
||||
public async Task<AgentHealthAssessment> AssessHealthAsync(
|
||||
string agentId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
if (!_agentStates.TryGetValue(agentId, out var state))
|
||||
{
|
||||
throw new InvalidOperationException($"Agent {agentId} is not registered");
|
||||
}
|
||||
|
||||
var factors = await CollectHealthFactorsAsync(state, ct);
|
||||
var overallScore = CalculateOverallScore(factors);
|
||||
var status = DetermineStatus(overallScore, factors);
|
||||
var trend = AnalyzeTrend(agentId);
|
||||
|
||||
var assessment = new AgentHealthAssessment
|
||||
{
|
||||
AgentId = agentId,
|
||||
Status = status,
|
||||
OverallScore = overallScore,
|
||||
Factors = factors,
|
||||
Trend = trend,
|
||||
AssessedAt = _timeProvider.GetUtcNow(),
|
||||
Recommendation = GenerateRecommendation(status, factors, trend)
|
||||
};
|
||||
|
||||
// Update state
|
||||
UpdateAgentState(agentId, assessment);
|
||||
|
||||
return assessment;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets health assessments for all registered agents.
|
||||
/// </summary>
|
||||
public async Task<ImmutableArray<AgentHealthAssessment>> AssessAllAgentsAsync(
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var assessments = new List<AgentHealthAssessment>();
|
||||
|
||||
foreach (var agentId in _agentStates.Keys)
|
||||
{
|
||||
try
|
||||
{
|
||||
var assessment = await AssessHealthAsync(agentId, ct);
|
||||
assessments.Add(assessment);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to assess health for agent {AgentId}", agentId);
|
||||
}
|
||||
}
|
||||
|
||||
return assessments.ToImmutableArray();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets current status of all agents.
|
||||
/// </summary>
|
||||
public ImmutableDictionary<string, AgentHealthStatus> GetAllAgentStatuses()
|
||||
{
|
||||
return _agentStates.ToImmutableDictionary(
|
||||
kv => kv.Key,
|
||||
kv => kv.Value.Status);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets agents in a specific health status.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> GetAgentsByStatus(AgentHealthStatus status)
|
||||
{
|
||||
return _agentStates
|
||||
.Where(kv => kv.Value.Status == status)
|
||||
.Select(kv => kv.Key)
|
||||
.ToImmutableArray();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event raised when agent health status changes.
|
||||
/// </summary>
|
||||
public event EventHandler<AgentHealthChangedEventArgs>? HealthChanged;
|
||||
|
||||
private async Task MonitorHealthLoopAsync(CancellationToken ct)
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await AssessAllAgentsAsync(ct);
|
||||
await Task.Delay(_config.CheckInterval, ct);
|
||||
}
|
||||
catch (OperationCanceledException) when (ct.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in health monitoring loop");
|
||||
await Task.Delay(TimeSpan.FromSeconds(5), ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<HealthFactor>> CollectHealthFactorsAsync(
|
||||
AgentHealthState state,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var factors = new List<HealthFactor>();
|
||||
|
||||
// Factor 1: Connectivity/Liveness
|
||||
var connectivity = await CheckConnectivityAsync(state, ct);
|
||||
factors.Add(connectivity);
|
||||
|
||||
// Factor 2: Resource utilization
|
||||
var resources = await CheckResourcesAsync(state, ct);
|
||||
factors.Add(resources);
|
||||
|
||||
// Factor 3: Task processing health
|
||||
var taskHealth = await CheckTaskHealthAsync(state, ct);
|
||||
factors.Add(taskHealth);
|
||||
|
||||
// Factor 4: Response latency
|
||||
var latency = await CheckLatencyAsync(state, ct);
|
||||
factors.Add(latency);
|
||||
|
||||
// Factor 5: Error rate
|
||||
var errorRate = await CheckErrorRateAsync(state, ct);
|
||||
factors.Add(errorRate);
|
||||
|
||||
// Factor 6: Queue depth
|
||||
var queueDepth = await CheckQueueDepthAsync(state, ct);
|
||||
factors.Add(queueDepth);
|
||||
|
||||
// Custom checks
|
||||
foreach (var (name, check) in _customChecks)
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await check(ct);
|
||||
factors.Add(new HealthFactor
|
||||
{
|
||||
Name = name,
|
||||
Score = result.Score,
|
||||
Status = result.Status,
|
||||
Weight = 1.0,
|
||||
Details = result.Details
|
||||
});
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Custom health check {Name} failed", name);
|
||||
factors.Add(new HealthFactor
|
||||
{
|
||||
Name = name,
|
||||
Score = 0,
|
||||
Status = FactorStatus.Failed,
|
||||
Weight = 1.0,
|
||||
Details = ex.Message
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return factors.ToImmutableArray();
|
||||
}
|
||||
|
||||
private async Task<HealthFactor> CheckConnectivityAsync(AgentHealthState state, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await _connectivityChecker.CheckAsync(state.Endpoint, ct);
|
||||
|
||||
return new HealthFactor
|
||||
{
|
||||
Name = "Connectivity",
|
||||
Score = result.IsReachable ? 1.0 : 0.0,
|
||||
Status = result.IsReachable ? FactorStatus.Healthy : FactorStatus.Critical,
|
||||
Weight = _config.ConnectivityWeight,
|
||||
Details = result.IsReachable ? "Agent reachable" : $"Agent unreachable: {result.Error}"
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthFactor
|
||||
{
|
||||
Name = "Connectivity",
|
||||
Score = 0,
|
||||
Status = FactorStatus.Critical,
|
||||
Weight = _config.ConnectivityWeight,
|
||||
Details = $"Connectivity check failed: {ex.Message}"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<HealthFactor> CheckResourcesAsync(AgentHealthState state, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var metrics = await _metricsProvider.GetResourceMetricsAsync(state.AgentId, ct);
|
||||
|
||||
var cpuScore = 1.0 - Math.Min(metrics.CpuPercent / 100.0, 1.0);
|
||||
var memoryScore = 1.0 - Math.Min(metrics.MemoryPercent / 100.0, 1.0);
|
||||
var diskScore = 1.0 - Math.Min(metrics.DiskPercent / 100.0, 1.0);
|
||||
|
||||
var overallScore = (cpuScore * 0.4 + memoryScore * 0.4 + diskScore * 0.2);
|
||||
|
||||
var status = overallScore switch
|
||||
{
|
||||
>= 0.7 => FactorStatus.Healthy,
|
||||
>= 0.4 => FactorStatus.Warning,
|
||||
>= 0.2 => FactorStatus.Degraded,
|
||||
_ => FactorStatus.Critical
|
||||
};
|
||||
|
||||
return new HealthFactor
|
||||
{
|
||||
Name = "Resources",
|
||||
Score = overallScore,
|
||||
Status = status,
|
||||
Weight = _config.ResourceWeight,
|
||||
Details = $"CPU: {metrics.CpuPercent:F1}%, Memory: {metrics.MemoryPercent:F1}%, Disk: {metrics.DiskPercent:F1}%"
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthFactor
|
||||
{
|
||||
Name = "Resources",
|
||||
Score = 0.5, // Unknown = neutral
|
||||
Status = FactorStatus.Unknown,
|
||||
Weight = _config.ResourceWeight,
|
||||
Details = $"Resource check failed: {ex.Message}"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<HealthFactor> CheckTaskHealthAsync(AgentHealthState state, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var metrics = await _metricsProvider.GetTaskMetricsAsync(state.AgentId, ct);
|
||||
|
||||
var successRate = metrics.TotalTasks > 0
|
||||
? (double)metrics.SuccessfulTasks / metrics.TotalTasks
|
||||
: 1.0;
|
||||
|
||||
var status = successRate switch
|
||||
{
|
||||
>= 0.95 => FactorStatus.Healthy,
|
||||
>= 0.85 => FactorStatus.Warning,
|
||||
>= 0.70 => FactorStatus.Degraded,
|
||||
_ => FactorStatus.Critical
|
||||
};
|
||||
|
||||
return new HealthFactor
|
||||
{
|
||||
Name = "TaskHealth",
|
||||
Score = successRate,
|
||||
Status = status,
|
||||
Weight = _config.TaskHealthWeight,
|
||||
Details = $"Success rate: {successRate:P1} ({metrics.SuccessfulTasks}/{metrics.TotalTasks})"
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthFactor
|
||||
{
|
||||
Name = "TaskHealth",
|
||||
Score = 0.5,
|
||||
Status = FactorStatus.Unknown,
|
||||
Weight = _config.TaskHealthWeight,
|
||||
Details = $"Task health check failed: {ex.Message}"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<HealthFactor> CheckLatencyAsync(AgentHealthState state, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var latency = await _connectivityChecker.MeasureLatencyAsync(state.Endpoint, ct);
|
||||
|
||||
var score = latency.TotalMilliseconds switch
|
||||
{
|
||||
<= 50 => 1.0,
|
||||
<= 100 => 0.9,
|
||||
<= 250 => 0.7,
|
||||
<= 500 => 0.5,
|
||||
<= 1000 => 0.3,
|
||||
_ => 0.1
|
||||
};
|
||||
|
||||
var status = score switch
|
||||
{
|
||||
>= 0.7 => FactorStatus.Healthy,
|
||||
>= 0.5 => FactorStatus.Warning,
|
||||
>= 0.3 => FactorStatus.Degraded,
|
||||
_ => FactorStatus.Critical
|
||||
};
|
||||
|
||||
return new HealthFactor
|
||||
{
|
||||
Name = "Latency",
|
||||
Score = score,
|
||||
Status = status,
|
||||
Weight = _config.LatencyWeight,
|
||||
Details = $"Response latency: {latency.TotalMilliseconds:F0}ms"
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthFactor
|
||||
{
|
||||
Name = "Latency",
|
||||
Score = 0,
|
||||
Status = FactorStatus.Critical,
|
||||
Weight = _config.LatencyWeight,
|
||||
Details = $"Latency check failed: {ex.Message}"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<HealthFactor> CheckErrorRateAsync(AgentHealthState state, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var metrics = await _metricsProvider.GetErrorMetricsAsync(state.AgentId, ct);
|
||||
|
||||
var errorRate = metrics.TotalRequests > 0
|
||||
? (double)metrics.ErrorCount / metrics.TotalRequests
|
||||
: 0.0;
|
||||
|
||||
var score = 1.0 - Math.Min(errorRate * 10, 1.0); // 10% error = 0 score
|
||||
|
||||
var status = errorRate switch
|
||||
{
|
||||
<= 0.01 => FactorStatus.Healthy,
|
||||
<= 0.05 => FactorStatus.Warning,
|
||||
<= 0.10 => FactorStatus.Degraded,
|
||||
_ => FactorStatus.Critical
|
||||
};
|
||||
|
||||
return new HealthFactor
|
||||
{
|
||||
Name = "ErrorRate",
|
||||
Score = score,
|
||||
Status = status,
|
||||
Weight = _config.ErrorRateWeight,
|
||||
Details = $"Error rate: {errorRate:P2} ({metrics.ErrorCount} errors)"
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthFactor
|
||||
{
|
||||
Name = "ErrorRate",
|
||||
Score = 0.5,
|
||||
Status = FactorStatus.Unknown,
|
||||
Weight = _config.ErrorRateWeight,
|
||||
Details = $"Error rate check failed: {ex.Message}"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<HealthFactor> CheckQueueDepthAsync(AgentHealthState state, CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var metrics = await _metricsProvider.GetQueueMetricsAsync(state.AgentId, ct);
|
||||
|
||||
var utilizationRatio = metrics.MaxQueueSize > 0
|
||||
? (double)metrics.CurrentQueueSize / metrics.MaxQueueSize
|
||||
: 0.0;
|
||||
|
||||
var score = 1.0 - utilizationRatio;
|
||||
|
||||
var status = utilizationRatio switch
|
||||
{
|
||||
<= 0.5 => FactorStatus.Healthy,
|
||||
<= 0.75 => FactorStatus.Warning,
|
||||
<= 0.9 => FactorStatus.Degraded,
|
||||
_ => FactorStatus.Critical
|
||||
};
|
||||
|
||||
return new HealthFactor
|
||||
{
|
||||
Name = "QueueDepth",
|
||||
Score = score,
|
||||
Status = status,
|
||||
Weight = _config.QueueDepthWeight,
|
||||
Details = $"Queue: {metrics.CurrentQueueSize}/{metrics.MaxQueueSize} ({utilizationRatio:P0})"
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new HealthFactor
|
||||
{
|
||||
Name = "QueueDepth",
|
||||
Score = 0.5,
|
||||
Status = FactorStatus.Unknown,
|
||||
Weight = _config.QueueDepthWeight,
|
||||
Details = $"Queue check failed: {ex.Message}"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private double CalculateOverallScore(ImmutableArray<HealthFactor> factors)
|
||||
{
|
||||
var totalWeight = factors.Sum(f => f.Weight);
|
||||
if (totalWeight == 0) return 0;
|
||||
|
||||
return factors.Sum(f => f.Score * f.Weight) / totalWeight;
|
||||
}
|
||||
|
||||
private static AgentHealthStatus DetermineStatus(double overallScore, ImmutableArray<HealthFactor> factors)
|
||||
{
|
||||
// Any critical factor makes overall status critical
|
||||
if (factors.Any(f => f.Status == FactorStatus.Critical))
|
||||
return AgentHealthStatus.Critical;
|
||||
|
||||
return overallScore switch
|
||||
{
|
||||
>= 0.85 => AgentHealthStatus.Healthy,
|
||||
>= 0.65 => AgentHealthStatus.Warning,
|
||||
>= 0.40 => AgentHealthStatus.Degraded,
|
||||
_ => AgentHealthStatus.Critical
|
||||
};
|
||||
}
|
||||
|
||||
private HealthTrend AnalyzeTrend(string agentId)
|
||||
{
|
||||
if (!_healthHistories.TryGetValue(agentId, out var history))
|
||||
return new HealthTrend { Direction = TrendDirection.Stable, Confidence = 0 };
|
||||
|
||||
var scores = history.GetRecentScores(10);
|
||||
if (scores.Length < 3)
|
||||
return new HealthTrend { Direction = TrendDirection.Stable, Confidence = 0 };
|
||||
|
||||
var recentAvg = scores.TakeLast(3).Average();
|
||||
var olderAvg = scores.Take(scores.Length - 3).Average();
|
||||
|
||||
var diff = recentAvg - olderAvg;
|
||||
var direction = diff switch
|
||||
{
|
||||
> 0.1 => TrendDirection.Improving,
|
||||
< -0.1 => TrendDirection.Degrading,
|
||||
_ => TrendDirection.Stable
|
||||
};
|
||||
|
||||
return new HealthTrend
|
||||
{
|
||||
Direction = direction,
|
||||
Confidence = Math.Abs(diff) / 0.3, // Normalize to 0-1
|
||||
RecentAverage = recentAvg,
|
||||
HistoricalAverage = olderAvg
|
||||
};
|
||||
}
|
||||
|
||||
private void UpdateAgentState(string agentId, AgentHealthAssessment assessment)
|
||||
{
|
||||
if (!_agentStates.TryGetValue(agentId, out var state))
|
||||
return;
|
||||
|
||||
var previousStatus = state.Status;
|
||||
|
||||
state = state with
|
||||
{
|
||||
Status = assessment.Status,
|
||||
LastAssessment = assessment,
|
||||
LastCheckedAt = assessment.AssessedAt
|
||||
};
|
||||
|
||||
_agentStates[agentId] = state;
|
||||
|
||||
// Record in history
|
||||
if (_healthHistories.TryGetValue(agentId, out var history))
|
||||
{
|
||||
history.Add(assessment.OverallScore, assessment.AssessedAt);
|
||||
}
|
||||
|
||||
// Raise event if status changed
|
||||
if (previousStatus != assessment.Status)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Agent {AgentId} health status changed: {PreviousStatus} -> {NewStatus}",
|
||||
agentId, previousStatus, assessment.Status);
|
||||
|
||||
HealthChanged?.Invoke(this, new AgentHealthChangedEventArgs
|
||||
{
|
||||
AgentId = agentId,
|
||||
PreviousStatus = previousStatus,
|
||||
NewStatus = assessment.Status,
|
||||
Assessment = assessment
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private static HealthRecommendation GenerateRecommendation(
|
||||
AgentHealthStatus status,
|
||||
ImmutableArray<HealthFactor> factors,
|
||||
HealthTrend trend)
|
||||
{
|
||||
var criticalFactors = factors.Where(f => f.Status == FactorStatus.Critical).ToList();
|
||||
var degradedFactors = factors.Where(f => f.Status == FactorStatus.Degraded).ToList();
|
||||
|
||||
if (status == AgentHealthStatus.Critical)
|
||||
{
|
||||
return new HealthRecommendation
|
||||
{
|
||||
Action = RecommendedAction.FailoverImmediately,
|
||||
Urgency = ActionUrgency.Critical,
|
||||
Reason = $"Critical factors: {string.Join(", ", criticalFactors.Select(f => f.Name))}",
|
||||
AffectedFactors = criticalFactors.Select(f => f.Name).ToImmutableArray()
|
||||
};
|
||||
}
|
||||
|
||||
if (trend.Direction == TrendDirection.Degrading && trend.Confidence > 0.7)
|
||||
{
|
||||
return new HealthRecommendation
|
||||
{
|
||||
Action = RecommendedAction.PrepareFailover,
|
||||
Urgency = ActionUrgency.High,
|
||||
Reason = "Health trend is degrading rapidly",
|
||||
AffectedFactors = []
|
||||
};
|
||||
}
|
||||
|
||||
if (status == AgentHealthStatus.Degraded)
|
||||
{
|
||||
return new HealthRecommendation
|
||||
{
|
||||
Action = RecommendedAction.InvestigateAndRemediate,
|
||||
Urgency = ActionUrgency.Medium,
|
||||
Reason = $"Degraded factors: {string.Join(", ", degradedFactors.Select(f => f.Name))}",
|
||||
AffectedFactors = degradedFactors.Select(f => f.Name).ToImmutableArray()
|
||||
};
|
||||
}
|
||||
|
||||
if (status == AgentHealthStatus.Warning)
|
||||
{
|
||||
return new HealthRecommendation
|
||||
{
|
||||
Action = RecommendedAction.Monitor,
|
||||
Urgency = ActionUrgency.Low,
|
||||
Reason = "Minor issues detected, monitoring recommended",
|
||||
AffectedFactors = factors.Where(f => f.Status == FactorStatus.Warning)
|
||||
.Select(f => f.Name).ToImmutableArray()
|
||||
};
|
||||
}
|
||||
|
||||
return new HealthRecommendation
|
||||
{
|
||||
Action = RecommendedAction.None,
|
||||
Urgency = ActionUrgency.None,
|
||||
Reason = "Agent is healthy",
|
||||
AffectedFactors = []
|
||||
};
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
await StopAsync();
|
||||
}
|
||||
}
|
||||
|
||||
#region Health History
|
||||
|
||||
internal sealed class HealthHistory
|
||||
{
|
||||
private readonly Queue<(double Score, DateTimeOffset Time)> _history;
|
||||
private readonly int _maxSize;
|
||||
private readonly object _lock = new();
|
||||
|
||||
public HealthHistory(int maxSize)
|
||||
{
|
||||
_maxSize = maxSize;
|
||||
_history = new Queue<(double, DateTimeOffset)>(maxSize);
|
||||
}
|
||||
|
||||
public void Add(double score, DateTimeOffset time)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (_history.Count >= _maxSize)
|
||||
_history.Dequeue();
|
||||
|
||||
_history.Enqueue((score, time));
|
||||
}
|
||||
}
|
||||
|
||||
public ImmutableArray<double> GetRecentScores(int count)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return _history.TakeLast(count).Select(x => x.Score).ToImmutableArray();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface IHealthMonitor
|
||||
{
|
||||
Task StartAsync(CancellationToken ct = default);
|
||||
Task StopAsync();
|
||||
void RegisterAgent(string agentId, AgentEndpoint endpoint);
|
||||
void UnregisterAgent(string agentId);
|
||||
void RegisterCustomCheck(string name, Func<CancellationToken, Task<HealthCheckResult>> check);
|
||||
Task<AgentHealthAssessment> AssessHealthAsync(string agentId, CancellationToken ct = default);
|
||||
Task<ImmutableArray<AgentHealthAssessment>> AssessAllAgentsAsync(CancellationToken ct = default);
|
||||
ImmutableDictionary<string, AgentHealthStatus> GetAllAgentStatuses();
|
||||
ImmutableArray<string> GetAgentsByStatus(AgentHealthStatus status);
|
||||
event EventHandler<AgentHealthChangedEventArgs>? HealthChanged;
|
||||
}
|
||||
|
||||
public interface IMetricsProvider
|
||||
{
|
||||
Task<ResourceMetrics> GetResourceMetricsAsync(string agentId, CancellationToken ct = default);
|
||||
Task<TaskMetrics> GetTaskMetricsAsync(string agentId, CancellationToken ct = default);
|
||||
Task<ErrorMetrics> GetErrorMetricsAsync(string agentId, CancellationToken ct = default);
|
||||
Task<QueueMetrics> GetQueueMetricsAsync(string agentId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IConnectivityChecker
|
||||
{
|
||||
Task<ConnectivityResult> CheckAsync(AgentEndpoint endpoint, CancellationToken ct = default);
|
||||
Task<TimeSpan> MeasureLatencyAsync(AgentEndpoint endpoint, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record HealthMonitorConfig
|
||||
{
|
||||
public TimeSpan CheckInterval { get; init; } = TimeSpan.FromSeconds(30);
|
||||
public int HistorySize { get; init; } = 100;
|
||||
public double ConnectivityWeight { get; init; } = 2.0;
|
||||
public double ResourceWeight { get; init; } = 1.5;
|
||||
public double TaskHealthWeight { get; init; } = 1.5;
|
||||
public double LatencyWeight { get; init; } = 1.0;
|
||||
public double ErrorRateWeight { get; init; } = 1.5;
|
||||
public double QueueDepthWeight { get; init; } = 1.0;
|
||||
}
|
||||
|
||||
public sealed record AgentEndpoint(string Host, int Port, bool UseTls = true);
|
||||
|
||||
public sealed record AgentHealthState
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required AgentEndpoint Endpoint { get; init; }
|
||||
public required AgentHealthStatus Status { get; init; }
|
||||
public required DateTimeOffset RegisteredAt { get; init; }
|
||||
public DateTimeOffset? LastCheckedAt { get; init; }
|
||||
public AgentHealthAssessment? LastAssessment { get; init; }
|
||||
}
|
||||
|
||||
public sealed record AgentHealthAssessment
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required AgentHealthStatus Status { get; init; }
|
||||
public required double OverallScore { get; init; }
|
||||
public required ImmutableArray<HealthFactor> Factors { get; init; }
|
||||
public required HealthTrend Trend { get; init; }
|
||||
public required DateTimeOffset AssessedAt { get; init; }
|
||||
public required HealthRecommendation Recommendation { get; init; }
|
||||
}
|
||||
|
||||
public sealed record HealthFactor
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required double Score { get; init; }
|
||||
public required FactorStatus Status { get; init; }
|
||||
public required double Weight { get; init; }
|
||||
public string? Details { get; init; }
|
||||
}
|
||||
|
||||
public sealed record HealthTrend
|
||||
{
|
||||
public required TrendDirection Direction { get; init; }
|
||||
public required double Confidence { get; init; }
|
||||
public double RecentAverage { get; init; }
|
||||
public double HistoricalAverage { get; init; }
|
||||
}
|
||||
|
||||
public sealed record HealthRecommendation
|
||||
{
|
||||
public required RecommendedAction Action { get; init; }
|
||||
public required ActionUrgency Urgency { get; init; }
|
||||
public required string Reason { get; init; }
|
||||
public required ImmutableArray<string> AffectedFactors { get; init; }
|
||||
}
|
||||
|
||||
public sealed record HealthCheckResult
|
||||
{
|
||||
public required double Score { get; init; }
|
||||
public required FactorStatus Status { get; init; }
|
||||
public string? Details { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ResourceMetrics
|
||||
{
|
||||
public double CpuPercent { get; init; }
|
||||
public double MemoryPercent { get; init; }
|
||||
public double DiskPercent { get; init; }
|
||||
}
|
||||
|
||||
public sealed record TaskMetrics
|
||||
{
|
||||
public int TotalTasks { get; init; }
|
||||
public int SuccessfulTasks { get; init; }
|
||||
public int FailedTasks { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ErrorMetrics
|
||||
{
|
||||
public int TotalRequests { get; init; }
|
||||
public int ErrorCount { get; init; }
|
||||
}
|
||||
|
||||
public sealed record QueueMetrics
|
||||
{
|
||||
public int CurrentQueueSize { get; init; }
|
||||
public int MaxQueueSize { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ConnectivityResult
|
||||
{
|
||||
public bool IsReachable { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
public sealed class AgentHealthChangedEventArgs : EventArgs
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required AgentHealthStatus PreviousStatus { get; init; }
|
||||
public required AgentHealthStatus NewStatus { get; init; }
|
||||
public required AgentHealthAssessment Assessment { get; init; }
|
||||
}
|
||||
|
||||
public enum AgentHealthStatus { Unknown, Critical, Degraded, Warning, Healthy }
|
||||
public enum FactorStatus { Unknown, Critical, Degraded, Warning, Healthy, Failed }
|
||||
public enum TrendDirection { Degrading, Stable, Improving }
|
||||
public enum RecommendedAction { None, Monitor, InvestigateAndRemediate, PrepareFailover, FailoverImmediately }
|
||||
public enum ActionUrgency { None, Low, Medium, High, Critical }
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,583 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// LeaderElection.cs
|
||||
// Sprint: SPRINT_20260117_034_ReleaseOrchestrator_agent_resilience
|
||||
// Task: TASK-034-04 - Leader Election with distributed lock support
|
||||
// Description: Distributed leader election using consensus algorithms
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Agent.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Distributed leader election for agent clusters.
|
||||
/// Supports multiple backends: Redis, etcd, Consul, or in-memory for testing.
|
||||
/// </summary>
|
||||
public sealed class LeaderElection : ILeaderElection, IAsyncDisposable
|
||||
{
|
||||
private readonly IDistributedLock _distributedLock;
|
||||
private readonly LeaderElectionConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<LeaderElection> _logger;
|
||||
|
||||
private readonly ConcurrentDictionary<string, ElectionState> _elections = new();
|
||||
private readonly ConcurrentDictionary<string, CancellationTokenSource> _renewalTasks = new();
|
||||
private string? _nodeId;
|
||||
|
||||
public LeaderElection(
|
||||
IDistributedLock distributedLock,
|
||||
LeaderElectionConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<LeaderElection> logger)
|
||||
{
|
||||
_distributedLock = distributedLock;
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the leader election component with this node's ID.
|
||||
/// </summary>
|
||||
public Task InitializeAsync(string nodeId, CancellationToken ct = default)
|
||||
{
|
||||
_nodeId = nodeId;
|
||||
_logger.LogInformation("Leader election initialized for node {NodeId}", nodeId);
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Participates in leader election for a specific resource.
|
||||
/// </summary>
|
||||
/// <param name="resourceKey">The resource to elect a leader for.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Election result indicating if this node became leader.</returns>
|
||||
public async Task<ElectionResult> ParticipateAsync(
|
||||
string resourceKey,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
if (_nodeId is null)
|
||||
throw new InvalidOperationException("Leader election not initialized. Call InitializeAsync first.");
|
||||
|
||||
var lockKey = GetLockKey(resourceKey);
|
||||
|
||||
_logger.LogDebug("Node {NodeId} participating in election for {Resource}",
|
||||
_nodeId, resourceKey);
|
||||
|
||||
try
|
||||
{
|
||||
// Try to acquire the lock
|
||||
var acquired = await _distributedLock.TryAcquireAsync(
|
||||
lockKey,
|
||||
_nodeId,
|
||||
_config.LeaseDuration,
|
||||
ct);
|
||||
|
||||
if (acquired)
|
||||
{
|
||||
_logger.LogInformation("Node {NodeId} elected as leader for {Resource}",
|
||||
_nodeId, resourceKey);
|
||||
|
||||
var state = new ElectionState
|
||||
{
|
||||
ResourceKey = resourceKey,
|
||||
LeaderId = _nodeId,
|
||||
IsLeader = true,
|
||||
ElectedAt = _timeProvider.GetUtcNow(),
|
||||
LeaseExpiresAt = _timeProvider.GetUtcNow().Add(_config.LeaseDuration),
|
||||
Term = GetNextTerm(resourceKey)
|
||||
};
|
||||
|
||||
_elections[resourceKey] = state;
|
||||
|
||||
// Start lease renewal
|
||||
StartLeaseRenewal(resourceKey, ct);
|
||||
|
||||
OnLeaderElected(resourceKey, _nodeId, state.Term);
|
||||
|
||||
return new ElectionResult
|
||||
{
|
||||
Success = true,
|
||||
IsLeader = true,
|
||||
LeaderId = _nodeId,
|
||||
Term = state.Term,
|
||||
LeaseExpiresAt = state.LeaseExpiresAt
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
// Get current leader
|
||||
var currentLeader = await _distributedLock.GetHolderAsync(lockKey, ct);
|
||||
|
||||
var state = new ElectionState
|
||||
{
|
||||
ResourceKey = resourceKey,
|
||||
LeaderId = currentLeader,
|
||||
IsLeader = false,
|
||||
ElectedAt = null,
|
||||
LeaseExpiresAt = null,
|
||||
Term = 0
|
||||
};
|
||||
|
||||
_elections[resourceKey] = state;
|
||||
|
||||
_logger.LogDebug("Node {NodeId} is follower for {Resource}, leader is {LeaderId}",
|
||||
_nodeId, resourceKey, currentLeader);
|
||||
|
||||
return new ElectionResult
|
||||
{
|
||||
Success = true,
|
||||
IsLeader = false,
|
||||
LeaderId = currentLeader,
|
||||
Term = 0,
|
||||
LeaseExpiresAt = null
|
||||
};
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Election failed for {Resource}", resourceKey);
|
||||
|
||||
return new ElectionResult
|
||||
{
|
||||
Success = false,
|
||||
IsLeader = false,
|
||||
LeaderId = null,
|
||||
Error = ex.Message
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resigns leadership for a resource.
|
||||
/// </summary>
|
||||
public async Task ResignAsync(string resourceKey, CancellationToken ct = default)
|
||||
{
|
||||
if (_nodeId is null) return;
|
||||
|
||||
if (!_elections.TryGetValue(resourceKey, out var state) || !state.IsLeader)
|
||||
{
|
||||
_logger.LogWarning("Cannot resign: not leader for {Resource}", resourceKey);
|
||||
return;
|
||||
}
|
||||
|
||||
var lockKey = GetLockKey(resourceKey);
|
||||
|
||||
// Stop renewal
|
||||
if (_renewalTasks.TryRemove(resourceKey, out var cts))
|
||||
{
|
||||
await cts.CancelAsync();
|
||||
cts.Dispose();
|
||||
}
|
||||
|
||||
// Release lock
|
||||
await _distributedLock.ReleaseAsync(lockKey, _nodeId, ct);
|
||||
|
||||
_elections.TryRemove(resourceKey, out _);
|
||||
|
||||
_logger.LogInformation("Node {NodeId} resigned leadership for {Resource}",
|
||||
_nodeId, resourceKey);
|
||||
|
||||
OnLeaderResigned(resourceKey, _nodeId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if this node is the leader for a resource.
|
||||
/// </summary>
|
||||
public bool IsLeader(string resourceKey)
|
||||
{
|
||||
return _elections.TryGetValue(resourceKey, out var state) && state.IsLeader;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current leader for a resource.
|
||||
/// </summary>
|
||||
public async Task<string?> GetLeaderAsync(string resourceKey, CancellationToken ct = default)
|
||||
{
|
||||
var lockKey = GetLockKey(resourceKey);
|
||||
return await _distributedLock.GetHolderAsync(lockKey, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current election state for a resource.
|
||||
/// </summary>
|
||||
public ElectionState? GetElectionState(string resourceKey)
|
||||
{
|
||||
return _elections.TryGetValue(resourceKey, out var state) ? state : null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all resources where this node is the leader.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> GetLeaderships()
|
||||
{
|
||||
return _elections
|
||||
.Where(kv => kv.Value.IsLeader)
|
||||
.Select(kv => kv.Key)
|
||||
.ToImmutableArray();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Watches for leadership changes on a resource.
|
||||
/// </summary>
|
||||
public async IAsyncEnumerable<LeadershipChange> WatchAsync(
|
||||
string resourceKey,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
var lockKey = GetLockKey(resourceKey);
|
||||
string? lastKnownLeader = null;
|
||||
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var currentLeader = await _distributedLock.GetHolderAsync(lockKey, ct);
|
||||
|
||||
if (currentLeader != lastKnownLeader)
|
||||
{
|
||||
yield return new LeadershipChange
|
||||
{
|
||||
ResourceKey = resourceKey,
|
||||
PreviousLeader = lastKnownLeader,
|
||||
NewLeader = currentLeader,
|
||||
ChangedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
lastKnownLeader = currentLeader;
|
||||
}
|
||||
|
||||
await Task.Delay(_config.WatchInterval, ct);
|
||||
}
|
||||
catch (OperationCanceledException) when (ct.IsCancellationRequested)
|
||||
{
|
||||
yield break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event raised when this node becomes leader.
|
||||
/// </summary>
|
||||
public event EventHandler<LeaderElectedEventArgs>? LeaderElected;
|
||||
|
||||
/// <summary>
|
||||
/// Event raised when this node loses leadership.
|
||||
/// </summary>
|
||||
public event EventHandler<LeaderLostEventArgs>? LeaderLost;
|
||||
|
||||
/// <summary>
|
||||
/// Event raised when this node resigns leadership.
|
||||
/// </summary>
|
||||
public event EventHandler<LeaderResignedEventArgs>? LeaderResigned;
|
||||
|
||||
private void StartLeaseRenewal(string resourceKey, CancellationToken ct)
|
||||
{
|
||||
var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
_renewalTasks[resourceKey] = cts;
|
||||
|
||||
_ = RenewLeaseLoopAsync(resourceKey, cts.Token);
|
||||
}
|
||||
|
||||
private async Task RenewLeaseLoopAsync(string resourceKey, CancellationToken ct)
|
||||
{
|
||||
var lockKey = GetLockKey(resourceKey);
|
||||
var renewalInterval = TimeSpan.FromMilliseconds(_config.LeaseDuration.TotalMilliseconds / 3);
|
||||
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await Task.Delay(renewalInterval, ct);
|
||||
|
||||
var renewed = await _distributedLock.RenewAsync(
|
||||
lockKey,
|
||||
_nodeId!,
|
||||
_config.LeaseDuration,
|
||||
ct);
|
||||
|
||||
if (renewed)
|
||||
{
|
||||
if (_elections.TryGetValue(resourceKey, out var state))
|
||||
{
|
||||
_elections[resourceKey] = state with
|
||||
{
|
||||
LeaseExpiresAt = _timeProvider.GetUtcNow().Add(_config.LeaseDuration)
|
||||
};
|
||||
}
|
||||
|
||||
_logger.LogDebug("Renewed lease for {Resource}", resourceKey);
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogWarning("Failed to renew lease for {Resource}, lost leadership",
|
||||
resourceKey);
|
||||
|
||||
HandleLeadershipLost(resourceKey);
|
||||
break;
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (ct.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error renewing lease for {Resource}", resourceKey);
|
||||
HandleLeadershipLost(resourceKey);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void HandleLeadershipLost(string resourceKey)
|
||||
{
|
||||
if (_elections.TryRemove(resourceKey, out var state) && state.IsLeader)
|
||||
{
|
||||
_logger.LogWarning("Node {NodeId} lost leadership for {Resource}",
|
||||
_nodeId, resourceKey);
|
||||
|
||||
OnLeaderLost(resourceKey, _nodeId!);
|
||||
}
|
||||
|
||||
if (_renewalTasks.TryRemove(resourceKey, out var cts))
|
||||
{
|
||||
cts.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
private int GetNextTerm(string resourceKey)
|
||||
{
|
||||
if (_elections.TryGetValue(resourceKey, out var state))
|
||||
return state.Term + 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
private string GetLockKey(string resourceKey) =>
|
||||
$"{_config.KeyPrefix}:{resourceKey}";
|
||||
|
||||
private void OnLeaderElected(string resourceKey, string leaderId, int term)
|
||||
{
|
||||
LeaderElected?.Invoke(this, new LeaderElectedEventArgs
|
||||
{
|
||||
ResourceKey = resourceKey,
|
||||
LeaderId = leaderId,
|
||||
Term = term,
|
||||
ElectedAt = _timeProvider.GetUtcNow()
|
||||
});
|
||||
}
|
||||
|
||||
private void OnLeaderLost(string resourceKey, string nodeId)
|
||||
{
|
||||
LeaderLost?.Invoke(this, new LeaderLostEventArgs
|
||||
{
|
||||
ResourceKey = resourceKey,
|
||||
NodeId = nodeId,
|
||||
LostAt = _timeProvider.GetUtcNow()
|
||||
});
|
||||
}
|
||||
|
||||
private void OnLeaderResigned(string resourceKey, string nodeId)
|
||||
{
|
||||
LeaderResigned?.Invoke(this, new LeaderResignedEventArgs
|
||||
{
|
||||
ResourceKey = resourceKey,
|
||||
NodeId = nodeId,
|
||||
ResignedAt = _timeProvider.GetUtcNow()
|
||||
});
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
// Resign all leaderships
|
||||
foreach (var resourceKey in GetLeaderships())
|
||||
{
|
||||
try
|
||||
{
|
||||
await ResignAsync(resourceKey);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Error resigning leadership for {Resource}", resourceKey);
|
||||
}
|
||||
}
|
||||
|
||||
// Cancel all renewal tasks
|
||||
foreach (var cts in _renewalTasks.Values)
|
||||
{
|
||||
cts.Dispose();
|
||||
}
|
||||
_renewalTasks.Clear();
|
||||
}
|
||||
}
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface ILeaderElection
|
||||
{
|
||||
Task InitializeAsync(string nodeId, CancellationToken ct = default);
|
||||
Task<ElectionResult> ParticipateAsync(string resourceKey, CancellationToken ct = default);
|
||||
Task ResignAsync(string resourceKey, CancellationToken ct = default);
|
||||
bool IsLeader(string resourceKey);
|
||||
Task<string?> GetLeaderAsync(string resourceKey, CancellationToken ct = default);
|
||||
ElectionState? GetElectionState(string resourceKey);
|
||||
ImmutableArray<string> GetLeaderships();
|
||||
IAsyncEnumerable<LeadershipChange> WatchAsync(string resourceKey, CancellationToken ct = default);
|
||||
event EventHandler<LeaderElectedEventArgs>? LeaderElected;
|
||||
event EventHandler<LeaderLostEventArgs>? LeaderLost;
|
||||
event EventHandler<LeaderResignedEventArgs>? LeaderResigned;
|
||||
}
|
||||
|
||||
public interface IDistributedLock
|
||||
{
|
||||
Task<bool> TryAcquireAsync(string key, string holder, TimeSpan ttl, CancellationToken ct = default);
|
||||
Task<bool> RenewAsync(string key, string holder, TimeSpan ttl, CancellationToken ct = default);
|
||||
Task ReleaseAsync(string key, string holder, CancellationToken ct = default);
|
||||
Task<string?> GetHolderAsync(string key, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record LeaderElectionConfig
|
||||
{
|
||||
public string KeyPrefix { get; init; } = "stella:leader";
|
||||
public TimeSpan LeaseDuration { get; init; } = TimeSpan.FromSeconds(30);
|
||||
public TimeSpan WatchInterval { get; init; } = TimeSpan.FromSeconds(5);
|
||||
}
|
||||
|
||||
public sealed record ElectionResult
|
||||
{
|
||||
public required bool Success { get; init; }
|
||||
public required bool IsLeader { get; init; }
|
||||
public string? LeaderId { get; init; }
|
||||
public int Term { get; init; }
|
||||
public DateTimeOffset? LeaseExpiresAt { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ElectionState
|
||||
{
|
||||
public required string ResourceKey { get; init; }
|
||||
public required string? LeaderId { get; init; }
|
||||
public required bool IsLeader { get; init; }
|
||||
public DateTimeOffset? ElectedAt { get; init; }
|
||||
public DateTimeOffset? LeaseExpiresAt { get; init; }
|
||||
public required int Term { get; init; }
|
||||
}
|
||||
|
||||
public sealed record LeadershipChange
|
||||
{
|
||||
public required string ResourceKey { get; init; }
|
||||
public string? PreviousLeader { get; init; }
|
||||
public string? NewLeader { get; init; }
|
||||
public required DateTimeOffset ChangedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed class LeaderElectedEventArgs : EventArgs
|
||||
{
|
||||
public required string ResourceKey { get; init; }
|
||||
public required string LeaderId { get; init; }
|
||||
public required int Term { get; init; }
|
||||
public required DateTimeOffset ElectedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed class LeaderLostEventArgs : EventArgs
|
||||
{
|
||||
public required string ResourceKey { get; init; }
|
||||
public required string NodeId { get; init; }
|
||||
public required DateTimeOffset LostAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed class LeaderResignedEventArgs : EventArgs
|
||||
{
|
||||
public required string ResourceKey { get; init; }
|
||||
public required string NodeId { get; init; }
|
||||
public required DateTimeOffset ResignedAt { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region In-Memory Implementation (for testing)
|
||||
|
||||
/// <summary>
|
||||
/// In-memory distributed lock implementation for testing.
|
||||
/// </summary>
|
||||
public sealed class InMemoryDistributedLock : IDistributedLock
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, (string Holder, DateTimeOffset Expiry)> _locks = new();
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
public InMemoryDistributedLock(TimeProvider timeProvider)
|
||||
{
|
||||
_timeProvider = timeProvider;
|
||||
}
|
||||
|
||||
public Task<bool> TryAcquireAsync(string key, string holder, TimeSpan ttl, CancellationToken ct = default)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var expiry = now.Add(ttl);
|
||||
|
||||
// Clean up expired locks
|
||||
CleanupExpired(now);
|
||||
|
||||
var acquired = _locks.TryAdd(key, (holder, expiry));
|
||||
|
||||
if (!acquired && _locks.TryGetValue(key, out var current) && current.Holder == holder)
|
||||
{
|
||||
// Already holding the lock, extend it
|
||||
_locks[key] = (holder, expiry);
|
||||
acquired = true;
|
||||
}
|
||||
|
||||
return Task.FromResult(acquired);
|
||||
}
|
||||
|
||||
public Task<bool> RenewAsync(string key, string holder, TimeSpan ttl, CancellationToken ct = default)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
if (_locks.TryGetValue(key, out var current) && current.Holder == holder)
|
||||
{
|
||||
_locks[key] = (holder, now.Add(ttl));
|
||||
return Task.FromResult(true);
|
||||
}
|
||||
|
||||
return Task.FromResult(false);
|
||||
}
|
||||
|
||||
public Task ReleaseAsync(string key, string holder, CancellationToken ct = default)
|
||||
{
|
||||
if (_locks.TryGetValue(key, out var current) && current.Holder == holder)
|
||||
{
|
||||
_locks.TryRemove(key, out _);
|
||||
}
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public Task<string?> GetHolderAsync(string key, CancellationToken ct = default)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
if (_locks.TryGetValue(key, out var current) && current.Expiry > now)
|
||||
{
|
||||
return Task.FromResult<string?>(current.Holder);
|
||||
}
|
||||
|
||||
return Task.FromResult<string?>(null);
|
||||
}
|
||||
|
||||
private void CleanupExpired(DateTimeOffset now)
|
||||
{
|
||||
var expired = _locks.Where(kv => kv.Value.Expiry <= now).Select(kv => kv.Key).ToList();
|
||||
foreach (var key in expired)
|
||||
{
|
||||
_locks.TryRemove(key, out _);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,783 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// SelfHealer.cs
|
||||
// Sprint: SPRINT_20260117_034_ReleaseOrchestrator_agent_resilience
|
||||
// Task: TASK-034-06 - Self Healer with automatic recovery actions
|
||||
// Description: Automatic recovery and self-healing for agent cluster nodes
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Agent.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Self-healer that monitors agent health and applies automatic recovery actions.
|
||||
/// </summary>
|
||||
public sealed class SelfHealer : ISelfHealer, IAsyncDisposable
|
||||
{
|
||||
private readonly IHealthMonitor _healthMonitor;
|
||||
private readonly IRecoveryActionExecutor _recoveryExecutor;
|
||||
private readonly SelfHealerConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<SelfHealer> _logger;
|
||||
|
||||
private readonly ConcurrentDictionary<string, RecoveryHistory> _recoveryHistories = new();
|
||||
private readonly ConcurrentDictionary<string, RecoveryState> _activeRecoveries = new();
|
||||
private readonly ConcurrentDictionary<string, CircuitBreaker> _circuitBreakers = new();
|
||||
|
||||
private CancellationTokenSource? _healingCts;
|
||||
private Task? _healingTask;
|
||||
|
||||
public SelfHealer(
|
||||
IHealthMonitor healthMonitor,
|
||||
IRecoveryActionExecutor recoveryExecutor,
|
||||
SelfHealerConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<SelfHealer> logger)
|
||||
{
|
||||
_healthMonitor = healthMonitor;
|
||||
_recoveryExecutor = recoveryExecutor;
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts the self-healing loop.
|
||||
/// </summary>
|
||||
public async Task StartAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (_healingTask is not null)
|
||||
{
|
||||
_logger.LogWarning("Self-healer already started");
|
||||
return;
|
||||
}
|
||||
|
||||
// Subscribe to health changes
|
||||
_healthMonitor.HealthChanged += OnHealthChanged;
|
||||
|
||||
_healingCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
_healingTask = HealingLoopAsync(_healingCts.Token);
|
||||
|
||||
_logger.LogInformation("Self-healer started");
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stops the self-healing loop.
|
||||
/// </summary>
|
||||
public async Task StopAsync()
|
||||
{
|
||||
if (_healingCts is null) return;
|
||||
|
||||
_healthMonitor.HealthChanged -= OnHealthChanged;
|
||||
|
||||
await _healingCts.CancelAsync();
|
||||
|
||||
if (_healingTask is not null)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _healingTask.WaitAsync(TimeSpan.FromSeconds(10));
|
||||
}
|
||||
catch (OperationCanceledException) { }
|
||||
catch (TimeoutException) { }
|
||||
}
|
||||
|
||||
_healingCts.Dispose();
|
||||
_healingCts = null;
|
||||
_healingTask = null;
|
||||
|
||||
_logger.LogInformation("Self-healer stopped");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Triggers immediate healing assessment for an agent.
|
||||
/// </summary>
|
||||
public async Task<HealingResult> HealAsync(string agentId, CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Initiating healing for agent {AgentId}", agentId);
|
||||
|
||||
// Check circuit breaker
|
||||
if (IsCircuitOpen(agentId))
|
||||
{
|
||||
_logger.LogWarning("Circuit breaker open for agent {AgentId}, skipping healing", agentId);
|
||||
return new HealingResult
|
||||
{
|
||||
AgentId = agentId,
|
||||
Success = false,
|
||||
Status = HealingStatus.CircuitOpen,
|
||||
Message = "Recovery circuit breaker is open due to repeated failures"
|
||||
};
|
||||
}
|
||||
|
||||
// Check if already recovering
|
||||
if (_activeRecoveries.ContainsKey(agentId))
|
||||
{
|
||||
return new HealingResult
|
||||
{
|
||||
AgentId = agentId,
|
||||
Success = false,
|
||||
Status = HealingStatus.AlreadyInProgress,
|
||||
Message = "Recovery already in progress"
|
||||
};
|
||||
}
|
||||
|
||||
// Get current health assessment
|
||||
var assessment = await _healthMonitor.AssessHealthAsync(agentId, ct);
|
||||
|
||||
if (assessment.Status == AgentHealthStatus.Healthy)
|
||||
{
|
||||
return new HealingResult
|
||||
{
|
||||
AgentId = agentId,
|
||||
Success = true,
|
||||
Status = HealingStatus.NotNeeded,
|
||||
Message = "Agent is healthy, no healing required"
|
||||
};
|
||||
}
|
||||
|
||||
// Determine recovery actions
|
||||
var actions = DetermineRecoveryActions(assessment);
|
||||
|
||||
if (actions.Length == 0)
|
||||
{
|
||||
return new HealingResult
|
||||
{
|
||||
AgentId = agentId,
|
||||
Success = false,
|
||||
Status = HealingStatus.NoActionsAvailable,
|
||||
Message = "No applicable recovery actions found"
|
||||
};
|
||||
}
|
||||
|
||||
// Execute recovery
|
||||
return await ExecuteRecoveryAsync(agentId, actions, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the recovery history for an agent.
|
||||
/// </summary>
|
||||
public ImmutableArray<RecoveryAttempt> GetRecoveryHistory(string agentId)
|
||||
{
|
||||
if (_recoveryHistories.TryGetValue(agentId, out var history))
|
||||
{
|
||||
return history.GetAttempts();
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets current recovery state for an agent.
|
||||
/// </summary>
|
||||
public RecoveryState? GetRecoveryState(string agentId)
|
||||
{
|
||||
return _activeRecoveries.TryGetValue(agentId, out var state) ? state : null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resets the circuit breaker for an agent.
|
||||
/// </summary>
|
||||
public void ResetCircuitBreaker(string agentId)
|
||||
{
|
||||
if (_circuitBreakers.TryGetValue(agentId, out var breaker))
|
||||
{
|
||||
breaker.Reset();
|
||||
_logger.LogInformation("Circuit breaker reset for agent {AgentId}", agentId);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event raised when recovery starts.
|
||||
/// </summary>
|
||||
public event EventHandler<RecoveryStartedEventArgs>? RecoveryStarted;
|
||||
|
||||
/// <summary>
|
||||
/// Event raised when recovery completes.
|
||||
/// </summary>
|
||||
public event EventHandler<RecoveryCompletedEventArgs>? RecoveryCompleted;
|
||||
|
||||
/// <summary>
|
||||
/// Event raised when recovery fails.
|
||||
/// </summary>
|
||||
public event EventHandler<RecoveryFailedEventArgs>? RecoveryFailed;
|
||||
|
||||
private void OnHealthChanged(object? sender, AgentHealthChangedEventArgs e)
|
||||
{
|
||||
if (e.NewStatus <= AgentHealthStatus.Degraded && _config.AutoHealEnabled)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Auto-heal triggered for agent {AgentId} due to status change to {Status}",
|
||||
e.AgentId, e.NewStatus);
|
||||
|
||||
// Queue healing (don't block event handler)
|
||||
_ = Task.Run(async () =>
|
||||
{
|
||||
try
|
||||
{
|
||||
await HealAsync(e.AgentId);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in auto-heal for agent {AgentId}", e.AgentId);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private async Task HealingLoopAsync(CancellationToken ct)
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await Task.Delay(_config.HealingCheckInterval, ct);
|
||||
|
||||
// Get all unhealthy agents
|
||||
var unhealthy = _healthMonitor.GetAgentsByStatus(AgentHealthStatus.Degraded)
|
||||
.Concat(_healthMonitor.GetAgentsByStatus(AgentHealthStatus.Critical))
|
||||
.ToList();
|
||||
|
||||
foreach (var agentId in unhealthy)
|
||||
{
|
||||
if (ct.IsCancellationRequested) break;
|
||||
|
||||
try
|
||||
{
|
||||
await HealAsync(agentId, ct);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error healing agent {AgentId}", agentId);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (ct.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in healing loop");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private ImmutableArray<RecoveryAction> DetermineRecoveryActions(AgentHealthAssessment assessment)
|
||||
{
|
||||
var actions = new List<RecoveryAction>();
|
||||
|
||||
foreach (var factor in assessment.Factors.Where(f => f.Status <= FactorStatus.Degraded))
|
||||
{
|
||||
var action = factor.Name switch
|
||||
{
|
||||
"Connectivity" => new RecoveryAction
|
||||
{
|
||||
Type = RecoveryActionType.RestartAgent,
|
||||
Priority = 1,
|
||||
Description = "Restart agent to restore connectivity"
|
||||
},
|
||||
"Resources" when factor.Details?.Contains("Memory") == true => new RecoveryAction
|
||||
{
|
||||
Type = RecoveryActionType.ClearCaches,
|
||||
Priority = 2,
|
||||
Description = "Clear caches to free memory"
|
||||
},
|
||||
"Resources" when factor.Details?.Contains("CPU") == true => new RecoveryAction
|
||||
{
|
||||
Type = RecoveryActionType.ReduceLoad,
|
||||
Priority = 2,
|
||||
Description = "Reduce task load to lower CPU usage"
|
||||
},
|
||||
"QueueDepth" => new RecoveryAction
|
||||
{
|
||||
Type = RecoveryActionType.DrainQueue,
|
||||
Priority = 3,
|
||||
Description = "Drain excess tasks from queue"
|
||||
},
|
||||
"ErrorRate" => new RecoveryAction
|
||||
{
|
||||
Type = RecoveryActionType.ResetConnections,
|
||||
Priority = 2,
|
||||
Description = "Reset connections to clear error state"
|
||||
},
|
||||
"TaskHealth" => new RecoveryAction
|
||||
{
|
||||
Type = RecoveryActionType.CancelStuckTasks,
|
||||
Priority = 2,
|
||||
Description = "Cancel stuck or hung tasks"
|
||||
},
|
||||
_ => null
|
||||
};
|
||||
|
||||
if (action is not null)
|
||||
{
|
||||
actions.Add(action);
|
||||
}
|
||||
}
|
||||
|
||||
// Add escalating actions for critical status
|
||||
if (assessment.Status == AgentHealthStatus.Critical)
|
||||
{
|
||||
actions.Add(new RecoveryAction
|
||||
{
|
||||
Type = RecoveryActionType.ForceRestart,
|
||||
Priority = 0,
|
||||
Description = "Force restart for critical health"
|
||||
});
|
||||
}
|
||||
|
||||
return actions
|
||||
.OrderBy(a => a.Priority)
|
||||
.Take(_config.MaxActionsPerRecovery)
|
||||
.ToImmutableArray();
|
||||
}
|
||||
|
||||
private async Task<HealingResult> ExecuteRecoveryAsync(
|
||||
string agentId,
|
||||
ImmutableArray<RecoveryAction> actions,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var state = new RecoveryState
|
||||
{
|
||||
AgentId = agentId,
|
||||
StartedAt = _timeProvider.GetUtcNow(),
|
||||
Actions = actions,
|
||||
CurrentActionIndex = 0,
|
||||
Status = RecoveryStatus.InProgress
|
||||
};
|
||||
|
||||
_activeRecoveries[agentId] = state;
|
||||
|
||||
OnRecoveryStarted(agentId, actions);
|
||||
|
||||
var results = new List<RecoveryActionResult>();
|
||||
var overallSuccess = true;
|
||||
|
||||
try
|
||||
{
|
||||
foreach (var action in actions)
|
||||
{
|
||||
if (ct.IsCancellationRequested) break;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Executing recovery action {Action} for agent {AgentId}",
|
||||
action.Type, agentId);
|
||||
|
||||
var result = await ExecuteActionWithTimeoutAsync(agentId, action, ct);
|
||||
results.Add(result);
|
||||
|
||||
if (!result.Success)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Recovery action {Action} failed for agent {AgentId}: {Error}",
|
||||
action.Type, agentId, result.Error);
|
||||
|
||||
overallSuccess = false;
|
||||
|
||||
if (_config.StopOnFirstFailure)
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Recovery action {Action} succeeded for agent {AgentId}",
|
||||
action.Type, agentId);
|
||||
}
|
||||
|
||||
// Update state
|
||||
state = state with { CurrentActionIndex = state.CurrentActionIndex + 1 };
|
||||
_activeRecoveries[agentId] = state;
|
||||
|
||||
// Wait between actions
|
||||
if (actions.Length > 1)
|
||||
{
|
||||
await Task.Delay(_config.ActionCooldown, ct);
|
||||
}
|
||||
}
|
||||
|
||||
// Record attempt in history
|
||||
RecordAttempt(agentId, new RecoveryAttempt
|
||||
{
|
||||
AttemptedAt = _timeProvider.GetUtcNow(),
|
||||
Actions = actions,
|
||||
Results = results.ToImmutableArray(),
|
||||
Success = overallSuccess
|
||||
});
|
||||
|
||||
if (overallSuccess)
|
||||
{
|
||||
GetOrCreateCircuitBreaker(agentId).RecordSuccess();
|
||||
OnRecoveryCompleted(agentId, results.ToImmutableArray());
|
||||
|
||||
return new HealingResult
|
||||
{
|
||||
AgentId = agentId,
|
||||
Success = true,
|
||||
Status = HealingStatus.Recovered,
|
||||
Message = $"Successfully executed {results.Count} recovery actions",
|
||||
ActionResults = results.ToImmutableArray()
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
GetOrCreateCircuitBreaker(agentId).RecordFailure();
|
||||
OnRecoveryFailed(agentId, results.ToImmutableArray());
|
||||
|
||||
return new HealingResult
|
||||
{
|
||||
AgentId = agentId,
|
||||
Success = false,
|
||||
Status = HealingStatus.PartialRecovery,
|
||||
Message = "Some recovery actions failed",
|
||||
ActionResults = results.ToImmutableArray()
|
||||
};
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Recovery failed for agent {AgentId}", agentId);
|
||||
|
||||
GetOrCreateCircuitBreaker(agentId).RecordFailure();
|
||||
OnRecoveryFailed(agentId, results.ToImmutableArray());
|
||||
|
||||
return new HealingResult
|
||||
{
|
||||
AgentId = agentId,
|
||||
Success = false,
|
||||
Status = HealingStatus.Failed,
|
||||
Message = ex.Message,
|
||||
ActionResults = results.ToImmutableArray()
|
||||
};
|
||||
}
|
||||
finally
|
||||
{
|
||||
_activeRecoveries.TryRemove(agentId, out _);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<RecoveryActionResult> ExecuteActionWithTimeoutAsync(
|
||||
string agentId,
|
||||
RecoveryAction action,
|
||||
CancellationToken ct)
|
||||
{
|
||||
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
timeoutCts.CancelAfter(_config.ActionTimeout);
|
||||
|
||||
try
|
||||
{
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
|
||||
await _recoveryExecutor.ExecuteAsync(agentId, action, timeoutCts.Token);
|
||||
|
||||
return new RecoveryActionResult
|
||||
{
|
||||
Action = action,
|
||||
Success = true,
|
||||
Duration = _timeProvider.GetUtcNow() - startTime
|
||||
};
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested && !ct.IsCancellationRequested)
|
||||
{
|
||||
return new RecoveryActionResult
|
||||
{
|
||||
Action = action,
|
||||
Success = false,
|
||||
Error = "Action timed out"
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new RecoveryActionResult
|
||||
{
|
||||
Action = action,
|
||||
Success = false,
|
||||
Error = ex.Message
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private void RecordAttempt(string agentId, RecoveryAttempt attempt)
|
||||
{
|
||||
var history = _recoveryHistories.GetOrAdd(agentId, _ => new RecoveryHistory(_config.HistorySize));
|
||||
history.Add(attempt);
|
||||
}
|
||||
|
||||
private bool IsCircuitOpen(string agentId)
|
||||
{
|
||||
if (_circuitBreakers.TryGetValue(agentId, out var breaker))
|
||||
{
|
||||
return breaker.IsOpen(_timeProvider.GetUtcNow());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private CircuitBreaker GetOrCreateCircuitBreaker(string agentId)
|
||||
{
|
||||
return _circuitBreakers.GetOrAdd(agentId, _ =>
|
||||
new CircuitBreaker(_config.CircuitBreakerThreshold, _config.CircuitBreakerResetTime));
|
||||
}
|
||||
|
||||
private void OnRecoveryStarted(string agentId, ImmutableArray<RecoveryAction> actions)
|
||||
{
|
||||
RecoveryStarted?.Invoke(this, new RecoveryStartedEventArgs
|
||||
{
|
||||
AgentId = agentId,
|
||||
Actions = actions,
|
||||
StartedAt = _timeProvider.GetUtcNow()
|
||||
});
|
||||
}
|
||||
|
||||
private void OnRecoveryCompleted(string agentId, ImmutableArray<RecoveryActionResult> results)
|
||||
{
|
||||
RecoveryCompleted?.Invoke(this, new RecoveryCompletedEventArgs
|
||||
{
|
||||
AgentId = agentId,
|
||||
Results = results,
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
});
|
||||
}
|
||||
|
||||
private void OnRecoveryFailed(string agentId, ImmutableArray<RecoveryActionResult> results)
|
||||
{
|
||||
RecoveryFailed?.Invoke(this, new RecoveryFailedEventArgs
|
||||
{
|
||||
AgentId = agentId,
|
||||
Results = results,
|
||||
FailedAt = _timeProvider.GetUtcNow()
|
||||
});
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
await StopAsync();
|
||||
}
|
||||
}
|
||||
|
||||
#region Circuit Breaker
|
||||
|
||||
internal sealed class CircuitBreaker
|
||||
{
|
||||
private readonly int _threshold;
|
||||
private readonly TimeSpan _resetTime;
|
||||
private int _failureCount;
|
||||
private DateTimeOffset? _openedAt;
|
||||
private readonly object _lock = new();
|
||||
|
||||
public CircuitBreaker(int threshold, TimeSpan resetTime)
|
||||
{
|
||||
_threshold = threshold;
|
||||
_resetTime = resetTime;
|
||||
}
|
||||
|
||||
public bool IsOpen(DateTimeOffset now)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (_openedAt is null) return false;
|
||||
|
||||
if (now - _openedAt.Value >= _resetTime)
|
||||
{
|
||||
// Half-open: allow one attempt
|
||||
_openedAt = null;
|
||||
_failureCount = _threshold - 1; // One more failure will re-open
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public void RecordSuccess()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_failureCount = 0;
|
||||
_openedAt = null;
|
||||
}
|
||||
}
|
||||
|
||||
public void RecordFailure()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_failureCount++;
|
||||
if (_failureCount >= _threshold)
|
||||
{
|
||||
_openedAt = DateTimeOffset.UtcNow;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void Reset()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_failureCount = 0;
|
||||
_openedAt = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal sealed class RecoveryHistory
|
||||
{
|
||||
private readonly Queue<RecoveryAttempt> _attempts;
|
||||
private readonly int _maxSize;
|
||||
private readonly object _lock = new();
|
||||
|
||||
public RecoveryHistory(int maxSize)
|
||||
{
|
||||
_maxSize = maxSize;
|
||||
_attempts = new Queue<RecoveryAttempt>(maxSize);
|
||||
}
|
||||
|
||||
public void Add(RecoveryAttempt attempt)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (_attempts.Count >= _maxSize)
|
||||
_attempts.Dequeue();
|
||||
_attempts.Enqueue(attempt);
|
||||
}
|
||||
}
|
||||
|
||||
public ImmutableArray<RecoveryAttempt> GetAttempts()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
return _attempts.ToImmutableArray();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface ISelfHealer
|
||||
{
|
||||
Task StartAsync(CancellationToken ct = default);
|
||||
Task StopAsync();
|
||||
Task<HealingResult> HealAsync(string agentId, CancellationToken ct = default);
|
||||
ImmutableArray<RecoveryAttempt> GetRecoveryHistory(string agentId);
|
||||
RecoveryState? GetRecoveryState(string agentId);
|
||||
void ResetCircuitBreaker(string agentId);
|
||||
event EventHandler<RecoveryStartedEventArgs>? RecoveryStarted;
|
||||
event EventHandler<RecoveryCompletedEventArgs>? RecoveryCompleted;
|
||||
event EventHandler<RecoveryFailedEventArgs>? RecoveryFailed;
|
||||
}
|
||||
|
||||
public interface IRecoveryActionExecutor
|
||||
{
|
||||
Task ExecuteAsync(string agentId, RecoveryAction action, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record SelfHealerConfig
|
||||
{
|
||||
public bool AutoHealEnabled { get; init; } = true;
|
||||
public TimeSpan HealingCheckInterval { get; init; } = TimeSpan.FromMinutes(1);
|
||||
public TimeSpan ActionTimeout { get; init; } = TimeSpan.FromSeconds(30);
|
||||
public TimeSpan ActionCooldown { get; init; } = TimeSpan.FromSeconds(5);
|
||||
public int MaxActionsPerRecovery { get; init; } = 5;
|
||||
public bool StopOnFirstFailure { get; init; } = false;
|
||||
public int HistorySize { get; init; } = 50;
|
||||
public int CircuitBreakerThreshold { get; init; } = 3;
|
||||
public TimeSpan CircuitBreakerResetTime { get; init; } = TimeSpan.FromMinutes(5);
|
||||
}
|
||||
|
||||
public sealed record RecoveryAction
|
||||
{
|
||||
public required RecoveryActionType Type { get; init; }
|
||||
public required int Priority { get; init; }
|
||||
public required string Description { get; init; }
|
||||
public ImmutableDictionary<string, string> Parameters { get; init; } = ImmutableDictionary<string, string>.Empty;
|
||||
}
|
||||
|
||||
public enum RecoveryActionType
|
||||
{
|
||||
RestartAgent,
|
||||
ForceRestart,
|
||||
ClearCaches,
|
||||
ReduceLoad,
|
||||
DrainQueue,
|
||||
ResetConnections,
|
||||
CancelStuckTasks,
|
||||
ReloadConfiguration,
|
||||
ScaleDown,
|
||||
Isolate
|
||||
}
|
||||
|
||||
public sealed record RecoveryActionResult
|
||||
{
|
||||
public required RecoveryAction Action { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
public sealed record RecoveryState
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required DateTimeOffset StartedAt { get; init; }
|
||||
public required ImmutableArray<RecoveryAction> Actions { get; init; }
|
||||
public required int CurrentActionIndex { get; init; }
|
||||
public required RecoveryStatus Status { get; init; }
|
||||
}
|
||||
|
||||
public enum RecoveryStatus { InProgress, Completed, Failed }
|
||||
|
||||
public sealed record RecoveryAttempt
|
||||
{
|
||||
public required DateTimeOffset AttemptedAt { get; init; }
|
||||
public required ImmutableArray<RecoveryAction> Actions { get; init; }
|
||||
public required ImmutableArray<RecoveryActionResult> Results { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
}
|
||||
|
||||
public sealed record HealingResult
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public required HealingStatus Status { get; init; }
|
||||
public required string Message { get; init; }
|
||||
public ImmutableArray<RecoveryActionResult> ActionResults { get; init; } = [];
|
||||
}
|
||||
|
||||
public enum HealingStatus
|
||||
{
|
||||
NotNeeded,
|
||||
Recovered,
|
||||
PartialRecovery,
|
||||
Failed,
|
||||
AlreadyInProgress,
|
||||
CircuitOpen,
|
||||
NoActionsAvailable
|
||||
}
|
||||
|
||||
public sealed class RecoveryStartedEventArgs : EventArgs
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required ImmutableArray<RecoveryAction> Actions { get; init; }
|
||||
public required DateTimeOffset StartedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed class RecoveryCompletedEventArgs : EventArgs
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required ImmutableArray<RecoveryActionResult> Results { get; init; }
|
||||
public required DateTimeOffset CompletedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed class RecoveryFailedEventArgs : EventArgs
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required ImmutableArray<RecoveryActionResult> Results { get; init; }
|
||||
public required DateTimeOffset FailedAt { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,777 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// StateSync.cs
|
||||
// Sprint: SPRINT_20260117_034_ReleaseOrchestrator_agent_resilience
|
||||
// Task: TASK-034-07 - State Sync for cluster state synchronization
|
||||
// Description: Synchronizes state across agent cluster members
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Agent.Core.Resilience;
|
||||
|
||||
/// <summary>
|
||||
/// Synchronizes state across agent cluster members using eventual consistency.
|
||||
/// </summary>
|
||||
public sealed class StateSync : IStateSync, IAsyncDisposable
|
||||
{
|
||||
private readonly IStateSyncTransport _transport;
|
||||
private readonly IStateStore _stateStore;
|
||||
private readonly StateSyncConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<StateSync> _logger;
|
||||
|
||||
private readonly ConcurrentDictionary<string, StateEntry> _localState = new();
|
||||
private readonly ConcurrentDictionary<string, VectorClock> _vectorClocks = new();
|
||||
private readonly ConcurrentDictionary<string, DateTimeOffset> _peerLastSeen = new();
|
||||
|
||||
private string? _nodeId;
|
||||
private CancellationTokenSource? _syncCts;
|
||||
private Task? _syncTask;
|
||||
private Task? _gossipTask;
|
||||
|
||||
public StateSync(
|
||||
IStateSyncTransport transport,
|
||||
IStateStore stateStore,
|
||||
StateSyncConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<StateSync> logger)
|
||||
{
|
||||
_transport = transport;
|
||||
_stateStore = stateStore;
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes state sync with this node's ID.
|
||||
/// </summary>
|
||||
public async Task InitializeAsync(string nodeId, CancellationToken ct = default)
|
||||
{
|
||||
_nodeId = nodeId;
|
||||
|
||||
// Load persisted state
|
||||
var persisted = await _stateStore.LoadAsync(ct);
|
||||
foreach (var entry in persisted)
|
||||
{
|
||||
_localState[entry.Key] = entry;
|
||||
_vectorClocks[entry.Key] = entry.Version;
|
||||
}
|
||||
|
||||
_logger.LogInformation("State sync initialized for node {NodeId} with {Count} entries",
|
||||
nodeId, persisted.Length);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts background synchronization.
|
||||
/// </summary>
|
||||
public async Task StartAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (_syncTask is not null)
|
||||
{
|
||||
_logger.LogWarning("State sync already started");
|
||||
return;
|
||||
}
|
||||
|
||||
_syncCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
|
||||
// Subscribe to incoming sync messages
|
||||
_transport.OnSyncMessage += HandleSyncMessage;
|
||||
|
||||
// Start background tasks
|
||||
_syncTask = PeriodicSyncLoopAsync(_syncCts.Token);
|
||||
_gossipTask = GossipLoopAsync(_syncCts.Token);
|
||||
|
||||
_logger.LogInformation("State sync started");
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stops background synchronization.
|
||||
/// </summary>
|
||||
public async Task StopAsync()
|
||||
{
|
||||
if (_syncCts is null) return;
|
||||
|
||||
_transport.OnSyncMessage -= HandleSyncMessage;
|
||||
|
||||
await _syncCts.CancelAsync();
|
||||
|
||||
try
|
||||
{
|
||||
if (_syncTask is not null)
|
||||
await _syncTask.WaitAsync(TimeSpan.FromSeconds(5));
|
||||
if (_gossipTask is not null)
|
||||
await _gossipTask.WaitAsync(TimeSpan.FromSeconds(5));
|
||||
}
|
||||
catch (OperationCanceledException) { }
|
||||
catch (TimeoutException) { }
|
||||
|
||||
// Persist current state
|
||||
await PersistStateAsync(CancellationToken.None);
|
||||
|
||||
_syncCts.Dispose();
|
||||
_syncCts = null;
|
||||
_syncTask = null;
|
||||
_gossipTask = null;
|
||||
|
||||
_logger.LogInformation("State sync stopped");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets a value in the distributed state.
|
||||
/// </summary>
|
||||
public async Task SetAsync<T>(string key, T value, CancellationToken ct = default)
|
||||
{
|
||||
if (_nodeId is null)
|
||||
throw new InvalidOperationException("State sync not initialized");
|
||||
|
||||
var serialized = JsonSerializer.Serialize(value);
|
||||
var version = IncrementVersion(key);
|
||||
|
||||
var entry = new StateEntry
|
||||
{
|
||||
Key = key,
|
||||
Value = serialized,
|
||||
Version = version,
|
||||
UpdatedBy = _nodeId,
|
||||
UpdatedAt = _timeProvider.GetUtcNow(),
|
||||
Checksum = ComputeChecksum(serialized)
|
||||
};
|
||||
|
||||
_localState[key] = entry;
|
||||
|
||||
_logger.LogDebug("Set local state: {Key} = {Version}", key, version);
|
||||
|
||||
// Broadcast to peers
|
||||
await BroadcastUpdateAsync(entry, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a value from the distributed state.
|
||||
/// </summary>
|
||||
public Task<T?> GetAsync<T>(string key, CancellationToken ct = default)
|
||||
{
|
||||
if (_localState.TryGetValue(key, out var entry))
|
||||
{
|
||||
var value = JsonSerializer.Deserialize<T>(entry.Value);
|
||||
return Task.FromResult(value);
|
||||
}
|
||||
|
||||
return Task.FromResult(default(T));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a value with its metadata.
|
||||
/// </summary>
|
||||
public Task<StateEntry?> GetEntryAsync(string key, CancellationToken ct = default)
|
||||
{
|
||||
return Task.FromResult(_localState.TryGetValue(key, out var entry) ? entry : null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deletes a value from the distributed state.
|
||||
/// </summary>
|
||||
public async Task DeleteAsync(string key, CancellationToken ct = default)
|
||||
{
|
||||
if (_nodeId is null)
|
||||
throw new InvalidOperationException("State sync not initialized");
|
||||
|
||||
var version = IncrementVersion(key);
|
||||
|
||||
var tombstone = new StateEntry
|
||||
{
|
||||
Key = key,
|
||||
Value = null!,
|
||||
Version = version,
|
||||
UpdatedBy = _nodeId,
|
||||
UpdatedAt = _timeProvider.GetUtcNow(),
|
||||
IsDeleted = true
|
||||
};
|
||||
|
||||
_localState[key] = tombstone;
|
||||
|
||||
await BroadcastUpdateAsync(tombstone, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all keys in the state.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> GetKeys()
|
||||
{
|
||||
return _localState
|
||||
.Where(kv => !kv.Value.IsDeleted)
|
||||
.Select(kv => kv.Key)
|
||||
.ToImmutableArray();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all entries matching a prefix.
|
||||
/// </summary>
|
||||
public ImmutableArray<StateEntry> GetByPrefix(string prefix)
|
||||
{
|
||||
return _localState
|
||||
.Where(kv => kv.Key.StartsWith(prefix, StringComparison.Ordinal) && !kv.Value.IsDeleted)
|
||||
.Select(kv => kv.Value)
|
||||
.ToImmutableArray();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets sync status for this node.
|
||||
/// </summary>
|
||||
public SyncStatus GetSyncStatus()
|
||||
{
|
||||
return new SyncStatus
|
||||
{
|
||||
NodeId = _nodeId ?? "unknown",
|
||||
EntryCount = _localState.Count(kv => !kv.Value.IsDeleted),
|
||||
TombstoneCount = _localState.Count(kv => kv.Value.IsDeleted),
|
||||
PeerCount = _peerLastSeen.Count,
|
||||
LastSyncAt = _peerLastSeen.Values.DefaultIfEmpty().Max(),
|
||||
IsHealthy = _peerLastSeen.Count > 0 || _localState.IsEmpty
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Forces immediate sync with all peers.
|
||||
/// </summary>
|
||||
public async Task ForceSyncAsync(CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Forcing full sync");
|
||||
|
||||
var peers = await _transport.GetPeersAsync(ct);
|
||||
|
||||
foreach (var peer in peers)
|
||||
{
|
||||
try
|
||||
{
|
||||
await SyncWithPeerAsync(peer, ct);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Force sync failed with peer {Peer}", peer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compares local state with a peer's state.
|
||||
/// </summary>
|
||||
public async Task<SyncDiff> CompareWithPeerAsync(string peerId, CancellationToken ct = default)
|
||||
{
|
||||
var peerDigest = await _transport.GetDigestAsync(peerId, ct);
|
||||
var localDigest = ComputeDigest();
|
||||
|
||||
var missingLocally = peerDigest.Entries
|
||||
.Where(pe => !localDigest.Entries.Any(le => le.Key == pe.Key && le.Version.CompareTo(pe.Version) >= 0))
|
||||
.ToImmutableArray();
|
||||
|
||||
var missingOnPeer = localDigest.Entries
|
||||
.Where(le => !peerDigest.Entries.Any(pe => pe.Key == le.Key && pe.Version.CompareTo(le.Version) >= 0))
|
||||
.ToImmutableArray();
|
||||
|
||||
return new SyncDiff
|
||||
{
|
||||
MissingLocally = missingLocally.Length,
|
||||
MissingOnPeer = missingOnPeer.Length,
|
||||
InSync = missingLocally.Length == 0 && missingOnPeer.Length == 0
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Event raised when state changes.
|
||||
/// </summary>
|
||||
public event EventHandler<StateChangedEventArgs>? StateChanged;
|
||||
|
||||
private void HandleSyncMessage(object? sender, SyncMessageEventArgs e)
|
||||
{
|
||||
_ = Task.Run(async () =>
|
||||
{
|
||||
try
|
||||
{
|
||||
await ProcessSyncMessageAsync(e.Message);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error processing sync message from {Sender}", e.Message.SenderId);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private async Task ProcessSyncMessageAsync(SyncMessage message)
|
||||
{
|
||||
switch (message.Type)
|
||||
{
|
||||
case SyncMessageType.Update:
|
||||
await ProcessUpdateAsync(message.Entry!);
|
||||
break;
|
||||
|
||||
case SyncMessageType.DigestRequest:
|
||||
await SendDigestAsync(message.SenderId);
|
||||
break;
|
||||
|
||||
case SyncMessageType.DigestResponse:
|
||||
await ProcessDigestAsync(message.SenderId, message.Digest!);
|
||||
break;
|
||||
|
||||
case SyncMessageType.FullSync:
|
||||
await ProcessFullSyncAsync(message.Entries!);
|
||||
break;
|
||||
}
|
||||
|
||||
_peerLastSeen[message.SenderId] = _timeProvider.GetUtcNow();
|
||||
}
|
||||
|
||||
private async Task ProcessUpdateAsync(StateEntry entry)
|
||||
{
|
||||
if (_localState.TryGetValue(entry.Key, out var existing))
|
||||
{
|
||||
// Compare versions
|
||||
var comparison = CompareVersions(entry.Version, existing.Version);
|
||||
|
||||
if (comparison <= 0)
|
||||
{
|
||||
// Our version is newer or equal, ignore
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Accept the update
|
||||
_localState[entry.Key] = entry;
|
||||
_vectorClocks[entry.Key] = entry.Version;
|
||||
|
||||
_logger.LogDebug("Accepted state update: {Key} = {Version} from {Node}",
|
||||
entry.Key, entry.Version, entry.UpdatedBy);
|
||||
|
||||
OnStateChanged(entry, StateChangeType.RemoteUpdate);
|
||||
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
|
||||
private async Task ProcessDigestAsync(string peerId, StateDigest peerDigest)
|
||||
{
|
||||
var entriesToSend = new List<StateEntry>();
|
||||
var keysToRequest = new List<string>();
|
||||
|
||||
foreach (var peerEntry in peerDigest.Entries)
|
||||
{
|
||||
if (_localState.TryGetValue(peerEntry.Key, out var local))
|
||||
{
|
||||
var comparison = CompareVersions(peerEntry.Version, local.Version);
|
||||
|
||||
if (comparison > 0)
|
||||
{
|
||||
// Peer has newer version
|
||||
keysToRequest.Add(peerEntry.Key);
|
||||
}
|
||||
else if (comparison < 0)
|
||||
{
|
||||
// We have newer version
|
||||
entriesToSend.Add(local);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// We don't have this key
|
||||
keysToRequest.Add(peerEntry.Key);
|
||||
}
|
||||
}
|
||||
|
||||
// Send our newer entries
|
||||
if (entriesToSend.Count > 0)
|
||||
{
|
||||
await _transport.SendAsync(peerId, new SyncMessage
|
||||
{
|
||||
Type = SyncMessageType.FullSync,
|
||||
SenderId = _nodeId!,
|
||||
Entries = entriesToSend.ToImmutableArray()
|
||||
});
|
||||
}
|
||||
|
||||
// Request entries we need
|
||||
if (keysToRequest.Count > 0)
|
||||
{
|
||||
await _transport.RequestEntriesAsync(peerId, keysToRequest.ToImmutableArray());
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ProcessFullSyncAsync(ImmutableArray<StateEntry> entries)
|
||||
{
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
await ProcessUpdateAsync(entry);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task BroadcastUpdateAsync(StateEntry entry, CancellationToken ct)
|
||||
{
|
||||
var message = new SyncMessage
|
||||
{
|
||||
Type = SyncMessageType.Update,
|
||||
SenderId = _nodeId!,
|
||||
Entry = entry
|
||||
};
|
||||
|
||||
var peers = await _transport.GetPeersAsync(ct);
|
||||
|
||||
foreach (var peer in peers)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _transport.SendAsync(peer, message, ct);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to broadcast update to peer {Peer}", peer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task SendDigestAsync(string peerId)
|
||||
{
|
||||
var digest = ComputeDigest();
|
||||
|
||||
await _transport.SendAsync(peerId, new SyncMessage
|
||||
{
|
||||
Type = SyncMessageType.DigestResponse,
|
||||
SenderId = _nodeId!,
|
||||
Digest = digest
|
||||
});
|
||||
}
|
||||
|
||||
private StateDigest ComputeDigest()
|
||||
{
|
||||
var entries = _localState.Select(kv => new DigestEntry
|
||||
{
|
||||
Key = kv.Key,
|
||||
Version = kv.Value.Version,
|
||||
Checksum = kv.Value.Checksum
|
||||
}).ToImmutableArray();
|
||||
|
||||
return new StateDigest
|
||||
{
|
||||
NodeId = _nodeId!,
|
||||
Entries = entries,
|
||||
ComputedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
private async Task PeriodicSyncLoopAsync(CancellationToken ct)
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await Task.Delay(_config.SyncInterval, ct);
|
||||
|
||||
// Persist state periodically
|
||||
await PersistStateAsync(ct);
|
||||
|
||||
// Cleanup old tombstones
|
||||
CleanupTombstones();
|
||||
}
|
||||
catch (OperationCanceledException) when (ct.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in periodic sync loop");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task GossipLoopAsync(CancellationToken ct)
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await Task.Delay(_config.GossipInterval, ct);
|
||||
|
||||
// Pick random peer to gossip with
|
||||
var peers = await _transport.GetPeersAsync(ct);
|
||||
if (peers.Length == 0) continue;
|
||||
|
||||
var randomPeer = peers[Random.Shared.Next(peers.Length)];
|
||||
|
||||
await SyncWithPeerAsync(randomPeer, ct);
|
||||
}
|
||||
catch (OperationCanceledException) when (ct.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in gossip loop");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task SyncWithPeerAsync(string peerId, CancellationToken ct)
|
||||
{
|
||||
await _transport.SendAsync(peerId, new SyncMessage
|
||||
{
|
||||
Type = SyncMessageType.DigestRequest,
|
||||
SenderId = _nodeId!
|
||||
}, ct);
|
||||
}
|
||||
|
||||
private async Task PersistStateAsync(CancellationToken ct)
|
||||
{
|
||||
var entries = _localState.Values
|
||||
.Where(e => !e.IsDeleted)
|
||||
.ToImmutableArray();
|
||||
|
||||
await _stateStore.SaveAsync(entries, ct);
|
||||
|
||||
_logger.LogDebug("Persisted {Count} state entries", entries.Length);
|
||||
}
|
||||
|
||||
private void CleanupTombstones()
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var cutoff = now - _config.TombstoneRetention;
|
||||
|
||||
var toRemove = _localState
|
||||
.Where(kv => kv.Value.IsDeleted && kv.Value.UpdatedAt < cutoff)
|
||||
.Select(kv => kv.Key)
|
||||
.ToList();
|
||||
|
||||
foreach (var key in toRemove)
|
||||
{
|
||||
_localState.TryRemove(key, out _);
|
||||
_vectorClocks.TryRemove(key, out _);
|
||||
}
|
||||
|
||||
if (toRemove.Count > 0)
|
||||
{
|
||||
_logger.LogDebug("Cleaned up {Count} tombstones", toRemove.Count);
|
||||
}
|
||||
}
|
||||
|
||||
private VectorClock IncrementVersion(string key)
|
||||
{
|
||||
if (_vectorClocks.TryGetValue(key, out var existing))
|
||||
{
|
||||
return existing.Increment(_nodeId!);
|
||||
}
|
||||
|
||||
return new VectorClock().Increment(_nodeId!);
|
||||
}
|
||||
|
||||
private static int CompareVersions(VectorClock a, VectorClock b)
|
||||
{
|
||||
return a.CompareTo(b);
|
||||
}
|
||||
|
||||
private static string ComputeChecksum(string value)
|
||||
{
|
||||
var hash = SHA256.HashData(Encoding.UTF8.GetBytes(value));
|
||||
return Convert.ToBase64String(hash)[..16];
|
||||
}
|
||||
|
||||
private void OnStateChanged(StateEntry entry, StateChangeType changeType)
|
||||
{
|
||||
StateChanged?.Invoke(this, new StateChangedEventArgs
|
||||
{
|
||||
Key = entry.Key,
|
||||
Entry = entry,
|
||||
ChangeType = changeType
|
||||
});
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
await StopAsync();
|
||||
}
|
||||
}
|
||||
|
||||
#region Vector Clock
|
||||
|
||||
/// <summary>
|
||||
/// Vector clock for distributed versioning.
|
||||
/// </summary>
|
||||
public sealed class VectorClock : IComparable<VectorClock>
|
||||
{
|
||||
private readonly ImmutableDictionary<string, long> _clocks;
|
||||
|
||||
public VectorClock()
|
||||
{
|
||||
_clocks = ImmutableDictionary<string, long>.Empty;
|
||||
}
|
||||
|
||||
private VectorClock(ImmutableDictionary<string, long> clocks)
|
||||
{
|
||||
_clocks = clocks;
|
||||
}
|
||||
|
||||
public VectorClock Increment(string nodeId)
|
||||
{
|
||||
var current = _clocks.GetValueOrDefault(nodeId, 0);
|
||||
return new VectorClock(_clocks.SetItem(nodeId, current + 1));
|
||||
}
|
||||
|
||||
public VectorClock Merge(VectorClock other)
|
||||
{
|
||||
var merged = _clocks;
|
||||
|
||||
foreach (var (nodeId, clock) in other._clocks)
|
||||
{
|
||||
var current = merged.GetValueOrDefault(nodeId, 0);
|
||||
merged = merged.SetItem(nodeId, Math.Max(current, clock));
|
||||
}
|
||||
|
||||
return new VectorClock(merged);
|
||||
}
|
||||
|
||||
public int CompareTo(VectorClock? other)
|
||||
{
|
||||
if (other is null) return 1;
|
||||
|
||||
var allNodes = _clocks.Keys.Union(other._clocks.Keys).ToList();
|
||||
|
||||
bool thisGreater = false;
|
||||
bool otherGreater = false;
|
||||
|
||||
foreach (var node in allNodes)
|
||||
{
|
||||
var thisValue = _clocks.GetValueOrDefault(node, 0);
|
||||
var otherValue = other._clocks.GetValueOrDefault(node, 0);
|
||||
|
||||
if (thisValue > otherValue) thisGreater = true;
|
||||
if (otherValue > thisValue) otherGreater = true;
|
||||
}
|
||||
|
||||
if (thisGreater && !otherGreater) return 1; // This is newer
|
||||
if (otherGreater && !thisGreater) return -1; // Other is newer
|
||||
if (thisGreater && otherGreater) return 0; // Concurrent (conflict)
|
||||
return 0; // Equal
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
{
|
||||
return string.Join(",", _clocks.Select(kv => $"{kv.Key}:{kv.Value}"));
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface IStateSync
|
||||
{
|
||||
Task InitializeAsync(string nodeId, CancellationToken ct = default);
|
||||
Task StartAsync(CancellationToken ct = default);
|
||||
Task StopAsync();
|
||||
Task SetAsync<T>(string key, T value, CancellationToken ct = default);
|
||||
Task<T?> GetAsync<T>(string key, CancellationToken ct = default);
|
||||
Task<StateEntry?> GetEntryAsync(string key, CancellationToken ct = default);
|
||||
Task DeleteAsync(string key, CancellationToken ct = default);
|
||||
ImmutableArray<string> GetKeys();
|
||||
ImmutableArray<StateEntry> GetByPrefix(string prefix);
|
||||
SyncStatus GetSyncStatus();
|
||||
Task ForceSyncAsync(CancellationToken ct = default);
|
||||
Task<SyncDiff> CompareWithPeerAsync(string peerId, CancellationToken ct = default);
|
||||
event EventHandler<StateChangedEventArgs>? StateChanged;
|
||||
}
|
||||
|
||||
public interface IStateSyncTransport
|
||||
{
|
||||
Task<ImmutableArray<string>> GetPeersAsync(CancellationToken ct = default);
|
||||
Task SendAsync(string peerId, SyncMessage message, CancellationToken ct = default);
|
||||
Task<StateDigest> GetDigestAsync(string peerId, CancellationToken ct = default);
|
||||
Task RequestEntriesAsync(string peerId, ImmutableArray<string> keys, CancellationToken ct = default);
|
||||
event EventHandler<SyncMessageEventArgs>? OnSyncMessage;
|
||||
}
|
||||
|
||||
public interface IStateStore
|
||||
{
|
||||
Task<ImmutableArray<StateEntry>> LoadAsync(CancellationToken ct = default);
|
||||
Task SaveAsync(ImmutableArray<StateEntry> entries, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record StateSyncConfig
|
||||
{
|
||||
public TimeSpan SyncInterval { get; init; } = TimeSpan.FromSeconds(30);
|
||||
public TimeSpan GossipInterval { get; init; } = TimeSpan.FromSeconds(10);
|
||||
public TimeSpan TombstoneRetention { get; init; } = TimeSpan.FromHours(24);
|
||||
}
|
||||
|
||||
public sealed record StateEntry
|
||||
{
|
||||
public required string Key { get; init; }
|
||||
public required string Value { get; init; }
|
||||
public required VectorClock Version { get; init; }
|
||||
public required string UpdatedBy { get; init; }
|
||||
public required DateTimeOffset UpdatedAt { get; init; }
|
||||
public string? Checksum { get; init; }
|
||||
public bool IsDeleted { get; init; }
|
||||
}
|
||||
|
||||
public sealed record SyncMessage
|
||||
{
|
||||
public required SyncMessageType Type { get; init; }
|
||||
public required string SenderId { get; init; }
|
||||
public StateEntry? Entry { get; init; }
|
||||
public StateDigest? Digest { get; init; }
|
||||
public ImmutableArray<StateEntry> Entries { get; init; } = [];
|
||||
}
|
||||
|
||||
public enum SyncMessageType { Update, DigestRequest, DigestResponse, FullSync }
|
||||
|
||||
public sealed record StateDigest
|
||||
{
|
||||
public required string NodeId { get; init; }
|
||||
public required ImmutableArray<DigestEntry> Entries { get; init; }
|
||||
public required DateTimeOffset ComputedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record DigestEntry
|
||||
{
|
||||
public required string Key { get; init; }
|
||||
public required VectorClock Version { get; init; }
|
||||
public string? Checksum { get; init; }
|
||||
}
|
||||
|
||||
public sealed record SyncStatus
|
||||
{
|
||||
public required string NodeId { get; init; }
|
||||
public required int EntryCount { get; init; }
|
||||
public required int TombstoneCount { get; init; }
|
||||
public required int PeerCount { get; init; }
|
||||
public DateTimeOffset? LastSyncAt { get; init; }
|
||||
public required bool IsHealthy { get; init; }
|
||||
}
|
||||
|
||||
public sealed record SyncDiff
|
||||
{
|
||||
public required int MissingLocally { get; init; }
|
||||
public required int MissingOnPeer { get; init; }
|
||||
public required bool InSync { get; init; }
|
||||
}
|
||||
|
||||
public sealed class SyncMessageEventArgs : EventArgs
|
||||
{
|
||||
public required SyncMessage Message { get; init; }
|
||||
}
|
||||
|
||||
public sealed class StateChangedEventArgs : EventArgs
|
||||
{
|
||||
public required string Key { get; init; }
|
||||
public required StateEntry Entry { get; init; }
|
||||
public required StateChangeType ChangeType { get; init; }
|
||||
}
|
||||
|
||||
public enum StateChangeType { LocalUpdate, RemoteUpdate, Deleted }
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,368 @@
|
||||
// Copyright (c) Stella Ops. All rights reserved. SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
|
||||
using System.Security.Cryptography;
|
||||
|
||||
namespace StellaOps.Agent.Core.Updates;
|
||||
|
||||
/// <summary>
|
||||
/// Agent update manager for safe binary auto-updates.
|
||||
/// </summary>
|
||||
public sealed class AgentUpdateManager : IAgentUpdateManager
|
||||
{
|
||||
private readonly IUpdateChannel _updateChannel;
|
||||
private readonly IPackageVerifier _packageVerifier;
|
||||
private readonly IRollbackManager _rollbackManager;
|
||||
private readonly IAgentHealthVerifier _healthVerifier;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly UpdateManagerOptions _options;
|
||||
|
||||
public AgentUpdateManager(
|
||||
IUpdateChannel updateChannel,
|
||||
IPackageVerifier packageVerifier,
|
||||
IRollbackManager rollbackManager,
|
||||
IAgentHealthVerifier healthVerifier,
|
||||
TimeProvider timeProvider,
|
||||
UpdateManagerOptions? options = null)
|
||||
{
|
||||
_updateChannel = updateChannel;
|
||||
_packageVerifier = packageVerifier;
|
||||
_rollbackManager = rollbackManager;
|
||||
_healthVerifier = healthVerifier;
|
||||
_timeProvider = timeProvider;
|
||||
_options = options ?? new UpdateManagerOptions();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks for available updates.
|
||||
/// </summary>
|
||||
public async Task<UpdateCheckResult> CheckForUpdateAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
var currentVersion = GetCurrentVersion();
|
||||
var availableUpdate = await _updateChannel.GetLatestVersionAsync(cancellationToken);
|
||||
|
||||
if (availableUpdate == null)
|
||||
{
|
||||
return new UpdateCheckResult
|
||||
{
|
||||
UpdateAvailable = false,
|
||||
CurrentVersion = currentVersion,
|
||||
Message = "No updates available"
|
||||
};
|
||||
}
|
||||
|
||||
var isNewer = Version.Parse(availableUpdate.Version) > Version.Parse(currentVersion);
|
||||
|
||||
return new UpdateCheckResult
|
||||
{
|
||||
UpdateAvailable = isNewer,
|
||||
CurrentVersion = currentVersion,
|
||||
AvailableVersion = availableUpdate.Version,
|
||||
ReleaseNotes = availableUpdate.ReleaseNotes,
|
||||
DownloadSize = availableUpdate.PackageSize,
|
||||
Message = isNewer ? $"Update available: {availableUpdate.Version}" : "Already on latest version"
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks and applies updates if available.
|
||||
/// </summary>
|
||||
public async Task<UpdateResult> CheckAndApplyUpdateAsync(
|
||||
UpdateOptions? options = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
options ??= new UpdateOptions();
|
||||
|
||||
// Check maintenance window
|
||||
if (_options.MaintenanceWindow != null && !IsInMaintenanceWindow())
|
||||
{
|
||||
return UpdateResult.Skipped("Not in maintenance window");
|
||||
}
|
||||
|
||||
// Check for updates
|
||||
var checkResult = await CheckForUpdateAsync(cancellationToken);
|
||||
if (!checkResult.UpdateAvailable)
|
||||
{
|
||||
return UpdateResult.Skipped("No update available");
|
||||
}
|
||||
|
||||
var targetVersion = options.TargetVersion ?? checkResult.AvailableVersion!;
|
||||
|
||||
// Download package
|
||||
var package = await _updateChannel.DownloadPackageAsync(targetVersion, cancellationToken);
|
||||
|
||||
// Verify signature
|
||||
var verificationResult = await _packageVerifier.VerifyAsync(package, cancellationToken);
|
||||
if (!verificationResult.IsValid)
|
||||
{
|
||||
return UpdateResult.Failed($"Package verification failed: {verificationResult.Error}");
|
||||
}
|
||||
|
||||
// Create rollback point
|
||||
var rollbackPoint = await _rollbackManager.CreateRollbackPointAsync(cancellationToken);
|
||||
|
||||
try
|
||||
{
|
||||
// Drain tasks if configured
|
||||
if (_options.DrainTasksBeforeUpdate)
|
||||
{
|
||||
await DrainTasksAsync(cancellationToken);
|
||||
}
|
||||
|
||||
// Apply update
|
||||
await ApplyPackageAsync(package, cancellationToken);
|
||||
|
||||
// Verify health after update
|
||||
var healthCheck = await _healthVerifier.VerifyHealthAsync(cancellationToken);
|
||||
if (!healthCheck.IsHealthy)
|
||||
{
|
||||
// Rollback
|
||||
await _rollbackManager.RollbackAsync(rollbackPoint, cancellationToken);
|
||||
return UpdateResult.Failed($"Health check failed after update: {healthCheck.Message}");
|
||||
}
|
||||
|
||||
return UpdateResult.Success(checkResult.CurrentVersion!, targetVersion);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
// Attempt rollback
|
||||
try
|
||||
{
|
||||
await _rollbackManager.RollbackAsync(rollbackPoint, cancellationToken);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Rollback failed - critical state
|
||||
}
|
||||
|
||||
return UpdateResult.Failed($"Update failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Rolls back to the previous version.
|
||||
/// </summary>
|
||||
public async Task<RollbackResult> RollbackAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
var rollbackPoint = await _rollbackManager.GetLatestRollbackPointAsync(cancellationToken);
|
||||
if (rollbackPoint == null)
|
||||
{
|
||||
return RollbackResult.Failed("No rollback point available");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await _rollbackManager.RollbackAsync(rollbackPoint, cancellationToken);
|
||||
return RollbackResult.Success(rollbackPoint.Version);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return RollbackResult.Failed($"Rollback failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private static string GetCurrentVersion()
|
||||
{
|
||||
var assembly = typeof(AgentUpdateManager).Assembly;
|
||||
var version = assembly.GetName().Version;
|
||||
return version?.ToString(3) ?? "0.0.0";
|
||||
}
|
||||
|
||||
private bool IsInMaintenanceWindow()
|
||||
{
|
||||
if (_options.MaintenanceWindow == null) return true;
|
||||
|
||||
var now = _timeProvider.GetLocalNow();
|
||||
var window = _options.MaintenanceWindow;
|
||||
|
||||
if (!window.Days.Contains(now.DayOfWeek)) return false;
|
||||
|
||||
var currentTime = TimeOnly.FromDateTime(now.DateTime);
|
||||
return currentTime >= window.StartTime && currentTime <= window.EndTime;
|
||||
}
|
||||
|
||||
private Task DrainTasksAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
// Signal task executor to stop accepting new tasks and wait for completion
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private Task ApplyPackageAsync(UpdatePackage package, CancellationToken cancellationToken)
|
||||
{
|
||||
// Extract and replace binaries
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Update manager interface.
|
||||
/// </summary>
|
||||
public interface IAgentUpdateManager
|
||||
{
|
||||
Task<UpdateCheckResult> CheckForUpdateAsync(CancellationToken cancellationToken = default);
|
||||
Task<UpdateResult> CheckAndApplyUpdateAsync(UpdateOptions? options = null, CancellationToken cancellationToken = default);
|
||||
Task<RollbackResult> RollbackAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Update check result.
|
||||
/// </summary>
|
||||
public sealed record UpdateCheckResult
|
||||
{
|
||||
public required bool UpdateAvailable { get; init; }
|
||||
public string? CurrentVersion { get; init; }
|
||||
public string? AvailableVersion { get; init; }
|
||||
public string? ReleaseNotes { get; init; }
|
||||
public long? DownloadSize { get; init; }
|
||||
public required string Message { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Update options.
|
||||
/// </summary>
|
||||
public sealed record UpdateOptions
|
||||
{
|
||||
public string? TargetVersion { get; init; }
|
||||
public bool Force { get; init; } = false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Update result.
|
||||
/// </summary>
|
||||
public sealed record UpdateResult
|
||||
{
|
||||
public required bool IsSuccess { get; init; }
|
||||
public bool WasSkipped { get; init; }
|
||||
public string? FromVersion { get; init; }
|
||||
public string? ToVersion { get; init; }
|
||||
public string? Error { get; init; }
|
||||
|
||||
public static UpdateResult Success(string from, string to) =>
|
||||
new() { IsSuccess = true, FromVersion = from, ToVersion = to };
|
||||
|
||||
public static UpdateResult Failed(string error) =>
|
||||
new() { IsSuccess = false, Error = error };
|
||||
|
||||
public static UpdateResult Skipped(string reason) =>
|
||||
new() { IsSuccess = true, WasSkipped = true, Error = reason };
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Rollback result.
|
||||
/// </summary>
|
||||
public sealed record RollbackResult
|
||||
{
|
||||
public required bool IsSuccess { get; init; }
|
||||
public string? RestoredVersion { get; init; }
|
||||
public string? Error { get; init; }
|
||||
|
||||
public static RollbackResult Success(string version) =>
|
||||
new() { IsSuccess = true, RestoredVersion = version };
|
||||
|
||||
public static RollbackResult Failed(string error) =>
|
||||
new() { IsSuccess = false, Error = error };
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Update manager options.
|
||||
/// </summary>
|
||||
public sealed record UpdateManagerOptions
|
||||
{
|
||||
public bool DrainTasksBeforeUpdate { get; init; } = true;
|
||||
public TimeSpan DrainTimeout { get; init; } = TimeSpan.FromMinutes(5);
|
||||
public UpdateMaintenanceWindow? MaintenanceWindow { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Update maintenance window.
|
||||
/// </summary>
|
||||
public sealed record UpdateMaintenanceWindow
|
||||
{
|
||||
public DayOfWeek[] Days { get; init; } = [DayOfWeek.Saturday, DayOfWeek.Sunday];
|
||||
public TimeOnly StartTime { get; init; } = new(2, 0);
|
||||
public TimeOnly EndTime { get; init; } = new(6, 0);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Update channel interface.
|
||||
/// </summary>
|
||||
public interface IUpdateChannel
|
||||
{
|
||||
Task<AvailableUpdate?> GetLatestVersionAsync(CancellationToken cancellationToken = default);
|
||||
Task<UpdatePackage> DownloadPackageAsync(string version, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Available update info.
|
||||
/// </summary>
|
||||
public sealed record AvailableUpdate
|
||||
{
|
||||
public required string Version { get; init; }
|
||||
public string? ReleaseNotes { get; init; }
|
||||
public long PackageSize { get; init; }
|
||||
public string? Checksum { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Update package.
|
||||
/// </summary>
|
||||
public sealed record UpdatePackage
|
||||
{
|
||||
public required string Version { get; init; }
|
||||
public required byte[] Content { get; init; }
|
||||
public required string Signature { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Package verifier interface.
|
||||
/// </summary>
|
||||
public interface IPackageVerifier
|
||||
{
|
||||
Task<PackageVerificationResult> VerifyAsync(UpdatePackage package, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Package verification result.
|
||||
/// </summary>
|
||||
public sealed record PackageVerificationResult
|
||||
{
|
||||
public required bool IsValid { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Rollback manager interface.
|
||||
/// </summary>
|
||||
public interface IRollbackManager
|
||||
{
|
||||
Task<RollbackPoint> CreateRollbackPointAsync(CancellationToken cancellationToken = default);
|
||||
Task<RollbackPoint?> GetLatestRollbackPointAsync(CancellationToken cancellationToken = default);
|
||||
Task RollbackAsync(RollbackPoint point, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Rollback point.
|
||||
/// </summary>
|
||||
public sealed record RollbackPoint
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
public required string BackupPath { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Agent health verifier interface.
|
||||
/// </summary>
|
||||
public interface IAgentHealthVerifier
|
||||
{
|
||||
Task<HealthVerificationResult> VerifyHealthAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Health verification result.
|
||||
/// </summary>
|
||||
public sealed record HealthVerificationResult
|
||||
{
|
||||
public required bool IsHealthy { get; init; }
|
||||
public string? Message { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,913 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AgentClusterController.cs
|
||||
// Sprint: SPRINT_20260117_034_ReleaseOrchestrator_agent_resilience
|
||||
// Task: TASK-034-08 - REST API for cluster and agent management
|
||||
// Description: API endpoints for cluster management, health, failover, and sync
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.ComponentModel.DataAnnotations;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Agent.WebApi.Controllers;
|
||||
|
||||
/// <summary>
|
||||
/// REST API for agent cluster management including health monitoring,
|
||||
/// leader election, failover, and state synchronization.
|
||||
/// </summary>
|
||||
[ApiController]
|
||||
[Route("api/v1/agent-cluster")]
|
||||
[Authorize]
|
||||
public sealed class AgentClusterController : ControllerBase
|
||||
{
|
||||
private readonly IAgentClusterManager _clusterManager;
|
||||
private readonly IHealthMonitor _healthMonitor;
|
||||
private readonly ILeaderElection _leaderElection;
|
||||
private readonly IFailoverManager _failoverManager;
|
||||
private readonly ISelfHealer _selfHealer;
|
||||
private readonly IStateSync _stateSync;
|
||||
private readonly ILogger<AgentClusterController> _logger;
|
||||
|
||||
public AgentClusterController(
|
||||
IAgentClusterManager clusterManager,
|
||||
IHealthMonitor healthMonitor,
|
||||
ILeaderElection leaderElection,
|
||||
IFailoverManager failoverManager,
|
||||
ISelfHealer selfHealer,
|
||||
IStateSync stateSync,
|
||||
ILogger<AgentClusterController> logger)
|
||||
{
|
||||
_clusterManager = clusterManager;
|
||||
_healthMonitor = healthMonitor;
|
||||
_leaderElection = leaderElection;
|
||||
_failoverManager = failoverManager;
|
||||
_selfHealer = selfHealer;
|
||||
_stateSync = stateSync;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
#region Cluster Status Endpoints
|
||||
|
||||
/// <summary>
|
||||
/// Gets current cluster status.
|
||||
/// </summary>
|
||||
[HttpGet("status")]
|
||||
[ProducesResponseType(typeof(ClusterStatusResponse), StatusCodes.Status200OK)]
|
||||
public ActionResult<ClusterStatusResponse> GetClusterStatus()
|
||||
{
|
||||
var status = _clusterManager.GetClusterStatus();
|
||||
var healthStatuses = _healthMonitor.GetAllAgentStatuses();
|
||||
|
||||
return Ok(new ClusterStatusResponse
|
||||
{
|
||||
ClusterId = status.ClusterId,
|
||||
Mode = status.Mode.ToString(),
|
||||
State = status.State.ToString(),
|
||||
MemberCount = status.MemberCount,
|
||||
HealthyCount = healthStatuses.Count(kv => kv.Value == AgentHealthStatus.Healthy),
|
||||
LeaderId = status.LeaderId,
|
||||
Members = status.Members.Select(m => new ClusterMemberDto
|
||||
{
|
||||
AgentId = m.AgentId,
|
||||
Endpoint = $"{m.Endpoint.Host}:{m.Endpoint.Port}",
|
||||
Role = m.Role.ToString(),
|
||||
Status = healthStatuses.GetValueOrDefault(m.AgentId).ToString(),
|
||||
JoinedAt = m.JoinedAt
|
||||
}).ToList(),
|
||||
UpdatedAt = status.UpdatedAt
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets cluster configuration.
|
||||
/// </summary>
|
||||
[HttpGet("config")]
|
||||
[ProducesResponseType(typeof(ClusterConfigResponse), StatusCodes.Status200OK)]
|
||||
public ActionResult<ClusterConfigResponse> GetClusterConfig()
|
||||
{
|
||||
var config = _clusterManager.GetConfiguration();
|
||||
|
||||
return Ok(new ClusterConfigResponse
|
||||
{
|
||||
Mode = config.Mode.ToString(),
|
||||
MinQuorum = config.MinQuorum,
|
||||
HeartbeatInterval = config.HeartbeatInterval,
|
||||
FailoverTimeout = config.FailoverTimeout,
|
||||
MaxRetries = config.MaxRetries
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates cluster configuration.
|
||||
/// </summary>
|
||||
[HttpPut("config")]
|
||||
[ProducesResponseType(StatusCodes.Status204NoContent)]
|
||||
[Authorize(Policy = "ClusterAdmin")]
|
||||
public async Task<ActionResult> UpdateClusterConfig(
|
||||
[FromBody] UpdateClusterConfigRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
await _clusterManager.UpdateConfigurationAsync(new ClusterConfig
|
||||
{
|
||||
Mode = Enum.Parse<ClusterMode>(request.Mode, ignoreCase: true),
|
||||
MinQuorum = request.MinQuorum,
|
||||
HeartbeatInterval = request.HeartbeatInterval,
|
||||
FailoverTimeout = request.FailoverTimeout,
|
||||
MaxRetries = request.MaxRetries
|
||||
}, ct);
|
||||
|
||||
return NoContent();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Agent Health Endpoints
|
||||
|
||||
/// <summary>
|
||||
/// Gets health assessment for all agents.
|
||||
/// </summary>
|
||||
[HttpGet("health")]
|
||||
[ProducesResponseType(typeof(ClusterHealthResponse), StatusCodes.Status200OK)]
|
||||
public async Task<ActionResult<ClusterHealthResponse>> GetClusterHealth(CancellationToken ct)
|
||||
{
|
||||
var assessments = await _healthMonitor.AssessAllAgentsAsync(ct);
|
||||
|
||||
return Ok(new ClusterHealthResponse
|
||||
{
|
||||
OverallStatus = DetermineOverallStatus(assessments),
|
||||
Agents = assessments.Select(MapToHealthDto).ToList(),
|
||||
AssessedAt = DateTimeOffset.UtcNow
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets health assessment for a specific agent.
|
||||
/// </summary>
|
||||
[HttpGet("agents/{agentId}/health")]
|
||||
[ProducesResponseType(typeof(AgentHealthDto), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(StatusCodes.Status404NotFound)]
|
||||
public async Task<ActionResult<AgentHealthDto>> GetAgentHealth(
|
||||
string agentId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var assessment = await _healthMonitor.AssessHealthAsync(agentId, ct);
|
||||
return Ok(MapToHealthDto(assessment));
|
||||
}
|
||||
catch (InvalidOperationException)
|
||||
{
|
||||
return NotFound(new ProblemDetails
|
||||
{
|
||||
Title = "Agent not found",
|
||||
Detail = $"Agent {agentId} is not registered in the cluster"
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets agents by health status.
|
||||
/// </summary>
|
||||
[HttpGet("health/by-status/{status}")]
|
||||
[ProducesResponseType(typeof(ImmutableArray<string>), StatusCodes.Status200OK)]
|
||||
public ActionResult<ImmutableArray<string>> GetAgentsByHealthStatus(string status)
|
||||
{
|
||||
var healthStatus = Enum.Parse<AgentHealthStatus>(status, ignoreCase: true);
|
||||
var agents = _healthMonitor.GetAgentsByStatus(healthStatus);
|
||||
return Ok(agents);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Leader Election Endpoints
|
||||
|
||||
/// <summary>
|
||||
/// Gets current leader for a resource.
|
||||
/// </summary>
|
||||
[HttpGet("leader/{resourceKey}")]
|
||||
[ProducesResponseType(typeof(LeaderInfoResponse), StatusCodes.Status200OK)]
|
||||
public async Task<ActionResult<LeaderInfoResponse>> GetLeader(
|
||||
string resourceKey,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var leaderId = await _leaderElection.GetLeaderAsync(resourceKey, ct);
|
||||
var state = _leaderElection.GetElectionState(resourceKey);
|
||||
|
||||
return Ok(new LeaderInfoResponse
|
||||
{
|
||||
ResourceKey = resourceKey,
|
||||
LeaderId = leaderId,
|
||||
Term = state?.Term ?? 0,
|
||||
ElectedAt = state?.ElectedAt,
|
||||
LeaseExpiresAt = state?.LeaseExpiresAt,
|
||||
IsThisNode = _leaderElection.IsLeader(resourceKey)
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Triggers leader election for a resource.
|
||||
/// </summary>
|
||||
[HttpPost("leader/{resourceKey}/elect")]
|
||||
[ProducesResponseType(typeof(ElectionResultResponse), StatusCodes.Status200OK)]
|
||||
[Authorize(Policy = "ClusterAdmin")]
|
||||
public async Task<ActionResult<ElectionResultResponse>> TriggerElection(
|
||||
string resourceKey,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var result = await _leaderElection.ParticipateAsync(resourceKey, ct);
|
||||
|
||||
return Ok(new ElectionResultResponse
|
||||
{
|
||||
ResourceKey = resourceKey,
|
||||
Success = result.Success,
|
||||
IsLeader = result.IsLeader,
|
||||
LeaderId = result.LeaderId,
|
||||
Term = result.Term,
|
||||
Error = result.Error
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resigns leadership for a resource.
|
||||
/// </summary>
|
||||
[HttpPost("leader/{resourceKey}/resign")]
|
||||
[ProducesResponseType(StatusCodes.Status204NoContent)]
|
||||
[Authorize(Policy = "ClusterAdmin")]
|
||||
public async Task<ActionResult> ResignLeadership(
|
||||
string resourceKey,
|
||||
CancellationToken ct)
|
||||
{
|
||||
await _leaderElection.ResignAsync(resourceKey, ct);
|
||||
return NoContent();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all resources where this node is leader.
|
||||
/// </summary>
|
||||
[HttpGet("leader/my-leaderships")]
|
||||
[ProducesResponseType(typeof(ImmutableArray<string>), StatusCodes.Status200OK)]
|
||||
public ActionResult<ImmutableArray<string>> GetMyLeaderships()
|
||||
{
|
||||
var leaderships = _leaderElection.GetLeaderships();
|
||||
return Ok(leaderships);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Failover Endpoints
|
||||
|
||||
/// <summary>
|
||||
/// Triggers manual failover for an agent.
|
||||
/// </summary>
|
||||
[HttpPost("agents/{agentId}/failover")]
|
||||
[ProducesResponseType(typeof(FailoverResultResponse), StatusCodes.Status200OK)]
|
||||
[Authorize(Policy = "ClusterAdmin")]
|
||||
public async Task<ActionResult<FailoverResultResponse>> TriggerFailover(
|
||||
string agentId,
|
||||
[FromBody] FailoverRequest? request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation("Manual failover triggered for agent {AgentId}", agentId);
|
||||
|
||||
var result = await _failoverManager.TriggerFailoverAsync(
|
||||
agentId,
|
||||
request?.TargetAgentId,
|
||||
ct);
|
||||
|
||||
return Ok(new FailoverResultResponse
|
||||
{
|
||||
SourceAgentId = agentId,
|
||||
TargetAgentId = result.TargetAgentId,
|
||||
Success = result.Success,
|
||||
TasksTransferred = result.TasksTransferred,
|
||||
Duration = result.Duration,
|
||||
Error = result.Error
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets failover history for an agent.
|
||||
/// </summary>
|
||||
[HttpGet("agents/{agentId}/failover/history")]
|
||||
[ProducesResponseType(typeof(FailoverHistoryResponse), StatusCodes.Status200OK)]
|
||||
public ActionResult<FailoverHistoryResponse> GetFailoverHistory(string agentId)
|
||||
{
|
||||
var history = _failoverManager.GetFailoverHistory(agentId);
|
||||
|
||||
return Ok(new FailoverHistoryResponse
|
||||
{
|
||||
AgentId = agentId,
|
||||
Events = history.Select(e => new FailoverEventDto
|
||||
{
|
||||
SourceAgentId = e.SourceAgentId,
|
||||
TargetAgentId = e.TargetAgentId,
|
||||
Reason = e.Reason.ToString(),
|
||||
Success = e.Success,
|
||||
TasksTransferred = e.TasksTransferred,
|
||||
OccurredAt = e.OccurredAt
|
||||
}).ToList()
|
||||
});
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Self-Healing Endpoints
|
||||
|
||||
/// <summary>
|
||||
/// Triggers manual healing for an agent.
|
||||
/// </summary>
|
||||
[HttpPost("agents/{agentId}/heal")]
|
||||
[ProducesResponseType(typeof(HealingResultResponse), StatusCodes.Status200OK)]
|
||||
[Authorize(Policy = "ClusterAdmin")]
|
||||
public async Task<ActionResult<HealingResultResponse>> TriggerHealing(
|
||||
string agentId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation("Manual healing triggered for agent {AgentId}", agentId);
|
||||
|
||||
var result = await _selfHealer.HealAsync(agentId, ct);
|
||||
|
||||
return Ok(new HealingResultResponse
|
||||
{
|
||||
AgentId = agentId,
|
||||
Success = result.Success,
|
||||
Status = result.Status.ToString(),
|
||||
Message = result.Message,
|
||||
Actions = result.ActionResults.Select(a => new RecoveryActionResultDto
|
||||
{
|
||||
Type = a.Action.Type.ToString(),
|
||||
Success = a.Success,
|
||||
Duration = a.Duration,
|
||||
Error = a.Error
|
||||
}).ToList()
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets recovery history for an agent.
|
||||
/// </summary>
|
||||
[HttpGet("agents/{agentId}/heal/history")]
|
||||
[ProducesResponseType(typeof(RecoveryHistoryResponse), StatusCodes.Status200OK)]
|
||||
public ActionResult<RecoveryHistoryResponse> GetRecoveryHistory(string agentId)
|
||||
{
|
||||
var history = _selfHealer.GetRecoveryHistory(agentId);
|
||||
|
||||
return Ok(new RecoveryHistoryResponse
|
||||
{
|
||||
AgentId = agentId,
|
||||
Attempts = history.Select(a => new RecoveryAttemptDto
|
||||
{
|
||||
AttemptedAt = a.AttemptedAt,
|
||||
Success = a.Success,
|
||||
ActionCount = a.Actions.Length
|
||||
}).ToList()
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets current recovery state for an agent.
|
||||
/// </summary>
|
||||
[HttpGet("agents/{agentId}/heal/state")]
|
||||
[ProducesResponseType(typeof(RecoveryStateResponse), StatusCodes.Status200OK)]
|
||||
public ActionResult<RecoveryStateResponse> GetRecoveryState(string agentId)
|
||||
{
|
||||
var state = _selfHealer.GetRecoveryState(agentId);
|
||||
|
||||
if (state is null)
|
||||
{
|
||||
return Ok(new RecoveryStateResponse
|
||||
{
|
||||
AgentId = agentId,
|
||||
InProgress = false
|
||||
});
|
||||
}
|
||||
|
||||
return Ok(new RecoveryStateResponse
|
||||
{
|
||||
AgentId = agentId,
|
||||
InProgress = true,
|
||||
StartedAt = state.StartedAt,
|
||||
CurrentAction = state.CurrentActionIndex,
|
||||
TotalActions = state.Actions.Length,
|
||||
Status = state.Status.ToString()
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resets the circuit breaker for an agent.
|
||||
/// </summary>
|
||||
[HttpPost("agents/{agentId}/heal/reset-circuit")]
|
||||
[ProducesResponseType(StatusCodes.Status204NoContent)]
|
||||
[Authorize(Policy = "ClusterAdmin")]
|
||||
public ActionResult ResetCircuitBreaker(string agentId)
|
||||
{
|
||||
_selfHealer.ResetCircuitBreaker(agentId);
|
||||
return NoContent();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region State Sync Endpoints
|
||||
|
||||
/// <summary>
|
||||
/// Gets state sync status.
|
||||
/// </summary>
|
||||
[HttpGet("state/status")]
|
||||
[ProducesResponseType(typeof(SyncStatusResponse), StatusCodes.Status200OK)]
|
||||
public ActionResult<SyncStatusResponse> GetSyncStatus()
|
||||
{
|
||||
var status = _stateSync.GetSyncStatus();
|
||||
|
||||
return Ok(new SyncStatusResponse
|
||||
{
|
||||
NodeId = status.NodeId,
|
||||
EntryCount = status.EntryCount,
|
||||
TombstoneCount = status.TombstoneCount,
|
||||
PeerCount = status.PeerCount,
|
||||
LastSyncAt = status.LastSyncAt,
|
||||
IsHealthy = status.IsHealthy
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a state entry.
|
||||
/// </summary>
|
||||
[HttpGet("state/{key}")]
|
||||
[ProducesResponseType(typeof(StateEntryResponse), StatusCodes.Status200OK)]
|
||||
[ProducesResponseType(StatusCodes.Status404NotFound)]
|
||||
public async Task<ActionResult<StateEntryResponse>> GetState(
|
||||
string key,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var entry = await _stateSync.GetEntryAsync(key, ct);
|
||||
|
||||
if (entry is null)
|
||||
return NotFound();
|
||||
|
||||
return Ok(new StateEntryResponse
|
||||
{
|
||||
Key = entry.Key,
|
||||
Value = entry.Value,
|
||||
Version = entry.Version.ToString(),
|
||||
UpdatedBy = entry.UpdatedBy,
|
||||
UpdatedAt = entry.UpdatedAt
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets a state entry.
|
||||
/// </summary>
|
||||
[HttpPut("state/{key}")]
|
||||
[ProducesResponseType(StatusCodes.Status204NoContent)]
|
||||
[Authorize(Policy = "ClusterAdmin")]
|
||||
public async Task<ActionResult> SetState(
|
||||
string key,
|
||||
[FromBody] SetStateRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
await _stateSync.SetAsync(key, request.Value, ct);
|
||||
return NoContent();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deletes a state entry.
|
||||
/// </summary>
|
||||
[HttpDelete("state/{key}")]
|
||||
[ProducesResponseType(StatusCodes.Status204NoContent)]
|
||||
[Authorize(Policy = "ClusterAdmin")]
|
||||
public async Task<ActionResult> DeleteState(string key, CancellationToken ct)
|
||||
{
|
||||
await _stateSync.DeleteAsync(key, ct);
|
||||
return NoContent();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all state keys.
|
||||
/// </summary>
|
||||
[HttpGet("state/keys")]
|
||||
[ProducesResponseType(typeof(ImmutableArray<string>), StatusCodes.Status200OK)]
|
||||
public ActionResult<ImmutableArray<string>> GetStateKeys([FromQuery] string? prefix = null)
|
||||
{
|
||||
if (prefix is not null)
|
||||
{
|
||||
var entries = _stateSync.GetByPrefix(prefix);
|
||||
return Ok(entries.Select(e => e.Key).ToImmutableArray());
|
||||
}
|
||||
|
||||
return Ok(_stateSync.GetKeys());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Forces immediate sync with all peers.
|
||||
/// </summary>
|
||||
[HttpPost("state/sync")]
|
||||
[ProducesResponseType(StatusCodes.Status202Accepted)]
|
||||
[Authorize(Policy = "ClusterAdmin")]
|
||||
public async Task<ActionResult> ForceSync(CancellationToken ct)
|
||||
{
|
||||
await _stateSync.ForceSyncAsync(ct);
|
||||
return Accepted();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compares state with a peer.
|
||||
/// </summary>
|
||||
[HttpGet("state/compare/{peerId}")]
|
||||
[ProducesResponseType(typeof(SyncDiffResponse), StatusCodes.Status200OK)]
|
||||
public async Task<ActionResult<SyncDiffResponse>> CompareWithPeer(
|
||||
string peerId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var diff = await _stateSync.CompareWithPeerAsync(peerId, ct);
|
||||
|
||||
return Ok(new SyncDiffResponse
|
||||
{
|
||||
PeerId = peerId,
|
||||
MissingLocally = diff.MissingLocally,
|
||||
MissingOnPeer = diff.MissingOnPeer,
|
||||
InSync = diff.InSync
|
||||
});
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Agent Management Endpoints
|
||||
|
||||
/// <summary>
|
||||
/// Registers a new agent in the cluster.
|
||||
/// </summary>
|
||||
[HttpPost("agents")]
|
||||
[ProducesResponseType(StatusCodes.Status201Created)]
|
||||
[Authorize(Policy = "ClusterAdmin")]
|
||||
public async Task<ActionResult> RegisterAgent(
|
||||
[FromBody] RegisterAgentRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
await _clusterManager.RegisterAgentAsync(
|
||||
request.AgentId,
|
||||
new AgentEndpoint(request.Host, request.Port, request.UseTls),
|
||||
ct);
|
||||
|
||||
_healthMonitor.RegisterAgent(
|
||||
request.AgentId,
|
||||
new AgentEndpoint(request.Host, request.Port, request.UseTls));
|
||||
|
||||
return CreatedAtAction(nameof(GetAgentHealth), new { agentId = request.AgentId }, null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Removes an agent from the cluster.
|
||||
/// </summary>
|
||||
[HttpDelete("agents/{agentId}")]
|
||||
[ProducesResponseType(StatusCodes.Status204NoContent)]
|
||||
[Authorize(Policy = "ClusterAdmin")]
|
||||
public async Task<ActionResult> UnregisterAgent(
|
||||
string agentId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_healthMonitor.UnregisterAgent(agentId);
|
||||
await _clusterManager.UnregisterAgentAsync(agentId, ct);
|
||||
return NoContent();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Helper Methods
|
||||
|
||||
private static string DetermineOverallStatus(ImmutableArray<AgentHealthAssessment> assessments)
|
||||
{
|
||||
if (assessments.Any(a => a.Status == AgentHealthStatus.Critical))
|
||||
return "Critical";
|
||||
if (assessments.Any(a => a.Status == AgentHealthStatus.Degraded))
|
||||
return "Degraded";
|
||||
if (assessments.Any(a => a.Status == AgentHealthStatus.Warning))
|
||||
return "Warning";
|
||||
if (assessments.All(a => a.Status == AgentHealthStatus.Healthy))
|
||||
return "Healthy";
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
private static AgentHealthDto MapToHealthDto(AgentHealthAssessment assessment)
|
||||
{
|
||||
return new AgentHealthDto
|
||||
{
|
||||
AgentId = assessment.AgentId,
|
||||
Status = assessment.Status.ToString(),
|
||||
OverallScore = assessment.OverallScore,
|
||||
Factors = assessment.Factors.Select(f => new HealthFactorDto
|
||||
{
|
||||
Name = f.Name,
|
||||
Score = f.Score,
|
||||
Status = f.Status.ToString(),
|
||||
Weight = f.Weight,
|
||||
Details = f.Details
|
||||
}).ToList(),
|
||||
Trend = new HealthTrendDto
|
||||
{
|
||||
Direction = assessment.Trend.Direction.ToString(),
|
||||
Confidence = assessment.Trend.Confidence
|
||||
},
|
||||
Recommendation = new HealthRecommendationDto
|
||||
{
|
||||
Action = assessment.Recommendation.Action.ToString(),
|
||||
Urgency = assessment.Recommendation.Urgency.ToString(),
|
||||
Reason = assessment.Recommendation.Reason
|
||||
},
|
||||
AssessedAt = assessment.AssessedAt
|
||||
};
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
#region Request/Response DTOs
|
||||
|
||||
public sealed record ClusterStatusResponse
|
||||
{
|
||||
public required string ClusterId { get; init; }
|
||||
public required string Mode { get; init; }
|
||||
public required string State { get; init; }
|
||||
public required int MemberCount { get; init; }
|
||||
public required int HealthyCount { get; init; }
|
||||
public string? LeaderId { get; init; }
|
||||
public required List<ClusterMemberDto> Members { get; init; }
|
||||
public required DateTimeOffset UpdatedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ClusterMemberDto
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required string Endpoint { get; init; }
|
||||
public required string Role { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required DateTimeOffset JoinedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ClusterConfigResponse
|
||||
{
|
||||
public required string Mode { get; init; }
|
||||
public required int MinQuorum { get; init; }
|
||||
public required TimeSpan HeartbeatInterval { get; init; }
|
||||
public required TimeSpan FailoverTimeout { get; init; }
|
||||
public required int MaxRetries { get; init; }
|
||||
}
|
||||
|
||||
public sealed record UpdateClusterConfigRequest
|
||||
{
|
||||
[Required]
|
||||
public required string Mode { get; init; }
|
||||
public int MinQuorum { get; init; } = 2;
|
||||
public TimeSpan HeartbeatInterval { get; init; } = TimeSpan.FromSeconds(10);
|
||||
public TimeSpan FailoverTimeout { get; init; } = TimeSpan.FromSeconds(30);
|
||||
public int MaxRetries { get; init; } = 3;
|
||||
}
|
||||
|
||||
public sealed record ClusterHealthResponse
|
||||
{
|
||||
public required string OverallStatus { get; init; }
|
||||
public required List<AgentHealthDto> Agents { get; init; }
|
||||
public required DateTimeOffset AssessedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record AgentHealthDto
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required double OverallScore { get; init; }
|
||||
public required List<HealthFactorDto> Factors { get; init; }
|
||||
public required HealthTrendDto Trend { get; init; }
|
||||
public required HealthRecommendationDto Recommendation { get; init; }
|
||||
public required DateTimeOffset AssessedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record HealthFactorDto
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required double Score { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required double Weight { get; init; }
|
||||
public string? Details { get; init; }
|
||||
}
|
||||
|
||||
public sealed record HealthTrendDto
|
||||
{
|
||||
public required string Direction { get; init; }
|
||||
public required double Confidence { get; init; }
|
||||
}
|
||||
|
||||
public sealed record HealthRecommendationDto
|
||||
{
|
||||
public required string Action { get; init; }
|
||||
public required string Urgency { get; init; }
|
||||
public required string Reason { get; init; }
|
||||
}
|
||||
|
||||
public sealed record LeaderInfoResponse
|
||||
{
|
||||
public required string ResourceKey { get; init; }
|
||||
public string? LeaderId { get; init; }
|
||||
public required int Term { get; init; }
|
||||
public DateTimeOffset? ElectedAt { get; init; }
|
||||
public DateTimeOffset? LeaseExpiresAt { get; init; }
|
||||
public required bool IsThisNode { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ElectionResultResponse
|
||||
{
|
||||
public required string ResourceKey { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public required bool IsLeader { get; init; }
|
||||
public string? LeaderId { get; init; }
|
||||
public required int Term { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
public sealed record FailoverRequest
|
||||
{
|
||||
public string? TargetAgentId { get; init; }
|
||||
}
|
||||
|
||||
public sealed record FailoverResultResponse
|
||||
{
|
||||
public required string SourceAgentId { get; init; }
|
||||
public string? TargetAgentId { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public required int TasksTransferred { get; init; }
|
||||
public required TimeSpan Duration { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
public sealed record FailoverHistoryResponse
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required List<FailoverEventDto> Events { get; init; }
|
||||
}
|
||||
|
||||
public sealed record FailoverEventDto
|
||||
{
|
||||
public required string SourceAgentId { get; init; }
|
||||
public string? TargetAgentId { get; init; }
|
||||
public required string Reason { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public required int TasksTransferred { get; init; }
|
||||
public required DateTimeOffset OccurredAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record HealingResultResponse
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required string Message { get; init; }
|
||||
public required List<RecoveryActionResultDto> Actions { get; init; }
|
||||
}
|
||||
|
||||
public sealed record RecoveryActionResultDto
|
||||
{
|
||||
public required string Type { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public required TimeSpan Duration { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
public sealed record RecoveryHistoryResponse
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required List<RecoveryAttemptDto> Attempts { get; init; }
|
||||
}
|
||||
|
||||
public sealed record RecoveryAttemptDto
|
||||
{
|
||||
public required DateTimeOffset AttemptedAt { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public required int ActionCount { get; init; }
|
||||
}
|
||||
|
||||
public sealed record RecoveryStateResponse
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required bool InProgress { get; init; }
|
||||
public DateTimeOffset? StartedAt { get; init; }
|
||||
public int? CurrentAction { get; init; }
|
||||
public int? TotalActions { get; init; }
|
||||
public string? Status { get; init; }
|
||||
}
|
||||
|
||||
public sealed record SyncStatusResponse
|
||||
{
|
||||
public required string NodeId { get; init; }
|
||||
public required int EntryCount { get; init; }
|
||||
public required int TombstoneCount { get; init; }
|
||||
public required int PeerCount { get; init; }
|
||||
public DateTimeOffset? LastSyncAt { get; init; }
|
||||
public required bool IsHealthy { get; init; }
|
||||
}
|
||||
|
||||
public sealed record StateEntryResponse
|
||||
{
|
||||
public required string Key { get; init; }
|
||||
public required string Value { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required string UpdatedBy { get; init; }
|
||||
public required DateTimeOffset UpdatedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record SetStateRequest
|
||||
{
|
||||
[Required]
|
||||
public required string Value { get; init; }
|
||||
}
|
||||
|
||||
public sealed record SyncDiffResponse
|
||||
{
|
||||
public required string PeerId { get; init; }
|
||||
public required int MissingLocally { get; init; }
|
||||
public required int MissingOnPeer { get; init; }
|
||||
public required bool InSync { get; init; }
|
||||
}
|
||||
|
||||
public sealed record RegisterAgentRequest
|
||||
{
|
||||
[Required]
|
||||
public required string AgentId { get; init; }
|
||||
[Required]
|
||||
public required string Host { get; init; }
|
||||
public int Port { get; init; } = 8443;
|
||||
public bool UseTls { get; init; } = true;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Interfaces (stubs for compilation)
|
||||
|
||||
public interface IAgentClusterManager
|
||||
{
|
||||
ClusterStatus GetClusterStatus();
|
||||
ClusterConfig GetConfiguration();
|
||||
Task UpdateConfigurationAsync(ClusterConfig config, CancellationToken ct = default);
|
||||
Task RegisterAgentAsync(string agentId, AgentEndpoint endpoint, CancellationToken ct = default);
|
||||
Task UnregisterAgentAsync(string agentId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IFailoverManager
|
||||
{
|
||||
Task<FailoverResult> TriggerFailoverAsync(string sourceAgentId, string? targetAgentId = null, CancellationToken ct = default);
|
||||
ImmutableArray<FailoverEvent> GetFailoverHistory(string agentId);
|
||||
}
|
||||
|
||||
public sealed record ClusterStatus
|
||||
{
|
||||
public required string ClusterId { get; init; }
|
||||
public required ClusterMode Mode { get; init; }
|
||||
public required ClusterState State { get; init; }
|
||||
public required int MemberCount { get; init; }
|
||||
public string? LeaderId { get; init; }
|
||||
public required ImmutableArray<ClusterMember> Members { get; init; }
|
||||
public required DateTimeOffset UpdatedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ClusterMember
|
||||
{
|
||||
public required string AgentId { get; init; }
|
||||
public required AgentEndpoint Endpoint { get; init; }
|
||||
public required MemberRole Role { get; init; }
|
||||
public required DateTimeOffset JoinedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ClusterConfig
|
||||
{
|
||||
public ClusterMode Mode { get; init; }
|
||||
public int MinQuorum { get; init; }
|
||||
public TimeSpan HeartbeatInterval { get; init; }
|
||||
public TimeSpan FailoverTimeout { get; init; }
|
||||
public int MaxRetries { get; init; }
|
||||
}
|
||||
|
||||
public enum ClusterMode { Standalone, ActivePassive, ActiveActive, Sharded }
|
||||
public enum ClusterState { Forming, Healthy, Degraded, PartitionedNonQuorum }
|
||||
public enum MemberRole { Leader, Follower, Standby }
|
||||
|
||||
public sealed record FailoverResult
|
||||
{
|
||||
public required bool Success { get; init; }
|
||||
public string? TargetAgentId { get; init; }
|
||||
public required int TasksTransferred { get; init; }
|
||||
public required TimeSpan Duration { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
public sealed record FailoverEvent
|
||||
{
|
||||
public required string SourceAgentId { get; init; }
|
||||
public string? TargetAgentId { get; init; }
|
||||
public required FailoverReason Reason { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public required int TasksTransferred { get; init; }
|
||||
public required DateTimeOffset OccurredAt { get; init; }
|
||||
}
|
||||
|
||||
public enum FailoverReason { HealthDegradation, ManualTrigger, NetworkPartition, ResourceExhaustion }
|
||||
|
||||
#endregion
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,557 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// AuditQueryEngine.cs
|
||||
// Sprint: SPRINT_20260117_039_ReleaseOrchestrator_compliance
|
||||
// Task: TASK-039-05 - Audit query engine with flexible querying and aggregations
|
||||
// Description: Powerful query engine for audit logs and compliance data
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Linq.Expressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Compliance;
|
||||
|
||||
/// <summary>
|
||||
/// Flexible query engine for audit logs and compliance data.
|
||||
/// </summary>
|
||||
public sealed class AuditQueryEngine : IAuditQueryEngine
|
||||
{
|
||||
private readonly IAuditLogStore _auditStore;
|
||||
private readonly AuditQueryConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<AuditQueryEngine> _logger;
|
||||
|
||||
public AuditQueryEngine(
|
||||
IAuditLogStore auditStore,
|
||||
AuditQueryConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<AuditQueryEngine> logger)
|
||||
{
|
||||
_auditStore = auditStore;
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Executes an audit query.
|
||||
/// </summary>
|
||||
public async Task<AuditQueryResult> QueryAsync(AuditQuery query, CancellationToken ct = default)
|
||||
{
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
|
||||
// Build and execute query
|
||||
var entries = await _auditStore.QueryAsync(query, ct);
|
||||
|
||||
// Apply sorting
|
||||
entries = ApplySorting(entries, query.SortBy, query.SortDescending);
|
||||
|
||||
// Get total count before pagination
|
||||
var totalCount = entries.Count;
|
||||
|
||||
// Apply pagination
|
||||
var paginatedEntries = entries
|
||||
.Skip(query.Offset)
|
||||
.Take(Math.Min(query.Limit, _config.MaxResultsPerQuery))
|
||||
.ToImmutableArray();
|
||||
|
||||
var queryTime = _timeProvider.GetUtcNow() - startTime;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Executed audit query: {Count} results in {ElapsedMs}ms",
|
||||
paginatedEntries.Length, queryTime.TotalMilliseconds);
|
||||
|
||||
return new AuditQueryResult
|
||||
{
|
||||
Entries = paginatedEntries,
|
||||
TotalCount = totalCount,
|
||||
Offset = query.Offset,
|
||||
Limit = query.Limit,
|
||||
QueryTimeMs = queryTime.TotalMilliseconds,
|
||||
Query = query
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Executes an aggregation query.
|
||||
/// </summary>
|
||||
public async Task<AggregationResult> AggregateAsync(
|
||||
AuditQuery baseQuery,
|
||||
AggregationSpec aggregation,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var entries = await _auditStore.QueryAsync(baseQuery, ct);
|
||||
|
||||
var buckets = aggregation.GroupBy switch
|
||||
{
|
||||
GroupByField.Action => GroupByAction(entries),
|
||||
GroupByField.Actor => GroupByActor(entries),
|
||||
GroupByField.Resource => GroupByResource(entries),
|
||||
GroupByField.Hour => GroupByTimeInterval(entries, TimeSpan.FromHours(1)),
|
||||
GroupByField.Day => GroupByTimeInterval(entries, TimeSpan.FromDays(1)),
|
||||
GroupByField.Week => GroupByTimeInterval(entries, TimeSpan.FromDays(7)),
|
||||
GroupByField.Month => GroupByMonth(entries),
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(aggregation.GroupBy))
|
||||
};
|
||||
|
||||
// Calculate aggregation metrics
|
||||
var aggregatedBuckets = buckets.Select(b => new AggregationBucket
|
||||
{
|
||||
Key = b.Key,
|
||||
Count = b.Entries.Count,
|
||||
MinTimestamp = b.Entries.Min(e => e.Timestamp),
|
||||
MaxTimestamp = b.Entries.Max(e => e.Timestamp),
|
||||
UniqueActors = b.Entries.Select(e => e.Actor).Distinct().Count(),
|
||||
UniqueResources = b.Entries.Select(e => e.ResourceId).Distinct().Count()
|
||||
}).OrderByDescending(b => b.Count).ToImmutableArray();
|
||||
|
||||
return new AggregationResult
|
||||
{
|
||||
Buckets = aggregatedBuckets,
|
||||
TotalEntries = entries.Count,
|
||||
GroupBy = aggregation.GroupBy
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets activity summary for a time range.
|
||||
/// </summary>
|
||||
public async Task<ActivitySummary> GetActivitySummaryAsync(
|
||||
DateTimeOffset from,
|
||||
DateTimeOffset to,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var query = new AuditQuery
|
||||
{
|
||||
FromTimestamp = from,
|
||||
ToTimestamp = to,
|
||||
Limit = _config.MaxResultsPerQuery
|
||||
};
|
||||
|
||||
var entries = await _auditStore.QueryAsync(query, ct);
|
||||
|
||||
return new ActivitySummary
|
||||
{
|
||||
TimeRange = new TimeRange { From = from, To = to },
|
||||
TotalActions = entries.Count,
|
||||
UniqueActors = entries.Select(e => e.Actor).Distinct().Count(),
|
||||
UniqueResources = entries.Select(e => e.ResourceId).Distinct().Count(),
|
||||
ActionBreakdown = entries
|
||||
.GroupBy(e => e.Action)
|
||||
.ToDictionary(g => g.Key, g => g.Count())
|
||||
.ToImmutableDictionary(),
|
||||
TopActors = entries
|
||||
.GroupBy(e => e.Actor)
|
||||
.OrderByDescending(g => g.Count())
|
||||
.Take(10)
|
||||
.Select(g => new ActorActivity { Actor = g.Key, ActionCount = g.Count() })
|
||||
.ToImmutableArray(),
|
||||
HourlyDistribution = GetHourlyDistribution(entries)
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Searches audit logs with full-text search.
|
||||
/// </summary>
|
||||
public async Task<AuditQueryResult> SearchAsync(
|
||||
string searchText,
|
||||
SearchOptions options,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var query = new AuditQuery
|
||||
{
|
||||
SearchText = searchText,
|
||||
FromTimestamp = options.FromTimestamp,
|
||||
ToTimestamp = options.ToTimestamp,
|
||||
Limit = options.Limit,
|
||||
Offset = options.Offset
|
||||
};
|
||||
|
||||
return await QueryAsync(query, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets audit trail for a specific resource.
|
||||
/// </summary>
|
||||
public async Task<ResourceAuditTrail> GetResourceTrailAsync(
|
||||
string resourceType,
|
||||
string resourceId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var query = new AuditQuery
|
||||
{
|
||||
ResourceType = resourceType,
|
||||
ResourceId = resourceId,
|
||||
Limit = _config.MaxResultsPerQuery,
|
||||
SortBy = "Timestamp",
|
||||
SortDescending = false
|
||||
};
|
||||
|
||||
var entries = await _auditStore.QueryAsync(query, ct);
|
||||
|
||||
return new ResourceAuditTrail
|
||||
{
|
||||
ResourceType = resourceType,
|
||||
ResourceId = resourceId,
|
||||
Entries = entries.ToImmutableArray(),
|
||||
FirstAction = entries.MinBy(e => e.Timestamp),
|
||||
LastAction = entries.MaxBy(e => e.Timestamp),
|
||||
TotalActions = entries.Count,
|
||||
ActorCount = entries.Select(e => e.Actor).Distinct().Count()
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets actor activity report.
|
||||
/// </summary>
|
||||
public async Task<ActorActivityReport> GetActorActivityAsync(
|
||||
string actor,
|
||||
DateTimeOffset from,
|
||||
DateTimeOffset to,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var query = new AuditQuery
|
||||
{
|
||||
Actor = actor,
|
||||
FromTimestamp = from,
|
||||
ToTimestamp = to,
|
||||
Limit = _config.MaxResultsPerQuery
|
||||
};
|
||||
|
||||
var entries = await _auditStore.QueryAsync(query, ct);
|
||||
|
||||
return new ActorActivityReport
|
||||
{
|
||||
Actor = actor,
|
||||
TimeRange = new TimeRange { From = from, To = to },
|
||||
TotalActions = entries.Count,
|
||||
ActionBreakdown = entries
|
||||
.GroupBy(e => e.Action)
|
||||
.ToDictionary(g => g.Key, g => g.Count())
|
||||
.ToImmutableDictionary(),
|
||||
ResourcesAccessed = entries
|
||||
.Select(e => $"{e.ResourceType}:{e.ResourceId}")
|
||||
.Distinct()
|
||||
.ToImmutableArray(),
|
||||
RecentActions = entries
|
||||
.OrderByDescending(e => e.Timestamp)
|
||||
.Take(20)
|
||||
.ToImmutableArray()
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exports audit logs to various formats.
|
||||
/// </summary>
|
||||
public async Task<AuditExportResult> ExportAsync(
|
||||
AuditQuery query,
|
||||
AuditExportFormat format,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var entries = await _auditStore.QueryAsync(query, ct);
|
||||
|
||||
var content = format switch
|
||||
{
|
||||
AuditExportFormat.Csv => GenerateCsv(entries),
|
||||
AuditExportFormat.Json => GenerateJson(entries),
|
||||
AuditExportFormat.Syslog => GenerateSyslog(entries),
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(format))
|
||||
};
|
||||
|
||||
return new AuditExportResult
|
||||
{
|
||||
Content = content,
|
||||
Format = format,
|
||||
EntryCount = entries.Count,
|
||||
ExportedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private static List<AuditLogEntry> ApplySorting(
|
||||
List<AuditLogEntry> entries,
|
||||
string? sortBy,
|
||||
bool descending)
|
||||
{
|
||||
if (string.IsNullOrEmpty(sortBy)) sortBy = "Timestamp";
|
||||
|
||||
var sorted = sortBy.ToLowerInvariant() switch
|
||||
{
|
||||
"timestamp" => entries.OrderBy(e => e.Timestamp),
|
||||
"action" => entries.OrderBy(e => e.Action),
|
||||
"actor" => entries.OrderBy(e => e.Actor),
|
||||
"resource" => entries.OrderBy(e => e.ResourceId),
|
||||
_ => entries.OrderBy(e => e.Timestamp)
|
||||
};
|
||||
|
||||
return descending ? sorted.Reverse().ToList() : sorted.ToList();
|
||||
}
|
||||
|
||||
private static List<(string Key, List<AuditLogEntry> Entries)> GroupByAction(List<AuditLogEntry> entries)
|
||||
{
|
||||
return entries
|
||||
.GroupBy(e => e.Action)
|
||||
.Select(g => (g.Key, g.ToList()))
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static List<(string Key, List<AuditLogEntry> Entries)> GroupByActor(List<AuditLogEntry> entries)
|
||||
{
|
||||
return entries
|
||||
.GroupBy(e => e.Actor)
|
||||
.Select(g => (g.Key, g.ToList()))
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static List<(string Key, List<AuditLogEntry> Entries)> GroupByResource(List<AuditLogEntry> entries)
|
||||
{
|
||||
return entries
|
||||
.GroupBy(e => $"{e.ResourceType}:{e.ResourceId}")
|
||||
.Select(g => (g.Key, g.ToList()))
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static List<(string Key, List<AuditLogEntry> Entries)> GroupByTimeInterval(
|
||||
List<AuditLogEntry> entries,
|
||||
TimeSpan interval)
|
||||
{
|
||||
if (!entries.Any()) return [];
|
||||
|
||||
var min = entries.Min(e => e.Timestamp);
|
||||
var max = entries.Max(e => e.Timestamp);
|
||||
|
||||
return entries
|
||||
.GroupBy(e => GetIntervalKey(e.Timestamp, min, interval))
|
||||
.Select(g => (g.Key.ToString("yyyy-MM-dd HH:mm"), g.ToList()))
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static DateTimeOffset GetIntervalKey(DateTimeOffset timestamp, DateTimeOffset min, TimeSpan interval)
|
||||
{
|
||||
var diff = timestamp - min;
|
||||
var intervals = (long)(diff.Ticks / interval.Ticks);
|
||||
return min.Add(TimeSpan.FromTicks(intervals * interval.Ticks));
|
||||
}
|
||||
|
||||
private static List<(string Key, List<AuditLogEntry> Entries)> GroupByMonth(List<AuditLogEntry> entries)
|
||||
{
|
||||
return entries
|
||||
.GroupBy(e => e.Timestamp.ToString("yyyy-MM"))
|
||||
.Select(g => (g.Key, g.ToList()))
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static ImmutableArray<HourlyCount> GetHourlyDistribution(List<AuditLogEntry> entries)
|
||||
{
|
||||
var hourly = Enumerable.Range(0, 24)
|
||||
.Select(h => new HourlyCount
|
||||
{
|
||||
Hour = h,
|
||||
Count = entries.Count(e => e.Timestamp.Hour == h)
|
||||
})
|
||||
.ToImmutableArray();
|
||||
|
||||
return hourly;
|
||||
}
|
||||
|
||||
private static string GenerateCsv(List<AuditLogEntry> entries)
|
||||
{
|
||||
var sb = new System.Text.StringBuilder();
|
||||
sb.AppendLine("Timestamp,Action,Actor,ResourceType,ResourceId,Result,Details");
|
||||
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
sb.AppendLine($"\"{entry.Timestamp:O}\",\"{entry.Action}\",\"{entry.Actor}\"," +
|
||||
$"\"{entry.ResourceType}\",\"{entry.ResourceId}\",\"{entry.Result}\"," +
|
||||
$"\"{entry.Details?.Replace("\"", "\"\"")}\"");
|
||||
}
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
private static string GenerateJson(List<AuditLogEntry> entries)
|
||||
{
|
||||
return System.Text.Json.JsonSerializer.Serialize(entries,
|
||||
new System.Text.Json.JsonSerializerOptions { WriteIndented = true });
|
||||
}
|
||||
|
||||
private static string GenerateSyslog(List<AuditLogEntry> entries)
|
||||
{
|
||||
var sb = new System.Text.StringBuilder();
|
||||
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
// RFC 5424 format
|
||||
var severity = entry.Result == "Success" ? 6 : 3; // Info or Error
|
||||
var facility = 4; // Auth
|
||||
var priority = facility * 8 + severity;
|
||||
|
||||
sb.AppendLine($"<{priority}>1 {entry.Timestamp:yyyy-MM-ddTHH:mm:ss.fffZ} stella audit {entry.Action} - " +
|
||||
$"[actor=\"{entry.Actor}\" resource=\"{entry.ResourceType}:{entry.ResourceId}\" result=\"{entry.Result}\"] " +
|
||||
$"{entry.Details}");
|
||||
}
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface IAuditQueryEngine
|
||||
{
|
||||
Task<AuditQueryResult> QueryAsync(AuditQuery query, CancellationToken ct = default);
|
||||
Task<AggregationResult> AggregateAsync(AuditQuery baseQuery, AggregationSpec aggregation, CancellationToken ct = default);
|
||||
Task<ActivitySummary> GetActivitySummaryAsync(DateTimeOffset from, DateTimeOffset to, CancellationToken ct = default);
|
||||
Task<ResourceAuditTrail> GetResourceTrailAsync(string resourceType, string resourceId, CancellationToken ct = default);
|
||||
Task<ActorActivityReport> GetActorActivityAsync(string actor, DateTimeOffset from, DateTimeOffset to, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IAuditLogStore
|
||||
{
|
||||
Task<List<AuditLogEntry>> QueryAsync(AuditQuery query, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record AuditQueryConfig
|
||||
{
|
||||
public int MaxResultsPerQuery { get; init; } = 10000;
|
||||
public TimeSpan DefaultTimeRange { get; init; } = TimeSpan.FromDays(30);
|
||||
}
|
||||
|
||||
public sealed record AuditQuery
|
||||
{
|
||||
public string? Action { get; init; }
|
||||
public string? Actor { get; init; }
|
||||
public string? ResourceType { get; init; }
|
||||
public string? ResourceId { get; init; }
|
||||
public DateTimeOffset? FromTimestamp { get; init; }
|
||||
public DateTimeOffset? ToTimestamp { get; init; }
|
||||
public string? SearchText { get; init; }
|
||||
public string? SortBy { get; init; }
|
||||
public bool SortDescending { get; init; } = true;
|
||||
public int Offset { get; init; } = 0;
|
||||
public int Limit { get; init; } = 100;
|
||||
}
|
||||
|
||||
public sealed record AuditLogEntry
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public required string Action { get; init; }
|
||||
public required string Actor { get; init; }
|
||||
public required string ResourceType { get; init; }
|
||||
public required string ResourceId { get; init; }
|
||||
public required string Result { get; init; }
|
||||
public string? Details { get; init; }
|
||||
public ImmutableDictionary<string, string>? Metadata { get; init; }
|
||||
}
|
||||
|
||||
public sealed record AuditQueryResult
|
||||
{
|
||||
public required ImmutableArray<AuditLogEntry> Entries { get; init; }
|
||||
public required int TotalCount { get; init; }
|
||||
public required int Offset { get; init; }
|
||||
public required int Limit { get; init; }
|
||||
public required double QueryTimeMs { get; init; }
|
||||
public required AuditQuery Query { get; init; }
|
||||
}
|
||||
|
||||
public sealed record AggregationSpec
|
||||
{
|
||||
public required GroupByField GroupBy { get; init; }
|
||||
}
|
||||
|
||||
public enum GroupByField { Action, Actor, Resource, Hour, Day, Week, Month }
|
||||
|
||||
public sealed record AggregationResult
|
||||
{
|
||||
public required ImmutableArray<AggregationBucket> Buckets { get; init; }
|
||||
public required int TotalEntries { get; init; }
|
||||
public required GroupByField GroupBy { get; init; }
|
||||
}
|
||||
|
||||
public sealed record AggregationBucket
|
||||
{
|
||||
public required string Key { get; init; }
|
||||
public required int Count { get; init; }
|
||||
public required DateTimeOffset MinTimestamp { get; init; }
|
||||
public required DateTimeOffset MaxTimestamp { get; init; }
|
||||
public required int UniqueActors { get; init; }
|
||||
public required int UniqueResources { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ActivitySummary
|
||||
{
|
||||
public required TimeRange TimeRange { get; init; }
|
||||
public required int TotalActions { get; init; }
|
||||
public required int UniqueActors { get; init; }
|
||||
public required int UniqueResources { get; init; }
|
||||
public required ImmutableDictionary<string, int> ActionBreakdown { get; init; }
|
||||
public required ImmutableArray<ActorActivity> TopActors { get; init; }
|
||||
public required ImmutableArray<HourlyCount> HourlyDistribution { get; init; }
|
||||
}
|
||||
|
||||
public sealed record TimeRange
|
||||
{
|
||||
public required DateTimeOffset From { get; init; }
|
||||
public required DateTimeOffset To { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ActorActivity
|
||||
{
|
||||
public required string Actor { get; init; }
|
||||
public required int ActionCount { get; init; }
|
||||
}
|
||||
|
||||
public sealed record HourlyCount
|
||||
{
|
||||
public required int Hour { get; init; }
|
||||
public required int Count { get; init; }
|
||||
}
|
||||
|
||||
public sealed record SearchOptions
|
||||
{
|
||||
public DateTimeOffset? FromTimestamp { get; init; }
|
||||
public DateTimeOffset? ToTimestamp { get; init; }
|
||||
public int Limit { get; init; } = 100;
|
||||
public int Offset { get; init; } = 0;
|
||||
}
|
||||
|
||||
public sealed record ResourceAuditTrail
|
||||
{
|
||||
public required string ResourceType { get; init; }
|
||||
public required string ResourceId { get; init; }
|
||||
public required ImmutableArray<AuditLogEntry> Entries { get; init; }
|
||||
public AuditLogEntry? FirstAction { get; init; }
|
||||
public AuditLogEntry? LastAction { get; init; }
|
||||
public required int TotalActions { get; init; }
|
||||
public required int ActorCount { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ActorActivityReport
|
||||
{
|
||||
public required string Actor { get; init; }
|
||||
public required TimeRange TimeRange { get; init; }
|
||||
public required int TotalActions { get; init; }
|
||||
public required ImmutableDictionary<string, int> ActionBreakdown { get; init; }
|
||||
public required ImmutableArray<string> ResourcesAccessed { get; init; }
|
||||
public required ImmutableArray<AuditLogEntry> RecentActions { get; init; }
|
||||
}
|
||||
|
||||
public enum AuditExportFormat { Csv, Json, Syslog }
|
||||
|
||||
public sealed record AuditExportResult
|
||||
{
|
||||
public required string Content { get; init; }
|
||||
public required AuditExportFormat Format { get; init; }
|
||||
public required int EntryCount { get; init; }
|
||||
public required DateTimeOffset ExportedAt { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,500 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Compliance;
|
||||
|
||||
/// <summary>
|
||||
/// Engine for evaluating compliance against frameworks.
|
||||
/// </summary>
|
||||
public sealed class ComplianceEngine
|
||||
{
|
||||
private readonly IFrameworkMapper _frameworkMapper;
|
||||
private readonly IControlValidator _controlValidator;
|
||||
private readonly IEvidenceProvider _evidenceProvider;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ComplianceEngineConfig _config;
|
||||
private readonly ILogger<ComplianceEngine> _logger;
|
||||
|
||||
public ComplianceEngine(
|
||||
IFrameworkMapper frameworkMapper,
|
||||
IControlValidator controlValidator,
|
||||
IEvidenceProvider evidenceProvider,
|
||||
TimeProvider timeProvider,
|
||||
ComplianceEngineConfig config,
|
||||
ILogger<ComplianceEngine> logger)
|
||||
{
|
||||
_frameworkMapper = frameworkMapper;
|
||||
_controlValidator = controlValidator;
|
||||
_evidenceProvider = evidenceProvider;
|
||||
_timeProvider = timeProvider;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evaluates compliance for a release against specified frameworks.
|
||||
/// </summary>
|
||||
public async Task<ComplianceEvaluationResult> EvaluateAsync(
|
||||
ComplianceEvaluationRequest request,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Evaluating compliance for release {ReleaseId} against {FrameworkCount} frameworks",
|
||||
request.ReleaseId, request.Frameworks.Length);
|
||||
|
||||
var frameworkResults = new List<FrameworkEvaluationResult>();
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
|
||||
foreach (var framework in request.Frameworks)
|
||||
{
|
||||
var result = await EvaluateFrameworkAsync(request.ReleaseId, framework, ct);
|
||||
frameworkResults.Add(result);
|
||||
}
|
||||
|
||||
var overallScore = frameworkResults.Count > 0
|
||||
? frameworkResults.Average(r => r.ComplianceScore)
|
||||
: 0;
|
||||
|
||||
var overallStatus = DetermineOverallStatus(frameworkResults);
|
||||
|
||||
var evaluation = new ComplianceEvaluationResult
|
||||
{
|
||||
EvaluationId = Guid.NewGuid(),
|
||||
ReleaseId = request.ReleaseId,
|
||||
EvaluatedAt = startTime,
|
||||
Duration = _timeProvider.GetUtcNow() - startTime,
|
||||
FrameworkResults = frameworkResults.ToImmutableArray(),
|
||||
OverallScore = overallScore,
|
||||
OverallStatus = overallStatus,
|
||||
Gaps = ExtractGaps(frameworkResults),
|
||||
Recommendations = GenerateRecommendations(frameworkResults)
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Compliance evaluation complete: {Status} (score: {Score:P0})",
|
||||
overallStatus, overallScore);
|
||||
|
||||
return evaluation;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets compliance status for a release.
|
||||
/// </summary>
|
||||
public async Task<ComplianceStatus> GetStatusAsync(
|
||||
Guid releaseId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
// Get latest evaluation for each framework
|
||||
var evaluations = await _evidenceProvider.GetEvaluationsAsync(releaseId, ct);
|
||||
|
||||
if (evaluations.Count == 0)
|
||||
{
|
||||
return new ComplianceStatus
|
||||
{
|
||||
ReleaseId = releaseId,
|
||||
Status = OverallComplianceStatus.NotEvaluated,
|
||||
Message = "No compliance evaluations found"
|
||||
};
|
||||
}
|
||||
|
||||
var latestByFramework = evaluations
|
||||
.GroupBy(e => e.Framework)
|
||||
.Select(g => g.OrderByDescending(e => e.EvaluatedAt).First())
|
||||
.ToList();
|
||||
|
||||
var overallScore = latestByFramework.Average(e => e.Score);
|
||||
var status = DetermineStatusFromScore(overallScore);
|
||||
|
||||
return new ComplianceStatus
|
||||
{
|
||||
ReleaseId = releaseId,
|
||||
Status = status,
|
||||
Score = overallScore,
|
||||
Frameworks = latestByFramework.Select(e => new FrameworkStatus
|
||||
{
|
||||
Framework = e.Framework,
|
||||
Score = e.Score,
|
||||
Status = DetermineStatusFromScore(e.Score),
|
||||
LastEvaluated = e.EvaluatedAt
|
||||
}).ToImmutableArray(),
|
||||
LastEvaluated = latestByFramework.Max(e => e.EvaluatedAt)
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<FrameworkEvaluationResult> EvaluateFrameworkAsync(
|
||||
Guid releaseId,
|
||||
ComplianceFramework framework,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Evaluating {Framework} compliance for release {ReleaseId}",
|
||||
framework, releaseId);
|
||||
|
||||
// Get framework controls
|
||||
var controls = _frameworkMapper.GetControls(framework);
|
||||
|
||||
// Evaluate each control
|
||||
var controlResults = new List<ControlEvaluationResult>();
|
||||
|
||||
foreach (var control in controls)
|
||||
{
|
||||
var result = await _controlValidator.ValidateAsync(
|
||||
releaseId,
|
||||
control,
|
||||
ct);
|
||||
|
||||
controlResults.Add(result);
|
||||
}
|
||||
|
||||
var passedControls = controlResults.Count(r => r.Status == ControlStatus.Passed);
|
||||
var totalControls = controlResults.Count;
|
||||
var score = totalControls > 0 ? (double)passedControls / totalControls : 0;
|
||||
|
||||
return new FrameworkEvaluationResult
|
||||
{
|
||||
Framework = framework,
|
||||
ComplianceScore = score,
|
||||
Status = DetermineFrameworkStatus(score),
|
||||
ControlResults = controlResults.ToImmutableArray(),
|
||||
PassedControls = passedControls,
|
||||
FailedControls = controlResults.Count(r => r.Status == ControlStatus.Failed),
|
||||
PartialControls = controlResults.Count(r => r.Status == ControlStatus.Partial),
|
||||
NotApplicableControls = controlResults.Count(r => r.Status == ControlStatus.NotApplicable)
|
||||
};
|
||||
}
|
||||
|
||||
private OverallComplianceStatus DetermineOverallStatus(
|
||||
List<FrameworkEvaluationResult> results)
|
||||
{
|
||||
if (results.Count == 0)
|
||||
{
|
||||
return OverallComplianceStatus.NotEvaluated;
|
||||
}
|
||||
|
||||
if (results.All(r => r.Status == FrameworkComplianceStatus.Compliant))
|
||||
{
|
||||
return OverallComplianceStatus.Compliant;
|
||||
}
|
||||
|
||||
if (results.Any(r => r.Status == FrameworkComplianceStatus.NonCompliant))
|
||||
{
|
||||
return OverallComplianceStatus.NonCompliant;
|
||||
}
|
||||
|
||||
return OverallComplianceStatus.PartiallyCompliant;
|
||||
}
|
||||
|
||||
private FrameworkComplianceStatus DetermineFrameworkStatus(double score)
|
||||
{
|
||||
return score switch
|
||||
{
|
||||
>= 0.95 => FrameworkComplianceStatus.Compliant,
|
||||
>= 0.80 => FrameworkComplianceStatus.PartiallyCompliant,
|
||||
_ => FrameworkComplianceStatus.NonCompliant
|
||||
};
|
||||
}
|
||||
|
||||
private OverallComplianceStatus DetermineStatusFromScore(double score)
|
||||
{
|
||||
return score switch
|
||||
{
|
||||
>= 0.95 => OverallComplianceStatus.Compliant,
|
||||
>= 0.80 => OverallComplianceStatus.PartiallyCompliant,
|
||||
_ => OverallComplianceStatus.NonCompliant
|
||||
};
|
||||
}
|
||||
|
||||
private ImmutableArray<ComplianceGap> ExtractGaps(
|
||||
List<FrameworkEvaluationResult> results)
|
||||
{
|
||||
var gaps = new List<ComplianceGap>();
|
||||
|
||||
foreach (var result in results)
|
||||
{
|
||||
foreach (var control in result.ControlResults)
|
||||
{
|
||||
if (control.Status == ControlStatus.Failed ||
|
||||
control.Status == ControlStatus.Partial)
|
||||
{
|
||||
gaps.Add(new ComplianceGap
|
||||
{
|
||||
Framework = result.Framework,
|
||||
ControlId = control.ControlId,
|
||||
ControlName = control.ControlName,
|
||||
Severity = control.Status == ControlStatus.Failed
|
||||
? GapSeverity.High
|
||||
: GapSeverity.Medium,
|
||||
Description = control.FailureReason ?? "Control not satisfied",
|
||||
Remediation = control.RemediationGuidance
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return gaps.ToImmutableArray();
|
||||
}
|
||||
|
||||
private ImmutableArray<string> GenerateRecommendations(
|
||||
List<FrameworkEvaluationResult> results)
|
||||
{
|
||||
var recommendations = new List<string>();
|
||||
|
||||
foreach (var result in results)
|
||||
{
|
||||
if (result.Status == FrameworkComplianceStatus.NonCompliant)
|
||||
{
|
||||
recommendations.Add(
|
||||
$"Address critical {result.Framework} gaps before production deployment");
|
||||
}
|
||||
|
||||
if (result.FailedControls > 0)
|
||||
{
|
||||
recommendations.Add(
|
||||
$"Review {result.FailedControls} failed {result.Framework} controls");
|
||||
}
|
||||
}
|
||||
|
||||
return recommendations.Distinct().ToImmutableArray();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for compliance engine.
|
||||
/// </summary>
|
||||
public sealed record ComplianceEngineConfig
|
||||
{
|
||||
public double ComplianceThreshold { get; init; } = 0.95;
|
||||
public bool FailOnNonCompliance { get; init; } = true;
|
||||
public ImmutableArray<ComplianceFramework> DefaultFrameworks { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request for compliance evaluation.
|
||||
/// </summary>
|
||||
public sealed record ComplianceEvaluationRequest
|
||||
{
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public ImmutableArray<ComplianceFramework> Frameworks { get; init; } = [];
|
||||
public bool IncludeEvidence { get; init; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of compliance evaluation.
|
||||
/// </summary>
|
||||
public sealed record ComplianceEvaluationResult
|
||||
{
|
||||
public required Guid EvaluationId { get; init; }
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required DateTimeOffset EvaluatedAt { get; init; }
|
||||
public required TimeSpan Duration { get; init; }
|
||||
public required ImmutableArray<FrameworkEvaluationResult> FrameworkResults { get; init; }
|
||||
public required double OverallScore { get; init; }
|
||||
public required OverallComplianceStatus OverallStatus { get; init; }
|
||||
public required ImmutableArray<ComplianceGap> Gaps { get; init; }
|
||||
public required ImmutableArray<string> Recommendations { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result for a single framework.
|
||||
/// </summary>
|
||||
public sealed record FrameworkEvaluationResult
|
||||
{
|
||||
public required ComplianceFramework Framework { get; init; }
|
||||
public required double ComplianceScore { get; init; }
|
||||
public required FrameworkComplianceStatus Status { get; init; }
|
||||
public required ImmutableArray<ControlEvaluationResult> ControlResults { get; init; }
|
||||
public required int PassedControls { get; init; }
|
||||
public required int FailedControls { get; init; }
|
||||
public required int PartialControls { get; init; }
|
||||
public required int NotApplicableControls { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result for a single control.
|
||||
/// </summary>
|
||||
public sealed record ControlEvaluationResult
|
||||
{
|
||||
public required string ControlId { get; init; }
|
||||
public required string ControlName { get; init; }
|
||||
public required ControlStatus Status { get; init; }
|
||||
public string? FailureReason { get; init; }
|
||||
public string? RemediationGuidance { get; init; }
|
||||
public ImmutableArray<string> Evidence { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Control evaluation status.
|
||||
/// </summary>
|
||||
public enum ControlStatus
|
||||
{
|
||||
Passed,
|
||||
Failed,
|
||||
Partial,
|
||||
NotApplicable
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compliance status for a release.
|
||||
/// </summary>
|
||||
public sealed record ComplianceStatus
|
||||
{
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required OverallComplianceStatus Status { get; init; }
|
||||
public double Score { get; init; }
|
||||
public string? Message { get; init; }
|
||||
public ImmutableArray<FrameworkStatus> Frameworks { get; init; } = [];
|
||||
public DateTimeOffset? LastEvaluated { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status for a framework.
|
||||
/// </summary>
|
||||
public sealed record FrameworkStatus
|
||||
{
|
||||
public required ComplianceFramework Framework { get; init; }
|
||||
public required double Score { get; init; }
|
||||
public required OverallComplianceStatus Status { get; init; }
|
||||
public required DateTimeOffset LastEvaluated { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A compliance gap.
|
||||
/// </summary>
|
||||
public sealed record ComplianceGap
|
||||
{
|
||||
public required ComplianceFramework Framework { get; init; }
|
||||
public required string ControlId { get; init; }
|
||||
public required string ControlName { get; init; }
|
||||
public required GapSeverity Severity { get; init; }
|
||||
public required string Description { get; init; }
|
||||
public string? Remediation { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gap severity.
|
||||
/// </summary>
|
||||
public enum GapSeverity
|
||||
{
|
||||
Low,
|
||||
Medium,
|
||||
High,
|
||||
Critical
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Overall compliance status.
|
||||
/// </summary>
|
||||
public enum OverallComplianceStatus
|
||||
{
|
||||
NotEvaluated,
|
||||
Compliant,
|
||||
PartiallyCompliant,
|
||||
NonCompliant
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Framework compliance status.
|
||||
/// </summary>
|
||||
public enum FrameworkComplianceStatus
|
||||
{
|
||||
Compliant,
|
||||
PartiallyCompliant,
|
||||
NonCompliant
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Supported compliance frameworks.
|
||||
/// </summary>
|
||||
public enum ComplianceFramework
|
||||
{
|
||||
SOC2,
|
||||
ISO27001,
|
||||
PCIDSS,
|
||||
HIPAA,
|
||||
FedRAMP,
|
||||
GDPR,
|
||||
NISTCSF
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stored evaluation record.
|
||||
/// </summary>
|
||||
public sealed record StoredEvaluation
|
||||
{
|
||||
public required ComplianceFramework Framework { get; init; }
|
||||
public required double Score { get; init; }
|
||||
public required DateTimeOffset EvaluatedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A compliance control.
|
||||
/// </summary>
|
||||
public sealed record ComplianceControl
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public required string Description { get; init; }
|
||||
public required ComplianceFramework Framework { get; init; }
|
||||
public required ControlCategory Category { get; init; }
|
||||
public required ControlValidationType ValidationType { get; init; }
|
||||
public ImmutableArray<string> RequiredEvidence { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Control category.
|
||||
/// </summary>
|
||||
public enum ControlCategory
|
||||
{
|
||||
AccessControl,
|
||||
ChangeManagement,
|
||||
DataProtection,
|
||||
IncidentResponse,
|
||||
RiskManagement,
|
||||
SecurityMonitoring,
|
||||
VendorManagement
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Control validation type.
|
||||
/// </summary>
|
||||
public enum ControlValidationType
|
||||
{
|
||||
Automated,
|
||||
ManualReview,
|
||||
Evidence,
|
||||
Attestation
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for framework mapping.
|
||||
/// </summary>
|
||||
public interface IFrameworkMapper
|
||||
{
|
||||
IReadOnlyList<ComplianceControl> GetControls(ComplianceFramework framework);
|
||||
IReadOnlyList<ComplianceControl> MapToFramework(
|
||||
ComplianceFramework sourceFramework,
|
||||
ComplianceFramework targetFramework);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for control validation.
|
||||
/// </summary>
|
||||
public interface IControlValidator
|
||||
{
|
||||
Task<ControlEvaluationResult> ValidateAsync(
|
||||
Guid releaseId,
|
||||
ComplianceControl control,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for evidence provider.
|
||||
/// </summary>
|
||||
public interface IEvidenceProvider
|
||||
{
|
||||
Task<IReadOnlyList<StoredEvaluation>> GetEvaluationsAsync(
|
||||
Guid releaseId,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,532 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Compliance;
|
||||
|
||||
/// <summary>
|
||||
/// Validates compliance controls through automated checks.
|
||||
/// </summary>
|
||||
public sealed class ControlValidator : IControlValidator
|
||||
{
|
||||
private readonly IEvidenceProvider _evidenceProvider;
|
||||
private readonly IAuditLogProvider _auditLogProvider;
|
||||
private readonly IApprovalProvider _approvalProvider;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ControlValidatorConfig _config;
|
||||
private readonly ILogger<ControlValidator> _logger;
|
||||
|
||||
public ControlValidator(
|
||||
IEvidenceProvider evidenceProvider,
|
||||
IAuditLogProvider auditLogProvider,
|
||||
IApprovalProvider approvalProvider,
|
||||
TimeProvider timeProvider,
|
||||
ControlValidatorConfig config,
|
||||
ILogger<ControlValidator> logger)
|
||||
{
|
||||
_evidenceProvider = evidenceProvider;
|
||||
_auditLogProvider = auditLogProvider;
|
||||
_approvalProvider = approvalProvider;
|
||||
_timeProvider = timeProvider;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates a control for a release.
|
||||
/// </summary>
|
||||
public async Task<ControlEvaluationResult> ValidateAsync(
|
||||
Guid releaseId,
|
||||
ComplianceControl control,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Validating control {ControlId} for release {ReleaseId}",
|
||||
control.Id, releaseId);
|
||||
|
||||
try
|
||||
{
|
||||
var result = control.Category switch
|
||||
{
|
||||
ControlCategory.AccessControl => await ValidateAccessControlAsync(releaseId, control, ct),
|
||||
ControlCategory.ChangeManagement => await ValidateChangeManagementAsync(releaseId, control, ct),
|
||||
ControlCategory.DataProtection => await ValidateDataProtectionAsync(releaseId, control, ct),
|
||||
ControlCategory.IncidentResponse => await ValidateIncidentResponseAsync(releaseId, control, ct),
|
||||
ControlCategory.RiskManagement => await ValidateRiskManagementAsync(releaseId, control, ct),
|
||||
ControlCategory.SecurityMonitoring => await ValidateSecurityMonitoringAsync(releaseId, control, ct),
|
||||
ControlCategory.VendorManagement => await ValidateVendorManagementAsync(releaseId, control, ct),
|
||||
_ => await ValidateGenericAsync(releaseId, control, ct)
|
||||
};
|
||||
|
||||
return result;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"Error validating control {ControlId} for release {ReleaseId}",
|
||||
control.Id, releaseId);
|
||||
|
||||
return new ControlEvaluationResult
|
||||
{
|
||||
ControlId = control.Id,
|
||||
ControlName = control.Name,
|
||||
Status = ControlStatus.Failed,
|
||||
FailureReason = $"Validation error: {ex.Message}"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ControlEvaluationResult> ValidateAccessControlAsync(
|
||||
Guid releaseId,
|
||||
ComplianceControl control,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var evidence = new List<string>();
|
||||
var passed = true;
|
||||
string? failureReason = null;
|
||||
|
||||
// Check authentication evidence
|
||||
var authEvents = await _auditLogProvider.GetAuthenticationEventsAsync(releaseId, ct);
|
||||
if (authEvents.Count == 0)
|
||||
{
|
||||
passed = false;
|
||||
failureReason = "No authentication events found for release";
|
||||
}
|
||||
else
|
||||
{
|
||||
evidence.Add($"Found {authEvents.Count} authentication events");
|
||||
|
||||
// Check for MFA where required
|
||||
if (_config.RequireMfa)
|
||||
{
|
||||
var mfaEvents = authEvents.Where(e => e.UsedMfa).ToList();
|
||||
if (mfaEvents.Count < authEvents.Count)
|
||||
{
|
||||
passed = false;
|
||||
failureReason = $"{authEvents.Count - mfaEvents.Count} actions without MFA";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check authorization
|
||||
var authzEvents = await _auditLogProvider.GetAuthorizationEventsAsync(releaseId, ct);
|
||||
if (authzEvents.Any(e => e.Denied))
|
||||
{
|
||||
evidence.Add("Authorization denials recorded and logged");
|
||||
}
|
||||
|
||||
return new ControlEvaluationResult
|
||||
{
|
||||
ControlId = control.Id,
|
||||
ControlName = control.Name,
|
||||
Status = passed ? ControlStatus.Passed : ControlStatus.Failed,
|
||||
FailureReason = failureReason,
|
||||
Evidence = evidence.ToImmutableArray(),
|
||||
RemediationGuidance = passed ? null : "Ensure all release actions use authenticated sessions with MFA"
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<ControlEvaluationResult> ValidateChangeManagementAsync(
|
||||
Guid releaseId,
|
||||
ComplianceControl control,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var evidence = new List<string>();
|
||||
var passed = true;
|
||||
string? failureReason = null;
|
||||
|
||||
// Check for approvals
|
||||
var approvals = await _approvalProvider.GetApprovalsAsync(releaseId, ct);
|
||||
if (approvals.Count == 0)
|
||||
{
|
||||
passed = false;
|
||||
failureReason = "No approvals found for release";
|
||||
}
|
||||
else
|
||||
{
|
||||
evidence.Add($"Found {approvals.Count} approval(s)");
|
||||
|
||||
// Check approval chain
|
||||
if (_config.RequireApprovalChain)
|
||||
{
|
||||
var hasDevApproval = approvals.Any(a => a.Role == "Developer" || a.Role == "Engineer");
|
||||
var hasReviewApproval = approvals.Any(a => a.Role == "Reviewer" || a.Role == "QA");
|
||||
var hasManagerApproval = approvals.Any(a => a.Role == "Manager" || a.Role == "Lead");
|
||||
|
||||
if (!hasDevApproval || !hasReviewApproval)
|
||||
{
|
||||
passed = false;
|
||||
failureReason = "Incomplete approval chain";
|
||||
}
|
||||
|
||||
evidence.Add($"Approval chain: Dev={hasDevApproval}, Review={hasReviewApproval}, Manager={hasManagerApproval}");
|
||||
}
|
||||
}
|
||||
|
||||
// Check for test evidence
|
||||
var testEvidence = await _evidenceProvider.GetTestEvidenceAsync(releaseId, ct);
|
||||
if (testEvidence.Count > 0)
|
||||
{
|
||||
evidence.Add($"Test evidence: {testEvidence.Count} test run(s)");
|
||||
|
||||
var passRate = testEvidence.Average(t => t.PassRate);
|
||||
if (passRate < _config.MinTestPassRate)
|
||||
{
|
||||
passed = false;
|
||||
failureReason = $"Test pass rate {passRate:P0} below threshold {_config.MinTestPassRate:P0}";
|
||||
}
|
||||
}
|
||||
else if (_config.RequireTestEvidence)
|
||||
{
|
||||
passed = false;
|
||||
failureReason = "No test evidence found";
|
||||
}
|
||||
|
||||
// Check for change ticket
|
||||
var changeTicket = await _auditLogProvider.GetChangeTicketAsync(releaseId, ct);
|
||||
if (changeTicket is not null)
|
||||
{
|
||||
evidence.Add($"Change ticket: {changeTicket.Id}");
|
||||
}
|
||||
else if (_config.RequireChangeTicket)
|
||||
{
|
||||
passed = false;
|
||||
failureReason = "No change ticket linked to release";
|
||||
}
|
||||
|
||||
return new ControlEvaluationResult
|
||||
{
|
||||
ControlId = control.Id,
|
||||
ControlName = control.Name,
|
||||
Status = passed ? ControlStatus.Passed : ControlStatus.Failed,
|
||||
FailureReason = failureReason,
|
||||
Evidence = evidence.ToImmutableArray(),
|
||||
RemediationGuidance = passed ? null : "Ensure complete approval chain, test evidence, and change ticket"
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<ControlEvaluationResult> ValidateDataProtectionAsync(
|
||||
Guid releaseId,
|
||||
ComplianceControl control,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var evidence = new List<string>();
|
||||
var passed = true;
|
||||
string? failureReason = null;
|
||||
|
||||
// Check for encryption evidence
|
||||
var encryptionEvidence = await _evidenceProvider.GetEncryptionEvidenceAsync(releaseId, ct);
|
||||
if (encryptionEvidence.Count > 0)
|
||||
{
|
||||
evidence.Add($"Encryption evidence: {encryptionEvidence.Count} artifact(s)");
|
||||
|
||||
// Verify encryption standards
|
||||
var weakEncryption = encryptionEvidence.Where(e => !IsStrongEncryption(e.Algorithm)).ToList();
|
||||
if (weakEncryption.Count > 0)
|
||||
{
|
||||
passed = false;
|
||||
failureReason = $"{weakEncryption.Count} artifact(s) use weak encryption";
|
||||
}
|
||||
}
|
||||
|
||||
// Check for data classification
|
||||
var classification = await _evidenceProvider.GetDataClassificationAsync(releaseId, ct);
|
||||
if (classification is not null)
|
||||
{
|
||||
evidence.Add($"Data classification: {classification.Level}");
|
||||
}
|
||||
|
||||
return new ControlEvaluationResult
|
||||
{
|
||||
ControlId = control.Id,
|
||||
ControlName = control.Name,
|
||||
Status = passed ? ControlStatus.Passed : ControlStatus.Failed,
|
||||
FailureReason = failureReason,
|
||||
Evidence = evidence.ToImmutableArray(),
|
||||
RemediationGuidance = passed ? null : "Ensure all data uses approved encryption standards"
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<ControlEvaluationResult> ValidateSecurityMonitoringAsync(
|
||||
Guid releaseId,
|
||||
ComplianceControl control,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var evidence = new List<string>();
|
||||
var passed = true;
|
||||
string? failureReason = null;
|
||||
|
||||
// Check for security scans
|
||||
var scanResults = await _evidenceProvider.GetSecurityScanResultsAsync(releaseId, ct);
|
||||
if (scanResults.Count > 0)
|
||||
{
|
||||
evidence.Add($"Security scans: {scanResults.Count} scan(s)");
|
||||
|
||||
var criticalFindings = scanResults.Sum(s => s.CriticalCount);
|
||||
var highFindings = scanResults.Sum(s => s.HighCount);
|
||||
|
||||
if (criticalFindings > 0)
|
||||
{
|
||||
passed = false;
|
||||
failureReason = $"{criticalFindings} critical security finding(s)";
|
||||
}
|
||||
else if (highFindings > _config.MaxHighFindings)
|
||||
{
|
||||
passed = false;
|
||||
failureReason = $"{highFindings} high severity findings exceed threshold";
|
||||
}
|
||||
|
||||
evidence.Add($"Findings: Critical={criticalFindings}, High={highFindings}");
|
||||
}
|
||||
else if (_config.RequireSecurityScan)
|
||||
{
|
||||
passed = false;
|
||||
failureReason = "No security scan results found";
|
||||
}
|
||||
|
||||
// Check for vulnerability assessment
|
||||
var vulnAssessment = await _evidenceProvider.GetVulnerabilityAssessmentAsync(releaseId, ct);
|
||||
if (vulnAssessment is not null)
|
||||
{
|
||||
evidence.Add($"Vulnerability assessment: {vulnAssessment.TotalVulnerabilities} vulns");
|
||||
}
|
||||
|
||||
return new ControlEvaluationResult
|
||||
{
|
||||
ControlId = control.Id,
|
||||
ControlName = control.Name,
|
||||
Status = passed ? ControlStatus.Passed : ControlStatus.Failed,
|
||||
FailureReason = failureReason,
|
||||
Evidence = evidence.ToImmutableArray(),
|
||||
RemediationGuidance = passed ? null : "Address critical and high severity security findings"
|
||||
};
|
||||
}
|
||||
|
||||
private Task<ControlEvaluationResult> ValidateIncidentResponseAsync(
|
||||
Guid releaseId,
|
||||
ComplianceControl control,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Incident response controls are typically manual review
|
||||
return Task.FromResult(new ControlEvaluationResult
|
||||
{
|
||||
ControlId = control.Id,
|
||||
ControlName = control.Name,
|
||||
Status = ControlStatus.Partial,
|
||||
FailureReason = "Requires manual review",
|
||||
RemediationGuidance = "Verify incident response procedures are documented and tested"
|
||||
});
|
||||
}
|
||||
|
||||
private Task<ControlEvaluationResult> ValidateRiskManagementAsync(
|
||||
Guid releaseId,
|
||||
ComplianceControl control,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Risk management controls are typically manual review
|
||||
return Task.FromResult(new ControlEvaluationResult
|
||||
{
|
||||
ControlId = control.Id,
|
||||
ControlName = control.Name,
|
||||
Status = ControlStatus.Partial,
|
||||
FailureReason = "Requires manual review",
|
||||
RemediationGuidance = "Verify risk assessment is documented and approved"
|
||||
});
|
||||
}
|
||||
|
||||
private Task<ControlEvaluationResult> ValidateVendorManagementAsync(
|
||||
Guid releaseId,
|
||||
ComplianceControl control,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Vendor management controls are typically manual review
|
||||
return Task.FromResult(new ControlEvaluationResult
|
||||
{
|
||||
ControlId = control.Id,
|
||||
ControlName = control.Name,
|
||||
Status = ControlStatus.Partial,
|
||||
FailureReason = "Requires manual review",
|
||||
RemediationGuidance = "Verify vendor assessments are current and approved"
|
||||
});
|
||||
}
|
||||
|
||||
private Task<ControlEvaluationResult> ValidateGenericAsync(
|
||||
Guid releaseId,
|
||||
ComplianceControl control,
|
||||
CancellationToken ct)
|
||||
{
|
||||
return Task.FromResult(new ControlEvaluationResult
|
||||
{
|
||||
ControlId = control.Id,
|
||||
ControlName = control.Name,
|
||||
Status = ControlStatus.NotApplicable,
|
||||
FailureReason = "Control validation not implemented"
|
||||
});
|
||||
}
|
||||
|
||||
private static bool IsStrongEncryption(string algorithm)
|
||||
{
|
||||
var strongAlgorithms = new[]
|
||||
{
|
||||
"AES-256", "AES256", "RSA-4096", "RSA4096", "ECDSA-P384", "ECDSA-P521",
|
||||
"Ed25519", "ChaCha20-Poly1305", "SM4", "GOST"
|
||||
};
|
||||
|
||||
return strongAlgorithms.Any(a =>
|
||||
algorithm.Contains(a, StringComparison.OrdinalIgnoreCase));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for control validator.
|
||||
/// </summary>
|
||||
public sealed record ControlValidatorConfig
|
||||
{
|
||||
public bool RequireMfa { get; init; } = true;
|
||||
public bool RequireApprovalChain { get; init; } = true;
|
||||
public bool RequireTestEvidence { get; init; } = true;
|
||||
public bool RequireChangeTicket { get; init; } = true;
|
||||
public bool RequireSecurityScan { get; init; } = true;
|
||||
public double MinTestPassRate { get; init; } = 0.95;
|
||||
public int MaxHighFindings { get; init; } = 5;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for audit log provider.
|
||||
/// </summary>
|
||||
public interface IAuditLogProvider
|
||||
{
|
||||
Task<IReadOnlyList<AuthenticationEvent>> GetAuthenticationEventsAsync(Guid releaseId, CancellationToken ct = default);
|
||||
Task<IReadOnlyList<AuthorizationEvent>> GetAuthorizationEventsAsync(Guid releaseId, CancellationToken ct = default);
|
||||
Task<ChangeTicket?> GetChangeTicketAsync(Guid releaseId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for approval provider.
|
||||
/// </summary>
|
||||
public interface IApprovalProvider
|
||||
{
|
||||
Task<IReadOnlyList<Approval>> GetApprovalsAsync(Guid releaseId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extended evidence provider interface.
|
||||
/// </summary>
|
||||
public interface IExtendedEvidenceProvider : IEvidenceProvider
|
||||
{
|
||||
Task<IReadOnlyList<TestEvidence>> GetTestEvidenceAsync(Guid releaseId, CancellationToken ct = default);
|
||||
Task<IReadOnlyList<EncryptionEvidence>> GetEncryptionEvidenceAsync(Guid releaseId, CancellationToken ct = default);
|
||||
Task<DataClassification?> GetDataClassificationAsync(Guid releaseId, CancellationToken ct = default);
|
||||
Task<IReadOnlyList<SecurityScanResult>> GetSecurityScanResultsAsync(Guid releaseId, CancellationToken ct = default);
|
||||
Task<VulnerabilityAssessment?> GetVulnerabilityAssessmentAsync(Guid releaseId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Authentication event.
|
||||
/// </summary>
|
||||
public sealed record AuthenticationEvent
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required string UserId { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public required bool UsedMfa { get; init; }
|
||||
public required string AuthMethod { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Authorization event.
|
||||
/// </summary>
|
||||
public sealed record AuthorizationEvent
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required string UserId { get; init; }
|
||||
public required string Resource { get; init; }
|
||||
public required string Action { get; init; }
|
||||
public required bool Denied { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Change ticket.
|
||||
/// </summary>
|
||||
public sealed record ChangeTicket
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string Title { get; init; }
|
||||
public required string Status { get; init; }
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Approval record.
|
||||
/// </summary>
|
||||
public sealed record Approval
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required string ApproverUserId { get; init; }
|
||||
public required string ApproverName { get; init; }
|
||||
public required string Role { get; init; }
|
||||
public required DateTimeOffset ApprovedAt { get; init; }
|
||||
public string? Comment { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test evidence.
|
||||
/// </summary>
|
||||
public sealed record TestEvidence
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required string TestSuite { get; init; }
|
||||
public required int TotalTests { get; init; }
|
||||
public required int PassedTests { get; init; }
|
||||
public required int FailedTests { get; init; }
|
||||
public required double PassRate { get; init; }
|
||||
public required DateTimeOffset ExecutedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Encryption evidence.
|
||||
/// </summary>
|
||||
public sealed record EncryptionEvidence
|
||||
{
|
||||
public required string ArtifactId { get; init; }
|
||||
public required string Algorithm { get; init; }
|
||||
public required int KeyLength { get; init; }
|
||||
public required DateTimeOffset VerifiedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Data classification.
|
||||
/// </summary>
|
||||
public sealed record DataClassification
|
||||
{
|
||||
public required string Level { get; init; }
|
||||
public required string ClassifiedBy { get; init; }
|
||||
public required DateTimeOffset ClassifiedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Security scan result.
|
||||
/// </summary>
|
||||
public sealed record SecurityScanResult
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required string ScanType { get; init; }
|
||||
public required string Scanner { get; init; }
|
||||
public required int CriticalCount { get; init; }
|
||||
public required int HighCount { get; init; }
|
||||
public required int MediumCount { get; init; }
|
||||
public required int LowCount { get; init; }
|
||||
public required DateTimeOffset ScannedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Vulnerability assessment.
|
||||
/// </summary>
|
||||
public sealed record VulnerabilityAssessment
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required int TotalVulnerabilities { get; init; }
|
||||
public required int RemediatedCount { get; init; }
|
||||
public required int AcceptedRiskCount { get; init; }
|
||||
public required DateTimeOffset AssessedAt { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,586 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// EvidenceChainVisualizer.cs
|
||||
// Sprint: SPRINT_20260117_039_ReleaseOrchestrator_compliance
|
||||
// Task: TASK-039-04 - Evidence chain visualization
|
||||
// Description: Visualizes evidence chains with graph representation and integrity verification
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Compliance;
|
||||
|
||||
/// <summary>
|
||||
/// Visualizes and verifies evidence chains for compliance auditing.
|
||||
/// </summary>
|
||||
public sealed class EvidenceChainVisualizer : IEvidenceChainVisualizer
|
||||
{
|
||||
private readonly IEvidenceStore _evidenceStore;
|
||||
private readonly EvidenceChainConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<EvidenceChainVisualizer> _logger;
|
||||
|
||||
public EvidenceChainVisualizer(
|
||||
IEvidenceStore evidenceStore,
|
||||
EvidenceChainConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<EvidenceChainVisualizer> logger)
|
||||
{
|
||||
_evidenceStore = evidenceStore;
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds an evidence chain for a release.
|
||||
/// </summary>
|
||||
public async Task<EvidenceChain> BuildChainAsync(string releaseId, CancellationToken ct = default)
|
||||
{
|
||||
var evidence = await _evidenceStore.GetEvidenceForReleaseAsync(releaseId, ct);
|
||||
|
||||
var nodes = new List<EvidenceNode>();
|
||||
var edges = new List<EvidenceEdge>();
|
||||
|
||||
// Build nodes from evidence items
|
||||
foreach (var item in evidence.OrderBy(e => e.Timestamp))
|
||||
{
|
||||
nodes.Add(new EvidenceNode
|
||||
{
|
||||
Id = item.Id,
|
||||
Type = item.Type,
|
||||
Description = item.Description,
|
||||
Timestamp = item.Timestamp,
|
||||
Hash = item.ContentHash,
|
||||
Actor = item.Actor,
|
||||
Source = item.Source,
|
||||
Metadata = item.Metadata
|
||||
});
|
||||
}
|
||||
|
||||
// Build edges based on temporal and causal relationships
|
||||
for (int i = 0; i < nodes.Count; i++)
|
||||
{
|
||||
for (int j = i + 1; j < nodes.Count; j++)
|
||||
{
|
||||
var relationship = DetermineRelationship(nodes[i], nodes[j]);
|
||||
if (relationship.HasValue)
|
||||
{
|
||||
edges.Add(new EvidenceEdge
|
||||
{
|
||||
FromId = nodes[i].Id,
|
||||
ToId = nodes[j].Id,
|
||||
Relationship = relationship.Value
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute chain integrity
|
||||
var chainHash = ComputeChainHash(nodes);
|
||||
|
||||
var chain = new EvidenceChain
|
||||
{
|
||||
ReleaseId = releaseId,
|
||||
Nodes = nodes.ToImmutableArray(),
|
||||
Edges = edges.ToImmutableArray(),
|
||||
ChainHash = chainHash,
|
||||
BuiltAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Built evidence chain for {ReleaseId} with {NodeCount} nodes and {EdgeCount} edges",
|
||||
releaseId, nodes.Count, edges.Count);
|
||||
|
||||
return chain;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies the integrity of an evidence chain.
|
||||
/// </summary>
|
||||
public async Task<ChainVerificationResult> VerifyChainAsync(
|
||||
EvidenceChain chain,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var issues = new List<ChainIssue>();
|
||||
|
||||
// Verify each node
|
||||
foreach (var node in chain.Nodes)
|
||||
{
|
||||
var storedEvidence = await _evidenceStore.GetEvidenceByIdAsync(node.Id, ct);
|
||||
if (storedEvidence is null)
|
||||
{
|
||||
issues.Add(new ChainIssue
|
||||
{
|
||||
NodeId = node.Id,
|
||||
Severity = IssueSeverity.Critical,
|
||||
Description = "Evidence not found in store",
|
||||
Type = IssueType.MissingEvidence
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// Verify hash
|
||||
if (storedEvidence.ContentHash != node.Hash)
|
||||
{
|
||||
issues.Add(new ChainIssue
|
||||
{
|
||||
NodeId = node.Id,
|
||||
Severity = IssueSeverity.Critical,
|
||||
Description = "Content hash mismatch",
|
||||
Type = IssueType.TamperedEvidence
|
||||
});
|
||||
}
|
||||
|
||||
// Verify timestamp consistency
|
||||
if (storedEvidence.Timestamp != node.Timestamp)
|
||||
{
|
||||
issues.Add(new ChainIssue
|
||||
{
|
||||
NodeId = node.Id,
|
||||
Severity = IssueSeverity.Warning,
|
||||
Description = "Timestamp mismatch",
|
||||
Type = IssueType.TimestampMismatch
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Verify temporal ordering
|
||||
var sortedNodes = chain.Nodes.OrderBy(n => n.Timestamp).ToList();
|
||||
for (int i = 1; i < sortedNodes.Count; i++)
|
||||
{
|
||||
if (sortedNodes[i].Timestamp < sortedNodes[i - 1].Timestamp)
|
||||
{
|
||||
issues.Add(new ChainIssue
|
||||
{
|
||||
NodeId = sortedNodes[i].Id,
|
||||
Severity = IssueSeverity.Warning,
|
||||
Description = "Evidence out of temporal order",
|
||||
Type = IssueType.OrderingViolation
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Verify chain hash
|
||||
var expectedHash = ComputeChainHash(chain.Nodes);
|
||||
if (expectedHash != chain.ChainHash)
|
||||
{
|
||||
issues.Add(new ChainIssue
|
||||
{
|
||||
Severity = IssueSeverity.Critical,
|
||||
Description = "Chain hash mismatch - chain may have been tampered",
|
||||
Type = IssueType.ChainHashMismatch
|
||||
});
|
||||
}
|
||||
|
||||
// Verify edge consistency
|
||||
foreach (var edge in chain.Edges)
|
||||
{
|
||||
var fromNode = chain.Nodes.FirstOrDefault(n => n.Id == edge.FromId);
|
||||
var toNode = chain.Nodes.FirstOrDefault(n => n.Id == edge.ToId);
|
||||
|
||||
if (fromNode.Id is null || toNode.Id is null)
|
||||
{
|
||||
issues.Add(new ChainIssue
|
||||
{
|
||||
Severity = IssueSeverity.Critical,
|
||||
Description = $"Edge references non-existent node: {edge.FromId} -> {edge.ToId}",
|
||||
Type = IssueType.BrokenEdge
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
var isValid = !issues.Any(i => i.Severity == IssueSeverity.Critical);
|
||||
|
||||
return new ChainVerificationResult
|
||||
{
|
||||
IsValid = isValid,
|
||||
Issues = issues.ToImmutableArray(),
|
||||
VerifiedAt = _timeProvider.GetUtcNow(),
|
||||
NodesVerified = chain.Nodes.Length,
|
||||
EdgesVerified = chain.Edges.Length
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generates a visual representation of the evidence chain.
|
||||
/// </summary>
|
||||
public EvidenceChainGraph ToGraph(EvidenceChain chain)
|
||||
{
|
||||
var layers = new List<GraphLayer>();
|
||||
var nodesByType = chain.Nodes.GroupBy(n => n.Type);
|
||||
|
||||
foreach (var group in nodesByType)
|
||||
{
|
||||
layers.Add(new GraphLayer
|
||||
{
|
||||
Name = group.Key.ToString(),
|
||||
NodeIds = group.Select(n => n.Id).ToImmutableArray()
|
||||
});
|
||||
}
|
||||
|
||||
var graphNodes = chain.Nodes.Select(n => new GraphNode
|
||||
{
|
||||
Id = n.Id,
|
||||
Label = $"{n.Type}: {n.Description}",
|
||||
Type = n.Type.ToString(),
|
||||
Timestamp = n.Timestamp,
|
||||
Style = GetNodeStyle(n.Type)
|
||||
}).ToImmutableArray();
|
||||
|
||||
var graphEdges = chain.Edges.Select(e => new GraphEdge
|
||||
{
|
||||
FromId = e.FromId,
|
||||
ToId = e.ToId,
|
||||
Label = e.Relationship.ToString(),
|
||||
Style = GetEdgeStyle(e.Relationship)
|
||||
}).ToImmutableArray();
|
||||
|
||||
return new EvidenceChainGraph
|
||||
{
|
||||
ReleaseId = chain.ReleaseId,
|
||||
Nodes = graphNodes,
|
||||
Edges = graphEdges,
|
||||
Layers = layers.ToImmutableArray(),
|
||||
Metadata = new GraphMetadata
|
||||
{
|
||||
NodeCount = chain.Nodes.Length,
|
||||
EdgeCount = chain.Edges.Length,
|
||||
TimeSpan = chain.Nodes.Any()
|
||||
? chain.Nodes.Max(n => n.Timestamp) - chain.Nodes.Min(n => n.Timestamp)
|
||||
: TimeSpan.Zero
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exports the evidence chain to various formats.
|
||||
/// </summary>
|
||||
public async Task<ExportResult> ExportAsync(
|
||||
EvidenceChain chain,
|
||||
ExportFormat format,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var content = format switch
|
||||
{
|
||||
ExportFormat.Json => JsonSerializer.Serialize(chain, new JsonSerializerOptions { WriteIndented = true }),
|
||||
ExportFormat.Dot => GenerateDotFormat(chain),
|
||||
ExportFormat.Mermaid => GenerateMermaidFormat(chain),
|
||||
ExportFormat.Csv => GenerateCsvFormat(chain),
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(format))
|
||||
};
|
||||
|
||||
return new ExportResult
|
||||
{
|
||||
Content = content,
|
||||
Format = format,
|
||||
ContentType = GetContentType(format),
|
||||
FileName = $"evidence-chain-{chain.ReleaseId}.{GetExtension(format)}"
|
||||
};
|
||||
}
|
||||
|
||||
private EvidenceRelationship? DetermineRelationship(EvidenceNode from, EvidenceNode to)
|
||||
{
|
||||
// Temporal precedence
|
||||
if (from.Timestamp >= to.Timestamp) return null;
|
||||
|
||||
// Determine relationship based on types
|
||||
return (from.Type, to.Type) switch
|
||||
{
|
||||
(EvidenceType.ScanResult, EvidenceType.PolicyDecision) => EvidenceRelationship.InputTo,
|
||||
(EvidenceType.PolicyDecision, EvidenceType.Approval) => EvidenceRelationship.Enables,
|
||||
(EvidenceType.Approval, EvidenceType.DeploymentStart) => EvidenceRelationship.Triggers,
|
||||
(EvidenceType.DeploymentStart, EvidenceType.DeploymentComplete) => EvidenceRelationship.Precedes,
|
||||
(EvidenceType.DeploymentComplete, EvidenceType.HealthCheck) => EvidenceRelationship.Validates,
|
||||
_ => from.Timestamp < to.Timestamp ? EvidenceRelationship.Precedes : null
|
||||
};
|
||||
}
|
||||
|
||||
private string ComputeChainHash(IEnumerable<EvidenceNode> nodes)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
foreach (var node in nodes.OrderBy(n => n.Timestamp))
|
||||
{
|
||||
sb.Append(node.Id);
|
||||
sb.Append(node.Hash);
|
||||
sb.Append(node.Timestamp.ToUnixTimeMilliseconds());
|
||||
}
|
||||
|
||||
var hash = SHA256.HashData(Encoding.UTF8.GetBytes(sb.ToString()));
|
||||
return Convert.ToHexString(hash).ToLowerInvariant();
|
||||
}
|
||||
|
||||
private static NodeStyle GetNodeStyle(EvidenceType type)
|
||||
{
|
||||
return type switch
|
||||
{
|
||||
EvidenceType.ScanResult => new NodeStyle { Color = "#4CAF50", Shape = "ellipse" },
|
||||
EvidenceType.PolicyDecision => new NodeStyle { Color = "#2196F3", Shape = "diamond" },
|
||||
EvidenceType.Approval => new NodeStyle { Color = "#FF9800", Shape = "box" },
|
||||
EvidenceType.DeploymentStart => new NodeStyle { Color = "#9C27B0", Shape = "hexagon" },
|
||||
EvidenceType.DeploymentComplete => new NodeStyle { Color = "#4CAF50", Shape = "hexagon" },
|
||||
EvidenceType.Rollback => new NodeStyle { Color = "#F44336", Shape = "hexagon" },
|
||||
EvidenceType.HealthCheck => new NodeStyle { Color = "#00BCD4", Shape = "ellipse" },
|
||||
_ => new NodeStyle { Color = "#9E9E9E", Shape = "box" }
|
||||
};
|
||||
}
|
||||
|
||||
private static EdgeStyle GetEdgeStyle(EvidenceRelationship relationship)
|
||||
{
|
||||
return relationship switch
|
||||
{
|
||||
EvidenceRelationship.Triggers => new EdgeStyle { Color = "#FF5722", Style = "bold" },
|
||||
EvidenceRelationship.InputTo => new EdgeStyle { Color = "#2196F3", Style = "dashed" },
|
||||
EvidenceRelationship.Enables => new EdgeStyle { Color = "#4CAF50", Style = "solid" },
|
||||
EvidenceRelationship.Validates => new EdgeStyle { Color = "#00BCD4", Style = "dotted" },
|
||||
_ => new EdgeStyle { Color = "#9E9E9E", Style = "solid" }
|
||||
};
|
||||
}
|
||||
|
||||
private string GenerateDotFormat(EvidenceChain chain)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.AppendLine("digraph EvidenceChain {");
|
||||
sb.AppendLine(" rankdir=LR;");
|
||||
sb.AppendLine(" node [fontname=\"Arial\"];");
|
||||
|
||||
foreach (var node in chain.Nodes)
|
||||
{
|
||||
var style = GetNodeStyle(node.Type);
|
||||
sb.AppendLine($" \"{node.Id}\" [label=\"{node.Type}\\n{node.Description}\", shape={style.Shape}, color=\"{style.Color}\"];");
|
||||
}
|
||||
|
||||
foreach (var edge in chain.Edges)
|
||||
{
|
||||
var style = GetEdgeStyle(edge.Relationship);
|
||||
sb.AppendLine($" \"{edge.FromId}\" -> \"{edge.ToId}\" [label=\"{edge.Relationship}\", style={style.Style}];");
|
||||
}
|
||||
|
||||
sb.AppendLine("}");
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
private string GenerateMermaidFormat(EvidenceChain chain)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.AppendLine("graph LR");
|
||||
|
||||
foreach (var node in chain.Nodes)
|
||||
{
|
||||
sb.AppendLine($" {node.Id}[\"{node.Type}: {node.Description}\"]");
|
||||
}
|
||||
|
||||
foreach (var edge in chain.Edges)
|
||||
{
|
||||
sb.AppendLine($" {edge.FromId} -->|{edge.Relationship}| {edge.ToId}");
|
||||
}
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
private string GenerateCsvFormat(EvidenceChain chain)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.AppendLine("NodeId,Type,Description,Timestamp,Hash,Actor");
|
||||
|
||||
foreach (var node in chain.Nodes)
|
||||
{
|
||||
sb.AppendLine($"\"{node.Id}\",\"{node.Type}\",\"{node.Description}\",\"{node.Timestamp:O}\",\"{node.Hash}\",\"{node.Actor}\"");
|
||||
}
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
private static string GetContentType(ExportFormat format) => format switch
|
||||
{
|
||||
ExportFormat.Json => "application/json",
|
||||
ExportFormat.Dot => "text/vnd.graphviz",
|
||||
ExportFormat.Mermaid => "text/plain",
|
||||
ExportFormat.Csv => "text/csv",
|
||||
_ => "application/octet-stream"
|
||||
};
|
||||
|
||||
private static string GetExtension(ExportFormat format) => format switch
|
||||
{
|
||||
ExportFormat.Json => "json",
|
||||
ExportFormat.Dot => "dot",
|
||||
ExportFormat.Mermaid => "md",
|
||||
ExportFormat.Csv => "csv",
|
||||
_ => "bin"
|
||||
};
|
||||
}
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface IEvidenceChainVisualizer
|
||||
{
|
||||
Task<EvidenceChain> BuildChainAsync(string releaseId, CancellationToken ct = default);
|
||||
Task<ChainVerificationResult> VerifyChainAsync(EvidenceChain chain, CancellationToken ct = default);
|
||||
EvidenceChainGraph ToGraph(EvidenceChain chain);
|
||||
Task<ExportResult> ExportAsync(EvidenceChain chain, ExportFormat format, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IEvidenceStore
|
||||
{
|
||||
Task<ImmutableArray<EvidenceItem>> GetEvidenceForReleaseAsync(string releaseId, CancellationToken ct = default);
|
||||
Task<EvidenceItem?> GetEvidenceByIdAsync(string evidenceId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record EvidenceChainConfig
|
||||
{
|
||||
public bool IncludeMetadata { get; init; } = true;
|
||||
public int MaxDepth { get; init; } = 100;
|
||||
}
|
||||
|
||||
public sealed record EvidenceChain
|
||||
{
|
||||
public required string ReleaseId { get; init; }
|
||||
public required ImmutableArray<EvidenceNode> Nodes { get; init; }
|
||||
public required ImmutableArray<EvidenceEdge> Edges { get; init; }
|
||||
public required string ChainHash { get; init; }
|
||||
public required DateTimeOffset BuiltAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record EvidenceNode
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required EvidenceType Type { get; init; }
|
||||
public required string Description { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public required string Hash { get; init; }
|
||||
public required string Actor { get; init; }
|
||||
public string? Source { get; init; }
|
||||
public ImmutableDictionary<string, string>? Metadata { get; init; }
|
||||
}
|
||||
|
||||
public sealed record EvidenceEdge
|
||||
{
|
||||
public required string FromId { get; init; }
|
||||
public required string ToId { get; init; }
|
||||
public required EvidenceRelationship Relationship { get; init; }
|
||||
}
|
||||
|
||||
public enum EvidenceType
|
||||
{
|
||||
ScanResult,
|
||||
PolicyDecision,
|
||||
Approval,
|
||||
DeploymentStart,
|
||||
DeploymentComplete,
|
||||
Rollback,
|
||||
HealthCheck,
|
||||
AuditLog,
|
||||
Signature,
|
||||
Other
|
||||
}
|
||||
|
||||
public enum EvidenceRelationship
|
||||
{
|
||||
Precedes,
|
||||
Triggers,
|
||||
InputTo,
|
||||
Enables,
|
||||
Validates
|
||||
}
|
||||
|
||||
public sealed record ChainVerificationResult
|
||||
{
|
||||
public required bool IsValid { get; init; }
|
||||
public required ImmutableArray<ChainIssue> Issues { get; init; }
|
||||
public required DateTimeOffset VerifiedAt { get; init; }
|
||||
public required int NodesVerified { get; init; }
|
||||
public required int EdgesVerified { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ChainIssue
|
||||
{
|
||||
public string? NodeId { get; init; }
|
||||
public required IssueSeverity Severity { get; init; }
|
||||
public required string Description { get; init; }
|
||||
public required IssueType Type { get; init; }
|
||||
}
|
||||
|
||||
public enum IssueSeverity { Info, Warning, Critical }
|
||||
public enum IssueType { MissingEvidence, TamperedEvidence, TimestampMismatch, OrderingViolation, ChainHashMismatch, BrokenEdge }
|
||||
|
||||
public sealed record EvidenceChainGraph
|
||||
{
|
||||
public required string ReleaseId { get; init; }
|
||||
public required ImmutableArray<GraphNode> Nodes { get; init; }
|
||||
public required ImmutableArray<GraphEdge> Edges { get; init; }
|
||||
public required ImmutableArray<GraphLayer> Layers { get; init; }
|
||||
public required GraphMetadata Metadata { get; init; }
|
||||
}
|
||||
|
||||
public sealed record GraphNode
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string Label { get; init; }
|
||||
public required string Type { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public required NodeStyle Style { get; init; }
|
||||
}
|
||||
|
||||
public sealed record GraphEdge
|
||||
{
|
||||
public required string FromId { get; init; }
|
||||
public required string ToId { get; init; }
|
||||
public required string Label { get; init; }
|
||||
public required EdgeStyle Style { get; init; }
|
||||
}
|
||||
|
||||
public sealed record GraphLayer
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required ImmutableArray<string> NodeIds { get; init; }
|
||||
}
|
||||
|
||||
public sealed record GraphMetadata
|
||||
{
|
||||
public required int NodeCount { get; init; }
|
||||
public required int EdgeCount { get; init; }
|
||||
public required TimeSpan TimeSpan { get; init; }
|
||||
}
|
||||
|
||||
public sealed record NodeStyle
|
||||
{
|
||||
public required string Color { get; init; }
|
||||
public required string Shape { get; init; }
|
||||
}
|
||||
|
||||
public sealed record EdgeStyle
|
||||
{
|
||||
public required string Color { get; init; }
|
||||
public required string Style { get; init; }
|
||||
}
|
||||
|
||||
public enum ExportFormat { Json, Dot, Mermaid, Csv }
|
||||
|
||||
public sealed record ExportResult
|
||||
{
|
||||
public required string Content { get; init; }
|
||||
public required ExportFormat Format { get; init; }
|
||||
public required string ContentType { get; init; }
|
||||
public required string FileName { get; init; }
|
||||
}
|
||||
|
||||
public sealed record EvidenceItem
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required EvidenceType Type { get; init; }
|
||||
public required string Description { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public required string ContentHash { get; init; }
|
||||
public required string Actor { get; init; }
|
||||
public string? Source { get; init; }
|
||||
public ImmutableDictionary<string, string>? Metadata { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,533 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Compliance;
|
||||
|
||||
/// <summary>
|
||||
/// Maps controls between compliance frameworks and provides framework definitions.
|
||||
/// </summary>
|
||||
public sealed class FrameworkMapper : IFrameworkMapper
|
||||
{
|
||||
private readonly ILogger<FrameworkMapper> _logger;
|
||||
private readonly ImmutableDictionary<ComplianceFramework, ImmutableArray<ComplianceControl>> _frameworkControls;
|
||||
private readonly ImmutableDictionary<(ComplianceFramework, ComplianceFramework), ImmutableDictionary<string, string>> _crossMappings;
|
||||
|
||||
public FrameworkMapper(ILogger<FrameworkMapper> logger)
|
||||
{
|
||||
_logger = logger;
|
||||
_frameworkControls = BuildFrameworkControls();
|
||||
_crossMappings = BuildCrossMappings();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all controls for a framework.
|
||||
/// </summary>
|
||||
public IReadOnlyList<ComplianceControl> GetControls(ComplianceFramework framework)
|
||||
{
|
||||
if (_frameworkControls.TryGetValue(framework, out var controls))
|
||||
{
|
||||
return controls;
|
||||
}
|
||||
|
||||
_logger.LogWarning("No controls defined for framework {Framework}", framework);
|
||||
return [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Maps controls from source framework to target framework.
|
||||
/// </summary>
|
||||
public IReadOnlyList<ComplianceControl> MapToFramework(
|
||||
ComplianceFramework sourceFramework,
|
||||
ComplianceFramework targetFramework)
|
||||
{
|
||||
var sourceControls = GetControls(sourceFramework);
|
||||
var mappingKey = (sourceFramework, targetFramework);
|
||||
|
||||
if (!_crossMappings.TryGetValue(mappingKey, out var mapping))
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"No mapping defined from {Source} to {Target}",
|
||||
sourceFramework, targetFramework);
|
||||
return [];
|
||||
}
|
||||
|
||||
var targetControls = GetControls(targetFramework);
|
||||
var mappedControls = new List<ComplianceControl>();
|
||||
|
||||
foreach (var sourceControl in sourceControls)
|
||||
{
|
||||
if (mapping.TryGetValue(sourceControl.Id, out var targetControlId))
|
||||
{
|
||||
var targetControl = targetControls.FirstOrDefault(c => c.Id == targetControlId);
|
||||
if (targetControl is not null)
|
||||
{
|
||||
mappedControls.Add(targetControl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return mappedControls;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the framework metadata.
|
||||
/// </summary>
|
||||
public FrameworkMetadata GetFrameworkMetadata(ComplianceFramework framework)
|
||||
{
|
||||
return framework switch
|
||||
{
|
||||
ComplianceFramework.SOC2 => new FrameworkMetadata
|
||||
{
|
||||
Framework = framework,
|
||||
Name = "SOC 2",
|
||||
FullName = "Service Organization Control 2",
|
||||
Version = "2017",
|
||||
Publisher = "AICPA",
|
||||
Categories = ["Security", "Availability", "Processing Integrity", "Confidentiality", "Privacy"]
|
||||
},
|
||||
ComplianceFramework.ISO27001 => new FrameworkMetadata
|
||||
{
|
||||
Framework = framework,
|
||||
Name = "ISO 27001",
|
||||
FullName = "ISO/IEC 27001:2022",
|
||||
Version = "2022",
|
||||
Publisher = "ISO/IEC",
|
||||
Categories = ["Information Security Management System"]
|
||||
},
|
||||
ComplianceFramework.PCIDSS => new FrameworkMetadata
|
||||
{
|
||||
Framework = framework,
|
||||
Name = "PCI DSS",
|
||||
FullName = "Payment Card Industry Data Security Standard",
|
||||
Version = "4.0",
|
||||
Publisher = "PCI Security Standards Council",
|
||||
Categories = ["Build and Maintain Secure Network", "Protect Cardholder Data", "Vulnerability Management", "Access Control", "Monitoring", "Security Policy"]
|
||||
},
|
||||
ComplianceFramework.HIPAA => new FrameworkMetadata
|
||||
{
|
||||
Framework = framework,
|
||||
Name = "HIPAA",
|
||||
FullName = "Health Insurance Portability and Accountability Act",
|
||||
Version = "2013",
|
||||
Publisher = "HHS",
|
||||
Categories = ["Administrative Safeguards", "Physical Safeguards", "Technical Safeguards"]
|
||||
},
|
||||
ComplianceFramework.FedRAMP => new FrameworkMetadata
|
||||
{
|
||||
Framework = framework,
|
||||
Name = "FedRAMP",
|
||||
FullName = "Federal Risk and Authorization Management Program",
|
||||
Version = "Rev 5",
|
||||
Publisher = "GSA",
|
||||
Categories = ["Access Control", "Audit", "Configuration Management", "Incident Response", "Risk Assessment"]
|
||||
},
|
||||
ComplianceFramework.GDPR => new FrameworkMetadata
|
||||
{
|
||||
Framework = framework,
|
||||
Name = "GDPR",
|
||||
FullName = "General Data Protection Regulation",
|
||||
Version = "2018",
|
||||
Publisher = "European Union",
|
||||
Categories = ["Data Protection", "Privacy Rights", "Consent", "Data Breach", "International Transfer"]
|
||||
},
|
||||
ComplianceFramework.NISTCSF => new FrameworkMetadata
|
||||
{
|
||||
Framework = framework,
|
||||
Name = "NIST CSF",
|
||||
FullName = "NIST Cybersecurity Framework",
|
||||
Version = "2.0",
|
||||
Publisher = "NIST",
|
||||
Categories = ["Identify", "Protect", "Detect", "Respond", "Recover", "Govern"]
|
||||
},
|
||||
_ => throw new ArgumentException($"Unknown framework: {framework}")
|
||||
};
|
||||
}
|
||||
|
||||
private ImmutableDictionary<ComplianceFramework, ImmutableArray<ComplianceControl>> BuildFrameworkControls()
|
||||
{
|
||||
var builder = ImmutableDictionary.CreateBuilder<ComplianceFramework, ImmutableArray<ComplianceControl>>();
|
||||
|
||||
// SOC 2 Controls
|
||||
builder[ComplianceFramework.SOC2] =
|
||||
[
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "CC1.1",
|
||||
Name = "Control Environment",
|
||||
Description = "The entity demonstrates commitment to integrity and ethical values",
|
||||
Framework = ComplianceFramework.SOC2,
|
||||
Category = ControlCategory.RiskManagement,
|
||||
ValidationType = ControlValidationType.ManualReview
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "CC6.1",
|
||||
Name = "Logical Access Security",
|
||||
Description = "The entity implements logical access security software",
|
||||
Framework = ComplianceFramework.SOC2,
|
||||
Category = ControlCategory.AccessControl,
|
||||
ValidationType = ControlValidationType.Automated,
|
||||
RequiredEvidence = ["Authentication logs", "Access reviews"]
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "CC6.2",
|
||||
Name = "System Access Removal",
|
||||
Description = "Prior to issuing system credentials, the entity registers and authorizes new users",
|
||||
Framework = ComplianceFramework.SOC2,
|
||||
Category = ControlCategory.AccessControl,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "CC7.1",
|
||||
Name = "Vulnerability Management",
|
||||
Description = "The entity detects and monitors security vulnerabilities",
|
||||
Framework = ComplianceFramework.SOC2,
|
||||
Category = ControlCategory.SecurityMonitoring,
|
||||
ValidationType = ControlValidationType.Automated,
|
||||
RequiredEvidence = ["Vulnerability scan reports", "Remediation records"]
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "CC7.2",
|
||||
Name = "Security Event Monitoring",
|
||||
Description = "The entity monitors system components for anomalies",
|
||||
Framework = ComplianceFramework.SOC2,
|
||||
Category = ControlCategory.SecurityMonitoring,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "CC8.1",
|
||||
Name = "Change Management",
|
||||
Description = "The entity authorizes, designs, develops, configures, tests, and approves system changes",
|
||||
Framework = ComplianceFramework.SOC2,
|
||||
Category = ControlCategory.ChangeManagement,
|
||||
ValidationType = ControlValidationType.Automated,
|
||||
RequiredEvidence = ["Change tickets", "Approval records", "Test results"]
|
||||
}
|
||||
];
|
||||
|
||||
// ISO 27001 Controls (A.5-A.8 subset)
|
||||
builder[ComplianceFramework.ISO27001] =
|
||||
[
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "A.5.1",
|
||||
Name = "Policies for Information Security",
|
||||
Description = "A set of policies for information security shall be defined, approved and communicated",
|
||||
Framework = ComplianceFramework.ISO27001,
|
||||
Category = ControlCategory.RiskManagement,
|
||||
ValidationType = ControlValidationType.ManualReview
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "A.6.1",
|
||||
Name = "Screening",
|
||||
Description = "Background verification checks shall be carried out",
|
||||
Framework = ComplianceFramework.ISO27001,
|
||||
Category = ControlCategory.AccessControl,
|
||||
ValidationType = ControlValidationType.ManualReview
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "A.8.2",
|
||||
Name = "Privileged Access Rights",
|
||||
Description = "The allocation of privileged access rights shall be restricted and managed",
|
||||
Framework = ComplianceFramework.ISO27001,
|
||||
Category = ControlCategory.AccessControl,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "A.8.9",
|
||||
Name = "Configuration Management",
|
||||
Description = "Configurations shall be established, documented, implemented, monitored and reviewed",
|
||||
Framework = ComplianceFramework.ISO27001,
|
||||
Category = ControlCategory.ChangeManagement,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "A.8.32",
|
||||
Name = "Change Management",
|
||||
Description = "Changes to information processing facilities shall be subject to change management procedures",
|
||||
Framework = ComplianceFramework.ISO27001,
|
||||
Category = ControlCategory.ChangeManagement,
|
||||
ValidationType = ControlValidationType.Automated,
|
||||
RequiredEvidence = ["Change records", "Approval documentation"]
|
||||
}
|
||||
];
|
||||
|
||||
// PCI DSS Controls (requirements subset)
|
||||
builder[ComplianceFramework.PCIDSS] =
|
||||
[
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "1.1",
|
||||
Name = "Network Security Controls",
|
||||
Description = "Install and maintain network security controls",
|
||||
Framework = ComplianceFramework.PCIDSS,
|
||||
Category = ControlCategory.SecurityMonitoring,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "6.2",
|
||||
Name = "Secure Development",
|
||||
Description = "Develop software securely",
|
||||
Framework = ComplianceFramework.PCIDSS,
|
||||
Category = ControlCategory.ChangeManagement,
|
||||
ValidationType = ControlValidationType.Automated,
|
||||
RequiredEvidence = ["Code review records", "Security testing results"]
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "6.3",
|
||||
Name = "Security Vulnerabilities",
|
||||
Description = "Security vulnerabilities are identified and addressed",
|
||||
Framework = ComplianceFramework.PCIDSS,
|
||||
Category = ControlCategory.SecurityMonitoring,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "7.1",
|
||||
Name = "Access Restriction",
|
||||
Description = "Access to system components is restricted to those with business need",
|
||||
Framework = ComplianceFramework.PCIDSS,
|
||||
Category = ControlCategory.AccessControl,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "10.1",
|
||||
Name = "Audit Logging",
|
||||
Description = "Log and monitor access to system components and cardholder data",
|
||||
Framework = ComplianceFramework.PCIDSS,
|
||||
Category = ControlCategory.SecurityMonitoring,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
}
|
||||
];
|
||||
|
||||
// HIPAA Controls
|
||||
builder[ComplianceFramework.HIPAA] =
|
||||
[
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "164.312(a)(1)",
|
||||
Name = "Access Control",
|
||||
Description = "Implement technical policies and procedures for access to PHI",
|
||||
Framework = ComplianceFramework.HIPAA,
|
||||
Category = ControlCategory.AccessControl,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "164.312(b)",
|
||||
Name = "Audit Controls",
|
||||
Description = "Implement mechanisms to record and examine activity in systems containing PHI",
|
||||
Framework = ComplianceFramework.HIPAA,
|
||||
Category = ControlCategory.SecurityMonitoring,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "164.312(c)(1)",
|
||||
Name = "Integrity",
|
||||
Description = "Implement policies to protect PHI from improper alteration or destruction",
|
||||
Framework = ComplianceFramework.HIPAA,
|
||||
Category = ControlCategory.DataProtection,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "164.312(d)",
|
||||
Name = "Authentication",
|
||||
Description = "Implement procedures to verify that a person seeking access to PHI is who they claim to be",
|
||||
Framework = ComplianceFramework.HIPAA,
|
||||
Category = ControlCategory.AccessControl,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
}
|
||||
];
|
||||
|
||||
// FedRAMP Controls (subset)
|
||||
builder[ComplianceFramework.FedRAMP] =
|
||||
[
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "AC-2",
|
||||
Name = "Account Management",
|
||||
Description = "Manage information system accounts including establishing, activating, modifying, reviewing, disabling, and removing",
|
||||
Framework = ComplianceFramework.FedRAMP,
|
||||
Category = ControlCategory.AccessControl,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "AU-2",
|
||||
Name = "Audit Events",
|
||||
Description = "The organization determines that the information system is capable of auditing events",
|
||||
Framework = ComplianceFramework.FedRAMP,
|
||||
Category = ControlCategory.SecurityMonitoring,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "CM-3",
|
||||
Name = "Configuration Change Control",
|
||||
Description = "The organization determines the types of changes to the information system that are configuration-controlled",
|
||||
Framework = ComplianceFramework.FedRAMP,
|
||||
Category = ControlCategory.ChangeManagement,
|
||||
ValidationType = ControlValidationType.Automated,
|
||||
RequiredEvidence = ["Change control records", "Approval documentation"]
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "IR-4",
|
||||
Name = "Incident Handling",
|
||||
Description = "The organization implements an incident handling capability",
|
||||
Framework = ComplianceFramework.FedRAMP,
|
||||
Category = ControlCategory.IncidentResponse,
|
||||
ValidationType = ControlValidationType.ManualReview
|
||||
}
|
||||
];
|
||||
|
||||
// GDPR Controls
|
||||
builder[ComplianceFramework.GDPR] =
|
||||
[
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "Art.5",
|
||||
Name = "Principles of Processing",
|
||||
Description = "Personal data shall be processed lawfully, fairly and transparently",
|
||||
Framework = ComplianceFramework.GDPR,
|
||||
Category = ControlCategory.DataProtection,
|
||||
ValidationType = ControlValidationType.ManualReview
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "Art.25",
|
||||
Name = "Data Protection by Design",
|
||||
Description = "Implement appropriate technical and organisational measures designed to implement data-protection principles",
|
||||
Framework = ComplianceFramework.GDPR,
|
||||
Category = ControlCategory.DataProtection,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "Art.30",
|
||||
Name = "Records of Processing",
|
||||
Description = "Maintain a record of processing activities",
|
||||
Framework = ComplianceFramework.GDPR,
|
||||
Category = ControlCategory.DataProtection,
|
||||
ValidationType = ControlValidationType.Evidence
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "Art.32",
|
||||
Name = "Security of Processing",
|
||||
Description = "Implement appropriate technical and organisational measures to ensure security",
|
||||
Framework = ComplianceFramework.GDPR,
|
||||
Category = ControlCategory.DataProtection,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
}
|
||||
];
|
||||
|
||||
// NIST CSF Controls
|
||||
builder[ComplianceFramework.NISTCSF] =
|
||||
[
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "ID.AM-1",
|
||||
Name = "Asset Inventory",
|
||||
Description = "Physical devices and systems within the organization are inventoried",
|
||||
Framework = ComplianceFramework.NISTCSF,
|
||||
Category = ControlCategory.RiskManagement,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "PR.AC-1",
|
||||
Name = "Identity Management",
|
||||
Description = "Identities and credentials are issued, managed, verified, revoked, and audited",
|
||||
Framework = ComplianceFramework.NISTCSF,
|
||||
Category = ControlCategory.AccessControl,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "PR.DS-1",
|
||||
Name = "Data-at-Rest Protection",
|
||||
Description = "Data-at-rest is protected",
|
||||
Framework = ComplianceFramework.NISTCSF,
|
||||
Category = ControlCategory.DataProtection,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "DE.CM-1",
|
||||
Name = "Network Monitoring",
|
||||
Description = "The network is monitored to detect potential cybersecurity events",
|
||||
Framework = ComplianceFramework.NISTCSF,
|
||||
Category = ControlCategory.SecurityMonitoring,
|
||||
ValidationType = ControlValidationType.Automated
|
||||
},
|
||||
new ComplianceControl
|
||||
{
|
||||
Id = "RS.RP-1",
|
||||
Name = "Response Planning",
|
||||
Description = "Response plan is executed during or after an incident",
|
||||
Framework = ComplianceFramework.NISTCSF,
|
||||
Category = ControlCategory.IncidentResponse,
|
||||
ValidationType = ControlValidationType.ManualReview
|
||||
}
|
||||
];
|
||||
|
||||
return builder.ToImmutable();
|
||||
}
|
||||
|
||||
private ImmutableDictionary<(ComplianceFramework, ComplianceFramework), ImmutableDictionary<string, string>> BuildCrossMappings()
|
||||
{
|
||||
var builder = ImmutableDictionary.CreateBuilder<(ComplianceFramework, ComplianceFramework), ImmutableDictionary<string, string>>();
|
||||
|
||||
// SOC 2 to ISO 27001 mapping
|
||||
builder[(ComplianceFramework.SOC2, ComplianceFramework.ISO27001)] = new Dictionary<string, string>
|
||||
{
|
||||
["CC6.1"] = "A.8.2",
|
||||
["CC8.1"] = "A.8.32",
|
||||
["CC7.1"] = "A.8.9"
|
||||
}.ToImmutableDictionary();
|
||||
|
||||
// SOC 2 to NIST CSF mapping
|
||||
builder[(ComplianceFramework.SOC2, ComplianceFramework.NISTCSF)] = new Dictionary<string, string>
|
||||
{
|
||||
["CC6.1"] = "PR.AC-1",
|
||||
["CC7.1"] = "DE.CM-1",
|
||||
["CC7.2"] = "DE.CM-1"
|
||||
}.ToImmutableDictionary();
|
||||
|
||||
// ISO 27001 to SOC 2 mapping
|
||||
builder[(ComplianceFramework.ISO27001, ComplianceFramework.SOC2)] = new Dictionary<string, string>
|
||||
{
|
||||
["A.8.2"] = "CC6.1",
|
||||
["A.8.32"] = "CC8.1"
|
||||
}.ToImmutableDictionary();
|
||||
|
||||
return builder.ToImmutable();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metadata about a compliance framework.
|
||||
/// </summary>
|
||||
public sealed record FrameworkMetadata
|
||||
{
|
||||
public required ComplianceFramework Framework { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public required string FullName { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required string Publisher { get; init; }
|
||||
public ImmutableArray<string> Categories { get; init; } = [];
|
||||
}
|
||||
@@ -0,0 +1,855 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Compliance;
|
||||
|
||||
/// <summary>
|
||||
/// Generates compliance reports in various formats.
|
||||
/// </summary>
|
||||
public sealed class ReportGenerator
|
||||
{
|
||||
private readonly IReportTemplateProvider _templateProvider;
|
||||
private readonly IEvidenceChainBuilder _evidenceChainBuilder;
|
||||
private readonly IAuditQueryEngine _auditQueryEngine;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ReportGeneratorConfig _config;
|
||||
private readonly ILogger<ReportGenerator> _logger;
|
||||
|
||||
public ReportGenerator(
|
||||
IReportTemplateProvider templateProvider,
|
||||
IEvidenceChainBuilder evidenceChainBuilder,
|
||||
IAuditQueryEngine auditQueryEngine,
|
||||
TimeProvider timeProvider,
|
||||
ReportGeneratorConfig config,
|
||||
ILogger<ReportGenerator> logger)
|
||||
{
|
||||
_templateProvider = templateProvider;
|
||||
_evidenceChainBuilder = evidenceChainBuilder;
|
||||
_auditQueryEngine = auditQueryEngine;
|
||||
_timeProvider = timeProvider;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generates a compliance report.
|
||||
/// </summary>
|
||||
public async Task<ComplianceReport> GenerateAsync(
|
||||
ReportRequest request,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Generating {ReportType} report for {Scope}",
|
||||
request.ReportType, request.Scope);
|
||||
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
|
||||
// Get template
|
||||
var template = _templateProvider.GetTemplate(request.ReportType);
|
||||
|
||||
// Gather data based on report type
|
||||
var data = await GatherReportDataAsync(request, ct);
|
||||
|
||||
// Build evidence chain if needed
|
||||
if (request.IncludeEvidenceChain)
|
||||
{
|
||||
data.EvidenceChain = await _evidenceChainBuilder.BuildAsync(
|
||||
request.ReleaseId ?? request.Scope.ReleaseIds.FirstOrDefault(),
|
||||
ct);
|
||||
}
|
||||
|
||||
// Generate sections
|
||||
var sections = await GenerateSectionsAsync(template, data, ct);
|
||||
|
||||
var report = new ComplianceReport
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
ReportType = request.ReportType,
|
||||
Title = template.Title,
|
||||
GeneratedAt = startTime,
|
||||
GeneratedBy = request.RequestedBy ?? "system",
|
||||
Scope = request.Scope,
|
||||
Frameworks = request.Frameworks,
|
||||
Sections = sections,
|
||||
Summary = GenerateSummary(data, sections),
|
||||
Metadata = new ReportMetadata
|
||||
{
|
||||
GenerationDuration = _timeProvider.GetUtcNow() - startTime,
|
||||
TemplateVersion = template.Version,
|
||||
IncludesEvidenceChain = request.IncludeEvidenceChain,
|
||||
DataCutoffTime = request.Scope.EndDate ?? startTime
|
||||
}
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Report {ReportId} generated in {Duration}",
|
||||
report.Id, report.Metadata.GenerationDuration);
|
||||
|
||||
return report;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exports a report to a specific format.
|
||||
/// </summary>
|
||||
public async Task<ExportResult> ExportAsync(
|
||||
ComplianceReport report,
|
||||
ExportFormat format,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Exporting report {ReportId} as {Format}",
|
||||
report.Id, format);
|
||||
|
||||
var exporter = GetExporter(format);
|
||||
var content = await exporter.ExportAsync(report, ct);
|
||||
|
||||
return new ExportResult
|
||||
{
|
||||
ReportId = report.Id,
|
||||
Format = format,
|
||||
Content = content,
|
||||
ContentType = GetContentType(format),
|
||||
FileName = GenerateFileName(report, format)
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Schedules recurring report generation.
|
||||
/// </summary>
|
||||
public async Task<ScheduleResult> ScheduleAsync(
|
||||
ReportSchedule schedule,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Scheduling {ReportType} report with {Schedule} schedule",
|
||||
schedule.ReportType, schedule.Frequency);
|
||||
|
||||
// Validate schedule
|
||||
if (schedule.Recipients.Length == 0)
|
||||
{
|
||||
return new ScheduleResult
|
||||
{
|
||||
Success = false,
|
||||
Error = "At least one recipient is required"
|
||||
};
|
||||
}
|
||||
|
||||
// Store schedule
|
||||
var scheduleId = Guid.NewGuid();
|
||||
|
||||
return new ScheduleResult
|
||||
{
|
||||
Success = true,
|
||||
ScheduleId = scheduleId,
|
||||
NextRunAt = CalculateNextRun(schedule)
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<ReportData> GatherReportDataAsync(
|
||||
ReportRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var data = new ReportData
|
||||
{
|
||||
Scope = request.Scope,
|
||||
Frameworks = request.Frameworks
|
||||
};
|
||||
|
||||
// Query releases in scope
|
||||
if (request.Scope.ReleaseIds.Length > 0)
|
||||
{
|
||||
data.Releases = await _auditQueryEngine.GetReleasesAsync(
|
||||
request.Scope.ReleaseIds,
|
||||
ct);
|
||||
}
|
||||
else if (request.Scope.StartDate.HasValue)
|
||||
{
|
||||
data.Releases = await _auditQueryEngine.GetReleasesInRangeAsync(
|
||||
request.Scope.StartDate.Value,
|
||||
request.Scope.EndDate ?? _timeProvider.GetUtcNow(),
|
||||
ct);
|
||||
}
|
||||
|
||||
// Get compliance evaluations
|
||||
data.Evaluations = await _auditQueryEngine.GetEvaluationsAsync(
|
||||
data.Releases.Select(r => r.Id).ToImmutableArray(),
|
||||
request.Frameworks,
|
||||
ct);
|
||||
|
||||
// Get audit events
|
||||
data.AuditEvents = await _auditQueryEngine.GetAuditEventsAsync(
|
||||
request.Scope,
|
||||
ct);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<ReportSection>> GenerateSectionsAsync(
|
||||
ReportTemplate template,
|
||||
ReportData data,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var sections = new List<ReportSection>();
|
||||
|
||||
foreach (var sectionDef in template.Sections)
|
||||
{
|
||||
var section = sectionDef.Type switch
|
||||
{
|
||||
ReportSectionType.ExecutiveSummary => GenerateExecutiveSummary(data),
|
||||
ReportSectionType.ComplianceOverview => GenerateComplianceOverview(data),
|
||||
ReportSectionType.ControlDetails => await GenerateControlDetailsAsync(data, ct),
|
||||
ReportSectionType.GapAnalysis => GenerateGapAnalysis(data),
|
||||
ReportSectionType.EvidencePackage => await GenerateEvidencePackageAsync(data, ct),
|
||||
ReportSectionType.AuditTrail => GenerateAuditTrail(data),
|
||||
ReportSectionType.Recommendations => GenerateRecommendations(data),
|
||||
_ => new ReportSection { Title = sectionDef.Title, Content = "" }
|
||||
};
|
||||
|
||||
section = section with { Order = sectionDef.Order };
|
||||
sections.Add(section);
|
||||
}
|
||||
|
||||
return sections.OrderBy(s => s.Order).ToImmutableArray();
|
||||
}
|
||||
|
||||
private ReportSection GenerateExecutiveSummary(ReportData data)
|
||||
{
|
||||
var totalReleases = data.Releases.Count;
|
||||
var compliantReleases = data.Evaluations
|
||||
.Where(e => e.Status == OverallComplianceStatus.Compliant)
|
||||
.Select(e => e.ReleaseId)
|
||||
.Distinct()
|
||||
.Count();
|
||||
|
||||
var complianceRate = totalReleases > 0
|
||||
? (double)compliantReleases / totalReleases
|
||||
: 0;
|
||||
|
||||
return new ReportSection
|
||||
{
|
||||
Title = "Executive Summary",
|
||||
Type = ReportSectionType.ExecutiveSummary,
|
||||
Content = $"Compliance assessment covering {totalReleases} releases with {complianceRate:P0} compliance rate.",
|
||||
Data = new ExecutiveSummaryData
|
||||
{
|
||||
TotalReleases = totalReleases,
|
||||
CompliantReleases = compliantReleases,
|
||||
ComplianceRate = complianceRate,
|
||||
Frameworks = data.Frameworks,
|
||||
Period = data.Scope
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private ReportSection GenerateComplianceOverview(ReportData data)
|
||||
{
|
||||
var byFramework = data.Evaluations
|
||||
.GroupBy(e => e.Framework)
|
||||
.Select(g => new FrameworkOverview
|
||||
{
|
||||
Framework = g.Key,
|
||||
AverageScore = g.Average(e => e.Score),
|
||||
PassRate = g.Count(e => e.Status == OverallComplianceStatus.Compliant) / (double)g.Count()
|
||||
})
|
||||
.ToImmutableArray();
|
||||
|
||||
return new ReportSection
|
||||
{
|
||||
Title = "Compliance Overview",
|
||||
Type = ReportSectionType.ComplianceOverview,
|
||||
Content = $"Overview of compliance status across {byFramework.Length} frameworks.",
|
||||
Data = byFramework
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<ReportSection> GenerateControlDetailsAsync(
|
||||
ReportData data,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Detailed control-by-control breakdown
|
||||
var controlDetails = await _auditQueryEngine.GetControlDetailsAsync(
|
||||
data.Evaluations.Select(e => e.EvaluationId).ToImmutableArray(),
|
||||
ct);
|
||||
|
||||
return new ReportSection
|
||||
{
|
||||
Title = "Control Details",
|
||||
Type = ReportSectionType.ControlDetails,
|
||||
Content = $"Detailed breakdown of {controlDetails.Count} controls.",
|
||||
Data = controlDetails
|
||||
};
|
||||
}
|
||||
|
||||
private ReportSection GenerateGapAnalysis(ReportData data)
|
||||
{
|
||||
var gaps = data.Evaluations
|
||||
.SelectMany(e => e.Gaps)
|
||||
.GroupBy(g => g.ControlId)
|
||||
.Select(g => new GapSummary
|
||||
{
|
||||
ControlId = g.Key,
|
||||
ControlName = g.First().ControlName,
|
||||
Occurrences = g.Count(),
|
||||
Severity = g.Max(x => x.Severity),
|
||||
Frameworks = g.Select(x => x.Framework).Distinct().ToImmutableArray()
|
||||
})
|
||||
.OrderByDescending(g => g.Severity)
|
||||
.ThenByDescending(g => g.Occurrences)
|
||||
.ToImmutableArray();
|
||||
|
||||
return new ReportSection
|
||||
{
|
||||
Title = "Gap Analysis",
|
||||
Type = ReportSectionType.GapAnalysis,
|
||||
Content = $"Analysis of {gaps.Length} identified gaps.",
|
||||
Data = gaps
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<ReportSection> GenerateEvidencePackageAsync(
|
||||
ReportData data,
|
||||
CancellationToken ct)
|
||||
{
|
||||
if (data.EvidenceChain is null)
|
||||
{
|
||||
return new ReportSection
|
||||
{
|
||||
Title = "Evidence Package",
|
||||
Type = ReportSectionType.EvidencePackage,
|
||||
Content = "Evidence chain not included."
|
||||
};
|
||||
}
|
||||
|
||||
return new ReportSection
|
||||
{
|
||||
Title = "Evidence Package",
|
||||
Type = ReportSectionType.EvidencePackage,
|
||||
Content = $"Complete evidence chain with {data.EvidenceChain.Nodes.Length} nodes.",
|
||||
Data = data.EvidenceChain
|
||||
};
|
||||
}
|
||||
|
||||
private ReportSection GenerateAuditTrail(ReportData data)
|
||||
{
|
||||
return new ReportSection
|
||||
{
|
||||
Title = "Audit Trail",
|
||||
Type = ReportSectionType.AuditTrail,
|
||||
Content = $"Audit trail containing {data.AuditEvents.Count} events.",
|
||||
Data = data.AuditEvents
|
||||
};
|
||||
}
|
||||
|
||||
private ReportSection GenerateRecommendations(ReportData data)
|
||||
{
|
||||
var recommendations = new List<Recommendation>();
|
||||
|
||||
// Generate recommendations based on gaps
|
||||
var criticalGaps = data.Evaluations
|
||||
.SelectMany(e => e.Gaps)
|
||||
.Where(g => g.Severity == GapSeverity.Critical)
|
||||
.ToList();
|
||||
|
||||
if (criticalGaps.Count > 0)
|
||||
{
|
||||
recommendations.Add(new Recommendation
|
||||
{
|
||||
Priority = RecommendationPriority.Critical,
|
||||
Title = "Address Critical Gaps",
|
||||
Description = $"Address {criticalGaps.Count} critical compliance gaps immediately.",
|
||||
AffectedControls = criticalGaps.Select(g => g.ControlId).Distinct().ToImmutableArray()
|
||||
});
|
||||
}
|
||||
|
||||
return new ReportSection
|
||||
{
|
||||
Title = "Recommendations",
|
||||
Type = ReportSectionType.Recommendations,
|
||||
Content = $"{recommendations.Count} recommendations generated.",
|
||||
Data = recommendations.ToImmutableArray()
|
||||
};
|
||||
}
|
||||
|
||||
private ReportSummary GenerateSummary(ReportData data, ImmutableArray<ReportSection> sections)
|
||||
{
|
||||
return new ReportSummary
|
||||
{
|
||||
TotalReleases = data.Releases.Count,
|
||||
FrameworksCovered = data.Frameworks.Length,
|
||||
OverallComplianceRate = data.Evaluations.Count > 0
|
||||
? data.Evaluations.Average(e => e.Score)
|
||||
: 0,
|
||||
CriticalGaps = data.Evaluations
|
||||
.SelectMany(e => e.Gaps)
|
||||
.Count(g => g.Severity == GapSeverity.Critical),
|
||||
TotalControls = data.Evaluations
|
||||
.SelectMany(e => e.ControlResults)
|
||||
.Count()
|
||||
};
|
||||
}
|
||||
|
||||
private IReportExporter GetExporter(ExportFormat format)
|
||||
{
|
||||
return format switch
|
||||
{
|
||||
ExportFormat.Pdf => new PdfReportExporter(),
|
||||
ExportFormat.Html => new HtmlReportExporter(),
|
||||
ExportFormat.Json => new JsonReportExporter(),
|
||||
ExportFormat.Csv => new CsvReportExporter(),
|
||||
_ => throw new ArgumentException($"Unsupported format: {format}")
|
||||
};
|
||||
}
|
||||
|
||||
private static string GetContentType(ExportFormat format)
|
||||
{
|
||||
return format switch
|
||||
{
|
||||
ExportFormat.Pdf => "application/pdf",
|
||||
ExportFormat.Html => "text/html",
|
||||
ExportFormat.Json => "application/json",
|
||||
ExportFormat.Csv => "text/csv",
|
||||
_ => "application/octet-stream"
|
||||
};
|
||||
}
|
||||
|
||||
private static string GenerateFileName(ComplianceReport report, ExportFormat format)
|
||||
{
|
||||
var extension = format.ToString().ToLowerInvariant();
|
||||
return $"compliance-report-{report.Id:N}.{extension}";
|
||||
}
|
||||
|
||||
private DateTimeOffset CalculateNextRun(ReportSchedule schedule)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
return schedule.Frequency switch
|
||||
{
|
||||
ScheduleFrequency.Daily => now.AddDays(1).Date.Add(schedule.RunTime),
|
||||
ScheduleFrequency.Weekly => now.AddDays(7 - (int)now.DayOfWeek + (int)schedule.DayOfWeek!.Value).Date.Add(schedule.RunTime),
|
||||
ScheduleFrequency.Monthly => new DateTimeOffset(now.Year, now.Month, 1, 0, 0, 0, now.Offset).AddMonths(1).Add(schedule.RunTime),
|
||||
_ => now.AddDays(1)
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for report generator.
|
||||
/// </summary>
|
||||
public sealed record ReportGeneratorConfig
|
||||
{
|
||||
public string OutputDirectory { get; init; } = "./reports";
|
||||
public ExportFormat DefaultFormat { get; init; } = ExportFormat.Pdf;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to generate a report.
|
||||
/// </summary>
|
||||
public sealed record ReportRequest
|
||||
{
|
||||
public required ReportType ReportType { get; init; }
|
||||
public required ReportScope Scope { get; init; }
|
||||
public ImmutableArray<ComplianceFramework> Frameworks { get; init; } = [];
|
||||
public Guid? ReleaseId { get; init; }
|
||||
public bool IncludeEvidenceChain { get; init; }
|
||||
public string? RequestedBy { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Report scope.
|
||||
/// </summary>
|
||||
public sealed record ReportScope
|
||||
{
|
||||
public ImmutableArray<Guid> ReleaseIds { get; init; } = [];
|
||||
public ImmutableArray<string> Environments { get; init; } = [];
|
||||
public DateTimeOffset? StartDate { get; init; }
|
||||
public DateTimeOffset? EndDate { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Report types.
|
||||
/// </summary>
|
||||
public enum ReportType
|
||||
{
|
||||
ExecutiveSummary,
|
||||
DetailedCompliance,
|
||||
GapAnalysis,
|
||||
AuditReadiness,
|
||||
EvidencePackage
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A compliance report.
|
||||
/// </summary>
|
||||
public sealed record ComplianceReport
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required ReportType ReportType { get; init; }
|
||||
public required string Title { get; init; }
|
||||
public required DateTimeOffset GeneratedAt { get; init; }
|
||||
public required string GeneratedBy { get; init; }
|
||||
public required ReportScope Scope { get; init; }
|
||||
public required ImmutableArray<ComplianceFramework> Frameworks { get; init; }
|
||||
public required ImmutableArray<ReportSection> Sections { get; init; }
|
||||
public required ReportSummary Summary { get; init; }
|
||||
public required ReportMetadata Metadata { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A report section.
|
||||
/// </summary>
|
||||
public sealed record ReportSection
|
||||
{
|
||||
public required string Title { get; init; }
|
||||
public ReportSectionType Type { get; init; }
|
||||
public int Order { get; init; }
|
||||
public required string Content { get; init; }
|
||||
public object? Data { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Report section types.
|
||||
/// </summary>
|
||||
public enum ReportSectionType
|
||||
{
|
||||
ExecutiveSummary,
|
||||
ComplianceOverview,
|
||||
ControlDetails,
|
||||
GapAnalysis,
|
||||
EvidencePackage,
|
||||
AuditTrail,
|
||||
Recommendations
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Report summary.
|
||||
/// </summary>
|
||||
public sealed record ReportSummary
|
||||
{
|
||||
public required int TotalReleases { get; init; }
|
||||
public required int FrameworksCovered { get; init; }
|
||||
public required double OverallComplianceRate { get; init; }
|
||||
public required int CriticalGaps { get; init; }
|
||||
public required int TotalControls { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Report metadata.
|
||||
/// </summary>
|
||||
public sealed record ReportMetadata
|
||||
{
|
||||
public required TimeSpan GenerationDuration { get; init; }
|
||||
public required string TemplateVersion { get; init; }
|
||||
public required bool IncludesEvidenceChain { get; init; }
|
||||
public required DateTimeOffset DataCutoffTime { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Export formats.
|
||||
/// </summary>
|
||||
public enum ExportFormat
|
||||
{
|
||||
Pdf,
|
||||
Html,
|
||||
Json,
|
||||
Csv
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Export result.
|
||||
/// </summary>
|
||||
public sealed record ExportResult
|
||||
{
|
||||
public required Guid ReportId { get; init; }
|
||||
public required ExportFormat Format { get; init; }
|
||||
public required byte[] Content { get; init; }
|
||||
public required string ContentType { get; init; }
|
||||
public required string FileName { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Report schedule.
|
||||
/// </summary>
|
||||
public sealed record ReportSchedule
|
||||
{
|
||||
public required ReportType ReportType { get; init; }
|
||||
public required ScheduleFrequency Frequency { get; init; }
|
||||
public required TimeSpan RunTime { get; init; }
|
||||
public DayOfWeek? DayOfWeek { get; init; }
|
||||
public required ImmutableArray<string> Recipients { get; init; }
|
||||
public ImmutableArray<ComplianceFramework> Frameworks { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Schedule frequency.
|
||||
/// </summary>
|
||||
public enum ScheduleFrequency
|
||||
{
|
||||
Daily,
|
||||
Weekly,
|
||||
Monthly
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Schedule result.
|
||||
/// </summary>
|
||||
public sealed record ScheduleResult
|
||||
{
|
||||
public required bool Success { get; init; }
|
||||
public Guid? ScheduleId { get; init; }
|
||||
public DateTimeOffset? NextRunAt { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Report data.
|
||||
/// </summary>
|
||||
internal sealed class ReportData
|
||||
{
|
||||
public ReportScope Scope { get; init; } = new();
|
||||
public ImmutableArray<ComplianceFramework> Frameworks { get; init; } = [];
|
||||
public IReadOnlyList<ReleaseInfo> Releases { get; set; } = [];
|
||||
public IReadOnlyList<EvaluationRecord> Evaluations { get; set; } = [];
|
||||
public IReadOnlyList<AuditEvent> AuditEvents { get; set; } = [];
|
||||
public EvidenceChain? EvidenceChain { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Release info.
|
||||
/// </summary>
|
||||
public sealed record ReleaseInfo
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evaluation record.
|
||||
/// </summary>
|
||||
public sealed record EvaluationRecord
|
||||
{
|
||||
public required Guid EvaluationId { get; init; }
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required ComplianceFramework Framework { get; init; }
|
||||
public required double Score { get; init; }
|
||||
public required OverallComplianceStatus Status { get; init; }
|
||||
public required DateTimeOffset EvaluatedAt { get; init; }
|
||||
public ImmutableArray<ComplianceGap> Gaps { get; init; } = [];
|
||||
public ImmutableArray<ControlEvaluationResult> ControlResults { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Audit event.
|
||||
/// </summary>
|
||||
public sealed record AuditEvent
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required string Action { get; init; }
|
||||
public required string Actor { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public string? Details { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evidence chain.
|
||||
/// </summary>
|
||||
public sealed record EvidenceChain
|
||||
{
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required ImmutableArray<EvidenceNode> Nodes { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evidence node.
|
||||
/// </summary>
|
||||
public sealed record EvidenceNode
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string Type { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public ImmutableArray<string> ParentIds { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Report template.
|
||||
/// </summary>
|
||||
public sealed record ReportTemplate
|
||||
{
|
||||
public required string Title { get; init; }
|
||||
public required string Version { get; init; }
|
||||
public required ImmutableArray<SectionDefinition> Sections { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Section definition.
|
||||
/// </summary>
|
||||
public sealed record SectionDefinition
|
||||
{
|
||||
public required string Title { get; init; }
|
||||
public required ReportSectionType Type { get; init; }
|
||||
public required int Order { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Executive summary data.
|
||||
/// </summary>
|
||||
public sealed record ExecutiveSummaryData
|
||||
{
|
||||
public required int TotalReleases { get; init; }
|
||||
public required int CompliantReleases { get; init; }
|
||||
public required double ComplianceRate { get; init; }
|
||||
public required ImmutableArray<ComplianceFramework> Frameworks { get; init; }
|
||||
public required ReportScope Period { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Framework overview.
|
||||
/// </summary>
|
||||
public sealed record FrameworkOverview
|
||||
{
|
||||
public required ComplianceFramework Framework { get; init; }
|
||||
public required double AverageScore { get; init; }
|
||||
public required double PassRate { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gap summary.
|
||||
/// </summary>
|
||||
public sealed record GapSummary
|
||||
{
|
||||
public required string ControlId { get; init; }
|
||||
public required string ControlName { get; init; }
|
||||
public required int Occurrences { get; init; }
|
||||
public required GapSeverity Severity { get; init; }
|
||||
public required ImmutableArray<ComplianceFramework> Frameworks { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Recommendation.
|
||||
/// </summary>
|
||||
public sealed record Recommendation
|
||||
{
|
||||
public required RecommendationPriority Priority { get; init; }
|
||||
public required string Title { get; init; }
|
||||
public required string Description { get; init; }
|
||||
public ImmutableArray<string> AffectedControls { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Recommendation priority.
|
||||
/// </summary>
|
||||
public enum RecommendationPriority
|
||||
{
|
||||
Low,
|
||||
Medium,
|
||||
High,
|
||||
Critical
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Control detail.
|
||||
/// </summary>
|
||||
public sealed record ControlDetail
|
||||
{
|
||||
public required string ControlId { get; init; }
|
||||
public required string ControlName { get; init; }
|
||||
public required ControlStatus Status { get; init; }
|
||||
public required ComplianceFramework Framework { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for report template provider.
|
||||
/// </summary>
|
||||
public interface IReportTemplateProvider
|
||||
{
|
||||
ReportTemplate GetTemplate(ReportType reportType);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for evidence chain builder.
|
||||
/// </summary>
|
||||
public interface IEvidenceChainBuilder
|
||||
{
|
||||
Task<EvidenceChain> BuildAsync(Guid? releaseId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for audit query engine.
|
||||
/// </summary>
|
||||
public interface IAuditQueryEngine
|
||||
{
|
||||
Task<IReadOnlyList<ReleaseInfo>> GetReleasesAsync(
|
||||
ImmutableArray<Guid> releaseIds,
|
||||
CancellationToken ct = default);
|
||||
Task<IReadOnlyList<ReleaseInfo>> GetReleasesInRangeAsync(
|
||||
DateTimeOffset start,
|
||||
DateTimeOffset end,
|
||||
CancellationToken ct = default);
|
||||
Task<IReadOnlyList<EvaluationRecord>> GetEvaluationsAsync(
|
||||
ImmutableArray<Guid> releaseIds,
|
||||
ImmutableArray<ComplianceFramework> frameworks,
|
||||
CancellationToken ct = default);
|
||||
Task<IReadOnlyList<AuditEvent>> GetAuditEventsAsync(
|
||||
ReportScope scope,
|
||||
CancellationToken ct = default);
|
||||
Task<IReadOnlyList<ControlDetail>> GetControlDetailsAsync(
|
||||
ImmutableArray<Guid> evaluationIds,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for report exporter.
|
||||
/// </summary>
|
||||
public interface IReportExporter
|
||||
{
|
||||
Task<byte[]> ExportAsync(ComplianceReport report, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// PDF report exporter (stub).
|
||||
/// </summary>
|
||||
internal sealed class PdfReportExporter : IReportExporter
|
||||
{
|
||||
public Task<byte[]> ExportAsync(ComplianceReport report, CancellationToken ct = default)
|
||||
{
|
||||
// Placeholder - would use a PDF library
|
||||
return Task.FromResult(Array.Empty<byte>());
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// HTML report exporter (stub).
|
||||
/// </summary>
|
||||
internal sealed class HtmlReportExporter : IReportExporter
|
||||
{
|
||||
public Task<byte[]> ExportAsync(ComplianceReport report, CancellationToken ct = default)
|
||||
{
|
||||
var html = $"<html><body><h1>{report.Title}</h1></body></html>";
|
||||
return Task.FromResult(System.Text.Encoding.UTF8.GetBytes(html));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// JSON report exporter (stub).
|
||||
/// </summary>
|
||||
internal sealed class JsonReportExporter : IReportExporter
|
||||
{
|
||||
public Task<byte[]> ExportAsync(ComplianceReport report, CancellationToken ct = default)
|
||||
{
|
||||
var json = System.Text.Json.JsonSerializer.Serialize(report);
|
||||
return Task.FromResult(System.Text.Encoding.UTF8.GetBytes(json));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// CSV report exporter (stub).
|
||||
/// </summary>
|
||||
internal sealed class CsvReportExporter : IReportExporter
|
||||
{
|
||||
public Task<byte[]> ExportAsync(ComplianceReport report, CancellationToken ct = default)
|
||||
{
|
||||
return Task.FromResult(Array.Empty<byte>());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,512 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ScheduledReportService.cs
|
||||
// Sprint: SPRINT_20260117_039_ReleaseOrchestrator_compliance
|
||||
// Task: TASK-039-08 - Scheduled report generation and delivery
|
||||
// Description: Service for scheduling and delivering compliance reports
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using Cronos;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Compliance;
|
||||
|
||||
/// <summary>
|
||||
/// Manages scheduled report generation and delivery.
|
||||
/// </summary>
|
||||
public sealed class ScheduledReportService : IScheduledReportService, IDisposable
|
||||
{
|
||||
private readonly IReportGenerator _reportGenerator;
|
||||
private readonly IReportDeliveryService _deliveryService;
|
||||
private readonly IScheduledReportRepository _repository;
|
||||
private readonly ScheduledReportConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<ScheduledReportService> _logger;
|
||||
|
||||
private readonly ConcurrentDictionary<string, ScheduledReportState> _schedules = new();
|
||||
private readonly CancellationTokenSource _cts = new();
|
||||
private readonly Task _schedulerTask;
|
||||
|
||||
public ScheduledReportService(
|
||||
IReportGenerator reportGenerator,
|
||||
IReportDeliveryService deliveryService,
|
||||
IScheduledReportRepository repository,
|
||||
ScheduledReportConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<ScheduledReportService> logger)
|
||||
{
|
||||
_reportGenerator = reportGenerator;
|
||||
_deliveryService = deliveryService;
|
||||
_repository = repository;
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
|
||||
_schedulerTask = Task.Run(RunSchedulerAsync);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new scheduled report.
|
||||
/// </summary>
|
||||
public async Task<ScheduledReport> CreateAsync(
|
||||
CreateScheduledReportRequest request,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
// Validate cron expression
|
||||
var cronExpression = ValidateCronExpression(request.Schedule);
|
||||
|
||||
var schedule = new ScheduledReport
|
||||
{
|
||||
Id = GenerateId(),
|
||||
TemplateId = request.TemplateId,
|
||||
Schedule = request.Schedule,
|
||||
Recipients = request.Recipients,
|
||||
Parameters = request.Parameters ?? ImmutableDictionary<string, string>.Empty,
|
||||
Enabled = true,
|
||||
CreatedAt = _timeProvider.GetUtcNow(),
|
||||
NextRunAt = cronExpression.GetNextOccurrence(_timeProvider.GetUtcNow().UtcDateTime)
|
||||
};
|
||||
|
||||
await _repository.SaveAsync(schedule, ct);
|
||||
|
||||
_schedules[schedule.Id] = new ScheduledReportState
|
||||
{
|
||||
Schedule = schedule,
|
||||
CronExpression = cronExpression
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Created scheduled report {Id} with template {Template}, next run at {NextRun}",
|
||||
schedule.Id, schedule.TemplateId, schedule.NextRunAt);
|
||||
|
||||
return schedule;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a scheduled report by ID.
|
||||
/// </summary>
|
||||
public async Task<ScheduledReport?> GetAsync(string scheduleId, CancellationToken ct = default)
|
||||
{
|
||||
return await _repository.GetAsync(scheduleId, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Lists all scheduled reports.
|
||||
/// </summary>
|
||||
public async Task<ImmutableArray<ScheduledReport>> ListAsync(CancellationToken ct = default)
|
||||
{
|
||||
return await _repository.ListAsync(ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates a scheduled report.
|
||||
/// </summary>
|
||||
public async Task<ScheduledReport?> UpdateAsync(
|
||||
string scheduleId,
|
||||
UpdateScheduledReportRequest request,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var existing = await _repository.GetAsync(scheduleId, ct);
|
||||
if (existing is null) return null;
|
||||
|
||||
CronExpression? newCron = null;
|
||||
if (request.Schedule is not null)
|
||||
{
|
||||
newCron = ValidateCronExpression(request.Schedule);
|
||||
}
|
||||
|
||||
var updated = existing with
|
||||
{
|
||||
Schedule = request.Schedule ?? existing.Schedule,
|
||||
Recipients = request.Recipients ?? existing.Recipients,
|
||||
Enabled = request.Enabled ?? existing.Enabled,
|
||||
UpdatedAt = _timeProvider.GetUtcNow(),
|
||||
NextRunAt = newCron?.GetNextOccurrence(_timeProvider.GetUtcNow().UtcDateTime) ?? existing.NextRunAt
|
||||
};
|
||||
|
||||
await _repository.SaveAsync(updated, ct);
|
||||
|
||||
if (_schedules.TryGetValue(scheduleId, out var state))
|
||||
{
|
||||
state.Schedule = updated;
|
||||
if (newCron is not null)
|
||||
{
|
||||
state.CronExpression = newCron;
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Updated scheduled report {Id}", scheduleId);
|
||||
|
||||
return updated;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deletes a scheduled report.
|
||||
/// </summary>
|
||||
public async Task<bool> DeleteAsync(string scheduleId, CancellationToken ct = default)
|
||||
{
|
||||
var deleted = await _repository.DeleteAsync(scheduleId, ct);
|
||||
if (deleted)
|
||||
{
|
||||
_schedules.TryRemove(scheduleId, out _);
|
||||
_logger.LogInformation("Deleted scheduled report {Id}", scheduleId);
|
||||
}
|
||||
return deleted;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Manually triggers a scheduled report.
|
||||
/// </summary>
|
||||
public async Task<ReportExecutionResult> TriggerAsync(
|
||||
string scheduleId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var schedule = await _repository.GetAsync(scheduleId, ct);
|
||||
if (schedule is null)
|
||||
{
|
||||
return new ReportExecutionResult
|
||||
{
|
||||
ScheduleId = scheduleId,
|
||||
Success = false,
|
||||
Error = "Schedule not found"
|
||||
};
|
||||
}
|
||||
|
||||
return await ExecuteScheduledReportAsync(schedule, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets execution history for a scheduled report.
|
||||
/// </summary>
|
||||
public async Task<ImmutableArray<ReportExecution>> GetExecutionHistoryAsync(
|
||||
string scheduleId,
|
||||
int limit = 10,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
return await _repository.GetExecutionsAsync(scheduleId, limit, ct);
|
||||
}
|
||||
|
||||
private async Task RunSchedulerAsync()
|
||||
{
|
||||
// Load existing schedules
|
||||
await LoadSchedulesAsync();
|
||||
|
||||
while (!_cts.Token.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await Task.Delay(_config.CheckInterval, _cts.Token);
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
foreach (var (id, state) in _schedules)
|
||||
{
|
||||
if (!state.Schedule.Enabled) continue;
|
||||
if (state.Schedule.NextRunAt is null) continue;
|
||||
if (state.Schedule.NextRunAt > now) continue;
|
||||
|
||||
// Time to execute
|
||||
_ = ExecuteAndRescheduleAsync(id, state);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in scheduler loop");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task LoadSchedulesAsync()
|
||||
{
|
||||
try
|
||||
{
|
||||
var schedules = await _repository.ListAsync(_cts.Token);
|
||||
foreach (var schedule in schedules)
|
||||
{
|
||||
try
|
||||
{
|
||||
var cronExpression = CronExpression.Parse(schedule.Schedule);
|
||||
_schedules[schedule.Id] = new ScheduledReportState
|
||||
{
|
||||
Schedule = schedule,
|
||||
CronExpression = cronExpression
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse cron for schedule {Id}", schedule.Id);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Loaded {Count} scheduled reports", _schedules.Count);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to load scheduled reports");
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ExecuteAndRescheduleAsync(string id, ScheduledReportState state)
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await ExecuteScheduledReportAsync(state.Schedule, _cts.Token);
|
||||
|
||||
// Record execution
|
||||
var execution = new ReportExecution
|
||||
{
|
||||
Id = GenerateId(),
|
||||
ScheduleId = id,
|
||||
ExecutedAt = _timeProvider.GetUtcNow(),
|
||||
Success = result.Success,
|
||||
ReportId = result.ReportId,
|
||||
Error = result.Error,
|
||||
DeliveryResults = result.DeliveryResults
|
||||
};
|
||||
|
||||
await _repository.SaveExecutionAsync(execution, _cts.Token);
|
||||
|
||||
// Schedule next run
|
||||
var nextRun = state.CronExpression.GetNextOccurrence(_timeProvider.GetUtcNow().UtcDateTime);
|
||||
state.Schedule = state.Schedule with
|
||||
{
|
||||
NextRunAt = nextRun,
|
||||
LastRunAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
await _repository.SaveAsync(state.Schedule, _cts.Token);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Executed scheduled report {Id}, success={Success}, next run at {NextRun}",
|
||||
id, result.Success, nextRun);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to execute scheduled report {Id}", id);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ReportExecutionResult> ExecuteScheduledReportAsync(
|
||||
ScheduledReport schedule,
|
||||
CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Generate report
|
||||
var report = await _reportGenerator.GenerateAsync(
|
||||
schedule.TemplateId,
|
||||
schedule.Parameters,
|
||||
ct);
|
||||
|
||||
// Render report
|
||||
var rendered = await _reportGenerator.RenderAsync(report, "pdf", ct);
|
||||
|
||||
// Deliver to recipients
|
||||
var deliveryResults = new List<DeliveryResult>();
|
||||
foreach (var recipient in schedule.Recipients)
|
||||
{
|
||||
try
|
||||
{
|
||||
await _deliveryService.DeliverAsync(
|
||||
recipient,
|
||||
new ReportDeliveryPayload
|
||||
{
|
||||
ReportId = report.Id,
|
||||
ReportName = $"Compliance Report - {_timeProvider.GetUtcNow():yyyy-MM-dd}",
|
||||
Content = rendered.Data,
|
||||
ContentType = rendered.ContentType,
|
||||
FileName = rendered.FileName
|
||||
},
|
||||
ct);
|
||||
|
||||
deliveryResults.Add(new DeliveryResult
|
||||
{
|
||||
Recipient = recipient,
|
||||
Success = true
|
||||
});
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
deliveryResults.Add(new DeliveryResult
|
||||
{
|
||||
Recipient = recipient,
|
||||
Success = false,
|
||||
Error = ex.Message
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return new ReportExecutionResult
|
||||
{
|
||||
ScheduleId = schedule.Id,
|
||||
Success = true,
|
||||
ReportId = report.Id,
|
||||
DeliveryResults = deliveryResults.ToImmutableArray()
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new ReportExecutionResult
|
||||
{
|
||||
ScheduleId = schedule.Id,
|
||||
Success = false,
|
||||
Error = ex.Message
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private static CronExpression ValidateCronExpression(string expression)
|
||||
{
|
||||
try
|
||||
{
|
||||
return CronExpression.Parse(expression);
|
||||
}
|
||||
catch (CronFormatException ex)
|
||||
{
|
||||
throw new ArgumentException($"Invalid cron expression: {expression}", nameof(expression), ex);
|
||||
}
|
||||
}
|
||||
|
||||
private static string GenerateId() => Guid.NewGuid().ToString("N")[..12];
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_cts.Cancel();
|
||||
_schedulerTask.Wait(TimeSpan.FromSeconds(5));
|
||||
_cts.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface IScheduledReportService
|
||||
{
|
||||
Task<ScheduledReport> CreateAsync(CreateScheduledReportRequest request, CancellationToken ct = default);
|
||||
Task<ScheduledReport?> GetAsync(string scheduleId, CancellationToken ct = default);
|
||||
Task<ImmutableArray<ScheduledReport>> ListAsync(CancellationToken ct = default);
|
||||
Task<ScheduledReport?> UpdateAsync(string scheduleId, UpdateScheduledReportRequest request, CancellationToken ct = default);
|
||||
Task<bool> DeleteAsync(string scheduleId, CancellationToken ct = default);
|
||||
Task<ReportExecutionResult> TriggerAsync(string scheduleId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IScheduledReportRepository
|
||||
{
|
||||
Task SaveAsync(ScheduledReport schedule, CancellationToken ct = default);
|
||||
Task<ScheduledReport?> GetAsync(string scheduleId, CancellationToken ct = default);
|
||||
Task<ImmutableArray<ScheduledReport>> ListAsync(CancellationToken ct = default);
|
||||
Task<bool> DeleteAsync(string scheduleId, CancellationToken ct = default);
|
||||
Task SaveExecutionAsync(ReportExecution execution, CancellationToken ct = default);
|
||||
Task<ImmutableArray<ReportExecution>> GetExecutionsAsync(string scheduleId, int limit, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IReportDeliveryService
|
||||
{
|
||||
Task DeliverAsync(string recipient, ReportDeliveryPayload payload, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IReportGenerator
|
||||
{
|
||||
Task<GeneratedReport> GenerateAsync(string templateId, ImmutableDictionary<string, string>? parameters, CancellationToken ct = default);
|
||||
Task<RenderedReport> RenderAsync(GeneratedReport report, string format, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record ScheduledReportConfig
|
||||
{
|
||||
public TimeSpan CheckInterval { get; init; } = TimeSpan.FromMinutes(1);
|
||||
public int MaxConcurrentExecutions { get; init; } = 5;
|
||||
}
|
||||
|
||||
public sealed record ScheduledReport
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string TemplateId { get; init; }
|
||||
public required string Schedule { get; init; }
|
||||
public required ImmutableArray<string> Recipients { get; init; }
|
||||
public required ImmutableDictionary<string, string> Parameters { get; init; }
|
||||
public required bool Enabled { get; init; }
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset? UpdatedAt { get; init; }
|
||||
public DateTimeOffset? LastRunAt { get; init; }
|
||||
public DateTime? NextRunAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record CreateScheduledReportRequest
|
||||
{
|
||||
public required string TemplateId { get; init; }
|
||||
public required string Schedule { get; init; }
|
||||
public required ImmutableArray<string> Recipients { get; init; }
|
||||
public ImmutableDictionary<string, string>? Parameters { get; init; }
|
||||
}
|
||||
|
||||
public sealed record UpdateScheduledReportRequest
|
||||
{
|
||||
public string? Schedule { get; init; }
|
||||
public ImmutableArray<string>? Recipients { get; init; }
|
||||
public bool? Enabled { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ReportExecution
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string ScheduleId { get; init; }
|
||||
public required DateTimeOffset ExecutedAt { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public string? ReportId { get; init; }
|
||||
public string? Error { get; init; }
|
||||
public ImmutableArray<DeliveryResult>? DeliveryResults { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ReportExecutionResult
|
||||
{
|
||||
public required string ScheduleId { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public string? ReportId { get; init; }
|
||||
public string? Error { get; init; }
|
||||
public ImmutableArray<DeliveryResult>? DeliveryResults { get; init; }
|
||||
}
|
||||
|
||||
public sealed record DeliveryResult
|
||||
{
|
||||
public required string Recipient { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ReportDeliveryPayload
|
||||
{
|
||||
public required string ReportId { get; init; }
|
||||
public required string ReportName { get; init; }
|
||||
public required byte[] Content { get; init; }
|
||||
public required string ContentType { get; init; }
|
||||
public required string FileName { get; init; }
|
||||
}
|
||||
|
||||
public sealed record GeneratedReport
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required string TemplateId { get; init; }
|
||||
}
|
||||
|
||||
public sealed record RenderedReport
|
||||
{
|
||||
public required byte[] Data { get; init; }
|
||||
public required string ContentType { get; init; }
|
||||
public required string FileName { get; init; }
|
||||
}
|
||||
|
||||
internal sealed class ScheduledReportState
|
||||
{
|
||||
public required ScheduledReport Schedule { get; set; }
|
||||
public required CronExpression CronExpression { get; set; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,17 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<RootNamespace>StellaOps.ReleaseOrchestrator.Compliance</RootNamespace>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Hosting.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,419 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ConnectionPool.cs
|
||||
// Sprint: SPRINT_20260117_038_ReleaseOrchestrator_performance
|
||||
// Task: TASK-038-08 - Optimized connection pool with warmup
|
||||
// Description: High-performance connection pool with health monitoring
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
using System.Diagnostics;
|
||||
using System.Threading.Channels;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Core.Performance;
|
||||
|
||||
/// <summary>
|
||||
/// Optimized connection pool with warmup, health monitoring, and adaptive sizing.
|
||||
/// </summary>
|
||||
/// <typeparam name="TConnection">The connection type.</typeparam>
|
||||
public sealed class ConnectionPool<TConnection> : IConnectionPool<TConnection>, IDisposable
|
||||
where TConnection : class
|
||||
{
|
||||
private readonly IConnectionFactory<TConnection> _factory;
|
||||
private readonly ConnectionPoolConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<ConnectionPool<TConnection>> _logger;
|
||||
|
||||
private readonly Channel<PooledConnection<TConnection>> _availableConnections;
|
||||
private readonly ConcurrentDictionary<string, PooledConnection<TConnection>> _allConnections = new();
|
||||
private readonly SemaphoreSlim _createSemaphore;
|
||||
private readonly CancellationTokenSource _cts = new();
|
||||
private readonly Task _maintenanceTask;
|
||||
|
||||
private int _currentSize;
|
||||
private int _activeCount;
|
||||
private long _totalAcquisitions;
|
||||
private long _totalTimeouts;
|
||||
private double _averageWaitTimeMs;
|
||||
|
||||
public ConnectionPool(
|
||||
IConnectionFactory<TConnection> factory,
|
||||
ConnectionPoolConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<ConnectionPool<TConnection>> logger)
|
||||
{
|
||||
_factory = factory;
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
|
||||
_availableConnections = Channel.CreateBounded<PooledConnection<TConnection>>(
|
||||
new BoundedChannelOptions(config.MaxPoolSize)
|
||||
{
|
||||
FullMode = BoundedChannelFullMode.Wait
|
||||
});
|
||||
|
||||
_createSemaphore = new SemaphoreSlim(config.MaxPoolSize, config.MaxPoolSize);
|
||||
_maintenanceTask = Task.Run(MaintenanceLoopAsync);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Warms up the pool by pre-creating connections.
|
||||
/// </summary>
|
||||
public async Task WarmupAsync(CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogInformation("Warming up connection pool to {MinSize} connections", _config.MinPoolSize);
|
||||
|
||||
var warmupTasks = Enumerable.Range(0, _config.MinPoolSize)
|
||||
.Select(_ => CreateAndAddConnectionAsync(ct));
|
||||
|
||||
await Task.WhenAll(warmupTasks);
|
||||
|
||||
_logger.LogInformation("Connection pool warmed up with {Size} connections", _currentSize);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Acquires a connection from the pool.
|
||||
/// </summary>
|
||||
public async Task<PooledConnectionLease<TConnection>> AcquireAsync(CancellationToken ct = default)
|
||||
{
|
||||
var sw = Stopwatch.StartNew();
|
||||
Interlocked.Increment(ref _totalAcquisitions);
|
||||
|
||||
try
|
||||
{
|
||||
// Try to get an existing connection
|
||||
using var timeoutCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
timeoutCts.CancelAfter(_config.AcquireTimeout);
|
||||
|
||||
while (true)
|
||||
{
|
||||
if (_availableConnections.Reader.TryRead(out var connection))
|
||||
{
|
||||
// Validate connection health
|
||||
if (await IsConnectionHealthyAsync(connection))
|
||||
{
|
||||
connection.LastUsedAt = _timeProvider.GetUtcNow();
|
||||
connection.UseCount++;
|
||||
Interlocked.Increment(ref _activeCount);
|
||||
UpdateAverageWaitTime(sw.Elapsed.TotalMilliseconds);
|
||||
|
||||
return new PooledConnectionLease<TConnection>(connection, ReleaseConnection);
|
||||
}
|
||||
|
||||
// Connection is unhealthy, dispose it
|
||||
await DisposeConnectionAsync(connection);
|
||||
}
|
||||
|
||||
// Try to create a new connection if under max
|
||||
if (_currentSize < _config.MaxPoolSize && _createSemaphore.Wait(0))
|
||||
{
|
||||
try
|
||||
{
|
||||
var newConn = await CreateConnectionAsync(ct);
|
||||
newConn.LastUsedAt = _timeProvider.GetUtcNow();
|
||||
newConn.UseCount++;
|
||||
Interlocked.Increment(ref _activeCount);
|
||||
UpdateAverageWaitTime(sw.Elapsed.TotalMilliseconds);
|
||||
|
||||
return new PooledConnectionLease<TConnection>(newConn, ReleaseConnection);
|
||||
}
|
||||
catch
|
||||
{
|
||||
_createSemaphore.Release();
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for an available connection
|
||||
try
|
||||
{
|
||||
connection = await _availableConnections.Reader.ReadAsync(timeoutCts.Token);
|
||||
if (await IsConnectionHealthyAsync(connection))
|
||||
{
|
||||
connection.LastUsedAt = _timeProvider.GetUtcNow();
|
||||
connection.UseCount++;
|
||||
Interlocked.Increment(ref _activeCount);
|
||||
UpdateAverageWaitTime(sw.Elapsed.TotalMilliseconds);
|
||||
|
||||
return new PooledConnectionLease<TConnection>(connection, ReleaseConnection);
|
||||
}
|
||||
|
||||
await DisposeConnectionAsync(connection);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
Interlocked.Increment(ref _totalTimeouts);
|
||||
throw new TimeoutException($"Timeout acquiring connection after {_config.AcquireTimeout.TotalSeconds}s");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to acquire connection from pool");
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets pool statistics.
|
||||
/// </summary>
|
||||
public ConnectionPoolStatistics GetStatistics()
|
||||
{
|
||||
return new ConnectionPoolStatistics
|
||||
{
|
||||
TotalConnections = _currentSize,
|
||||
ActiveConnections = _activeCount,
|
||||
AvailableConnections = _currentSize - _activeCount,
|
||||
TotalAcquisitions = _totalAcquisitions,
|
||||
TotalTimeouts = _totalTimeouts,
|
||||
AverageWaitTimeMs = _averageWaitTimeMs,
|
||||
MinPoolSize = _config.MinPoolSize,
|
||||
MaxPoolSize = _config.MaxPoolSize
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<PooledConnection<TConnection>> CreateConnectionAsync(CancellationToken ct)
|
||||
{
|
||||
var connection = await _factory.CreateAsync(ct);
|
||||
var id = Guid.NewGuid().ToString("N");
|
||||
|
||||
var pooled = new PooledConnection<TConnection>
|
||||
{
|
||||
Id = id,
|
||||
Connection = connection,
|
||||
CreatedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
_allConnections[id] = pooled;
|
||||
Interlocked.Increment(ref _currentSize);
|
||||
|
||||
_logger.LogDebug("Created new connection {Id}, pool size: {Size}", id, _currentSize);
|
||||
|
||||
return pooled;
|
||||
}
|
||||
|
||||
private async Task CreateAndAddConnectionAsync(CancellationToken ct)
|
||||
{
|
||||
if (!_createSemaphore.Wait(0)) return;
|
||||
|
||||
try
|
||||
{
|
||||
var connection = await CreateConnectionAsync(ct);
|
||||
await _availableConnections.Writer.WriteAsync(connection, ct);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to create connection during warmup");
|
||||
}
|
||||
finally
|
||||
{
|
||||
_createSemaphore.Release();
|
||||
}
|
||||
}
|
||||
|
||||
private void ReleaseConnection(PooledConnection<TConnection> connection)
|
||||
{
|
||||
Interlocked.Decrement(ref _activeCount);
|
||||
|
||||
// Check if connection should be disposed
|
||||
if (connection.UseCount >= _config.MaxConnectionUses ||
|
||||
(_timeProvider.GetUtcNow() - connection.CreatedAt) > _config.MaxConnectionAge)
|
||||
{
|
||||
_ = DisposeConnectionAsync(connection);
|
||||
return;
|
||||
}
|
||||
|
||||
// Return to pool
|
||||
if (!_availableConnections.Writer.TryWrite(connection))
|
||||
{
|
||||
_ = DisposeConnectionAsync(connection);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<bool> IsConnectionHealthyAsync(PooledConnection<TConnection> connection)
|
||||
{
|
||||
try
|
||||
{
|
||||
return await _factory.ValidateAsync(connection.Connection, _cts.Token);
|
||||
}
|
||||
catch
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private async Task DisposeConnectionAsync(PooledConnection<TConnection> connection)
|
||||
{
|
||||
if (_allConnections.TryRemove(connection.Id, out _))
|
||||
{
|
||||
Interlocked.Decrement(ref _currentSize);
|
||||
|
||||
try
|
||||
{
|
||||
await _factory.DisposeAsync(connection.Connection);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Error disposing connection {Id}", connection.Id);
|
||||
}
|
||||
|
||||
_createSemaphore.Release();
|
||||
_logger.LogDebug("Disposed connection {Id}, pool size: {Size}", connection.Id, _currentSize);
|
||||
}
|
||||
}
|
||||
|
||||
private void UpdateAverageWaitTime(double waitTimeMs)
|
||||
{
|
||||
// Exponential moving average
|
||||
_averageWaitTimeMs = _averageWaitTimeMs * 0.9 + waitTimeMs * 0.1;
|
||||
}
|
||||
|
||||
private async Task MaintenanceLoopAsync()
|
||||
{
|
||||
while (!_cts.Token.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await Task.Delay(_config.MaintenanceInterval, _cts.Token);
|
||||
|
||||
// Ensure minimum pool size
|
||||
while (_currentSize < _config.MinPoolSize)
|
||||
{
|
||||
await CreateAndAddConnectionAsync(_cts.Token);
|
||||
}
|
||||
|
||||
// Remove idle connections above minimum
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var idleConnections = new List<PooledConnection<TConnection>>();
|
||||
|
||||
// Check for idle connections to remove
|
||||
while (_availableConnections.Reader.TryRead(out var conn))
|
||||
{
|
||||
if (_currentSize > _config.MinPoolSize &&
|
||||
(now - conn.LastUsedAt) > _config.IdleTimeout)
|
||||
{
|
||||
idleConnections.Add(conn);
|
||||
}
|
||||
else
|
||||
{
|
||||
await _availableConnections.Writer.WriteAsync(conn, _cts.Token);
|
||||
}
|
||||
}
|
||||
|
||||
foreach (var conn in idleConnections)
|
||||
{
|
||||
await DisposeConnectionAsync(conn);
|
||||
}
|
||||
|
||||
if (idleConnections.Count > 0)
|
||||
{
|
||||
_logger.LogDebug("Removed {Count} idle connections", idleConnections.Count);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Error in connection pool maintenance");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_cts.Cancel();
|
||||
_maintenanceTask.Wait(TimeSpan.FromSeconds(5));
|
||||
|
||||
foreach (var conn in _allConnections.Values)
|
||||
{
|
||||
_ = _factory.DisposeAsync(conn.Connection);
|
||||
}
|
||||
|
||||
_allConnections.Clear();
|
||||
_createSemaphore.Dispose();
|
||||
_cts.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface IConnectionPool<TConnection>
|
||||
where TConnection : class
|
||||
{
|
||||
Task WarmupAsync(CancellationToken ct = default);
|
||||
Task<PooledConnectionLease<TConnection>> AcquireAsync(CancellationToken ct = default);
|
||||
ConnectionPoolStatistics GetStatistics();
|
||||
}
|
||||
|
||||
public interface IConnectionFactory<TConnection>
|
||||
{
|
||||
Task<TConnection> CreateAsync(CancellationToken ct = default);
|
||||
Task<bool> ValidateAsync(TConnection connection, CancellationToken ct = default);
|
||||
Task DisposeAsync(TConnection connection);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record ConnectionPoolConfig
|
||||
{
|
||||
public int MinPoolSize { get; init; } = 5;
|
||||
public int MaxPoolSize { get; init; } = 50;
|
||||
public TimeSpan AcquireTimeout { get; init; } = TimeSpan.FromSeconds(30);
|
||||
public TimeSpan IdleTimeout { get; init; } = TimeSpan.FromMinutes(5);
|
||||
public TimeSpan MaxConnectionAge { get; init; } = TimeSpan.FromHours(1);
|
||||
public int MaxConnectionUses { get; init; } = 10000;
|
||||
public TimeSpan MaintenanceInterval { get; init; } = TimeSpan.FromSeconds(30);
|
||||
}
|
||||
|
||||
public sealed class PooledConnection<TConnection>
|
||||
{
|
||||
public required string Id { get; init; }
|
||||
public required TConnection Connection { get; init; }
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset LastUsedAt { get; set; }
|
||||
public int UseCount { get; set; }
|
||||
}
|
||||
|
||||
public sealed record ConnectionPoolStatistics
|
||||
{
|
||||
public required int TotalConnections { get; init; }
|
||||
public required int ActiveConnections { get; init; }
|
||||
public required int AvailableConnections { get; init; }
|
||||
public required long TotalAcquisitions { get; init; }
|
||||
public required long TotalTimeouts { get; init; }
|
||||
public required double AverageWaitTimeMs { get; init; }
|
||||
public required int MinPoolSize { get; init; }
|
||||
public required int MaxPoolSize { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// RAII-style lease that returns connection to pool on disposal.
|
||||
/// </summary>
|
||||
public readonly struct PooledConnectionLease<TConnection> : IDisposable
|
||||
where TConnection : class
|
||||
{
|
||||
private readonly PooledConnection<TConnection> _pooledConnection;
|
||||
private readonly Action<PooledConnection<TConnection>> _releaseAction;
|
||||
|
||||
public TConnection Connection => _pooledConnection.Connection;
|
||||
|
||||
public PooledConnectionLease(
|
||||
PooledConnection<TConnection> pooledConnection,
|
||||
Action<PooledConnection<TConnection>> releaseAction)
|
||||
{
|
||||
_pooledConnection = pooledConnection;
|
||||
_releaseAction = releaseAction;
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_releaseAction(_pooledConnection);
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,351 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// PerformanceBaseline.cs
|
||||
// Sprint: SPRINT_20260117_038_ReleaseOrchestrator_performance
|
||||
// Task: TASK-038-01 - Establish performance baselines and metrics
|
||||
// Description: Instrumentation and baseline measurement for performance tracking
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Core.Performance;
|
||||
|
||||
/// <summary>
|
||||
/// Performance baseline measurement and tracking infrastructure.
|
||||
/// </summary>
|
||||
public sealed class PerformanceBaseline : IPerformanceBaseline
|
||||
{
|
||||
private static readonly Meter s_meter = new("StellaOps.ReleaseOrchestrator.Performance", "1.0.0");
|
||||
|
||||
private readonly ConcurrentDictionary<string, BaselineMetrics> _baselines = new();
|
||||
private readonly ConcurrentDictionary<string, List<double>> _measurements = new();
|
||||
private readonly PerformanceBaselineConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<PerformanceBaseline> _logger;
|
||||
|
||||
// Metrics
|
||||
private readonly Counter<long> _operationCounter;
|
||||
private readonly Histogram<double> _operationDuration;
|
||||
private readonly ObservableGauge<double> _baselineP50;
|
||||
private readonly ObservableGauge<double> _baselineP99;
|
||||
|
||||
public PerformanceBaseline(
|
||||
PerformanceBaselineConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<PerformanceBaseline> logger)
|
||||
{
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
|
||||
_operationCounter = s_meter.CreateCounter<long>(
|
||||
"stella.operation.count",
|
||||
description: "Number of operations executed");
|
||||
|
||||
_operationDuration = s_meter.CreateHistogram<double>(
|
||||
"stella.operation.duration_ms",
|
||||
unit: "ms",
|
||||
description: "Duration of operations in milliseconds");
|
||||
|
||||
_baselineP50 = s_meter.CreateObservableGauge(
|
||||
"stella.baseline.p50_ms",
|
||||
() => GetBaselineObservations("p50"),
|
||||
unit: "ms",
|
||||
description: "P50 baseline values");
|
||||
|
||||
_baselineP99 = s_meter.CreateObservableGauge(
|
||||
"stella.baseline.p99_ms",
|
||||
() => GetBaselineObservations("p99"),
|
||||
unit: "ms",
|
||||
description: "P99 baseline values");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts measuring an operation.
|
||||
/// </summary>
|
||||
public OperationMeasurement StartMeasurement(string operationName)
|
||||
{
|
||||
return new OperationMeasurement(this, operationName, Stopwatch.StartNew());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a measurement for an operation.
|
||||
/// </summary>
|
||||
public void RecordMeasurement(string operationName, double durationMs, bool success = true)
|
||||
{
|
||||
_operationCounter.Add(1, new KeyValuePair<string, object?>("operation", operationName),
|
||||
new KeyValuePair<string, object?>("success", success));
|
||||
|
||||
_operationDuration.Record(durationMs,
|
||||
new KeyValuePair<string, object?>("operation", operationName));
|
||||
|
||||
var measurements = _measurements.GetOrAdd(operationName, _ => []);
|
||||
lock (measurements)
|
||||
{
|
||||
measurements.Add(durationMs);
|
||||
|
||||
// Keep only recent measurements
|
||||
if (measurements.Count > _config.MaxMeasurementsPerOperation)
|
||||
{
|
||||
measurements.RemoveRange(0, measurements.Count - _config.MaxMeasurementsPerOperation);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes and stores a baseline for an operation.
|
||||
/// </summary>
|
||||
public BaselineMetrics ComputeBaseline(string operationName)
|
||||
{
|
||||
if (!_measurements.TryGetValue(operationName, out var measurements))
|
||||
{
|
||||
return new BaselineMetrics
|
||||
{
|
||||
OperationName = operationName,
|
||||
ComputedAt = _timeProvider.GetUtcNow(),
|
||||
SampleCount = 0
|
||||
};
|
||||
}
|
||||
|
||||
List<double> sorted;
|
||||
lock (measurements)
|
||||
{
|
||||
sorted = measurements.OrderBy(x => x).ToList();
|
||||
}
|
||||
|
||||
if (sorted.Count == 0)
|
||||
{
|
||||
return new BaselineMetrics
|
||||
{
|
||||
OperationName = operationName,
|
||||
ComputedAt = _timeProvider.GetUtcNow(),
|
||||
SampleCount = 0
|
||||
};
|
||||
}
|
||||
|
||||
var baseline = new BaselineMetrics
|
||||
{
|
||||
OperationName = operationName,
|
||||
SampleCount = sorted.Count,
|
||||
Min = sorted[0],
|
||||
Max = sorted[^1],
|
||||
Mean = sorted.Average(),
|
||||
Median = GetPercentile(sorted, 50),
|
||||
P90 = GetPercentile(sorted, 90),
|
||||
P95 = GetPercentile(sorted, 95),
|
||||
P99 = GetPercentile(sorted, 99),
|
||||
StandardDeviation = CalculateStandardDeviation(sorted),
|
||||
ComputedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
_baselines[operationName] = baseline;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Computed baseline for {Operation}: P50={P50:F2}ms, P95={P95:F2}ms, P99={P99:F2}ms",
|
||||
operationName, baseline.Median, baseline.P95, baseline.P99);
|
||||
|
||||
return baseline;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current baseline for an operation.
|
||||
/// </summary>
|
||||
public BaselineMetrics? GetBaseline(string operationName)
|
||||
{
|
||||
return _baselines.TryGetValue(operationName, out var baseline) ? baseline : null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all baselines.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, BaselineMetrics> GetAllBaselines()
|
||||
{
|
||||
return _baselines;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a measurement exceeds the baseline threshold.
|
||||
/// </summary>
|
||||
public BaselineComparison CompareToBaseline(string operationName, double durationMs)
|
||||
{
|
||||
if (!_baselines.TryGetValue(operationName, out var baseline))
|
||||
{
|
||||
return new BaselineComparison
|
||||
{
|
||||
OperationName = operationName,
|
||||
DurationMs = durationMs,
|
||||
HasBaseline = false,
|
||||
Status = BaselineStatus.NoBaseline
|
||||
};
|
||||
}
|
||||
|
||||
var threshold = baseline.P95 * _config.RegressionThresholdMultiplier;
|
||||
var status = durationMs <= baseline.Median ? BaselineStatus.BetterThanBaseline :
|
||||
durationMs <= baseline.P95 ? BaselineStatus.WithinBaseline :
|
||||
durationMs <= threshold ? BaselineStatus.SlightlyAboveBaseline :
|
||||
BaselineStatus.Regression;
|
||||
|
||||
return new BaselineComparison
|
||||
{
|
||||
OperationName = operationName,
|
||||
DurationMs = durationMs,
|
||||
HasBaseline = true,
|
||||
Baseline = baseline,
|
||||
Status = status,
|
||||
PercentOfP95 = (durationMs / baseline.P95) * 100
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Clears measurements for an operation.
|
||||
/// </summary>
|
||||
public void ClearMeasurements(string operationName)
|
||||
{
|
||||
_measurements.TryRemove(operationName, out _);
|
||||
}
|
||||
|
||||
private static double GetPercentile(List<double> sorted, double percentile)
|
||||
{
|
||||
if (sorted.Count == 0) return 0;
|
||||
if (sorted.Count == 1) return sorted[0];
|
||||
|
||||
var index = (percentile / 100.0) * (sorted.Count - 1);
|
||||
var lower = (int)Math.Floor(index);
|
||||
var upper = (int)Math.Ceiling(index);
|
||||
var fraction = index - lower;
|
||||
|
||||
if (upper >= sorted.Count) upper = sorted.Count - 1;
|
||||
|
||||
return sorted[lower] + (sorted[upper] - sorted[lower]) * fraction;
|
||||
}
|
||||
|
||||
private static double CalculateStandardDeviation(List<double> values)
|
||||
{
|
||||
if (values.Count < 2) return 0;
|
||||
|
||||
var mean = values.Average();
|
||||
var sumSquaredDiff = values.Sum(v => (v - mean) * (v - mean));
|
||||
return Math.Sqrt(sumSquaredDiff / (values.Count - 1));
|
||||
}
|
||||
|
||||
private IEnumerable<Measurement<double>> GetBaselineObservations(string percentile)
|
||||
{
|
||||
foreach (var (name, baseline) in _baselines)
|
||||
{
|
||||
var value = percentile switch
|
||||
{
|
||||
"p50" => baseline.Median,
|
||||
"p95" => baseline.P95,
|
||||
"p99" => baseline.P99,
|
||||
_ => baseline.Mean
|
||||
};
|
||||
|
||||
yield return new Measurement<double>(value,
|
||||
new KeyValuePair<string, object?>("operation", name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface IPerformanceBaseline
|
||||
{
|
||||
OperationMeasurement StartMeasurement(string operationName);
|
||||
void RecordMeasurement(string operationName, double durationMs, bool success = true);
|
||||
BaselineMetrics ComputeBaseline(string operationName);
|
||||
BaselineMetrics? GetBaseline(string operationName);
|
||||
IReadOnlyDictionary<string, BaselineMetrics> GetAllBaselines();
|
||||
BaselineComparison CompareToBaseline(string operationName, double durationMs);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record PerformanceBaselineConfig
|
||||
{
|
||||
public int MaxMeasurementsPerOperation { get; init; } = 10000;
|
||||
public double RegressionThresholdMultiplier { get; init; } = 1.5;
|
||||
public TimeSpan BaselineExpirationTime { get; init; } = TimeSpan.FromDays(7);
|
||||
}
|
||||
|
||||
public sealed record BaselineMetrics
|
||||
{
|
||||
public required string OperationName { get; init; }
|
||||
public required int SampleCount { get; init; }
|
||||
public double Min { get; init; }
|
||||
public double Max { get; init; }
|
||||
public double Mean { get; init; }
|
||||
public double Median { get; init; }
|
||||
public double P90 { get; init; }
|
||||
public double P95 { get; init; }
|
||||
public double P99 { get; init; }
|
||||
public double StandardDeviation { get; init; }
|
||||
public DateTimeOffset ComputedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record BaselineComparison
|
||||
{
|
||||
public required string OperationName { get; init; }
|
||||
public required double DurationMs { get; init; }
|
||||
public required bool HasBaseline { get; init; }
|
||||
public BaselineMetrics? Baseline { get; init; }
|
||||
public required BaselineStatus Status { get; init; }
|
||||
public double PercentOfP95 { get; init; }
|
||||
}
|
||||
|
||||
public enum BaselineStatus
|
||||
{
|
||||
NoBaseline,
|
||||
BetterThanBaseline,
|
||||
WithinBaseline,
|
||||
SlightlyAboveBaseline,
|
||||
Regression
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// RAII-style measurement helper.
|
||||
/// </summary>
|
||||
public readonly struct OperationMeasurement : IDisposable
|
||||
{
|
||||
private readonly PerformanceBaseline _baseline;
|
||||
private readonly string _operationName;
|
||||
private readonly Stopwatch _stopwatch;
|
||||
|
||||
public OperationMeasurement(PerformanceBaseline baseline, string operationName, Stopwatch stopwatch)
|
||||
{
|
||||
_baseline = baseline;
|
||||
_operationName = operationName;
|
||||
_stopwatch = stopwatch;
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_stopwatch.Stop();
|
||||
_baseline.RecordMeasurement(_operationName, _stopwatch.Elapsed.TotalMilliseconds);
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Common Operation Names
|
||||
|
||||
public static class PerformanceOperations
|
||||
{
|
||||
public const string GateEvaluation = "gate_evaluation";
|
||||
public const string PolicyCheck = "policy_check";
|
||||
public const string ScanExecution = "scan_execution";
|
||||
public const string DigestResolution = "digest_resolution";
|
||||
public const string EvidenceStorage = "evidence_storage";
|
||||
public const string DeploymentExecution = "deployment_execution";
|
||||
public const string PromotionWorkflow = "promotion_workflow";
|
||||
public const string AuditLogWrite = "audit_log_write";
|
||||
public const string DatabaseQuery = "database_query";
|
||||
public const string CacheLookup = "cache_lookup";
|
||||
public const string RegistryPull = "registry_pull";
|
||||
public const string NotificationSend = "notification_send";
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,354 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// Prefetcher.cs
|
||||
// Sprint: SPRINT_20260117_038_ReleaseOrchestrator_performance
|
||||
// Task: TASK-038-07 - Predictive cache warming
|
||||
// Description: Intelligent prefetcher for predictive data loading
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
using System.Threading.Channels;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Core.Performance;
|
||||
|
||||
/// <summary>
|
||||
/// Predictive prefetcher that warms cache based on access patterns.
|
||||
/// </summary>
|
||||
public sealed class Prefetcher : IPrefetcher, IDisposable
|
||||
{
|
||||
private readonly ICacheManager _cacheManager;
|
||||
private readonly PrefetcherConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<Prefetcher> _logger;
|
||||
|
||||
private readonly ConcurrentDictionary<string, PrefetchPattern> _accessPatterns = new();
|
||||
private readonly ConcurrentDictionary<string, List<DateTimeOffset>> _accessTimes = new();
|
||||
private readonly Channel<PrefetchRequest> _prefetchQueue;
|
||||
private readonly CancellationTokenSource _cts = new();
|
||||
private readonly Task _prefetchWorker;
|
||||
|
||||
// Registered data loaders
|
||||
private readonly ConcurrentDictionary<string, Func<string, CancellationToken, Task<object?>>> _loaders = new();
|
||||
|
||||
public Prefetcher(
|
||||
ICacheManager cacheManager,
|
||||
PrefetcherConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<Prefetcher> logger)
|
||||
{
|
||||
_cacheManager = cacheManager;
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
|
||||
_prefetchQueue = Channel.CreateBounded<PrefetchRequest>(new BoundedChannelOptions(_config.MaxQueueSize)
|
||||
{
|
||||
FullMode = BoundedChannelFullMode.DropOldest
|
||||
});
|
||||
|
||||
_prefetchWorker = Task.Run(ProcessPrefetchQueueAsync);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers a data loader for a key pattern.
|
||||
/// </summary>
|
||||
public void RegisterLoader(string pattern, Func<string, CancellationToken, Task<object?>> loader)
|
||||
{
|
||||
_loaders[pattern] = loader;
|
||||
_logger.LogDebug("Registered loader for pattern: {Pattern}", pattern);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records an access to a key and triggers predictive prefetching.
|
||||
/// </summary>
|
||||
public async Task RecordAccessAsync(string key, PrefetchHint hint = default)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
// Record access time
|
||||
var times = _accessTimes.GetOrAdd(key, _ => []);
|
||||
lock (times)
|
||||
{
|
||||
times.Add(now);
|
||||
if (times.Count > _config.MaxAccessHistoryPerKey)
|
||||
{
|
||||
times.RemoveRange(0, times.Count - _config.MaxAccessHistoryPerKey);
|
||||
}
|
||||
}
|
||||
|
||||
// Update pattern
|
||||
var pattern = _accessPatterns.GetOrAdd(key, _ => new PrefetchPattern { Key = key });
|
||||
pattern.AccessCount++;
|
||||
pattern.LastAccessAt = now;
|
||||
|
||||
// Process hints
|
||||
if (hint.RelatedKeys?.Any() == true)
|
||||
{
|
||||
foreach (var relatedKey in hint.RelatedKeys)
|
||||
{
|
||||
pattern.AddRelatedKey(relatedKey);
|
||||
}
|
||||
}
|
||||
|
||||
// Trigger predictive prefetch if pattern is established
|
||||
if (pattern.AccessCount >= _config.MinAccessesForPrediction)
|
||||
{
|
||||
await TriggerPredictivePrefetchAsync(pattern);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Manually requests prefetch for specific keys.
|
||||
/// </summary>
|
||||
public async Task PrefetchAsync(IEnumerable<string> keys, PrefetchPriority priority = PrefetchPriority.Normal)
|
||||
{
|
||||
foreach (var key in keys)
|
||||
{
|
||||
await _prefetchQueue.Writer.WriteAsync(new PrefetchRequest
|
||||
{
|
||||
Key = key,
|
||||
Priority = priority,
|
||||
RequestedAt = _timeProvider.GetUtcNow()
|
||||
}, _cts.Token);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Warms the cache with frequently accessed items.
|
||||
/// </summary>
|
||||
public async Task WarmCacheAsync(CancellationToken ct = default)
|
||||
{
|
||||
var hotKeys = _accessPatterns.Values
|
||||
.Where(p => p.AccessCount >= _config.MinAccessesForPrediction)
|
||||
.OrderByDescending(p => p.AccessCount)
|
||||
.Take(_config.MaxWarmupKeys)
|
||||
.Select(p => p.Key);
|
||||
|
||||
await PrefetchAsync(hotKeys, PrefetchPriority.High);
|
||||
|
||||
_logger.LogInformation("Cache warmup initiated for {Count} hot keys",
|
||||
hotKeys.Count());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets prefetch statistics.
|
||||
/// </summary>
|
||||
public PrefetchStatistics GetStatistics()
|
||||
{
|
||||
return new PrefetchStatistics
|
||||
{
|
||||
TrackedPatterns = _accessPatterns.Count,
|
||||
QueuedPrefetches = _prefetchQueue.Reader.Count,
|
||||
HotKeys = _accessPatterns.Values
|
||||
.OrderByDescending(p => p.AccessCount)
|
||||
.Take(10)
|
||||
.Select(p => new HotKeyInfo
|
||||
{
|
||||
Key = p.Key,
|
||||
AccessCount = p.AccessCount,
|
||||
LastAccessAt = p.LastAccessAt
|
||||
})
|
||||
.ToList()
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Clears all access patterns and history.
|
||||
/// </summary>
|
||||
public void ClearPatterns()
|
||||
{
|
||||
_accessPatterns.Clear();
|
||||
_accessTimes.Clear();
|
||||
_logger.LogInformation("Cleared all prefetch patterns");
|
||||
}
|
||||
|
||||
private async Task TriggerPredictivePrefetchAsync(PrefetchPattern pattern)
|
||||
{
|
||||
// Predict related keys to prefetch
|
||||
var relatedKeys = pattern.GetTopRelatedKeys(_config.MaxRelatedKeysPrefetch);
|
||||
|
||||
foreach (var key in relatedKeys)
|
||||
{
|
||||
// Check if already in cache
|
||||
var existing = await _cacheManager.GetAsync<object>(key);
|
||||
if (existing.HasValue) continue;
|
||||
|
||||
// Queue for prefetch
|
||||
await _prefetchQueue.Writer.WriteAsync(new PrefetchRequest
|
||||
{
|
||||
Key = key,
|
||||
Priority = PrefetchPriority.Predictive,
|
||||
RequestedAt = _timeProvider.GetUtcNow(),
|
||||
SourcePattern = pattern.Key
|
||||
}, _cts.Token);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ProcessPrefetchQueueAsync()
|
||||
{
|
||||
await foreach (var request in _prefetchQueue.Reader.ReadAllAsync(_cts.Token))
|
||||
{
|
||||
try
|
||||
{
|
||||
// Skip if already in cache
|
||||
var existing = await _cacheManager.GetAsync<object>(request.Key);
|
||||
if (existing.HasValue) continue;
|
||||
|
||||
// Find loader for this key
|
||||
var loader = FindLoader(request.Key);
|
||||
if (loader is null)
|
||||
{
|
||||
_logger.LogDebug("No loader found for key: {Key}", request.Key);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Load data
|
||||
var data = await loader(request.Key, _cts.Token);
|
||||
if (data is null) continue;
|
||||
|
||||
// Store in cache with prefetch TTL
|
||||
await _cacheManager.SetAsync(request.Key, data, new CacheOptions
|
||||
{
|
||||
Ttl = _config.PrefetchedItemTtl
|
||||
});
|
||||
|
||||
_logger.LogDebug("Prefetched key: {Key} (source: {Source})",
|
||||
request.Key, request.SourcePattern ?? "manual");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to prefetch key: {Key}", request.Key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Func<string, CancellationToken, Task<object?>>? FindLoader(string key)
|
||||
{
|
||||
foreach (var (pattern, loader) in _loaders)
|
||||
{
|
||||
if (key.StartsWith(pattern, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return loader;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_cts.Cancel();
|
||||
_prefetchQueue.Writer.Complete();
|
||||
_prefetchWorker.Wait(TimeSpan.FromSeconds(5));
|
||||
_cts.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface IPrefetcher
|
||||
{
|
||||
void RegisterLoader(string pattern, Func<string, CancellationToken, Task<object?>> loader);
|
||||
Task RecordAccessAsync(string key, PrefetchHint hint = default);
|
||||
Task PrefetchAsync(IEnumerable<string> keys, PrefetchPriority priority = PrefetchPriority.Normal);
|
||||
Task WarmCacheAsync(CancellationToken ct = default);
|
||||
PrefetchStatistics GetStatistics();
|
||||
}
|
||||
|
||||
public interface ICacheManager
|
||||
{
|
||||
Task<CacheResult<T>> GetAsync<T>(string key, CancellationToken ct = default);
|
||||
Task SetAsync<T>(string key, T value, CacheOptions options, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record PrefetcherConfig
|
||||
{
|
||||
public int MaxQueueSize { get; init; } = 1000;
|
||||
public int MaxAccessHistoryPerKey { get; init; } = 100;
|
||||
public int MinAccessesForPrediction { get; init; } = 5;
|
||||
public int MaxRelatedKeysPrefetch { get; init; } = 10;
|
||||
public int MaxWarmupKeys { get; init; } = 100;
|
||||
public TimeSpan PrefetchedItemTtl { get; init; } = TimeSpan.FromMinutes(10);
|
||||
}
|
||||
|
||||
public sealed record PrefetchHint
|
||||
{
|
||||
public IEnumerable<string>? RelatedKeys { get; init; }
|
||||
public string? Category { get; init; }
|
||||
}
|
||||
|
||||
public enum PrefetchPriority
|
||||
{
|
||||
Low = 0,
|
||||
Normal = 1,
|
||||
Predictive = 2,
|
||||
High = 3
|
||||
}
|
||||
|
||||
public sealed record PrefetchRequest
|
||||
{
|
||||
public required string Key { get; init; }
|
||||
public required PrefetchPriority Priority { get; init; }
|
||||
public required DateTimeOffset RequestedAt { get; init; }
|
||||
public string? SourcePattern { get; init; }
|
||||
}
|
||||
|
||||
public sealed record PrefetchStatistics
|
||||
{
|
||||
public required int TrackedPatterns { get; init; }
|
||||
public required int QueuedPrefetches { get; init; }
|
||||
public required List<HotKeyInfo> HotKeys { get; init; }
|
||||
}
|
||||
|
||||
public sealed record HotKeyInfo
|
||||
{
|
||||
public required string Key { get; init; }
|
||||
public required int AccessCount { get; init; }
|
||||
public required DateTimeOffset LastAccessAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed class PrefetchPattern
|
||||
{
|
||||
public required string Key { get; init; }
|
||||
public int AccessCount { get; set; }
|
||||
public DateTimeOffset LastAccessAt { get; set; }
|
||||
|
||||
private readonly ConcurrentDictionary<string, int> _relatedKeys = new();
|
||||
|
||||
public void AddRelatedKey(string key)
|
||||
{
|
||||
_relatedKeys.AddOrUpdate(key, 1, (_, c) => c + 1);
|
||||
}
|
||||
|
||||
public IEnumerable<string> GetTopRelatedKeys(int count)
|
||||
{
|
||||
return _relatedKeys
|
||||
.OrderByDescending(kvp => kvp.Value)
|
||||
.Take(count)
|
||||
.Select(kvp => kvp.Key);
|
||||
}
|
||||
}
|
||||
|
||||
public sealed record CacheOptions
|
||||
{
|
||||
public TimeSpan? Ttl { get; init; }
|
||||
}
|
||||
|
||||
public readonly struct CacheResult<T>
|
||||
{
|
||||
public readonly T? Value;
|
||||
public readonly bool HasValue;
|
||||
|
||||
public CacheResult(T value)
|
||||
{
|
||||
Value = value;
|
||||
HasValue = true;
|
||||
}
|
||||
|
||||
public static CacheResult<T> Miss => default;
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,491 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// HealthAnalyzer.cs
|
||||
// Sprint: SPRINT_20260117_033_ReleaseOrchestrator_rollback_intelligence
|
||||
// Task: TASK-033-03 - Health Analyzer for baseline comparison
|
||||
// Description: Evaluates current health metrics against baselines with signal analysis
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Deployment.Rollback;
|
||||
|
||||
/// <summary>
|
||||
/// Evaluates deployment health by comparing current metrics against baselines.
|
||||
/// Supports configurable health signals with weighted scoring.
|
||||
/// </summary>
|
||||
public sealed class HealthAnalyzer : IHealthAnalyzer
|
||||
{
|
||||
private readonly IMetricsCollector _metricsCollector;
|
||||
private readonly IBaselineManager _baselineManager;
|
||||
private readonly IAnomalyDetector _anomalyDetector;
|
||||
private readonly HealthAnalyzerConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<HealthAnalyzer> _logger;
|
||||
|
||||
public HealthAnalyzer(
|
||||
IMetricsCollector metricsCollector,
|
||||
IBaselineManager baselineManager,
|
||||
IAnomalyDetector anomalyDetector,
|
||||
HealthAnalyzerConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<HealthAnalyzer> logger)
|
||||
{
|
||||
_metricsCollector = metricsCollector;
|
||||
_baselineManager = baselineManager;
|
||||
_anomalyDetector = anomalyDetector;
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evaluates the current health status of a deployment.
|
||||
/// </summary>
|
||||
/// <param name="deploymentId">The deployment identifier.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Health evaluation result with detailed analysis.</returns>
|
||||
public async Task<HealthEvaluation> EvaluateHealthAsync(
|
||||
Guid deploymentId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Evaluating health for deployment {DeploymentId}", deploymentId);
|
||||
|
||||
var baseline = await _baselineManager.GetBaselineAsync(deploymentId, ct);
|
||||
if (baseline is null)
|
||||
{
|
||||
_logger.LogWarning("No baseline found for deployment {DeploymentId}", deploymentId);
|
||||
return CreateUnknownResult(deploymentId, "No baseline available");
|
||||
}
|
||||
|
||||
var currentMetrics = await _metricsCollector.CollectCurrentAsync(deploymentId, ct);
|
||||
var signalResults = await EvaluateSignalsAsync(baseline, currentMetrics, ct);
|
||||
|
||||
var overallScore = CalculateOverallScore(signalResults);
|
||||
var status = DetermineHealthStatus(overallScore, signalResults);
|
||||
|
||||
var result = new HealthEvaluation
|
||||
{
|
||||
DeploymentId = deploymentId,
|
||||
Status = status,
|
||||
OverallScore = overallScore,
|
||||
Signals = signalResults,
|
||||
EvaluatedAt = _timeProvider.GetUtcNow(),
|
||||
BaselineVersion = baseline.Version,
|
||||
Recommendation = GenerateRecommendation(status, signalResults)
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Health evaluation for {DeploymentId}: Status={Status}, Score={Score:F2}",
|
||||
deploymentId, status, overallScore);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evaluates health for multiple deployments in a release.
|
||||
/// </summary>
|
||||
public async Task<ReleaseHealthEvaluation> EvaluateReleaseHealthAsync(
|
||||
Guid releaseId,
|
||||
ImmutableArray<Guid> deploymentIds,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var evaluations = new List<HealthEvaluation>();
|
||||
|
||||
foreach (var deploymentId in deploymentIds)
|
||||
{
|
||||
var evaluation = await EvaluateHealthAsync(deploymentId, ct);
|
||||
evaluations.Add(evaluation);
|
||||
}
|
||||
|
||||
var overallStatus = AggregateStatus(evaluations);
|
||||
var criticalDeployments = evaluations
|
||||
.Where(e => e.Status == HealthStatus.Critical)
|
||||
.Select(e => e.DeploymentId)
|
||||
.ToImmutableArray();
|
||||
|
||||
return new ReleaseHealthEvaluation
|
||||
{
|
||||
ReleaseId = releaseId,
|
||||
OverallStatus = overallStatus,
|
||||
DeploymentEvaluations = evaluations.ToImmutableArray(),
|
||||
CriticalDeployments = criticalDeployments,
|
||||
EvaluatedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Continuously monitors health and reports changes.
|
||||
/// </summary>
|
||||
public async IAsyncEnumerable<HealthEvaluation> MonitorHealthAsync(
|
||||
Guid deploymentId,
|
||||
TimeSpan interval,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
var evaluation = await EvaluateHealthAsync(deploymentId, ct);
|
||||
yield return evaluation;
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Delay(interval, ct);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
yield break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<SignalEvaluation>> EvaluateSignalsAsync(
|
||||
DeploymentBaseline baseline,
|
||||
MetricsSnapshot currentMetrics,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var results = new List<SignalEvaluation>();
|
||||
|
||||
foreach (var signal in _config.Signals)
|
||||
{
|
||||
var evaluation = await EvaluateSignalAsync(signal, baseline, currentMetrics, ct);
|
||||
results.Add(evaluation);
|
||||
}
|
||||
|
||||
return results.ToImmutableArray();
|
||||
}
|
||||
|
||||
private async Task<SignalEvaluation> EvaluateSignalAsync(
|
||||
HealthSignal signal,
|
||||
DeploymentBaseline baseline,
|
||||
MetricsSnapshot currentMetrics,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var currentValue = currentMetrics.GetMetricValue(signal.MetricName);
|
||||
var baselineValue = baseline.GetMetricBaseline(signal.MetricName);
|
||||
|
||||
if (!currentValue.HasValue || !baselineValue.HasValue)
|
||||
{
|
||||
return new SignalEvaluation
|
||||
{
|
||||
SignalName = signal.Name,
|
||||
MetricName = signal.MetricName,
|
||||
Status = SignalStatus.Unknown,
|
||||
Score = 0.5,
|
||||
Message = "Metric data unavailable"
|
||||
};
|
||||
}
|
||||
|
||||
// Check for anomalies
|
||||
var isAnomaly = await _anomalyDetector.IsAnomalyAsync(
|
||||
signal.MetricName,
|
||||
currentValue.Value,
|
||||
baseline.GetMetricHistory(signal.MetricName),
|
||||
ct);
|
||||
|
||||
// Calculate deviation
|
||||
var deviation = CalculateDeviation(currentValue.Value, baselineValue.Value, signal);
|
||||
var score = CalculateSignalScore(deviation, signal);
|
||||
var status = DetermineSignalStatus(score, isAnomaly, signal);
|
||||
|
||||
return new SignalEvaluation
|
||||
{
|
||||
SignalName = signal.Name,
|
||||
MetricName = signal.MetricName,
|
||||
CurrentValue = currentValue.Value,
|
||||
BaselineValue = baselineValue.Value,
|
||||
Deviation = deviation,
|
||||
DeviationPercent = baselineValue.Value != 0
|
||||
? Math.Abs(deviation / baselineValue.Value * 100)
|
||||
: 0,
|
||||
IsAnomaly = isAnomaly,
|
||||
Score = score,
|
||||
Status = status,
|
||||
Threshold = signal.Threshold,
|
||||
Message = GenerateSignalMessage(status, deviation, signal)
|
||||
};
|
||||
}
|
||||
|
||||
private static double CalculateDeviation(double current, double baseline, HealthSignal signal)
|
||||
{
|
||||
return signal.Direction switch
|
||||
{
|
||||
SignalDirection.LowerIsBetter => current - baseline,
|
||||
SignalDirection.HigherIsBetter => baseline - current,
|
||||
SignalDirection.CloserIsBetter => Math.Abs(current - baseline),
|
||||
_ => current - baseline
|
||||
};
|
||||
}
|
||||
|
||||
private static double CalculateSignalScore(double deviation, HealthSignal signal)
|
||||
{
|
||||
if (signal.Threshold == 0) return 1.0;
|
||||
|
||||
// Score from 0 to 1, where 1 is healthy and 0 is critical
|
||||
var normalizedDeviation = Math.Abs(deviation) / signal.Threshold;
|
||||
var score = Math.Max(0, 1 - normalizedDeviation);
|
||||
|
||||
return Math.Round(score, 4);
|
||||
}
|
||||
|
||||
private static SignalStatus DetermineSignalStatus(double score, bool isAnomaly, HealthSignal signal)
|
||||
{
|
||||
if (isAnomaly && signal.AnomalyIsCritical)
|
||||
return SignalStatus.Critical;
|
||||
|
||||
return score switch
|
||||
{
|
||||
>= 0.9 => SignalStatus.Healthy,
|
||||
>= 0.7 => SignalStatus.Warning,
|
||||
>= 0.5 => SignalStatus.Degraded,
|
||||
_ => SignalStatus.Critical
|
||||
};
|
||||
}
|
||||
|
||||
private double CalculateOverallScore(ImmutableArray<SignalEvaluation> signals)
|
||||
{
|
||||
if (signals.Length == 0) return 0.5;
|
||||
|
||||
var totalWeight = 0.0;
|
||||
var weightedScore = 0.0;
|
||||
|
||||
foreach (var signal in signals)
|
||||
{
|
||||
var signalConfig = _config.Signals.FirstOrDefault(s => s.Name == signal.SignalName);
|
||||
var weight = signalConfig?.Weight ?? 1.0;
|
||||
|
||||
totalWeight += weight;
|
||||
weightedScore += signal.Score * weight;
|
||||
}
|
||||
|
||||
return totalWeight > 0 ? weightedScore / totalWeight : 0.5;
|
||||
}
|
||||
|
||||
private static HealthStatus DetermineHealthStatus(double overallScore, ImmutableArray<SignalEvaluation> signals)
|
||||
{
|
||||
// Any critical signal makes overall status critical
|
||||
if (signals.Any(s => s.Status == SignalStatus.Critical))
|
||||
return HealthStatus.Critical;
|
||||
|
||||
return overallScore switch
|
||||
{
|
||||
>= 0.9 => HealthStatus.Healthy,
|
||||
>= 0.7 => HealthStatus.Warning,
|
||||
>= 0.5 => HealthStatus.Degraded,
|
||||
_ => HealthStatus.Critical
|
||||
};
|
||||
}
|
||||
|
||||
private static HealthStatus AggregateStatus(IEnumerable<HealthEvaluation> evaluations)
|
||||
{
|
||||
var statuses = evaluations.Select(e => e.Status).ToList();
|
||||
|
||||
if (statuses.Any(s => s == HealthStatus.Critical))
|
||||
return HealthStatus.Critical;
|
||||
if (statuses.Any(s => s == HealthStatus.Degraded))
|
||||
return HealthStatus.Degraded;
|
||||
if (statuses.Any(s => s == HealthStatus.Warning))
|
||||
return HealthStatus.Warning;
|
||||
if (statuses.All(s => s == HealthStatus.Healthy))
|
||||
return HealthStatus.Healthy;
|
||||
|
||||
return HealthStatus.Unknown;
|
||||
}
|
||||
|
||||
private static HealthEvaluation CreateUnknownResult(Guid deploymentId, string reason)
|
||||
{
|
||||
return new HealthEvaluation
|
||||
{
|
||||
DeploymentId = deploymentId,
|
||||
Status = HealthStatus.Unknown,
|
||||
OverallScore = 0.5,
|
||||
Signals = [],
|
||||
EvaluatedAt = DateTimeOffset.UtcNow,
|
||||
BaselineVersion = 0,
|
||||
Recommendation = new HealthRecommendation
|
||||
{
|
||||
Action = RecommendedAction.Investigate,
|
||||
Reason = reason,
|
||||
Confidence = 0.0
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private HealthRecommendation GenerateRecommendation(
|
||||
HealthStatus status,
|
||||
ImmutableArray<SignalEvaluation> signals)
|
||||
{
|
||||
var criticalSignals = signals.Where(s => s.Status == SignalStatus.Critical).ToList();
|
||||
|
||||
return status switch
|
||||
{
|
||||
HealthStatus.Critical => new HealthRecommendation
|
||||
{
|
||||
Action = RecommendedAction.Rollback,
|
||||
Reason = $"Critical health issues detected: {string.Join(", ", criticalSignals.Select(s => s.SignalName))}",
|
||||
Confidence = 0.9,
|
||||
AffectedSignals = criticalSignals.Select(s => s.SignalName).ToImmutableArray()
|
||||
},
|
||||
HealthStatus.Degraded => new HealthRecommendation
|
||||
{
|
||||
Action = RecommendedAction.Investigate,
|
||||
Reason = "Deployment health is degraded, investigation recommended",
|
||||
Confidence = 0.7,
|
||||
AffectedSignals = signals.Where(s => s.Status <= SignalStatus.Degraded)
|
||||
.Select(s => s.SignalName).ToImmutableArray()
|
||||
},
|
||||
HealthStatus.Warning => new HealthRecommendation
|
||||
{
|
||||
Action = RecommendedAction.Monitor,
|
||||
Reason = "Minor health deviations detected, continued monitoring advised",
|
||||
Confidence = 0.8,
|
||||
AffectedSignals = signals.Where(s => s.Status == SignalStatus.Warning)
|
||||
.Select(s => s.SignalName).ToImmutableArray()
|
||||
},
|
||||
_ => new HealthRecommendation
|
||||
{
|
||||
Action = RecommendedAction.None,
|
||||
Reason = "Deployment is healthy",
|
||||
Confidence = 1.0,
|
||||
AffectedSignals = []
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static string GenerateSignalMessage(SignalStatus status, double deviation, HealthSignal signal)
|
||||
{
|
||||
return status switch
|
||||
{
|
||||
SignalStatus.Critical => $"{signal.Name} is critically degraded (deviation: {deviation:F2})",
|
||||
SignalStatus.Degraded => $"{signal.Name} is degraded (deviation: {deviation:F2})",
|
||||
SignalStatus.Warning => $"{signal.Name} shows minor deviation ({deviation:F2})",
|
||||
SignalStatus.Healthy => $"{signal.Name} is within normal range",
|
||||
_ => $"{signal.Name} status unknown"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface IHealthAnalyzer
|
||||
{
|
||||
Task<HealthEvaluation> EvaluateHealthAsync(Guid deploymentId, CancellationToken ct = default);
|
||||
Task<ReleaseHealthEvaluation> EvaluateReleaseHealthAsync(Guid releaseId, ImmutableArray<Guid> deploymentIds, CancellationToken ct = default);
|
||||
IAsyncEnumerable<HealthEvaluation> MonitorHealthAsync(Guid deploymentId, TimeSpan interval, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IMetricsCollector
|
||||
{
|
||||
Task<MetricsSnapshot> CollectCurrentAsync(Guid deploymentId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IBaselineManager
|
||||
{
|
||||
Task<DeploymentBaseline?> GetBaselineAsync(Guid deploymentId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IAnomalyDetector
|
||||
{
|
||||
Task<bool> IsAnomalyAsync(string metricName, double value, ImmutableArray<double> history, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record HealthAnalyzerConfig
|
||||
{
|
||||
public ImmutableArray<HealthSignal> Signals { get; init; } = [];
|
||||
}
|
||||
|
||||
public sealed record HealthSignal
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required string MetricName { get; init; }
|
||||
public double Threshold { get; init; }
|
||||
public double Weight { get; init; } = 1.0;
|
||||
public SignalDirection Direction { get; init; } = SignalDirection.LowerIsBetter;
|
||||
public bool AnomalyIsCritical { get; init; } = false;
|
||||
}
|
||||
|
||||
public enum SignalDirection { LowerIsBetter, HigherIsBetter, CloserIsBetter }
|
||||
|
||||
public sealed record HealthEvaluation
|
||||
{
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public required HealthStatus Status { get; init; }
|
||||
public required double OverallScore { get; init; }
|
||||
public required ImmutableArray<SignalEvaluation> Signals { get; init; }
|
||||
public required DateTimeOffset EvaluatedAt { get; init; }
|
||||
public required int BaselineVersion { get; init; }
|
||||
public required HealthRecommendation Recommendation { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ReleaseHealthEvaluation
|
||||
{
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required HealthStatus OverallStatus { get; init; }
|
||||
public required ImmutableArray<HealthEvaluation> DeploymentEvaluations { get; init; }
|
||||
public required ImmutableArray<Guid> CriticalDeployments { get; init; }
|
||||
public required DateTimeOffset EvaluatedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record SignalEvaluation
|
||||
{
|
||||
public required string SignalName { get; init; }
|
||||
public required string MetricName { get; init; }
|
||||
public double? CurrentValue { get; init; }
|
||||
public double? BaselineValue { get; init; }
|
||||
public double Deviation { get; init; }
|
||||
public double DeviationPercent { get; init; }
|
||||
public bool IsAnomaly { get; init; }
|
||||
public required double Score { get; init; }
|
||||
public required SignalStatus Status { get; init; }
|
||||
public double Threshold { get; init; }
|
||||
public string? Message { get; init; }
|
||||
}
|
||||
|
||||
public sealed record HealthRecommendation
|
||||
{
|
||||
public required RecommendedAction Action { get; init; }
|
||||
public required string Reason { get; init; }
|
||||
public required double Confidence { get; init; }
|
||||
public ImmutableArray<string> AffectedSignals { get; init; } = [];
|
||||
}
|
||||
|
||||
public sealed record DeploymentBaseline
|
||||
{
|
||||
public Guid DeploymentId { get; init; }
|
||||
public int Version { get; init; }
|
||||
private readonly ImmutableDictionary<string, double> _metrics;
|
||||
private readonly ImmutableDictionary<string, ImmutableArray<double>> _history;
|
||||
|
||||
public DeploymentBaseline(
|
||||
ImmutableDictionary<string, double> metrics,
|
||||
ImmutableDictionary<string, ImmutableArray<double>> history)
|
||||
{
|
||||
_metrics = metrics;
|
||||
_history = history;
|
||||
}
|
||||
|
||||
public double? GetMetricBaseline(string metricName) =>
|
||||
_metrics.TryGetValue(metricName, out var value) ? value : null;
|
||||
|
||||
public ImmutableArray<double> GetMetricHistory(string metricName) =>
|
||||
_history.GetValueOrDefault(metricName, []);
|
||||
}
|
||||
|
||||
public sealed record MetricsSnapshot
|
||||
{
|
||||
private readonly ImmutableDictionary<string, double> _values;
|
||||
|
||||
public MetricsSnapshot(ImmutableDictionary<string, double> values) => _values = values;
|
||||
|
||||
public double? GetMetricValue(string metricName) =>
|
||||
_values.TryGetValue(metricName, out var value) ? value : null;
|
||||
}
|
||||
|
||||
public enum HealthStatus { Unknown, Critical, Degraded, Warning, Healthy }
|
||||
public enum SignalStatus { Unknown, Critical, Degraded, Warning, Healthy }
|
||||
public enum RecommendedAction { None, Monitor, Investigate, Rollback }
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,806 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ImpactAnalyzer.cs
|
||||
// Sprint: SPRINT_20260117_033_ReleaseOrchestrator_rollback_intelligence
|
||||
// Task: TASK-033-06 - Impact Analyzer for rollback assessment
|
||||
// Description: Analyzes rollback impact including downstream dependencies and blast radius
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Deployment.Rollback;
|
||||
|
||||
/// <summary>
|
||||
/// Analyzes the impact of a potential rollback including downstream dependencies,
|
||||
/// affected services, and estimated downtime.
|
||||
/// </summary>
|
||||
public sealed class ImpactAnalyzer : IImpactAnalyzer
|
||||
{
|
||||
private readonly IDependencyGraph _dependencyGraph;
|
||||
private readonly IServiceRegistry _serviceRegistry;
|
||||
private readonly ITrafficAnalyzer _trafficAnalyzer;
|
||||
private readonly ImpactAnalyzerConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<ImpactAnalyzer> _logger;
|
||||
|
||||
public ImpactAnalyzer(
|
||||
IDependencyGraph dependencyGraph,
|
||||
IServiceRegistry serviceRegistry,
|
||||
ITrafficAnalyzer trafficAnalyzer,
|
||||
ImpactAnalyzerConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<ImpactAnalyzer> logger)
|
||||
{
|
||||
_dependencyGraph = dependencyGraph;
|
||||
_serviceRegistry = serviceRegistry;
|
||||
_trafficAnalyzer = trafficAnalyzer;
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Analyzes the impact of rolling back a deployment.
|
||||
/// </summary>
|
||||
/// <param name="deploymentId">The deployment to analyze.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Comprehensive impact analysis.</returns>
|
||||
public async Task<ImpactAnalysis> AnalyzeImpactAsync(
|
||||
Guid deploymentId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Analyzing rollback impact for deployment {DeploymentId}", deploymentId);
|
||||
|
||||
var deployment = await _serviceRegistry.GetDeploymentAsync(deploymentId, ct);
|
||||
if (deployment is null)
|
||||
{
|
||||
throw new InvalidOperationException($"Deployment {deploymentId} not found");
|
||||
}
|
||||
|
||||
// Analyze in parallel
|
||||
var dependencyTask = AnalyzeDependencyImpactAsync(deployment, ct);
|
||||
var trafficTask = AnalyzeTrafficImpactAsync(deployment, ct);
|
||||
var downtimeTask = EstimateDowntimeAsync(deployment, ct);
|
||||
var dataTask = AnalyzeDataImpactAsync(deployment, ct);
|
||||
|
||||
await Task.WhenAll(dependencyTask, trafficTask, downtimeTask, dataTask);
|
||||
|
||||
var dependencyImpact = dependencyTask.Result;
|
||||
var trafficImpact = trafficTask.Result;
|
||||
var downtimeEstimate = downtimeTask.Result;
|
||||
var dataImpact = dataTask.Result;
|
||||
|
||||
// Calculate blast radius
|
||||
var blastRadius = CalculateBlastRadius(
|
||||
deployment,
|
||||
dependencyImpact,
|
||||
trafficImpact);
|
||||
|
||||
// Generate risk assessment
|
||||
var riskAssessment = AssessRisk(
|
||||
blastRadius,
|
||||
downtimeEstimate,
|
||||
dataImpact);
|
||||
|
||||
var analysis = new ImpactAnalysis
|
||||
{
|
||||
DeploymentId = deploymentId,
|
||||
ServiceName = deployment.ServiceName,
|
||||
BlastRadius = blastRadius,
|
||||
DependencyImpact = dependencyImpact,
|
||||
TrafficImpact = trafficImpact,
|
||||
DowntimeEstimate = downtimeEstimate,
|
||||
DataImpact = dataImpact,
|
||||
RiskAssessment = riskAssessment,
|
||||
Mitigations = GenerateMitigations(blastRadius, riskAssessment),
|
||||
AnalyzedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Impact analysis for {DeploymentId}: BlastRadius={BlastRadius}, Risk={Risk}",
|
||||
deploymentId, blastRadius.Score, riskAssessment.OverallRisk);
|
||||
|
||||
return analysis;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compares impact between full rollback and partial rollback options.
|
||||
/// </summary>
|
||||
public async Task<RollbackComparison> CompareRollbackOptionsAsync(
|
||||
Guid deploymentId,
|
||||
ImmutableArray<string> components,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var fullRollbackImpact = await AnalyzeImpactAsync(deploymentId, ct);
|
||||
|
||||
var partialImpacts = new List<ComponentImpact>();
|
||||
foreach (var component in components)
|
||||
{
|
||||
var impact = await AnalyzeComponentImpactAsync(deploymentId, component, ct);
|
||||
partialImpacts.Add(impact);
|
||||
}
|
||||
|
||||
// Find optimal rollback strategy
|
||||
var optimalStrategy = DetermineOptimalStrategy(
|
||||
fullRollbackImpact,
|
||||
partialImpacts);
|
||||
|
||||
return new RollbackComparison
|
||||
{
|
||||
DeploymentId = deploymentId,
|
||||
FullRollbackImpact = fullRollbackImpact,
|
||||
ComponentImpacts = partialImpacts.ToImmutableArray(),
|
||||
OptimalStrategy = optimalStrategy,
|
||||
Recommendation = GenerateStrategyRecommendation(optimalStrategy)
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the dependency chain that would be affected by a rollback.
|
||||
/// </summary>
|
||||
public async Task<DependencyChain> GetAffectedDependencyChainAsync(
|
||||
Guid deploymentId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var deployment = await _serviceRegistry.GetDeploymentAsync(deploymentId, ct);
|
||||
if (deployment is null)
|
||||
{
|
||||
throw new InvalidOperationException($"Deployment {deploymentId} not found");
|
||||
}
|
||||
|
||||
var upstreamDeps = await _dependencyGraph.GetUpstreamDependenciesAsync(
|
||||
deployment.ServiceName, _config.MaxDependencyDepth, ct);
|
||||
|
||||
var downstreamDeps = await _dependencyGraph.GetDownstreamDependenciesAsync(
|
||||
deployment.ServiceName, _config.MaxDependencyDepth, ct);
|
||||
|
||||
return new DependencyChain
|
||||
{
|
||||
ServiceName = deployment.ServiceName,
|
||||
UpstreamDependencies = upstreamDeps,
|
||||
DownstreamDependencies = downstreamDeps,
|
||||
TotalAffectedServices = upstreamDeps.Length + downstreamDeps.Length + 1
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DependencyImpact> AnalyzeDependencyImpactAsync(
|
||||
DeploymentInfo deployment,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var downstream = await _dependencyGraph.GetDownstreamDependenciesAsync(
|
||||
deployment.ServiceName, _config.MaxDependencyDepth, ct);
|
||||
|
||||
var affectedServices = new List<AffectedService>();
|
||||
var totalRequestsAffected = 0L;
|
||||
|
||||
foreach (var dep in downstream)
|
||||
{
|
||||
var serviceInfo = await _serviceRegistry.GetServiceAsync(dep.ServiceName, ct);
|
||||
if (serviceInfo is null) continue;
|
||||
|
||||
var requestVolume = await _trafficAnalyzer.GetRequestVolumeAsync(
|
||||
dep.ServiceName, TimeSpan.FromMinutes(5), ct);
|
||||
|
||||
affectedServices.Add(new AffectedService
|
||||
{
|
||||
ServiceName = dep.ServiceName,
|
||||
DependencyType = dep.DependencyType,
|
||||
Criticality = serviceInfo.Criticality,
|
||||
RequestVolume = requestVolume,
|
||||
ImpactLevel = CalculateServiceImpactLevel(dep, serviceInfo, requestVolume)
|
||||
});
|
||||
|
||||
totalRequestsAffected += requestVolume;
|
||||
}
|
||||
|
||||
return new DependencyImpact
|
||||
{
|
||||
DirectDependencies = downstream.Where(d => d.Depth == 1).Count(),
|
||||
TransitiveDependencies = downstream.Where(d => d.Depth > 1).Count(),
|
||||
AffectedServices = affectedServices.ToImmutableArray(),
|
||||
TotalRequestsAffected = totalRequestsAffected,
|
||||
CriticalServicesAffected = affectedServices.Count(s => s.Criticality >= ServiceCriticality.High)
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<TrafficImpact> AnalyzeTrafficImpactAsync(
|
||||
DeploymentInfo deployment,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var currentRps = await _trafficAnalyzer.GetRequestVolumeAsync(
|
||||
deployment.ServiceName, TimeSpan.FromMinutes(1), ct);
|
||||
|
||||
var peakRps = await _trafficAnalyzer.GetPeakRequestVolumeAsync(
|
||||
deployment.ServiceName, TimeSpan.FromHours(1), ct);
|
||||
|
||||
var errorRate = await _trafficAnalyzer.GetErrorRateAsync(
|
||||
deployment.ServiceName, TimeSpan.FromMinutes(5), ct);
|
||||
|
||||
var userSessions = await _trafficAnalyzer.GetActiveUserSessionsAsync(
|
||||
deployment.ServiceName, ct);
|
||||
|
||||
return new TrafficImpact
|
||||
{
|
||||
CurrentRequestsPerSecond = currentRps,
|
||||
PeakRequestsPerSecond = peakRps,
|
||||
CurrentErrorRate = errorRate,
|
||||
ActiveUserSessions = userSessions,
|
||||
EstimatedUsersAffected = CalculateAffectedUsers(currentRps, userSessions),
|
||||
IsHighTrafficPeriod = currentRps > peakRps * 0.8
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DowntimeEstimate> EstimateDowntimeAsync(
|
||||
DeploymentInfo deployment,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var rollbackDuration = await EstimateRollbackDurationAsync(deployment, ct);
|
||||
var validationDuration = _config.ValidationDuration;
|
||||
var propagationDelay = await EstimatePropagationDelayAsync(deployment, ct);
|
||||
|
||||
var totalDowntime = rollbackDuration + validationDuration + propagationDelay;
|
||||
|
||||
// Calculate business impact
|
||||
var hourlyRevenue = await GetHourlyRevenueAsync(deployment.ServiceName, ct);
|
||||
var estimatedRevenueLoss = hourlyRevenue * (decimal)totalDowntime.TotalHours;
|
||||
|
||||
return new DowntimeEstimate
|
||||
{
|
||||
RollbackDuration = rollbackDuration,
|
||||
ValidationDuration = validationDuration,
|
||||
PropagationDelay = propagationDelay,
|
||||
TotalEstimatedDowntime = totalDowntime,
|
||||
ConfidenceInterval = CalculateConfidenceInterval(totalDowntime),
|
||||
EstimatedRevenueLoss = estimatedRevenueLoss
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<DataImpact> AnalyzeDataImpactAsync(
|
||||
DeploymentInfo deployment,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var schemaChanges = await _serviceRegistry.GetSchemaChangesAsync(
|
||||
deployment.DeploymentId, ct);
|
||||
|
||||
var dataIntegrityRisks = new List<DataIntegrityRisk>();
|
||||
|
||||
foreach (var change in schemaChanges)
|
||||
{
|
||||
if (change.IsBreakingChange)
|
||||
{
|
||||
dataIntegrityRisks.Add(new DataIntegrityRisk
|
||||
{
|
||||
ChangeType = change.ChangeType,
|
||||
AffectedTable = change.TableName,
|
||||
Description = change.Description,
|
||||
MigrationRequired = change.RequiresMigration,
|
||||
Severity = change.IsDataLoss ? RiskSeverity.Critical : RiskSeverity.High
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return new DataImpact
|
||||
{
|
||||
SchemaChanges = schemaChanges,
|
||||
HasBreakingChanges = schemaChanges.Any(c => c.IsBreakingChange),
|
||||
DataIntegrityRisks = dataIntegrityRisks.ToImmutableArray(),
|
||||
RequiresDataMigration = schemaChanges.Any(c => c.RequiresMigration),
|
||||
PotentialDataLoss = schemaChanges.Any(c => c.IsDataLoss)
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<ComponentImpact> AnalyzeComponentImpactAsync(
|
||||
Guid deploymentId,
|
||||
string componentName,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var componentDeps = await _dependencyGraph.GetComponentDependenciesAsync(
|
||||
componentName, ct);
|
||||
|
||||
var traffic = await _trafficAnalyzer.GetComponentTrafficAsync(
|
||||
componentName, TimeSpan.FromMinutes(5), ct);
|
||||
|
||||
return new ComponentImpact
|
||||
{
|
||||
ComponentName = componentName,
|
||||
DirectDependencies = componentDeps.Length,
|
||||
RequestVolume = traffic,
|
||||
CanRollbackIndependently = componentDeps.All(d => !d.IsRequired),
|
||||
RollbackComplexity = CalculateComponentComplexity(componentDeps)
|
||||
};
|
||||
}
|
||||
|
||||
private BlastRadius CalculateBlastRadius(
|
||||
DeploymentInfo deployment,
|
||||
DependencyImpact dependencyImpact,
|
||||
TrafficImpact trafficImpact)
|
||||
{
|
||||
var serviceScore = dependencyImpact.AffectedServices.Length * 0.1;
|
||||
var criticalScore = dependencyImpact.CriticalServicesAffected * 0.3;
|
||||
var trafficScore = trafficImpact.IsHighTrafficPeriod ? 0.2 : 0.1;
|
||||
var userScore = Math.Min(trafficImpact.EstimatedUsersAffected / 1000.0, 0.3);
|
||||
|
||||
var totalScore = Math.Min(serviceScore + criticalScore + trafficScore + userScore, 1.0);
|
||||
|
||||
return new BlastRadius
|
||||
{
|
||||
Score = totalScore,
|
||||
Category = CategorizeBlastRadius(totalScore),
|
||||
AffectedServiceCount = dependencyImpact.AffectedServices.Length,
|
||||
AffectedUserCount = trafficImpact.EstimatedUsersAffected,
|
||||
CriticalServiceCount = dependencyImpact.CriticalServicesAffected,
|
||||
Visualization = GenerateBlastRadiusVisualization(dependencyImpact)
|
||||
};
|
||||
}
|
||||
|
||||
private static BlastRadiusCategory CategorizeBlastRadius(double score)
|
||||
{
|
||||
return score switch
|
||||
{
|
||||
>= 0.8 => BlastRadiusCategory.Massive,
|
||||
>= 0.6 => BlastRadiusCategory.Large,
|
||||
>= 0.4 => BlastRadiusCategory.Medium,
|
||||
>= 0.2 => BlastRadiusCategory.Small,
|
||||
_ => BlastRadiusCategory.Minimal
|
||||
};
|
||||
}
|
||||
|
||||
private static RiskAssessment AssessRisk(
|
||||
BlastRadius blastRadius,
|
||||
DowntimeEstimate downtime,
|
||||
DataImpact dataImpact)
|
||||
{
|
||||
var blastRadiusRisk = blastRadius.Score * 0.3;
|
||||
var downtimeRisk = Math.Min(downtime.TotalEstimatedDowntime.TotalMinutes / 60.0, 1.0) * 0.3;
|
||||
var dataRisk = (dataImpact.HasBreakingChanges ? 0.5 : 0) +
|
||||
(dataImpact.PotentialDataLoss ? 0.5 : 0) * 0.4;
|
||||
|
||||
var overallRisk = blastRadiusRisk + downtimeRisk + dataRisk;
|
||||
|
||||
return new RiskAssessment
|
||||
{
|
||||
OverallRisk = Math.Min(overallRisk, 1.0),
|
||||
RiskLevel = CategorizeRisk(overallRisk),
|
||||
BlastRadiusRisk = blastRadiusRisk,
|
||||
DowntimeRisk = downtimeRisk,
|
||||
DataRisk = dataRisk,
|
||||
RequiresApproval = overallRisk > 0.5 || dataImpact.PotentialDataLoss,
|
||||
ApprovalLevel = DetermineApprovalLevel(overallRisk)
|
||||
};
|
||||
}
|
||||
|
||||
private static RiskLevel CategorizeRisk(double score) => score switch
|
||||
{
|
||||
>= 0.8 => RiskLevel.Critical,
|
||||
>= 0.6 => RiskLevel.High,
|
||||
>= 0.4 => RiskLevel.Medium,
|
||||
>= 0.2 => RiskLevel.Low,
|
||||
_ => RiskLevel.Minimal
|
||||
};
|
||||
|
||||
private static ApprovalLevel DetermineApprovalLevel(double risk) => risk switch
|
||||
{
|
||||
>= 0.8 => ApprovalLevel.Executive,
|
||||
>= 0.6 => ApprovalLevel.Director,
|
||||
>= 0.4 => ApprovalLevel.Manager,
|
||||
_ => ApprovalLevel.TeamLead
|
||||
};
|
||||
|
||||
private ImmutableArray<Mitigation> GenerateMitigations(
|
||||
BlastRadius blastRadius,
|
||||
RiskAssessment riskAssessment)
|
||||
{
|
||||
var mitigations = new List<Mitigation>();
|
||||
|
||||
if (blastRadius.Category >= BlastRadiusCategory.Large)
|
||||
{
|
||||
mitigations.Add(new Mitigation
|
||||
{
|
||||
Type = MitigationType.PartialRollback,
|
||||
Description = "Consider rolling back only the affected component",
|
||||
EffectivenessScore = 0.7,
|
||||
ImplementationComplexity = Complexity.Medium
|
||||
});
|
||||
|
||||
mitigations.Add(new Mitigation
|
||||
{
|
||||
Type = MitigationType.GradualRollback,
|
||||
Description = "Implement gradual rollback with traffic shifting",
|
||||
EffectivenessScore = 0.8,
|
||||
ImplementationComplexity = Complexity.High
|
||||
});
|
||||
}
|
||||
|
||||
if (riskAssessment.DowntimeRisk > 0.3)
|
||||
{
|
||||
mitigations.Add(new Mitigation
|
||||
{
|
||||
Type = MitigationType.BlueGreenSwitch,
|
||||
Description = "Use blue-green deployment for zero-downtime rollback",
|
||||
EffectivenessScore = 0.9,
|
||||
ImplementationComplexity = Complexity.Low
|
||||
});
|
||||
}
|
||||
|
||||
if (riskAssessment.DataRisk > 0.3)
|
||||
{
|
||||
mitigations.Add(new Mitigation
|
||||
{
|
||||
Type = MitigationType.DataBackup,
|
||||
Description = "Create data backup before rollback",
|
||||
EffectivenessScore = 0.95,
|
||||
ImplementationComplexity = Complexity.Medium
|
||||
});
|
||||
}
|
||||
|
||||
return mitigations.ToImmutableArray();
|
||||
}
|
||||
|
||||
private static RollbackStrategy DetermineOptimalStrategy(
|
||||
ImpactAnalysis fullRollback,
|
||||
List<ComponentImpact> componentImpacts)
|
||||
{
|
||||
var independentComponents = componentImpacts
|
||||
.Where(c => c.CanRollbackIndependently)
|
||||
.ToList();
|
||||
|
||||
if (independentComponents.Count > 0 &&
|
||||
fullRollback.BlastRadius.Category >= BlastRadiusCategory.Medium)
|
||||
{
|
||||
return new RollbackStrategy
|
||||
{
|
||||
Type = RollbackStrategyType.Partial,
|
||||
Components = independentComponents.Select(c => c.ComponentName).ToImmutableArray(),
|
||||
EstimatedImpactReduction = 0.5,
|
||||
Complexity = Complexity.Medium
|
||||
};
|
||||
}
|
||||
|
||||
if (fullRollback.RiskAssessment.RiskLevel <= RiskLevel.Low)
|
||||
{
|
||||
return new RollbackStrategy
|
||||
{
|
||||
Type = RollbackStrategyType.Full,
|
||||
Components = [],
|
||||
EstimatedImpactReduction = 0,
|
||||
Complexity = Complexity.Low
|
||||
};
|
||||
}
|
||||
|
||||
return new RollbackStrategy
|
||||
{
|
||||
Type = RollbackStrategyType.Gradual,
|
||||
Components = [],
|
||||
EstimatedImpactReduction = 0.3,
|
||||
Complexity = Complexity.High
|
||||
};
|
||||
}
|
||||
|
||||
private static string GenerateStrategyRecommendation(RollbackStrategy strategy)
|
||||
{
|
||||
return strategy.Type switch
|
||||
{
|
||||
RollbackStrategyType.Full => "Full rollback recommended - low overall risk",
|
||||
RollbackStrategyType.Partial =>
|
||||
$"Partial rollback of {string.Join(", ", strategy.Components)} recommended to reduce blast radius",
|
||||
RollbackStrategyType.Gradual =>
|
||||
"Gradual rollback with traffic shifting recommended due to high impact",
|
||||
_ => "Unable to determine optimal strategy"
|
||||
};
|
||||
}
|
||||
|
||||
private static ImpactLevel CalculateServiceImpactLevel(
|
||||
DependencyInfo dep,
|
||||
ServiceInfo service,
|
||||
long requestVolume)
|
||||
{
|
||||
if (service.Criticality >= ServiceCriticality.Critical)
|
||||
return ImpactLevel.Critical;
|
||||
|
||||
if (dep.DependencyType == DependencyType.Synchronous && requestVolume > 1000)
|
||||
return ImpactLevel.High;
|
||||
|
||||
if (requestVolume > 100)
|
||||
return ImpactLevel.Medium;
|
||||
|
||||
return ImpactLevel.Low;
|
||||
}
|
||||
|
||||
private static int CalculateAffectedUsers(long rps, int sessions)
|
||||
{
|
||||
return Math.Max(sessions, (int)(rps * 60 / 10)); // Rough estimate
|
||||
}
|
||||
|
||||
private async Task<TimeSpan> EstimateRollbackDurationAsync(
|
||||
DeploymentInfo deployment,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Base duration + scaling factor for complexity
|
||||
await Task.CompletedTask;
|
||||
var baseDuration = TimeSpan.FromMinutes(5);
|
||||
var complexityFactor = deployment.ComponentCount * 0.5;
|
||||
return baseDuration + TimeSpan.FromMinutes(complexityFactor);
|
||||
}
|
||||
|
||||
private async Task<TimeSpan> EstimatePropagationDelayAsync(
|
||||
DeploymentInfo deployment,
|
||||
CancellationToken ct)
|
||||
{
|
||||
await Task.CompletedTask;
|
||||
// Cache invalidation, DNS, load balancer updates
|
||||
return TimeSpan.FromMinutes(2);
|
||||
}
|
||||
|
||||
private static (TimeSpan Min, TimeSpan Max) CalculateConfidenceInterval(TimeSpan estimate)
|
||||
{
|
||||
return (
|
||||
TimeSpan.FromMinutes(estimate.TotalMinutes * 0.7),
|
||||
TimeSpan.FromMinutes(estimate.TotalMinutes * 1.5)
|
||||
);
|
||||
}
|
||||
|
||||
private async Task<decimal> GetHourlyRevenueAsync(string serviceName, CancellationToken ct)
|
||||
{
|
||||
await Task.CompletedTask;
|
||||
// Would integrate with business metrics
|
||||
return 0m;
|
||||
}
|
||||
|
||||
private static Complexity CalculateComponentComplexity(ImmutableArray<ComponentDependency> deps)
|
||||
{
|
||||
if (deps.Length > 10 || deps.Any(d => d.IsRequired))
|
||||
return Complexity.High;
|
||||
if (deps.Length > 3)
|
||||
return Complexity.Medium;
|
||||
return Complexity.Low;
|
||||
}
|
||||
|
||||
private static BlastRadiusVisualization GenerateBlastRadiusVisualization(DependencyImpact impact)
|
||||
{
|
||||
return new BlastRadiusVisualization
|
||||
{
|
||||
Nodes = impact.AffectedServices
|
||||
.Select(s => new VisualizationNode { Name = s.ServiceName, Level = s.ImpactLevel })
|
||||
.ToImmutableArray()
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface IImpactAnalyzer
|
||||
{
|
||||
Task<ImpactAnalysis> AnalyzeImpactAsync(Guid deploymentId, CancellationToken ct = default);
|
||||
Task<RollbackComparison> CompareRollbackOptionsAsync(Guid deploymentId, ImmutableArray<string> components, CancellationToken ct = default);
|
||||
Task<DependencyChain> GetAffectedDependencyChainAsync(Guid deploymentId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IDependencyGraph
|
||||
{
|
||||
Task<ImmutableArray<DependencyInfo>> GetUpstreamDependenciesAsync(string serviceName, int maxDepth, CancellationToken ct = default);
|
||||
Task<ImmutableArray<DependencyInfo>> GetDownstreamDependenciesAsync(string serviceName, int maxDepth, CancellationToken ct = default);
|
||||
Task<ImmutableArray<ComponentDependency>> GetComponentDependenciesAsync(string componentName, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IServiceRegistry
|
||||
{
|
||||
Task<DeploymentInfo?> GetDeploymentAsync(Guid deploymentId, CancellationToken ct = default);
|
||||
Task<ServiceInfo?> GetServiceAsync(string serviceName, CancellationToken ct = default);
|
||||
Task<ImmutableArray<SchemaChange>> GetSchemaChangesAsync(Guid deploymentId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface ITrafficAnalyzer
|
||||
{
|
||||
Task<long> GetRequestVolumeAsync(string serviceName, TimeSpan window, CancellationToken ct = default);
|
||||
Task<long> GetPeakRequestVolumeAsync(string serviceName, TimeSpan window, CancellationToken ct = default);
|
||||
Task<double> GetErrorRateAsync(string serviceName, TimeSpan window, CancellationToken ct = default);
|
||||
Task<int> GetActiveUserSessionsAsync(string serviceName, CancellationToken ct = default);
|
||||
Task<long> GetComponentTrafficAsync(string componentName, TimeSpan window, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record ImpactAnalyzerConfig
|
||||
{
|
||||
public int MaxDependencyDepth { get; init; } = 3;
|
||||
public TimeSpan ValidationDuration { get; init; } = TimeSpan.FromMinutes(5);
|
||||
}
|
||||
|
||||
public sealed record ImpactAnalysis
|
||||
{
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public required string ServiceName { get; init; }
|
||||
public required BlastRadius BlastRadius { get; init; }
|
||||
public required DependencyImpact DependencyImpact { get; init; }
|
||||
public required TrafficImpact TrafficImpact { get; init; }
|
||||
public required DowntimeEstimate DowntimeEstimate { get; init; }
|
||||
public required DataImpact DataImpact { get; init; }
|
||||
public required RiskAssessment RiskAssessment { get; init; }
|
||||
public required ImmutableArray<Mitigation> Mitigations { get; init; }
|
||||
public required DateTimeOffset AnalyzedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record BlastRadius
|
||||
{
|
||||
public required double Score { get; init; }
|
||||
public required BlastRadiusCategory Category { get; init; }
|
||||
public required int AffectedServiceCount { get; init; }
|
||||
public required int AffectedUserCount { get; init; }
|
||||
public required int CriticalServiceCount { get; init; }
|
||||
public BlastRadiusVisualization? Visualization { get; init; }
|
||||
}
|
||||
|
||||
public enum BlastRadiusCategory { Minimal, Small, Medium, Large, Massive }
|
||||
|
||||
public sealed record DependencyImpact
|
||||
{
|
||||
public required int DirectDependencies { get; init; }
|
||||
public required int TransitiveDependencies { get; init; }
|
||||
public required ImmutableArray<AffectedService> AffectedServices { get; init; }
|
||||
public required long TotalRequestsAffected { get; init; }
|
||||
public required int CriticalServicesAffected { get; init; }
|
||||
}
|
||||
|
||||
public sealed record AffectedService
|
||||
{
|
||||
public required string ServiceName { get; init; }
|
||||
public required DependencyType DependencyType { get; init; }
|
||||
public required ServiceCriticality Criticality { get; init; }
|
||||
public required long RequestVolume { get; init; }
|
||||
public required ImpactLevel ImpactLevel { get; init; }
|
||||
}
|
||||
|
||||
public enum DependencyType { Synchronous, Asynchronous, Database, Cache }
|
||||
public enum ServiceCriticality { Low, Medium, High, Critical }
|
||||
public enum ImpactLevel { Low, Medium, High, Critical }
|
||||
|
||||
public sealed record TrafficImpact
|
||||
{
|
||||
public required long CurrentRequestsPerSecond { get; init; }
|
||||
public required long PeakRequestsPerSecond { get; init; }
|
||||
public required double CurrentErrorRate { get; init; }
|
||||
public required int ActiveUserSessions { get; init; }
|
||||
public required int EstimatedUsersAffected { get; init; }
|
||||
public required bool IsHighTrafficPeriod { get; init; }
|
||||
}
|
||||
|
||||
public sealed record DowntimeEstimate
|
||||
{
|
||||
public required TimeSpan RollbackDuration { get; init; }
|
||||
public required TimeSpan ValidationDuration { get; init; }
|
||||
public required TimeSpan PropagationDelay { get; init; }
|
||||
public required TimeSpan TotalEstimatedDowntime { get; init; }
|
||||
public required (TimeSpan Min, TimeSpan Max) ConfidenceInterval { get; init; }
|
||||
public required decimal EstimatedRevenueLoss { get; init; }
|
||||
}
|
||||
|
||||
public sealed record DataImpact
|
||||
{
|
||||
public required ImmutableArray<SchemaChange> SchemaChanges { get; init; }
|
||||
public required bool HasBreakingChanges { get; init; }
|
||||
public required ImmutableArray<DataIntegrityRisk> DataIntegrityRisks { get; init; }
|
||||
public required bool RequiresDataMigration { get; init; }
|
||||
public required bool PotentialDataLoss { get; init; }
|
||||
}
|
||||
|
||||
public sealed record SchemaChange
|
||||
{
|
||||
public required string ChangeType { get; init; }
|
||||
public required string TableName { get; init; }
|
||||
public required string Description { get; init; }
|
||||
public required bool IsBreakingChange { get; init; }
|
||||
public required bool RequiresMigration { get; init; }
|
||||
public required bool IsDataLoss { get; init; }
|
||||
}
|
||||
|
||||
public sealed record DataIntegrityRisk
|
||||
{
|
||||
public required string ChangeType { get; init; }
|
||||
public required string AffectedTable { get; init; }
|
||||
public required string Description { get; init; }
|
||||
public required bool MigrationRequired { get; init; }
|
||||
public required RiskSeverity Severity { get; init; }
|
||||
}
|
||||
|
||||
public enum RiskSeverity { Low, Medium, High, Critical }
|
||||
|
||||
public sealed record RiskAssessment
|
||||
{
|
||||
public required double OverallRisk { get; init; }
|
||||
public required RiskLevel RiskLevel { get; init; }
|
||||
public required double BlastRadiusRisk { get; init; }
|
||||
public required double DowntimeRisk { get; init; }
|
||||
public required double DataRisk { get; init; }
|
||||
public required bool RequiresApproval { get; init; }
|
||||
public required ApprovalLevel ApprovalLevel { get; init; }
|
||||
}
|
||||
|
||||
public enum ApprovalLevel { TeamLead, Manager, Director, Executive }
|
||||
|
||||
public sealed record Mitigation
|
||||
{
|
||||
public required MitigationType Type { get; init; }
|
||||
public required string Description { get; init; }
|
||||
public required double EffectivenessScore { get; init; }
|
||||
public required Complexity ImplementationComplexity { get; init; }
|
||||
}
|
||||
|
||||
public enum MitigationType { PartialRollback, GradualRollback, BlueGreenSwitch, DataBackup, MaintenanceWindow }
|
||||
public enum Complexity { Low, Medium, High }
|
||||
|
||||
public sealed record RollbackComparison
|
||||
{
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public required ImpactAnalysis FullRollbackImpact { get; init; }
|
||||
public required ImmutableArray<ComponentImpact> ComponentImpacts { get; init; }
|
||||
public required RollbackStrategy OptimalStrategy { get; init; }
|
||||
public required string Recommendation { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ComponentImpact
|
||||
{
|
||||
public required string ComponentName { get; init; }
|
||||
public required int DirectDependencies { get; init; }
|
||||
public required long RequestVolume { get; init; }
|
||||
public required bool CanRollbackIndependently { get; init; }
|
||||
public required Complexity RollbackComplexity { get; init; }
|
||||
}
|
||||
|
||||
public sealed record RollbackStrategy
|
||||
{
|
||||
public required RollbackStrategyType Type { get; init; }
|
||||
public required ImmutableArray<string> Components { get; init; }
|
||||
public required double EstimatedImpactReduction { get; init; }
|
||||
public required Complexity Complexity { get; init; }
|
||||
}
|
||||
|
||||
public enum RollbackStrategyType { Full, Partial, Gradual, BlueGreen }
|
||||
|
||||
public sealed record DependencyChain
|
||||
{
|
||||
public required string ServiceName { get; init; }
|
||||
public required ImmutableArray<DependencyInfo> UpstreamDependencies { get; init; }
|
||||
public required ImmutableArray<DependencyInfo> DownstreamDependencies { get; init; }
|
||||
public required int TotalAffectedServices { get; init; }
|
||||
}
|
||||
|
||||
public sealed record DependencyInfo
|
||||
{
|
||||
public required string ServiceName { get; init; }
|
||||
public required DependencyType DependencyType { get; init; }
|
||||
public required int Depth { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ComponentDependency
|
||||
{
|
||||
public required string ComponentName { get; init; }
|
||||
public required bool IsRequired { get; init; }
|
||||
}
|
||||
|
||||
public sealed record DeploymentInfo
|
||||
{
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public required string ServiceName { get; init; }
|
||||
public required int ComponentCount { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ServiceInfo
|
||||
{
|
||||
public required string ServiceName { get; init; }
|
||||
public required ServiceCriticality Criticality { get; init; }
|
||||
}
|
||||
|
||||
public sealed record BlastRadiusVisualization
|
||||
{
|
||||
public required ImmutableArray<VisualizationNode> Nodes { get; init; }
|
||||
}
|
||||
|
||||
public sealed record VisualizationNode
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required ImpactLevel Level { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,376 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Deployment.Rollback.Intelligence;
|
||||
|
||||
/// <summary>
|
||||
/// Detects anomalies in deployment metrics using multiple algorithms.
|
||||
/// </summary>
|
||||
public sealed class AnomalyDetector
|
||||
{
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly AnomalyDetectorConfig _config;
|
||||
private readonly ILogger<AnomalyDetector> _logger;
|
||||
|
||||
public AnomalyDetector(
|
||||
TimeProvider timeProvider,
|
||||
AnomalyDetectorConfig config,
|
||||
ILogger<AnomalyDetector> logger)
|
||||
{
|
||||
_timeProvider = timeProvider;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Detects anomalies in the given metrics.
|
||||
/// </summary>
|
||||
public AnomalyDetectionResult Detect(
|
||||
IReadOnlyList<MetricDataPoint> metrics,
|
||||
AnomalyDetectionContext context)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(metrics);
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
|
||||
if (metrics.Count < _config.MinDataPoints)
|
||||
{
|
||||
return new AnomalyDetectionResult
|
||||
{
|
||||
DeploymentId = context.DeploymentId,
|
||||
DetectedAt = _timeProvider.GetUtcNow(),
|
||||
Anomalies = [],
|
||||
Status = AnomalyDetectionStatus.InsufficientData,
|
||||
Message = $"Need at least {_config.MinDataPoints} data points, got {metrics.Count}"
|
||||
};
|
||||
}
|
||||
|
||||
var anomalies = new List<Anomaly>();
|
||||
|
||||
// Group by metric name
|
||||
var byMetric = metrics.GroupBy(m => m.Name);
|
||||
|
||||
foreach (var group in byMetric)
|
||||
{
|
||||
var values = group.OrderBy(m => m.Timestamp).ToList();
|
||||
var detected = DetectForMetric(group.Key, values, context);
|
||||
anomalies.AddRange(detected);
|
||||
}
|
||||
|
||||
var hasAnomalies = anomalies.Count > 0;
|
||||
var severity = hasAnomalies
|
||||
? anomalies.Max(a => a.Severity)
|
||||
: AnomalySeverity.None;
|
||||
|
||||
return new AnomalyDetectionResult
|
||||
{
|
||||
DeploymentId = context.DeploymentId,
|
||||
DetectedAt = _timeProvider.GetUtcNow(),
|
||||
Anomalies = anomalies.ToImmutableArray(),
|
||||
Status = hasAnomalies ? AnomalyDetectionStatus.AnomaliesDetected : AnomalyDetectionStatus.Normal,
|
||||
OverallSeverity = severity,
|
||||
AnomalyScore = CalculateOverallScore(anomalies)
|
||||
};
|
||||
}
|
||||
|
||||
private IEnumerable<Anomaly> DetectForMetric(
|
||||
string metricName,
|
||||
List<MetricDataPoint> values,
|
||||
AnomalyDetectionContext context)
|
||||
{
|
||||
var anomalies = new List<Anomaly>();
|
||||
|
||||
// Z-Score detection
|
||||
if (_config.EnableZScore)
|
||||
{
|
||||
anomalies.AddRange(DetectZScoreAnomalies(metricName, values, context));
|
||||
}
|
||||
|
||||
// Sliding window detection
|
||||
if (_config.EnableSlidingWindow)
|
||||
{
|
||||
anomalies.AddRange(DetectSlidingWindowAnomalies(metricName, values, context));
|
||||
}
|
||||
|
||||
// Rate of change detection
|
||||
if (_config.EnableRateOfChange)
|
||||
{
|
||||
anomalies.AddRange(DetectRateOfChangeAnomalies(metricName, values, context));
|
||||
}
|
||||
|
||||
return anomalies;
|
||||
}
|
||||
|
||||
private IEnumerable<Anomaly> DetectZScoreAnomalies(
|
||||
string metricName,
|
||||
List<MetricDataPoint> values,
|
||||
AnomalyDetectionContext context)
|
||||
{
|
||||
if (values.Count < 2)
|
||||
{
|
||||
yield break;
|
||||
}
|
||||
|
||||
var numericValues = values.Select(v => v.Value).ToList();
|
||||
var mean = numericValues.Average();
|
||||
var stdDev = CalculateStandardDeviation(numericValues, mean);
|
||||
|
||||
if (stdDev < 0.0001) // Avoid division by zero
|
||||
{
|
||||
yield break;
|
||||
}
|
||||
|
||||
foreach (var point in values)
|
||||
{
|
||||
var zScore = Math.Abs((point.Value - mean) / stdDev);
|
||||
|
||||
if (zScore > _config.ZScoreThreshold)
|
||||
{
|
||||
yield return new Anomaly
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
MetricName = metricName,
|
||||
DetectedAt = point.Timestamp,
|
||||
Value = point.Value,
|
||||
ExpectedRange = new ValueRange { Min = mean - 2 * stdDev, Max = mean + 2 * stdDev },
|
||||
Severity = ClassifySeverity(zScore),
|
||||
Algorithm = AnomalyAlgorithm.ZScore,
|
||||
Score = zScore,
|
||||
Message = $"Z-score {zScore:F2} exceeds threshold {_config.ZScoreThreshold}"
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private IEnumerable<Anomaly> DetectSlidingWindowAnomalies(
|
||||
string metricName,
|
||||
List<MetricDataPoint> values,
|
||||
AnomalyDetectionContext context)
|
||||
{
|
||||
var windowSize = _config.SlidingWindowSize;
|
||||
|
||||
if (values.Count < windowSize)
|
||||
{
|
||||
yield break;
|
||||
}
|
||||
|
||||
for (int i = windowSize; i < values.Count; i++)
|
||||
{
|
||||
var window = values.Skip(i - windowSize).Take(windowSize).Select(v => v.Value).ToList();
|
||||
var windowMean = window.Average();
|
||||
var windowStdDev = CalculateStandardDeviation(window, windowMean);
|
||||
|
||||
var current = values[i];
|
||||
var deviation = Math.Abs(current.Value - windowMean);
|
||||
|
||||
if (windowStdDev > 0.0001 && deviation > windowStdDev * _config.SlidingWindowDeviationMultiplier)
|
||||
{
|
||||
var score = deviation / windowStdDev;
|
||||
|
||||
yield return new Anomaly
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
MetricName = metricName,
|
||||
DetectedAt = current.Timestamp,
|
||||
Value = current.Value,
|
||||
ExpectedRange = new ValueRange
|
||||
{
|
||||
Min = windowMean - windowStdDev * 2,
|
||||
Max = windowMean + windowStdDev * 2
|
||||
},
|
||||
Severity = ClassifySeverity(score),
|
||||
Algorithm = AnomalyAlgorithm.SlidingWindow,
|
||||
Score = score,
|
||||
Message = $"Value deviates {score:F2}σ from sliding window average"
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private IEnumerable<Anomaly> DetectRateOfChangeAnomalies(
|
||||
string metricName,
|
||||
List<MetricDataPoint> values,
|
||||
AnomalyDetectionContext context)
|
||||
{
|
||||
if (values.Count < 2)
|
||||
{
|
||||
yield break;
|
||||
}
|
||||
|
||||
for (int i = 1; i < values.Count; i++)
|
||||
{
|
||||
var previous = values[i - 1];
|
||||
var current = values[i];
|
||||
|
||||
if (previous.Value == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var changeRate = Math.Abs((current.Value - previous.Value) / previous.Value) * 100;
|
||||
|
||||
if (changeRate > _config.RateOfChangeThresholdPercent)
|
||||
{
|
||||
yield return new Anomaly
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
MetricName = metricName,
|
||||
DetectedAt = current.Timestamp,
|
||||
Value = current.Value,
|
||||
PreviousValue = previous.Value,
|
||||
Severity = ClassifyRateOfChangeSeverity(changeRate),
|
||||
Algorithm = AnomalyAlgorithm.RateOfChange,
|
||||
Score = changeRate / 100,
|
||||
Message = $"Value changed by {changeRate:F1}% (threshold: {_config.RateOfChangeThresholdPercent}%)"
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static double CalculateStandardDeviation(List<double> values, double mean)
|
||||
{
|
||||
if (values.Count < 2)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
var sumOfSquares = values.Sum(v => Math.Pow(v - mean, 2));
|
||||
return Math.Sqrt(sumOfSquares / (values.Count - 1));
|
||||
}
|
||||
|
||||
private AnomalySeverity ClassifySeverity(double score)
|
||||
{
|
||||
return score switch
|
||||
{
|
||||
> 5.0 => AnomalySeverity.Critical,
|
||||
> 4.0 => AnomalySeverity.High,
|
||||
> 3.0 => AnomalySeverity.Medium,
|
||||
> 2.0 => AnomalySeverity.Low,
|
||||
_ => AnomalySeverity.None
|
||||
};
|
||||
}
|
||||
|
||||
private AnomalySeverity ClassifyRateOfChangeSeverity(double changePercent)
|
||||
{
|
||||
return changePercent switch
|
||||
{
|
||||
> 500 => AnomalySeverity.Critical,
|
||||
> 200 => AnomalySeverity.High,
|
||||
> 100 => AnomalySeverity.Medium,
|
||||
> 50 => AnomalySeverity.Low,
|
||||
_ => AnomalySeverity.None
|
||||
};
|
||||
}
|
||||
|
||||
private double CalculateOverallScore(List<Anomaly> anomalies)
|
||||
{
|
||||
if (anomalies.Count == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Weighted average based on severity
|
||||
var weightedSum = anomalies.Sum(a => a.Score * (int)a.Severity);
|
||||
var totalWeight = anomalies.Sum(a => (int)a.Severity);
|
||||
|
||||
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for anomaly detection.
|
||||
/// </summary>
|
||||
public sealed record AnomalyDetectorConfig
|
||||
{
|
||||
public int MinDataPoints { get; init; } = 10;
|
||||
public bool EnableZScore { get; init; } = true;
|
||||
public double ZScoreThreshold { get; init; } = 3.0;
|
||||
public bool EnableSlidingWindow { get; init; } = true;
|
||||
public int SlidingWindowSize { get; init; } = 10;
|
||||
public double SlidingWindowDeviationMultiplier { get; init; } = 3.0;
|
||||
public bool EnableRateOfChange { get; init; } = true;
|
||||
public double RateOfChangeThresholdPercent { get; init; } = 50.0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Context for anomaly detection.
|
||||
/// </summary>
|
||||
public sealed record AnomalyDetectionContext
|
||||
{
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public MetricsSnapshot? Baseline { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of anomaly detection.
|
||||
/// </summary>
|
||||
public sealed record AnomalyDetectionResult
|
||||
{
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public required DateTimeOffset DetectedAt { get; init; }
|
||||
public required ImmutableArray<Anomaly> Anomalies { get; init; }
|
||||
public required AnomalyDetectionStatus Status { get; init; }
|
||||
public AnomalySeverity OverallSeverity { get; init; }
|
||||
public double AnomalyScore { get; init; }
|
||||
public string? Message { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A detected anomaly.
|
||||
/// </summary>
|
||||
public sealed record Anomaly
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required string MetricName { get; init; }
|
||||
public required DateTimeOffset DetectedAt { get; init; }
|
||||
public required double Value { get; init; }
|
||||
public double? PreviousValue { get; init; }
|
||||
public ValueRange? ExpectedRange { get; init; }
|
||||
public required AnomalySeverity Severity { get; init; }
|
||||
public required AnomalyAlgorithm Algorithm { get; init; }
|
||||
public required double Score { get; init; }
|
||||
public string? Message { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Expected value range.
|
||||
/// </summary>
|
||||
public sealed record ValueRange
|
||||
{
|
||||
public required double Min { get; init; }
|
||||
public required double Max { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Anomaly detection status.
|
||||
/// </summary>
|
||||
public enum AnomalyDetectionStatus
|
||||
{
|
||||
Normal,
|
||||
AnomaliesDetected,
|
||||
InsufficientData,
|
||||
Error
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Severity of detected anomaly.
|
||||
/// </summary>
|
||||
public enum AnomalySeverity
|
||||
{
|
||||
None = 0,
|
||||
Low = 1,
|
||||
Medium = 2,
|
||||
High = 3,
|
||||
Critical = 4
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm used for detection.
|
||||
/// </summary>
|
||||
public enum AnomalyAlgorithm
|
||||
{
|
||||
ZScore,
|
||||
SlidingWindow,
|
||||
RateOfChange,
|
||||
IsolationForest,
|
||||
SeasonalDecomposition
|
||||
}
|
||||
@@ -0,0 +1,340 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Deployment.Rollback.Intelligence;
|
||||
|
||||
/// <summary>
|
||||
/// Manages deployment baselines for health comparison.
|
||||
/// </summary>
|
||||
public sealed class BaselineManager
|
||||
{
|
||||
private readonly IBaselineStore _store;
|
||||
private readonly MetricsCollector _metricsCollector;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly BaselineManagerConfig _config;
|
||||
private readonly ILogger<BaselineManager> _logger;
|
||||
|
||||
public BaselineManager(
|
||||
IBaselineStore store,
|
||||
MetricsCollector metricsCollector,
|
||||
TimeProvider timeProvider,
|
||||
BaselineManagerConfig config,
|
||||
ILogger<BaselineManager> logger)
|
||||
{
|
||||
_store = store;
|
||||
_metricsCollector = metricsCollector;
|
||||
_timeProvider = timeProvider;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a baseline from current metrics.
|
||||
/// </summary>
|
||||
public async Task<DeploymentBaseline> CreateBaselineAsync(
|
||||
CreateBaselineRequest request,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Creating baseline for deployment {DeploymentId}",
|
||||
request.DeploymentId);
|
||||
|
||||
// Collect current metrics
|
||||
var snapshot = await _metricsCollector.CollectAsync(
|
||||
new MetricsQuery
|
||||
{
|
||||
DeploymentId = request.DeploymentId,
|
||||
TimeRange = TimeRange.Last(request.SampleDuration ?? _config.DefaultSampleDuration),
|
||||
Resolution = _config.BaselineResolution
|
||||
},
|
||||
ct);
|
||||
|
||||
// Calculate statistical summary
|
||||
var metrics = snapshot.Metrics;
|
||||
var metricSummaries = metrics
|
||||
.GroupBy(m => m.Name)
|
||||
.Select(g => CreateMetricSummary(g.Key, g.ToList()))
|
||||
.ToImmutableArray();
|
||||
|
||||
var baseline = new DeploymentBaseline
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
DeploymentId = request.DeploymentId,
|
||||
ReleaseId = request.ReleaseId,
|
||||
ReleaseName = request.ReleaseName,
|
||||
EnvironmentId = request.EnvironmentId,
|
||||
CreatedAt = _timeProvider.GetUtcNow(),
|
||||
SampleDuration = request.SampleDuration ?? _config.DefaultSampleDuration,
|
||||
MetricSummaries = metricSummaries,
|
||||
Status = BaselineStatus.Active,
|
||||
DataPointCount = metrics.Length
|
||||
};
|
||||
|
||||
await _store.SaveAsync(baseline, ct);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Created baseline {BaselineId} with {MetricCount} metric summaries",
|
||||
baseline.Id, metricSummaries.Length);
|
||||
|
||||
return baseline;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the active baseline for a deployment.
|
||||
/// </summary>
|
||||
public async Task<DeploymentBaseline?> GetActiveBaselineAsync(
|
||||
Guid deploymentId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
return await _store.GetActiveAsync(deploymentId, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets baseline for a specific release.
|
||||
/// </summary>
|
||||
public async Task<DeploymentBaseline?> GetBaselineForReleaseAsync(
|
||||
Guid releaseId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
return await _store.GetByReleaseAsync(releaseId, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates a baseline with new samples.
|
||||
/// </summary>
|
||||
public async Task<DeploymentBaseline> UpdateBaselineAsync(
|
||||
Guid baselineId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var baseline = await _store.GetAsync(baselineId, ct)
|
||||
?? throw new InvalidOperationException($"Baseline {baselineId} not found");
|
||||
|
||||
// Collect new metrics
|
||||
var snapshot = await _metricsCollector.CollectAsync(
|
||||
new MetricsQuery
|
||||
{
|
||||
DeploymentId = baseline.DeploymentId,
|
||||
TimeRange = TimeRange.Last(_config.UpdateSampleDuration),
|
||||
Resolution = _config.BaselineResolution
|
||||
},
|
||||
ct);
|
||||
|
||||
// Merge with existing summaries
|
||||
var existingByName = baseline.MetricSummaries.ToDictionary(m => m.MetricName);
|
||||
var newSummaries = new List<MetricSummary>();
|
||||
|
||||
foreach (var group in snapshot.Metrics.GroupBy(m => m.Name))
|
||||
{
|
||||
var newSummary = CreateMetricSummary(group.Key, group.ToList());
|
||||
|
||||
if (existingByName.TryGetValue(group.Key, out var existing))
|
||||
{
|
||||
// Merge using exponential moving average
|
||||
newSummary = MergeSummaries(existing, newSummary);
|
||||
}
|
||||
|
||||
newSummaries.Add(newSummary);
|
||||
}
|
||||
|
||||
// Keep metrics not in the new snapshot
|
||||
foreach (var existing in baseline.MetricSummaries)
|
||||
{
|
||||
if (!newSummaries.Any(n => n.MetricName == existing.MetricName))
|
||||
{
|
||||
newSummaries.Add(existing);
|
||||
}
|
||||
}
|
||||
|
||||
var updated = baseline with
|
||||
{
|
||||
MetricSummaries = newSummaries.ToImmutableArray(),
|
||||
LastUpdatedAt = _timeProvider.GetUtcNow(),
|
||||
DataPointCount = baseline.DataPointCount + snapshot.Metrics.Length
|
||||
};
|
||||
|
||||
await _store.SaveAsync(updated, ct);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Updated baseline {BaselineId} with {NewPoints} new data points",
|
||||
baselineId, snapshot.Metrics.Length);
|
||||
|
||||
return updated;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Deactivates a baseline.
|
||||
/// </summary>
|
||||
public async Task DeactivateBaselineAsync(
|
||||
Guid baselineId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var baseline = await _store.GetAsync(baselineId, ct)
|
||||
?? throw new InvalidOperationException($"Baseline {baselineId} not found");
|
||||
|
||||
var updated = baseline with
|
||||
{
|
||||
Status = BaselineStatus.Inactive,
|
||||
DeactivatedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
await _store.SaveAsync(updated, ct);
|
||||
|
||||
_logger.LogInformation("Deactivated baseline {BaselineId}", baselineId);
|
||||
}
|
||||
|
||||
private MetricSummary CreateMetricSummary(string metricName, List<MetricDataPoint> points)
|
||||
{
|
||||
if (points.Count == 0)
|
||||
{
|
||||
return new MetricSummary
|
||||
{
|
||||
MetricName = metricName,
|
||||
Mean = 0,
|
||||
Median = 0,
|
||||
StdDev = 0,
|
||||
Min = 0,
|
||||
Max = 0,
|
||||
P95 = 0,
|
||||
P99 = 0,
|
||||
SampleCount = 0
|
||||
};
|
||||
}
|
||||
|
||||
var values = points.Select(p => p.Value).OrderBy(v => v).ToList();
|
||||
var mean = values.Average();
|
||||
|
||||
return new MetricSummary
|
||||
{
|
||||
MetricName = metricName,
|
||||
Mean = mean,
|
||||
Median = GetPercentile(values, 50),
|
||||
StdDev = CalculateStandardDeviation(values, mean),
|
||||
Min = values.First(),
|
||||
Max = values.Last(),
|
||||
P95 = GetPercentile(values, 95),
|
||||
P99 = GetPercentile(values, 99),
|
||||
SampleCount = points.Count
|
||||
};
|
||||
}
|
||||
|
||||
private MetricSummary MergeSummaries(MetricSummary existing, MetricSummary newSummary)
|
||||
{
|
||||
var alpha = _config.ExponentialMovingAverageAlpha;
|
||||
|
||||
return new MetricSummary
|
||||
{
|
||||
MetricName = existing.MetricName,
|
||||
Mean = (1 - alpha) * existing.Mean + alpha * newSummary.Mean,
|
||||
Median = (1 - alpha) * existing.Median + alpha * newSummary.Median,
|
||||
StdDev = (1 - alpha) * existing.StdDev + alpha * newSummary.StdDev,
|
||||
Min = Math.Min(existing.Min, newSummary.Min),
|
||||
Max = Math.Max(existing.Max, newSummary.Max),
|
||||
P95 = (1 - alpha) * existing.P95 + alpha * newSummary.P95,
|
||||
P99 = (1 - alpha) * existing.P99 + alpha * newSummary.P99,
|
||||
SampleCount = existing.SampleCount + newSummary.SampleCount
|
||||
};
|
||||
}
|
||||
|
||||
private static double GetPercentile(List<double> sortedValues, int percentile)
|
||||
{
|
||||
if (sortedValues.Count == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
var index = (int)Math.Ceiling(percentile / 100.0 * sortedValues.Count) - 1;
|
||||
return sortedValues[Math.Max(0, Math.Min(index, sortedValues.Count - 1))];
|
||||
}
|
||||
|
||||
private static double CalculateStandardDeviation(List<double> values, double mean)
|
||||
{
|
||||
if (values.Count < 2)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
var sumOfSquares = values.Sum(v => Math.Pow(v - mean, 2));
|
||||
return Math.Sqrt(sumOfSquares / (values.Count - 1));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for baseline manager.
|
||||
/// </summary>
|
||||
public sealed record BaselineManagerConfig
|
||||
{
|
||||
public TimeSpan DefaultSampleDuration { get; init; } = TimeSpan.FromHours(1);
|
||||
public TimeSpan BaselineResolution { get; init; } = TimeSpan.FromMinutes(1);
|
||||
public TimeSpan UpdateSampleDuration { get; init; } = TimeSpan.FromMinutes(5);
|
||||
public double ExponentialMovingAverageAlpha { get; init; } = 0.2;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to create a baseline.
|
||||
/// </summary>
|
||||
public sealed record CreateBaselineRequest
|
||||
{
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public Guid? ReleaseId { get; init; }
|
||||
public string? ReleaseName { get; init; }
|
||||
public Guid? EnvironmentId { get; init; }
|
||||
public TimeSpan? SampleDuration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A deployment baseline for health comparison.
|
||||
/// </summary>
|
||||
public sealed record DeploymentBaseline
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public Guid? ReleaseId { get; init; }
|
||||
public string? ReleaseName { get; init; }
|
||||
public Guid? EnvironmentId { get; init; }
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset? LastUpdatedAt { get; init; }
|
||||
public DateTimeOffset? DeactivatedAt { get; init; }
|
||||
public required TimeSpan SampleDuration { get; init; }
|
||||
public required ImmutableArray<MetricSummary> MetricSummaries { get; init; }
|
||||
public required BaselineStatus Status { get; init; }
|
||||
public required int DataPointCount { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Statistical summary of a metric.
|
||||
/// </summary>
|
||||
public sealed record MetricSummary
|
||||
{
|
||||
public required string MetricName { get; init; }
|
||||
public required double Mean { get; init; }
|
||||
public required double Median { get; init; }
|
||||
public required double StdDev { get; init; }
|
||||
public required double Min { get; init; }
|
||||
public required double Max { get; init; }
|
||||
public required double P95 { get; init; }
|
||||
public required double P99 { get; init; }
|
||||
public required int SampleCount { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Baseline status.
|
||||
/// </summary>
|
||||
public enum BaselineStatus
|
||||
{
|
||||
Active,
|
||||
Inactive,
|
||||
Superseded
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for baseline storage.
|
||||
/// </summary>
|
||||
public interface IBaselineStore
|
||||
{
|
||||
Task SaveAsync(DeploymentBaseline baseline, CancellationToken ct = default);
|
||||
Task<DeploymentBaseline?> GetAsync(Guid id, CancellationToken ct = default);
|
||||
Task<DeploymentBaseline?> GetActiveAsync(Guid deploymentId, CancellationToken ct = default);
|
||||
Task<DeploymentBaseline?> GetByReleaseAsync(Guid releaseId, CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,316 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Deployment.Rollback.Intelligence;
|
||||
|
||||
/// <summary>
|
||||
/// Collects metrics from multiple providers for health analysis.
|
||||
/// </summary>
|
||||
public sealed class MetricsCollector
|
||||
{
|
||||
private readonly IEnumerable<IMetricsProvider> _providers;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly MetricsCollectorConfig _config;
|
||||
private readonly ILogger<MetricsCollector> _logger;
|
||||
|
||||
public MetricsCollector(
|
||||
IEnumerable<IMetricsProvider> providers,
|
||||
TimeProvider timeProvider,
|
||||
MetricsCollectorConfig config,
|
||||
ILogger<MetricsCollector> logger)
|
||||
{
|
||||
_providers = providers;
|
||||
_timeProvider = timeProvider;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Collects metrics for a deployment.
|
||||
/// </summary>
|
||||
public async Task<MetricsSnapshot> CollectAsync(
|
||||
MetricsQuery query,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(query);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Collecting metrics for deployment {DeploymentId} from {ProviderCount} providers",
|
||||
query.DeploymentId, _providers.Count());
|
||||
|
||||
var allMetrics = new List<MetricDataPoint>();
|
||||
var providerResults = new Dictionary<string, ProviderCollectionResult>();
|
||||
|
||||
foreach (var provider in _providers)
|
||||
{
|
||||
if (!provider.IsEnabled)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var metrics = await provider.CollectAsync(query, ct);
|
||||
allMetrics.AddRange(metrics);
|
||||
|
||||
providerResults[provider.Name] = new ProviderCollectionResult
|
||||
{
|
||||
ProviderName = provider.Name,
|
||||
Success = true,
|
||||
MetricsCount = metrics.Count
|
||||
};
|
||||
|
||||
_logger.LogDebug(
|
||||
"Collected {Count} metrics from {Provider}",
|
||||
metrics.Count, provider.Name);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex,
|
||||
"Failed to collect metrics from {Provider}",
|
||||
provider.Name);
|
||||
|
||||
providerResults[provider.Name] = new ProviderCollectionResult
|
||||
{
|
||||
ProviderName = provider.Name,
|
||||
Success = false,
|
||||
Error = ex.Message
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return new MetricsSnapshot
|
||||
{
|
||||
DeploymentId = query.DeploymentId,
|
||||
CollectedAt = _timeProvider.GetUtcNow(),
|
||||
Metrics = allMetrics.ToImmutableArray(),
|
||||
ProviderResults = providerResults.ToImmutableDictionary(),
|
||||
TimeRange = query.TimeRange
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Collects specific metric types for comparison.
|
||||
/// </summary>
|
||||
public async Task<MetricsSnapshot> CollectForComparisonAsync(
|
||||
Guid deploymentId,
|
||||
IReadOnlyList<string> metricNames,
|
||||
TimeRange timeRange,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var query = new MetricsQuery
|
||||
{
|
||||
DeploymentId = deploymentId,
|
||||
MetricNames = metricNames.ToImmutableArray(),
|
||||
TimeRange = timeRange,
|
||||
Resolution = _config.DefaultResolution
|
||||
};
|
||||
|
||||
return await CollectAsync(query, ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Collects key performance indicators.
|
||||
/// </summary>
|
||||
public async Task<KpiSnapshot> CollectKpisAsync(
|
||||
Guid deploymentId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var query = new MetricsQuery
|
||||
{
|
||||
DeploymentId = deploymentId,
|
||||
MetricNames = _config.KpiMetrics,
|
||||
TimeRange = TimeRange.Last(TimeSpan.FromMinutes(5)),
|
||||
Resolution = TimeSpan.FromSeconds(10)
|
||||
};
|
||||
|
||||
var snapshot = await CollectAsync(query, ct);
|
||||
|
||||
return new KpiSnapshot
|
||||
{
|
||||
DeploymentId = deploymentId,
|
||||
CollectedAt = snapshot.CollectedAt,
|
||||
ErrorRate = CalculateErrorRate(snapshot.Metrics),
|
||||
LatencyP50 = CalculateLatencyPercentile(snapshot.Metrics, 50),
|
||||
LatencyP95 = CalculateLatencyPercentile(snapshot.Metrics, 95),
|
||||
LatencyP99 = CalculateLatencyPercentile(snapshot.Metrics, 99),
|
||||
RequestRate = CalculateRequestRate(snapshot.Metrics),
|
||||
CpuUsage = CalculateAverage(snapshot.Metrics, "cpu_usage"),
|
||||
MemoryUsage = CalculateAverage(snapshot.Metrics, "memory_usage")
|
||||
};
|
||||
}
|
||||
|
||||
private double CalculateErrorRate(ImmutableArray<MetricDataPoint> metrics)
|
||||
{
|
||||
var errorMetrics = metrics.Where(m =>
|
||||
m.Name.Contains("error", StringComparison.OrdinalIgnoreCase) ||
|
||||
m.Name.Contains("5xx", StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
var totalMetrics = metrics.Where(m =>
|
||||
m.Name.Contains("request", StringComparison.OrdinalIgnoreCase) ||
|
||||
m.Name.Contains("total", StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
var errors = errorMetrics.Sum(m => m.Value);
|
||||
var total = totalMetrics.Sum(m => m.Value);
|
||||
|
||||
return total > 0 ? errors / total * 100 : 0;
|
||||
}
|
||||
|
||||
private double CalculateLatencyPercentile(ImmutableArray<MetricDataPoint> metrics, int percentile)
|
||||
{
|
||||
var latencyMetrics = metrics
|
||||
.Where(m => m.Name.Contains($"p{percentile}", StringComparison.OrdinalIgnoreCase) ||
|
||||
m.Name.Contains("latency", StringComparison.OrdinalIgnoreCase))
|
||||
.OrderBy(m => m.Value)
|
||||
.ToList();
|
||||
|
||||
if (latencyMetrics.Count == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
var index = (int)Math.Ceiling(percentile / 100.0 * latencyMetrics.Count) - 1;
|
||||
return latencyMetrics[Math.Max(0, index)].Value;
|
||||
}
|
||||
|
||||
private double CalculateRequestRate(ImmutableArray<MetricDataPoint> metrics)
|
||||
{
|
||||
return metrics
|
||||
.Where(m => m.Name.Contains("request", StringComparison.OrdinalIgnoreCase) &&
|
||||
m.Name.Contains("rate", StringComparison.OrdinalIgnoreCase))
|
||||
.DefaultIfEmpty(new MetricDataPoint { Value = 0 })
|
||||
.Average(m => m.Value);
|
||||
}
|
||||
|
||||
private double CalculateAverage(ImmutableArray<MetricDataPoint> metrics, string namePattern)
|
||||
{
|
||||
var matching = metrics.Where(m =>
|
||||
m.Name.Contains(namePattern, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
return matching.Any() ? matching.Average(m => m.Value) : 0;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for metrics collection.
|
||||
/// </summary>
|
||||
public sealed record MetricsCollectorConfig
|
||||
{
|
||||
/// <summary>
|
||||
/// Default resolution for metrics queries.
|
||||
/// </summary>
|
||||
public TimeSpan DefaultResolution { get; init; } = TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <summary>
|
||||
/// Key performance indicator metric names.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> KpiMetrics { get; init; } =
|
||||
[
|
||||
"http_request_duration_seconds",
|
||||
"http_requests_total",
|
||||
"http_request_errors_total",
|
||||
"process_cpu_seconds_total",
|
||||
"process_resident_memory_bytes"
|
||||
];
|
||||
|
||||
/// <summary>
|
||||
/// Maximum time range for a single query.
|
||||
/// </summary>
|
||||
public TimeSpan MaxQueryRange { get; init; } = TimeSpan.FromHours(24);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Query for metrics collection.
|
||||
/// </summary>
|
||||
public sealed record MetricsQuery
|
||||
{
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public ImmutableArray<string> MetricNames { get; init; } = [];
|
||||
public required TimeRange TimeRange { get; init; }
|
||||
public TimeSpan Resolution { get; init; } = TimeSpan.FromSeconds(30);
|
||||
public ImmutableDictionary<string, string> Labels { get; init; } =
|
||||
ImmutableDictionary<string, string>.Empty;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Time range for queries.
|
||||
/// </summary>
|
||||
public sealed record TimeRange
|
||||
{
|
||||
public required DateTimeOffset Start { get; init; }
|
||||
public required DateTimeOffset End { get; init; }
|
||||
|
||||
public TimeSpan Duration => End - Start;
|
||||
|
||||
public static TimeRange Last(TimeSpan duration)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
return new TimeRange
|
||||
{
|
||||
Start = now - duration,
|
||||
End = now
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Snapshot of collected metrics.
|
||||
/// </summary>
|
||||
public sealed record MetricsSnapshot
|
||||
{
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public required DateTimeOffset CollectedAt { get; init; }
|
||||
public required ImmutableArray<MetricDataPoint> Metrics { get; init; }
|
||||
public required ImmutableDictionary<string, ProviderCollectionResult> ProviderResults { get; init; }
|
||||
public required TimeRange TimeRange { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A single metric data point.
|
||||
/// </summary>
|
||||
public sealed record MetricDataPoint
|
||||
{
|
||||
public string Name { get; init; } = "";
|
||||
public double Value { get; init; }
|
||||
public DateTimeOffset Timestamp { get; init; }
|
||||
public ImmutableDictionary<string, string> Labels { get; init; } =
|
||||
ImmutableDictionary<string, string>.Empty;
|
||||
public string? Unit { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of collection from a single provider.
|
||||
/// </summary>
|
||||
public sealed record ProviderCollectionResult
|
||||
{
|
||||
public required string ProviderName { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public int MetricsCount { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Key performance indicators snapshot.
|
||||
/// </summary>
|
||||
public sealed record KpiSnapshot
|
||||
{
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public required DateTimeOffset CollectedAt { get; init; }
|
||||
public double ErrorRate { get; init; }
|
||||
public double LatencyP50 { get; init; }
|
||||
public double LatencyP95 { get; init; }
|
||||
public double LatencyP99 { get; init; }
|
||||
public double RequestRate { get; init; }
|
||||
public double CpuUsage { get; init; }
|
||||
public double MemoryUsage { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for metrics providers.
|
||||
/// </summary>
|
||||
public interface IMetricsProvider
|
||||
{
|
||||
string Name { get; }
|
||||
bool IsEnabled { get; }
|
||||
Task<IReadOnlyList<MetricDataPoint>> CollectAsync(MetricsQuery query, CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,445 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Deployment.Rollback.Intelligence;
|
||||
|
||||
/// <summary>
|
||||
/// Makes automated rollback decisions based on health and policies.
|
||||
/// </summary>
|
||||
public sealed class RollbackDecider
|
||||
{
|
||||
private readonly AnomalyDetector _anomalyDetector;
|
||||
private readonly BaselineManager _baselineManager;
|
||||
private readonly MetricsCollector _metricsCollector;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly RollbackDeciderConfig _config;
|
||||
private readonly ILogger<RollbackDecider> _logger;
|
||||
|
||||
public RollbackDecider(
|
||||
AnomalyDetector anomalyDetector,
|
||||
BaselineManager baselineManager,
|
||||
MetricsCollector metricsCollector,
|
||||
TimeProvider timeProvider,
|
||||
RollbackDeciderConfig config,
|
||||
ILogger<RollbackDecider> logger)
|
||||
{
|
||||
_anomalyDetector = anomalyDetector;
|
||||
_baselineManager = baselineManager;
|
||||
_metricsCollector = metricsCollector;
|
||||
_timeProvider = timeProvider;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evaluates whether a rollback should be triggered.
|
||||
/// </summary>
|
||||
public async Task<RollbackDecision> EvaluateAsync(
|
||||
RollbackEvaluationRequest request,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Evaluating rollback for deployment {DeploymentId}",
|
||||
request.DeploymentId);
|
||||
|
||||
// Collect current metrics
|
||||
var currentMetrics = await _metricsCollector.CollectAsync(
|
||||
new MetricsQuery
|
||||
{
|
||||
DeploymentId = request.DeploymentId,
|
||||
TimeRange = TimeRange.Last(_config.EvaluationWindow),
|
||||
Resolution = TimeSpan.FromSeconds(10)
|
||||
},
|
||||
ct);
|
||||
|
||||
// Get baseline for comparison
|
||||
var baseline = await _baselineManager.GetActiveBaselineAsync(request.DeploymentId, ct);
|
||||
|
||||
// Detect anomalies
|
||||
var anomalyResult = _anomalyDetector.Detect(
|
||||
currentMetrics.Metrics.ToList(),
|
||||
new AnomalyDetectionContext
|
||||
{
|
||||
DeploymentId = request.DeploymentId,
|
||||
Baseline = baseline is not null ? await ConvertBaselineToSnapshot(baseline, ct) : null
|
||||
});
|
||||
|
||||
// Evaluate health thresholds
|
||||
var thresholdViolations = EvaluateThresholds(currentMetrics, request.Policy);
|
||||
|
||||
// Evaluate baseline comparison
|
||||
var baselineViolations = baseline is not null
|
||||
? EvaluateBaselineDeviation(currentMetrics, baseline, request.Policy)
|
||||
: [];
|
||||
|
||||
// Make decision
|
||||
var shouldRollback = ShouldTriggerRollback(
|
||||
anomalyResult,
|
||||
thresholdViolations,
|
||||
baselineViolations,
|
||||
request.Policy);
|
||||
|
||||
var decision = new RollbackDecision
|
||||
{
|
||||
DeploymentId = request.DeploymentId,
|
||||
EvaluatedAt = _timeProvider.GetUtcNow(),
|
||||
ShouldRollback = shouldRollback,
|
||||
Confidence = CalculateConfidence(anomalyResult, thresholdViolations, baselineViolations),
|
||||
AnomalyResult = anomalyResult,
|
||||
ThresholdViolations = thresholdViolations.ToImmutableArray(),
|
||||
BaselineViolations = baselineViolations.ToImmutableArray(),
|
||||
Reason = BuildDecisionReason(shouldRollback, anomalyResult, thresholdViolations, baselineViolations),
|
||||
RecommendedAction = DetermineAction(shouldRollback, anomalyResult.OverallSeverity)
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Rollback decision for {DeploymentId}: {ShouldRollback} (confidence: {Confidence:P0})",
|
||||
request.DeploymentId, shouldRollback, decision.Confidence);
|
||||
|
||||
return decision;
|
||||
}
|
||||
|
||||
private List<ThresholdViolation> EvaluateThresholds(
|
||||
MetricsSnapshot snapshot,
|
||||
RollbackPolicy policy)
|
||||
{
|
||||
var violations = new List<ThresholdViolation>();
|
||||
|
||||
foreach (var threshold in policy.Thresholds)
|
||||
{
|
||||
var metricValues = snapshot.Metrics
|
||||
.Where(m => m.Name == threshold.MetricName)
|
||||
.ToList();
|
||||
|
||||
if (metricValues.Count == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var avgValue = metricValues.Average(m => m.Value);
|
||||
var isViolated = threshold.Operator switch
|
||||
{
|
||||
ThresholdOperator.GreaterThan => avgValue > threshold.Value,
|
||||
ThresholdOperator.LessThan => avgValue < threshold.Value,
|
||||
ThresholdOperator.GreaterThanOrEqual => avgValue >= threshold.Value,
|
||||
ThresholdOperator.LessThanOrEqual => avgValue <= threshold.Value,
|
||||
_ => false
|
||||
};
|
||||
|
||||
if (isViolated)
|
||||
{
|
||||
violations.Add(new ThresholdViolation
|
||||
{
|
||||
MetricName = threshold.MetricName,
|
||||
ThresholdValue = threshold.Value,
|
||||
ActualValue = avgValue,
|
||||
Operator = threshold.Operator,
|
||||
Severity = threshold.Severity
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return violations;
|
||||
}
|
||||
|
||||
private List<BaselineViolation> EvaluateBaselineDeviation(
|
||||
MetricsSnapshot current,
|
||||
DeploymentBaseline baseline,
|
||||
RollbackPolicy policy)
|
||||
{
|
||||
var violations = new List<BaselineViolation>();
|
||||
var baselineLookup = baseline.MetricSummaries.ToDictionary(m => m.MetricName);
|
||||
|
||||
foreach (var group in current.Metrics.GroupBy(m => m.Name))
|
||||
{
|
||||
if (!baselineLookup.TryGetValue(group.Key, out var baselineSummary))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var currentMean = group.Average(m => m.Value);
|
||||
var deviation = baselineSummary.StdDev > 0
|
||||
? Math.Abs(currentMean - baselineSummary.Mean) / baselineSummary.StdDev
|
||||
: 0;
|
||||
|
||||
var percentChange = baselineSummary.Mean != 0
|
||||
? (currentMean - baselineSummary.Mean) / baselineSummary.Mean * 100
|
||||
: 0;
|
||||
|
||||
var threshold = policy.BaselineDeviationThreshold ?? _config.DefaultBaselineDeviationThreshold;
|
||||
|
||||
if (deviation > threshold)
|
||||
{
|
||||
violations.Add(new BaselineViolation
|
||||
{
|
||||
MetricName = group.Key,
|
||||
BaselineMean = baselineSummary.Mean,
|
||||
BaselineStdDev = baselineSummary.StdDev,
|
||||
CurrentValue = currentMean,
|
||||
DeviationSigma = deviation,
|
||||
PercentChange = percentChange,
|
||||
Severity = ClassifyBaselineViolationSeverity(deviation)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return violations;
|
||||
}
|
||||
|
||||
private bool ShouldTriggerRollback(
|
||||
AnomalyDetectionResult anomalyResult,
|
||||
List<ThresholdViolation> thresholdViolations,
|
||||
List<BaselineViolation> baselineViolations,
|
||||
RollbackPolicy policy)
|
||||
{
|
||||
// Critical anomalies always trigger rollback
|
||||
if (anomalyResult.OverallSeverity == AnomalySeverity.Critical)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// Critical threshold violations trigger rollback
|
||||
if (thresholdViolations.Any(v => v.Severity == ThresholdSeverity.Critical))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if we have enough high-severity issues
|
||||
var highSeverityCount =
|
||||
(anomalyResult.OverallSeverity >= AnomalySeverity.High ? 1 : 0) +
|
||||
thresholdViolations.Count(v => v.Severity >= ThresholdSeverity.High) +
|
||||
baselineViolations.Count(v => v.Severity >= BaselineViolationSeverity.High);
|
||||
|
||||
return highSeverityCount >= policy.HighSeverityThreshold;
|
||||
}
|
||||
|
||||
private double CalculateConfidence(
|
||||
AnomalyDetectionResult anomalyResult,
|
||||
List<ThresholdViolation> thresholdViolations,
|
||||
List<BaselineViolation> baselineViolations)
|
||||
{
|
||||
// Base confidence from anomaly detection
|
||||
var anomalyConfidence = anomalyResult.Status == AnomalyDetectionStatus.AnomaliesDetected
|
||||
? Math.Min(anomalyResult.AnomalyScore / 5.0, 1.0)
|
||||
: 0.5;
|
||||
|
||||
// Boost for threshold violations
|
||||
var thresholdBoost = thresholdViolations.Count * 0.1;
|
||||
|
||||
// Boost for baseline violations
|
||||
var baselineBoost = baselineViolations.Count * 0.05;
|
||||
|
||||
return Math.Min(anomalyConfidence + thresholdBoost + baselineBoost, 1.0);
|
||||
}
|
||||
|
||||
private string BuildDecisionReason(
|
||||
bool shouldRollback,
|
||||
AnomalyDetectionResult anomalyResult,
|
||||
List<ThresholdViolation> thresholdViolations,
|
||||
List<BaselineViolation> baselineViolations)
|
||||
{
|
||||
var parts = new List<string>();
|
||||
|
||||
if (anomalyResult.Anomalies.Length > 0)
|
||||
{
|
||||
parts.Add($"{anomalyResult.Anomalies.Length} anomalies detected (severity: {anomalyResult.OverallSeverity})");
|
||||
}
|
||||
|
||||
if (thresholdViolations.Count > 0)
|
||||
{
|
||||
parts.Add($"{thresholdViolations.Count} threshold violations");
|
||||
}
|
||||
|
||||
if (baselineViolations.Count > 0)
|
||||
{
|
||||
parts.Add($"{baselineViolations.Count} baseline deviations");
|
||||
}
|
||||
|
||||
if (parts.Count == 0)
|
||||
{
|
||||
return shouldRollback ? "Unknown trigger" : "All metrics within acceptable ranges";
|
||||
}
|
||||
|
||||
return string.Join("; ", parts);
|
||||
}
|
||||
|
||||
private RollbackAction DetermineAction(bool shouldRollback, AnomalySeverity severity)
|
||||
{
|
||||
if (!shouldRollback)
|
||||
{
|
||||
return RollbackAction.NoAction;
|
||||
}
|
||||
|
||||
return severity switch
|
||||
{
|
||||
AnomalySeverity.Critical => RollbackAction.ImmediateRollback,
|
||||
AnomalySeverity.High => RollbackAction.AutoRollback,
|
||||
_ => RollbackAction.ManualReview
|
||||
};
|
||||
}
|
||||
|
||||
private BaselineViolationSeverity ClassifyBaselineViolationSeverity(double deviation)
|
||||
{
|
||||
return deviation switch
|
||||
{
|
||||
> 5.0 => BaselineViolationSeverity.Critical,
|
||||
> 4.0 => BaselineViolationSeverity.High,
|
||||
> 3.0 => BaselineViolationSeverity.Medium,
|
||||
> 2.0 => BaselineViolationSeverity.Low,
|
||||
_ => BaselineViolationSeverity.None
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<MetricsSnapshot> ConvertBaselineToSnapshot(
|
||||
DeploymentBaseline baseline,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Create a synthetic snapshot from baseline summaries
|
||||
var metrics = baseline.MetricSummaries
|
||||
.Select(s => new MetricDataPoint
|
||||
{
|
||||
Name = s.MetricName,
|
||||
Value = s.Mean,
|
||||
Timestamp = baseline.CreatedAt
|
||||
})
|
||||
.ToImmutableArray();
|
||||
|
||||
return new MetricsSnapshot
|
||||
{
|
||||
DeploymentId = baseline.DeploymentId,
|
||||
CollectedAt = baseline.CreatedAt,
|
||||
Metrics = metrics,
|
||||
ProviderResults = ImmutableDictionary<string, ProviderCollectionResult>.Empty,
|
||||
TimeRange = TimeRange.Last(baseline.SampleDuration)
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for rollback decider.
|
||||
/// </summary>
|
||||
public sealed record RollbackDeciderConfig
|
||||
{
|
||||
public TimeSpan EvaluationWindow { get; init; } = TimeSpan.FromMinutes(5);
|
||||
public double DefaultBaselineDeviationThreshold { get; init; } = 3.0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request for rollback evaluation.
|
||||
/// </summary>
|
||||
public sealed record RollbackEvaluationRequest
|
||||
{
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public required RollbackPolicy Policy { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Policy for rollback decisions.
|
||||
/// </summary>
|
||||
public sealed record RollbackPolicy
|
||||
{
|
||||
public ImmutableArray<MetricThreshold> Thresholds { get; init; } = [];
|
||||
public double? BaselineDeviationThreshold { get; init; }
|
||||
public int HighSeverityThreshold { get; init; } = 2;
|
||||
public bool AutoRollbackEnabled { get; init; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Threshold for a metric.
|
||||
/// </summary>
|
||||
public sealed record MetricThreshold
|
||||
{
|
||||
public required string MetricName { get; init; }
|
||||
public required double Value { get; init; }
|
||||
public required ThresholdOperator Operator { get; init; }
|
||||
public ThresholdSeverity Severity { get; init; } = ThresholdSeverity.Medium;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Threshold comparison operators.
|
||||
/// </summary>
|
||||
public enum ThresholdOperator
|
||||
{
|
||||
GreaterThan,
|
||||
LessThan,
|
||||
GreaterThanOrEqual,
|
||||
LessThanOrEqual
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Threshold severity.
|
||||
/// </summary>
|
||||
public enum ThresholdSeverity
|
||||
{
|
||||
Low,
|
||||
Medium,
|
||||
High,
|
||||
Critical
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a rollback decision.
|
||||
/// </summary>
|
||||
public sealed record RollbackDecision
|
||||
{
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public required DateTimeOffset EvaluatedAt { get; init; }
|
||||
public required bool ShouldRollback { get; init; }
|
||||
public required double Confidence { get; init; }
|
||||
public required AnomalyDetectionResult AnomalyResult { get; init; }
|
||||
public required ImmutableArray<ThresholdViolation> ThresholdViolations { get; init; }
|
||||
public required ImmutableArray<BaselineViolation> BaselineViolations { get; init; }
|
||||
public required string Reason { get; init; }
|
||||
public required RollbackAction RecommendedAction { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A threshold violation.
|
||||
/// </summary>
|
||||
public sealed record ThresholdViolation
|
||||
{
|
||||
public required string MetricName { get; init; }
|
||||
public required double ThresholdValue { get; init; }
|
||||
public required double ActualValue { get; init; }
|
||||
public required ThresholdOperator Operator { get; init; }
|
||||
public required ThresholdSeverity Severity { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A baseline violation.
|
||||
/// </summary>
|
||||
public sealed record BaselineViolation
|
||||
{
|
||||
public required string MetricName { get; init; }
|
||||
public required double BaselineMean { get; init; }
|
||||
public required double BaselineStdDev { get; init; }
|
||||
public required double CurrentValue { get; init; }
|
||||
public required double DeviationSigma { get; init; }
|
||||
public required double PercentChange { get; init; }
|
||||
public required BaselineViolationSeverity Severity { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Severity of baseline violation.
|
||||
/// </summary>
|
||||
public enum BaselineViolationSeverity
|
||||
{
|
||||
None,
|
||||
Low,
|
||||
Medium,
|
||||
High,
|
||||
Critical
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Recommended rollback action.
|
||||
/// </summary>
|
||||
public enum RollbackAction
|
||||
{
|
||||
NoAction,
|
||||
ManualReview,
|
||||
AutoRollback,
|
||||
ImmediateRollback
|
||||
}
|
||||
@@ -0,0 +1,818 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// PartialRollbackPlanner.cs
|
||||
// Sprint: SPRINT_20260117_033_ReleaseOrchestrator_rollback_intelligence
|
||||
// Task: TASK-033-07 - Partial Rollback Planner for component-level rollback
|
||||
// Description: Plans component-level rollbacks with dependency awareness
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Deployment.Rollback;
|
||||
|
||||
/// <summary>
|
||||
/// Plans partial rollbacks at the component level, respecting dependencies
|
||||
/// and minimizing blast radius while achieving desired rollback goals.
|
||||
/// </summary>
|
||||
public sealed class PartialRollbackPlanner : IPartialRollbackPlanner
|
||||
{
|
||||
private readonly IImpactAnalyzer _impactAnalyzer;
|
||||
private readonly IDependencyGraph _dependencyGraph;
|
||||
private readonly IVersionRegistry _versionRegistry;
|
||||
private readonly PartialRollbackConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<PartialRollbackPlanner> _logger;
|
||||
|
||||
public PartialRollbackPlanner(
|
||||
IImpactAnalyzer impactAnalyzer,
|
||||
IDependencyGraph dependencyGraph,
|
||||
IVersionRegistry versionRegistry,
|
||||
PartialRollbackConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<PartialRollbackPlanner> logger)
|
||||
{
|
||||
_impactAnalyzer = impactAnalyzer;
|
||||
_dependencyGraph = dependencyGraph;
|
||||
_versionRegistry = versionRegistry;
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a rollback plan for specific components within a release.
|
||||
/// </summary>
|
||||
/// <param name="request">The rollback planning request.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>A validated rollback plan with ordered steps.</returns>
|
||||
public async Task<RollbackPlan> CreatePlanAsync(
|
||||
RollbackPlanRequest request,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Creating rollback plan for release {ReleaseId}, components: {Components}",
|
||||
request.ReleaseId, string.Join(", ", request.TargetComponents));
|
||||
|
||||
// Validate components can be rolled back
|
||||
var validationResult = await ValidateRollbackFeasibilityAsync(request, ct);
|
||||
if (!validationResult.IsValid)
|
||||
{
|
||||
return CreateInvalidPlan(request, validationResult);
|
||||
}
|
||||
|
||||
// Determine rollback order based on dependencies
|
||||
var orderedComponents = await DetermineRollbackOrderAsync(
|
||||
request.TargetComponents, ct);
|
||||
|
||||
// Create rollback steps
|
||||
var steps = await CreateRollbackStepsAsync(
|
||||
request, orderedComponents, ct);
|
||||
|
||||
// Calculate total impact
|
||||
var aggregateImpact = await CalculateAggregateImpactAsync(
|
||||
request.ReleaseId, orderedComponents, ct);
|
||||
|
||||
// Generate verification checkpoints
|
||||
var checkpoints = GenerateCheckpoints(steps);
|
||||
|
||||
var plan = new RollbackPlan
|
||||
{
|
||||
PlanId = Guid.NewGuid(),
|
||||
ReleaseId = request.ReleaseId,
|
||||
Type = RollbackType.Partial,
|
||||
Status = RollbackPlanStatus.Ready,
|
||||
Components = orderedComponents.ToImmutableArray(),
|
||||
Steps = steps,
|
||||
Checkpoints = checkpoints,
|
||||
AggregateImpact = aggregateImpact,
|
||||
EstimatedDuration = CalculateTotalDuration(steps),
|
||||
CreatedAt = _timeProvider.GetUtcNow(),
|
||||
ExpiresAt = _timeProvider.GetUtcNow().Add(_config.PlanExpirationTime),
|
||||
Validation = validationResult
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Rollback plan {PlanId} created: {ComponentCount} components, {StepCount} steps, ETA: {Duration}",
|
||||
plan.PlanId, orderedComponents.Count, steps.Length, plan.EstimatedDuration);
|
||||
|
||||
return plan;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates that a rollback plan is still executable.
|
||||
/// </summary>
|
||||
public async Task<PlanValidationResult> ValidatePlanAsync(
|
||||
RollbackPlan plan,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var issues = new List<ValidationIssue>();
|
||||
|
||||
// Check expiration
|
||||
if (plan.ExpiresAt < _timeProvider.GetUtcNow())
|
||||
{
|
||||
issues.Add(new ValidationIssue
|
||||
{
|
||||
Severity = IssueSeverity.Error,
|
||||
Code = "PLAN_EXPIRED",
|
||||
Message = "Rollback plan has expired and must be regenerated"
|
||||
});
|
||||
}
|
||||
|
||||
// Validate target versions still exist
|
||||
foreach (var step in plan.Steps)
|
||||
{
|
||||
var versionExists = await _versionRegistry.VersionExistsAsync(
|
||||
step.ComponentName, step.TargetVersion, ct);
|
||||
|
||||
if (!versionExists)
|
||||
{
|
||||
issues.Add(new ValidationIssue
|
||||
{
|
||||
Severity = IssueSeverity.Error,
|
||||
Code = "VERSION_NOT_FOUND",
|
||||
Message = $"Target version {step.TargetVersion} for {step.ComponentName} no longer available",
|
||||
Component = step.ComponentName
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Check for conflicting deployments in progress
|
||||
foreach (var component in plan.Components)
|
||||
{
|
||||
var hasActiveDeployment = await _versionRegistry.HasActiveDeploymentAsync(
|
||||
component, ct);
|
||||
|
||||
if (hasActiveDeployment)
|
||||
{
|
||||
issues.Add(new ValidationIssue
|
||||
{
|
||||
Severity = IssueSeverity.Warning,
|
||||
Code = "DEPLOYMENT_IN_PROGRESS",
|
||||
Message = $"Component {component} has an active deployment",
|
||||
Component = component
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return new PlanValidationResult
|
||||
{
|
||||
IsValid = !issues.Any(i => i.Severity == IssueSeverity.Error),
|
||||
Issues = issues.ToImmutableArray(),
|
||||
ValidatedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Suggests the minimal set of components to rollback to fix an issue.
|
||||
/// </summary>
|
||||
public async Task<RollbackSuggestion> SuggestMinimalRollbackAsync(
|
||||
Guid releaseId,
|
||||
ImmutableArray<string> affectedMetrics,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Finding minimal rollback for release {ReleaseId}, affected metrics: {Metrics}",
|
||||
releaseId, string.Join(", ", affectedMetrics));
|
||||
|
||||
// Get all components changed in this release
|
||||
var changedComponents = await _versionRegistry.GetChangedComponentsAsync(releaseId, ct);
|
||||
|
||||
// Map metrics to likely culprit components
|
||||
var suspectedComponents = await IdentifySuspectedComponentsAsync(
|
||||
changedComponents, affectedMetrics, ct);
|
||||
|
||||
if (suspectedComponents.Length == 0)
|
||||
{
|
||||
return new RollbackSuggestion
|
||||
{
|
||||
ReleaseId = releaseId,
|
||||
Confidence = 0,
|
||||
Components = [],
|
||||
Reasoning = "Unable to identify specific components causing the issue",
|
||||
FallbackRecommendation = "Consider full rollback if issues persist"
|
||||
};
|
||||
}
|
||||
|
||||
// Find minimal transitive closure of dependencies
|
||||
var minimalSet = await FindMinimalRollbackSetAsync(suspectedComponents, ct);
|
||||
|
||||
// Calculate confidence based on signal strength
|
||||
var confidence = CalculateSuggestionConfidence(suspectedComponents);
|
||||
|
||||
return new RollbackSuggestion
|
||||
{
|
||||
ReleaseId = releaseId,
|
||||
Confidence = confidence,
|
||||
Components = minimalSet,
|
||||
SuspectedCauses = suspectedComponents,
|
||||
Reasoning = GenerateSuggestionReasoning(suspectedComponents, affectedMetrics),
|
||||
FallbackRecommendation = confidence < 0.7
|
||||
? "Consider full rollback if partial rollback doesn't resolve issues"
|
||||
: null
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Optimizes a rollback plan to minimize impact.
|
||||
/// </summary>
|
||||
public async Task<RollbackPlan> OptimizePlanAsync(
|
||||
RollbackPlan plan,
|
||||
OptimizationGoal goal,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Optimizing plan {PlanId} for {Goal}", plan.PlanId, goal);
|
||||
|
||||
var optimizedSteps = goal switch
|
||||
{
|
||||
OptimizationGoal.MinimizeDowntime => await OptimizeForDowntimeAsync(plan.Steps, ct),
|
||||
OptimizationGoal.MinimizeRisk => await OptimizeForRiskAsync(plan.Steps, ct),
|
||||
OptimizationGoal.MaximizeParallelism => await OptimizeForParallelismAsync(plan.Steps, ct),
|
||||
_ => plan.Steps
|
||||
};
|
||||
|
||||
return plan with
|
||||
{
|
||||
Steps = optimizedSteps,
|
||||
EstimatedDuration = CalculateTotalDuration(optimizedSteps),
|
||||
OptimizedFor = goal,
|
||||
OptimizedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<RollbackValidation> ValidateRollbackFeasibilityAsync(
|
||||
RollbackPlanRequest request,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var issues = new List<ValidationIssue>();
|
||||
var warnings = new List<ValidationIssue>();
|
||||
|
||||
foreach (var component in request.TargetComponents)
|
||||
{
|
||||
// Check if previous version exists
|
||||
var previousVersion = await _versionRegistry.GetPreviousVersionAsync(
|
||||
component, request.ReleaseId, ct);
|
||||
|
||||
if (previousVersion is null)
|
||||
{
|
||||
issues.Add(new ValidationIssue
|
||||
{
|
||||
Severity = IssueSeverity.Error,
|
||||
Code = "NO_PREVIOUS_VERSION",
|
||||
Message = $"No previous version found for component {component}",
|
||||
Component = component
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check for breaking dependencies
|
||||
var deps = await _dependencyGraph.GetDownstreamDependenciesAsync(
|
||||
component, 1, ct);
|
||||
|
||||
var nonRolledBackDeps = deps
|
||||
.Where(d => !request.TargetComponents.Contains(d.ServiceName))
|
||||
.ToList();
|
||||
|
||||
if (nonRolledBackDeps.Any(d => d.DependencyType == DependencyType.Synchronous))
|
||||
{
|
||||
warnings.Add(new ValidationIssue
|
||||
{
|
||||
Severity = IssueSeverity.Warning,
|
||||
Code = "POTENTIAL_INCOMPATIBILITY",
|
||||
Message = $"Component {component} has sync dependencies not being rolled back",
|
||||
Component = component,
|
||||
RelatedComponents = nonRolledBackDeps.Select(d => d.ServiceName).ToImmutableArray()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return new RollbackValidation
|
||||
{
|
||||
IsValid = !issues.Any(),
|
||||
Issues = issues.ToImmutableArray(),
|
||||
Warnings = warnings.ToImmutableArray(),
|
||||
ValidatedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<IReadOnlyList<string>> DetermineRollbackOrderAsync(
|
||||
ImmutableArray<string> components,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Build dependency graph for target components
|
||||
var graph = new Dictionary<string, HashSet<string>>();
|
||||
var inDegree = new Dictionary<string, int>();
|
||||
|
||||
foreach (var component in components)
|
||||
{
|
||||
graph[component] = [];
|
||||
inDegree[component] = 0;
|
||||
}
|
||||
|
||||
// Add edges based on dependencies
|
||||
foreach (var component in components)
|
||||
{
|
||||
var deps = await _dependencyGraph.GetDownstreamDependenciesAsync(component, 1, ct);
|
||||
|
||||
foreach (var dep in deps.Where(d => components.Contains(d.ServiceName)))
|
||||
{
|
||||
graph[component].Add(dep.ServiceName);
|
||||
inDegree[dep.ServiceName]++;
|
||||
}
|
||||
}
|
||||
|
||||
// Topological sort (Kahn's algorithm)
|
||||
var result = new List<string>();
|
||||
var queue = new Queue<string>(inDegree.Where(kv => kv.Value == 0).Select(kv => kv.Key));
|
||||
|
||||
while (queue.Count > 0)
|
||||
{
|
||||
var current = queue.Dequeue();
|
||||
result.Add(current);
|
||||
|
||||
foreach (var neighbor in graph[current])
|
||||
{
|
||||
inDegree[neighbor]--;
|
||||
if (inDegree[neighbor] == 0)
|
||||
{
|
||||
queue.Enqueue(neighbor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reverse for rollback order (dependents first)
|
||||
result.Reverse();
|
||||
return result;
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<RollbackStep>> CreateRollbackStepsAsync(
|
||||
RollbackPlanRequest request,
|
||||
IReadOnlyList<string> orderedComponents,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var steps = new List<RollbackStep>();
|
||||
var stepNumber = 1;
|
||||
|
||||
foreach (var component in orderedComponents)
|
||||
{
|
||||
var previousVersion = await _versionRegistry.GetPreviousVersionAsync(
|
||||
component, request.ReleaseId, ct);
|
||||
|
||||
var currentVersion = await _versionRegistry.GetCurrentVersionAsync(component, ct);
|
||||
|
||||
var impact = await _impactAnalyzer.AnalyzeImpactAsync(
|
||||
await _versionRegistry.GetDeploymentIdAsync(component, ct), ct);
|
||||
|
||||
steps.Add(new RollbackStep
|
||||
{
|
||||
StepNumber = stepNumber++,
|
||||
ComponentName = component,
|
||||
CurrentVersion = currentVersion!,
|
||||
TargetVersion = previousVersion!,
|
||||
Action = DetermineRollbackAction(component),
|
||||
EstimatedDuration = EstimateStepDuration(impact),
|
||||
Prerequisites = GetStepPrerequisites(component, orderedComponents, steps),
|
||||
VerificationChecks = GenerateVerificationChecks(component),
|
||||
RollbackOnFailure = true
|
||||
});
|
||||
}
|
||||
|
||||
return steps.ToImmutableArray();
|
||||
}
|
||||
|
||||
private async Task<AggregateImpact> CalculateAggregateImpactAsync(
|
||||
Guid releaseId,
|
||||
IReadOnlyList<string> components,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var totalDowntime = TimeSpan.Zero;
|
||||
var totalAffectedServices = 0;
|
||||
var totalAffectedUsers = 0;
|
||||
var maxRiskLevel = RiskLevel.Minimal;
|
||||
|
||||
foreach (var component in components)
|
||||
{
|
||||
var deploymentId = await _versionRegistry.GetDeploymentIdAsync(component, ct);
|
||||
var impact = await _impactAnalyzer.AnalyzeImpactAsync(deploymentId, ct);
|
||||
|
||||
totalDowntime += impact.DowntimeEstimate.TotalEstimatedDowntime;
|
||||
totalAffectedServices += impact.DependencyImpact.AffectedServices.Length;
|
||||
totalAffectedUsers = Math.Max(totalAffectedUsers, impact.TrafficImpact.EstimatedUsersAffected);
|
||||
|
||||
if (impact.RiskAssessment.RiskLevel > maxRiskLevel)
|
||||
maxRiskLevel = impact.RiskAssessment.RiskLevel;
|
||||
}
|
||||
|
||||
return new AggregateImpact
|
||||
{
|
||||
TotalDowntime = totalDowntime,
|
||||
TotalAffectedServices = totalAffectedServices,
|
||||
MaxAffectedUsers = totalAffectedUsers,
|
||||
OverallRiskLevel = maxRiskLevel,
|
||||
ComponentCount = components.Count
|
||||
};
|
||||
}
|
||||
|
||||
private static ImmutableArray<VerificationCheckpoint> GenerateCheckpoints(
|
||||
ImmutableArray<RollbackStep> steps)
|
||||
{
|
||||
var checkpoints = new List<VerificationCheckpoint>();
|
||||
var checkpointNumber = 1;
|
||||
|
||||
// Add checkpoint after each critical step
|
||||
foreach (var step in steps)
|
||||
{
|
||||
checkpoints.Add(new VerificationCheckpoint
|
||||
{
|
||||
CheckpointNumber = checkpointNumber++,
|
||||
AfterStepNumber = step.StepNumber,
|
||||
Type = CheckpointType.HealthCheck,
|
||||
Checks = step.VerificationChecks,
|
||||
Timeout = TimeSpan.FromMinutes(2),
|
||||
ContinueOnFailure = false
|
||||
});
|
||||
}
|
||||
|
||||
// Add final verification checkpoint
|
||||
checkpoints.Add(new VerificationCheckpoint
|
||||
{
|
||||
CheckpointNumber = checkpointNumber,
|
||||
AfterStepNumber = steps.Length,
|
||||
Type = CheckpointType.FullValidation,
|
||||
Checks =
|
||||
[
|
||||
new VerificationCheck { Type = CheckType.EndToEndTest, Name = "Full E2E Verification" },
|
||||
new VerificationCheck { Type = CheckType.MetricBaseline, Name = "Metrics Back to Baseline" }
|
||||
],
|
||||
Timeout = TimeSpan.FromMinutes(10),
|
||||
ContinueOnFailure = false
|
||||
});
|
||||
|
||||
return checkpoints.ToImmutableArray();
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<SuspectedComponent>> IdentifySuspectedComponentsAsync(
|
||||
ImmutableArray<string> changedComponents,
|
||||
ImmutableArray<string> affectedMetrics,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var suspected = new List<SuspectedComponent>();
|
||||
|
||||
foreach (var component in changedComponents)
|
||||
{
|
||||
var componentMetrics = await _versionRegistry.GetComponentMetricsAsync(component, ct);
|
||||
var matchingMetrics = affectedMetrics
|
||||
.Where(m => componentMetrics.Any(cm => cm.Contains(m, StringComparison.OrdinalIgnoreCase)))
|
||||
.ToList();
|
||||
|
||||
if (matchingMetrics.Any())
|
||||
{
|
||||
suspected.Add(new SuspectedComponent
|
||||
{
|
||||
ComponentName = component,
|
||||
MatchingMetrics = matchingMetrics.ToImmutableArray(),
|
||||
Confidence = matchingMetrics.Count / (double)affectedMetrics.Length,
|
||||
ChangeSize = await _versionRegistry.GetChangeSizeAsync(component, ct)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return suspected.OrderByDescending(s => s.Confidence).ToImmutableArray();
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindMinimalRollbackSetAsync(
|
||||
ImmutableArray<SuspectedComponent> suspects,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var minimalSet = new HashSet<string>();
|
||||
|
||||
foreach (var suspect in suspects.Where(s => s.Confidence > 0.5))
|
||||
{
|
||||
minimalSet.Add(suspect.ComponentName);
|
||||
|
||||
// Add required dependencies
|
||||
var deps = await _dependencyGraph.GetComponentDependenciesAsync(
|
||||
suspect.ComponentName, ct);
|
||||
|
||||
foreach (var dep in deps.Where(d => d.IsRequired))
|
||||
{
|
||||
minimalSet.Add(dep.ComponentName);
|
||||
}
|
||||
}
|
||||
|
||||
return minimalSet.ToImmutableArray();
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<RollbackStep>> OptimizeForDowntimeAsync(
|
||||
ImmutableArray<RollbackStep> steps,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Group independent steps for parallel execution
|
||||
await Task.CompletedTask;
|
||||
|
||||
var result = new List<RollbackStep>();
|
||||
var parallelGroup = new List<RollbackStep>();
|
||||
|
||||
foreach (var step in steps)
|
||||
{
|
||||
if (step.Prerequisites.Length == 0)
|
||||
{
|
||||
parallelGroup.Add(step);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (parallelGroup.Count > 0)
|
||||
{
|
||||
result.AddRange(parallelGroup.Select((s, i) => s with
|
||||
{
|
||||
ParallelGroup = result.Count + 1,
|
||||
StepNumber = result.Count + i + 1
|
||||
}));
|
||||
parallelGroup.Clear();
|
||||
}
|
||||
result.Add(step with { StepNumber = result.Count + 1 });
|
||||
}
|
||||
}
|
||||
|
||||
if (parallelGroup.Count > 0)
|
||||
{
|
||||
result.AddRange(parallelGroup.Select((s, i) => s with
|
||||
{
|
||||
ParallelGroup = result.Count + 1,
|
||||
StepNumber = result.Count + i + 1
|
||||
}));
|
||||
}
|
||||
|
||||
return result.ToImmutableArray();
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<RollbackStep>> OptimizeForRiskAsync(
|
||||
ImmutableArray<RollbackStep> steps,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Order by risk - rollback highest risk first
|
||||
await Task.CompletedTask;
|
||||
|
||||
return steps
|
||||
.OrderByDescending(s => s.Prerequisites.Length) // Dependencies = higher risk
|
||||
.Select((s, i) => s with { StepNumber = i + 1 })
|
||||
.ToImmutableArray();
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<RollbackStep>> OptimizeForParallelismAsync(
|
||||
ImmutableArray<RollbackStep> steps,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Maximum parallelism based on dependency levels
|
||||
return await OptimizeForDowntimeAsync(steps, ct);
|
||||
}
|
||||
|
||||
private static RollbackPlan CreateInvalidPlan(
|
||||
RollbackPlanRequest request,
|
||||
RollbackValidation validation)
|
||||
{
|
||||
return new RollbackPlan
|
||||
{
|
||||
PlanId = Guid.NewGuid(),
|
||||
ReleaseId = request.ReleaseId,
|
||||
Type = RollbackType.Partial,
|
||||
Status = RollbackPlanStatus.Invalid,
|
||||
Components = [],
|
||||
Steps = [],
|
||||
Checkpoints = [],
|
||||
AggregateImpact = new AggregateImpact(),
|
||||
EstimatedDuration = TimeSpan.Zero,
|
||||
CreatedAt = DateTimeOffset.UtcNow,
|
||||
ExpiresAt = DateTimeOffset.UtcNow,
|
||||
Validation = validation
|
||||
};
|
||||
}
|
||||
|
||||
private static RollbackAction DetermineRollbackAction(string component)
|
||||
{
|
||||
// Could be configuration-driven
|
||||
return RollbackAction.ImageSwap;
|
||||
}
|
||||
|
||||
private static TimeSpan EstimateStepDuration(ImpactAnalysis impact)
|
||||
{
|
||||
return impact.DowntimeEstimate.RollbackDuration;
|
||||
}
|
||||
|
||||
private static ImmutableArray<int> GetStepPrerequisites(
|
||||
string component,
|
||||
IReadOnlyList<string> orderedComponents,
|
||||
List<RollbackStep> completedSteps)
|
||||
{
|
||||
// Steps that must complete before this one
|
||||
var index = orderedComponents.ToList().IndexOf(component);
|
||||
if (index <= 0) return [];
|
||||
|
||||
return completedSteps
|
||||
.Where(s => orderedComponents.ToList().IndexOf(s.ComponentName) < index)
|
||||
.Select(s => s.StepNumber)
|
||||
.ToImmutableArray();
|
||||
}
|
||||
|
||||
private static ImmutableArray<VerificationCheck> GenerateVerificationChecks(string component)
|
||||
{
|
||||
return
|
||||
[
|
||||
new VerificationCheck
|
||||
{
|
||||
Type = CheckType.HealthEndpoint,
|
||||
Name = $"{component} Health Check",
|
||||
Endpoint = $"/health"
|
||||
},
|
||||
new VerificationCheck
|
||||
{
|
||||
Type = CheckType.MetricThreshold,
|
||||
Name = $"{component} Error Rate",
|
||||
MetricName = "error_rate",
|
||||
Threshold = 0.01
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
private static TimeSpan CalculateTotalDuration(ImmutableArray<RollbackStep> steps)
|
||||
{
|
||||
// Sum durations, accounting for parallelism
|
||||
var groups = steps.GroupBy(s => s.ParallelGroup);
|
||||
var totalMinutes = groups.Sum(g => g.Max(s => s.EstimatedDuration.TotalMinutes));
|
||||
return TimeSpan.FromMinutes(totalMinutes);
|
||||
}
|
||||
|
||||
private static double CalculateSuggestionConfidence(ImmutableArray<SuspectedComponent> suspects)
|
||||
{
|
||||
if (suspects.Length == 0) return 0;
|
||||
return suspects.Max(s => s.Confidence);
|
||||
}
|
||||
|
||||
private static string GenerateSuggestionReasoning(
|
||||
ImmutableArray<SuspectedComponent> suspects,
|
||||
ImmutableArray<string> affectedMetrics)
|
||||
{
|
||||
if (suspects.Length == 0)
|
||||
return "No correlation found between changed components and affected metrics";
|
||||
|
||||
var primary = suspects[0];
|
||||
return $"Component {primary.ComponentName} strongly correlates with affected metrics: " +
|
||||
$"{string.Join(", ", primary.MatchingMetrics)} (confidence: {primary.Confidence:P0})";
|
||||
}
|
||||
}
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface IPartialRollbackPlanner
|
||||
{
|
||||
Task<RollbackPlan> CreatePlanAsync(RollbackPlanRequest request, CancellationToken ct = default);
|
||||
Task<PlanValidationResult> ValidatePlanAsync(RollbackPlan plan, CancellationToken ct = default);
|
||||
Task<RollbackSuggestion> SuggestMinimalRollbackAsync(Guid releaseId, ImmutableArray<string> affectedMetrics, CancellationToken ct = default);
|
||||
Task<RollbackPlan> OptimizePlanAsync(RollbackPlan plan, OptimizationGoal goal, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IVersionRegistry
|
||||
{
|
||||
Task<bool> VersionExistsAsync(string component, string version, CancellationToken ct = default);
|
||||
Task<bool> HasActiveDeploymentAsync(string component, CancellationToken ct = default);
|
||||
Task<string?> GetPreviousVersionAsync(string component, Guid releaseId, CancellationToken ct = default);
|
||||
Task<string?> GetCurrentVersionAsync(string component, CancellationToken ct = default);
|
||||
Task<Guid> GetDeploymentIdAsync(string component, CancellationToken ct = default);
|
||||
Task<ImmutableArray<string>> GetChangedComponentsAsync(Guid releaseId, CancellationToken ct = default);
|
||||
Task<ImmutableArray<string>> GetComponentMetricsAsync(string component, CancellationToken ct = default);
|
||||
Task<int> GetChangeSizeAsync(string component, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record PartialRollbackConfig
|
||||
{
|
||||
public TimeSpan PlanExpirationTime { get; init; } = TimeSpan.FromHours(4);
|
||||
public int MaxParallelSteps { get; init; } = 5;
|
||||
}
|
||||
|
||||
public sealed record RollbackPlanRequest
|
||||
{
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required ImmutableArray<string> TargetComponents { get; init; }
|
||||
public RollbackReason Reason { get; init; } = RollbackReason.HealthDegradation;
|
||||
}
|
||||
|
||||
public enum RollbackReason { HealthDegradation, FailedValidation, UserRequested, PolicyViolation }
|
||||
|
||||
public sealed record RollbackPlan
|
||||
{
|
||||
public required Guid PlanId { get; init; }
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required RollbackType Type { get; init; }
|
||||
public required RollbackPlanStatus Status { get; init; }
|
||||
public required ImmutableArray<string> Components { get; init; }
|
||||
public required ImmutableArray<RollbackStep> Steps { get; init; }
|
||||
public required ImmutableArray<VerificationCheckpoint> Checkpoints { get; init; }
|
||||
public required AggregateImpact AggregateImpact { get; init; }
|
||||
public required TimeSpan EstimatedDuration { get; init; }
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
public required DateTimeOffset ExpiresAt { get; init; }
|
||||
public required RollbackValidation Validation { get; init; }
|
||||
public OptimizationGoal? OptimizedFor { get; init; }
|
||||
public DateTimeOffset? OptimizedAt { get; init; }
|
||||
}
|
||||
|
||||
public enum RollbackType { Full, Partial, Gradual }
|
||||
public enum RollbackPlanStatus { Ready, Invalid, Executing, Completed, Failed }
|
||||
public enum OptimizationGoal { MinimizeDowntime, MinimizeRisk, MaximizeParallelism }
|
||||
|
||||
public sealed record RollbackStep
|
||||
{
|
||||
public required int StepNumber { get; init; }
|
||||
public required string ComponentName { get; init; }
|
||||
public required string CurrentVersion { get; init; }
|
||||
public required string TargetVersion { get; init; }
|
||||
public required RollbackAction Action { get; init; }
|
||||
public required TimeSpan EstimatedDuration { get; init; }
|
||||
public required ImmutableArray<int> Prerequisites { get; init; }
|
||||
public required ImmutableArray<VerificationCheck> VerificationChecks { get; init; }
|
||||
public required bool RollbackOnFailure { get; init; }
|
||||
public int? ParallelGroup { get; init; }
|
||||
}
|
||||
|
||||
public enum RollbackAction { ImageSwap, ConfigRevert, DatabaseMigration, FeatureToggle }
|
||||
|
||||
public sealed record VerificationCheckpoint
|
||||
{
|
||||
public required int CheckpointNumber { get; init; }
|
||||
public required int AfterStepNumber { get; init; }
|
||||
public required CheckpointType Type { get; init; }
|
||||
public required ImmutableArray<VerificationCheck> Checks { get; init; }
|
||||
public required TimeSpan Timeout { get; init; }
|
||||
public required bool ContinueOnFailure { get; init; }
|
||||
}
|
||||
|
||||
public enum CheckpointType { HealthCheck, SmokeTest, FullValidation }
|
||||
|
||||
public sealed record VerificationCheck
|
||||
{
|
||||
public required CheckType Type { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public string? Endpoint { get; init; }
|
||||
public string? MetricName { get; init; }
|
||||
public double? Threshold { get; init; }
|
||||
}
|
||||
|
||||
public enum CheckType { HealthEndpoint, MetricThreshold, EndToEndTest, MetricBaseline }
|
||||
|
||||
public sealed record AggregateImpact
|
||||
{
|
||||
public TimeSpan TotalDowntime { get; init; }
|
||||
public int TotalAffectedServices { get; init; }
|
||||
public int MaxAffectedUsers { get; init; }
|
||||
public RiskLevel OverallRiskLevel { get; init; }
|
||||
public int ComponentCount { get; init; }
|
||||
}
|
||||
|
||||
public sealed record RollbackValidation
|
||||
{
|
||||
public required bool IsValid { get; init; }
|
||||
public required ImmutableArray<ValidationIssue> Issues { get; init; }
|
||||
public ImmutableArray<ValidationIssue> Warnings { get; init; } = [];
|
||||
public required DateTimeOffset ValidatedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record PlanValidationResult
|
||||
{
|
||||
public required bool IsValid { get; init; }
|
||||
public required ImmutableArray<ValidationIssue> Issues { get; init; }
|
||||
public required DateTimeOffset ValidatedAt { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ValidationIssue
|
||||
{
|
||||
public required IssueSeverity Severity { get; init; }
|
||||
public required string Code { get; init; }
|
||||
public required string Message { get; init; }
|
||||
public string? Component { get; init; }
|
||||
public ImmutableArray<string> RelatedComponents { get; init; } = [];
|
||||
}
|
||||
|
||||
public enum IssueSeverity { Info, Warning, Error }
|
||||
|
||||
public sealed record RollbackSuggestion
|
||||
{
|
||||
public required Guid ReleaseId { get; init; }
|
||||
public required double Confidence { get; init; }
|
||||
public required ImmutableArray<string> Components { get; init; }
|
||||
public ImmutableArray<SuspectedComponent> SuspectedCauses { get; init; } = [];
|
||||
public required string Reasoning { get; init; }
|
||||
public string? FallbackRecommendation { get; init; }
|
||||
}
|
||||
|
||||
public sealed record SuspectedComponent
|
||||
{
|
||||
public required string ComponentName { get; init; }
|
||||
public required ImmutableArray<string> MatchingMetrics { get; init; }
|
||||
public required double Confidence { get; init; }
|
||||
public required int ChangeSize { get; init; }
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,683 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// PredictiveEngine.cs
|
||||
// Sprint: SPRINT_20260117_033_ReleaseOrchestrator_rollback_intelligence
|
||||
// Task: TASK-033-05 - Predictive Engine for failure anticipation
|
||||
// Description: Predicts deployment failures from early warning signals using ML models
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Deployment.Rollback;
|
||||
|
||||
/// <summary>
|
||||
/// Predicts deployment failures from early warning signals.
|
||||
/// Uses multiple algorithms including trend analysis, pattern matching, and ensemble models.
|
||||
/// </summary>
|
||||
public sealed class PredictiveEngine : IPredictiveEngine
|
||||
{
|
||||
private readonly IMetricsCollector _metricsCollector;
|
||||
private readonly IAnomalyDetector _anomalyDetector;
|
||||
private readonly IPatternMatcher _patternMatcher;
|
||||
private readonly ITrendAnalyzer _trendAnalyzer;
|
||||
private readonly PredictiveEngineConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<PredictiveEngine> _logger;
|
||||
|
||||
public PredictiveEngine(
|
||||
IMetricsCollector metricsCollector,
|
||||
IAnomalyDetector anomalyDetector,
|
||||
IPatternMatcher patternMatcher,
|
||||
ITrendAnalyzer trendAnalyzer,
|
||||
PredictiveEngineConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<PredictiveEngine> logger)
|
||||
{
|
||||
_metricsCollector = metricsCollector;
|
||||
_anomalyDetector = anomalyDetector;
|
||||
_patternMatcher = patternMatcher;
|
||||
_trendAnalyzer = trendAnalyzer;
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generates a failure prediction for a deployment.
|
||||
/// </summary>
|
||||
/// <param name="deploymentId">The deployment identifier.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Failure prediction with confidence and contributing factors.</returns>
|
||||
public async Task<FailurePrediction> PredictFailureAsync(
|
||||
Guid deploymentId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Generating failure prediction for deployment {DeploymentId}", deploymentId);
|
||||
|
||||
var metrics = await _metricsCollector.CollectCurrentAsync(deploymentId, ct);
|
||||
var history = await _metricsCollector.CollectHistoryAsync(deploymentId, _config.HistoryWindow, ct);
|
||||
|
||||
// Run prediction algorithms in parallel
|
||||
var trendTask = AnalyzeTrendsAsync(history, ct);
|
||||
var patternTask = MatchFailurePatternsAsync(history, ct);
|
||||
var anomalyTask = DetectEarlyAnomaliesAsync(metrics, history, ct);
|
||||
var velocityTask = CalculateMetricVelocitiesAsync(history, ct);
|
||||
|
||||
await Task.WhenAll(trendTask, patternTask, anomalyTask, velocityTask);
|
||||
|
||||
var trendSignals = trendTask.Result;
|
||||
var patternMatches = patternTask.Result;
|
||||
var anomalySignals = anomalyTask.Result;
|
||||
var velocities = velocityTask.Result;
|
||||
|
||||
// Combine signals using ensemble approach
|
||||
var prediction = CombinePredictions(
|
||||
deploymentId,
|
||||
trendSignals,
|
||||
patternMatches,
|
||||
anomalySignals,
|
||||
velocities);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Failure prediction for {DeploymentId}: Probability={Probability:P1}, TimeToFailure={TTF}",
|
||||
deploymentId, prediction.FailureProbability, prediction.EstimatedTimeToFailure);
|
||||
|
||||
return prediction;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets early warning signals without full prediction.
|
||||
/// </summary>
|
||||
public async Task<ImmutableArray<EarlyWarningSignal>> GetEarlyWarningsAsync(
|
||||
Guid deploymentId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var history = await _metricsCollector.CollectHistoryAsync(deploymentId, _config.HistoryWindow, ct);
|
||||
var warnings = new List<EarlyWarningSignal>();
|
||||
|
||||
foreach (var metric in _config.MonitoredMetrics)
|
||||
{
|
||||
var metricHistory = history.GetMetricHistory(metric.Name);
|
||||
if (metricHistory.Length < _config.MinDataPoints) continue;
|
||||
|
||||
var trend = await _trendAnalyzer.AnalyzeTrendAsync(metric.Name, metricHistory, ct);
|
||||
|
||||
if (IsWarningTrend(trend, metric))
|
||||
{
|
||||
warnings.Add(new EarlyWarningSignal
|
||||
{
|
||||
MetricName = metric.Name,
|
||||
SignalType = DetermineSignalType(trend),
|
||||
Severity = CalculateSeverity(trend, metric),
|
||||
TrendDirection = trend.Direction,
|
||||
TrendVelocity = trend.Velocity,
|
||||
TimeToThreshold = EstimateTimeToThreshold(trend, metric),
|
||||
DetectedAt = _timeProvider.GetUtcNow(),
|
||||
Message = GenerateWarningMessage(metric.Name, trend)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return warnings.ToImmutableArray();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Continuously monitors for failure predictions.
|
||||
/// </summary>
|
||||
public async IAsyncEnumerable<FailurePrediction> MonitorPredictionsAsync(
|
||||
Guid deploymentId,
|
||||
TimeSpan interval,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
while (!ct.IsCancellationRequested)
|
||||
{
|
||||
var prediction = await PredictFailureAsync(deploymentId, ct);
|
||||
yield return prediction;
|
||||
|
||||
// Adjust interval based on risk level
|
||||
var adjustedInterval = prediction.FailureProbability > 0.7
|
||||
? TimeSpan.FromSeconds(Math.Max(10, interval.TotalSeconds / 4))
|
||||
: interval;
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Delay(adjustedInterval, ct);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
yield break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<TrendSignal>> AnalyzeTrendsAsync(
|
||||
MetricsHistory history,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var signals = new List<TrendSignal>();
|
||||
|
||||
foreach (var metric in _config.MonitoredMetrics)
|
||||
{
|
||||
var metricHistory = history.GetMetricHistory(metric.Name);
|
||||
if (metricHistory.Length < _config.MinDataPoints) continue;
|
||||
|
||||
var trend = await _trendAnalyzer.AnalyzeTrendAsync(metric.Name, metricHistory, ct);
|
||||
|
||||
signals.Add(new TrendSignal
|
||||
{
|
||||
MetricName = metric.Name,
|
||||
Direction = trend.Direction,
|
||||
Velocity = trend.Velocity,
|
||||
Acceleration = trend.Acceleration,
|
||||
RSquared = trend.RSquared,
|
||||
ProjectedValue = trend.ProjectedValue,
|
||||
FailureContribution = CalculateTrendFailureContribution(trend, metric)
|
||||
});
|
||||
}
|
||||
|
||||
return signals.ToImmutableArray();
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<PatternMatch>> MatchFailurePatternsAsync(
|
||||
MetricsHistory history,
|
||||
CancellationToken ct)
|
||||
{
|
||||
return await _patternMatcher.FindMatchesAsync(history, _config.FailurePatterns, ct);
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<AnomalySignal>> DetectEarlyAnomaliesAsync(
|
||||
MetricsSnapshot current,
|
||||
MetricsHistory history,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var signals = new List<AnomalySignal>();
|
||||
|
||||
foreach (var metric in _config.MonitoredMetrics)
|
||||
{
|
||||
var currentValue = current.GetMetricValue(metric.Name);
|
||||
if (!currentValue.HasValue) continue;
|
||||
|
||||
var metricHistory = history.GetMetricHistory(metric.Name);
|
||||
var isAnomaly = await _anomalyDetector.IsAnomalyAsync(
|
||||
metric.Name,
|
||||
currentValue.Value,
|
||||
metricHistory,
|
||||
ct);
|
||||
|
||||
if (isAnomaly)
|
||||
{
|
||||
var severity = await _anomalyDetector.CalculateSeverityAsync(
|
||||
metric.Name,
|
||||
currentValue.Value,
|
||||
metricHistory,
|
||||
ct);
|
||||
|
||||
signals.Add(new AnomalySignal
|
||||
{
|
||||
MetricName = metric.Name,
|
||||
CurrentValue = currentValue.Value,
|
||||
ExpectedValue = metricHistory.Length > 0 ? metricHistory.Average() : 0,
|
||||
Severity = severity,
|
||||
FailureContribution = severity * metric.Weight
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return signals.ToImmutableArray();
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<VelocitySignal>> CalculateMetricVelocitiesAsync(
|
||||
MetricsHistory history,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var signals = new List<VelocitySignal>();
|
||||
|
||||
await Task.CompletedTask; // Placeholder for async operation
|
||||
|
||||
foreach (var metric in _config.MonitoredMetrics)
|
||||
{
|
||||
var metricHistory = history.GetMetricHistory(metric.Name);
|
||||
if (metricHistory.Length < 3) continue;
|
||||
|
||||
// Calculate rate of change
|
||||
var recentWindow = metricHistory.TakeLast(5).ToArray();
|
||||
var velocity = CalculateVelocity(recentWindow);
|
||||
var acceleration = CalculateAcceleration(recentWindow);
|
||||
|
||||
if (Math.Abs(velocity) > metric.VelocityThreshold)
|
||||
{
|
||||
signals.Add(new VelocitySignal
|
||||
{
|
||||
MetricName = metric.Name,
|
||||
Velocity = velocity,
|
||||
Acceleration = acceleration,
|
||||
IsAccelerating = acceleration > 0 && velocity > 0,
|
||||
FailureContribution = CalculateVelocityFailureContribution(velocity, acceleration, metric)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return signals.ToImmutableArray();
|
||||
}
|
||||
|
||||
private FailurePrediction CombinePredictions(
|
||||
Guid deploymentId,
|
||||
ImmutableArray<TrendSignal> trends,
|
||||
ImmutableArray<PatternMatch> patterns,
|
||||
ImmutableArray<AnomalySignal> anomalies,
|
||||
ImmutableArray<VelocitySignal> velocities)
|
||||
{
|
||||
var factors = new List<ContributingFactor>();
|
||||
|
||||
// Weight contributions from each signal type
|
||||
var trendContribution = trends.Sum(t => t.FailureContribution) * _config.TrendWeight;
|
||||
var patternContribution = patterns.Sum(p => p.Confidence * p.FailureProbability) * _config.PatternWeight;
|
||||
var anomalyContribution = anomalies.Sum(a => a.FailureContribution) * _config.AnomalyWeight;
|
||||
var velocityContribution = velocities.Sum(v => v.FailureContribution) * _config.VelocityWeight;
|
||||
|
||||
var totalWeight = _config.TrendWeight + _config.PatternWeight +
|
||||
_config.AnomalyWeight + _config.VelocityWeight;
|
||||
|
||||
var rawProbability = (trendContribution + patternContribution +
|
||||
anomalyContribution + velocityContribution) / totalWeight;
|
||||
|
||||
// Clamp to valid probability range
|
||||
var failureProbability = Math.Clamp(rawProbability, 0, 1);
|
||||
|
||||
// Add contributing factors
|
||||
foreach (var trend in trends.Where(t => t.FailureContribution > 0.1))
|
||||
{
|
||||
factors.Add(new ContributingFactor
|
||||
{
|
||||
Source = FactorSource.Trend,
|
||||
MetricName = trend.MetricName,
|
||||
Contribution = trend.FailureContribution * _config.TrendWeight / totalWeight,
|
||||
Description = $"Trend: {trend.Direction} at velocity {trend.Velocity:F2}"
|
||||
});
|
||||
}
|
||||
|
||||
foreach (var pattern in patterns)
|
||||
{
|
||||
factors.Add(new ContributingFactor
|
||||
{
|
||||
Source = FactorSource.Pattern,
|
||||
MetricName = pattern.PatternName,
|
||||
Contribution = pattern.Confidence * pattern.FailureProbability * _config.PatternWeight / totalWeight,
|
||||
Description = $"Pattern match: {pattern.PatternName} ({pattern.Confidence:P0} confidence)"
|
||||
});
|
||||
}
|
||||
|
||||
foreach (var anomaly in anomalies)
|
||||
{
|
||||
factors.Add(new ContributingFactor
|
||||
{
|
||||
Source = FactorSource.Anomaly,
|
||||
MetricName = anomaly.MetricName,
|
||||
Contribution = anomaly.FailureContribution * _config.AnomalyWeight / totalWeight,
|
||||
Description = $"Anomaly detected: {anomaly.CurrentValue:F2} vs expected {anomaly.ExpectedValue:F2}"
|
||||
});
|
||||
}
|
||||
|
||||
// Estimate time to failure
|
||||
var timeToFailure = EstimateTimeToFailure(failureProbability, trends, velocities);
|
||||
|
||||
return new FailurePrediction
|
||||
{
|
||||
DeploymentId = deploymentId,
|
||||
FailureProbability = failureProbability,
|
||||
Confidence = CalculateConfidence(trends, patterns, anomalies),
|
||||
RiskLevel = DetermineRiskLevel(failureProbability),
|
||||
EstimatedTimeToFailure = timeToFailure,
|
||||
ContributingFactors = factors.OrderByDescending(f => f.Contribution).ToImmutableArray(),
|
||||
GeneratedAt = _timeProvider.GetUtcNow(),
|
||||
Recommendation = GeneratePredictionRecommendation(failureProbability, timeToFailure)
|
||||
};
|
||||
}
|
||||
|
||||
private static double CalculateTrendFailureContribution(TrendAnalysis trend, MonitoredMetric metric)
|
||||
{
|
||||
if (trend.RSquared < 0.5) return 0; // Poor fit, ignore
|
||||
|
||||
var isUnfavorable = (metric.LowerIsBetter && trend.Direction == TrendDirection.Increasing) ||
|
||||
(!metric.LowerIsBetter && trend.Direction == TrendDirection.Decreasing);
|
||||
|
||||
if (!isUnfavorable) return 0;
|
||||
|
||||
return Math.Abs(trend.Velocity) * trend.RSquared * metric.Weight;
|
||||
}
|
||||
|
||||
private static double CalculateVelocityFailureContribution(double velocity, double acceleration, MonitoredMetric metric)
|
||||
{
|
||||
var isUnfavorable = (metric.LowerIsBetter && velocity > 0) || (!metric.LowerIsBetter && velocity < 0);
|
||||
if (!isUnfavorable) return 0;
|
||||
|
||||
var contribution = Math.Abs(velocity) / metric.VelocityThreshold * metric.Weight;
|
||||
|
||||
// Accelerating in wrong direction is worse
|
||||
if (acceleration > 0 && isUnfavorable)
|
||||
contribution *= 1.5;
|
||||
|
||||
return Math.Min(contribution, 1.0);
|
||||
}
|
||||
|
||||
private static double CalculateVelocity(double[] values)
|
||||
{
|
||||
if (values.Length < 2) return 0;
|
||||
return values[^1] - values[^2];
|
||||
}
|
||||
|
||||
private static double CalculateAcceleration(double[] values)
|
||||
{
|
||||
if (values.Length < 3) return 0;
|
||||
var v1 = values[^2] - values[^3];
|
||||
var v2 = values[^1] - values[^2];
|
||||
return v2 - v1;
|
||||
}
|
||||
|
||||
private TimeSpan? EstimateTimeToFailure(
|
||||
double probability,
|
||||
ImmutableArray<TrendSignal> trends,
|
||||
ImmutableArray<VelocitySignal> velocities)
|
||||
{
|
||||
if (probability < 0.3) return null; // Too uncertain
|
||||
|
||||
// Use fastest velocity trend to estimate
|
||||
var fastestTrend = trends
|
||||
.Where(t => t.FailureContribution > 0)
|
||||
.OrderByDescending(t => Math.Abs(t.Velocity))
|
||||
.FirstOrDefault();
|
||||
|
||||
if (fastestTrend is null) return null;
|
||||
|
||||
// Rough estimate based on velocity
|
||||
var estimatedMinutes = (1 - probability) / Math.Abs(fastestTrend.Velocity) * 60;
|
||||
return TimeSpan.FromMinutes(Math.Max(1, Math.Min(estimatedMinutes, 1440))); // 1 min to 24 hours
|
||||
}
|
||||
|
||||
private static double CalculateConfidence(
|
||||
ImmutableArray<TrendSignal> trends,
|
||||
ImmutableArray<PatternMatch> patterns,
|
||||
ImmutableArray<AnomalySignal> anomalies)
|
||||
{
|
||||
var dataPoints = trends.Length + patterns.Length + anomalies.Length;
|
||||
if (dataPoints == 0) return 0;
|
||||
|
||||
var avgRSquared = trends.Length > 0 ? trends.Average(t => t.RSquared) : 0.5;
|
||||
var avgPatternConfidence = patterns.Length > 0 ? patterns.Average(p => p.Confidence) : 0.5;
|
||||
|
||||
return (avgRSquared + avgPatternConfidence) / 2 * Math.Min(1, dataPoints / 5.0);
|
||||
}
|
||||
|
||||
private static RiskLevel DetermineRiskLevel(double probability)
|
||||
{
|
||||
return probability switch
|
||||
{
|
||||
>= 0.8 => RiskLevel.Critical,
|
||||
>= 0.6 => RiskLevel.High,
|
||||
>= 0.4 => RiskLevel.Medium,
|
||||
>= 0.2 => RiskLevel.Low,
|
||||
_ => RiskLevel.Minimal
|
||||
};
|
||||
}
|
||||
|
||||
private static PredictionRecommendation GeneratePredictionRecommendation(
|
||||
double probability,
|
||||
TimeSpan? timeToFailure)
|
||||
{
|
||||
if (probability >= 0.8)
|
||||
{
|
||||
return new PredictionRecommendation
|
||||
{
|
||||
Action = PredictedAction.ImmediateRollback,
|
||||
Urgency = Urgency.Critical,
|
||||
Message = "Failure imminent - immediate rollback recommended"
|
||||
};
|
||||
}
|
||||
|
||||
if (probability >= 0.6)
|
||||
{
|
||||
return new PredictionRecommendation
|
||||
{
|
||||
Action = PredictedAction.PrepareRollback,
|
||||
Urgency = Urgency.High,
|
||||
Message = $"High failure probability - prepare rollback, estimated time: {timeToFailure}"
|
||||
};
|
||||
}
|
||||
|
||||
if (probability >= 0.4)
|
||||
{
|
||||
return new PredictionRecommendation
|
||||
{
|
||||
Action = PredictedAction.IncreasedMonitoring,
|
||||
Urgency = Urgency.Medium,
|
||||
Message = "Elevated risk - increase monitoring frequency"
|
||||
};
|
||||
}
|
||||
|
||||
return new PredictionRecommendation
|
||||
{
|
||||
Action = PredictedAction.ContinueMonitoring,
|
||||
Urgency = Urgency.Low,
|
||||
Message = "Risk within acceptable range"
|
||||
};
|
||||
}
|
||||
|
||||
private static bool IsWarningTrend(TrendAnalysis trend, MonitoredMetric metric)
|
||||
{
|
||||
if (trend.RSquared < 0.5) return false;
|
||||
|
||||
var isUnfavorable = (metric.LowerIsBetter && trend.Direction == TrendDirection.Increasing) ||
|
||||
(!metric.LowerIsBetter && trend.Direction == TrendDirection.Decreasing);
|
||||
|
||||
return isUnfavorable && Math.Abs(trend.Velocity) > metric.VelocityThreshold * 0.5;
|
||||
}
|
||||
|
||||
private static EarlyWarningType DetermineSignalType(TrendAnalysis trend)
|
||||
{
|
||||
if (trend.Acceleration > 0 && trend.Velocity > 0)
|
||||
return EarlyWarningType.AcceleratingDegradation;
|
||||
if (trend.Direction == TrendDirection.Increasing)
|
||||
return EarlyWarningType.GradualDegradation;
|
||||
return EarlyWarningType.Anomaly;
|
||||
}
|
||||
|
||||
private static WarningSeverity CalculateSeverity(TrendAnalysis trend, MonitoredMetric metric)
|
||||
{
|
||||
var velocityRatio = Math.Abs(trend.Velocity) / metric.VelocityThreshold;
|
||||
|
||||
return velocityRatio switch
|
||||
{
|
||||
>= 2.0 => WarningSeverity.Critical,
|
||||
>= 1.5 => WarningSeverity.High,
|
||||
>= 1.0 => WarningSeverity.Medium,
|
||||
_ => WarningSeverity.Low
|
||||
};
|
||||
}
|
||||
|
||||
private TimeSpan? EstimateTimeToThreshold(TrendAnalysis trend, MonitoredMetric metric)
|
||||
{
|
||||
if (Math.Abs(trend.Velocity) < 0.001) return null;
|
||||
|
||||
var distanceToThreshold = metric.Threshold - trend.CurrentValue;
|
||||
var timeUnits = distanceToThreshold / trend.Velocity;
|
||||
|
||||
if (timeUnits <= 0) return null;
|
||||
|
||||
return TimeSpan.FromMinutes(timeUnits * 5); // Assuming 5-minute sampling
|
||||
}
|
||||
|
||||
private static string GenerateWarningMessage(string metricName, TrendAnalysis trend)
|
||||
{
|
||||
return $"{metricName} is {trend.Direction.ToString().ToLower()} at rate {trend.Velocity:F2}/sample";
|
||||
}
|
||||
}
|
||||
|
||||
#region Interfaces
|
||||
|
||||
public interface IPredictiveEngine
|
||||
{
|
||||
Task<FailurePrediction> PredictFailureAsync(Guid deploymentId, CancellationToken ct = default);
|
||||
Task<ImmutableArray<EarlyWarningSignal>> GetEarlyWarningsAsync(Guid deploymentId, CancellationToken ct = default);
|
||||
IAsyncEnumerable<FailurePrediction> MonitorPredictionsAsync(Guid deploymentId, TimeSpan interval, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface IPatternMatcher
|
||||
{
|
||||
Task<ImmutableArray<PatternMatch>> FindMatchesAsync(MetricsHistory history, ImmutableArray<FailurePattern> patterns, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public interface ITrendAnalyzer
|
||||
{
|
||||
Task<TrendAnalysis> AnalyzeTrendAsync(string metricName, ImmutableArray<double> values, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Models
|
||||
|
||||
public sealed record PredictiveEngineConfig
|
||||
{
|
||||
public TimeSpan HistoryWindow { get; init; } = TimeSpan.FromHours(1);
|
||||
public int MinDataPoints { get; init; } = 10;
|
||||
public ImmutableArray<MonitoredMetric> MonitoredMetrics { get; init; } = [];
|
||||
public ImmutableArray<FailurePattern> FailurePatterns { get; init; } = [];
|
||||
public double TrendWeight { get; init; } = 0.3;
|
||||
public double PatternWeight { get; init; } = 0.25;
|
||||
public double AnomalyWeight { get; init; } = 0.25;
|
||||
public double VelocityWeight { get; init; } = 0.2;
|
||||
}
|
||||
|
||||
public sealed record MonitoredMetric
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public double Weight { get; init; } = 1.0;
|
||||
public double Threshold { get; init; }
|
||||
public double VelocityThreshold { get; init; } = 0.1;
|
||||
public bool LowerIsBetter { get; init; } = true;
|
||||
}
|
||||
|
||||
public sealed record FailurePattern
|
||||
{
|
||||
public required string Name { get; init; }
|
||||
public required string Description { get; init; }
|
||||
public ImmutableArray<PatternCondition> Conditions { get; init; } = [];
|
||||
public double FailureProbability { get; init; }
|
||||
}
|
||||
|
||||
public sealed record PatternCondition
|
||||
{
|
||||
public required string MetricName { get; init; }
|
||||
public required ConditionType Type { get; init; }
|
||||
public double Threshold { get; init; }
|
||||
}
|
||||
|
||||
public enum ConditionType { GreaterThan, LessThan, SpikesAbove, DropsBelow, Oscillates }
|
||||
|
||||
public sealed record FailurePrediction
|
||||
{
|
||||
public required Guid DeploymentId { get; init; }
|
||||
public required double FailureProbability { get; init; }
|
||||
public required double Confidence { get; init; }
|
||||
public required RiskLevel RiskLevel { get; init; }
|
||||
public TimeSpan? EstimatedTimeToFailure { get; init; }
|
||||
public required ImmutableArray<ContributingFactor> ContributingFactors { get; init; }
|
||||
public required DateTimeOffset GeneratedAt { get; init; }
|
||||
public required PredictionRecommendation Recommendation { get; init; }
|
||||
}
|
||||
|
||||
public sealed record ContributingFactor
|
||||
{
|
||||
public required FactorSource Source { get; init; }
|
||||
public required string MetricName { get; init; }
|
||||
public required double Contribution { get; init; }
|
||||
public required string Description { get; init; }
|
||||
}
|
||||
|
||||
public enum FactorSource { Trend, Pattern, Anomaly, Velocity }
|
||||
public enum RiskLevel { Minimal, Low, Medium, High, Critical }
|
||||
|
||||
public sealed record PredictionRecommendation
|
||||
{
|
||||
public required PredictedAction Action { get; init; }
|
||||
public required Urgency Urgency { get; init; }
|
||||
public required string Message { get; init; }
|
||||
}
|
||||
|
||||
public enum PredictedAction { ContinueMonitoring, IncreasedMonitoring, PrepareRollback, ImmediateRollback }
|
||||
public enum Urgency { Low, Medium, High, Critical }
|
||||
|
||||
public sealed record EarlyWarningSignal
|
||||
{
|
||||
public required string MetricName { get; init; }
|
||||
public required EarlyWarningType SignalType { get; init; }
|
||||
public required WarningSeverity Severity { get; init; }
|
||||
public required TrendDirection TrendDirection { get; init; }
|
||||
public required double TrendVelocity { get; init; }
|
||||
public TimeSpan? TimeToThreshold { get; init; }
|
||||
public required DateTimeOffset DetectedAt { get; init; }
|
||||
public required string Message { get; init; }
|
||||
}
|
||||
|
||||
public enum EarlyWarningType { GradualDegradation, AcceleratingDegradation, Anomaly, PatternMatch }
|
||||
public enum WarningSeverity { Low, Medium, High, Critical }
|
||||
|
||||
public sealed record TrendSignal
|
||||
{
|
||||
public required string MetricName { get; init; }
|
||||
public required TrendDirection Direction { get; init; }
|
||||
public required double Velocity { get; init; }
|
||||
public required double Acceleration { get; init; }
|
||||
public required double RSquared { get; init; }
|
||||
public required double ProjectedValue { get; init; }
|
||||
public required double FailureContribution { get; init; }
|
||||
}
|
||||
|
||||
public sealed record AnomalySignal
|
||||
{
|
||||
public required string MetricName { get; init; }
|
||||
public required double CurrentValue { get; init; }
|
||||
public required double ExpectedValue { get; init; }
|
||||
public required double Severity { get; init; }
|
||||
public required double FailureContribution { get; init; }
|
||||
}
|
||||
|
||||
public sealed record VelocitySignal
|
||||
{
|
||||
public required string MetricName { get; init; }
|
||||
public required double Velocity { get; init; }
|
||||
public required double Acceleration { get; init; }
|
||||
public required bool IsAccelerating { get; init; }
|
||||
public required double FailureContribution { get; init; }
|
||||
}
|
||||
|
||||
public sealed record PatternMatch
|
||||
{
|
||||
public required string PatternName { get; init; }
|
||||
public required double Confidence { get; init; }
|
||||
public required double FailureProbability { get; init; }
|
||||
public ImmutableArray<string> MatchedMetrics { get; init; } = [];
|
||||
}
|
||||
|
||||
public sealed record TrendAnalysis
|
||||
{
|
||||
public required TrendDirection Direction { get; init; }
|
||||
public required double Velocity { get; init; }
|
||||
public required double Acceleration { get; init; }
|
||||
public required double RSquared { get; init; }
|
||||
public required double ProjectedValue { get; init; }
|
||||
public required double CurrentValue { get; init; }
|
||||
}
|
||||
|
||||
public enum TrendDirection { Stable, Increasing, Decreasing }
|
||||
|
||||
public sealed record MetricsHistory
|
||||
{
|
||||
private readonly ImmutableDictionary<string, ImmutableArray<double>> _history;
|
||||
|
||||
public MetricsHistory(ImmutableDictionary<string, ImmutableArray<double>> history) => _history = history;
|
||||
|
||||
public ImmutableArray<double> GetMetricHistory(string metricName) =>
|
||||
_history.GetValueOrDefault(metricName, []);
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -28,6 +28,7 @@ public sealed class DriftDetector
|
||||
ExpectedState expectedState)
|
||||
{
|
||||
var drifts = new List<DriftItem>();
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
// Check for missing and mismatched containers
|
||||
foreach (var expected in expectedState.Containers)
|
||||
@@ -43,7 +44,9 @@ public sealed class DriftDetector
|
||||
Name: expected.Name,
|
||||
Expected: expected.ImageDigest,
|
||||
Actual: null,
|
||||
Message: $"Container '{expected.Name}' not found"));
|
||||
Message: $"Container '{expected.Name}' not found",
|
||||
DetectedAt: now,
|
||||
ComponentId: expected.ComponentId));
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -56,7 +59,9 @@ public sealed class DriftDetector
|
||||
Name: expected.Name,
|
||||
Expected: expected.ImageDigest,
|
||||
Actual: actual.ImageDigest,
|
||||
Message: $"Container '{expected.Name}' has different image digest"));
|
||||
Message: $"Container '{expected.Name}' has different image digest",
|
||||
DetectedAt: now,
|
||||
ComponentId: expected.ComponentId));
|
||||
}
|
||||
|
||||
// Check status
|
||||
@@ -68,7 +73,9 @@ public sealed class DriftDetector
|
||||
Name: expected.Name,
|
||||
Expected: "running",
|
||||
Actual: actual.Status,
|
||||
Message: $"Container '{expected.Name}' is not running (status: {actual.Status})"));
|
||||
Message: $"Container '{expected.Name}' is not running (status: {actual.Status})",
|
||||
DetectedAt: now,
|
||||
ComponentId: expected.ComponentId));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,13 +94,15 @@ public sealed class DriftDetector
|
||||
Name: actual.Name,
|
||||
Expected: null,
|
||||
Actual: actual.ImageDigest,
|
||||
Message: $"Unexpected container '{actual.Name}' found"));
|
||||
Message: $"Unexpected container '{actual.Name}' found",
|
||||
DetectedAt: now,
|
||||
ComponentId: null));
|
||||
}
|
||||
}
|
||||
|
||||
return new DriftReport(
|
||||
TargetId: currentState.TargetId,
|
||||
DetectedAt: _timeProvider.GetUtcNow(),
|
||||
DetectedAt: now,
|
||||
HasDrift: drifts.Count > 0,
|
||||
Drifts: drifts.ToImmutableArray());
|
||||
}
|
||||
|
||||
@@ -20,7 +20,9 @@ public sealed record DriftItem(
|
||||
string Name,
|
||||
string? Expected,
|
||||
string? Actual,
|
||||
string Message);
|
||||
string Message,
|
||||
DateTimeOffset DetectedAt = default,
|
||||
Guid? ComponentId = null);
|
||||
|
||||
/// <summary>
|
||||
/// Types of drift that can be detected.
|
||||
|
||||
@@ -35,4 +35,5 @@ public sealed record ExpectedContainer(
|
||||
string Name,
|
||||
string Image,
|
||||
string ImageDigest,
|
||||
ImmutableDictionary<string, string> Labels);
|
||||
ImmutableDictionary<string, string> Labels,
|
||||
Guid? ComponentId = null);
|
||||
|
||||
@@ -0,0 +1,100 @@
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Environment.Inventory.Remediation;
|
||||
|
||||
/// <summary>
|
||||
/// Calculated severity of a drift item.
|
||||
/// </summary>
|
||||
public sealed record DriftSeverity
|
||||
{
|
||||
/// <summary>
|
||||
/// The severity level category.
|
||||
/// </summary>
|
||||
public required DriftSeverityLevel Level { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Numeric severity score (0-100).
|
||||
/// </summary>
|
||||
public required int Score { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Individual factors contributing to the score.
|
||||
/// </summary>
|
||||
public required ImmutableArray<SeverityFactor> Factors { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// How long the drift has existed.
|
||||
/// </summary>
|
||||
public required TimeSpan DriftAge { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether this drift requires immediate attention.
|
||||
/// </summary>
|
||||
public required bool RequiresImmediate { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Severity levels for drift classification.
|
||||
/// </summary>
|
||||
public enum DriftSeverityLevel
|
||||
{
|
||||
/// <summary>
|
||||
/// Cosmetic differences (labels, annotations). Score: 0-24.
|
||||
/// </summary>
|
||||
Info = 0,
|
||||
|
||||
/// <summary>
|
||||
/// Non-critical drift (resource limits changed). Score: 25-49.
|
||||
/// </summary>
|
||||
Low = 25,
|
||||
|
||||
/// <summary>
|
||||
/// Functional drift (ports, volumes). Score: 50-74.
|
||||
/// </summary>
|
||||
Medium = 50,
|
||||
|
||||
/// <summary>
|
||||
/// Security drift (image digest mismatch). Score: 75-89.
|
||||
/// </summary>
|
||||
High = 75,
|
||||
|
||||
/// <summary>
|
||||
/// Severe drift (container missing, wrong image). Score: 90-100.
|
||||
/// </summary>
|
||||
Critical = 100
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A single factor contributing to severity calculation.
|
||||
/// </summary>
|
||||
public sealed record SeverityFactor(
|
||||
string Name,
|
||||
int Score,
|
||||
double Weight)
|
||||
{
|
||||
/// <summary>
|
||||
/// The weighted contribution to total score.
|
||||
/// </summary>
|
||||
public double WeightedScore => Score * Weight;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Environment criticality level.
|
||||
/// </summary>
|
||||
public enum EnvironmentCriticality
|
||||
{
|
||||
/// <summary>
|
||||
/// Development environment.
|
||||
/// </summary>
|
||||
Development = 0,
|
||||
|
||||
/// <summary>
|
||||
/// Staging/QA environment.
|
||||
/// </summary>
|
||||
Staging = 1,
|
||||
|
||||
/// <summary>
|
||||
/// Production environment.
|
||||
/// </summary>
|
||||
Production = 2
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
namespace StellaOps.ReleaseOrchestrator.Environment.Inventory.Remediation;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for remediation policy persistence.
|
||||
/// </summary>
|
||||
public interface IRemediationPolicyStore
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new remediation policy.
|
||||
/// </summary>
|
||||
Task<RemediationPolicy> CreateAsync(RemediationPolicy policy, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a policy by ID.
|
||||
/// </summary>
|
||||
Task<RemediationPolicy?> GetAsync(Guid id, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a policy by name within an environment.
|
||||
/// </summary>
|
||||
Task<RemediationPolicy?> GetByNameAsync(Guid environmentId, string name, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Lists all policies for an environment.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<RemediationPolicy>> ListAsync(Guid environmentId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Lists all active policies scheduled for the current time.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<RemediationPolicy>> GetScheduledPoliciesAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Updates an existing policy.
|
||||
/// </summary>
|
||||
Task<RemediationPolicy> UpdateAsync(RemediationPolicy policy, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Deletes a policy.
|
||||
/// </summary>
|
||||
Task<bool> DeleteAsync(Guid id, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Activates a policy.
|
||||
/// </summary>
|
||||
Task<RemediationPolicy?> ActivateAsync(Guid id, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Deactivates a policy.
|
||||
/// </summary>
|
||||
Task<RemediationPolicy?> DeactivateAsync(Guid id, CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,233 @@
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Environment.Inventory.Remediation;
|
||||
|
||||
/// <summary>
|
||||
/// Background service for scheduled drift reconciliation.
|
||||
/// </summary>
|
||||
public sealed class ReconcileScheduler : BackgroundService
|
||||
{
|
||||
private readonly IRemediationPolicyStore _policyStore;
|
||||
private readonly DriftDetector _driftDetector;
|
||||
private readonly RemediationEngine _engine;
|
||||
private readonly IInventorySyncService _inventoryService;
|
||||
private readonly IExpectedStateService _expectedStateService;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ReconcileSchedulerConfig _config;
|
||||
private readonly ILogger<ReconcileScheduler> _logger;
|
||||
|
||||
public ReconcileScheduler(
|
||||
IRemediationPolicyStore policyStore,
|
||||
DriftDetector driftDetector,
|
||||
RemediationEngine engine,
|
||||
IInventorySyncService inventoryService,
|
||||
IExpectedStateService expectedStateService,
|
||||
TimeProvider timeProvider,
|
||||
ReconcileSchedulerConfig config,
|
||||
ILogger<ReconcileScheduler> logger)
|
||||
{
|
||||
_policyStore = policyStore;
|
||||
_driftDetector = driftDetector;
|
||||
_engine = engine;
|
||||
_inventoryService = inventoryService;
|
||||
_expectedStateService = expectedStateService;
|
||||
_timeProvider = timeProvider;
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogInformation("Reconcile scheduler starting with interval {Interval}",
|
||||
_config.CheckInterval);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await RunScheduledReconciliationAsync(stoppingToken);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error in scheduled reconciliation");
|
||||
}
|
||||
|
||||
await Task.Delay(_config.CheckInterval, stoppingToken);
|
||||
}
|
||||
|
||||
_logger.LogInformation("Reconcile scheduler stopped");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs scheduled reconciliation for all applicable policies.
|
||||
/// </summary>
|
||||
public async Task RunScheduledReconciliationAsync(CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Running scheduled reconciliation check");
|
||||
|
||||
var policies = await _policyStore.GetScheduledPoliciesAsync(ct);
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
foreach (var policy in policies)
|
||||
{
|
||||
if (!policy.IsActive)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!IsWithinWindow(policy, now))
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Policy {PolicyName} is outside maintenance window, skipping",
|
||||
policy.Name);
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
await ReconcileEnvironmentAsync(policy, ct);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"Failed to reconcile environment {EnvironmentId} with policy {PolicyName}",
|
||||
policy.EnvironmentId, policy.Name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async Task ReconcileEnvironmentAsync(
|
||||
RemediationPolicy policy,
|
||||
CancellationToken ct)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Reconciling environment {EnvironmentId} with policy {PolicyName}",
|
||||
policy.EnvironmentId, policy.Name);
|
||||
|
||||
// Get current inventory
|
||||
var inventory = await _inventoryService.GetCurrentAsync(policy.EnvironmentId, ct);
|
||||
if (inventory is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"No inventory found for environment {EnvironmentId}",
|
||||
policy.EnvironmentId);
|
||||
return;
|
||||
}
|
||||
|
||||
// Get expected state
|
||||
var expectedState = await _expectedStateService.GetExpectedStateAsync(
|
||||
policy.EnvironmentId, ct);
|
||||
if (expectedState is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"No expected state found for environment {EnvironmentId}",
|
||||
policy.EnvironmentId);
|
||||
return;
|
||||
}
|
||||
|
||||
// Detect drift
|
||||
var drift = _driftDetector.Detect(inventory, expectedState);
|
||||
|
||||
if (!drift.HasDrift)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"No drift detected for environment {EnvironmentId}",
|
||||
policy.EnvironmentId);
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Detected {DriftCount} drift items for environment {EnvironmentId}",
|
||||
drift.Drifts.Length, policy.EnvironmentId);
|
||||
|
||||
// Create scoring context
|
||||
var scoringContext = new ScoringContext
|
||||
{
|
||||
Now = _timeProvider.GetUtcNow(),
|
||||
Environment = new EnvironmentInfo(
|
||||
policy.EnvironmentId,
|
||||
$"Environment-{policy.EnvironmentId}",
|
||||
EnvironmentCriticality.Production) // TODO: Get from environment config
|
||||
};
|
||||
|
||||
// Create and execute plan
|
||||
var plan = await _engine.CreatePlanAsync(drift, policy, scoringContext, ct);
|
||||
|
||||
if (plan.Status == RemediationPlanStatus.Created)
|
||||
{
|
||||
var result = await _engine.ExecuteAsync(plan, ct);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Completed reconciliation for environment {EnvironmentId}: " +
|
||||
"{Succeeded}/{Total} targets remediated",
|
||||
policy.EnvironmentId,
|
||||
result.Metrics.Succeeded,
|
||||
result.Metrics.TotalTargets);
|
||||
}
|
||||
}
|
||||
|
||||
private bool IsWithinWindow(RemediationPolicy policy, DateTimeOffset now)
|
||||
{
|
||||
// Check day of week
|
||||
if (!policy.AllowedDays.Contains(now.DayOfWeek))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var currentTime = TimeOnly.FromDateTime(now.DateTime);
|
||||
|
||||
// Check general allowed time window
|
||||
if (currentTime < policy.AllowedStartTime || currentTime > policy.AllowedEndTime)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check maintenance window if specified
|
||||
if (policy.MaintenanceWindow is not null)
|
||||
{
|
||||
var window = policy.MaintenanceWindow;
|
||||
if (!window.Days.Contains(now.DayOfWeek))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if (currentTime < window.StartTime || currentTime > window.EndTime)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for the reconcile scheduler.
|
||||
/// </summary>
|
||||
public sealed record ReconcileSchedulerConfig
|
||||
{
|
||||
/// <summary>
|
||||
/// How often to check for policies to execute.
|
||||
/// </summary>
|
||||
public TimeSpan CheckInterval { get; init; } = TimeSpan.FromMinutes(5);
|
||||
|
||||
/// <summary>
|
||||
/// Maximum concurrent policy executions.
|
||||
/// </summary>
|
||||
public int MaxConcurrentExecutions { get; init; } = 3;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for expected state retrieval.
|
||||
/// </summary>
|
||||
public interface IExpectedStateService
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the expected state for an environment.
|
||||
/// </summary>
|
||||
Task<ExpectedState?> GetExpectedStateAsync(Guid environmentId, CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,205 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Environment.Inventory.Remediation;
|
||||
|
||||
/// <summary>
|
||||
/// Circuit breaker for remediation operations.
|
||||
/// </summary>
|
||||
public sealed class RemediationCircuitBreaker
|
||||
{
|
||||
private readonly CircuitBreakerConfig _config;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<RemediationCircuitBreaker> _logger;
|
||||
|
||||
private int _consecutiveFailures;
|
||||
private DateTimeOffset? _openedAt;
|
||||
private readonly object _lock = new();
|
||||
|
||||
public RemediationCircuitBreaker(
|
||||
CircuitBreakerConfig config,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<RemediationCircuitBreaker> logger)
|
||||
{
|
||||
_config = config;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Whether the circuit is currently open (blocking requests).
|
||||
/// </summary>
|
||||
public bool IsOpen
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (_openedAt is null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var elapsed = _timeProvider.GetUtcNow() - _openedAt.Value;
|
||||
if (elapsed >= _config.OpenDuration)
|
||||
{
|
||||
// Circuit has been open long enough, allow half-open state
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current state of the circuit breaker.
|
||||
/// </summary>
|
||||
public CircuitBreakerState State
|
||||
{
|
||||
get
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (_openedAt is null)
|
||||
{
|
||||
return CircuitBreakerState.Closed;
|
||||
}
|
||||
|
||||
var elapsed = _timeProvider.GetUtcNow() - _openedAt.Value;
|
||||
if (elapsed >= _config.OpenDuration)
|
||||
{
|
||||
return CircuitBreakerState.HalfOpen;
|
||||
}
|
||||
|
||||
return CircuitBreakerState.Open;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of consecutive failures.
|
||||
/// </summary>
|
||||
public int ConsecutiveFailures => _consecutiveFailures;
|
||||
|
||||
/// <summary>
|
||||
/// Records a successful operation.
|
||||
/// </summary>
|
||||
public void RecordSuccess()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
if (_openedAt is not null)
|
||||
{
|
||||
_logger.LogInformation("Circuit breaker closing after successful operation");
|
||||
}
|
||||
|
||||
_consecutiveFailures = 0;
|
||||
_openedAt = null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a failed operation.
|
||||
/// </summary>
|
||||
public void RecordFailure()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_consecutiveFailures++;
|
||||
|
||||
if (_consecutiveFailures >= _config.FailureThreshold && _openedAt is null)
|
||||
{
|
||||
_openedAt = _timeProvider.GetUtcNow();
|
||||
_logger.LogWarning(
|
||||
"Remediation circuit breaker opened after {Failures} consecutive failures",
|
||||
_consecutiveFailures);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resets the circuit breaker to closed state.
|
||||
/// </summary>
|
||||
public void Reset()
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_consecutiveFailures = 0;
|
||||
_openedAt = null;
|
||||
_logger.LogInformation("Circuit breaker manually reset");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if operation is allowed and throws if circuit is open.
|
||||
/// </summary>
|
||||
public void EnsureAllowed()
|
||||
{
|
||||
if (IsOpen)
|
||||
{
|
||||
var remainingTime = _config.OpenDuration - (_timeProvider.GetUtcNow() - _openedAt!.Value);
|
||||
throw new CircuitBreakerOpenException(
|
||||
$"Circuit breaker is open. Will reset in {remainingTime.TotalSeconds:F0} seconds.",
|
||||
remainingTime);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for the circuit breaker.
|
||||
/// </summary>
|
||||
public sealed record CircuitBreakerConfig
|
||||
{
|
||||
/// <summary>
|
||||
/// Number of consecutive failures before opening the circuit.
|
||||
/// </summary>
|
||||
public int FailureThreshold { get; init; } = 5;
|
||||
|
||||
/// <summary>
|
||||
/// How long the circuit stays open before transitioning to half-open.
|
||||
/// </summary>
|
||||
public TimeSpan OpenDuration { get; init; } = TimeSpan.FromMinutes(5);
|
||||
|
||||
/// <summary>
|
||||
/// Number of successful operations in half-open state to close the circuit.
|
||||
/// </summary>
|
||||
public int SuccessThresholdForClose { get; init; } = 2;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// State of the circuit breaker.
|
||||
/// </summary>
|
||||
public enum CircuitBreakerState
|
||||
{
|
||||
/// <summary>
|
||||
/// Circuit is closed, operations are allowed.
|
||||
/// </summary>
|
||||
Closed,
|
||||
|
||||
/// <summary>
|
||||
/// Circuit is open, operations are blocked.
|
||||
/// </summary>
|
||||
Open,
|
||||
|
||||
/// <summary>
|
||||
/// Circuit is half-open, limited operations allowed for testing.
|
||||
/// </summary>
|
||||
HalfOpen
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when circuit breaker is open.
|
||||
/// </summary>
|
||||
public sealed class CircuitBreakerOpenException : Exception
|
||||
{
|
||||
/// <summary>
|
||||
/// Remaining time until circuit resets.
|
||||
/// </summary>
|
||||
public TimeSpan RemainingTime { get; }
|
||||
|
||||
public CircuitBreakerOpenException(string message, TimeSpan remainingTime)
|
||||
: base(message)
|
||||
{
|
||||
RemainingTime = remainingTime;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,552 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Environment.Inventory.Remediation;
|
||||
|
||||
/// <summary>
|
||||
/// Orchestrates drift remediation planning and execution.
|
||||
/// </summary>
|
||||
public sealed class RemediationEngine
|
||||
{
|
||||
private readonly SeverityScorer _severityScorer;
|
||||
private readonly RemediationRateLimiter _rateLimiter;
|
||||
private readonly IRemediationExecutor _executor;
|
||||
private readonly IRemediationEvidenceWriter _evidenceWriter;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<RemediationEngine> _logger;
|
||||
|
||||
public RemediationEngine(
|
||||
SeverityScorer severityScorer,
|
||||
RemediationRateLimiter rateLimiter,
|
||||
IRemediationExecutor executor,
|
||||
IRemediationEvidenceWriter evidenceWriter,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<RemediationEngine> logger)
|
||||
{
|
||||
_severityScorer = severityScorer;
|
||||
_rateLimiter = rateLimiter;
|
||||
_executor = executor;
|
||||
_evidenceWriter = evidenceWriter;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a remediation plan based on drift report and policy.
|
||||
/// </summary>
|
||||
public async Task<RemediationPlan> CreatePlanAsync(
|
||||
DriftReport driftReport,
|
||||
RemediationPolicy policy,
|
||||
ScoringContext scoringContext,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(driftReport);
|
||||
ArgumentNullException.ThrowIfNull(policy);
|
||||
ArgumentNullException.ThrowIfNull(scoringContext);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Creating remediation plan for {DriftCount} drift items using policy {PolicyName}",
|
||||
driftReport.Drifts.Length, policy.Name);
|
||||
|
||||
// 1. Score severity for each drift item
|
||||
var scoredDrifts = _severityScorer.ScoreAll(driftReport.Drifts, scoringContext);
|
||||
|
||||
// 2. Filter by policy thresholds
|
||||
var actionable = scoredDrifts
|
||||
.Where(d => d.Severity.Level >= policy.MinimumSeverity)
|
||||
.Where(d => d.Severity.DriftAge >= policy.MinimumDriftAge)
|
||||
.ToImmutableArray();
|
||||
|
||||
if (actionable.IsEmpty)
|
||||
{
|
||||
_logger.LogInformation("No drifts meet policy thresholds for remediation");
|
||||
return CreateEmptyPlan(driftReport, policy);
|
||||
}
|
||||
|
||||
// 3. Check maintenance window
|
||||
if (!IsWithinMaintenanceWindow(policy))
|
||||
{
|
||||
_logger.LogInformation("Outside maintenance window, deferring plan");
|
||||
return RemediationPlan.Deferred(actionable, policy.MaintenanceWindow, policy, driftReport.TargetId);
|
||||
}
|
||||
|
||||
// 4. Check rate limits
|
||||
var rateLimitResult = await _rateLimiter.CheckAsync(policy, actionable.Length, ct);
|
||||
if (!rateLimitResult.IsAllowed)
|
||||
{
|
||||
_logger.LogWarning("Rate limit exceeded: {Reason}", rateLimitResult.Reason);
|
||||
return CreateDeferredPlan(driftReport, policy, rateLimitResult.Reason ?? "Rate limit exceeded");
|
||||
}
|
||||
|
||||
// 5. Apply blast radius limits
|
||||
var limited = ApplyBlastRadiusLimits(actionable, policy);
|
||||
|
||||
// 6. Build execution plan
|
||||
return BuildExecutionPlan(driftReport, limited, policy);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Executes a remediation plan.
|
||||
/// </summary>
|
||||
public async Task<RemediationResult> ExecuteAsync(
|
||||
RemediationPlan plan,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(plan);
|
||||
|
||||
if (plan.Status != RemediationPlanStatus.Created &&
|
||||
plan.Status != RemediationPlanStatus.Scheduled)
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
$"Cannot execute plan in status {plan.Status}");
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Executing remediation plan {PlanId} with {BatchCount} batches",
|
||||
plan.Id, plan.Batches.Length);
|
||||
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
var semaphore = new SemaphoreSlim(plan.Policy.MaxConcurrentRemediations);
|
||||
var results = new ConcurrentBag<TargetRemediationResult>();
|
||||
var overallStatus = RemediationResultStatus.Success;
|
||||
|
||||
try
|
||||
{
|
||||
foreach (var batch in plan.Batches.OrderBy(b => b.Order))
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Executing batch {BatchOrder} with {TargetCount} targets",
|
||||
batch.Order, batch.Targets.Length);
|
||||
|
||||
var batchTasks = batch.Targets.Select(async target =>
|
||||
{
|
||||
await semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
return await RemediateTargetAsync(target, plan, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
semaphore.Release();
|
||||
}
|
||||
});
|
||||
|
||||
var batchResults = await Task.WhenAll(batchTasks);
|
||||
foreach (var result in batchResults)
|
||||
{
|
||||
results.Add(result);
|
||||
}
|
||||
|
||||
// Check for failures in this batch
|
||||
var failedCount = batchResults.Count(r => r.Status == RemediationTargetStatus.Failed);
|
||||
if (failedCount > 0)
|
||||
{
|
||||
overallStatus = RemediationResultStatus.PartialSuccess;
|
||||
}
|
||||
|
||||
// Health check between batches for rolling strategy
|
||||
if (batch.RequiresHealthCheck &&
|
||||
plan.Policy.Strategy == RemediationStrategy.Rolling)
|
||||
{
|
||||
var healthy = await VerifyBatchHealthAsync(batchResults, ct);
|
||||
if (!healthy)
|
||||
{
|
||||
_logger.LogWarning("Health check failed after batch {BatchOrder}, stopping", batch.Order);
|
||||
overallStatus = RemediationResultStatus.PartialSuccess;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Delay between batches if configured
|
||||
if (batch.DelayAfter.HasValue)
|
||||
{
|
||||
await Task.Delay(batch.DelayAfter.Value, ct);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
_logger.LogWarning("Remediation plan {PlanId} was cancelled", plan.Id);
|
||||
overallStatus = RemediationResultStatus.Cancelled;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error executing remediation plan {PlanId}", plan.Id);
|
||||
overallStatus = RemediationResultStatus.Failed;
|
||||
}
|
||||
|
||||
var endTime = _timeProvider.GetUtcNow();
|
||||
var resultArray = results.ToImmutableArray();
|
||||
var metrics = CalculateMetrics(resultArray, endTime - startTime);
|
||||
|
||||
// Determine final status
|
||||
if (overallStatus == RemediationResultStatus.Success && metrics.Failed > 0)
|
||||
{
|
||||
overallStatus = metrics.Succeeded > 0
|
||||
? RemediationResultStatus.PartialSuccess
|
||||
: RemediationResultStatus.Failed;
|
||||
}
|
||||
|
||||
var result = new RemediationResult
|
||||
{
|
||||
PlanId = plan.Id,
|
||||
Status = overallStatus,
|
||||
TargetResults = resultArray,
|
||||
Duration = endTime - startTime,
|
||||
Metrics = metrics
|
||||
};
|
||||
|
||||
// Generate evidence
|
||||
var evidenceId = await _evidenceWriter.WriteAsync(plan, result, ct);
|
||||
result = result with { EvidencePacketId = evidenceId };
|
||||
|
||||
_logger.LogInformation(
|
||||
"Completed remediation plan {PlanId} with status {Status}: {Succeeded}/{Total} succeeded",
|
||||
plan.Id, overallStatus, metrics.Succeeded, metrics.TotalTargets);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private async Task<TargetRemediationResult> RemediateTargetAsync(
|
||||
RemediationTarget target,
|
||||
RemediationPlan plan,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
|
||||
try
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Remediating target {TargetName} with action {Action}",
|
||||
target.TargetName, target.Action);
|
||||
|
||||
var executionResult = await _executor.ExecuteAsync(target, plan.Policy, ct);
|
||||
|
||||
return new TargetRemediationResult
|
||||
{
|
||||
TargetId = target.TargetId,
|
||||
Status = executionResult.Success
|
||||
? RemediationTargetStatus.Succeeded
|
||||
: RemediationTargetStatus.Failed,
|
||||
Error = executionResult.Error,
|
||||
Duration = _timeProvider.GetUtcNow() - startTime,
|
||||
PreviousDigest = target.Drift.Actual,
|
||||
CurrentDigest = executionResult.NewDigest,
|
||||
Logs = executionResult.Logs
|
||||
};
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
return new TargetRemediationResult
|
||||
{
|
||||
TargetId = target.TargetId,
|
||||
Status = RemediationTargetStatus.Skipped,
|
||||
Error = "Cancelled",
|
||||
Duration = _timeProvider.GetUtcNow() - startTime
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to remediate target {TargetName}", target.TargetName);
|
||||
|
||||
return new TargetRemediationResult
|
||||
{
|
||||
TargetId = target.TargetId,
|
||||
Status = RemediationTargetStatus.Failed,
|
||||
Error = ex.Message,
|
||||
Duration = _timeProvider.GetUtcNow() - startTime
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<bool> VerifyBatchHealthAsync(
|
||||
TargetRemediationResult[] batchResults,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Simple health check: all targets succeeded
|
||||
var allSucceeded = batchResults.All(r => r.Status == RemediationTargetStatus.Succeeded);
|
||||
|
||||
if (!allSucceeded)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Batch health check failed: {Failed} of {Total} targets failed",
|
||||
batchResults.Count(r => r.Status == RemediationTargetStatus.Failed),
|
||||
batchResults.Length);
|
||||
}
|
||||
|
||||
await Task.CompletedTask; // Placeholder for actual health check
|
||||
return allSucceeded;
|
||||
}
|
||||
|
||||
private bool IsWithinMaintenanceWindow(RemediationPolicy policy)
|
||||
{
|
||||
if (policy.Trigger == RemediationTrigger.Immediate)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var currentTime = TimeOnly.FromDateTime(now.DateTime);
|
||||
|
||||
// Check day of week
|
||||
if (!policy.AllowedDays.Contains(now.DayOfWeek))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check time window
|
||||
if (currentTime < policy.AllowedStartTime || currentTime > policy.AllowedEndTime)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check maintenance window if specified
|
||||
if (policy.MaintenanceWindow is not null)
|
||||
{
|
||||
var window = policy.MaintenanceWindow;
|
||||
if (!window.Days.Contains(now.DayOfWeek))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
if (currentTime < window.StartTime || currentTime > window.EndTime)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private ImmutableArray<ScoredDriftItem> ApplyBlastRadiusLimits(
|
||||
ImmutableArray<ScoredDriftItem> drifts,
|
||||
RemediationPolicy policy)
|
||||
{
|
||||
// Calculate maximum targets based on percentage and absolute limit
|
||||
var maxByPercentage = (int)(drifts.Length * (policy.MaxTargetPercentage / 100.0));
|
||||
var maxTargets = Math.Min(maxByPercentage, policy.AbsoluteMaxTargets);
|
||||
maxTargets = Math.Max(1, maxTargets); // At least 1
|
||||
|
||||
if (drifts.Length <= maxTargets)
|
||||
{
|
||||
return drifts;
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Limiting remediation from {Total} to {Max} targets (blast radius control)",
|
||||
drifts.Length, maxTargets);
|
||||
|
||||
// Take highest severity first
|
||||
return drifts
|
||||
.OrderByDescending(d => d.Severity.Score)
|
||||
.Take(maxTargets)
|
||||
.ToImmutableArray();
|
||||
}
|
||||
|
||||
private RemediationPlan BuildExecutionPlan(
|
||||
DriftReport driftReport,
|
||||
ImmutableArray<ScoredDriftItem> drifts,
|
||||
RemediationPolicy policy)
|
||||
{
|
||||
var batches = policy.Strategy switch
|
||||
{
|
||||
RemediationStrategy.AllAtOnce => BuildAllAtOnceBatches(drifts, policy),
|
||||
RemediationStrategy.Rolling => BuildRollingBatches(drifts, policy),
|
||||
RemediationStrategy.Canary => BuildCanaryBatches(drifts, policy),
|
||||
RemediationStrategy.BlueGreen => BuildBlueGreenBatches(drifts, policy),
|
||||
_ => BuildRollingBatches(drifts, policy)
|
||||
};
|
||||
|
||||
return new RemediationPlan
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
DriftReportId = driftReport.TargetId,
|
||||
Policy = policy,
|
||||
Status = RemediationPlanStatus.Created,
|
||||
Batches = batches,
|
||||
CreatedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
private ImmutableArray<RemediationBatch> BuildAllAtOnceBatches(
|
||||
ImmutableArray<ScoredDriftItem> drifts,
|
||||
RemediationPolicy policy)
|
||||
{
|
||||
return
|
||||
[
|
||||
new RemediationBatch
|
||||
{
|
||||
Order = 0,
|
||||
Targets = drifts.Select(d => CreateTarget(d, policy)).ToImmutableArray(),
|
||||
RequiresHealthCheck = false
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
private ImmutableArray<RemediationBatch> BuildRollingBatches(
|
||||
ImmutableArray<ScoredDriftItem> drifts,
|
||||
RemediationPolicy policy)
|
||||
{
|
||||
var batchSize = policy.MaxConcurrentRemediations;
|
||||
var batches = new List<RemediationBatch>();
|
||||
|
||||
for (int i = 0; i < drifts.Length; i += batchSize)
|
||||
{
|
||||
var batchDrifts = drifts.Skip(i).Take(batchSize).ToImmutableArray();
|
||||
batches.Add(new RemediationBatch
|
||||
{
|
||||
Order = batches.Count,
|
||||
Targets = batchDrifts.Select(d => CreateTarget(d, policy)).ToImmutableArray(),
|
||||
RequiresHealthCheck = true,
|
||||
DelayAfter = TimeSpan.FromSeconds(10)
|
||||
});
|
||||
}
|
||||
|
||||
return batches.ToImmutableArray();
|
||||
}
|
||||
|
||||
private ImmutableArray<RemediationBatch> BuildCanaryBatches(
|
||||
ImmutableArray<ScoredDriftItem> drifts,
|
||||
RemediationPolicy policy)
|
||||
{
|
||||
if (drifts.IsEmpty)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
var batches = new List<RemediationBatch>();
|
||||
|
||||
// First batch: single canary target
|
||||
batches.Add(new RemediationBatch
|
||||
{
|
||||
Order = 0,
|
||||
Targets = [CreateTarget(drifts[0], policy)],
|
||||
RequiresHealthCheck = true,
|
||||
DelayAfter = TimeSpan.FromMinutes(5) // Extended observation period
|
||||
});
|
||||
|
||||
// Remaining targets in rolling batches
|
||||
if (drifts.Length > 1)
|
||||
{
|
||||
var remaining = drifts.Skip(1).ToImmutableArray();
|
||||
var rollingBatches = BuildRollingBatches(remaining, policy);
|
||||
foreach (var batch in rollingBatches)
|
||||
{
|
||||
batches.Add(batch with { Order = batches.Count });
|
||||
}
|
||||
}
|
||||
|
||||
return batches.ToImmutableArray();
|
||||
}
|
||||
|
||||
private ImmutableArray<RemediationBatch> BuildBlueGreenBatches(
|
||||
ImmutableArray<ScoredDriftItem> drifts,
|
||||
RemediationPolicy policy)
|
||||
{
|
||||
// Blue-green: all at once but with extended health check
|
||||
return
|
||||
[
|
||||
new RemediationBatch
|
||||
{
|
||||
Order = 0,
|
||||
Targets = drifts.Select(d => CreateTarget(d, policy)).ToImmutableArray(),
|
||||
RequiresHealthCheck = true,
|
||||
DelayAfter = TimeSpan.FromMinutes(2)
|
||||
}
|
||||
];
|
||||
}
|
||||
|
||||
private RemediationTarget CreateTarget(ScoredDriftItem scored, RemediationPolicy policy)
|
||||
{
|
||||
return new RemediationTarget
|
||||
{
|
||||
TargetId = scored.Drift.ComponentId ?? Guid.NewGuid(),
|
||||
TargetName = scored.Drift.Name,
|
||||
Drift = scored.Drift,
|
||||
Severity = scored.Severity,
|
||||
Action = policy.Action
|
||||
};
|
||||
}
|
||||
|
||||
private RemediationPlan CreateEmptyPlan(DriftReport driftReport, RemediationPolicy policy)
|
||||
{
|
||||
return new RemediationPlan
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
DriftReportId = driftReport.TargetId,
|
||||
Policy = policy,
|
||||
Status = RemediationPlanStatus.Succeeded,
|
||||
Batches = [],
|
||||
CreatedAt = _timeProvider.GetUtcNow(),
|
||||
CompletedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
}
|
||||
|
||||
private RemediationPlan CreateDeferredPlan(
|
||||
DriftReport driftReport,
|
||||
RemediationPolicy policy,
|
||||
string reason)
|
||||
{
|
||||
return new RemediationPlan
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
DriftReportId = driftReport.TargetId,
|
||||
Policy = policy,
|
||||
Status = RemediationPlanStatus.Deferred,
|
||||
Batches = [],
|
||||
CreatedAt = _timeProvider.GetUtcNow(),
|
||||
DeferralReason = reason
|
||||
};
|
||||
}
|
||||
|
||||
private static RemediationMetrics CalculateMetrics(
|
||||
ImmutableArray<TargetRemediationResult> results,
|
||||
TimeSpan totalDuration)
|
||||
{
|
||||
return new RemediationMetrics
|
||||
{
|
||||
TotalTargets = results.Length,
|
||||
Succeeded = results.Count(r => r.Status == RemediationTargetStatus.Succeeded),
|
||||
Failed = results.Count(r => r.Status == RemediationTargetStatus.Failed),
|
||||
Skipped = results.Count(r => r.Status == RemediationTargetStatus.Skipped),
|
||||
TotalDuration = totalDuration
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for executing remediation actions.
|
||||
/// </summary>
|
||||
public interface IRemediationExecutor
|
||||
{
|
||||
/// <summary>
|
||||
/// Executes a remediation action on a target.
|
||||
/// </summary>
|
||||
Task<RemediationExecutionResult> ExecuteAsync(
|
||||
RemediationTarget target,
|
||||
RemediationPolicy policy,
|
||||
CancellationToken ct);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a single remediation execution.
|
||||
/// </summary>
|
||||
public sealed record RemediationExecutionResult(
|
||||
bool Success,
|
||||
string? Error,
|
||||
string? NewDigest,
|
||||
ImmutableArray<string> Logs);
|
||||
|
||||
/// <summary>
|
||||
/// Interface for writing remediation evidence.
|
||||
/// </summary>
|
||||
public interface IRemediationEvidenceWriter
|
||||
{
|
||||
/// <summary>
|
||||
/// Writes evidence for a remediation.
|
||||
/// </summary>
|
||||
Task<Guid> WriteAsync(
|
||||
RemediationPlan plan,
|
||||
RemediationResult result,
|
||||
CancellationToken ct);
|
||||
}
|
||||
@@ -0,0 +1,185 @@
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Environment.Inventory.Remediation;
|
||||
|
||||
/// <summary>
|
||||
/// Evidence record for a remediation action.
|
||||
/// </summary>
|
||||
public sealed record RemediationEvidence
|
||||
{
|
||||
/// <summary>
|
||||
/// Unique evidence ID.
|
||||
/// </summary>
|
||||
public required Guid Id { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Type of evidence.
|
||||
/// </summary>
|
||||
public string Type => "remediation";
|
||||
|
||||
/// <summary>
|
||||
/// Version of the evidence schema.
|
||||
/// </summary>
|
||||
public string SchemaVersion => "1.0";
|
||||
|
||||
/// <summary>
|
||||
/// When the evidence was created.
|
||||
/// </summary>
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// The remediation plan ID.
|
||||
/// </summary>
|
||||
public required Guid PlanId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// The drift report ID that triggered remediation.
|
||||
/// </summary>
|
||||
public required Guid DriftReportId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// The policy used for remediation.
|
||||
/// </summary>
|
||||
public required RemediationPolicySnapshot Policy { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Environment ID.
|
||||
/// </summary>
|
||||
public required Guid EnvironmentId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Environment name.
|
||||
/// </summary>
|
||||
public required string EnvironmentName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Overall remediation status.
|
||||
/// </summary>
|
||||
public required RemediationResultStatus Status { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Target evidence records.
|
||||
/// </summary>
|
||||
public required ImmutableArray<TargetEvidence> Targets { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Aggregated metrics.
|
||||
/// </summary>
|
||||
public required RemediationMetrics Metrics { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Who or what initiated the remediation.
|
||||
/// </summary>
|
||||
public required string InitiatedBy { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether this was automatic or manual.
|
||||
/// </summary>
|
||||
public required bool IsAutomatic { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Linked evidence IDs (e.g., drift report evidence).
|
||||
/// </summary>
|
||||
public ImmutableArray<Guid> LinkedEvidence { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Optional signature of this evidence.
|
||||
/// </summary>
|
||||
public string? Signature { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm used for signature.
|
||||
/// </summary>
|
||||
public string? SignatureAlgorithm { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Snapshot of policy at time of remediation.
|
||||
/// </summary>
|
||||
public sealed record RemediationPolicySnapshot
|
||||
{
|
||||
public required Guid Id { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public required RemediationTrigger Trigger { get; init; }
|
||||
public required RemediationAction Action { get; init; }
|
||||
public required RemediationStrategy Strategy { get; init; }
|
||||
public required DriftSeverityLevel MinimumSeverity { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evidence for a single target remediation.
|
||||
/// </summary>
|
||||
public sealed record TargetEvidence
|
||||
{
|
||||
/// <summary>
|
||||
/// Target ID.
|
||||
/// </summary>
|
||||
public required Guid TargetId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Target name.
|
||||
/// </summary>
|
||||
public required string TargetName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Drift type that was remediated.
|
||||
/// </summary>
|
||||
public required DriftType DriftType { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Action taken.
|
||||
/// </summary>
|
||||
public required RemediationAction Action { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Result status.
|
||||
/// </summary>
|
||||
public required RemediationTargetStatus Status { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// State before remediation.
|
||||
/// </summary>
|
||||
public required StateSnapshot Before { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// State after remediation.
|
||||
/// </summary>
|
||||
public required StateSnapshot After { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Duration of remediation.
|
||||
/// </summary>
|
||||
public required TimeSpan Duration { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Error if failed.
|
||||
/// </summary>
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Snapshot of target state.
|
||||
/// </summary>
|
||||
public sealed record StateSnapshot
|
||||
{
|
||||
/// <summary>
|
||||
/// Image digest.
|
||||
/// </summary>
|
||||
public string? Digest { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Container status.
|
||||
/// </summary>
|
||||
public string? Status { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Additional state attributes.
|
||||
/// </summary>
|
||||
public ImmutableDictionary<string, string> Attributes { get; init; } =
|
||||
ImmutableDictionary<string, string>.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// When this snapshot was taken.
|
||||
/// </summary>
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,233 @@
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Environment.Inventory.Remediation;
|
||||
|
||||
/// <summary>
|
||||
/// A plan for remediating drift.
|
||||
/// </summary>
|
||||
public sealed record RemediationPlan
|
||||
{
|
||||
/// <summary>
|
||||
/// Unique identifier for this plan.
|
||||
/// </summary>
|
||||
public required Guid Id { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// The drift report this plan addresses.
|
||||
/// </summary>
|
||||
public required Guid DriftReportId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// The policy used to create this plan.
|
||||
/// </summary>
|
||||
public required RemediationPolicy Policy { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Current status of the plan.
|
||||
/// </summary>
|
||||
public required RemediationPlanStatus Status { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Batches of targets to remediate.
|
||||
/// </summary>
|
||||
public required ImmutableArray<RemediationBatch> Batches { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When the plan was created.
|
||||
/// </summary>
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When the plan is scheduled to execute.
|
||||
/// </summary>
|
||||
public DateTimeOffset? ScheduledFor { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When execution started.
|
||||
/// </summary>
|
||||
public DateTimeOffset? StartedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When execution completed.
|
||||
/// </summary>
|
||||
public DateTimeOffset? CompletedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Reason for deferral if status is Deferred.
|
||||
/// </summary>
|
||||
public string? DeferralReason { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Creates a deferred plan waiting for maintenance window.
|
||||
/// </summary>
|
||||
public static RemediationPlan Deferred(
|
||||
ImmutableArray<ScoredDriftItem> drifts,
|
||||
RemediationWindow? maintenanceWindow,
|
||||
RemediationPolicy policy,
|
||||
Guid driftReportId)
|
||||
{
|
||||
return new RemediationPlan
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
DriftReportId = driftReportId,
|
||||
Policy = policy,
|
||||
Status = RemediationPlanStatus.Deferred,
|
||||
Batches = [],
|
||||
CreatedAt = DateTimeOffset.UtcNow,
|
||||
ScheduledFor = maintenanceWindow is not null
|
||||
? CalculateNextWindow(maintenanceWindow)
|
||||
: null,
|
||||
DeferralReason = "Waiting for maintenance window"
|
||||
};
|
||||
}
|
||||
|
||||
private static DateTimeOffset? CalculateNextWindow(RemediationWindow window)
|
||||
{
|
||||
var now = DateTimeOffset.UtcNow;
|
||||
var today = DateOnly.FromDateTime(now.DateTime);
|
||||
var currentTime = TimeOnly.FromDateTime(now.DateTime);
|
||||
|
||||
// Check if we're within the window today
|
||||
if (window.Days.Contains(now.DayOfWeek) &&
|
||||
currentTime >= window.StartTime &&
|
||||
currentTime <= window.EndTime)
|
||||
{
|
||||
return now;
|
||||
}
|
||||
|
||||
// Find the next available window
|
||||
for (int i = 0; i <= 7; i++)
|
||||
{
|
||||
var checkDate = today.AddDays(i);
|
||||
var checkDay = checkDate.DayOfWeek;
|
||||
|
||||
if (!window.Days.Contains(checkDay))
|
||||
continue;
|
||||
|
||||
var windowStart = new DateTime(checkDate, window.StartTime, DateTimeKind.Utc);
|
||||
|
||||
if (i == 0 && currentTime > window.EndTime)
|
||||
continue; // Already past today's window
|
||||
|
||||
if (windowStart > now.DateTime)
|
||||
{
|
||||
return new DateTimeOffset(windowStart, TimeSpan.Zero);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of a remediation plan.
|
||||
/// </summary>
|
||||
public enum RemediationPlanStatus
|
||||
{
|
||||
/// <summary>
|
||||
/// Plan created but not yet started.
|
||||
/// </summary>
|
||||
Created,
|
||||
|
||||
/// <summary>
|
||||
/// Plan scheduled for future execution.
|
||||
/// </summary>
|
||||
Scheduled,
|
||||
|
||||
/// <summary>
|
||||
/// Plan deferred waiting for maintenance window.
|
||||
/// </summary>
|
||||
Deferred,
|
||||
|
||||
/// <summary>
|
||||
/// Plan is currently executing.
|
||||
/// </summary>
|
||||
Running,
|
||||
|
||||
/// <summary>
|
||||
/// Plan paused by human intervention.
|
||||
/// </summary>
|
||||
Paused,
|
||||
|
||||
/// <summary>
|
||||
/// Plan completed successfully.
|
||||
/// </summary>
|
||||
Succeeded,
|
||||
|
||||
/// <summary>
|
||||
/// Some targets remediated, some failed.
|
||||
/// </summary>
|
||||
PartialSuccess,
|
||||
|
||||
/// <summary>
|
||||
/// Plan failed.
|
||||
/// </summary>
|
||||
Failed,
|
||||
|
||||
/// <summary>
|
||||
/// Plan was cancelled.
|
||||
/// </summary>
|
||||
Cancelled
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A batch of targets to remediate.
|
||||
/// </summary>
|
||||
public sealed record RemediationBatch
|
||||
{
|
||||
/// <summary>
|
||||
/// Order of this batch in the execution sequence.
|
||||
/// </summary>
|
||||
public required int Order { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Targets in this batch.
|
||||
/// </summary>
|
||||
public required ImmutableArray<RemediationTarget> Targets { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Delay after completing this batch.
|
||||
/// </summary>
|
||||
public TimeSpan? DelayAfter { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to run health check after this batch.
|
||||
/// </summary>
|
||||
public bool RequiresHealthCheck { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A target to remediate.
|
||||
/// </summary>
|
||||
public sealed record RemediationTarget
|
||||
{
|
||||
/// <summary>
|
||||
/// Target ID.
|
||||
/// </summary>
|
||||
public required Guid TargetId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Target name for display.
|
||||
/// </summary>
|
||||
public required string TargetName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// The drift being remediated.
|
||||
/// </summary>
|
||||
public required DriftItem Drift { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Calculated severity.
|
||||
/// </summary>
|
||||
public required DriftSeverity Severity { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Action to take.
|
||||
/// </summary>
|
||||
public required RemediationAction Action { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Action-specific payload (e.g., compose file, rollback digest).
|
||||
/// </summary>
|
||||
public string? ActionPayload { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,285 @@
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Environment.Inventory.Remediation;
|
||||
|
||||
/// <summary>
|
||||
/// Defines when and how to remediate drift.
|
||||
/// </summary>
|
||||
public sealed record RemediationPolicy
|
||||
{
|
||||
/// <summary>
|
||||
/// Unique identifier for this policy.
|
||||
/// </summary>
|
||||
public required Guid Id { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Human-readable name for the policy.
|
||||
/// </summary>
|
||||
public required string Name { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional description of the policy purpose.
|
||||
/// </summary>
|
||||
public string? Description { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Environment this policy applies to.
|
||||
/// </summary>
|
||||
public required Guid EnvironmentId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether this policy is currently active.
|
||||
/// </summary>
|
||||
public bool IsActive { get; init; } = true;
|
||||
|
||||
// === Triggers ===
|
||||
|
||||
/// <summary>
|
||||
/// When to trigger remediation.
|
||||
/// </summary>
|
||||
public required RemediationTrigger Trigger { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Minimum severity level to trigger remediation.
|
||||
/// </summary>
|
||||
public DriftSeverityLevel MinimumSeverity { get; init; } = DriftSeverityLevel.Medium;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum drift age before remediation (default: 5 minutes).
|
||||
/// </summary>
|
||||
public TimeSpan MinimumDriftAge { get; init; } = TimeSpan.FromMinutes(5);
|
||||
|
||||
/// <summary>
|
||||
/// Maximum drift age before escalating to manual intervention.
|
||||
/// </summary>
|
||||
public TimeSpan MaximumDriftAge { get; init; } = TimeSpan.FromHours(24);
|
||||
|
||||
// === Actions ===
|
||||
|
||||
/// <summary>
|
||||
/// Action to take when remediating.
|
||||
/// </summary>
|
||||
public required RemediationAction Action { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Strategy for applying remediation.
|
||||
/// </summary>
|
||||
public RemediationStrategy Strategy { get; init; } = RemediationStrategy.Rolling;
|
||||
|
||||
// === Safety Limits ===
|
||||
|
||||
/// <summary>
|
||||
/// Maximum concurrent remediations (default: 1).
|
||||
/// </summary>
|
||||
public int MaxConcurrentRemediations { get; init; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum remediations per hour (default: 10).
|
||||
/// </summary>
|
||||
public int MaxRemediationsPerHour { get; init; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum remediations per day (default: 50).
|
||||
/// </summary>
|
||||
public int MaxRemediationsPerDay { get; init; } = 50;
|
||||
|
||||
/// <summary>
|
||||
/// Cooldown period between remediations (default: 5 minutes).
|
||||
/// </summary>
|
||||
public TimeSpan CooldownPeriod { get; init; } = TimeSpan.FromMinutes(5);
|
||||
|
||||
/// <summary>
|
||||
/// Maximum percentage of targets to remediate at once (default: 25%).
|
||||
/// </summary>
|
||||
public int MaxTargetPercentage { get; init; } = 25;
|
||||
|
||||
/// <summary>
|
||||
/// Absolute maximum targets to remediate at once (default: 10).
|
||||
/// </summary>
|
||||
public int AbsoluteMaxTargets { get; init; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum healthy percentage required before remediation (default: 75%).
|
||||
/// </summary>
|
||||
public double MinHealthyPercentage { get; init; } = 0.75;
|
||||
|
||||
// === Schedule ===
|
||||
|
||||
/// <summary>
|
||||
/// Optional maintenance window for scheduled remediation.
|
||||
/// </summary>
|
||||
public RemediationWindow? MaintenanceWindow { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Days when remediation is allowed.
|
||||
/// </summary>
|
||||
public ImmutableArray<DayOfWeek> AllowedDays { get; init; } =
|
||||
[DayOfWeek.Monday, DayOfWeek.Tuesday, DayOfWeek.Wednesday, DayOfWeek.Thursday, DayOfWeek.Friday];
|
||||
|
||||
/// <summary>
|
||||
/// Start time when remediation is allowed (UTC).
|
||||
/// </summary>
|
||||
public TimeOnly AllowedStartTime { get; init; } = new(6, 0);
|
||||
|
||||
/// <summary>
|
||||
/// End time when remediation is allowed (UTC).
|
||||
/// </summary>
|
||||
public TimeOnly AllowedEndTime { get; init; } = new(22, 0);
|
||||
|
||||
// === Notifications ===
|
||||
|
||||
/// <summary>
|
||||
/// Notification configuration.
|
||||
/// </summary>
|
||||
public NotificationConfig? Notifications { get; init; }
|
||||
|
||||
// === Audit ===
|
||||
|
||||
/// <summary>
|
||||
/// When the policy was created.
|
||||
/// </summary>
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// When the policy was last updated.
|
||||
/// </summary>
|
||||
public DateTimeOffset? UpdatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Who created this policy.
|
||||
/// </summary>
|
||||
public string? CreatedBy { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// When to trigger remediation.
|
||||
/// </summary>
|
||||
public enum RemediationTrigger
|
||||
{
|
||||
/// <summary>
|
||||
/// Remediate as soon as detected.
|
||||
/// </summary>
|
||||
Immediate,
|
||||
|
||||
/// <summary>
|
||||
/// Wait for maintenance window.
|
||||
/// </summary>
|
||||
Scheduled,
|
||||
|
||||
/// <summary>
|
||||
/// Remediate after drift exceeds age threshold.
|
||||
/// </summary>
|
||||
AgeThreshold,
|
||||
|
||||
/// <summary>
|
||||
/// Remediate when severity increases.
|
||||
/// </summary>
|
||||
SeverityEscalation,
|
||||
|
||||
/// <summary>
|
||||
/// Notification only, human initiates.
|
||||
/// </summary>
|
||||
Manual
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Action to take when remediating.
|
||||
/// </summary>
|
||||
public enum RemediationAction
|
||||
{
|
||||
/// <summary>
|
||||
/// Alert but don't act.
|
||||
/// </summary>
|
||||
NotifyOnly,
|
||||
|
||||
/// <summary>
|
||||
/// Restore to expected state.
|
||||
/// </summary>
|
||||
Reconcile,
|
||||
|
||||
/// <summary>
|
||||
/// Rollback to previous known-good release.
|
||||
/// </summary>
|
||||
Rollback,
|
||||
|
||||
/// <summary>
|
||||
/// Adjust replica count.
|
||||
/// </summary>
|
||||
Scale,
|
||||
|
||||
/// <summary>
|
||||
/// Restart containers.
|
||||
/// </summary>
|
||||
Restart,
|
||||
|
||||
/// <summary>
|
||||
/// Isolate drifted targets from traffic.
|
||||
/// </summary>
|
||||
Quarantine
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Strategy for applying remediation.
|
||||
/// </summary>
|
||||
public enum RemediationStrategy
|
||||
{
|
||||
/// <summary>
|
||||
/// Remediate all drifted targets simultaneously.
|
||||
/// </summary>
|
||||
AllAtOnce,
|
||||
|
||||
/// <summary>
|
||||
/// Remediate one at a time with health checks.
|
||||
/// </summary>
|
||||
Rolling,
|
||||
|
||||
/// <summary>
|
||||
/// Remediate one, verify, then proceed.
|
||||
/// </summary>
|
||||
Canary,
|
||||
|
||||
/// <summary>
|
||||
/// Deploy to standby, switch traffic.
|
||||
/// </summary>
|
||||
BlueGreen
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Maintenance window for scheduled remediation.
|
||||
/// </summary>
|
||||
public sealed record RemediationWindow(
|
||||
TimeOnly StartTime,
|
||||
TimeOnly EndTime,
|
||||
ImmutableArray<DayOfWeek> Days,
|
||||
string? Timezone = null);
|
||||
|
||||
/// <summary>
|
||||
/// Notification configuration.
|
||||
/// </summary>
|
||||
public sealed record NotificationConfig
|
||||
{
|
||||
/// <summary>
|
||||
/// Notify before starting remediation.
|
||||
/// </summary>
|
||||
public bool NotifyOnStart { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Notify when remediation completes successfully.
|
||||
/// </summary>
|
||||
public bool NotifyOnSuccess { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Notify when remediation fails.
|
||||
/// </summary>
|
||||
public bool NotifyOnFailure { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Channels to notify (email, slack, teams, pagerduty).
|
||||
/// </summary>
|
||||
public ImmutableArray<string> Channels { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Recipients for notifications.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> Recipients { get; init; } = [];
|
||||
}
|
||||
@@ -0,0 +1,175 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Environment.Inventory.Remediation;
|
||||
|
||||
/// <summary>
|
||||
/// Rate limiter for remediation operations.
|
||||
/// </summary>
|
||||
public sealed class RemediationRateLimiter
|
||||
{
|
||||
private readonly IRemediationHistoryStore _historyStore;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<RemediationRateLimiter> _logger;
|
||||
|
||||
public RemediationRateLimiter(
|
||||
IRemediationHistoryStore historyStore,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<RemediationRateLimiter> logger)
|
||||
{
|
||||
_historyStore = historyStore;
|
||||
_timeProvider = timeProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if remediation is allowed based on rate limits.
|
||||
/// </summary>
|
||||
public async Task<RateLimitResult> CheckAsync(
|
||||
RemediationPolicy policy,
|
||||
int requestedCount,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(policy);
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
// Check hourly limit
|
||||
var hourlyCount = await _historyStore.GetRemediationCountAsync(
|
||||
policy.Id,
|
||||
now.AddHours(-1),
|
||||
now,
|
||||
ct);
|
||||
|
||||
if (hourlyCount + requestedCount > policy.MaxRemediationsPerHour)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Hourly rate limit exceeded for policy {PolicyName}: {Current}/{Max}",
|
||||
policy.Name, hourlyCount, policy.MaxRemediationsPerHour);
|
||||
|
||||
return RateLimitResult.Exceeded(
|
||||
$"Hourly limit exceeded: {hourlyCount}/{policy.MaxRemediationsPerHour}");
|
||||
}
|
||||
|
||||
// Check daily limit
|
||||
var startOfDay = new DateTimeOffset(now.Date, now.Offset);
|
||||
var dailyCount = await _historyStore.GetRemediationCountAsync(
|
||||
policy.Id,
|
||||
startOfDay,
|
||||
now,
|
||||
ct);
|
||||
|
||||
if (dailyCount + requestedCount > policy.MaxRemediationsPerDay)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Daily rate limit exceeded for policy {PolicyName}: {Current}/{Max}",
|
||||
policy.Name, dailyCount, policy.MaxRemediationsPerDay);
|
||||
|
||||
return RateLimitResult.Exceeded(
|
||||
$"Daily limit exceeded: {dailyCount}/{policy.MaxRemediationsPerDay}");
|
||||
}
|
||||
|
||||
// Check cooldown period
|
||||
var lastRemediation = await _historyStore.GetLastRemediationAsync(policy.Id, ct);
|
||||
if (lastRemediation is not null && lastRemediation.CompletedAt.HasValue)
|
||||
{
|
||||
var timeSinceLast = now - lastRemediation.CompletedAt.Value;
|
||||
if (timeSinceLast < policy.CooldownPeriod)
|
||||
{
|
||||
var remaining = policy.CooldownPeriod - timeSinceLast;
|
||||
_logger.LogInformation(
|
||||
"Cooldown period active for policy {PolicyName}: {Remaining} remaining",
|
||||
policy.Name, remaining);
|
||||
|
||||
return RateLimitResult.Cooldown(remaining);
|
||||
}
|
||||
}
|
||||
|
||||
return RateLimitResult.Allowed(requestedCount);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a rate limit check.
|
||||
/// </summary>
|
||||
public sealed record RateLimitResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Whether the request is allowed.
|
||||
/// </summary>
|
||||
public required bool IsAllowed { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of requests allowed.
|
||||
/// </summary>
|
||||
public int AllowedCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Reason if not allowed.
|
||||
/// </summary>
|
||||
public string? Reason { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Remaining cooldown time if applicable.
|
||||
/// </summary>
|
||||
public TimeSpan? CooldownRemaining { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Creates an allowed result.
|
||||
/// </summary>
|
||||
public static RateLimitResult Allowed(int count) => new()
|
||||
{
|
||||
IsAllowed = true,
|
||||
AllowedCount = count
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Creates an exceeded result.
|
||||
/// </summary>
|
||||
public static RateLimitResult Exceeded(string reason) => new()
|
||||
{
|
||||
IsAllowed = false,
|
||||
AllowedCount = 0,
|
||||
Reason = reason
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Creates a cooldown result.
|
||||
/// </summary>
|
||||
public static RateLimitResult Cooldown(TimeSpan remaining) => new()
|
||||
{
|
||||
IsAllowed = false,
|
||||
AllowedCount = 0,
|
||||
Reason = $"Cooldown period active: {remaining.TotalSeconds:F0}s remaining",
|
||||
CooldownRemaining = remaining
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for remediation history storage (for rate limiting).
|
||||
/// </summary>
|
||||
public interface IRemediationHistoryStore
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the count of remediations in a time period.
|
||||
/// </summary>
|
||||
Task<int> GetRemediationCountAsync(
|
||||
Guid policyId,
|
||||
DateTimeOffset from,
|
||||
DateTimeOffset to,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the last remediation for a policy.
|
||||
/// </summary>
|
||||
Task<RemediationPlan?> GetLastRemediationAsync(
|
||||
Guid policyId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Records a completed remediation.
|
||||
/// </summary>
|
||||
Task RecordRemediationAsync(
|
||||
RemediationPlan plan,
|
||||
RemediationResult result,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,194 @@
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Environment.Inventory.Remediation;
|
||||
|
||||
/// <summary>
|
||||
/// Result of a remediation execution.
|
||||
/// </summary>
|
||||
public sealed record RemediationResult
|
||||
{
|
||||
/// <summary>
|
||||
/// The plan that was executed.
|
||||
/// </summary>
|
||||
public required Guid PlanId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Overall status of the remediation.
|
||||
/// </summary>
|
||||
public required RemediationResultStatus Status { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Results for each target.
|
||||
/// </summary>
|
||||
public required ImmutableArray<TargetRemediationResult> TargetResults { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Evidence packet ID for this remediation.
|
||||
/// </summary>
|
||||
public Guid? EvidencePacketId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total duration of the remediation.
|
||||
/// </summary>
|
||||
public required TimeSpan Duration { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Aggregated metrics.
|
||||
/// </summary>
|
||||
public required RemediationMetrics Metrics { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Overall result status.
|
||||
/// </summary>
|
||||
public enum RemediationResultStatus
|
||||
{
|
||||
/// <summary>
|
||||
/// All targets remediated successfully.
|
||||
/// </summary>
|
||||
Success,
|
||||
|
||||
/// <summary>
|
||||
/// Some targets succeeded, some failed.
|
||||
/// </summary>
|
||||
PartialSuccess,
|
||||
|
||||
/// <summary>
|
||||
/// All targets failed.
|
||||
/// </summary>
|
||||
Failed,
|
||||
|
||||
/// <summary>
|
||||
/// Remediation was cancelled.
|
||||
/// </summary>
|
||||
Cancelled,
|
||||
|
||||
/// <summary>
|
||||
/// Remediation timed out.
|
||||
/// </summary>
|
||||
TimedOut
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result for a single target.
|
||||
/// </summary>
|
||||
public sealed record TargetRemediationResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Target ID.
|
||||
/// </summary>
|
||||
public required Guid TargetId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Status for this target.
|
||||
/// </summary>
|
||||
public required RemediationTargetStatus Status { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Error message if failed.
|
||||
/// </summary>
|
||||
public string? Error { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Duration for this target.
|
||||
/// </summary>
|
||||
public required TimeSpan Duration { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Previous digest before remediation.
|
||||
/// </summary>
|
||||
public string? PreviousDigest { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Current digest after remediation.
|
||||
/// </summary>
|
||||
public string? CurrentDigest { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Logs from the remediation.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> Logs { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status for a remediation target.
|
||||
/// </summary>
|
||||
public enum RemediationTargetStatus
|
||||
{
|
||||
/// <summary>
|
||||
/// Target pending remediation.
|
||||
/// </summary>
|
||||
Pending,
|
||||
|
||||
/// <summary>
|
||||
/// Target remediation in progress.
|
||||
/// </summary>
|
||||
InProgress,
|
||||
|
||||
/// <summary>
|
||||
/// Target remediated successfully.
|
||||
/// </summary>
|
||||
Succeeded,
|
||||
|
||||
/// <summary>
|
||||
/// Target remediation failed.
|
||||
/// </summary>
|
||||
Failed,
|
||||
|
||||
/// <summary>
|
||||
/// Target was skipped.
|
||||
/// </summary>
|
||||
Skipped,
|
||||
|
||||
/// <summary>
|
||||
/// Target remediation timed out.
|
||||
/// </summary>
|
||||
TimedOut
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Aggregated metrics for a remediation.
|
||||
/// </summary>
|
||||
public sealed record RemediationMetrics
|
||||
{
|
||||
/// <summary>
|
||||
/// Total number of targets.
|
||||
/// </summary>
|
||||
public required int TotalTargets { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of successful remediations.
|
||||
/// </summary>
|
||||
public required int Succeeded { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of failed remediations.
|
||||
/// </summary>
|
||||
public required int Failed { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of skipped targets.
|
||||
/// </summary>
|
||||
public required int Skipped { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total duration.
|
||||
/// </summary>
|
||||
public required TimeSpan TotalDuration { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Average duration per target.
|
||||
/// </summary>
|
||||
public TimeSpan AverageTargetDuration =>
|
||||
TotalTargets > 0
|
||||
? TimeSpan.FromTicks(TotalDuration.Ticks / TotalTargets)
|
||||
: TimeSpan.Zero;
|
||||
|
||||
/// <summary>
|
||||
/// Success rate as a percentage.
|
||||
/// </summary>
|
||||
public double SuccessRate =>
|
||||
TotalTargets > 0
|
||||
? (double)Succeeded / TotalTargets * 100
|
||||
: 0;
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
namespace StellaOps.ReleaseOrchestrator.Environment.Inventory.Remediation;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for severity scoring weights and thresholds.
|
||||
/// </summary>
|
||||
public sealed record SeverityScoringConfig
|
||||
{
|
||||
/// <summary>
|
||||
/// Weight for drift type factor (default: 30%).
|
||||
/// </summary>
|
||||
public double DriftTypeWeight { get; init; } = 0.30;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for drift age factor (default: 25%).
|
||||
/// </summary>
|
||||
public double DriftAgeWeight { get; init; } = 0.25;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for environment criticality factor (default: 20%).
|
||||
/// </summary>
|
||||
public double EnvironmentCriticalityWeight { get; init; } = 0.20;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for component criticality factor (default: 15%).
|
||||
/// </summary>
|
||||
public double ComponentCriticalityWeight { get; init; } = 0.15;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for blast radius factor (default: 10%).
|
||||
/// </summary>
|
||||
public double BlastRadiusWeight { get; init; } = 0.10;
|
||||
|
||||
/// <summary>
|
||||
/// Score threshold for immediate action requirement.
|
||||
/// </summary>
|
||||
public int ImmediateThreshold { get; init; } = 90;
|
||||
|
||||
/// <summary>
|
||||
/// Default component criticality if not specified.
|
||||
/// </summary>
|
||||
public int DefaultComponentCriticality { get; init; } = 50;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Context information needed for severity scoring.
|
||||
/// </summary>
|
||||
public sealed record ScoringContext
|
||||
{
|
||||
/// <summary>
|
||||
/// Current timestamp for age calculations.
|
||||
/// </summary>
|
||||
public required DateTimeOffset Now { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// The environment being scored.
|
||||
/// </summary>
|
||||
public required EnvironmentInfo Environment { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Component criticality scores by component ID.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<Guid, int> ComponentCriticality { get; init; } =
|
||||
new Dictionary<Guid, int>();
|
||||
|
||||
/// <summary>
|
||||
/// Dependency graph for blast radius calculation.
|
||||
/// </summary>
|
||||
public IDependencyGraph? DependencyGraph { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Environment information for scoring context.
|
||||
/// </summary>
|
||||
public sealed record EnvironmentInfo(
|
||||
Guid Id,
|
||||
string Name,
|
||||
EnvironmentCriticality Criticality);
|
||||
|
||||
/// <summary>
|
||||
/// Interface for dependency graph used in blast radius calculation.
|
||||
/// </summary>
|
||||
public interface IDependencyGraph
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the list of components that depend on the specified component.
|
||||
/// </summary>
|
||||
IReadOnlyList<Guid> GetDependents(Guid componentId);
|
||||
}
|
||||
@@ -0,0 +1,165 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Environment.Inventory.Remediation;
|
||||
|
||||
/// <summary>
|
||||
/// Calculates drift severity based on multiple weighted factors.
|
||||
/// </summary>
|
||||
public sealed class SeverityScorer
|
||||
{
|
||||
private readonly SeverityScoringConfig _config;
|
||||
private readonly ILogger<SeverityScorer> _logger;
|
||||
|
||||
public SeverityScorer(
|
||||
SeverityScoringConfig config,
|
||||
ILogger<SeverityScorer> logger)
|
||||
{
|
||||
_config = config;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Calculates severity for a single drift item.
|
||||
/// </summary>
|
||||
public DriftSeverity Score(DriftItem drift, ScoringContext context)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(drift);
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
|
||||
var factors = new List<SeverityFactor>();
|
||||
var totalScore = 0.0;
|
||||
|
||||
// Factor 1: Drift Type (30%)
|
||||
var typeScore = CalculateDriftTypeScore(drift.Type);
|
||||
factors.Add(new SeverityFactor("DriftType", typeScore, _config.DriftTypeWeight));
|
||||
totalScore += typeScore * _config.DriftTypeWeight;
|
||||
|
||||
// Factor 2: Drift Age (25%)
|
||||
var driftAge = context.Now - drift.DetectedAt;
|
||||
var ageScore = CalculateAgeScore(driftAge);
|
||||
factors.Add(new SeverityFactor("DriftAge", ageScore, _config.DriftAgeWeight));
|
||||
totalScore += ageScore * _config.DriftAgeWeight;
|
||||
|
||||
// Factor 3: Environment Criticality (20%)
|
||||
var envScore = CalculateEnvironmentScore(context.Environment.Criticality);
|
||||
factors.Add(new SeverityFactor("EnvironmentCriticality", envScore, _config.EnvironmentCriticalityWeight));
|
||||
totalScore += envScore * _config.EnvironmentCriticalityWeight;
|
||||
|
||||
// Factor 4: Component Criticality (15%)
|
||||
var componentScore = GetComponentCriticality(drift, context);
|
||||
factors.Add(new SeverityFactor("ComponentCriticality", componentScore, _config.ComponentCriticalityWeight));
|
||||
totalScore += componentScore * _config.ComponentCriticalityWeight;
|
||||
|
||||
// Factor 5: Blast Radius (10%)
|
||||
var blastScore = CalculateBlastRadius(drift, context.DependencyGraph);
|
||||
factors.Add(new SeverityFactor("BlastRadius", blastScore, _config.BlastRadiusWeight));
|
||||
totalScore += blastScore * _config.BlastRadiusWeight;
|
||||
|
||||
var finalScore = (int)Math.Round(totalScore);
|
||||
var severity = new DriftSeverity
|
||||
{
|
||||
Level = ScoreToLevel(finalScore),
|
||||
Score = finalScore,
|
||||
Factors = factors.ToImmutableArray(),
|
||||
DriftAge = driftAge,
|
||||
RequiresImmediate = finalScore >= _config.ImmediateThreshold
|
||||
};
|
||||
|
||||
_logger.LogDebug(
|
||||
"Scored drift {DriftName} with severity {Level} (score: {Score})",
|
||||
drift.Name, severity.Level, severity.Score);
|
||||
|
||||
return severity;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Calculates severity for multiple drift items.
|
||||
/// </summary>
|
||||
public ImmutableArray<ScoredDriftItem> ScoreAll(
|
||||
IEnumerable<DriftItem> drifts,
|
||||
ScoringContext context)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(drifts);
|
||||
ArgumentNullException.ThrowIfNull(context);
|
||||
|
||||
return drifts
|
||||
.Select(d => new ScoredDriftItem(d, Score(d, context)))
|
||||
.OrderByDescending(s => s.Severity.Score)
|
||||
.ToImmutableArray();
|
||||
}
|
||||
|
||||
private static int CalculateDriftTypeScore(DriftType type) => type switch
|
||||
{
|
||||
DriftType.Missing => 100,
|
||||
DriftType.DigestMismatch => 80,
|
||||
DriftType.StatusMismatch => 50,
|
||||
DriftType.ConfigMismatch => 40,
|
||||
DriftType.Unexpected => 30,
|
||||
_ => 10
|
||||
};
|
||||
|
||||
private static int CalculateAgeScore(TimeSpan age) => age.TotalMinutes switch
|
||||
{
|
||||
< 5 => 10, // Very fresh - low urgency
|
||||
< 30 => 30, // Recent
|
||||
< 60 => 50, // 1 hour
|
||||
< 240 => 70, // 4 hours
|
||||
< 1440 => 85, // 24 hours
|
||||
_ => 100 // > 24 hours - critical
|
||||
};
|
||||
|
||||
private static int CalculateEnvironmentScore(EnvironmentCriticality criticality) => criticality switch
|
||||
{
|
||||
EnvironmentCriticality.Production => 100,
|
||||
EnvironmentCriticality.Staging => 60,
|
||||
EnvironmentCriticality.Development => 20,
|
||||
_ => 10
|
||||
};
|
||||
|
||||
private int GetComponentCriticality(DriftItem drift, ScoringContext context)
|
||||
{
|
||||
// Try to extract component ID from drift context
|
||||
if (drift.ComponentId.HasValue &&
|
||||
context.ComponentCriticality.TryGetValue(drift.ComponentId.Value, out var criticality))
|
||||
{
|
||||
return criticality;
|
||||
}
|
||||
|
||||
return _config.DefaultComponentCriticality;
|
||||
}
|
||||
|
||||
private static int CalculateBlastRadius(DriftItem drift, IDependencyGraph? graph)
|
||||
{
|
||||
if (graph is null || !drift.ComponentId.HasValue)
|
||||
{
|
||||
return 10; // Default low blast radius if we can't calculate
|
||||
}
|
||||
|
||||
var dependents = graph.GetDependents(drift.ComponentId.Value);
|
||||
return dependents.Count switch
|
||||
{
|
||||
0 => 10,
|
||||
< 3 => 30,
|
||||
< 10 => 60,
|
||||
< 25 => 80,
|
||||
_ => 100
|
||||
};
|
||||
}
|
||||
|
||||
private static DriftSeverityLevel ScoreToLevel(int score) => score switch
|
||||
{
|
||||
>= 90 => DriftSeverityLevel.Critical,
|
||||
>= 75 => DriftSeverityLevel.High,
|
||||
>= 50 => DriftSeverityLevel.Medium,
|
||||
>= 25 => DriftSeverityLevel.Low,
|
||||
_ => DriftSeverityLevel.Info
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A drift item with its calculated severity.
|
||||
/// </summary>
|
||||
public sealed record ScoredDriftItem(
|
||||
DriftItem Drift,
|
||||
DriftSeverity Severity);
|
||||
@@ -0,0 +1,839 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// FederationIntegrationTests.cs
|
||||
// Sprint: SPRINT_20260117_036_ReleaseOrchestrator_multi_region
|
||||
// Task: TASK-036-08 - Integration tests for multi-region scenarios
|
||||
// Description: Tests for region coordination, sync, evidence replication, and routing
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.ReleaseOrchestrator.Federation.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Integration tests for multi-region federation features.
|
||||
/// </summary>
|
||||
public sealed class FederationIntegrationTests
|
||||
{
|
||||
private readonly FakeTimeProvider _timeProvider = new();
|
||||
|
||||
#region Region Coordinator Tests
|
||||
|
||||
[Fact]
|
||||
public async Task RegionCoordinator_StartGlobalPromotion_CreatesWaves()
|
||||
{
|
||||
// Arrange
|
||||
var (coordinator, _) = CreateRegionCoordinator();
|
||||
|
||||
// Act
|
||||
var promotion = await coordinator.StartGlobalPromotionAsync(new GlobalPromotionRequest
|
||||
{
|
||||
PromotionId = "promo-1",
|
||||
DeploymentId = "deployment-1",
|
||||
TargetVersion = "v2.0",
|
||||
Strategy = PromotionStrategy.Sequential
|
||||
});
|
||||
|
||||
// Assert
|
||||
Assert.Equal(GlobalPromotionStatus.InProgress, promotion.Status);
|
||||
Assert.True(promotion.Waves.Length > 0);
|
||||
Assert.All(promotion.RegionStatuses.Values, s =>
|
||||
Assert.True(s.Status == RegionPromotionState.Pending ||
|
||||
s.Status == RegionPromotionState.InProgress ||
|
||||
s.Status == RegionPromotionState.Completed));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task RegionCoordinator_CanaryStrategy_CanaryRegionsFirst()
|
||||
{
|
||||
// Arrange
|
||||
var (coordinator, _) = CreateRegionCoordinator();
|
||||
|
||||
// Act
|
||||
var promotion = await coordinator.StartGlobalPromotionAsync(new GlobalPromotionRequest
|
||||
{
|
||||
PromotionId = "promo-canary",
|
||||
DeploymentId = "deployment-1",
|
||||
TargetVersion = "v2.0",
|
||||
Strategy = PromotionStrategy.Canary
|
||||
});
|
||||
|
||||
// Assert
|
||||
Assert.True(promotion.Waves.Length >= 2); // At least canary + production waves
|
||||
var firstWave = promotion.Waves.First();
|
||||
Assert.True(firstWave.MinBakeTimeMinutes > 0 || firstWave.WaveNumber == 1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task RegionCoordinator_Progress_MovesToNextWave()
|
||||
{
|
||||
// Arrange
|
||||
var (coordinator, _) = CreateRegionCoordinator();
|
||||
|
||||
var promotion = await coordinator.StartGlobalPromotionAsync(new GlobalPromotionRequest
|
||||
{
|
||||
PromotionId = "promo-progress",
|
||||
DeploymentId = "deployment-1",
|
||||
TargetVersion = "v2.0",
|
||||
Strategy = PromotionStrategy.Sequential
|
||||
});
|
||||
|
||||
// Complete first wave manually
|
||||
foreach (var regionId in promotion.Waves[0].RegionIds)
|
||||
{
|
||||
await coordinator.UpdateRegionStatusAsync(
|
||||
promotion.Id, regionId, RegionPromotionState.Completed);
|
||||
}
|
||||
|
||||
// Act
|
||||
var progressed = await coordinator.ProgressAsync(promotion.Id);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(GlobalPromotionStatus.InProgress, progressed.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task RegionCoordinator_Pause_SetsCorrectStatus()
|
||||
{
|
||||
// Arrange
|
||||
var (coordinator, _) = CreateRegionCoordinator();
|
||||
|
||||
await coordinator.StartGlobalPromotionAsync(new GlobalPromotionRequest
|
||||
{
|
||||
PromotionId = "promo-pause",
|
||||
DeploymentId = "deployment-1",
|
||||
TargetVersion = "v2.0",
|
||||
Strategy = PromotionStrategy.Sequential
|
||||
});
|
||||
|
||||
// Act
|
||||
var paused = await coordinator.PauseAsync("promo-pause");
|
||||
|
||||
// Assert
|
||||
Assert.Equal(GlobalPromotionStatus.Paused, paused.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task RegionCoordinator_Resume_ContinuesPromotion()
|
||||
{
|
||||
// Arrange
|
||||
var (coordinator, _) = CreateRegionCoordinator();
|
||||
|
||||
await coordinator.StartGlobalPromotionAsync(new GlobalPromotionRequest
|
||||
{
|
||||
PromotionId = "promo-resume",
|
||||
DeploymentId = "deployment-1",
|
||||
TargetVersion = "v2.0",
|
||||
Strategy = PromotionStrategy.Sequential
|
||||
});
|
||||
|
||||
await coordinator.PauseAsync("promo-resume");
|
||||
|
||||
// Act
|
||||
var resumed = await coordinator.ResumeAsync("promo-resume");
|
||||
|
||||
// Assert
|
||||
Assert.Equal(GlobalPromotionStatus.InProgress, resumed.Status);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task RegionCoordinator_Rollback_RollsBackAllRegions()
|
||||
{
|
||||
// Arrange
|
||||
var (coordinator, federationHub) = CreateRegionCoordinator();
|
||||
|
||||
await coordinator.StartGlobalPromotionAsync(new GlobalPromotionRequest
|
||||
{
|
||||
PromotionId = "promo-rollback",
|
||||
DeploymentId = "deployment-1",
|
||||
TargetVersion = "v2.0",
|
||||
Strategy = PromotionStrategy.Sequential
|
||||
});
|
||||
|
||||
// Act
|
||||
var rolledBack = await coordinator.RollbackAsync("promo-rollback", "Test rollback");
|
||||
|
||||
// Assert
|
||||
Assert.Equal(GlobalPromotionStatus.RolledBack, rolledBack.Status);
|
||||
Assert.Equal("Test rollback", rolledBack.RollbackReason);
|
||||
Assert.True(federationHub.RollbackCount > 0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task RegionCoordinator_GetCrossRegionHealth_ReturnsHealthStatus()
|
||||
{
|
||||
// Arrange
|
||||
var (coordinator, _) = CreateRegionCoordinator();
|
||||
|
||||
await coordinator.StartGlobalPromotionAsync(new GlobalPromotionRequest
|
||||
{
|
||||
PromotionId = "promo-health",
|
||||
DeploymentId = "deployment-1",
|
||||
TargetVersion = "v2.0",
|
||||
Strategy = PromotionStrategy.Sequential
|
||||
});
|
||||
|
||||
// Act
|
||||
var health = await coordinator.GetCrossRegionHealthAsync("promo-health");
|
||||
|
||||
// Assert
|
||||
Assert.NotEmpty(health.RegionHealths);
|
||||
Assert.True(health.OverallStatus is CrossRegionHealthStatus.Healthy or
|
||||
CrossRegionHealthStatus.Degraded or CrossRegionHealthStatus.Unknown);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Cross-Region Sync Tests
|
||||
|
||||
[Fact]
|
||||
public async Task CrossRegionSync_Replicate_SendsToAllPeers()
|
||||
{
|
||||
// Arrange
|
||||
var (sync, transport) = CreateCrossRegionSync();
|
||||
await sync.InitializeAsync("region-a");
|
||||
|
||||
// Act
|
||||
var result = await sync.ReplicateAsync(new SyncEntry
|
||||
{
|
||||
Key = "test-key",
|
||||
Value = "test-value",
|
||||
Version = 1,
|
||||
VectorClock = new VectorClock().Increment("region-a"),
|
||||
ModifiedAt = _timeProvider.GetUtcNow(),
|
||||
ModifiedBy = "region-a"
|
||||
});
|
||||
|
||||
// Assert
|
||||
Assert.True(result.SuccessCount > 0);
|
||||
Assert.True(transport.SentMessages.Count > 0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CrossRegionSync_RequestFullSync_SyncsWithPeer()
|
||||
{
|
||||
// Arrange
|
||||
var (sync, _) = CreateCrossRegionSync();
|
||||
await sync.InitializeAsync("region-a");
|
||||
|
||||
// Act
|
||||
var summary = await sync.RequestFullSyncAsync("region-b");
|
||||
|
||||
// Assert
|
||||
Assert.Equal("region-b", summary.PeerRegionId);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CrossRegionSync_ConflictDetection_RecordsConflict()
|
||||
{
|
||||
// Arrange
|
||||
var (sync, _) = CreateCrossRegionSync();
|
||||
await sync.InitializeAsync("region-a");
|
||||
|
||||
bool conflictDetected = false;
|
||||
sync.ConflictDetected += (_, _) => conflictDetected = true;
|
||||
|
||||
// Simulate receiving a conflicting message
|
||||
await sync.ReceiveAsync(new SyncMessage
|
||||
{
|
||||
Type = SyncMessageType.Replicate,
|
||||
SourceRegionId = "region-b",
|
||||
Entry = new SyncEntry
|
||||
{
|
||||
Key = "existing-key",
|
||||
Value = "conflicting-value",
|
||||
Version = 2,
|
||||
VectorClock = new VectorClock().Increment("region-b"),
|
||||
ModifiedAt = _timeProvider.GetUtcNow(),
|
||||
ModifiedBy = "region-b"
|
||||
},
|
||||
SentAt = _timeProvider.GetUtcNow()
|
||||
});
|
||||
|
||||
// Note: Conflict detection depends on existing entry in store
|
||||
// This test validates the mechanism exists
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CrossRegionSync_GetSyncStates_ReturnsAllPeers()
|
||||
{
|
||||
// Arrange
|
||||
var (sync, _) = CreateCrossRegionSync();
|
||||
await sync.InitializeAsync("region-a");
|
||||
|
||||
// Act
|
||||
var states = sync.GetSyncStates();
|
||||
|
||||
// Assert
|
||||
Assert.True(states.Length >= 0);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Evidence Replicator Tests
|
||||
|
||||
[Fact]
|
||||
public async Task EvidenceReplicator_ReplicateEvidence_ReplicatesToAllowedRegions()
|
||||
{
|
||||
// Arrange
|
||||
var replicator = CreateEvidenceReplicator();
|
||||
|
||||
var bundle = new EvidenceBundle
|
||||
{
|
||||
Id = "bundle-1",
|
||||
OriginRegion = "region-eu-west",
|
||||
Version = 1,
|
||||
DataClassification = DataClassification.Internal,
|
||||
Items = [new EvidenceItem
|
||||
{
|
||||
Id = "item-1",
|
||||
Type = "scan-result",
|
||||
Content = "{}",
|
||||
ContentHash = "abc123"
|
||||
}],
|
||||
CreatedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = await replicator.ReplicateEvidenceAsync(bundle);
|
||||
|
||||
// Assert
|
||||
Assert.True(result.Status == ReplicationStatus.Success ||
|
||||
result.Status == ReplicationStatus.Partial);
|
||||
Assert.True(result.AllowedRegions.Length > 0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task EvidenceReplicator_ValidateResidency_ChecksCompliance()
|
||||
{
|
||||
// Arrange
|
||||
var replicator = CreateEvidenceReplicator();
|
||||
|
||||
// Act
|
||||
var validation = await replicator.ValidateResidencyAsync("bundle-1");
|
||||
|
||||
// Assert - bundle doesn't exist so not compliant
|
||||
Assert.False(validation.IsCompliant);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task EvidenceReplicator_ScheduleReplication_CreatesTask()
|
||||
{
|
||||
// Arrange
|
||||
var replicator = CreateEvidenceReplicator();
|
||||
|
||||
var bundle = new EvidenceBundle
|
||||
{
|
||||
Id = "bundle-scheduled",
|
||||
OriginRegion = "region-eu-west",
|
||||
Version = 1,
|
||||
DataClassification = DataClassification.Internal,
|
||||
Items = [],
|
||||
CreatedAt = _timeProvider.GetUtcNow()
|
||||
};
|
||||
|
||||
// Act
|
||||
var taskId = await replicator.ScheduleReplicationAsync(bundle, ReplicationPriority.High);
|
||||
|
||||
// Assert
|
||||
Assert.NotEmpty(taskId);
|
||||
|
||||
// Wait briefly for task processing
|
||||
await Task.Delay(100);
|
||||
|
||||
var tasks = replicator.GetPendingTasks();
|
||||
// Task may be completed or still pending
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Latency Router Tests
|
||||
|
||||
[Fact]
|
||||
public async Task LatencyRouter_SelectRegion_ReturnsOptimalRegion()
|
||||
{
|
||||
// Arrange
|
||||
var router = CreateLatencyRouter();
|
||||
await router.InitializeAsync("region-a", GetTestRegionEndpoints());
|
||||
|
||||
// Act
|
||||
var decision = await router.SelectRegionAsync(new RoutingRequest
|
||||
{
|
||||
RequestId = "req-1"
|
||||
});
|
||||
|
||||
// Assert
|
||||
Assert.NotNull(decision.SelectedRegion);
|
||||
Assert.True(decision.HealthScore > 0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task LatencyRouter_SelectRegion_RespectsPreferences()
|
||||
{
|
||||
// Arrange
|
||||
var router = CreateLatencyRouter();
|
||||
await router.InitializeAsync("region-a", GetTestRegionEndpoints());
|
||||
|
||||
// Act
|
||||
var decision = await router.SelectRegionAsync(new RoutingRequest
|
||||
{
|
||||
RequestId = "req-2",
|
||||
PreferredRegions = ["region-b"]
|
||||
});
|
||||
|
||||
// Assert
|
||||
Assert.Equal("region-b", decision.SelectedRegion);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task LatencyRouter_SelectRegion_RespectsExclusions()
|
||||
{
|
||||
// Arrange
|
||||
var router = CreateLatencyRouter();
|
||||
await router.InitializeAsync("region-a", GetTestRegionEndpoints());
|
||||
|
||||
// Act
|
||||
var decision = await router.SelectRegionAsync(new RoutingRequest
|
||||
{
|
||||
RequestId = "req-3",
|
||||
ExcludedRegions = ["region-a", "region-b"]
|
||||
});
|
||||
|
||||
// Assert
|
||||
Assert.NotEqual("region-a", decision.SelectedRegion);
|
||||
Assert.NotEqual("region-b", decision.SelectedRegion);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task LatencyRouter_ProbeAllRegions_ReturnsResults()
|
||||
{
|
||||
// Arrange
|
||||
var router = CreateLatencyRouter();
|
||||
await router.InitializeAsync("region-a", GetTestRegionEndpoints());
|
||||
|
||||
// Act
|
||||
var results = await router.ProbeAllRegionsAsync();
|
||||
|
||||
// Assert
|
||||
Assert.True(results.Length >= 1);
|
||||
Assert.All(results.Where(r => r.RegionId == "region-a"), r => Assert.Equal(0, r.LatencyMs));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task LatencyRouter_MarkUnavailable_ExcludesFromRouting()
|
||||
{
|
||||
// Arrange
|
||||
var router = CreateLatencyRouter();
|
||||
await router.InitializeAsync("region-a", GetTestRegionEndpoints());
|
||||
|
||||
// Act
|
||||
router.MarkUnavailable("region-b", TimeSpan.FromMinutes(5));
|
||||
|
||||
var decision = await router.SelectRegionAsync(new RoutingRequest
|
||||
{
|
||||
RequestId = "req-4",
|
||||
PreferredRegions = ["region-b"]
|
||||
});
|
||||
|
||||
// Assert - should not select unavailable region
|
||||
Assert.NotEqual("region-b", decision.SelectedRegion);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task LatencyRouter_GetStatistics_ReturnsAggregatedStats()
|
||||
{
|
||||
// Arrange
|
||||
var router = CreateLatencyRouter();
|
||||
await router.InitializeAsync("region-a", GetTestRegionEndpoints());
|
||||
|
||||
// Act
|
||||
var stats = router.GetStatistics();
|
||||
|
||||
// Assert
|
||||
Assert.True(stats.TotalRegions >= 1);
|
||||
Assert.True(stats.HealthyRegions >= 0);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Global Dashboard Tests
|
||||
|
||||
[Fact]
|
||||
public async Task GlobalDashboard_GetOverview_ReturnsComprehensiveView()
|
||||
{
|
||||
// Arrange
|
||||
var dashboard = CreateGlobalDashboard();
|
||||
|
||||
// Act
|
||||
var overview = await dashboard.GetOverviewAsync();
|
||||
|
||||
// Assert
|
||||
Assert.True(overview.TotalRegions >= 0);
|
||||
Assert.NotNull(overview.OverallHealth);
|
||||
Assert.NotNull(overview.SyncHealth);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GlobalDashboard_CreateAlert_RaisesEvent()
|
||||
{
|
||||
// Arrange
|
||||
var dashboard = CreateGlobalDashboard();
|
||||
Alert? receivedAlert = null;
|
||||
dashboard.AlertCreated += (_, args) => receivedAlert = args.Alert;
|
||||
|
||||
// Act
|
||||
var alert = await dashboard.CreateAlertAsync(new CreateAlertRequest
|
||||
{
|
||||
RegionId = "region-a",
|
||||
Severity = AlertSeverity.Warning,
|
||||
Category = AlertCategory.Health,
|
||||
Title = "Test Alert",
|
||||
Description = "This is a test alert"
|
||||
});
|
||||
|
||||
// Assert
|
||||
Assert.NotNull(alert);
|
||||
Assert.Equal("Test Alert", alert.Title);
|
||||
Assert.Equal(AlertStatus.Active, alert.Status);
|
||||
Assert.Equal(alert.Id, receivedAlert?.Id);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GlobalDashboard_AcknowledgeAlert_UpdatesStatus()
|
||||
{
|
||||
// Arrange
|
||||
var dashboard = CreateGlobalDashboard();
|
||||
|
||||
var alert = await dashboard.CreateAlertAsync(new CreateAlertRequest
|
||||
{
|
||||
RegionId = "region-a",
|
||||
Severity = AlertSeverity.Warning,
|
||||
Category = AlertCategory.Health,
|
||||
Title = "Test Alert",
|
||||
Description = "Test"
|
||||
});
|
||||
|
||||
// Act
|
||||
var acknowledged = await dashboard.AcknowledgeAlertAsync(alert.Id, "operator-1");
|
||||
|
||||
// Assert
|
||||
Assert.Equal(AlertStatus.Acknowledged, acknowledged.Status);
|
||||
Assert.Equal("operator-1", acknowledged.AcknowledgedBy);
|
||||
Assert.NotNull(acknowledged.AcknowledgedAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GlobalDashboard_ResolveAlert_RemovesFromActive()
|
||||
{
|
||||
// Arrange
|
||||
var dashboard = CreateGlobalDashboard();
|
||||
|
||||
var alert = await dashboard.CreateAlertAsync(new CreateAlertRequest
|
||||
{
|
||||
RegionId = "region-a",
|
||||
Severity = AlertSeverity.Warning,
|
||||
Category = AlertCategory.Health,
|
||||
Title = "Test Alert",
|
||||
Description = "Test"
|
||||
});
|
||||
|
||||
// Act
|
||||
var resolved = await dashboard.ResolveAlertAsync(alert.Id, "Issue fixed");
|
||||
|
||||
// Assert
|
||||
Assert.Equal(AlertStatus.Resolved, resolved.Status);
|
||||
Assert.Equal("Issue fixed", resolved.Resolution);
|
||||
|
||||
var activeAlerts = dashboard.GetAlerts();
|
||||
Assert.DoesNotContain(activeAlerts, a => a.Id == alert.Id);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GlobalDashboard_GetSyncOverview_ReturnsSyncStatus()
|
||||
{
|
||||
// Arrange
|
||||
var dashboard = CreateGlobalDashboard();
|
||||
|
||||
// Act
|
||||
var overview = await dashboard.GetSyncOverviewAsync();
|
||||
|
||||
// Assert
|
||||
Assert.True(overview.TotalPeers >= 0);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region End-to-End Tests
|
||||
|
||||
[Fact]
|
||||
public async Task EndToEnd_GlobalPromotionFlow()
|
||||
{
|
||||
// Arrange
|
||||
var (coordinator, federationHub) = CreateRegionCoordinator();
|
||||
|
||||
// Start promotion
|
||||
var promotion = await coordinator.StartGlobalPromotionAsync(new GlobalPromotionRequest
|
||||
{
|
||||
PromotionId = "e2e-promo",
|
||||
DeploymentId = "service-a",
|
||||
TargetVersion = "v3.0",
|
||||
Strategy = PromotionStrategy.Sequential
|
||||
});
|
||||
|
||||
Assert.Equal(GlobalPromotionStatus.InProgress, promotion.Status);
|
||||
|
||||
// Complete all waves
|
||||
foreach (var wave in promotion.Waves)
|
||||
{
|
||||
foreach (var regionId in wave.RegionIds)
|
||||
{
|
||||
await coordinator.UpdateRegionStatusAsync(
|
||||
promotion.Id, regionId, RegionPromotionState.Completed);
|
||||
}
|
||||
}
|
||||
|
||||
// Complete
|
||||
var completed = await coordinator.CompleteAsync(promotion.Id);
|
||||
|
||||
// Assert
|
||||
Assert.Equal(GlobalPromotionStatus.Completed, completed.Status);
|
||||
Assert.NotNull(completed.CompletedAt);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Setup Helpers
|
||||
|
||||
private (RegionCoordinator, FakeFederationHub) CreateRegionCoordinator()
|
||||
{
|
||||
var federationHub = new FakeFederationHub();
|
||||
var healthMonitor = new FakeRegionHealthMonitor();
|
||||
|
||||
var coordinator = new RegionCoordinator(
|
||||
federationHub,
|
||||
healthMonitor,
|
||||
new RegionCoordinatorConfig(),
|
||||
_timeProvider,
|
||||
NullLogger<RegionCoordinator>.Instance);
|
||||
|
||||
return (coordinator, federationHub);
|
||||
}
|
||||
|
||||
private (CrossRegionSync, FakeRegionTransport) CreateCrossRegionSync()
|
||||
{
|
||||
var transport = new FakeRegionTransport();
|
||||
var store = new FakeCrossRegionStore();
|
||||
|
||||
var sync = new CrossRegionSync(
|
||||
transport,
|
||||
store,
|
||||
new CrossRegionSyncConfig { SyncInterval = TimeSpan.FromHours(1) },
|
||||
_timeProvider,
|
||||
NullLogger<CrossRegionSync>.Instance);
|
||||
|
||||
return (sync, transport);
|
||||
}
|
||||
|
||||
private EvidenceReplicator CreateEvidenceReplicator()
|
||||
{
|
||||
var (crossRegionSync, _) = CreateCrossRegionSync();
|
||||
var residencyPolicy = new FakeDataResidencyPolicy();
|
||||
var evidenceStore = new FakeEvidenceStore();
|
||||
|
||||
return new EvidenceReplicator(
|
||||
crossRegionSync,
|
||||
residencyPolicy,
|
||||
evidenceStore,
|
||||
new EvidenceReplicatorConfig(),
|
||||
_timeProvider,
|
||||
NullLogger<EvidenceReplicator>.Instance);
|
||||
}
|
||||
|
||||
private LatencyRouter CreateLatencyRouter()
|
||||
{
|
||||
var healthMonitor = new FakeRegionHealthMonitor();
|
||||
|
||||
return new LatencyRouter(
|
||||
healthMonitor,
|
||||
new LatencyRouterConfig(),
|
||||
_timeProvider,
|
||||
NullLogger<LatencyRouter>.Instance);
|
||||
}
|
||||
|
||||
private GlobalDashboard CreateGlobalDashboard()
|
||||
{
|
||||
var (federationHub, _) = (new FakeFederationHub(), 0);
|
||||
var (regionCoordinator, _) = CreateRegionCoordinator();
|
||||
var latencyRouter = CreateLatencyRouter();
|
||||
var (crossRegionSync, _) = CreateCrossRegionSync();
|
||||
|
||||
return new GlobalDashboard(
|
||||
federationHub,
|
||||
regionCoordinator,
|
||||
latencyRouter,
|
||||
crossRegionSync,
|
||||
new GlobalDashboardConfig(),
|
||||
_timeProvider,
|
||||
NullLogger<GlobalDashboard>.Instance);
|
||||
}
|
||||
|
||||
private static IEnumerable<RegionEndpoint> GetTestRegionEndpoints()
|
||||
{
|
||||
return
|
||||
[
|
||||
new RegionEndpoint { Id = "region-a", Url = "https://a.example.com", Location = "US-East" },
|
||||
new RegionEndpoint { Id = "region-b", Url = "https://b.example.com", Location = "EU-West" },
|
||||
new RegionEndpoint { Id = "region-c", Url = "https://c.example.com", Location = "AP-Tokyo" }
|
||||
];
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
#region Test Doubles
|
||||
|
||||
public sealed class FakeTimeProvider : TimeProvider
|
||||
{
|
||||
private DateTimeOffset _now = new(2026, 1, 17, 12, 0, 0, TimeSpan.Zero);
|
||||
public override DateTimeOffset GetUtcNow() => _now;
|
||||
public void Advance(TimeSpan duration) => _now = _now.Add(duration);
|
||||
}
|
||||
|
||||
public sealed class FakeFederationHub : IFederationHub
|
||||
{
|
||||
public int DeployCount { get; private set; }
|
||||
public int RollbackCount { get; private set; }
|
||||
|
||||
public Task<ImmutableArray<Region>> GetRegionsAsync(CancellationToken ct = default)
|
||||
{
|
||||
return Task.FromResult<ImmutableArray<Region>>(
|
||||
[
|
||||
new Region { Id = "region-a", Name = "US-East", Location = "us-east-1", Priority = 1, IsCanary = true },
|
||||
new Region { Id = "region-b", Name = "EU-West", Location = "eu-west-1", Priority = 2, IsCanary = false },
|
||||
new Region { Id = "region-c", Name = "AP-Tokyo", Location = "ap-northeast-1", Priority = 3, IsCanary = false }
|
||||
]);
|
||||
}
|
||||
|
||||
public Task DeployToRegionAsync(string regionId, string deploymentId, string version, CancellationToken ct = default)
|
||||
{
|
||||
DeployCount++;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public Task RollbackRegionAsync(string regionId, string deploymentId, CancellationToken ct = default)
|
||||
{
|
||||
RollbackCount++;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class FakeRegionHealthMonitor : IRegionHealthMonitor
|
||||
{
|
||||
public Task<RegionHealth> GetRegionHealthAsync(string regionId, CancellationToken ct = default)
|
||||
{
|
||||
return Task.FromResult(new RegionHealth
|
||||
{
|
||||
RegionId = regionId,
|
||||
Status = RegionHealthStatus.Healthy,
|
||||
Score = 0.95
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class FakeRegionTransport : IRegionTransport
|
||||
{
|
||||
public List<SyncMessage> SentMessages { get; } = [];
|
||||
|
||||
public Task<ImmutableArray<string>> DiscoverPeersAsync(CancellationToken ct = default)
|
||||
{
|
||||
return Task.FromResult<ImmutableArray<string>>(["region-b", "region-c"]);
|
||||
}
|
||||
|
||||
public Task SendAsync(string peerId, SyncMessage message, CancellationToken ct = default)
|
||||
{
|
||||
SentMessages.Add(message);
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class FakeCrossRegionStore : ICrossRegionStore
|
||||
{
|
||||
private readonly Dictionary<string, SyncEntry> _entries = new();
|
||||
|
||||
public Task<SyncEntry?> GetAsync(string key, CancellationToken ct = default)
|
||||
{
|
||||
return Task.FromResult(_entries.TryGetValue(key, out var entry) ? entry : null);
|
||||
}
|
||||
|
||||
public Task SaveAsync(SyncEntry entry, CancellationToken ct = default)
|
||||
{
|
||||
_entries[entry.Key] = entry;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
public Task<ImmutableArray<SyncEntry>> GetAllAsync(CancellationToken ct = default)
|
||||
{
|
||||
return Task.FromResult(_entries.Values.ToImmutableArray());
|
||||
}
|
||||
|
||||
public Task<SyncDigest> GetDigestAsync(CancellationToken ct = default)
|
||||
{
|
||||
return Task.FromResult(new SyncDigest
|
||||
{
|
||||
RegionId = "local",
|
||||
Entries = _entries.Values.Select(e => new DigestEntry
|
||||
{
|
||||
Key = e.Key,
|
||||
VectorClock = e.VectorClock,
|
||||
Version = e.Version
|
||||
}).ToImmutableArray(),
|
||||
ComputedAt = DateTimeOffset.UtcNow
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class FakeDataResidencyPolicy : IDataResidencyPolicy
|
||||
{
|
||||
public Task<ImmutableArray<string>> GetAllowedRegionsAsync(
|
||||
DataClassification classification,
|
||||
string originRegion,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
// For sovereign data, only same region
|
||||
if (classification == DataClassification.Sovereign)
|
||||
{
|
||||
return Task.FromResult<ImmutableArray<string>>([originRegion]);
|
||||
}
|
||||
|
||||
// For other classifications, allow all regions
|
||||
return Task.FromResult<ImmutableArray<string>>(["region-a", "region-b", "region-c"]);
|
||||
}
|
||||
|
||||
public Task<EvidenceItem> TransformForRegionsAsync(
|
||||
EvidenceItem item,
|
||||
ImmutableArray<string> targetRegions,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
// No transformation needed
|
||||
return Task.FromResult(item);
|
||||
}
|
||||
}
|
||||
|
||||
public sealed class FakeEvidenceStore : IEvidenceStore
|
||||
{
|
||||
private readonly Dictionary<string, EvidenceBundle> _bundles = new();
|
||||
|
||||
public Task<EvidenceBundle?> GetBundleAsync(string bundleId, CancellationToken ct = default)
|
||||
{
|
||||
return Task.FromResult(_bundles.TryGetValue(bundleId, out var bundle) ? bundle : null);
|
||||
}
|
||||
|
||||
public Task SaveBundleAsync(EvidenceBundle bundle, CancellationToken ct = default)
|
||||
{
|
||||
_bundles[bundle.Id] = bundle;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user