192 lines
6.7 KiB
C#
192 lines
6.7 KiB
C#
using Microsoft.AspNetCore.Mvc;
|
|
using StellaOps.JobEngine.Infrastructure.Postgres;
|
|
using static StellaOps.Localization.T;
|
|
|
|
namespace StellaOps.JobEngine.WebService.Endpoints;
|
|
|
|
/// <summary>
|
|
/// Health and readiness probe endpoints.
|
|
/// </summary>
|
|
public static class HealthEndpoints
|
|
{
|
|
/// <summary>
|
|
/// Maps health endpoints to the route builder.
|
|
/// </summary>
|
|
public static IEndpointRouteBuilder MapHealthEndpoints(this IEndpointRouteBuilder app)
|
|
{
|
|
app.MapGet("/healthz", GetHealth)
|
|
.WithName("Orchestrator_Health")
|
|
.WithTags("Health")
|
|
.WithDescription(_t("orchestrator.health.liveness_description"))
|
|
.AllowAnonymous();
|
|
|
|
app.MapGet("/readyz", GetReadiness)
|
|
.WithName("Orchestrator_Readiness")
|
|
.WithTags("Health")
|
|
.WithDescription(_t("orchestrator.health.readiness_description"))
|
|
.AllowAnonymous();
|
|
|
|
app.MapGet("/livez", GetLiveness)
|
|
.WithName("Orchestrator_Liveness")
|
|
.WithTags("Health")
|
|
.WithDescription(_t("orchestrator.health.liveness_description"))
|
|
.AllowAnonymous();
|
|
|
|
app.MapGet("/health/details", GetHealthDetails)
|
|
.WithName("Orchestrator_HealthDetails")
|
|
.WithTags("Health")
|
|
.WithDescription(_t("orchestrator.health.deep_description"))
|
|
.AllowAnonymous();
|
|
|
|
return app;
|
|
}
|
|
|
|
private static IResult GetHealth([FromServices] TimeProvider timeProvider)
|
|
{
|
|
return Results.Ok(new HealthResponse("ok", timeProvider.GetUtcNow()));
|
|
}
|
|
|
|
private static async Task<IResult> GetReadiness(
|
|
[FromServices] JobEngineDataSource dataSource,
|
|
[FromServices] TimeProvider timeProvider,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
try
|
|
{
|
|
// Check database connectivity
|
|
var dbHealthy = await CheckDatabaseAsync(dataSource, cancellationToken).ConfigureAwait(false);
|
|
|
|
if (!dbHealthy)
|
|
{
|
|
return Results.Json(
|
|
new ReadinessResponse("not_ready", timeProvider.GetUtcNow(), new Dictionary<string, string>
|
|
{
|
|
["database"] = "unhealthy"
|
|
}),
|
|
statusCode: StatusCodes.Status503ServiceUnavailable);
|
|
}
|
|
|
|
return Results.Ok(new ReadinessResponse("ready", timeProvider.GetUtcNow(), new Dictionary<string, string>
|
|
{
|
|
["database"] = "healthy"
|
|
}));
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
return Results.Json(
|
|
new ReadinessResponse("not_ready", timeProvider.GetUtcNow(), new Dictionary<string, string>
|
|
{
|
|
["database"] = $"error: {ex.Message}"
|
|
}),
|
|
statusCode: StatusCodes.Status503ServiceUnavailable);
|
|
}
|
|
}
|
|
|
|
private static IResult GetLiveness([FromServices] TimeProvider timeProvider)
|
|
{
|
|
// Liveness just checks the process is alive
|
|
return Results.Ok(new HealthResponse("alive", timeProvider.GetUtcNow()));
|
|
}
|
|
|
|
private static async Task<IResult> GetHealthDetails(
|
|
[FromServices] JobEngineDataSource dataSource,
|
|
[FromServices] TimeProvider timeProvider,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
var checks = new Dictionary<string, HealthCheckResult>();
|
|
var overallHealthy = true;
|
|
|
|
// Database check
|
|
try
|
|
{
|
|
var dbHealthy = await CheckDatabaseAsync(dataSource, cancellationToken).ConfigureAwait(false);
|
|
checks["database"] = new HealthCheckResult(
|
|
dbHealthy ? "healthy" : "unhealthy",
|
|
dbHealthy ? null : "Connection test failed",
|
|
timeProvider.GetUtcNow());
|
|
overallHealthy &= dbHealthy;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
checks["database"] = new HealthCheckResult("unhealthy", ex.Message, timeProvider.GetUtcNow());
|
|
overallHealthy = false;
|
|
}
|
|
|
|
// Memory check
|
|
var memoryInfo = GC.GetGCMemoryInfo();
|
|
var memoryUsedMb = GC.GetTotalMemory(false) / (1024.0 * 1024.0);
|
|
var memoryLimitMb = memoryInfo.TotalAvailableMemoryBytes / (1024.0 * 1024.0);
|
|
var memoryHealthy = memoryUsedMb < memoryLimitMb * 0.9; // < 90% threshold
|
|
|
|
checks["memory"] = new HealthCheckResult(
|
|
memoryHealthy ? "healthy" : "degraded",
|
|
$"Used: {memoryUsedMb:F2} MB",
|
|
timeProvider.GetUtcNow());
|
|
|
|
// Thread pool check
|
|
ThreadPool.GetAvailableThreads(out var workerThreads, out var completionPortThreads);
|
|
ThreadPool.GetMaxThreads(out var maxWorkerThreads, out var maxCompletionPortThreads);
|
|
var threadPoolHealthy = workerThreads > maxWorkerThreads * 0.1; // > 10% available
|
|
|
|
checks["threadPool"] = new HealthCheckResult(
|
|
threadPoolHealthy ? "healthy" : "degraded",
|
|
$"Worker threads available: {workerThreads}/{maxWorkerThreads}",
|
|
timeProvider.GetUtcNow());
|
|
|
|
var response = new HealthDetailsResponse(
|
|
overallHealthy ? "healthy" : "unhealthy",
|
|
timeProvider.GetUtcNow(),
|
|
checks);
|
|
|
|
return overallHealthy
|
|
? Results.Ok(response)
|
|
: Results.Json(response, statusCode: StatusCodes.Status503ServiceUnavailable);
|
|
}
|
|
|
|
private static async Task<bool> CheckDatabaseAsync(JobEngineDataSource dataSource, CancellationToken cancellationToken)
|
|
{
|
|
try
|
|
{
|
|
// Use a system tenant for health checks
|
|
await using var connection = await dataSource.OpenConnectionAsync("_system", "health", cancellationToken).ConfigureAwait(false);
|
|
await using var command = connection.CreateCommand();
|
|
command.CommandText = "SELECT 1";
|
|
await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
|
return true;
|
|
}
|
|
catch
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Basic health response.
|
|
/// </summary>
|
|
public sealed record HealthResponse(string Status, DateTimeOffset Timestamp);
|
|
|
|
/// <summary>
|
|
/// Readiness response with dependency status.
|
|
/// </summary>
|
|
public sealed record ReadinessResponse(
|
|
string Status,
|
|
DateTimeOffset Timestamp,
|
|
IReadOnlyDictionary<string, string> Dependencies);
|
|
|
|
/// <summary>
|
|
/// Individual health check result.
|
|
/// </summary>
|
|
public sealed record HealthCheckResult(
|
|
string Status,
|
|
string? Details,
|
|
DateTimeOffset CheckedAt);
|
|
|
|
/// <summary>
|
|
/// Detailed health response with all checks.
|
|
/// </summary>
|
|
public sealed record HealthDetailsResponse(
|
|
string Status,
|
|
DateTimeOffset Timestamp,
|
|
IReadOnlyDictionary<string, HealthCheckResult> Checks);
|