using Microsoft.AspNetCore.Mvc;
using StellaOps.JobEngine.Infrastructure.Postgres;
using static StellaOps.Localization.T;
namespace StellaOps.JobEngine.WebService.Endpoints;
///
/// Health and readiness probe endpoints.
///
public static class HealthEndpoints
{
///
/// Maps health endpoints to the route builder.
///
public static IEndpointRouteBuilder MapHealthEndpoints(this IEndpointRouteBuilder app)
{
app.MapGet("/healthz", GetHealth)
.WithName("Orchestrator_Health")
.WithTags("Health")
.WithDescription(_t("orchestrator.health.liveness_description"))
.AllowAnonymous();
app.MapGet("/readyz", GetReadiness)
.WithName("Orchestrator_Readiness")
.WithTags("Health")
.WithDescription(_t("orchestrator.health.readiness_description"))
.AllowAnonymous();
app.MapGet("/livez", GetLiveness)
.WithName("Orchestrator_Liveness")
.WithTags("Health")
.WithDescription(_t("orchestrator.health.liveness_description"))
.AllowAnonymous();
app.MapGet("/health/details", GetHealthDetails)
.WithName("Orchestrator_HealthDetails")
.WithTags("Health")
.WithDescription(_t("orchestrator.health.deep_description"))
.AllowAnonymous();
return app;
}
private static IResult GetHealth([FromServices] TimeProvider timeProvider)
{
return Results.Ok(new HealthResponse("ok", timeProvider.GetUtcNow()));
}
private static async Task GetReadiness(
[FromServices] JobEngineDataSource dataSource,
[FromServices] TimeProvider timeProvider,
CancellationToken cancellationToken)
{
try
{
// Check database connectivity
var dbHealthy = await CheckDatabaseAsync(dataSource, cancellationToken).ConfigureAwait(false);
if (!dbHealthy)
{
return Results.Json(
new ReadinessResponse("not_ready", timeProvider.GetUtcNow(), new Dictionary
{
["database"] = "unhealthy"
}),
statusCode: StatusCodes.Status503ServiceUnavailable);
}
return Results.Ok(new ReadinessResponse("ready", timeProvider.GetUtcNow(), new Dictionary
{
["database"] = "healthy"
}));
}
catch (Exception ex)
{
return Results.Json(
new ReadinessResponse("not_ready", timeProvider.GetUtcNow(), new Dictionary
{
["database"] = $"error: {ex.Message}"
}),
statusCode: StatusCodes.Status503ServiceUnavailable);
}
}
private static IResult GetLiveness([FromServices] TimeProvider timeProvider)
{
// Liveness just checks the process is alive
return Results.Ok(new HealthResponse("alive", timeProvider.GetUtcNow()));
}
private static async Task GetHealthDetails(
[FromServices] JobEngineDataSource dataSource,
[FromServices] TimeProvider timeProvider,
CancellationToken cancellationToken)
{
var checks = new Dictionary();
var overallHealthy = true;
// Database check
try
{
var dbHealthy = await CheckDatabaseAsync(dataSource, cancellationToken).ConfigureAwait(false);
checks["database"] = new HealthCheckResult(
dbHealthy ? "healthy" : "unhealthy",
dbHealthy ? null : "Connection test failed",
timeProvider.GetUtcNow());
overallHealthy &= dbHealthy;
}
catch (Exception ex)
{
checks["database"] = new HealthCheckResult("unhealthy", ex.Message, timeProvider.GetUtcNow());
overallHealthy = false;
}
// Memory check
var memoryInfo = GC.GetGCMemoryInfo();
var memoryUsedMb = GC.GetTotalMemory(false) / (1024.0 * 1024.0);
var memoryLimitMb = memoryInfo.TotalAvailableMemoryBytes / (1024.0 * 1024.0);
var memoryHealthy = memoryUsedMb < memoryLimitMb * 0.9; // < 90% threshold
checks["memory"] = new HealthCheckResult(
memoryHealthy ? "healthy" : "degraded",
$"Used: {memoryUsedMb:F2} MB",
timeProvider.GetUtcNow());
// Thread pool check
ThreadPool.GetAvailableThreads(out var workerThreads, out var completionPortThreads);
ThreadPool.GetMaxThreads(out var maxWorkerThreads, out var maxCompletionPortThreads);
var threadPoolHealthy = workerThreads > maxWorkerThreads * 0.1; // > 10% available
checks["threadPool"] = new HealthCheckResult(
threadPoolHealthy ? "healthy" : "degraded",
$"Worker threads available: {workerThreads}/{maxWorkerThreads}",
timeProvider.GetUtcNow());
var response = new HealthDetailsResponse(
overallHealthy ? "healthy" : "unhealthy",
timeProvider.GetUtcNow(),
checks);
return overallHealthy
? Results.Ok(response)
: Results.Json(response, statusCode: StatusCodes.Status503ServiceUnavailable);
}
private static async Task CheckDatabaseAsync(JobEngineDataSource dataSource, CancellationToken cancellationToken)
{
try
{
// Use a system tenant for health checks
await using var connection = await dataSource.OpenConnectionAsync("_system", "health", cancellationToken).ConfigureAwait(false);
await using var command = connection.CreateCommand();
command.CommandText = "SELECT 1";
await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
return true;
}
catch
{
return false;
}
}
}
///
/// Basic health response.
///
public sealed record HealthResponse(string Status, DateTimeOffset Timestamp);
///
/// Readiness response with dependency status.
///
public sealed record ReadinessResponse(
string Status,
DateTimeOffset Timestamp,
IReadOnlyDictionary Dependencies);
///
/// Individual health check result.
///
public sealed record HealthCheckResult(
string Status,
string? Details,
DateTimeOffset CheckedAt);
///
/// Detailed health response with all checks.
///
public sealed record HealthDetailsResponse(
string Status,
DateTimeOffset Timestamp,
IReadOnlyDictionary Checks);