using Microsoft.AspNetCore.Mvc; using StellaOps.JobEngine.Infrastructure.Postgres; using static StellaOps.Localization.T; namespace StellaOps.JobEngine.WebService.Endpoints; /// /// Health and readiness probe endpoints. /// public static class HealthEndpoints { /// /// Maps health endpoints to the route builder. /// public static IEndpointRouteBuilder MapHealthEndpoints(this IEndpointRouteBuilder app) { app.MapGet("/healthz", GetHealth) .WithName("Orchestrator_Health") .WithTags("Health") .WithDescription(_t("orchestrator.health.liveness_description")) .AllowAnonymous(); app.MapGet("/readyz", GetReadiness) .WithName("Orchestrator_Readiness") .WithTags("Health") .WithDescription(_t("orchestrator.health.readiness_description")) .AllowAnonymous(); app.MapGet("/livez", GetLiveness) .WithName("Orchestrator_Liveness") .WithTags("Health") .WithDescription(_t("orchestrator.health.liveness_description")) .AllowAnonymous(); app.MapGet("/health/details", GetHealthDetails) .WithName("Orchestrator_HealthDetails") .WithTags("Health") .WithDescription(_t("orchestrator.health.deep_description")) .AllowAnonymous(); return app; } private static IResult GetHealth([FromServices] TimeProvider timeProvider) { return Results.Ok(new HealthResponse("ok", timeProvider.GetUtcNow())); } private static async Task GetReadiness( [FromServices] JobEngineDataSource dataSource, [FromServices] TimeProvider timeProvider, CancellationToken cancellationToken) { try { // Check database connectivity var dbHealthy = await CheckDatabaseAsync(dataSource, cancellationToken).ConfigureAwait(false); if (!dbHealthy) { return Results.Json( new ReadinessResponse("not_ready", timeProvider.GetUtcNow(), new Dictionary { ["database"] = "unhealthy" }), statusCode: StatusCodes.Status503ServiceUnavailable); } return Results.Ok(new ReadinessResponse("ready", timeProvider.GetUtcNow(), new Dictionary { ["database"] = "healthy" })); } catch (Exception ex) { return Results.Json( new ReadinessResponse("not_ready", timeProvider.GetUtcNow(), new Dictionary { ["database"] = $"error: {ex.Message}" }), statusCode: StatusCodes.Status503ServiceUnavailable); } } private static IResult GetLiveness([FromServices] TimeProvider timeProvider) { // Liveness just checks the process is alive return Results.Ok(new HealthResponse("alive", timeProvider.GetUtcNow())); } private static async Task GetHealthDetails( [FromServices] JobEngineDataSource dataSource, [FromServices] TimeProvider timeProvider, CancellationToken cancellationToken) { var checks = new Dictionary(); var overallHealthy = true; // Database check try { var dbHealthy = await CheckDatabaseAsync(dataSource, cancellationToken).ConfigureAwait(false); checks["database"] = new HealthCheckResult( dbHealthy ? "healthy" : "unhealthy", dbHealthy ? null : "Connection test failed", timeProvider.GetUtcNow()); overallHealthy &= dbHealthy; } catch (Exception ex) { checks["database"] = new HealthCheckResult("unhealthy", ex.Message, timeProvider.GetUtcNow()); overallHealthy = false; } // Memory check var memoryInfo = GC.GetGCMemoryInfo(); var memoryUsedMb = GC.GetTotalMemory(false) / (1024.0 * 1024.0); var memoryLimitMb = memoryInfo.TotalAvailableMemoryBytes / (1024.0 * 1024.0); var memoryHealthy = memoryUsedMb < memoryLimitMb * 0.9; // < 90% threshold checks["memory"] = new HealthCheckResult( memoryHealthy ? "healthy" : "degraded", $"Used: {memoryUsedMb:F2} MB", timeProvider.GetUtcNow()); // Thread pool check ThreadPool.GetAvailableThreads(out var workerThreads, out var completionPortThreads); ThreadPool.GetMaxThreads(out var maxWorkerThreads, out var maxCompletionPortThreads); var threadPoolHealthy = workerThreads > maxWorkerThreads * 0.1; // > 10% available checks["threadPool"] = new HealthCheckResult( threadPoolHealthy ? "healthy" : "degraded", $"Worker threads available: {workerThreads}/{maxWorkerThreads}", timeProvider.GetUtcNow()); var response = new HealthDetailsResponse( overallHealthy ? "healthy" : "unhealthy", timeProvider.GetUtcNow(), checks); return overallHealthy ? Results.Ok(response) : Results.Json(response, statusCode: StatusCodes.Status503ServiceUnavailable); } private static async Task CheckDatabaseAsync(JobEngineDataSource dataSource, CancellationToken cancellationToken) { try { // Use a system tenant for health checks await using var connection = await dataSource.OpenConnectionAsync("_system", "health", cancellationToken).ConfigureAwait(false); await using var command = connection.CreateCommand(); command.CommandText = "SELECT 1"; await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false); return true; } catch { return false; } } } /// /// Basic health response. /// public sealed record HealthResponse(string Status, DateTimeOffset Timestamp); /// /// Readiness response with dependency status. /// public sealed record ReadinessResponse( string Status, DateTimeOffset Timestamp, IReadOnlyDictionary Dependencies); /// /// Individual health check result. /// public sealed record HealthCheckResult( string Status, string? Details, DateTimeOffset CheckedAt); /// /// Detailed health response with all checks. /// public sealed record HealthDetailsResponse( string Status, DateTimeOffset Timestamp, IReadOnlyDictionary Checks);