consolidation of some of the modules, localization fixes, product advisories work, qa work
This commit is contained in:
@@ -0,0 +1,191 @@
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
using StellaOps.JobEngine.Infrastructure.Postgres;
|
||||
using static StellaOps.Localization.T;
|
||||
|
||||
namespace StellaOps.JobEngine.WebService.Endpoints;
|
||||
|
||||
/// <summary>
|
||||
/// Health and readiness probe endpoints.
|
||||
/// </summary>
|
||||
public static class HealthEndpoints
|
||||
{
|
||||
/// <summary>
|
||||
/// Maps health endpoints to the route builder.
|
||||
/// </summary>
|
||||
public static IEndpointRouteBuilder MapHealthEndpoints(this IEndpointRouteBuilder app)
|
||||
{
|
||||
app.MapGet("/healthz", GetHealth)
|
||||
.WithName("Orchestrator_Health")
|
||||
.WithTags("Health")
|
||||
.WithDescription(_t("orchestrator.health.liveness_description"))
|
||||
.AllowAnonymous();
|
||||
|
||||
app.MapGet("/readyz", GetReadiness)
|
||||
.WithName("Orchestrator_Readiness")
|
||||
.WithTags("Health")
|
||||
.WithDescription(_t("orchestrator.health.readiness_description"))
|
||||
.AllowAnonymous();
|
||||
|
||||
app.MapGet("/livez", GetLiveness)
|
||||
.WithName("Orchestrator_Liveness")
|
||||
.WithTags("Health")
|
||||
.WithDescription(_t("orchestrator.health.liveness_description"))
|
||||
.AllowAnonymous();
|
||||
|
||||
app.MapGet("/health/details", GetHealthDetails)
|
||||
.WithName("Orchestrator_HealthDetails")
|
||||
.WithTags("Health")
|
||||
.WithDescription(_t("orchestrator.health.deep_description"))
|
||||
.AllowAnonymous();
|
||||
|
||||
return app;
|
||||
}
|
||||
|
||||
private static IResult GetHealth([FromServices] TimeProvider timeProvider)
|
||||
{
|
||||
return Results.Ok(new HealthResponse("ok", timeProvider.GetUtcNow()));
|
||||
}
|
||||
|
||||
private static async Task<IResult> GetReadiness(
|
||||
[FromServices] JobEngineDataSource dataSource,
|
||||
[FromServices] TimeProvider timeProvider,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Check database connectivity
|
||||
var dbHealthy = await CheckDatabaseAsync(dataSource, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!dbHealthy)
|
||||
{
|
||||
return Results.Json(
|
||||
new ReadinessResponse("not_ready", timeProvider.GetUtcNow(), new Dictionary<string, string>
|
||||
{
|
||||
["database"] = "unhealthy"
|
||||
}),
|
||||
statusCode: StatusCodes.Status503ServiceUnavailable);
|
||||
}
|
||||
|
||||
return Results.Ok(new ReadinessResponse("ready", timeProvider.GetUtcNow(), new Dictionary<string, string>
|
||||
{
|
||||
["database"] = "healthy"
|
||||
}));
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return Results.Json(
|
||||
new ReadinessResponse("not_ready", timeProvider.GetUtcNow(), new Dictionary<string, string>
|
||||
{
|
||||
["database"] = $"error: {ex.Message}"
|
||||
}),
|
||||
statusCode: StatusCodes.Status503ServiceUnavailable);
|
||||
}
|
||||
}
|
||||
|
||||
private static IResult GetLiveness([FromServices] TimeProvider timeProvider)
|
||||
{
|
||||
// Liveness just checks the process is alive
|
||||
return Results.Ok(new HealthResponse("alive", timeProvider.GetUtcNow()));
|
||||
}
|
||||
|
||||
private static async Task<IResult> GetHealthDetails(
|
||||
[FromServices] JobEngineDataSource dataSource,
|
||||
[FromServices] TimeProvider timeProvider,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var checks = new Dictionary<string, HealthCheckResult>();
|
||||
var overallHealthy = true;
|
||||
|
||||
// Database check
|
||||
try
|
||||
{
|
||||
var dbHealthy = await CheckDatabaseAsync(dataSource, cancellationToken).ConfigureAwait(false);
|
||||
checks["database"] = new HealthCheckResult(
|
||||
dbHealthy ? "healthy" : "unhealthy",
|
||||
dbHealthy ? null : "Connection test failed",
|
||||
timeProvider.GetUtcNow());
|
||||
overallHealthy &= dbHealthy;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
checks["database"] = new HealthCheckResult("unhealthy", ex.Message, timeProvider.GetUtcNow());
|
||||
overallHealthy = false;
|
||||
}
|
||||
|
||||
// Memory check
|
||||
var memoryInfo = GC.GetGCMemoryInfo();
|
||||
var memoryUsedMb = GC.GetTotalMemory(false) / (1024.0 * 1024.0);
|
||||
var memoryLimitMb = memoryInfo.TotalAvailableMemoryBytes / (1024.0 * 1024.0);
|
||||
var memoryHealthy = memoryUsedMb < memoryLimitMb * 0.9; // < 90% threshold
|
||||
|
||||
checks["memory"] = new HealthCheckResult(
|
||||
memoryHealthy ? "healthy" : "degraded",
|
||||
$"Used: {memoryUsedMb:F2} MB",
|
||||
timeProvider.GetUtcNow());
|
||||
|
||||
// Thread pool check
|
||||
ThreadPool.GetAvailableThreads(out var workerThreads, out var completionPortThreads);
|
||||
ThreadPool.GetMaxThreads(out var maxWorkerThreads, out var maxCompletionPortThreads);
|
||||
var threadPoolHealthy = workerThreads > maxWorkerThreads * 0.1; // > 10% available
|
||||
|
||||
checks["threadPool"] = new HealthCheckResult(
|
||||
threadPoolHealthy ? "healthy" : "degraded",
|
||||
$"Worker threads available: {workerThreads}/{maxWorkerThreads}",
|
||||
timeProvider.GetUtcNow());
|
||||
|
||||
var response = new HealthDetailsResponse(
|
||||
overallHealthy ? "healthy" : "unhealthy",
|
||||
timeProvider.GetUtcNow(),
|
||||
checks);
|
||||
|
||||
return overallHealthy
|
||||
? Results.Ok(response)
|
||||
: Results.Json(response, statusCode: StatusCodes.Status503ServiceUnavailable);
|
||||
}
|
||||
|
||||
private static async Task<bool> CheckDatabaseAsync(JobEngineDataSource dataSource, CancellationToken cancellationToken)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Use a system tenant for health checks
|
||||
await using var connection = await dataSource.OpenConnectionAsync("_system", "health", cancellationToken).ConfigureAwait(false);
|
||||
await using var command = connection.CreateCommand();
|
||||
command.CommandText = "SELECT 1";
|
||||
await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
||||
return true;
|
||||
}
|
||||
catch
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Basic health response.
|
||||
/// </summary>
|
||||
public sealed record HealthResponse(string Status, DateTimeOffset Timestamp);
|
||||
|
||||
/// <summary>
|
||||
/// Readiness response with dependency status.
|
||||
/// </summary>
|
||||
public sealed record ReadinessResponse(
|
||||
string Status,
|
||||
DateTimeOffset Timestamp,
|
||||
IReadOnlyDictionary<string, string> Dependencies);
|
||||
|
||||
/// <summary>
|
||||
/// Individual health check result.
|
||||
/// </summary>
|
||||
public sealed record HealthCheckResult(
|
||||
string Status,
|
||||
string? Details,
|
||||
DateTimeOffset CheckedAt);
|
||||
|
||||
/// <summary>
|
||||
/// Detailed health response with all checks.
|
||||
/// </summary>
|
||||
public sealed record HealthDetailsResponse(
|
||||
string Status,
|
||||
DateTimeOffset Timestamp,
|
||||
IReadOnlyDictionary<string, HealthCheckResult> Checks);
|
||||
Reference in New Issue
Block a user