using Microsoft.AspNetCore.Mvc;
using StellaOps.JobEngine.Core.Scale;
using static StellaOps.Localization.T;
namespace StellaOps.JobEngine.WebService.Endpoints;
///
/// Endpoints for autoscaling metrics and load shedding status.
///
public static class ScaleEndpoints
{
///
/// Maps scale endpoints to the route builder.
///
public static IEndpointRouteBuilder MapScaleEndpoints(this IEndpointRouteBuilder app)
{
var group = app.MapGroup("/scale")
.WithTags("Scaling")
.AllowAnonymous();
// Autoscaling metrics for KEDA/HPA
group.MapGet("/metrics", GetAutoscaleMetrics)
.WithName("Orchestrator_AutoscaleMetrics")
.WithDescription(_t("orchestrator.scale.metrics_description"));
// Prometheus-compatible metrics endpoint
group.MapGet("/metrics/prometheus", GetPrometheusMetrics)
.WithName("Orchestrator_PrometheusScaleMetrics")
.WithDescription(_t("orchestrator.scale.prometheus_description"));
// Load shedding status
group.MapGet("/load", GetLoadStatus)
.WithName("Orchestrator_LoadStatus")
.WithDescription(_t("orchestrator.scale.load_description"));
// Scale snapshot for debugging
group.MapGet("/snapshot", GetScaleSnapshot)
.WithName("Orchestrator_ScaleSnapshot")
.WithDescription(_t("orchestrator.scale.snapshot_description"));
// Startup probe (slower to pass, includes warmup check)
app.MapGet("/startupz", GetStartupStatus)
.WithName("Orchestrator_StartupProbe")
.WithTags("Health")
.WithDescription(_t("orchestrator.scale.startupz_description"))
.AllowAnonymous();
return app;
}
private static IResult GetAutoscaleMetrics(
[FromServices] ScaleMetrics scaleMetrics)
{
var metrics = scaleMetrics.GetAutoscaleMetrics();
return Results.Ok(metrics);
}
private static IResult GetPrometheusMetrics(
[FromServices] ScaleMetrics scaleMetrics,
[FromServices] LoadShedder loadShedder)
{
var metrics = scaleMetrics.GetAutoscaleMetrics();
var loadStatus = loadShedder.GetStatus();
// Format as Prometheus text exposition
var lines = new List
{
"# HELP orchestrator_queue_depth Current number of pending jobs",
"# TYPE orchestrator_queue_depth gauge",
$"orchestrator_queue_depth {metrics.QueueDepth}",
"",
"# HELP orchestrator_active_jobs Current number of active jobs",
"# TYPE orchestrator_active_jobs gauge",
$"orchestrator_active_jobs {metrics.ActiveJobs}",
"",
"# HELP orchestrator_dispatch_latency_p95_ms P95 dispatch latency in milliseconds",
"# TYPE orchestrator_dispatch_latency_p95_ms gauge",
$"orchestrator_dispatch_latency_p95_ms {metrics.DispatchLatencyP95Ms:F2}",
"",
"# HELP orchestrator_dispatch_latency_p99_ms P99 dispatch latency in milliseconds",
"# TYPE orchestrator_dispatch_latency_p99_ms gauge",
$"orchestrator_dispatch_latency_p99_ms {metrics.DispatchLatencyP99Ms:F2}",
"",
"# HELP orchestrator_recommended_replicas Recommended replica count for autoscaling",
"# TYPE orchestrator_recommended_replicas gauge",
$"orchestrator_recommended_replicas {metrics.RecommendedReplicas}",
"",
"# HELP orchestrator_under_pressure Whether the system is under pressure (1=yes, 0=no)",
"# TYPE orchestrator_under_pressure gauge",
$"orchestrator_under_pressure {(metrics.IsUnderPressure ? 1 : 0)}",
"",
"# HELP orchestrator_load_factor Current load factor (1.0 = at target)",
"# TYPE orchestrator_load_factor gauge",
$"orchestrator_load_factor {loadStatus.LoadFactor:F3}",
"",
"# HELP orchestrator_load_shedding_state Current load shedding state (0=normal, 1=warning, 2=critical, 3=emergency)",
"# TYPE orchestrator_load_shedding_state gauge",
$"orchestrator_load_shedding_state {(int)loadStatus.State}",
"",
"# HELP orchestrator_scale_samples Number of latency samples in measurement window",
"# TYPE orchestrator_scale_samples gauge",
$"orchestrator_scale_samples {metrics.SamplesInWindow}"
};
return Results.Text(string.Join("\n", lines), "text/plain");
}
private static IResult GetLoadStatus(
[FromServices] LoadShedder loadShedder)
{
var status = loadShedder.GetStatus();
return Results.Ok(status);
}
private static IResult GetScaleSnapshot(
[FromServices] ScaleMetrics scaleMetrics,
[FromServices] LoadShedder loadShedder)
{
var snapshot = scaleMetrics.GetSnapshot();
var loadStatus = loadShedder.GetStatus();
return Results.Ok(new
{
snapshot.Timestamp,
snapshot.TotalQueueDepth,
snapshot.TotalActiveJobs,
DispatchLatency = new
{
snapshot.DispatchLatency.Count,
snapshot.DispatchLatency.Min,
snapshot.DispatchLatency.Max,
snapshot.DispatchLatency.Avg,
snapshot.DispatchLatency.P50,
snapshot.DispatchLatency.P95,
snapshot.DispatchLatency.P99
},
LoadShedding = new
{
loadStatus.State,
loadStatus.LoadFactor,
loadStatus.IsSheddingLoad,
loadStatus.AcceptingPriority,
loadStatus.RecommendedDelayMs
},
QueueDepthByKey = snapshot.QueueDepthByKey,
ActiveJobsByKey = snapshot.ActiveJobsByKey
});
}
private static IResult GetStartupStatus(
[FromServices] ScaleMetrics scaleMetrics,
[FromServices] StartupProbe startupProbe)
{
if (!startupProbe.IsReady)
{
return Results.Json(new StartupResponse(
Status: "starting",
Ready: false,
UptimeSeconds: startupProbe.UptimeSeconds,
WarmupComplete: startupProbe.WarmupComplete,
Message: startupProbe.StatusMessage),
statusCode: StatusCodes.Status503ServiceUnavailable);
}
return Results.Ok(new StartupResponse(
Status: "started",
Ready: true,
UptimeSeconds: startupProbe.UptimeSeconds,
WarmupComplete: startupProbe.WarmupComplete,
Message: "Service is ready"));
}
}
///
/// Startup probe response.
///
public sealed record StartupResponse(
string Status,
bool Ready,
double UptimeSeconds,
bool WarmupComplete,
string Message);
///
/// Startup probe service that tracks warmup status.
///
public sealed class StartupProbe
{
private readonly DateTimeOffset _startTime = DateTimeOffset.UtcNow;
private readonly TimeSpan _minWarmupTime;
private volatile bool _warmupComplete;
private string _statusMessage = "Starting up";
public StartupProbe(TimeSpan? minWarmupTime = null)
{
_minWarmupTime = minWarmupTime ?? TimeSpan.FromSeconds(5);
}
///
/// Gets whether the service is ready.
///
public bool IsReady => WarmupComplete;
///
/// Gets whether warmup has completed.
///
public bool WarmupComplete
{
get
{
if (_warmupComplete) return true;
// Auto-complete warmup after minimum time
if (UptimeSeconds >= _minWarmupTime.TotalSeconds)
{
_warmupComplete = true;
_statusMessage = "Warmup complete";
}
return _warmupComplete;
}
}
///
/// Gets the uptime in seconds.
///
public double UptimeSeconds => (DateTimeOffset.UtcNow - _startTime).TotalSeconds;
///
/// Gets the current status message.
///
public string StatusMessage => _statusMessage;
///
/// Marks warmup as complete.
///
public void MarkWarmupComplete()
{
_warmupComplete = true;
_statusMessage = "Warmup complete";
}
///
/// Updates the status message.
///
public void SetStatus(string message)
{
_statusMessage = message;
}
}