251 lines
8.6 KiB
C#
251 lines
8.6 KiB
C#
using Microsoft.AspNetCore.Mvc;
|
|
using StellaOps.JobEngine.Core.Scale;
|
|
using static StellaOps.Localization.T;
|
|
|
|
namespace StellaOps.JobEngine.WebService.Endpoints;
|
|
|
|
/// <summary>
|
|
/// Endpoints for autoscaling metrics and load shedding status.
|
|
/// </summary>
|
|
public static class ScaleEndpoints
|
|
{
|
|
/// <summary>
|
|
/// Maps scale endpoints to the route builder.
|
|
/// </summary>
|
|
public static IEndpointRouteBuilder MapScaleEndpoints(this IEndpointRouteBuilder app)
|
|
{
|
|
var group = app.MapGroup("/scale")
|
|
.WithTags("Scaling")
|
|
.AllowAnonymous();
|
|
|
|
// Autoscaling metrics for KEDA/HPA
|
|
group.MapGet("/metrics", GetAutoscaleMetrics)
|
|
.WithName("Orchestrator_AutoscaleMetrics")
|
|
.WithDescription(_t("orchestrator.scale.metrics_description"));
|
|
|
|
// Prometheus-compatible metrics endpoint
|
|
group.MapGet("/metrics/prometheus", GetPrometheusMetrics)
|
|
.WithName("Orchestrator_PrometheusScaleMetrics")
|
|
.WithDescription(_t("orchestrator.scale.prometheus_description"));
|
|
|
|
// Load shedding status
|
|
group.MapGet("/load", GetLoadStatus)
|
|
.WithName("Orchestrator_LoadStatus")
|
|
.WithDescription(_t("orchestrator.scale.load_description"));
|
|
|
|
// Scale snapshot for debugging
|
|
group.MapGet("/snapshot", GetScaleSnapshot)
|
|
.WithName("Orchestrator_ScaleSnapshot")
|
|
.WithDescription(_t("orchestrator.scale.snapshot_description"));
|
|
|
|
// Startup probe (slower to pass, includes warmup check)
|
|
app.MapGet("/startupz", GetStartupStatus)
|
|
.WithName("Orchestrator_StartupProbe")
|
|
.WithTags("Health")
|
|
.WithDescription(_t("orchestrator.scale.startupz_description"))
|
|
.AllowAnonymous();
|
|
|
|
return app;
|
|
}
|
|
|
|
private static IResult GetAutoscaleMetrics(
|
|
[FromServices] ScaleMetrics scaleMetrics)
|
|
{
|
|
var metrics = scaleMetrics.GetAutoscaleMetrics();
|
|
return Results.Ok(metrics);
|
|
}
|
|
|
|
private static IResult GetPrometheusMetrics(
|
|
[FromServices] ScaleMetrics scaleMetrics,
|
|
[FromServices] LoadShedder loadShedder)
|
|
{
|
|
var metrics = scaleMetrics.GetAutoscaleMetrics();
|
|
var loadStatus = loadShedder.GetStatus();
|
|
|
|
// Format as Prometheus text exposition
|
|
var lines = new List<string>
|
|
{
|
|
"# HELP orchestrator_queue_depth Current number of pending jobs",
|
|
"# TYPE orchestrator_queue_depth gauge",
|
|
$"orchestrator_queue_depth {metrics.QueueDepth}",
|
|
"",
|
|
"# HELP orchestrator_active_jobs Current number of active jobs",
|
|
"# TYPE orchestrator_active_jobs gauge",
|
|
$"orchestrator_active_jobs {metrics.ActiveJobs}",
|
|
"",
|
|
"# HELP orchestrator_dispatch_latency_p95_ms P95 dispatch latency in milliseconds",
|
|
"# TYPE orchestrator_dispatch_latency_p95_ms gauge",
|
|
$"orchestrator_dispatch_latency_p95_ms {metrics.DispatchLatencyP95Ms:F2}",
|
|
"",
|
|
"# HELP orchestrator_dispatch_latency_p99_ms P99 dispatch latency in milliseconds",
|
|
"# TYPE orchestrator_dispatch_latency_p99_ms gauge",
|
|
$"orchestrator_dispatch_latency_p99_ms {metrics.DispatchLatencyP99Ms:F2}",
|
|
"",
|
|
"# HELP orchestrator_recommended_replicas Recommended replica count for autoscaling",
|
|
"# TYPE orchestrator_recommended_replicas gauge",
|
|
$"orchestrator_recommended_replicas {metrics.RecommendedReplicas}",
|
|
"",
|
|
"# HELP orchestrator_under_pressure Whether the system is under pressure (1=yes, 0=no)",
|
|
"# TYPE orchestrator_under_pressure gauge",
|
|
$"orchestrator_under_pressure {(metrics.IsUnderPressure ? 1 : 0)}",
|
|
"",
|
|
"# HELP orchestrator_load_factor Current load factor (1.0 = at target)",
|
|
"# TYPE orchestrator_load_factor gauge",
|
|
$"orchestrator_load_factor {loadStatus.LoadFactor:F3}",
|
|
"",
|
|
"# HELP orchestrator_load_shedding_state Current load shedding state (0=normal, 1=warning, 2=critical, 3=emergency)",
|
|
"# TYPE orchestrator_load_shedding_state gauge",
|
|
$"orchestrator_load_shedding_state {(int)loadStatus.State}",
|
|
"",
|
|
"# HELP orchestrator_scale_samples Number of latency samples in measurement window",
|
|
"# TYPE orchestrator_scale_samples gauge",
|
|
$"orchestrator_scale_samples {metrics.SamplesInWindow}"
|
|
};
|
|
|
|
return Results.Text(string.Join("\n", lines), "text/plain");
|
|
}
|
|
|
|
private static IResult GetLoadStatus(
|
|
[FromServices] LoadShedder loadShedder)
|
|
{
|
|
var status = loadShedder.GetStatus();
|
|
return Results.Ok(status);
|
|
}
|
|
|
|
private static IResult GetScaleSnapshot(
|
|
[FromServices] ScaleMetrics scaleMetrics,
|
|
[FromServices] LoadShedder loadShedder)
|
|
{
|
|
var snapshot = scaleMetrics.GetSnapshot();
|
|
var loadStatus = loadShedder.GetStatus();
|
|
|
|
return Results.Ok(new
|
|
{
|
|
snapshot.Timestamp,
|
|
snapshot.TotalQueueDepth,
|
|
snapshot.TotalActiveJobs,
|
|
DispatchLatency = new
|
|
{
|
|
snapshot.DispatchLatency.Count,
|
|
snapshot.DispatchLatency.Min,
|
|
snapshot.DispatchLatency.Max,
|
|
snapshot.DispatchLatency.Avg,
|
|
snapshot.DispatchLatency.P50,
|
|
snapshot.DispatchLatency.P95,
|
|
snapshot.DispatchLatency.P99
|
|
},
|
|
LoadShedding = new
|
|
{
|
|
loadStatus.State,
|
|
loadStatus.LoadFactor,
|
|
loadStatus.IsSheddingLoad,
|
|
loadStatus.AcceptingPriority,
|
|
loadStatus.RecommendedDelayMs
|
|
},
|
|
QueueDepthByKey = snapshot.QueueDepthByKey,
|
|
ActiveJobsByKey = snapshot.ActiveJobsByKey
|
|
});
|
|
}
|
|
|
|
private static IResult GetStartupStatus(
|
|
[FromServices] ScaleMetrics scaleMetrics,
|
|
[FromServices] StartupProbe startupProbe)
|
|
{
|
|
if (!startupProbe.IsReady)
|
|
{
|
|
return Results.Json(new StartupResponse(
|
|
Status: "starting",
|
|
Ready: false,
|
|
UptimeSeconds: startupProbe.UptimeSeconds,
|
|
WarmupComplete: startupProbe.WarmupComplete,
|
|
Message: startupProbe.StatusMessage),
|
|
statusCode: StatusCodes.Status503ServiceUnavailable);
|
|
}
|
|
|
|
return Results.Ok(new StartupResponse(
|
|
Status: "started",
|
|
Ready: true,
|
|
UptimeSeconds: startupProbe.UptimeSeconds,
|
|
WarmupComplete: startupProbe.WarmupComplete,
|
|
Message: "Service is ready"));
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Startup probe response.
|
|
/// </summary>
|
|
public sealed record StartupResponse(
|
|
string Status,
|
|
bool Ready,
|
|
double UptimeSeconds,
|
|
bool WarmupComplete,
|
|
string Message);
|
|
|
|
/// <summary>
|
|
/// Startup probe service that tracks warmup status.
|
|
/// </summary>
|
|
public sealed class StartupProbe
|
|
{
|
|
private readonly DateTimeOffset _startTime = DateTimeOffset.UtcNow;
|
|
private readonly TimeSpan _minWarmupTime;
|
|
private volatile bool _warmupComplete;
|
|
private string _statusMessage = "Starting up";
|
|
|
|
public StartupProbe(TimeSpan? minWarmupTime = null)
|
|
{
|
|
_minWarmupTime = minWarmupTime ?? TimeSpan.FromSeconds(5);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets whether the service is ready.
|
|
/// </summary>
|
|
public bool IsReady => WarmupComplete;
|
|
|
|
/// <summary>
|
|
/// Gets whether warmup has completed.
|
|
/// </summary>
|
|
public bool WarmupComplete
|
|
{
|
|
get
|
|
{
|
|
if (_warmupComplete) return true;
|
|
|
|
// Auto-complete warmup after minimum time
|
|
if (UptimeSeconds >= _minWarmupTime.TotalSeconds)
|
|
{
|
|
_warmupComplete = true;
|
|
_statusMessage = "Warmup complete";
|
|
}
|
|
|
|
return _warmupComplete;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the uptime in seconds.
|
|
/// </summary>
|
|
public double UptimeSeconds => (DateTimeOffset.UtcNow - _startTime).TotalSeconds;
|
|
|
|
/// <summary>
|
|
/// Gets the current status message.
|
|
/// </summary>
|
|
public string StatusMessage => _statusMessage;
|
|
|
|
/// <summary>
|
|
/// Marks warmup as complete.
|
|
/// </summary>
|
|
public void MarkWarmupComplete()
|
|
{
|
|
_warmupComplete = true;
|
|
_statusMessage = "Warmup complete";
|
|
}
|
|
|
|
/// <summary>
|
|
/// Updates the status message.
|
|
/// </summary>
|
|
public void SetStatus(string message)
|
|
{
|
|
_statusMessage = message;
|
|
}
|
|
}
|