consolidation of some of the modules, localization fixes, product advisories work, qa work
This commit is contained in:
@@ -0,0 +1,250 @@
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
using StellaOps.JobEngine.Core.Scale;
|
||||
using static StellaOps.Localization.T;
|
||||
|
||||
namespace StellaOps.JobEngine.WebService.Endpoints;
|
||||
|
||||
/// <summary>
|
||||
/// Endpoints for autoscaling metrics and load shedding status.
|
||||
/// </summary>
|
||||
public static class ScaleEndpoints
|
||||
{
|
||||
/// <summary>
|
||||
/// Maps scale endpoints to the route builder.
|
||||
/// </summary>
|
||||
public static IEndpointRouteBuilder MapScaleEndpoints(this IEndpointRouteBuilder app)
|
||||
{
|
||||
var group = app.MapGroup("/scale")
|
||||
.WithTags("Scaling")
|
||||
.AllowAnonymous();
|
||||
|
||||
// Autoscaling metrics for KEDA/HPA
|
||||
group.MapGet("/metrics", GetAutoscaleMetrics)
|
||||
.WithName("Orchestrator_AutoscaleMetrics")
|
||||
.WithDescription(_t("orchestrator.scale.metrics_description"));
|
||||
|
||||
// Prometheus-compatible metrics endpoint
|
||||
group.MapGet("/metrics/prometheus", GetPrometheusMetrics)
|
||||
.WithName("Orchestrator_PrometheusScaleMetrics")
|
||||
.WithDescription(_t("orchestrator.scale.prometheus_description"));
|
||||
|
||||
// Load shedding status
|
||||
group.MapGet("/load", GetLoadStatus)
|
||||
.WithName("Orchestrator_LoadStatus")
|
||||
.WithDescription(_t("orchestrator.scale.load_description"));
|
||||
|
||||
// Scale snapshot for debugging
|
||||
group.MapGet("/snapshot", GetScaleSnapshot)
|
||||
.WithName("Orchestrator_ScaleSnapshot")
|
||||
.WithDescription(_t("orchestrator.scale.snapshot_description"));
|
||||
|
||||
// Startup probe (slower to pass, includes warmup check)
|
||||
app.MapGet("/startupz", GetStartupStatus)
|
||||
.WithName("Orchestrator_StartupProbe")
|
||||
.WithTags("Health")
|
||||
.WithDescription(_t("orchestrator.scale.startupz_description"))
|
||||
.AllowAnonymous();
|
||||
|
||||
return app;
|
||||
}
|
||||
|
||||
private static IResult GetAutoscaleMetrics(
|
||||
[FromServices] ScaleMetrics scaleMetrics)
|
||||
{
|
||||
var metrics = scaleMetrics.GetAutoscaleMetrics();
|
||||
return Results.Ok(metrics);
|
||||
}
|
||||
|
||||
private static IResult GetPrometheusMetrics(
|
||||
[FromServices] ScaleMetrics scaleMetrics,
|
||||
[FromServices] LoadShedder loadShedder)
|
||||
{
|
||||
var metrics = scaleMetrics.GetAutoscaleMetrics();
|
||||
var loadStatus = loadShedder.GetStatus();
|
||||
|
||||
// Format as Prometheus text exposition
|
||||
var lines = new List<string>
|
||||
{
|
||||
"# HELP orchestrator_queue_depth Current number of pending jobs",
|
||||
"# TYPE orchestrator_queue_depth gauge",
|
||||
$"orchestrator_queue_depth {metrics.QueueDepth}",
|
||||
"",
|
||||
"# HELP orchestrator_active_jobs Current number of active jobs",
|
||||
"# TYPE orchestrator_active_jobs gauge",
|
||||
$"orchestrator_active_jobs {metrics.ActiveJobs}",
|
||||
"",
|
||||
"# HELP orchestrator_dispatch_latency_p95_ms P95 dispatch latency in milliseconds",
|
||||
"# TYPE orchestrator_dispatch_latency_p95_ms gauge",
|
||||
$"orchestrator_dispatch_latency_p95_ms {metrics.DispatchLatencyP95Ms:F2}",
|
||||
"",
|
||||
"# HELP orchestrator_dispatch_latency_p99_ms P99 dispatch latency in milliseconds",
|
||||
"# TYPE orchestrator_dispatch_latency_p99_ms gauge",
|
||||
$"orchestrator_dispatch_latency_p99_ms {metrics.DispatchLatencyP99Ms:F2}",
|
||||
"",
|
||||
"# HELP orchestrator_recommended_replicas Recommended replica count for autoscaling",
|
||||
"# TYPE orchestrator_recommended_replicas gauge",
|
||||
$"orchestrator_recommended_replicas {metrics.RecommendedReplicas}",
|
||||
"",
|
||||
"# HELP orchestrator_under_pressure Whether the system is under pressure (1=yes, 0=no)",
|
||||
"# TYPE orchestrator_under_pressure gauge",
|
||||
$"orchestrator_under_pressure {(metrics.IsUnderPressure ? 1 : 0)}",
|
||||
"",
|
||||
"# HELP orchestrator_load_factor Current load factor (1.0 = at target)",
|
||||
"# TYPE orchestrator_load_factor gauge",
|
||||
$"orchestrator_load_factor {loadStatus.LoadFactor:F3}",
|
||||
"",
|
||||
"# HELP orchestrator_load_shedding_state Current load shedding state (0=normal, 1=warning, 2=critical, 3=emergency)",
|
||||
"# TYPE orchestrator_load_shedding_state gauge",
|
||||
$"orchestrator_load_shedding_state {(int)loadStatus.State}",
|
||||
"",
|
||||
"# HELP orchestrator_scale_samples Number of latency samples in measurement window",
|
||||
"# TYPE orchestrator_scale_samples gauge",
|
||||
$"orchestrator_scale_samples {metrics.SamplesInWindow}"
|
||||
};
|
||||
|
||||
return Results.Text(string.Join("\n", lines), "text/plain");
|
||||
}
|
||||
|
||||
private static IResult GetLoadStatus(
|
||||
[FromServices] LoadShedder loadShedder)
|
||||
{
|
||||
var status = loadShedder.GetStatus();
|
||||
return Results.Ok(status);
|
||||
}
|
||||
|
||||
private static IResult GetScaleSnapshot(
|
||||
[FromServices] ScaleMetrics scaleMetrics,
|
||||
[FromServices] LoadShedder loadShedder)
|
||||
{
|
||||
var snapshot = scaleMetrics.GetSnapshot();
|
||||
var loadStatus = loadShedder.GetStatus();
|
||||
|
||||
return Results.Ok(new
|
||||
{
|
||||
snapshot.Timestamp,
|
||||
snapshot.TotalQueueDepth,
|
||||
snapshot.TotalActiveJobs,
|
||||
DispatchLatency = new
|
||||
{
|
||||
snapshot.DispatchLatency.Count,
|
||||
snapshot.DispatchLatency.Min,
|
||||
snapshot.DispatchLatency.Max,
|
||||
snapshot.DispatchLatency.Avg,
|
||||
snapshot.DispatchLatency.P50,
|
||||
snapshot.DispatchLatency.P95,
|
||||
snapshot.DispatchLatency.P99
|
||||
},
|
||||
LoadShedding = new
|
||||
{
|
||||
loadStatus.State,
|
||||
loadStatus.LoadFactor,
|
||||
loadStatus.IsSheddingLoad,
|
||||
loadStatus.AcceptingPriority,
|
||||
loadStatus.RecommendedDelayMs
|
||||
},
|
||||
QueueDepthByKey = snapshot.QueueDepthByKey,
|
||||
ActiveJobsByKey = snapshot.ActiveJobsByKey
|
||||
});
|
||||
}
|
||||
|
||||
private static IResult GetStartupStatus(
|
||||
[FromServices] ScaleMetrics scaleMetrics,
|
||||
[FromServices] StartupProbe startupProbe)
|
||||
{
|
||||
if (!startupProbe.IsReady)
|
||||
{
|
||||
return Results.Json(new StartupResponse(
|
||||
Status: "starting",
|
||||
Ready: false,
|
||||
UptimeSeconds: startupProbe.UptimeSeconds,
|
||||
WarmupComplete: startupProbe.WarmupComplete,
|
||||
Message: startupProbe.StatusMessage),
|
||||
statusCode: StatusCodes.Status503ServiceUnavailable);
|
||||
}
|
||||
|
||||
return Results.Ok(new StartupResponse(
|
||||
Status: "started",
|
||||
Ready: true,
|
||||
UptimeSeconds: startupProbe.UptimeSeconds,
|
||||
WarmupComplete: startupProbe.WarmupComplete,
|
||||
Message: "Service is ready"));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Startup probe response.
|
||||
/// </summary>
|
||||
public sealed record StartupResponse(
|
||||
string Status,
|
||||
bool Ready,
|
||||
double UptimeSeconds,
|
||||
bool WarmupComplete,
|
||||
string Message);
|
||||
|
||||
/// <summary>
|
||||
/// Startup probe service that tracks warmup status.
|
||||
/// </summary>
|
||||
public sealed class StartupProbe
|
||||
{
|
||||
private readonly DateTimeOffset _startTime = DateTimeOffset.UtcNow;
|
||||
private readonly TimeSpan _minWarmupTime;
|
||||
private volatile bool _warmupComplete;
|
||||
private string _statusMessage = "Starting up";
|
||||
|
||||
public StartupProbe(TimeSpan? minWarmupTime = null)
|
||||
{
|
||||
_minWarmupTime = minWarmupTime ?? TimeSpan.FromSeconds(5);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether the service is ready.
|
||||
/// </summary>
|
||||
public bool IsReady => WarmupComplete;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether warmup has completed.
|
||||
/// </summary>
|
||||
public bool WarmupComplete
|
||||
{
|
||||
get
|
||||
{
|
||||
if (_warmupComplete) return true;
|
||||
|
||||
// Auto-complete warmup after minimum time
|
||||
if (UptimeSeconds >= _minWarmupTime.TotalSeconds)
|
||||
{
|
||||
_warmupComplete = true;
|
||||
_statusMessage = "Warmup complete";
|
||||
}
|
||||
|
||||
return _warmupComplete;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the uptime in seconds.
|
||||
/// </summary>
|
||||
public double UptimeSeconds => (DateTimeOffset.UtcNow - _startTime).TotalSeconds;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current status message.
|
||||
/// </summary>
|
||||
public string StatusMessage => _statusMessage;
|
||||
|
||||
/// <summary>
|
||||
/// Marks warmup as complete.
|
||||
/// </summary>
|
||||
public void MarkWarmupComplete()
|
||||
{
|
||||
_warmupComplete = true;
|
||||
_statusMessage = "Warmup complete";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates the status message.
|
||||
/// </summary>
|
||||
public void SetStatus(string message)
|
||||
{
|
||||
_statusMessage = message;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user