using Microsoft.AspNetCore.Mvc; using StellaOps.JobEngine.Core.Scale; using static StellaOps.Localization.T; namespace StellaOps.JobEngine.WebService.Endpoints; /// /// Endpoints for autoscaling metrics and load shedding status. /// public static class ScaleEndpoints { /// /// Maps scale endpoints to the route builder. /// public static IEndpointRouteBuilder MapScaleEndpoints(this IEndpointRouteBuilder app) { var group = app.MapGroup("/scale") .WithTags("Scaling") .AllowAnonymous(); // Autoscaling metrics for KEDA/HPA group.MapGet("/metrics", GetAutoscaleMetrics) .WithName("Orchestrator_AutoscaleMetrics") .WithDescription(_t("orchestrator.scale.metrics_description")); // Prometheus-compatible metrics endpoint group.MapGet("/metrics/prometheus", GetPrometheusMetrics) .WithName("Orchestrator_PrometheusScaleMetrics") .WithDescription(_t("orchestrator.scale.prometheus_description")); // Load shedding status group.MapGet("/load", GetLoadStatus) .WithName("Orchestrator_LoadStatus") .WithDescription(_t("orchestrator.scale.load_description")); // Scale snapshot for debugging group.MapGet("/snapshot", GetScaleSnapshot) .WithName("Orchestrator_ScaleSnapshot") .WithDescription(_t("orchestrator.scale.snapshot_description")); // Startup probe (slower to pass, includes warmup check) app.MapGet("/startupz", GetStartupStatus) .WithName("Orchestrator_StartupProbe") .WithTags("Health") .WithDescription(_t("orchestrator.scale.startupz_description")) .AllowAnonymous(); return app; } private static IResult GetAutoscaleMetrics( [FromServices] ScaleMetrics scaleMetrics) { var metrics = scaleMetrics.GetAutoscaleMetrics(); return Results.Ok(metrics); } private static IResult GetPrometheusMetrics( [FromServices] ScaleMetrics scaleMetrics, [FromServices] LoadShedder loadShedder) { var metrics = scaleMetrics.GetAutoscaleMetrics(); var loadStatus = loadShedder.GetStatus(); // Format as Prometheus text exposition var lines = new List { "# HELP orchestrator_queue_depth Current number of pending jobs", "# TYPE orchestrator_queue_depth gauge", $"orchestrator_queue_depth {metrics.QueueDepth}", "", "# HELP orchestrator_active_jobs Current number of active jobs", "# TYPE orchestrator_active_jobs gauge", $"orchestrator_active_jobs {metrics.ActiveJobs}", "", "# HELP orchestrator_dispatch_latency_p95_ms P95 dispatch latency in milliseconds", "# TYPE orchestrator_dispatch_latency_p95_ms gauge", $"orchestrator_dispatch_latency_p95_ms {metrics.DispatchLatencyP95Ms:F2}", "", "# HELP orchestrator_dispatch_latency_p99_ms P99 dispatch latency in milliseconds", "# TYPE orchestrator_dispatch_latency_p99_ms gauge", $"orchestrator_dispatch_latency_p99_ms {metrics.DispatchLatencyP99Ms:F2}", "", "# HELP orchestrator_recommended_replicas Recommended replica count for autoscaling", "# TYPE orchestrator_recommended_replicas gauge", $"orchestrator_recommended_replicas {metrics.RecommendedReplicas}", "", "# HELP orchestrator_under_pressure Whether the system is under pressure (1=yes, 0=no)", "# TYPE orchestrator_under_pressure gauge", $"orchestrator_under_pressure {(metrics.IsUnderPressure ? 1 : 0)}", "", "# HELP orchestrator_load_factor Current load factor (1.0 = at target)", "# TYPE orchestrator_load_factor gauge", $"orchestrator_load_factor {loadStatus.LoadFactor:F3}", "", "# HELP orchestrator_load_shedding_state Current load shedding state (0=normal, 1=warning, 2=critical, 3=emergency)", "# TYPE orchestrator_load_shedding_state gauge", $"orchestrator_load_shedding_state {(int)loadStatus.State}", "", "# HELP orchestrator_scale_samples Number of latency samples in measurement window", "# TYPE orchestrator_scale_samples gauge", $"orchestrator_scale_samples {metrics.SamplesInWindow}" }; return Results.Text(string.Join("\n", lines), "text/plain"); } private static IResult GetLoadStatus( [FromServices] LoadShedder loadShedder) { var status = loadShedder.GetStatus(); return Results.Ok(status); } private static IResult GetScaleSnapshot( [FromServices] ScaleMetrics scaleMetrics, [FromServices] LoadShedder loadShedder) { var snapshot = scaleMetrics.GetSnapshot(); var loadStatus = loadShedder.GetStatus(); return Results.Ok(new { snapshot.Timestamp, snapshot.TotalQueueDepth, snapshot.TotalActiveJobs, DispatchLatency = new { snapshot.DispatchLatency.Count, snapshot.DispatchLatency.Min, snapshot.DispatchLatency.Max, snapshot.DispatchLatency.Avg, snapshot.DispatchLatency.P50, snapshot.DispatchLatency.P95, snapshot.DispatchLatency.P99 }, LoadShedding = new { loadStatus.State, loadStatus.LoadFactor, loadStatus.IsSheddingLoad, loadStatus.AcceptingPriority, loadStatus.RecommendedDelayMs }, QueueDepthByKey = snapshot.QueueDepthByKey, ActiveJobsByKey = snapshot.ActiveJobsByKey }); } private static IResult GetStartupStatus( [FromServices] ScaleMetrics scaleMetrics, [FromServices] StartupProbe startupProbe) { if (!startupProbe.IsReady) { return Results.Json(new StartupResponse( Status: "starting", Ready: false, UptimeSeconds: startupProbe.UptimeSeconds, WarmupComplete: startupProbe.WarmupComplete, Message: startupProbe.StatusMessage), statusCode: StatusCodes.Status503ServiceUnavailable); } return Results.Ok(new StartupResponse( Status: "started", Ready: true, UptimeSeconds: startupProbe.UptimeSeconds, WarmupComplete: startupProbe.WarmupComplete, Message: "Service is ready")); } } /// /// Startup probe response. /// public sealed record StartupResponse( string Status, bool Ready, double UptimeSeconds, bool WarmupComplete, string Message); /// /// Startup probe service that tracks warmup status. /// public sealed class StartupProbe { private readonly DateTimeOffset _startTime = DateTimeOffset.UtcNow; private readonly TimeSpan _minWarmupTime; private volatile bool _warmupComplete; private string _statusMessage = "Starting up"; public StartupProbe(TimeSpan? minWarmupTime = null) { _minWarmupTime = minWarmupTime ?? TimeSpan.FromSeconds(5); } /// /// Gets whether the service is ready. /// public bool IsReady => WarmupComplete; /// /// Gets whether warmup has completed. /// public bool WarmupComplete { get { if (_warmupComplete) return true; // Auto-complete warmup after minimum time if (UptimeSeconds >= _minWarmupTime.TotalSeconds) { _warmupComplete = true; _statusMessage = "Warmup complete"; } return _warmupComplete; } } /// /// Gets the uptime in seconds. /// public double UptimeSeconds => (DateTimeOffset.UtcNow - _startTime).TotalSeconds; /// /// Gets the current status message. /// public string StatusMessage => _statusMessage; /// /// Marks warmup as complete. /// public void MarkWarmupComplete() { _warmupComplete = true; _statusMessage = "Warmup complete"; } /// /// Updates the status message. /// public void SetStatus(string message) { _statusMessage = message; } }