Files
git.stella-ops.org/src/Gateway/StellaOps.Gateway.WebService/Services/GatewayHealthMonitorService.cs

107 lines
3.6 KiB
C#

using Microsoft.Extensions.Options;
using StellaOps.Router.Common.Abstractions;
using StellaOps.Router.Common.Enums;
using StellaOps.Router.Gateway.Configuration;
namespace StellaOps.Gateway.WebService.Services;
public sealed class GatewayHealthMonitorService : BackgroundService
{
private readonly IGlobalRoutingState _routingState;
private readonly IOptions<HealthOptions> _options;
private readonly ILogger<GatewayHealthMonitorService> _logger;
public GatewayHealthMonitorService(
IGlobalRoutingState routingState,
IOptions<HealthOptions> options,
ILogger<GatewayHealthMonitorService> logger)
{
_routingState = routingState;
_options = options;
_logger = logger;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogInformation(
"Health monitor started. Stale threshold: {StaleThreshold}, Check interval: {CheckInterval}",
_options.Value.StaleThreshold,
_options.Value.CheckInterval);
while (!stoppingToken.IsCancellationRequested)
{
try
{
await Task.Delay(_options.Value.CheckInterval, stoppingToken);
CheckStaleConnections();
}
catch (OperationCanceledException)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error in health monitor loop");
}
}
_logger.LogInformation("Health monitor stopped");
}
private void CheckStaleConnections()
{
var staleThreshold = _options.Value.StaleThreshold;
var degradedThreshold = _options.Value.DegradedThreshold;
var now = DateTime.UtcNow;
var staleCount = 0;
var degradedCount = 0;
foreach (var connection in _routingState.GetAllConnections())
{
if (connection.Status == InstanceHealthStatus.Draining)
{
continue;
}
var age = now - connection.LastHeartbeatUtc;
if (age > staleThreshold && connection.Status != InstanceHealthStatus.Unhealthy)
{
_routingState.UpdateConnection(connection.ConnectionId, c =>
c.Status = InstanceHealthStatus.Unhealthy);
_logger.LogWarning(
"Instance {InstanceId} ({ServiceName}/{Version}) marked Unhealthy: no heartbeat for {Age:g}",
connection.Instance.InstanceId,
connection.Instance.ServiceName,
connection.Instance.Version,
age);
staleCount++;
}
else if (age > degradedThreshold && connection.Status == InstanceHealthStatus.Healthy)
{
_routingState.UpdateConnection(connection.ConnectionId, c =>
c.Status = InstanceHealthStatus.Degraded);
_logger.LogWarning(
"Instance {InstanceId} ({ServiceName}/{Version}) marked Degraded: delayed heartbeat ({Age:g})",
connection.Instance.InstanceId,
connection.Instance.ServiceName,
connection.Instance.Version,
age);
degradedCount++;
}
}
if (staleCount > 0 || degradedCount > 0)
{
_logger.LogDebug(
"Health check completed: {StaleCount} stale, {DegradedCount} degraded",
staleCount,
degradedCount);
}
}
}