107 lines
3.6 KiB
C#
107 lines
3.6 KiB
C#
using Microsoft.Extensions.Options;
|
|
using StellaOps.Router.Common.Abstractions;
|
|
using StellaOps.Router.Common.Enums;
|
|
using StellaOps.Router.Gateway.Configuration;
|
|
|
|
namespace StellaOps.Gateway.WebService.Services;
|
|
|
|
public sealed class GatewayHealthMonitorService : BackgroundService
|
|
{
|
|
private readonly IGlobalRoutingState _routingState;
|
|
private readonly IOptions<HealthOptions> _options;
|
|
private readonly ILogger<GatewayHealthMonitorService> _logger;
|
|
|
|
public GatewayHealthMonitorService(
|
|
IGlobalRoutingState routingState,
|
|
IOptions<HealthOptions> options,
|
|
ILogger<GatewayHealthMonitorService> logger)
|
|
{
|
|
_routingState = routingState;
|
|
_options = options;
|
|
_logger = logger;
|
|
}
|
|
|
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
|
{
|
|
_logger.LogInformation(
|
|
"Health monitor started. Stale threshold: {StaleThreshold}, Check interval: {CheckInterval}",
|
|
_options.Value.StaleThreshold,
|
|
_options.Value.CheckInterval);
|
|
|
|
while (!stoppingToken.IsCancellationRequested)
|
|
{
|
|
try
|
|
{
|
|
await Task.Delay(_options.Value.CheckInterval, stoppingToken);
|
|
CheckStaleConnections();
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
break;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error in health monitor loop");
|
|
}
|
|
}
|
|
|
|
_logger.LogInformation("Health monitor stopped");
|
|
}
|
|
|
|
private void CheckStaleConnections()
|
|
{
|
|
var staleThreshold = _options.Value.StaleThreshold;
|
|
var degradedThreshold = _options.Value.DegradedThreshold;
|
|
var now = DateTime.UtcNow;
|
|
var staleCount = 0;
|
|
var degradedCount = 0;
|
|
|
|
foreach (var connection in _routingState.GetAllConnections())
|
|
{
|
|
if (connection.Status == InstanceHealthStatus.Draining)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
var age = now - connection.LastHeartbeatUtc;
|
|
|
|
if (age > staleThreshold && connection.Status != InstanceHealthStatus.Unhealthy)
|
|
{
|
|
_routingState.UpdateConnection(connection.ConnectionId, c =>
|
|
c.Status = InstanceHealthStatus.Unhealthy);
|
|
|
|
_logger.LogWarning(
|
|
"Instance {InstanceId} ({ServiceName}/{Version}) marked Unhealthy: no heartbeat for {Age:g}",
|
|
connection.Instance.InstanceId,
|
|
connection.Instance.ServiceName,
|
|
connection.Instance.Version,
|
|
age);
|
|
|
|
staleCount++;
|
|
}
|
|
else if (age > degradedThreshold && connection.Status == InstanceHealthStatus.Healthy)
|
|
{
|
|
_routingState.UpdateConnection(connection.ConnectionId, c =>
|
|
c.Status = InstanceHealthStatus.Degraded);
|
|
|
|
_logger.LogWarning(
|
|
"Instance {InstanceId} ({ServiceName}/{Version}) marked Degraded: delayed heartbeat ({Age:g})",
|
|
connection.Instance.InstanceId,
|
|
connection.Instance.ServiceName,
|
|
connection.Instance.Version,
|
|
age);
|
|
|
|
degradedCount++;
|
|
}
|
|
}
|
|
|
|
if (staleCount > 0 || degradedCount > 0)
|
|
{
|
|
_logger.LogDebug(
|
|
"Health check completed: {StaleCount} stale, {DegradedCount} degraded",
|
|
staleCount,
|
|
degradedCount);
|
|
}
|
|
}
|
|
}
|