Files
git.stella-ops.org/docs/router/23-Step.md
master 75f6942769
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Add integration tests for migration categories and execution
- Implemented MigrationCategoryTests to validate migration categorization for startup, release, seed, and data migrations.
- Added tests for edge cases, including null, empty, and whitespace migration names.
- Created StartupMigrationHostTests to verify the behavior of the migration host with real PostgreSQL instances using Testcontainers.
- Included tests for migration execution, schema creation, and handling of pending release migrations.
- Added SQL migration files for testing: creating a test table, adding a column, a release migration, and seeding data.
2025-12-04 19:10:54 +02:00

21 KiB

Step 23: Metrics & Health Checks

Phase 6: Observability & Resilience Estimated Complexity: Medium Dependencies: Step 22 (Logging & Tracing)


Overview

Metrics and health checks provide operational visibility into the router and microservices. Prometheus-compatible metrics expose request rates, latencies, error rates, and connection pool status. Health checks enable load balancers and orchestrators to route traffic appropriately.


Goals

  1. Expose Prometheus-compatible metrics
  2. Track request/response metrics per endpoint
  3. Monitor transport layer health
  4. Provide liveness and readiness probes
  5. Support custom health check integrations

Metrics Configuration

namespace StellaOps.Router.Common;

public class MetricsConfig
{
    /// <summary>Whether to enable metrics collection.</summary>
    public bool Enabled { get; set; } = true;

    /// <summary>Path for metrics endpoint.</summary>
    public string Path { get; set; } = "/metrics";

    /// <summary>Histogram buckets for request duration.</summary>
    public double[] DurationBuckets { get; set; } = new[]
    {
        0.001, 0.005, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 10.0
    };

    /// <summary>Labels to include in metrics.</summary>
    public HashSet<string> IncludeLabels { get; set; } = new()
    {
        "method", "path", "status_code", "service"
    };

    /// <summary>Whether to include path in labels (may cause high cardinality).</summary>
    public bool IncludePathLabel { get; set; } = false;

    /// <summary>Maximum unique path labels before aggregating.</summary>
    public int MaxPathCardinality { get; set; } = 100;
}

Core Metrics

namespace StellaOps.Router.Common;

/// <summary>
/// Central metrics registry for Stella Router.
/// </summary>
public sealed class StellaMetrics
{
    // Request metrics
    public static readonly Counter<long> RequestsTotal = Meter.CreateCounter<long>(
        "stella_requests_total",
        description: "Total number of requests processed");

    public static readonly Histogram<double> RequestDuration = Meter.CreateHistogram<double>(
        "stella_request_duration_seconds",
        unit: "s",
        description: "Request processing duration in seconds");

    public static readonly Counter<long> RequestErrors = Meter.CreateCounter<long>(
        "stella_request_errors_total",
        description: "Total number of request errors");

    // Transport metrics
    public static readonly UpDownCounter<int> ActiveConnections = Meter.CreateUpDownCounter<int>(
        "stella_active_connections",
        description: "Number of active transport connections");

    public static readonly Counter<long> ConnectionsTotal = Meter.CreateCounter<long>(
        "stella_connections_total",
        description: "Total number of transport connections");

    public static readonly Counter<long> FramesSent = Meter.CreateCounter<long>(
        "stella_frames_sent_total",
        description: "Total number of frames sent");

    public static readonly Counter<long> FramesReceived = Meter.CreateCounter<long>(
        "stella_frames_received_total",
        description: "Total number of frames received");

    public static readonly Counter<long> BytesSent = Meter.CreateCounter<long>(
        "stella_bytes_sent_total",
        unit: "By",
        description: "Total bytes sent");

    public static readonly Counter<long> BytesReceived = Meter.CreateCounter<long>(
        "stella_bytes_received_total",
        unit: "By",
        description: "Total bytes received");

    // Rate limiting metrics
    public static readonly Counter<long> RateLimitHits = Meter.CreateCounter<long>(
        "stella_rate_limit_hits_total",
        description: "Number of requests that hit rate limits");

    public static readonly Gauge<int> RateLimitBuckets = Meter.CreateGauge<int>(
        "stella_rate_limit_buckets",
        description: "Number of active rate limit buckets");

    // Auth metrics
    public static readonly Counter<long> AuthSuccesses = Meter.CreateCounter<long>(
        "stella_auth_success_total",
        description: "Number of successful authentications");

    public static readonly Counter<long> AuthFailures = Meter.CreateCounter<long>(
        "stella_auth_failures_total",
        description: "Number of failed authentications");

    // Circuit breaker metrics
    public static readonly Gauge<int> CircuitBreakerState = Meter.CreateGauge<int>(
        "stella_circuit_breaker_state",
        description: "Circuit breaker state (0=closed, 1=half-open, 2=open)");

    private static readonly Meter Meter = new("StellaOps.Router", "1.0.0");
}

Request Metrics Middleware

namespace StellaOps.Router.Gateway;

/// <summary>
/// Middleware to collect request metrics.
/// </summary>
public sealed class MetricsMiddleware
{
    private readonly RequestDelegate _next;
    private readonly MetricsConfig _config;
    private readonly PathNormalizer _pathNormalizer;

    public MetricsMiddleware(
        RequestDelegate next,
        IOptions<MetricsConfig> config)
    {
        _next = next;
        _config = config.Value;
        _pathNormalizer = new PathNormalizer(_config.MaxPathCardinality);
    }

    public async Task InvokeAsync(HttpContext context)
    {
        if (!_config.Enabled)
        {
            await _next(context);
            return;
        }

        var sw = Stopwatch.StartNew();
        var method = context.Request.Method;
        var path = _config.IncludePathLabel
            ? _pathNormalizer.Normalize(context.Request.Path)
            : "aggregated";

        try
        {
            await _next(context);
        }
        finally
        {
            sw.Stop();

            var tags = new TagList
            {
                { "method", method },
                { "status_code", context.Response.StatusCode.ToString() }
            };

            if (_config.IncludePathLabel)
            {
                tags.Add("path", path);
            }

            StellaMetrics.RequestsTotal.Add(1, tags);
            StellaMetrics.RequestDuration.Record(sw.Elapsed.TotalSeconds, tags);

            if (context.Response.StatusCode >= 400)
            {
                StellaMetrics.RequestErrors.Add(1, tags);
            }
        }
    }
}

/// <summary>
/// Normalizes paths to prevent high cardinality.
/// </summary>
internal sealed class PathNormalizer
{
    private readonly int _maxCardinality;
    private readonly ConcurrentDictionary<string, string> _pathCache = new();
    private int _uniquePaths;

    public PathNormalizer(int maxCardinality)
    {
        _maxCardinality = maxCardinality;
    }

    public string Normalize(string path)
    {
        if (_pathCache.TryGetValue(path, out var normalized))
            return normalized;

        // Replace path parameters with placeholders
        var segments = path.Split('/');
        for (int i = 0; i < segments.Length; i++)
        {
            if (Guid.TryParse(segments[i], out _) ||
                int.TryParse(segments[i], out _) ||
                segments[i].Length > 20)
            {
                segments[i] = "{id}";
            }
        }

        normalized = string.Join("/", segments);

        if (Interlocked.Increment(ref _uniquePaths) <= _maxCardinality)
        {
            _pathCache[path] = normalized;
        }
        else
        {
            normalized = "other";
        }

        return normalized;
    }
}

Transport Metrics

namespace StellaOps.Router.Transport;

/// <summary>
/// Collects metrics for transport layer operations.
/// </summary>
public sealed class TransportMetricsCollector
{
    public void RecordConnectionOpened(string transport, string serviceName)
    {
        var tags = new TagList
        {
            { "transport", transport },
            { "service", serviceName }
        };

        StellaMetrics.ConnectionsTotal.Add(1, tags);
        StellaMetrics.ActiveConnections.Add(1, tags);
    }

    public void RecordConnectionClosed(string transport, string serviceName)
    {
        var tags = new TagList
        {
            { "transport", transport },
            { "service", serviceName }
        };

        StellaMetrics.ActiveConnections.Add(-1, tags);
    }

    public void RecordFrameSent(string transport, FrameType type, int bytes)
    {
        var tags = new TagList
        {
            { "transport", transport },
            { "frame_type", type.ToString() }
        };

        StellaMetrics.FramesSent.Add(1, tags);
        StellaMetrics.BytesSent.Add(bytes, new TagList { { "transport", transport } });
    }

    public void RecordFrameReceived(string transport, FrameType type, int bytes)
    {
        var tags = new TagList
        {
            { "transport", transport },
            { "frame_type", type.ToString() }
        };

        StellaMetrics.FramesReceived.Add(1, tags);
        StellaMetrics.BytesReceived.Add(bytes, new TagList { { "transport", transport } });
    }
}

Health Check System

namespace StellaOps.Router.Common;

/// <summary>
/// Health check result.
/// </summary>
public sealed class HealthCheckResult
{
    public HealthStatus Status { get; init; }
    public string? Description { get; init; }
    public TimeSpan Duration { get; init; }
    public IReadOnlyDictionary<string, object>? Data { get; init; }
    public Exception? Exception { get; init; }
}

public enum HealthStatus
{
    Healthy,
    Degraded,
    Unhealthy
}

/// <summary>
/// Health check interface.
/// </summary>
public interface IHealthCheck
{
    string Name { get; }
    Task<HealthCheckResult> CheckAsync(CancellationToken cancellationToken);
}

/// <summary>
/// Aggregates multiple health checks.
/// </summary>
public sealed class HealthCheckService
{
    private readonly IEnumerable<IHealthCheck> _checks;
    private readonly ILogger<HealthCheckService> _logger;

    public HealthCheckService(
        IEnumerable<IHealthCheck> checks,
        ILogger<HealthCheckService> logger)
    {
        _checks = checks;
        _logger = logger;
    }

    public async Task<HealthReport> CheckHealthAsync(CancellationToken cancellationToken)
    {
        var results = new Dictionary<string, HealthCheckResult>();
        var overallStatus = HealthStatus.Healthy;

        foreach (var check in _checks)
        {
            var sw = Stopwatch.StartNew();

            try
            {
                var result = await check.CheckAsync(cancellationToken);
                result = result with { Duration = sw.Elapsed };
                results[check.Name] = result;

                if (result.Status > overallStatus)
                {
                    overallStatus = result.Status;
                }
            }
            catch (Exception ex)
            {
                _logger.LogWarning(ex, "Health check {Name} failed", check.Name);
                results[check.Name] = new HealthCheckResult
                {
                    Status = HealthStatus.Unhealthy,
                    Description = ex.Message,
                    Duration = sw.Elapsed,
                    Exception = ex
                };
                overallStatus = HealthStatus.Unhealthy;
            }
        }

        return new HealthReport
        {
            Status = overallStatus,
            Checks = results,
            TotalDuration = results.Values.Sum(r => r.Duration.TotalMilliseconds)
        };
    }
}

public sealed class HealthReport
{
    public HealthStatus Status { get; init; }
    public IReadOnlyDictionary<string, HealthCheckResult> Checks { get; init; } = new Dictionary<string, HealthCheckResult>();
    public double TotalDuration { get; init; }
}

Built-in Health Checks

namespace StellaOps.Router.Gateway;

/// <summary>
/// Checks that at least one transport connection is active.
/// </summary>
public sealed class TransportHealthCheck : IHealthCheck
{
    private readonly IGlobalRoutingState _routingState;

    public string Name => "transport";

    public TransportHealthCheck(IGlobalRoutingState routingState)
    {
        _routingState = routingState;
    }

    public Task<HealthCheckResult> CheckAsync(CancellationToken cancellationToken)
    {
        var connections = _routingState.GetAllConnections();
        var activeCount = connections.Count(c => c.State == ConnectionState.Connected);

        if (activeCount == 0)
        {
            return Task.FromResult(new HealthCheckResult
            {
                Status = HealthStatus.Unhealthy,
                Description = "No active transport connections",
                Data = new Dictionary<string, object> { ["connections"] = 0 }
            });
        }

        return Task.FromResult(new HealthCheckResult
        {
            Status = HealthStatus.Healthy,
            Description = $"{activeCount} active connections",
            Data = new Dictionary<string, object> { ["connections"] = activeCount }
        });
    }
}

/// <summary>
/// Checks Authority service connectivity.
/// </summary>
public sealed class AuthorityHealthCheck : IHealthCheck
{
    private readonly IAuthorityClient _authority;
    private readonly TimeSpan _timeout;

    public string Name => "authority";

    public AuthorityHealthCheck(
        IAuthorityClient authority,
        IOptions<AuthorityConfig> config)
    {
        _authority = authority;
        _timeout = config.Value.HealthCheckTimeout;
    }

    public async Task<HealthCheckResult> CheckAsync(CancellationToken cancellationToken)
    {
        try
        {
            using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
            cts.CancelAfter(_timeout);

            var isHealthy = await _authority.CheckHealthAsync(cts.Token);

            return new HealthCheckResult
            {
                Status = isHealthy ? HealthStatus.Healthy : HealthStatus.Degraded,
                Description = isHealthy ? "Authority is responsive" : "Authority returned unhealthy"
            };
        }
        catch (Exception ex)
        {
            return new HealthCheckResult
            {
                Status = HealthStatus.Degraded, // Degraded, not unhealthy - gateway can still work
                Description = $"Authority unreachable: {ex.Message}",
                Exception = ex
            };
        }
    }
}

/// <summary>
/// Checks rate limiter backend connectivity.
/// </summary>
public sealed class RateLimiterHealthCheck : IHealthCheck
{
    private readonly IRateLimiter _rateLimiter;

    public string Name => "rate_limiter";

    public RateLimiterHealthCheck(IRateLimiter rateLimiter)
    {
        _rateLimiter = rateLimiter;
    }

    public async Task<HealthCheckResult> CheckAsync(CancellationToken cancellationToken)
    {
        try
        {
            // Try a simple operation
            await _rateLimiter.CheckLimitAsync(
                new RateLimitContext { Key = "__health_check__", Tier = RateLimitTier.Free },
                cancellationToken);

            return new HealthCheckResult
            {
                Status = HealthStatus.Healthy,
                Description = "Rate limiter is responsive"
            };
        }
        catch (Exception ex)
        {
            return new HealthCheckResult
            {
                Status = HealthStatus.Degraded,
                Description = $"Rate limiter error: {ex.Message}",
                Exception = ex
            };
        }
    }
}

Health Endpoints

namespace StellaOps.Router.Gateway;

/// <summary>
/// Health check endpoints.
/// </summary>
public static class HealthEndpoints
{
    public static IEndpointRouteBuilder MapHealthEndpoints(
        this IEndpointRouteBuilder endpoints,
        string basePath = "/health")
    {
        endpoints.MapGet(basePath + "/live", LivenessCheck);
        endpoints.MapGet(basePath + "/ready", ReadinessCheck);
        endpoints.MapGet(basePath, DetailedHealthCheck);

        return endpoints;
    }

    /// <summary>
    /// Liveness probe - is the process running?
    /// </summary>
    private static IResult LivenessCheck()
    {
        return Results.Ok(new { status = "alive" });
    }

    /// <summary>
    /// Readiness probe - can the service accept traffic?
    /// </summary>
    private static async Task<IResult> ReadinessCheck(
        HealthCheckService healthService,
        CancellationToken cancellationToken)
    {
        var report = await healthService.CheckHealthAsync(cancellationToken);

        return report.Status == HealthStatus.Unhealthy
            ? Results.Json(new
            {
                status = "not_ready",
                checks = report.Checks.ToDictionary(c => c.Key, c => c.Value.Status.ToString())
            }, statusCode: 503)
            : Results.Ok(new { status = "ready" });
    }

    /// <summary>
    /// Detailed health report.
    /// </summary>
    private static async Task<IResult> DetailedHealthCheck(
        HealthCheckService healthService,
        CancellationToken cancellationToken)
    {
        var report = await healthService.CheckHealthAsync(cancellationToken);

        var response = new
        {
            status = report.Status.ToString().ToLower(),
            totalDuration = $"{report.TotalDuration:F2}ms",
            checks = report.Checks.ToDictionary(c => c.Key, c => new
            {
                status = c.Value.Status.ToString().ToLower(),
                description = c.Value.Description,
                duration = $"{c.Value.Duration.TotalMilliseconds:F2}ms",
                data = c.Value.Data
            })
        };

        var statusCode = report.Status switch
        {
            HealthStatus.Healthy => 200,
            HealthStatus.Degraded => 200, // Still return 200 for degraded
            HealthStatus.Unhealthy => 503,
            _ => 200
        };

        return Results.Json(response, statusCode: statusCode);
    }
}

Prometheus Metrics Endpoint

namespace StellaOps.Router.Gateway;

/// <summary>
/// Exposes metrics in Prometheus format.
/// </summary>
public sealed class PrometheusMetricsEndpoint
{
    public static void Map(IEndpointRouteBuilder endpoints, string path = "/metrics")
    {
        endpoints.MapGet(path, async (HttpContext context) =>
        {
            var exporter = context.RequestServices.GetRequiredService<PrometheusExporter>();
            var metrics = await exporter.ExportAsync();

            context.Response.ContentType = "text/plain; version=0.0.4";
            await context.Response.WriteAsync(metrics);
        });
    }
}

public sealed class PrometheusExporter
{
    private readonly MeterProvider _meterProvider;

    public PrometheusExporter(MeterProvider meterProvider)
    {
        _meterProvider = meterProvider;
    }

    public Task<string> ExportAsync()
    {
        // Use OpenTelemetry's Prometheus exporter
        // This is a simplified example
        var sb = new StringBuilder();

        // Export would iterate over all registered metrics
        // Real implementation uses OpenTelemetry.Exporter.Prometheus

        return Task.FromResult(sb.ToString());
    }
}

Service Registration

namespace StellaOps.Router.Gateway;

public static class MetricsExtensions
{
    public static IServiceCollection AddStellaMetrics(
        this IServiceCollection services,
        IConfiguration configuration)
    {
        services.Configure<MetricsConfig>(configuration.GetSection("Metrics"));

        services.AddOpenTelemetry()
            .WithMetrics(builder =>
            {
                builder
                    .AddMeter("StellaOps.Router")
                    .AddAspNetCoreInstrumentation()
                    .AddPrometheusExporter();
            });

        return services;
    }

    public static IServiceCollection AddStellaHealthChecks(
        this IServiceCollection services)
    {
        services.AddSingleton<HealthCheckService>();
        services.AddSingleton<IHealthCheck, TransportHealthCheck>();
        services.AddSingleton<IHealthCheck, AuthorityHealthCheck>();
        services.AddSingleton<IHealthCheck, RateLimiterHealthCheck>();

        return services;
    }
}

YAML Configuration

Metrics:
  Enabled: true
  Path: "/metrics"
  IncludePathLabel: false
  MaxPathCardinality: 100
  DurationBuckets:
    - 0.005
    - 0.01
    - 0.025
    - 0.05
    - 0.1
    - 0.25
    - 0.5
    - 1
    - 2.5
    - 5
    - 10

HealthChecks:
  Enabled: true
  Path: "/health"
  CacheDuration: "00:00:05"

Deliverables

  1. StellaOps.Router.Common/StellaMetrics.cs
  2. StellaOps.Router.Gateway/MetricsMiddleware.cs
  3. StellaOps.Router.Transport/TransportMetricsCollector.cs
  4. StellaOps.Router.Common/HealthCheckService.cs
  5. StellaOps.Router.Gateway/TransportHealthCheck.cs
  6. StellaOps.Router.Gateway/AuthorityHealthCheck.cs
  7. StellaOps.Router.Gateway/HealthEndpoints.cs
  8. StellaOps.Router.Gateway/PrometheusMetricsEndpoint.cs
  9. Metrics collection tests
  10. Health check tests

Next Step

Proceed to Step 24: Circuit Breaker & Retry Policies to implement resilience patterns.