feat(rate-limiting): Implement core rate limiting functionality with configuration, decision-making, metrics, middleware, and service registration

- Add RateLimitConfig for configuration management with YAML binding support.
- Introduce RateLimitDecision to encapsulate the result of rate limit checks.
- Implement RateLimitMetrics for OpenTelemetry metrics tracking.
- Create RateLimitMiddleware for enforcing rate limits on incoming requests.
- Develop RateLimitService to orchestrate instance and environment rate limit checks.
- Add RateLimitServiceCollectionExtensions for dependency injection registration.
This commit is contained in:
master
2025-12-17 18:02:37 +02:00
parent 394b57f6bf
commit 8bbfe4d2d2
211 changed files with 47179 additions and 1590 deletions

View File

@@ -0,0 +1,441 @@
// ───────────────────────────────────────────────────────────────────────────
// StellaOps Attestor — Distributed Verification Provider (Resilient, Multi-Node)
// SPDX-License-Identifier: AGPL-3.0-or-later
// ───────────────────────────────────────────────────────────────────────────
using System.Collections.Concurrent;
using System.Net.Http.Json;
using System.Security.Cryptography;
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using Polly;
using Polly.CircuitBreaker;
using Polly.Retry;
using Polly.Timeout;
using StellaOps.Attestor.Verify.Configuration;
using StellaOps.Attestor.Verify.Models;
namespace StellaOps.Attestor.Verify.Providers;
/// <summary>
/// Provides distributed verification by distributing work across multiple verification nodes.
/// Implements circuit breaker, retry policies, and consistent hashing for deterministic routing.
/// </summary>
public class DistributedVerificationProvider : IVerificationProvider
{
private readonly ILogger<DistributedVerificationProvider> _logger;
private readonly DistributedVerificationOptions _options;
private readonly HttpClient _httpClient;
private readonly ConcurrentDictionary<string, CircuitBreakerState> _circuitStates = new();
private readonly ConsistentHashRing _hashRing;
private readonly ResiliencePipeline<VerificationResult> _resiliencePipeline;
public DistributedVerificationProvider(
ILogger<DistributedVerificationProvider> logger,
IOptions<DistributedVerificationOptions> options,
HttpClient httpClient)
{
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
_httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
if (_options.Nodes == null || _options.Nodes.Count == 0)
{
throw new ArgumentException("At least one verification node must be configured");
}
_hashRing = new ConsistentHashRing(_options.Nodes, _options.VirtualNodeMultiplier);
_resiliencePipeline = BuildResiliencePipeline();
_logger.LogInformation("Initialized distributed verification provider with {NodeCount} nodes", _options.Nodes.Count);
}
/// <inheritdoc/>
public async Task<VerificationResult> VerifyAsync(
VerificationRequest request,
CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(request);
// Compute deterministic hash for routing
var routingKey = ComputeRoutingKey(request);
var orderedNodes = _hashRing.GetOrderedNodes(routingKey);
_logger.LogDebug(
"Routing verification request {RequestId} with key {RoutingKey} through {NodeCount} nodes",
request.RequestId,
routingKey,
orderedNodes.Count);
// Try nodes in order until one succeeds
List<Exception> exceptions = [];
foreach (var node in orderedNodes)
{
if (!IsNodeHealthy(node))
{
_logger.LogDebug("Skipping unhealthy node {NodeId}", node.Id);
continue;
}
try
{
var result = await _resiliencePipeline.ExecuteAsync(
async ct => await ExecuteVerificationAsync(node, request, ct),
cancellationToken);
_logger.LogInformation(
"Verification request {RequestId} completed on node {NodeId} with result {Status}",
request.RequestId,
node.Id,
result.Status);
return result;
}
catch (Exception ex) when (ex is HttpRequestException or TaskCanceledException or BrokenCircuitException)
{
_logger.LogWarning(ex, "Node {NodeId} failed for request {RequestId}", node.Id, request.RequestId);
exceptions.Add(ex);
MarkNodeUnhealthy(node);
}
}
// All nodes failed
_logger.LogError(
"All {NodeCount} nodes failed for verification request {RequestId}",
orderedNodes.Count,
request.RequestId);
return new VerificationResult
{
RequestId = request.RequestId,
Status = VerificationStatus.Error,
ErrorMessage = $"All verification nodes failed. {exceptions.Count} errors occurred.",
Timestamp = DateTimeOffset.UtcNow,
};
}
/// <inheritdoc/>
public async Task<HealthCheckResult> CheckHealthAsync(CancellationToken cancellationToken = default)
{
var results = new ConcurrentDictionary<string, bool>();
var tasks = _options.Nodes.Select(async node =>
{
try
{
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
cts.CancelAfter(TimeSpan.FromSeconds(5));
var response = await _httpClient.GetAsync(
new Uri(node.Endpoint, "health"),
cts.Token);
results[node.Id] = response.IsSuccessStatusCode;
}
catch
{
results[node.Id] = false;
}
});
await Task.WhenAll(tasks);
var healthyCount = results.Count(r => r.Value);
var totalCount = results.Count;
return new HealthCheckResult
{
IsHealthy = healthyCount >= _options.MinHealthyNodes,
HealthyNodeCount = healthyCount,
TotalNodeCount = totalCount,
NodeStatuses = results.ToDictionary(r => r.Key, r => r.Value),
Timestamp = DateTimeOffset.UtcNow,
};
}
/// <summary>
/// Gets the current distribution statistics for monitoring.
/// </summary>
public DistributionStats GetDistributionStats()
{
var healthyNodes = _options.Nodes.Where(IsNodeHealthy).ToList();
var unhealthyNodes = _options.Nodes.Except(healthyNodes).ToList();
return new DistributionStats
{
TotalNodes = _options.Nodes.Count,
HealthyNodes = healthyNodes.Count,
UnhealthyNodes = unhealthyNodes.Count,
VirtualNodesPerNode = _options.VirtualNodeMultiplier,
CircuitBreakerStates = _circuitStates.ToDictionary(
kvp => kvp.Key,
kvp => kvp.Value.ToString()),
};
}
private async Task<VerificationResult> ExecuteVerificationAsync(
VerificationNode node,
VerificationRequest request,
CancellationToken cancellationToken)
{
var endpoint = new Uri(node.Endpoint, "api/v1/verify");
_logger.LogDebug(
"Sending verification request {RequestId} to node {NodeId} at {Endpoint}",
request.RequestId,
node.Id,
endpoint);
using var response = await _httpClient.PostAsJsonAsync(endpoint, request, cancellationToken);
response.EnsureSuccessStatusCode();
var result = await response.Content.ReadFromJsonAsync<VerificationResult>(cancellationToken);
return result ?? throw new InvalidOperationException("Received null response from verification node");
}
private ResiliencePipeline<VerificationResult> BuildResiliencePipeline()
{
return new ResiliencePipelineBuilder<VerificationResult>()
.AddTimeout(new TimeoutStrategyOptions
{
Timeout = _options.RequestTimeout,
OnTimeout = args =>
{
_logger.LogWarning("Request timed out after {Timeout}", args.Timeout);
return default;
},
})
.AddRetry(new RetryStrategyOptions<VerificationResult>
{
MaxRetryAttempts = _options.MaxRetries,
Delay = _options.RetryDelay,
BackoffType = DelayBackoffType.Exponential,
ShouldHandle = new PredicateBuilder<VerificationResult>()
.Handle<HttpRequestException>()
.Handle<TaskCanceledException>(),
OnRetry = args =>
{
_logger.LogWarning(
args.Outcome.Exception,
"Retry attempt {AttemptNumber} after delay {Delay}",
args.AttemptNumber,
args.RetryDelay);
return default;
},
})
.Build();
}
private static string ComputeRoutingKey(VerificationRequest request)
{
// Create a deterministic routing key based on the content to verify
// This ensures the same content always routes to the same primary node
var keyMaterial = $"{request.DigestAlgorithm}:{request.Digest}:{request.ArtifactUri}";
var hashBytes = SHA256.HashData(Encoding.UTF8.GetBytes(keyMaterial));
return Convert.ToHexString(hashBytes);
}
private bool IsNodeHealthy(VerificationNode node)
{
if (!_circuitStates.TryGetValue(node.Id, out var state))
{
return true; // No circuit breaker state means healthy
}
// Allow recovery after cooldown period
if (state.LastFailure.HasValue &&
DateTimeOffset.UtcNow - state.LastFailure.Value > _options.CircuitBreakerCooldown)
{
state.FailureCount = 0;
state.LastFailure = null;
return true;
}
return state.FailureCount < _options.CircuitBreakerThreshold;
}
private void MarkNodeUnhealthy(VerificationNode node)
{
var state = _circuitStates.GetOrAdd(node.Id, _ => new CircuitBreakerState());
state.FailureCount++;
state.LastFailure = DateTimeOffset.UtcNow;
if (state.FailureCount >= _options.CircuitBreakerThreshold)
{
_logger.LogWarning(
"Node {NodeId} circuit breaker opened after {FailureCount} failures",
node.Id,
state.FailureCount);
}
}
private sealed class CircuitBreakerState
{
public int FailureCount { get; set; }
public DateTimeOffset? LastFailure { get; set; }
public override string ToString() =>
FailureCount >= 3 ? "Open" : FailureCount > 0 ? "HalfOpen" : "Closed";
}
}
/// <summary>
/// Implements consistent hashing for deterministic node selection.
/// </summary>
internal sealed class ConsistentHashRing
{
private readonly SortedDictionary<int, VerificationNode> _ring = new();
private readonly int[] _sortedHashes;
private readonly VerificationNode[] _sortedNodes;
public ConsistentHashRing(IReadOnlyList<VerificationNode> nodes, int virtualNodeMultiplier)
{
foreach (var node in nodes)
{
for (var i = 0; i < virtualNodeMultiplier; i++)
{
var virtualKey = $"{node.Id}:{i}";
var hash = ComputeHash(virtualKey);
_ring[hash] = node;
}
}
_sortedHashes = [.. _ring.Keys];
_sortedNodes = [.. _ring.Values];
}
/// <summary>
/// Gets nodes ordered by proximity to the routing key for failover.
/// </summary>
public List<VerificationNode> GetOrderedNodes(string routingKey)
{
var keyHash = ComputeHash(routingKey);
// Binary search for the first node >= hash
var index = Array.BinarySearch(_sortedHashes, keyHash);
if (index < 0)
{
index = ~index;
}
// Collect unique nodes starting from the found position
var orderedNodes = new List<VerificationNode>();
var seen = new HashSet<string>();
for (var i = 0; i < _sortedHashes.Length && orderedNodes.Count < _ring.Count; i++)
{
var actualIndex = (index + i) % _sortedHashes.Length;
var node = _sortedNodes[actualIndex];
if (seen.Add(node.Id))
{
orderedNodes.Add(node);
}
}
return orderedNodes;
}
private static int ComputeHash(string key)
{
var hashBytes = SHA256.HashData(Encoding.UTF8.GetBytes(key));
return BitConverter.ToInt32(hashBytes, 0);
}
}
/// <summary>
/// Configuration options for distributed verification.
/// </summary>
public class DistributedVerificationOptions
{
/// <summary>
/// List of verification nodes.
/// </summary>
public List<VerificationNode> Nodes { get; set; } = [];
/// <summary>
/// Minimum number of healthy nodes required.
/// </summary>
public int MinHealthyNodes { get; set; } = 1;
/// <summary>
/// Number of virtual nodes per physical node for consistent hashing.
/// </summary>
public int VirtualNodeMultiplier { get; set; } = 100;
/// <summary>
/// Maximum retry attempts per node.
/// </summary>
public int MaxRetries { get; set; } = 3;
/// <summary>
/// Delay between retries.
/// </summary>
public TimeSpan RetryDelay { get; set; } = TimeSpan.FromMilliseconds(500);
/// <summary>
/// Request timeout per node.
/// </summary>
public TimeSpan RequestTimeout { get; set; } = TimeSpan.FromSeconds(30);
/// <summary>
/// Number of consecutive failures before circuit breaker opens.
/// </summary>
public int CircuitBreakerThreshold { get; set; } = 3;
/// <summary>
/// Time before a tripped circuit breaker allows retry.
/// </summary>
public TimeSpan CircuitBreakerCooldown { get; set; } = TimeSpan.FromMinutes(1);
}
/// <summary>
/// Represents a verification node in the distributed cluster.
/// </summary>
public class VerificationNode
{
/// <summary>
/// Unique identifier for this node.
/// </summary>
public required string Id { get; init; }
/// <summary>
/// Base URI for the node's API.
/// </summary>
public required Uri Endpoint { get; init; }
/// <summary>
/// Node priority (lower = higher priority).
/// </summary>
public int Priority { get; init; } = 100;
/// <summary>
/// Node region for locality-aware routing.
/// </summary>
public string? Region { get; init; }
}
/// <summary>
/// Health check result for the distributed provider.
/// </summary>
public class HealthCheckResult
{
public bool IsHealthy { get; init; }
public int HealthyNodeCount { get; init; }
public int TotalNodeCount { get; init; }
public Dictionary<string, bool> NodeStatuses { get; init; } = [];
public DateTimeOffset Timestamp { get; init; }
}
/// <summary>
/// Distribution statistics for monitoring.
/// </summary>
public class DistributionStats
{
public int TotalNodes { get; init; }
public int HealthyNodes { get; init; }
public int UnhealthyNodes { get; init; }
public int VirtualNodesPerNode { get; init; }
public Dictionary<string, string> CircuitBreakerStates { get; init; } = [];
}