feat(rate-limiting): Implement core rate limiting functionality with configuration, decision-making, metrics, middleware, and service registration
- Add RateLimitConfig for configuration management with YAML binding support. - Introduce RateLimitDecision to encapsulate the result of rate limit checks. - Implement RateLimitMetrics for OpenTelemetry metrics tracking. - Create RateLimitMiddleware for enforcing rate limits on incoming requests. - Develop RateLimitService to orchestrate instance and environment rate limit checks. - Add RateLimitServiceCollectionExtensions for dependency injection registration.
This commit is contained in:
@@ -0,0 +1,441 @@
|
||||
// ───────────────────────────────────────────────────────────────────────────
|
||||
// StellaOps Attestor — Distributed Verification Provider (Resilient, Multi-Node)
|
||||
// SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
// ───────────────────────────────────────────────────────────────────────────
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
using System.Net.Http.Json;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using Polly;
|
||||
using Polly.CircuitBreaker;
|
||||
using Polly.Retry;
|
||||
using Polly.Timeout;
|
||||
using StellaOps.Attestor.Verify.Configuration;
|
||||
using StellaOps.Attestor.Verify.Models;
|
||||
|
||||
namespace StellaOps.Attestor.Verify.Providers;
|
||||
|
||||
/// <summary>
|
||||
/// Provides distributed verification by distributing work across multiple verification nodes.
|
||||
/// Implements circuit breaker, retry policies, and consistent hashing for deterministic routing.
|
||||
/// </summary>
|
||||
public class DistributedVerificationProvider : IVerificationProvider
|
||||
{
|
||||
private readonly ILogger<DistributedVerificationProvider> _logger;
|
||||
private readonly DistributedVerificationOptions _options;
|
||||
private readonly HttpClient _httpClient;
|
||||
private readonly ConcurrentDictionary<string, CircuitBreakerState> _circuitStates = new();
|
||||
private readonly ConsistentHashRing _hashRing;
|
||||
private readonly ResiliencePipeline<VerificationResult> _resiliencePipeline;
|
||||
|
||||
public DistributedVerificationProvider(
|
||||
ILogger<DistributedVerificationProvider> logger,
|
||||
IOptions<DistributedVerificationOptions> options,
|
||||
HttpClient httpClient)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
||||
_httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
|
||||
|
||||
if (_options.Nodes == null || _options.Nodes.Count == 0)
|
||||
{
|
||||
throw new ArgumentException("At least one verification node must be configured");
|
||||
}
|
||||
|
||||
_hashRing = new ConsistentHashRing(_options.Nodes, _options.VirtualNodeMultiplier);
|
||||
_resiliencePipeline = BuildResiliencePipeline();
|
||||
|
||||
_logger.LogInformation("Initialized distributed verification provider with {NodeCount} nodes", _options.Nodes.Count);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<VerificationResult> VerifyAsync(
|
||||
VerificationRequest request,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
|
||||
// Compute deterministic hash for routing
|
||||
var routingKey = ComputeRoutingKey(request);
|
||||
var orderedNodes = _hashRing.GetOrderedNodes(routingKey);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Routing verification request {RequestId} with key {RoutingKey} through {NodeCount} nodes",
|
||||
request.RequestId,
|
||||
routingKey,
|
||||
orderedNodes.Count);
|
||||
|
||||
// Try nodes in order until one succeeds
|
||||
List<Exception> exceptions = [];
|
||||
foreach (var node in orderedNodes)
|
||||
{
|
||||
if (!IsNodeHealthy(node))
|
||||
{
|
||||
_logger.LogDebug("Skipping unhealthy node {NodeId}", node.Id);
|
||||
continue;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var result = await _resiliencePipeline.ExecuteAsync(
|
||||
async ct => await ExecuteVerificationAsync(node, request, ct),
|
||||
cancellationToken);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Verification request {RequestId} completed on node {NodeId} with result {Status}",
|
||||
request.RequestId,
|
||||
node.Id,
|
||||
result.Status);
|
||||
|
||||
return result;
|
||||
}
|
||||
catch (Exception ex) when (ex is HttpRequestException or TaskCanceledException or BrokenCircuitException)
|
||||
{
|
||||
_logger.LogWarning(ex, "Node {NodeId} failed for request {RequestId}", node.Id, request.RequestId);
|
||||
exceptions.Add(ex);
|
||||
MarkNodeUnhealthy(node);
|
||||
}
|
||||
}
|
||||
|
||||
// All nodes failed
|
||||
_logger.LogError(
|
||||
"All {NodeCount} nodes failed for verification request {RequestId}",
|
||||
orderedNodes.Count,
|
||||
request.RequestId);
|
||||
|
||||
return new VerificationResult
|
||||
{
|
||||
RequestId = request.RequestId,
|
||||
Status = VerificationStatus.Error,
|
||||
ErrorMessage = $"All verification nodes failed. {exceptions.Count} errors occurred.",
|
||||
Timestamp = DateTimeOffset.UtcNow,
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<HealthCheckResult> CheckHealthAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
var results = new ConcurrentDictionary<string, bool>();
|
||||
var tasks = _options.Nodes.Select(async node =>
|
||||
{
|
||||
try
|
||||
{
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
||||
cts.CancelAfter(TimeSpan.FromSeconds(5));
|
||||
|
||||
var response = await _httpClient.GetAsync(
|
||||
new Uri(node.Endpoint, "health"),
|
||||
cts.Token);
|
||||
|
||||
results[node.Id] = response.IsSuccessStatusCode;
|
||||
}
|
||||
catch
|
||||
{
|
||||
results[node.Id] = false;
|
||||
}
|
||||
});
|
||||
|
||||
await Task.WhenAll(tasks);
|
||||
|
||||
var healthyCount = results.Count(r => r.Value);
|
||||
var totalCount = results.Count;
|
||||
|
||||
return new HealthCheckResult
|
||||
{
|
||||
IsHealthy = healthyCount >= _options.MinHealthyNodes,
|
||||
HealthyNodeCount = healthyCount,
|
||||
TotalNodeCount = totalCount,
|
||||
NodeStatuses = results.ToDictionary(r => r.Key, r => r.Value),
|
||||
Timestamp = DateTimeOffset.UtcNow,
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current distribution statistics for monitoring.
|
||||
/// </summary>
|
||||
public DistributionStats GetDistributionStats()
|
||||
{
|
||||
var healthyNodes = _options.Nodes.Where(IsNodeHealthy).ToList();
|
||||
var unhealthyNodes = _options.Nodes.Except(healthyNodes).ToList();
|
||||
|
||||
return new DistributionStats
|
||||
{
|
||||
TotalNodes = _options.Nodes.Count,
|
||||
HealthyNodes = healthyNodes.Count,
|
||||
UnhealthyNodes = unhealthyNodes.Count,
|
||||
VirtualNodesPerNode = _options.VirtualNodeMultiplier,
|
||||
CircuitBreakerStates = _circuitStates.ToDictionary(
|
||||
kvp => kvp.Key,
|
||||
kvp => kvp.Value.ToString()),
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<VerificationResult> ExecuteVerificationAsync(
|
||||
VerificationNode node,
|
||||
VerificationRequest request,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
var endpoint = new Uri(node.Endpoint, "api/v1/verify");
|
||||
|
||||
_logger.LogDebug(
|
||||
"Sending verification request {RequestId} to node {NodeId} at {Endpoint}",
|
||||
request.RequestId,
|
||||
node.Id,
|
||||
endpoint);
|
||||
|
||||
using var response = await _httpClient.PostAsJsonAsync(endpoint, request, cancellationToken);
|
||||
response.EnsureSuccessStatusCode();
|
||||
|
||||
var result = await response.Content.ReadFromJsonAsync<VerificationResult>(cancellationToken);
|
||||
return result ?? throw new InvalidOperationException("Received null response from verification node");
|
||||
}
|
||||
|
||||
private ResiliencePipeline<VerificationResult> BuildResiliencePipeline()
|
||||
{
|
||||
return new ResiliencePipelineBuilder<VerificationResult>()
|
||||
.AddTimeout(new TimeoutStrategyOptions
|
||||
{
|
||||
Timeout = _options.RequestTimeout,
|
||||
OnTimeout = args =>
|
||||
{
|
||||
_logger.LogWarning("Request timed out after {Timeout}", args.Timeout);
|
||||
return default;
|
||||
},
|
||||
})
|
||||
.AddRetry(new RetryStrategyOptions<VerificationResult>
|
||||
{
|
||||
MaxRetryAttempts = _options.MaxRetries,
|
||||
Delay = _options.RetryDelay,
|
||||
BackoffType = DelayBackoffType.Exponential,
|
||||
ShouldHandle = new PredicateBuilder<VerificationResult>()
|
||||
.Handle<HttpRequestException>()
|
||||
.Handle<TaskCanceledException>(),
|
||||
OnRetry = args =>
|
||||
{
|
||||
_logger.LogWarning(
|
||||
args.Outcome.Exception,
|
||||
"Retry attempt {AttemptNumber} after delay {Delay}",
|
||||
args.AttemptNumber,
|
||||
args.RetryDelay);
|
||||
return default;
|
||||
},
|
||||
})
|
||||
.Build();
|
||||
}
|
||||
|
||||
private static string ComputeRoutingKey(VerificationRequest request)
|
||||
{
|
||||
// Create a deterministic routing key based on the content to verify
|
||||
// This ensures the same content always routes to the same primary node
|
||||
var keyMaterial = $"{request.DigestAlgorithm}:{request.Digest}:{request.ArtifactUri}";
|
||||
var hashBytes = SHA256.HashData(Encoding.UTF8.GetBytes(keyMaterial));
|
||||
return Convert.ToHexString(hashBytes);
|
||||
}
|
||||
|
||||
private bool IsNodeHealthy(VerificationNode node)
|
||||
{
|
||||
if (!_circuitStates.TryGetValue(node.Id, out var state))
|
||||
{
|
||||
return true; // No circuit breaker state means healthy
|
||||
}
|
||||
|
||||
// Allow recovery after cooldown period
|
||||
if (state.LastFailure.HasValue &&
|
||||
DateTimeOffset.UtcNow - state.LastFailure.Value > _options.CircuitBreakerCooldown)
|
||||
{
|
||||
state.FailureCount = 0;
|
||||
state.LastFailure = null;
|
||||
return true;
|
||||
}
|
||||
|
||||
return state.FailureCount < _options.CircuitBreakerThreshold;
|
||||
}
|
||||
|
||||
private void MarkNodeUnhealthy(VerificationNode node)
|
||||
{
|
||||
var state = _circuitStates.GetOrAdd(node.Id, _ => new CircuitBreakerState());
|
||||
state.FailureCount++;
|
||||
state.LastFailure = DateTimeOffset.UtcNow;
|
||||
|
||||
if (state.FailureCount >= _options.CircuitBreakerThreshold)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Node {NodeId} circuit breaker opened after {FailureCount} failures",
|
||||
node.Id,
|
||||
state.FailureCount);
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class CircuitBreakerState
|
||||
{
|
||||
public int FailureCount { get; set; }
|
||||
public DateTimeOffset? LastFailure { get; set; }
|
||||
|
||||
public override string ToString() =>
|
||||
FailureCount >= 3 ? "Open" : FailureCount > 0 ? "HalfOpen" : "Closed";
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Implements consistent hashing for deterministic node selection.
|
||||
/// </summary>
|
||||
internal sealed class ConsistentHashRing
|
||||
{
|
||||
private readonly SortedDictionary<int, VerificationNode> _ring = new();
|
||||
private readonly int[] _sortedHashes;
|
||||
private readonly VerificationNode[] _sortedNodes;
|
||||
|
||||
public ConsistentHashRing(IReadOnlyList<VerificationNode> nodes, int virtualNodeMultiplier)
|
||||
{
|
||||
foreach (var node in nodes)
|
||||
{
|
||||
for (var i = 0; i < virtualNodeMultiplier; i++)
|
||||
{
|
||||
var virtualKey = $"{node.Id}:{i}";
|
||||
var hash = ComputeHash(virtualKey);
|
||||
_ring[hash] = node;
|
||||
}
|
||||
}
|
||||
|
||||
_sortedHashes = [.. _ring.Keys];
|
||||
_sortedNodes = [.. _ring.Values];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets nodes ordered by proximity to the routing key for failover.
|
||||
/// </summary>
|
||||
public List<VerificationNode> GetOrderedNodes(string routingKey)
|
||||
{
|
||||
var keyHash = ComputeHash(routingKey);
|
||||
|
||||
// Binary search for the first node >= hash
|
||||
var index = Array.BinarySearch(_sortedHashes, keyHash);
|
||||
if (index < 0)
|
||||
{
|
||||
index = ~index;
|
||||
}
|
||||
|
||||
// Collect unique nodes starting from the found position
|
||||
var orderedNodes = new List<VerificationNode>();
|
||||
var seen = new HashSet<string>();
|
||||
|
||||
for (var i = 0; i < _sortedHashes.Length && orderedNodes.Count < _ring.Count; i++)
|
||||
{
|
||||
var actualIndex = (index + i) % _sortedHashes.Length;
|
||||
var node = _sortedNodes[actualIndex];
|
||||
|
||||
if (seen.Add(node.Id))
|
||||
{
|
||||
orderedNodes.Add(node);
|
||||
}
|
||||
}
|
||||
|
||||
return orderedNodes;
|
||||
}
|
||||
|
||||
private static int ComputeHash(string key)
|
||||
{
|
||||
var hashBytes = SHA256.HashData(Encoding.UTF8.GetBytes(key));
|
||||
return BitConverter.ToInt32(hashBytes, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for distributed verification.
|
||||
/// </summary>
|
||||
public class DistributedVerificationOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// List of verification nodes.
|
||||
/// </summary>
|
||||
public List<VerificationNode> Nodes { get; set; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Minimum number of healthy nodes required.
|
||||
/// </summary>
|
||||
public int MinHealthyNodes { get; set; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// Number of virtual nodes per physical node for consistent hashing.
|
||||
/// </summary>
|
||||
public int VirtualNodeMultiplier { get; set; } = 100;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum retry attempts per node.
|
||||
/// </summary>
|
||||
public int MaxRetries { get; set; } = 3;
|
||||
|
||||
/// <summary>
|
||||
/// Delay between retries.
|
||||
/// </summary>
|
||||
public TimeSpan RetryDelay { get; set; } = TimeSpan.FromMilliseconds(500);
|
||||
|
||||
/// <summary>
|
||||
/// Request timeout per node.
|
||||
/// </summary>
|
||||
public TimeSpan RequestTimeout { get; set; } = TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <summary>
|
||||
/// Number of consecutive failures before circuit breaker opens.
|
||||
/// </summary>
|
||||
public int CircuitBreakerThreshold { get; set; } = 3;
|
||||
|
||||
/// <summary>
|
||||
/// Time before a tripped circuit breaker allows retry.
|
||||
/// </summary>
|
||||
public TimeSpan CircuitBreakerCooldown { get; set; } = TimeSpan.FromMinutes(1);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents a verification node in the distributed cluster.
|
||||
/// </summary>
|
||||
public class VerificationNode
|
||||
{
|
||||
/// <summary>
|
||||
/// Unique identifier for this node.
|
||||
/// </summary>
|
||||
public required string Id { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Base URI for the node's API.
|
||||
/// </summary>
|
||||
public required Uri Endpoint { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Node priority (lower = higher priority).
|
||||
/// </summary>
|
||||
public int Priority { get; init; } = 100;
|
||||
|
||||
/// <summary>
|
||||
/// Node region for locality-aware routing.
|
||||
/// </summary>
|
||||
public string? Region { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Health check result for the distributed provider.
|
||||
/// </summary>
|
||||
public class HealthCheckResult
|
||||
{
|
||||
public bool IsHealthy { get; init; }
|
||||
public int HealthyNodeCount { get; init; }
|
||||
public int TotalNodeCount { get; init; }
|
||||
public Dictionary<string, bool> NodeStatuses { get; init; } = [];
|
||||
public DateTimeOffset Timestamp { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Distribution statistics for monitoring.
|
||||
/// </summary>
|
||||
public class DistributionStats
|
||||
{
|
||||
public int TotalNodes { get; init; }
|
||||
public int HealthyNodes { get; init; }
|
||||
public int UnhealthyNodes { get; init; }
|
||||
public int VirtualNodesPerNode { get; init; }
|
||||
public Dictionary<string, string> CircuitBreakerStates { get; init; } = [];
|
||||
}
|
||||
Reference in New Issue
Block a user