- Add RateLimitConfig for configuration management with YAML binding support. - Introduce RateLimitDecision to encapsulate the result of rate limit checks. - Implement RateLimitMetrics for OpenTelemetry metrics tracking. - Create RateLimitMiddleware for enforcing rate limits on incoming requests. - Develop RateLimitService to orchestrate instance and environment rate limit checks. - Add RateLimitServiceCollectionExtensions for dependency injection registration.
442 lines
15 KiB
C#
442 lines
15 KiB
C#
// ───────────────────────────────────────────────────────────────────────────
|
|
// StellaOps Attestor — Distributed Verification Provider (Resilient, Multi-Node)
|
|
// SPDX-License-Identifier: AGPL-3.0-or-later
|
|
// ───────────────────────────────────────────────────────────────────────────
|
|
|
|
using System.Collections.Concurrent;
|
|
using System.Net.Http.Json;
|
|
using System.Security.Cryptography;
|
|
using System.Text;
|
|
using System.Text.Json;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
using Polly;
|
|
using Polly.CircuitBreaker;
|
|
using Polly.Retry;
|
|
using Polly.Timeout;
|
|
using StellaOps.Attestor.Verify.Configuration;
|
|
using StellaOps.Attestor.Verify.Models;
|
|
|
|
namespace StellaOps.Attestor.Verify.Providers;
|
|
|
|
/// <summary>
|
|
/// Provides distributed verification by distributing work across multiple verification nodes.
|
|
/// Implements circuit breaker, retry policies, and consistent hashing for deterministic routing.
|
|
/// </summary>
|
|
public class DistributedVerificationProvider : IVerificationProvider
|
|
{
|
|
private readonly ILogger<DistributedVerificationProvider> _logger;
|
|
private readonly DistributedVerificationOptions _options;
|
|
private readonly HttpClient _httpClient;
|
|
private readonly ConcurrentDictionary<string, CircuitBreakerState> _circuitStates = new();
|
|
private readonly ConsistentHashRing _hashRing;
|
|
private readonly ResiliencePipeline<VerificationResult> _resiliencePipeline;
|
|
|
|
public DistributedVerificationProvider(
|
|
ILogger<DistributedVerificationProvider> logger,
|
|
IOptions<DistributedVerificationOptions> options,
|
|
HttpClient httpClient)
|
|
{
|
|
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
|
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
|
_httpClient = httpClient ?? throw new ArgumentNullException(nameof(httpClient));
|
|
|
|
if (_options.Nodes == null || _options.Nodes.Count == 0)
|
|
{
|
|
throw new ArgumentException("At least one verification node must be configured");
|
|
}
|
|
|
|
_hashRing = new ConsistentHashRing(_options.Nodes, _options.VirtualNodeMultiplier);
|
|
_resiliencePipeline = BuildResiliencePipeline();
|
|
|
|
_logger.LogInformation("Initialized distributed verification provider with {NodeCount} nodes", _options.Nodes.Count);
|
|
}
|
|
|
|
/// <inheritdoc/>
|
|
public async Task<VerificationResult> VerifyAsync(
|
|
VerificationRequest request,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(request);
|
|
|
|
// Compute deterministic hash for routing
|
|
var routingKey = ComputeRoutingKey(request);
|
|
var orderedNodes = _hashRing.GetOrderedNodes(routingKey);
|
|
|
|
_logger.LogDebug(
|
|
"Routing verification request {RequestId} with key {RoutingKey} through {NodeCount} nodes",
|
|
request.RequestId,
|
|
routingKey,
|
|
orderedNodes.Count);
|
|
|
|
// Try nodes in order until one succeeds
|
|
List<Exception> exceptions = [];
|
|
foreach (var node in orderedNodes)
|
|
{
|
|
if (!IsNodeHealthy(node))
|
|
{
|
|
_logger.LogDebug("Skipping unhealthy node {NodeId}", node.Id);
|
|
continue;
|
|
}
|
|
|
|
try
|
|
{
|
|
var result = await _resiliencePipeline.ExecuteAsync(
|
|
async ct => await ExecuteVerificationAsync(node, request, ct),
|
|
cancellationToken);
|
|
|
|
_logger.LogInformation(
|
|
"Verification request {RequestId} completed on node {NodeId} with result {Status}",
|
|
request.RequestId,
|
|
node.Id,
|
|
result.Status);
|
|
|
|
return result;
|
|
}
|
|
catch (Exception ex) when (ex is HttpRequestException or TaskCanceledException or BrokenCircuitException)
|
|
{
|
|
_logger.LogWarning(ex, "Node {NodeId} failed for request {RequestId}", node.Id, request.RequestId);
|
|
exceptions.Add(ex);
|
|
MarkNodeUnhealthy(node);
|
|
}
|
|
}
|
|
|
|
// All nodes failed
|
|
_logger.LogError(
|
|
"All {NodeCount} nodes failed for verification request {RequestId}",
|
|
orderedNodes.Count,
|
|
request.RequestId);
|
|
|
|
return new VerificationResult
|
|
{
|
|
RequestId = request.RequestId,
|
|
Status = VerificationStatus.Error,
|
|
ErrorMessage = $"All verification nodes failed. {exceptions.Count} errors occurred.",
|
|
Timestamp = DateTimeOffset.UtcNow,
|
|
};
|
|
}
|
|
|
|
/// <inheritdoc/>
|
|
public async Task<HealthCheckResult> CheckHealthAsync(CancellationToken cancellationToken = default)
|
|
{
|
|
var results = new ConcurrentDictionary<string, bool>();
|
|
var tasks = _options.Nodes.Select(async node =>
|
|
{
|
|
try
|
|
{
|
|
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
|
|
cts.CancelAfter(TimeSpan.FromSeconds(5));
|
|
|
|
var response = await _httpClient.GetAsync(
|
|
new Uri(node.Endpoint, "health"),
|
|
cts.Token);
|
|
|
|
results[node.Id] = response.IsSuccessStatusCode;
|
|
}
|
|
catch
|
|
{
|
|
results[node.Id] = false;
|
|
}
|
|
});
|
|
|
|
await Task.WhenAll(tasks);
|
|
|
|
var healthyCount = results.Count(r => r.Value);
|
|
var totalCount = results.Count;
|
|
|
|
return new HealthCheckResult
|
|
{
|
|
IsHealthy = healthyCount >= _options.MinHealthyNodes,
|
|
HealthyNodeCount = healthyCount,
|
|
TotalNodeCount = totalCount,
|
|
NodeStatuses = results.ToDictionary(r => r.Key, r => r.Value),
|
|
Timestamp = DateTimeOffset.UtcNow,
|
|
};
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the current distribution statistics for monitoring.
|
|
/// </summary>
|
|
public DistributionStats GetDistributionStats()
|
|
{
|
|
var healthyNodes = _options.Nodes.Where(IsNodeHealthy).ToList();
|
|
var unhealthyNodes = _options.Nodes.Except(healthyNodes).ToList();
|
|
|
|
return new DistributionStats
|
|
{
|
|
TotalNodes = _options.Nodes.Count,
|
|
HealthyNodes = healthyNodes.Count,
|
|
UnhealthyNodes = unhealthyNodes.Count,
|
|
VirtualNodesPerNode = _options.VirtualNodeMultiplier,
|
|
CircuitBreakerStates = _circuitStates.ToDictionary(
|
|
kvp => kvp.Key,
|
|
kvp => kvp.Value.ToString()),
|
|
};
|
|
}
|
|
|
|
private async Task<VerificationResult> ExecuteVerificationAsync(
|
|
VerificationNode node,
|
|
VerificationRequest request,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
var endpoint = new Uri(node.Endpoint, "api/v1/verify");
|
|
|
|
_logger.LogDebug(
|
|
"Sending verification request {RequestId} to node {NodeId} at {Endpoint}",
|
|
request.RequestId,
|
|
node.Id,
|
|
endpoint);
|
|
|
|
using var response = await _httpClient.PostAsJsonAsync(endpoint, request, cancellationToken);
|
|
response.EnsureSuccessStatusCode();
|
|
|
|
var result = await response.Content.ReadFromJsonAsync<VerificationResult>(cancellationToken);
|
|
return result ?? throw new InvalidOperationException("Received null response from verification node");
|
|
}
|
|
|
|
private ResiliencePipeline<VerificationResult> BuildResiliencePipeline()
|
|
{
|
|
return new ResiliencePipelineBuilder<VerificationResult>()
|
|
.AddTimeout(new TimeoutStrategyOptions
|
|
{
|
|
Timeout = _options.RequestTimeout,
|
|
OnTimeout = args =>
|
|
{
|
|
_logger.LogWarning("Request timed out after {Timeout}", args.Timeout);
|
|
return default;
|
|
},
|
|
})
|
|
.AddRetry(new RetryStrategyOptions<VerificationResult>
|
|
{
|
|
MaxRetryAttempts = _options.MaxRetries,
|
|
Delay = _options.RetryDelay,
|
|
BackoffType = DelayBackoffType.Exponential,
|
|
ShouldHandle = new PredicateBuilder<VerificationResult>()
|
|
.Handle<HttpRequestException>()
|
|
.Handle<TaskCanceledException>(),
|
|
OnRetry = args =>
|
|
{
|
|
_logger.LogWarning(
|
|
args.Outcome.Exception,
|
|
"Retry attempt {AttemptNumber} after delay {Delay}",
|
|
args.AttemptNumber,
|
|
args.RetryDelay);
|
|
return default;
|
|
},
|
|
})
|
|
.Build();
|
|
}
|
|
|
|
private static string ComputeRoutingKey(VerificationRequest request)
|
|
{
|
|
// Create a deterministic routing key based on the content to verify
|
|
// This ensures the same content always routes to the same primary node
|
|
var keyMaterial = $"{request.DigestAlgorithm}:{request.Digest}:{request.ArtifactUri}";
|
|
var hashBytes = SHA256.HashData(Encoding.UTF8.GetBytes(keyMaterial));
|
|
return Convert.ToHexString(hashBytes);
|
|
}
|
|
|
|
private bool IsNodeHealthy(VerificationNode node)
|
|
{
|
|
if (!_circuitStates.TryGetValue(node.Id, out var state))
|
|
{
|
|
return true; // No circuit breaker state means healthy
|
|
}
|
|
|
|
// Allow recovery after cooldown period
|
|
if (state.LastFailure.HasValue &&
|
|
DateTimeOffset.UtcNow - state.LastFailure.Value > _options.CircuitBreakerCooldown)
|
|
{
|
|
state.FailureCount = 0;
|
|
state.LastFailure = null;
|
|
return true;
|
|
}
|
|
|
|
return state.FailureCount < _options.CircuitBreakerThreshold;
|
|
}
|
|
|
|
private void MarkNodeUnhealthy(VerificationNode node)
|
|
{
|
|
var state = _circuitStates.GetOrAdd(node.Id, _ => new CircuitBreakerState());
|
|
state.FailureCount++;
|
|
state.LastFailure = DateTimeOffset.UtcNow;
|
|
|
|
if (state.FailureCount >= _options.CircuitBreakerThreshold)
|
|
{
|
|
_logger.LogWarning(
|
|
"Node {NodeId} circuit breaker opened after {FailureCount} failures",
|
|
node.Id,
|
|
state.FailureCount);
|
|
}
|
|
}
|
|
|
|
private sealed class CircuitBreakerState
|
|
{
|
|
public int FailureCount { get; set; }
|
|
public DateTimeOffset? LastFailure { get; set; }
|
|
|
|
public override string ToString() =>
|
|
FailureCount >= 3 ? "Open" : FailureCount > 0 ? "HalfOpen" : "Closed";
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Implements consistent hashing for deterministic node selection.
|
|
/// </summary>
|
|
internal sealed class ConsistentHashRing
|
|
{
|
|
private readonly SortedDictionary<int, VerificationNode> _ring = new();
|
|
private readonly int[] _sortedHashes;
|
|
private readonly VerificationNode[] _sortedNodes;
|
|
|
|
public ConsistentHashRing(IReadOnlyList<VerificationNode> nodes, int virtualNodeMultiplier)
|
|
{
|
|
foreach (var node in nodes)
|
|
{
|
|
for (var i = 0; i < virtualNodeMultiplier; i++)
|
|
{
|
|
var virtualKey = $"{node.Id}:{i}";
|
|
var hash = ComputeHash(virtualKey);
|
|
_ring[hash] = node;
|
|
}
|
|
}
|
|
|
|
_sortedHashes = [.. _ring.Keys];
|
|
_sortedNodes = [.. _ring.Values];
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets nodes ordered by proximity to the routing key for failover.
|
|
/// </summary>
|
|
public List<VerificationNode> GetOrderedNodes(string routingKey)
|
|
{
|
|
var keyHash = ComputeHash(routingKey);
|
|
|
|
// Binary search for the first node >= hash
|
|
var index = Array.BinarySearch(_sortedHashes, keyHash);
|
|
if (index < 0)
|
|
{
|
|
index = ~index;
|
|
}
|
|
|
|
// Collect unique nodes starting from the found position
|
|
var orderedNodes = new List<VerificationNode>();
|
|
var seen = new HashSet<string>();
|
|
|
|
for (var i = 0; i < _sortedHashes.Length && orderedNodes.Count < _ring.Count; i++)
|
|
{
|
|
var actualIndex = (index + i) % _sortedHashes.Length;
|
|
var node = _sortedNodes[actualIndex];
|
|
|
|
if (seen.Add(node.Id))
|
|
{
|
|
orderedNodes.Add(node);
|
|
}
|
|
}
|
|
|
|
return orderedNodes;
|
|
}
|
|
|
|
private static int ComputeHash(string key)
|
|
{
|
|
var hashBytes = SHA256.HashData(Encoding.UTF8.GetBytes(key));
|
|
return BitConverter.ToInt32(hashBytes, 0);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Configuration options for distributed verification.
|
|
/// </summary>
|
|
public class DistributedVerificationOptions
|
|
{
|
|
/// <summary>
|
|
/// List of verification nodes.
|
|
/// </summary>
|
|
public List<VerificationNode> Nodes { get; set; } = [];
|
|
|
|
/// <summary>
|
|
/// Minimum number of healthy nodes required.
|
|
/// </summary>
|
|
public int MinHealthyNodes { get; set; } = 1;
|
|
|
|
/// <summary>
|
|
/// Number of virtual nodes per physical node for consistent hashing.
|
|
/// </summary>
|
|
public int VirtualNodeMultiplier { get; set; } = 100;
|
|
|
|
/// <summary>
|
|
/// Maximum retry attempts per node.
|
|
/// </summary>
|
|
public int MaxRetries { get; set; } = 3;
|
|
|
|
/// <summary>
|
|
/// Delay between retries.
|
|
/// </summary>
|
|
public TimeSpan RetryDelay { get; set; } = TimeSpan.FromMilliseconds(500);
|
|
|
|
/// <summary>
|
|
/// Request timeout per node.
|
|
/// </summary>
|
|
public TimeSpan RequestTimeout { get; set; } = TimeSpan.FromSeconds(30);
|
|
|
|
/// <summary>
|
|
/// Number of consecutive failures before circuit breaker opens.
|
|
/// </summary>
|
|
public int CircuitBreakerThreshold { get; set; } = 3;
|
|
|
|
/// <summary>
|
|
/// Time before a tripped circuit breaker allows retry.
|
|
/// </summary>
|
|
public TimeSpan CircuitBreakerCooldown { get; set; } = TimeSpan.FromMinutes(1);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Represents a verification node in the distributed cluster.
|
|
/// </summary>
|
|
public class VerificationNode
|
|
{
|
|
/// <summary>
|
|
/// Unique identifier for this node.
|
|
/// </summary>
|
|
public required string Id { get; init; }
|
|
|
|
/// <summary>
|
|
/// Base URI for the node's API.
|
|
/// </summary>
|
|
public required Uri Endpoint { get; init; }
|
|
|
|
/// <summary>
|
|
/// Node priority (lower = higher priority).
|
|
/// </summary>
|
|
public int Priority { get; init; } = 100;
|
|
|
|
/// <summary>
|
|
/// Node region for locality-aware routing.
|
|
/// </summary>
|
|
public string? Region { get; init; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Health check result for the distributed provider.
|
|
/// </summary>
|
|
public class HealthCheckResult
|
|
{
|
|
public bool IsHealthy { get; init; }
|
|
public int HealthyNodeCount { get; init; }
|
|
public int TotalNodeCount { get; init; }
|
|
public Dictionary<string, bool> NodeStatuses { get; init; } = [];
|
|
public DateTimeOffset Timestamp { get; init; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Distribution statistics for monitoring.
|
|
/// </summary>
|
|
public class DistributionStats
|
|
{
|
|
public int TotalNodes { get; init; }
|
|
public int HealthyNodes { get; init; }
|
|
public int UnhealthyNodes { get; init; }
|
|
public int VirtualNodesPerNode { get; init; }
|
|
public Dictionary<string, string> CircuitBreakerStates { get; init; } = [];
|
|
}
|