Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
- Implemented MigrationCategoryTests to validate migration categorization for startup, release, seed, and data migrations. - Added tests for edge cases, including null, empty, and whitespace migration names. - Created StartupMigrationHostTests to verify the behavior of the migration host with real PostgreSQL instances using Testcontainers. - Included tests for migration execution, schema creation, and handling of pending release migrations. - Added SQL migration files for testing: creating a test table, adding a column, a release migration, and seeding data.
857 lines
22 KiB
Markdown
857 lines
22 KiB
Markdown
# Step 24: Circuit Breaker & Retry Policies
|
|
|
|
**Phase 6: Observability & Resilience**
|
|
**Estimated Complexity:** High
|
|
**Dependencies:** Step 23 (Metrics & Health Checks)
|
|
|
|
---
|
|
|
|
## Overview
|
|
|
|
Circuit breakers and retry policies protect the system from cascading failures and transient errors. The circuit breaker prevents requests to failing services, while retry policies automatically retry failed requests with exponential backoff.
|
|
|
|
---
|
|
|
|
## Goals
|
|
|
|
1. Implement circuit breaker pattern for service protection
|
|
2. Support configurable retry policies
|
|
3. Enable per-service and per-endpoint policies
|
|
4. Integrate with metrics for observability
|
|
5. Provide graceful degradation strategies
|
|
|
|
---
|
|
|
|
## Circuit Breaker Configuration
|
|
|
|
```csharp
|
|
namespace StellaOps.Router.Resilience;
|
|
|
|
public class CircuitBreakerConfig
|
|
{
|
|
/// <summary>Number of failures before opening circuit.</summary>
|
|
public int FailureThreshold { get; set; } = 5;
|
|
|
|
/// <summary>Time window for counting failures.</summary>
|
|
public TimeSpan SamplingDuration { get; set; } = TimeSpan.FromSeconds(30);
|
|
|
|
/// <summary>How long to stay open before testing.</summary>
|
|
public TimeSpan BreakDuration { get; set; } = TimeSpan.FromSeconds(30);
|
|
|
|
/// <summary>Minimum throughput before circuit can trip.</summary>
|
|
public int MinimumThroughput { get; set; } = 10;
|
|
|
|
/// <summary>Failure ratio to trip circuit (0.0 to 1.0).</summary>
|
|
public double FailureRatioThreshold { get; set; } = 0.5;
|
|
|
|
/// <summary>HTTP status codes considered failures.</summary>
|
|
public HashSet<int> FailureStatusCodes { get; set; } = new()
|
|
{
|
|
500, 502, 503, 504
|
|
};
|
|
|
|
/// <summary>Exception types considered failures.</summary>
|
|
public HashSet<Type> FailureExceptions { get; set; } = new()
|
|
{
|
|
typeof(TimeoutException),
|
|
typeof(TaskCanceledException),
|
|
typeof(HttpRequestException)
|
|
};
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Circuit Breaker Implementation
|
|
|
|
```csharp
|
|
namespace StellaOps.Router.Resilience;
|
|
|
|
public enum CircuitState
|
|
{
|
|
Closed = 0, // Normal operation
|
|
Open = 2, // Blocking requests
|
|
HalfOpen = 1 // Testing with limited requests
|
|
}
|
|
|
|
/// <summary>
|
|
/// Circuit breaker for a single service or endpoint.
|
|
/// </summary>
|
|
public sealed class CircuitBreaker
|
|
{
|
|
private readonly CircuitBreakerConfig _config;
|
|
private readonly ILogger<CircuitBreaker> _logger;
|
|
private readonly SlidingWindow _window;
|
|
private CircuitState _state = CircuitState.Closed;
|
|
private DateTimeOffset _openedAt;
|
|
private readonly SemaphoreSlim _halfOpenLock = new(1, 1);
|
|
|
|
public string Name { get; }
|
|
public CircuitState State => _state;
|
|
public DateTimeOffset LastStateChange { get; private set; }
|
|
|
|
public CircuitBreaker(
|
|
string name,
|
|
CircuitBreakerConfig config,
|
|
ILogger<CircuitBreaker> logger)
|
|
{
|
|
Name = name;
|
|
_config = config;
|
|
_logger = logger;
|
|
_window = new SlidingWindow(config.SamplingDuration);
|
|
LastStateChange = DateTimeOffset.UtcNow;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks if request is allowed through the circuit.
|
|
/// </summary>
|
|
public async Task<bool> AllowRequestAsync(CancellationToken cancellationToken)
|
|
{
|
|
switch (_state)
|
|
{
|
|
case CircuitState.Closed:
|
|
return true;
|
|
|
|
case CircuitState.Open:
|
|
if (DateTimeOffset.UtcNow - _openedAt >= _config.BreakDuration)
|
|
{
|
|
await TryTransitionToHalfOpenAsync();
|
|
}
|
|
return _state == CircuitState.HalfOpen;
|
|
|
|
case CircuitState.HalfOpen:
|
|
// Only allow one request at a time in half-open
|
|
return await _halfOpenLock.WaitAsync(0, cancellationToken);
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Records a successful request.
|
|
/// </summary>
|
|
public void RecordSuccess()
|
|
{
|
|
_window.RecordSuccess();
|
|
|
|
if (_state == CircuitState.HalfOpen)
|
|
{
|
|
TransitionToClosed();
|
|
_halfOpenLock.Release();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Records a failed request.
|
|
/// </summary>
|
|
public void RecordFailure()
|
|
{
|
|
_window.RecordFailure();
|
|
|
|
if (_state == CircuitState.HalfOpen)
|
|
{
|
|
TransitionToOpen();
|
|
_halfOpenLock.Release();
|
|
}
|
|
else if (_state == CircuitState.Closed)
|
|
{
|
|
CheckThreshold();
|
|
}
|
|
}
|
|
|
|
private void CheckThreshold()
|
|
{
|
|
var stats = _window.GetStats();
|
|
|
|
if (stats.TotalRequests < _config.MinimumThroughput)
|
|
return;
|
|
|
|
var failureRatio = (double)stats.Failures / stats.TotalRequests;
|
|
|
|
if (failureRatio >= _config.FailureRatioThreshold ||
|
|
stats.Failures >= _config.FailureThreshold)
|
|
{
|
|
TransitionToOpen();
|
|
}
|
|
}
|
|
|
|
private void TransitionToOpen()
|
|
{
|
|
_state = CircuitState.Open;
|
|
_openedAt = DateTimeOffset.UtcNow;
|
|
LastStateChange = _openedAt;
|
|
|
|
_logger.LogWarning(
|
|
"Circuit {Name} opened. Failures: {Failures}, Ratio: {Ratio:P2}",
|
|
Name, _window.GetStats().Failures,
|
|
(double)_window.GetStats().Failures / Math.Max(1, _window.GetStats().TotalRequests));
|
|
|
|
StellaMetrics.CircuitBreakerState.Record((int)CircuitState.Open,
|
|
new TagList { { "circuit", Name } });
|
|
}
|
|
|
|
private async Task TryTransitionToHalfOpenAsync()
|
|
{
|
|
if (_state != CircuitState.Open)
|
|
return;
|
|
|
|
if (await _halfOpenLock.WaitAsync(0))
|
|
{
|
|
_state = CircuitState.HalfOpen;
|
|
LastStateChange = DateTimeOffset.UtcNow;
|
|
_window.Reset();
|
|
|
|
_logger.LogInformation("Circuit {Name} transitioning to half-open", Name);
|
|
|
|
StellaMetrics.CircuitBreakerState.Record((int)CircuitState.HalfOpen,
|
|
new TagList { { "circuit", Name } });
|
|
}
|
|
}
|
|
|
|
private void TransitionToClosed()
|
|
{
|
|
_state = CircuitState.Closed;
|
|
LastStateChange = DateTimeOffset.UtcNow;
|
|
_window.Reset();
|
|
|
|
_logger.LogInformation("Circuit {Name} closed", Name);
|
|
|
|
StellaMetrics.CircuitBreakerState.Record((int)CircuitState.Closed,
|
|
new TagList { { "circuit", Name } });
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Sliding window for tracking success/failure counts.
|
|
/// </summary>
|
|
internal sealed class SlidingWindow
|
|
{
|
|
private readonly TimeSpan _duration;
|
|
private readonly ConcurrentQueue<(DateTimeOffset Time, bool Success)> _events = new();
|
|
|
|
public SlidingWindow(TimeSpan duration)
|
|
{
|
|
_duration = duration;
|
|
}
|
|
|
|
public void RecordSuccess()
|
|
{
|
|
_events.Enqueue((DateTimeOffset.UtcNow, true));
|
|
Cleanup();
|
|
}
|
|
|
|
public void RecordFailure()
|
|
{
|
|
_events.Enqueue((DateTimeOffset.UtcNow, false));
|
|
Cleanup();
|
|
}
|
|
|
|
public WindowStats GetStats()
|
|
{
|
|
Cleanup();
|
|
|
|
var successes = 0;
|
|
var failures = 0;
|
|
|
|
foreach (var evt in _events)
|
|
{
|
|
if (evt.Success)
|
|
successes++;
|
|
else
|
|
failures++;
|
|
}
|
|
|
|
return new WindowStats(successes, failures);
|
|
}
|
|
|
|
public void Reset()
|
|
{
|
|
_events.Clear();
|
|
}
|
|
|
|
private void Cleanup()
|
|
{
|
|
var cutoff = DateTimeOffset.UtcNow - _duration;
|
|
|
|
while (_events.TryPeek(out var evt) && evt.Time < cutoff)
|
|
{
|
|
_events.TryDequeue(out _);
|
|
}
|
|
}
|
|
}
|
|
|
|
internal readonly record struct WindowStats(int Successes, int Failures)
|
|
{
|
|
public int TotalRequests => Successes + Failures;
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Retry Policy Configuration
|
|
|
|
```csharp
|
|
namespace StellaOps.Router.Resilience;
|
|
|
|
public class RetryPolicyConfig
|
|
{
|
|
/// <summary>Maximum number of retries.</summary>
|
|
public int MaxRetries { get; set; } = 3;
|
|
|
|
/// <summary>Initial delay before first retry.</summary>
|
|
public TimeSpan InitialDelay { get; set; } = TimeSpan.FromMilliseconds(100);
|
|
|
|
/// <summary>Maximum delay between retries.</summary>
|
|
public TimeSpan MaxDelay { get; set; } = TimeSpan.FromSeconds(10);
|
|
|
|
/// <summary>Backoff multiplier for exponential delay.</summary>
|
|
public double BackoffMultiplier { get; set; } = 2.0;
|
|
|
|
/// <summary>Whether to add jitter to delays.</summary>
|
|
public bool UseJitter { get; set; } = true;
|
|
|
|
/// <summary>Maximum jitter to add (percentage of delay).</summary>
|
|
public double MaxJitterPercent { get; set; } = 0.25;
|
|
|
|
/// <summary>HTTP status codes that trigger retry.</summary>
|
|
public HashSet<int> RetryableStatusCodes { get; set; } = new()
|
|
{
|
|
408, 429, 500, 502, 503, 504
|
|
};
|
|
|
|
/// <summary>Exception types that trigger retry.</summary>
|
|
public HashSet<Type> RetryableExceptions { get; set; } = new()
|
|
{
|
|
typeof(TimeoutException),
|
|
typeof(HttpRequestException),
|
|
typeof(IOException)
|
|
};
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Retry Policy Implementation
|
|
|
|
```csharp
|
|
namespace StellaOps.Router.Resilience;
|
|
|
|
/// <summary>
|
|
/// Executes operations with retry logic.
|
|
/// </summary>
|
|
public sealed class RetryPolicy
|
|
{
|
|
private readonly RetryPolicyConfig _config;
|
|
private readonly ILogger<RetryPolicy> _logger;
|
|
|
|
public RetryPolicy(RetryPolicyConfig config, ILogger<RetryPolicy> logger)
|
|
{
|
|
_config = config;
|
|
_logger = logger;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Executes an operation with retry logic.
|
|
/// </summary>
|
|
public async Task<T> ExecuteAsync<T>(
|
|
Func<CancellationToken, Task<T>> operation,
|
|
Func<T, bool> shouldRetry,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
var attempt = 0;
|
|
var totalDelay = TimeSpan.Zero;
|
|
|
|
while (true)
|
|
{
|
|
try
|
|
{
|
|
attempt++;
|
|
var result = await operation(cancellationToken);
|
|
|
|
if (shouldRetry(result) && attempt <= _config.MaxRetries)
|
|
{
|
|
var delay = CalculateDelay(attempt);
|
|
totalDelay += delay;
|
|
|
|
_logger.LogDebug(
|
|
"Retrying operation (attempt {Attempt}/{MaxRetries}) after {Delay}ms",
|
|
attempt, _config.MaxRetries, delay.TotalMilliseconds);
|
|
|
|
await Task.Delay(delay, cancellationToken);
|
|
continue;
|
|
}
|
|
|
|
if (attempt > 1)
|
|
{
|
|
_logger.LogDebug(
|
|
"Operation succeeded after {Attempts} attempts, total delay: {TotalDelay}ms",
|
|
attempt, totalDelay.TotalMilliseconds);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
catch (Exception ex) when (ShouldRetry(ex) && attempt <= _config.MaxRetries)
|
|
{
|
|
var delay = CalculateDelay(attempt);
|
|
totalDelay += delay;
|
|
|
|
_logger.LogWarning(
|
|
ex,
|
|
"Operation failed (attempt {Attempt}/{MaxRetries}), retrying after {Delay}ms",
|
|
attempt, _config.MaxRetries, delay.TotalMilliseconds);
|
|
|
|
await Task.Delay(delay, cancellationToken);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Executes an operation with retry logic (response payload variant).
|
|
/// </summary>
|
|
public Task<ResponsePayload> ExecuteAsync(
|
|
Func<CancellationToken, Task<ResponsePayload>> operation,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
return ExecuteAsync(
|
|
operation,
|
|
response => _config.RetryableStatusCodes.Contains(response.StatusCode),
|
|
cancellationToken);
|
|
}
|
|
|
|
private bool ShouldRetry(Exception ex)
|
|
{
|
|
var exType = ex.GetType();
|
|
return _config.RetryableExceptions.Any(t => t.IsAssignableFrom(exType));
|
|
}
|
|
|
|
private TimeSpan CalculateDelay(int attempt)
|
|
{
|
|
// Exponential backoff
|
|
var delay = TimeSpan.FromMilliseconds(
|
|
_config.InitialDelay.TotalMilliseconds * Math.Pow(_config.BackoffMultiplier, attempt - 1));
|
|
|
|
// Cap at max delay
|
|
if (delay > _config.MaxDelay)
|
|
{
|
|
delay = _config.MaxDelay;
|
|
}
|
|
|
|
// Add jitter
|
|
if (_config.UseJitter)
|
|
{
|
|
var jitter = delay.TotalMilliseconds * _config.MaxJitterPercent * Random.Shared.NextDouble();
|
|
delay = TimeSpan.FromMilliseconds(delay.TotalMilliseconds + jitter);
|
|
}
|
|
|
|
return delay;
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Resilience Policy Executor
|
|
|
|
```csharp
|
|
namespace StellaOps.Router.Resilience;
|
|
|
|
/// <summary>
|
|
/// Combines circuit breaker and retry policies.
|
|
/// </summary>
|
|
public interface IResiliencePolicy
|
|
{
|
|
Task<ResponsePayload> ExecuteAsync(
|
|
string serviceName,
|
|
Func<CancellationToken, Task<ResponsePayload>> operation,
|
|
CancellationToken cancellationToken);
|
|
}
|
|
|
|
public sealed class ResiliencePolicy : IResiliencePolicy
|
|
{
|
|
private readonly ICircuitBreakerRegistry _circuitBreakers;
|
|
private readonly RetryPolicy _retryPolicy;
|
|
private readonly ResilienceConfig _config;
|
|
private readonly ILogger<ResiliencePolicy> _logger;
|
|
|
|
public ResiliencePolicy(
|
|
ICircuitBreakerRegistry circuitBreakers,
|
|
RetryPolicy retryPolicy,
|
|
IOptions<ResilienceConfig> config,
|
|
ILogger<ResiliencePolicy> logger)
|
|
{
|
|
_circuitBreakers = circuitBreakers;
|
|
_retryPolicy = retryPolicy;
|
|
_config = config.Value;
|
|
_logger = logger;
|
|
}
|
|
|
|
public async Task<ResponsePayload> ExecuteAsync(
|
|
string serviceName,
|
|
Func<CancellationToken, Task<ResponsePayload>> operation,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
var circuitBreaker = _circuitBreakers.GetOrCreate(serviceName);
|
|
|
|
// Check circuit breaker
|
|
if (!await circuitBreaker.AllowRequestAsync(cancellationToken))
|
|
{
|
|
_logger.LogWarning("Circuit breaker {Name} is open, rejecting request", serviceName);
|
|
|
|
return _config.FallbackResponse ?? new ResponsePayload
|
|
{
|
|
StatusCode = 503,
|
|
Headers = new Dictionary<string, string>
|
|
{
|
|
["X-Circuit-Breaker"] = "open",
|
|
["Retry-After"] = "30"
|
|
},
|
|
Body = Encoding.UTF8.GetBytes(JsonSerializer.Serialize(new
|
|
{
|
|
error = "Service temporarily unavailable",
|
|
service = serviceName
|
|
})),
|
|
IsFinalChunk = true
|
|
};
|
|
}
|
|
|
|
try
|
|
{
|
|
// Execute with retry
|
|
var response = await _retryPolicy.ExecuteAsync(operation, cancellationToken);
|
|
|
|
// Record result
|
|
if (IsSuccess(response))
|
|
{
|
|
circuitBreaker.RecordSuccess();
|
|
}
|
|
else if (IsFailure(response))
|
|
{
|
|
circuitBreaker.RecordFailure();
|
|
}
|
|
|
|
return response;
|
|
}
|
|
catch (Exception)
|
|
{
|
|
circuitBreaker.RecordFailure();
|
|
throw;
|
|
}
|
|
}
|
|
|
|
private bool IsSuccess(ResponsePayload response)
|
|
{
|
|
return response.StatusCode >= 200 && response.StatusCode < 400;
|
|
}
|
|
|
|
private bool IsFailure(ResponsePayload response)
|
|
{
|
|
return _config.CircuitBreaker.FailureStatusCodes.Contains(response.StatusCode);
|
|
}
|
|
}
|
|
|
|
public class ResilienceConfig
|
|
{
|
|
public CircuitBreakerConfig CircuitBreaker { get; set; } = new();
|
|
public RetryPolicyConfig Retry { get; set; } = new();
|
|
public ResponsePayload? FallbackResponse { get; set; }
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Circuit Breaker Registry
|
|
|
|
```csharp
|
|
namespace StellaOps.Router.Resilience;
|
|
|
|
/// <summary>
|
|
/// Registry of circuit breakers per service.
|
|
/// </summary>
|
|
public interface ICircuitBreakerRegistry
|
|
{
|
|
CircuitBreaker GetOrCreate(string name);
|
|
IReadOnlyDictionary<string, CircuitBreaker> GetAll();
|
|
void Reset(string name);
|
|
void ResetAll();
|
|
}
|
|
|
|
public sealed class CircuitBreakerRegistry : ICircuitBreakerRegistry
|
|
{
|
|
private readonly ConcurrentDictionary<string, CircuitBreaker> _breakers = new();
|
|
private readonly CircuitBreakerConfig _config;
|
|
private readonly ILoggerFactory _loggerFactory;
|
|
|
|
public CircuitBreakerRegistry(
|
|
IOptions<CircuitBreakerConfig> config,
|
|
ILoggerFactory loggerFactory)
|
|
{
|
|
_config = config.Value;
|
|
_loggerFactory = loggerFactory;
|
|
}
|
|
|
|
public CircuitBreaker GetOrCreate(string name)
|
|
{
|
|
return _breakers.GetOrAdd(name, n =>
|
|
{
|
|
var logger = _loggerFactory.CreateLogger<CircuitBreaker>();
|
|
return new CircuitBreaker(n, _config, logger);
|
|
});
|
|
}
|
|
|
|
public IReadOnlyDictionary<string, CircuitBreaker> GetAll()
|
|
{
|
|
return _breakers;
|
|
}
|
|
|
|
public void Reset(string name)
|
|
{
|
|
if (_breakers.TryRemove(name, out _))
|
|
{
|
|
// Will be recreated fresh on next request
|
|
}
|
|
}
|
|
|
|
public void ResetAll()
|
|
{
|
|
_breakers.Clear();
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Bulkhead Pattern
|
|
|
|
```csharp
|
|
namespace StellaOps.Router.Resilience;
|
|
|
|
/// <summary>
|
|
/// Bulkhead pattern - limits concurrent requests to a service.
|
|
/// </summary>
|
|
public sealed class Bulkhead
|
|
{
|
|
private readonly SemaphoreSlim _semaphore;
|
|
private readonly BulkheadConfig _config;
|
|
private readonly string _name;
|
|
private int _queuedRequests;
|
|
|
|
public string Name => _name;
|
|
public int ActiveRequests => _config.MaxConcurrency - _semaphore.CurrentCount;
|
|
public int QueuedRequests => _queuedRequests;
|
|
|
|
public Bulkhead(string name, BulkheadConfig config)
|
|
{
|
|
_name = name;
|
|
_config = config;
|
|
_semaphore = new SemaphoreSlim(config.MaxConcurrency, config.MaxConcurrency);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Acquires a slot in the bulkhead.
|
|
/// </summary>
|
|
public async Task<IDisposable?> AcquireAsync(CancellationToken cancellationToken)
|
|
{
|
|
var queued = Interlocked.Increment(ref _queuedRequests);
|
|
|
|
if (queued > _config.MaxQueueSize)
|
|
{
|
|
Interlocked.Decrement(ref _queuedRequests);
|
|
return null; // Reject immediately
|
|
}
|
|
|
|
try
|
|
{
|
|
var acquired = await _semaphore.WaitAsync(_config.QueueTimeout, cancellationToken);
|
|
Interlocked.Decrement(ref _queuedRequests);
|
|
|
|
if (!acquired)
|
|
{
|
|
return null;
|
|
}
|
|
|
|
return new BulkheadLease(_semaphore);
|
|
}
|
|
catch
|
|
{
|
|
Interlocked.Decrement(ref _queuedRequests);
|
|
throw;
|
|
}
|
|
}
|
|
|
|
private sealed class BulkheadLease : IDisposable
|
|
{
|
|
private readonly SemaphoreSlim _semaphore;
|
|
private bool _disposed;
|
|
|
|
public BulkheadLease(SemaphoreSlim semaphore)
|
|
{
|
|
_semaphore = semaphore;
|
|
}
|
|
|
|
public void Dispose()
|
|
{
|
|
if (!_disposed)
|
|
{
|
|
_semaphore.Release();
|
|
_disposed = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
public class BulkheadConfig
|
|
{
|
|
public int MaxConcurrency { get; set; } = 100;
|
|
public int MaxQueueSize { get; set; } = 50;
|
|
public TimeSpan QueueTimeout { get; set; } = TimeSpan.FromSeconds(10);
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Resilience Middleware
|
|
|
|
```csharp
|
|
namespace StellaOps.Router.Gateway;
|
|
|
|
/// <summary>
|
|
/// Middleware that applies resilience policies to requests.
|
|
/// </summary>
|
|
public sealed class ResilienceMiddleware
|
|
{
|
|
private readonly RequestDelegate _next;
|
|
private readonly IResiliencePolicy _policy;
|
|
|
|
public ResilienceMiddleware(RequestDelegate next, IResiliencePolicy policy)
|
|
{
|
|
_next = next;
|
|
_policy = policy;
|
|
}
|
|
|
|
public async Task InvokeAsync(HttpContext context)
|
|
{
|
|
// Get target service from route data
|
|
var serviceName = context.GetRouteValue("service")?.ToString();
|
|
|
|
if (string.IsNullOrEmpty(serviceName))
|
|
{
|
|
await _next(context);
|
|
return;
|
|
}
|
|
|
|
try
|
|
{
|
|
await _next(context);
|
|
}
|
|
catch (Exception ex) when (IsTransientException(ex))
|
|
{
|
|
// Convert to 503 with retry information
|
|
context.Response.StatusCode = 503;
|
|
context.Response.Headers["Retry-After"] = "30";
|
|
await context.Response.WriteAsJsonAsync(new
|
|
{
|
|
error = "Service temporarily unavailable",
|
|
retryAfter = 30
|
|
});
|
|
}
|
|
}
|
|
|
|
private bool IsTransientException(Exception ex)
|
|
{
|
|
return ex is TimeoutException or
|
|
HttpRequestException or
|
|
TaskCanceledException;
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## Service Registration
|
|
|
|
```csharp
|
|
namespace StellaOps.Router.Resilience;
|
|
|
|
public static class ResilienceExtensions
|
|
{
|
|
public static IServiceCollection AddStellaResilience(
|
|
this IServiceCollection services,
|
|
IConfiguration configuration)
|
|
{
|
|
services.Configure<ResilienceConfig>(configuration.GetSection("Resilience"));
|
|
services.Configure<CircuitBreakerConfig>(configuration.GetSection("Resilience:CircuitBreaker"));
|
|
services.Configure<RetryPolicyConfig>(configuration.GetSection("Resilience:Retry"));
|
|
services.Configure<BulkheadConfig>(configuration.GetSection("Resilience:Bulkhead"));
|
|
|
|
services.AddSingleton<ICircuitBreakerRegistry, CircuitBreakerRegistry>();
|
|
services.AddSingleton<RetryPolicy>();
|
|
services.AddSingleton<IResiliencePolicy, ResiliencePolicy>();
|
|
|
|
return services;
|
|
}
|
|
}
|
|
```
|
|
|
|
---
|
|
|
|
## YAML Configuration
|
|
|
|
```yaml
|
|
Resilience:
|
|
CircuitBreaker:
|
|
FailureThreshold: 5
|
|
SamplingDuration: "00:00:30"
|
|
BreakDuration: "00:00:30"
|
|
MinimumThroughput: 10
|
|
FailureRatioThreshold: 0.5
|
|
FailureStatusCodes:
|
|
- 500
|
|
- 502
|
|
- 503
|
|
- 504
|
|
|
|
Retry:
|
|
MaxRetries: 3
|
|
InitialDelay: "00:00:00.100"
|
|
MaxDelay: "00:00:10"
|
|
BackoffMultiplier: 2.0
|
|
UseJitter: true
|
|
MaxJitterPercent: 0.25
|
|
RetryableStatusCodes:
|
|
- 408
|
|
- 429
|
|
- 502
|
|
- 503
|
|
- 504
|
|
|
|
Bulkhead:
|
|
MaxConcurrency: 100
|
|
MaxQueueSize: 50
|
|
QueueTimeout: "00:00:10"
|
|
```
|
|
|
|
---
|
|
|
|
## Deliverables
|
|
|
|
1. `StellaOps.Router.Resilience/CircuitBreaker.cs`
|
|
2. `StellaOps.Router.Resilience/CircuitBreakerConfig.cs`
|
|
3. `StellaOps.Router.Resilience/ICircuitBreakerRegistry.cs`
|
|
4. `StellaOps.Router.Resilience/CircuitBreakerRegistry.cs`
|
|
5. `StellaOps.Router.Resilience/RetryPolicy.cs`
|
|
6. `StellaOps.Router.Resilience/RetryPolicyConfig.cs`
|
|
7. `StellaOps.Router.Resilience/IResiliencePolicy.cs`
|
|
8. `StellaOps.Router.Resilience/ResiliencePolicy.cs`
|
|
9. `StellaOps.Router.Resilience/Bulkhead.cs`
|
|
10. `StellaOps.Router.Gateway/ResilienceMiddleware.cs`
|
|
11. Circuit breaker state transition tests
|
|
12. Retry policy tests
|
|
13. Bulkhead tests
|
|
|
|
---
|
|
|
|
## Next Step
|
|
|
|
Proceed to [Step 25: Configuration Hot-Reload](25-Step.md) to implement dynamic configuration updates.
|