# Step 24: Circuit Breaker & Retry Policies **Phase 6: Observability & Resilience** **Estimated Complexity:** High **Dependencies:** Step 23 (Metrics & Health Checks) --- ## Overview Circuit breakers and retry policies protect the system from cascading failures and transient errors. The circuit breaker prevents requests to failing services, while retry policies automatically retry failed requests with exponential backoff. --- ## Goals 1. Implement circuit breaker pattern for service protection 2. Support configurable retry policies 3. Enable per-service and per-endpoint policies 4. Integrate with metrics for observability 5. Provide graceful degradation strategies --- ## Circuit Breaker Configuration ```csharp namespace StellaOps.Router.Resilience; public class CircuitBreakerConfig { /// Number of failures before opening circuit. public int FailureThreshold { get; set; } = 5; /// Time window for counting failures. public TimeSpan SamplingDuration { get; set; } = TimeSpan.FromSeconds(30); /// How long to stay open before testing. public TimeSpan BreakDuration { get; set; } = TimeSpan.FromSeconds(30); /// Minimum throughput before circuit can trip. public int MinimumThroughput { get; set; } = 10; /// Failure ratio to trip circuit (0.0 to 1.0). public double FailureRatioThreshold { get; set; } = 0.5; /// HTTP status codes considered failures. public HashSet FailureStatusCodes { get; set; } = new() { 500, 502, 503, 504 }; /// Exception types considered failures. public HashSet FailureExceptions { get; set; } = new() { typeof(TimeoutException), typeof(TaskCanceledException), typeof(HttpRequestException) }; } ``` --- ## Circuit Breaker Implementation ```csharp namespace StellaOps.Router.Resilience; public enum CircuitState { Closed = 0, // Normal operation Open = 2, // Blocking requests HalfOpen = 1 // Testing with limited requests } /// /// Circuit breaker for a single service or endpoint. /// public sealed class CircuitBreaker { private readonly CircuitBreakerConfig _config; private readonly ILogger _logger; private readonly SlidingWindow _window; private CircuitState _state = CircuitState.Closed; private DateTimeOffset _openedAt; private readonly SemaphoreSlim _halfOpenLock = new(1, 1); public string Name { get; } public CircuitState State => _state; public DateTimeOffset LastStateChange { get; private set; } public CircuitBreaker( string name, CircuitBreakerConfig config, ILogger logger) { Name = name; _config = config; _logger = logger; _window = new SlidingWindow(config.SamplingDuration); LastStateChange = DateTimeOffset.UtcNow; } /// /// Checks if request is allowed through the circuit. /// public async Task AllowRequestAsync(CancellationToken cancellationToken) { switch (_state) { case CircuitState.Closed: return true; case CircuitState.Open: if (DateTimeOffset.UtcNow - _openedAt >= _config.BreakDuration) { await TryTransitionToHalfOpenAsync(); } return _state == CircuitState.HalfOpen; case CircuitState.HalfOpen: // Only allow one request at a time in half-open return await _halfOpenLock.WaitAsync(0, cancellationToken); default: return false; } } /// /// Records a successful request. /// public void RecordSuccess() { _window.RecordSuccess(); if (_state == CircuitState.HalfOpen) { TransitionToClosed(); _halfOpenLock.Release(); } } /// /// Records a failed request. /// public void RecordFailure() { _window.RecordFailure(); if (_state == CircuitState.HalfOpen) { TransitionToOpen(); _halfOpenLock.Release(); } else if (_state == CircuitState.Closed) { CheckThreshold(); } } private void CheckThreshold() { var stats = _window.GetStats(); if (stats.TotalRequests < _config.MinimumThroughput) return; var failureRatio = (double)stats.Failures / stats.TotalRequests; if (failureRatio >= _config.FailureRatioThreshold || stats.Failures >= _config.FailureThreshold) { TransitionToOpen(); } } private void TransitionToOpen() { _state = CircuitState.Open; _openedAt = DateTimeOffset.UtcNow; LastStateChange = _openedAt; _logger.LogWarning( "Circuit {Name} opened. Failures: {Failures}, Ratio: {Ratio:P2}", Name, _window.GetStats().Failures, (double)_window.GetStats().Failures / Math.Max(1, _window.GetStats().TotalRequests)); StellaMetrics.CircuitBreakerState.Record((int)CircuitState.Open, new TagList { { "circuit", Name } }); } private async Task TryTransitionToHalfOpenAsync() { if (_state != CircuitState.Open) return; if (await _halfOpenLock.WaitAsync(0)) { _state = CircuitState.HalfOpen; LastStateChange = DateTimeOffset.UtcNow; _window.Reset(); _logger.LogInformation("Circuit {Name} transitioning to half-open", Name); StellaMetrics.CircuitBreakerState.Record((int)CircuitState.HalfOpen, new TagList { { "circuit", Name } }); } } private void TransitionToClosed() { _state = CircuitState.Closed; LastStateChange = DateTimeOffset.UtcNow; _window.Reset(); _logger.LogInformation("Circuit {Name} closed", Name); StellaMetrics.CircuitBreakerState.Record((int)CircuitState.Closed, new TagList { { "circuit", Name } }); } } /// /// Sliding window for tracking success/failure counts. /// internal sealed class SlidingWindow { private readonly TimeSpan _duration; private readonly ConcurrentQueue<(DateTimeOffset Time, bool Success)> _events = new(); public SlidingWindow(TimeSpan duration) { _duration = duration; } public void RecordSuccess() { _events.Enqueue((DateTimeOffset.UtcNow, true)); Cleanup(); } public void RecordFailure() { _events.Enqueue((DateTimeOffset.UtcNow, false)); Cleanup(); } public WindowStats GetStats() { Cleanup(); var successes = 0; var failures = 0; foreach (var evt in _events) { if (evt.Success) successes++; else failures++; } return new WindowStats(successes, failures); } public void Reset() { _events.Clear(); } private void Cleanup() { var cutoff = DateTimeOffset.UtcNow - _duration; while (_events.TryPeek(out var evt) && evt.Time < cutoff) { _events.TryDequeue(out _); } } } internal readonly record struct WindowStats(int Successes, int Failures) { public int TotalRequests => Successes + Failures; } ``` --- ## Retry Policy Configuration ```csharp namespace StellaOps.Router.Resilience; public class RetryPolicyConfig { /// Maximum number of retries. public int MaxRetries { get; set; } = 3; /// Initial delay before first retry. public TimeSpan InitialDelay { get; set; } = TimeSpan.FromMilliseconds(100); /// Maximum delay between retries. public TimeSpan MaxDelay { get; set; } = TimeSpan.FromSeconds(10); /// Backoff multiplier for exponential delay. public double BackoffMultiplier { get; set; } = 2.0; /// Whether to add jitter to delays. public bool UseJitter { get; set; } = true; /// Maximum jitter to add (percentage of delay). public double MaxJitterPercent { get; set; } = 0.25; /// HTTP status codes that trigger retry. public HashSet RetryableStatusCodes { get; set; } = new() { 408, 429, 500, 502, 503, 504 }; /// Exception types that trigger retry. public HashSet RetryableExceptions { get; set; } = new() { typeof(TimeoutException), typeof(HttpRequestException), typeof(IOException) }; } ``` --- ## Retry Policy Implementation ```csharp namespace StellaOps.Router.Resilience; /// /// Executes operations with retry logic. /// public sealed class RetryPolicy { private readonly RetryPolicyConfig _config; private readonly ILogger _logger; public RetryPolicy(RetryPolicyConfig config, ILogger logger) { _config = config; _logger = logger; } /// /// Executes an operation with retry logic. /// public async Task ExecuteAsync( Func> operation, Func shouldRetry, CancellationToken cancellationToken) { var attempt = 0; var totalDelay = TimeSpan.Zero; while (true) { try { attempt++; var result = await operation(cancellationToken); if (shouldRetry(result) && attempt <= _config.MaxRetries) { var delay = CalculateDelay(attempt); totalDelay += delay; _logger.LogDebug( "Retrying operation (attempt {Attempt}/{MaxRetries}) after {Delay}ms", attempt, _config.MaxRetries, delay.TotalMilliseconds); await Task.Delay(delay, cancellationToken); continue; } if (attempt > 1) { _logger.LogDebug( "Operation succeeded after {Attempts} attempts, total delay: {TotalDelay}ms", attempt, totalDelay.TotalMilliseconds); } return result; } catch (Exception ex) when (ShouldRetry(ex) && attempt <= _config.MaxRetries) { var delay = CalculateDelay(attempt); totalDelay += delay; _logger.LogWarning( ex, "Operation failed (attempt {Attempt}/{MaxRetries}), retrying after {Delay}ms", attempt, _config.MaxRetries, delay.TotalMilliseconds); await Task.Delay(delay, cancellationToken); } } } /// /// Executes an operation with retry logic (response payload variant). /// public Task ExecuteAsync( Func> operation, CancellationToken cancellationToken) { return ExecuteAsync( operation, response => _config.RetryableStatusCodes.Contains(response.StatusCode), cancellationToken); } private bool ShouldRetry(Exception ex) { var exType = ex.GetType(); return _config.RetryableExceptions.Any(t => t.IsAssignableFrom(exType)); } private TimeSpan CalculateDelay(int attempt) { // Exponential backoff var delay = TimeSpan.FromMilliseconds( _config.InitialDelay.TotalMilliseconds * Math.Pow(_config.BackoffMultiplier, attempt - 1)); // Cap at max delay if (delay > _config.MaxDelay) { delay = _config.MaxDelay; } // Add jitter if (_config.UseJitter) { var jitter = delay.TotalMilliseconds * _config.MaxJitterPercent * Random.Shared.NextDouble(); delay = TimeSpan.FromMilliseconds(delay.TotalMilliseconds + jitter); } return delay; } } ``` --- ## Resilience Policy Executor ```csharp namespace StellaOps.Router.Resilience; /// /// Combines circuit breaker and retry policies. /// public interface IResiliencePolicy { Task ExecuteAsync( string serviceName, Func> operation, CancellationToken cancellationToken); } public sealed class ResiliencePolicy : IResiliencePolicy { private readonly ICircuitBreakerRegistry _circuitBreakers; private readonly RetryPolicy _retryPolicy; private readonly ResilienceConfig _config; private readonly ILogger _logger; public ResiliencePolicy( ICircuitBreakerRegistry circuitBreakers, RetryPolicy retryPolicy, IOptions config, ILogger logger) { _circuitBreakers = circuitBreakers; _retryPolicy = retryPolicy; _config = config.Value; _logger = logger; } public async Task ExecuteAsync( string serviceName, Func> operation, CancellationToken cancellationToken) { var circuitBreaker = _circuitBreakers.GetOrCreate(serviceName); // Check circuit breaker if (!await circuitBreaker.AllowRequestAsync(cancellationToken)) { _logger.LogWarning("Circuit breaker {Name} is open, rejecting request", serviceName); return _config.FallbackResponse ?? new ResponsePayload { StatusCode = 503, Headers = new Dictionary { ["X-Circuit-Breaker"] = "open", ["Retry-After"] = "30" }, Body = Encoding.UTF8.GetBytes(JsonSerializer.Serialize(new { error = "Service temporarily unavailable", service = serviceName })), IsFinalChunk = true }; } try { // Execute with retry var response = await _retryPolicy.ExecuteAsync(operation, cancellationToken); // Record result if (IsSuccess(response)) { circuitBreaker.RecordSuccess(); } else if (IsFailure(response)) { circuitBreaker.RecordFailure(); } return response; } catch (Exception) { circuitBreaker.RecordFailure(); throw; } } private bool IsSuccess(ResponsePayload response) { return response.StatusCode >= 200 && response.StatusCode < 400; } private bool IsFailure(ResponsePayload response) { return _config.CircuitBreaker.FailureStatusCodes.Contains(response.StatusCode); } } public class ResilienceConfig { public CircuitBreakerConfig CircuitBreaker { get; set; } = new(); public RetryPolicyConfig Retry { get; set; } = new(); public ResponsePayload? FallbackResponse { get; set; } } ``` --- ## Circuit Breaker Registry ```csharp namespace StellaOps.Router.Resilience; /// /// Registry of circuit breakers per service. /// public interface ICircuitBreakerRegistry { CircuitBreaker GetOrCreate(string name); IReadOnlyDictionary GetAll(); void Reset(string name); void ResetAll(); } public sealed class CircuitBreakerRegistry : ICircuitBreakerRegistry { private readonly ConcurrentDictionary _breakers = new(); private readonly CircuitBreakerConfig _config; private readonly ILoggerFactory _loggerFactory; public CircuitBreakerRegistry( IOptions config, ILoggerFactory loggerFactory) { _config = config.Value; _loggerFactory = loggerFactory; } public CircuitBreaker GetOrCreate(string name) { return _breakers.GetOrAdd(name, n => { var logger = _loggerFactory.CreateLogger(); return new CircuitBreaker(n, _config, logger); }); } public IReadOnlyDictionary GetAll() { return _breakers; } public void Reset(string name) { if (_breakers.TryRemove(name, out _)) { // Will be recreated fresh on next request } } public void ResetAll() { _breakers.Clear(); } } ``` --- ## Bulkhead Pattern ```csharp namespace StellaOps.Router.Resilience; /// /// Bulkhead pattern - limits concurrent requests to a service. /// public sealed class Bulkhead { private readonly SemaphoreSlim _semaphore; private readonly BulkheadConfig _config; private readonly string _name; private int _queuedRequests; public string Name => _name; public int ActiveRequests => _config.MaxConcurrency - _semaphore.CurrentCount; public int QueuedRequests => _queuedRequests; public Bulkhead(string name, BulkheadConfig config) { _name = name; _config = config; _semaphore = new SemaphoreSlim(config.MaxConcurrency, config.MaxConcurrency); } /// /// Acquires a slot in the bulkhead. /// public async Task AcquireAsync(CancellationToken cancellationToken) { var queued = Interlocked.Increment(ref _queuedRequests); if (queued > _config.MaxQueueSize) { Interlocked.Decrement(ref _queuedRequests); return null; // Reject immediately } try { var acquired = await _semaphore.WaitAsync(_config.QueueTimeout, cancellationToken); Interlocked.Decrement(ref _queuedRequests); if (!acquired) { return null; } return new BulkheadLease(_semaphore); } catch { Interlocked.Decrement(ref _queuedRequests); throw; } } private sealed class BulkheadLease : IDisposable { private readonly SemaphoreSlim _semaphore; private bool _disposed; public BulkheadLease(SemaphoreSlim semaphore) { _semaphore = semaphore; } public void Dispose() { if (!_disposed) { _semaphore.Release(); _disposed = true; } } } } public class BulkheadConfig { public int MaxConcurrency { get; set; } = 100; public int MaxQueueSize { get; set; } = 50; public TimeSpan QueueTimeout { get; set; } = TimeSpan.FromSeconds(10); } ``` --- ## Resilience Middleware ```csharp namespace StellaOps.Router.Gateway; /// /// Middleware that applies resilience policies to requests. /// public sealed class ResilienceMiddleware { private readonly RequestDelegate _next; private readonly IResiliencePolicy _policy; public ResilienceMiddleware(RequestDelegate next, IResiliencePolicy policy) { _next = next; _policy = policy; } public async Task InvokeAsync(HttpContext context) { // Get target service from route data var serviceName = context.GetRouteValue("service")?.ToString(); if (string.IsNullOrEmpty(serviceName)) { await _next(context); return; } try { await _next(context); } catch (Exception ex) when (IsTransientException(ex)) { // Convert to 503 with retry information context.Response.StatusCode = 503; context.Response.Headers["Retry-After"] = "30"; await context.Response.WriteAsJsonAsync(new { error = "Service temporarily unavailable", retryAfter = 30 }); } } private bool IsTransientException(Exception ex) { return ex is TimeoutException or HttpRequestException or TaskCanceledException; } } ``` --- ## Service Registration ```csharp namespace StellaOps.Router.Resilience; public static class ResilienceExtensions { public static IServiceCollection AddStellaResilience( this IServiceCollection services, IConfiguration configuration) { services.Configure(configuration.GetSection("Resilience")); services.Configure(configuration.GetSection("Resilience:CircuitBreaker")); services.Configure(configuration.GetSection("Resilience:Retry")); services.Configure(configuration.GetSection("Resilience:Bulkhead")); services.AddSingleton(); services.AddSingleton(); services.AddSingleton(); return services; } } ``` --- ## YAML Configuration ```yaml Resilience: CircuitBreaker: FailureThreshold: 5 SamplingDuration: "00:00:30" BreakDuration: "00:00:30" MinimumThroughput: 10 FailureRatioThreshold: 0.5 FailureStatusCodes: - 500 - 502 - 503 - 504 Retry: MaxRetries: 3 InitialDelay: "00:00:00.100" MaxDelay: "00:00:10" BackoffMultiplier: 2.0 UseJitter: true MaxJitterPercent: 0.25 RetryableStatusCodes: - 408 - 429 - 502 - 503 - 504 Bulkhead: MaxConcurrency: 100 MaxQueueSize: 50 QueueTimeout: "00:00:10" ``` --- ## Deliverables 1. `StellaOps.Router.Resilience/CircuitBreaker.cs` 2. `StellaOps.Router.Resilience/CircuitBreakerConfig.cs` 3. `StellaOps.Router.Resilience/ICircuitBreakerRegistry.cs` 4. `StellaOps.Router.Resilience/CircuitBreakerRegistry.cs` 5. `StellaOps.Router.Resilience/RetryPolicy.cs` 6. `StellaOps.Router.Resilience/RetryPolicyConfig.cs` 7. `StellaOps.Router.Resilience/IResiliencePolicy.cs` 8. `StellaOps.Router.Resilience/ResiliencePolicy.cs` 9. `StellaOps.Router.Resilience/Bulkhead.cs` 10. `StellaOps.Router.Gateway/ResilienceMiddleware.cs` 11. Circuit breaker state transition tests 12. Retry policy tests 13. Bulkhead tests --- ## Next Step Proceed to [Step 25: Configuration Hot-Reload](25-Step.md) to implement dynamic configuration updates.