# Step 24: Circuit Breaker & Retry Policies
**Phase 6: Observability & Resilience**
**Estimated Complexity:** High
**Dependencies:** Step 23 (Metrics & Health Checks)
---
## Overview
Circuit breakers and retry policies protect the system from cascading failures and transient errors. The circuit breaker prevents requests to failing services, while retry policies automatically retry failed requests with exponential backoff.
---
## Goals
1. Implement circuit breaker pattern for service protection
2. Support configurable retry policies
3. Enable per-service and per-endpoint policies
4. Integrate with metrics for observability
5. Provide graceful degradation strategies
---
## Circuit Breaker Configuration
```csharp
namespace StellaOps.Router.Resilience;
public class CircuitBreakerConfig
{
/// Number of failures before opening circuit.
public int FailureThreshold { get; set; } = 5;
/// Time window for counting failures.
public TimeSpan SamplingDuration { get; set; } = TimeSpan.FromSeconds(30);
/// How long to stay open before testing.
public TimeSpan BreakDuration { get; set; } = TimeSpan.FromSeconds(30);
/// Minimum throughput before circuit can trip.
public int MinimumThroughput { get; set; } = 10;
/// Failure ratio to trip circuit (0.0 to 1.0).
public double FailureRatioThreshold { get; set; } = 0.5;
/// HTTP status codes considered failures.
public HashSet FailureStatusCodes { get; set; } = new()
{
500, 502, 503, 504
};
/// Exception types considered failures.
public HashSet FailureExceptions { get; set; } = new()
{
typeof(TimeoutException),
typeof(TaskCanceledException),
typeof(HttpRequestException)
};
}
```
---
## Circuit Breaker Implementation
```csharp
namespace StellaOps.Router.Resilience;
public enum CircuitState
{
Closed = 0, // Normal operation
Open = 2, // Blocking requests
HalfOpen = 1 // Testing with limited requests
}
///
/// Circuit breaker for a single service or endpoint.
///
public sealed class CircuitBreaker
{
private readonly CircuitBreakerConfig _config;
private readonly ILogger _logger;
private readonly SlidingWindow _window;
private CircuitState _state = CircuitState.Closed;
private DateTimeOffset _openedAt;
private readonly SemaphoreSlim _halfOpenLock = new(1, 1);
public string Name { get; }
public CircuitState State => _state;
public DateTimeOffset LastStateChange { get; private set; }
public CircuitBreaker(
string name,
CircuitBreakerConfig config,
ILogger logger)
{
Name = name;
_config = config;
_logger = logger;
_window = new SlidingWindow(config.SamplingDuration);
LastStateChange = DateTimeOffset.UtcNow;
}
///
/// Checks if request is allowed through the circuit.
///
public async Task AllowRequestAsync(CancellationToken cancellationToken)
{
switch (_state)
{
case CircuitState.Closed:
return true;
case CircuitState.Open:
if (DateTimeOffset.UtcNow - _openedAt >= _config.BreakDuration)
{
await TryTransitionToHalfOpenAsync();
}
return _state == CircuitState.HalfOpen;
case CircuitState.HalfOpen:
// Only allow one request at a time in half-open
return await _halfOpenLock.WaitAsync(0, cancellationToken);
default:
return false;
}
}
///
/// Records a successful request.
///
public void RecordSuccess()
{
_window.RecordSuccess();
if (_state == CircuitState.HalfOpen)
{
TransitionToClosed();
_halfOpenLock.Release();
}
}
///
/// Records a failed request.
///
public void RecordFailure()
{
_window.RecordFailure();
if (_state == CircuitState.HalfOpen)
{
TransitionToOpen();
_halfOpenLock.Release();
}
else if (_state == CircuitState.Closed)
{
CheckThreshold();
}
}
private void CheckThreshold()
{
var stats = _window.GetStats();
if (stats.TotalRequests < _config.MinimumThroughput)
return;
var failureRatio = (double)stats.Failures / stats.TotalRequests;
if (failureRatio >= _config.FailureRatioThreshold ||
stats.Failures >= _config.FailureThreshold)
{
TransitionToOpen();
}
}
private void TransitionToOpen()
{
_state = CircuitState.Open;
_openedAt = DateTimeOffset.UtcNow;
LastStateChange = _openedAt;
_logger.LogWarning(
"Circuit {Name} opened. Failures: {Failures}, Ratio: {Ratio:P2}",
Name, _window.GetStats().Failures,
(double)_window.GetStats().Failures / Math.Max(1, _window.GetStats().TotalRequests));
StellaMetrics.CircuitBreakerState.Record((int)CircuitState.Open,
new TagList { { "circuit", Name } });
}
private async Task TryTransitionToHalfOpenAsync()
{
if (_state != CircuitState.Open)
return;
if (await _halfOpenLock.WaitAsync(0))
{
_state = CircuitState.HalfOpen;
LastStateChange = DateTimeOffset.UtcNow;
_window.Reset();
_logger.LogInformation("Circuit {Name} transitioning to half-open", Name);
StellaMetrics.CircuitBreakerState.Record((int)CircuitState.HalfOpen,
new TagList { { "circuit", Name } });
}
}
private void TransitionToClosed()
{
_state = CircuitState.Closed;
LastStateChange = DateTimeOffset.UtcNow;
_window.Reset();
_logger.LogInformation("Circuit {Name} closed", Name);
StellaMetrics.CircuitBreakerState.Record((int)CircuitState.Closed,
new TagList { { "circuit", Name } });
}
}
///
/// Sliding window for tracking success/failure counts.
///
internal sealed class SlidingWindow
{
private readonly TimeSpan _duration;
private readonly ConcurrentQueue<(DateTimeOffset Time, bool Success)> _events = new();
public SlidingWindow(TimeSpan duration)
{
_duration = duration;
}
public void RecordSuccess()
{
_events.Enqueue((DateTimeOffset.UtcNow, true));
Cleanup();
}
public void RecordFailure()
{
_events.Enqueue((DateTimeOffset.UtcNow, false));
Cleanup();
}
public WindowStats GetStats()
{
Cleanup();
var successes = 0;
var failures = 0;
foreach (var evt in _events)
{
if (evt.Success)
successes++;
else
failures++;
}
return new WindowStats(successes, failures);
}
public void Reset()
{
_events.Clear();
}
private void Cleanup()
{
var cutoff = DateTimeOffset.UtcNow - _duration;
while (_events.TryPeek(out var evt) && evt.Time < cutoff)
{
_events.TryDequeue(out _);
}
}
}
internal readonly record struct WindowStats(int Successes, int Failures)
{
public int TotalRequests => Successes + Failures;
}
```
---
## Retry Policy Configuration
```csharp
namespace StellaOps.Router.Resilience;
public class RetryPolicyConfig
{
/// Maximum number of retries.
public int MaxRetries { get; set; } = 3;
/// Initial delay before first retry.
public TimeSpan InitialDelay { get; set; } = TimeSpan.FromMilliseconds(100);
/// Maximum delay between retries.
public TimeSpan MaxDelay { get; set; } = TimeSpan.FromSeconds(10);
/// Backoff multiplier for exponential delay.
public double BackoffMultiplier { get; set; } = 2.0;
/// Whether to add jitter to delays.
public bool UseJitter { get; set; } = true;
/// Maximum jitter to add (percentage of delay).
public double MaxJitterPercent { get; set; } = 0.25;
/// HTTP status codes that trigger retry.
public HashSet RetryableStatusCodes { get; set; } = new()
{
408, 429, 500, 502, 503, 504
};
/// Exception types that trigger retry.
public HashSet RetryableExceptions { get; set; } = new()
{
typeof(TimeoutException),
typeof(HttpRequestException),
typeof(IOException)
};
}
```
---
## Retry Policy Implementation
```csharp
namespace StellaOps.Router.Resilience;
///
/// Executes operations with retry logic.
///
public sealed class RetryPolicy
{
private readonly RetryPolicyConfig _config;
private readonly ILogger _logger;
public RetryPolicy(RetryPolicyConfig config, ILogger logger)
{
_config = config;
_logger = logger;
}
///
/// Executes an operation with retry logic.
///
public async Task ExecuteAsync(
Func> operation,
Func shouldRetry,
CancellationToken cancellationToken)
{
var attempt = 0;
var totalDelay = TimeSpan.Zero;
while (true)
{
try
{
attempt++;
var result = await operation(cancellationToken);
if (shouldRetry(result) && attempt <= _config.MaxRetries)
{
var delay = CalculateDelay(attempt);
totalDelay += delay;
_logger.LogDebug(
"Retrying operation (attempt {Attempt}/{MaxRetries}) after {Delay}ms",
attempt, _config.MaxRetries, delay.TotalMilliseconds);
await Task.Delay(delay, cancellationToken);
continue;
}
if (attempt > 1)
{
_logger.LogDebug(
"Operation succeeded after {Attempts} attempts, total delay: {TotalDelay}ms",
attempt, totalDelay.TotalMilliseconds);
}
return result;
}
catch (Exception ex) when (ShouldRetry(ex) && attempt <= _config.MaxRetries)
{
var delay = CalculateDelay(attempt);
totalDelay += delay;
_logger.LogWarning(
ex,
"Operation failed (attempt {Attempt}/{MaxRetries}), retrying after {Delay}ms",
attempt, _config.MaxRetries, delay.TotalMilliseconds);
await Task.Delay(delay, cancellationToken);
}
}
}
///
/// Executes an operation with retry logic (response payload variant).
///
public Task ExecuteAsync(
Func> operation,
CancellationToken cancellationToken)
{
return ExecuteAsync(
operation,
response => _config.RetryableStatusCodes.Contains(response.StatusCode),
cancellationToken);
}
private bool ShouldRetry(Exception ex)
{
var exType = ex.GetType();
return _config.RetryableExceptions.Any(t => t.IsAssignableFrom(exType));
}
private TimeSpan CalculateDelay(int attempt)
{
// Exponential backoff
var delay = TimeSpan.FromMilliseconds(
_config.InitialDelay.TotalMilliseconds * Math.Pow(_config.BackoffMultiplier, attempt - 1));
// Cap at max delay
if (delay > _config.MaxDelay)
{
delay = _config.MaxDelay;
}
// Add jitter
if (_config.UseJitter)
{
var jitter = delay.TotalMilliseconds * _config.MaxJitterPercent * Random.Shared.NextDouble();
delay = TimeSpan.FromMilliseconds(delay.TotalMilliseconds + jitter);
}
return delay;
}
}
```
---
## Resilience Policy Executor
```csharp
namespace StellaOps.Router.Resilience;
///
/// Combines circuit breaker and retry policies.
///
public interface IResiliencePolicy
{
Task ExecuteAsync(
string serviceName,
Func> operation,
CancellationToken cancellationToken);
}
public sealed class ResiliencePolicy : IResiliencePolicy
{
private readonly ICircuitBreakerRegistry _circuitBreakers;
private readonly RetryPolicy _retryPolicy;
private readonly ResilienceConfig _config;
private readonly ILogger _logger;
public ResiliencePolicy(
ICircuitBreakerRegistry circuitBreakers,
RetryPolicy retryPolicy,
IOptions config,
ILogger logger)
{
_circuitBreakers = circuitBreakers;
_retryPolicy = retryPolicy;
_config = config.Value;
_logger = logger;
}
public async Task ExecuteAsync(
string serviceName,
Func> operation,
CancellationToken cancellationToken)
{
var circuitBreaker = _circuitBreakers.GetOrCreate(serviceName);
// Check circuit breaker
if (!await circuitBreaker.AllowRequestAsync(cancellationToken))
{
_logger.LogWarning("Circuit breaker {Name} is open, rejecting request", serviceName);
return _config.FallbackResponse ?? new ResponsePayload
{
StatusCode = 503,
Headers = new Dictionary
{
["X-Circuit-Breaker"] = "open",
["Retry-After"] = "30"
},
Body = Encoding.UTF8.GetBytes(JsonSerializer.Serialize(new
{
error = "Service temporarily unavailable",
service = serviceName
})),
IsFinalChunk = true
};
}
try
{
// Execute with retry
var response = await _retryPolicy.ExecuteAsync(operation, cancellationToken);
// Record result
if (IsSuccess(response))
{
circuitBreaker.RecordSuccess();
}
else if (IsFailure(response))
{
circuitBreaker.RecordFailure();
}
return response;
}
catch (Exception)
{
circuitBreaker.RecordFailure();
throw;
}
}
private bool IsSuccess(ResponsePayload response)
{
return response.StatusCode >= 200 && response.StatusCode < 400;
}
private bool IsFailure(ResponsePayload response)
{
return _config.CircuitBreaker.FailureStatusCodes.Contains(response.StatusCode);
}
}
public class ResilienceConfig
{
public CircuitBreakerConfig CircuitBreaker { get; set; } = new();
public RetryPolicyConfig Retry { get; set; } = new();
public ResponsePayload? FallbackResponse { get; set; }
}
```
---
## Circuit Breaker Registry
```csharp
namespace StellaOps.Router.Resilience;
///
/// Registry of circuit breakers per service.
///
public interface ICircuitBreakerRegistry
{
CircuitBreaker GetOrCreate(string name);
IReadOnlyDictionary GetAll();
void Reset(string name);
void ResetAll();
}
public sealed class CircuitBreakerRegistry : ICircuitBreakerRegistry
{
private readonly ConcurrentDictionary _breakers = new();
private readonly CircuitBreakerConfig _config;
private readonly ILoggerFactory _loggerFactory;
public CircuitBreakerRegistry(
IOptions config,
ILoggerFactory loggerFactory)
{
_config = config.Value;
_loggerFactory = loggerFactory;
}
public CircuitBreaker GetOrCreate(string name)
{
return _breakers.GetOrAdd(name, n =>
{
var logger = _loggerFactory.CreateLogger();
return new CircuitBreaker(n, _config, logger);
});
}
public IReadOnlyDictionary GetAll()
{
return _breakers;
}
public void Reset(string name)
{
if (_breakers.TryRemove(name, out _))
{
// Will be recreated fresh on next request
}
}
public void ResetAll()
{
_breakers.Clear();
}
}
```
---
## Bulkhead Pattern
```csharp
namespace StellaOps.Router.Resilience;
///
/// Bulkhead pattern - limits concurrent requests to a service.
///
public sealed class Bulkhead
{
private readonly SemaphoreSlim _semaphore;
private readonly BulkheadConfig _config;
private readonly string _name;
private int _queuedRequests;
public string Name => _name;
public int ActiveRequests => _config.MaxConcurrency - _semaphore.CurrentCount;
public int QueuedRequests => _queuedRequests;
public Bulkhead(string name, BulkheadConfig config)
{
_name = name;
_config = config;
_semaphore = new SemaphoreSlim(config.MaxConcurrency, config.MaxConcurrency);
}
///
/// Acquires a slot in the bulkhead.
///
public async Task AcquireAsync(CancellationToken cancellationToken)
{
var queued = Interlocked.Increment(ref _queuedRequests);
if (queued > _config.MaxQueueSize)
{
Interlocked.Decrement(ref _queuedRequests);
return null; // Reject immediately
}
try
{
var acquired = await _semaphore.WaitAsync(_config.QueueTimeout, cancellationToken);
Interlocked.Decrement(ref _queuedRequests);
if (!acquired)
{
return null;
}
return new BulkheadLease(_semaphore);
}
catch
{
Interlocked.Decrement(ref _queuedRequests);
throw;
}
}
private sealed class BulkheadLease : IDisposable
{
private readonly SemaphoreSlim _semaphore;
private bool _disposed;
public BulkheadLease(SemaphoreSlim semaphore)
{
_semaphore = semaphore;
}
public void Dispose()
{
if (!_disposed)
{
_semaphore.Release();
_disposed = true;
}
}
}
}
public class BulkheadConfig
{
public int MaxConcurrency { get; set; } = 100;
public int MaxQueueSize { get; set; } = 50;
public TimeSpan QueueTimeout { get; set; } = TimeSpan.FromSeconds(10);
}
```
---
## Resilience Middleware
```csharp
namespace StellaOps.Router.Gateway;
///
/// Middleware that applies resilience policies to requests.
///
public sealed class ResilienceMiddleware
{
private readonly RequestDelegate _next;
private readonly IResiliencePolicy _policy;
public ResilienceMiddleware(RequestDelegate next, IResiliencePolicy policy)
{
_next = next;
_policy = policy;
}
public async Task InvokeAsync(HttpContext context)
{
// Get target service from route data
var serviceName = context.GetRouteValue("service")?.ToString();
if (string.IsNullOrEmpty(serviceName))
{
await _next(context);
return;
}
try
{
await _next(context);
}
catch (Exception ex) when (IsTransientException(ex))
{
// Convert to 503 with retry information
context.Response.StatusCode = 503;
context.Response.Headers["Retry-After"] = "30";
await context.Response.WriteAsJsonAsync(new
{
error = "Service temporarily unavailable",
retryAfter = 30
});
}
}
private bool IsTransientException(Exception ex)
{
return ex is TimeoutException or
HttpRequestException or
TaskCanceledException;
}
}
```
---
## Service Registration
```csharp
namespace StellaOps.Router.Resilience;
public static class ResilienceExtensions
{
public static IServiceCollection AddStellaResilience(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure(configuration.GetSection("Resilience"));
services.Configure(configuration.GetSection("Resilience:CircuitBreaker"));
services.Configure(configuration.GetSection("Resilience:Retry"));
services.Configure(configuration.GetSection("Resilience:Bulkhead"));
services.AddSingleton();
services.AddSingleton();
services.AddSingleton();
return services;
}
}
```
---
## YAML Configuration
```yaml
Resilience:
CircuitBreaker:
FailureThreshold: 5
SamplingDuration: "00:00:30"
BreakDuration: "00:00:30"
MinimumThroughput: 10
FailureRatioThreshold: 0.5
FailureStatusCodes:
- 500
- 502
- 503
- 504
Retry:
MaxRetries: 3
InitialDelay: "00:00:00.100"
MaxDelay: "00:00:10"
BackoffMultiplier: 2.0
UseJitter: true
MaxJitterPercent: 0.25
RetryableStatusCodes:
- 408
- 429
- 502
- 503
- 504
Bulkhead:
MaxConcurrency: 100
MaxQueueSize: 50
QueueTimeout: "00:00:10"
```
---
## Deliverables
1. `StellaOps.Router.Resilience/CircuitBreaker.cs`
2. `StellaOps.Router.Resilience/CircuitBreakerConfig.cs`
3. `StellaOps.Router.Resilience/ICircuitBreakerRegistry.cs`
4. `StellaOps.Router.Resilience/CircuitBreakerRegistry.cs`
5. `StellaOps.Router.Resilience/RetryPolicy.cs`
6. `StellaOps.Router.Resilience/RetryPolicyConfig.cs`
7. `StellaOps.Router.Resilience/IResiliencePolicy.cs`
8. `StellaOps.Router.Resilience/ResiliencePolicy.cs`
9. `StellaOps.Router.Resilience/Bulkhead.cs`
10. `StellaOps.Router.Gateway/ResilienceMiddleware.cs`
11. Circuit breaker state transition tests
12. Retry policy tests
13. Bulkhead tests
---
## Next Step
Proceed to [Step 25: Configuration Hot-Reload](25-Step.md) to implement dynamic configuration updates.