// ----------------------------------------------------------------------------- // RecoveryTests.cs // Sprint: SPRINT_5100_0005_0001_router_chaos_suite // Task: T3 - Recovery and Resilience Tests // Description: Test router recovery after load spikes. // ----------------------------------------------------------------------------- using System.Collections.Concurrent; using System.Diagnostics; using System.Net; using FluentAssertions; using StellaOps.Chaos.Router.Tests.Fixtures; namespace StellaOps.Chaos.Router.Tests; [Trait("Category", "Chaos")] [Trait("Category", "Router")] [Trait("Category", "Recovery")] public class RecoveryTests : IClassFixture { private readonly RouterTestFixture _fixture; public RecoveryTests(RouterTestFixture fixture) { _fixture = fixture; } [Fact] public async Task Router_AfterSpike_RecoveryWithin30Seconds() { // Arrange var client = _fixture.CreateClient(); var stopwatch = Stopwatch.StartNew(); // Phase 1: Verify normal operation var normalResponse = await client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest()); var normalWorking = normalResponse.IsSuccessStatusCode || normalResponse.StatusCode == HttpStatusCode.TooManyRequests; // Phase 2: Create load spike await CreateLoadSpikeAsync(client, requestCount: 500, durationSeconds: 5); // Phase 3: Measure recovery var recovered = false; var recoveryStart = Stopwatch.StartNew(); while (recoveryStart.Elapsed < TimeSpan.FromSeconds(60)) { var response = await client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest()); if (response.IsSuccessStatusCode) { recovered = true; break; } // If throttled, wait before retry if (response.StatusCode == HttpStatusCode.TooManyRequests) { var retryAfter = response.Headers.GetValues("Retry-After").FirstOrDefault(); if (int.TryParse(retryAfter, out var waitSeconds)) { await Task.Delay(TimeSpan.FromSeconds(Math.Min(waitSeconds, 5))); } else { await Task.Delay(1000); } } else { await Task.Delay(1000); } } recoveryStart.Stop(); // Assert if (normalWorking) { recovered.Should().BeTrue("Router should recover after spike"); recoveryStart.Elapsed.Should().BeLessThan(TimeSpan.FromSeconds(30), "Recovery should happen within 30 seconds"); } Console.WriteLine($"Recovery time: {recoveryStart.Elapsed.TotalSeconds:F2}s"); } [Fact] public async Task Router_NoDataLoss_DuringThrottling() { // Arrange var client = _fixture.CreateClient(); var submittedIds = new ConcurrentBag(); var successfulIds = new ConcurrentBag(); var maxRetries = 10; // Act - Submit requests with tracking and retry on throttle var tasks = Enumerable.Range(0, 100).Select(async i => { var scanId = Guid.NewGuid().ToString(); submittedIds.Add(scanId); var retryCount = 0; HttpResponseMessage? response = null; while (retryCount < maxRetries) { response = await client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest(scanId)); if (response.StatusCode == HttpStatusCode.TooManyRequests) { var retryAfter = response.Headers.GetValues("Retry-After").FirstOrDefault(); var waitSeconds = int.TryParse(retryAfter, out var ra) ? ra : 2; await Task.Delay(TimeSpan.FromSeconds(Math.Min(waitSeconds, 5))); retryCount++; } else { break; } } if (response is not null && response.IsSuccessStatusCode) { successfulIds.Add(scanId); } return response; }); await Task.WhenAll(tasks); // Assert var successRate = (double)successfulIds.Count / submittedIds.Count; Console.WriteLine($"Success rate with retries: {successRate:P2} ({successfulIds.Count}/{submittedIds.Count})"); // All submitted requests should eventually succeed with proper retry logic successRate.Should().BeGreaterOrEqualTo(0.9, "At least 90% of requests should succeed with retry logic"); } [Fact] public async Task Router_GracefulDegradation_MaintainsPartialService() { // Arrange var client = _fixture.CreateClient(); var cts = new CancellationTokenSource(); // Start continuous background load var backgroundTask = CreateContinuousLoadAsync(client, cts.Token); // Allow load to stabilize await Task.Delay(3000); // Check that some requests are still succeeding var successCount = 0; var totalChecks = 10; for (var i = 0; i < totalChecks; i++) { var response = await client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest()); if (response.IsSuccessStatusCode || response.StatusCode == HttpStatusCode.Accepted) { successCount++; } await Task.Delay(100); } cts.Cancel(); try { await backgroundTask; } catch (OperationCanceledException) { } // Assert successCount.Should().BeGreaterThan(0, "Router should maintain partial service under load"); Console.WriteLine($"Partial service check: {successCount}/{totalChecks} successful"); } [Fact] public async Task Router_LatencyBounded_DuringSpike() { // Arrange var client = _fixture.CreateClient(); var latencies = new ConcurrentBag(); // Create background load var cts = new CancellationTokenSource(); var loadTask = CreateContinuousLoadAsync(client, cts.Token); // Measure latencies during load for (var i = 0; i < 20; i++) { var sw = Stopwatch.StartNew(); var response = await client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest()); sw.Stop(); latencies.Add(sw.ElapsedMilliseconds); await Task.Delay(100); } cts.Cancel(); try { await loadTask; } catch (OperationCanceledException) { } // Assert var avgLatency = latencies.Average(); var p95Latency = latencies.OrderBy(l => l).ElementAt((int)(latencies.Count * 0.95)); Console.WriteLine($"Latency during load: Avg={avgLatency:F0}ms, P95={p95Latency}ms"); // P95 latency should be bounded (allowing for throttle wait times) p95Latency.Should().BeLessThan(10000, "95th percentile latency should be bounded under load"); } [Fact] public async Task Router_QueueDepth_DoesNotGrowUnbounded() { // Arrange var client = _fixture.CreateClient(); // Create significant load var tasks = Enumerable.Range(0, 200) .Select(_ => client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest())); await Task.WhenAll(tasks); // Check metrics for queue depth var metricsResponse = await client.GetAsync("/metrics"); if (metricsResponse.IsSuccessStatusCode) { var metrics = await metricsResponse.Content.ReadAsStringAsync(); // Look for queue depth metric if (metrics.Contains("queue_depth") || metrics.Contains("pending_requests")) { // Queue depth should be reasonable after burst Console.WriteLine("Queue metrics found in /metrics endpoint"); } } // If we got here without timeout, queue is not growing unbounded } private static async Task CreateLoadSpikeAsync(HttpClient client, int requestCount, int durationSeconds) { var cts = new CancellationTokenSource(TimeSpan.FromSeconds(durationSeconds)); var tasks = new List(); try { for (var i = 0; i < requestCount && !cts.Token.IsCancellationRequested; i++) { tasks.Add(client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest(), cts.Token)); // Small delay to spread requests if (i % 50 == 0) { await Task.Delay(10, cts.Token); } } await Task.WhenAll(tasks.Where(t => !t.IsCanceled)); } catch (OperationCanceledException) { // Expected when duration expires } } private static async Task CreateContinuousLoadAsync(HttpClient client, CancellationToken ct) { while (!ct.IsCancellationRequested) { try { // Fire-and-forget requests _ = client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest(), ct); await Task.Delay(50, ct); } catch (OperationCanceledException) { break; } catch { // Ignore errors during load generation } } } }