// ----------------------------------------------------------------------------- // ValkeyFailureTests.cs // Sprint: SPRINT_5100_0005_0001_router_chaos_suite // Task: T4 - Valkey Failure Injection // Description: Test router behavior when Valkey cache fails. // ----------------------------------------------------------------------------- using System.Diagnostics; using System.Net; using FluentAssertions; using StellaOps.Chaos.Router.Tests.Fixtures; namespace StellaOps.Chaos.Router.Tests; [Trait("Category", "Chaos")] [Trait("Category", "Valkey")] [Collection("ValkeyTests")] public class ValkeyFailureTests : IClassFixture, IAsyncLifetime { private readonly RouterWithValkeyFixture _fixture; public ValkeyFailureTests(RouterWithValkeyFixture fixture) { _fixture = fixture; } public async Task InitializeAsync() { await _fixture.StartValkeyAsync(); } public Task DisposeAsync() { return Task.CompletedTask; } [Fact] public async Task Router_ValkeyDown_FallsBackToLocal() { // Arrange var client = _fixture.CreateClient(); // Verify normal operation with Valkey var response1 = await client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest()); var initialSuccess = response1.IsSuccessStatusCode || response1.StatusCode == HttpStatusCode.TooManyRequests; // Kill Valkey await _fixture.StopValkeyAsync(); // Wait for router to detect Valkey is down await Task.Delay(2000); // Act - Router should degrade gracefully var response2 = await client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest()); // Assert - Should still work with local rate limiter or return controlled error var validStatuses = new[] { HttpStatusCode.OK, HttpStatusCode.Accepted, HttpStatusCode.TooManyRequests, HttpStatusCode.ServiceUnavailable }; response2.StatusCode.Should().BeOneOf(validStatuses, "Router should fall back to local rate limiting when Valkey is down"); // Restore Valkey for other tests await _fixture.StartValkeyAsync(); } [Fact] public async Task Router_ValkeyReconnect_ResumesDistributedLimiting() { // Arrange var client = _fixture.CreateClient(); // Kill and restart Valkey await _fixture.StopValkeyAsync(); await Task.Delay(3000); await _fixture.StartValkeyAsync(); await Task.Delay(2000); // Allow reconnection // Act - Send some requests after Valkey restart var responses = new List(); for (var i = 0; i < 10; i++) { responses.Add(await client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest())); await Task.Delay(100); } // Assert - Requests should be processed var successCount = responses.Count(r => r.IsSuccessStatusCode || r.StatusCode == HttpStatusCode.TooManyRequests); successCount.Should().BeGreaterThan(0, "Router should resume processing after Valkey reconnect"); // Optional: Check metrics for distributed limiting active var metricsResponse = await client.GetAsync("/metrics"); if (metricsResponse.IsSuccessStatusCode) { var metrics = await metricsResponse.Content.ReadAsStringAsync(); Console.WriteLine("Metrics available after Valkey reconnect"); // Log whether distributed backend is active } } [Fact] public async Task Router_ValkeyLatency_DoesNotBlock() { // Arrange await _fixture.ConfigureValkeyLatencyAsync(TimeSpan.FromSeconds(2)); var client = _fixture.CreateClient(); var stopwatch = Stopwatch.StartNew(); // Act var response = await client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest()); stopwatch.Stop(); // Assert - Request should complete without waiting for slow Valkey // The router should have a timeout for cache operations stopwatch.Elapsed.Should().BeLessThan(TimeSpan.FromSeconds(5), "Slow Valkey should not significantly block request processing"); // Request should still be valid var validStatuses = new[] { HttpStatusCode.OK, HttpStatusCode.Accepted, HttpStatusCode.TooManyRequests, HttpStatusCode.ServiceUnavailable }; response.StatusCode.Should().BeOneOf(validStatuses); Console.WriteLine($"Request completed in {stopwatch.ElapsedMilliseconds}ms with slow Valkey"); } [Fact] public async Task Router_ValkeyFlap_HandlesGracefully() { // Arrange var client = _fixture.CreateClient(); var successCount = 0; var errorCount = 0; // Act - Simulate Valkey flapping for (var cycle = 0; cycle < 3; cycle++) { // Valkey up await _fixture.StartValkeyAsync(); await Task.Delay(1000); for (var i = 0; i < 5; i++) { var response = await client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest()); if (response.IsSuccessStatusCode) successCount++; else errorCount++; } // Valkey down await _fixture.StopValkeyAsync(); await Task.Delay(1000); for (var i = 0; i < 5; i++) { var response = await client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest()); if (response.IsSuccessStatusCode) successCount++; else if (response.StatusCode == HttpStatusCode.TooManyRequests) successCount++; // Throttled is acceptable else errorCount++; } } // Assert var totalRequests = successCount + errorCount; var successRate = (double)successCount / totalRequests; Console.WriteLine($"Valkey flap test: {successCount}/{totalRequests} successful ({successRate:P2})"); successRate.Should().BeGreaterOrEqualTo(0.5, "Router should handle at least 50% of requests during Valkey flapping"); } [Fact] public async Task Router_ValkeyConnectionExhaustion_DoesNotCrash() { // Arrange var client = _fixture.CreateClient(); // Create many parallel requests that might exhaust Valkey connections var tasks = Enumerable.Range(0, 500) .Select(_ => client.PostAsync("/api/v1/scan", RouterTestFixture.CreateScanRequest())); // Act var responses = await Task.WhenAll(tasks); // Assert - Router should not crash var validResponses = responses.Count(r => r.StatusCode == HttpStatusCode.OK || r.StatusCode == HttpStatusCode.Accepted || r.StatusCode == HttpStatusCode.TooManyRequests || r.StatusCode == HttpStatusCode.ServiceUnavailable); validResponses.Should().Be(responses.Length, "All responses should be valid HTTP responses"); // Verify router is still responsive after burst var healthCheck = await client.GetAsync("/health"); // Router health endpoint should respond Console.WriteLine($"Health check after burst: {healthCheck.StatusCode}"); } }