sln build fix (again), tests fixes, audit work and doctors work
This commit is contained in:
@@ -0,0 +1,381 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ControlPlaneOutageTests.cs
|
||||
// Sprint: Testing Enhancement Advisory - Phase 3.3
|
||||
// Description: Tests for control-plane behavior during full outage scenarios
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using FluentAssertions;
|
||||
using StellaOps.Chaos.ControlPlane.Tests.Fixtures;
|
||||
using StellaOps.TestKit;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Chaos.ControlPlane.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Tests for control-plane behavior during full outage scenarios.
|
||||
/// Validates graceful degradation, data durability, and recovery.
|
||||
/// </summary>
|
||||
[Trait("Category", TestCategories.Chaos)]
|
||||
[Trait("Category", "ControlPlane")]
|
||||
public class ControlPlaneOutageTests : IClassFixture<ControlPlaneClusterFixture>
|
||||
{
|
||||
private readonly ControlPlaneClusterFixture _fixture;
|
||||
|
||||
public ControlPlaneOutageTests(ControlPlaneClusterFixture fixture)
|
||||
{
|
||||
_fixture = fixture;
|
||||
_fixture.FailureInjector.RecoverAll();
|
||||
_fixture.ClearEventLog();
|
||||
}
|
||||
|
||||
#region Authority Outage Tests
|
||||
|
||||
[Fact]
|
||||
public async Task Authority_Outage_CachedTokens_AllowTemporaryAccess()
|
||||
{
|
||||
// Arrange - Issue tokens while Authority is healthy
|
||||
var token1 = await _fixture.IssueTokenAsync("user-1", TimeSpan.FromHours(1));
|
||||
var token2 = await _fixture.IssueTokenAsync("user-2", TimeSpan.FromHours(1));
|
||||
|
||||
token1.Success.Should().BeTrue();
|
||||
token2.Success.Should().BeTrue();
|
||||
|
||||
// Act - Authority goes down
|
||||
_fixture.FailureInjector.InjectFullOutage("authority");
|
||||
|
||||
// Try to validate tokens
|
||||
var validation1 = await _fixture.ValidateTokenAsync(token1.Token!.TokenId);
|
||||
var validation2 = await _fixture.ValidateTokenAsync(token2.Token!.TokenId);
|
||||
|
||||
// Assert - Cached tokens should still validate
|
||||
validation1.Success.Should().BeTrue();
|
||||
validation1.IsValid.Should().BeTrue();
|
||||
validation1.Source.Should().Be(ValidationSource.Cache);
|
||||
validation1.Warning.Should().Contain("Authority unavailable");
|
||||
|
||||
validation2.Success.Should().BeTrue();
|
||||
validation2.IsValid.Should().BeTrue();
|
||||
validation2.Source.Should().Be(ValidationSource.Cache);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Authority_Outage_NewTokens_CannotBeIssued()
|
||||
{
|
||||
// Arrange - Authority goes down
|
||||
_fixture.FailureInjector.InjectFullOutage("authority");
|
||||
|
||||
// Act - Try to issue a new token
|
||||
var result = await _fixture.IssueTokenAsync("new-user", TimeSpan.FromHours(1));
|
||||
|
||||
// Assert - Should fail
|
||||
result.Success.Should().BeFalse();
|
||||
result.Token.Should().BeNull();
|
||||
result.Error.Should().NotBeNullOrEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Authority_Outage_UncachedTokens_FailValidation()
|
||||
{
|
||||
// Arrange - Authority goes down immediately
|
||||
_fixture.FailureInjector.InjectFullOutage("authority");
|
||||
|
||||
// Act - Try to validate a token that was never cached
|
||||
var result = await _fixture.ValidateTokenAsync("nonexistent-token");
|
||||
|
||||
// Assert - Should fail gracefully
|
||||
result.Success.Should().BeFalse();
|
||||
result.IsValid.Should().BeFalse();
|
||||
result.Source.Should().Be(ValidationSource.None);
|
||||
result.Error.Should().Contain("not in cache");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Authority_Recovery_TokenValidation_UsesAuthorityAgain()
|
||||
{
|
||||
// Arrange - Issue token while healthy
|
||||
var token = await _fixture.IssueTokenAsync("user-1", TimeSpan.FromHours(1));
|
||||
token.Success.Should().BeTrue();
|
||||
|
||||
// Authority goes down
|
||||
_fixture.FailureInjector.InjectFullOutage("authority");
|
||||
var duringOutage = await _fixture.ValidateTokenAsync(token.Token!.TokenId);
|
||||
duringOutage.Source.Should().Be(ValidationSource.Cache);
|
||||
|
||||
// Act - Authority recovers
|
||||
_fixture.FailureInjector.RecoverService("authority");
|
||||
var afterRecovery = await _fixture.ValidateTokenAsync(token.Token!.TokenId);
|
||||
|
||||
// Assert - Should use Authority again
|
||||
afterRecovery.Success.Should().BeTrue();
|
||||
afterRecovery.IsValid.Should().BeTrue();
|
||||
afterRecovery.Source.Should().Be(ValidationSource.Authority);
|
||||
afterRecovery.Warning.Should().BeNull();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Scheduler Outage Tests
|
||||
|
||||
[Fact]
|
||||
public async Task Scheduler_Outage_PendingJobs_NotLost()
|
||||
{
|
||||
// Arrange - Enqueue jobs while healthy
|
||||
var job1 = await _fixture.EnqueueJobAsync("scan", "image:tag1");
|
||||
var job2 = await _fixture.EnqueueJobAsync("scan", "image:tag2");
|
||||
|
||||
job1.Success.Should().BeTrue();
|
||||
job2.Success.Should().BeTrue();
|
||||
|
||||
// Scheduler goes down
|
||||
_fixture.FailureInjector.InjectFullOutage("scheduler");
|
||||
|
||||
// Act - Enqueue more jobs during outage
|
||||
var job3 = await _fixture.EnqueueJobAsync("scan", "image:tag3");
|
||||
var job4 = await _fixture.EnqueueJobAsync("policy", "check-1");
|
||||
|
||||
// Assert - Jobs are persisted locally even during outage
|
||||
job3.Success.Should().BeTrue();
|
||||
job3.Warning.Should().Contain("scheduler notification failed");
|
||||
|
||||
job4.Success.Should().BeTrue();
|
||||
job4.Warning.Should().Contain("scheduler notification failed");
|
||||
|
||||
// All jobs should be in the queue
|
||||
_fixture.GetPendingJobCount().Should().Be(4);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Scheduler_Recovery_PendingJobs_ProcessedInOrder()
|
||||
{
|
||||
// Arrange - Scheduler down, enqueue jobs
|
||||
_fixture.FailureInjector.InjectFullOutage("scheduler");
|
||||
|
||||
await _fixture.EnqueueJobAsync("scan", "image:tag1");
|
||||
await _fixture.EnqueueJobAsync("scan", "image:tag2");
|
||||
await _fixture.EnqueueJobAsync("scan", "image:tag3");
|
||||
|
||||
var pendingBefore = _fixture.GetPendingJobCount();
|
||||
pendingBefore.Should().Be(3);
|
||||
|
||||
// Act - Scheduler recovers
|
||||
_fixture.FailureInjector.RecoverService("scheduler");
|
||||
var processedCount = await _fixture.ProcessPendingJobsAsync();
|
||||
|
||||
// Assert - All jobs processed
|
||||
processedCount.Should().Be(3);
|
||||
_fixture.GetPendingJobCount().Should().Be(0);
|
||||
|
||||
var allJobs = _fixture.GetAllJobs();
|
||||
allJobs.Should().AllSatisfy(j => j.Status.Should().Be(JobStatus.Processing));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Scheduler_IntermittentOutage_NoJobDuplication()
|
||||
{
|
||||
// Arrange - Normal operation
|
||||
var job1 = await _fixture.EnqueueJobAsync("scan", "image:tag1");
|
||||
job1.Success.Should().BeTrue();
|
||||
job1.Warning.Should().BeNull();
|
||||
|
||||
// Outage
|
||||
_fixture.FailureInjector.InjectFullOutage("scheduler");
|
||||
var job2 = await _fixture.EnqueueJobAsync("scan", "image:tag2");
|
||||
job2.Warning.Should().NotBeNull();
|
||||
|
||||
// Recovery
|
||||
_fixture.FailureInjector.RecoverService("scheduler");
|
||||
var job3 = await _fixture.EnqueueJobAsync("scan", "image:tag3");
|
||||
job3.Warning.Should().BeNull();
|
||||
|
||||
// Assert - No duplicates
|
||||
var allJobs = _fixture.GetAllJobs();
|
||||
var uniqueJobIds = allJobs.Select(j => j.JobId).Distinct().Count();
|
||||
uniqueJobIds.Should().Be(allJobs.Count);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Full Control-Plane Outage Tests
|
||||
|
||||
[Fact]
|
||||
public async Task FullControlPlane_Outage_DataPersistence_Verified()
|
||||
{
|
||||
// Arrange - Persist data while healthy
|
||||
var data1 = await _fixture.PersistDataAsync("config-1", "value-1");
|
||||
var data2 = await _fixture.PersistDataAsync("config-2", "value-2");
|
||||
var data3 = await _fixture.PersistDataAsync("config-3", "value-3");
|
||||
|
||||
data1.Success.Should().BeTrue();
|
||||
data2.Success.Should().BeTrue();
|
||||
data3.Success.Should().BeTrue();
|
||||
|
||||
// Act - Full control-plane outage
|
||||
_fixture.InjectFullControlPlaneOutage();
|
||||
|
||||
// Recovery
|
||||
_fixture.RecoverControlPlane();
|
||||
|
||||
// Assert - All data should be intact
|
||||
var read1 = await _fixture.ReadDataAsync("config-1");
|
||||
var read2 = await _fixture.ReadDataAsync("config-2");
|
||||
var read3 = await _fixture.ReadDataAsync("config-3");
|
||||
|
||||
read1.Success.Should().BeTrue();
|
||||
read1.Data.Should().NotBeNull();
|
||||
read1.Data!.Value.Should().Be("value-1");
|
||||
|
||||
read2.Success.Should().BeTrue();
|
||||
read2.Data!.Value.Should().Be("value-2");
|
||||
|
||||
read3.Success.Should().BeTrue();
|
||||
read3.Data!.Value.Should().Be("value-3");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task FullControlPlane_Outage_AllOperations_FailGracefully()
|
||||
{
|
||||
// Arrange - Full outage
|
||||
_fixture.InjectFullControlPlaneOutage();
|
||||
|
||||
// Act - Try various operations
|
||||
var tokenResult = await _fixture.IssueTokenAsync("user", TimeSpan.FromHours(1));
|
||||
var persistResult = await _fixture.PersistDataAsync("key", "value");
|
||||
var readResult = await _fixture.ReadDataAsync("any-key");
|
||||
|
||||
// Assert - All fail gracefully without exceptions
|
||||
tokenResult.Success.Should().BeFalse();
|
||||
tokenResult.Error.Should().NotBeNullOrEmpty();
|
||||
|
||||
persistResult.Success.Should().BeFalse();
|
||||
persistResult.Error.Should().NotBeNullOrEmpty();
|
||||
|
||||
readResult.Success.Should().BeFalse();
|
||||
readResult.Error.Should().NotBeNullOrEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task FullControlPlane_Recovery_SystemResumes_NormalOperation()
|
||||
{
|
||||
// Arrange - Operations before outage
|
||||
var tokenBefore = await _fixture.IssueTokenAsync("user-before", TimeSpan.FromHours(1));
|
||||
await _fixture.PersistDataAsync("data-before", "value-before");
|
||||
|
||||
// Full outage
|
||||
_fixture.InjectFullControlPlaneOutage();
|
||||
|
||||
// Verify outage
|
||||
var duringOutage = await _fixture.IssueTokenAsync("user-during", TimeSpan.FromHours(1));
|
||||
duringOutage.Success.Should().BeFalse();
|
||||
|
||||
// Act - Recovery
|
||||
_fixture.RecoverControlPlane();
|
||||
|
||||
// Assert - Normal operations resume
|
||||
var tokenAfter = await _fixture.IssueTokenAsync("user-after", TimeSpan.FromHours(1));
|
||||
tokenAfter.Success.Should().BeTrue();
|
||||
|
||||
var dataAfter = await _fixture.PersistDataAsync("data-after", "value-after");
|
||||
dataAfter.Success.Should().BeTrue();
|
||||
|
||||
var readBefore = await _fixture.ReadDataAsync("data-before");
|
||||
readBefore.Success.Should().BeTrue();
|
||||
readBefore.Data!.Value.Should().Be("value-before");
|
||||
|
||||
var readAfter = await _fixture.ReadDataAsync("data-after");
|
||||
readAfter.Success.Should().BeTrue();
|
||||
readAfter.Data!.Value.Should().Be("value-after");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task FullControlPlane_Outage_EventLogCaptures_AllFailures()
|
||||
{
|
||||
// Arrange
|
||||
_fixture.ClearEventLog();
|
||||
_fixture.InjectFullControlPlaneOutage();
|
||||
|
||||
// Act - Generate failures
|
||||
await _fixture.IssueTokenAsync("user", TimeSpan.FromHours(1));
|
||||
await _fixture.PersistDataAsync("key", "value");
|
||||
|
||||
// Assert - Event log contains failure entries
|
||||
var events = _fixture.EventLog;
|
||||
events.Should().NotBeEmpty();
|
||||
|
||||
var failureEvents = events.Where(e =>
|
||||
e.EventType.Contains("Failed", StringComparison.OrdinalIgnoreCase));
|
||||
failureEvents.Should().NotBeEmpty();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Data Integrity Tests
|
||||
|
||||
[Fact]
|
||||
public async Task Database_Outage_WritesFail_ReadsFail()
|
||||
{
|
||||
// Arrange - Database outage only
|
||||
_fixture.FailureInjector.InjectFullOutage("database");
|
||||
|
||||
// Act
|
||||
var writeResult = await _fixture.PersistDataAsync("test-key", "test-value");
|
||||
var readResult = await _fixture.ReadDataAsync("test-key");
|
||||
|
||||
// Assert
|
||||
writeResult.Success.Should().BeFalse();
|
||||
readResult.Success.Should().BeFalse();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Database_Recovery_DataVersioning_Correct()
|
||||
{
|
||||
// Arrange - Write initial version
|
||||
var v1 = await _fixture.PersistDataAsync("versioned-key", "value-v1");
|
||||
v1.Success.Should().BeTrue();
|
||||
v1.Data!.Version.Should().Be(1);
|
||||
|
||||
// Update
|
||||
var v2 = await _fixture.PersistDataAsync("versioned-key", "value-v2");
|
||||
v2.Success.Should().BeTrue();
|
||||
v2.Data!.Version.Should().Be(2);
|
||||
|
||||
// Database outage
|
||||
_fixture.FailureInjector.InjectFullOutage("database");
|
||||
var failedWrite = await _fixture.PersistDataAsync("versioned-key", "value-v3");
|
||||
failedWrite.Success.Should().BeFalse();
|
||||
|
||||
// Recovery
|
||||
_fixture.FailureInjector.RecoverService("database");
|
||||
|
||||
// Act - Write after recovery
|
||||
var v3 = await _fixture.PersistDataAsync("versioned-key", "value-v3");
|
||||
|
||||
// Assert - Version continues from last successful write
|
||||
v3.Success.Should().BeTrue();
|
||||
v3.Data!.Version.Should().Be(3);
|
||||
v3.Data.Value.Should().Be("value-v3");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MixedOutage_SomeServices_OthersHealthy()
|
||||
{
|
||||
// Arrange - Only Authority and Database down, Scheduler healthy
|
||||
_fixture.FailureInjector.InjectFullOutage("authority");
|
||||
_fixture.FailureInjector.InjectFullOutage("database");
|
||||
|
||||
// Act - Scheduler operations should work
|
||||
var jobResult = await _fixture.EnqueueJobAsync("scan", "test-image");
|
||||
|
||||
// Assert
|
||||
jobResult.Success.Should().BeTrue();
|
||||
_fixture.GetPendingJobCount().Should().BeGreaterThanOrEqualTo(1);
|
||||
|
||||
// But Authority/Database operations fail
|
||||
var tokenResult = await _fixture.IssueTokenAsync("user", TimeSpan.FromHours(1));
|
||||
tokenResult.Success.Should().BeFalse();
|
||||
|
||||
var persistResult = await _fixture.PersistDataAsync("key", "value");
|
||||
persistResult.Success.Should().BeFalse();
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,525 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ControlPlaneClusterFixture.cs
|
||||
// Sprint: Testing Enhancement Advisory - Phase 3.3
|
||||
// Description: Test fixture for simulating control-plane cluster with outage scenarios
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Chaos.ControlPlane.Tests.Fixtures;
|
||||
|
||||
/// <summary>
|
||||
/// Test fixture that simulates a control-plane cluster with multiple services.
|
||||
/// Enables chaos testing of outage scenarios.
|
||||
/// </summary>
|
||||
public sealed class ControlPlaneClusterFixture : IAsyncLifetime
|
||||
{
|
||||
private readonly ServiceFailureInjector _failureInjector = new();
|
||||
private readonly ConcurrentDictionary<string, MockServiceState> _serviceStates = new();
|
||||
private readonly ConcurrentQueue<ClusterEvent> _eventLog = new();
|
||||
private readonly ConcurrentDictionary<string, CachedToken> _tokenCache = new();
|
||||
private readonly ConcurrentQueue<PendingJob> _pendingJobs = new();
|
||||
private readonly ConcurrentDictionary<string, PersistedData> _dataStore = new();
|
||||
private long _eventSequence;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the failure injector for this cluster.
|
||||
/// </summary>
|
||||
public ServiceFailureInjector FailureInjector => _failureInjector;
|
||||
|
||||
/// <summary>
|
||||
/// Gets all cluster events.
|
||||
/// </summary>
|
||||
public IReadOnlyCollection<ClusterEvent> EventLog => _eventLog.ToImmutableArray();
|
||||
|
||||
/// <summary>
|
||||
/// Gets all cached tokens.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, CachedToken> TokenCache => _tokenCache.ToImmutableDictionary();
|
||||
|
||||
/// <summary>
|
||||
/// Gets all pending jobs.
|
||||
/// </summary>
|
||||
public IReadOnlyCollection<PendingJob> PendingJobs => _pendingJobs.ToImmutableArray();
|
||||
|
||||
/// <summary>
|
||||
/// Gets all persisted data.
|
||||
/// </summary>
|
||||
public IReadOnlyDictionary<string, PersistedData> DataStore => _dataStore.ToImmutableDictionary();
|
||||
|
||||
/// <inheritdoc />
|
||||
public ValueTask InitializeAsync()
|
||||
{
|
||||
// Register default control-plane services
|
||||
RegisterService("authority", ServiceType.Authority);
|
||||
RegisterService("scheduler", ServiceType.Scheduler);
|
||||
RegisterService("gateway", ServiceType.Gateway);
|
||||
RegisterService("backend", ServiceType.Backend);
|
||||
RegisterService("database", ServiceType.Database);
|
||||
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public ValueTask DisposeAsync()
|
||||
{
|
||||
_failureInjector.RecoverAll();
|
||||
_serviceStates.Clear();
|
||||
return ValueTask.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers a service in the cluster.
|
||||
/// </summary>
|
||||
public void RegisterService(string serviceId, ServiceType serviceType)
|
||||
{
|
||||
_failureInjector.RegisterService(serviceId);
|
||||
_serviceStates[serviceId] = new MockServiceState
|
||||
{
|
||||
ServiceId = serviceId,
|
||||
ServiceType = serviceType,
|
||||
IsHealthy = true,
|
||||
StartedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Simulates Authority service issuing a token.
|
||||
/// </summary>
|
||||
public async Task<TokenResult> IssueTokenAsync(string userId, TimeSpan validity, CancellationToken ct = default)
|
||||
{
|
||||
var requestResult = await _failureInjector.SimulateRequestAsync("authority", ct);
|
||||
|
||||
if (!requestResult.Success)
|
||||
{
|
||||
LogEvent("authority", "TokenIssueFailed", $"User: {userId}, Error: {requestResult.Error}");
|
||||
return new TokenResult
|
||||
{
|
||||
Success = false,
|
||||
Error = requestResult.Error
|
||||
};
|
||||
}
|
||||
|
||||
var token = new CachedToken
|
||||
{
|
||||
TokenId = Guid.NewGuid().ToString("N"),
|
||||
UserId = userId,
|
||||
IssuedAt = DateTimeOffset.UtcNow,
|
||||
ExpiresAt = DateTimeOffset.UtcNow.Add(validity),
|
||||
IsValid = true
|
||||
};
|
||||
|
||||
_tokenCache[token.TokenId] = token;
|
||||
LogEvent("authority", "TokenIssued", $"TokenId: {token.TokenId}, User: {userId}");
|
||||
|
||||
return new TokenResult
|
||||
{
|
||||
Success = true,
|
||||
Token = token
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Validates a token, using cache if Authority is unavailable.
|
||||
/// </summary>
|
||||
public async Task<ValidationResult> ValidateTokenAsync(string tokenId, CancellationToken ct = default)
|
||||
{
|
||||
// Try to reach Authority
|
||||
var requestResult = await _failureInjector.SimulateRequestAsync("authority", ct);
|
||||
|
||||
if (requestResult.Success)
|
||||
{
|
||||
// Authority available - validate directly
|
||||
if (_tokenCache.TryGetValue(tokenId, out var token))
|
||||
{
|
||||
var isValid = token.IsValid && token.ExpiresAt > DateTimeOffset.UtcNow;
|
||||
LogEvent("authority", "TokenValidated", $"TokenId: {tokenId}, Valid: {isValid}");
|
||||
return new ValidationResult
|
||||
{
|
||||
Success = true,
|
||||
IsValid = isValid,
|
||||
Source = ValidationSource.Authority
|
||||
};
|
||||
}
|
||||
|
||||
return new ValidationResult
|
||||
{
|
||||
Success = true,
|
||||
IsValid = false,
|
||||
Source = ValidationSource.Authority,
|
||||
Error = "Token not found"
|
||||
};
|
||||
}
|
||||
|
||||
// Authority unavailable - check local cache
|
||||
if (_tokenCache.TryGetValue(tokenId, out var cachedToken))
|
||||
{
|
||||
var isValid = cachedToken.IsValid && cachedToken.ExpiresAt > DateTimeOffset.UtcNow;
|
||||
LogEvent("authority", "TokenValidatedFromCache", $"TokenId: {tokenId}, Valid: {isValid}");
|
||||
return new ValidationResult
|
||||
{
|
||||
Success = true,
|
||||
IsValid = isValid,
|
||||
Source = ValidationSource.Cache,
|
||||
Warning = "Authority unavailable, used cached token"
|
||||
};
|
||||
}
|
||||
|
||||
LogEvent("authority", "TokenValidationFailed", $"TokenId: {tokenId}, Authority unavailable, no cache");
|
||||
return new ValidationResult
|
||||
{
|
||||
Success = false,
|
||||
IsValid = false,
|
||||
Source = ValidationSource.None,
|
||||
Error = "Authority unavailable and token not in cache"
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Enqueues a job with the Scheduler.
|
||||
/// </summary>
|
||||
public async Task<JobResult> EnqueueJobAsync(string jobType, string payload, CancellationToken ct = default)
|
||||
{
|
||||
var job = new PendingJob
|
||||
{
|
||||
JobId = Guid.NewGuid().ToString("N"),
|
||||
JobType = jobType,
|
||||
Payload = payload,
|
||||
EnqueuedAt = DateTimeOffset.UtcNow,
|
||||
Status = JobStatus.Pending
|
||||
};
|
||||
|
||||
// Always persist to local queue first (durability)
|
||||
_pendingJobs.Enqueue(job);
|
||||
LogEvent("scheduler", "JobEnqueued", $"JobId: {job.JobId}, Type: {jobType}");
|
||||
|
||||
// Try to notify scheduler
|
||||
var requestResult = await _failureInjector.SimulateRequestAsync("scheduler", ct);
|
||||
|
||||
if (!requestResult.Success)
|
||||
{
|
||||
LogEvent("scheduler", "SchedulerNotifyFailed", $"JobId: {job.JobId}, Error: {requestResult.Error}");
|
||||
return new JobResult
|
||||
{
|
||||
Success = true,
|
||||
Job = job,
|
||||
Warning = "Job persisted but scheduler notification failed"
|
||||
};
|
||||
}
|
||||
|
||||
return new JobResult
|
||||
{
|
||||
Success = true,
|
||||
Job = job
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Processes pending jobs when scheduler recovers.
|
||||
/// </summary>
|
||||
public async Task<int> ProcessPendingJobsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var requestResult = await _failureInjector.SimulateRequestAsync("scheduler", ct);
|
||||
|
||||
if (!requestResult.Success)
|
||||
{
|
||||
LogEvent("scheduler", "ProcessingFailed", $"Error: {requestResult.Error}");
|
||||
return 0;
|
||||
}
|
||||
|
||||
var processedCount = 0;
|
||||
var jobsSnapshot = _pendingJobs.ToArray();
|
||||
|
||||
foreach (var job in jobsSnapshot.Where(j => j.Status == JobStatus.Pending))
|
||||
{
|
||||
job.Status = JobStatus.Processing;
|
||||
job.ProcessedAt = DateTimeOffset.UtcNow;
|
||||
processedCount++;
|
||||
LogEvent("scheduler", "JobProcessed", $"JobId: {job.JobId}");
|
||||
}
|
||||
|
||||
return processedCount;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Persists data to the data store.
|
||||
/// </summary>
|
||||
public async Task<PersistResult> PersistDataAsync(string key, string value, CancellationToken ct = default)
|
||||
{
|
||||
var data = new PersistedData
|
||||
{
|
||||
Key = key,
|
||||
Value = value,
|
||||
PersistedAt = DateTimeOffset.UtcNow,
|
||||
Version = 1
|
||||
};
|
||||
|
||||
// Check if database is available
|
||||
var dbResult = await _failureInjector.SimulateRequestAsync("database", ct);
|
||||
|
||||
if (!dbResult.Success)
|
||||
{
|
||||
LogEvent("database", "PersistFailed", $"Key: {key}, Error: {dbResult.Error}");
|
||||
return new PersistResult
|
||||
{
|
||||
Success = false,
|
||||
Error = dbResult.Error
|
||||
};
|
||||
}
|
||||
|
||||
if (_dataStore.TryGetValue(key, out var existing))
|
||||
{
|
||||
data.Version = existing.Version + 1;
|
||||
}
|
||||
|
||||
_dataStore[key] = data;
|
||||
LogEvent("database", "DataPersisted", $"Key: {key}, Version: {data.Version}");
|
||||
|
||||
return new PersistResult
|
||||
{
|
||||
Success = true,
|
||||
Data = data
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reads data from the data store.
|
||||
/// </summary>
|
||||
public async Task<ReadResult> ReadDataAsync(string key, CancellationToken ct = default)
|
||||
{
|
||||
var dbResult = await _failureInjector.SimulateRequestAsync("database", ct);
|
||||
|
||||
if (!dbResult.Success)
|
||||
{
|
||||
LogEvent("database", "ReadFailed", $"Key: {key}, Error: {dbResult.Error}");
|
||||
return new ReadResult
|
||||
{
|
||||
Success = false,
|
||||
Error = dbResult.Error
|
||||
};
|
||||
}
|
||||
|
||||
if (_dataStore.TryGetValue(key, out var data))
|
||||
{
|
||||
LogEvent("database", "DataRead", $"Key: {key}, Version: {data.Version}");
|
||||
return new ReadResult
|
||||
{
|
||||
Success = true,
|
||||
Data = data
|
||||
};
|
||||
}
|
||||
|
||||
return new ReadResult
|
||||
{
|
||||
Success = true,
|
||||
Data = null
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Simulates a full control-plane outage (all services down).
|
||||
/// </summary>
|
||||
public void InjectFullControlPlaneOutage()
|
||||
{
|
||||
foreach (var serviceId in _serviceStates.Keys)
|
||||
{
|
||||
_failureInjector.InjectFullOutage(serviceId);
|
||||
}
|
||||
LogEvent("cluster", "FullOutageInjected", "All services down");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Recovers all control-plane services.
|
||||
/// </summary>
|
||||
public void RecoverControlPlane()
|
||||
{
|
||||
_failureInjector.RecoverAll();
|
||||
LogEvent("cluster", "ControlPlaneRecovered", "All services recovered");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the count of pending jobs that haven't been lost.
|
||||
/// </summary>
|
||||
public int GetPendingJobCount() => _pendingJobs.Count(j => j.Status == JobStatus.Pending);
|
||||
|
||||
/// <summary>
|
||||
/// Gets all jobs (for verification).
|
||||
/// </summary>
|
||||
public IReadOnlyList<PendingJob> GetAllJobs() => _pendingJobs.ToImmutableArray();
|
||||
|
||||
/// <summary>
|
||||
/// Verifies all persisted data is intact.
|
||||
/// </summary>
|
||||
public bool VerifyDataIntegrity(IEnumerable<string> expectedKeys)
|
||||
{
|
||||
return expectedKeys.All(key => _dataStore.ContainsKey(key));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Clears the event log.
|
||||
/// </summary>
|
||||
public void ClearEventLog()
|
||||
{
|
||||
while (_eventLog.TryDequeue(out _)) { }
|
||||
}
|
||||
|
||||
private void LogEvent(string service, string eventType, string details)
|
||||
{
|
||||
var seq = Interlocked.Increment(ref _eventSequence);
|
||||
_eventLog.Enqueue(new ClusterEvent
|
||||
{
|
||||
Sequence = seq,
|
||||
Timestamp = DateTimeOffset.UtcNow,
|
||||
Service = service,
|
||||
EventType = eventType,
|
||||
Details = details
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Mock state for a service in the cluster.
|
||||
/// </summary>
|
||||
public sealed class MockServiceState
|
||||
{
|
||||
public required string ServiceId { get; init; }
|
||||
public required ServiceType ServiceType { get; init; }
|
||||
public bool IsHealthy { get; set; }
|
||||
public DateTimeOffset StartedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Types of services in the control-plane.
|
||||
/// </summary>
|
||||
public enum ServiceType
|
||||
{
|
||||
Authority,
|
||||
Scheduler,
|
||||
Gateway,
|
||||
Backend,
|
||||
Database
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents a cached authentication token.
|
||||
/// </summary>
|
||||
public sealed class CachedToken
|
||||
{
|
||||
public required string TokenId { get; init; }
|
||||
public required string UserId { get; init; }
|
||||
public required DateTimeOffset IssuedAt { get; init; }
|
||||
public required DateTimeOffset ExpiresAt { get; init; }
|
||||
public required bool IsValid { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of token issuance.
|
||||
/// </summary>
|
||||
public sealed record TokenResult
|
||||
{
|
||||
public required bool Success { get; init; }
|
||||
public CachedToken? Token { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of token validation.
|
||||
/// </summary>
|
||||
public sealed record ValidationResult
|
||||
{
|
||||
public required bool Success { get; init; }
|
||||
public required bool IsValid { get; init; }
|
||||
public required ValidationSource Source { get; init; }
|
||||
public string? Error { get; init; }
|
||||
public string? Warning { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Source of token validation.
|
||||
/// </summary>
|
||||
public enum ValidationSource
|
||||
{
|
||||
None,
|
||||
Authority,
|
||||
Cache
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents a pending job in the scheduler queue.
|
||||
/// </summary>
|
||||
public sealed class PendingJob
|
||||
{
|
||||
public required string JobId { get; init; }
|
||||
public required string JobType { get; init; }
|
||||
public required string Payload { get; init; }
|
||||
public required DateTimeOffset EnqueuedAt { get; init; }
|
||||
public JobStatus Status { get; set; }
|
||||
public DateTimeOffset? ProcessedAt { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Job status.
|
||||
/// </summary>
|
||||
public enum JobStatus
|
||||
{
|
||||
Pending,
|
||||
Processing,
|
||||
Completed,
|
||||
Failed
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of job enqueue operation.
|
||||
/// </summary>
|
||||
public sealed record JobResult
|
||||
{
|
||||
public required bool Success { get; init; }
|
||||
public PendingJob? Job { get; init; }
|
||||
public string? Error { get; init; }
|
||||
public string? Warning { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents persisted data.
|
||||
/// </summary>
|
||||
public sealed class PersistedData
|
||||
{
|
||||
public required string Key { get; init; }
|
||||
public required string Value { get; init; }
|
||||
public required DateTimeOffset PersistedAt { get; init; }
|
||||
public required int Version { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of data persistence operation.
|
||||
/// </summary>
|
||||
public sealed record PersistResult
|
||||
{
|
||||
public required bool Success { get; init; }
|
||||
public PersistedData? Data { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of data read operation.
|
||||
/// </summary>
|
||||
public sealed record ReadResult
|
||||
{
|
||||
public required bool Success { get; init; }
|
||||
public PersistedData? Data { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents an event in the cluster.
|
||||
/// </summary>
|
||||
public sealed record ClusterEvent
|
||||
{
|
||||
public required long Sequence { get; init; }
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public required string Service { get; init; }
|
||||
public required string EventType { get; init; }
|
||||
public required string Details { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,273 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ServiceFailureInjector.cs
|
||||
// Sprint: Testing Enhancement Advisory - Phase 3.3
|
||||
// Description: Service failure injection for control-plane chaos testing
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
|
||||
namespace StellaOps.Chaos.ControlPlane.Tests.Fixtures;
|
||||
|
||||
/// <summary>
|
||||
/// Injects failures into control-plane services for chaos testing.
|
||||
/// Supports various failure modes: full outage, partial failures, latency injection.
|
||||
/// </summary>
|
||||
public sealed class ServiceFailureInjector
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, ServiceState> _serviceStates = new();
|
||||
private readonly ConcurrentDictionary<string, FailureConfig> _failureConfigs = new();
|
||||
private readonly Random _random = new(42); // Deterministic for reproducibility
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current state of a service.
|
||||
/// </summary>
|
||||
public ServiceState GetServiceState(string serviceId)
|
||||
{
|
||||
return _serviceStates.GetOrAdd(serviceId, _ => new ServiceState
|
||||
{
|
||||
ServiceId = serviceId,
|
||||
Status = ServiceStatus.Healthy,
|
||||
LastUpdated = DateTimeOffset.UtcNow
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers a service for failure injection.
|
||||
/// </summary>
|
||||
public void RegisterService(string serviceId)
|
||||
{
|
||||
_serviceStates.TryAdd(serviceId, new ServiceState
|
||||
{
|
||||
ServiceId = serviceId,
|
||||
Status = ServiceStatus.Healthy,
|
||||
LastUpdated = DateTimeOffset.UtcNow
|
||||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Causes a complete service outage.
|
||||
/// </summary>
|
||||
public void InjectFullOutage(string serviceId)
|
||||
{
|
||||
if (_serviceStates.TryGetValue(serviceId, out var state))
|
||||
{
|
||||
state.Status = ServiceStatus.Down;
|
||||
state.LastUpdated = DateTimeOffset.UtcNow;
|
||||
state.OutageStarted = DateTimeOffset.UtcNow;
|
||||
}
|
||||
|
||||
_failureConfigs[serviceId] = new FailureConfig
|
||||
{
|
||||
ServiceId = serviceId,
|
||||
FailureMode = FailureMode.FullOutage,
|
||||
FailureRate = 1.0
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Causes partial failures (random request failures).
|
||||
/// </summary>
|
||||
public void InjectPartialFailure(string serviceId, double failureRate = 0.5)
|
||||
{
|
||||
if (_serviceStates.TryGetValue(serviceId, out var state))
|
||||
{
|
||||
state.Status = ServiceStatus.Degraded;
|
||||
state.LastUpdated = DateTimeOffset.UtcNow;
|
||||
}
|
||||
|
||||
_failureConfigs[serviceId] = new FailureConfig
|
||||
{
|
||||
ServiceId = serviceId,
|
||||
FailureMode = FailureMode.PartialFailure,
|
||||
FailureRate = Math.Clamp(failureRate, 0.0, 1.0)
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Injects latency into service responses.
|
||||
/// </summary>
|
||||
public void InjectLatency(string serviceId, TimeSpan baseLatency, TimeSpan jitter = default)
|
||||
{
|
||||
if (_serviceStates.TryGetValue(serviceId, out var state))
|
||||
{
|
||||
state.Status = ServiceStatus.Slow;
|
||||
state.LastUpdated = DateTimeOffset.UtcNow;
|
||||
}
|
||||
|
||||
_failureConfigs[serviceId] = new FailureConfig
|
||||
{
|
||||
ServiceId = serviceId,
|
||||
FailureMode = FailureMode.LatencyInjection,
|
||||
BaseLatency = baseLatency,
|
||||
LatencyJitter = jitter
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Recovers a service from failure.
|
||||
/// </summary>
|
||||
public void RecoverService(string serviceId)
|
||||
{
|
||||
if (_serviceStates.TryGetValue(serviceId, out var state))
|
||||
{
|
||||
state.Status = ServiceStatus.Healthy;
|
||||
state.LastUpdated = DateTimeOffset.UtcNow;
|
||||
state.OutageStarted = null;
|
||||
}
|
||||
|
||||
_failureConfigs.TryRemove(serviceId, out _);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Recovers all services.
|
||||
/// </summary>
|
||||
public void RecoverAll()
|
||||
{
|
||||
foreach (var serviceId in _serviceStates.Keys)
|
||||
{
|
||||
RecoverService(serviceId);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Simulates a request to a service, applying any configured failures.
|
||||
/// </summary>
|
||||
/// <returns>True if request succeeds, false if it fails due to injected failure.</returns>
|
||||
public async Task<ServiceRequestResult> SimulateRequestAsync(
|
||||
string serviceId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var state = GetServiceState(serviceId);
|
||||
|
||||
if (!_failureConfigs.TryGetValue(serviceId, out var config))
|
||||
{
|
||||
// No failure configured, request succeeds
|
||||
return new ServiceRequestResult
|
||||
{
|
||||
ServiceId = serviceId,
|
||||
Success = true,
|
||||
Latency = TimeSpan.Zero
|
||||
};
|
||||
}
|
||||
|
||||
switch (config.FailureMode)
|
||||
{
|
||||
case FailureMode.FullOutage:
|
||||
return new ServiceRequestResult
|
||||
{
|
||||
ServiceId = serviceId,
|
||||
Success = false,
|
||||
Error = $"Service {serviceId} is down (full outage)",
|
||||
Latency = TimeSpan.Zero
|
||||
};
|
||||
|
||||
case FailureMode.PartialFailure:
|
||||
var shouldFail = _random.NextDouble() < config.FailureRate;
|
||||
return new ServiceRequestResult
|
||||
{
|
||||
ServiceId = serviceId,
|
||||
Success = !shouldFail,
|
||||
Error = shouldFail ? $"Service {serviceId} request failed (partial failure)" : null,
|
||||
Latency = TimeSpan.Zero
|
||||
};
|
||||
|
||||
case FailureMode.LatencyInjection:
|
||||
var jitterMs = _random.NextDouble() * config.LatencyJitter.TotalMilliseconds;
|
||||
var totalLatency = config.BaseLatency + TimeSpan.FromMilliseconds(jitterMs);
|
||||
|
||||
// Simulate latency (in real test, would actually delay)
|
||||
return new ServiceRequestResult
|
||||
{
|
||||
ServiceId = serviceId,
|
||||
Success = true,
|
||||
Latency = totalLatency
|
||||
};
|
||||
|
||||
default:
|
||||
return new ServiceRequestResult
|
||||
{
|
||||
ServiceId = serviceId,
|
||||
Success = true,
|
||||
Latency = TimeSpan.Zero
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all services currently in outage.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> GetServicesInOutage()
|
||||
{
|
||||
return _serviceStates
|
||||
.Where(kvp => kvp.Value.Status == ServiceStatus.Down)
|
||||
.Select(kvp => kvp.Key)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets all services currently degraded.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string> GetDegradedServices()
|
||||
{
|
||||
return _serviceStates
|
||||
.Where(kvp => kvp.Value.Status is ServiceStatus.Degraded or ServiceStatus.Slow)
|
||||
.Select(kvp => kvp.Key)
|
||||
.ToList();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// State of a service for chaos testing.
|
||||
/// </summary>
|
||||
public sealed class ServiceState
|
||||
{
|
||||
public required string ServiceId { get; init; }
|
||||
public ServiceStatus Status { get; set; }
|
||||
public DateTimeOffset LastUpdated { get; set; }
|
||||
public DateTimeOffset? OutageStarted { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service status levels.
|
||||
/// </summary>
|
||||
public enum ServiceStatus
|
||||
{
|
||||
Healthy,
|
||||
Degraded,
|
||||
Slow,
|
||||
Down
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for failure injection.
|
||||
/// </summary>
|
||||
public sealed class FailureConfig
|
||||
{
|
||||
public required string ServiceId { get; init; }
|
||||
public FailureMode FailureMode { get; init; }
|
||||
public double FailureRate { get; init; }
|
||||
public TimeSpan BaseLatency { get; init; }
|
||||
public TimeSpan LatencyJitter { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Types of failure modes.
|
||||
/// </summary>
|
||||
public enum FailureMode
|
||||
{
|
||||
None,
|
||||
FullOutage,
|
||||
PartialFailure,
|
||||
LatencyInjection
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a simulated service request.
|
||||
/// </summary>
|
||||
public sealed record ServiceRequestResult
|
||||
{
|
||||
public required string ServiceId { get; init; }
|
||||
public required bool Success { get; init; }
|
||||
public string? Error { get; init; }
|
||||
public TimeSpan Latency { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,406 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// PartialOutageTests.cs
|
||||
// Sprint: Testing Enhancement Advisory - Phase 3.3
|
||||
// Description: Tests for control-plane behavior during partial outages
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using FluentAssertions;
|
||||
using StellaOps.Chaos.ControlPlane.Tests.Fixtures;
|
||||
using StellaOps.TestKit;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Chaos.ControlPlane.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Tests for control-plane behavior during partial outage scenarios.
|
||||
/// Validates graceful degradation under latency injection and partial failures.
|
||||
/// </summary>
|
||||
[Trait("Category", TestCategories.Chaos)]
|
||||
[Trait("Category", "ControlPlane")]
|
||||
[Trait("Category", "PartialOutage")]
|
||||
public class PartialOutageTests : IClassFixture<ControlPlaneClusterFixture>
|
||||
{
|
||||
private readonly ControlPlaneClusterFixture _fixture;
|
||||
|
||||
public PartialOutageTests(ControlPlaneClusterFixture fixture)
|
||||
{
|
||||
_fixture = fixture;
|
||||
_fixture.FailureInjector.RecoverAll();
|
||||
_fixture.ClearEventLog();
|
||||
}
|
||||
|
||||
#region Partial Failure Rate Tests
|
||||
|
||||
[Fact]
|
||||
public async Task Authority_50PercentFailure_SomeTokensIssued()
|
||||
{
|
||||
// Arrange - 50% failure rate
|
||||
_fixture.FailureInjector.InjectPartialFailure("authority", 0.5);
|
||||
|
||||
// Act - Try to issue multiple tokens
|
||||
var results = new List<TokenResult>();
|
||||
for (var i = 0; i < 20; i++)
|
||||
{
|
||||
results.Add(await _fixture.IssueTokenAsync($"user-{i}", TimeSpan.FromHours(1)));
|
||||
}
|
||||
|
||||
// Assert - Roughly half should succeed (with some variance due to randomness)
|
||||
var successCount = results.Count(r => r.Success);
|
||||
var failureCount = results.Count(r => !r.Success);
|
||||
|
||||
// Allow reasonable variance (30-70% success due to random seeding)
|
||||
successCount.Should().BeGreaterThan(3);
|
||||
failureCount.Should().BeGreaterThan(3);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Database_25PercentFailure_MostWritesSucceed()
|
||||
{
|
||||
// Arrange - 25% failure rate
|
||||
_fixture.FailureInjector.InjectPartialFailure("database", 0.25);
|
||||
|
||||
// Act - Try multiple writes
|
||||
var results = new List<PersistResult>();
|
||||
for (var i = 0; i < 20; i++)
|
||||
{
|
||||
results.Add(await _fixture.PersistDataAsync($"key-{i}", $"value-{i}"));
|
||||
}
|
||||
|
||||
// Assert - Most should succeed
|
||||
var successCount = results.Count(r => r.Success);
|
||||
successCount.Should().BeGreaterThan(10); // At least half should succeed
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Scheduler_HighFailureRate_JobsStillPersisted()
|
||||
{
|
||||
// Arrange - 80% failure rate
|
||||
_fixture.FailureInjector.InjectPartialFailure("scheduler", 0.8);
|
||||
|
||||
// Act - Enqueue jobs
|
||||
var results = new List<JobResult>();
|
||||
for (var i = 0; i < 10; i++)
|
||||
{
|
||||
results.Add(await _fixture.EnqueueJobAsync("scan", $"image-{i}"));
|
||||
}
|
||||
|
||||
// Assert - All jobs should be persisted locally (just notification may fail)
|
||||
results.Should().AllSatisfy(r => r.Success.Should().BeTrue());
|
||||
|
||||
// Jobs should all be in the pending queue
|
||||
_fixture.GetPendingJobCount().Should().Be(10);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task PartialFailure_RetrySucceeds_Eventually()
|
||||
{
|
||||
// Arrange - 50% failure rate
|
||||
_fixture.FailureInjector.InjectPartialFailure("authority", 0.5);
|
||||
|
||||
// Act - Keep trying until success (max 10 attempts)
|
||||
TokenResult? successResult = null;
|
||||
for (var attempt = 0; attempt < 10; attempt++)
|
||||
{
|
||||
var result = await _fixture.IssueTokenAsync("retry-user", TimeSpan.FromHours(1));
|
||||
if (result.Success)
|
||||
{
|
||||
successResult = result;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Assert - Should eventually succeed
|
||||
successResult.Should().NotBeNull();
|
||||
successResult!.Success.Should().BeTrue();
|
||||
successResult.Token.Should().NotBeNull();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Latency Injection Tests
|
||||
|
||||
[Fact]
|
||||
public async Task Authority_HighLatency_OperationsComplete()
|
||||
{
|
||||
// Arrange - 500ms base latency
|
||||
_fixture.FailureInjector.InjectLatency(
|
||||
"authority",
|
||||
TimeSpan.FromMilliseconds(500),
|
||||
TimeSpan.FromMilliseconds(100));
|
||||
|
||||
// Act - Issue token
|
||||
var result = await _fixture.IssueTokenAsync("latency-user", TimeSpan.FromHours(1));
|
||||
|
||||
// Assert - Should still complete
|
||||
result.Success.Should().BeTrue();
|
||||
result.Token.Should().NotBeNull();
|
||||
|
||||
// The service state should show "Slow" status
|
||||
var serviceState = _fixture.FailureInjector.GetServiceState("authority");
|
||||
serviceState.Status.Should().Be(ServiceStatus.Slow);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Database_VariableLatency_NoDataCorruption()
|
||||
{
|
||||
// Arrange - Variable latency with jitter
|
||||
_fixture.FailureInjector.InjectLatency(
|
||||
"database",
|
||||
TimeSpan.FromMilliseconds(200),
|
||||
TimeSpan.FromMilliseconds(300));
|
||||
|
||||
// Act - Multiple concurrent-like writes
|
||||
var tasks = new List<Task<PersistResult>>();
|
||||
for (var i = 0; i < 10; i++)
|
||||
{
|
||||
tasks.Add(_fixture.PersistDataAsync($"latency-key-{i}", $"value-{i}"));
|
||||
}
|
||||
|
||||
var results = await Task.WhenAll(tasks);
|
||||
|
||||
// Assert - All should succeed
|
||||
results.Should().AllSatisfy(r => r.Success.Should().BeTrue());
|
||||
|
||||
// Verify data integrity
|
||||
for (var i = 0; i < 10; i++)
|
||||
{
|
||||
var read = await _fixture.ReadDataAsync($"latency-key-{i}");
|
||||
read.Success.Should().BeTrue();
|
||||
read.Data!.Value.Should().Be($"value-{i}");
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Scheduler_Latency_JobOrdering_Preserved()
|
||||
{
|
||||
// Arrange - Latency injection
|
||||
_fixture.FailureInjector.InjectLatency(
|
||||
"scheduler",
|
||||
TimeSpan.FromMilliseconds(100),
|
||||
TimeSpan.FromMilliseconds(50));
|
||||
|
||||
// Act - Enqueue jobs in sequence
|
||||
for (var i = 0; i < 5; i++)
|
||||
{
|
||||
await _fixture.EnqueueJobAsync("scan", $"ordered-{i}");
|
||||
}
|
||||
|
||||
// Assert - Jobs should be in order
|
||||
var jobs = _fixture.GetAllJobs().ToList();
|
||||
jobs.Should().HaveCount(5);
|
||||
|
||||
for (var i = 0; i < 5; i++)
|
||||
{
|
||||
jobs[i].Payload.Should().Be($"ordered-{i}");
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Degraded Service Tests
|
||||
|
||||
[Fact]
|
||||
public async Task DegradedAuthority_CacheHitRate_Improves()
|
||||
{
|
||||
// Arrange - Issue tokens while healthy
|
||||
var tokens = new List<string>();
|
||||
for (var i = 0; i < 5; i++)
|
||||
{
|
||||
var result = await _fixture.IssueTokenAsync($"user-{i}", TimeSpan.FromHours(1));
|
||||
if (result.Success)
|
||||
{
|
||||
tokens.Add(result.Token!.TokenId);
|
||||
}
|
||||
}
|
||||
|
||||
// Authority becomes degraded (partial failure)
|
||||
_fixture.FailureInjector.InjectPartialFailure("authority", 0.7);
|
||||
|
||||
// Act - Validate cached tokens
|
||||
var validations = new List<ValidationResult>();
|
||||
foreach (var tokenId in tokens)
|
||||
{
|
||||
validations.Add(await _fixture.ValidateTokenAsync(tokenId));
|
||||
}
|
||||
|
||||
// Assert - All should succeed (either from Authority or cache)
|
||||
validations.Should().AllSatisfy(v =>
|
||||
{
|
||||
v.Success.Should().BeTrue();
|
||||
v.IsValid.Should().BeTrue();
|
||||
});
|
||||
|
||||
// Some should come from cache
|
||||
var cacheHits = validations.Count(v => v.Source == ValidationSource.Cache);
|
||||
cacheHits.Should().BeGreaterThan(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MultipleDegraded_Services_GracefulDegradation()
|
||||
{
|
||||
// Arrange - Multiple services degraded
|
||||
_fixture.FailureInjector.InjectPartialFailure("authority", 0.3);
|
||||
_fixture.FailureInjector.InjectLatency("database", TimeSpan.FromMilliseconds(200));
|
||||
_fixture.FailureInjector.InjectPartialFailure("scheduler", 0.2);
|
||||
|
||||
// Act - Perform various operations
|
||||
var tokenResults = new List<bool>();
|
||||
var persistResults = new List<bool>();
|
||||
var jobResults = new List<bool>();
|
||||
|
||||
for (var i = 0; i < 10; i++)
|
||||
{
|
||||
var tokenResult = await _fixture.IssueTokenAsync($"user-{i}", TimeSpan.FromHours(1));
|
||||
tokenResults.Add(tokenResult.Success);
|
||||
|
||||
var persistResult = await _fixture.PersistDataAsync($"key-{i}", $"value-{i}");
|
||||
persistResults.Add(persistResult.Success);
|
||||
|
||||
var jobResult = await _fixture.EnqueueJobAsync("scan", $"image-{i}");
|
||||
jobResults.Add(jobResult.Success);
|
||||
}
|
||||
|
||||
// Assert - System remains functional despite degradation
|
||||
tokenResults.Count(r => r).Should().BeGreaterThan(5);
|
||||
persistResults.Count(r => r).Should().BeGreaterThan(5);
|
||||
jobResults.Should().AllSatisfy(r => r.Should().BeTrue()); // Jobs always persist locally
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Recovery from Partial Outage Tests
|
||||
|
||||
[Fact]
|
||||
public async Task PartialOutage_Recovery_FullFunctionality_Restored()
|
||||
{
|
||||
// Arrange - Start with partial failure
|
||||
_fixture.FailureInjector.InjectPartialFailure("authority", 0.5);
|
||||
|
||||
// Some operations fail during partial outage
|
||||
var duringOutage = new List<TokenResult>();
|
||||
for (var i = 0; i < 5; i++)
|
||||
{
|
||||
duringOutage.Add(await _fixture.IssueTokenAsync($"user-{i}", TimeSpan.FromHours(1)));
|
||||
}
|
||||
|
||||
// Act - Recover
|
||||
_fixture.FailureInjector.RecoverService("authority");
|
||||
|
||||
// All operations should succeed now
|
||||
var afterRecovery = new List<TokenResult>();
|
||||
for (var i = 5; i < 10; i++)
|
||||
{
|
||||
afterRecovery.Add(await _fixture.IssueTokenAsync($"user-{i}", TimeSpan.FromHours(1)));
|
||||
}
|
||||
|
||||
// Assert
|
||||
afterRecovery.Should().AllSatisfy(r => r.Success.Should().BeTrue());
|
||||
|
||||
var serviceState = _fixture.FailureInjector.GetServiceState("authority");
|
||||
serviceState.Status.Should().Be(ServiceStatus.Healthy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task LatencyRecovery_PerformanceReturns_ToNormal()
|
||||
{
|
||||
// Arrange - High latency
|
||||
_fixture.FailureInjector.InjectLatency(
|
||||
"database",
|
||||
TimeSpan.FromSeconds(1),
|
||||
TimeSpan.FromMilliseconds(500));
|
||||
|
||||
// Note: In real scenario, we'd measure actual latency
|
||||
// Here we just verify state changes
|
||||
var slowState = _fixture.FailureInjector.GetServiceState("database");
|
||||
slowState.Status.Should().Be(ServiceStatus.Slow);
|
||||
|
||||
// Act - Recover
|
||||
_fixture.FailureInjector.RecoverService("database");
|
||||
|
||||
// Assert - Back to healthy
|
||||
var healthyState = _fixture.FailureInjector.GetServiceState("database");
|
||||
healthyState.Status.Should().Be(ServiceStatus.Healthy);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Service Isolation Tests
|
||||
|
||||
[Fact]
|
||||
public async Task SingleService_Degraded_OthersUnaffected()
|
||||
{
|
||||
// Arrange - Only Authority degraded
|
||||
_fixture.FailureInjector.InjectPartialFailure("authority", 0.9);
|
||||
|
||||
// Act - Database and Scheduler operations
|
||||
var dbResult = await _fixture.PersistDataAsync("isolated-key", "isolated-value");
|
||||
var jobResult = await _fixture.EnqueueJobAsync("scan", "isolated-image");
|
||||
|
||||
// Assert - Unaffected services work normally
|
||||
dbResult.Success.Should().BeTrue();
|
||||
jobResult.Success.Should().BeTrue();
|
||||
|
||||
// But Authority is degraded
|
||||
var authState = _fixture.FailureInjector.GetServiceState("authority");
|
||||
authState.Status.Should().Be(ServiceStatus.Degraded);
|
||||
|
||||
var dbState = _fixture.FailureInjector.GetServiceState("database");
|
||||
dbState.Status.Should().Be(ServiceStatus.Healthy);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task CascadingDegradation_DoesNotOccur()
|
||||
{
|
||||
// Arrange - Database degraded
|
||||
_fixture.FailureInjector.InjectPartialFailure("database", 0.5);
|
||||
|
||||
// Act - Authority should work independently
|
||||
var tokenResult = await _fixture.IssueTokenAsync("cascade-user", TimeSpan.FromHours(1));
|
||||
|
||||
// Assert - Authority unaffected by database degradation
|
||||
tokenResult.Success.Should().BeTrue();
|
||||
|
||||
// Scheduler also unaffected
|
||||
var jobResult = await _fixture.EnqueueJobAsync("cascade-scan", "image");
|
||||
jobResult.Success.Should().BeTrue();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Flapping Service Tests
|
||||
|
||||
[Fact]
|
||||
public async Task FlappingService_SystemRemains_Stable()
|
||||
{
|
||||
// Simulate a flapping service (alternating between healthy and degraded)
|
||||
var allResults = new List<bool>();
|
||||
|
||||
for (var cycle = 0; cycle < 5; cycle++)
|
||||
{
|
||||
// Service degrades
|
||||
_fixture.FailureInjector.InjectPartialFailure("authority", 0.5);
|
||||
|
||||
for (var i = 0; i < 3; i++)
|
||||
{
|
||||
var result = await _fixture.IssueTokenAsync($"flap-user-{cycle}-{i}", TimeSpan.FromHours(1));
|
||||
allResults.Add(result.Success);
|
||||
}
|
||||
|
||||
// Service recovers
|
||||
_fixture.FailureInjector.RecoverService("authority");
|
||||
|
||||
for (var i = 0; i < 3; i++)
|
||||
{
|
||||
var result = await _fixture.IssueTokenAsync($"stable-user-{cycle}-{i}", TimeSpan.FromHours(1));
|
||||
allResults.Add(result.Success);
|
||||
}
|
||||
}
|
||||
|
||||
// Assert - System handled the flapping without crashing
|
||||
// Most operations during stable periods should succeed
|
||||
allResults.Should().NotBeEmpty();
|
||||
allResults.Count(r => r).Should().BeGreaterThan(allResults.Count / 2);
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,27 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<IsPackable>false</IsPackable>
|
||||
<IsTestProject>true</IsTestProject>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<RootNamespace>StellaOps.Chaos.ControlPlane.Tests</RootNamespace>
|
||||
</PropertyGroup>
|
||||
|
||||
<!-- Sprint: Testing Enhancement Advisory - Phase 3.3 -->
|
||||
<!-- Description: Control-plane outage E2E chaos tests -->
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="FluentAssertions" />
|
||||
<PackageReference Include="Testcontainers" />
|
||||
<PackageReference Include="Testcontainers.PostgreSql" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\..\__Libraries\StellaOps.TestKit\StellaOps.TestKit.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -1,7 +1,7 @@
|
||||
# Chaos Router Tests Task Board
|
||||
|
||||
This board mirrors active sprint tasks for this module.
|
||||
Source of truth: `docs/implplan/SPRINT_20251229_049_BE_csproj_audit_maint_tests.md`.
|
||||
Source of truth: `docs-archived/implplan/2025-12-29-csproj-audit/SPRINT_20251229_049_BE_csproj_audit_maint_tests.md`.
|
||||
|
||||
| Task ID | Status | Notes |
|
||||
| --- | --- | --- |
|
||||
|
||||
Reference in New Issue
Block a user