sln build fix (again), tests fixes, audit work and doctors work

This commit is contained in:
master
2026-01-12 22:15:51 +02:00
parent 9873f80830
commit 9330c64349
812 changed files with 48051 additions and 3891 deletions

View File

@@ -0,0 +1,381 @@
// -----------------------------------------------------------------------------
// ControlPlaneOutageTests.cs
// Sprint: Testing Enhancement Advisory - Phase 3.3
// Description: Tests for control-plane behavior during full outage scenarios
// -----------------------------------------------------------------------------
using FluentAssertions;
using StellaOps.Chaos.ControlPlane.Tests.Fixtures;
using StellaOps.TestKit;
using Xunit;
namespace StellaOps.Chaos.ControlPlane.Tests;
/// <summary>
/// Tests for control-plane behavior during full outage scenarios.
/// Validates graceful degradation, data durability, and recovery.
/// </summary>
[Trait("Category", TestCategories.Chaos)]
[Trait("Category", "ControlPlane")]
public class ControlPlaneOutageTests : IClassFixture<ControlPlaneClusterFixture>
{
private readonly ControlPlaneClusterFixture _fixture;
public ControlPlaneOutageTests(ControlPlaneClusterFixture fixture)
{
_fixture = fixture;
_fixture.FailureInjector.RecoverAll();
_fixture.ClearEventLog();
}
#region Authority Outage Tests
[Fact]
public async Task Authority_Outage_CachedTokens_AllowTemporaryAccess()
{
// Arrange - Issue tokens while Authority is healthy
var token1 = await _fixture.IssueTokenAsync("user-1", TimeSpan.FromHours(1));
var token2 = await _fixture.IssueTokenAsync("user-2", TimeSpan.FromHours(1));
token1.Success.Should().BeTrue();
token2.Success.Should().BeTrue();
// Act - Authority goes down
_fixture.FailureInjector.InjectFullOutage("authority");
// Try to validate tokens
var validation1 = await _fixture.ValidateTokenAsync(token1.Token!.TokenId);
var validation2 = await _fixture.ValidateTokenAsync(token2.Token!.TokenId);
// Assert - Cached tokens should still validate
validation1.Success.Should().BeTrue();
validation1.IsValid.Should().BeTrue();
validation1.Source.Should().Be(ValidationSource.Cache);
validation1.Warning.Should().Contain("Authority unavailable");
validation2.Success.Should().BeTrue();
validation2.IsValid.Should().BeTrue();
validation2.Source.Should().Be(ValidationSource.Cache);
}
[Fact]
public async Task Authority_Outage_NewTokens_CannotBeIssued()
{
// Arrange - Authority goes down
_fixture.FailureInjector.InjectFullOutage("authority");
// Act - Try to issue a new token
var result = await _fixture.IssueTokenAsync("new-user", TimeSpan.FromHours(1));
// Assert - Should fail
result.Success.Should().BeFalse();
result.Token.Should().BeNull();
result.Error.Should().NotBeNullOrEmpty();
}
[Fact]
public async Task Authority_Outage_UncachedTokens_FailValidation()
{
// Arrange - Authority goes down immediately
_fixture.FailureInjector.InjectFullOutage("authority");
// Act - Try to validate a token that was never cached
var result = await _fixture.ValidateTokenAsync("nonexistent-token");
// Assert - Should fail gracefully
result.Success.Should().BeFalse();
result.IsValid.Should().BeFalse();
result.Source.Should().Be(ValidationSource.None);
result.Error.Should().Contain("not in cache");
}
[Fact]
public async Task Authority_Recovery_TokenValidation_UsesAuthorityAgain()
{
// Arrange - Issue token while healthy
var token = await _fixture.IssueTokenAsync("user-1", TimeSpan.FromHours(1));
token.Success.Should().BeTrue();
// Authority goes down
_fixture.FailureInjector.InjectFullOutage("authority");
var duringOutage = await _fixture.ValidateTokenAsync(token.Token!.TokenId);
duringOutage.Source.Should().Be(ValidationSource.Cache);
// Act - Authority recovers
_fixture.FailureInjector.RecoverService("authority");
var afterRecovery = await _fixture.ValidateTokenAsync(token.Token!.TokenId);
// Assert - Should use Authority again
afterRecovery.Success.Should().BeTrue();
afterRecovery.IsValid.Should().BeTrue();
afterRecovery.Source.Should().Be(ValidationSource.Authority);
afterRecovery.Warning.Should().BeNull();
}
#endregion
#region Scheduler Outage Tests
[Fact]
public async Task Scheduler_Outage_PendingJobs_NotLost()
{
// Arrange - Enqueue jobs while healthy
var job1 = await _fixture.EnqueueJobAsync("scan", "image:tag1");
var job2 = await _fixture.EnqueueJobAsync("scan", "image:tag2");
job1.Success.Should().BeTrue();
job2.Success.Should().BeTrue();
// Scheduler goes down
_fixture.FailureInjector.InjectFullOutage("scheduler");
// Act - Enqueue more jobs during outage
var job3 = await _fixture.EnqueueJobAsync("scan", "image:tag3");
var job4 = await _fixture.EnqueueJobAsync("policy", "check-1");
// Assert - Jobs are persisted locally even during outage
job3.Success.Should().BeTrue();
job3.Warning.Should().Contain("scheduler notification failed");
job4.Success.Should().BeTrue();
job4.Warning.Should().Contain("scheduler notification failed");
// All jobs should be in the queue
_fixture.GetPendingJobCount().Should().Be(4);
}
[Fact]
public async Task Scheduler_Recovery_PendingJobs_ProcessedInOrder()
{
// Arrange - Scheduler down, enqueue jobs
_fixture.FailureInjector.InjectFullOutage("scheduler");
await _fixture.EnqueueJobAsync("scan", "image:tag1");
await _fixture.EnqueueJobAsync("scan", "image:tag2");
await _fixture.EnqueueJobAsync("scan", "image:tag3");
var pendingBefore = _fixture.GetPendingJobCount();
pendingBefore.Should().Be(3);
// Act - Scheduler recovers
_fixture.FailureInjector.RecoverService("scheduler");
var processedCount = await _fixture.ProcessPendingJobsAsync();
// Assert - All jobs processed
processedCount.Should().Be(3);
_fixture.GetPendingJobCount().Should().Be(0);
var allJobs = _fixture.GetAllJobs();
allJobs.Should().AllSatisfy(j => j.Status.Should().Be(JobStatus.Processing));
}
[Fact]
public async Task Scheduler_IntermittentOutage_NoJobDuplication()
{
// Arrange - Normal operation
var job1 = await _fixture.EnqueueJobAsync("scan", "image:tag1");
job1.Success.Should().BeTrue();
job1.Warning.Should().BeNull();
// Outage
_fixture.FailureInjector.InjectFullOutage("scheduler");
var job2 = await _fixture.EnqueueJobAsync("scan", "image:tag2");
job2.Warning.Should().NotBeNull();
// Recovery
_fixture.FailureInjector.RecoverService("scheduler");
var job3 = await _fixture.EnqueueJobAsync("scan", "image:tag3");
job3.Warning.Should().BeNull();
// Assert - No duplicates
var allJobs = _fixture.GetAllJobs();
var uniqueJobIds = allJobs.Select(j => j.JobId).Distinct().Count();
uniqueJobIds.Should().Be(allJobs.Count);
}
#endregion
#region Full Control-Plane Outage Tests
[Fact]
public async Task FullControlPlane_Outage_DataPersistence_Verified()
{
// Arrange - Persist data while healthy
var data1 = await _fixture.PersistDataAsync("config-1", "value-1");
var data2 = await _fixture.PersistDataAsync("config-2", "value-2");
var data3 = await _fixture.PersistDataAsync("config-3", "value-3");
data1.Success.Should().BeTrue();
data2.Success.Should().BeTrue();
data3.Success.Should().BeTrue();
// Act - Full control-plane outage
_fixture.InjectFullControlPlaneOutage();
// Recovery
_fixture.RecoverControlPlane();
// Assert - All data should be intact
var read1 = await _fixture.ReadDataAsync("config-1");
var read2 = await _fixture.ReadDataAsync("config-2");
var read3 = await _fixture.ReadDataAsync("config-3");
read1.Success.Should().BeTrue();
read1.Data.Should().NotBeNull();
read1.Data!.Value.Should().Be("value-1");
read2.Success.Should().BeTrue();
read2.Data!.Value.Should().Be("value-2");
read3.Success.Should().BeTrue();
read3.Data!.Value.Should().Be("value-3");
}
[Fact]
public async Task FullControlPlane_Outage_AllOperations_FailGracefully()
{
// Arrange - Full outage
_fixture.InjectFullControlPlaneOutage();
// Act - Try various operations
var tokenResult = await _fixture.IssueTokenAsync("user", TimeSpan.FromHours(1));
var persistResult = await _fixture.PersistDataAsync("key", "value");
var readResult = await _fixture.ReadDataAsync("any-key");
// Assert - All fail gracefully without exceptions
tokenResult.Success.Should().BeFalse();
tokenResult.Error.Should().NotBeNullOrEmpty();
persistResult.Success.Should().BeFalse();
persistResult.Error.Should().NotBeNullOrEmpty();
readResult.Success.Should().BeFalse();
readResult.Error.Should().NotBeNullOrEmpty();
}
[Fact]
public async Task FullControlPlane_Recovery_SystemResumes_NormalOperation()
{
// Arrange - Operations before outage
var tokenBefore = await _fixture.IssueTokenAsync("user-before", TimeSpan.FromHours(1));
await _fixture.PersistDataAsync("data-before", "value-before");
// Full outage
_fixture.InjectFullControlPlaneOutage();
// Verify outage
var duringOutage = await _fixture.IssueTokenAsync("user-during", TimeSpan.FromHours(1));
duringOutage.Success.Should().BeFalse();
// Act - Recovery
_fixture.RecoverControlPlane();
// Assert - Normal operations resume
var tokenAfter = await _fixture.IssueTokenAsync("user-after", TimeSpan.FromHours(1));
tokenAfter.Success.Should().BeTrue();
var dataAfter = await _fixture.PersistDataAsync("data-after", "value-after");
dataAfter.Success.Should().BeTrue();
var readBefore = await _fixture.ReadDataAsync("data-before");
readBefore.Success.Should().BeTrue();
readBefore.Data!.Value.Should().Be("value-before");
var readAfter = await _fixture.ReadDataAsync("data-after");
readAfter.Success.Should().BeTrue();
readAfter.Data!.Value.Should().Be("value-after");
}
[Fact]
public async Task FullControlPlane_Outage_EventLogCaptures_AllFailures()
{
// Arrange
_fixture.ClearEventLog();
_fixture.InjectFullControlPlaneOutage();
// Act - Generate failures
await _fixture.IssueTokenAsync("user", TimeSpan.FromHours(1));
await _fixture.PersistDataAsync("key", "value");
// Assert - Event log contains failure entries
var events = _fixture.EventLog;
events.Should().NotBeEmpty();
var failureEvents = events.Where(e =>
e.EventType.Contains("Failed", StringComparison.OrdinalIgnoreCase));
failureEvents.Should().NotBeEmpty();
}
#endregion
#region Data Integrity Tests
[Fact]
public async Task Database_Outage_WritesFail_ReadsFail()
{
// Arrange - Database outage only
_fixture.FailureInjector.InjectFullOutage("database");
// Act
var writeResult = await _fixture.PersistDataAsync("test-key", "test-value");
var readResult = await _fixture.ReadDataAsync("test-key");
// Assert
writeResult.Success.Should().BeFalse();
readResult.Success.Should().BeFalse();
}
[Fact]
public async Task Database_Recovery_DataVersioning_Correct()
{
// Arrange - Write initial version
var v1 = await _fixture.PersistDataAsync("versioned-key", "value-v1");
v1.Success.Should().BeTrue();
v1.Data!.Version.Should().Be(1);
// Update
var v2 = await _fixture.PersistDataAsync("versioned-key", "value-v2");
v2.Success.Should().BeTrue();
v2.Data!.Version.Should().Be(2);
// Database outage
_fixture.FailureInjector.InjectFullOutage("database");
var failedWrite = await _fixture.PersistDataAsync("versioned-key", "value-v3");
failedWrite.Success.Should().BeFalse();
// Recovery
_fixture.FailureInjector.RecoverService("database");
// Act - Write after recovery
var v3 = await _fixture.PersistDataAsync("versioned-key", "value-v3");
// Assert - Version continues from last successful write
v3.Success.Should().BeTrue();
v3.Data!.Version.Should().Be(3);
v3.Data.Value.Should().Be("value-v3");
}
[Fact]
public async Task MixedOutage_SomeServices_OthersHealthy()
{
// Arrange - Only Authority and Database down, Scheduler healthy
_fixture.FailureInjector.InjectFullOutage("authority");
_fixture.FailureInjector.InjectFullOutage("database");
// Act - Scheduler operations should work
var jobResult = await _fixture.EnqueueJobAsync("scan", "test-image");
// Assert
jobResult.Success.Should().BeTrue();
_fixture.GetPendingJobCount().Should().BeGreaterThanOrEqualTo(1);
// But Authority/Database operations fail
var tokenResult = await _fixture.IssueTokenAsync("user", TimeSpan.FromHours(1));
tokenResult.Success.Should().BeFalse();
var persistResult = await _fixture.PersistDataAsync("key", "value");
persistResult.Success.Should().BeFalse();
}
#endregion
}

View File

@@ -0,0 +1,525 @@
// -----------------------------------------------------------------------------
// ControlPlaneClusterFixture.cs
// Sprint: Testing Enhancement Advisory - Phase 3.3
// Description: Test fixture for simulating control-plane cluster with outage scenarios
// -----------------------------------------------------------------------------
using System.Collections.Concurrent;
using System.Collections.Immutable;
using Xunit;
namespace StellaOps.Chaos.ControlPlane.Tests.Fixtures;
/// <summary>
/// Test fixture that simulates a control-plane cluster with multiple services.
/// Enables chaos testing of outage scenarios.
/// </summary>
public sealed class ControlPlaneClusterFixture : IAsyncLifetime
{
private readonly ServiceFailureInjector _failureInjector = new();
private readonly ConcurrentDictionary<string, MockServiceState> _serviceStates = new();
private readonly ConcurrentQueue<ClusterEvent> _eventLog = new();
private readonly ConcurrentDictionary<string, CachedToken> _tokenCache = new();
private readonly ConcurrentQueue<PendingJob> _pendingJobs = new();
private readonly ConcurrentDictionary<string, PersistedData> _dataStore = new();
private long _eventSequence;
/// <summary>
/// Gets the failure injector for this cluster.
/// </summary>
public ServiceFailureInjector FailureInjector => _failureInjector;
/// <summary>
/// Gets all cluster events.
/// </summary>
public IReadOnlyCollection<ClusterEvent> EventLog => _eventLog.ToImmutableArray();
/// <summary>
/// Gets all cached tokens.
/// </summary>
public IReadOnlyDictionary<string, CachedToken> TokenCache => _tokenCache.ToImmutableDictionary();
/// <summary>
/// Gets all pending jobs.
/// </summary>
public IReadOnlyCollection<PendingJob> PendingJobs => _pendingJobs.ToImmutableArray();
/// <summary>
/// Gets all persisted data.
/// </summary>
public IReadOnlyDictionary<string, PersistedData> DataStore => _dataStore.ToImmutableDictionary();
/// <inheritdoc />
public ValueTask InitializeAsync()
{
// Register default control-plane services
RegisterService("authority", ServiceType.Authority);
RegisterService("scheduler", ServiceType.Scheduler);
RegisterService("gateway", ServiceType.Gateway);
RegisterService("backend", ServiceType.Backend);
RegisterService("database", ServiceType.Database);
return ValueTask.CompletedTask;
}
/// <inheritdoc />
public ValueTask DisposeAsync()
{
_failureInjector.RecoverAll();
_serviceStates.Clear();
return ValueTask.CompletedTask;
}
/// <summary>
/// Registers a service in the cluster.
/// </summary>
public void RegisterService(string serviceId, ServiceType serviceType)
{
_failureInjector.RegisterService(serviceId);
_serviceStates[serviceId] = new MockServiceState
{
ServiceId = serviceId,
ServiceType = serviceType,
IsHealthy = true,
StartedAt = DateTimeOffset.UtcNow
};
}
/// <summary>
/// Simulates Authority service issuing a token.
/// </summary>
public async Task<TokenResult> IssueTokenAsync(string userId, TimeSpan validity, CancellationToken ct = default)
{
var requestResult = await _failureInjector.SimulateRequestAsync("authority", ct);
if (!requestResult.Success)
{
LogEvent("authority", "TokenIssueFailed", $"User: {userId}, Error: {requestResult.Error}");
return new TokenResult
{
Success = false,
Error = requestResult.Error
};
}
var token = new CachedToken
{
TokenId = Guid.NewGuid().ToString("N"),
UserId = userId,
IssuedAt = DateTimeOffset.UtcNow,
ExpiresAt = DateTimeOffset.UtcNow.Add(validity),
IsValid = true
};
_tokenCache[token.TokenId] = token;
LogEvent("authority", "TokenIssued", $"TokenId: {token.TokenId}, User: {userId}");
return new TokenResult
{
Success = true,
Token = token
};
}
/// <summary>
/// Validates a token, using cache if Authority is unavailable.
/// </summary>
public async Task<ValidationResult> ValidateTokenAsync(string tokenId, CancellationToken ct = default)
{
// Try to reach Authority
var requestResult = await _failureInjector.SimulateRequestAsync("authority", ct);
if (requestResult.Success)
{
// Authority available - validate directly
if (_tokenCache.TryGetValue(tokenId, out var token))
{
var isValid = token.IsValid && token.ExpiresAt > DateTimeOffset.UtcNow;
LogEvent("authority", "TokenValidated", $"TokenId: {tokenId}, Valid: {isValid}");
return new ValidationResult
{
Success = true,
IsValid = isValid,
Source = ValidationSource.Authority
};
}
return new ValidationResult
{
Success = true,
IsValid = false,
Source = ValidationSource.Authority,
Error = "Token not found"
};
}
// Authority unavailable - check local cache
if (_tokenCache.TryGetValue(tokenId, out var cachedToken))
{
var isValid = cachedToken.IsValid && cachedToken.ExpiresAt > DateTimeOffset.UtcNow;
LogEvent("authority", "TokenValidatedFromCache", $"TokenId: {tokenId}, Valid: {isValid}");
return new ValidationResult
{
Success = true,
IsValid = isValid,
Source = ValidationSource.Cache,
Warning = "Authority unavailable, used cached token"
};
}
LogEvent("authority", "TokenValidationFailed", $"TokenId: {tokenId}, Authority unavailable, no cache");
return new ValidationResult
{
Success = false,
IsValid = false,
Source = ValidationSource.None,
Error = "Authority unavailable and token not in cache"
};
}
/// <summary>
/// Enqueues a job with the Scheduler.
/// </summary>
public async Task<JobResult> EnqueueJobAsync(string jobType, string payload, CancellationToken ct = default)
{
var job = new PendingJob
{
JobId = Guid.NewGuid().ToString("N"),
JobType = jobType,
Payload = payload,
EnqueuedAt = DateTimeOffset.UtcNow,
Status = JobStatus.Pending
};
// Always persist to local queue first (durability)
_pendingJobs.Enqueue(job);
LogEvent("scheduler", "JobEnqueued", $"JobId: {job.JobId}, Type: {jobType}");
// Try to notify scheduler
var requestResult = await _failureInjector.SimulateRequestAsync("scheduler", ct);
if (!requestResult.Success)
{
LogEvent("scheduler", "SchedulerNotifyFailed", $"JobId: {job.JobId}, Error: {requestResult.Error}");
return new JobResult
{
Success = true,
Job = job,
Warning = "Job persisted but scheduler notification failed"
};
}
return new JobResult
{
Success = true,
Job = job
};
}
/// <summary>
/// Processes pending jobs when scheduler recovers.
/// </summary>
public async Task<int> ProcessPendingJobsAsync(CancellationToken ct = default)
{
var requestResult = await _failureInjector.SimulateRequestAsync("scheduler", ct);
if (!requestResult.Success)
{
LogEvent("scheduler", "ProcessingFailed", $"Error: {requestResult.Error}");
return 0;
}
var processedCount = 0;
var jobsSnapshot = _pendingJobs.ToArray();
foreach (var job in jobsSnapshot.Where(j => j.Status == JobStatus.Pending))
{
job.Status = JobStatus.Processing;
job.ProcessedAt = DateTimeOffset.UtcNow;
processedCount++;
LogEvent("scheduler", "JobProcessed", $"JobId: {job.JobId}");
}
return processedCount;
}
/// <summary>
/// Persists data to the data store.
/// </summary>
public async Task<PersistResult> PersistDataAsync(string key, string value, CancellationToken ct = default)
{
var data = new PersistedData
{
Key = key,
Value = value,
PersistedAt = DateTimeOffset.UtcNow,
Version = 1
};
// Check if database is available
var dbResult = await _failureInjector.SimulateRequestAsync("database", ct);
if (!dbResult.Success)
{
LogEvent("database", "PersistFailed", $"Key: {key}, Error: {dbResult.Error}");
return new PersistResult
{
Success = false,
Error = dbResult.Error
};
}
if (_dataStore.TryGetValue(key, out var existing))
{
data.Version = existing.Version + 1;
}
_dataStore[key] = data;
LogEvent("database", "DataPersisted", $"Key: {key}, Version: {data.Version}");
return new PersistResult
{
Success = true,
Data = data
};
}
/// <summary>
/// Reads data from the data store.
/// </summary>
public async Task<ReadResult> ReadDataAsync(string key, CancellationToken ct = default)
{
var dbResult = await _failureInjector.SimulateRequestAsync("database", ct);
if (!dbResult.Success)
{
LogEvent("database", "ReadFailed", $"Key: {key}, Error: {dbResult.Error}");
return new ReadResult
{
Success = false,
Error = dbResult.Error
};
}
if (_dataStore.TryGetValue(key, out var data))
{
LogEvent("database", "DataRead", $"Key: {key}, Version: {data.Version}");
return new ReadResult
{
Success = true,
Data = data
};
}
return new ReadResult
{
Success = true,
Data = null
};
}
/// <summary>
/// Simulates a full control-plane outage (all services down).
/// </summary>
public void InjectFullControlPlaneOutage()
{
foreach (var serviceId in _serviceStates.Keys)
{
_failureInjector.InjectFullOutage(serviceId);
}
LogEvent("cluster", "FullOutageInjected", "All services down");
}
/// <summary>
/// Recovers all control-plane services.
/// </summary>
public void RecoverControlPlane()
{
_failureInjector.RecoverAll();
LogEvent("cluster", "ControlPlaneRecovered", "All services recovered");
}
/// <summary>
/// Gets the count of pending jobs that haven't been lost.
/// </summary>
public int GetPendingJobCount() => _pendingJobs.Count(j => j.Status == JobStatus.Pending);
/// <summary>
/// Gets all jobs (for verification).
/// </summary>
public IReadOnlyList<PendingJob> GetAllJobs() => _pendingJobs.ToImmutableArray();
/// <summary>
/// Verifies all persisted data is intact.
/// </summary>
public bool VerifyDataIntegrity(IEnumerable<string> expectedKeys)
{
return expectedKeys.All(key => _dataStore.ContainsKey(key));
}
/// <summary>
/// Clears the event log.
/// </summary>
public void ClearEventLog()
{
while (_eventLog.TryDequeue(out _)) { }
}
private void LogEvent(string service, string eventType, string details)
{
var seq = Interlocked.Increment(ref _eventSequence);
_eventLog.Enqueue(new ClusterEvent
{
Sequence = seq,
Timestamp = DateTimeOffset.UtcNow,
Service = service,
EventType = eventType,
Details = details
});
}
}
/// <summary>
/// Mock state for a service in the cluster.
/// </summary>
public sealed class MockServiceState
{
public required string ServiceId { get; init; }
public required ServiceType ServiceType { get; init; }
public bool IsHealthy { get; set; }
public DateTimeOffset StartedAt { get; init; }
}
/// <summary>
/// Types of services in the control-plane.
/// </summary>
public enum ServiceType
{
Authority,
Scheduler,
Gateway,
Backend,
Database
}
/// <summary>
/// Represents a cached authentication token.
/// </summary>
public sealed class CachedToken
{
public required string TokenId { get; init; }
public required string UserId { get; init; }
public required DateTimeOffset IssuedAt { get; init; }
public required DateTimeOffset ExpiresAt { get; init; }
public required bool IsValid { get; set; }
}
/// <summary>
/// Result of token issuance.
/// </summary>
public sealed record TokenResult
{
public required bool Success { get; init; }
public CachedToken? Token { get; init; }
public string? Error { get; init; }
}
/// <summary>
/// Result of token validation.
/// </summary>
public sealed record ValidationResult
{
public required bool Success { get; init; }
public required bool IsValid { get; init; }
public required ValidationSource Source { get; init; }
public string? Error { get; init; }
public string? Warning { get; init; }
}
/// <summary>
/// Source of token validation.
/// </summary>
public enum ValidationSource
{
None,
Authority,
Cache
}
/// <summary>
/// Represents a pending job in the scheduler queue.
/// </summary>
public sealed class PendingJob
{
public required string JobId { get; init; }
public required string JobType { get; init; }
public required string Payload { get; init; }
public required DateTimeOffset EnqueuedAt { get; init; }
public JobStatus Status { get; set; }
public DateTimeOffset? ProcessedAt { get; set; }
}
/// <summary>
/// Job status.
/// </summary>
public enum JobStatus
{
Pending,
Processing,
Completed,
Failed
}
/// <summary>
/// Result of job enqueue operation.
/// </summary>
public sealed record JobResult
{
public required bool Success { get; init; }
public PendingJob? Job { get; init; }
public string? Error { get; init; }
public string? Warning { get; init; }
}
/// <summary>
/// Represents persisted data.
/// </summary>
public sealed class PersistedData
{
public required string Key { get; init; }
public required string Value { get; init; }
public required DateTimeOffset PersistedAt { get; init; }
public required int Version { get; set; }
}
/// <summary>
/// Result of data persistence operation.
/// </summary>
public sealed record PersistResult
{
public required bool Success { get; init; }
public PersistedData? Data { get; init; }
public string? Error { get; init; }
}
/// <summary>
/// Result of data read operation.
/// </summary>
public sealed record ReadResult
{
public required bool Success { get; init; }
public PersistedData? Data { get; init; }
public string? Error { get; init; }
}
/// <summary>
/// Represents an event in the cluster.
/// </summary>
public sealed record ClusterEvent
{
public required long Sequence { get; init; }
public required DateTimeOffset Timestamp { get; init; }
public required string Service { get; init; }
public required string EventType { get; init; }
public required string Details { get; init; }
}

View File

@@ -0,0 +1,273 @@
// -----------------------------------------------------------------------------
// ServiceFailureInjector.cs
// Sprint: Testing Enhancement Advisory - Phase 3.3
// Description: Service failure injection for control-plane chaos testing
// -----------------------------------------------------------------------------
using System.Collections.Concurrent;
namespace StellaOps.Chaos.ControlPlane.Tests.Fixtures;
/// <summary>
/// Injects failures into control-plane services for chaos testing.
/// Supports various failure modes: full outage, partial failures, latency injection.
/// </summary>
public sealed class ServiceFailureInjector
{
private readonly ConcurrentDictionary<string, ServiceState> _serviceStates = new();
private readonly ConcurrentDictionary<string, FailureConfig> _failureConfigs = new();
private readonly Random _random = new(42); // Deterministic for reproducibility
/// <summary>
/// Gets the current state of a service.
/// </summary>
public ServiceState GetServiceState(string serviceId)
{
return _serviceStates.GetOrAdd(serviceId, _ => new ServiceState
{
ServiceId = serviceId,
Status = ServiceStatus.Healthy,
LastUpdated = DateTimeOffset.UtcNow
});
}
/// <summary>
/// Registers a service for failure injection.
/// </summary>
public void RegisterService(string serviceId)
{
_serviceStates.TryAdd(serviceId, new ServiceState
{
ServiceId = serviceId,
Status = ServiceStatus.Healthy,
LastUpdated = DateTimeOffset.UtcNow
});
}
/// <summary>
/// Causes a complete service outage.
/// </summary>
public void InjectFullOutage(string serviceId)
{
if (_serviceStates.TryGetValue(serviceId, out var state))
{
state.Status = ServiceStatus.Down;
state.LastUpdated = DateTimeOffset.UtcNow;
state.OutageStarted = DateTimeOffset.UtcNow;
}
_failureConfigs[serviceId] = new FailureConfig
{
ServiceId = serviceId,
FailureMode = FailureMode.FullOutage,
FailureRate = 1.0
};
}
/// <summary>
/// Causes partial failures (random request failures).
/// </summary>
public void InjectPartialFailure(string serviceId, double failureRate = 0.5)
{
if (_serviceStates.TryGetValue(serviceId, out var state))
{
state.Status = ServiceStatus.Degraded;
state.LastUpdated = DateTimeOffset.UtcNow;
}
_failureConfigs[serviceId] = new FailureConfig
{
ServiceId = serviceId,
FailureMode = FailureMode.PartialFailure,
FailureRate = Math.Clamp(failureRate, 0.0, 1.0)
};
}
/// <summary>
/// Injects latency into service responses.
/// </summary>
public void InjectLatency(string serviceId, TimeSpan baseLatency, TimeSpan jitter = default)
{
if (_serviceStates.TryGetValue(serviceId, out var state))
{
state.Status = ServiceStatus.Slow;
state.LastUpdated = DateTimeOffset.UtcNow;
}
_failureConfigs[serviceId] = new FailureConfig
{
ServiceId = serviceId,
FailureMode = FailureMode.LatencyInjection,
BaseLatency = baseLatency,
LatencyJitter = jitter
};
}
/// <summary>
/// Recovers a service from failure.
/// </summary>
public void RecoverService(string serviceId)
{
if (_serviceStates.TryGetValue(serviceId, out var state))
{
state.Status = ServiceStatus.Healthy;
state.LastUpdated = DateTimeOffset.UtcNow;
state.OutageStarted = null;
}
_failureConfigs.TryRemove(serviceId, out _);
}
/// <summary>
/// Recovers all services.
/// </summary>
public void RecoverAll()
{
foreach (var serviceId in _serviceStates.Keys)
{
RecoverService(serviceId);
}
}
/// <summary>
/// Simulates a request to a service, applying any configured failures.
/// </summary>
/// <returns>True if request succeeds, false if it fails due to injected failure.</returns>
public async Task<ServiceRequestResult> SimulateRequestAsync(
string serviceId,
CancellationToken ct = default)
{
var state = GetServiceState(serviceId);
if (!_failureConfigs.TryGetValue(serviceId, out var config))
{
// No failure configured, request succeeds
return new ServiceRequestResult
{
ServiceId = serviceId,
Success = true,
Latency = TimeSpan.Zero
};
}
switch (config.FailureMode)
{
case FailureMode.FullOutage:
return new ServiceRequestResult
{
ServiceId = serviceId,
Success = false,
Error = $"Service {serviceId} is down (full outage)",
Latency = TimeSpan.Zero
};
case FailureMode.PartialFailure:
var shouldFail = _random.NextDouble() < config.FailureRate;
return new ServiceRequestResult
{
ServiceId = serviceId,
Success = !shouldFail,
Error = shouldFail ? $"Service {serviceId} request failed (partial failure)" : null,
Latency = TimeSpan.Zero
};
case FailureMode.LatencyInjection:
var jitterMs = _random.NextDouble() * config.LatencyJitter.TotalMilliseconds;
var totalLatency = config.BaseLatency + TimeSpan.FromMilliseconds(jitterMs);
// Simulate latency (in real test, would actually delay)
return new ServiceRequestResult
{
ServiceId = serviceId,
Success = true,
Latency = totalLatency
};
default:
return new ServiceRequestResult
{
ServiceId = serviceId,
Success = true,
Latency = TimeSpan.Zero
};
}
}
/// <summary>
/// Gets all services currently in outage.
/// </summary>
public IReadOnlyList<string> GetServicesInOutage()
{
return _serviceStates
.Where(kvp => kvp.Value.Status == ServiceStatus.Down)
.Select(kvp => kvp.Key)
.ToList();
}
/// <summary>
/// Gets all services currently degraded.
/// </summary>
public IReadOnlyList<string> GetDegradedServices()
{
return _serviceStates
.Where(kvp => kvp.Value.Status is ServiceStatus.Degraded or ServiceStatus.Slow)
.Select(kvp => kvp.Key)
.ToList();
}
}
/// <summary>
/// State of a service for chaos testing.
/// </summary>
public sealed class ServiceState
{
public required string ServiceId { get; init; }
public ServiceStatus Status { get; set; }
public DateTimeOffset LastUpdated { get; set; }
public DateTimeOffset? OutageStarted { get; set; }
}
/// <summary>
/// Service status levels.
/// </summary>
public enum ServiceStatus
{
Healthy,
Degraded,
Slow,
Down
}
/// <summary>
/// Configuration for failure injection.
/// </summary>
public sealed class FailureConfig
{
public required string ServiceId { get; init; }
public FailureMode FailureMode { get; init; }
public double FailureRate { get; init; }
public TimeSpan BaseLatency { get; init; }
public TimeSpan LatencyJitter { get; init; }
}
/// <summary>
/// Types of failure modes.
/// </summary>
public enum FailureMode
{
None,
FullOutage,
PartialFailure,
LatencyInjection
}
/// <summary>
/// Result of a simulated service request.
/// </summary>
public sealed record ServiceRequestResult
{
public required string ServiceId { get; init; }
public required bool Success { get; init; }
public string? Error { get; init; }
public TimeSpan Latency { get; init; }
}

View File

@@ -0,0 +1,406 @@
// -----------------------------------------------------------------------------
// PartialOutageTests.cs
// Sprint: Testing Enhancement Advisory - Phase 3.3
// Description: Tests for control-plane behavior during partial outages
// -----------------------------------------------------------------------------
using FluentAssertions;
using StellaOps.Chaos.ControlPlane.Tests.Fixtures;
using StellaOps.TestKit;
using Xunit;
namespace StellaOps.Chaos.ControlPlane.Tests;
/// <summary>
/// Tests for control-plane behavior during partial outage scenarios.
/// Validates graceful degradation under latency injection and partial failures.
/// </summary>
[Trait("Category", TestCategories.Chaos)]
[Trait("Category", "ControlPlane")]
[Trait("Category", "PartialOutage")]
public class PartialOutageTests : IClassFixture<ControlPlaneClusterFixture>
{
private readonly ControlPlaneClusterFixture _fixture;
public PartialOutageTests(ControlPlaneClusterFixture fixture)
{
_fixture = fixture;
_fixture.FailureInjector.RecoverAll();
_fixture.ClearEventLog();
}
#region Partial Failure Rate Tests
[Fact]
public async Task Authority_50PercentFailure_SomeTokensIssued()
{
// Arrange - 50% failure rate
_fixture.FailureInjector.InjectPartialFailure("authority", 0.5);
// Act - Try to issue multiple tokens
var results = new List<TokenResult>();
for (var i = 0; i < 20; i++)
{
results.Add(await _fixture.IssueTokenAsync($"user-{i}", TimeSpan.FromHours(1)));
}
// Assert - Roughly half should succeed (with some variance due to randomness)
var successCount = results.Count(r => r.Success);
var failureCount = results.Count(r => !r.Success);
// Allow reasonable variance (30-70% success due to random seeding)
successCount.Should().BeGreaterThan(3);
failureCount.Should().BeGreaterThan(3);
}
[Fact]
public async Task Database_25PercentFailure_MostWritesSucceed()
{
// Arrange - 25% failure rate
_fixture.FailureInjector.InjectPartialFailure("database", 0.25);
// Act - Try multiple writes
var results = new List<PersistResult>();
for (var i = 0; i < 20; i++)
{
results.Add(await _fixture.PersistDataAsync($"key-{i}", $"value-{i}"));
}
// Assert - Most should succeed
var successCount = results.Count(r => r.Success);
successCount.Should().BeGreaterThan(10); // At least half should succeed
}
[Fact]
public async Task Scheduler_HighFailureRate_JobsStillPersisted()
{
// Arrange - 80% failure rate
_fixture.FailureInjector.InjectPartialFailure("scheduler", 0.8);
// Act - Enqueue jobs
var results = new List<JobResult>();
for (var i = 0; i < 10; i++)
{
results.Add(await _fixture.EnqueueJobAsync("scan", $"image-{i}"));
}
// Assert - All jobs should be persisted locally (just notification may fail)
results.Should().AllSatisfy(r => r.Success.Should().BeTrue());
// Jobs should all be in the pending queue
_fixture.GetPendingJobCount().Should().Be(10);
}
[Fact]
public async Task PartialFailure_RetrySucceeds_Eventually()
{
// Arrange - 50% failure rate
_fixture.FailureInjector.InjectPartialFailure("authority", 0.5);
// Act - Keep trying until success (max 10 attempts)
TokenResult? successResult = null;
for (var attempt = 0; attempt < 10; attempt++)
{
var result = await _fixture.IssueTokenAsync("retry-user", TimeSpan.FromHours(1));
if (result.Success)
{
successResult = result;
break;
}
}
// Assert - Should eventually succeed
successResult.Should().NotBeNull();
successResult!.Success.Should().BeTrue();
successResult.Token.Should().NotBeNull();
}
#endregion
#region Latency Injection Tests
[Fact]
public async Task Authority_HighLatency_OperationsComplete()
{
// Arrange - 500ms base latency
_fixture.FailureInjector.InjectLatency(
"authority",
TimeSpan.FromMilliseconds(500),
TimeSpan.FromMilliseconds(100));
// Act - Issue token
var result = await _fixture.IssueTokenAsync("latency-user", TimeSpan.FromHours(1));
// Assert - Should still complete
result.Success.Should().BeTrue();
result.Token.Should().NotBeNull();
// The service state should show "Slow" status
var serviceState = _fixture.FailureInjector.GetServiceState("authority");
serviceState.Status.Should().Be(ServiceStatus.Slow);
}
[Fact]
public async Task Database_VariableLatency_NoDataCorruption()
{
// Arrange - Variable latency with jitter
_fixture.FailureInjector.InjectLatency(
"database",
TimeSpan.FromMilliseconds(200),
TimeSpan.FromMilliseconds(300));
// Act - Multiple concurrent-like writes
var tasks = new List<Task<PersistResult>>();
for (var i = 0; i < 10; i++)
{
tasks.Add(_fixture.PersistDataAsync($"latency-key-{i}", $"value-{i}"));
}
var results = await Task.WhenAll(tasks);
// Assert - All should succeed
results.Should().AllSatisfy(r => r.Success.Should().BeTrue());
// Verify data integrity
for (var i = 0; i < 10; i++)
{
var read = await _fixture.ReadDataAsync($"latency-key-{i}");
read.Success.Should().BeTrue();
read.Data!.Value.Should().Be($"value-{i}");
}
}
[Fact]
public async Task Scheduler_Latency_JobOrdering_Preserved()
{
// Arrange - Latency injection
_fixture.FailureInjector.InjectLatency(
"scheduler",
TimeSpan.FromMilliseconds(100),
TimeSpan.FromMilliseconds(50));
// Act - Enqueue jobs in sequence
for (var i = 0; i < 5; i++)
{
await _fixture.EnqueueJobAsync("scan", $"ordered-{i}");
}
// Assert - Jobs should be in order
var jobs = _fixture.GetAllJobs().ToList();
jobs.Should().HaveCount(5);
for (var i = 0; i < 5; i++)
{
jobs[i].Payload.Should().Be($"ordered-{i}");
}
}
#endregion
#region Degraded Service Tests
[Fact]
public async Task DegradedAuthority_CacheHitRate_Improves()
{
// Arrange - Issue tokens while healthy
var tokens = new List<string>();
for (var i = 0; i < 5; i++)
{
var result = await _fixture.IssueTokenAsync($"user-{i}", TimeSpan.FromHours(1));
if (result.Success)
{
tokens.Add(result.Token!.TokenId);
}
}
// Authority becomes degraded (partial failure)
_fixture.FailureInjector.InjectPartialFailure("authority", 0.7);
// Act - Validate cached tokens
var validations = new List<ValidationResult>();
foreach (var tokenId in tokens)
{
validations.Add(await _fixture.ValidateTokenAsync(tokenId));
}
// Assert - All should succeed (either from Authority or cache)
validations.Should().AllSatisfy(v =>
{
v.Success.Should().BeTrue();
v.IsValid.Should().BeTrue();
});
// Some should come from cache
var cacheHits = validations.Count(v => v.Source == ValidationSource.Cache);
cacheHits.Should().BeGreaterThan(0);
}
[Fact]
public async Task MultipleDegraded_Services_GracefulDegradation()
{
// Arrange - Multiple services degraded
_fixture.FailureInjector.InjectPartialFailure("authority", 0.3);
_fixture.FailureInjector.InjectLatency("database", TimeSpan.FromMilliseconds(200));
_fixture.FailureInjector.InjectPartialFailure("scheduler", 0.2);
// Act - Perform various operations
var tokenResults = new List<bool>();
var persistResults = new List<bool>();
var jobResults = new List<bool>();
for (var i = 0; i < 10; i++)
{
var tokenResult = await _fixture.IssueTokenAsync($"user-{i}", TimeSpan.FromHours(1));
tokenResults.Add(tokenResult.Success);
var persistResult = await _fixture.PersistDataAsync($"key-{i}", $"value-{i}");
persistResults.Add(persistResult.Success);
var jobResult = await _fixture.EnqueueJobAsync("scan", $"image-{i}");
jobResults.Add(jobResult.Success);
}
// Assert - System remains functional despite degradation
tokenResults.Count(r => r).Should().BeGreaterThan(5);
persistResults.Count(r => r).Should().BeGreaterThan(5);
jobResults.Should().AllSatisfy(r => r.Should().BeTrue()); // Jobs always persist locally
}
#endregion
#region Recovery from Partial Outage Tests
[Fact]
public async Task PartialOutage_Recovery_FullFunctionality_Restored()
{
// Arrange - Start with partial failure
_fixture.FailureInjector.InjectPartialFailure("authority", 0.5);
// Some operations fail during partial outage
var duringOutage = new List<TokenResult>();
for (var i = 0; i < 5; i++)
{
duringOutage.Add(await _fixture.IssueTokenAsync($"user-{i}", TimeSpan.FromHours(1)));
}
// Act - Recover
_fixture.FailureInjector.RecoverService("authority");
// All operations should succeed now
var afterRecovery = new List<TokenResult>();
for (var i = 5; i < 10; i++)
{
afterRecovery.Add(await _fixture.IssueTokenAsync($"user-{i}", TimeSpan.FromHours(1)));
}
// Assert
afterRecovery.Should().AllSatisfy(r => r.Success.Should().BeTrue());
var serviceState = _fixture.FailureInjector.GetServiceState("authority");
serviceState.Status.Should().Be(ServiceStatus.Healthy);
}
[Fact]
public async Task LatencyRecovery_PerformanceReturns_ToNormal()
{
// Arrange - High latency
_fixture.FailureInjector.InjectLatency(
"database",
TimeSpan.FromSeconds(1),
TimeSpan.FromMilliseconds(500));
// Note: In real scenario, we'd measure actual latency
// Here we just verify state changes
var slowState = _fixture.FailureInjector.GetServiceState("database");
slowState.Status.Should().Be(ServiceStatus.Slow);
// Act - Recover
_fixture.FailureInjector.RecoverService("database");
// Assert - Back to healthy
var healthyState = _fixture.FailureInjector.GetServiceState("database");
healthyState.Status.Should().Be(ServiceStatus.Healthy);
}
#endregion
#region Service Isolation Tests
[Fact]
public async Task SingleService_Degraded_OthersUnaffected()
{
// Arrange - Only Authority degraded
_fixture.FailureInjector.InjectPartialFailure("authority", 0.9);
// Act - Database and Scheduler operations
var dbResult = await _fixture.PersistDataAsync("isolated-key", "isolated-value");
var jobResult = await _fixture.EnqueueJobAsync("scan", "isolated-image");
// Assert - Unaffected services work normally
dbResult.Success.Should().BeTrue();
jobResult.Success.Should().BeTrue();
// But Authority is degraded
var authState = _fixture.FailureInjector.GetServiceState("authority");
authState.Status.Should().Be(ServiceStatus.Degraded);
var dbState = _fixture.FailureInjector.GetServiceState("database");
dbState.Status.Should().Be(ServiceStatus.Healthy);
}
[Fact]
public async Task CascadingDegradation_DoesNotOccur()
{
// Arrange - Database degraded
_fixture.FailureInjector.InjectPartialFailure("database", 0.5);
// Act - Authority should work independently
var tokenResult = await _fixture.IssueTokenAsync("cascade-user", TimeSpan.FromHours(1));
// Assert - Authority unaffected by database degradation
tokenResult.Success.Should().BeTrue();
// Scheduler also unaffected
var jobResult = await _fixture.EnqueueJobAsync("cascade-scan", "image");
jobResult.Success.Should().BeTrue();
}
#endregion
#region Flapping Service Tests
[Fact]
public async Task FlappingService_SystemRemains_Stable()
{
// Simulate a flapping service (alternating between healthy and degraded)
var allResults = new List<bool>();
for (var cycle = 0; cycle < 5; cycle++)
{
// Service degrades
_fixture.FailureInjector.InjectPartialFailure("authority", 0.5);
for (var i = 0; i < 3; i++)
{
var result = await _fixture.IssueTokenAsync($"flap-user-{cycle}-{i}", TimeSpan.FromHours(1));
allResults.Add(result.Success);
}
// Service recovers
_fixture.FailureInjector.RecoverService("authority");
for (var i = 0; i < 3; i++)
{
var result = await _fixture.IssueTokenAsync($"stable-user-{cycle}-{i}", TimeSpan.FromHours(1));
allResults.Add(result.Success);
}
}
// Assert - System handled the flapping without crashing
// Most operations during stable periods should succeed
allResults.Should().NotBeEmpty();
allResults.Count(r => r).Should().BeGreaterThan(allResults.Count / 2);
}
#endregion
}

View File

@@ -0,0 +1,27 @@
<?xml version="1.0" encoding="utf-8"?>
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<LangVersion>preview</LangVersion>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<IsPackable>false</IsPackable>
<IsTestProject>true</IsTestProject>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<RootNamespace>StellaOps.Chaos.ControlPlane.Tests</RootNamespace>
</PropertyGroup>
<!-- Sprint: Testing Enhancement Advisory - Phase 3.3 -->
<!-- Description: Control-plane outage E2E chaos tests -->
<ItemGroup>
<PackageReference Include="FluentAssertions" />
<PackageReference Include="Testcontainers" />
<PackageReference Include="Testcontainers.PostgreSql" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\__Libraries\StellaOps.TestKit\StellaOps.TestKit.csproj" />
</ItemGroup>
</Project>

View File

@@ -1,7 +1,7 @@
# Chaos Router Tests Task Board
This board mirrors active sprint tasks for this module.
Source of truth: `docs/implplan/SPRINT_20251229_049_BE_csproj_audit_maint_tests.md`.
Source of truth: `docs-archived/implplan/2025-12-29-csproj-audit/SPRINT_20251229_049_BE_csproj_audit_maint_tests.md`.
| Task ID | Status | Notes |
| --- | --- | --- |