using System.Collections.Concurrent;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Notifier.Worker.Observability;
///
/// Chaos testing engine for simulating channel outages and failures.
///
public interface IChaosEngine
{
///
/// Injects a fault for a channel type.
///
Task InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default);
///
/// Removes a fault injection.
///
Task RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default);
///
/// Gets all active faults.
///
Task> GetActiveFaultsAsync(CancellationToken cancellationToken = default);
///
/// Checks if a channel operation should fail due to chaos.
///
Task ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default);
///
/// Runs a chaos test scenario.
///
Task RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default);
///
/// Gets chaos test history.
///
Task> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default);
///
/// Clears all active faults.
///
Task ClearAllFaultsAsync(CancellationToken cancellationToken = default);
}
///
/// Request to inject a fault.
///
public sealed record ChaosFaultRequest
{
public required string ChannelType { get; init; }
public string? TenantId { get; init; }
public required ChaosFaultType FaultType { get; init; }
public double FailureProbability { get; init; } = 1.0;
public TimeSpan? Duration { get; init; }
public TimeSpan? LatencyInjection { get; init; }
public int? ErrorCode { get; init; }
public string? ErrorMessage { get; init; }
public string? Description { get; init; }
}
///
/// Type of chaos fault.
///
public enum ChaosFaultType
{
Outage,
Latency,
RateLimit,
AuthFailure,
Timeout,
PartialFailure,
Intermittent
}
///
/// Active fault injection.
///
public sealed record ChaosFaultInjection
{
public required string FaultId { get; init; }
public required string ChannelType { get; init; }
public string? TenantId { get; init; }
public required ChaosFaultType FaultType { get; init; }
public double FailureProbability { get; init; }
public TimeSpan? LatencyInjection { get; init; }
public int? ErrorCode { get; init; }
public string? ErrorMessage { get; init; }
public string? Description { get; init; }
public DateTimeOffset CreatedAt { get; init; }
public DateTimeOffset? ExpiresAt { get; init; }
public int TriggerCount { get; init; }
public bool IsActive { get; init; } = true;
}
///
/// Result of checking for chaos fault.
///
public sealed record ChaosFaultResult
{
public bool ShouldFail { get; init; }
public ChaosFaultInjection? ActiveFault { get; init; }
public TimeSpan? InjectedLatency { get; init; }
public Exception? SimulatedException { get; init; }
}
///
/// A chaos test scenario.
///
public sealed record ChaosScenario
{
public required string ScenarioId { get; init; }
public required string Name { get; init; }
public string? Description { get; init; }
public required IReadOnlyList Steps { get; init; }
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(10);
public bool StopOnFirstFailure { get; init; }
}
///
/// A step in a chaos scenario.
///
public sealed record ChaosScenarioStep
{
public required string StepId { get; init; }
public required string Name { get; init; }
public required ChaosStepAction Action { get; init; }
public ChaosFaultRequest? FaultToInject { get; init; }
public string? FaultIdToRemove { get; init; }
public TimeSpan? WaitDuration { get; init; }
public ChaosAssertion? Assertion { get; init; }
}
///
/// Action type for a chaos step.
///
public enum ChaosStepAction
{
InjectFault,
RemoveFault,
Wait,
Assert,
SendTestDelivery,
CheckMetrics
}
///
/// Assertion for chaos testing.
///
public sealed record ChaosAssertion
{
public required ChaosAssertionType Type { get; init; }
public string? MetricName { get; init; }
public double? ExpectedValue { get; init; }
public double? Tolerance { get; init; }
public string? ExpectedStatus { get; init; }
}
///
/// Type of chaos assertion.
///
public enum ChaosAssertionType
{
MetricEquals,
MetricGreaterThan,
MetricLessThan,
DeadLetterCountEquals,
FallbackTriggered,
AlertFired
}
///
/// Result of a chaos test.
///
public sealed record ChaosTestResult
{
public required string TestId { get; init; }
public required string ScenarioId { get; init; }
public required string ScenarioName { get; init; }
public bool Success { get; init; }
public DateTimeOffset StartedAt { get; init; }
public DateTimeOffset CompletedAt { get; init; }
public TimeSpan Duration { get; init; }
public IReadOnlyList StepResults { get; init; } = [];
public string? Error { get; init; }
}
///
/// Result of a chaos step.
///
public sealed record ChaosStepResult
{
public required string StepId { get; init; }
public required string StepName { get; init; }
public bool Success { get; init; }
public DateTimeOffset ExecutedAt { get; init; }
public TimeSpan Duration { get; init; }
public string? Error { get; init; }
public object? Data { get; init; }
}
///
/// Options for chaos engine.
///
public sealed class ChaosEngineOptions
{
public const string SectionName = "Notifier:Observability:Chaos";
public bool Enabled { get; set; }
public bool AllowInProduction { get; set; }
public TimeSpan MaxFaultDuration { get; set; } = TimeSpan.FromHours(1);
public int MaxConcurrentFaults { get; set; } = 10;
public IReadOnlyList AllowedChannelTypes { get; set; } = ["webhook", "email", "slack", "teams", "pagerduty", "opsgenie"];
}
///
/// Default implementation of chaos engine.
///
public sealed class DefaultChaosEngine : IChaosEngine
{
private readonly ConcurrentDictionary _activeFaults = new();
private readonly List _testHistory = [];
private readonly ChaosEngineOptions _options;
private readonly TimeProvider _timeProvider;
private readonly INotifierMetrics? _metrics;
private readonly ILogger _logger;
private readonly Random _random = new();
public DefaultChaosEngine(
IOptions options,
TimeProvider timeProvider,
INotifierMetrics? metrics,
ILogger logger)
{
_options = options?.Value ?? new ChaosEngineOptions();
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public Task InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default)
{
if (!_options.Enabled)
throw new InvalidOperationException("Chaos engine is disabled");
if (_activeFaults.Count >= _options.MaxConcurrentFaults)
throw new InvalidOperationException($"Maximum concurrent faults ({_options.MaxConcurrentFaults}) reached");
var now = _timeProvider.GetUtcNow();
var duration = request.Duration ?? _options.MaxFaultDuration;
if (duration > _options.MaxFaultDuration)
duration = _options.MaxFaultDuration;
var fault = new ChaosFaultInjection
{
FaultId = $"chaos-{Guid.NewGuid():N}"[..16],
ChannelType = request.ChannelType,
TenantId = request.TenantId,
FaultType = request.FaultType,
FailureProbability = Math.Clamp(request.FailureProbability, 0.0, 1.0),
LatencyInjection = request.LatencyInjection,
ErrorCode = request.ErrorCode,
ErrorMessage = request.ErrorMessage,
Description = request.Description,
CreatedAt = now,
ExpiresAt = now + duration,
IsActive = true
};
_activeFaults[fault.FaultId] = fault;
_logger.LogWarning("Injected chaos fault {FaultId} for channel {ChannelType}: {FaultType}", fault.FaultId, fault.ChannelType, fault.FaultType);
return Task.FromResult(fault);
}
public Task RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default)
{
var removed = _activeFaults.TryRemove(faultId, out var fault);
if (removed)
_logger.LogInformation("Removed chaos fault {FaultId}", faultId);
return Task.FromResult(removed);
}
public Task> GetActiveFaultsAsync(CancellationToken cancellationToken = default)
{
CleanupExpiredFaults();
return Task.FromResult>(_activeFaults.Values.ToList());
}
public Task ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default)
{
if (!_options.Enabled)
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
CleanupExpiredFaults();
var matchingFault = _activeFaults.Values
.Where(f => f.IsActive)
.Where(f => f.ChannelType == channelType || f.ChannelType == "*")
.Where(f => f.TenantId is null || f.TenantId == tenantId)
.FirstOrDefault();
if (matchingFault is null)
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
var shouldFail = _random.NextDouble() < matchingFault.FailureProbability;
if (!shouldFail)
return Task.FromResult(new ChaosFaultResult { ShouldFail = false, ActiveFault = matchingFault });
// Update trigger count
_activeFaults[matchingFault.FaultId] = matchingFault with { TriggerCount = matchingFault.TriggerCount + 1 };
var exception = matchingFault.FaultType switch
{
ChaosFaultType.Outage => new InvalidOperationException(matchingFault.ErrorMessage ?? "Channel outage (chaos)"),
ChaosFaultType.AuthFailure => new UnauthorizedAccessException(matchingFault.ErrorMessage ?? "Auth failure (chaos)"),
ChaosFaultType.Timeout => new TimeoutException(matchingFault.ErrorMessage ?? "Timeout (chaos)"),
ChaosFaultType.RateLimit => new InvalidOperationException(matchingFault.ErrorMessage ?? "Rate limited (chaos)"),
_ => new Exception(matchingFault.ErrorMessage ?? "Chaos fault")
};
return Task.FromResult(new ChaosFaultResult
{
ShouldFail = true,
ActiveFault = matchingFault,
InjectedLatency = matchingFault.LatencyInjection,
SimulatedException = exception
});
}
public async Task RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default)
{
if (!_options.Enabled)
throw new InvalidOperationException("Chaos engine is disabled");
var testId = $"test-{Guid.NewGuid():N}"[..16];
var startedAt = _timeProvider.GetUtcNow();
var stepResults = new List();
string? error = null;
var success = true;
_logger.LogInformation("Starting chaos scenario {ScenarioId}: {ScenarioName}", scenario.ScenarioId, scenario.Name);
using var timeoutCts = new CancellationTokenSource(scenario.Timeout);
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token);
try
{
foreach (var step in scenario.Steps)
{
var stepStart = _timeProvider.GetUtcNow();
var stepResult = await ExecuteStepAsync(step, linkedCts.Token);
stepResults.Add(stepResult);
if (!stepResult.Success)
{
success = false;
if (scenario.StopOnFirstFailure)
{
error = $"Step '{step.Name}' failed: {stepResult.Error}";
break;
}
}
}
}
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
{
success = false;
error = "Scenario timed out";
}
catch (Exception ex)
{
success = false;
error = ex.Message;
}
var completedAt = _timeProvider.GetUtcNow();
var result = new ChaosTestResult
{
TestId = testId,
ScenarioId = scenario.ScenarioId,
ScenarioName = scenario.Name,
Success = success,
StartedAt = startedAt,
CompletedAt = completedAt,
Duration = completedAt - startedAt,
StepResults = stepResults,
Error = error
};
lock (_testHistory)
{
_testHistory.Add(result);
while (_testHistory.Count > 100) _testHistory.RemoveAt(0);
}
_logger.LogInformation("Chaos scenario {ScenarioId} completed: {Success}", scenario.ScenarioId, success ? "PASSED" : "FAILED");
return result;
}
private async Task ExecuteStepAsync(ChaosScenarioStep step, CancellationToken cancellationToken)
{
var executedAt = _timeProvider.GetUtcNow();
try
{
object? data = null;
switch (step.Action)
{
case ChaosStepAction.InjectFault when step.FaultToInject is not null:
var fault = await InjectFaultAsync(step.FaultToInject, cancellationToken);
data = fault;
break;
case ChaosStepAction.RemoveFault when step.FaultIdToRemove is not null:
await RemoveFaultAsync(step.FaultIdToRemove, cancellationToken);
break;
case ChaosStepAction.Wait when step.WaitDuration.HasValue:
await Task.Delay(step.WaitDuration.Value, cancellationToken);
break;
case ChaosStepAction.Assert when step.Assertion is not null:
var assertResult = EvaluateAssertion(step.Assertion);
if (!assertResult.passed)
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = assertResult.error };
break;
}
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = true, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Data = data };
}
catch (Exception ex)
{
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = ex.Message };
}
}
private (bool passed, string? error) EvaluateAssertion(ChaosAssertion assertion)
{
// Simplified assertion evaluation - in production would query actual metrics
return assertion.Type switch
{
ChaosAssertionType.FallbackTriggered => (true, null),
ChaosAssertionType.AlertFired => (true, null),
_ => (true, null)
};
}
public Task> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default)
{
lock (_testHistory)
{
return Task.FromResult>(_testHistory.TakeLast(limit).Reverse().ToList());
}
}
public Task ClearAllFaultsAsync(CancellationToken cancellationToken = default)
{
_activeFaults.Clear();
_logger.LogInformation("Cleared all chaos faults");
return Task.CompletedTask;
}
private void CleanupExpiredFaults()
{
var now = _timeProvider.GetUtcNow();
var expired = _activeFaults.Where(f => f.Value.ExpiresAt.HasValue && f.Value.ExpiresAt < now).Select(f => f.Key).ToList();
foreach (var id in expired)
{
_activeFaults.TryRemove(id, out _);
_logger.LogDebug("Expired chaos fault {FaultId}", id);
}
}
}