using System.Collections.Concurrent; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; namespace StellaOps.Notifier.Worker.Observability; /// /// Chaos testing engine for simulating channel outages and failures. /// public interface IChaosEngine { /// /// Injects a fault for a channel type. /// Task InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default); /// /// Removes a fault injection. /// Task RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default); /// /// Gets all active faults. /// Task> GetActiveFaultsAsync(CancellationToken cancellationToken = default); /// /// Checks if a channel operation should fail due to chaos. /// Task ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default); /// /// Runs a chaos test scenario. /// Task RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default); /// /// Gets chaos test history. /// Task> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default); /// /// Clears all active faults. /// Task ClearAllFaultsAsync(CancellationToken cancellationToken = default); } /// /// Request to inject a fault. /// public sealed record ChaosFaultRequest { public required string ChannelType { get; init; } public string? TenantId { get; init; } public required ChaosFaultType FaultType { get; init; } public double FailureProbability { get; init; } = 1.0; public TimeSpan? Duration { get; init; } public TimeSpan? LatencyInjection { get; init; } public int? ErrorCode { get; init; } public string? ErrorMessage { get; init; } public string? Description { get; init; } } /// /// Type of chaos fault. /// public enum ChaosFaultType { Outage, Latency, RateLimit, AuthFailure, Timeout, PartialFailure, Intermittent } /// /// Active fault injection. /// public sealed record ChaosFaultInjection { public required string FaultId { get; init; } public required string ChannelType { get; init; } public string? TenantId { get; init; } public required ChaosFaultType FaultType { get; init; } public double FailureProbability { get; init; } public TimeSpan? LatencyInjection { get; init; } public int? ErrorCode { get; init; } public string? ErrorMessage { get; init; } public string? Description { get; init; } public DateTimeOffset CreatedAt { get; init; } public DateTimeOffset? ExpiresAt { get; init; } public int TriggerCount { get; init; } public bool IsActive { get; init; } = true; } /// /// Result of checking for chaos fault. /// public sealed record ChaosFaultResult { public bool ShouldFail { get; init; } public ChaosFaultInjection? ActiveFault { get; init; } public TimeSpan? InjectedLatency { get; init; } public Exception? SimulatedException { get; init; } } /// /// A chaos test scenario. /// public sealed record ChaosScenario { public required string ScenarioId { get; init; } public required string Name { get; init; } public string? Description { get; init; } public required IReadOnlyList Steps { get; init; } public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(10); public bool StopOnFirstFailure { get; init; } } /// /// A step in a chaos scenario. /// public sealed record ChaosScenarioStep { public required string StepId { get; init; } public required string Name { get; init; } public required ChaosStepAction Action { get; init; } public ChaosFaultRequest? FaultToInject { get; init; } public string? FaultIdToRemove { get; init; } public TimeSpan? WaitDuration { get; init; } public ChaosAssertion? Assertion { get; init; } } /// /// Action type for a chaos step. /// public enum ChaosStepAction { InjectFault, RemoveFault, Wait, Assert, SendTestDelivery, CheckMetrics } /// /// Assertion for chaos testing. /// public sealed record ChaosAssertion { public required ChaosAssertionType Type { get; init; } public string? MetricName { get; init; } public double? ExpectedValue { get; init; } public double? Tolerance { get; init; } public string? ExpectedStatus { get; init; } } /// /// Type of chaos assertion. /// public enum ChaosAssertionType { MetricEquals, MetricGreaterThan, MetricLessThan, DeadLetterCountEquals, FallbackTriggered, AlertFired } /// /// Result of a chaos test. /// public sealed record ChaosTestResult { public required string TestId { get; init; } public required string ScenarioId { get; init; } public required string ScenarioName { get; init; } public bool Success { get; init; } public DateTimeOffset StartedAt { get; init; } public DateTimeOffset CompletedAt { get; init; } public TimeSpan Duration { get; init; } public IReadOnlyList StepResults { get; init; } = []; public string? Error { get; init; } } /// /// Result of a chaos step. /// public sealed record ChaosStepResult { public required string StepId { get; init; } public required string StepName { get; init; } public bool Success { get; init; } public DateTimeOffset ExecutedAt { get; init; } public TimeSpan Duration { get; init; } public string? Error { get; init; } public object? Data { get; init; } } /// /// Options for chaos engine. /// public sealed class ChaosEngineOptions { public const string SectionName = "Notifier:Observability:Chaos"; public bool Enabled { get; set; } public bool AllowInProduction { get; set; } public TimeSpan MaxFaultDuration { get; set; } = TimeSpan.FromHours(1); public int MaxConcurrentFaults { get; set; } = 10; public IReadOnlyList AllowedChannelTypes { get; set; } = ["webhook", "email", "slack", "teams", "pagerduty", "opsgenie"]; } /// /// Default implementation of chaos engine. /// public sealed class DefaultChaosEngine : IChaosEngine { private readonly ConcurrentDictionary _activeFaults = new(); private readonly List _testHistory = []; private readonly ChaosEngineOptions _options; private readonly TimeProvider _timeProvider; private readonly INotifierMetrics? _metrics; private readonly ILogger _logger; private readonly Random _random = new(); public DefaultChaosEngine( IOptions options, TimeProvider timeProvider, INotifierMetrics? metrics, ILogger logger) { _options = options?.Value ?? new ChaosEngineOptions(); _timeProvider = timeProvider ?? TimeProvider.System; _metrics = metrics; _logger = logger ?? throw new ArgumentNullException(nameof(logger)); } public Task InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default) { if (!_options.Enabled) throw new InvalidOperationException("Chaos engine is disabled"); if (_activeFaults.Count >= _options.MaxConcurrentFaults) throw new InvalidOperationException($"Maximum concurrent faults ({_options.MaxConcurrentFaults}) reached"); var now = _timeProvider.GetUtcNow(); var duration = request.Duration ?? _options.MaxFaultDuration; if (duration > _options.MaxFaultDuration) duration = _options.MaxFaultDuration; var fault = new ChaosFaultInjection { FaultId = $"chaos-{Guid.NewGuid():N}"[..16], ChannelType = request.ChannelType, TenantId = request.TenantId, FaultType = request.FaultType, FailureProbability = Math.Clamp(request.FailureProbability, 0.0, 1.0), LatencyInjection = request.LatencyInjection, ErrorCode = request.ErrorCode, ErrorMessage = request.ErrorMessage, Description = request.Description, CreatedAt = now, ExpiresAt = now + duration, IsActive = true }; _activeFaults[fault.FaultId] = fault; _logger.LogWarning("Injected chaos fault {FaultId} for channel {ChannelType}: {FaultType}", fault.FaultId, fault.ChannelType, fault.FaultType); return Task.FromResult(fault); } public Task RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default) { var removed = _activeFaults.TryRemove(faultId, out var fault); if (removed) _logger.LogInformation("Removed chaos fault {FaultId}", faultId); return Task.FromResult(removed); } public Task> GetActiveFaultsAsync(CancellationToken cancellationToken = default) { CleanupExpiredFaults(); return Task.FromResult>(_activeFaults.Values.ToList()); } public Task ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default) { if (!_options.Enabled) return Task.FromResult(new ChaosFaultResult { ShouldFail = false }); CleanupExpiredFaults(); var matchingFault = _activeFaults.Values .Where(f => f.IsActive) .Where(f => f.ChannelType == channelType || f.ChannelType == "*") .Where(f => f.TenantId is null || f.TenantId == tenantId) .FirstOrDefault(); if (matchingFault is null) return Task.FromResult(new ChaosFaultResult { ShouldFail = false }); var shouldFail = _random.NextDouble() < matchingFault.FailureProbability; if (!shouldFail) return Task.FromResult(new ChaosFaultResult { ShouldFail = false, ActiveFault = matchingFault }); // Update trigger count _activeFaults[matchingFault.FaultId] = matchingFault with { TriggerCount = matchingFault.TriggerCount + 1 }; var exception = matchingFault.FaultType switch { ChaosFaultType.Outage => new InvalidOperationException(matchingFault.ErrorMessage ?? "Channel outage (chaos)"), ChaosFaultType.AuthFailure => new UnauthorizedAccessException(matchingFault.ErrorMessage ?? "Auth failure (chaos)"), ChaosFaultType.Timeout => new TimeoutException(matchingFault.ErrorMessage ?? "Timeout (chaos)"), ChaosFaultType.RateLimit => new InvalidOperationException(matchingFault.ErrorMessage ?? "Rate limited (chaos)"), _ => new Exception(matchingFault.ErrorMessage ?? "Chaos fault") }; return Task.FromResult(new ChaosFaultResult { ShouldFail = true, ActiveFault = matchingFault, InjectedLatency = matchingFault.LatencyInjection, SimulatedException = exception }); } public async Task RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default) { if (!_options.Enabled) throw new InvalidOperationException("Chaos engine is disabled"); var testId = $"test-{Guid.NewGuid():N}"[..16]; var startedAt = _timeProvider.GetUtcNow(); var stepResults = new List(); string? error = null; var success = true; _logger.LogInformation("Starting chaos scenario {ScenarioId}: {ScenarioName}", scenario.ScenarioId, scenario.Name); using var timeoutCts = new CancellationTokenSource(scenario.Timeout); using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token); try { foreach (var step in scenario.Steps) { var stepStart = _timeProvider.GetUtcNow(); var stepResult = await ExecuteStepAsync(step, linkedCts.Token); stepResults.Add(stepResult); if (!stepResult.Success) { success = false; if (scenario.StopOnFirstFailure) { error = $"Step '{step.Name}' failed: {stepResult.Error}"; break; } } } } catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested) { success = false; error = "Scenario timed out"; } catch (Exception ex) { success = false; error = ex.Message; } var completedAt = _timeProvider.GetUtcNow(); var result = new ChaosTestResult { TestId = testId, ScenarioId = scenario.ScenarioId, ScenarioName = scenario.Name, Success = success, StartedAt = startedAt, CompletedAt = completedAt, Duration = completedAt - startedAt, StepResults = stepResults, Error = error }; lock (_testHistory) { _testHistory.Add(result); while (_testHistory.Count > 100) _testHistory.RemoveAt(0); } _logger.LogInformation("Chaos scenario {ScenarioId} completed: {Success}", scenario.ScenarioId, success ? "PASSED" : "FAILED"); return result; } private async Task ExecuteStepAsync(ChaosScenarioStep step, CancellationToken cancellationToken) { var executedAt = _timeProvider.GetUtcNow(); try { object? data = null; switch (step.Action) { case ChaosStepAction.InjectFault when step.FaultToInject is not null: var fault = await InjectFaultAsync(step.FaultToInject, cancellationToken); data = fault; break; case ChaosStepAction.RemoveFault when step.FaultIdToRemove is not null: await RemoveFaultAsync(step.FaultIdToRemove, cancellationToken); break; case ChaosStepAction.Wait when step.WaitDuration.HasValue: await Task.Delay(step.WaitDuration.Value, cancellationToken); break; case ChaosStepAction.Assert when step.Assertion is not null: var assertResult = EvaluateAssertion(step.Assertion); if (!assertResult.passed) return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = assertResult.error }; break; } return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = true, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Data = data }; } catch (Exception ex) { return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = ex.Message }; } } private (bool passed, string? error) EvaluateAssertion(ChaosAssertion assertion) { // Simplified assertion evaluation - in production would query actual metrics return assertion.Type switch { ChaosAssertionType.FallbackTriggered => (true, null), ChaosAssertionType.AlertFired => (true, null), _ => (true, null) }; } public Task> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default) { lock (_testHistory) { return Task.FromResult>(_testHistory.TakeLast(limit).Reverse().ToList()); } } public Task ClearAllFaultsAsync(CancellationToken cancellationToken = default) { _activeFaults.Clear(); _logger.LogInformation("Cleared all chaos faults"); return Task.CompletedTask; } private void CleanupExpiredFaults() { var now = _timeProvider.GetUtcNow(); var expired = _activeFaults.Where(f => f.Value.ExpiresAt.HasValue && f.Value.ExpiresAt < now).Select(f => f.Key).ToList(); foreach (var id in expired) { _activeFaults.TryRemove(id, out _); _logger.LogDebug("Expired chaos fault {FaultId}", id); } } }