up
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
api-governance / spectral-lint (push) Has been cancelled
oas-ci / oas-validate (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Policy Simulation / policy-simulate (push) Has been cancelled
SDK Publish & Sign / sdk-publish (push) Has been cancelled
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
api-governance / spectral-lint (push) Has been cancelled
oas-ci / oas-validate (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Policy Simulation / policy-simulate (push) Has been cancelled
SDK Publish & Sign / sdk-publish (push) Has been cancelled
This commit is contained in:
@@ -0,0 +1,471 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Chaos testing engine for simulating channel outages and failures.
|
||||
/// </summary>
|
||||
public interface IChaosEngine
|
||||
{
|
||||
/// <summary>
|
||||
/// Injects a fault for a channel type.
|
||||
/// </summary>
|
||||
Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Removes a fault injection.
|
||||
/// </summary>
|
||||
Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets all active faults.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a channel operation should fail due to chaos.
|
||||
/// </summary>
|
||||
Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Runs a chaos test scenario.
|
||||
/// </summary>
|
||||
Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets chaos test history.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Clears all active faults.
|
||||
/// </summary>
|
||||
Task ClearAllFaultsAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to inject a fault.
|
||||
/// </summary>
|
||||
public sealed record ChaosFaultRequest
|
||||
{
|
||||
public required string ChannelType { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public required ChaosFaultType FaultType { get; init; }
|
||||
public double FailureProbability { get; init; } = 1.0;
|
||||
public TimeSpan? Duration { get; init; }
|
||||
public TimeSpan? LatencyInjection { get; init; }
|
||||
public int? ErrorCode { get; init; }
|
||||
public string? ErrorMessage { get; init; }
|
||||
public string? Description { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of chaos fault.
|
||||
/// </summary>
|
||||
public enum ChaosFaultType
|
||||
{
|
||||
Outage,
|
||||
Latency,
|
||||
RateLimit,
|
||||
AuthFailure,
|
||||
Timeout,
|
||||
PartialFailure,
|
||||
Intermittent
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Active fault injection.
|
||||
/// </summary>
|
||||
public sealed record ChaosFaultInjection
|
||||
{
|
||||
public required string FaultId { get; init; }
|
||||
public required string ChannelType { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public required ChaosFaultType FaultType { get; init; }
|
||||
public double FailureProbability { get; init; }
|
||||
public TimeSpan? LatencyInjection { get; init; }
|
||||
public int? ErrorCode { get; init; }
|
||||
public string? ErrorMessage { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset? ExpiresAt { get; init; }
|
||||
public int TriggerCount { get; init; }
|
||||
public bool IsActive { get; init; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of checking for chaos fault.
|
||||
/// </summary>
|
||||
public sealed record ChaosFaultResult
|
||||
{
|
||||
public bool ShouldFail { get; init; }
|
||||
public ChaosFaultInjection? ActiveFault { get; init; }
|
||||
public TimeSpan? InjectedLatency { get; init; }
|
||||
public Exception? SimulatedException { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A chaos test scenario.
|
||||
/// </summary>
|
||||
public sealed record ChaosScenario
|
||||
{
|
||||
public required string ScenarioId { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public required IReadOnlyList<ChaosScenarioStep> Steps { get; init; }
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(10);
|
||||
public bool StopOnFirstFailure { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A step in a chaos scenario.
|
||||
/// </summary>
|
||||
public sealed record ChaosScenarioStep
|
||||
{
|
||||
public required string StepId { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public required ChaosStepAction Action { get; init; }
|
||||
public ChaosFaultRequest? FaultToInject { get; init; }
|
||||
public string? FaultIdToRemove { get; init; }
|
||||
public TimeSpan? WaitDuration { get; init; }
|
||||
public ChaosAssertion? Assertion { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Action type for a chaos step.
|
||||
/// </summary>
|
||||
public enum ChaosStepAction
|
||||
{
|
||||
InjectFault,
|
||||
RemoveFault,
|
||||
Wait,
|
||||
Assert,
|
||||
SendTestDelivery,
|
||||
CheckMetrics
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Assertion for chaos testing.
|
||||
/// </summary>
|
||||
public sealed record ChaosAssertion
|
||||
{
|
||||
public required ChaosAssertionType Type { get; init; }
|
||||
public string? MetricName { get; init; }
|
||||
public double? ExpectedValue { get; init; }
|
||||
public double? Tolerance { get; init; }
|
||||
public string? ExpectedStatus { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of chaos assertion.
|
||||
/// </summary>
|
||||
public enum ChaosAssertionType
|
||||
{
|
||||
MetricEquals,
|
||||
MetricGreaterThan,
|
||||
MetricLessThan,
|
||||
DeadLetterCountEquals,
|
||||
FallbackTriggered,
|
||||
AlertFired
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a chaos test.
|
||||
/// </summary>
|
||||
public sealed record ChaosTestResult
|
||||
{
|
||||
public required string TestId { get; init; }
|
||||
public required string ScenarioId { get; init; }
|
||||
public required string ScenarioName { get; init; }
|
||||
public bool Success { get; init; }
|
||||
public DateTimeOffset StartedAt { get; init; }
|
||||
public DateTimeOffset CompletedAt { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
public IReadOnlyList<ChaosStepResult> StepResults { get; init; } = [];
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a chaos step.
|
||||
/// </summary>
|
||||
public sealed record ChaosStepResult
|
||||
{
|
||||
public required string StepId { get; init; }
|
||||
public required string StepName { get; init; }
|
||||
public bool Success { get; init; }
|
||||
public DateTimeOffset ExecutedAt { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
public string? Error { get; init; }
|
||||
public object? Data { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for chaos engine.
|
||||
/// </summary>
|
||||
public sealed class ChaosEngineOptions
|
||||
{
|
||||
public const string SectionName = "Notifier:Observability:Chaos";
|
||||
|
||||
public bool Enabled { get; set; }
|
||||
public bool AllowInProduction { get; set; }
|
||||
public TimeSpan MaxFaultDuration { get; set; } = TimeSpan.FromHours(1);
|
||||
public int MaxConcurrentFaults { get; set; } = 10;
|
||||
public IReadOnlyList<string> AllowedChannelTypes { get; set; } = ["webhook", "email", "slack", "teams", "pagerduty", "opsgenie"];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of chaos engine.
|
||||
/// </summary>
|
||||
public sealed class DefaultChaosEngine : IChaosEngine
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, ChaosFaultInjection> _activeFaults = new();
|
||||
private readonly List<ChaosTestResult> _testHistory = [];
|
||||
private readonly ChaosEngineOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly INotifierMetrics? _metrics;
|
||||
private readonly ILogger<DefaultChaosEngine> _logger;
|
||||
private readonly Random _random = new();
|
||||
|
||||
public DefaultChaosEngine(
|
||||
IOptions<ChaosEngineOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
INotifierMetrics? metrics,
|
||||
ILogger<DefaultChaosEngine> logger)
|
||||
{
|
||||
_options = options?.Value ?? new ChaosEngineOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
throw new InvalidOperationException("Chaos engine is disabled");
|
||||
|
||||
if (_activeFaults.Count >= _options.MaxConcurrentFaults)
|
||||
throw new InvalidOperationException($"Maximum concurrent faults ({_options.MaxConcurrentFaults}) reached");
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var duration = request.Duration ?? _options.MaxFaultDuration;
|
||||
if (duration > _options.MaxFaultDuration)
|
||||
duration = _options.MaxFaultDuration;
|
||||
|
||||
var fault = new ChaosFaultInjection
|
||||
{
|
||||
FaultId = $"chaos-{Guid.NewGuid():N}"[..16],
|
||||
ChannelType = request.ChannelType,
|
||||
TenantId = request.TenantId,
|
||||
FaultType = request.FaultType,
|
||||
FailureProbability = Math.Clamp(request.FailureProbability, 0.0, 1.0),
|
||||
LatencyInjection = request.LatencyInjection,
|
||||
ErrorCode = request.ErrorCode,
|
||||
ErrorMessage = request.ErrorMessage,
|
||||
Description = request.Description,
|
||||
CreatedAt = now,
|
||||
ExpiresAt = now + duration,
|
||||
IsActive = true
|
||||
};
|
||||
|
||||
_activeFaults[fault.FaultId] = fault;
|
||||
_logger.LogWarning("Injected chaos fault {FaultId} for channel {ChannelType}: {FaultType}", fault.FaultId, fault.ChannelType, fault.FaultType);
|
||||
|
||||
return Task.FromResult(fault);
|
||||
}
|
||||
|
||||
public Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var removed = _activeFaults.TryRemove(faultId, out var fault);
|
||||
if (removed)
|
||||
_logger.LogInformation("Removed chaos fault {FaultId}", faultId);
|
||||
return Task.FromResult(removed);
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
CleanupExpiredFaults();
|
||||
return Task.FromResult<IReadOnlyList<ChaosFaultInjection>>(_activeFaults.Values.ToList());
|
||||
}
|
||||
|
||||
public Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
|
||||
|
||||
CleanupExpiredFaults();
|
||||
|
||||
var matchingFault = _activeFaults.Values
|
||||
.Where(f => f.IsActive)
|
||||
.Where(f => f.ChannelType == channelType || f.ChannelType == "*")
|
||||
.Where(f => f.TenantId is null || f.TenantId == tenantId)
|
||||
.FirstOrDefault();
|
||||
|
||||
if (matchingFault is null)
|
||||
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
|
||||
|
||||
var shouldFail = _random.NextDouble() < matchingFault.FailureProbability;
|
||||
if (!shouldFail)
|
||||
return Task.FromResult(new ChaosFaultResult { ShouldFail = false, ActiveFault = matchingFault });
|
||||
|
||||
// Update trigger count
|
||||
_activeFaults[matchingFault.FaultId] = matchingFault with { TriggerCount = matchingFault.TriggerCount + 1 };
|
||||
|
||||
var exception = matchingFault.FaultType switch
|
||||
{
|
||||
ChaosFaultType.Outage => new InvalidOperationException(matchingFault.ErrorMessage ?? "Channel outage (chaos)"),
|
||||
ChaosFaultType.AuthFailure => new UnauthorizedAccessException(matchingFault.ErrorMessage ?? "Auth failure (chaos)"),
|
||||
ChaosFaultType.Timeout => new TimeoutException(matchingFault.ErrorMessage ?? "Timeout (chaos)"),
|
||||
ChaosFaultType.RateLimit => new InvalidOperationException(matchingFault.ErrorMessage ?? "Rate limited (chaos)"),
|
||||
_ => new Exception(matchingFault.ErrorMessage ?? "Chaos fault")
|
||||
};
|
||||
|
||||
return Task.FromResult(new ChaosFaultResult
|
||||
{
|
||||
ShouldFail = true,
|
||||
ActiveFault = matchingFault,
|
||||
InjectedLatency = matchingFault.LatencyInjection,
|
||||
SimulatedException = exception
|
||||
});
|
||||
}
|
||||
|
||||
public async Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
throw new InvalidOperationException("Chaos engine is disabled");
|
||||
|
||||
var testId = $"test-{Guid.NewGuid():N}"[..16];
|
||||
var startedAt = _timeProvider.GetUtcNow();
|
||||
var stepResults = new List<ChaosStepResult>();
|
||||
string? error = null;
|
||||
var success = true;
|
||||
|
||||
_logger.LogInformation("Starting chaos scenario {ScenarioId}: {ScenarioName}", scenario.ScenarioId, scenario.Name);
|
||||
|
||||
using var timeoutCts = new CancellationTokenSource(scenario.Timeout);
|
||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token);
|
||||
|
||||
try
|
||||
{
|
||||
foreach (var step in scenario.Steps)
|
||||
{
|
||||
var stepStart = _timeProvider.GetUtcNow();
|
||||
var stepResult = await ExecuteStepAsync(step, linkedCts.Token);
|
||||
stepResults.Add(stepResult);
|
||||
|
||||
if (!stepResult.Success)
|
||||
{
|
||||
success = false;
|
||||
if (scenario.StopOnFirstFailure)
|
||||
{
|
||||
error = $"Step '{step.Name}' failed: {stepResult.Error}";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
success = false;
|
||||
error = "Scenario timed out";
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
success = false;
|
||||
error = ex.Message;
|
||||
}
|
||||
|
||||
var completedAt = _timeProvider.GetUtcNow();
|
||||
var result = new ChaosTestResult
|
||||
{
|
||||
TestId = testId,
|
||||
ScenarioId = scenario.ScenarioId,
|
||||
ScenarioName = scenario.Name,
|
||||
Success = success,
|
||||
StartedAt = startedAt,
|
||||
CompletedAt = completedAt,
|
||||
Duration = completedAt - startedAt,
|
||||
StepResults = stepResults,
|
||||
Error = error
|
||||
};
|
||||
|
||||
lock (_testHistory)
|
||||
{
|
||||
_testHistory.Add(result);
|
||||
while (_testHistory.Count > 100) _testHistory.RemoveAt(0);
|
||||
}
|
||||
|
||||
_logger.LogInformation("Chaos scenario {ScenarioId} completed: {Success}", scenario.ScenarioId, success ? "PASSED" : "FAILED");
|
||||
return result;
|
||||
}
|
||||
|
||||
private async Task<ChaosStepResult> ExecuteStepAsync(ChaosScenarioStep step, CancellationToken cancellationToken)
|
||||
{
|
||||
var executedAt = _timeProvider.GetUtcNow();
|
||||
try
|
||||
{
|
||||
object? data = null;
|
||||
switch (step.Action)
|
||||
{
|
||||
case ChaosStepAction.InjectFault when step.FaultToInject is not null:
|
||||
var fault = await InjectFaultAsync(step.FaultToInject, cancellationToken);
|
||||
data = fault;
|
||||
break;
|
||||
case ChaosStepAction.RemoveFault when step.FaultIdToRemove is not null:
|
||||
await RemoveFaultAsync(step.FaultIdToRemove, cancellationToken);
|
||||
break;
|
||||
case ChaosStepAction.Wait when step.WaitDuration.HasValue:
|
||||
await Task.Delay(step.WaitDuration.Value, cancellationToken);
|
||||
break;
|
||||
case ChaosStepAction.Assert when step.Assertion is not null:
|
||||
var assertResult = EvaluateAssertion(step.Assertion);
|
||||
if (!assertResult.passed)
|
||||
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = assertResult.error };
|
||||
break;
|
||||
}
|
||||
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = true, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Data = data };
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = ex.Message };
|
||||
}
|
||||
}
|
||||
|
||||
private (bool passed, string? error) EvaluateAssertion(ChaosAssertion assertion)
|
||||
{
|
||||
// Simplified assertion evaluation - in production would query actual metrics
|
||||
return assertion.Type switch
|
||||
{
|
||||
ChaosAssertionType.FallbackTriggered => (true, null),
|
||||
ChaosAssertionType.AlertFired => (true, null),
|
||||
_ => (true, null)
|
||||
};
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_testHistory)
|
||||
{
|
||||
return Task.FromResult<IReadOnlyList<ChaosTestResult>>(_testHistory.TakeLast(limit).Reverse().ToList());
|
||||
}
|
||||
}
|
||||
|
||||
public Task ClearAllFaultsAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
_activeFaults.Clear();
|
||||
_logger.LogInformation("Cleared all chaos faults");
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private void CleanupExpiredFaults()
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var expired = _activeFaults.Where(f => f.Value.ExpiresAt.HasValue && f.Value.ExpiresAt < now).Select(f => f.Key).ToList();
|
||||
foreach (var id in expired)
|
||||
{
|
||||
_activeFaults.TryRemove(id, out _);
|
||||
_logger.LogDebug("Expired chaos fault {FaultId}", id);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user