Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
api-governance / spectral-lint (push) Has been cancelled
oas-ci / oas-validate (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Policy Simulation / policy-simulate (push) Has been cancelled
SDK Publish & Sign / sdk-publish (push) Has been cancelled
472 lines
17 KiB
C#
472 lines
17 KiB
C#
using System.Collections.Concurrent;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
|
|
namespace StellaOps.Notifier.Worker.Observability;
|
|
|
|
/// <summary>
|
|
/// Chaos testing engine for simulating channel outages and failures.
|
|
/// </summary>
|
|
public interface IChaosEngine
|
|
{
|
|
/// <summary>
|
|
/// Injects a fault for a channel type.
|
|
/// </summary>
|
|
Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default);
|
|
|
|
/// <summary>
|
|
/// Removes a fault injection.
|
|
/// </summary>
|
|
Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default);
|
|
|
|
/// <summary>
|
|
/// Gets all active faults.
|
|
/// </summary>
|
|
Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default);
|
|
|
|
/// <summary>
|
|
/// Checks if a channel operation should fail due to chaos.
|
|
/// </summary>
|
|
Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default);
|
|
|
|
/// <summary>
|
|
/// Runs a chaos test scenario.
|
|
/// </summary>
|
|
Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default);
|
|
|
|
/// <summary>
|
|
/// Gets chaos test history.
|
|
/// </summary>
|
|
Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default);
|
|
|
|
/// <summary>
|
|
/// Clears all active faults.
|
|
/// </summary>
|
|
Task ClearAllFaultsAsync(CancellationToken cancellationToken = default);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Request to inject a fault.
|
|
/// </summary>
|
|
public sealed record ChaosFaultRequest
|
|
{
|
|
public required string ChannelType { get; init; }
|
|
public string? TenantId { get; init; }
|
|
public required ChaosFaultType FaultType { get; init; }
|
|
public double FailureProbability { get; init; } = 1.0;
|
|
public TimeSpan? Duration { get; init; }
|
|
public TimeSpan? LatencyInjection { get; init; }
|
|
public int? ErrorCode { get; init; }
|
|
public string? ErrorMessage { get; init; }
|
|
public string? Description { get; init; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Type of chaos fault.
|
|
/// </summary>
|
|
public enum ChaosFaultType
|
|
{
|
|
Outage,
|
|
Latency,
|
|
RateLimit,
|
|
AuthFailure,
|
|
Timeout,
|
|
PartialFailure,
|
|
Intermittent
|
|
}
|
|
|
|
/// <summary>
|
|
/// Active fault injection.
|
|
/// </summary>
|
|
public sealed record ChaosFaultInjection
|
|
{
|
|
public required string FaultId { get; init; }
|
|
public required string ChannelType { get; init; }
|
|
public string? TenantId { get; init; }
|
|
public required ChaosFaultType FaultType { get; init; }
|
|
public double FailureProbability { get; init; }
|
|
public TimeSpan? LatencyInjection { get; init; }
|
|
public int? ErrorCode { get; init; }
|
|
public string? ErrorMessage { get; init; }
|
|
public string? Description { get; init; }
|
|
public DateTimeOffset CreatedAt { get; init; }
|
|
public DateTimeOffset? ExpiresAt { get; init; }
|
|
public int TriggerCount { get; init; }
|
|
public bool IsActive { get; init; } = true;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Result of checking for chaos fault.
|
|
/// </summary>
|
|
public sealed record ChaosFaultResult
|
|
{
|
|
public bool ShouldFail { get; init; }
|
|
public ChaosFaultInjection? ActiveFault { get; init; }
|
|
public TimeSpan? InjectedLatency { get; init; }
|
|
public Exception? SimulatedException { get; init; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// A chaos test scenario.
|
|
/// </summary>
|
|
public sealed record ChaosScenario
|
|
{
|
|
public required string ScenarioId { get; init; }
|
|
public required string Name { get; init; }
|
|
public string? Description { get; init; }
|
|
public required IReadOnlyList<ChaosScenarioStep> Steps { get; init; }
|
|
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(10);
|
|
public bool StopOnFirstFailure { get; init; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// A step in a chaos scenario.
|
|
/// </summary>
|
|
public sealed record ChaosScenarioStep
|
|
{
|
|
public required string StepId { get; init; }
|
|
public required string Name { get; init; }
|
|
public required ChaosStepAction Action { get; init; }
|
|
public ChaosFaultRequest? FaultToInject { get; init; }
|
|
public string? FaultIdToRemove { get; init; }
|
|
public TimeSpan? WaitDuration { get; init; }
|
|
public ChaosAssertion? Assertion { get; init; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Action type for a chaos step.
|
|
/// </summary>
|
|
public enum ChaosStepAction
|
|
{
|
|
InjectFault,
|
|
RemoveFault,
|
|
Wait,
|
|
Assert,
|
|
SendTestDelivery,
|
|
CheckMetrics
|
|
}
|
|
|
|
/// <summary>
|
|
/// Assertion for chaos testing.
|
|
/// </summary>
|
|
public sealed record ChaosAssertion
|
|
{
|
|
public required ChaosAssertionType Type { get; init; }
|
|
public string? MetricName { get; init; }
|
|
public double? ExpectedValue { get; init; }
|
|
public double? Tolerance { get; init; }
|
|
public string? ExpectedStatus { get; init; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Type of chaos assertion.
|
|
/// </summary>
|
|
public enum ChaosAssertionType
|
|
{
|
|
MetricEquals,
|
|
MetricGreaterThan,
|
|
MetricLessThan,
|
|
DeadLetterCountEquals,
|
|
FallbackTriggered,
|
|
AlertFired
|
|
}
|
|
|
|
/// <summary>
|
|
/// Result of a chaos test.
|
|
/// </summary>
|
|
public sealed record ChaosTestResult
|
|
{
|
|
public required string TestId { get; init; }
|
|
public required string ScenarioId { get; init; }
|
|
public required string ScenarioName { get; init; }
|
|
public bool Success { get; init; }
|
|
public DateTimeOffset StartedAt { get; init; }
|
|
public DateTimeOffset CompletedAt { get; init; }
|
|
public TimeSpan Duration { get; init; }
|
|
public IReadOnlyList<ChaosStepResult> StepResults { get; init; } = [];
|
|
public string? Error { get; init; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Result of a chaos step.
|
|
/// </summary>
|
|
public sealed record ChaosStepResult
|
|
{
|
|
public required string StepId { get; init; }
|
|
public required string StepName { get; init; }
|
|
public bool Success { get; init; }
|
|
public DateTimeOffset ExecutedAt { get; init; }
|
|
public TimeSpan Duration { get; init; }
|
|
public string? Error { get; init; }
|
|
public object? Data { get; init; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Options for chaos engine.
|
|
/// </summary>
|
|
public sealed class ChaosEngineOptions
|
|
{
|
|
public const string SectionName = "Notifier:Observability:Chaos";
|
|
|
|
public bool Enabled { get; set; }
|
|
public bool AllowInProduction { get; set; }
|
|
public TimeSpan MaxFaultDuration { get; set; } = TimeSpan.FromHours(1);
|
|
public int MaxConcurrentFaults { get; set; } = 10;
|
|
public IReadOnlyList<string> AllowedChannelTypes { get; set; } = ["webhook", "email", "slack", "teams", "pagerduty", "opsgenie"];
|
|
}
|
|
|
|
/// <summary>
|
|
/// Default implementation of chaos engine.
|
|
/// </summary>
|
|
public sealed class DefaultChaosEngine : IChaosEngine
|
|
{
|
|
private readonly ConcurrentDictionary<string, ChaosFaultInjection> _activeFaults = new();
|
|
private readonly List<ChaosTestResult> _testHistory = [];
|
|
private readonly ChaosEngineOptions _options;
|
|
private readonly TimeProvider _timeProvider;
|
|
private readonly INotifierMetrics? _metrics;
|
|
private readonly ILogger<DefaultChaosEngine> _logger;
|
|
private readonly Random _random = new();
|
|
|
|
public DefaultChaosEngine(
|
|
IOptions<ChaosEngineOptions> options,
|
|
TimeProvider timeProvider,
|
|
INotifierMetrics? metrics,
|
|
ILogger<DefaultChaosEngine> logger)
|
|
{
|
|
_options = options?.Value ?? new ChaosEngineOptions();
|
|
_timeProvider = timeProvider ?? TimeProvider.System;
|
|
_metrics = metrics;
|
|
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
|
}
|
|
|
|
public Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default)
|
|
{
|
|
if (!_options.Enabled)
|
|
throw new InvalidOperationException("Chaos engine is disabled");
|
|
|
|
if (_activeFaults.Count >= _options.MaxConcurrentFaults)
|
|
throw new InvalidOperationException($"Maximum concurrent faults ({_options.MaxConcurrentFaults}) reached");
|
|
|
|
var now = _timeProvider.GetUtcNow();
|
|
var duration = request.Duration ?? _options.MaxFaultDuration;
|
|
if (duration > _options.MaxFaultDuration)
|
|
duration = _options.MaxFaultDuration;
|
|
|
|
var fault = new ChaosFaultInjection
|
|
{
|
|
FaultId = $"chaos-{Guid.NewGuid():N}"[..16],
|
|
ChannelType = request.ChannelType,
|
|
TenantId = request.TenantId,
|
|
FaultType = request.FaultType,
|
|
FailureProbability = Math.Clamp(request.FailureProbability, 0.0, 1.0),
|
|
LatencyInjection = request.LatencyInjection,
|
|
ErrorCode = request.ErrorCode,
|
|
ErrorMessage = request.ErrorMessage,
|
|
Description = request.Description,
|
|
CreatedAt = now,
|
|
ExpiresAt = now + duration,
|
|
IsActive = true
|
|
};
|
|
|
|
_activeFaults[fault.FaultId] = fault;
|
|
_logger.LogWarning("Injected chaos fault {FaultId} for channel {ChannelType}: {FaultType}", fault.FaultId, fault.ChannelType, fault.FaultType);
|
|
|
|
return Task.FromResult(fault);
|
|
}
|
|
|
|
public Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default)
|
|
{
|
|
var removed = _activeFaults.TryRemove(faultId, out var fault);
|
|
if (removed)
|
|
_logger.LogInformation("Removed chaos fault {FaultId}", faultId);
|
|
return Task.FromResult(removed);
|
|
}
|
|
|
|
public Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default)
|
|
{
|
|
CleanupExpiredFaults();
|
|
return Task.FromResult<IReadOnlyList<ChaosFaultInjection>>(_activeFaults.Values.ToList());
|
|
}
|
|
|
|
public Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default)
|
|
{
|
|
if (!_options.Enabled)
|
|
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
|
|
|
|
CleanupExpiredFaults();
|
|
|
|
var matchingFault = _activeFaults.Values
|
|
.Where(f => f.IsActive)
|
|
.Where(f => f.ChannelType == channelType || f.ChannelType == "*")
|
|
.Where(f => f.TenantId is null || f.TenantId == tenantId)
|
|
.FirstOrDefault();
|
|
|
|
if (matchingFault is null)
|
|
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
|
|
|
|
var shouldFail = _random.NextDouble() < matchingFault.FailureProbability;
|
|
if (!shouldFail)
|
|
return Task.FromResult(new ChaosFaultResult { ShouldFail = false, ActiveFault = matchingFault });
|
|
|
|
// Update trigger count
|
|
_activeFaults[matchingFault.FaultId] = matchingFault with { TriggerCount = matchingFault.TriggerCount + 1 };
|
|
|
|
var exception = matchingFault.FaultType switch
|
|
{
|
|
ChaosFaultType.Outage => new InvalidOperationException(matchingFault.ErrorMessage ?? "Channel outage (chaos)"),
|
|
ChaosFaultType.AuthFailure => new UnauthorizedAccessException(matchingFault.ErrorMessage ?? "Auth failure (chaos)"),
|
|
ChaosFaultType.Timeout => new TimeoutException(matchingFault.ErrorMessage ?? "Timeout (chaos)"),
|
|
ChaosFaultType.RateLimit => new InvalidOperationException(matchingFault.ErrorMessage ?? "Rate limited (chaos)"),
|
|
_ => new Exception(matchingFault.ErrorMessage ?? "Chaos fault")
|
|
};
|
|
|
|
return Task.FromResult(new ChaosFaultResult
|
|
{
|
|
ShouldFail = true,
|
|
ActiveFault = matchingFault,
|
|
InjectedLatency = matchingFault.LatencyInjection,
|
|
SimulatedException = exception
|
|
});
|
|
}
|
|
|
|
public async Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default)
|
|
{
|
|
if (!_options.Enabled)
|
|
throw new InvalidOperationException("Chaos engine is disabled");
|
|
|
|
var testId = $"test-{Guid.NewGuid():N}"[..16];
|
|
var startedAt = _timeProvider.GetUtcNow();
|
|
var stepResults = new List<ChaosStepResult>();
|
|
string? error = null;
|
|
var success = true;
|
|
|
|
_logger.LogInformation("Starting chaos scenario {ScenarioId}: {ScenarioName}", scenario.ScenarioId, scenario.Name);
|
|
|
|
using var timeoutCts = new CancellationTokenSource(scenario.Timeout);
|
|
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token);
|
|
|
|
try
|
|
{
|
|
foreach (var step in scenario.Steps)
|
|
{
|
|
var stepStart = _timeProvider.GetUtcNow();
|
|
var stepResult = await ExecuteStepAsync(step, linkedCts.Token);
|
|
stepResults.Add(stepResult);
|
|
|
|
if (!stepResult.Success)
|
|
{
|
|
success = false;
|
|
if (scenario.StopOnFirstFailure)
|
|
{
|
|
error = $"Step '{step.Name}' failed: {stepResult.Error}";
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
|
|
{
|
|
success = false;
|
|
error = "Scenario timed out";
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
success = false;
|
|
error = ex.Message;
|
|
}
|
|
|
|
var completedAt = _timeProvider.GetUtcNow();
|
|
var result = new ChaosTestResult
|
|
{
|
|
TestId = testId,
|
|
ScenarioId = scenario.ScenarioId,
|
|
ScenarioName = scenario.Name,
|
|
Success = success,
|
|
StartedAt = startedAt,
|
|
CompletedAt = completedAt,
|
|
Duration = completedAt - startedAt,
|
|
StepResults = stepResults,
|
|
Error = error
|
|
};
|
|
|
|
lock (_testHistory)
|
|
{
|
|
_testHistory.Add(result);
|
|
while (_testHistory.Count > 100) _testHistory.RemoveAt(0);
|
|
}
|
|
|
|
_logger.LogInformation("Chaos scenario {ScenarioId} completed: {Success}", scenario.ScenarioId, success ? "PASSED" : "FAILED");
|
|
return result;
|
|
}
|
|
|
|
private async Task<ChaosStepResult> ExecuteStepAsync(ChaosScenarioStep step, CancellationToken cancellationToken)
|
|
{
|
|
var executedAt = _timeProvider.GetUtcNow();
|
|
try
|
|
{
|
|
object? data = null;
|
|
switch (step.Action)
|
|
{
|
|
case ChaosStepAction.InjectFault when step.FaultToInject is not null:
|
|
var fault = await InjectFaultAsync(step.FaultToInject, cancellationToken);
|
|
data = fault;
|
|
break;
|
|
case ChaosStepAction.RemoveFault when step.FaultIdToRemove is not null:
|
|
await RemoveFaultAsync(step.FaultIdToRemove, cancellationToken);
|
|
break;
|
|
case ChaosStepAction.Wait when step.WaitDuration.HasValue:
|
|
await Task.Delay(step.WaitDuration.Value, cancellationToken);
|
|
break;
|
|
case ChaosStepAction.Assert when step.Assertion is not null:
|
|
var assertResult = EvaluateAssertion(step.Assertion);
|
|
if (!assertResult.passed)
|
|
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = assertResult.error };
|
|
break;
|
|
}
|
|
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = true, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Data = data };
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = ex.Message };
|
|
}
|
|
}
|
|
|
|
private (bool passed, string? error) EvaluateAssertion(ChaosAssertion assertion)
|
|
{
|
|
// Simplified assertion evaluation - in production would query actual metrics
|
|
return assertion.Type switch
|
|
{
|
|
ChaosAssertionType.FallbackTriggered => (true, null),
|
|
ChaosAssertionType.AlertFired => (true, null),
|
|
_ => (true, null)
|
|
};
|
|
}
|
|
|
|
public Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default)
|
|
{
|
|
lock (_testHistory)
|
|
{
|
|
return Task.FromResult<IReadOnlyList<ChaosTestResult>>(_testHistory.TakeLast(limit).Reverse().ToList());
|
|
}
|
|
}
|
|
|
|
public Task ClearAllFaultsAsync(CancellationToken cancellationToken = default)
|
|
{
|
|
_activeFaults.Clear();
|
|
_logger.LogInformation("Cleared all chaos faults");
|
|
return Task.CompletedTask;
|
|
}
|
|
|
|
private void CleanupExpiredFaults()
|
|
{
|
|
var now = _timeProvider.GetUtcNow();
|
|
var expired = _activeFaults.Where(f => f.Value.ExpiresAt.HasValue && f.Value.ExpiresAt < now).Select(f => f.Key).ToList();
|
|
foreach (var id in expired)
|
|
{
|
|
_activeFaults.TryRemove(id, out _);
|
|
_logger.LogDebug("Expired chaos fault {FaultId}", id);
|
|
}
|
|
}
|
|
}
|