1123 lines
34 KiB
C#
1123 lines
34 KiB
C#
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
using System.Collections.Concurrent;
|
|
|
|
namespace StellaOps.Notifier.Worker.Observability;
|
|
|
|
/// <summary>
|
|
/// Chaos testing service for simulating channel outages and failures.
|
|
/// Enables controlled fault injection to test resilience of notification delivery.
|
|
/// </summary>
|
|
public interface IChaosTestRunner
|
|
{
|
|
/// <summary>
|
|
/// Starts a chaos experiment.
|
|
/// </summary>
|
|
Task<ChaosExperiment> StartExperimentAsync(ChaosExperimentConfig config, CancellationToken ct = default);
|
|
|
|
/// <summary>
|
|
/// Stops a running chaos experiment.
|
|
/// </summary>
|
|
Task StopExperimentAsync(string experimentId, CancellationToken ct = default);
|
|
|
|
/// <summary>
|
|
/// Gets the current status of an experiment.
|
|
/// </summary>
|
|
Task<ChaosExperiment?> GetExperimentAsync(string experimentId, CancellationToken ct = default);
|
|
|
|
/// <summary>
|
|
/// Lists all experiments (optionally filtered by status).
|
|
/// </summary>
|
|
Task<IReadOnlyList<ChaosExperiment>> ListExperimentsAsync(
|
|
ChaosExperimentStatus? status = null,
|
|
int limit = 100,
|
|
CancellationToken ct = default);
|
|
|
|
/// <summary>
|
|
/// Checks if a channel should fail based on active chaos experiments.
|
|
/// </summary>
|
|
Task<ChaosDecision> ShouldFailAsync(string tenantId, string channelType, string? channelId = null, CancellationToken ct = default);
|
|
|
|
/// <summary>
|
|
/// Records the outcome of a chaos-affected operation.
|
|
/// </summary>
|
|
Task RecordOutcomeAsync(string experimentId, ChaosOutcome outcome, CancellationToken ct = default);
|
|
|
|
/// <summary>
|
|
/// Gets experiment results/statistics.
|
|
/// </summary>
|
|
Task<ChaosExperimentResults> GetResultsAsync(string experimentId, CancellationToken ct = default);
|
|
|
|
/// <summary>
|
|
/// Cleans up completed experiments older than the specified age.
|
|
/// </summary>
|
|
Task<int> CleanupAsync(TimeSpan olderThan, CancellationToken ct = default);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Configuration for a chaos experiment.
|
|
/// </summary>
|
|
public sealed record ChaosExperimentConfig
|
|
{
|
|
/// <summary>
|
|
/// Human-readable name for the experiment.
|
|
/// </summary>
|
|
public required string Name { get; init; }
|
|
|
|
/// <summary>
|
|
/// Description of what the experiment tests.
|
|
/// </summary>
|
|
public string? Description { get; init; }
|
|
|
|
/// <summary>
|
|
/// Target tenant ID (null for all tenants).
|
|
/// </summary>
|
|
public string? TenantId { get; init; }
|
|
|
|
/// <summary>
|
|
/// Target channel types to affect.
|
|
/// </summary>
|
|
public IReadOnlyList<string> TargetChannelTypes { get; init; } = [];
|
|
|
|
/// <summary>
|
|
/// Target channel IDs to affect (empty means all channels of specified types).
|
|
/// </summary>
|
|
public IReadOnlyList<string> TargetChannelIds { get; init; } = [];
|
|
|
|
/// <summary>
|
|
/// Type of fault to inject.
|
|
/// </summary>
|
|
public required ChaosFaultType FaultType { get; init; }
|
|
|
|
/// <summary>
|
|
/// Fault configuration parameters.
|
|
/// </summary>
|
|
public ChaosFaultConfig FaultConfig { get; init; } = new();
|
|
|
|
/// <summary>
|
|
/// Duration of the experiment.
|
|
/// </summary>
|
|
public TimeSpan Duration { get; init; } = TimeSpan.FromMinutes(5);
|
|
|
|
/// <summary>
|
|
/// Maximum number of operations to affect (0 = unlimited).
|
|
/// </summary>
|
|
public int MaxAffectedOperations { get; init; }
|
|
|
|
/// <summary>
|
|
/// Tags for categorizing experiments.
|
|
/// </summary>
|
|
public IReadOnlyDictionary<string, string> Tags { get; init; } = new Dictionary<string, string>();
|
|
|
|
/// <summary>
|
|
/// Who initiated this experiment.
|
|
/// </summary>
|
|
public required string InitiatedBy { get; init; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Configuration for fault behavior.
|
|
/// </summary>
|
|
public sealed record ChaosFaultConfig
|
|
{
|
|
/// <summary>
|
|
/// Failure rate (0.0 to 1.0) for partial/intermittent failures.
|
|
/// </summary>
|
|
public double FailureRate { get; init; } = 1.0;
|
|
|
|
/// <summary>
|
|
/// Minimum latency to inject.
|
|
/// </summary>
|
|
public TimeSpan MinLatency { get; init; } = TimeSpan.FromSeconds(1);
|
|
|
|
/// <summary>
|
|
/// Maximum latency to inject.
|
|
/// </summary>
|
|
public TimeSpan MaxLatency { get; init; } = TimeSpan.FromSeconds(5);
|
|
|
|
/// <summary>
|
|
/// HTTP status code to return for error responses.
|
|
/// </summary>
|
|
public int ErrorStatusCode { get; init; } = 500;
|
|
|
|
/// <summary>
|
|
/// Error message to include.
|
|
/// </summary>
|
|
public string? ErrorMessage { get; init; }
|
|
|
|
/// <summary>
|
|
/// Rate limit (requests per minute) for RateLimit fault type.
|
|
/// </summary>
|
|
public int RateLimitPerMinute { get; init; } = 10;
|
|
|
|
/// <summary>
|
|
/// Timeout duration for Timeout fault type.
|
|
/// </summary>
|
|
public TimeSpan TimeoutDuration { get; init; } = TimeSpan.FromSeconds(30);
|
|
|
|
/// <summary>
|
|
/// Random seed for reproducible experiments.
|
|
/// </summary>
|
|
public int? Seed { get; init; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Status of a chaos experiment.
|
|
/// </summary>
|
|
public enum ChaosExperimentStatus
|
|
{
|
|
/// <summary>
|
|
/// Experiment is scheduled but not yet started.
|
|
/// </summary>
|
|
Scheduled,
|
|
|
|
/// <summary>
|
|
/// Experiment is currently running.
|
|
/// </summary>
|
|
Running,
|
|
|
|
/// <summary>
|
|
/// Experiment completed normally.
|
|
/// </summary>
|
|
Completed,
|
|
|
|
/// <summary>
|
|
/// Experiment was stopped early.
|
|
/// </summary>
|
|
Stopped,
|
|
|
|
/// <summary>
|
|
/// Experiment failed to run.
|
|
/// </summary>
|
|
Failed
|
|
}
|
|
|
|
/// <summary>
|
|
/// Represents an active or completed chaos experiment.
|
|
/// </summary>
|
|
public sealed record ChaosExperiment
|
|
{
|
|
/// <summary>
|
|
/// Unique experiment identifier.
|
|
/// </summary>
|
|
public required string Id { get; init; }
|
|
|
|
/// <summary>
|
|
/// Configuration for this experiment.
|
|
/// </summary>
|
|
public required ChaosExperimentConfig Config { get; init; }
|
|
|
|
/// <summary>
|
|
/// Current status.
|
|
/// </summary>
|
|
public required ChaosExperimentStatus Status { get; init; }
|
|
|
|
/// <summary>
|
|
/// When the experiment was created.
|
|
/// </summary>
|
|
public required DateTimeOffset CreatedAt { get; init; }
|
|
|
|
/// <summary>
|
|
/// When the experiment started running.
|
|
/// </summary>
|
|
public DateTimeOffset? StartedAt { get; init; }
|
|
|
|
/// <summary>
|
|
/// When the experiment ended.
|
|
/// </summary>
|
|
public DateTimeOffset? EndedAt { get; init; }
|
|
|
|
/// <summary>
|
|
/// Scheduled end time.
|
|
/// </summary>
|
|
public DateTimeOffset? ScheduledEndAt { get; init; }
|
|
|
|
/// <summary>
|
|
/// Number of operations affected so far.
|
|
/// </summary>
|
|
public int AffectedOperations { get; init; }
|
|
|
|
/// <summary>
|
|
/// Error message if status is Failed.
|
|
/// </summary>
|
|
public string? ErrorMessage { get; init; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Decision from chaos system about whether to inject a fault.
|
|
/// </summary>
|
|
public sealed record ChaosDecision
|
|
{
|
|
/// <summary>
|
|
/// Whether to inject a fault.
|
|
/// </summary>
|
|
public required bool ShouldFail { get; init; }
|
|
|
|
/// <summary>
|
|
/// The experiment causing the fault (if any).
|
|
/// </summary>
|
|
public string? ExperimentId { get; init; }
|
|
|
|
/// <summary>
|
|
/// Type of fault to inject.
|
|
/// </summary>
|
|
public ChaosFaultType? FaultType { get; init; }
|
|
|
|
/// <summary>
|
|
/// Fault configuration.
|
|
/// </summary>
|
|
public ChaosFaultConfig? FaultConfig { get; init; }
|
|
|
|
/// <summary>
|
|
/// Latency to inject (if applicable).
|
|
/// </summary>
|
|
public TimeSpan? InjectedLatency { get; init; }
|
|
|
|
/// <summary>
|
|
/// Error to return (if applicable).
|
|
/// </summary>
|
|
public string? InjectedError { get; init; }
|
|
|
|
/// <summary>
|
|
/// HTTP status code to return (if applicable).
|
|
/// </summary>
|
|
public int? InjectedStatusCode { get; init; }
|
|
|
|
/// <summary>
|
|
/// Reason for the decision.
|
|
/// </summary>
|
|
public string? Reason { get; init; }
|
|
|
|
/// <summary>
|
|
/// Creates a "no fault" decision.
|
|
/// </summary>
|
|
public static ChaosDecision NoFault() => new()
|
|
{
|
|
ShouldFail = false,
|
|
Reason = "No active chaos experiment"
|
|
};
|
|
}
|
|
|
|
/// <summary>
|
|
/// Records the outcome of a chaos-affected operation.
|
|
/// </summary>
|
|
public sealed record ChaosOutcome
|
|
{
|
|
/// <summary>
|
|
/// Type of outcome.
|
|
/// </summary>
|
|
public required ChaosOutcomeType Type { get; init; }
|
|
|
|
/// <summary>
|
|
/// Channel type affected.
|
|
/// </summary>
|
|
public required string ChannelType { get; init; }
|
|
|
|
/// <summary>
|
|
/// Channel ID affected.
|
|
/// </summary>
|
|
public string? ChannelId { get; init; }
|
|
|
|
/// <summary>
|
|
/// Tenant ID affected.
|
|
/// </summary>
|
|
public string? TenantId { get; init; }
|
|
|
|
/// <summary>
|
|
/// Duration of the operation.
|
|
/// </summary>
|
|
public TimeSpan? Duration { get; init; }
|
|
|
|
/// <summary>
|
|
/// Whether fallback was triggered.
|
|
/// </summary>
|
|
public bool FallbackTriggered { get; init; }
|
|
|
|
/// <summary>
|
|
/// Whether retry was triggered.
|
|
/// </summary>
|
|
public bool RetryTriggered { get; init; }
|
|
|
|
/// <summary>
|
|
/// Error message if operation failed.
|
|
/// </summary>
|
|
public string? ErrorMessage { get; init; }
|
|
|
|
/// <summary>
|
|
/// When this outcome was recorded.
|
|
/// </summary>
|
|
public DateTimeOffset Timestamp { get; init; } = DateTimeOffset.UtcNow;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Types of chaos outcomes.
|
|
/// </summary>
|
|
public enum ChaosOutcomeType
|
|
{
|
|
/// <summary>
|
|
/// Fault was injected and operation failed.
|
|
/// </summary>
|
|
FaultInjected,
|
|
|
|
/// <summary>
|
|
/// Fault was injected but operation recovered.
|
|
/// </summary>
|
|
RecoveredFromFault,
|
|
|
|
/// <summary>
|
|
/// Operation was delayed by latency injection.
|
|
/// </summary>
|
|
LatencyInjected,
|
|
|
|
/// <summary>
|
|
/// Operation was rate limited.
|
|
/// </summary>
|
|
RateLimited,
|
|
|
|
/// <summary>
|
|
/// Operation bypassed due to experiment limits.
|
|
/// </summary>
|
|
Bypassed
|
|
}
|
|
|
|
/// <summary>
|
|
/// Results and statistics from a chaos experiment.
|
|
/// </summary>
|
|
public sealed record ChaosExperimentResults
|
|
{
|
|
/// <summary>
|
|
/// Experiment identifier.
|
|
/// </summary>
|
|
public required string ExperimentId { get; init; }
|
|
|
|
/// <summary>
|
|
/// Total operations affected.
|
|
/// </summary>
|
|
public required int TotalAffected { get; init; }
|
|
|
|
/// <summary>
|
|
/// Operations that failed due to fault injection.
|
|
/// </summary>
|
|
public required int FailedOperations { get; init; }
|
|
|
|
/// <summary>
|
|
/// Operations that recovered from fault.
|
|
/// </summary>
|
|
public required int RecoveredOperations { get; init; }
|
|
|
|
/// <summary>
|
|
/// Operations that triggered fallback.
|
|
/// </summary>
|
|
public required int FallbackTriggered { get; init; }
|
|
|
|
/// <summary>
|
|
/// Operations that triggered retry.
|
|
/// </summary>
|
|
public required int RetryTriggered { get; init; }
|
|
|
|
/// <summary>
|
|
/// Average injected latency.
|
|
/// </summary>
|
|
public TimeSpan? AverageInjectedLatency { get; init; }
|
|
|
|
/// <summary>
|
|
/// Breakdown by channel type.
|
|
/// </summary>
|
|
public IReadOnlyDictionary<string, ChaosChannelStats> ByChannelType { get; init; } = new Dictionary<string, ChaosChannelStats>();
|
|
|
|
/// <summary>
|
|
/// Timeline of outcomes.
|
|
/// </summary>
|
|
public IReadOnlyList<ChaosOutcome> Outcomes { get; init; } = [];
|
|
}
|
|
|
|
/// <summary>
|
|
/// Statistics for a specific channel type.
|
|
/// </summary>
|
|
public sealed record ChaosChannelStats
|
|
{
|
|
/// <summary>
|
|
/// Channel type.
|
|
/// </summary>
|
|
public required string ChannelType { get; init; }
|
|
|
|
/// <summary>
|
|
/// Total affected operations.
|
|
/// </summary>
|
|
public required int TotalAffected { get; init; }
|
|
|
|
/// <summary>
|
|
/// Failed operations.
|
|
/// </summary>
|
|
public required int Failed { get; init; }
|
|
|
|
/// <summary>
|
|
/// Recovered operations.
|
|
/// </summary>
|
|
public required int Recovered { get; init; }
|
|
|
|
/// <summary>
|
|
/// Fallback triggered count.
|
|
/// </summary>
|
|
public required int Fallbacks { get; init; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Options for chaos testing.
|
|
/// </summary>
|
|
public sealed class ChaosTestOptions
|
|
{
|
|
public const string SectionName = "Notifier:Observability:Chaos";
|
|
|
|
/// <summary>
|
|
/// Whether chaos testing is enabled.
|
|
/// </summary>
|
|
public bool Enabled { get; set; }
|
|
|
|
/// <summary>
|
|
/// Maximum concurrent experiments.
|
|
/// </summary>
|
|
public int MaxConcurrentExperiments { get; set; } = 5;
|
|
|
|
/// <summary>
|
|
/// Maximum duration for any experiment.
|
|
/// </summary>
|
|
public TimeSpan MaxExperimentDuration { get; set; } = TimeSpan.FromHours(1);
|
|
|
|
/// <summary>
|
|
/// Retention period for completed experiments.
|
|
/// </summary>
|
|
public TimeSpan ExperimentRetention { get; set; } = TimeSpan.FromDays(7);
|
|
|
|
/// <summary>
|
|
/// Whether to require explicit tenant targeting.
|
|
/// </summary>
|
|
public bool RequireTenantTarget { get; set; } = true;
|
|
|
|
/// <summary>
|
|
/// Allowed initiators (empty = all allowed).
|
|
/// </summary>
|
|
public IReadOnlyList<string> AllowedInitiators { get; set; } = [];
|
|
}
|
|
|
|
/// <summary>
|
|
/// In-memory implementation of chaos test runner.
|
|
/// </summary>
|
|
public sealed class InMemoryChaosTestRunner : IChaosTestRunner
|
|
{
|
|
private readonly ConcurrentDictionary<string, ChaosExperimentState> _experiments = new();
|
|
private readonly ChaosTestOptions _options;
|
|
private readonly TimeProvider _timeProvider;
|
|
private readonly ILogger<InMemoryChaosTestRunner> _logger;
|
|
|
|
public InMemoryChaosTestRunner(
|
|
IOptions<ChaosTestOptions> options,
|
|
TimeProvider timeProvider,
|
|
ILogger<InMemoryChaosTestRunner> logger)
|
|
{
|
|
_options = options?.Value ?? new ChaosTestOptions();
|
|
_timeProvider = timeProvider ?? TimeProvider.System;
|
|
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
|
}
|
|
|
|
public Task<ChaosExperiment> StartExperimentAsync(ChaosExperimentConfig config, CancellationToken ct = default)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(config);
|
|
|
|
if (!_options.Enabled)
|
|
{
|
|
throw new InvalidOperationException("Chaos testing is not enabled");
|
|
}
|
|
|
|
// Validate initiator
|
|
if (_options.AllowedInitiators.Count > 0 && !_options.AllowedInitiators.Contains(config.InitiatedBy))
|
|
{
|
|
throw new UnauthorizedAccessException($"Initiator '{config.InitiatedBy}' is not allowed to run chaos experiments");
|
|
}
|
|
|
|
// Validate tenant targeting
|
|
if (_options.RequireTenantTarget && string.IsNullOrEmpty(config.TenantId))
|
|
{
|
|
throw new InvalidOperationException("Tenant targeting is required for chaos experiments");
|
|
}
|
|
|
|
// Validate duration
|
|
if (config.Duration > _options.MaxExperimentDuration)
|
|
{
|
|
throw new InvalidOperationException($"Experiment duration exceeds maximum of {_options.MaxExperimentDuration}");
|
|
}
|
|
|
|
// Check concurrent limit
|
|
var runningCount = _experiments.Values.Count(e => e.Experiment.Status == ChaosExperimentStatus.Running);
|
|
if (runningCount >= _options.MaxConcurrentExperiments)
|
|
{
|
|
throw new InvalidOperationException($"Maximum concurrent experiments ({_options.MaxConcurrentExperiments}) reached");
|
|
}
|
|
|
|
var now = _timeProvider.GetUtcNow();
|
|
var experimentId = $"chaos-{Guid.NewGuid():N}";
|
|
|
|
var experiment = new ChaosExperiment
|
|
{
|
|
Id = experimentId,
|
|
Config = config,
|
|
Status = ChaosExperimentStatus.Running,
|
|
CreatedAt = now,
|
|
StartedAt = now,
|
|
ScheduledEndAt = now.Add(config.Duration),
|
|
AffectedOperations = 0
|
|
};
|
|
|
|
var state = new ChaosExperimentState
|
|
{
|
|
Experiment = experiment,
|
|
Random = config.FaultConfig.Seed.HasValue
|
|
? new Random(config.FaultConfig.Seed.Value)
|
|
: new Random(),
|
|
Outcomes = [],
|
|
RateLimitBucket = new RateLimitBucket(_timeProvider)
|
|
};
|
|
|
|
_experiments[experimentId] = state;
|
|
|
|
_logger.LogInformation(
|
|
"Started chaos experiment {ExperimentId}: {Name} ({FaultType}) targeting {ChannelTypes}",
|
|
experimentId,
|
|
config.Name,
|
|
config.FaultType,
|
|
string.Join(", ", config.TargetChannelTypes));
|
|
|
|
return Task.FromResult(experiment);
|
|
}
|
|
|
|
public Task StopExperimentAsync(string experimentId, CancellationToken ct = default)
|
|
{
|
|
if (_experiments.TryGetValue(experimentId, out var state))
|
|
{
|
|
if (state.Experiment.Status == ChaosExperimentStatus.Running)
|
|
{
|
|
var stopped = state.Experiment with
|
|
{
|
|
Status = ChaosExperimentStatus.Stopped,
|
|
EndedAt = _timeProvider.GetUtcNow()
|
|
};
|
|
state.Experiment = stopped;
|
|
|
|
_logger.LogInformation(
|
|
"Stopped chaos experiment {ExperimentId} after {AffectedOps} affected operations",
|
|
experimentId,
|
|
stopped.AffectedOperations);
|
|
}
|
|
}
|
|
|
|
return Task.CompletedTask;
|
|
}
|
|
|
|
public Task<ChaosExperiment?> GetExperimentAsync(string experimentId, CancellationToken ct = default)
|
|
{
|
|
if (_experiments.TryGetValue(experimentId, out var state))
|
|
{
|
|
CheckAndUpdateExperimentStatus(state);
|
|
return Task.FromResult<ChaosExperiment?>(state.Experiment);
|
|
}
|
|
|
|
return Task.FromResult<ChaosExperiment?>(null);
|
|
}
|
|
|
|
public Task<IReadOnlyList<ChaosExperiment>> ListExperimentsAsync(
|
|
ChaosExperimentStatus? status = null,
|
|
int limit = 100,
|
|
CancellationToken ct = default)
|
|
{
|
|
foreach (var state in _experiments.Values)
|
|
{
|
|
CheckAndUpdateExperimentStatus(state);
|
|
}
|
|
|
|
var query = _experiments.Values.Select(s => s.Experiment).AsEnumerable();
|
|
|
|
if (status.HasValue)
|
|
{
|
|
query = query.Where(e => e.Status == status.Value);
|
|
}
|
|
|
|
var result = query
|
|
.OrderByDescending(e => e.CreatedAt)
|
|
.Take(limit)
|
|
.ToList();
|
|
|
|
return Task.FromResult<IReadOnlyList<ChaosExperiment>>(result);
|
|
}
|
|
|
|
public Task<ChaosDecision> ShouldFailAsync(
|
|
string tenantId,
|
|
string channelType,
|
|
string? channelId = null,
|
|
CancellationToken ct = default)
|
|
{
|
|
if (!_options.Enabled)
|
|
{
|
|
return Task.FromResult(ChaosDecision.NoFault());
|
|
}
|
|
|
|
// Find matching active experiments
|
|
foreach (var state in _experiments.Values)
|
|
{
|
|
CheckAndUpdateExperimentStatus(state);
|
|
|
|
if (state.Experiment.Status != ChaosExperimentStatus.Running)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
var config = state.Experiment.Config;
|
|
|
|
// Check tenant match
|
|
if (!string.IsNullOrEmpty(config.TenantId) && config.TenantId != tenantId)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Check channel type match
|
|
if (config.TargetChannelTypes.Count > 0 && !config.TargetChannelTypes.Contains(channelType))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Check channel ID match
|
|
if (config.TargetChannelIds.Count > 0 && !string.IsNullOrEmpty(channelId) && !config.TargetChannelIds.Contains(channelId))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Check max affected operations
|
|
if (config.MaxAffectedOperations > 0 && state.Experiment.AffectedOperations >= config.MaxAffectedOperations)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Determine if fault should be injected based on fault type
|
|
var decision = EvaluateFault(state, config);
|
|
if (decision.ShouldFail || decision.InjectedLatency.HasValue)
|
|
{
|
|
// Increment affected count
|
|
state.Experiment = state.Experiment with
|
|
{
|
|
AffectedOperations = state.Experiment.AffectedOperations + 1
|
|
};
|
|
|
|
return Task.FromResult(decision);
|
|
}
|
|
}
|
|
|
|
return Task.FromResult(ChaosDecision.NoFault());
|
|
}
|
|
|
|
private ChaosDecision EvaluateFault(ChaosExperimentState state, ChaosExperimentConfig config)
|
|
{
|
|
var faultConfig = config.FaultConfig;
|
|
|
|
return config.FaultType switch
|
|
{
|
|
ChaosFaultType.Outage => new ChaosDecision
|
|
{
|
|
ShouldFail = true,
|
|
ExperimentId = state.Experiment.Id,
|
|
FaultType = ChaosFaultType.Outage,
|
|
FaultConfig = faultConfig,
|
|
InjectedError = faultConfig.ErrorMessage ?? "Chaos: Simulated outage",
|
|
InjectedStatusCode = faultConfig.ErrorStatusCode,
|
|
Reason = "Complete outage simulation"
|
|
},
|
|
|
|
ChaosFaultType.PartialFailure => EvaluatePartialFailure(state, faultConfig),
|
|
|
|
ChaosFaultType.Latency => EvaluateLatency(state, faultConfig),
|
|
|
|
ChaosFaultType.Intermittent => EvaluateIntermittent(state, faultConfig),
|
|
|
|
ChaosFaultType.RateLimit => EvaluateRateLimit(state, faultConfig),
|
|
|
|
ChaosFaultType.Timeout => new ChaosDecision
|
|
{
|
|
ShouldFail = true,
|
|
ExperimentId = state.Experiment.Id,
|
|
FaultType = ChaosFaultType.Timeout,
|
|
FaultConfig = faultConfig,
|
|
InjectedLatency = faultConfig.TimeoutDuration,
|
|
InjectedError = "Chaos: Request timeout",
|
|
Reason = "Timeout simulation"
|
|
},
|
|
|
|
ChaosFaultType.ErrorResponse => new ChaosDecision
|
|
{
|
|
ShouldFail = true,
|
|
ExperimentId = state.Experiment.Id,
|
|
FaultType = ChaosFaultType.ErrorResponse,
|
|
FaultConfig = faultConfig,
|
|
InjectedStatusCode = faultConfig.ErrorStatusCode,
|
|
InjectedError = faultConfig.ErrorMessage ?? $"Chaos: HTTP {faultConfig.ErrorStatusCode}",
|
|
Reason = "Error response simulation"
|
|
},
|
|
|
|
ChaosFaultType.CorruptResponse => new ChaosDecision
|
|
{
|
|
ShouldFail = true,
|
|
ExperimentId = state.Experiment.Id,
|
|
FaultType = ChaosFaultType.CorruptResponse,
|
|
FaultConfig = faultConfig,
|
|
InjectedError = "Chaos: Corrupted response data",
|
|
Reason = "Corrupt response simulation"
|
|
},
|
|
|
|
_ => ChaosDecision.NoFault()
|
|
};
|
|
}
|
|
|
|
private ChaosDecision EvaluatePartialFailure(ChaosExperimentState state, ChaosFaultConfig faultConfig)
|
|
{
|
|
if (state.Random.NextDouble() < faultConfig.FailureRate)
|
|
{
|
|
return new ChaosDecision
|
|
{
|
|
ShouldFail = true,
|
|
ExperimentId = state.Experiment.Id,
|
|
FaultType = ChaosFaultType.PartialFailure,
|
|
FaultConfig = faultConfig,
|
|
InjectedError = faultConfig.ErrorMessage ?? "Chaos: Partial failure",
|
|
InjectedStatusCode = faultConfig.ErrorStatusCode,
|
|
Reason = $"Partial failure ({faultConfig.FailureRate:P0} rate)"
|
|
};
|
|
}
|
|
|
|
return ChaosDecision.NoFault();
|
|
}
|
|
|
|
private ChaosDecision EvaluateLatency(ChaosExperimentState state, ChaosFaultConfig faultConfig)
|
|
{
|
|
var latencyRange = faultConfig.MaxLatency - faultConfig.MinLatency;
|
|
var randomLatency = faultConfig.MinLatency + TimeSpan.FromMilliseconds(
|
|
state.Random.NextDouble() * latencyRange.TotalMilliseconds);
|
|
|
|
return new ChaosDecision
|
|
{
|
|
ShouldFail = false, // Latency doesn't cause failure
|
|
ExperimentId = state.Experiment.Id,
|
|
FaultType = ChaosFaultType.Latency,
|
|
FaultConfig = faultConfig,
|
|
InjectedLatency = randomLatency,
|
|
Reason = $"Latency injection ({randomLatency.TotalMilliseconds:F0}ms)"
|
|
};
|
|
}
|
|
|
|
private ChaosDecision EvaluateIntermittent(ChaosExperimentState state, ChaosFaultConfig faultConfig)
|
|
{
|
|
// More random pattern than partial failure
|
|
var shouldFail = state.Random.NextDouble() < faultConfig.FailureRate &&
|
|
state.Random.Next(3) == 0; // Additional randomness
|
|
|
|
if (shouldFail)
|
|
{
|
|
return new ChaosDecision
|
|
{
|
|
ShouldFail = true,
|
|
ExperimentId = state.Experiment.Id,
|
|
FaultType = ChaosFaultType.Intermittent,
|
|
FaultConfig = faultConfig,
|
|
InjectedError = faultConfig.ErrorMessage ?? "Chaos: Intermittent failure",
|
|
InjectedStatusCode = faultConfig.ErrorStatusCode,
|
|
Reason = "Intermittent failure simulation"
|
|
};
|
|
}
|
|
|
|
return ChaosDecision.NoFault();
|
|
}
|
|
|
|
private ChaosDecision EvaluateRateLimit(ChaosExperimentState state, ChaosFaultConfig faultConfig)
|
|
{
|
|
if (state.RateLimitBucket.TryConsume(faultConfig.RateLimitPerMinute))
|
|
{
|
|
return ChaosDecision.NoFault();
|
|
}
|
|
|
|
return new ChaosDecision
|
|
{
|
|
ShouldFail = true,
|
|
ExperimentId = state.Experiment.Id,
|
|
FaultType = ChaosFaultType.RateLimit,
|
|
FaultConfig = faultConfig,
|
|
InjectedStatusCode = 429,
|
|
InjectedError = "Chaos: Rate limit exceeded",
|
|
Reason = $"Rate limited ({faultConfig.RateLimitPerMinute}/min)"
|
|
};
|
|
}
|
|
|
|
public Task RecordOutcomeAsync(string experimentId, ChaosOutcome outcome, CancellationToken ct = default)
|
|
{
|
|
if (_experiments.TryGetValue(experimentId, out var state))
|
|
{
|
|
state.Outcomes.Add(outcome);
|
|
}
|
|
|
|
return Task.CompletedTask;
|
|
}
|
|
|
|
public Task<ChaosExperimentResults> GetResultsAsync(string experimentId, CancellationToken ct = default)
|
|
{
|
|
if (!_experiments.TryGetValue(experimentId, out var state))
|
|
{
|
|
return Task.FromResult(new ChaosExperimentResults
|
|
{
|
|
ExperimentId = experimentId,
|
|
TotalAffected = 0,
|
|
FailedOperations = 0,
|
|
RecoveredOperations = 0,
|
|
FallbackTriggered = 0,
|
|
RetryTriggered = 0
|
|
});
|
|
}
|
|
|
|
var outcomes = state.Outcomes.ToList();
|
|
|
|
var byChannel = outcomes
|
|
.GroupBy(o => o.ChannelType)
|
|
.ToDictionary(
|
|
g => g.Key,
|
|
g => new ChaosChannelStats
|
|
{
|
|
ChannelType = g.Key,
|
|
TotalAffected = g.Count(),
|
|
Failed = g.Count(o => o.Type == ChaosOutcomeType.FaultInjected),
|
|
Recovered = g.Count(o => o.Type == ChaosOutcomeType.RecoveredFromFault),
|
|
Fallbacks = g.Count(o => o.FallbackTriggered)
|
|
});
|
|
|
|
var latencies = outcomes
|
|
.Where(o => o.Duration.HasValue && o.Type == ChaosOutcomeType.LatencyInjected)
|
|
.Select(o => o.Duration!.Value)
|
|
.ToList();
|
|
|
|
return Task.FromResult(new ChaosExperimentResults
|
|
{
|
|
ExperimentId = experimentId,
|
|
TotalAffected = outcomes.Count,
|
|
FailedOperations = outcomes.Count(o => o.Type == ChaosOutcomeType.FaultInjected),
|
|
RecoveredOperations = outcomes.Count(o => o.Type == ChaosOutcomeType.RecoveredFromFault),
|
|
FallbackTriggered = outcomes.Count(o => o.FallbackTriggered),
|
|
RetryTriggered = outcomes.Count(o => o.RetryTriggered),
|
|
AverageInjectedLatency = latencies.Count > 0
|
|
? TimeSpan.FromMilliseconds(latencies.Average(l => l.TotalMilliseconds))
|
|
: null,
|
|
ByChannelType = byChannel,
|
|
Outcomes = outcomes
|
|
});
|
|
}
|
|
|
|
public Task<int> CleanupAsync(TimeSpan olderThan, CancellationToken ct = default)
|
|
{
|
|
var cutoff = _timeProvider.GetUtcNow() - olderThan;
|
|
var removed = 0;
|
|
|
|
var toRemove = _experiments
|
|
.Where(kvp =>
|
|
kvp.Value.Experiment.Status is ChaosExperimentStatus.Completed or ChaosExperimentStatus.Stopped or ChaosExperimentStatus.Failed &&
|
|
kvp.Value.Experiment.EndedAt.HasValue &&
|
|
kvp.Value.Experiment.EndedAt.Value < cutoff)
|
|
.Select(kvp => kvp.Key)
|
|
.ToList();
|
|
|
|
foreach (var id in toRemove)
|
|
{
|
|
if (_experiments.TryRemove(id, out _))
|
|
{
|
|
removed++;
|
|
}
|
|
}
|
|
|
|
if (removed > 0)
|
|
{
|
|
_logger.LogInformation("Cleaned up {Count} completed chaos experiments", removed);
|
|
}
|
|
|
|
return Task.FromResult(removed);
|
|
}
|
|
|
|
private void CheckAndUpdateExperimentStatus(ChaosExperimentState state)
|
|
{
|
|
if (state.Experiment.Status != ChaosExperimentStatus.Running)
|
|
{
|
|
return;
|
|
}
|
|
|
|
var now = _timeProvider.GetUtcNow();
|
|
|
|
// Check if expired
|
|
if (state.Experiment.ScheduledEndAt.HasValue && now >= state.Experiment.ScheduledEndAt.Value)
|
|
{
|
|
state.Experiment = state.Experiment with
|
|
{
|
|
Status = ChaosExperimentStatus.Completed,
|
|
EndedAt = state.Experiment.ScheduledEndAt
|
|
};
|
|
|
|
_logger.LogInformation(
|
|
"Chaos experiment {ExperimentId} completed after {AffectedOps} affected operations",
|
|
state.Experiment.Id,
|
|
state.Experiment.AffectedOperations);
|
|
}
|
|
|
|
// Check if max operations reached
|
|
var config = state.Experiment.Config;
|
|
if (config.MaxAffectedOperations > 0 && state.Experiment.AffectedOperations >= config.MaxAffectedOperations)
|
|
{
|
|
state.Experiment = state.Experiment with
|
|
{
|
|
Status = ChaosExperimentStatus.Completed,
|
|
EndedAt = now
|
|
};
|
|
|
|
_logger.LogInformation(
|
|
"Chaos experiment {ExperimentId} completed after reaching max operations ({Max})",
|
|
state.Experiment.Id,
|
|
config.MaxAffectedOperations);
|
|
}
|
|
}
|
|
|
|
private sealed class ChaosExperimentState
|
|
{
|
|
public required ChaosExperiment Experiment { get; set; }
|
|
public required Random Random { get; init; }
|
|
public required List<ChaosOutcome> Outcomes { get; init; }
|
|
public required RateLimitBucket RateLimitBucket { get; init; }
|
|
}
|
|
|
|
private sealed class RateLimitBucket
|
|
{
|
|
private readonly TimeProvider _timeProvider;
|
|
private DateTimeOffset _windowStart;
|
|
private int _count;
|
|
private readonly object _lock = new();
|
|
|
|
public RateLimitBucket(TimeProvider timeProvider)
|
|
{
|
|
_timeProvider = timeProvider;
|
|
_windowStart = timeProvider.GetUtcNow();
|
|
_count = 0;
|
|
}
|
|
|
|
public bool TryConsume(int limit)
|
|
{
|
|
lock (_lock)
|
|
{
|
|
var now = _timeProvider.GetUtcNow();
|
|
if ((now - _windowStart).TotalMinutes >= 1)
|
|
{
|
|
_windowStart = now;
|
|
_count = 0;
|
|
}
|
|
|
|
if (_count < limit)
|
|
{
|
|
_count++;
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Extension methods for chaos testing.
|
|
/// </summary>
|
|
public static class ChaosTestExtensions
|
|
{
|
|
/// <summary>
|
|
/// Applies chaos decision to an operation, injecting faults as configured.
|
|
/// </summary>
|
|
public static async Task ApplyChaosAsync(this ChaosDecision decision, CancellationToken ct = default)
|
|
{
|
|
if (!decision.ShouldFail && decision.InjectedLatency.HasValue)
|
|
{
|
|
// Latency-only injection
|
|
await Task.Delay(decision.InjectedLatency.Value, ct);
|
|
}
|
|
else if (decision.ShouldFail)
|
|
{
|
|
// First apply any latency
|
|
if (decision.InjectedLatency.HasValue)
|
|
{
|
|
await Task.Delay(decision.InjectedLatency.Value, ct);
|
|
}
|
|
|
|
// Then throw the appropriate exception
|
|
throw new ChaosInjectedException(decision);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates a simple outage experiment config.
|
|
/// </summary>
|
|
public static ChaosExperimentConfig CreateOutageExperiment(
|
|
string name,
|
|
string initiatedBy,
|
|
IReadOnlyList<string> channelTypes,
|
|
string? tenantId = null,
|
|
TimeSpan? duration = null)
|
|
{
|
|
return new ChaosExperimentConfig
|
|
{
|
|
Name = name,
|
|
InitiatedBy = initiatedBy,
|
|
TenantId = tenantId,
|
|
TargetChannelTypes = channelTypes,
|
|
FaultType = ChaosFaultType.Outage,
|
|
Duration = duration ?? TimeSpan.FromMinutes(5)
|
|
};
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates a latency injection experiment config.
|
|
/// </summary>
|
|
public static ChaosExperimentConfig CreateLatencyExperiment(
|
|
string name,
|
|
string initiatedBy,
|
|
IReadOnlyList<string> channelTypes,
|
|
TimeSpan minLatency,
|
|
TimeSpan maxLatency,
|
|
string? tenantId = null,
|
|
TimeSpan? duration = null)
|
|
{
|
|
return new ChaosExperimentConfig
|
|
{
|
|
Name = name,
|
|
InitiatedBy = initiatedBy,
|
|
TenantId = tenantId,
|
|
TargetChannelTypes = channelTypes,
|
|
FaultType = ChaosFaultType.Latency,
|
|
FaultConfig = new ChaosFaultConfig
|
|
{
|
|
MinLatency = minLatency,
|
|
MaxLatency = maxLatency
|
|
},
|
|
Duration = duration ?? TimeSpan.FromMinutes(5)
|
|
};
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Exception thrown when chaos is injected.
|
|
/// </summary>
|
|
public sealed class ChaosInjectedException : Exception
|
|
{
|
|
public ChaosDecision Decision { get; }
|
|
|
|
public ChaosInjectedException(ChaosDecision decision)
|
|
: base(decision.InjectedError ?? "Chaos fault injected")
|
|
{
|
|
Decision = decision;
|
|
}
|
|
}
|