Files
git.stella-ops.org/src/Notifier/StellaOps.Notifier/StellaOps.Notifier.Worker/Observability/IChaosTestRunner.cs
2026-02-01 21:37:40 +02:00

1123 lines
34 KiB
C#

using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using System.Collections.Concurrent;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Chaos testing service for simulating channel outages and failures.
/// Enables controlled fault injection to test resilience of notification delivery.
/// </summary>
public interface IChaosTestRunner
{
/// <summary>
/// Starts a chaos experiment.
/// </summary>
Task<ChaosExperiment> StartExperimentAsync(ChaosExperimentConfig config, CancellationToken ct = default);
/// <summary>
/// Stops a running chaos experiment.
/// </summary>
Task StopExperimentAsync(string experimentId, CancellationToken ct = default);
/// <summary>
/// Gets the current status of an experiment.
/// </summary>
Task<ChaosExperiment?> GetExperimentAsync(string experimentId, CancellationToken ct = default);
/// <summary>
/// Lists all experiments (optionally filtered by status).
/// </summary>
Task<IReadOnlyList<ChaosExperiment>> ListExperimentsAsync(
ChaosExperimentStatus? status = null,
int limit = 100,
CancellationToken ct = default);
/// <summary>
/// Checks if a channel should fail based on active chaos experiments.
/// </summary>
Task<ChaosDecision> ShouldFailAsync(string tenantId, string channelType, string? channelId = null, CancellationToken ct = default);
/// <summary>
/// Records the outcome of a chaos-affected operation.
/// </summary>
Task RecordOutcomeAsync(string experimentId, ChaosOutcome outcome, CancellationToken ct = default);
/// <summary>
/// Gets experiment results/statistics.
/// </summary>
Task<ChaosExperimentResults> GetResultsAsync(string experimentId, CancellationToken ct = default);
/// <summary>
/// Cleans up completed experiments older than the specified age.
/// </summary>
Task<int> CleanupAsync(TimeSpan olderThan, CancellationToken ct = default);
}
/// <summary>
/// Configuration for a chaos experiment.
/// </summary>
public sealed record ChaosExperimentConfig
{
/// <summary>
/// Human-readable name for the experiment.
/// </summary>
public required string Name { get; init; }
/// <summary>
/// Description of what the experiment tests.
/// </summary>
public string? Description { get; init; }
/// <summary>
/// Target tenant ID (null for all tenants).
/// </summary>
public string? TenantId { get; init; }
/// <summary>
/// Target channel types to affect.
/// </summary>
public IReadOnlyList<string> TargetChannelTypes { get; init; } = [];
/// <summary>
/// Target channel IDs to affect (empty means all channels of specified types).
/// </summary>
public IReadOnlyList<string> TargetChannelIds { get; init; } = [];
/// <summary>
/// Type of fault to inject.
/// </summary>
public required ChaosFaultType FaultType { get; init; }
/// <summary>
/// Fault configuration parameters.
/// </summary>
public ChaosFaultConfig FaultConfig { get; init; } = new();
/// <summary>
/// Duration of the experiment.
/// </summary>
public TimeSpan Duration { get; init; } = TimeSpan.FromMinutes(5);
/// <summary>
/// Maximum number of operations to affect (0 = unlimited).
/// </summary>
public int MaxAffectedOperations { get; init; }
/// <summary>
/// Tags for categorizing experiments.
/// </summary>
public IReadOnlyDictionary<string, string> Tags { get; init; } = new Dictionary<string, string>();
/// <summary>
/// Who initiated this experiment.
/// </summary>
public required string InitiatedBy { get; init; }
}
/// <summary>
/// Configuration for fault behavior.
/// </summary>
public sealed record ChaosFaultConfig
{
/// <summary>
/// Failure rate (0.0 to 1.0) for partial/intermittent failures.
/// </summary>
public double FailureRate { get; init; } = 1.0;
/// <summary>
/// Minimum latency to inject.
/// </summary>
public TimeSpan MinLatency { get; init; } = TimeSpan.FromSeconds(1);
/// <summary>
/// Maximum latency to inject.
/// </summary>
public TimeSpan MaxLatency { get; init; } = TimeSpan.FromSeconds(5);
/// <summary>
/// HTTP status code to return for error responses.
/// </summary>
public int ErrorStatusCode { get; init; } = 500;
/// <summary>
/// Error message to include.
/// </summary>
public string? ErrorMessage { get; init; }
/// <summary>
/// Rate limit (requests per minute) for RateLimit fault type.
/// </summary>
public int RateLimitPerMinute { get; init; } = 10;
/// <summary>
/// Timeout duration for Timeout fault type.
/// </summary>
public TimeSpan TimeoutDuration { get; init; } = TimeSpan.FromSeconds(30);
/// <summary>
/// Random seed for reproducible experiments.
/// </summary>
public int? Seed { get; init; }
}
/// <summary>
/// Status of a chaos experiment.
/// </summary>
public enum ChaosExperimentStatus
{
/// <summary>
/// Experiment is scheduled but not yet started.
/// </summary>
Scheduled,
/// <summary>
/// Experiment is currently running.
/// </summary>
Running,
/// <summary>
/// Experiment completed normally.
/// </summary>
Completed,
/// <summary>
/// Experiment was stopped early.
/// </summary>
Stopped,
/// <summary>
/// Experiment failed to run.
/// </summary>
Failed
}
/// <summary>
/// Represents an active or completed chaos experiment.
/// </summary>
public sealed record ChaosExperiment
{
/// <summary>
/// Unique experiment identifier.
/// </summary>
public required string Id { get; init; }
/// <summary>
/// Configuration for this experiment.
/// </summary>
public required ChaosExperimentConfig Config { get; init; }
/// <summary>
/// Current status.
/// </summary>
public required ChaosExperimentStatus Status { get; init; }
/// <summary>
/// When the experiment was created.
/// </summary>
public required DateTimeOffset CreatedAt { get; init; }
/// <summary>
/// When the experiment started running.
/// </summary>
public DateTimeOffset? StartedAt { get; init; }
/// <summary>
/// When the experiment ended.
/// </summary>
public DateTimeOffset? EndedAt { get; init; }
/// <summary>
/// Scheduled end time.
/// </summary>
public DateTimeOffset? ScheduledEndAt { get; init; }
/// <summary>
/// Number of operations affected so far.
/// </summary>
public int AffectedOperations { get; init; }
/// <summary>
/// Error message if status is Failed.
/// </summary>
public string? ErrorMessage { get; init; }
}
/// <summary>
/// Decision from chaos system about whether to inject a fault.
/// </summary>
public sealed record ChaosDecision
{
/// <summary>
/// Whether to inject a fault.
/// </summary>
public required bool ShouldFail { get; init; }
/// <summary>
/// The experiment causing the fault (if any).
/// </summary>
public string? ExperimentId { get; init; }
/// <summary>
/// Type of fault to inject.
/// </summary>
public ChaosFaultType? FaultType { get; init; }
/// <summary>
/// Fault configuration.
/// </summary>
public ChaosFaultConfig? FaultConfig { get; init; }
/// <summary>
/// Latency to inject (if applicable).
/// </summary>
public TimeSpan? InjectedLatency { get; init; }
/// <summary>
/// Error to return (if applicable).
/// </summary>
public string? InjectedError { get; init; }
/// <summary>
/// HTTP status code to return (if applicable).
/// </summary>
public int? InjectedStatusCode { get; init; }
/// <summary>
/// Reason for the decision.
/// </summary>
public string? Reason { get; init; }
/// <summary>
/// Creates a "no fault" decision.
/// </summary>
public static ChaosDecision NoFault() => new()
{
ShouldFail = false,
Reason = "No active chaos experiment"
};
}
/// <summary>
/// Records the outcome of a chaos-affected operation.
/// </summary>
public sealed record ChaosOutcome
{
/// <summary>
/// Type of outcome.
/// </summary>
public required ChaosOutcomeType Type { get; init; }
/// <summary>
/// Channel type affected.
/// </summary>
public required string ChannelType { get; init; }
/// <summary>
/// Channel ID affected.
/// </summary>
public string? ChannelId { get; init; }
/// <summary>
/// Tenant ID affected.
/// </summary>
public string? TenantId { get; init; }
/// <summary>
/// Duration of the operation.
/// </summary>
public TimeSpan? Duration { get; init; }
/// <summary>
/// Whether fallback was triggered.
/// </summary>
public bool FallbackTriggered { get; init; }
/// <summary>
/// Whether retry was triggered.
/// </summary>
public bool RetryTriggered { get; init; }
/// <summary>
/// Error message if operation failed.
/// </summary>
public string? ErrorMessage { get; init; }
/// <summary>
/// When this outcome was recorded.
/// </summary>
public DateTimeOffset Timestamp { get; init; } = DateTimeOffset.UtcNow;
}
/// <summary>
/// Types of chaos outcomes.
/// </summary>
public enum ChaosOutcomeType
{
/// <summary>
/// Fault was injected and operation failed.
/// </summary>
FaultInjected,
/// <summary>
/// Fault was injected but operation recovered.
/// </summary>
RecoveredFromFault,
/// <summary>
/// Operation was delayed by latency injection.
/// </summary>
LatencyInjected,
/// <summary>
/// Operation was rate limited.
/// </summary>
RateLimited,
/// <summary>
/// Operation bypassed due to experiment limits.
/// </summary>
Bypassed
}
/// <summary>
/// Results and statistics from a chaos experiment.
/// </summary>
public sealed record ChaosExperimentResults
{
/// <summary>
/// Experiment identifier.
/// </summary>
public required string ExperimentId { get; init; }
/// <summary>
/// Total operations affected.
/// </summary>
public required int TotalAffected { get; init; }
/// <summary>
/// Operations that failed due to fault injection.
/// </summary>
public required int FailedOperations { get; init; }
/// <summary>
/// Operations that recovered from fault.
/// </summary>
public required int RecoveredOperations { get; init; }
/// <summary>
/// Operations that triggered fallback.
/// </summary>
public required int FallbackTriggered { get; init; }
/// <summary>
/// Operations that triggered retry.
/// </summary>
public required int RetryTriggered { get; init; }
/// <summary>
/// Average injected latency.
/// </summary>
public TimeSpan? AverageInjectedLatency { get; init; }
/// <summary>
/// Breakdown by channel type.
/// </summary>
public IReadOnlyDictionary<string, ChaosChannelStats> ByChannelType { get; init; } = new Dictionary<string, ChaosChannelStats>();
/// <summary>
/// Timeline of outcomes.
/// </summary>
public IReadOnlyList<ChaosOutcome> Outcomes { get; init; } = [];
}
/// <summary>
/// Statistics for a specific channel type.
/// </summary>
public sealed record ChaosChannelStats
{
/// <summary>
/// Channel type.
/// </summary>
public required string ChannelType { get; init; }
/// <summary>
/// Total affected operations.
/// </summary>
public required int TotalAffected { get; init; }
/// <summary>
/// Failed operations.
/// </summary>
public required int Failed { get; init; }
/// <summary>
/// Recovered operations.
/// </summary>
public required int Recovered { get; init; }
/// <summary>
/// Fallback triggered count.
/// </summary>
public required int Fallbacks { get; init; }
}
/// <summary>
/// Options for chaos testing.
/// </summary>
public sealed class ChaosTestOptions
{
public const string SectionName = "Notifier:Observability:Chaos";
/// <summary>
/// Whether chaos testing is enabled.
/// </summary>
public bool Enabled { get; set; }
/// <summary>
/// Maximum concurrent experiments.
/// </summary>
public int MaxConcurrentExperiments { get; set; } = 5;
/// <summary>
/// Maximum duration for any experiment.
/// </summary>
public TimeSpan MaxExperimentDuration { get; set; } = TimeSpan.FromHours(1);
/// <summary>
/// Retention period for completed experiments.
/// </summary>
public TimeSpan ExperimentRetention { get; set; } = TimeSpan.FromDays(7);
/// <summary>
/// Whether to require explicit tenant targeting.
/// </summary>
public bool RequireTenantTarget { get; set; } = true;
/// <summary>
/// Allowed initiators (empty = all allowed).
/// </summary>
public IReadOnlyList<string> AllowedInitiators { get; set; } = [];
}
/// <summary>
/// In-memory implementation of chaos test runner.
/// </summary>
public sealed class InMemoryChaosTestRunner : IChaosTestRunner
{
private readonly ConcurrentDictionary<string, ChaosExperimentState> _experiments = new();
private readonly ChaosTestOptions _options;
private readonly TimeProvider _timeProvider;
private readonly ILogger<InMemoryChaosTestRunner> _logger;
public InMemoryChaosTestRunner(
IOptions<ChaosTestOptions> options,
TimeProvider timeProvider,
ILogger<InMemoryChaosTestRunner> logger)
{
_options = options?.Value ?? new ChaosTestOptions();
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public Task<ChaosExperiment> StartExperimentAsync(ChaosExperimentConfig config, CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(config);
if (!_options.Enabled)
{
throw new InvalidOperationException("Chaos testing is not enabled");
}
// Validate initiator
if (_options.AllowedInitiators.Count > 0 && !_options.AllowedInitiators.Contains(config.InitiatedBy))
{
throw new UnauthorizedAccessException($"Initiator '{config.InitiatedBy}' is not allowed to run chaos experiments");
}
// Validate tenant targeting
if (_options.RequireTenantTarget && string.IsNullOrEmpty(config.TenantId))
{
throw new InvalidOperationException("Tenant targeting is required for chaos experiments");
}
// Validate duration
if (config.Duration > _options.MaxExperimentDuration)
{
throw new InvalidOperationException($"Experiment duration exceeds maximum of {_options.MaxExperimentDuration}");
}
// Check concurrent limit
var runningCount = _experiments.Values.Count(e => e.Experiment.Status == ChaosExperimentStatus.Running);
if (runningCount >= _options.MaxConcurrentExperiments)
{
throw new InvalidOperationException($"Maximum concurrent experiments ({_options.MaxConcurrentExperiments}) reached");
}
var now = _timeProvider.GetUtcNow();
var experimentId = $"chaos-{Guid.NewGuid():N}";
var experiment = new ChaosExperiment
{
Id = experimentId,
Config = config,
Status = ChaosExperimentStatus.Running,
CreatedAt = now,
StartedAt = now,
ScheduledEndAt = now.Add(config.Duration),
AffectedOperations = 0
};
var state = new ChaosExperimentState
{
Experiment = experiment,
Random = config.FaultConfig.Seed.HasValue
? new Random(config.FaultConfig.Seed.Value)
: new Random(),
Outcomes = [],
RateLimitBucket = new RateLimitBucket(_timeProvider)
};
_experiments[experimentId] = state;
_logger.LogInformation(
"Started chaos experiment {ExperimentId}: {Name} ({FaultType}) targeting {ChannelTypes}",
experimentId,
config.Name,
config.FaultType,
string.Join(", ", config.TargetChannelTypes));
return Task.FromResult(experiment);
}
public Task StopExperimentAsync(string experimentId, CancellationToken ct = default)
{
if (_experiments.TryGetValue(experimentId, out var state))
{
if (state.Experiment.Status == ChaosExperimentStatus.Running)
{
var stopped = state.Experiment with
{
Status = ChaosExperimentStatus.Stopped,
EndedAt = _timeProvider.GetUtcNow()
};
state.Experiment = stopped;
_logger.LogInformation(
"Stopped chaos experiment {ExperimentId} after {AffectedOps} affected operations",
experimentId,
stopped.AffectedOperations);
}
}
return Task.CompletedTask;
}
public Task<ChaosExperiment?> GetExperimentAsync(string experimentId, CancellationToken ct = default)
{
if (_experiments.TryGetValue(experimentId, out var state))
{
CheckAndUpdateExperimentStatus(state);
return Task.FromResult<ChaosExperiment?>(state.Experiment);
}
return Task.FromResult<ChaosExperiment?>(null);
}
public Task<IReadOnlyList<ChaosExperiment>> ListExperimentsAsync(
ChaosExperimentStatus? status = null,
int limit = 100,
CancellationToken ct = default)
{
foreach (var state in _experiments.Values)
{
CheckAndUpdateExperimentStatus(state);
}
var query = _experiments.Values.Select(s => s.Experiment).AsEnumerable();
if (status.HasValue)
{
query = query.Where(e => e.Status == status.Value);
}
var result = query
.OrderByDescending(e => e.CreatedAt)
.Take(limit)
.ToList();
return Task.FromResult<IReadOnlyList<ChaosExperiment>>(result);
}
public Task<ChaosDecision> ShouldFailAsync(
string tenantId,
string channelType,
string? channelId = null,
CancellationToken ct = default)
{
if (!_options.Enabled)
{
return Task.FromResult(ChaosDecision.NoFault());
}
// Find matching active experiments
foreach (var state in _experiments.Values)
{
CheckAndUpdateExperimentStatus(state);
if (state.Experiment.Status != ChaosExperimentStatus.Running)
{
continue;
}
var config = state.Experiment.Config;
// Check tenant match
if (!string.IsNullOrEmpty(config.TenantId) && config.TenantId != tenantId)
{
continue;
}
// Check channel type match
if (config.TargetChannelTypes.Count > 0 && !config.TargetChannelTypes.Contains(channelType))
{
continue;
}
// Check channel ID match
if (config.TargetChannelIds.Count > 0 && !string.IsNullOrEmpty(channelId) && !config.TargetChannelIds.Contains(channelId))
{
continue;
}
// Check max affected operations
if (config.MaxAffectedOperations > 0 && state.Experiment.AffectedOperations >= config.MaxAffectedOperations)
{
continue;
}
// Determine if fault should be injected based on fault type
var decision = EvaluateFault(state, config);
if (decision.ShouldFail || decision.InjectedLatency.HasValue)
{
// Increment affected count
state.Experiment = state.Experiment with
{
AffectedOperations = state.Experiment.AffectedOperations + 1
};
return Task.FromResult(decision);
}
}
return Task.FromResult(ChaosDecision.NoFault());
}
private ChaosDecision EvaluateFault(ChaosExperimentState state, ChaosExperimentConfig config)
{
var faultConfig = config.FaultConfig;
return config.FaultType switch
{
ChaosFaultType.Outage => new ChaosDecision
{
ShouldFail = true,
ExperimentId = state.Experiment.Id,
FaultType = ChaosFaultType.Outage,
FaultConfig = faultConfig,
InjectedError = faultConfig.ErrorMessage ?? "Chaos: Simulated outage",
InjectedStatusCode = faultConfig.ErrorStatusCode,
Reason = "Complete outage simulation"
},
ChaosFaultType.PartialFailure => EvaluatePartialFailure(state, faultConfig),
ChaosFaultType.Latency => EvaluateLatency(state, faultConfig),
ChaosFaultType.Intermittent => EvaluateIntermittent(state, faultConfig),
ChaosFaultType.RateLimit => EvaluateRateLimit(state, faultConfig),
ChaosFaultType.Timeout => new ChaosDecision
{
ShouldFail = true,
ExperimentId = state.Experiment.Id,
FaultType = ChaosFaultType.Timeout,
FaultConfig = faultConfig,
InjectedLatency = faultConfig.TimeoutDuration,
InjectedError = "Chaos: Request timeout",
Reason = "Timeout simulation"
},
ChaosFaultType.ErrorResponse => new ChaosDecision
{
ShouldFail = true,
ExperimentId = state.Experiment.Id,
FaultType = ChaosFaultType.ErrorResponse,
FaultConfig = faultConfig,
InjectedStatusCode = faultConfig.ErrorStatusCode,
InjectedError = faultConfig.ErrorMessage ?? $"Chaos: HTTP {faultConfig.ErrorStatusCode}",
Reason = "Error response simulation"
},
ChaosFaultType.CorruptResponse => new ChaosDecision
{
ShouldFail = true,
ExperimentId = state.Experiment.Id,
FaultType = ChaosFaultType.CorruptResponse,
FaultConfig = faultConfig,
InjectedError = "Chaos: Corrupted response data",
Reason = "Corrupt response simulation"
},
_ => ChaosDecision.NoFault()
};
}
private ChaosDecision EvaluatePartialFailure(ChaosExperimentState state, ChaosFaultConfig faultConfig)
{
if (state.Random.NextDouble() < faultConfig.FailureRate)
{
return new ChaosDecision
{
ShouldFail = true,
ExperimentId = state.Experiment.Id,
FaultType = ChaosFaultType.PartialFailure,
FaultConfig = faultConfig,
InjectedError = faultConfig.ErrorMessage ?? "Chaos: Partial failure",
InjectedStatusCode = faultConfig.ErrorStatusCode,
Reason = $"Partial failure ({faultConfig.FailureRate:P0} rate)"
};
}
return ChaosDecision.NoFault();
}
private ChaosDecision EvaluateLatency(ChaosExperimentState state, ChaosFaultConfig faultConfig)
{
var latencyRange = faultConfig.MaxLatency - faultConfig.MinLatency;
var randomLatency = faultConfig.MinLatency + TimeSpan.FromMilliseconds(
state.Random.NextDouble() * latencyRange.TotalMilliseconds);
return new ChaosDecision
{
ShouldFail = false, // Latency doesn't cause failure
ExperimentId = state.Experiment.Id,
FaultType = ChaosFaultType.Latency,
FaultConfig = faultConfig,
InjectedLatency = randomLatency,
Reason = $"Latency injection ({randomLatency.TotalMilliseconds:F0}ms)"
};
}
private ChaosDecision EvaluateIntermittent(ChaosExperimentState state, ChaosFaultConfig faultConfig)
{
// More random pattern than partial failure
var shouldFail = state.Random.NextDouble() < faultConfig.FailureRate &&
state.Random.Next(3) == 0; // Additional randomness
if (shouldFail)
{
return new ChaosDecision
{
ShouldFail = true,
ExperimentId = state.Experiment.Id,
FaultType = ChaosFaultType.Intermittent,
FaultConfig = faultConfig,
InjectedError = faultConfig.ErrorMessage ?? "Chaos: Intermittent failure",
InjectedStatusCode = faultConfig.ErrorStatusCode,
Reason = "Intermittent failure simulation"
};
}
return ChaosDecision.NoFault();
}
private ChaosDecision EvaluateRateLimit(ChaosExperimentState state, ChaosFaultConfig faultConfig)
{
if (state.RateLimitBucket.TryConsume(faultConfig.RateLimitPerMinute))
{
return ChaosDecision.NoFault();
}
return new ChaosDecision
{
ShouldFail = true,
ExperimentId = state.Experiment.Id,
FaultType = ChaosFaultType.RateLimit,
FaultConfig = faultConfig,
InjectedStatusCode = 429,
InjectedError = "Chaos: Rate limit exceeded",
Reason = $"Rate limited ({faultConfig.RateLimitPerMinute}/min)"
};
}
public Task RecordOutcomeAsync(string experimentId, ChaosOutcome outcome, CancellationToken ct = default)
{
if (_experiments.TryGetValue(experimentId, out var state))
{
state.Outcomes.Add(outcome);
}
return Task.CompletedTask;
}
public Task<ChaosExperimentResults> GetResultsAsync(string experimentId, CancellationToken ct = default)
{
if (!_experiments.TryGetValue(experimentId, out var state))
{
return Task.FromResult(new ChaosExperimentResults
{
ExperimentId = experimentId,
TotalAffected = 0,
FailedOperations = 0,
RecoveredOperations = 0,
FallbackTriggered = 0,
RetryTriggered = 0
});
}
var outcomes = state.Outcomes.ToList();
var byChannel = outcomes
.GroupBy(o => o.ChannelType)
.ToDictionary(
g => g.Key,
g => new ChaosChannelStats
{
ChannelType = g.Key,
TotalAffected = g.Count(),
Failed = g.Count(o => o.Type == ChaosOutcomeType.FaultInjected),
Recovered = g.Count(o => o.Type == ChaosOutcomeType.RecoveredFromFault),
Fallbacks = g.Count(o => o.FallbackTriggered)
});
var latencies = outcomes
.Where(o => o.Duration.HasValue && o.Type == ChaosOutcomeType.LatencyInjected)
.Select(o => o.Duration!.Value)
.ToList();
return Task.FromResult(new ChaosExperimentResults
{
ExperimentId = experimentId,
TotalAffected = outcomes.Count,
FailedOperations = outcomes.Count(o => o.Type == ChaosOutcomeType.FaultInjected),
RecoveredOperations = outcomes.Count(o => o.Type == ChaosOutcomeType.RecoveredFromFault),
FallbackTriggered = outcomes.Count(o => o.FallbackTriggered),
RetryTriggered = outcomes.Count(o => o.RetryTriggered),
AverageInjectedLatency = latencies.Count > 0
? TimeSpan.FromMilliseconds(latencies.Average(l => l.TotalMilliseconds))
: null,
ByChannelType = byChannel,
Outcomes = outcomes
});
}
public Task<int> CleanupAsync(TimeSpan olderThan, CancellationToken ct = default)
{
var cutoff = _timeProvider.GetUtcNow() - olderThan;
var removed = 0;
var toRemove = _experiments
.Where(kvp =>
kvp.Value.Experiment.Status is ChaosExperimentStatus.Completed or ChaosExperimentStatus.Stopped or ChaosExperimentStatus.Failed &&
kvp.Value.Experiment.EndedAt.HasValue &&
kvp.Value.Experiment.EndedAt.Value < cutoff)
.Select(kvp => kvp.Key)
.ToList();
foreach (var id in toRemove)
{
if (_experiments.TryRemove(id, out _))
{
removed++;
}
}
if (removed > 0)
{
_logger.LogInformation("Cleaned up {Count} completed chaos experiments", removed);
}
return Task.FromResult(removed);
}
private void CheckAndUpdateExperimentStatus(ChaosExperimentState state)
{
if (state.Experiment.Status != ChaosExperimentStatus.Running)
{
return;
}
var now = _timeProvider.GetUtcNow();
// Check if expired
if (state.Experiment.ScheduledEndAt.HasValue && now >= state.Experiment.ScheduledEndAt.Value)
{
state.Experiment = state.Experiment with
{
Status = ChaosExperimentStatus.Completed,
EndedAt = state.Experiment.ScheduledEndAt
};
_logger.LogInformation(
"Chaos experiment {ExperimentId} completed after {AffectedOps} affected operations",
state.Experiment.Id,
state.Experiment.AffectedOperations);
}
// Check if max operations reached
var config = state.Experiment.Config;
if (config.MaxAffectedOperations > 0 && state.Experiment.AffectedOperations >= config.MaxAffectedOperations)
{
state.Experiment = state.Experiment with
{
Status = ChaosExperimentStatus.Completed,
EndedAt = now
};
_logger.LogInformation(
"Chaos experiment {ExperimentId} completed after reaching max operations ({Max})",
state.Experiment.Id,
config.MaxAffectedOperations);
}
}
private sealed class ChaosExperimentState
{
public required ChaosExperiment Experiment { get; set; }
public required Random Random { get; init; }
public required List<ChaosOutcome> Outcomes { get; init; }
public required RateLimitBucket RateLimitBucket { get; init; }
}
private sealed class RateLimitBucket
{
private readonly TimeProvider _timeProvider;
private DateTimeOffset _windowStart;
private int _count;
private readonly object _lock = new();
public RateLimitBucket(TimeProvider timeProvider)
{
_timeProvider = timeProvider;
_windowStart = timeProvider.GetUtcNow();
_count = 0;
}
public bool TryConsume(int limit)
{
lock (_lock)
{
var now = _timeProvider.GetUtcNow();
if ((now - _windowStart).TotalMinutes >= 1)
{
_windowStart = now;
_count = 0;
}
if (_count < limit)
{
_count++;
return true;
}
return false;
}
}
}
}
/// <summary>
/// Extension methods for chaos testing.
/// </summary>
public static class ChaosTestExtensions
{
/// <summary>
/// Applies chaos decision to an operation, injecting faults as configured.
/// </summary>
public static async Task ApplyChaosAsync(this ChaosDecision decision, CancellationToken ct = default)
{
if (!decision.ShouldFail && decision.InjectedLatency.HasValue)
{
// Latency-only injection
await Task.Delay(decision.InjectedLatency.Value, ct);
}
else if (decision.ShouldFail)
{
// First apply any latency
if (decision.InjectedLatency.HasValue)
{
await Task.Delay(decision.InjectedLatency.Value, ct);
}
// Then throw the appropriate exception
throw new ChaosInjectedException(decision);
}
}
/// <summary>
/// Creates a simple outage experiment config.
/// </summary>
public static ChaosExperimentConfig CreateOutageExperiment(
string name,
string initiatedBy,
IReadOnlyList<string> channelTypes,
string? tenantId = null,
TimeSpan? duration = null)
{
return new ChaosExperimentConfig
{
Name = name,
InitiatedBy = initiatedBy,
TenantId = tenantId,
TargetChannelTypes = channelTypes,
FaultType = ChaosFaultType.Outage,
Duration = duration ?? TimeSpan.FromMinutes(5)
};
}
/// <summary>
/// Creates a latency injection experiment config.
/// </summary>
public static ChaosExperimentConfig CreateLatencyExperiment(
string name,
string initiatedBy,
IReadOnlyList<string> channelTypes,
TimeSpan minLatency,
TimeSpan maxLatency,
string? tenantId = null,
TimeSpan? duration = null)
{
return new ChaosExperimentConfig
{
Name = name,
InitiatedBy = initiatedBy,
TenantId = tenantId,
TargetChannelTypes = channelTypes,
FaultType = ChaosFaultType.Latency,
FaultConfig = new ChaosFaultConfig
{
MinLatency = minLatency,
MaxLatency = maxLatency
},
Duration = duration ?? TimeSpan.FromMinutes(5)
};
}
}
/// <summary>
/// Exception thrown when chaos is injected.
/// </summary>
public sealed class ChaosInjectedException : Exception
{
public ChaosDecision Decision { get; }
public ChaosInjectedException(ChaosDecision decision)
: base(decision.InjectedError ?? "Chaos fault injected")
{
Decision = decision;
}
}