Files
git.stella-ops.org/src/Policy/StellaOps.Policy.Engine/Telemetry/IncidentMode.cs

213 lines
6.9 KiB
C#

using System.Diagnostics;
using System.Globalization;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using OpenTelemetry.Trace;
namespace StellaOps.Policy.Engine.Telemetry;
/// <summary>
/// Service for managing incident mode, which enables 100% trace sampling
/// and extended retention during critical periods.
/// </summary>
public sealed class IncidentModeService
{
private readonly ILogger<IncidentModeService> _logger;
private readonly TimeProvider _timeProvider;
private readonly IOptionsMonitor<PolicyEngineTelemetryOptions> _optionsMonitor;
private volatile IncidentModeState _state = new(false, null, null, null);
public IncidentModeService(
ILogger<IncidentModeService> logger,
TimeProvider timeProvider,
IOptionsMonitor<PolicyEngineTelemetryOptions> optionsMonitor)
{
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
_optionsMonitor = optionsMonitor ?? throw new ArgumentNullException(nameof(optionsMonitor));
// Initialize from configuration
if (_optionsMonitor.CurrentValue.IncidentMode)
{
_state = new IncidentModeState(
true,
_timeProvider.GetUtcNow(),
null,
"configuration");
}
}
/// <summary>
/// Gets the current incident mode state.
/// </summary>
public IncidentModeState State => _state;
/// <summary>
/// Gets whether incident mode is currently active.
/// </summary>
public bool IsActive => _state.IsActive;
/// <summary>
/// Enables incident mode.
/// </summary>
/// <param name="reason">Reason for enabling incident mode.</param>
/// <param name="duration">Optional duration after which incident mode auto-disables.</param>
public void Enable(string reason, TimeSpan? duration = null)
{
var now = _timeProvider.GetUtcNow();
var expiresAt = duration.HasValue ? now.Add(duration.Value) : (DateTimeOffset?)null;
_state = new IncidentModeState(true, now, expiresAt, reason);
_logger.LogWarning(
"Incident mode ENABLED. Reason: {Reason}, ExpiresAt: {ExpiresAt}",
reason,
expiresAt?.ToString("O", CultureInfo.InvariantCulture) ?? "never");
PolicyEngineTelemetry.RecordError("incident_mode_enabled", null);
}
/// <summary>
/// Disables incident mode.
/// </summary>
/// <param name="reason">Reason for disabling incident mode.</param>
public void Disable(string reason)
{
var wasActive = _state.IsActive;
_state = new IncidentModeState(false, null, null, null);
if (wasActive)
{
_logger.LogInformation("Incident mode DISABLED. Reason: {Reason}", reason);
}
}
/// <summary>
/// Checks if incident mode should be auto-disabled due to expiration.
/// </summary>
public void CheckExpiration()
{
var state = _state;
if (state.IsActive && state.ExpiresAt.HasValue)
{
if (_timeProvider.GetUtcNow() >= state.ExpiresAt.Value)
{
Disable("auto-expired");
}
}
}
/// <summary>
/// Gets the effective sampling ratio, considering incident mode.
/// </summary>
public double GetEffectiveSamplingRatio()
{
if (_state.IsActive)
{
return 1.0; // 100% sampling during incident mode
}
return _optionsMonitor.CurrentValue.TraceSamplingRatio;
}
}
/// <summary>
/// Represents the current state of incident mode.
/// </summary>
public sealed record IncidentModeState(
bool IsActive,
DateTimeOffset? ActivatedAt,
DateTimeOffset? ExpiresAt,
string? Reason);
/// <summary>
/// A trace sampler that respects incident mode settings.
/// </summary>
public sealed class IncidentModeSampler : Sampler
{
private readonly IncidentModeService _incidentModeService;
private readonly Sampler _baseSampler;
public IncidentModeSampler(IncidentModeService incidentModeService, double baseSamplingRatio)
{
_incidentModeService = incidentModeService ?? throw new ArgumentNullException(nameof(incidentModeService));
_baseSampler = new TraceIdRatioBasedSampler(baseSamplingRatio);
}
public override SamplingResult ShouldSample(in SamplingParameters samplingParameters)
{
// During incident mode, always sample
if (_incidentModeService.IsActive)
{
return new SamplingResult(SamplingDecision.RecordAndSample);
}
// Otherwise, use the base sampler
return _baseSampler.ShouldSample(samplingParameters);
}
}
/// <summary>
/// Extension methods for configuring incident mode.
/// </summary>
public static class IncidentModeExtensions
{
/// <summary>
/// Adds the incident mode sampler to the tracer provider.
/// </summary>
public static TracerProviderBuilder SetIncidentModeSampler(
this TracerProviderBuilder builder,
IncidentModeService incidentModeService,
double baseSamplingRatio)
{
ArgumentNullException.ThrowIfNull(builder);
ArgumentNullException.ThrowIfNull(incidentModeService);
return builder.SetSampler(new IncidentModeSampler(incidentModeService, baseSamplingRatio));
}
}
/// <summary>
/// Background service that periodically checks incident mode expiration.
/// </summary>
public sealed class IncidentModeExpirationWorker : BackgroundService
{
private readonly IncidentModeService _incidentModeService;
private readonly ILogger<IncidentModeExpirationWorker> _logger;
private readonly TimeSpan _checkInterval = TimeSpan.FromMinutes(1);
public IncidentModeExpirationWorker(
IncidentModeService incidentModeService,
ILogger<IncidentModeExpirationWorker> logger)
{
_incidentModeService = incidentModeService ?? throw new ArgumentNullException(nameof(incidentModeService));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
_logger.LogDebug("Incident mode expiration worker started.");
while (!stoppingToken.IsCancellationRequested)
{
try
{
_incidentModeService.CheckExpiration();
await Task.Delay(_checkInterval, stoppingToken);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error checking incident mode expiration.");
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken);
}
}
_logger.LogDebug("Incident mode expiration worker stopped.");
}
}