using System.Diagnostics; using System.Globalization; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using OpenTelemetry.Trace; namespace StellaOps.Policy.Engine.Telemetry; /// /// Service for managing incident mode, which enables 100% trace sampling /// and extended retention during critical periods. /// public sealed class IncidentModeService { private readonly ILogger _logger; private readonly TimeProvider _timeProvider; private readonly IOptionsMonitor _optionsMonitor; private volatile IncidentModeState _state = new(false, null, null, null); public IncidentModeService( ILogger logger, TimeProvider timeProvider, IOptionsMonitor optionsMonitor) { _logger = logger ?? throw new ArgumentNullException(nameof(logger)); _timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider)); _optionsMonitor = optionsMonitor ?? throw new ArgumentNullException(nameof(optionsMonitor)); // Initialize from configuration if (_optionsMonitor.CurrentValue.IncidentMode) { _state = new IncidentModeState( true, _timeProvider.GetUtcNow(), null, "configuration"); } } /// /// Gets the current incident mode state. /// public IncidentModeState State => _state; /// /// Gets whether incident mode is currently active. /// public bool IsActive => _state.IsActive; /// /// Enables incident mode. /// /// Reason for enabling incident mode. /// Optional duration after which incident mode auto-disables. public void Enable(string reason, TimeSpan? duration = null) { var now = _timeProvider.GetUtcNow(); var expiresAt = duration.HasValue ? now.Add(duration.Value) : (DateTimeOffset?)null; _state = new IncidentModeState(true, now, expiresAt, reason); _logger.LogWarning( "Incident mode ENABLED. Reason: {Reason}, ExpiresAt: {ExpiresAt}", reason, expiresAt?.ToString("O", CultureInfo.InvariantCulture) ?? "never"); PolicyEngineTelemetry.RecordError("incident_mode_enabled", null); } /// /// Disables incident mode. /// /// Reason for disabling incident mode. public void Disable(string reason) { var wasActive = _state.IsActive; _state = new IncidentModeState(false, null, null, null); if (wasActive) { _logger.LogInformation("Incident mode DISABLED. Reason: {Reason}", reason); } } /// /// Checks if incident mode should be auto-disabled due to expiration. /// public void CheckExpiration() { var state = _state; if (state.IsActive && state.ExpiresAt.HasValue) { if (_timeProvider.GetUtcNow() >= state.ExpiresAt.Value) { Disable("auto-expired"); } } } /// /// Gets the effective sampling ratio, considering incident mode. /// public double GetEffectiveSamplingRatio() { if (_state.IsActive) { return 1.0; // 100% sampling during incident mode } return _optionsMonitor.CurrentValue.TraceSamplingRatio; } } /// /// Represents the current state of incident mode. /// public sealed record IncidentModeState( bool IsActive, DateTimeOffset? ActivatedAt, DateTimeOffset? ExpiresAt, string? Reason); /// /// A trace sampler that respects incident mode settings. /// public sealed class IncidentModeSampler : Sampler { private readonly IncidentModeService _incidentModeService; private readonly Sampler _baseSampler; public IncidentModeSampler(IncidentModeService incidentModeService, double baseSamplingRatio) { _incidentModeService = incidentModeService ?? throw new ArgumentNullException(nameof(incidentModeService)); _baseSampler = new TraceIdRatioBasedSampler(baseSamplingRatio); } public override SamplingResult ShouldSample(in SamplingParameters samplingParameters) { // During incident mode, always sample if (_incidentModeService.IsActive) { return new SamplingResult(SamplingDecision.RecordAndSample); } // Otherwise, use the base sampler return _baseSampler.ShouldSample(samplingParameters); } } /// /// Extension methods for configuring incident mode. /// public static class IncidentModeExtensions { /// /// Adds the incident mode sampler to the tracer provider. /// public static TracerProviderBuilder SetIncidentModeSampler( this TracerProviderBuilder builder, IncidentModeService incidentModeService, double baseSamplingRatio) { ArgumentNullException.ThrowIfNull(builder); ArgumentNullException.ThrowIfNull(incidentModeService); return builder.SetSampler(new IncidentModeSampler(incidentModeService, baseSamplingRatio)); } } /// /// Background service that periodically checks incident mode expiration. /// public sealed class IncidentModeExpirationWorker : BackgroundService { private readonly IncidentModeService _incidentModeService; private readonly ILogger _logger; private readonly TimeSpan _checkInterval = TimeSpan.FromMinutes(1); public IncidentModeExpirationWorker( IncidentModeService incidentModeService, ILogger logger) { _incidentModeService = incidentModeService ?? throw new ArgumentNullException(nameof(incidentModeService)); _logger = logger ?? throw new ArgumentNullException(nameof(logger)); } protected override async Task ExecuteAsync(CancellationToken stoppingToken) { _logger.LogDebug("Incident mode expiration worker started."); while (!stoppingToken.IsCancellationRequested) { try { _incidentModeService.CheckExpiration(); await Task.Delay(_checkInterval, stoppingToken); } catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested) { break; } catch (Exception ex) { _logger.LogError(ex, "Error checking incident mode expiration."); await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken); } } _logger.LogDebug("Incident mode expiration worker stopped."); } }