213 lines
6.9 KiB
C#
213 lines
6.9 KiB
C#
using System.Diagnostics;
|
|
using System.Globalization;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
using OpenTelemetry.Trace;
|
|
|
|
namespace StellaOps.Policy.Engine.Telemetry;
|
|
|
|
/// <summary>
|
|
/// Service for managing incident mode, which enables 100% trace sampling
|
|
/// and extended retention during critical periods.
|
|
/// </summary>
|
|
public sealed class IncidentModeService
|
|
{
|
|
private readonly ILogger<IncidentModeService> _logger;
|
|
private readonly TimeProvider _timeProvider;
|
|
private readonly IOptionsMonitor<PolicyEngineTelemetryOptions> _optionsMonitor;
|
|
|
|
private volatile IncidentModeState _state = new(false, null, null, null);
|
|
|
|
public IncidentModeService(
|
|
ILogger<IncidentModeService> logger,
|
|
TimeProvider timeProvider,
|
|
IOptionsMonitor<PolicyEngineTelemetryOptions> optionsMonitor)
|
|
{
|
|
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
|
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
|
|
_optionsMonitor = optionsMonitor ?? throw new ArgumentNullException(nameof(optionsMonitor));
|
|
|
|
// Initialize from configuration
|
|
if (_optionsMonitor.CurrentValue.IncidentMode)
|
|
{
|
|
_state = new IncidentModeState(
|
|
true,
|
|
_timeProvider.GetUtcNow(),
|
|
null,
|
|
"configuration");
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the current incident mode state.
|
|
/// </summary>
|
|
public IncidentModeState State => _state;
|
|
|
|
/// <summary>
|
|
/// Gets whether incident mode is currently active.
|
|
/// </summary>
|
|
public bool IsActive => _state.IsActive;
|
|
|
|
/// <summary>
|
|
/// Enables incident mode.
|
|
/// </summary>
|
|
/// <param name="reason">Reason for enabling incident mode.</param>
|
|
/// <param name="duration">Optional duration after which incident mode auto-disables.</param>
|
|
public void Enable(string reason, TimeSpan? duration = null)
|
|
{
|
|
var now = _timeProvider.GetUtcNow();
|
|
var expiresAt = duration.HasValue ? now.Add(duration.Value) : (DateTimeOffset?)null;
|
|
|
|
_state = new IncidentModeState(true, now, expiresAt, reason);
|
|
|
|
_logger.LogWarning(
|
|
"Incident mode ENABLED. Reason: {Reason}, ExpiresAt: {ExpiresAt}",
|
|
reason,
|
|
expiresAt?.ToString("O", CultureInfo.InvariantCulture) ?? "never");
|
|
|
|
PolicyEngineTelemetry.RecordError("incident_mode_enabled", null);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Disables incident mode.
|
|
/// </summary>
|
|
/// <param name="reason">Reason for disabling incident mode.</param>
|
|
public void Disable(string reason)
|
|
{
|
|
var wasActive = _state.IsActive;
|
|
_state = new IncidentModeState(false, null, null, null);
|
|
|
|
if (wasActive)
|
|
{
|
|
_logger.LogInformation("Incident mode DISABLED. Reason: {Reason}", reason);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks if incident mode should be auto-disabled due to expiration.
|
|
/// </summary>
|
|
public void CheckExpiration()
|
|
{
|
|
var state = _state;
|
|
if (state.IsActive && state.ExpiresAt.HasValue)
|
|
{
|
|
if (_timeProvider.GetUtcNow() >= state.ExpiresAt.Value)
|
|
{
|
|
Disable("auto-expired");
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the effective sampling ratio, considering incident mode.
|
|
/// </summary>
|
|
public double GetEffectiveSamplingRatio()
|
|
{
|
|
if (_state.IsActive)
|
|
{
|
|
return 1.0; // 100% sampling during incident mode
|
|
}
|
|
|
|
return _optionsMonitor.CurrentValue.TraceSamplingRatio;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Represents the current state of incident mode.
|
|
/// </summary>
|
|
public sealed record IncidentModeState(
|
|
bool IsActive,
|
|
DateTimeOffset? ActivatedAt,
|
|
DateTimeOffset? ExpiresAt,
|
|
string? Reason);
|
|
|
|
/// <summary>
|
|
/// A trace sampler that respects incident mode settings.
|
|
/// </summary>
|
|
public sealed class IncidentModeSampler : Sampler
|
|
{
|
|
private readonly IncidentModeService _incidentModeService;
|
|
private readonly Sampler _baseSampler;
|
|
|
|
public IncidentModeSampler(IncidentModeService incidentModeService, double baseSamplingRatio)
|
|
{
|
|
_incidentModeService = incidentModeService ?? throw new ArgumentNullException(nameof(incidentModeService));
|
|
_baseSampler = new TraceIdRatioBasedSampler(baseSamplingRatio);
|
|
}
|
|
|
|
public override SamplingResult ShouldSample(in SamplingParameters samplingParameters)
|
|
{
|
|
// During incident mode, always sample
|
|
if (_incidentModeService.IsActive)
|
|
{
|
|
return new SamplingResult(SamplingDecision.RecordAndSample);
|
|
}
|
|
|
|
// Otherwise, use the base sampler
|
|
return _baseSampler.ShouldSample(samplingParameters);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Extension methods for configuring incident mode.
|
|
/// </summary>
|
|
public static class IncidentModeExtensions
|
|
{
|
|
/// <summary>
|
|
/// Adds the incident mode sampler to the tracer provider.
|
|
/// </summary>
|
|
public static TracerProviderBuilder SetIncidentModeSampler(
|
|
this TracerProviderBuilder builder,
|
|
IncidentModeService incidentModeService,
|
|
double baseSamplingRatio)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(builder);
|
|
ArgumentNullException.ThrowIfNull(incidentModeService);
|
|
|
|
return builder.SetSampler(new IncidentModeSampler(incidentModeService, baseSamplingRatio));
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Background service that periodically checks incident mode expiration.
|
|
/// </summary>
|
|
public sealed class IncidentModeExpirationWorker : BackgroundService
|
|
{
|
|
private readonly IncidentModeService _incidentModeService;
|
|
private readonly ILogger<IncidentModeExpirationWorker> _logger;
|
|
private readonly TimeSpan _checkInterval = TimeSpan.FromMinutes(1);
|
|
|
|
public IncidentModeExpirationWorker(
|
|
IncidentModeService incidentModeService,
|
|
ILogger<IncidentModeExpirationWorker> logger)
|
|
{
|
|
_incidentModeService = incidentModeService ?? throw new ArgumentNullException(nameof(incidentModeService));
|
|
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
|
}
|
|
|
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
|
{
|
|
_logger.LogDebug("Incident mode expiration worker started.");
|
|
|
|
while (!stoppingToken.IsCancellationRequested)
|
|
{
|
|
try
|
|
{
|
|
_incidentModeService.CheckExpiration();
|
|
await Task.Delay(_checkInterval, stoppingToken);
|
|
}
|
|
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
|
{
|
|
break;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error checking incident mode expiration.");
|
|
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken);
|
|
}
|
|
}
|
|
|
|
_logger.LogDebug("Incident mode expiration worker stopped.");
|
|
}
|
|
}
|