up
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Notify Smoke Test / Notify Unit Tests (push) Has been cancelled
Notify Smoke Test / Notifier Service Tests (push) Has been cancelled
Notify Smoke Test / Notification Smoke Test (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Scanner Analyzers / Discover Analyzers (push) Has been cancelled
Scanner Analyzers / Build Analyzers (push) Has been cancelled
Scanner Analyzers / Test Language Analyzers (push) Has been cancelled
Scanner Analyzers / Validate Test Fixtures (push) Has been cancelled
Scanner Analyzers / Verify Deterministic Output (push) Has been cancelled
Signals CI & Image / signals-ci (push) Has been cancelled
Signals Reachability Scoring & Events / reachability-smoke (push) Has been cancelled
Signals Reachability Scoring & Events / sign-and-upload (push) Has been cancelled
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Notify Smoke Test / Notify Unit Tests (push) Has been cancelled
Notify Smoke Test / Notifier Service Tests (push) Has been cancelled
Notify Smoke Test / Notification Smoke Test (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Scanner Analyzers / Discover Analyzers (push) Has been cancelled
Scanner Analyzers / Build Analyzers (push) Has been cancelled
Scanner Analyzers / Test Language Analyzers (push) Has been cancelled
Scanner Analyzers / Validate Test Fixtures (push) Has been cancelled
Scanner Analyzers / Verify Deterministic Output (push) Has been cancelled
Signals CI & Image / signals-ci (push) Has been cancelled
Signals Reachability Scoring & Events / reachability-smoke (push) Has been cancelled
Signals Reachability Scoring & Events / sign-and-upload (push) Has been cancelled
This commit is contained in:
@@ -1,211 +1,211 @@
|
||||
using System.Diagnostics;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using OpenTelemetry.Trace;
|
||||
|
||||
namespace StellaOps.Policy.Engine.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Service for managing incident mode, which enables 100% trace sampling
|
||||
/// and extended retention during critical periods.
|
||||
/// </summary>
|
||||
public sealed class IncidentModeService
|
||||
{
|
||||
private readonly ILogger<IncidentModeService> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly IOptionsMonitor<PolicyEngineTelemetryOptions> _optionsMonitor;
|
||||
|
||||
private volatile IncidentModeState _state = new(false, null, null, null);
|
||||
|
||||
public IncidentModeService(
|
||||
ILogger<IncidentModeService> logger,
|
||||
TimeProvider timeProvider,
|
||||
IOptionsMonitor<PolicyEngineTelemetryOptions> optionsMonitor)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
|
||||
_optionsMonitor = optionsMonitor ?? throw new ArgumentNullException(nameof(optionsMonitor));
|
||||
|
||||
// Initialize from configuration
|
||||
if (_optionsMonitor.CurrentValue.IncidentMode)
|
||||
{
|
||||
_state = new IncidentModeState(
|
||||
true,
|
||||
_timeProvider.GetUtcNow(),
|
||||
null,
|
||||
"configuration");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current incident mode state.
|
||||
/// </summary>
|
||||
public IncidentModeState State => _state;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether incident mode is currently active.
|
||||
/// </summary>
|
||||
public bool IsActive => _state.IsActive;
|
||||
|
||||
/// <summary>
|
||||
/// Enables incident mode.
|
||||
/// </summary>
|
||||
/// <param name="reason">Reason for enabling incident mode.</param>
|
||||
/// <param name="duration">Optional duration after which incident mode auto-disables.</param>
|
||||
public void Enable(string reason, TimeSpan? duration = null)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var expiresAt = duration.HasValue ? now.Add(duration.Value) : (DateTimeOffset?)null;
|
||||
|
||||
_state = new IncidentModeState(true, now, expiresAt, reason);
|
||||
|
||||
_logger.LogWarning(
|
||||
"Incident mode ENABLED. Reason: {Reason}, ExpiresAt: {ExpiresAt}",
|
||||
reason,
|
||||
expiresAt?.ToString("O") ?? "never");
|
||||
|
||||
PolicyEngineTelemetry.RecordError("incident_mode_enabled", null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Disables incident mode.
|
||||
/// </summary>
|
||||
/// <param name="reason">Reason for disabling incident mode.</param>
|
||||
public void Disable(string reason)
|
||||
{
|
||||
var wasActive = _state.IsActive;
|
||||
_state = new IncidentModeState(false, null, null, null);
|
||||
|
||||
if (wasActive)
|
||||
{
|
||||
_logger.LogInformation("Incident mode DISABLED. Reason: {Reason}", reason);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if incident mode should be auto-disabled due to expiration.
|
||||
/// </summary>
|
||||
public void CheckExpiration()
|
||||
{
|
||||
var state = _state;
|
||||
if (state.IsActive && state.ExpiresAt.HasValue)
|
||||
{
|
||||
if (_timeProvider.GetUtcNow() >= state.ExpiresAt.Value)
|
||||
{
|
||||
Disable("auto-expired");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the effective sampling ratio, considering incident mode.
|
||||
/// </summary>
|
||||
public double GetEffectiveSamplingRatio()
|
||||
{
|
||||
if (_state.IsActive)
|
||||
{
|
||||
return 1.0; // 100% sampling during incident mode
|
||||
}
|
||||
|
||||
return _optionsMonitor.CurrentValue.TraceSamplingRatio;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents the current state of incident mode.
|
||||
/// </summary>
|
||||
public sealed record IncidentModeState(
|
||||
bool IsActive,
|
||||
DateTimeOffset? ActivatedAt,
|
||||
DateTimeOffset? ExpiresAt,
|
||||
string? Reason);
|
||||
|
||||
/// <summary>
|
||||
/// A trace sampler that respects incident mode settings.
|
||||
/// </summary>
|
||||
public sealed class IncidentModeSampler : Sampler
|
||||
{
|
||||
private readonly IncidentModeService _incidentModeService;
|
||||
private readonly Sampler _baseSampler;
|
||||
|
||||
public IncidentModeSampler(IncidentModeService incidentModeService, double baseSamplingRatio)
|
||||
{
|
||||
_incidentModeService = incidentModeService ?? throw new ArgumentNullException(nameof(incidentModeService));
|
||||
_baseSampler = new TraceIdRatioBasedSampler(baseSamplingRatio);
|
||||
}
|
||||
|
||||
public override SamplingResult ShouldSample(in SamplingParameters samplingParameters)
|
||||
{
|
||||
// During incident mode, always sample
|
||||
if (_incidentModeService.IsActive)
|
||||
{
|
||||
return new SamplingResult(SamplingDecision.RecordAndSample);
|
||||
}
|
||||
|
||||
// Otherwise, use the base sampler
|
||||
return _baseSampler.ShouldSample(samplingParameters);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for configuring incident mode.
|
||||
/// </summary>
|
||||
public static class IncidentModeExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds the incident mode sampler to the tracer provider.
|
||||
/// </summary>
|
||||
public static TracerProviderBuilder SetIncidentModeSampler(
|
||||
this TracerProviderBuilder builder,
|
||||
IncidentModeService incidentModeService,
|
||||
double baseSamplingRatio)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(builder);
|
||||
ArgumentNullException.ThrowIfNull(incidentModeService);
|
||||
|
||||
return builder.SetSampler(new IncidentModeSampler(incidentModeService, baseSamplingRatio));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Background service that periodically checks incident mode expiration.
|
||||
/// </summary>
|
||||
public sealed class IncidentModeExpirationWorker : BackgroundService
|
||||
{
|
||||
private readonly IncidentModeService _incidentModeService;
|
||||
private readonly ILogger<IncidentModeExpirationWorker> _logger;
|
||||
private readonly TimeSpan _checkInterval = TimeSpan.FromMinutes(1);
|
||||
|
||||
public IncidentModeExpirationWorker(
|
||||
IncidentModeService incidentModeService,
|
||||
ILogger<IncidentModeExpirationWorker> logger)
|
||||
{
|
||||
_incidentModeService = incidentModeService ?? throw new ArgumentNullException(nameof(incidentModeService));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogDebug("Incident mode expiration worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
_incidentModeService.CheckExpiration();
|
||||
await Task.Delay(_checkInterval, stoppingToken);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error checking incident mode expiration.");
|
||||
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogDebug("Incident mode expiration worker stopped.");
|
||||
}
|
||||
}
|
||||
using System.Diagnostics;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using OpenTelemetry.Trace;
|
||||
|
||||
namespace StellaOps.Policy.Engine.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Service for managing incident mode, which enables 100% trace sampling
|
||||
/// and extended retention during critical periods.
|
||||
/// </summary>
|
||||
public sealed class IncidentModeService
|
||||
{
|
||||
private readonly ILogger<IncidentModeService> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly IOptionsMonitor<PolicyEngineTelemetryOptions> _optionsMonitor;
|
||||
|
||||
private volatile IncidentModeState _state = new(false, null, null, null);
|
||||
|
||||
public IncidentModeService(
|
||||
ILogger<IncidentModeService> logger,
|
||||
TimeProvider timeProvider,
|
||||
IOptionsMonitor<PolicyEngineTelemetryOptions> optionsMonitor)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
|
||||
_optionsMonitor = optionsMonitor ?? throw new ArgumentNullException(nameof(optionsMonitor));
|
||||
|
||||
// Initialize from configuration
|
||||
if (_optionsMonitor.CurrentValue.IncidentMode)
|
||||
{
|
||||
_state = new IncidentModeState(
|
||||
true,
|
||||
_timeProvider.GetUtcNow(),
|
||||
null,
|
||||
"configuration");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current incident mode state.
|
||||
/// </summary>
|
||||
public IncidentModeState State => _state;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether incident mode is currently active.
|
||||
/// </summary>
|
||||
public bool IsActive => _state.IsActive;
|
||||
|
||||
/// <summary>
|
||||
/// Enables incident mode.
|
||||
/// </summary>
|
||||
/// <param name="reason">Reason for enabling incident mode.</param>
|
||||
/// <param name="duration">Optional duration after which incident mode auto-disables.</param>
|
||||
public void Enable(string reason, TimeSpan? duration = null)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var expiresAt = duration.HasValue ? now.Add(duration.Value) : (DateTimeOffset?)null;
|
||||
|
||||
_state = new IncidentModeState(true, now, expiresAt, reason);
|
||||
|
||||
_logger.LogWarning(
|
||||
"Incident mode ENABLED. Reason: {Reason}, ExpiresAt: {ExpiresAt}",
|
||||
reason,
|
||||
expiresAt?.ToString("O") ?? "never");
|
||||
|
||||
PolicyEngineTelemetry.RecordError("incident_mode_enabled", null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Disables incident mode.
|
||||
/// </summary>
|
||||
/// <param name="reason">Reason for disabling incident mode.</param>
|
||||
public void Disable(string reason)
|
||||
{
|
||||
var wasActive = _state.IsActive;
|
||||
_state = new IncidentModeState(false, null, null, null);
|
||||
|
||||
if (wasActive)
|
||||
{
|
||||
_logger.LogInformation("Incident mode DISABLED. Reason: {Reason}", reason);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if incident mode should be auto-disabled due to expiration.
|
||||
/// </summary>
|
||||
public void CheckExpiration()
|
||||
{
|
||||
var state = _state;
|
||||
if (state.IsActive && state.ExpiresAt.HasValue)
|
||||
{
|
||||
if (_timeProvider.GetUtcNow() >= state.ExpiresAt.Value)
|
||||
{
|
||||
Disable("auto-expired");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the effective sampling ratio, considering incident mode.
|
||||
/// </summary>
|
||||
public double GetEffectiveSamplingRatio()
|
||||
{
|
||||
if (_state.IsActive)
|
||||
{
|
||||
return 1.0; // 100% sampling during incident mode
|
||||
}
|
||||
|
||||
return _optionsMonitor.CurrentValue.TraceSamplingRatio;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents the current state of incident mode.
|
||||
/// </summary>
|
||||
public sealed record IncidentModeState(
|
||||
bool IsActive,
|
||||
DateTimeOffset? ActivatedAt,
|
||||
DateTimeOffset? ExpiresAt,
|
||||
string? Reason);
|
||||
|
||||
/// <summary>
|
||||
/// A trace sampler that respects incident mode settings.
|
||||
/// </summary>
|
||||
public sealed class IncidentModeSampler : Sampler
|
||||
{
|
||||
private readonly IncidentModeService _incidentModeService;
|
||||
private readonly Sampler _baseSampler;
|
||||
|
||||
public IncidentModeSampler(IncidentModeService incidentModeService, double baseSamplingRatio)
|
||||
{
|
||||
_incidentModeService = incidentModeService ?? throw new ArgumentNullException(nameof(incidentModeService));
|
||||
_baseSampler = new TraceIdRatioBasedSampler(baseSamplingRatio);
|
||||
}
|
||||
|
||||
public override SamplingResult ShouldSample(in SamplingParameters samplingParameters)
|
||||
{
|
||||
// During incident mode, always sample
|
||||
if (_incidentModeService.IsActive)
|
||||
{
|
||||
return new SamplingResult(SamplingDecision.RecordAndSample);
|
||||
}
|
||||
|
||||
// Otherwise, use the base sampler
|
||||
return _baseSampler.ShouldSample(samplingParameters);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for configuring incident mode.
|
||||
/// </summary>
|
||||
public static class IncidentModeExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds the incident mode sampler to the tracer provider.
|
||||
/// </summary>
|
||||
public static TracerProviderBuilder SetIncidentModeSampler(
|
||||
this TracerProviderBuilder builder,
|
||||
IncidentModeService incidentModeService,
|
||||
double baseSamplingRatio)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(builder);
|
||||
ArgumentNullException.ThrowIfNull(incidentModeService);
|
||||
|
||||
return builder.SetSampler(new IncidentModeSampler(incidentModeService, baseSamplingRatio));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Background service that periodically checks incident mode expiration.
|
||||
/// </summary>
|
||||
public sealed class IncidentModeExpirationWorker : BackgroundService
|
||||
{
|
||||
private readonly IncidentModeService _incidentModeService;
|
||||
private readonly ILogger<IncidentModeExpirationWorker> _logger;
|
||||
private readonly TimeSpan _checkInterval = TimeSpan.FromMinutes(1);
|
||||
|
||||
public IncidentModeExpirationWorker(
|
||||
IncidentModeService incidentModeService,
|
||||
ILogger<IncidentModeExpirationWorker> logger)
|
||||
{
|
||||
_incidentModeService = incidentModeService ?? throw new ArgumentNullException(nameof(incidentModeService));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
_logger.LogDebug("Incident mode expiration worker started.");
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
_incidentModeService.CheckExpiration();
|
||||
await Task.Delay(_checkInterval, stoppingToken);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error checking incident mode expiration.");
|
||||
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogDebug("Incident mode expiration worker stopped.");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user