Implement incident mode management service and models
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
- Added IPackRunIncidentModeService interface for managing incident mode activation, deactivation, and status retrieval. - Created PackRunIncidentModeService class implementing the service interface with methods for activating, deactivating, and escalating incident modes. - Introduced incident mode status model (PackRunIncidentModeStatus) and related enums for escalation levels and activation sources. - Developed retention policy, telemetry settings, and debug capture settings models to manage incident mode configurations. - Implemented SLO breach notification handling to activate incident mode based on severity. - Added in-memory store (InMemoryPackRunIncidentModeStore) for testing purposes. - Created comprehensive unit tests for incident mode service, covering activation, deactivation, status retrieval, and SLO breach handling.
This commit is contained in:
@@ -328,6 +328,18 @@ public static class PackRunEventTypes
|
||||
/// <summary>Attestation was revoked.</summary>
|
||||
public const string AttestationRevoked = "pack.attestation.revoked";
|
||||
|
||||
/// <summary>Incident mode activated (per TASKRUN-OBS-55-001).</summary>
|
||||
public const string IncidentModeActivated = "pack.incident.activated";
|
||||
|
||||
/// <summary>Incident mode deactivated.</summary>
|
||||
public const string IncidentModeDeactivated = "pack.incident.deactivated";
|
||||
|
||||
/// <summary>Incident mode escalated to higher level.</summary>
|
||||
public const string IncidentModeEscalated = "pack.incident.escalated";
|
||||
|
||||
/// <summary>SLO breach detected triggering incident mode.</summary>
|
||||
public const string SloBreachDetected = "pack.incident.slo_breach";
|
||||
|
||||
/// <summary>Checks if the event type is a pack run event.</summary>
|
||||
public static bool IsPackRunEvent(string eventType) =>
|
||||
eventType.StartsWith(Prefix, StringComparison.Ordinal);
|
||||
|
||||
@@ -0,0 +1,534 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.TaskRunner.Core.Events;
|
||||
|
||||
namespace StellaOps.TaskRunner.Core.IncidentMode;
|
||||
|
||||
/// <summary>
|
||||
/// Service for managing pack run incident mode.
|
||||
/// Per TASKRUN-OBS-55-001.
|
||||
/// </summary>
|
||||
public interface IPackRunIncidentModeService
|
||||
{
|
||||
/// <summary>
|
||||
/// Activates incident mode for a run.
|
||||
/// </summary>
|
||||
Task<IncidentModeActivationResult> ActivateAsync(
|
||||
IncidentModeActivationRequest request,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Deactivates incident mode for a run.
|
||||
/// </summary>
|
||||
Task<IncidentModeActivationResult> DeactivateAsync(
|
||||
string runId,
|
||||
string? reason = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current incident mode status for a run.
|
||||
/// </summary>
|
||||
Task<PackRunIncidentModeStatus> GetStatusAsync(
|
||||
string runId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Handles an SLO breach notification.
|
||||
/// </summary>
|
||||
Task<IncidentModeActivationResult> HandleSloBreachAsync(
|
||||
SloBreachNotification notification,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Escalates incident mode to a higher level.
|
||||
/// </summary>
|
||||
Task<IncidentModeActivationResult> EscalateAsync(
|
||||
string runId,
|
||||
IncidentEscalationLevel newLevel,
|
||||
string? reason = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets settings for the current incident mode level.
|
||||
/// </summary>
|
||||
IncidentModeSettings GetSettingsForLevel(IncidentEscalationLevel level);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Store for incident mode state.
|
||||
/// </summary>
|
||||
public interface IPackRunIncidentModeStore
|
||||
{
|
||||
/// <summary>
|
||||
/// Stores incident mode status.
|
||||
/// </summary>
|
||||
Task StoreAsync(
|
||||
string runId,
|
||||
PackRunIncidentModeStatus status,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets incident mode status.
|
||||
/// </summary>
|
||||
Task<PackRunIncidentModeStatus?> GetAsync(
|
||||
string runId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Lists all runs in incident mode.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<string>> ListActiveRunsAsync(
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Removes incident mode status.
|
||||
/// </summary>
|
||||
Task RemoveAsync(
|
||||
string runId,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Settings for incident mode levels.
|
||||
/// </summary>
|
||||
public sealed record IncidentModeSettings(
|
||||
/// <summary>Escalation level.</summary>
|
||||
IncidentEscalationLevel Level,
|
||||
|
||||
/// <summary>Retention policy.</summary>
|
||||
IncidentRetentionPolicy RetentionPolicy,
|
||||
|
||||
/// <summary>Telemetry settings.</summary>
|
||||
IncidentTelemetrySettings TelemetrySettings,
|
||||
|
||||
/// <summary>Debug capture settings.</summary>
|
||||
IncidentDebugCaptureSettings DebugCaptureSettings);
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of pack run incident mode service.
|
||||
/// </summary>
|
||||
public sealed class PackRunIncidentModeService : IPackRunIncidentModeService
|
||||
{
|
||||
private readonly IPackRunIncidentModeStore _store;
|
||||
private readonly IPackRunTimelineEventEmitter? _timelineEmitter;
|
||||
private readonly ILogger<PackRunIncidentModeService> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
public PackRunIncidentModeService(
|
||||
IPackRunIncidentModeStore store,
|
||||
ILogger<PackRunIncidentModeService> logger,
|
||||
TimeProvider? timeProvider = null,
|
||||
IPackRunTimelineEventEmitter? timelineEmitter = null)
|
||||
{
|
||||
_store = store ?? throw new ArgumentNullException(nameof(store));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_timelineEmitter = timelineEmitter;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IncidentModeActivationResult> ActivateAsync(
|
||||
IncidentModeActivationRequest request,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
|
||||
try
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var settings = GetSettingsForLevel(request.Level);
|
||||
|
||||
var expiresAt = request.DurationMinutes.HasValue
|
||||
? now.AddMinutes(request.DurationMinutes.Value)
|
||||
: (DateTimeOffset?)null;
|
||||
|
||||
var status = new PackRunIncidentModeStatus(
|
||||
Active: true,
|
||||
Level: request.Level,
|
||||
ActivatedAt: now,
|
||||
ActivationReason: request.Reason,
|
||||
Source: request.Source,
|
||||
ExpiresAt: expiresAt,
|
||||
RetentionPolicy: settings.RetentionPolicy,
|
||||
TelemetrySettings: settings.TelemetrySettings,
|
||||
DebugCaptureSettings: settings.DebugCaptureSettings);
|
||||
|
||||
await _store.StoreAsync(request.RunId, status, cancellationToken);
|
||||
|
||||
// Emit timeline event
|
||||
await EmitTimelineEventAsync(
|
||||
request.TenantId,
|
||||
request.RunId,
|
||||
PackRunIncidentEventTypes.IncidentModeActivated,
|
||||
new Dictionary<string, string>
|
||||
{
|
||||
["level"] = request.Level.ToString(),
|
||||
["source"] = request.Source.ToString(),
|
||||
["reason"] = request.Reason,
|
||||
["requestedBy"] = request.RequestedBy ?? "system"
|
||||
},
|
||||
cancellationToken);
|
||||
|
||||
_logger.LogWarning(
|
||||
"Incident mode activated for run {RunId} at level {Level} due to: {Reason}",
|
||||
request.RunId,
|
||||
request.Level,
|
||||
request.Reason);
|
||||
|
||||
return new IncidentModeActivationResult(
|
||||
Success: true,
|
||||
Status: status,
|
||||
Error: null);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to activate incident mode for run {RunId}", request.RunId);
|
||||
|
||||
return new IncidentModeActivationResult(
|
||||
Success: false,
|
||||
Status: PackRunIncidentModeStatus.Inactive(),
|
||||
Error: ex.Message);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IncidentModeActivationResult> DeactivateAsync(
|
||||
string runId,
|
||||
string? reason = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
var current = await _store.GetAsync(runId, cancellationToken);
|
||||
if (current is null || !current.Active)
|
||||
{
|
||||
return new IncidentModeActivationResult(
|
||||
Success: true,
|
||||
Status: PackRunIncidentModeStatus.Inactive(),
|
||||
Error: null);
|
||||
}
|
||||
|
||||
await _store.RemoveAsync(runId, cancellationToken);
|
||||
var inactive = PackRunIncidentModeStatus.Inactive();
|
||||
|
||||
// Emit timeline event (using default tenant since we don't have it)
|
||||
await EmitTimelineEventAsync(
|
||||
"default",
|
||||
runId,
|
||||
PackRunIncidentEventTypes.IncidentModeDeactivated,
|
||||
new Dictionary<string, string>
|
||||
{
|
||||
["previousLevel"] = current.Level.ToString(),
|
||||
["reason"] = reason ?? "Manual deactivation",
|
||||
["activeDuration"] = current.ActivatedAt.HasValue
|
||||
? (_timeProvider.GetUtcNow() - current.ActivatedAt.Value).ToString()
|
||||
: "unknown"
|
||||
},
|
||||
cancellationToken);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Incident mode deactivated for run {RunId}. Reason: {Reason}",
|
||||
runId,
|
||||
reason ?? "Manual deactivation");
|
||||
|
||||
return new IncidentModeActivationResult(
|
||||
Success: true,
|
||||
Status: inactive,
|
||||
Error: null);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to deactivate incident mode for run {RunId}", runId);
|
||||
|
||||
return new IncidentModeActivationResult(
|
||||
Success: false,
|
||||
Status: PackRunIncidentModeStatus.Inactive(),
|
||||
Error: ex.Message);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<PackRunIncidentModeStatus> GetStatusAsync(
|
||||
string runId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var status = await _store.GetAsync(runId, cancellationToken);
|
||||
|
||||
if (status is null)
|
||||
{
|
||||
return PackRunIncidentModeStatus.Inactive();
|
||||
}
|
||||
|
||||
// Check if expired
|
||||
if (status.ExpiresAt.HasValue && status.ExpiresAt.Value <= _timeProvider.GetUtcNow())
|
||||
{
|
||||
await _store.RemoveAsync(runId, cancellationToken);
|
||||
return PackRunIncidentModeStatus.Inactive();
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IncidentModeActivationResult> HandleSloBreachAsync(
|
||||
SloBreachNotification notification,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(notification);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(notification.ResourceId))
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Received SLO breach notification {BreachId} without resource ID, skipping incident activation",
|
||||
notification.BreachId);
|
||||
|
||||
return new IncidentModeActivationResult(
|
||||
Success: false,
|
||||
Status: PackRunIncidentModeStatus.Inactive(),
|
||||
Error: "No resource ID in SLO breach notification");
|
||||
}
|
||||
|
||||
// Map severity to escalation level
|
||||
var level = notification.Severity?.ToUpperInvariant() switch
|
||||
{
|
||||
"CRITICAL" => IncidentEscalationLevel.Critical,
|
||||
"HIGH" => IncidentEscalationLevel.High,
|
||||
"MEDIUM" => IncidentEscalationLevel.Medium,
|
||||
"LOW" => IncidentEscalationLevel.Low,
|
||||
_ => IncidentEscalationLevel.Medium
|
||||
};
|
||||
|
||||
var request = new IncidentModeActivationRequest(
|
||||
RunId: notification.ResourceId,
|
||||
TenantId: notification.TenantId ?? "default",
|
||||
Level: level,
|
||||
Source: IncidentModeSource.SloBreach,
|
||||
Reason: $"SLO breach: {notification.SloName} ({notification.CurrentValue:F2} vs threshold {notification.Threshold:F2})",
|
||||
DurationMinutes: 60, // Auto-expire after 1 hour
|
||||
RequestedBy: "slo-monitor");
|
||||
|
||||
_logger.LogWarning(
|
||||
"Processing SLO breach {BreachId} for {SloName} on resource {ResourceId}",
|
||||
notification.BreachId,
|
||||
notification.SloName,
|
||||
notification.ResourceId);
|
||||
|
||||
return await ActivateAsync(request, cancellationToken);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IncidentModeActivationResult> EscalateAsync(
|
||||
string runId,
|
||||
IncidentEscalationLevel newLevel,
|
||||
string? reason = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var current = await _store.GetAsync(runId, cancellationToken);
|
||||
|
||||
if (current is null || !current.Active)
|
||||
{
|
||||
return new IncidentModeActivationResult(
|
||||
Success: false,
|
||||
Status: PackRunIncidentModeStatus.Inactive(),
|
||||
Error: "Incident mode is not active for this run");
|
||||
}
|
||||
|
||||
if (newLevel <= current.Level)
|
||||
{
|
||||
return new IncidentModeActivationResult(
|
||||
Success: false,
|
||||
Status: current,
|
||||
Error: $"Cannot escalate to {newLevel} - current level is {current.Level}");
|
||||
}
|
||||
|
||||
var settings = GetSettingsForLevel(newLevel);
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
var escalated = current with
|
||||
{
|
||||
Level = newLevel,
|
||||
ActivationReason = $"{current.ActivationReason} [Escalated: {reason ?? "Manual escalation"}]",
|
||||
RetentionPolicy = settings.RetentionPolicy,
|
||||
TelemetrySettings = settings.TelemetrySettings,
|
||||
DebugCaptureSettings = settings.DebugCaptureSettings
|
||||
};
|
||||
|
||||
await _store.StoreAsync(runId, escalated, cancellationToken);
|
||||
|
||||
// Emit timeline event
|
||||
await EmitTimelineEventAsync(
|
||||
"default",
|
||||
runId,
|
||||
PackRunIncidentEventTypes.IncidentModeEscalated,
|
||||
new Dictionary<string, string>
|
||||
{
|
||||
["previousLevel"] = current.Level.ToString(),
|
||||
["newLevel"] = newLevel.ToString(),
|
||||
["reason"] = reason ?? "Manual escalation"
|
||||
},
|
||||
cancellationToken);
|
||||
|
||||
_logger.LogWarning(
|
||||
"Incident mode escalated for run {RunId} from {OldLevel} to {NewLevel}. Reason: {Reason}",
|
||||
runId,
|
||||
current.Level,
|
||||
newLevel,
|
||||
reason ?? "Manual escalation");
|
||||
|
||||
return new IncidentModeActivationResult(
|
||||
Success: true,
|
||||
Status: escalated,
|
||||
Error: null);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IncidentModeSettings GetSettingsForLevel(IncidentEscalationLevel level) => level switch
|
||||
{
|
||||
IncidentEscalationLevel.None => new IncidentModeSettings(
|
||||
level,
|
||||
IncidentRetentionPolicy.Default(),
|
||||
IncidentTelemetrySettings.Default(),
|
||||
IncidentDebugCaptureSettings.Default()),
|
||||
|
||||
IncidentEscalationLevel.Low => new IncidentModeSettings(
|
||||
level,
|
||||
IncidentRetentionPolicy.Default() with { LogRetentionDays = 30 },
|
||||
IncidentTelemetrySettings.Default() with
|
||||
{
|
||||
EnhancedTelemetryActive = true,
|
||||
LogVerbosity = IncidentLogVerbosity.Verbose,
|
||||
TraceSamplingRate = 0.5
|
||||
},
|
||||
IncidentDebugCaptureSettings.Default()),
|
||||
|
||||
IncidentEscalationLevel.Medium => new IncidentModeSettings(
|
||||
level,
|
||||
IncidentRetentionPolicy.Extended(),
|
||||
IncidentTelemetrySettings.Enhanced(),
|
||||
IncidentDebugCaptureSettings.Basic()),
|
||||
|
||||
IncidentEscalationLevel.High => new IncidentModeSettings(
|
||||
level,
|
||||
IncidentRetentionPolicy.Extended() with { LogRetentionDays = 180, ArtifactRetentionDays = 365 },
|
||||
IncidentTelemetrySettings.Enhanced() with { LogVerbosity = IncidentLogVerbosity.Debug },
|
||||
IncidentDebugCaptureSettings.Full()),
|
||||
|
||||
IncidentEscalationLevel.Critical => new IncidentModeSettings(
|
||||
level,
|
||||
IncidentRetentionPolicy.Maximum(),
|
||||
IncidentTelemetrySettings.Maximum(),
|
||||
IncidentDebugCaptureSettings.Full() with { MaxCaptureSizeMb = 1000 }),
|
||||
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(level))
|
||||
};
|
||||
|
||||
private async Task EmitTimelineEventAsync(
|
||||
string tenantId,
|
||||
string runId,
|
||||
string eventType,
|
||||
IReadOnlyDictionary<string, string> attributes,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (_timelineEmitter is null) return;
|
||||
|
||||
await _timelineEmitter.EmitAsync(
|
||||
PackRunTimelineEvent.Create(
|
||||
tenantId: tenantId,
|
||||
eventType: eventType,
|
||||
source: "taskrunner-incident-mode",
|
||||
occurredAt: _timeProvider.GetUtcNow(),
|
||||
runId: runId,
|
||||
severity: PackRunEventSeverity.Warning,
|
||||
attributes: attributes),
|
||||
cancellationToken);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Incident mode timeline event types.
|
||||
/// </summary>
|
||||
public static class PackRunIncidentEventTypes
|
||||
{
|
||||
/// <summary>Incident mode activated.</summary>
|
||||
public const string IncidentModeActivated = "pack.incident.activated";
|
||||
|
||||
/// <summary>Incident mode deactivated.</summary>
|
||||
public const string IncidentModeDeactivated = "pack.incident.deactivated";
|
||||
|
||||
/// <summary>Incident mode escalated.</summary>
|
||||
public const string IncidentModeEscalated = "pack.incident.escalated";
|
||||
|
||||
/// <summary>SLO breach detected.</summary>
|
||||
public const string SloBreachDetected = "pack.incident.slo_breach";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory incident mode store for testing.
|
||||
/// </summary>
|
||||
public sealed class InMemoryPackRunIncidentModeStore : IPackRunIncidentModeStore
|
||||
{
|
||||
private readonly Dictionary<string, PackRunIncidentModeStatus> _statuses = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StoreAsync(
|
||||
string runId,
|
||||
PackRunIncidentModeStatus status,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_statuses[runId] = status;
|
||||
}
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<PackRunIncidentModeStatus?> GetAsync(
|
||||
string runId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_statuses.TryGetValue(runId, out var status);
|
||||
return Task.FromResult(status);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<IReadOnlyList<string>> ListActiveRunsAsync(
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
var active = _statuses
|
||||
.Where(kvp => kvp.Value.Active)
|
||||
.Select(kvp => kvp.Key)
|
||||
.ToList();
|
||||
return Task.FromResult<IReadOnlyList<string>>(active);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task RemoveAsync(
|
||||
string runId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_statuses.Remove(runId);
|
||||
}
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>Gets count of stored statuses.</summary>
|
||||
public int Count
|
||||
{
|
||||
get { lock (_lock) { return _statuses.Count; } }
|
||||
}
|
||||
|
||||
/// <summary>Clears all statuses.</summary>
|
||||
public void Clear()
|
||||
{
|
||||
lock (_lock) { _statuses.Clear(); }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,363 @@
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
|
||||
namespace StellaOps.TaskRunner.Core.IncidentMode;
|
||||
|
||||
/// <summary>
|
||||
/// Incident mode status for a pack run.
|
||||
/// Per TASKRUN-OBS-55-001.
|
||||
/// </summary>
|
||||
public sealed record PackRunIncidentModeStatus(
|
||||
/// <summary>Whether incident mode is active.</summary>
|
||||
bool Active,
|
||||
|
||||
/// <summary>Current escalation level.</summary>
|
||||
IncidentEscalationLevel Level,
|
||||
|
||||
/// <summary>When incident mode was activated.</summary>
|
||||
DateTimeOffset? ActivatedAt,
|
||||
|
||||
/// <summary>Reason for activation.</summary>
|
||||
string? ActivationReason,
|
||||
|
||||
/// <summary>Source of activation (SLO breach, manual, etc.).</summary>
|
||||
IncidentModeSource Source,
|
||||
|
||||
/// <summary>When incident mode will auto-deactivate (if set).</summary>
|
||||
DateTimeOffset? ExpiresAt,
|
||||
|
||||
/// <summary>Current retention policy in effect.</summary>
|
||||
IncidentRetentionPolicy RetentionPolicy,
|
||||
|
||||
/// <summary>Active telemetry escalation settings.</summary>
|
||||
IncidentTelemetrySettings TelemetrySettings,
|
||||
|
||||
/// <summary>Debug artifact capture settings.</summary>
|
||||
IncidentDebugCaptureSettings DebugCaptureSettings)
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
|
||||
WriteIndented = false
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Creates a default inactive status.
|
||||
/// </summary>
|
||||
public static PackRunIncidentModeStatus Inactive() => new(
|
||||
Active: false,
|
||||
Level: IncidentEscalationLevel.None,
|
||||
ActivatedAt: null,
|
||||
ActivationReason: null,
|
||||
Source: IncidentModeSource.None,
|
||||
ExpiresAt: null,
|
||||
RetentionPolicy: IncidentRetentionPolicy.Default(),
|
||||
TelemetrySettings: IncidentTelemetrySettings.Default(),
|
||||
DebugCaptureSettings: IncidentDebugCaptureSettings.Default());
|
||||
|
||||
/// <summary>
|
||||
/// Serializes to JSON.
|
||||
/// </summary>
|
||||
public string ToJson() => JsonSerializer.Serialize(this, JsonOptions);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Incident escalation levels.
|
||||
/// </summary>
|
||||
public enum IncidentEscalationLevel
|
||||
{
|
||||
/// <summary>No incident mode.</summary>
|
||||
None = 0,
|
||||
|
||||
/// <summary>Low severity - enhanced logging.</summary>
|
||||
Low = 1,
|
||||
|
||||
/// <summary>Medium severity - debug capture enabled.</summary>
|
||||
Medium = 2,
|
||||
|
||||
/// <summary>High severity - full debug + extended retention.</summary>
|
||||
High = 3,
|
||||
|
||||
/// <summary>Critical - maximum telemetry + indefinite retention.</summary>
|
||||
Critical = 4
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Source of incident mode activation.
|
||||
/// </summary>
|
||||
public enum IncidentModeSource
|
||||
{
|
||||
/// <summary>No incident mode.</summary>
|
||||
None,
|
||||
|
||||
/// <summary>Activated manually by operator.</summary>
|
||||
Manual,
|
||||
|
||||
/// <summary>Activated by SLO breach webhook.</summary>
|
||||
SloBreach,
|
||||
|
||||
/// <summary>Activated by error rate threshold.</summary>
|
||||
ErrorRate,
|
||||
|
||||
/// <summary>Activated by policy evaluation.</summary>
|
||||
PolicyTrigger,
|
||||
|
||||
/// <summary>Activated by external system.</summary>
|
||||
External
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Retention policy during incident mode.
|
||||
/// </summary>
|
||||
public sealed record IncidentRetentionPolicy(
|
||||
/// <summary>Whether extended retention is active.</summary>
|
||||
bool ExtendedRetentionActive,
|
||||
|
||||
/// <summary>Log retention in days.</summary>
|
||||
int LogRetentionDays,
|
||||
|
||||
/// <summary>Artifact retention in days.</summary>
|
||||
int ArtifactRetentionDays,
|
||||
|
||||
/// <summary>Debug capture retention in days.</summary>
|
||||
int DebugCaptureRetentionDays,
|
||||
|
||||
/// <summary>Trace retention in days.</summary>
|
||||
int TraceRetentionDays)
|
||||
{
|
||||
/// <summary>Default retention policy.</summary>
|
||||
public static IncidentRetentionPolicy Default() => new(
|
||||
ExtendedRetentionActive: false,
|
||||
LogRetentionDays: 7,
|
||||
ArtifactRetentionDays: 30,
|
||||
DebugCaptureRetentionDays: 3,
|
||||
TraceRetentionDays: 7);
|
||||
|
||||
/// <summary>Extended retention for incident mode.</summary>
|
||||
public static IncidentRetentionPolicy Extended() => new(
|
||||
ExtendedRetentionActive: true,
|
||||
LogRetentionDays: 90,
|
||||
ArtifactRetentionDays: 180,
|
||||
DebugCaptureRetentionDays: 30,
|
||||
TraceRetentionDays: 90);
|
||||
|
||||
/// <summary>Maximum retention for critical incidents.</summary>
|
||||
public static IncidentRetentionPolicy Maximum() => new(
|
||||
ExtendedRetentionActive: true,
|
||||
LogRetentionDays: 365,
|
||||
ArtifactRetentionDays: 365,
|
||||
DebugCaptureRetentionDays: 90,
|
||||
TraceRetentionDays: 365);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Telemetry settings during incident mode.
|
||||
/// </summary>
|
||||
public sealed record IncidentTelemetrySettings(
|
||||
/// <summary>Whether enhanced telemetry is active.</summary>
|
||||
bool EnhancedTelemetryActive,
|
||||
|
||||
/// <summary>Log verbosity level.</summary>
|
||||
IncidentLogVerbosity LogVerbosity,
|
||||
|
||||
/// <summary>Trace sampling rate (0.0 to 1.0).</summary>
|
||||
double TraceSamplingRate,
|
||||
|
||||
/// <summary>Whether to capture environment variables.</summary>
|
||||
bool CaptureEnvironment,
|
||||
|
||||
/// <summary>Whether to capture step inputs/outputs.</summary>
|
||||
bool CaptureStepIo,
|
||||
|
||||
/// <summary>Whether to capture network calls.</summary>
|
||||
bool CaptureNetworkCalls,
|
||||
|
||||
/// <summary>Maximum trace spans per step.</summary>
|
||||
int MaxTraceSpansPerStep)
|
||||
{
|
||||
/// <summary>Default telemetry settings.</summary>
|
||||
public static IncidentTelemetrySettings Default() => new(
|
||||
EnhancedTelemetryActive: false,
|
||||
LogVerbosity: IncidentLogVerbosity.Normal,
|
||||
TraceSamplingRate: 0.1,
|
||||
CaptureEnvironment: false,
|
||||
CaptureStepIo: false,
|
||||
CaptureNetworkCalls: false,
|
||||
MaxTraceSpansPerStep: 100);
|
||||
|
||||
/// <summary>Enhanced telemetry for incident mode.</summary>
|
||||
public static IncidentTelemetrySettings Enhanced() => new(
|
||||
EnhancedTelemetryActive: true,
|
||||
LogVerbosity: IncidentLogVerbosity.Verbose,
|
||||
TraceSamplingRate: 1.0,
|
||||
CaptureEnvironment: true,
|
||||
CaptureStepIo: true,
|
||||
CaptureNetworkCalls: true,
|
||||
MaxTraceSpansPerStep: 1000);
|
||||
|
||||
/// <summary>Maximum telemetry for debugging.</summary>
|
||||
public static IncidentTelemetrySettings Maximum() => new(
|
||||
EnhancedTelemetryActive: true,
|
||||
LogVerbosity: IncidentLogVerbosity.Debug,
|
||||
TraceSamplingRate: 1.0,
|
||||
CaptureEnvironment: true,
|
||||
CaptureStepIo: true,
|
||||
CaptureNetworkCalls: true,
|
||||
MaxTraceSpansPerStep: 10000);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Log verbosity levels for incident mode.
|
||||
/// </summary>
|
||||
public enum IncidentLogVerbosity
|
||||
{
|
||||
/// <summary>Minimal logging (errors only).</summary>
|
||||
Minimal,
|
||||
|
||||
/// <summary>Normal logging.</summary>
|
||||
Normal,
|
||||
|
||||
/// <summary>Verbose logging.</summary>
|
||||
Verbose,
|
||||
|
||||
/// <summary>Debug logging (maximum detail).</summary>
|
||||
Debug
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Debug artifact capture settings.
|
||||
/// </summary>
|
||||
public sealed record IncidentDebugCaptureSettings(
|
||||
/// <summary>Whether debug capture is active.</summary>
|
||||
bool CaptureActive,
|
||||
|
||||
/// <summary>Whether to capture heap dumps.</summary>
|
||||
bool CaptureHeapDumps,
|
||||
|
||||
/// <summary>Whether to capture thread dumps.</summary>
|
||||
bool CaptureThreadDumps,
|
||||
|
||||
/// <summary>Whether to capture profiling data.</summary>
|
||||
bool CaptureProfilingData,
|
||||
|
||||
/// <summary>Whether to capture system metrics.</summary>
|
||||
bool CaptureSystemMetrics,
|
||||
|
||||
/// <summary>Maximum capture size in MB.</summary>
|
||||
int MaxCaptureSizeMb,
|
||||
|
||||
/// <summary>Capture interval in seconds.</summary>
|
||||
int CaptureIntervalSeconds)
|
||||
{
|
||||
/// <summary>Default capture settings (disabled).</summary>
|
||||
public static IncidentDebugCaptureSettings Default() => new(
|
||||
CaptureActive: false,
|
||||
CaptureHeapDumps: false,
|
||||
CaptureThreadDumps: false,
|
||||
CaptureProfilingData: false,
|
||||
CaptureSystemMetrics: false,
|
||||
MaxCaptureSizeMb: 0,
|
||||
CaptureIntervalSeconds: 0);
|
||||
|
||||
/// <summary>Basic debug capture.</summary>
|
||||
public static IncidentDebugCaptureSettings Basic() => new(
|
||||
CaptureActive: true,
|
||||
CaptureHeapDumps: false,
|
||||
CaptureThreadDumps: true,
|
||||
CaptureProfilingData: false,
|
||||
CaptureSystemMetrics: true,
|
||||
MaxCaptureSizeMb: 100,
|
||||
CaptureIntervalSeconds: 60);
|
||||
|
||||
/// <summary>Full debug capture.</summary>
|
||||
public static IncidentDebugCaptureSettings Full() => new(
|
||||
CaptureActive: true,
|
||||
CaptureHeapDumps: true,
|
||||
CaptureThreadDumps: true,
|
||||
CaptureProfilingData: true,
|
||||
CaptureSystemMetrics: true,
|
||||
MaxCaptureSizeMb: 500,
|
||||
CaptureIntervalSeconds: 30);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SLO breach notification payload.
|
||||
/// </summary>
|
||||
public sealed record SloBreachNotification(
|
||||
/// <summary>Breach identifier.</summary>
|
||||
[property: JsonPropertyName("breachId")]
|
||||
string BreachId,
|
||||
|
||||
/// <summary>SLO that was breached.</summary>
|
||||
[property: JsonPropertyName("sloName")]
|
||||
string SloName,
|
||||
|
||||
/// <summary>Breach severity.</summary>
|
||||
[property: JsonPropertyName("severity")]
|
||||
string Severity,
|
||||
|
||||
/// <summary>When the breach occurred.</summary>
|
||||
[property: JsonPropertyName("occurredAt")]
|
||||
DateTimeOffset OccurredAt,
|
||||
|
||||
/// <summary>Current metric value.</summary>
|
||||
[property: JsonPropertyName("currentValue")]
|
||||
double CurrentValue,
|
||||
|
||||
/// <summary>Threshold that was breached.</summary>
|
||||
[property: JsonPropertyName("threshold")]
|
||||
double Threshold,
|
||||
|
||||
/// <summary>Target metric value.</summary>
|
||||
[property: JsonPropertyName("target")]
|
||||
double Target,
|
||||
|
||||
/// <summary>Affected resource (run ID, step ID, etc.).</summary>
|
||||
[property: JsonPropertyName("resourceId")]
|
||||
string? ResourceId,
|
||||
|
||||
/// <summary>Affected tenant.</summary>
|
||||
[property: JsonPropertyName("tenantId")]
|
||||
string? TenantId,
|
||||
|
||||
/// <summary>Additional context.</summary>
|
||||
[property: JsonPropertyName("context")]
|
||||
IReadOnlyDictionary<string, string>? Context);
|
||||
|
||||
/// <summary>
|
||||
/// Request to activate incident mode.
|
||||
/// </summary>
|
||||
public sealed record IncidentModeActivationRequest(
|
||||
/// <summary>Run ID to activate incident mode for.</summary>
|
||||
string RunId,
|
||||
|
||||
/// <summary>Tenant ID.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Escalation level to activate.</summary>
|
||||
IncidentEscalationLevel Level,
|
||||
|
||||
/// <summary>Activation source.</summary>
|
||||
IncidentModeSource Source,
|
||||
|
||||
/// <summary>Reason for activation.</summary>
|
||||
string Reason,
|
||||
|
||||
/// <summary>Duration in minutes (null for indefinite).</summary>
|
||||
int? DurationMinutes,
|
||||
|
||||
/// <summary>Operator or system that requested activation.</summary>
|
||||
string? RequestedBy);
|
||||
|
||||
/// <summary>
|
||||
/// Result of incident mode activation.
|
||||
/// </summary>
|
||||
public sealed record IncidentModeActivationResult(
|
||||
/// <summary>Whether activation succeeded.</summary>
|
||||
bool Success,
|
||||
|
||||
/// <summary>Current incident mode status.</summary>
|
||||
PackRunIncidentModeStatus Status,
|
||||
|
||||
/// <summary>Error message if activation failed.</summary>
|
||||
string? Error);
|
||||
Reference in New Issue
Block a user