Implement incident mode management service and models
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled

- Added IPackRunIncidentModeService interface for managing incident mode activation, deactivation, and status retrieval.
- Created PackRunIncidentModeService class implementing the service interface with methods for activating, deactivating, and escalating incident modes.
- Introduced incident mode status model (PackRunIncidentModeStatus) and related enums for escalation levels and activation sources.
- Developed retention policy, telemetry settings, and debug capture settings models to manage incident mode configurations.
- Implemented SLO breach notification handling to activate incident mode based on severity.
- Added in-memory store (InMemoryPackRunIncidentModeStore) for testing purposes.
- Created comprehensive unit tests for incident mode service, covering activation, deactivation, status retrieval, and SLO breach handling.
This commit is contained in:
StellaOps Bot
2025-12-06 22:33:00 +02:00
parent 4042fc2184
commit 9bd6a73926
23 changed files with 7779 additions and 12 deletions

View File

@@ -328,6 +328,18 @@ public static class PackRunEventTypes
/// <summary>Attestation was revoked.</summary>
public const string AttestationRevoked = "pack.attestation.revoked";
/// <summary>Incident mode activated (per TASKRUN-OBS-55-001).</summary>
public const string IncidentModeActivated = "pack.incident.activated";
/// <summary>Incident mode deactivated.</summary>
public const string IncidentModeDeactivated = "pack.incident.deactivated";
/// <summary>Incident mode escalated to higher level.</summary>
public const string IncidentModeEscalated = "pack.incident.escalated";
/// <summary>SLO breach detected triggering incident mode.</summary>
public const string SloBreachDetected = "pack.incident.slo_breach";
/// <summary>Checks if the event type is a pack run event.</summary>
public static bool IsPackRunEvent(string eventType) =>
eventType.StartsWith(Prefix, StringComparison.Ordinal);

View File

@@ -0,0 +1,534 @@
using Microsoft.Extensions.Logging;
using StellaOps.TaskRunner.Core.Events;
namespace StellaOps.TaskRunner.Core.IncidentMode;
/// <summary>
/// Service for managing pack run incident mode.
/// Per TASKRUN-OBS-55-001.
/// </summary>
public interface IPackRunIncidentModeService
{
/// <summary>
/// Activates incident mode for a run.
/// </summary>
Task<IncidentModeActivationResult> ActivateAsync(
IncidentModeActivationRequest request,
CancellationToken cancellationToken = default);
/// <summary>
/// Deactivates incident mode for a run.
/// </summary>
Task<IncidentModeActivationResult> DeactivateAsync(
string runId,
string? reason = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets the current incident mode status for a run.
/// </summary>
Task<PackRunIncidentModeStatus> GetStatusAsync(
string runId,
CancellationToken cancellationToken = default);
/// <summary>
/// Handles an SLO breach notification.
/// </summary>
Task<IncidentModeActivationResult> HandleSloBreachAsync(
SloBreachNotification notification,
CancellationToken cancellationToken = default);
/// <summary>
/// Escalates incident mode to a higher level.
/// </summary>
Task<IncidentModeActivationResult> EscalateAsync(
string runId,
IncidentEscalationLevel newLevel,
string? reason = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets settings for the current incident mode level.
/// </summary>
IncidentModeSettings GetSettingsForLevel(IncidentEscalationLevel level);
}
/// <summary>
/// Store for incident mode state.
/// </summary>
public interface IPackRunIncidentModeStore
{
/// <summary>
/// Stores incident mode status.
/// </summary>
Task StoreAsync(
string runId,
PackRunIncidentModeStatus status,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets incident mode status.
/// </summary>
Task<PackRunIncidentModeStatus?> GetAsync(
string runId,
CancellationToken cancellationToken = default);
/// <summary>
/// Lists all runs in incident mode.
/// </summary>
Task<IReadOnlyList<string>> ListActiveRunsAsync(
CancellationToken cancellationToken = default);
/// <summary>
/// Removes incident mode status.
/// </summary>
Task RemoveAsync(
string runId,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Settings for incident mode levels.
/// </summary>
public sealed record IncidentModeSettings(
/// <summary>Escalation level.</summary>
IncidentEscalationLevel Level,
/// <summary>Retention policy.</summary>
IncidentRetentionPolicy RetentionPolicy,
/// <summary>Telemetry settings.</summary>
IncidentTelemetrySettings TelemetrySettings,
/// <summary>Debug capture settings.</summary>
IncidentDebugCaptureSettings DebugCaptureSettings);
/// <summary>
/// Default implementation of pack run incident mode service.
/// </summary>
public sealed class PackRunIncidentModeService : IPackRunIncidentModeService
{
private readonly IPackRunIncidentModeStore _store;
private readonly IPackRunTimelineEventEmitter? _timelineEmitter;
private readonly ILogger<PackRunIncidentModeService> _logger;
private readonly TimeProvider _timeProvider;
public PackRunIncidentModeService(
IPackRunIncidentModeStore store,
ILogger<PackRunIncidentModeService> logger,
TimeProvider? timeProvider = null,
IPackRunTimelineEventEmitter? timelineEmitter = null)
{
_store = store ?? throw new ArgumentNullException(nameof(store));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_timeProvider = timeProvider ?? TimeProvider.System;
_timelineEmitter = timelineEmitter;
}
/// <inheritdoc />
public async Task<IncidentModeActivationResult> ActivateAsync(
IncidentModeActivationRequest request,
CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(request);
try
{
var now = _timeProvider.GetUtcNow();
var settings = GetSettingsForLevel(request.Level);
var expiresAt = request.DurationMinutes.HasValue
? now.AddMinutes(request.DurationMinutes.Value)
: (DateTimeOffset?)null;
var status = new PackRunIncidentModeStatus(
Active: true,
Level: request.Level,
ActivatedAt: now,
ActivationReason: request.Reason,
Source: request.Source,
ExpiresAt: expiresAt,
RetentionPolicy: settings.RetentionPolicy,
TelemetrySettings: settings.TelemetrySettings,
DebugCaptureSettings: settings.DebugCaptureSettings);
await _store.StoreAsync(request.RunId, status, cancellationToken);
// Emit timeline event
await EmitTimelineEventAsync(
request.TenantId,
request.RunId,
PackRunIncidentEventTypes.IncidentModeActivated,
new Dictionary<string, string>
{
["level"] = request.Level.ToString(),
["source"] = request.Source.ToString(),
["reason"] = request.Reason,
["requestedBy"] = request.RequestedBy ?? "system"
},
cancellationToken);
_logger.LogWarning(
"Incident mode activated for run {RunId} at level {Level} due to: {Reason}",
request.RunId,
request.Level,
request.Reason);
return new IncidentModeActivationResult(
Success: true,
Status: status,
Error: null);
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to activate incident mode for run {RunId}", request.RunId);
return new IncidentModeActivationResult(
Success: false,
Status: PackRunIncidentModeStatus.Inactive(),
Error: ex.Message);
}
}
/// <inheritdoc />
public async Task<IncidentModeActivationResult> DeactivateAsync(
string runId,
string? reason = null,
CancellationToken cancellationToken = default)
{
try
{
var current = await _store.GetAsync(runId, cancellationToken);
if (current is null || !current.Active)
{
return new IncidentModeActivationResult(
Success: true,
Status: PackRunIncidentModeStatus.Inactive(),
Error: null);
}
await _store.RemoveAsync(runId, cancellationToken);
var inactive = PackRunIncidentModeStatus.Inactive();
// Emit timeline event (using default tenant since we don't have it)
await EmitTimelineEventAsync(
"default",
runId,
PackRunIncidentEventTypes.IncidentModeDeactivated,
new Dictionary<string, string>
{
["previousLevel"] = current.Level.ToString(),
["reason"] = reason ?? "Manual deactivation",
["activeDuration"] = current.ActivatedAt.HasValue
? (_timeProvider.GetUtcNow() - current.ActivatedAt.Value).ToString()
: "unknown"
},
cancellationToken);
_logger.LogInformation(
"Incident mode deactivated for run {RunId}. Reason: {Reason}",
runId,
reason ?? "Manual deactivation");
return new IncidentModeActivationResult(
Success: true,
Status: inactive,
Error: null);
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to deactivate incident mode for run {RunId}", runId);
return new IncidentModeActivationResult(
Success: false,
Status: PackRunIncidentModeStatus.Inactive(),
Error: ex.Message);
}
}
/// <inheritdoc />
public async Task<PackRunIncidentModeStatus> GetStatusAsync(
string runId,
CancellationToken cancellationToken = default)
{
var status = await _store.GetAsync(runId, cancellationToken);
if (status is null)
{
return PackRunIncidentModeStatus.Inactive();
}
// Check if expired
if (status.ExpiresAt.HasValue && status.ExpiresAt.Value <= _timeProvider.GetUtcNow())
{
await _store.RemoveAsync(runId, cancellationToken);
return PackRunIncidentModeStatus.Inactive();
}
return status;
}
/// <inheritdoc />
public async Task<IncidentModeActivationResult> HandleSloBreachAsync(
SloBreachNotification notification,
CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(notification);
if (string.IsNullOrWhiteSpace(notification.ResourceId))
{
_logger.LogWarning(
"Received SLO breach notification {BreachId} without resource ID, skipping incident activation",
notification.BreachId);
return new IncidentModeActivationResult(
Success: false,
Status: PackRunIncidentModeStatus.Inactive(),
Error: "No resource ID in SLO breach notification");
}
// Map severity to escalation level
var level = notification.Severity?.ToUpperInvariant() switch
{
"CRITICAL" => IncidentEscalationLevel.Critical,
"HIGH" => IncidentEscalationLevel.High,
"MEDIUM" => IncidentEscalationLevel.Medium,
"LOW" => IncidentEscalationLevel.Low,
_ => IncidentEscalationLevel.Medium
};
var request = new IncidentModeActivationRequest(
RunId: notification.ResourceId,
TenantId: notification.TenantId ?? "default",
Level: level,
Source: IncidentModeSource.SloBreach,
Reason: $"SLO breach: {notification.SloName} ({notification.CurrentValue:F2} vs threshold {notification.Threshold:F2})",
DurationMinutes: 60, // Auto-expire after 1 hour
RequestedBy: "slo-monitor");
_logger.LogWarning(
"Processing SLO breach {BreachId} for {SloName} on resource {ResourceId}",
notification.BreachId,
notification.SloName,
notification.ResourceId);
return await ActivateAsync(request, cancellationToken);
}
/// <inheritdoc />
public async Task<IncidentModeActivationResult> EscalateAsync(
string runId,
IncidentEscalationLevel newLevel,
string? reason = null,
CancellationToken cancellationToken = default)
{
var current = await _store.GetAsync(runId, cancellationToken);
if (current is null || !current.Active)
{
return new IncidentModeActivationResult(
Success: false,
Status: PackRunIncidentModeStatus.Inactive(),
Error: "Incident mode is not active for this run");
}
if (newLevel <= current.Level)
{
return new IncidentModeActivationResult(
Success: false,
Status: current,
Error: $"Cannot escalate to {newLevel} - current level is {current.Level}");
}
var settings = GetSettingsForLevel(newLevel);
var now = _timeProvider.GetUtcNow();
var escalated = current with
{
Level = newLevel,
ActivationReason = $"{current.ActivationReason} [Escalated: {reason ?? "Manual escalation"}]",
RetentionPolicy = settings.RetentionPolicy,
TelemetrySettings = settings.TelemetrySettings,
DebugCaptureSettings = settings.DebugCaptureSettings
};
await _store.StoreAsync(runId, escalated, cancellationToken);
// Emit timeline event
await EmitTimelineEventAsync(
"default",
runId,
PackRunIncidentEventTypes.IncidentModeEscalated,
new Dictionary<string, string>
{
["previousLevel"] = current.Level.ToString(),
["newLevel"] = newLevel.ToString(),
["reason"] = reason ?? "Manual escalation"
},
cancellationToken);
_logger.LogWarning(
"Incident mode escalated for run {RunId} from {OldLevel} to {NewLevel}. Reason: {Reason}",
runId,
current.Level,
newLevel,
reason ?? "Manual escalation");
return new IncidentModeActivationResult(
Success: true,
Status: escalated,
Error: null);
}
/// <inheritdoc />
public IncidentModeSettings GetSettingsForLevel(IncidentEscalationLevel level) => level switch
{
IncidentEscalationLevel.None => new IncidentModeSettings(
level,
IncidentRetentionPolicy.Default(),
IncidentTelemetrySettings.Default(),
IncidentDebugCaptureSettings.Default()),
IncidentEscalationLevel.Low => new IncidentModeSettings(
level,
IncidentRetentionPolicy.Default() with { LogRetentionDays = 30 },
IncidentTelemetrySettings.Default() with
{
EnhancedTelemetryActive = true,
LogVerbosity = IncidentLogVerbosity.Verbose,
TraceSamplingRate = 0.5
},
IncidentDebugCaptureSettings.Default()),
IncidentEscalationLevel.Medium => new IncidentModeSettings(
level,
IncidentRetentionPolicy.Extended(),
IncidentTelemetrySettings.Enhanced(),
IncidentDebugCaptureSettings.Basic()),
IncidentEscalationLevel.High => new IncidentModeSettings(
level,
IncidentRetentionPolicy.Extended() with { LogRetentionDays = 180, ArtifactRetentionDays = 365 },
IncidentTelemetrySettings.Enhanced() with { LogVerbosity = IncidentLogVerbosity.Debug },
IncidentDebugCaptureSettings.Full()),
IncidentEscalationLevel.Critical => new IncidentModeSettings(
level,
IncidentRetentionPolicy.Maximum(),
IncidentTelemetrySettings.Maximum(),
IncidentDebugCaptureSettings.Full() with { MaxCaptureSizeMb = 1000 }),
_ => throw new ArgumentOutOfRangeException(nameof(level))
};
private async Task EmitTimelineEventAsync(
string tenantId,
string runId,
string eventType,
IReadOnlyDictionary<string, string> attributes,
CancellationToken cancellationToken)
{
if (_timelineEmitter is null) return;
await _timelineEmitter.EmitAsync(
PackRunTimelineEvent.Create(
tenantId: tenantId,
eventType: eventType,
source: "taskrunner-incident-mode",
occurredAt: _timeProvider.GetUtcNow(),
runId: runId,
severity: PackRunEventSeverity.Warning,
attributes: attributes),
cancellationToken);
}
}
/// <summary>
/// Incident mode timeline event types.
/// </summary>
public static class PackRunIncidentEventTypes
{
/// <summary>Incident mode activated.</summary>
public const string IncidentModeActivated = "pack.incident.activated";
/// <summary>Incident mode deactivated.</summary>
public const string IncidentModeDeactivated = "pack.incident.deactivated";
/// <summary>Incident mode escalated.</summary>
public const string IncidentModeEscalated = "pack.incident.escalated";
/// <summary>SLO breach detected.</summary>
public const string SloBreachDetected = "pack.incident.slo_breach";
}
/// <summary>
/// In-memory incident mode store for testing.
/// </summary>
public sealed class InMemoryPackRunIncidentModeStore : IPackRunIncidentModeStore
{
private readonly Dictionary<string, PackRunIncidentModeStatus> _statuses = new();
private readonly object _lock = new();
/// <inheritdoc />
public Task StoreAsync(
string runId,
PackRunIncidentModeStatus status,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
_statuses[runId] = status;
}
return Task.CompletedTask;
}
/// <inheritdoc />
public Task<PackRunIncidentModeStatus?> GetAsync(
string runId,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
_statuses.TryGetValue(runId, out var status);
return Task.FromResult(status);
}
}
/// <inheritdoc />
public Task<IReadOnlyList<string>> ListActiveRunsAsync(
CancellationToken cancellationToken = default)
{
lock (_lock)
{
var active = _statuses
.Where(kvp => kvp.Value.Active)
.Select(kvp => kvp.Key)
.ToList();
return Task.FromResult<IReadOnlyList<string>>(active);
}
}
/// <inheritdoc />
public Task RemoveAsync(
string runId,
CancellationToken cancellationToken = default)
{
lock (_lock)
{
_statuses.Remove(runId);
}
return Task.CompletedTask;
}
/// <summary>Gets count of stored statuses.</summary>
public int Count
{
get { lock (_lock) { return _statuses.Count; } }
}
/// <summary>Clears all statuses.</summary>
public void Clear()
{
lock (_lock) { _statuses.Clear(); }
}
}

View File

@@ -0,0 +1,363 @@
using System.Text.Json;
using System.Text.Json.Serialization;
namespace StellaOps.TaskRunner.Core.IncidentMode;
/// <summary>
/// Incident mode status for a pack run.
/// Per TASKRUN-OBS-55-001.
/// </summary>
public sealed record PackRunIncidentModeStatus(
/// <summary>Whether incident mode is active.</summary>
bool Active,
/// <summary>Current escalation level.</summary>
IncidentEscalationLevel Level,
/// <summary>When incident mode was activated.</summary>
DateTimeOffset? ActivatedAt,
/// <summary>Reason for activation.</summary>
string? ActivationReason,
/// <summary>Source of activation (SLO breach, manual, etc.).</summary>
IncidentModeSource Source,
/// <summary>When incident mode will auto-deactivate (if set).</summary>
DateTimeOffset? ExpiresAt,
/// <summary>Current retention policy in effect.</summary>
IncidentRetentionPolicy RetentionPolicy,
/// <summary>Active telemetry escalation settings.</summary>
IncidentTelemetrySettings TelemetrySettings,
/// <summary>Debug artifact capture settings.</summary>
IncidentDebugCaptureSettings DebugCaptureSettings)
{
private static readonly JsonSerializerOptions JsonOptions = new()
{
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
WriteIndented = false
};
/// <summary>
/// Creates a default inactive status.
/// </summary>
public static PackRunIncidentModeStatus Inactive() => new(
Active: false,
Level: IncidentEscalationLevel.None,
ActivatedAt: null,
ActivationReason: null,
Source: IncidentModeSource.None,
ExpiresAt: null,
RetentionPolicy: IncidentRetentionPolicy.Default(),
TelemetrySettings: IncidentTelemetrySettings.Default(),
DebugCaptureSettings: IncidentDebugCaptureSettings.Default());
/// <summary>
/// Serializes to JSON.
/// </summary>
public string ToJson() => JsonSerializer.Serialize(this, JsonOptions);
}
/// <summary>
/// Incident escalation levels.
/// </summary>
public enum IncidentEscalationLevel
{
/// <summary>No incident mode.</summary>
None = 0,
/// <summary>Low severity - enhanced logging.</summary>
Low = 1,
/// <summary>Medium severity - debug capture enabled.</summary>
Medium = 2,
/// <summary>High severity - full debug + extended retention.</summary>
High = 3,
/// <summary>Critical - maximum telemetry + indefinite retention.</summary>
Critical = 4
}
/// <summary>
/// Source of incident mode activation.
/// </summary>
public enum IncidentModeSource
{
/// <summary>No incident mode.</summary>
None,
/// <summary>Activated manually by operator.</summary>
Manual,
/// <summary>Activated by SLO breach webhook.</summary>
SloBreach,
/// <summary>Activated by error rate threshold.</summary>
ErrorRate,
/// <summary>Activated by policy evaluation.</summary>
PolicyTrigger,
/// <summary>Activated by external system.</summary>
External
}
/// <summary>
/// Retention policy during incident mode.
/// </summary>
public sealed record IncidentRetentionPolicy(
/// <summary>Whether extended retention is active.</summary>
bool ExtendedRetentionActive,
/// <summary>Log retention in days.</summary>
int LogRetentionDays,
/// <summary>Artifact retention in days.</summary>
int ArtifactRetentionDays,
/// <summary>Debug capture retention in days.</summary>
int DebugCaptureRetentionDays,
/// <summary>Trace retention in days.</summary>
int TraceRetentionDays)
{
/// <summary>Default retention policy.</summary>
public static IncidentRetentionPolicy Default() => new(
ExtendedRetentionActive: false,
LogRetentionDays: 7,
ArtifactRetentionDays: 30,
DebugCaptureRetentionDays: 3,
TraceRetentionDays: 7);
/// <summary>Extended retention for incident mode.</summary>
public static IncidentRetentionPolicy Extended() => new(
ExtendedRetentionActive: true,
LogRetentionDays: 90,
ArtifactRetentionDays: 180,
DebugCaptureRetentionDays: 30,
TraceRetentionDays: 90);
/// <summary>Maximum retention for critical incidents.</summary>
public static IncidentRetentionPolicy Maximum() => new(
ExtendedRetentionActive: true,
LogRetentionDays: 365,
ArtifactRetentionDays: 365,
DebugCaptureRetentionDays: 90,
TraceRetentionDays: 365);
}
/// <summary>
/// Telemetry settings during incident mode.
/// </summary>
public sealed record IncidentTelemetrySettings(
/// <summary>Whether enhanced telemetry is active.</summary>
bool EnhancedTelemetryActive,
/// <summary>Log verbosity level.</summary>
IncidentLogVerbosity LogVerbosity,
/// <summary>Trace sampling rate (0.0 to 1.0).</summary>
double TraceSamplingRate,
/// <summary>Whether to capture environment variables.</summary>
bool CaptureEnvironment,
/// <summary>Whether to capture step inputs/outputs.</summary>
bool CaptureStepIo,
/// <summary>Whether to capture network calls.</summary>
bool CaptureNetworkCalls,
/// <summary>Maximum trace spans per step.</summary>
int MaxTraceSpansPerStep)
{
/// <summary>Default telemetry settings.</summary>
public static IncidentTelemetrySettings Default() => new(
EnhancedTelemetryActive: false,
LogVerbosity: IncidentLogVerbosity.Normal,
TraceSamplingRate: 0.1,
CaptureEnvironment: false,
CaptureStepIo: false,
CaptureNetworkCalls: false,
MaxTraceSpansPerStep: 100);
/// <summary>Enhanced telemetry for incident mode.</summary>
public static IncidentTelemetrySettings Enhanced() => new(
EnhancedTelemetryActive: true,
LogVerbosity: IncidentLogVerbosity.Verbose,
TraceSamplingRate: 1.0,
CaptureEnvironment: true,
CaptureStepIo: true,
CaptureNetworkCalls: true,
MaxTraceSpansPerStep: 1000);
/// <summary>Maximum telemetry for debugging.</summary>
public static IncidentTelemetrySettings Maximum() => new(
EnhancedTelemetryActive: true,
LogVerbosity: IncidentLogVerbosity.Debug,
TraceSamplingRate: 1.0,
CaptureEnvironment: true,
CaptureStepIo: true,
CaptureNetworkCalls: true,
MaxTraceSpansPerStep: 10000);
}
/// <summary>
/// Log verbosity levels for incident mode.
/// </summary>
public enum IncidentLogVerbosity
{
/// <summary>Minimal logging (errors only).</summary>
Minimal,
/// <summary>Normal logging.</summary>
Normal,
/// <summary>Verbose logging.</summary>
Verbose,
/// <summary>Debug logging (maximum detail).</summary>
Debug
}
/// <summary>
/// Debug artifact capture settings.
/// </summary>
public sealed record IncidentDebugCaptureSettings(
/// <summary>Whether debug capture is active.</summary>
bool CaptureActive,
/// <summary>Whether to capture heap dumps.</summary>
bool CaptureHeapDumps,
/// <summary>Whether to capture thread dumps.</summary>
bool CaptureThreadDumps,
/// <summary>Whether to capture profiling data.</summary>
bool CaptureProfilingData,
/// <summary>Whether to capture system metrics.</summary>
bool CaptureSystemMetrics,
/// <summary>Maximum capture size in MB.</summary>
int MaxCaptureSizeMb,
/// <summary>Capture interval in seconds.</summary>
int CaptureIntervalSeconds)
{
/// <summary>Default capture settings (disabled).</summary>
public static IncidentDebugCaptureSettings Default() => new(
CaptureActive: false,
CaptureHeapDumps: false,
CaptureThreadDumps: false,
CaptureProfilingData: false,
CaptureSystemMetrics: false,
MaxCaptureSizeMb: 0,
CaptureIntervalSeconds: 0);
/// <summary>Basic debug capture.</summary>
public static IncidentDebugCaptureSettings Basic() => new(
CaptureActive: true,
CaptureHeapDumps: false,
CaptureThreadDumps: true,
CaptureProfilingData: false,
CaptureSystemMetrics: true,
MaxCaptureSizeMb: 100,
CaptureIntervalSeconds: 60);
/// <summary>Full debug capture.</summary>
public static IncidentDebugCaptureSettings Full() => new(
CaptureActive: true,
CaptureHeapDumps: true,
CaptureThreadDumps: true,
CaptureProfilingData: true,
CaptureSystemMetrics: true,
MaxCaptureSizeMb: 500,
CaptureIntervalSeconds: 30);
}
/// <summary>
/// SLO breach notification payload.
/// </summary>
public sealed record SloBreachNotification(
/// <summary>Breach identifier.</summary>
[property: JsonPropertyName("breachId")]
string BreachId,
/// <summary>SLO that was breached.</summary>
[property: JsonPropertyName("sloName")]
string SloName,
/// <summary>Breach severity.</summary>
[property: JsonPropertyName("severity")]
string Severity,
/// <summary>When the breach occurred.</summary>
[property: JsonPropertyName("occurredAt")]
DateTimeOffset OccurredAt,
/// <summary>Current metric value.</summary>
[property: JsonPropertyName("currentValue")]
double CurrentValue,
/// <summary>Threshold that was breached.</summary>
[property: JsonPropertyName("threshold")]
double Threshold,
/// <summary>Target metric value.</summary>
[property: JsonPropertyName("target")]
double Target,
/// <summary>Affected resource (run ID, step ID, etc.).</summary>
[property: JsonPropertyName("resourceId")]
string? ResourceId,
/// <summary>Affected tenant.</summary>
[property: JsonPropertyName("tenantId")]
string? TenantId,
/// <summary>Additional context.</summary>
[property: JsonPropertyName("context")]
IReadOnlyDictionary<string, string>? Context);
/// <summary>
/// Request to activate incident mode.
/// </summary>
public sealed record IncidentModeActivationRequest(
/// <summary>Run ID to activate incident mode for.</summary>
string RunId,
/// <summary>Tenant ID.</summary>
string TenantId,
/// <summary>Escalation level to activate.</summary>
IncidentEscalationLevel Level,
/// <summary>Activation source.</summary>
IncidentModeSource Source,
/// <summary>Reason for activation.</summary>
string Reason,
/// <summary>Duration in minutes (null for indefinite).</summary>
int? DurationMinutes,
/// <summary>Operator or system that requested activation.</summary>
string? RequestedBy);
/// <summary>
/// Result of incident mode activation.
/// </summary>
public sealed record IncidentModeActivationResult(
/// <summary>Whether activation succeeded.</summary>
bool Success,
/// <summary>Current incident mode status.</summary>
PackRunIncidentModeStatus Status,
/// <summary>Error message if activation failed.</summary>
string? Error);