Implement incident mode management service and models
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
- Added IPackRunIncidentModeService interface for managing incident mode activation, deactivation, and status retrieval. - Created PackRunIncidentModeService class implementing the service interface with methods for activating, deactivating, and escalating incident modes. - Introduced incident mode status model (PackRunIncidentModeStatus) and related enums for escalation levels and activation sources. - Developed retention policy, telemetry settings, and debug capture settings models to manage incident mode configurations. - Implemented SLO breach notification handling to activate incident mode based on severity. - Added in-memory store (InMemoryPackRunIncidentModeStore) for testing purposes. - Created comprehensive unit tests for incident mode service, covering activation, deactivation, status retrieval, and SLO breach handling.
This commit is contained in:
@@ -328,6 +328,18 @@ public static class PackRunEventTypes
|
||||
/// <summary>Attestation was revoked.</summary>
|
||||
public const string AttestationRevoked = "pack.attestation.revoked";
|
||||
|
||||
/// <summary>Incident mode activated (per TASKRUN-OBS-55-001).</summary>
|
||||
public const string IncidentModeActivated = "pack.incident.activated";
|
||||
|
||||
/// <summary>Incident mode deactivated.</summary>
|
||||
public const string IncidentModeDeactivated = "pack.incident.deactivated";
|
||||
|
||||
/// <summary>Incident mode escalated to higher level.</summary>
|
||||
public const string IncidentModeEscalated = "pack.incident.escalated";
|
||||
|
||||
/// <summary>SLO breach detected triggering incident mode.</summary>
|
||||
public const string SloBreachDetected = "pack.incident.slo_breach";
|
||||
|
||||
/// <summary>Checks if the event type is a pack run event.</summary>
|
||||
public static bool IsPackRunEvent(string eventType) =>
|
||||
eventType.StartsWith(Prefix, StringComparison.Ordinal);
|
||||
|
||||
@@ -0,0 +1,534 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.TaskRunner.Core.Events;
|
||||
|
||||
namespace StellaOps.TaskRunner.Core.IncidentMode;
|
||||
|
||||
/// <summary>
|
||||
/// Service for managing pack run incident mode.
|
||||
/// Per TASKRUN-OBS-55-001.
|
||||
/// </summary>
|
||||
public interface IPackRunIncidentModeService
|
||||
{
|
||||
/// <summary>
|
||||
/// Activates incident mode for a run.
|
||||
/// </summary>
|
||||
Task<IncidentModeActivationResult> ActivateAsync(
|
||||
IncidentModeActivationRequest request,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Deactivates incident mode for a run.
|
||||
/// </summary>
|
||||
Task<IncidentModeActivationResult> DeactivateAsync(
|
||||
string runId,
|
||||
string? reason = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current incident mode status for a run.
|
||||
/// </summary>
|
||||
Task<PackRunIncidentModeStatus> GetStatusAsync(
|
||||
string runId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Handles an SLO breach notification.
|
||||
/// </summary>
|
||||
Task<IncidentModeActivationResult> HandleSloBreachAsync(
|
||||
SloBreachNotification notification,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Escalates incident mode to a higher level.
|
||||
/// </summary>
|
||||
Task<IncidentModeActivationResult> EscalateAsync(
|
||||
string runId,
|
||||
IncidentEscalationLevel newLevel,
|
||||
string? reason = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets settings for the current incident mode level.
|
||||
/// </summary>
|
||||
IncidentModeSettings GetSettingsForLevel(IncidentEscalationLevel level);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Store for incident mode state.
|
||||
/// </summary>
|
||||
public interface IPackRunIncidentModeStore
|
||||
{
|
||||
/// <summary>
|
||||
/// Stores incident mode status.
|
||||
/// </summary>
|
||||
Task StoreAsync(
|
||||
string runId,
|
||||
PackRunIncidentModeStatus status,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets incident mode status.
|
||||
/// </summary>
|
||||
Task<PackRunIncidentModeStatus?> GetAsync(
|
||||
string runId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Lists all runs in incident mode.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<string>> ListActiveRunsAsync(
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Removes incident mode status.
|
||||
/// </summary>
|
||||
Task RemoveAsync(
|
||||
string runId,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Settings for incident mode levels.
|
||||
/// </summary>
|
||||
public sealed record IncidentModeSettings(
|
||||
/// <summary>Escalation level.</summary>
|
||||
IncidentEscalationLevel Level,
|
||||
|
||||
/// <summary>Retention policy.</summary>
|
||||
IncidentRetentionPolicy RetentionPolicy,
|
||||
|
||||
/// <summary>Telemetry settings.</summary>
|
||||
IncidentTelemetrySettings TelemetrySettings,
|
||||
|
||||
/// <summary>Debug capture settings.</summary>
|
||||
IncidentDebugCaptureSettings DebugCaptureSettings);
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of pack run incident mode service.
|
||||
/// </summary>
|
||||
public sealed class PackRunIncidentModeService : IPackRunIncidentModeService
|
||||
{
|
||||
private readonly IPackRunIncidentModeStore _store;
|
||||
private readonly IPackRunTimelineEventEmitter? _timelineEmitter;
|
||||
private readonly ILogger<PackRunIncidentModeService> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
public PackRunIncidentModeService(
|
||||
IPackRunIncidentModeStore store,
|
||||
ILogger<PackRunIncidentModeService> logger,
|
||||
TimeProvider? timeProvider = null,
|
||||
IPackRunTimelineEventEmitter? timelineEmitter = null)
|
||||
{
|
||||
_store = store ?? throw new ArgumentNullException(nameof(store));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_timelineEmitter = timelineEmitter;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IncidentModeActivationResult> ActivateAsync(
|
||||
IncidentModeActivationRequest request,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(request);
|
||||
|
||||
try
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var settings = GetSettingsForLevel(request.Level);
|
||||
|
||||
var expiresAt = request.DurationMinutes.HasValue
|
||||
? now.AddMinutes(request.DurationMinutes.Value)
|
||||
: (DateTimeOffset?)null;
|
||||
|
||||
var status = new PackRunIncidentModeStatus(
|
||||
Active: true,
|
||||
Level: request.Level,
|
||||
ActivatedAt: now,
|
||||
ActivationReason: request.Reason,
|
||||
Source: request.Source,
|
||||
ExpiresAt: expiresAt,
|
||||
RetentionPolicy: settings.RetentionPolicy,
|
||||
TelemetrySettings: settings.TelemetrySettings,
|
||||
DebugCaptureSettings: settings.DebugCaptureSettings);
|
||||
|
||||
await _store.StoreAsync(request.RunId, status, cancellationToken);
|
||||
|
||||
// Emit timeline event
|
||||
await EmitTimelineEventAsync(
|
||||
request.TenantId,
|
||||
request.RunId,
|
||||
PackRunIncidentEventTypes.IncidentModeActivated,
|
||||
new Dictionary<string, string>
|
||||
{
|
||||
["level"] = request.Level.ToString(),
|
||||
["source"] = request.Source.ToString(),
|
||||
["reason"] = request.Reason,
|
||||
["requestedBy"] = request.RequestedBy ?? "system"
|
||||
},
|
||||
cancellationToken);
|
||||
|
||||
_logger.LogWarning(
|
||||
"Incident mode activated for run {RunId} at level {Level} due to: {Reason}",
|
||||
request.RunId,
|
||||
request.Level,
|
||||
request.Reason);
|
||||
|
||||
return new IncidentModeActivationResult(
|
||||
Success: true,
|
||||
Status: status,
|
||||
Error: null);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to activate incident mode for run {RunId}", request.RunId);
|
||||
|
||||
return new IncidentModeActivationResult(
|
||||
Success: false,
|
||||
Status: PackRunIncidentModeStatus.Inactive(),
|
||||
Error: ex.Message);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IncidentModeActivationResult> DeactivateAsync(
|
||||
string runId,
|
||||
string? reason = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
var current = await _store.GetAsync(runId, cancellationToken);
|
||||
if (current is null || !current.Active)
|
||||
{
|
||||
return new IncidentModeActivationResult(
|
||||
Success: true,
|
||||
Status: PackRunIncidentModeStatus.Inactive(),
|
||||
Error: null);
|
||||
}
|
||||
|
||||
await _store.RemoveAsync(runId, cancellationToken);
|
||||
var inactive = PackRunIncidentModeStatus.Inactive();
|
||||
|
||||
// Emit timeline event (using default tenant since we don't have it)
|
||||
await EmitTimelineEventAsync(
|
||||
"default",
|
||||
runId,
|
||||
PackRunIncidentEventTypes.IncidentModeDeactivated,
|
||||
new Dictionary<string, string>
|
||||
{
|
||||
["previousLevel"] = current.Level.ToString(),
|
||||
["reason"] = reason ?? "Manual deactivation",
|
||||
["activeDuration"] = current.ActivatedAt.HasValue
|
||||
? (_timeProvider.GetUtcNow() - current.ActivatedAt.Value).ToString()
|
||||
: "unknown"
|
||||
},
|
||||
cancellationToken);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Incident mode deactivated for run {RunId}. Reason: {Reason}",
|
||||
runId,
|
||||
reason ?? "Manual deactivation");
|
||||
|
||||
return new IncidentModeActivationResult(
|
||||
Success: true,
|
||||
Status: inactive,
|
||||
Error: null);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to deactivate incident mode for run {RunId}", runId);
|
||||
|
||||
return new IncidentModeActivationResult(
|
||||
Success: false,
|
||||
Status: PackRunIncidentModeStatus.Inactive(),
|
||||
Error: ex.Message);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<PackRunIncidentModeStatus> GetStatusAsync(
|
||||
string runId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var status = await _store.GetAsync(runId, cancellationToken);
|
||||
|
||||
if (status is null)
|
||||
{
|
||||
return PackRunIncidentModeStatus.Inactive();
|
||||
}
|
||||
|
||||
// Check if expired
|
||||
if (status.ExpiresAt.HasValue && status.ExpiresAt.Value <= _timeProvider.GetUtcNow())
|
||||
{
|
||||
await _store.RemoveAsync(runId, cancellationToken);
|
||||
return PackRunIncidentModeStatus.Inactive();
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IncidentModeActivationResult> HandleSloBreachAsync(
|
||||
SloBreachNotification notification,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(notification);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(notification.ResourceId))
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Received SLO breach notification {BreachId} without resource ID, skipping incident activation",
|
||||
notification.BreachId);
|
||||
|
||||
return new IncidentModeActivationResult(
|
||||
Success: false,
|
||||
Status: PackRunIncidentModeStatus.Inactive(),
|
||||
Error: "No resource ID in SLO breach notification");
|
||||
}
|
||||
|
||||
// Map severity to escalation level
|
||||
var level = notification.Severity?.ToUpperInvariant() switch
|
||||
{
|
||||
"CRITICAL" => IncidentEscalationLevel.Critical,
|
||||
"HIGH" => IncidentEscalationLevel.High,
|
||||
"MEDIUM" => IncidentEscalationLevel.Medium,
|
||||
"LOW" => IncidentEscalationLevel.Low,
|
||||
_ => IncidentEscalationLevel.Medium
|
||||
};
|
||||
|
||||
var request = new IncidentModeActivationRequest(
|
||||
RunId: notification.ResourceId,
|
||||
TenantId: notification.TenantId ?? "default",
|
||||
Level: level,
|
||||
Source: IncidentModeSource.SloBreach,
|
||||
Reason: $"SLO breach: {notification.SloName} ({notification.CurrentValue:F2} vs threshold {notification.Threshold:F2})",
|
||||
DurationMinutes: 60, // Auto-expire after 1 hour
|
||||
RequestedBy: "slo-monitor");
|
||||
|
||||
_logger.LogWarning(
|
||||
"Processing SLO breach {BreachId} for {SloName} on resource {ResourceId}",
|
||||
notification.BreachId,
|
||||
notification.SloName,
|
||||
notification.ResourceId);
|
||||
|
||||
return await ActivateAsync(request, cancellationToken);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IncidentModeActivationResult> EscalateAsync(
|
||||
string runId,
|
||||
IncidentEscalationLevel newLevel,
|
||||
string? reason = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var current = await _store.GetAsync(runId, cancellationToken);
|
||||
|
||||
if (current is null || !current.Active)
|
||||
{
|
||||
return new IncidentModeActivationResult(
|
||||
Success: false,
|
||||
Status: PackRunIncidentModeStatus.Inactive(),
|
||||
Error: "Incident mode is not active for this run");
|
||||
}
|
||||
|
||||
if (newLevel <= current.Level)
|
||||
{
|
||||
return new IncidentModeActivationResult(
|
||||
Success: false,
|
||||
Status: current,
|
||||
Error: $"Cannot escalate to {newLevel} - current level is {current.Level}");
|
||||
}
|
||||
|
||||
var settings = GetSettingsForLevel(newLevel);
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
|
||||
var escalated = current with
|
||||
{
|
||||
Level = newLevel,
|
||||
ActivationReason = $"{current.ActivationReason} [Escalated: {reason ?? "Manual escalation"}]",
|
||||
RetentionPolicy = settings.RetentionPolicy,
|
||||
TelemetrySettings = settings.TelemetrySettings,
|
||||
DebugCaptureSettings = settings.DebugCaptureSettings
|
||||
};
|
||||
|
||||
await _store.StoreAsync(runId, escalated, cancellationToken);
|
||||
|
||||
// Emit timeline event
|
||||
await EmitTimelineEventAsync(
|
||||
"default",
|
||||
runId,
|
||||
PackRunIncidentEventTypes.IncidentModeEscalated,
|
||||
new Dictionary<string, string>
|
||||
{
|
||||
["previousLevel"] = current.Level.ToString(),
|
||||
["newLevel"] = newLevel.ToString(),
|
||||
["reason"] = reason ?? "Manual escalation"
|
||||
},
|
||||
cancellationToken);
|
||||
|
||||
_logger.LogWarning(
|
||||
"Incident mode escalated for run {RunId} from {OldLevel} to {NewLevel}. Reason: {Reason}",
|
||||
runId,
|
||||
current.Level,
|
||||
newLevel,
|
||||
reason ?? "Manual escalation");
|
||||
|
||||
return new IncidentModeActivationResult(
|
||||
Success: true,
|
||||
Status: escalated,
|
||||
Error: null);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IncidentModeSettings GetSettingsForLevel(IncidentEscalationLevel level) => level switch
|
||||
{
|
||||
IncidentEscalationLevel.None => new IncidentModeSettings(
|
||||
level,
|
||||
IncidentRetentionPolicy.Default(),
|
||||
IncidentTelemetrySettings.Default(),
|
||||
IncidentDebugCaptureSettings.Default()),
|
||||
|
||||
IncidentEscalationLevel.Low => new IncidentModeSettings(
|
||||
level,
|
||||
IncidentRetentionPolicy.Default() with { LogRetentionDays = 30 },
|
||||
IncidentTelemetrySettings.Default() with
|
||||
{
|
||||
EnhancedTelemetryActive = true,
|
||||
LogVerbosity = IncidentLogVerbosity.Verbose,
|
||||
TraceSamplingRate = 0.5
|
||||
},
|
||||
IncidentDebugCaptureSettings.Default()),
|
||||
|
||||
IncidentEscalationLevel.Medium => new IncidentModeSettings(
|
||||
level,
|
||||
IncidentRetentionPolicy.Extended(),
|
||||
IncidentTelemetrySettings.Enhanced(),
|
||||
IncidentDebugCaptureSettings.Basic()),
|
||||
|
||||
IncidentEscalationLevel.High => new IncidentModeSettings(
|
||||
level,
|
||||
IncidentRetentionPolicy.Extended() with { LogRetentionDays = 180, ArtifactRetentionDays = 365 },
|
||||
IncidentTelemetrySettings.Enhanced() with { LogVerbosity = IncidentLogVerbosity.Debug },
|
||||
IncidentDebugCaptureSettings.Full()),
|
||||
|
||||
IncidentEscalationLevel.Critical => new IncidentModeSettings(
|
||||
level,
|
||||
IncidentRetentionPolicy.Maximum(),
|
||||
IncidentTelemetrySettings.Maximum(),
|
||||
IncidentDebugCaptureSettings.Full() with { MaxCaptureSizeMb = 1000 }),
|
||||
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(level))
|
||||
};
|
||||
|
||||
private async Task EmitTimelineEventAsync(
|
||||
string tenantId,
|
||||
string runId,
|
||||
string eventType,
|
||||
IReadOnlyDictionary<string, string> attributes,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (_timelineEmitter is null) return;
|
||||
|
||||
await _timelineEmitter.EmitAsync(
|
||||
PackRunTimelineEvent.Create(
|
||||
tenantId: tenantId,
|
||||
eventType: eventType,
|
||||
source: "taskrunner-incident-mode",
|
||||
occurredAt: _timeProvider.GetUtcNow(),
|
||||
runId: runId,
|
||||
severity: PackRunEventSeverity.Warning,
|
||||
attributes: attributes),
|
||||
cancellationToken);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Incident mode timeline event types.
|
||||
/// </summary>
|
||||
public static class PackRunIncidentEventTypes
|
||||
{
|
||||
/// <summary>Incident mode activated.</summary>
|
||||
public const string IncidentModeActivated = "pack.incident.activated";
|
||||
|
||||
/// <summary>Incident mode deactivated.</summary>
|
||||
public const string IncidentModeDeactivated = "pack.incident.deactivated";
|
||||
|
||||
/// <summary>Incident mode escalated.</summary>
|
||||
public const string IncidentModeEscalated = "pack.incident.escalated";
|
||||
|
||||
/// <summary>SLO breach detected.</summary>
|
||||
public const string SloBreachDetected = "pack.incident.slo_breach";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory incident mode store for testing.
|
||||
/// </summary>
|
||||
public sealed class InMemoryPackRunIncidentModeStore : IPackRunIncidentModeStore
|
||||
{
|
||||
private readonly Dictionary<string, PackRunIncidentModeStatus> _statuses = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task StoreAsync(
|
||||
string runId,
|
||||
PackRunIncidentModeStatus status,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_statuses[runId] = status;
|
||||
}
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<PackRunIncidentModeStatus?> GetAsync(
|
||||
string runId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_statuses.TryGetValue(runId, out var status);
|
||||
return Task.FromResult(status);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<IReadOnlyList<string>> ListActiveRunsAsync(
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
var active = _statuses
|
||||
.Where(kvp => kvp.Value.Active)
|
||||
.Select(kvp => kvp.Key)
|
||||
.ToList();
|
||||
return Task.FromResult<IReadOnlyList<string>>(active);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task RemoveAsync(
|
||||
string runId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_lock)
|
||||
{
|
||||
_statuses.Remove(runId);
|
||||
}
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <summary>Gets count of stored statuses.</summary>
|
||||
public int Count
|
||||
{
|
||||
get { lock (_lock) { return _statuses.Count; } }
|
||||
}
|
||||
|
||||
/// <summary>Clears all statuses.</summary>
|
||||
public void Clear()
|
||||
{
|
||||
lock (_lock) { _statuses.Clear(); }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,363 @@
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
|
||||
namespace StellaOps.TaskRunner.Core.IncidentMode;
|
||||
|
||||
/// <summary>
|
||||
/// Incident mode status for a pack run.
|
||||
/// Per TASKRUN-OBS-55-001.
|
||||
/// </summary>
|
||||
public sealed record PackRunIncidentModeStatus(
|
||||
/// <summary>Whether incident mode is active.</summary>
|
||||
bool Active,
|
||||
|
||||
/// <summary>Current escalation level.</summary>
|
||||
IncidentEscalationLevel Level,
|
||||
|
||||
/// <summary>When incident mode was activated.</summary>
|
||||
DateTimeOffset? ActivatedAt,
|
||||
|
||||
/// <summary>Reason for activation.</summary>
|
||||
string? ActivationReason,
|
||||
|
||||
/// <summary>Source of activation (SLO breach, manual, etc.).</summary>
|
||||
IncidentModeSource Source,
|
||||
|
||||
/// <summary>When incident mode will auto-deactivate (if set).</summary>
|
||||
DateTimeOffset? ExpiresAt,
|
||||
|
||||
/// <summary>Current retention policy in effect.</summary>
|
||||
IncidentRetentionPolicy RetentionPolicy,
|
||||
|
||||
/// <summary>Active telemetry escalation settings.</summary>
|
||||
IncidentTelemetrySettings TelemetrySettings,
|
||||
|
||||
/// <summary>Debug artifact capture settings.</summary>
|
||||
IncidentDebugCaptureSettings DebugCaptureSettings)
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
|
||||
WriteIndented = false
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Creates a default inactive status.
|
||||
/// </summary>
|
||||
public static PackRunIncidentModeStatus Inactive() => new(
|
||||
Active: false,
|
||||
Level: IncidentEscalationLevel.None,
|
||||
ActivatedAt: null,
|
||||
ActivationReason: null,
|
||||
Source: IncidentModeSource.None,
|
||||
ExpiresAt: null,
|
||||
RetentionPolicy: IncidentRetentionPolicy.Default(),
|
||||
TelemetrySettings: IncidentTelemetrySettings.Default(),
|
||||
DebugCaptureSettings: IncidentDebugCaptureSettings.Default());
|
||||
|
||||
/// <summary>
|
||||
/// Serializes to JSON.
|
||||
/// </summary>
|
||||
public string ToJson() => JsonSerializer.Serialize(this, JsonOptions);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Incident escalation levels.
|
||||
/// </summary>
|
||||
public enum IncidentEscalationLevel
|
||||
{
|
||||
/// <summary>No incident mode.</summary>
|
||||
None = 0,
|
||||
|
||||
/// <summary>Low severity - enhanced logging.</summary>
|
||||
Low = 1,
|
||||
|
||||
/// <summary>Medium severity - debug capture enabled.</summary>
|
||||
Medium = 2,
|
||||
|
||||
/// <summary>High severity - full debug + extended retention.</summary>
|
||||
High = 3,
|
||||
|
||||
/// <summary>Critical - maximum telemetry + indefinite retention.</summary>
|
||||
Critical = 4
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Source of incident mode activation.
|
||||
/// </summary>
|
||||
public enum IncidentModeSource
|
||||
{
|
||||
/// <summary>No incident mode.</summary>
|
||||
None,
|
||||
|
||||
/// <summary>Activated manually by operator.</summary>
|
||||
Manual,
|
||||
|
||||
/// <summary>Activated by SLO breach webhook.</summary>
|
||||
SloBreach,
|
||||
|
||||
/// <summary>Activated by error rate threshold.</summary>
|
||||
ErrorRate,
|
||||
|
||||
/// <summary>Activated by policy evaluation.</summary>
|
||||
PolicyTrigger,
|
||||
|
||||
/// <summary>Activated by external system.</summary>
|
||||
External
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Retention policy during incident mode.
|
||||
/// </summary>
|
||||
public sealed record IncidentRetentionPolicy(
|
||||
/// <summary>Whether extended retention is active.</summary>
|
||||
bool ExtendedRetentionActive,
|
||||
|
||||
/// <summary>Log retention in days.</summary>
|
||||
int LogRetentionDays,
|
||||
|
||||
/// <summary>Artifact retention in days.</summary>
|
||||
int ArtifactRetentionDays,
|
||||
|
||||
/// <summary>Debug capture retention in days.</summary>
|
||||
int DebugCaptureRetentionDays,
|
||||
|
||||
/// <summary>Trace retention in days.</summary>
|
||||
int TraceRetentionDays)
|
||||
{
|
||||
/// <summary>Default retention policy.</summary>
|
||||
public static IncidentRetentionPolicy Default() => new(
|
||||
ExtendedRetentionActive: false,
|
||||
LogRetentionDays: 7,
|
||||
ArtifactRetentionDays: 30,
|
||||
DebugCaptureRetentionDays: 3,
|
||||
TraceRetentionDays: 7);
|
||||
|
||||
/// <summary>Extended retention for incident mode.</summary>
|
||||
public static IncidentRetentionPolicy Extended() => new(
|
||||
ExtendedRetentionActive: true,
|
||||
LogRetentionDays: 90,
|
||||
ArtifactRetentionDays: 180,
|
||||
DebugCaptureRetentionDays: 30,
|
||||
TraceRetentionDays: 90);
|
||||
|
||||
/// <summary>Maximum retention for critical incidents.</summary>
|
||||
public static IncidentRetentionPolicy Maximum() => new(
|
||||
ExtendedRetentionActive: true,
|
||||
LogRetentionDays: 365,
|
||||
ArtifactRetentionDays: 365,
|
||||
DebugCaptureRetentionDays: 90,
|
||||
TraceRetentionDays: 365);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Telemetry settings during incident mode.
|
||||
/// </summary>
|
||||
public sealed record IncidentTelemetrySettings(
|
||||
/// <summary>Whether enhanced telemetry is active.</summary>
|
||||
bool EnhancedTelemetryActive,
|
||||
|
||||
/// <summary>Log verbosity level.</summary>
|
||||
IncidentLogVerbosity LogVerbosity,
|
||||
|
||||
/// <summary>Trace sampling rate (0.0 to 1.0).</summary>
|
||||
double TraceSamplingRate,
|
||||
|
||||
/// <summary>Whether to capture environment variables.</summary>
|
||||
bool CaptureEnvironment,
|
||||
|
||||
/// <summary>Whether to capture step inputs/outputs.</summary>
|
||||
bool CaptureStepIo,
|
||||
|
||||
/// <summary>Whether to capture network calls.</summary>
|
||||
bool CaptureNetworkCalls,
|
||||
|
||||
/// <summary>Maximum trace spans per step.</summary>
|
||||
int MaxTraceSpansPerStep)
|
||||
{
|
||||
/// <summary>Default telemetry settings.</summary>
|
||||
public static IncidentTelemetrySettings Default() => new(
|
||||
EnhancedTelemetryActive: false,
|
||||
LogVerbosity: IncidentLogVerbosity.Normal,
|
||||
TraceSamplingRate: 0.1,
|
||||
CaptureEnvironment: false,
|
||||
CaptureStepIo: false,
|
||||
CaptureNetworkCalls: false,
|
||||
MaxTraceSpansPerStep: 100);
|
||||
|
||||
/// <summary>Enhanced telemetry for incident mode.</summary>
|
||||
public static IncidentTelemetrySettings Enhanced() => new(
|
||||
EnhancedTelemetryActive: true,
|
||||
LogVerbosity: IncidentLogVerbosity.Verbose,
|
||||
TraceSamplingRate: 1.0,
|
||||
CaptureEnvironment: true,
|
||||
CaptureStepIo: true,
|
||||
CaptureNetworkCalls: true,
|
||||
MaxTraceSpansPerStep: 1000);
|
||||
|
||||
/// <summary>Maximum telemetry for debugging.</summary>
|
||||
public static IncidentTelemetrySettings Maximum() => new(
|
||||
EnhancedTelemetryActive: true,
|
||||
LogVerbosity: IncidentLogVerbosity.Debug,
|
||||
TraceSamplingRate: 1.0,
|
||||
CaptureEnvironment: true,
|
||||
CaptureStepIo: true,
|
||||
CaptureNetworkCalls: true,
|
||||
MaxTraceSpansPerStep: 10000);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Log verbosity levels for incident mode.
|
||||
/// </summary>
|
||||
public enum IncidentLogVerbosity
|
||||
{
|
||||
/// <summary>Minimal logging (errors only).</summary>
|
||||
Minimal,
|
||||
|
||||
/// <summary>Normal logging.</summary>
|
||||
Normal,
|
||||
|
||||
/// <summary>Verbose logging.</summary>
|
||||
Verbose,
|
||||
|
||||
/// <summary>Debug logging (maximum detail).</summary>
|
||||
Debug
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Debug artifact capture settings.
|
||||
/// </summary>
|
||||
public sealed record IncidentDebugCaptureSettings(
|
||||
/// <summary>Whether debug capture is active.</summary>
|
||||
bool CaptureActive,
|
||||
|
||||
/// <summary>Whether to capture heap dumps.</summary>
|
||||
bool CaptureHeapDumps,
|
||||
|
||||
/// <summary>Whether to capture thread dumps.</summary>
|
||||
bool CaptureThreadDumps,
|
||||
|
||||
/// <summary>Whether to capture profiling data.</summary>
|
||||
bool CaptureProfilingData,
|
||||
|
||||
/// <summary>Whether to capture system metrics.</summary>
|
||||
bool CaptureSystemMetrics,
|
||||
|
||||
/// <summary>Maximum capture size in MB.</summary>
|
||||
int MaxCaptureSizeMb,
|
||||
|
||||
/// <summary>Capture interval in seconds.</summary>
|
||||
int CaptureIntervalSeconds)
|
||||
{
|
||||
/// <summary>Default capture settings (disabled).</summary>
|
||||
public static IncidentDebugCaptureSettings Default() => new(
|
||||
CaptureActive: false,
|
||||
CaptureHeapDumps: false,
|
||||
CaptureThreadDumps: false,
|
||||
CaptureProfilingData: false,
|
||||
CaptureSystemMetrics: false,
|
||||
MaxCaptureSizeMb: 0,
|
||||
CaptureIntervalSeconds: 0);
|
||||
|
||||
/// <summary>Basic debug capture.</summary>
|
||||
public static IncidentDebugCaptureSettings Basic() => new(
|
||||
CaptureActive: true,
|
||||
CaptureHeapDumps: false,
|
||||
CaptureThreadDumps: true,
|
||||
CaptureProfilingData: false,
|
||||
CaptureSystemMetrics: true,
|
||||
MaxCaptureSizeMb: 100,
|
||||
CaptureIntervalSeconds: 60);
|
||||
|
||||
/// <summary>Full debug capture.</summary>
|
||||
public static IncidentDebugCaptureSettings Full() => new(
|
||||
CaptureActive: true,
|
||||
CaptureHeapDumps: true,
|
||||
CaptureThreadDumps: true,
|
||||
CaptureProfilingData: true,
|
||||
CaptureSystemMetrics: true,
|
||||
MaxCaptureSizeMb: 500,
|
||||
CaptureIntervalSeconds: 30);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SLO breach notification payload.
|
||||
/// </summary>
|
||||
public sealed record SloBreachNotification(
|
||||
/// <summary>Breach identifier.</summary>
|
||||
[property: JsonPropertyName("breachId")]
|
||||
string BreachId,
|
||||
|
||||
/// <summary>SLO that was breached.</summary>
|
||||
[property: JsonPropertyName("sloName")]
|
||||
string SloName,
|
||||
|
||||
/// <summary>Breach severity.</summary>
|
||||
[property: JsonPropertyName("severity")]
|
||||
string Severity,
|
||||
|
||||
/// <summary>When the breach occurred.</summary>
|
||||
[property: JsonPropertyName("occurredAt")]
|
||||
DateTimeOffset OccurredAt,
|
||||
|
||||
/// <summary>Current metric value.</summary>
|
||||
[property: JsonPropertyName("currentValue")]
|
||||
double CurrentValue,
|
||||
|
||||
/// <summary>Threshold that was breached.</summary>
|
||||
[property: JsonPropertyName("threshold")]
|
||||
double Threshold,
|
||||
|
||||
/// <summary>Target metric value.</summary>
|
||||
[property: JsonPropertyName("target")]
|
||||
double Target,
|
||||
|
||||
/// <summary>Affected resource (run ID, step ID, etc.).</summary>
|
||||
[property: JsonPropertyName("resourceId")]
|
||||
string? ResourceId,
|
||||
|
||||
/// <summary>Affected tenant.</summary>
|
||||
[property: JsonPropertyName("tenantId")]
|
||||
string? TenantId,
|
||||
|
||||
/// <summary>Additional context.</summary>
|
||||
[property: JsonPropertyName("context")]
|
||||
IReadOnlyDictionary<string, string>? Context);
|
||||
|
||||
/// <summary>
|
||||
/// Request to activate incident mode.
|
||||
/// </summary>
|
||||
public sealed record IncidentModeActivationRequest(
|
||||
/// <summary>Run ID to activate incident mode for.</summary>
|
||||
string RunId,
|
||||
|
||||
/// <summary>Tenant ID.</summary>
|
||||
string TenantId,
|
||||
|
||||
/// <summary>Escalation level to activate.</summary>
|
||||
IncidentEscalationLevel Level,
|
||||
|
||||
/// <summary>Activation source.</summary>
|
||||
IncidentModeSource Source,
|
||||
|
||||
/// <summary>Reason for activation.</summary>
|
||||
string Reason,
|
||||
|
||||
/// <summary>Duration in minutes (null for indefinite).</summary>
|
||||
int? DurationMinutes,
|
||||
|
||||
/// <summary>Operator or system that requested activation.</summary>
|
||||
string? RequestedBy);
|
||||
|
||||
/// <summary>
|
||||
/// Result of incident mode activation.
|
||||
/// </summary>
|
||||
public sealed record IncidentModeActivationResult(
|
||||
/// <summary>Whether activation succeeded.</summary>
|
||||
bool Success,
|
||||
|
||||
/// <summary>Current incident mode status.</summary>
|
||||
PackRunIncidentModeStatus Status,
|
||||
|
||||
/// <summary>Error message if activation failed.</summary>
|
||||
string? Error);
|
||||
@@ -0,0 +1,396 @@
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Time.Testing;
|
||||
using StellaOps.TaskRunner.Core.Events;
|
||||
using StellaOps.TaskRunner.Core.IncidentMode;
|
||||
|
||||
namespace StellaOps.TaskRunner.Tests;
|
||||
|
||||
public sealed class PackRunIncidentModeTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task ActivateAsync_ActivatesIncidentModeSuccessfully()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var request = new IncidentModeActivationRequest(
|
||||
RunId: "run-001",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.Medium,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Debugging production issue",
|
||||
DurationMinutes: 60,
|
||||
RequestedBy: "admin@example.com");
|
||||
|
||||
var result = await service.ActivateAsync(request, TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.True(result.Success);
|
||||
Assert.True(result.Status.Active);
|
||||
Assert.Equal(IncidentEscalationLevel.Medium, result.Status.Level);
|
||||
Assert.Equal(IncidentModeSource.Manual, result.Status.Source);
|
||||
Assert.NotNull(result.Status.ActivatedAt);
|
||||
Assert.NotNull(result.Status.ExpiresAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ActivateAsync_WithoutDuration_CreatesIndefiniteIncidentMode()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var request = new IncidentModeActivationRequest(
|
||||
RunId: "run-002",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.High,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Critical investigation",
|
||||
DurationMinutes: null,
|
||||
RequestedBy: null);
|
||||
|
||||
var result = await service.ActivateAsync(request, TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.True(result.Success);
|
||||
Assert.Null(result.Status.ExpiresAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ActivateAsync_EmitsTimelineEvent()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var timelineSink = new InMemoryPackRunTimelineEventSink();
|
||||
var emitter = new PackRunTimelineEventEmitter(
|
||||
timelineSink,
|
||||
TimeProvider.System,
|
||||
NullLogger<PackRunTimelineEventEmitter>.Instance);
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance,
|
||||
null,
|
||||
emitter);
|
||||
|
||||
var request = new IncidentModeActivationRequest(
|
||||
RunId: "run-003",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.Low,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Test",
|
||||
DurationMinutes: 30,
|
||||
RequestedBy: null);
|
||||
|
||||
await service.ActivateAsync(request, TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.Equal(1, timelineSink.Count);
|
||||
var evt = timelineSink.GetEvents()[0];
|
||||
Assert.Equal(PackRunIncidentEventTypes.IncidentModeActivated, evt.EventType);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeactivateAsync_DeactivatesIncidentMode()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
// First activate
|
||||
var activateRequest = new IncidentModeActivationRequest(
|
||||
RunId: "run-004",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.Medium,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Test",
|
||||
DurationMinutes: null,
|
||||
RequestedBy: null);
|
||||
|
||||
await service.ActivateAsync(activateRequest, TestContext.Current.CancellationToken);
|
||||
|
||||
// Then deactivate
|
||||
var result = await service.DeactivateAsync("run-004", "Issue resolved", TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.True(result.Success);
|
||||
Assert.False(result.Status.Active);
|
||||
|
||||
var status = await service.GetStatusAsync("run-004", TestContext.Current.CancellationToken);
|
||||
Assert.False(status.Active);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GetStatusAsync_ReturnsInactiveForUnknownRun()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var status = await service.GetStatusAsync("unknown-run", TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.False(status.Active);
|
||||
Assert.Equal(IncidentEscalationLevel.None, status.Level);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GetStatusAsync_AutoDeactivatesExpiredIncidentMode()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var fakeTime = new FakeTimeProvider(DateTimeOffset.UtcNow);
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance,
|
||||
fakeTime);
|
||||
|
||||
var request = new IncidentModeActivationRequest(
|
||||
RunId: "run-005",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.Medium,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Test",
|
||||
DurationMinutes: 30,
|
||||
RequestedBy: null);
|
||||
|
||||
await service.ActivateAsync(request, TestContext.Current.CancellationToken);
|
||||
|
||||
// Advance time past expiration
|
||||
fakeTime.Advance(TimeSpan.FromMinutes(31));
|
||||
|
||||
var status = await service.GetStatusAsync("run-005", TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.False(status.Active);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task HandleSloBreachAsync_ActivatesIncidentModeFromBreach()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var breach = new SloBreachNotification(
|
||||
BreachId: "breach-001",
|
||||
SloName: "error_rate_5m",
|
||||
Severity: "HIGH",
|
||||
OccurredAt: DateTimeOffset.UtcNow,
|
||||
CurrentValue: 15.5,
|
||||
Threshold: 5.0,
|
||||
Target: 1.0,
|
||||
ResourceId: "run-006",
|
||||
TenantId: "tenant-1",
|
||||
Context: new Dictionary<string, string> { ["step"] = "scan" });
|
||||
|
||||
var result = await service.HandleSloBreachAsync(breach, TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.True(result.Success);
|
||||
Assert.True(result.Status.Active);
|
||||
Assert.Equal(IncidentEscalationLevel.High, result.Status.Level);
|
||||
Assert.Equal(IncidentModeSource.SloBreach, result.Status.Source);
|
||||
Assert.Contains("error_rate_5m", result.Status.ActivationReason!);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task HandleSloBreachAsync_MapsSeverityToLevel()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var severityToLevel = new Dictionary<string, IncidentEscalationLevel>
|
||||
{
|
||||
["CRITICAL"] = IncidentEscalationLevel.Critical,
|
||||
["HIGH"] = IncidentEscalationLevel.High,
|
||||
["MEDIUM"] = IncidentEscalationLevel.Medium,
|
||||
["LOW"] = IncidentEscalationLevel.Low
|
||||
};
|
||||
|
||||
var runIndex = 0;
|
||||
foreach (var (severity, expectedLevel) in severityToLevel)
|
||||
{
|
||||
var breach = new SloBreachNotification(
|
||||
BreachId: $"breach-{runIndex}",
|
||||
SloName: "test_slo",
|
||||
Severity: severity,
|
||||
OccurredAt: DateTimeOffset.UtcNow,
|
||||
CurrentValue: 10.0,
|
||||
Threshold: 5.0,
|
||||
Target: 1.0,
|
||||
ResourceId: $"run-severity-{runIndex++}",
|
||||
TenantId: "tenant-1",
|
||||
Context: null);
|
||||
|
||||
var result = await service.HandleSloBreachAsync(breach, TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.True(result.Success);
|
||||
Assert.Equal(expectedLevel, result.Status.Level);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task HandleSloBreachAsync_ReturnsErrorForMissingResourceId()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var breach = new SloBreachNotification(
|
||||
BreachId: "breach-no-resource",
|
||||
SloName: "test_slo",
|
||||
Severity: "HIGH",
|
||||
OccurredAt: DateTimeOffset.UtcNow,
|
||||
CurrentValue: 10.0,
|
||||
Threshold: 5.0,
|
||||
Target: 1.0,
|
||||
ResourceId: null,
|
||||
TenantId: "tenant-1",
|
||||
Context: null);
|
||||
|
||||
var result = await service.HandleSloBreachAsync(breach, TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.False(result.Success);
|
||||
Assert.Contains("No resource ID", result.Error);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task EscalateAsync_IncreasesEscalationLevel()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
// First activate at Low level
|
||||
var activateRequest = new IncidentModeActivationRequest(
|
||||
RunId: "run-escalate",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.Low,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Initial activation",
|
||||
DurationMinutes: null,
|
||||
RequestedBy: null);
|
||||
|
||||
await service.ActivateAsync(activateRequest, TestContext.Current.CancellationToken);
|
||||
|
||||
// Escalate to High
|
||||
var result = await service.EscalateAsync(
|
||||
"run-escalate",
|
||||
IncidentEscalationLevel.High,
|
||||
"Issue is more severe than expected",
|
||||
TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.True(result.Success);
|
||||
Assert.Equal(IncidentEscalationLevel.High, result.Status.Level);
|
||||
Assert.Contains("Escalated", result.Status.ActivationReason);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task EscalateAsync_FailsWhenNotInIncidentMode()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var result = await service.EscalateAsync(
|
||||
"unknown-run",
|
||||
IncidentEscalationLevel.High,
|
||||
null,
|
||||
TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.False(result.Success);
|
||||
Assert.Contains("not active", result.Error);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task EscalateAsync_FailsWhenNewLevelIsLowerOrEqual()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var activateRequest = new IncidentModeActivationRequest(
|
||||
RunId: "run-no-deescalate",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.High,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Test",
|
||||
DurationMinutes: null,
|
||||
RequestedBy: null);
|
||||
|
||||
await service.ActivateAsync(activateRequest, TestContext.Current.CancellationToken);
|
||||
|
||||
var result = await service.EscalateAsync(
|
||||
"run-no-deescalate",
|
||||
IncidentEscalationLevel.Medium, // Lower than High
|
||||
null,
|
||||
TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.False(result.Success);
|
||||
Assert.Contains("Cannot escalate", result.Error);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSettingsForLevel_ReturnsCorrectSettings()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
// Test None level
|
||||
var noneSettings = service.GetSettingsForLevel(IncidentEscalationLevel.None);
|
||||
Assert.False(noneSettings.TelemetrySettings.EnhancedTelemetryActive);
|
||||
Assert.False(noneSettings.DebugCaptureSettings.CaptureActive);
|
||||
|
||||
// Test Critical level
|
||||
var criticalSettings = service.GetSettingsForLevel(IncidentEscalationLevel.Critical);
|
||||
Assert.True(criticalSettings.TelemetrySettings.EnhancedTelemetryActive);
|
||||
Assert.Equal(IncidentLogVerbosity.Debug, criticalSettings.TelemetrySettings.LogVerbosity);
|
||||
Assert.Equal(1.0, criticalSettings.TelemetrySettings.TraceSamplingRate);
|
||||
Assert.True(criticalSettings.DebugCaptureSettings.CaptureActive);
|
||||
Assert.True(criticalSettings.DebugCaptureSettings.CaptureHeapDumps);
|
||||
Assert.Equal(365, criticalSettings.RetentionPolicy.LogRetentionDays);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PackRunIncidentModeStatus_Inactive_ReturnsDefaultValues()
|
||||
{
|
||||
var inactive = PackRunIncidentModeStatus.Inactive();
|
||||
|
||||
Assert.False(inactive.Active);
|
||||
Assert.Equal(IncidentEscalationLevel.None, inactive.Level);
|
||||
Assert.Null(inactive.ActivatedAt);
|
||||
Assert.Null(inactive.ActivationReason);
|
||||
Assert.Equal(IncidentModeSource.None, inactive.Source);
|
||||
Assert.False(inactive.RetentionPolicy.ExtendedRetentionActive);
|
||||
Assert.False(inactive.TelemetrySettings.EnhancedTelemetryActive);
|
||||
Assert.False(inactive.DebugCaptureSettings.CaptureActive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IncidentRetentionPolicy_Extended_HasLongerRetention()
|
||||
{
|
||||
var defaultPolicy = IncidentRetentionPolicy.Default();
|
||||
var extendedPolicy = IncidentRetentionPolicy.Extended();
|
||||
|
||||
Assert.True(extendedPolicy.ExtendedRetentionActive);
|
||||
Assert.True(extendedPolicy.LogRetentionDays > defaultPolicy.LogRetentionDays);
|
||||
Assert.True(extendedPolicy.ArtifactRetentionDays > defaultPolicy.ArtifactRetentionDays);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IncidentTelemetrySettings_Enhanced_HasHigherSampling()
|
||||
{
|
||||
var defaultSettings = IncidentTelemetrySettings.Default();
|
||||
var enhancedSettings = IncidentTelemetrySettings.Enhanced();
|
||||
|
||||
Assert.True(enhancedSettings.EnhancedTelemetryActive);
|
||||
Assert.True(enhancedSettings.TraceSamplingRate > defaultSettings.TraceSamplingRate);
|
||||
Assert.True(enhancedSettings.CaptureEnvironment);
|
||||
Assert.True(enhancedSettings.CaptureStepIo);
|
||||
}
|
||||
}
|
||||
@@ -16,6 +16,7 @@ using StellaOps.AirGap.Policy;
|
||||
using StellaOps.TaskRunner.Core.AirGap;
|
||||
using StellaOps.TaskRunner.Core.Attestation;
|
||||
using StellaOps.TaskRunner.Core.Configuration;
|
||||
using StellaOps.TaskRunner.Core.IncidentMode;
|
||||
using StellaOps.TaskRunner.Core.Events;
|
||||
using StellaOps.TaskRunner.Core.Execution;
|
||||
using StellaOps.TaskRunner.Core.Execution.Simulation;
|
||||
@@ -127,6 +128,10 @@ builder.Services.AddSingleton<IPackRunAttestationStore, InMemoryPackRunAttestati
|
||||
builder.Services.AddSingleton<IPackRunAttestationSigner, StubPackRunAttestationSigner>();
|
||||
builder.Services.AddSingleton<IPackRunAttestationService, PackRunAttestationService>();
|
||||
|
||||
// Pack run incident mode (TASKRUN-OBS-55-001)
|
||||
builder.Services.AddSingleton<IPackRunIncidentModeStore, InMemoryPackRunIncidentModeStore>();
|
||||
builder.Services.AddSingleton<IPackRunIncidentModeService, PackRunIncidentModeService>();
|
||||
|
||||
builder.Services.AddOpenApi();
|
||||
|
||||
var app = builder.Build();
|
||||
@@ -230,6 +235,22 @@ app.MapGet("/api/attestations/{attestationId}/envelope", HandleGetAttestationEnv
|
||||
app.MapPost("/v1/task-runner/attestations/{attestationId}/verify", HandleVerifyAttestation).WithName("VerifyAttestation");
|
||||
app.MapPost("/api/attestations/{attestationId}/verify", HandleVerifyAttestation).WithName("VerifyAttestationApi");
|
||||
|
||||
// Incident mode endpoints (TASKRUN-OBS-55-001)
|
||||
app.MapGet("/v1/task-runner/runs/{runId}/incident-mode", HandleGetIncidentModeStatus).WithName("GetIncidentModeStatus");
|
||||
app.MapGet("/api/runs/{runId}/incident-mode", HandleGetIncidentModeStatus).WithName("GetIncidentModeStatusApi");
|
||||
|
||||
app.MapPost("/v1/task-runner/runs/{runId}/incident-mode/activate", HandleActivateIncidentMode).WithName("ActivateIncidentMode");
|
||||
app.MapPost("/api/runs/{runId}/incident-mode/activate", HandleActivateIncidentMode).WithName("ActivateIncidentModeApi");
|
||||
|
||||
app.MapPost("/v1/task-runner/runs/{runId}/incident-mode/deactivate", HandleDeactivateIncidentMode).WithName("DeactivateIncidentMode");
|
||||
app.MapPost("/api/runs/{runId}/incident-mode/deactivate", HandleDeactivateIncidentMode).WithName("DeactivateIncidentModeApi");
|
||||
|
||||
app.MapPost("/v1/task-runner/runs/{runId}/incident-mode/escalate", HandleEscalateIncidentMode).WithName("EscalateIncidentMode");
|
||||
app.MapPost("/api/runs/{runId}/incident-mode/escalate", HandleEscalateIncidentMode).WithName("EscalateIncidentModeApi");
|
||||
|
||||
app.MapPost("/v1/task-runner/webhooks/slo-breach", HandleSloBreachWebhook).WithName("SloBreachWebhook");
|
||||
app.MapPost("/api/webhooks/slo-breach", HandleSloBreachWebhook).WithName("SloBreachWebhookApi");
|
||||
|
||||
app.MapGet("/.well-known/openapi", (HttpResponse response) =>
|
||||
{
|
||||
var metadata = OpenApiMetadataFactory.Create("/openapi");
|
||||
@@ -681,6 +702,175 @@ async Task<IResult> HandleVerifyAttestation(
|
||||
}, statusCode: statusCode);
|
||||
}
|
||||
|
||||
// Incident mode handlers (TASKRUN-OBS-55-001)
|
||||
async Task<IResult> HandleGetIncidentModeStatus(
|
||||
string runId,
|
||||
IPackRunIncidentModeService incidentModeService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(runId))
|
||||
{
|
||||
return Results.BadRequest(new { error = "runId is required." });
|
||||
}
|
||||
|
||||
var status = await incidentModeService.GetStatusAsync(runId, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
return Results.Ok(new
|
||||
{
|
||||
runId,
|
||||
active = status.Active,
|
||||
level = status.Level.ToString().ToLowerInvariant(),
|
||||
activatedAt = status.ActivatedAt?.ToString("O"),
|
||||
activationReason = status.ActivationReason,
|
||||
source = status.Source.ToString().ToLowerInvariant(),
|
||||
expiresAt = status.ExpiresAt?.ToString("O"),
|
||||
retentionPolicy = new
|
||||
{
|
||||
extendedRetentionActive = status.RetentionPolicy.ExtendedRetentionActive,
|
||||
logRetentionDays = status.RetentionPolicy.LogRetentionDays,
|
||||
artifactRetentionDays = status.RetentionPolicy.ArtifactRetentionDays
|
||||
},
|
||||
telemetrySettings = new
|
||||
{
|
||||
enhancedTelemetryActive = status.TelemetrySettings.EnhancedTelemetryActive,
|
||||
logVerbosity = status.TelemetrySettings.LogVerbosity.ToString().ToLowerInvariant(),
|
||||
traceSamplingRate = status.TelemetrySettings.TraceSamplingRate
|
||||
},
|
||||
debugCaptureSettings = new
|
||||
{
|
||||
captureActive = status.DebugCaptureSettings.CaptureActive,
|
||||
captureHeapDumps = status.DebugCaptureSettings.CaptureHeapDumps,
|
||||
captureThreadDumps = status.DebugCaptureSettings.CaptureThreadDumps
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async Task<IResult> HandleActivateIncidentMode(
|
||||
string runId,
|
||||
[FromBody] ActivateIncidentModeRequest? request,
|
||||
[FromHeader(Name = "X-Tenant-ID")] string? tenantId,
|
||||
IPackRunIncidentModeService incidentModeService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(runId))
|
||||
{
|
||||
return Results.BadRequest(new { error = "runId is required." });
|
||||
}
|
||||
|
||||
var level = Enum.TryParse<IncidentEscalationLevel>(request?.Level, ignoreCase: true, out var parsedLevel)
|
||||
? parsedLevel
|
||||
: IncidentEscalationLevel.Medium;
|
||||
|
||||
var activationRequest = new IncidentModeActivationRequest(
|
||||
RunId: runId,
|
||||
TenantId: tenantId ?? "default",
|
||||
Level: level,
|
||||
Source: StellaOps.TaskRunner.Core.IncidentMode.IncidentModeSource.Manual,
|
||||
Reason: request?.Reason ?? "Manual activation via API",
|
||||
DurationMinutes: request?.DurationMinutes,
|
||||
RequestedBy: request?.RequestedBy);
|
||||
|
||||
var result = await incidentModeService.ActivateAsync(activationRequest, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!result.Success)
|
||||
{
|
||||
return Results.BadRequest(new { error = result.Error });
|
||||
}
|
||||
|
||||
return Results.Ok(new
|
||||
{
|
||||
success = result.Success,
|
||||
active = result.Status.Active,
|
||||
level = result.Status.Level.ToString().ToLowerInvariant(),
|
||||
activatedAt = result.Status.ActivatedAt?.ToString("O"),
|
||||
expiresAt = result.Status.ExpiresAt?.ToString("O")
|
||||
});
|
||||
}
|
||||
|
||||
async Task<IResult> HandleDeactivateIncidentMode(
|
||||
string runId,
|
||||
[FromBody] DeactivateIncidentModeRequest? request,
|
||||
IPackRunIncidentModeService incidentModeService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(runId))
|
||||
{
|
||||
return Results.BadRequest(new { error = "runId is required." });
|
||||
}
|
||||
|
||||
var result = await incidentModeService.DeactivateAsync(runId, request?.Reason, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
return Results.Ok(new
|
||||
{
|
||||
success = result.Success,
|
||||
active = result.Status.Active
|
||||
});
|
||||
}
|
||||
|
||||
async Task<IResult> HandleEscalateIncidentMode(
|
||||
string runId,
|
||||
[FromBody] EscalateIncidentModeRequest? request,
|
||||
IPackRunIncidentModeService incidentModeService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(runId))
|
||||
{
|
||||
return Results.BadRequest(new { error = "runId is required." });
|
||||
}
|
||||
|
||||
if (request is null || string.IsNullOrWhiteSpace(request.Level))
|
||||
{
|
||||
return Results.BadRequest(new { error = "Level is required for escalation." });
|
||||
}
|
||||
|
||||
if (!Enum.TryParse<IncidentEscalationLevel>(request.Level, ignoreCase: true, out var newLevel))
|
||||
{
|
||||
return Results.BadRequest(new { error = $"Invalid escalation level: {request.Level}" });
|
||||
}
|
||||
|
||||
var result = await incidentModeService.EscalateAsync(runId, newLevel, request.Reason, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (!result.Success)
|
||||
{
|
||||
return Results.BadRequest(new { error = result.Error });
|
||||
}
|
||||
|
||||
return Results.Ok(new
|
||||
{
|
||||
success = result.Success,
|
||||
level = result.Status.Level.ToString().ToLowerInvariant()
|
||||
});
|
||||
}
|
||||
|
||||
async Task<IResult> HandleSloBreachWebhook(
|
||||
[FromBody] SloBreachNotification notification,
|
||||
IPackRunIncidentModeService incidentModeService,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (notification is null)
|
||||
{
|
||||
return Results.BadRequest(new { error = "Notification body is required." });
|
||||
}
|
||||
|
||||
var result = await incidentModeService.HandleSloBreachAsync(notification, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (!result.Success)
|
||||
{
|
||||
return Results.BadRequest(new { error = result.Error });
|
||||
}
|
||||
|
||||
return Results.Ok(new
|
||||
{
|
||||
success = result.Success,
|
||||
runId = notification.ResourceId,
|
||||
level = result.Status.Level.ToString().ToLowerInvariant(),
|
||||
activatedAt = result.Status.ActivatedAt?.ToString("O")
|
||||
});
|
||||
}
|
||||
|
||||
app.Run();
|
||||
|
||||
static IDictionary<string, JsonNode?>? ConvertInputs(JsonObject? node)
|
||||
@@ -712,6 +902,17 @@ internal sealed record VerifyAttestationRequest(
|
||||
|
||||
internal sealed record VerifyAttestationSubject(string Name, IReadOnlyDictionary<string, string>? Digest);
|
||||
|
||||
// Incident mode API request models (TASKRUN-OBS-55-001)
|
||||
internal sealed record ActivateIncidentModeRequest(
|
||||
string? Level,
|
||||
string? Reason,
|
||||
int? DurationMinutes,
|
||||
string? RequestedBy);
|
||||
|
||||
internal sealed record DeactivateIncidentModeRequest(string? Reason);
|
||||
|
||||
internal sealed record EscalateIncidentModeRequest(string Level, string? Reason);
|
||||
|
||||
internal sealed record SimulationResponse(
|
||||
string PlanHash,
|
||||
FailurePolicyResponse FailurePolicy,
|
||||
|
||||
Reference in New Issue
Block a user