Implement incident mode management service and models
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
- Added IPackRunIncidentModeService interface for managing incident mode activation, deactivation, and status retrieval. - Created PackRunIncidentModeService class implementing the service interface with methods for activating, deactivating, and escalating incident modes. - Introduced incident mode status model (PackRunIncidentModeStatus) and related enums for escalation levels and activation sources. - Developed retention policy, telemetry settings, and debug capture settings models to manage incident mode configurations. - Implemented SLO breach notification handling to activate incident mode based on severity. - Added in-memory store (InMemoryPackRunIncidentModeStore) for testing purposes. - Created comprehensive unit tests for incident mode service, covering activation, deactivation, status retrieval, and SLO breach handling.
This commit is contained in:
@@ -0,0 +1,396 @@
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Microsoft.Extensions.Time.Testing;
|
||||
using StellaOps.TaskRunner.Core.Events;
|
||||
using StellaOps.TaskRunner.Core.IncidentMode;
|
||||
|
||||
namespace StellaOps.TaskRunner.Tests;
|
||||
|
||||
public sealed class PackRunIncidentModeTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task ActivateAsync_ActivatesIncidentModeSuccessfully()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var request = new IncidentModeActivationRequest(
|
||||
RunId: "run-001",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.Medium,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Debugging production issue",
|
||||
DurationMinutes: 60,
|
||||
RequestedBy: "admin@example.com");
|
||||
|
||||
var result = await service.ActivateAsync(request, TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.True(result.Success);
|
||||
Assert.True(result.Status.Active);
|
||||
Assert.Equal(IncidentEscalationLevel.Medium, result.Status.Level);
|
||||
Assert.Equal(IncidentModeSource.Manual, result.Status.Source);
|
||||
Assert.NotNull(result.Status.ActivatedAt);
|
||||
Assert.NotNull(result.Status.ExpiresAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ActivateAsync_WithoutDuration_CreatesIndefiniteIncidentMode()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var request = new IncidentModeActivationRequest(
|
||||
RunId: "run-002",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.High,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Critical investigation",
|
||||
DurationMinutes: null,
|
||||
RequestedBy: null);
|
||||
|
||||
var result = await service.ActivateAsync(request, TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.True(result.Success);
|
||||
Assert.Null(result.Status.ExpiresAt);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task ActivateAsync_EmitsTimelineEvent()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var timelineSink = new InMemoryPackRunTimelineEventSink();
|
||||
var emitter = new PackRunTimelineEventEmitter(
|
||||
timelineSink,
|
||||
TimeProvider.System,
|
||||
NullLogger<PackRunTimelineEventEmitter>.Instance);
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance,
|
||||
null,
|
||||
emitter);
|
||||
|
||||
var request = new IncidentModeActivationRequest(
|
||||
RunId: "run-003",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.Low,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Test",
|
||||
DurationMinutes: 30,
|
||||
RequestedBy: null);
|
||||
|
||||
await service.ActivateAsync(request, TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.Equal(1, timelineSink.Count);
|
||||
var evt = timelineSink.GetEvents()[0];
|
||||
Assert.Equal(PackRunIncidentEventTypes.IncidentModeActivated, evt.EventType);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task DeactivateAsync_DeactivatesIncidentMode()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
// First activate
|
||||
var activateRequest = new IncidentModeActivationRequest(
|
||||
RunId: "run-004",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.Medium,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Test",
|
||||
DurationMinutes: null,
|
||||
RequestedBy: null);
|
||||
|
||||
await service.ActivateAsync(activateRequest, TestContext.Current.CancellationToken);
|
||||
|
||||
// Then deactivate
|
||||
var result = await service.DeactivateAsync("run-004", "Issue resolved", TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.True(result.Success);
|
||||
Assert.False(result.Status.Active);
|
||||
|
||||
var status = await service.GetStatusAsync("run-004", TestContext.Current.CancellationToken);
|
||||
Assert.False(status.Active);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GetStatusAsync_ReturnsInactiveForUnknownRun()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var status = await service.GetStatusAsync("unknown-run", TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.False(status.Active);
|
||||
Assert.Equal(IncidentEscalationLevel.None, status.Level);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task GetStatusAsync_AutoDeactivatesExpiredIncidentMode()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var fakeTime = new FakeTimeProvider(DateTimeOffset.UtcNow);
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance,
|
||||
fakeTime);
|
||||
|
||||
var request = new IncidentModeActivationRequest(
|
||||
RunId: "run-005",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.Medium,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Test",
|
||||
DurationMinutes: 30,
|
||||
RequestedBy: null);
|
||||
|
||||
await service.ActivateAsync(request, TestContext.Current.CancellationToken);
|
||||
|
||||
// Advance time past expiration
|
||||
fakeTime.Advance(TimeSpan.FromMinutes(31));
|
||||
|
||||
var status = await service.GetStatusAsync("run-005", TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.False(status.Active);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task HandleSloBreachAsync_ActivatesIncidentModeFromBreach()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var breach = new SloBreachNotification(
|
||||
BreachId: "breach-001",
|
||||
SloName: "error_rate_5m",
|
||||
Severity: "HIGH",
|
||||
OccurredAt: DateTimeOffset.UtcNow,
|
||||
CurrentValue: 15.5,
|
||||
Threshold: 5.0,
|
||||
Target: 1.0,
|
||||
ResourceId: "run-006",
|
||||
TenantId: "tenant-1",
|
||||
Context: new Dictionary<string, string> { ["step"] = "scan" });
|
||||
|
||||
var result = await service.HandleSloBreachAsync(breach, TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.True(result.Success);
|
||||
Assert.True(result.Status.Active);
|
||||
Assert.Equal(IncidentEscalationLevel.High, result.Status.Level);
|
||||
Assert.Equal(IncidentModeSource.SloBreach, result.Status.Source);
|
||||
Assert.Contains("error_rate_5m", result.Status.ActivationReason!);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task HandleSloBreachAsync_MapsSeverityToLevel()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var severityToLevel = new Dictionary<string, IncidentEscalationLevel>
|
||||
{
|
||||
["CRITICAL"] = IncidentEscalationLevel.Critical,
|
||||
["HIGH"] = IncidentEscalationLevel.High,
|
||||
["MEDIUM"] = IncidentEscalationLevel.Medium,
|
||||
["LOW"] = IncidentEscalationLevel.Low
|
||||
};
|
||||
|
||||
var runIndex = 0;
|
||||
foreach (var (severity, expectedLevel) in severityToLevel)
|
||||
{
|
||||
var breach = new SloBreachNotification(
|
||||
BreachId: $"breach-{runIndex}",
|
||||
SloName: "test_slo",
|
||||
Severity: severity,
|
||||
OccurredAt: DateTimeOffset.UtcNow,
|
||||
CurrentValue: 10.0,
|
||||
Threshold: 5.0,
|
||||
Target: 1.0,
|
||||
ResourceId: $"run-severity-{runIndex++}",
|
||||
TenantId: "tenant-1",
|
||||
Context: null);
|
||||
|
||||
var result = await service.HandleSloBreachAsync(breach, TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.True(result.Success);
|
||||
Assert.Equal(expectedLevel, result.Status.Level);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task HandleSloBreachAsync_ReturnsErrorForMissingResourceId()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var breach = new SloBreachNotification(
|
||||
BreachId: "breach-no-resource",
|
||||
SloName: "test_slo",
|
||||
Severity: "HIGH",
|
||||
OccurredAt: DateTimeOffset.UtcNow,
|
||||
CurrentValue: 10.0,
|
||||
Threshold: 5.0,
|
||||
Target: 1.0,
|
||||
ResourceId: null,
|
||||
TenantId: "tenant-1",
|
||||
Context: null);
|
||||
|
||||
var result = await service.HandleSloBreachAsync(breach, TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.False(result.Success);
|
||||
Assert.Contains("No resource ID", result.Error);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task EscalateAsync_IncreasesEscalationLevel()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
// First activate at Low level
|
||||
var activateRequest = new IncidentModeActivationRequest(
|
||||
RunId: "run-escalate",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.Low,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Initial activation",
|
||||
DurationMinutes: null,
|
||||
RequestedBy: null);
|
||||
|
||||
await service.ActivateAsync(activateRequest, TestContext.Current.CancellationToken);
|
||||
|
||||
// Escalate to High
|
||||
var result = await service.EscalateAsync(
|
||||
"run-escalate",
|
||||
IncidentEscalationLevel.High,
|
||||
"Issue is more severe than expected",
|
||||
TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.True(result.Success);
|
||||
Assert.Equal(IncidentEscalationLevel.High, result.Status.Level);
|
||||
Assert.Contains("Escalated", result.Status.ActivationReason);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task EscalateAsync_FailsWhenNotInIncidentMode()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var result = await service.EscalateAsync(
|
||||
"unknown-run",
|
||||
IncidentEscalationLevel.High,
|
||||
null,
|
||||
TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.False(result.Success);
|
||||
Assert.Contains("not active", result.Error);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task EscalateAsync_FailsWhenNewLevelIsLowerOrEqual()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
var activateRequest = new IncidentModeActivationRequest(
|
||||
RunId: "run-no-deescalate",
|
||||
TenantId: "tenant-1",
|
||||
Level: IncidentEscalationLevel.High,
|
||||
Source: IncidentModeSource.Manual,
|
||||
Reason: "Test",
|
||||
DurationMinutes: null,
|
||||
RequestedBy: null);
|
||||
|
||||
await service.ActivateAsync(activateRequest, TestContext.Current.CancellationToken);
|
||||
|
||||
var result = await service.EscalateAsync(
|
||||
"run-no-deescalate",
|
||||
IncidentEscalationLevel.Medium, // Lower than High
|
||||
null,
|
||||
TestContext.Current.CancellationToken);
|
||||
|
||||
Assert.False(result.Success);
|
||||
Assert.Contains("Cannot escalate", result.Error);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void GetSettingsForLevel_ReturnsCorrectSettings()
|
||||
{
|
||||
var store = new InMemoryPackRunIncidentModeStore();
|
||||
var service = new PackRunIncidentModeService(
|
||||
store,
|
||||
NullLogger<PackRunIncidentModeService>.Instance);
|
||||
|
||||
// Test None level
|
||||
var noneSettings = service.GetSettingsForLevel(IncidentEscalationLevel.None);
|
||||
Assert.False(noneSettings.TelemetrySettings.EnhancedTelemetryActive);
|
||||
Assert.False(noneSettings.DebugCaptureSettings.CaptureActive);
|
||||
|
||||
// Test Critical level
|
||||
var criticalSettings = service.GetSettingsForLevel(IncidentEscalationLevel.Critical);
|
||||
Assert.True(criticalSettings.TelemetrySettings.EnhancedTelemetryActive);
|
||||
Assert.Equal(IncidentLogVerbosity.Debug, criticalSettings.TelemetrySettings.LogVerbosity);
|
||||
Assert.Equal(1.0, criticalSettings.TelemetrySettings.TraceSamplingRate);
|
||||
Assert.True(criticalSettings.DebugCaptureSettings.CaptureActive);
|
||||
Assert.True(criticalSettings.DebugCaptureSettings.CaptureHeapDumps);
|
||||
Assert.Equal(365, criticalSettings.RetentionPolicy.LogRetentionDays);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PackRunIncidentModeStatus_Inactive_ReturnsDefaultValues()
|
||||
{
|
||||
var inactive = PackRunIncidentModeStatus.Inactive();
|
||||
|
||||
Assert.False(inactive.Active);
|
||||
Assert.Equal(IncidentEscalationLevel.None, inactive.Level);
|
||||
Assert.Null(inactive.ActivatedAt);
|
||||
Assert.Null(inactive.ActivationReason);
|
||||
Assert.Equal(IncidentModeSource.None, inactive.Source);
|
||||
Assert.False(inactive.RetentionPolicy.ExtendedRetentionActive);
|
||||
Assert.False(inactive.TelemetrySettings.EnhancedTelemetryActive);
|
||||
Assert.False(inactive.DebugCaptureSettings.CaptureActive);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IncidentRetentionPolicy_Extended_HasLongerRetention()
|
||||
{
|
||||
var defaultPolicy = IncidentRetentionPolicy.Default();
|
||||
var extendedPolicy = IncidentRetentionPolicy.Extended();
|
||||
|
||||
Assert.True(extendedPolicy.ExtendedRetentionActive);
|
||||
Assert.True(extendedPolicy.LogRetentionDays > defaultPolicy.LogRetentionDays);
|
||||
Assert.True(extendedPolicy.ArtifactRetentionDays > defaultPolicy.ArtifactRetentionDays);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IncidentTelemetrySettings_Enhanced_HasHigherSampling()
|
||||
{
|
||||
var defaultSettings = IncidentTelemetrySettings.Default();
|
||||
var enhancedSettings = IncidentTelemetrySettings.Enhanced();
|
||||
|
||||
Assert.True(enhancedSettings.EnhancedTelemetryActive);
|
||||
Assert.True(enhancedSettings.TraceSamplingRate > defaultSettings.TraceSamplingRate);
|
||||
Assert.True(enhancedSettings.CaptureEnvironment);
|
||||
Assert.True(enhancedSettings.CaptureStepIo);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user