partly or unimplemented features - now implemented
This commit is contained in:
@@ -0,0 +1,327 @@
|
||||
using StellaOps.Telemetry.Core;
|
||||
using StellaOps.TestKit;
|
||||
|
||||
namespace StellaOps.Telemetry.Core.Tests;
|
||||
|
||||
public sealed class DoraMetricsServiceTests : IDisposable
|
||||
{
|
||||
private readonly DoraMetrics _metrics;
|
||||
private readonly InMemoryDoraMetricsService _service;
|
||||
|
||||
public DoraMetricsServiceTests()
|
||||
{
|
||||
_metrics = new DoraMetrics();
|
||||
_service = new InMemoryDoraMetricsService(_metrics);
|
||||
}
|
||||
|
||||
public void Dispose() => _metrics.Dispose();
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task RecordDeploymentAsync_StoresDeployment()
|
||||
{
|
||||
var deployment = CreateDeployment("deploy-001", DoraDeploymentOutcome.Success);
|
||||
|
||||
await _service.RecordDeploymentAsync(deployment);
|
||||
|
||||
var deployments = await _service.GetDeploymentsAsync(
|
||||
"acme", null,
|
||||
DateTimeOffset.UtcNow.AddDays(-1),
|
||||
DateTimeOffset.UtcNow.AddDays(1)).ToListAsync();
|
||||
|
||||
Assert.Single(deployments);
|
||||
Assert.Equal("deploy-001", deployments[0].DeploymentId);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task RecordIncidentAsync_StoresIncident()
|
||||
{
|
||||
var incident = CreateIncident("inc-001", isResolved: false);
|
||||
|
||||
await _service.RecordIncidentAsync(incident);
|
||||
|
||||
var incidents = await _service.GetIncidentsAsync(
|
||||
"acme", null,
|
||||
DateTimeOffset.UtcNow.AddDays(-1),
|
||||
DateTimeOffset.UtcNow.AddDays(1)).ToListAsync();
|
||||
|
||||
Assert.Single(incidents);
|
||||
Assert.Equal("inc-001", incidents[0].IncidentId);
|
||||
Assert.True(incidents[0].IsOpen);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task ResolveIncidentAsync_UpdatesIncident()
|
||||
{
|
||||
var incident = CreateIncident("inc-002", isResolved: false);
|
||||
await _service.RecordIncidentAsync(incident);
|
||||
|
||||
var resolveTime = DateTimeOffset.UtcNow;
|
||||
await _service.ResolveIncidentAsync("acme", "inc-002", resolveTime);
|
||||
|
||||
var incidents = await _service.GetIncidentsAsync(
|
||||
"acme", null,
|
||||
DateTimeOffset.UtcNow.AddDays(-1),
|
||||
DateTimeOffset.UtcNow.AddDays(1)).ToListAsync();
|
||||
|
||||
Assert.Single(incidents);
|
||||
Assert.False(incidents[0].IsOpen);
|
||||
Assert.Equal(resolveTime, incidents[0].ResolvedAt);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task GetSummaryAsync_CalculatesDeploymentFrequency()
|
||||
{
|
||||
// Add 10 deployments over 10 days (1 per day)
|
||||
var baseTime = DateTimeOffset.UtcNow;
|
||||
for (int i = 0; i < 10; i++)
|
||||
{
|
||||
var deployment = new DoraDeploymentEvent(
|
||||
DeploymentId: $"deploy-{i:000}",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
CommitSha: $"sha{i}",
|
||||
CommitTimestamp: baseTime.AddDays(-10 + i).AddHours(-1),
|
||||
DeploymentTimestamp: baseTime.AddDays(-10 + i),
|
||||
Outcome: DoraDeploymentOutcome.Success,
|
||||
DurationMs: 60000);
|
||||
|
||||
await _service.RecordDeploymentAsync(deployment);
|
||||
}
|
||||
|
||||
var summary = await _service.GetSummaryAsync(
|
||||
"acme", null,
|
||||
baseTime.AddDays(-10),
|
||||
baseTime);
|
||||
|
||||
Assert.Equal(10, summary.DeploymentCount);
|
||||
Assert.Equal(1.0, summary.DeploymentFrequencyPerDay, precision: 1);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task GetSummaryAsync_CalculatesChangeFailureRate()
|
||||
{
|
||||
var baseTime = DateTimeOffset.UtcNow;
|
||||
|
||||
// 7 successful, 3 failures = 30% CFR
|
||||
for (int i = 0; i < 7; i++)
|
||||
{
|
||||
await _service.RecordDeploymentAsync(CreateDeployment($"success-{i}", DoraDeploymentOutcome.Success));
|
||||
}
|
||||
for (int i = 0; i < 3; i++)
|
||||
{
|
||||
await _service.RecordDeploymentAsync(CreateDeployment($"rollback-{i}", DoraDeploymentOutcome.Rollback));
|
||||
}
|
||||
|
||||
var summary = await _service.GetSummaryAsync(
|
||||
"acme", null,
|
||||
baseTime.AddDays(-1),
|
||||
baseTime.AddDays(1));
|
||||
|
||||
Assert.Equal(10, summary.DeploymentCount);
|
||||
Assert.Equal(7, summary.SuccessfulDeployments);
|
||||
Assert.Equal(3, summary.FailedDeployments);
|
||||
Assert.Equal(30.0, summary.ChangeFailureRatePercent, precision: 1);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task GetSummaryAsync_CalculatesMedianLeadTime()
|
||||
{
|
||||
var baseTime = DateTimeOffset.UtcNow;
|
||||
|
||||
// Lead times: 1h, 2h, 3h, 4h, 5h -> median = 3h
|
||||
for (int i = 1; i <= 5; i++)
|
||||
{
|
||||
var deployment = new DoraDeploymentEvent(
|
||||
DeploymentId: $"deploy-{i}",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
CommitSha: $"sha{i}",
|
||||
CommitTimestamp: baseTime.AddHours(-i),
|
||||
DeploymentTimestamp: baseTime,
|
||||
Outcome: DoraDeploymentOutcome.Success,
|
||||
DurationMs: 30000);
|
||||
|
||||
await _service.RecordDeploymentAsync(deployment);
|
||||
}
|
||||
|
||||
var summary = await _service.GetSummaryAsync(
|
||||
"acme", null,
|
||||
baseTime.AddDays(-1),
|
||||
baseTime.AddDays(1));
|
||||
|
||||
Assert.Equal(3.0, summary.MedianLeadTimeHours, precision: 1);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task GetSummaryAsync_CalculatesMTTR()
|
||||
{
|
||||
var baseTime = DateTimeOffset.UtcNow;
|
||||
|
||||
// Recovery times: 1h, 2h, 3h -> mean = 2h
|
||||
for (int i = 1; i <= 3; i++)
|
||||
{
|
||||
var incident = new DoraIncidentEvent(
|
||||
IncidentId: $"inc-{i}",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
Severity: DoraIncidentSeverity.High,
|
||||
StartedAt: baseTime.AddHours(-i - 1),
|
||||
ResolvedAt: baseTime.AddHours(-1));
|
||||
|
||||
await _service.RecordIncidentAsync(incident);
|
||||
}
|
||||
|
||||
var summary = await _service.GetSummaryAsync(
|
||||
"acme", null,
|
||||
baseTime.AddDays(-1),
|
||||
baseTime.AddDays(1));
|
||||
|
||||
Assert.Equal(2.0, summary.MeanTimeToRecoveryHours, precision: 1);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task GetSummaryAsync_ClassifiesPerformanceLevel()
|
||||
{
|
||||
var baseTime = DateTimeOffset.UtcNow;
|
||||
|
||||
// Add elite-level deployments (high frequency, low lead time, low CFR)
|
||||
for (int i = 0; i < 30; i++)
|
||||
{
|
||||
var deployment = new DoraDeploymentEvent(
|
||||
DeploymentId: $"deploy-{i:000}",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
CommitSha: $"sha{i}",
|
||||
CommitTimestamp: baseTime.AddDays(-30 + i).AddMinutes(-30), // 30 min lead time
|
||||
DeploymentTimestamp: baseTime.AddDays(-30 + i),
|
||||
Outcome: DoraDeploymentOutcome.Success,
|
||||
DurationMs: 30000);
|
||||
|
||||
await _service.RecordDeploymentAsync(deployment);
|
||||
}
|
||||
|
||||
// Add one resolved incident with quick recovery
|
||||
var incident = new DoraIncidentEvent(
|
||||
IncidentId: "inc-1",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
Severity: DoraIncidentSeverity.High,
|
||||
StartedAt: baseTime.AddMinutes(-30),
|
||||
ResolvedAt: baseTime);
|
||||
|
||||
await _service.RecordIncidentAsync(incident);
|
||||
|
||||
var summary = await _service.GetSummaryAsync(
|
||||
"acme", null,
|
||||
baseTime.AddDays(-30),
|
||||
baseTime);
|
||||
|
||||
Assert.Equal(DoraPerformanceLevel.Elite, summary.PerformanceLevel);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task GetDeploymentsAsync_FiltersbyEnvironment()
|
||||
{
|
||||
await _service.RecordDeploymentAsync(CreateDeployment("prod-1", DoraDeploymentOutcome.Success, "production"));
|
||||
await _service.RecordDeploymentAsync(CreateDeployment("stage-1", DoraDeploymentOutcome.Success, "staging"));
|
||||
await _service.RecordDeploymentAsync(CreateDeployment("prod-2", DoraDeploymentOutcome.Success, "production"));
|
||||
|
||||
var prodDeployments = await _service.GetDeploymentsAsync(
|
||||
"acme", "production",
|
||||
DateTimeOffset.UtcNow.AddDays(-1),
|
||||
DateTimeOffset.UtcNow.AddDays(1)).ToListAsync();
|
||||
|
||||
Assert.Equal(2, prodDeployments.Count);
|
||||
Assert.All(prodDeployments, d => Assert.Equal("production", d.Environment));
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task GetIncidentsAsync_ExcludesOpenWhenRequested()
|
||||
{
|
||||
await _service.RecordIncidentAsync(CreateIncident("open-1", isResolved: false));
|
||||
await _service.RecordIncidentAsync(CreateIncident("resolved-1", isResolved: true));
|
||||
await _service.RecordIncidentAsync(CreateIncident("open-2", isResolved: false));
|
||||
|
||||
var resolvedOnly = await _service.GetIncidentsAsync(
|
||||
"acme", null,
|
||||
DateTimeOffset.UtcNow.AddDays(-1),
|
||||
DateTimeOffset.UtcNow.AddDays(1),
|
||||
includeOpen: false).ToListAsync();
|
||||
|
||||
Assert.Single(resolvedOnly);
|
||||
Assert.False(resolvedOnly[0].IsOpen);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task TenantIsolation_DeploymentsIsolatedByTenant()
|
||||
{
|
||||
var deployment1 = CreateDeployment("tenant1-deploy", DoraDeploymentOutcome.Success, tenant: "tenant1");
|
||||
var deployment2 = CreateDeployment("tenant2-deploy", DoraDeploymentOutcome.Success, tenant: "tenant2");
|
||||
|
||||
await _service.RecordDeploymentAsync(deployment1);
|
||||
await _service.RecordDeploymentAsync(deployment2);
|
||||
|
||||
var tenant1Deployments = await _service.GetDeploymentsAsync(
|
||||
"tenant1", null,
|
||||
DateTimeOffset.UtcNow.AddDays(-1),
|
||||
DateTimeOffset.UtcNow.AddDays(1)).ToListAsync();
|
||||
|
||||
Assert.Single(tenant1Deployments);
|
||||
Assert.Equal("tenant1-deploy", tenant1Deployments[0].DeploymentId);
|
||||
}
|
||||
|
||||
private static DoraDeploymentEvent CreateDeployment(
|
||||
string id,
|
||||
DoraDeploymentOutcome outcome,
|
||||
string environment = "production",
|
||||
string tenant = "acme")
|
||||
{
|
||||
return new DoraDeploymentEvent(
|
||||
DeploymentId: id,
|
||||
TenantId: tenant,
|
||||
Environment: environment,
|
||||
CommitSha: $"sha-{id}",
|
||||
CommitTimestamp: DateTimeOffset.UtcNow.AddHours(-1),
|
||||
DeploymentTimestamp: DateTimeOffset.UtcNow,
|
||||
Outcome: outcome,
|
||||
DurationMs: 60000);
|
||||
}
|
||||
|
||||
private static DoraIncidentEvent CreateIncident(
|
||||
string id,
|
||||
bool isResolved,
|
||||
string tenant = "acme")
|
||||
{
|
||||
return new DoraIncidentEvent(
|
||||
IncidentId: id,
|
||||
TenantId: tenant,
|
||||
Environment: "production",
|
||||
Severity: DoraIncidentSeverity.High,
|
||||
StartedAt: DateTimeOffset.UtcNow.AddHours(-2),
|
||||
ResolvedAt: isResolved ? DateTimeOffset.UtcNow : null);
|
||||
}
|
||||
}
|
||||
|
||||
internal static class AsyncEnumerableExtensions
|
||||
{
|
||||
public static async Task<List<T>> ToListAsync<T>(this IAsyncEnumerable<T> source)
|
||||
{
|
||||
var list = new List<T>();
|
||||
await foreach (var item in source)
|
||||
{
|
||||
list.Add(item);
|
||||
}
|
||||
return list;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,266 @@
|
||||
using System.Diagnostics.Metrics;
|
||||
using StellaOps.Telemetry.Core;
|
||||
using StellaOps.TestKit;
|
||||
|
||||
namespace StellaOps.Telemetry.Core.Tests;
|
||||
|
||||
public sealed class DoraMetricsTests : IDisposable
|
||||
{
|
||||
private readonly MeterListener _listener;
|
||||
private readonly List<RecordedMeasurement> _measurements = [];
|
||||
|
||||
public DoraMetricsTests()
|
||||
{
|
||||
_listener = new MeterListener();
|
||||
_listener.InstrumentPublished = (instrument, listener) =>
|
||||
{
|
||||
if (instrument.Meter.Name == DoraMetrics.MeterName)
|
||||
{
|
||||
listener.EnableMeasurementEvents(instrument);
|
||||
}
|
||||
};
|
||||
|
||||
_listener.SetMeasurementEventCallback<double>((instrument, measurement, tags, state) =>
|
||||
{
|
||||
_measurements.Add(new RecordedMeasurement(instrument.Name, measurement, tags.ToArray()));
|
||||
});
|
||||
_listener.SetMeasurementEventCallback<long>((instrument, measurement, tags, state) =>
|
||||
{
|
||||
_measurements.Add(new RecordedMeasurement(instrument.Name, measurement, tags.ToArray()));
|
||||
});
|
||||
_listener.Start();
|
||||
}
|
||||
|
||||
public void Dispose() => _listener.Dispose();
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void RecordDeployment_WithSuccessfulDeployment_RecordsMetrics()
|
||||
{
|
||||
using var metrics = new DoraMetrics();
|
||||
|
||||
var deployment = new DoraDeploymentEvent(
|
||||
DeploymentId: "deploy-001",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
CommitSha: "abc123",
|
||||
CommitTimestamp: DateTimeOffset.UtcNow.AddHours(-2),
|
||||
DeploymentTimestamp: DateTimeOffset.UtcNow,
|
||||
Outcome: DoraDeploymentOutcome.Success,
|
||||
DurationMs: 120_000);
|
||||
|
||||
metrics.RecordDeployment(deployment);
|
||||
|
||||
Assert.Contains(_measurements, m => m.Name == "dora_deployments_total" && m.Value is long v && v == 1);
|
||||
Assert.Contains(_measurements, m => m.Name == "dora_deployment_success_total" && m.Value is long v && v == 1);
|
||||
Assert.Contains(_measurements, m => m.Name == "dora_deployment_duration_seconds");
|
||||
Assert.Contains(_measurements, m => m.Name == "dora_lead_time_hours");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void RecordDeployment_WithRollback_RecordsFailureMetrics()
|
||||
{
|
||||
using var metrics = new DoraMetrics();
|
||||
|
||||
var deployment = new DoraDeploymentEvent(
|
||||
DeploymentId: "deploy-002",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
CommitSha: "def456",
|
||||
CommitTimestamp: DateTimeOffset.UtcNow.AddDays(-1),
|
||||
DeploymentTimestamp: DateTimeOffset.UtcNow,
|
||||
Outcome: DoraDeploymentOutcome.Rollback,
|
||||
DurationMs: 60_000);
|
||||
|
||||
metrics.RecordDeployment(deployment);
|
||||
|
||||
Assert.Contains(_measurements, m => m.Name == "dora_deployments_total" && m.Value is long v && v == 1);
|
||||
Assert.Contains(_measurements, m => m.Name == "dora_deployment_failure_total" && m.Value is long v && v == 1);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void RecordDeployment_ExceedsLeadTimeSlo_RecordsSloBreak()
|
||||
{
|
||||
var options = new DoraMetricsOptions { LeadTimeSloHours = 1.0 };
|
||||
using var metrics = new DoraMetrics(options);
|
||||
|
||||
var deployment = new DoraDeploymentEvent(
|
||||
DeploymentId: "deploy-003",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
CommitSha: "ghi789",
|
||||
CommitTimestamp: DateTimeOffset.UtcNow.AddDays(-2), // 48 hours ago
|
||||
DeploymentTimestamp: DateTimeOffset.UtcNow,
|
||||
Outcome: DoraDeploymentOutcome.Success,
|
||||
DurationMs: 30_000);
|
||||
|
||||
metrics.RecordDeployment(deployment);
|
||||
|
||||
Assert.Contains(_measurements, m => m.Name == "dora_slo_breach_total" && m.Value is long v && v == 1);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void RecordIncidentStarted_TracksIncidentCount()
|
||||
{
|
||||
using var metrics = new DoraMetrics();
|
||||
|
||||
var incident = new DoraIncidentEvent(
|
||||
IncidentId: "inc-001",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
Severity: DoraIncidentSeverity.High,
|
||||
StartedAt: DateTimeOffset.UtcNow,
|
||||
ResolvedAt: null);
|
||||
|
||||
metrics.RecordIncidentStarted(incident);
|
||||
|
||||
Assert.Contains(_measurements, m => m.Name == "dora_incidents_total" && m.Value is long v && v == 1);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void RecordIncidentResolved_TracksTimeToRecovery()
|
||||
{
|
||||
using var metrics = new DoraMetrics();
|
||||
|
||||
var incident = new DoraIncidentEvent(
|
||||
IncidentId: "inc-002",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
Severity: DoraIncidentSeverity.Critical,
|
||||
StartedAt: DateTimeOffset.UtcNow.AddHours(-2),
|
||||
ResolvedAt: DateTimeOffset.UtcNow);
|
||||
|
||||
metrics.RecordIncidentResolved(incident);
|
||||
|
||||
Assert.Contains(_measurements, m => m.Name == "dora_incidents_resolved_total" && m.Value is long v && v == 1);
|
||||
Assert.Contains(_measurements, m => m.Name == "dora_time_to_recovery_hours");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void RecordIncidentResolved_ExceedsMttrSlo_RecordsSloBreak()
|
||||
{
|
||||
var options = new DoraMetricsOptions { MttrSloHours = 0.5 };
|
||||
using var metrics = new DoraMetrics(options);
|
||||
|
||||
var incident = new DoraIncidentEvent(
|
||||
IncidentId: "inc-003",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
Severity: DoraIncidentSeverity.High,
|
||||
StartedAt: DateTimeOffset.UtcNow.AddHours(-2),
|
||||
ResolvedAt: DateTimeOffset.UtcNow);
|
||||
|
||||
metrics.RecordIncidentResolved(incident);
|
||||
|
||||
Assert.Contains(_measurements, m => m.Name == "dora_slo_breach_total" && m.Value is long v && v == 1);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Theory]
|
||||
[InlineData(2.0, 12.0, 10.0, 0.5, DoraPerformanceLevel.Elite)]
|
||||
[InlineData(0.2, 100.0, 20.0, 20.0, DoraPerformanceLevel.High)]
|
||||
[InlineData(0.05, 2000.0, 40.0, 100.0, DoraPerformanceLevel.Medium)]
|
||||
[InlineData(0.01, 5000.0, 60.0, 200.0, DoraPerformanceLevel.Low)]
|
||||
[InlineData(0.0, 0.0, 0.0, 0.0, DoraPerformanceLevel.Unknown)]
|
||||
public void ClassifyPerformance_ReturnsCorrectLevel(
|
||||
double deploymentFrequency,
|
||||
double leadTimeHours,
|
||||
double cfrPercent,
|
||||
double mttrHours,
|
||||
DoraPerformanceLevel expectedLevel)
|
||||
{
|
||||
var result = DoraMetrics.ClassifyPerformance(
|
||||
deploymentFrequency,
|
||||
leadTimeHours,
|
||||
cfrPercent,
|
||||
mttrHours);
|
||||
|
||||
Assert.Equal(expectedLevel, result);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void DoraDeploymentEvent_LeadTime_CalculatesCorrectly()
|
||||
{
|
||||
var commitTime = new DateTimeOffset(2025, 1, 15, 10, 0, 0, TimeSpan.Zero);
|
||||
var deployTime = new DateTimeOffset(2025, 1, 15, 14, 30, 0, TimeSpan.Zero);
|
||||
|
||||
var deployment = new DoraDeploymentEvent(
|
||||
DeploymentId: "test",
|
||||
TenantId: "acme",
|
||||
Environment: "prod",
|
||||
CommitSha: "abc",
|
||||
CommitTimestamp: commitTime,
|
||||
DeploymentTimestamp: deployTime,
|
||||
Outcome: DoraDeploymentOutcome.Success,
|
||||
DurationMs: 1000);
|
||||
|
||||
Assert.Equal(TimeSpan.FromHours(4.5), deployment.LeadTime);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Theory]
|
||||
[InlineData(DoraDeploymentOutcome.Success, false)]
|
||||
[InlineData(DoraDeploymentOutcome.Rollback, true)]
|
||||
[InlineData(DoraDeploymentOutcome.Hotfix, true)]
|
||||
[InlineData(DoraDeploymentOutcome.Failed, true)]
|
||||
[InlineData(DoraDeploymentOutcome.Cancelled, false)]
|
||||
public void DoraDeploymentEvent_IsFailure_ReturnsCorrectValue(
|
||||
DoraDeploymentOutcome outcome,
|
||||
bool expectedIsFailure)
|
||||
{
|
||||
var deployment = new DoraDeploymentEvent(
|
||||
DeploymentId: "test",
|
||||
TenantId: "acme",
|
||||
Environment: "prod",
|
||||
CommitSha: "abc",
|
||||
CommitTimestamp: DateTimeOffset.UtcNow,
|
||||
DeploymentTimestamp: DateTimeOffset.UtcNow,
|
||||
Outcome: outcome,
|
||||
DurationMs: 1000);
|
||||
|
||||
Assert.Equal(expectedIsFailure, deployment.IsFailure);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void DoraIncidentEvent_TimeToRecovery_ReturnsNullWhenOpen()
|
||||
{
|
||||
var incident = new DoraIncidentEvent(
|
||||
IncidentId: "test",
|
||||
TenantId: "acme",
|
||||
Environment: "prod",
|
||||
Severity: DoraIncidentSeverity.High,
|
||||
StartedAt: DateTimeOffset.UtcNow,
|
||||
ResolvedAt: null);
|
||||
|
||||
Assert.Null(incident.TimeToRecovery);
|
||||
Assert.True(incident.IsOpen);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void DoraIncidentEvent_TimeToRecovery_CalculatesWhenResolved()
|
||||
{
|
||||
var startTime = new DateTimeOffset(2025, 1, 15, 10, 0, 0, TimeSpan.Zero);
|
||||
var resolveTime = new DateTimeOffset(2025, 1, 15, 11, 30, 0, TimeSpan.Zero);
|
||||
|
||||
var incident = new DoraIncidentEvent(
|
||||
IncidentId: "test",
|
||||
TenantId: "acme",
|
||||
Environment: "prod",
|
||||
Severity: DoraIncidentSeverity.High,
|
||||
StartedAt: startTime,
|
||||
ResolvedAt: resolveTime);
|
||||
|
||||
Assert.Equal(TimeSpan.FromHours(1.5), incident.TimeToRecovery);
|
||||
Assert.False(incident.IsOpen);
|
||||
}
|
||||
|
||||
private sealed record RecordedMeasurement(string Name, object Value, KeyValuePair<string, object?>[] Tags);
|
||||
}
|
||||
@@ -0,0 +1,237 @@
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Telemetry.Core;
|
||||
|
||||
namespace StellaOps.Telemetry.Core.Tests;
|
||||
|
||||
public sealed class OutcomeAnalyticsServiceTests : IDisposable
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2026, 2, 1, 0, 0, 0, TimeSpan.Zero);
|
||||
private readonly DoraMetrics _metrics;
|
||||
private readonly InMemoryDoraMetricsService _doraMetricsService;
|
||||
private readonly DoraOutcomeAnalyticsService _outcomeAnalyticsService;
|
||||
|
||||
public OutcomeAnalyticsServiceTests()
|
||||
{
|
||||
_metrics = new DoraMetrics();
|
||||
_doraMetricsService = new InMemoryDoraMetricsService(_metrics);
|
||||
_outcomeAnalyticsService = new DoraOutcomeAnalyticsService(_doraMetricsService);
|
||||
}
|
||||
|
||||
public void Dispose() => _metrics.Dispose();
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task GetExecutiveReportAsync_ComputesAttributionAndCohorts()
|
||||
{
|
||||
await SeedDeterministicTelemetryAsync();
|
||||
|
||||
var report = await _outcomeAnalyticsService.GetExecutiveReportAsync(
|
||||
tenantId: "acme",
|
||||
environment: "production",
|
||||
periodStart: BaseTime,
|
||||
periodEnd: BaseTime.AddDays(4));
|
||||
|
||||
Assert.Equal(4, report.TotalDeployments);
|
||||
Assert.Equal(2, report.FailedDeployments);
|
||||
Assert.Equal(3, report.TotalIncidents);
|
||||
Assert.Equal(2, report.ResolvedIncidents);
|
||||
Assert.Equal(2, report.AcknowledgedIncidents);
|
||||
Assert.Equal(0.38, report.MeanTimeToAcknowledgeHours);
|
||||
Assert.Equal(2.5, report.MeanTimeToRecoveryHours);
|
||||
|
||||
Assert.Collection(report.DeploymentAttribution,
|
||||
pipeline =>
|
||||
{
|
||||
Assert.Equal("pipeline-a", pipeline.PipelineId);
|
||||
Assert.Equal(2, pipeline.DeploymentCount);
|
||||
Assert.Equal(1, pipeline.FailedDeploymentCount);
|
||||
Assert.Equal(50.0, pipeline.ChangeFailureRatePercent);
|
||||
Assert.Equal(2.5, pipeline.MedianLeadTimeHours);
|
||||
},
|
||||
pipeline =>
|
||||
{
|
||||
Assert.Equal("pipeline-b", pipeline.PipelineId);
|
||||
Assert.Equal(1, pipeline.DeploymentCount);
|
||||
Assert.Equal(0, pipeline.FailedDeploymentCount);
|
||||
Assert.Equal(0.0, pipeline.ChangeFailureRatePercent);
|
||||
Assert.Equal(6.0, pipeline.MedianLeadTimeHours);
|
||||
},
|
||||
pipeline =>
|
||||
{
|
||||
Assert.Equal("unknown", pipeline.PipelineId);
|
||||
Assert.Equal(1, pipeline.DeploymentCount);
|
||||
Assert.Equal(1, pipeline.FailedDeploymentCount);
|
||||
Assert.Equal(100.0, pipeline.ChangeFailureRatePercent);
|
||||
Assert.Equal(6.0, pipeline.MedianLeadTimeHours);
|
||||
});
|
||||
|
||||
Assert.Collection(report.IncidentAttribution,
|
||||
critical =>
|
||||
{
|
||||
Assert.Equal(DoraIncidentSeverity.Critical, critical.Severity);
|
||||
Assert.Equal(1, critical.IncidentCount);
|
||||
Assert.Equal(1, critical.ResolvedIncidentCount);
|
||||
Assert.Equal(0, critical.AcknowledgedIncidentCount);
|
||||
Assert.Equal(0.0, critical.MeanTimeToAcknowledgeHours);
|
||||
Assert.Equal(4.0, critical.MeanTimeToRecoveryHours);
|
||||
},
|
||||
high =>
|
||||
{
|
||||
Assert.Equal(DoraIncidentSeverity.High, high.Severity);
|
||||
Assert.Equal(1, high.IncidentCount);
|
||||
Assert.Equal(1, high.ResolvedIncidentCount);
|
||||
Assert.Equal(1, high.AcknowledgedIncidentCount);
|
||||
Assert.Equal(0.25, high.MeanTimeToAcknowledgeHours);
|
||||
Assert.Equal(1.0, high.MeanTimeToRecoveryHours);
|
||||
},
|
||||
medium =>
|
||||
{
|
||||
Assert.Equal(DoraIncidentSeverity.Medium, medium.Severity);
|
||||
Assert.Equal(1, medium.IncidentCount);
|
||||
Assert.Equal(0, medium.ResolvedIncidentCount);
|
||||
Assert.Equal(1, medium.AcknowledgedIncidentCount);
|
||||
Assert.Equal(0.5, medium.MeanTimeToAcknowledgeHours);
|
||||
Assert.Equal(0.0, medium.MeanTimeToRecoveryHours);
|
||||
});
|
||||
|
||||
Assert.Equal(5, report.DailyCohorts.Count);
|
||||
Assert.Equal(new DateOnly(2026, 2, 1), report.DailyCohorts[0].Day);
|
||||
Assert.Equal(new DateOnly(2026, 2, 5), report.DailyCohorts[4].Day);
|
||||
Assert.Equal(0, report.DailyCohorts[3].DeploymentCount);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task GetExecutiveReportAsync_IsDeterministicAcrossRepeatedCalls()
|
||||
{
|
||||
await SeedDeterministicTelemetryAsync();
|
||||
|
||||
var first = await _outcomeAnalyticsService.GetExecutiveReportAsync(
|
||||
tenantId: "acme",
|
||||
environment: "production",
|
||||
periodStart: BaseTime,
|
||||
periodEnd: BaseTime.AddDays(4));
|
||||
|
||||
var second = await _outcomeAnalyticsService.GetExecutiveReportAsync(
|
||||
tenantId: "acme",
|
||||
environment: "production",
|
||||
periodStart: BaseTime,
|
||||
periodEnd: BaseTime.AddDays(4));
|
||||
|
||||
Assert.Equal(first.TenantId, second.TenantId);
|
||||
Assert.Equal(first.Environment, second.Environment);
|
||||
Assert.Equal(first.PeriodStart, second.PeriodStart);
|
||||
Assert.Equal(first.PeriodEnd, second.PeriodEnd);
|
||||
Assert.Equal(first.TotalDeployments, second.TotalDeployments);
|
||||
Assert.Equal(first.FailedDeployments, second.FailedDeployments);
|
||||
Assert.Equal(first.TotalIncidents, second.TotalIncidents);
|
||||
Assert.Equal(first.ResolvedIncidents, second.ResolvedIncidents);
|
||||
Assert.Equal(first.AcknowledgedIncidents, second.AcknowledgedIncidents);
|
||||
Assert.Equal(first.MeanTimeToAcknowledgeHours, second.MeanTimeToAcknowledgeHours);
|
||||
Assert.Equal(first.MeanTimeToRecoveryHours, second.MeanTimeToRecoveryHours);
|
||||
Assert.Equal(first.DeploymentAttribution, second.DeploymentAttribution);
|
||||
Assert.Equal(first.IncidentAttribution, second.IncidentAttribution);
|
||||
Assert.Equal(first.DailyCohorts, second.DailyCohorts);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void AddDoraMetrics_RegistersOutcomeAnalyticsService()
|
||||
{
|
||||
var services = new ServiceCollection();
|
||||
|
||||
services.AddDoraMetrics();
|
||||
|
||||
using var serviceProvider = services.BuildServiceProvider();
|
||||
var service = serviceProvider.GetService<IOutcomeAnalyticsService>();
|
||||
|
||||
Assert.NotNull(service);
|
||||
}
|
||||
|
||||
private async Task SeedDeterministicTelemetryAsync()
|
||||
{
|
||||
var deployments = new[]
|
||||
{
|
||||
new DoraDeploymentEvent(
|
||||
DeploymentId: "deploy-001",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
CommitSha: "sha-001",
|
||||
CommitTimestamp: BaseTime.AddHours(-1),
|
||||
DeploymentTimestamp: BaseTime.AddHours(1),
|
||||
Outcome: DoraDeploymentOutcome.Success,
|
||||
DurationMs: 30_000,
|
||||
PipelineId: "pipeline-a"),
|
||||
new DoraDeploymentEvent(
|
||||
DeploymentId: "deploy-002",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
CommitSha: "sha-002",
|
||||
CommitTimestamp: BaseTime.AddDays(1).AddHours(-2),
|
||||
DeploymentTimestamp: BaseTime.AddDays(1).AddHours(1),
|
||||
Outcome: DoraDeploymentOutcome.Rollback,
|
||||
DurationMs: 45_000,
|
||||
PipelineId: "pipeline-a"),
|
||||
new DoraDeploymentEvent(
|
||||
DeploymentId: "deploy-003",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
CommitSha: "sha-003",
|
||||
CommitTimestamp: BaseTime.AddDays(1).AddHours(-4),
|
||||
DeploymentTimestamp: BaseTime.AddDays(1).AddHours(2),
|
||||
Outcome: DoraDeploymentOutcome.Success,
|
||||
DurationMs: 32_000,
|
||||
PipelineId: "pipeline-b"),
|
||||
new DoraDeploymentEvent(
|
||||
DeploymentId: "deploy-004",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
CommitSha: "sha-004",
|
||||
CommitTimestamp: BaseTime.AddDays(2).AddHours(-3),
|
||||
DeploymentTimestamp: BaseTime.AddDays(2).AddHours(3),
|
||||
Outcome: DoraDeploymentOutcome.Failed,
|
||||
DurationMs: 52_000,
|
||||
PipelineId: null),
|
||||
};
|
||||
|
||||
foreach (var deployment in deployments)
|
||||
{
|
||||
await _doraMetricsService.RecordDeploymentAsync(deployment);
|
||||
}
|
||||
|
||||
var incidents = new[]
|
||||
{
|
||||
new DoraIncidentEvent(
|
||||
IncidentId: "inc-001",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
Severity: DoraIncidentSeverity.High,
|
||||
StartedAt: BaseTime.AddDays(1).AddHours(10),
|
||||
ResolvedAt: BaseTime.AddDays(1).AddHours(11),
|
||||
AcknowledgedAt: BaseTime.AddDays(1).AddHours(10.25),
|
||||
DeploymentId: "deploy-002"),
|
||||
new DoraIncidentEvent(
|
||||
IncidentId: "inc-002",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
Severity: DoraIncidentSeverity.Critical,
|
||||
StartedAt: BaseTime.AddDays(2).AddHours(8),
|
||||
ResolvedAt: BaseTime.AddDays(2).AddHours(12),
|
||||
DeploymentId: "deploy-004"),
|
||||
new DoraIncidentEvent(
|
||||
IncidentId: "inc-003",
|
||||
TenantId: "acme",
|
||||
Environment: "production",
|
||||
Severity: DoraIncidentSeverity.Medium,
|
||||
StartedAt: BaseTime.AddDays(3).AddHours(9),
|
||||
ResolvedAt: null,
|
||||
AcknowledgedAt: BaseTime.AddDays(3).AddHours(9.5),
|
||||
DeploymentId: "deploy-004"),
|
||||
};
|
||||
|
||||
foreach (var incident in incidents)
|
||||
{
|
||||
await _doraMetricsService.RecordIncidentAsync(incident);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,304 @@
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Telemetry.Core;
|
||||
|
||||
/// <summary>
|
||||
/// OpenTelemetry-style metrics for DORA (DevOps Research and Assessment) metrics.
|
||||
/// Tracks the four key metrics: Deployment Frequency, Lead Time for Changes,
|
||||
/// Change Failure Rate, and Mean Time to Recovery (MTTR).
|
||||
/// </summary>
|
||||
public sealed class DoraMetrics : IDisposable
|
||||
{
|
||||
/// <summary>
|
||||
/// Default meter name for DORA metrics.
|
||||
/// </summary>
|
||||
public const string MeterName = "StellaOps.DORA";
|
||||
|
||||
private readonly Meter _meter;
|
||||
private readonly DoraMetricsOptions _options;
|
||||
private bool _disposed;
|
||||
|
||||
// Deployment Frequency metrics
|
||||
private readonly Counter<long> _deploymentCounter;
|
||||
private readonly Histogram<double> _deploymentDurationHistogram;
|
||||
|
||||
// Lead Time for Changes metrics
|
||||
private readonly Histogram<double> _leadTimeHistogram;
|
||||
|
||||
// Change Failure Rate metrics
|
||||
private readonly Counter<long> _deploymentSuccessCounter;
|
||||
private readonly Counter<long> _deploymentFailureCounter;
|
||||
|
||||
// MTTR metrics
|
||||
private readonly Counter<long> _incidentCounter;
|
||||
private readonly Counter<long> _incidentResolvedCounter;
|
||||
private readonly Histogram<double> _timeToRecoveryHistogram;
|
||||
|
||||
// SLO breach tracking
|
||||
private readonly Counter<long> _sloBreachCounter;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of <see cref="DoraMetrics"/>.
|
||||
/// </summary>
|
||||
public DoraMetrics(DoraMetricsOptions? options = null)
|
||||
{
|
||||
_options = options ?? new DoraMetricsOptions();
|
||||
_meter = new Meter(MeterName, _options.Version);
|
||||
|
||||
// Deployment Frequency
|
||||
_deploymentCounter = _meter.CreateCounter<long>(
|
||||
name: "dora_deployments_total",
|
||||
unit: "{deployment}",
|
||||
description: "Total number of deployments.");
|
||||
|
||||
_deploymentDurationHistogram = _meter.CreateHistogram<double>(
|
||||
name: "dora_deployment_duration_seconds",
|
||||
unit: "s",
|
||||
description: "Duration of deployments in seconds.");
|
||||
|
||||
// Lead Time for Changes
|
||||
_leadTimeHistogram = _meter.CreateHistogram<double>(
|
||||
name: "dora_lead_time_hours",
|
||||
unit: "h",
|
||||
description: "Lead time from commit to deployment in hours.");
|
||||
|
||||
// Change Failure Rate
|
||||
_deploymentSuccessCounter = _meter.CreateCounter<long>(
|
||||
name: "dora_deployment_success_total",
|
||||
unit: "{deployment}",
|
||||
description: "Total number of successful deployments.");
|
||||
|
||||
_deploymentFailureCounter = _meter.CreateCounter<long>(
|
||||
name: "dora_deployment_failure_total",
|
||||
unit: "{deployment}",
|
||||
description: "Total number of failed deployments (rollbacks, hotfixes, failures).");
|
||||
|
||||
// MTTR
|
||||
_incidentCounter = _meter.CreateCounter<long>(
|
||||
name: "dora_incidents_total",
|
||||
unit: "{incident}",
|
||||
description: "Total number of incidents.");
|
||||
|
||||
_incidentResolvedCounter = _meter.CreateCounter<long>(
|
||||
name: "dora_incidents_resolved_total",
|
||||
unit: "{incident}",
|
||||
description: "Total number of resolved incidents.");
|
||||
|
||||
_timeToRecoveryHistogram = _meter.CreateHistogram<double>(
|
||||
name: "dora_time_to_recovery_hours",
|
||||
unit: "h",
|
||||
description: "Time to recovery from incidents in hours.");
|
||||
|
||||
// SLO tracking
|
||||
_sloBreachCounter = _meter.CreateCounter<long>(
|
||||
name: "dora_slo_breach_total",
|
||||
unit: "{breach}",
|
||||
description: "Total number of DORA SLO breaches.");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a deployment event.
|
||||
/// </summary>
|
||||
public void RecordDeployment(DoraDeploymentEvent deployment)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(deployment);
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", deployment.TenantId },
|
||||
{ "environment", deployment.Environment },
|
||||
{ "outcome", deployment.Outcome.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
if (!string.IsNullOrEmpty(deployment.PipelineId))
|
||||
{
|
||||
tags.Add("pipeline_id", deployment.PipelineId);
|
||||
}
|
||||
|
||||
// Record deployment count
|
||||
_deploymentCounter.Add(1, tags);
|
||||
|
||||
// Record deployment duration
|
||||
var durationSeconds = deployment.DurationMs / 1000.0;
|
||||
_deploymentDurationHistogram.Record(durationSeconds, tags);
|
||||
|
||||
// Record lead time
|
||||
var leadTimeHours = deployment.LeadTime.TotalHours;
|
||||
_leadTimeHistogram.Record(leadTimeHours, tags);
|
||||
|
||||
// Track success/failure for CFR
|
||||
if (deployment.IsFailure)
|
||||
{
|
||||
_deploymentFailureCounter.Add(1, tags);
|
||||
}
|
||||
else if (deployment.Outcome == DoraDeploymentOutcome.Success)
|
||||
{
|
||||
_deploymentSuccessCounter.Add(1, tags);
|
||||
}
|
||||
|
||||
// Check SLO breaches
|
||||
CheckDeploymentSlos(deployment);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records an incident start.
|
||||
/// </summary>
|
||||
public void RecordIncidentStarted(DoraIncidentEvent incident)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(incident);
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", incident.TenantId },
|
||||
{ "environment", incident.Environment },
|
||||
{ "severity", incident.Severity.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_incidentCounter.Add(1, tags);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records an incident resolution.
|
||||
/// </summary>
|
||||
public void RecordIncidentResolved(DoraIncidentEvent incident)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(incident);
|
||||
|
||||
if (!incident.ResolvedAt.HasValue || !incident.TimeToRecovery.HasValue)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", incident.TenantId },
|
||||
{ "environment", incident.Environment },
|
||||
{ "severity", incident.Severity.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_incidentResolvedCounter.Add(1, tags);
|
||||
|
||||
var mttrHours = incident.TimeToRecovery.Value.TotalHours;
|
||||
_timeToRecoveryHistogram.Record(mttrHours, tags);
|
||||
|
||||
// Check MTTR SLO
|
||||
if (mttrHours > _options.MttrSloHours)
|
||||
{
|
||||
var sloTags = new TagList
|
||||
{
|
||||
{ "tenant_id", incident.TenantId },
|
||||
{ "environment", incident.Environment },
|
||||
{ "severity", incident.Severity.ToString().ToLowerInvariant() },
|
||||
{ "metric", "mttr" }
|
||||
};
|
||||
_sloBreachCounter.Add(1, sloTags);
|
||||
}
|
||||
}
|
||||
|
||||
private void CheckDeploymentSlos(DoraDeploymentEvent deployment)
|
||||
{
|
||||
// Lead time SLO check
|
||||
var leadTimeHours = deployment.LeadTime.TotalHours;
|
||||
if (leadTimeHours > _options.LeadTimeSloHours)
|
||||
{
|
||||
var sloTags = new TagList
|
||||
{
|
||||
{ "tenant_id", deployment.TenantId },
|
||||
{ "environment", deployment.Environment },
|
||||
{ "outcome", deployment.Outcome.ToString().ToLowerInvariant() },
|
||||
{ "metric", "lead_time" }
|
||||
};
|
||||
if (!string.IsNullOrEmpty(deployment.PipelineId))
|
||||
{
|
||||
sloTags.Add("pipeline_id", deployment.PipelineId);
|
||||
}
|
||||
_sloBreachCounter.Add(1, sloTags);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a deployment frequency SLO breach (typically calculated in batches).
|
||||
/// </summary>
|
||||
public void RecordDeploymentFrequencySloBreak(string tenantId, string environment, double actualFrequency)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", tenantId },
|
||||
{ "environment", environment },
|
||||
{ "metric", "deployment_frequency" },
|
||||
{ "actual_frequency", actualFrequency.ToString("F2") }
|
||||
};
|
||||
|
||||
_sloBreachCounter.Add(1, tags);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a change failure rate SLO breach (typically calculated in batches).
|
||||
/// </summary>
|
||||
public void RecordChangeFailureRateSloBreak(string tenantId, string environment, double actualRate)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", tenantId },
|
||||
{ "environment", environment },
|
||||
{ "metric", "change_failure_rate" },
|
||||
{ "actual_rate", actualRate.ToString("F2") }
|
||||
};
|
||||
|
||||
_sloBreachCounter.Add(1, tags);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Classifies the DORA performance level based on the four key metrics.
|
||||
/// </summary>
|
||||
public static DoraPerformanceLevel ClassifyPerformance(
|
||||
double deploymentFrequencyPerDay,
|
||||
double leadTimeHours,
|
||||
double changeFailureRatePercent,
|
||||
double mttrHours)
|
||||
{
|
||||
// Elite: On-demand (multiple per day), <1h lead time, <15% CFR, <1h MTTR
|
||||
if (deploymentFrequencyPerDay >= 1.0 &&
|
||||
leadTimeHours < 24 &&
|
||||
changeFailureRatePercent < 15 &&
|
||||
mttrHours < 1)
|
||||
{
|
||||
return DoraPerformanceLevel.Elite;
|
||||
}
|
||||
|
||||
// High: Once per day to once per week, 1 day to 1 week lead time, 16-30% CFR, <1 day MTTR
|
||||
if (deploymentFrequencyPerDay >= 0.14 && // ~1/week
|
||||
leadTimeHours < 168 && // 1 week
|
||||
changeFailureRatePercent <= 30 &&
|
||||
mttrHours < 24)
|
||||
{
|
||||
return DoraPerformanceLevel.High;
|
||||
}
|
||||
|
||||
// Medium: Once per week to once per month, 1-6 months lead time, <45% CFR, <1 week MTTR
|
||||
if (deploymentFrequencyPerDay >= 0.033 && // ~1/month
|
||||
leadTimeHours < 4320 && // ~6 months
|
||||
changeFailureRatePercent <= 45 &&
|
||||
mttrHours < 168) // 1 week
|
||||
{
|
||||
return DoraPerformanceLevel.Medium;
|
||||
}
|
||||
|
||||
// Low: Everything else with some activity
|
||||
if (deploymentFrequencyPerDay > 0)
|
||||
{
|
||||
return DoraPerformanceLevel.Low;
|
||||
}
|
||||
|
||||
return DoraPerformanceLevel.Unknown;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_meter.Dispose();
|
||||
_disposed = true;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,245 @@
|
||||
namespace StellaOps.Telemetry.Core;
|
||||
|
||||
/// <summary>
|
||||
/// Options for DORA metrics collection and reporting.
|
||||
/// </summary>
|
||||
public sealed class DoraMetricsOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Version string for the meter.
|
||||
/// </summary>
|
||||
public string Version { get; set; } = "1.0.0";
|
||||
|
||||
/// <summary>
|
||||
/// Whether DORA metrics collection is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// SLO target for Lead Time for Changes in hours (default: 24 hours for Elite performers).
|
||||
/// </summary>
|
||||
public double LeadTimeSloHours { get; set; } = 24.0;
|
||||
|
||||
/// <summary>
|
||||
/// SLO target for Deployment Frequency per day (default: 1 for Elite performers).
|
||||
/// </summary>
|
||||
public double DeploymentFrequencySloPerDay { get; set; } = 1.0;
|
||||
|
||||
/// <summary>
|
||||
/// SLO target for Change Failure Rate as a percentage (default: 15% for Elite performers).
|
||||
/// </summary>
|
||||
public double ChangeFailureRateSloPercent { get; set; } = 15.0;
|
||||
|
||||
/// <summary>
|
||||
/// SLO target for Mean Time to Recovery in hours (default: 1 hour for Elite performers).
|
||||
/// </summary>
|
||||
public double MttrSloHours { get; set; } = 1.0;
|
||||
|
||||
/// <summary>
|
||||
/// Rolling window for calculating deployment frequency (in days).
|
||||
/// </summary>
|
||||
public int FrequencyWindowDays { get; set; } = 30;
|
||||
|
||||
/// <summary>
|
||||
/// Rolling window for calculating change failure rate (in days).
|
||||
/// </summary>
|
||||
public int FailureRateWindowDays { get; set; } = 30;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DORA performance classification based on the Four Keys metrics.
|
||||
/// </summary>
|
||||
public enum DoraPerformanceLevel
|
||||
{
|
||||
/// <summary>
|
||||
/// Elite performers: On-demand deployments, <1 hour lead time, <15% CFR, <1 hour MTTR.
|
||||
/// </summary>
|
||||
Elite = 4,
|
||||
|
||||
/// <summary>
|
||||
/// High performers: Between once per day and once per week, 1-7 days lead time, 16-30% CFR, <1 day MTTR.
|
||||
/// </summary>
|
||||
High = 3,
|
||||
|
||||
/// <summary>
|
||||
/// Medium performers: Between once per week and once per month, 1-6 months lead time, ~45% CFR, <1 week MTTR.
|
||||
/// </summary>
|
||||
Medium = 2,
|
||||
|
||||
/// <summary>
|
||||
/// Low performers: Between once per month and once per six months, >6 months lead time, ~64% CFR, >6 months MTTR.
|
||||
/// </summary>
|
||||
Low = 1,
|
||||
|
||||
/// <summary>
|
||||
/// Unknown or insufficient data to classify.
|
||||
/// </summary>
|
||||
Unknown = 0
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of deployment event for DORA tracking.
|
||||
/// </summary>
|
||||
public enum DoraDeploymentOutcome
|
||||
{
|
||||
/// <summary>
|
||||
/// Successful deployment that did not require rollback or hotfix.
|
||||
/// </summary>
|
||||
Success = 0,
|
||||
|
||||
/// <summary>
|
||||
/// Deployment that required a rollback.
|
||||
/// </summary>
|
||||
Rollback = 1,
|
||||
|
||||
/// <summary>
|
||||
/// Deployment that required a hotfix.
|
||||
/// </summary>
|
||||
Hotfix = 2,
|
||||
|
||||
/// <summary>
|
||||
/// Deployment that failed during execution.
|
||||
/// </summary>
|
||||
Failed = 3,
|
||||
|
||||
/// <summary>
|
||||
/// Deployment was cancelled before completion.
|
||||
/// </summary>
|
||||
Cancelled = 4
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Incident severity levels for MTTR tracking.
|
||||
/// </summary>
|
||||
public enum DoraIncidentSeverity
|
||||
{
|
||||
/// <summary>
|
||||
/// Critical incident affecting all users/services.
|
||||
/// </summary>
|
||||
Critical = 1,
|
||||
|
||||
/// <summary>
|
||||
/// High severity incident affecting major functionality.
|
||||
/// </summary>
|
||||
High = 2,
|
||||
|
||||
/// <summary>
|
||||
/// Medium severity incident affecting some users.
|
||||
/// </summary>
|
||||
Medium = 3,
|
||||
|
||||
/// <summary>
|
||||
/// Low severity incident with minimal impact.
|
||||
/// </summary>
|
||||
Low = 4
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Record of a deployment event for DORA metrics.
|
||||
/// </summary>
|
||||
/// <param name="DeploymentId">Unique identifier for the deployment.</param>
|
||||
/// <param name="TenantId">Tenant associated with the deployment.</param>
|
||||
/// <param name="Environment">Target environment (e.g., production, staging).</param>
|
||||
/// <param name="CommitSha">The commit SHA that was deployed.</param>
|
||||
/// <param name="CommitTimestamp">When the commit was created.</param>
|
||||
/// <param name="DeploymentTimestamp">When the deployment completed.</param>
|
||||
/// <param name="Outcome">The outcome of the deployment.</param>
|
||||
/// <param name="DurationMs">How long the deployment took in milliseconds.</param>
|
||||
/// <param name="ArtifactDigest">The artifact digest that was deployed.</param>
|
||||
/// <param name="PipelineId">The CI/CD pipeline that executed the deployment.</param>
|
||||
public sealed record DoraDeploymentEvent(
|
||||
string DeploymentId,
|
||||
string TenantId,
|
||||
string Environment,
|
||||
string CommitSha,
|
||||
DateTimeOffset CommitTimestamp,
|
||||
DateTimeOffset DeploymentTimestamp,
|
||||
DoraDeploymentOutcome Outcome,
|
||||
long DurationMs,
|
||||
string? ArtifactDigest = null,
|
||||
string? PipelineId = null)
|
||||
{
|
||||
/// <summary>
|
||||
/// Calculates the lead time for this deployment (time from commit to deployment).
|
||||
/// </summary>
|
||||
public TimeSpan LeadTime => DeploymentTimestamp - CommitTimestamp;
|
||||
|
||||
/// <summary>
|
||||
/// Whether this deployment is considered a failure for CFR calculation.
|
||||
/// </summary>
|
||||
public bool IsFailure => Outcome is DoraDeploymentOutcome.Rollback
|
||||
or DoraDeploymentOutcome.Hotfix
|
||||
or DoraDeploymentOutcome.Failed;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Record of an incident for MTTR tracking.
|
||||
/// </summary>
|
||||
/// <param name="IncidentId">Unique identifier for the incident.</param>
|
||||
/// <param name="TenantId">Tenant associated with the incident.</param>
|
||||
/// <param name="Environment">Environment where the incident occurred.</param>
|
||||
/// <param name="Severity">The severity of the incident.</param>
|
||||
/// <param name="StartedAt">When the incident was detected.</param>
|
||||
/// <param name="ResolvedAt">When the incident was resolved (null if still open).</param>
|
||||
/// <param name="AcknowledgedAt">When the incident was acknowledged (null if not yet acknowledged).</param>
|
||||
/// <param name="DeploymentId">The deployment that caused the incident (if known).</param>
|
||||
/// <param name="Description">Brief description of the incident.</param>
|
||||
public sealed record DoraIncidentEvent(
|
||||
string IncidentId,
|
||||
string TenantId,
|
||||
string Environment,
|
||||
DoraIncidentSeverity Severity,
|
||||
DateTimeOffset StartedAt,
|
||||
DateTimeOffset? ResolvedAt,
|
||||
DateTimeOffset? AcknowledgedAt = null,
|
||||
string? DeploymentId = null,
|
||||
string? Description = null)
|
||||
{
|
||||
/// <summary>
|
||||
/// Calculates the time to acknowledge (null if not acknowledged).
|
||||
/// </summary>
|
||||
public TimeSpan? TimeToAcknowledge => AcknowledgedAt.HasValue
|
||||
? AcknowledgedAt.Value - StartedAt
|
||||
: null;
|
||||
|
||||
/// <summary>
|
||||
/// Calculates the time to recovery (null if still open).
|
||||
/// </summary>
|
||||
public TimeSpan? TimeToRecovery => ResolvedAt.HasValue
|
||||
? ResolvedAt.Value - StartedAt
|
||||
: null;
|
||||
|
||||
/// <summary>
|
||||
/// Whether the incident is still open.
|
||||
/// </summary>
|
||||
public bool IsOpen => !ResolvedAt.HasValue;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Summary of DORA metrics for a tenant/environment over a time period.
|
||||
/// </summary>
|
||||
/// <param name="TenantId">The tenant ID.</param>
|
||||
/// <param name="Environment">The environment (or null for all environments).</param>
|
||||
/// <param name="PeriodStart">Start of the measurement period.</param>
|
||||
/// <param name="PeriodEnd">End of the measurement period.</param>
|
||||
/// <param name="DeploymentCount">Total number of deployments.</param>
|
||||
/// <param name="SuccessfulDeployments">Number of successful deployments.</param>
|
||||
/// <param name="FailedDeployments">Number of failed deployments (CFR numerator).</param>
|
||||
/// <param name="DeploymentFrequencyPerDay">Average deployments per day.</param>
|
||||
/// <param name="MedianLeadTimeHours">Median lead time for changes in hours.</param>
|
||||
/// <param name="ChangeFailureRatePercent">Change failure rate as a percentage.</param>
|
||||
/// <param name="MeanTimeToRecoveryHours">Mean time to recovery in hours.</param>
|
||||
/// <param name="PerformanceLevel">Calculated DORA performance classification.</param>
|
||||
public sealed record DoraSummary(
|
||||
string TenantId,
|
||||
string? Environment,
|
||||
DateTimeOffset PeriodStart,
|
||||
DateTimeOffset PeriodEnd,
|
||||
int DeploymentCount,
|
||||
int SuccessfulDeployments,
|
||||
int FailedDeployments,
|
||||
double DeploymentFrequencyPerDay,
|
||||
double MedianLeadTimeHours,
|
||||
double ChangeFailureRatePercent,
|
||||
double MeanTimeToRecoveryHours,
|
||||
DoraPerformanceLevel PerformanceLevel);
|
||||
@@ -0,0 +1,214 @@
|
||||
namespace StellaOps.Telemetry.Core;
|
||||
|
||||
/// <summary>
|
||||
/// Deterministic outcome analytics service backed by <see cref="IDoraMetricsService"/>.
|
||||
/// </summary>
|
||||
public sealed class DoraOutcomeAnalyticsService : IOutcomeAnalyticsService
|
||||
{
|
||||
private const string UnknownPipelineId = "unknown";
|
||||
private readonly IDoraMetricsService _doraMetricsService;
|
||||
|
||||
public DoraOutcomeAnalyticsService(IDoraMetricsService doraMetricsService)
|
||||
{
|
||||
_doraMetricsService = doraMetricsService ?? throw new ArgumentNullException(nameof(doraMetricsService));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<OutcomeExecutiveReport> GetExecutiveReportAsync(
|
||||
string tenantId,
|
||||
string? environment,
|
||||
DateTimeOffset periodStart,
|
||||
DateTimeOffset periodEnd,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
if (periodEnd < periodStart)
|
||||
{
|
||||
throw new ArgumentException("Period end must be greater than or equal to period start.", nameof(periodEnd));
|
||||
}
|
||||
|
||||
var deployments = await ToListAsync(
|
||||
_doraMetricsService.GetDeploymentsAsync(tenantId, environment, periodStart, periodEnd, cancellationToken),
|
||||
cancellationToken);
|
||||
|
||||
var incidents = await ToListAsync(
|
||||
_doraMetricsService.GetIncidentsAsync(tenantId, environment, periodStart, periodEnd, includeOpen: true, cancellationToken),
|
||||
cancellationToken);
|
||||
|
||||
var totalDeployments = deployments.Count;
|
||||
var failedDeployments = deployments.Count(static d => d.IsFailure);
|
||||
var totalIncidents = incidents.Count;
|
||||
var resolvedIncidents = incidents.Where(static i => !i.IsOpen).ToList();
|
||||
var acknowledgedIncidents = incidents.Where(static i => i.TimeToAcknowledge.HasValue).ToList();
|
||||
|
||||
var meanTimeToAcknowledgeHours = CalculateMeanHours(acknowledgedIncidents
|
||||
.Select(i => i.TimeToAcknowledge)
|
||||
.Where(static t => t.HasValue)
|
||||
.Select(static t => t!.Value));
|
||||
|
||||
var meanTimeToRecoveryHours = CalculateMeanHours(resolvedIncidents
|
||||
.Select(i => i.TimeToRecovery)
|
||||
.Where(static t => t.HasValue)
|
||||
.Select(static t => t!.Value));
|
||||
|
||||
var deploymentAttribution = BuildDeploymentAttribution(deployments);
|
||||
var incidentAttribution = BuildIncidentAttribution(incidents);
|
||||
var dailyCohorts = BuildDailyCohorts(periodStart, periodEnd, deployments, resolvedIncidents);
|
||||
|
||||
return new OutcomeExecutiveReport(
|
||||
TenantId: tenantId,
|
||||
Environment: environment,
|
||||
PeriodStart: periodStart,
|
||||
PeriodEnd: periodEnd,
|
||||
TotalDeployments: totalDeployments,
|
||||
FailedDeployments: failedDeployments,
|
||||
TotalIncidents: totalIncidents,
|
||||
ResolvedIncidents: resolvedIncidents.Count,
|
||||
AcknowledgedIncidents: acknowledgedIncidents.Count,
|
||||
MeanTimeToAcknowledgeHours: meanTimeToAcknowledgeHours,
|
||||
MeanTimeToRecoveryHours: meanTimeToRecoveryHours,
|
||||
DeploymentAttribution: deploymentAttribution,
|
||||
IncidentAttribution: incidentAttribution,
|
||||
DailyCohorts: dailyCohorts);
|
||||
}
|
||||
|
||||
private static IReadOnlyList<DeploymentAttributionSlice> BuildDeploymentAttribution(
|
||||
IReadOnlyList<DoraDeploymentEvent> deployments)
|
||||
{
|
||||
return deployments
|
||||
.GroupBy(static d => NormalizePipelineId(d.PipelineId), StringComparer.Ordinal)
|
||||
.OrderBy(static g => g.Key, StringComparer.Ordinal)
|
||||
.Select(static group =>
|
||||
{
|
||||
var events = group.OrderBy(static d => d.DeploymentTimestamp).ToList();
|
||||
var deploymentCount = events.Count;
|
||||
var failedDeploymentCount = events.Count(static d => d.IsFailure);
|
||||
var failureRate = deploymentCount == 0
|
||||
? 0
|
||||
: Math.Round((failedDeploymentCount * 100.0) / deploymentCount, 2);
|
||||
var medianLeadTimeHours = Math.Round(CalculateMedianHours(events.Select(static d => d.LeadTime.TotalHours)), 2);
|
||||
|
||||
return new DeploymentAttributionSlice(
|
||||
PipelineId: group.Key,
|
||||
DeploymentCount: deploymentCount,
|
||||
FailedDeploymentCount: failedDeploymentCount,
|
||||
ChangeFailureRatePercent: failureRate,
|
||||
MedianLeadTimeHours: medianLeadTimeHours);
|
||||
})
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static IReadOnlyList<IncidentAttributionSlice> BuildIncidentAttribution(
|
||||
IReadOnlyList<DoraIncidentEvent> incidents)
|
||||
{
|
||||
return incidents
|
||||
.GroupBy(static i => i.Severity)
|
||||
.OrderBy(static g => g.Key)
|
||||
.Select(static group =>
|
||||
{
|
||||
var events = group.OrderBy(static i => i.StartedAt).ToList();
|
||||
var resolved = events.Where(static i => !i.IsOpen).ToList();
|
||||
var acknowledged = events.Where(static i => i.TimeToAcknowledge.HasValue).ToList();
|
||||
|
||||
return new IncidentAttributionSlice(
|
||||
Severity: group.Key,
|
||||
IncidentCount: events.Count,
|
||||
ResolvedIncidentCount: resolved.Count,
|
||||
AcknowledgedIncidentCount: acknowledged.Count,
|
||||
MeanTimeToAcknowledgeHours: CalculateMeanHours(acknowledged
|
||||
.Select(i => i.TimeToAcknowledge)
|
||||
.Where(static t => t.HasValue)
|
||||
.Select(static t => t!.Value)),
|
||||
MeanTimeToRecoveryHours: CalculateMeanHours(resolved
|
||||
.Select(i => i.TimeToRecovery)
|
||||
.Where(static t => t.HasValue)
|
||||
.Select(static t => t!.Value)));
|
||||
})
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static IReadOnlyList<OutcomeCohortSlice> BuildDailyCohorts(
|
||||
DateTimeOffset periodStart,
|
||||
DateTimeOffset periodEnd,
|
||||
IReadOnlyList<DoraDeploymentEvent> deployments,
|
||||
IReadOnlyList<DoraIncidentEvent> resolvedIncidents)
|
||||
{
|
||||
var deploymentByDay = deployments
|
||||
.GroupBy(static d => DateOnly.FromDateTime(d.DeploymentTimestamp.UtcDateTime.Date))
|
||||
.ToDictionary(
|
||||
static g => g.Key,
|
||||
static g => (Deployments: g.Count(), FailedDeployments: g.Count(static d => d.IsFailure)));
|
||||
|
||||
var resolvedByDay = resolvedIncidents
|
||||
.GroupBy(static i => DateOnly.FromDateTime(i.ResolvedAt!.Value.UtcDateTime.Date))
|
||||
.ToDictionary(static g => g.Key, static g => g.Count());
|
||||
|
||||
var day = DateOnly.FromDateTime(periodStart.UtcDateTime.Date);
|
||||
var endDay = DateOnly.FromDateTime(periodEnd.UtcDateTime.Date);
|
||||
var cohorts = new List<OutcomeCohortSlice>();
|
||||
while (day <= endDay)
|
||||
{
|
||||
deploymentByDay.TryGetValue(day, out var deploymentStats);
|
||||
resolvedByDay.TryGetValue(day, out var resolvedCount);
|
||||
|
||||
cohorts.Add(new OutcomeCohortSlice(
|
||||
Day: day,
|
||||
DeploymentCount: deploymentStats.Deployments,
|
||||
FailedDeploymentCount: deploymentStats.FailedDeployments,
|
||||
ResolvedIncidentCount: resolvedCount));
|
||||
|
||||
day = day.AddDays(1);
|
||||
}
|
||||
|
||||
return cohorts;
|
||||
}
|
||||
|
||||
private static string NormalizePipelineId(string? pipelineId) =>
|
||||
string.IsNullOrWhiteSpace(pipelineId)
|
||||
? UnknownPipelineId
|
||||
: pipelineId.Trim().ToLowerInvariant();
|
||||
|
||||
private static double CalculateMeanHours(IEnumerable<TimeSpan> values)
|
||||
{
|
||||
var hours = values
|
||||
.Where(static span => span >= TimeSpan.Zero)
|
||||
.Select(static span => span.TotalHours)
|
||||
.ToList();
|
||||
|
||||
if (hours.Count == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
return Math.Round(hours.Average(), 2);
|
||||
}
|
||||
|
||||
private static double CalculateMedianHours(IEnumerable<double> values)
|
||||
{
|
||||
var sorted = values.OrderBy(static value => value).ToList();
|
||||
if (sorted.Count == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
var mid = sorted.Count / 2;
|
||||
if (sorted.Count % 2 == 0)
|
||||
{
|
||||
return (sorted[mid - 1] + sorted[mid]) / 2.0;
|
||||
}
|
||||
|
||||
return sorted[mid];
|
||||
}
|
||||
|
||||
private static async Task<List<T>> ToListAsync<T>(IAsyncEnumerable<T> source, CancellationToken cancellationToken)
|
||||
{
|
||||
var list = new List<T>();
|
||||
await foreach (var item in source)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
list.Add(item);
|
||||
}
|
||||
|
||||
return list;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,80 @@
|
||||
namespace StellaOps.Telemetry.Core;
|
||||
|
||||
/// <summary>
|
||||
/// Service interface for recording and querying DORA metrics.
|
||||
/// </summary>
|
||||
public interface IDoraMetricsService
|
||||
{
|
||||
/// <summary>
|
||||
/// Records a deployment event for DORA metrics tracking.
|
||||
/// </summary>
|
||||
/// <param name="deployment">The deployment event to record.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task RecordDeploymentAsync(DoraDeploymentEvent deployment, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Records an incident for MTTR tracking.
|
||||
/// </summary>
|
||||
/// <param name="incident">The incident event to record.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task RecordIncidentAsync(DoraIncidentEvent incident, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Resolves an open incident.
|
||||
/// </summary>
|
||||
/// <param name="tenantId">The tenant ID.</param>
|
||||
/// <param name="incidentId">The incident ID to resolve.</param>
|
||||
/// <param name="resolvedAt">When the incident was resolved.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task ResolveIncidentAsync(string tenantId, string incidentId, DateTimeOffset resolvedAt, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a DORA metrics summary for a tenant and optional environment.
|
||||
/// </summary>
|
||||
/// <param name="tenantId">The tenant ID.</param>
|
||||
/// <param name="environment">Optional environment filter.</param>
|
||||
/// <param name="periodStart">Start of the period to analyze.</param>
|
||||
/// <param name="periodEnd">End of the period to analyze.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>A summary of DORA metrics for the period.</returns>
|
||||
Task<DoraSummary> GetSummaryAsync(
|
||||
string tenantId,
|
||||
string? environment,
|
||||
DateTimeOffset periodStart,
|
||||
DateTimeOffset periodEnd,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets deployment events for a tenant within a time range.
|
||||
/// </summary>
|
||||
/// <param name="tenantId">The tenant ID.</param>
|
||||
/// <param name="environment">Optional environment filter.</param>
|
||||
/// <param name="from">Start of the time range.</param>
|
||||
/// <param name="to">End of the time range.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Deployment events in the time range.</returns>
|
||||
IAsyncEnumerable<DoraDeploymentEvent> GetDeploymentsAsync(
|
||||
string tenantId,
|
||||
string? environment,
|
||||
DateTimeOffset from,
|
||||
DateTimeOffset to,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets incident events for a tenant within a time range.
|
||||
/// </summary>
|
||||
/// <param name="tenantId">The tenant ID.</param>
|
||||
/// <param name="environment">Optional environment filter.</param>
|
||||
/// <param name="from">Start of the time range.</param>
|
||||
/// <param name="to">End of the time range.</param>
|
||||
/// <param name="includeOpen">Whether to include open incidents.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Incident events in the time range.</returns>
|
||||
IAsyncEnumerable<DoraIncidentEvent> GetIncidentsAsync(
|
||||
string tenantId,
|
||||
string? environment,
|
||||
DateTimeOffset from,
|
||||
DateTimeOffset to,
|
||||
bool includeOpen = true,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
namespace StellaOps.Telemetry.Core;
|
||||
|
||||
/// <summary>
|
||||
/// Service interface for deterministic outcome attribution and executive reporting.
|
||||
/// </summary>
|
||||
public interface IOutcomeAnalyticsService
|
||||
{
|
||||
/// <summary>
|
||||
/// Builds an executive outcome report for a tenant and optional environment over a fixed period.
|
||||
/// </summary>
|
||||
/// <param name="tenantId">Tenant to report for.</param>
|
||||
/// <param name="environment">Optional environment filter.</param>
|
||||
/// <param name="periodStart">Start of the reporting period.</param>
|
||||
/// <param name="periodEnd">End of the reporting period.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Deterministic outcome report with attribution and cohort slices.</returns>
|
||||
Task<OutcomeExecutiveReport> GetExecutiveReportAsync(
|
||||
string tenantId,
|
||||
string? environment,
|
||||
DateTimeOffset periodStart,
|
||||
DateTimeOffset periodEnd,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
@@ -0,0 +1,281 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Runtime.CompilerServices;
|
||||
|
||||
namespace StellaOps.Telemetry.Core;
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of <see cref="IDoraMetricsService"/> for development and testing.
|
||||
/// Production deployments should use a persistent storage implementation.
|
||||
/// </summary>
|
||||
public sealed class InMemoryDoraMetricsService : IDoraMetricsService
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, List<DoraDeploymentEvent>> _deployments = new();
|
||||
private readonly ConcurrentDictionary<string, List<DoraIncidentEvent>> _incidents = new();
|
||||
private readonly DoraMetrics _metrics;
|
||||
private readonly DoraMetricsOptions _options;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of <see cref="InMemoryDoraMetricsService"/>.
|
||||
/// </summary>
|
||||
public InMemoryDoraMetricsService(DoraMetrics metrics, DoraMetricsOptions? options = null)
|
||||
{
|
||||
_metrics = metrics ?? throw new ArgumentNullException(nameof(metrics));
|
||||
_options = options ?? new DoraMetricsOptions();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task RecordDeploymentAsync(DoraDeploymentEvent deployment, CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(deployment);
|
||||
|
||||
var key = GetTenantKey(deployment.TenantId);
|
||||
var list = _deployments.GetOrAdd(key, _ => new List<DoraDeploymentEvent>());
|
||||
|
||||
lock (list)
|
||||
{
|
||||
list.Add(deployment);
|
||||
}
|
||||
|
||||
_metrics.RecordDeployment(deployment);
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task RecordIncidentAsync(DoraIncidentEvent incident, CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(incident);
|
||||
|
||||
var key = GetTenantKey(incident.TenantId);
|
||||
var list = _incidents.GetOrAdd(key, _ => new List<DoraIncidentEvent>());
|
||||
|
||||
lock (list)
|
||||
{
|
||||
list.Add(incident);
|
||||
}
|
||||
|
||||
_metrics.RecordIncidentStarted(incident);
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task ResolveIncidentAsync(string tenantId, string incidentId, DateTimeOffset resolvedAt, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var key = GetTenantKey(tenantId);
|
||||
if (!_incidents.TryGetValue(key, out var list))
|
||||
{
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
DoraIncidentEvent? resolved = null;
|
||||
lock (list)
|
||||
{
|
||||
var index = list.FindIndex(i => i.IncidentId == incidentId && i.IsOpen);
|
||||
if (index >= 0)
|
||||
{
|
||||
var original = list[index];
|
||||
resolved = original with { ResolvedAt = resolvedAt };
|
||||
list[index] = resolved;
|
||||
}
|
||||
}
|
||||
|
||||
if (resolved != null)
|
||||
{
|
||||
_metrics.RecordIncidentResolved(resolved);
|
||||
}
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<DoraSummary> GetSummaryAsync(
|
||||
string tenantId,
|
||||
string? environment,
|
||||
DateTimeOffset periodStart,
|
||||
DateTimeOffset periodEnd,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var deployments = GetDeploymentsInRange(tenantId, environment, periodStart, periodEnd);
|
||||
var incidents = GetIncidentsInRange(tenantId, environment, periodStart, periodEnd, resolvedOnly: true);
|
||||
|
||||
var periodDays = (periodEnd - periodStart).TotalDays;
|
||||
if (periodDays <= 0) periodDays = 1;
|
||||
|
||||
// Deployment Frequency
|
||||
var totalDeployments = deployments.Count;
|
||||
var deploymentFrequency = totalDeployments / periodDays;
|
||||
|
||||
// Change Failure Rate
|
||||
var successfulDeployments = deployments.Count(d => !d.IsFailure);
|
||||
var failedDeployments = deployments.Count(d => d.IsFailure);
|
||||
var changeFailureRate = totalDeployments > 0
|
||||
? (failedDeployments * 100.0) / totalDeployments
|
||||
: 0.0;
|
||||
|
||||
// Lead Time for Changes (median)
|
||||
var leadTimes = deployments
|
||||
.Select(d => d.LeadTime.TotalHours)
|
||||
.OrderBy(t => t)
|
||||
.ToList();
|
||||
|
||||
var medianLeadTime = leadTimes.Count > 0
|
||||
? CalculateMedian(leadTimes)
|
||||
: 0.0;
|
||||
|
||||
// Mean Time to Recovery
|
||||
var recoveryTimes = incidents
|
||||
.Where(i => i.TimeToRecovery.HasValue)
|
||||
.Select(i => i.TimeToRecovery!.Value.TotalHours)
|
||||
.ToList();
|
||||
|
||||
var mttr = recoveryTimes.Count > 0
|
||||
? recoveryTimes.Average()
|
||||
: 0.0;
|
||||
|
||||
// Classify performance
|
||||
var performanceLevel = DoraMetrics.ClassifyPerformance(
|
||||
deploymentFrequency,
|
||||
medianLeadTime,
|
||||
changeFailureRate,
|
||||
mttr);
|
||||
|
||||
// Check and record SLO breaches
|
||||
if (deploymentFrequency < _options.DeploymentFrequencySloPerDay && totalDeployments > 0)
|
||||
{
|
||||
_metrics.RecordDeploymentFrequencySloBreak(tenantId, environment ?? "all", deploymentFrequency);
|
||||
}
|
||||
|
||||
if (changeFailureRate > _options.ChangeFailureRateSloPercent && totalDeployments > 0)
|
||||
{
|
||||
_metrics.RecordChangeFailureRateSloBreak(tenantId, environment ?? "all", changeFailureRate);
|
||||
}
|
||||
|
||||
var summary = new DoraSummary(
|
||||
TenantId: tenantId,
|
||||
Environment: environment,
|
||||
PeriodStart: periodStart,
|
||||
PeriodEnd: periodEnd,
|
||||
DeploymentCount: totalDeployments,
|
||||
SuccessfulDeployments: successfulDeployments,
|
||||
FailedDeployments: failedDeployments,
|
||||
DeploymentFrequencyPerDay: Math.Round(deploymentFrequency, 4),
|
||||
MedianLeadTimeHours: Math.Round(medianLeadTime, 2),
|
||||
ChangeFailureRatePercent: Math.Round(changeFailureRate, 2),
|
||||
MeanTimeToRecoveryHours: Math.Round(mttr, 2),
|
||||
PerformanceLevel: performanceLevel);
|
||||
|
||||
return Task.FromResult(summary);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async IAsyncEnumerable<DoraDeploymentEvent> GetDeploymentsAsync(
|
||||
string tenantId,
|
||||
string? environment,
|
||||
DateTimeOffset from,
|
||||
DateTimeOffset to,
|
||||
[EnumeratorCancellation] CancellationToken cancellationToken = default)
|
||||
{
|
||||
var deployments = GetDeploymentsInRange(tenantId, environment, from, to);
|
||||
foreach (var deployment in deployments)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
yield return deployment;
|
||||
}
|
||||
|
||||
await Task.CompletedTask; // Async enumerable pattern
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async IAsyncEnumerable<DoraIncidentEvent> GetIncidentsAsync(
|
||||
string tenantId,
|
||||
string? environment,
|
||||
DateTimeOffset from,
|
||||
DateTimeOffset to,
|
||||
bool includeOpen = true,
|
||||
[EnumeratorCancellation] CancellationToken cancellationToken = default)
|
||||
{
|
||||
var incidents = GetIncidentsInRange(tenantId, environment, from, to, resolvedOnly: !includeOpen);
|
||||
foreach (var incident in incidents)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
yield return incident;
|
||||
}
|
||||
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
|
||||
private List<DoraDeploymentEvent> GetDeploymentsInRange(
|
||||
string tenantId,
|
||||
string? environment,
|
||||
DateTimeOffset from,
|
||||
DateTimeOffset to)
|
||||
{
|
||||
var key = GetTenantKey(tenantId);
|
||||
if (!_deployments.TryGetValue(key, out var list))
|
||||
{
|
||||
return new List<DoraDeploymentEvent>();
|
||||
}
|
||||
|
||||
lock (list)
|
||||
{
|
||||
var query = list.Where(d =>
|
||||
d.DeploymentTimestamp >= from &&
|
||||
d.DeploymentTimestamp <= to);
|
||||
|
||||
if (!string.IsNullOrEmpty(environment))
|
||||
{
|
||||
query = query.Where(d => d.Environment.Equals(environment, StringComparison.OrdinalIgnoreCase));
|
||||
}
|
||||
|
||||
return query.OrderBy(d => d.DeploymentTimestamp).ToList();
|
||||
}
|
||||
}
|
||||
|
||||
private List<DoraIncidentEvent> GetIncidentsInRange(
|
||||
string tenantId,
|
||||
string? environment,
|
||||
DateTimeOffset from,
|
||||
DateTimeOffset to,
|
||||
bool resolvedOnly)
|
||||
{
|
||||
var key = GetTenantKey(tenantId);
|
||||
if (!_incidents.TryGetValue(key, out var list))
|
||||
{
|
||||
return new List<DoraIncidentEvent>();
|
||||
}
|
||||
|
||||
lock (list)
|
||||
{
|
||||
var query = list.Where(i =>
|
||||
i.StartedAt >= from &&
|
||||
i.StartedAt <= to);
|
||||
|
||||
if (!string.IsNullOrEmpty(environment))
|
||||
{
|
||||
query = query.Where(i => i.Environment.Equals(environment, StringComparison.OrdinalIgnoreCase));
|
||||
}
|
||||
|
||||
if (resolvedOnly)
|
||||
{
|
||||
query = query.Where(i => !i.IsOpen);
|
||||
}
|
||||
|
||||
return query.OrderBy(i => i.StartedAt).ToList();
|
||||
}
|
||||
}
|
||||
|
||||
private static double CalculateMedian(List<double> sortedValues)
|
||||
{
|
||||
if (sortedValues.Count == 0) return 0;
|
||||
|
||||
var mid = sortedValues.Count / 2;
|
||||
if (sortedValues.Count % 2 == 0)
|
||||
{
|
||||
return (sortedValues[mid - 1] + sortedValues[mid]) / 2.0;
|
||||
}
|
||||
|
||||
return sortedValues[mid];
|
||||
}
|
||||
|
||||
private static string GetTenantKey(string tenantId) =>
|
||||
tenantId.ToLowerInvariant();
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
namespace StellaOps.Telemetry.Core;
|
||||
|
||||
/// <summary>
|
||||
/// Executive outcome analytics report built from deployment and incident telemetry events.
|
||||
/// </summary>
|
||||
public sealed record OutcomeExecutiveReport(
|
||||
string TenantId,
|
||||
string? Environment,
|
||||
DateTimeOffset PeriodStart,
|
||||
DateTimeOffset PeriodEnd,
|
||||
int TotalDeployments,
|
||||
int FailedDeployments,
|
||||
int TotalIncidents,
|
||||
int ResolvedIncidents,
|
||||
int AcknowledgedIncidents,
|
||||
double MeanTimeToAcknowledgeHours,
|
||||
double MeanTimeToRecoveryHours,
|
||||
IReadOnlyList<DeploymentAttributionSlice> DeploymentAttribution,
|
||||
IReadOnlyList<IncidentAttributionSlice> IncidentAttribution,
|
||||
IReadOnlyList<OutcomeCohortSlice> DailyCohorts);
|
||||
|
||||
/// <summary>
|
||||
/// Attribution slice for deployment outcomes grouped by pipeline.
|
||||
/// </summary>
|
||||
public sealed record DeploymentAttributionSlice(
|
||||
string PipelineId,
|
||||
int DeploymentCount,
|
||||
int FailedDeploymentCount,
|
||||
double ChangeFailureRatePercent,
|
||||
double MedianLeadTimeHours);
|
||||
|
||||
/// <summary>
|
||||
/// Attribution slice for incidents grouped by severity.
|
||||
/// </summary>
|
||||
public sealed record IncidentAttributionSlice(
|
||||
DoraIncidentSeverity Severity,
|
||||
int IncidentCount,
|
||||
int ResolvedIncidentCount,
|
||||
int AcknowledgedIncidentCount,
|
||||
double MeanTimeToAcknowledgeHours,
|
||||
double MeanTimeToRecoveryHours);
|
||||
|
||||
/// <summary>
|
||||
/// Daily cohort view used for trend reporting.
|
||||
/// </summary>
|
||||
public sealed record OutcomeCohortSlice(
|
||||
DateOnly Day,
|
||||
int DeploymentCount,
|
||||
int FailedDeploymentCount,
|
||||
int ResolvedIncidentCount);
|
||||
@@ -134,6 +134,44 @@ public static class TelemetryServiceCollectionExtensions
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers DORA (DevOps Research and Assessment) metrics for measuring software delivery performance.
|
||||
/// Tracks the four key metrics: Deployment Frequency, Lead Time for Changes, Change Failure Rate, and MTTR.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection to mutate.</param>
|
||||
/// <param name="configureOptions">Optional options configuration including SLO targets.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDoraMetrics(
|
||||
this IServiceCollection services,
|
||||
Action<DoraMetricsOptions>? configureOptions = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
services.AddOptions<DoraMetricsOptions>()
|
||||
.Configure(options => configureOptions?.Invoke(options));
|
||||
|
||||
services.TryAddSingleton(sp =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<DoraMetricsOptions>>().Value;
|
||||
return new DoraMetrics(options);
|
||||
});
|
||||
|
||||
services.TryAddSingleton<IDoraMetricsService>(sp =>
|
||||
{
|
||||
var metrics = sp.GetRequiredService<DoraMetrics>();
|
||||
var options = sp.GetRequiredService<IOptions<DoraMetricsOptions>>().Value;
|
||||
return new InMemoryDoraMetricsService(metrics, options);
|
||||
});
|
||||
|
||||
services.TryAddSingleton<IOutcomeAnalyticsService>(sp =>
|
||||
{
|
||||
var doraMetricsService = sp.GetRequiredService<IDoraMetricsService>();
|
||||
return new DoraOutcomeAnalyticsService(doraMetricsService);
|
||||
});
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers incident mode services for toggling enhanced telemetry during incidents.
|
||||
/// </summary>
|
||||
|
||||
Reference in New Issue
Block a user