Merge branch 'main' of https://git.stella-ops.org/stella-ops.org/git.stella-ops.org
Some checks failed
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Docs CI / lint-and-preview (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
api-governance / spectral-lint (push) Has been cancelled
oas-ci / oas-validate (push) Has been cancelled
Policy Simulation / policy-simulate (push) Has been cancelled
sdk-generator-smoke / sdk-smoke (push) Has been cancelled
SDK Publish & Sign / sdk-publish (push) Has been cancelled

This commit is contained in:
StellaOps Bot
2025-11-27 21:45:32 +02:00
510 changed files with 138401 additions and 51276 deletions

View File

@@ -1,233 +1,233 @@
using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Default implementation of notification metrics using System.Diagnostics.Metrics.
/// </summary>
public sealed class DefaultNotifyMetrics : INotifyMetrics
{
private static readonly ActivitySource ActivitySource = new("StellaOps.Notifier", "1.0.0");
private static readonly Meter Meter = new("StellaOps.Notifier", "1.0.0");
// Counters
private readonly Counter<long> _deliveryAttempts;
private readonly Counter<long> _escalationEvents;
private readonly Counter<long> _deadLetterEntries;
private readonly Counter<long> _ruleEvaluations;
private readonly Counter<long> _templateRenders;
private readonly Counter<long> _stormEvents;
private readonly Counter<long> _retentionCleanups;
// Histograms
private readonly Histogram<double> _deliveryDuration;
private readonly Histogram<double> _ruleEvaluationDuration;
private readonly Histogram<double> _templateRenderDuration;
// Gauges (using ObservableGauge pattern)
private readonly Dictionary<string, int> _queueDepths = new();
private readonly object _queueDepthLock = new();
public DefaultNotifyMetrics()
{
// Initialize counters
_deliveryAttempts = Meter.CreateCounter<long>(
NotifyMetricNames.DeliveryAttempts,
unit: "{attempts}",
description: "Total number of notification delivery attempts");
_escalationEvents = Meter.CreateCounter<long>(
NotifyMetricNames.EscalationEvents,
unit: "{events}",
description: "Total number of escalation events");
_deadLetterEntries = Meter.CreateCounter<long>(
NotifyMetricNames.DeadLetterEntries,
unit: "{entries}",
description: "Total number of dead-letter entries");
_ruleEvaluations = Meter.CreateCounter<long>(
NotifyMetricNames.RuleEvaluations,
unit: "{evaluations}",
description: "Total number of rule evaluations");
_templateRenders = Meter.CreateCounter<long>(
NotifyMetricNames.TemplateRenders,
unit: "{renders}",
description: "Total number of template render operations");
_stormEvents = Meter.CreateCounter<long>(
NotifyMetricNames.StormEvents,
unit: "{events}",
description: "Total number of storm detection events");
_retentionCleanups = Meter.CreateCounter<long>(
NotifyMetricNames.RetentionCleanups,
unit: "{cleanups}",
description: "Total number of retention cleanup operations");
// Initialize histograms
_deliveryDuration = Meter.CreateHistogram<double>(
NotifyMetricNames.DeliveryDuration,
unit: "ms",
description: "Duration of delivery attempts in milliseconds");
_ruleEvaluationDuration = Meter.CreateHistogram<double>(
NotifyMetricNames.RuleEvaluationDuration,
unit: "ms",
description: "Duration of rule evaluations in milliseconds");
_templateRenderDuration = Meter.CreateHistogram<double>(
NotifyMetricNames.TemplateRenderDuration,
unit: "ms",
description: "Duration of template renders in milliseconds");
// Initialize observable gauge for queue depths
Meter.CreateObservableGauge(
NotifyMetricNames.QueueDepth,
observeValues: ObserveQueueDepths,
unit: "{messages}",
description: "Current queue depth per channel");
}
public void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.ChannelType, channelType },
{ NotifyMetricTags.Status, status }
};
_deliveryAttempts.Add(1, tags);
_deliveryDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordEscalation(string tenantId, int level, string outcome)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.Level, level.ToString() },
{ NotifyMetricTags.Outcome, outcome }
};
_escalationEvents.Add(1, tags);
}
public void RecordDeadLetter(string tenantId, string reason, string channelType)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.Reason, reason },
{ NotifyMetricTags.ChannelType, channelType }
};
_deadLetterEntries.Add(1, tags);
}
public void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.RuleId, ruleId },
{ NotifyMetricTags.Matched, matched.ToString().ToLowerInvariant() }
};
_ruleEvaluations.Add(1, tags);
_ruleEvaluationDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.TemplateKey, templateKey },
{ NotifyMetricTags.Success, success.ToString().ToLowerInvariant() }
};
_templateRenders.Add(1, tags);
_templateRenderDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordStormEvent(string tenantId, string stormKey, string decision)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.StormKey, stormKey },
{ NotifyMetricTags.Decision, decision }
};
_stormEvents.Add(1, tags);
}
public void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.EntityType, entityType }
};
_retentionCleanups.Add(deletedCount, tags);
}
public void RecordQueueDepth(string tenantId, string channelType, int depth)
{
var key = $"{tenantId}:{channelType}";
lock (_queueDepthLock)
{
_queueDepths[key] = depth;
}
}
public Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType)
{
var activity = ActivitySource.StartActivity("notify.delivery", ActivityKind.Internal);
if (activity is not null)
{
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
activity.SetTag("delivery_id", deliveryId);
activity.SetTag(NotifyMetricTags.ChannelType, channelType);
}
return activity;
}
public Activity? StartEscalationActivity(string tenantId, string incidentId, int level)
{
var activity = ActivitySource.StartActivity("notify.escalation", ActivityKind.Internal);
if (activity is not null)
{
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
activity.SetTag("incident_id", incidentId);
activity.SetTag(NotifyMetricTags.Level, level);
}
return activity;
}
private IEnumerable<Measurement<int>> ObserveQueueDepths()
{
lock (_queueDepthLock)
{
foreach (var (key, depth) in _queueDepths)
{
var parts = key.Split(':');
if (parts.Length == 2)
{
yield return new Measurement<int>(
depth,
new TagList
{
{ NotifyMetricTags.TenantId, parts[0] },
{ NotifyMetricTags.ChannelType, parts[1] }
});
}
}
}
}
}
using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Default implementation of notification metrics using System.Diagnostics.Metrics.
/// </summary>
public sealed class DefaultNotifyMetrics : INotifyMetrics
{
private static readonly ActivitySource ActivitySource = new("StellaOps.Notifier", "1.0.0");
private static readonly Meter Meter = new("StellaOps.Notifier", "1.0.0");
// Counters
private readonly Counter<long> _deliveryAttempts;
private readonly Counter<long> _escalationEvents;
private readonly Counter<long> _deadLetterEntries;
private readonly Counter<long> _ruleEvaluations;
private readonly Counter<long> _templateRenders;
private readonly Counter<long> _stormEvents;
private readonly Counter<long> _retentionCleanups;
// Histograms
private readonly Histogram<double> _deliveryDuration;
private readonly Histogram<double> _ruleEvaluationDuration;
private readonly Histogram<double> _templateRenderDuration;
// Gauges (using ObservableGauge pattern)
private readonly Dictionary<string, int> _queueDepths = new();
private readonly object _queueDepthLock = new();
public DefaultNotifyMetrics()
{
// Initialize counters
_deliveryAttempts = Meter.CreateCounter<long>(
NotifyMetricNames.DeliveryAttempts,
unit: "{attempts}",
description: "Total number of notification delivery attempts");
_escalationEvents = Meter.CreateCounter<long>(
NotifyMetricNames.EscalationEvents,
unit: "{events}",
description: "Total number of escalation events");
_deadLetterEntries = Meter.CreateCounter<long>(
NotifyMetricNames.DeadLetterEntries,
unit: "{entries}",
description: "Total number of dead-letter entries");
_ruleEvaluations = Meter.CreateCounter<long>(
NotifyMetricNames.RuleEvaluations,
unit: "{evaluations}",
description: "Total number of rule evaluations");
_templateRenders = Meter.CreateCounter<long>(
NotifyMetricNames.TemplateRenders,
unit: "{renders}",
description: "Total number of template render operations");
_stormEvents = Meter.CreateCounter<long>(
NotifyMetricNames.StormEvents,
unit: "{events}",
description: "Total number of storm detection events");
_retentionCleanups = Meter.CreateCounter<long>(
NotifyMetricNames.RetentionCleanups,
unit: "{cleanups}",
description: "Total number of retention cleanup operations");
// Initialize histograms
_deliveryDuration = Meter.CreateHistogram<double>(
NotifyMetricNames.DeliveryDuration,
unit: "ms",
description: "Duration of delivery attempts in milliseconds");
_ruleEvaluationDuration = Meter.CreateHistogram<double>(
NotifyMetricNames.RuleEvaluationDuration,
unit: "ms",
description: "Duration of rule evaluations in milliseconds");
_templateRenderDuration = Meter.CreateHistogram<double>(
NotifyMetricNames.TemplateRenderDuration,
unit: "ms",
description: "Duration of template renders in milliseconds");
// Initialize observable gauge for queue depths
Meter.CreateObservableGauge(
NotifyMetricNames.QueueDepth,
observeValues: ObserveQueueDepths,
unit: "{messages}",
description: "Current queue depth per channel");
}
public void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.ChannelType, channelType },
{ NotifyMetricTags.Status, status }
};
_deliveryAttempts.Add(1, tags);
_deliveryDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordEscalation(string tenantId, int level, string outcome)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.Level, level.ToString() },
{ NotifyMetricTags.Outcome, outcome }
};
_escalationEvents.Add(1, tags);
}
public void RecordDeadLetter(string tenantId, string reason, string channelType)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.Reason, reason },
{ NotifyMetricTags.ChannelType, channelType }
};
_deadLetterEntries.Add(1, tags);
}
public void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.RuleId, ruleId },
{ NotifyMetricTags.Matched, matched.ToString().ToLowerInvariant() }
};
_ruleEvaluations.Add(1, tags);
_ruleEvaluationDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.TemplateKey, templateKey },
{ NotifyMetricTags.Success, success.ToString().ToLowerInvariant() }
};
_templateRenders.Add(1, tags);
_templateRenderDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordStormEvent(string tenantId, string stormKey, string decision)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.StormKey, stormKey },
{ NotifyMetricTags.Decision, decision }
};
_stormEvents.Add(1, tags);
}
public void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.EntityType, entityType }
};
_retentionCleanups.Add(deletedCount, tags);
}
public void RecordQueueDepth(string tenantId, string channelType, int depth)
{
var key = $"{tenantId}:{channelType}";
lock (_queueDepthLock)
{
_queueDepths[key] = depth;
}
}
public Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType)
{
var activity = ActivitySource.StartActivity("notify.delivery", ActivityKind.Internal);
if (activity is not null)
{
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
activity.SetTag("delivery_id", deliveryId);
activity.SetTag(NotifyMetricTags.ChannelType, channelType);
}
return activity;
}
public Activity? StartEscalationActivity(string tenantId, string incidentId, int level)
{
var activity = ActivitySource.StartActivity("notify.escalation", ActivityKind.Internal);
if (activity is not null)
{
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
activity.SetTag("incident_id", incidentId);
activity.SetTag(NotifyMetricTags.Level, level);
}
return activity;
}
private IEnumerable<Measurement<int>> ObserveQueueDepths()
{
lock (_queueDepthLock)
{
foreach (var (key, depth) in _queueDepths)
{
var parts = key.Split(':');
if (parts.Length == 2)
{
yield return new Measurement<int>(
depth,
new TagList
{
{ NotifyMetricTags.TenantId, parts[0] },
{ NotifyMetricTags.ChannelType, parts[1] }
});
}
}
}
}
}

View File

@@ -0,0 +1,471 @@
using System.Collections.Concurrent;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Chaos testing engine for simulating channel outages and failures.
/// </summary>
public interface IChaosEngine
{
/// <summary>
/// Injects a fault for a channel type.
/// </summary>
Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default);
/// <summary>
/// Removes a fault injection.
/// </summary>
Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default);
/// <summary>
/// Gets all active faults.
/// </summary>
Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default);
/// <summary>
/// Checks if a channel operation should fail due to chaos.
/// </summary>
Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default);
/// <summary>
/// Runs a chaos test scenario.
/// </summary>
Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default);
/// <summary>
/// Gets chaos test history.
/// </summary>
Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default);
/// <summary>
/// Clears all active faults.
/// </summary>
Task ClearAllFaultsAsync(CancellationToken cancellationToken = default);
}
/// <summary>
/// Request to inject a fault.
/// </summary>
public sealed record ChaosFaultRequest
{
public required string ChannelType { get; init; }
public string? TenantId { get; init; }
public required ChaosFaultType FaultType { get; init; }
public double FailureProbability { get; init; } = 1.0;
public TimeSpan? Duration { get; init; }
public TimeSpan? LatencyInjection { get; init; }
public int? ErrorCode { get; init; }
public string? ErrorMessage { get; init; }
public string? Description { get; init; }
}
/// <summary>
/// Type of chaos fault.
/// </summary>
public enum ChaosFaultType
{
Outage,
Latency,
RateLimit,
AuthFailure,
Timeout,
PartialFailure,
Intermittent
}
/// <summary>
/// Active fault injection.
/// </summary>
public sealed record ChaosFaultInjection
{
public required string FaultId { get; init; }
public required string ChannelType { get; init; }
public string? TenantId { get; init; }
public required ChaosFaultType FaultType { get; init; }
public double FailureProbability { get; init; }
public TimeSpan? LatencyInjection { get; init; }
public int? ErrorCode { get; init; }
public string? ErrorMessage { get; init; }
public string? Description { get; init; }
public DateTimeOffset CreatedAt { get; init; }
public DateTimeOffset? ExpiresAt { get; init; }
public int TriggerCount { get; init; }
public bool IsActive { get; init; } = true;
}
/// <summary>
/// Result of checking for chaos fault.
/// </summary>
public sealed record ChaosFaultResult
{
public bool ShouldFail { get; init; }
public ChaosFaultInjection? ActiveFault { get; init; }
public TimeSpan? InjectedLatency { get; init; }
public Exception? SimulatedException { get; init; }
}
/// <summary>
/// A chaos test scenario.
/// </summary>
public sealed record ChaosScenario
{
public required string ScenarioId { get; init; }
public required string Name { get; init; }
public string? Description { get; init; }
public required IReadOnlyList<ChaosScenarioStep> Steps { get; init; }
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(10);
public bool StopOnFirstFailure { get; init; }
}
/// <summary>
/// A step in a chaos scenario.
/// </summary>
public sealed record ChaosScenarioStep
{
public required string StepId { get; init; }
public required string Name { get; init; }
public required ChaosStepAction Action { get; init; }
public ChaosFaultRequest? FaultToInject { get; init; }
public string? FaultIdToRemove { get; init; }
public TimeSpan? WaitDuration { get; init; }
public ChaosAssertion? Assertion { get; init; }
}
/// <summary>
/// Action type for a chaos step.
/// </summary>
public enum ChaosStepAction
{
InjectFault,
RemoveFault,
Wait,
Assert,
SendTestDelivery,
CheckMetrics
}
/// <summary>
/// Assertion for chaos testing.
/// </summary>
public sealed record ChaosAssertion
{
public required ChaosAssertionType Type { get; init; }
public string? MetricName { get; init; }
public double? ExpectedValue { get; init; }
public double? Tolerance { get; init; }
public string? ExpectedStatus { get; init; }
}
/// <summary>
/// Type of chaos assertion.
/// </summary>
public enum ChaosAssertionType
{
MetricEquals,
MetricGreaterThan,
MetricLessThan,
DeadLetterCountEquals,
FallbackTriggered,
AlertFired
}
/// <summary>
/// Result of a chaos test.
/// </summary>
public sealed record ChaosTestResult
{
public required string TestId { get; init; }
public required string ScenarioId { get; init; }
public required string ScenarioName { get; init; }
public bool Success { get; init; }
public DateTimeOffset StartedAt { get; init; }
public DateTimeOffset CompletedAt { get; init; }
public TimeSpan Duration { get; init; }
public IReadOnlyList<ChaosStepResult> StepResults { get; init; } = [];
public string? Error { get; init; }
}
/// <summary>
/// Result of a chaos step.
/// </summary>
public sealed record ChaosStepResult
{
public required string StepId { get; init; }
public required string StepName { get; init; }
public bool Success { get; init; }
public DateTimeOffset ExecutedAt { get; init; }
public TimeSpan Duration { get; init; }
public string? Error { get; init; }
public object? Data { get; init; }
}
/// <summary>
/// Options for chaos engine.
/// </summary>
public sealed class ChaosEngineOptions
{
public const string SectionName = "Notifier:Observability:Chaos";
public bool Enabled { get; set; }
public bool AllowInProduction { get; set; }
public TimeSpan MaxFaultDuration { get; set; } = TimeSpan.FromHours(1);
public int MaxConcurrentFaults { get; set; } = 10;
public IReadOnlyList<string> AllowedChannelTypes { get; set; } = ["webhook", "email", "slack", "teams", "pagerduty", "opsgenie"];
}
/// <summary>
/// Default implementation of chaos engine.
/// </summary>
public sealed class DefaultChaosEngine : IChaosEngine
{
private readonly ConcurrentDictionary<string, ChaosFaultInjection> _activeFaults = new();
private readonly List<ChaosTestResult> _testHistory = [];
private readonly ChaosEngineOptions _options;
private readonly TimeProvider _timeProvider;
private readonly INotifierMetrics? _metrics;
private readonly ILogger<DefaultChaosEngine> _logger;
private readonly Random _random = new();
public DefaultChaosEngine(
IOptions<ChaosEngineOptions> options,
TimeProvider timeProvider,
INotifierMetrics? metrics,
ILogger<DefaultChaosEngine> logger)
{
_options = options?.Value ?? new ChaosEngineOptions();
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default)
{
if (!_options.Enabled)
throw new InvalidOperationException("Chaos engine is disabled");
if (_activeFaults.Count >= _options.MaxConcurrentFaults)
throw new InvalidOperationException($"Maximum concurrent faults ({_options.MaxConcurrentFaults}) reached");
var now = _timeProvider.GetUtcNow();
var duration = request.Duration ?? _options.MaxFaultDuration;
if (duration > _options.MaxFaultDuration)
duration = _options.MaxFaultDuration;
var fault = new ChaosFaultInjection
{
FaultId = $"chaos-{Guid.NewGuid():N}"[..16],
ChannelType = request.ChannelType,
TenantId = request.TenantId,
FaultType = request.FaultType,
FailureProbability = Math.Clamp(request.FailureProbability, 0.0, 1.0),
LatencyInjection = request.LatencyInjection,
ErrorCode = request.ErrorCode,
ErrorMessage = request.ErrorMessage,
Description = request.Description,
CreatedAt = now,
ExpiresAt = now + duration,
IsActive = true
};
_activeFaults[fault.FaultId] = fault;
_logger.LogWarning("Injected chaos fault {FaultId} for channel {ChannelType}: {FaultType}", fault.FaultId, fault.ChannelType, fault.FaultType);
return Task.FromResult(fault);
}
public Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default)
{
var removed = _activeFaults.TryRemove(faultId, out var fault);
if (removed)
_logger.LogInformation("Removed chaos fault {FaultId}", faultId);
return Task.FromResult(removed);
}
public Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default)
{
CleanupExpiredFaults();
return Task.FromResult<IReadOnlyList<ChaosFaultInjection>>(_activeFaults.Values.ToList());
}
public Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default)
{
if (!_options.Enabled)
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
CleanupExpiredFaults();
var matchingFault = _activeFaults.Values
.Where(f => f.IsActive)
.Where(f => f.ChannelType == channelType || f.ChannelType == "*")
.Where(f => f.TenantId is null || f.TenantId == tenantId)
.FirstOrDefault();
if (matchingFault is null)
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
var shouldFail = _random.NextDouble() < matchingFault.FailureProbability;
if (!shouldFail)
return Task.FromResult(new ChaosFaultResult { ShouldFail = false, ActiveFault = matchingFault });
// Update trigger count
_activeFaults[matchingFault.FaultId] = matchingFault with { TriggerCount = matchingFault.TriggerCount + 1 };
var exception = matchingFault.FaultType switch
{
ChaosFaultType.Outage => new InvalidOperationException(matchingFault.ErrorMessage ?? "Channel outage (chaos)"),
ChaosFaultType.AuthFailure => new UnauthorizedAccessException(matchingFault.ErrorMessage ?? "Auth failure (chaos)"),
ChaosFaultType.Timeout => new TimeoutException(matchingFault.ErrorMessage ?? "Timeout (chaos)"),
ChaosFaultType.RateLimit => new InvalidOperationException(matchingFault.ErrorMessage ?? "Rate limited (chaos)"),
_ => new Exception(matchingFault.ErrorMessage ?? "Chaos fault")
};
return Task.FromResult(new ChaosFaultResult
{
ShouldFail = true,
ActiveFault = matchingFault,
InjectedLatency = matchingFault.LatencyInjection,
SimulatedException = exception
});
}
public async Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default)
{
if (!_options.Enabled)
throw new InvalidOperationException("Chaos engine is disabled");
var testId = $"test-{Guid.NewGuid():N}"[..16];
var startedAt = _timeProvider.GetUtcNow();
var stepResults = new List<ChaosStepResult>();
string? error = null;
var success = true;
_logger.LogInformation("Starting chaos scenario {ScenarioId}: {ScenarioName}", scenario.ScenarioId, scenario.Name);
using var timeoutCts = new CancellationTokenSource(scenario.Timeout);
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token);
try
{
foreach (var step in scenario.Steps)
{
var stepStart = _timeProvider.GetUtcNow();
var stepResult = await ExecuteStepAsync(step, linkedCts.Token);
stepResults.Add(stepResult);
if (!stepResult.Success)
{
success = false;
if (scenario.StopOnFirstFailure)
{
error = $"Step '{step.Name}' failed: {stepResult.Error}";
break;
}
}
}
}
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
{
success = false;
error = "Scenario timed out";
}
catch (Exception ex)
{
success = false;
error = ex.Message;
}
var completedAt = _timeProvider.GetUtcNow();
var result = new ChaosTestResult
{
TestId = testId,
ScenarioId = scenario.ScenarioId,
ScenarioName = scenario.Name,
Success = success,
StartedAt = startedAt,
CompletedAt = completedAt,
Duration = completedAt - startedAt,
StepResults = stepResults,
Error = error
};
lock (_testHistory)
{
_testHistory.Add(result);
while (_testHistory.Count > 100) _testHistory.RemoveAt(0);
}
_logger.LogInformation("Chaos scenario {ScenarioId} completed: {Success}", scenario.ScenarioId, success ? "PASSED" : "FAILED");
return result;
}
private async Task<ChaosStepResult> ExecuteStepAsync(ChaosScenarioStep step, CancellationToken cancellationToken)
{
var executedAt = _timeProvider.GetUtcNow();
try
{
object? data = null;
switch (step.Action)
{
case ChaosStepAction.InjectFault when step.FaultToInject is not null:
var fault = await InjectFaultAsync(step.FaultToInject, cancellationToken);
data = fault;
break;
case ChaosStepAction.RemoveFault when step.FaultIdToRemove is not null:
await RemoveFaultAsync(step.FaultIdToRemove, cancellationToken);
break;
case ChaosStepAction.Wait when step.WaitDuration.HasValue:
await Task.Delay(step.WaitDuration.Value, cancellationToken);
break;
case ChaosStepAction.Assert when step.Assertion is not null:
var assertResult = EvaluateAssertion(step.Assertion);
if (!assertResult.passed)
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = assertResult.error };
break;
}
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = true, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Data = data };
}
catch (Exception ex)
{
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = ex.Message };
}
}
private (bool passed, string? error) EvaluateAssertion(ChaosAssertion assertion)
{
// Simplified assertion evaluation - in production would query actual metrics
return assertion.Type switch
{
ChaosAssertionType.FallbackTriggered => (true, null),
ChaosAssertionType.AlertFired => (true, null),
_ => (true, null)
};
}
public Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default)
{
lock (_testHistory)
{
return Task.FromResult<IReadOnlyList<ChaosTestResult>>(_testHistory.TakeLast(limit).Reverse().ToList());
}
}
public Task ClearAllFaultsAsync(CancellationToken cancellationToken = default)
{
_activeFaults.Clear();
_logger.LogInformation("Cleared all chaos faults");
return Task.CompletedTask;
}
private void CleanupExpiredFaults()
{
var now = _timeProvider.GetUtcNow();
var expired = _activeFaults.Where(f => f.Value.ExpiresAt.HasValue && f.Value.ExpiresAt < now).Select(f => f.Key).ToList();
foreach (var id in expired)
{
_activeFaults.TryRemove(id, out _);
_logger.LogDebug("Expired chaos fault {FaultId}", id);
}
}
}

View File

@@ -0,0 +1,349 @@
using System.Collections.Concurrent;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Handles notifications that have failed permanently after all retries.
/// </summary>
public interface IDeadLetterHandler
{
/// <summary>
/// Moves a delivery to the dead-letter queue.
/// </summary>
Task<DeadLetteredDelivery> DeadLetterAsync(
string tenantId,
string deliveryId,
DeadLetterReason reason,
string channelType,
object? payload = null,
Exception? exception = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets dead-lettered deliveries for a tenant.
/// </summary>
Task<IReadOnlyList<DeadLetteredDelivery>> GetAsync(
string tenantId,
DeadLetterQuery? query = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Retries a dead-lettered delivery.
/// </summary>
Task<DeadLetterRetryResult> RetryAsync(
string tenantId,
string deadLetterId,
CancellationToken cancellationToken = default);
/// <summary>
/// Retries all matching dead-lettered deliveries.
/// </summary>
Task<DeadLetterBulkRetryResult> RetryBulkAsync(
string tenantId,
DeadLetterQuery? query = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Discards a dead-lettered delivery.
/// </summary>
Task<bool> DiscardAsync(
string tenantId,
string deadLetterId,
string? reason = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets statistics about dead-lettered deliveries.
/// </summary>
Task<DeadLetterStats> GetStatsAsync(
string? tenantId = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Purges old dead-lettered deliveries.
/// </summary>
Task<int> PurgeAsync(
string? tenantId,
TimeSpan olderThan,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Reason for dead-lettering.
/// </summary>
public enum DeadLetterReason
{
MaxRetriesExceeded,
InvalidPayload,
ChannelUnavailable,
AuthenticationFailed,
RateLimited,
TemplateRenderFailed,
ConfigurationError,
UnknownError
}
/// <summary>
/// A dead-lettered delivery.
/// </summary>
public sealed record DeadLetteredDelivery
{
public required string DeadLetterId { get; init; }
public required string TenantId { get; init; }
public required string DeliveryId { get; init; }
public required string ChannelType { get; init; }
public required DeadLetterReason Reason { get; init; }
public string? ReasonDetails { get; init; }
public object? OriginalPayload { get; init; }
public string? ExceptionType { get; init; }
public string? ExceptionMessage { get; init; }
public int AttemptCount { get; init; }
public DateTimeOffset FirstAttemptAt { get; init; }
public DateTimeOffset DeadLetteredAt { get; init; }
public DateTimeOffset? LastRetryAt { get; init; }
public int RetryCount { get; init; }
public DeadLetterStatus Status { get; init; } = DeadLetterStatus.Pending;
public string? DiscardReason { get; init; }
}
/// <summary>
/// Status of a dead-lettered delivery.
/// </summary>
public enum DeadLetterStatus
{
Pending,
Retrying,
Retried,
Discarded
}
/// <summary>
/// Query for dead-lettered deliveries.
/// </summary>
public sealed record DeadLetterQuery
{
public DeadLetterReason? Reason { get; init; }
public string? ChannelType { get; init; }
public DeadLetterStatus? Status { get; init; }
public DateTimeOffset? After { get; init; }
public DateTimeOffset? Before { get; init; }
public int Limit { get; init; } = 100;
public int Offset { get; init; }
}
/// <summary>
/// Result of a retry attempt.
/// </summary>
public sealed record DeadLetterRetryResult
{
public required string DeadLetterId { get; init; }
public bool Success { get; init; }
public string? Error { get; init; }
public DeadLetterStatus NewStatus { get; init; }
}
/// <summary>
/// Result of a bulk retry operation.
/// </summary>
public sealed record DeadLetterBulkRetryResult
{
public int Total { get; init; }
public int Succeeded { get; init; }
public int Failed { get; init; }
public IReadOnlyList<DeadLetterRetryResult> Results { get; init; } = [];
}
/// <summary>
/// Statistics about dead-lettered deliveries.
/// </summary>
public sealed record DeadLetterStats
{
public DateTimeOffset Timestamp { get; init; }
public string? TenantId { get; init; }
public int TotalCount { get; init; }
public int PendingCount { get; init; }
public int RetryingCount { get; init; }
public int RetriedCount { get; init; }
public int DiscardedCount { get; init; }
public IReadOnlyDictionary<DeadLetterReason, int> ByReason { get; init; } = new Dictionary<DeadLetterReason, int>();
public IReadOnlyDictionary<string, int> ByChannel { get; init; } = new Dictionary<string, int>();
public DateTimeOffset? OldestDeadLetterAt { get; init; }
public DateTimeOffset? NewestDeadLetterAt { get; init; }
}
/// <summary>
/// Options for dead-letter handling.
/// </summary>
public sealed class DeadLetterOptions
{
public const string SectionName = "Notifier:Observability:DeadLetter";
public bool Enabled { get; set; } = true;
public int MaxRetryAttempts { get; set; } = 3;
public TimeSpan RetryDelay { get; set; } = TimeSpan.FromMinutes(5);
public TimeSpan RetentionPeriod { get; set; } = TimeSpan.FromDays(30);
public bool AutoPurge { get; set; } = true;
public TimeSpan PurgeInterval { get; set; } = TimeSpan.FromHours(24);
public int AlertThreshold { get; set; } = 100;
}
/// <summary>
/// In-memory implementation of dead-letter handler.
/// </summary>
public sealed class InMemoryDeadLetterHandler : IDeadLetterHandler
{
private readonly ConcurrentDictionary<string, List<DeadLetteredDelivery>> _deadLetters = new();
private readonly DeadLetterOptions _options;
private readonly TimeProvider _timeProvider;
private readonly INotifierMetrics? _metrics;
private readonly ILogger<InMemoryDeadLetterHandler> _logger;
public InMemoryDeadLetterHandler(
IOptions<DeadLetterOptions> options,
TimeProvider timeProvider,
INotifierMetrics? metrics,
ILogger<InMemoryDeadLetterHandler> logger)
{
_options = options?.Value ?? new DeadLetterOptions();
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public Task<DeadLetteredDelivery> DeadLetterAsync(
string tenantId,
string deliveryId,
DeadLetterReason reason,
string channelType,
object? payload = null,
Exception? exception = null,
CancellationToken cancellationToken = default)
{
var now = _timeProvider.GetUtcNow();
var deadLetter = new DeadLetteredDelivery
{
DeadLetterId = $"dl-{Guid.NewGuid():N}"[..16],
TenantId = tenantId,
DeliveryId = deliveryId,
ChannelType = channelType,
Reason = reason,
ReasonDetails = exception?.Message,
OriginalPayload = payload,
ExceptionType = exception?.GetType().FullName,
ExceptionMessage = exception?.Message,
DeadLetteredAt = now,
FirstAttemptAt = now,
Status = DeadLetterStatus.Pending
};
var list = _deadLetters.GetOrAdd(tenantId, _ => []);
lock (list) { list.Add(deadLetter); }
_metrics?.RecordDeadLetter(tenantId, reason.ToString(), channelType);
_logger.LogWarning("Dead-lettered delivery {DeliveryId} for tenant {TenantId}: {Reason}", deliveryId, tenantId, reason);
return Task.FromResult(deadLetter);
}
public Task<IReadOnlyList<DeadLetteredDelivery>> GetAsync(
string tenantId,
DeadLetterQuery? query = null,
CancellationToken cancellationToken = default)
{
if (!_deadLetters.TryGetValue(tenantId, out var list))
return Task.FromResult<IReadOnlyList<DeadLetteredDelivery>>([]);
IEnumerable<DeadLetteredDelivery> filtered;
lock (list) { filtered = list.ToList(); }
if (query is not null)
{
if (query.Reason.HasValue) filtered = filtered.Where(d => d.Reason == query.Reason.Value);
if (!string.IsNullOrEmpty(query.ChannelType)) filtered = filtered.Where(d => d.ChannelType == query.ChannelType);
if (query.Status.HasValue) filtered = filtered.Where(d => d.Status == query.Status.Value);
if (query.After.HasValue) filtered = filtered.Where(d => d.DeadLetteredAt > query.After.Value);
if (query.Before.HasValue) filtered = filtered.Where(d => d.DeadLetteredAt < query.Before.Value);
}
var result = filtered.OrderByDescending(d => d.DeadLetteredAt).Skip(query?.Offset ?? 0).Take(query?.Limit ?? 100).ToList();
return Task.FromResult<IReadOnlyList<DeadLetteredDelivery>>(result);
}
public Task<DeadLetterRetryResult> RetryAsync(string tenantId, string deadLetterId, CancellationToken cancellationToken = default)
{
if (!_deadLetters.TryGetValue(tenantId, out var list))
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = false, Error = "Not found", NewStatus = DeadLetterStatus.Pending });
DeadLetteredDelivery? deadLetter;
lock (list) { deadLetter = list.FirstOrDefault(d => d.DeadLetterId == deadLetterId); }
if (deadLetter is null)
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = false, Error = "Not found", NewStatus = DeadLetterStatus.Pending });
lock (list)
{
var index = list.FindIndex(d => d.DeadLetterId == deadLetterId);
if (index >= 0)
list[index] = deadLetter with { Status = DeadLetterStatus.Retried, LastRetryAt = _timeProvider.GetUtcNow(), RetryCount = deadLetter.RetryCount + 1 };
}
_logger.LogInformation("Retrying dead-lettered delivery {DeadLetterId} for tenant {TenantId}", deadLetterId, tenantId);
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = true, NewStatus = DeadLetterStatus.Retried });
}
public async Task<DeadLetterBulkRetryResult> RetryBulkAsync(string tenantId, DeadLetterQuery? query = null, CancellationToken cancellationToken = default)
{
var deadLetters = await GetAsync(tenantId, query, cancellationToken);
var results = new List<DeadLetterRetryResult>();
foreach (var dl in deadLetters.Where(d => d.Status == DeadLetterStatus.Pending))
results.Add(await RetryAsync(tenantId, dl.DeadLetterId, cancellationToken));
return new DeadLetterBulkRetryResult { Total = results.Count, Succeeded = results.Count(r => r.Success), Failed = results.Count(r => !r.Success), Results = results };
}
public Task<bool> DiscardAsync(string tenantId, string deadLetterId, string? reason = null, CancellationToken cancellationToken = default)
{
if (!_deadLetters.TryGetValue(tenantId, out var list)) return Task.FromResult(false);
lock (list)
{
var index = list.FindIndex(d => d.DeadLetterId == deadLetterId);
if (index < 0) return Task.FromResult(false);
list[index] = list[index] with { Status = DeadLetterStatus.Discarded, DiscardReason = reason };
}
_logger.LogInformation("Discarded dead-lettered delivery {DeadLetterId} for tenant {TenantId}: {Reason}", deadLetterId, tenantId, reason ?? "No reason");
return Task.FromResult(true);
}
public Task<DeadLetterStats> GetStatsAsync(string? tenantId = null, CancellationToken cancellationToken = default)
{
var all = tenantId is not null ? (_deadLetters.TryGetValue(tenantId, out var l) ? l.ToList() : []) : _deadLetters.Values.SelectMany(v => v).ToList();
return Task.FromResult(new DeadLetterStats
{
Timestamp = _timeProvider.GetUtcNow(),
TenantId = tenantId,
TotalCount = all.Count,
PendingCount = all.Count(d => d.Status == DeadLetterStatus.Pending),
RetryingCount = all.Count(d => d.Status == DeadLetterStatus.Retrying),
RetriedCount = all.Count(d => d.Status == DeadLetterStatus.Retried),
DiscardedCount = all.Count(d => d.Status == DeadLetterStatus.Discarded),
ByReason = all.GroupBy(d => d.Reason).ToDictionary(g => g.Key, g => g.Count()),
ByChannel = all.GroupBy(d => d.ChannelType).ToDictionary(g => g.Key, g => g.Count()),
OldestDeadLetterAt = all.MinBy(d => d.DeadLetteredAt)?.DeadLetteredAt,
NewestDeadLetterAt = all.MaxBy(d => d.DeadLetteredAt)?.DeadLetteredAt
});
}
public Task<int> PurgeAsync(string? tenantId, TimeSpan olderThan, CancellationToken cancellationToken = default)
{
var cutoff = _timeProvider.GetUtcNow() - olderThan;
var purged = 0;
var tenants = tenantId is not null ? [tenantId] : _deadLetters.Keys.ToList();
foreach (var t in tenants)
{
if (!_deadLetters.TryGetValue(t, out var list)) continue;
lock (list) { purged += list.RemoveAll(d => d.DeadLetteredAt < cutoff); }
}
_logger.LogInformation("Purged {Count} dead-lettered deliveries older than {OlderThan}", purged, olderThan);
return Task.FromResult(purged);
}
}

View File

@@ -0,0 +1,802 @@
using System.Collections.Concurrent;
using System.Diagnostics;
using System.Diagnostics.Metrics;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Metrics service for the Notifier module.
/// Provides counters, histograms, and gauges for observability.
/// </summary>
public interface INotifierMetrics
{
/// <summary>
/// Records a notification delivery attempt.
/// </summary>
void RecordDeliveryAttempt(string tenantId, string channelType, bool success, TimeSpan duration);
/// <summary>
/// Records an escalation event.
/// </summary>
void RecordEscalation(string tenantId, string policyId, int level, EscalationEventType eventType);
/// <summary>
/// Records escalation acknowledgment latency.
/// </summary>
void RecordAckLatency(string tenantId, string policyId, TimeSpan latency);
/// <summary>
/// Records a storm detection event.
/// </summary>
void RecordStormEvent(string tenantId, string eventKind, StormEventType eventType, int suppressedCount);
/// <summary>
/// Records a fallback attempt.
/// </summary>
void RecordFallback(string tenantId, string fromChannel, string toChannel, bool success);
/// <summary>
/// Records a dead-letter event.
/// </summary>
void RecordDeadLetter(string tenantId, string reason, string channelType);
/// <summary>
/// Records digest generation.
/// </summary>
void RecordDigestGeneration(string tenantId, string scheduleId, TimeSpan duration, int incidentCount);
/// <summary>
/// Records quiet hours/throttle suppression.
/// </summary>
void RecordSuppression(string tenantId, SuppressionType type, string eventKind);
/// <summary>
/// Records template rendering.
/// </summary>
void RecordTemplateRender(string tenantId, string templateId, TimeSpan duration, bool success);
/// <summary>
/// Records incident lifecycle event.
/// </summary>
void RecordIncidentEvent(string tenantId, IncidentEventType eventType);
/// <summary>
/// Updates active escalations gauge.
/// </summary>
void SetActiveEscalations(string tenantId, int count);
/// <summary>
/// Updates active storms gauge.
/// </summary>
void SetActiveStorms(string tenantId, int count);
/// <summary>
/// Updates pending deliveries gauge.
/// </summary>
void SetPendingDeliveries(string tenantId, int count);
/// <summary>
/// Gets current metrics snapshot.
/// </summary>
NotifierMetricsSnapshot GetSnapshot(string? tenantId = null);
}
/// <summary>
/// Type of escalation event.
/// </summary>
public enum EscalationEventType
{
Started,
LevelAdvanced,
Acknowledged,
Resolved,
Exhausted,
Timeout
}
/// <summary>
/// Type of storm event.
/// </summary>
public enum StormEventType
{
Detected,
Suppressed,
SummarySent,
Ended
}
/// <summary>
/// Type of suppression.
/// </summary>
public enum SuppressionType
{
QuietHours,
Throttle,
Maintenance,
Override
}
/// <summary>
/// Type of incident event.
/// </summary>
public enum IncidentEventType
{
Created,
Updated,
Acknowledged,
Resolved,
Reopened
}
/// <summary>
/// Snapshot of current metrics.
/// </summary>
public sealed record NotifierMetricsSnapshot
{
public required DateTimeOffset Timestamp { get; init; }
public string? TenantId { get; init; }
// Delivery metrics
public long TotalDeliveryAttempts { get; init; }
public long SuccessfulDeliveries { get; init; }
public long FailedDeliveries { get; init; }
public double AverageDeliveryLatencyMs { get; init; }
public double P95DeliveryLatencyMs { get; init; }
public double P99DeliveryLatencyMs { get; init; }
// Escalation metrics
public long TotalEscalations { get; init; }
public long EscalationsAcknowledged { get; init; }
public long EscalationsExhausted { get; init; }
public double AverageAckLatencyMs { get; init; }
public int ActiveEscalations { get; init; }
// Storm metrics
public long StormsDetected { get; init; }
public long NotificationsSuppressed { get; init; }
public int ActiveStorms { get; init; }
// Fallback metrics
public long FallbackAttempts { get; init; }
public long FallbackSuccesses { get; init; }
// Dead-letter metrics
public long DeadLetterCount { get; init; }
// Incident metrics
public long IncidentsCreated { get; init; }
public long IncidentsResolved { get; init; }
// Suppression metrics
public long QuietHoursSuppressions { get; init; }
public long ThrottleSuppressions { get; init; }
// Queue metrics
public int PendingDeliveries { get; init; }
// Channel breakdown
public IReadOnlyDictionary<string, ChannelMetrics> ChannelMetrics { get; init; } = new Dictionary<string, ChannelMetrics>();
}
/// <summary>
/// Metrics for a specific channel.
/// </summary>
public sealed record ChannelMetrics
{
public required string ChannelType { get; init; }
public long Attempts { get; init; }
public long Successes { get; init; }
public long Failures { get; init; }
public double AverageLatencyMs { get; init; }
public double SuccessRate => Attempts > 0 ? (double)Successes / Attempts * 100 : 0;
}
/// <summary>
/// Options for metrics service.
/// </summary>
public sealed class NotifierMetricsOptions
{
public const string SectionName = "Notifier:Observability:Metrics";
/// <summary>
/// Whether metrics collection is enabled.
/// </summary>
public bool Enabled { get; set; } = true;
/// <summary>
/// Histogram bucket boundaries for latency (in milliseconds).
/// </summary>
public double[] LatencyBuckets { get; set; } = [10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000];
/// <summary>
/// How long to retain detailed metrics.
/// </summary>
public TimeSpan RetentionPeriod { get; set; } = TimeSpan.FromHours(24);
/// <summary>
/// Whether to include tenant-level breakdown.
/// </summary>
public bool IncludeTenantBreakdown { get; set; } = true;
/// <summary>
/// Maximum number of tenants to track individually.
/// </summary>
public int MaxTrackedTenants { get; set; } = 1000;
}
/// <summary>
/// Default implementation of notifier metrics using System.Diagnostics.Metrics.
/// </summary>
public sealed class DefaultNotifierMetrics : INotifierMetrics, IDisposable
{
private readonly Meter _meter;
private readonly NotifierMetricsOptions _options;
private readonly TimeProvider _timeProvider;
private readonly ILogger<DefaultNotifierMetrics> _logger;
// Counters
private readonly Counter<long> _deliveryAttempts;
private readonly Counter<long> _deliverySuccesses;
private readonly Counter<long> _deliveryFailures;
private readonly Counter<long> _escalationEvents;
private readonly Counter<long> _stormEvents;
private readonly Counter<long> _fallbackAttempts;
private readonly Counter<long> _deadLetters;
private readonly Counter<long> _suppressions;
private readonly Counter<long> _incidentEvents;
private readonly Counter<long> _templateRenders;
// Histograms
private readonly Histogram<double> _deliveryLatency;
private readonly Histogram<double> _ackLatency;
private readonly Histogram<double> _digestDuration;
private readonly Histogram<double> _templateRenderDuration;
// Gauges (tracked via observable gauges)
private readonly ConcurrentDictionary<string, int> _activeEscalations = new();
private readonly ConcurrentDictionary<string, int> _activeStorms = new();
private readonly ConcurrentDictionary<string, int> _pendingDeliveries = new();
// In-memory aggregation for snapshots
private readonly ConcurrentDictionary<string, TenantMetricsData> _tenantMetrics = new();
private readonly object _lock = new();
public DefaultNotifierMetrics(
IOptions<NotifierMetricsOptions> options,
TimeProvider timeProvider,
ILogger<DefaultNotifierMetrics> logger)
{
_options = options?.Value ?? new NotifierMetricsOptions();
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_meter = new Meter("StellaOps.Notifier", "1.0.0");
// Initialize counters
_deliveryAttempts = _meter.CreateCounter<long>(
"notifier.delivery.attempts",
"attempts",
"Total number of delivery attempts");
_deliverySuccesses = _meter.CreateCounter<long>(
"notifier.delivery.successes",
"deliveries",
"Number of successful deliveries");
_deliveryFailures = _meter.CreateCounter<long>(
"notifier.delivery.failures",
"deliveries",
"Number of failed deliveries");
_escalationEvents = _meter.CreateCounter<long>(
"notifier.escalation.events",
"events",
"Number of escalation events");
_stormEvents = _meter.CreateCounter<long>(
"notifier.storm.events",
"events",
"Number of storm-related events");
_fallbackAttempts = _meter.CreateCounter<long>(
"notifier.fallback.attempts",
"attempts",
"Number of fallback attempts");
_deadLetters = _meter.CreateCounter<long>(
"notifier.deadletter.count",
"messages",
"Number of dead-lettered messages");
_suppressions = _meter.CreateCounter<long>(
"notifier.suppression.count",
"suppressions",
"Number of suppressed notifications");
_incidentEvents = _meter.CreateCounter<long>(
"notifier.incident.events",
"events",
"Number of incident lifecycle events");
_templateRenders = _meter.CreateCounter<long>(
"notifier.template.renders",
"renders",
"Number of template render operations");
// Initialize histograms
_deliveryLatency = _meter.CreateHistogram<double>(
"notifier.delivery.latency",
"ms",
"Delivery latency in milliseconds");
_ackLatency = _meter.CreateHistogram<double>(
"notifier.escalation.ack_latency",
"ms",
"Acknowledgment latency in milliseconds");
_digestDuration = _meter.CreateHistogram<double>(
"notifier.digest.duration",
"ms",
"Digest generation duration in milliseconds");
_templateRenderDuration = _meter.CreateHistogram<double>(
"notifier.template.render_duration",
"ms",
"Template render duration in milliseconds");
// Initialize observable gauges
_meter.CreateObservableGauge(
"notifier.escalations.active",
() => GetObservableGaugeValues(_activeEscalations),
"escalations",
"Number of active escalations");
_meter.CreateObservableGauge(
"notifier.storms.active",
() => GetObservableGaugeValues(_activeStorms),
"storms",
"Number of active notification storms");
_meter.CreateObservableGauge(
"notifier.deliveries.pending",
() => GetObservableGaugeValues(_pendingDeliveries),
"deliveries",
"Number of pending deliveries");
}
public void RecordDeliveryAttempt(string tenantId, string channelType, bool success, TimeSpan duration)
{
if (!_options.Enabled) return;
var tags = new TagList
{
{ "tenant_id", tenantId },
{ "channel_type", channelType },
{ "success", success.ToString().ToLowerInvariant() }
};
_deliveryAttempts.Add(1, tags);
_deliveryLatency.Record(duration.TotalMilliseconds, tags);
if (success)
{
_deliverySuccesses.Add(1, tags);
}
else
{
_deliveryFailures.Add(1, tags);
}
// Update in-memory aggregation
UpdateTenantMetrics(tenantId, m =>
{
m.TotalDeliveryAttempts++;
if (success) m.SuccessfulDeliveries++;
else m.FailedDeliveries++;
m.DeliveryLatencies.Add(duration.TotalMilliseconds);
if (!m.ChannelMetrics.TryGetValue(channelType, out var cm))
{
cm = new MutableChannelMetrics { ChannelType = channelType };
m.ChannelMetrics[channelType] = cm;
}
cm.Attempts++;
if (success) cm.Successes++;
else cm.Failures++;
cm.Latencies.Add(duration.TotalMilliseconds);
});
}
public void RecordEscalation(string tenantId, string policyId, int level, EscalationEventType eventType)
{
if (!_options.Enabled) return;
var tags = new TagList
{
{ "tenant_id", tenantId },
{ "policy_id", policyId },
{ "level", level.ToString() },
{ "event_type", eventType.ToString().ToLowerInvariant() }
};
_escalationEvents.Add(1, tags);
UpdateTenantMetrics(tenantId, m =>
{
m.TotalEscalations++;
if (eventType == EscalationEventType.Acknowledged)
m.EscalationsAcknowledged++;
else if (eventType == EscalationEventType.Exhausted)
m.EscalationsExhausted++;
});
}
public void RecordAckLatency(string tenantId, string policyId, TimeSpan latency)
{
if (!_options.Enabled) return;
var tags = new TagList
{
{ "tenant_id", tenantId },
{ "policy_id", policyId }
};
_ackLatency.Record(latency.TotalMilliseconds, tags);
UpdateTenantMetrics(tenantId, m =>
{
m.AckLatencies.Add(latency.TotalMilliseconds);
});
}
public void RecordStormEvent(string tenantId, string eventKind, StormEventType eventType, int suppressedCount)
{
if (!_options.Enabled) return;
var tags = new TagList
{
{ "tenant_id", tenantId },
{ "event_kind", eventKind },
{ "event_type", eventType.ToString().ToLowerInvariant() }
};
_stormEvents.Add(1, tags);
UpdateTenantMetrics(tenantId, m =>
{
if (eventType == StormEventType.Detected)
m.StormsDetected++;
m.NotificationsSuppressed += suppressedCount;
});
}
public void RecordFallback(string tenantId, string fromChannel, string toChannel, bool success)
{
if (!_options.Enabled) return;
var tags = new TagList
{
{ "tenant_id", tenantId },
{ "from_channel", fromChannel },
{ "to_channel", toChannel },
{ "success", success.ToString().ToLowerInvariant() }
};
_fallbackAttempts.Add(1, tags);
UpdateTenantMetrics(tenantId, m =>
{
m.FallbackAttempts++;
if (success) m.FallbackSuccesses++;
});
}
public void RecordDeadLetter(string tenantId, string reason, string channelType)
{
if (!_options.Enabled) return;
var tags = new TagList
{
{ "tenant_id", tenantId },
{ "reason", reason },
{ "channel_type", channelType }
};
_deadLetters.Add(1, tags);
UpdateTenantMetrics(tenantId, m => m.DeadLetterCount++);
}
public void RecordDigestGeneration(string tenantId, string scheduleId, TimeSpan duration, int incidentCount)
{
if (!_options.Enabled) return;
var tags = new TagList
{
{ "tenant_id", tenantId },
{ "schedule_id", scheduleId }
};
_digestDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordSuppression(string tenantId, SuppressionType type, string eventKind)
{
if (!_options.Enabled) return;
var tags = new TagList
{
{ "tenant_id", tenantId },
{ "suppression_type", type.ToString().ToLowerInvariant() },
{ "event_kind", eventKind }
};
_suppressions.Add(1, tags);
UpdateTenantMetrics(tenantId, m =>
{
switch (type)
{
case SuppressionType.QuietHours:
m.QuietHoursSuppressions++;
break;
case SuppressionType.Throttle:
m.ThrottleSuppressions++;
break;
}
});
}
public void RecordTemplateRender(string tenantId, string templateId, TimeSpan duration, bool success)
{
if (!_options.Enabled) return;
var tags = new TagList
{
{ "tenant_id", tenantId },
{ "template_id", templateId },
{ "success", success.ToString().ToLowerInvariant() }
};
_templateRenders.Add(1, tags);
_templateRenderDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordIncidentEvent(string tenantId, IncidentEventType eventType)
{
if (!_options.Enabled) return;
var tags = new TagList
{
{ "tenant_id", tenantId },
{ "event_type", eventType.ToString().ToLowerInvariant() }
};
_incidentEvents.Add(1, tags);
UpdateTenantMetrics(tenantId, m =>
{
switch (eventType)
{
case IncidentEventType.Created:
m.IncidentsCreated++;
break;
case IncidentEventType.Resolved:
m.IncidentsResolved++;
break;
}
});
}
public void SetActiveEscalations(string tenantId, int count)
{
_activeEscalations[tenantId] = count;
}
public void SetActiveStorms(string tenantId, int count)
{
_activeStorms[tenantId] = count;
}
public void SetPendingDeliveries(string tenantId, int count)
{
_pendingDeliveries[tenantId] = count;
}
public NotifierMetricsSnapshot GetSnapshot(string? tenantId = null)
{
if (tenantId is not null)
{
return GetTenantSnapshot(tenantId);
}
// Aggregate all tenants
var allMetrics = _tenantMetrics.Values.ToList();
var deliveryLatencies = allMetrics.SelectMany(m => m.DeliveryLatencies).ToList();
var ackLatencies = allMetrics.SelectMany(m => m.AckLatencies).ToList();
var channelMetrics = new Dictionary<string, ChannelMetrics>();
foreach (var tenant in allMetrics)
{
foreach (var (channel, cm) in tenant.ChannelMetrics)
{
if (!channelMetrics.TryGetValue(channel, out var existing))
{
channelMetrics[channel] = new ChannelMetrics
{
ChannelType = channel,
Attempts = cm.Attempts,
Successes = cm.Successes,
Failures = cm.Failures,
AverageLatencyMs = cm.Latencies.Count > 0 ? cm.Latencies.Average() : 0
};
}
else
{
var allLatencies = new List<double>(cm.Latencies);
channelMetrics[channel] = existing with
{
Attempts = existing.Attempts + cm.Attempts,
Successes = existing.Successes + cm.Successes,
Failures = existing.Failures + cm.Failures,
AverageLatencyMs = allLatencies.Count > 0 ? allLatencies.Average() : existing.AverageLatencyMs
};
}
}
}
return new NotifierMetricsSnapshot
{
Timestamp = _timeProvider.GetUtcNow(),
TotalDeliveryAttempts = allMetrics.Sum(m => m.TotalDeliveryAttempts),
SuccessfulDeliveries = allMetrics.Sum(m => m.SuccessfulDeliveries),
FailedDeliveries = allMetrics.Sum(m => m.FailedDeliveries),
AverageDeliveryLatencyMs = deliveryLatencies.Count > 0 ? deliveryLatencies.Average() : 0,
P95DeliveryLatencyMs = CalculatePercentile(deliveryLatencies, 95),
P99DeliveryLatencyMs = CalculatePercentile(deliveryLatencies, 99),
TotalEscalations = allMetrics.Sum(m => m.TotalEscalations),
EscalationsAcknowledged = allMetrics.Sum(m => m.EscalationsAcknowledged),
EscalationsExhausted = allMetrics.Sum(m => m.EscalationsExhausted),
AverageAckLatencyMs = ackLatencies.Count > 0 ? ackLatencies.Average() : 0,
ActiveEscalations = _activeEscalations.Values.Sum(),
StormsDetected = allMetrics.Sum(m => m.StormsDetected),
NotificationsSuppressed = allMetrics.Sum(m => m.NotificationsSuppressed),
ActiveStorms = _activeStorms.Values.Sum(),
FallbackAttempts = allMetrics.Sum(m => m.FallbackAttempts),
FallbackSuccesses = allMetrics.Sum(m => m.FallbackSuccesses),
DeadLetterCount = allMetrics.Sum(m => m.DeadLetterCount),
IncidentsCreated = allMetrics.Sum(m => m.IncidentsCreated),
IncidentsResolved = allMetrics.Sum(m => m.IncidentsResolved),
QuietHoursSuppressions = allMetrics.Sum(m => m.QuietHoursSuppressions),
ThrottleSuppressions = allMetrics.Sum(m => m.ThrottleSuppressions),
PendingDeliveries = _pendingDeliveries.Values.Sum(),
ChannelMetrics = channelMetrics
};
}
private NotifierMetricsSnapshot GetTenantSnapshot(string tenantId)
{
if (!_tenantMetrics.TryGetValue(tenantId, out var metrics))
{
return new NotifierMetricsSnapshot
{
Timestamp = _timeProvider.GetUtcNow(),
TenantId = tenantId
};
}
var channelMetrics = metrics.ChannelMetrics.ToDictionary(
kvp => kvp.Key,
kvp => new ChannelMetrics
{
ChannelType = kvp.Key,
Attempts = kvp.Value.Attempts,
Successes = kvp.Value.Successes,
Failures = kvp.Value.Failures,
AverageLatencyMs = kvp.Value.Latencies.Count > 0 ? kvp.Value.Latencies.Average() : 0
});
return new NotifierMetricsSnapshot
{
Timestamp = _timeProvider.GetUtcNow(),
TenantId = tenantId,
TotalDeliveryAttempts = metrics.TotalDeliveryAttempts,
SuccessfulDeliveries = metrics.SuccessfulDeliveries,
FailedDeliveries = metrics.FailedDeliveries,
AverageDeliveryLatencyMs = metrics.DeliveryLatencies.Count > 0 ? metrics.DeliveryLatencies.Average() : 0,
P95DeliveryLatencyMs = CalculatePercentile(metrics.DeliveryLatencies, 95),
P99DeliveryLatencyMs = CalculatePercentile(metrics.DeliveryLatencies, 99),
TotalEscalations = metrics.TotalEscalations,
EscalationsAcknowledged = metrics.EscalationsAcknowledged,
EscalationsExhausted = metrics.EscalationsExhausted,
AverageAckLatencyMs = metrics.AckLatencies.Count > 0 ? metrics.AckLatencies.Average() : 0,
ActiveEscalations = _activeEscalations.GetValueOrDefault(tenantId, 0),
StormsDetected = metrics.StormsDetected,
NotificationsSuppressed = metrics.NotificationsSuppressed,
ActiveStorms = _activeStorms.GetValueOrDefault(tenantId, 0),
FallbackAttempts = metrics.FallbackAttempts,
FallbackSuccesses = metrics.FallbackSuccesses,
DeadLetterCount = metrics.DeadLetterCount,
IncidentsCreated = metrics.IncidentsCreated,
IncidentsResolved = metrics.IncidentsResolved,
QuietHoursSuppressions = metrics.QuietHoursSuppressions,
ThrottleSuppressions = metrics.ThrottleSuppressions,
PendingDeliveries = _pendingDeliveries.GetValueOrDefault(tenantId, 0),
ChannelMetrics = channelMetrics
};
}
private void UpdateTenantMetrics(string tenantId, Action<TenantMetricsData> update)
{
if (!_options.IncludeTenantBreakdown) return;
var metrics = _tenantMetrics.GetOrAdd(tenantId, _ => new TenantMetricsData());
lock (metrics)
{
update(metrics);
}
}
private static IEnumerable<Measurement<int>> GetObservableGaugeValues(ConcurrentDictionary<string, int> values)
{
foreach (var (tenantId, count) in values)
{
yield return new Measurement<int>(count, new TagList { { "tenant_id", tenantId } });
}
}
private static double CalculatePercentile(List<double> values, int percentile)
{
if (values.Count == 0) return 0;
var sorted = values.OrderBy(v => v).ToList();
var index = (int)Math.Ceiling(percentile / 100.0 * sorted.Count) - 1;
return sorted[Math.Max(0, Math.Min(index, sorted.Count - 1))];
}
public void Dispose()
{
_meter.Dispose();
}
private sealed class TenantMetricsData
{
public long TotalDeliveryAttempts;
public long SuccessfulDeliveries;
public long FailedDeliveries;
public List<double> DeliveryLatencies = [];
public long TotalEscalations;
public long EscalationsAcknowledged;
public long EscalationsExhausted;
public List<double> AckLatencies = [];
public long StormsDetected;
public long NotificationsSuppressed;
public long FallbackAttempts;
public long FallbackSuccesses;
public long DeadLetterCount;
public long IncidentsCreated;
public long IncidentsResolved;
public long QuietHoursSuppressions;
public long ThrottleSuppressions;
public Dictionary<string, MutableChannelMetrics> ChannelMetrics = [];
}
private sealed class MutableChannelMetrics
{
public required string ChannelType { get; init; }
public long Attempts;
public long Successes;
public long Failures;
public List<double> Latencies = [];
}
}

View File

@@ -0,0 +1,395 @@
using System.Diagnostics;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Tracing service for the Notifier module.
/// Provides distributed tracing capabilities using OpenTelemetry-compatible Activity API.
/// </summary>
public interface INotifierTracing
{
/// <summary>
/// Starts a delivery span.
/// </summary>
Activity? StartDeliverySpan(string tenantId, string deliveryId, string channelType);
/// <summary>
/// Starts an escalation span.
/// </summary>
Activity? StartEscalationSpan(string tenantId, string incidentId, string policyId);
/// <summary>
/// Starts a digest generation span.
/// </summary>
Activity? StartDigestSpan(string tenantId, string scheduleId);
/// <summary>
/// Starts a template render span.
/// </summary>
Activity? StartTemplateRenderSpan(string tenantId, string templateId);
/// <summary>
/// Starts a correlation span.
/// </summary>
Activity? StartCorrelationSpan(string tenantId, string eventKind);
/// <summary>
/// Starts a webhook validation span.
/// </summary>
Activity? StartWebhookValidationSpan(string tenantId, string channelId);
/// <summary>
/// Adds an event to the current span.
/// </summary>
void AddEvent(Activity? activity, string name, IDictionary<string, object?>? attributes = null);
/// <summary>
/// Sets span status to error.
/// </summary>
void SetError(Activity? activity, Exception? exception = null, string? description = null);
/// <summary>
/// Sets span status to ok.
/// </summary>
void SetOk(Activity? activity);
/// <summary>
/// Adds custom tags to a span.
/// </summary>
void AddTags(Activity? activity, IDictionary<string, object?> tags);
/// <summary>
/// Creates a linked span (for batch operations).
/// </summary>
Activity? StartLinkedSpan(string operationName, ActivityContext parentContext, IDictionary<string, object?>? tags = null);
}
/// <summary>
/// Options for tracing service.
/// </summary>
public sealed class NotifierTracingOptions
{
public const string SectionName = "Notifier:Observability:Tracing";
/// <summary>
/// Whether tracing is enabled.
/// </summary>
public bool Enabled { get; set; } = true;
/// <summary>
/// Activity source name.
/// </summary>
public string SourceName { get; set; } = "StellaOps.Notifier";
/// <summary>
/// Whether to include sensitive data in traces.
/// </summary>
public bool IncludeSensitiveData { get; set; }
/// <summary>
/// Sampling ratio (0.0 to 1.0).
/// </summary>
public double SamplingRatio { get; set; } = 1.0;
/// <summary>
/// Maximum number of attributes per span.
/// </summary>
public int MaxAttributesPerSpan { get; set; } = 128;
/// <summary>
/// Maximum number of events per span.
/// </summary>
public int MaxEventsPerSpan { get; set; } = 128;
}
/// <summary>
/// Default implementation of notifier tracing.
/// </summary>
public sealed class DefaultNotifierTracing : INotifierTracing, IDisposable
{
private readonly ActivitySource _activitySource;
private readonly NotifierTracingOptions _options;
private readonly ILogger<DefaultNotifierTracing> _logger;
public DefaultNotifierTracing(
IOptions<NotifierTracingOptions> options,
ILogger<DefaultNotifierTracing> logger)
{
_options = options?.Value ?? new NotifierTracingOptions();
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_activitySource = new ActivitySource(_options.SourceName, "1.0.0");
}
public Activity? StartDeliverySpan(string tenantId, string deliveryId, string channelType)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.delivery",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("delivery.id", deliveryId);
activity.SetTag("channel.type", channelType);
return activity;
}
public Activity? StartEscalationSpan(string tenantId, string incidentId, string policyId)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.escalation",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("incident.id", incidentId);
activity.SetTag("policy.id", policyId);
return activity;
}
public Activity? StartDigestSpan(string tenantId, string scheduleId)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.digest",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("schedule.id", scheduleId);
return activity;
}
public Activity? StartTemplateRenderSpan(string tenantId, string templateId)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.template.render",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("template.id", templateId);
return activity;
}
public Activity? StartCorrelationSpan(string tenantId, string eventKind)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.correlation",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("event.kind", eventKind);
return activity;
}
public Activity? StartWebhookValidationSpan(string tenantId, string channelId)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.webhook.validation",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("channel.id", channelId);
return activity;
}
public void AddEvent(Activity? activity, string name, IDictionary<string, object?>? attributes = null)
{
if (activity is null) return;
var tags = new ActivityTagsCollection();
if (attributes is not null)
{
foreach (var (key, value) in attributes)
{
if (value is not null)
{
tags.Add(key, value);
}
}
}
activity.AddEvent(new ActivityEvent(name, tags: tags));
}
public void SetError(Activity? activity, Exception? exception = null, string? description = null)
{
if (activity is null) return;
activity.SetStatus(ActivityStatusCode.Error, description ?? exception?.Message);
if (exception is not null)
{
activity.SetTag("exception.type", exception.GetType().FullName);
activity.SetTag("exception.message", exception.Message);
if (_options.IncludeSensitiveData)
{
activity.SetTag("exception.stacktrace", exception.StackTrace);
}
}
}
public void SetOk(Activity? activity)
{
activity?.SetStatus(ActivityStatusCode.Ok);
}
public void AddTags(Activity? activity, IDictionary<string, object?> tags)
{
if (activity is null) return;
foreach (var (key, value) in tags)
{
if (value is not null)
{
activity.SetTag(key, value);
}
}
}
public Activity? StartLinkedSpan(string operationName, ActivityContext parentContext, IDictionary<string, object?>? tags = null)
{
if (!_options.Enabled) return null;
var links = new[] { new ActivityLink(parentContext) };
var activity = _activitySource.StartActivity(
operationName,
ActivityKind.Internal,
parentContext: default,
links: links);
if (activity is not null && tags is not null)
{
AddTags(activity, tags);
}
return activity;
}
public void Dispose()
{
_activitySource.Dispose();
}
}
/// <summary>
/// Extension methods for Activity-based tracing.
/// </summary>
public static class ActivityExtensions
{
/// <summary>
/// Records a delivery result on the activity.
/// </summary>
public static void RecordDeliveryResult(this Activity? activity, bool success, int? httpStatusCode = null, string? error = null)
{
if (activity is null) return;
activity.SetTag("delivery.success", success);
if (httpStatusCode.HasValue)
{
activity.SetTag("http.status_code", httpStatusCode.Value);
}
if (!string.IsNullOrEmpty(error))
{
activity.SetTag("delivery.error", error);
}
}
/// <summary>
/// Records an escalation level change.
/// </summary>
public static void RecordEscalationLevel(this Activity? activity, int level, string? target = null)
{
if (activity is null) return;
activity.SetTag("escalation.level", level);
if (!string.IsNullOrEmpty(target))
{
activity.SetTag("escalation.target", target);
}
activity.AddEvent(new ActivityEvent("escalation.level.changed", tags: new ActivityTagsCollection
{
{ "level", level },
{ "target", target }
}));
}
/// <summary>
/// Records storm detection.
/// </summary>
public static void RecordStormDetected(this Activity? activity, string eventKind, int eventCount)
{
if (activity is null) return;
activity.AddEvent(new ActivityEvent("storm.detected", tags: new ActivityTagsCollection
{
{ "event_kind", eventKind },
{ "event_count", eventCount }
}));
}
/// <summary>
/// Records fallback attempt.
/// </summary>
public static void RecordFallback(this Activity? activity, string fromChannel, string toChannel)
{
if (activity is null) return;
activity.AddEvent(new ActivityEvent("fallback.attempted", tags: new ActivityTagsCollection
{
{ "from_channel", fromChannel },
{ "to_channel", toChannel }
}));
}
/// <summary>
/// Records template render details.
/// </summary>
public static void RecordTemplateRender(this Activity? activity, string format, int outputLength)
{
if (activity is null) return;
activity.SetTag("template.format", format);
activity.SetTag("template.output_length", outputLength);
}
/// <summary>
/// Records correlation result.
/// </summary>
public static void RecordCorrelationResult(this Activity? activity, string correlationKey, bool isNewIncident)
{
if (activity is null) return;
activity.SetTag("correlation.key", correlationKey);
activity.SetTag("correlation.new_incident", isNewIncident);
}
}

View File

@@ -1,98 +1,98 @@
using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Interface for notification system metrics and tracing.
/// </summary>
public interface INotifyMetrics
{
/// <summary>
/// Records a notification delivery attempt.
/// </summary>
void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration);
/// <summary>
/// Records an escalation event.
/// </summary>
void RecordEscalation(string tenantId, int level, string outcome);
/// <summary>
/// Records a dead-letter entry.
/// </summary>
void RecordDeadLetter(string tenantId, string reason, string channelType);
/// <summary>
/// Records rule evaluation.
/// </summary>
void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration);
/// <summary>
/// Records template rendering.
/// </summary>
void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration);
/// <summary>
/// Records storm detection event.
/// </summary>
void RecordStormEvent(string tenantId, string stormKey, string decision);
/// <summary>
/// Records retention cleanup.
/// </summary>
void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount);
/// <summary>
/// Gets the current queue depth for a channel.
/// </summary>
void RecordQueueDepth(string tenantId, string channelType, int depth);
/// <summary>
/// Creates an activity for distributed tracing.
/// </summary>
Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType);
/// <summary>
/// Creates an activity for escalation tracing.
/// </summary>
Activity? StartEscalationActivity(string tenantId, string incidentId, int level);
}
/// <summary>
/// Metric tag names for consistency.
/// </summary>
public static class NotifyMetricTags
{
public const string TenantId = "tenant_id";
public const string ChannelType = "channel_type";
public const string Status = "status";
public const string Outcome = "outcome";
public const string Level = "level";
public const string Reason = "reason";
public const string RuleId = "rule_id";
public const string Matched = "matched";
public const string TemplateKey = "template_key";
public const string Success = "success";
public const string StormKey = "storm_key";
public const string Decision = "decision";
public const string EntityType = "entity_type";
}
/// <summary>
/// Metric names for the notification system.
/// </summary>
public static class NotifyMetricNames
{
public const string DeliveryAttempts = "notify.delivery.attempts";
public const string DeliveryDuration = "notify.delivery.duration";
public const string EscalationEvents = "notify.escalation.events";
public const string DeadLetterEntries = "notify.deadletter.entries";
public const string RuleEvaluations = "notify.rule.evaluations";
public const string RuleEvaluationDuration = "notify.rule.evaluation.duration";
public const string TemplateRenders = "notify.template.renders";
public const string TemplateRenderDuration = "notify.template.render.duration";
public const string StormEvents = "notify.storm.events";
public const string RetentionCleanups = "notify.retention.cleanups";
public const string QueueDepth = "notify.queue.depth";
}
using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Interface for notification system metrics and tracing.
/// </summary>
public interface INotifyMetrics
{
/// <summary>
/// Records a notification delivery attempt.
/// </summary>
void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration);
/// <summary>
/// Records an escalation event.
/// </summary>
void RecordEscalation(string tenantId, int level, string outcome);
/// <summary>
/// Records a dead-letter entry.
/// </summary>
void RecordDeadLetter(string tenantId, string reason, string channelType);
/// <summary>
/// Records rule evaluation.
/// </summary>
void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration);
/// <summary>
/// Records template rendering.
/// </summary>
void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration);
/// <summary>
/// Records storm detection event.
/// </summary>
void RecordStormEvent(string tenantId, string stormKey, string decision);
/// <summary>
/// Records retention cleanup.
/// </summary>
void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount);
/// <summary>
/// Gets the current queue depth for a channel.
/// </summary>
void RecordQueueDepth(string tenantId, string channelType, int depth);
/// <summary>
/// Creates an activity for distributed tracing.
/// </summary>
Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType);
/// <summary>
/// Creates an activity for escalation tracing.
/// </summary>
Activity? StartEscalationActivity(string tenantId, string incidentId, int level);
}
/// <summary>
/// Metric tag names for consistency.
/// </summary>
public static class NotifyMetricTags
{
public const string TenantId = "tenant_id";
public const string ChannelType = "channel_type";
public const string Status = "status";
public const string Outcome = "outcome";
public const string Level = "level";
public const string Reason = "reason";
public const string RuleId = "rule_id";
public const string Matched = "matched";
public const string TemplateKey = "template_key";
public const string Success = "success";
public const string StormKey = "storm_key";
public const string Decision = "decision";
public const string EntityType = "entity_type";
}
/// <summary>
/// Metric names for the notification system.
/// </summary>
public static class NotifyMetricNames
{
public const string DeliveryAttempts = "notify.delivery.attempts";
public const string DeliveryDuration = "notify.delivery.duration";
public const string EscalationEvents = "notify.escalation.events";
public const string DeadLetterEntries = "notify.deadletter.entries";
public const string RuleEvaluations = "notify.rule.evaluations";
public const string RuleEvaluationDuration = "notify.rule.evaluation.duration";
public const string TemplateRenders = "notify.template.renders";
public const string TemplateRenderDuration = "notify.template.render.duration";
public const string StormEvents = "notify.storm.events";
public const string RetentionCleanups = "notify.retention.cleanups";
public const string QueueDepth = "notify.queue.depth";
}

View File

@@ -0,0 +1,456 @@
using System.Collections.Concurrent;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Manages data retention policies for notifications and related data.
/// </summary>
public interface IRetentionPolicyService
{
/// <summary>
/// Gets all retention policies for a tenant.
/// </summary>
Task<IReadOnlyList<RetentionPolicy>> GetPoliciesAsync(string tenantId, CancellationToken cancellationToken = default);
/// <summary>
/// Gets a specific retention policy.
/// </summary>
Task<RetentionPolicy?> GetPolicyAsync(string tenantId, string policyId, CancellationToken cancellationToken = default);
/// <summary>
/// Creates or updates a retention policy.
/// </summary>
Task<RetentionPolicy> UpsertPolicyAsync(RetentionPolicy policy, CancellationToken cancellationToken = default);
/// <summary>
/// Deletes a retention policy.
/// </summary>
Task<bool> DeletePolicyAsync(string tenantId, string policyId, CancellationToken cancellationToken = default);
/// <summary>
/// Applies retention policies and purges old data.
/// </summary>
Task<RetentionResult> ApplyAsync(string? tenantId = null, CancellationToken cancellationToken = default);
/// <summary>
/// Gets retention statistics.
/// </summary>
Task<RetentionStats> GetStatsAsync(string? tenantId = null, CancellationToken cancellationToken = default);
/// <summary>
/// Previews what would be deleted by retention policies.
/// </summary>
Task<RetentionPreview> PreviewAsync(string tenantId, CancellationToken cancellationToken = default);
}
/// <summary>
/// A data retention policy.
/// </summary>
public sealed record RetentionPolicy
{
public required string PolicyId { get; init; }
public required string TenantId { get; init; }
public required string Name { get; init; }
public string? Description { get; init; }
public required RetentionDataType DataType { get; init; }
public required TimeSpan RetentionPeriod { get; init; }
public RetentionAction Action { get; init; } = RetentionAction.Delete;
public string? ArchiveDestination { get; init; }
public bool Enabled { get; init; } = true;
public IReadOnlyList<string>? ChannelTypes { get; init; }
public IReadOnlyList<string>? EventKinds { get; init; }
public int? MinimumCount { get; init; }
public DateTimeOffset CreatedAt { get; init; }
public DateTimeOffset UpdatedAt { get; init; }
public DateTimeOffset? LastAppliedAt { get; init; }
}
/// <summary>
/// Type of data subject to retention.
/// </summary>
public enum RetentionDataType
{
Deliveries,
DeadLetters,
Incidents,
AuditLogs,
Metrics,
Templates,
EscalationHistory,
DigestHistory,
InboxNotifications
}
/// <summary>
/// Action to take when retention period expires.
/// </summary>
public enum RetentionAction
{
Delete,
Archive,
Anonymize
}
/// <summary>
/// Result of applying retention policies.
/// </summary>
public sealed record RetentionResult
{
public DateTimeOffset Timestamp { get; init; }
public string? TenantId { get; init; }
public int PoliciesApplied { get; init; }
public int TotalDeleted { get; init; }
public int TotalArchived { get; init; }
public int TotalAnonymized { get; init; }
public TimeSpan Duration { get; init; }
public IReadOnlyList<RetentionPolicyResult> PolicyResults { get; init; } = [];
public IReadOnlyList<string> Errors { get; init; } = [];
}
/// <summary>
/// Result of applying a single retention policy.
/// </summary>
public sealed record RetentionPolicyResult
{
public required string PolicyId { get; init; }
public required string PolicyName { get; init; }
public required RetentionDataType DataType { get; init; }
public int AffectedCount { get; init; }
public RetentionAction ActionTaken { get; init; }
public bool Success { get; init; }
public string? Error { get; init; }
}
/// <summary>
/// Statistics about retention.
/// </summary>
public sealed record RetentionStats
{
public DateTimeOffset Timestamp { get; init; }
public string? TenantId { get; init; }
public int TotalPolicies { get; init; }
public int EnabledPolicies { get; init; }
public int DisabledPolicies { get; init; }
public long TotalDeletedAllTime { get; init; }
public long TotalArchivedAllTime { get; init; }
public DateTimeOffset? LastRunAt { get; init; }
public DateTimeOffset? NextScheduledRun { get; init; }
public IReadOnlyDictionary<RetentionDataType, DataTypeStats> ByDataType { get; init; } = new Dictionary<RetentionDataType, DataTypeStats>();
}
/// <summary>
/// Statistics for a specific data type.
/// </summary>
public sealed record DataTypeStats
{
public required RetentionDataType DataType { get; init; }
public long CurrentCount { get; init; }
public DateTimeOffset? OldestRecord { get; init; }
public long DeletedCount { get; init; }
public long ArchivedCount { get; init; }
}
/// <summary>
/// Preview of what retention would delete.
/// </summary>
public sealed record RetentionPreview
{
public DateTimeOffset Timestamp { get; init; }
public string? TenantId { get; init; }
public int TotalToDelete { get; init; }
public int TotalToArchive { get; init; }
public int TotalToAnonymize { get; init; }
public IReadOnlyList<RetentionPreviewItem> Items { get; init; } = [];
}
/// <summary>
/// Preview item for a single policy.
/// </summary>
public sealed record RetentionPreviewItem
{
public required string PolicyId { get; init; }
public required string PolicyName { get; init; }
public required RetentionDataType DataType { get; init; }
public int AffectedCount { get; init; }
public RetentionAction Action { get; init; }
public DateTimeOffset? OldestAffected { get; init; }
public DateTimeOffset? NewestAffected { get; init; }
}
/// <summary>
/// Options for retention service.
/// </summary>
public sealed class RetentionOptions
{
public const string SectionName = "Notifier:Observability:Retention";
public bool Enabled { get; set; } = true;
public TimeSpan DefaultRetentionPeriod { get; set; } = TimeSpan.FromDays(90);
public TimeSpan MinimumRetentionPeriod { get; set; } = TimeSpan.FromDays(1);
public TimeSpan MaximumRetentionPeriod { get; set; } = TimeSpan.FromDays(365 * 7);
public bool AutoRun { get; set; } = true;
public TimeSpan RunInterval { get; set; } = TimeSpan.FromHours(24);
public TimeSpan RunTime { get; set; } = TimeSpan.FromHours(3);
public int BatchSize { get; set; } = 1000;
public bool DryRunByDefault { get; set; }
}
/// <summary>
/// In-memory implementation of retention policy service.
/// </summary>
public sealed class InMemoryRetentionPolicyService : IRetentionPolicyService
{
private readonly ConcurrentDictionary<string, List<RetentionPolicy>> _policies = new();
private readonly ConcurrentDictionary<string, RetentionStats> _stats = new();
private readonly RetentionOptions _options;
private readonly TimeProvider _timeProvider;
private readonly ILogger<InMemoryRetentionPolicyService> _logger;
public InMemoryRetentionPolicyService(
IOptions<RetentionOptions> options,
TimeProvider timeProvider,
ILogger<InMemoryRetentionPolicyService> logger)
{
_options = options?.Value ?? new RetentionOptions();
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public Task<IReadOnlyList<RetentionPolicy>> GetPoliciesAsync(string tenantId, CancellationToken cancellationToken = default)
{
if (!_policies.TryGetValue(tenantId, out var policies))
return Task.FromResult<IReadOnlyList<RetentionPolicy>>([]);
return Task.FromResult<IReadOnlyList<RetentionPolicy>>(policies.ToList());
}
public Task<RetentionPolicy?> GetPolicyAsync(string tenantId, string policyId, CancellationToken cancellationToken = default)
{
if (!_policies.TryGetValue(tenantId, out var policies))
return Task.FromResult<RetentionPolicy?>(null);
return Task.FromResult(policies.FirstOrDefault(p => p.PolicyId == policyId));
}
public Task<RetentionPolicy> UpsertPolicyAsync(RetentionPolicy policy, CancellationToken cancellationToken = default)
{
var now = _timeProvider.GetUtcNow();
var list = _policies.GetOrAdd(policy.TenantId, _ => []);
lock (list)
{
var index = list.FindIndex(p => p.PolicyId == policy.PolicyId);
var updated = policy with { UpdatedAt = now, CreatedAt = index < 0 ? now : list[index].CreatedAt };
if (index >= 0) list[index] = updated;
else list.Add(updated);
_logger.LogInformation("Upserted retention policy {PolicyId} for tenant {TenantId}", policy.PolicyId, policy.TenantId);
return Task.FromResult(updated);
}
}
public Task<bool> DeletePolicyAsync(string tenantId, string policyId, CancellationToken cancellationToken = default)
{
if (!_policies.TryGetValue(tenantId, out var policies)) return Task.FromResult(false);
lock (policies)
{
var removed = policies.RemoveAll(p => p.PolicyId == policyId) > 0;
if (removed) _logger.LogInformation("Deleted retention policy {PolicyId} for tenant {TenantId}", policyId, tenantId);
return Task.FromResult(removed);
}
}
public Task<RetentionResult> ApplyAsync(string? tenantId = null, CancellationToken cancellationToken = default)
{
var startTime = _timeProvider.GetUtcNow();
var policyResults = new List<RetentionPolicyResult>();
var errors = new List<string>();
var totalDeleted = 0;
var totalArchived = 0;
var totalAnonymized = 0;
var tenantsToProcess = tenantId is not null ? [tenantId] : _policies.Keys.ToList();
foreach (var t in tenantsToProcess)
{
if (!_policies.TryGetValue(t, out var policies)) continue;
foreach (var policy in policies.Where(p => p.Enabled))
{
try
{
var affectedCount = SimulateRetention(policy);
var result = new RetentionPolicyResult
{
PolicyId = policy.PolicyId,
PolicyName = policy.Name,
DataType = policy.DataType,
AffectedCount = affectedCount,
ActionTaken = policy.Action,
Success = true
};
policyResults.Add(result);
switch (policy.Action)
{
case RetentionAction.Delete: totalDeleted += affectedCount; break;
case RetentionAction.Archive: totalArchived += affectedCount; break;
case RetentionAction.Anonymize: totalAnonymized += affectedCount; break;
}
// Update last applied time
lock (policies)
{
var idx = policies.FindIndex(p => p.PolicyId == policy.PolicyId);
if (idx >= 0) policies[idx] = policy with { LastAppliedAt = _timeProvider.GetUtcNow() };
}
}
catch (Exception ex)
{
errors.Add($"Policy {policy.PolicyId}: {ex.Message}");
policyResults.Add(new RetentionPolicyResult
{
PolicyId = policy.PolicyId,
PolicyName = policy.Name,
DataType = policy.DataType,
Success = false,
Error = ex.Message
});
}
}
}
var endTime = _timeProvider.GetUtcNow();
_logger.LogInformation("Applied retention policies: {Deleted} deleted, {Archived} archived, {Anonymized} anonymized", totalDeleted, totalArchived, totalAnonymized);
return Task.FromResult(new RetentionResult
{
Timestamp = endTime,
TenantId = tenantId,
PoliciesApplied = policyResults.Count(r => r.Success),
TotalDeleted = totalDeleted,
TotalArchived = totalArchived,
TotalAnonymized = totalAnonymized,
Duration = endTime - startTime,
PolicyResults = policyResults,
Errors = errors
});
}
public Task<RetentionStats> GetStatsAsync(string? tenantId = null, CancellationToken cancellationToken = default)
{
var allPolicies = tenantId is not null
? (_policies.TryGetValue(tenantId, out var p) ? p : [])
: _policies.Values.SelectMany(v => v).ToList();
var byDataType = Enum.GetValues<RetentionDataType>()
.ToDictionary(dt => dt, dt => new DataTypeStats { DataType = dt, CurrentCount = 0, DeletedCount = 0, ArchivedCount = 0 });
return Task.FromResult(new RetentionStats
{
Timestamp = _timeProvider.GetUtcNow(),
TenantId = tenantId,
TotalPolicies = allPolicies.Count,
EnabledPolicies = allPolicies.Count(p => p.Enabled),
DisabledPolicies = allPolicies.Count(p => !p.Enabled),
LastRunAt = allPolicies.Max(p => p.LastAppliedAt),
ByDataType = byDataType
});
}
public Task<RetentionPreview> PreviewAsync(string tenantId, CancellationToken cancellationToken = default)
{
if (!_policies.TryGetValue(tenantId, out var policies))
return Task.FromResult(new RetentionPreview { Timestamp = _timeProvider.GetUtcNow(), TenantId = tenantId });
var items = policies.Where(p => p.Enabled).Select(p => new RetentionPreviewItem
{
PolicyId = p.PolicyId,
PolicyName = p.Name,
DataType = p.DataType,
AffectedCount = SimulateRetention(p),
Action = p.Action
}).ToList();
return Task.FromResult(new RetentionPreview
{
Timestamp = _timeProvider.GetUtcNow(),
TenantId = tenantId,
TotalToDelete = items.Where(i => i.Action == RetentionAction.Delete).Sum(i => i.AffectedCount),
TotalToArchive = items.Where(i => i.Action == RetentionAction.Archive).Sum(i => i.AffectedCount),
TotalToAnonymize = items.Where(i => i.Action == RetentionAction.Anonymize).Sum(i => i.AffectedCount),
Items = items
});
}
private int SimulateRetention(RetentionPolicy policy)
{
// In production, this would query actual data stores
// For simulation, return a random count based on retention period
var daysFactor = (int)policy.RetentionPeriod.TotalDays;
return Math.Max(0, 100 - daysFactor);
}
}
/// <summary>
/// Background service that runs retention policies on schedule.
/// </summary>
public sealed class RetentionPolicyRunner : BackgroundService
{
private readonly IRetentionPolicyService _retentionService;
private readonly RetentionOptions _options;
private readonly TimeProvider _timeProvider;
private readonly ILogger<RetentionPolicyRunner> _logger;
public RetentionPolicyRunner(
IRetentionPolicyService retentionService,
IOptions<RetentionOptions> options,
TimeProvider timeProvider,
ILogger<RetentionPolicyRunner> logger)
{
_retentionService = retentionService ?? throw new ArgumentNullException(nameof(retentionService));
_options = options?.Value ?? new RetentionOptions();
_timeProvider = timeProvider ?? TimeProvider.System;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
if (!_options.Enabled || !_options.AutoRun)
{
_logger.LogInformation("Retention policy runner is disabled");
return;
}
_logger.LogInformation("Retention policy runner started with interval {Interval}", _options.RunInterval);
while (!stoppingToken.IsCancellationRequested)
{
try
{
var now = _timeProvider.GetUtcNow();
var nextRun = now.Date.Add(_options.RunTime);
if (nextRun <= now) nextRun = nextRun.AddDays(1);
var delay = nextRun - now;
if (delay > _options.RunInterval) delay = _options.RunInterval;
await Task.Delay(delay, stoppingToken);
_logger.LogInformation("Running scheduled retention policy application");
var result = await _retentionService.ApplyAsync(cancellationToken: stoppingToken);
_logger.LogInformation("Retention completed: {Deleted} deleted, {Archived} archived in {Duration}ms",
result.TotalDeleted, result.TotalArchived, result.Duration.TotalMilliseconds);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error running retention policies");
await Task.Delay(TimeSpan.FromMinutes(5), stoppingToken);
}
}
}
}

View File

@@ -0,0 +1,244 @@
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Extension methods for registering observability services.
/// </summary>
public static class ObservabilityServiceExtensions
{
/// <summary>
/// Adds all observability services (metrics, tracing, dead-letter, chaos, retention).
/// </summary>
public static IServiceCollection AddNotifierObservability(
this IServiceCollection services,
IConfiguration configuration)
{
return services
.AddNotifierMetrics(configuration)
.AddNotifierTracing(configuration)
.AddDeadLetterHandling(configuration)
.AddChaosEngine(configuration)
.AddRetentionPolicies(configuration);
}
/// <summary>
/// Adds notifier metrics services.
/// </summary>
public static IServiceCollection AddNotifierMetrics(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<NotifierMetricsOptions>(
configuration.GetSection(NotifierMetricsOptions.SectionName));
services.AddSingleton<INotifierMetrics, DefaultNotifierMetrics>();
return services;
}
/// <summary>
/// Adds notifier tracing services.
/// </summary>
public static IServiceCollection AddNotifierTracing(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<NotifierTracingOptions>(
configuration.GetSection(NotifierTracingOptions.SectionName));
services.AddSingleton<INotifierTracing, DefaultNotifierTracing>();
return services;
}
/// <summary>
/// Adds dead-letter handling services.
/// </summary>
public static IServiceCollection AddDeadLetterHandling(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<DeadLetterOptions>(
configuration.GetSection(DeadLetterOptions.SectionName));
services.AddSingleton<IDeadLetterHandler, InMemoryDeadLetterHandler>();
return services;
}
/// <summary>
/// Adds chaos engine services.
/// </summary>
public static IServiceCollection AddChaosEngine(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<ChaosEngineOptions>(
configuration.GetSection(ChaosEngineOptions.SectionName));
services.AddSingleton<IChaosEngine, DefaultChaosEngine>();
return services;
}
/// <summary>
/// Adds retention policy services.
/// </summary>
public static IServiceCollection AddRetentionPolicies(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<RetentionOptions>(
configuration.GetSection(RetentionOptions.SectionName));
services.AddSingleton<IRetentionPolicyService, InMemoryRetentionPolicyService>();
services.AddHostedService<RetentionPolicyRunner>();
return services;
}
/// <summary>
/// Builder for customizing observability services.
/// </summary>
public static ObservabilityServiceBuilder AddNotifierObservability(this IServiceCollection services)
{
return new ObservabilityServiceBuilder(services);
}
}
/// <summary>
/// Builder for customizing observability services.
/// </summary>
public sealed class ObservabilityServiceBuilder
{
private readonly IServiceCollection _services;
public ObservabilityServiceBuilder(IServiceCollection services)
{
_services = services ?? throw new ArgumentNullException(nameof(services));
}
/// <summary>
/// Configures metrics options.
/// </summary>
public ObservabilityServiceBuilder ConfigureMetrics(Action<NotifierMetricsOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Configures tracing options.
/// </summary>
public ObservabilityServiceBuilder ConfigureTracing(Action<NotifierTracingOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Configures dead-letter options.
/// </summary>
public ObservabilityServiceBuilder ConfigureDeadLetter(Action<DeadLetterOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Configures chaos engine options.
/// </summary>
public ObservabilityServiceBuilder ConfigureChaos(Action<ChaosEngineOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Configures retention options.
/// </summary>
public ObservabilityServiceBuilder ConfigureRetention(Action<RetentionOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Uses a custom metrics implementation.
/// </summary>
public ObservabilityServiceBuilder UseCustomMetrics<T>() where T : class, INotifierMetrics
{
_services.AddSingleton<INotifierMetrics, T>();
return this;
}
/// <summary>
/// Uses a custom tracing implementation.
/// </summary>
public ObservabilityServiceBuilder UseCustomTracing<T>() where T : class, INotifierTracing
{
_services.AddSingleton<INotifierTracing, T>();
return this;
}
/// <summary>
/// Uses a custom dead-letter handler.
/// </summary>
public ObservabilityServiceBuilder UseCustomDeadLetterHandler<T>() where T : class, IDeadLetterHandler
{
_services.AddSingleton<IDeadLetterHandler, T>();
return this;
}
/// <summary>
/// Uses a custom chaos engine.
/// </summary>
public ObservabilityServiceBuilder UseCustomChaosEngine<T>() where T : class, IChaosEngine
{
_services.AddSingleton<IChaosEngine, T>();
return this;
}
/// <summary>
/// Uses a custom retention policy service.
/// </summary>
public ObservabilityServiceBuilder UseCustomRetentionService<T>() where T : class, IRetentionPolicyService
{
_services.AddSingleton<IRetentionPolicyService, T>();
return this;
}
/// <summary>
/// Builds the services with default implementations.
/// </summary>
public IServiceCollection Build()
{
// Register defaults if not already registered
_services.TryAddSingleton<INotifierMetrics, DefaultNotifierMetrics>();
_services.TryAddSingleton<INotifierTracing, DefaultNotifierTracing>();
_services.TryAddSingleton<IDeadLetterHandler, InMemoryDeadLetterHandler>();
_services.TryAddSingleton<IChaosEngine, DefaultChaosEngine>();
_services.TryAddSingleton<IRetentionPolicyService, InMemoryRetentionPolicyService>();
_services.AddHostedService<RetentionPolicyRunner>();
return _services;
}
}
/// <summary>
/// Extension methods for service collection to ensure singleton registration.
/// </summary>
file static class ServiceCollectionExtensions
{
public static void TryAddSingleton<TService, TImplementation>(this IServiceCollection services)
where TService : class
where TImplementation : class, TService
{
if (!services.Any(s => s.ServiceType == typeof(TService)))
{
services.AddSingleton<TService, TImplementation>();
}
}
}