up
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Notify Smoke Test / Notify Unit Tests (push) Has been cancelled
Notify Smoke Test / Notifier Service Tests (push) Has been cancelled
Notify Smoke Test / Notification Smoke Test (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Scanner Analyzers / Discover Analyzers (push) Has been cancelled
Scanner Analyzers / Build Analyzers (push) Has been cancelled
Scanner Analyzers / Test Language Analyzers (push) Has been cancelled
Scanner Analyzers / Validate Test Fixtures (push) Has been cancelled
Scanner Analyzers / Verify Deterministic Output (push) Has been cancelled
Signals CI & Image / signals-ci (push) Has been cancelled
Signals Reachability Scoring & Events / reachability-smoke (push) Has been cancelled
Signals Reachability Scoring & Events / sign-and-upload (push) Has been cancelled

This commit is contained in:
StellaOps Bot
2025-12-13 00:20:26 +02:00
parent e1f1bef4c1
commit 564df71bfb
2376 changed files with 334389 additions and 328032 deletions

View File

@@ -1,233 +1,233 @@
using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Default implementation of notification metrics using System.Diagnostics.Metrics.
/// </summary>
public sealed class DefaultNotifyMetrics : INotifyMetrics
{
private static readonly ActivitySource ActivitySource = new("StellaOps.Notifier", "1.0.0");
private static readonly Meter Meter = new("StellaOps.Notifier", "1.0.0");
// Counters
private readonly Counter<long> _deliveryAttempts;
private readonly Counter<long> _escalationEvents;
private readonly Counter<long> _deadLetterEntries;
private readonly Counter<long> _ruleEvaluations;
private readonly Counter<long> _templateRenders;
private readonly Counter<long> _stormEvents;
private readonly Counter<long> _retentionCleanups;
// Histograms
private readonly Histogram<double> _deliveryDuration;
private readonly Histogram<double> _ruleEvaluationDuration;
private readonly Histogram<double> _templateRenderDuration;
// Gauges (using ObservableGauge pattern)
private readonly Dictionary<string, int> _queueDepths = new();
private readonly object _queueDepthLock = new();
public DefaultNotifyMetrics()
{
// Initialize counters
_deliveryAttempts = Meter.CreateCounter<long>(
NotifyMetricNames.DeliveryAttempts,
unit: "{attempts}",
description: "Total number of notification delivery attempts");
_escalationEvents = Meter.CreateCounter<long>(
NotifyMetricNames.EscalationEvents,
unit: "{events}",
description: "Total number of escalation events");
_deadLetterEntries = Meter.CreateCounter<long>(
NotifyMetricNames.DeadLetterEntries,
unit: "{entries}",
description: "Total number of dead-letter entries");
_ruleEvaluations = Meter.CreateCounter<long>(
NotifyMetricNames.RuleEvaluations,
unit: "{evaluations}",
description: "Total number of rule evaluations");
_templateRenders = Meter.CreateCounter<long>(
NotifyMetricNames.TemplateRenders,
unit: "{renders}",
description: "Total number of template render operations");
_stormEvents = Meter.CreateCounter<long>(
NotifyMetricNames.StormEvents,
unit: "{events}",
description: "Total number of storm detection events");
_retentionCleanups = Meter.CreateCounter<long>(
NotifyMetricNames.RetentionCleanups,
unit: "{cleanups}",
description: "Total number of retention cleanup operations");
// Initialize histograms
_deliveryDuration = Meter.CreateHistogram<double>(
NotifyMetricNames.DeliveryDuration,
unit: "ms",
description: "Duration of delivery attempts in milliseconds");
_ruleEvaluationDuration = Meter.CreateHistogram<double>(
NotifyMetricNames.RuleEvaluationDuration,
unit: "ms",
description: "Duration of rule evaluations in milliseconds");
_templateRenderDuration = Meter.CreateHistogram<double>(
NotifyMetricNames.TemplateRenderDuration,
unit: "ms",
description: "Duration of template renders in milliseconds");
// Initialize observable gauge for queue depths
Meter.CreateObservableGauge(
NotifyMetricNames.QueueDepth,
observeValues: ObserveQueueDepths,
unit: "{messages}",
description: "Current queue depth per channel");
}
public void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.ChannelType, channelType },
{ NotifyMetricTags.Status, status }
};
_deliveryAttempts.Add(1, tags);
_deliveryDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordEscalation(string tenantId, int level, string outcome)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.Level, level.ToString() },
{ NotifyMetricTags.Outcome, outcome }
};
_escalationEvents.Add(1, tags);
}
public void RecordDeadLetter(string tenantId, string reason, string channelType)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.Reason, reason },
{ NotifyMetricTags.ChannelType, channelType }
};
_deadLetterEntries.Add(1, tags);
}
public void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.RuleId, ruleId },
{ NotifyMetricTags.Matched, matched.ToString().ToLowerInvariant() }
};
_ruleEvaluations.Add(1, tags);
_ruleEvaluationDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.TemplateKey, templateKey },
{ NotifyMetricTags.Success, success.ToString().ToLowerInvariant() }
};
_templateRenders.Add(1, tags);
_templateRenderDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordStormEvent(string tenantId, string stormKey, string decision)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.StormKey, stormKey },
{ NotifyMetricTags.Decision, decision }
};
_stormEvents.Add(1, tags);
}
public void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.EntityType, entityType }
};
_retentionCleanups.Add(deletedCount, tags);
}
public void RecordQueueDepth(string tenantId, string channelType, int depth)
{
var key = $"{tenantId}:{channelType}";
lock (_queueDepthLock)
{
_queueDepths[key] = depth;
}
}
public Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType)
{
var activity = ActivitySource.StartActivity("notify.delivery", ActivityKind.Internal);
if (activity is not null)
{
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
activity.SetTag("delivery_id", deliveryId);
activity.SetTag(NotifyMetricTags.ChannelType, channelType);
}
return activity;
}
public Activity? StartEscalationActivity(string tenantId, string incidentId, int level)
{
var activity = ActivitySource.StartActivity("notify.escalation", ActivityKind.Internal);
if (activity is not null)
{
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
activity.SetTag("incident_id", incidentId);
activity.SetTag(NotifyMetricTags.Level, level);
}
return activity;
}
private IEnumerable<Measurement<int>> ObserveQueueDepths()
{
lock (_queueDepthLock)
{
foreach (var (key, depth) in _queueDepths)
{
var parts = key.Split(':');
if (parts.Length == 2)
{
yield return new Measurement<int>(
depth,
new TagList
{
{ NotifyMetricTags.TenantId, parts[0] },
{ NotifyMetricTags.ChannelType, parts[1] }
});
}
}
}
}
}
using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Default implementation of notification metrics using System.Diagnostics.Metrics.
/// </summary>
public sealed class DefaultNotifyMetrics : INotifyMetrics
{
private static readonly ActivitySource ActivitySource = new("StellaOps.Notifier", "1.0.0");
private static readonly Meter Meter = new("StellaOps.Notifier", "1.0.0");
// Counters
private readonly Counter<long> _deliveryAttempts;
private readonly Counter<long> _escalationEvents;
private readonly Counter<long> _deadLetterEntries;
private readonly Counter<long> _ruleEvaluations;
private readonly Counter<long> _templateRenders;
private readonly Counter<long> _stormEvents;
private readonly Counter<long> _retentionCleanups;
// Histograms
private readonly Histogram<double> _deliveryDuration;
private readonly Histogram<double> _ruleEvaluationDuration;
private readonly Histogram<double> _templateRenderDuration;
// Gauges (using ObservableGauge pattern)
private readonly Dictionary<string, int> _queueDepths = new();
private readonly object _queueDepthLock = new();
public DefaultNotifyMetrics()
{
// Initialize counters
_deliveryAttempts = Meter.CreateCounter<long>(
NotifyMetricNames.DeliveryAttempts,
unit: "{attempts}",
description: "Total number of notification delivery attempts");
_escalationEvents = Meter.CreateCounter<long>(
NotifyMetricNames.EscalationEvents,
unit: "{events}",
description: "Total number of escalation events");
_deadLetterEntries = Meter.CreateCounter<long>(
NotifyMetricNames.DeadLetterEntries,
unit: "{entries}",
description: "Total number of dead-letter entries");
_ruleEvaluations = Meter.CreateCounter<long>(
NotifyMetricNames.RuleEvaluations,
unit: "{evaluations}",
description: "Total number of rule evaluations");
_templateRenders = Meter.CreateCounter<long>(
NotifyMetricNames.TemplateRenders,
unit: "{renders}",
description: "Total number of template render operations");
_stormEvents = Meter.CreateCounter<long>(
NotifyMetricNames.StormEvents,
unit: "{events}",
description: "Total number of storm detection events");
_retentionCleanups = Meter.CreateCounter<long>(
NotifyMetricNames.RetentionCleanups,
unit: "{cleanups}",
description: "Total number of retention cleanup operations");
// Initialize histograms
_deliveryDuration = Meter.CreateHistogram<double>(
NotifyMetricNames.DeliveryDuration,
unit: "ms",
description: "Duration of delivery attempts in milliseconds");
_ruleEvaluationDuration = Meter.CreateHistogram<double>(
NotifyMetricNames.RuleEvaluationDuration,
unit: "ms",
description: "Duration of rule evaluations in milliseconds");
_templateRenderDuration = Meter.CreateHistogram<double>(
NotifyMetricNames.TemplateRenderDuration,
unit: "ms",
description: "Duration of template renders in milliseconds");
// Initialize observable gauge for queue depths
Meter.CreateObservableGauge(
NotifyMetricNames.QueueDepth,
observeValues: ObserveQueueDepths,
unit: "{messages}",
description: "Current queue depth per channel");
}
public void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.ChannelType, channelType },
{ NotifyMetricTags.Status, status }
};
_deliveryAttempts.Add(1, tags);
_deliveryDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordEscalation(string tenantId, int level, string outcome)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.Level, level.ToString() },
{ NotifyMetricTags.Outcome, outcome }
};
_escalationEvents.Add(1, tags);
}
public void RecordDeadLetter(string tenantId, string reason, string channelType)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.Reason, reason },
{ NotifyMetricTags.ChannelType, channelType }
};
_deadLetterEntries.Add(1, tags);
}
public void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.RuleId, ruleId },
{ NotifyMetricTags.Matched, matched.ToString().ToLowerInvariant() }
};
_ruleEvaluations.Add(1, tags);
_ruleEvaluationDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.TemplateKey, templateKey },
{ NotifyMetricTags.Success, success.ToString().ToLowerInvariant() }
};
_templateRenders.Add(1, tags);
_templateRenderDuration.Record(duration.TotalMilliseconds, tags);
}
public void RecordStormEvent(string tenantId, string stormKey, string decision)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.StormKey, stormKey },
{ NotifyMetricTags.Decision, decision }
};
_stormEvents.Add(1, tags);
}
public void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount)
{
var tags = new TagList
{
{ NotifyMetricTags.TenantId, tenantId },
{ NotifyMetricTags.EntityType, entityType }
};
_retentionCleanups.Add(deletedCount, tags);
}
public void RecordQueueDepth(string tenantId, string channelType, int depth)
{
var key = $"{tenantId}:{channelType}";
lock (_queueDepthLock)
{
_queueDepths[key] = depth;
}
}
public Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType)
{
var activity = ActivitySource.StartActivity("notify.delivery", ActivityKind.Internal);
if (activity is not null)
{
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
activity.SetTag("delivery_id", deliveryId);
activity.SetTag(NotifyMetricTags.ChannelType, channelType);
}
return activity;
}
public Activity? StartEscalationActivity(string tenantId, string incidentId, int level)
{
var activity = ActivitySource.StartActivity("notify.escalation", ActivityKind.Internal);
if (activity is not null)
{
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
activity.SetTag("incident_id", incidentId);
activity.SetTag(NotifyMetricTags.Level, level);
}
return activity;
}
private IEnumerable<Measurement<int>> ObserveQueueDepths()
{
lock (_queueDepthLock)
{
foreach (var (key, depth) in _queueDepths)
{
var parts = key.Split(':');
if (parts.Length == 2)
{
yield return new Measurement<int>(
depth,
new TagList
{
{ NotifyMetricTags.TenantId, parts[0] },
{ NotifyMetricTags.ChannelType, parts[1] }
});
}
}
}
}
}

View File

@@ -1,473 +1,473 @@
using System.Collections.Concurrent;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Chaos testing engine for simulating channel outages and failures.
/// </summary>
public interface IChaosEngine
{
/// <summary>
/// Injects a fault for a channel type.
/// </summary>
Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default);
/// <summary>
/// Removes a fault injection.
/// </summary>
Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default);
/// <summary>
/// Gets all active faults.
/// </summary>
Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default);
/// <summary>
/// Checks if a channel operation should fail due to chaos.
/// </summary>
Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default);
/// <summary>
/// Runs a chaos test scenario.
/// </summary>
Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default);
/// <summary>
/// Gets chaos test history.
/// </summary>
Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default);
/// <summary>
/// Clears all active faults.
/// </summary>
Task ClearAllFaultsAsync(CancellationToken cancellationToken = default);
}
/// <summary>
/// Request to inject a fault.
/// </summary>
public sealed record ChaosFaultRequest
{
public required string ChannelType { get; init; }
public string? TenantId { get; init; }
public required ChaosFaultType FaultType { get; init; }
public double FailureProbability { get; init; } = 1.0;
public TimeSpan? Duration { get; init; }
public TimeSpan? LatencyInjection { get; init; }
public int? ErrorCode { get; init; }
public string? ErrorMessage { get; init; }
public string? Description { get; init; }
}
/// <summary>
/// Type of chaos fault.
/// </summary>
public enum ChaosFaultType
{
Outage,
Latency,
RateLimit,
AuthFailure,
Timeout,
PartialFailure,
Intermittent,
ErrorResponse,
CorruptResponse
}
/// <summary>
/// Active fault injection.
/// </summary>
public sealed record ChaosFaultInjection
{
public required string FaultId { get; init; }
public required string ChannelType { get; init; }
public string? TenantId { get; init; }
public required ChaosFaultType FaultType { get; init; }
public double FailureProbability { get; init; }
public TimeSpan? LatencyInjection { get; init; }
public int? ErrorCode { get; init; }
public string? ErrorMessage { get; init; }
public string? Description { get; init; }
public DateTimeOffset CreatedAt { get; init; }
public DateTimeOffset? ExpiresAt { get; init; }
public int TriggerCount { get; init; }
public bool IsActive { get; init; } = true;
}
/// <summary>
/// Result of checking for chaos fault.
/// </summary>
public sealed record ChaosFaultResult
{
public bool ShouldFail { get; init; }
public ChaosFaultInjection? ActiveFault { get; init; }
public TimeSpan? InjectedLatency { get; init; }
public Exception? SimulatedException { get; init; }
}
/// <summary>
/// A chaos test scenario.
/// </summary>
public sealed record ChaosScenario
{
public required string ScenarioId { get; init; }
public required string Name { get; init; }
public string? Description { get; init; }
public required IReadOnlyList<ChaosScenarioStep> Steps { get; init; }
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(10);
public bool StopOnFirstFailure { get; init; }
}
/// <summary>
/// A step in a chaos scenario.
/// </summary>
public sealed record ChaosScenarioStep
{
public required string StepId { get; init; }
public required string Name { get; init; }
public required ChaosStepAction Action { get; init; }
public ChaosFaultRequest? FaultToInject { get; init; }
public string? FaultIdToRemove { get; init; }
public TimeSpan? WaitDuration { get; init; }
public ChaosAssertion? Assertion { get; init; }
}
/// <summary>
/// Action type for a chaos step.
/// </summary>
public enum ChaosStepAction
{
InjectFault,
RemoveFault,
Wait,
Assert,
SendTestDelivery,
CheckMetrics
}
/// <summary>
/// Assertion for chaos testing.
/// </summary>
public sealed record ChaosAssertion
{
public required ChaosAssertionType Type { get; init; }
public string? MetricName { get; init; }
public double? ExpectedValue { get; init; }
public double? Tolerance { get; init; }
public string? ExpectedStatus { get; init; }
}
/// <summary>
/// Type of chaos assertion.
/// </summary>
public enum ChaosAssertionType
{
MetricEquals,
MetricGreaterThan,
MetricLessThan,
DeadLetterCountEquals,
FallbackTriggered,
AlertFired
}
/// <summary>
/// Result of a chaos test.
/// </summary>
public sealed record ChaosTestResult
{
public required string TestId { get; init; }
public required string ScenarioId { get; init; }
public required string ScenarioName { get; init; }
public bool Success { get; init; }
public DateTimeOffset StartedAt { get; init; }
public DateTimeOffset CompletedAt { get; init; }
public TimeSpan Duration { get; init; }
public IReadOnlyList<ChaosStepResult> StepResults { get; init; } = [];
public string? Error { get; init; }
}
/// <summary>
/// Result of a chaos step.
/// </summary>
public sealed record ChaosStepResult
{
public required string StepId { get; init; }
public required string StepName { get; init; }
public bool Success { get; init; }
public DateTimeOffset ExecutedAt { get; init; }
public TimeSpan Duration { get; init; }
public string? Error { get; init; }
public object? Data { get; init; }
}
/// <summary>
/// Options for chaos engine.
/// </summary>
public sealed class ChaosEngineOptions
{
public const string SectionName = "Notifier:Observability:Chaos";
public bool Enabled { get; set; }
public bool AllowInProduction { get; set; }
public TimeSpan MaxFaultDuration { get; set; } = TimeSpan.FromHours(1);
public int MaxConcurrentFaults { get; set; } = 10;
public IReadOnlyList<string> AllowedChannelTypes { get; set; } = ["webhook", "email", "slack", "teams", "pagerduty", "opsgenie"];
}
/// <summary>
/// Default implementation of chaos engine.
/// </summary>
public sealed class DefaultChaosEngine : IChaosEngine
{
private readonly ConcurrentDictionary<string, ChaosFaultInjection> _activeFaults = new();
private readonly List<ChaosTestResult> _testHistory = [];
private readonly ChaosEngineOptions _options;
private readonly TimeProvider _timeProvider;
private readonly INotifierMetrics? _metrics;
private readonly ILogger<DefaultChaosEngine> _logger;
private readonly Random _random = new();
public DefaultChaosEngine(
IOptions<ChaosEngineOptions> options,
TimeProvider timeProvider,
INotifierMetrics? metrics,
ILogger<DefaultChaosEngine> logger)
{
_options = options?.Value ?? new ChaosEngineOptions();
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default)
{
if (!_options.Enabled)
throw new InvalidOperationException("Chaos engine is disabled");
if (_activeFaults.Count >= _options.MaxConcurrentFaults)
throw new InvalidOperationException($"Maximum concurrent faults ({_options.MaxConcurrentFaults}) reached");
var now = _timeProvider.GetUtcNow();
var duration = request.Duration ?? _options.MaxFaultDuration;
if (duration > _options.MaxFaultDuration)
duration = _options.MaxFaultDuration;
var fault = new ChaosFaultInjection
{
FaultId = $"chaos-{Guid.NewGuid():N}"[..16],
ChannelType = request.ChannelType,
TenantId = request.TenantId,
FaultType = request.FaultType,
FailureProbability = Math.Clamp(request.FailureProbability, 0.0, 1.0),
LatencyInjection = request.LatencyInjection,
ErrorCode = request.ErrorCode,
ErrorMessage = request.ErrorMessage,
Description = request.Description,
CreatedAt = now,
ExpiresAt = now + duration,
IsActive = true
};
_activeFaults[fault.FaultId] = fault;
_logger.LogWarning("Injected chaos fault {FaultId} for channel {ChannelType}: {FaultType}", fault.FaultId, fault.ChannelType, fault.FaultType);
return Task.FromResult(fault);
}
public Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default)
{
var removed = _activeFaults.TryRemove(faultId, out var fault);
if (removed)
_logger.LogInformation("Removed chaos fault {FaultId}", faultId);
return Task.FromResult(removed);
}
public Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default)
{
CleanupExpiredFaults();
return Task.FromResult<IReadOnlyList<ChaosFaultInjection>>(_activeFaults.Values.ToList());
}
public Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default)
{
if (!_options.Enabled)
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
CleanupExpiredFaults();
var matchingFault = _activeFaults.Values
.Where(f => f.IsActive)
.Where(f => f.ChannelType == channelType || f.ChannelType == "*")
.Where(f => f.TenantId is null || f.TenantId == tenantId)
.FirstOrDefault();
if (matchingFault is null)
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
var shouldFail = _random.NextDouble() < matchingFault.FailureProbability;
if (!shouldFail)
return Task.FromResult(new ChaosFaultResult { ShouldFail = false, ActiveFault = matchingFault });
// Update trigger count
_activeFaults[matchingFault.FaultId] = matchingFault with { TriggerCount = matchingFault.TriggerCount + 1 };
var exception = matchingFault.FaultType switch
{
ChaosFaultType.Outage => new InvalidOperationException(matchingFault.ErrorMessage ?? "Channel outage (chaos)"),
ChaosFaultType.AuthFailure => new UnauthorizedAccessException(matchingFault.ErrorMessage ?? "Auth failure (chaos)"),
ChaosFaultType.Timeout => new TimeoutException(matchingFault.ErrorMessage ?? "Timeout (chaos)"),
ChaosFaultType.RateLimit => new InvalidOperationException(matchingFault.ErrorMessage ?? "Rate limited (chaos)"),
_ => new Exception(matchingFault.ErrorMessage ?? "Chaos fault")
};
return Task.FromResult(new ChaosFaultResult
{
ShouldFail = true,
ActiveFault = matchingFault,
InjectedLatency = matchingFault.LatencyInjection,
SimulatedException = exception
});
}
public async Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default)
{
if (!_options.Enabled)
throw new InvalidOperationException("Chaos engine is disabled");
var testId = $"test-{Guid.NewGuid():N}"[..16];
var startedAt = _timeProvider.GetUtcNow();
var stepResults = new List<ChaosStepResult>();
string? error = null;
var success = true;
_logger.LogInformation("Starting chaos scenario {ScenarioId}: {ScenarioName}", scenario.ScenarioId, scenario.Name);
using var timeoutCts = new CancellationTokenSource(scenario.Timeout);
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token);
try
{
foreach (var step in scenario.Steps)
{
var stepStart = _timeProvider.GetUtcNow();
var stepResult = await ExecuteStepAsync(step, linkedCts.Token);
stepResults.Add(stepResult);
if (!stepResult.Success)
{
success = false;
if (scenario.StopOnFirstFailure)
{
error = $"Step '{step.Name}' failed: {stepResult.Error}";
break;
}
}
}
}
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
{
success = false;
error = "Scenario timed out";
}
catch (Exception ex)
{
success = false;
error = ex.Message;
}
var completedAt = _timeProvider.GetUtcNow();
var result = new ChaosTestResult
{
TestId = testId,
ScenarioId = scenario.ScenarioId,
ScenarioName = scenario.Name,
Success = success,
StartedAt = startedAt,
CompletedAt = completedAt,
Duration = completedAt - startedAt,
StepResults = stepResults,
Error = error
};
lock (_testHistory)
{
_testHistory.Add(result);
while (_testHistory.Count > 100) _testHistory.RemoveAt(0);
}
_logger.LogInformation("Chaos scenario {ScenarioId} completed: {Success}", scenario.ScenarioId, success ? "PASSED" : "FAILED");
return result;
}
private async Task<ChaosStepResult> ExecuteStepAsync(ChaosScenarioStep step, CancellationToken cancellationToken)
{
var executedAt = _timeProvider.GetUtcNow();
try
{
object? data = null;
switch (step.Action)
{
case ChaosStepAction.InjectFault when step.FaultToInject is not null:
var fault = await InjectFaultAsync(step.FaultToInject, cancellationToken);
data = fault;
break;
case ChaosStepAction.RemoveFault when step.FaultIdToRemove is not null:
await RemoveFaultAsync(step.FaultIdToRemove, cancellationToken);
break;
case ChaosStepAction.Wait when step.WaitDuration.HasValue:
await Task.Delay(step.WaitDuration.Value, cancellationToken);
break;
case ChaosStepAction.Assert when step.Assertion is not null:
var assertResult = EvaluateAssertion(step.Assertion);
if (!assertResult.passed)
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = assertResult.error };
break;
}
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = true, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Data = data };
}
catch (Exception ex)
{
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = ex.Message };
}
}
private (bool passed, string? error) EvaluateAssertion(ChaosAssertion assertion)
{
// Simplified assertion evaluation - in production would query actual metrics
return assertion.Type switch
{
ChaosAssertionType.FallbackTriggered => (true, null),
ChaosAssertionType.AlertFired => (true, null),
_ => (true, null)
};
}
public Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default)
{
lock (_testHistory)
{
return Task.FromResult<IReadOnlyList<ChaosTestResult>>(_testHistory.TakeLast(limit).Reverse().ToList());
}
}
public Task ClearAllFaultsAsync(CancellationToken cancellationToken = default)
{
_activeFaults.Clear();
_logger.LogInformation("Cleared all chaos faults");
return Task.CompletedTask;
}
private void CleanupExpiredFaults()
{
var now = _timeProvider.GetUtcNow();
var expired = _activeFaults.Where(f => f.Value.ExpiresAt.HasValue && f.Value.ExpiresAt < now).Select(f => f.Key).ToList();
foreach (var id in expired)
{
_activeFaults.TryRemove(id, out _);
_logger.LogDebug("Expired chaos fault {FaultId}", id);
}
}
}
using System.Collections.Concurrent;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Chaos testing engine for simulating channel outages and failures.
/// </summary>
public interface IChaosEngine
{
/// <summary>
/// Injects a fault for a channel type.
/// </summary>
Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default);
/// <summary>
/// Removes a fault injection.
/// </summary>
Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default);
/// <summary>
/// Gets all active faults.
/// </summary>
Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default);
/// <summary>
/// Checks if a channel operation should fail due to chaos.
/// </summary>
Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default);
/// <summary>
/// Runs a chaos test scenario.
/// </summary>
Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default);
/// <summary>
/// Gets chaos test history.
/// </summary>
Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default);
/// <summary>
/// Clears all active faults.
/// </summary>
Task ClearAllFaultsAsync(CancellationToken cancellationToken = default);
}
/// <summary>
/// Request to inject a fault.
/// </summary>
public sealed record ChaosFaultRequest
{
public required string ChannelType { get; init; }
public string? TenantId { get; init; }
public required ChaosFaultType FaultType { get; init; }
public double FailureProbability { get; init; } = 1.0;
public TimeSpan? Duration { get; init; }
public TimeSpan? LatencyInjection { get; init; }
public int? ErrorCode { get; init; }
public string? ErrorMessage { get; init; }
public string? Description { get; init; }
}
/// <summary>
/// Type of chaos fault.
/// </summary>
public enum ChaosFaultType
{
Outage,
Latency,
RateLimit,
AuthFailure,
Timeout,
PartialFailure,
Intermittent,
ErrorResponse,
CorruptResponse
}
/// <summary>
/// Active fault injection.
/// </summary>
public sealed record ChaosFaultInjection
{
public required string FaultId { get; init; }
public required string ChannelType { get; init; }
public string? TenantId { get; init; }
public required ChaosFaultType FaultType { get; init; }
public double FailureProbability { get; init; }
public TimeSpan? LatencyInjection { get; init; }
public int? ErrorCode { get; init; }
public string? ErrorMessage { get; init; }
public string? Description { get; init; }
public DateTimeOffset CreatedAt { get; init; }
public DateTimeOffset? ExpiresAt { get; init; }
public int TriggerCount { get; init; }
public bool IsActive { get; init; } = true;
}
/// <summary>
/// Result of checking for chaos fault.
/// </summary>
public sealed record ChaosFaultResult
{
public bool ShouldFail { get; init; }
public ChaosFaultInjection? ActiveFault { get; init; }
public TimeSpan? InjectedLatency { get; init; }
public Exception? SimulatedException { get; init; }
}
/// <summary>
/// A chaos test scenario.
/// </summary>
public sealed record ChaosScenario
{
public required string ScenarioId { get; init; }
public required string Name { get; init; }
public string? Description { get; init; }
public required IReadOnlyList<ChaosScenarioStep> Steps { get; init; }
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(10);
public bool StopOnFirstFailure { get; init; }
}
/// <summary>
/// A step in a chaos scenario.
/// </summary>
public sealed record ChaosScenarioStep
{
public required string StepId { get; init; }
public required string Name { get; init; }
public required ChaosStepAction Action { get; init; }
public ChaosFaultRequest? FaultToInject { get; init; }
public string? FaultIdToRemove { get; init; }
public TimeSpan? WaitDuration { get; init; }
public ChaosAssertion? Assertion { get; init; }
}
/// <summary>
/// Action type for a chaos step.
/// </summary>
public enum ChaosStepAction
{
InjectFault,
RemoveFault,
Wait,
Assert,
SendTestDelivery,
CheckMetrics
}
/// <summary>
/// Assertion for chaos testing.
/// </summary>
public sealed record ChaosAssertion
{
public required ChaosAssertionType Type { get; init; }
public string? MetricName { get; init; }
public double? ExpectedValue { get; init; }
public double? Tolerance { get; init; }
public string? ExpectedStatus { get; init; }
}
/// <summary>
/// Type of chaos assertion.
/// </summary>
public enum ChaosAssertionType
{
MetricEquals,
MetricGreaterThan,
MetricLessThan,
DeadLetterCountEquals,
FallbackTriggered,
AlertFired
}
/// <summary>
/// Result of a chaos test.
/// </summary>
public sealed record ChaosTestResult
{
public required string TestId { get; init; }
public required string ScenarioId { get; init; }
public required string ScenarioName { get; init; }
public bool Success { get; init; }
public DateTimeOffset StartedAt { get; init; }
public DateTimeOffset CompletedAt { get; init; }
public TimeSpan Duration { get; init; }
public IReadOnlyList<ChaosStepResult> StepResults { get; init; } = [];
public string? Error { get; init; }
}
/// <summary>
/// Result of a chaos step.
/// </summary>
public sealed record ChaosStepResult
{
public required string StepId { get; init; }
public required string StepName { get; init; }
public bool Success { get; init; }
public DateTimeOffset ExecutedAt { get; init; }
public TimeSpan Duration { get; init; }
public string? Error { get; init; }
public object? Data { get; init; }
}
/// <summary>
/// Options for chaos engine.
/// </summary>
public sealed class ChaosEngineOptions
{
public const string SectionName = "Notifier:Observability:Chaos";
public bool Enabled { get; set; }
public bool AllowInProduction { get; set; }
public TimeSpan MaxFaultDuration { get; set; } = TimeSpan.FromHours(1);
public int MaxConcurrentFaults { get; set; } = 10;
public IReadOnlyList<string> AllowedChannelTypes { get; set; } = ["webhook", "email", "slack", "teams", "pagerduty", "opsgenie"];
}
/// <summary>
/// Default implementation of chaos engine.
/// </summary>
public sealed class DefaultChaosEngine : IChaosEngine
{
private readonly ConcurrentDictionary<string, ChaosFaultInjection> _activeFaults = new();
private readonly List<ChaosTestResult> _testHistory = [];
private readonly ChaosEngineOptions _options;
private readonly TimeProvider _timeProvider;
private readonly INotifierMetrics? _metrics;
private readonly ILogger<DefaultChaosEngine> _logger;
private readonly Random _random = new();
public DefaultChaosEngine(
IOptions<ChaosEngineOptions> options,
TimeProvider timeProvider,
INotifierMetrics? metrics,
ILogger<DefaultChaosEngine> logger)
{
_options = options?.Value ?? new ChaosEngineOptions();
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default)
{
if (!_options.Enabled)
throw new InvalidOperationException("Chaos engine is disabled");
if (_activeFaults.Count >= _options.MaxConcurrentFaults)
throw new InvalidOperationException($"Maximum concurrent faults ({_options.MaxConcurrentFaults}) reached");
var now = _timeProvider.GetUtcNow();
var duration = request.Duration ?? _options.MaxFaultDuration;
if (duration > _options.MaxFaultDuration)
duration = _options.MaxFaultDuration;
var fault = new ChaosFaultInjection
{
FaultId = $"chaos-{Guid.NewGuid():N}"[..16],
ChannelType = request.ChannelType,
TenantId = request.TenantId,
FaultType = request.FaultType,
FailureProbability = Math.Clamp(request.FailureProbability, 0.0, 1.0),
LatencyInjection = request.LatencyInjection,
ErrorCode = request.ErrorCode,
ErrorMessage = request.ErrorMessage,
Description = request.Description,
CreatedAt = now,
ExpiresAt = now + duration,
IsActive = true
};
_activeFaults[fault.FaultId] = fault;
_logger.LogWarning("Injected chaos fault {FaultId} for channel {ChannelType}: {FaultType}", fault.FaultId, fault.ChannelType, fault.FaultType);
return Task.FromResult(fault);
}
public Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default)
{
var removed = _activeFaults.TryRemove(faultId, out var fault);
if (removed)
_logger.LogInformation("Removed chaos fault {FaultId}", faultId);
return Task.FromResult(removed);
}
public Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default)
{
CleanupExpiredFaults();
return Task.FromResult<IReadOnlyList<ChaosFaultInjection>>(_activeFaults.Values.ToList());
}
public Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default)
{
if (!_options.Enabled)
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
CleanupExpiredFaults();
var matchingFault = _activeFaults.Values
.Where(f => f.IsActive)
.Where(f => f.ChannelType == channelType || f.ChannelType == "*")
.Where(f => f.TenantId is null || f.TenantId == tenantId)
.FirstOrDefault();
if (matchingFault is null)
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
var shouldFail = _random.NextDouble() < matchingFault.FailureProbability;
if (!shouldFail)
return Task.FromResult(new ChaosFaultResult { ShouldFail = false, ActiveFault = matchingFault });
// Update trigger count
_activeFaults[matchingFault.FaultId] = matchingFault with { TriggerCount = matchingFault.TriggerCount + 1 };
var exception = matchingFault.FaultType switch
{
ChaosFaultType.Outage => new InvalidOperationException(matchingFault.ErrorMessage ?? "Channel outage (chaos)"),
ChaosFaultType.AuthFailure => new UnauthorizedAccessException(matchingFault.ErrorMessage ?? "Auth failure (chaos)"),
ChaosFaultType.Timeout => new TimeoutException(matchingFault.ErrorMessage ?? "Timeout (chaos)"),
ChaosFaultType.RateLimit => new InvalidOperationException(matchingFault.ErrorMessage ?? "Rate limited (chaos)"),
_ => new Exception(matchingFault.ErrorMessage ?? "Chaos fault")
};
return Task.FromResult(new ChaosFaultResult
{
ShouldFail = true,
ActiveFault = matchingFault,
InjectedLatency = matchingFault.LatencyInjection,
SimulatedException = exception
});
}
public async Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default)
{
if (!_options.Enabled)
throw new InvalidOperationException("Chaos engine is disabled");
var testId = $"test-{Guid.NewGuid():N}"[..16];
var startedAt = _timeProvider.GetUtcNow();
var stepResults = new List<ChaosStepResult>();
string? error = null;
var success = true;
_logger.LogInformation("Starting chaos scenario {ScenarioId}: {ScenarioName}", scenario.ScenarioId, scenario.Name);
using var timeoutCts = new CancellationTokenSource(scenario.Timeout);
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token);
try
{
foreach (var step in scenario.Steps)
{
var stepStart = _timeProvider.GetUtcNow();
var stepResult = await ExecuteStepAsync(step, linkedCts.Token);
stepResults.Add(stepResult);
if (!stepResult.Success)
{
success = false;
if (scenario.StopOnFirstFailure)
{
error = $"Step '{step.Name}' failed: {stepResult.Error}";
break;
}
}
}
}
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
{
success = false;
error = "Scenario timed out";
}
catch (Exception ex)
{
success = false;
error = ex.Message;
}
var completedAt = _timeProvider.GetUtcNow();
var result = new ChaosTestResult
{
TestId = testId,
ScenarioId = scenario.ScenarioId,
ScenarioName = scenario.Name,
Success = success,
StartedAt = startedAt,
CompletedAt = completedAt,
Duration = completedAt - startedAt,
StepResults = stepResults,
Error = error
};
lock (_testHistory)
{
_testHistory.Add(result);
while (_testHistory.Count > 100) _testHistory.RemoveAt(0);
}
_logger.LogInformation("Chaos scenario {ScenarioId} completed: {Success}", scenario.ScenarioId, success ? "PASSED" : "FAILED");
return result;
}
private async Task<ChaosStepResult> ExecuteStepAsync(ChaosScenarioStep step, CancellationToken cancellationToken)
{
var executedAt = _timeProvider.GetUtcNow();
try
{
object? data = null;
switch (step.Action)
{
case ChaosStepAction.InjectFault when step.FaultToInject is not null:
var fault = await InjectFaultAsync(step.FaultToInject, cancellationToken);
data = fault;
break;
case ChaosStepAction.RemoveFault when step.FaultIdToRemove is not null:
await RemoveFaultAsync(step.FaultIdToRemove, cancellationToken);
break;
case ChaosStepAction.Wait when step.WaitDuration.HasValue:
await Task.Delay(step.WaitDuration.Value, cancellationToken);
break;
case ChaosStepAction.Assert when step.Assertion is not null:
var assertResult = EvaluateAssertion(step.Assertion);
if (!assertResult.passed)
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = assertResult.error };
break;
}
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = true, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Data = data };
}
catch (Exception ex)
{
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = ex.Message };
}
}
private (bool passed, string? error) EvaluateAssertion(ChaosAssertion assertion)
{
// Simplified assertion evaluation - in production would query actual metrics
return assertion.Type switch
{
ChaosAssertionType.FallbackTriggered => (true, null),
ChaosAssertionType.AlertFired => (true, null),
_ => (true, null)
};
}
public Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default)
{
lock (_testHistory)
{
return Task.FromResult<IReadOnlyList<ChaosTestResult>>(_testHistory.TakeLast(limit).Reverse().ToList());
}
}
public Task ClearAllFaultsAsync(CancellationToken cancellationToken = default)
{
_activeFaults.Clear();
_logger.LogInformation("Cleared all chaos faults");
return Task.CompletedTask;
}
private void CleanupExpiredFaults()
{
var now = _timeProvider.GetUtcNow();
var expired = _activeFaults.Where(f => f.Value.ExpiresAt.HasValue && f.Value.ExpiresAt < now).Select(f => f.Key).ToList();
foreach (var id in expired)
{
_activeFaults.TryRemove(id, out _);
_logger.LogDebug("Expired chaos fault {FaultId}", id);
}
}
}

View File

@@ -1,351 +1,351 @@
using System.Collections.Concurrent;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Handles notifications that have failed permanently after all retries.
/// </summary>
public interface IDeadLetterHandler
{
/// <summary>
/// Moves a delivery to the dead-letter queue.
/// </summary>
Task<DeadLetteredDelivery> DeadLetterAsync(
string tenantId,
string deliveryId,
DeadLetterReason reason,
string channelType,
object? payload = null,
Exception? exception = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets dead-lettered deliveries for a tenant.
/// </summary>
Task<IReadOnlyList<DeadLetteredDelivery>> GetAsync(
string tenantId,
DeadLetterQuery? query = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Retries a dead-lettered delivery.
/// </summary>
Task<DeadLetterRetryResult> RetryAsync(
string tenantId,
string deadLetterId,
CancellationToken cancellationToken = default);
/// <summary>
/// Retries all matching dead-lettered deliveries.
/// </summary>
Task<DeadLetterBulkRetryResult> RetryBulkAsync(
string tenantId,
DeadLetterQuery? query = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Discards a dead-lettered delivery.
/// </summary>
Task<bool> DiscardAsync(
string tenantId,
string deadLetterId,
string? reason = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets statistics about dead-lettered deliveries.
/// </summary>
Task<DeadLetterStats> GetStatsAsync(
string? tenantId = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Purges old dead-lettered deliveries.
/// </summary>
Task<int> PurgeAsync(
string? tenantId,
TimeSpan olderThan,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Reason for dead-lettering.
/// </summary>
public enum DeadLetterReason
{
MaxRetriesExceeded,
InvalidPayload,
ChannelUnavailable,
AuthenticationFailed,
RateLimited,
TemplateRenderFailed,
ConfigurationError,
UnknownError
}
/// <summary>
/// A dead-lettered delivery.
/// </summary>
public sealed record DeadLetteredDelivery
{
public required string DeadLetterId { get; init; }
public required string TenantId { get; init; }
public required string DeliveryId { get; init; }
public required string ChannelType { get; init; }
public required DeadLetterReason Reason { get; init; }
public string? ReasonDetails { get; init; }
public object? OriginalPayload { get; init; }
public string? ExceptionType { get; init; }
public string? ExceptionMessage { get; init; }
public int AttemptCount { get; init; }
public DateTimeOffset FirstAttemptAt { get; init; }
public DateTimeOffset DeadLetteredAt { get; init; }
public DateTimeOffset? LastRetryAt { get; init; }
public int RetryCount { get; init; }
public DeadLetterStatus Status { get; init; } = DeadLetterStatus.Pending;
public string? DiscardReason { get; init; }
}
/// <summary>
/// Status of a dead-lettered delivery.
/// </summary>
public enum DeadLetterStatus
{
Pending,
Retrying,
Retried,
Discarded
}
/// <summary>
/// Query for dead-lettered deliveries.
/// </summary>
public sealed record DeadLetterQuery
{
public string? Id { get; init; }
public DeadLetterReason? Reason { get; init; }
public string? ChannelType { get; init; }
public DeadLetterStatus? Status { get; init; }
public DateTimeOffset? After { get; init; }
public DateTimeOffset? Before { get; init; }
public int Limit { get; init; } = 100;
public int Offset { get; init; }
}
/// <summary>
/// Result of a retry attempt.
/// </summary>
public sealed record DeadLetterRetryResult
{
public required string DeadLetterId { get; init; }
public bool Success { get; init; }
public string? Error { get; init; }
public DeadLetterStatus NewStatus { get; init; }
}
/// <summary>
/// Result of a bulk retry operation.
/// </summary>
public sealed record DeadLetterBulkRetryResult
{
public int Total { get; init; }
public int Succeeded { get; init; }
public int Failed { get; init; }
public IReadOnlyList<DeadLetterRetryResult> Results { get; init; } = [];
}
/// <summary>
/// Statistics about dead-lettered deliveries.
/// </summary>
public sealed record DeadLetterStats
{
public DateTimeOffset Timestamp { get; init; }
public string? TenantId { get; init; }
public int TotalCount { get; init; }
public int PendingCount { get; init; }
public int RetryingCount { get; init; }
public int RetriedCount { get; init; }
public int DiscardedCount { get; init; }
public IReadOnlyDictionary<DeadLetterReason, int> ByReason { get; init; } = new Dictionary<DeadLetterReason, int>();
public IReadOnlyDictionary<string, int> ByChannel { get; init; } = new Dictionary<string, int>();
public DateTimeOffset? OldestDeadLetterAt { get; init; }
public DateTimeOffset? NewestDeadLetterAt { get; init; }
}
/// <summary>
/// Options for dead-letter handling.
/// </summary>
public sealed class DeadLetterOptions
{
public const string SectionName = "Notifier:Observability:DeadLetter";
public bool Enabled { get; set; } = true;
public int MaxRetryAttempts { get; set; } = 3;
public TimeSpan RetryDelay { get; set; } = TimeSpan.FromMinutes(5);
public TimeSpan RetentionPeriod { get; set; } = TimeSpan.FromDays(30);
public bool AutoPurge { get; set; } = true;
public TimeSpan PurgeInterval { get; set; } = TimeSpan.FromHours(24);
public int AlertThreshold { get; set; } = 100;
}
/// <summary>
/// In-memory implementation of dead-letter handler.
/// </summary>
public sealed class InMemoryDeadLetterHandler : IDeadLetterHandler
{
private readonly ConcurrentDictionary<string, List<DeadLetteredDelivery>> _deadLetters = new();
private readonly DeadLetterOptions _options;
private readonly TimeProvider _timeProvider;
private readonly INotifierMetrics? _metrics;
private readonly ILogger<InMemoryDeadLetterHandler> _logger;
public InMemoryDeadLetterHandler(
IOptions<DeadLetterOptions> options,
TimeProvider timeProvider,
INotifierMetrics? metrics,
ILogger<InMemoryDeadLetterHandler> logger)
{
_options = options?.Value ?? new DeadLetterOptions();
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public Task<DeadLetteredDelivery> DeadLetterAsync(
string tenantId,
string deliveryId,
DeadLetterReason reason,
string channelType,
object? payload = null,
Exception? exception = null,
CancellationToken cancellationToken = default)
{
var now = _timeProvider.GetUtcNow();
var deadLetter = new DeadLetteredDelivery
{
DeadLetterId = $"dl-{Guid.NewGuid():N}"[..16],
TenantId = tenantId,
DeliveryId = deliveryId,
ChannelType = channelType,
Reason = reason,
ReasonDetails = exception?.Message,
OriginalPayload = payload,
ExceptionType = exception?.GetType().FullName,
ExceptionMessage = exception?.Message,
DeadLetteredAt = now,
FirstAttemptAt = now,
Status = DeadLetterStatus.Pending
};
var list = _deadLetters.GetOrAdd(tenantId, _ => []);
lock (list) { list.Add(deadLetter); }
_metrics?.RecordDeadLetter(tenantId, reason.ToString(), channelType);
_logger.LogWarning("Dead-lettered delivery {DeliveryId} for tenant {TenantId}: {Reason}", deliveryId, tenantId, reason);
return Task.FromResult(deadLetter);
}
public Task<IReadOnlyList<DeadLetteredDelivery>> GetAsync(
string tenantId,
DeadLetterQuery? query = null,
CancellationToken cancellationToken = default)
{
if (!_deadLetters.TryGetValue(tenantId, out var list))
return Task.FromResult<IReadOnlyList<DeadLetteredDelivery>>([]);
IEnumerable<DeadLetteredDelivery> filtered;
lock (list) { filtered = list.ToList(); }
if (query is not null)
{
if (!string.IsNullOrWhiteSpace(query.Id)) filtered = filtered.Where(d => d.DeadLetterId == query.Id);
if (query.Reason.HasValue) filtered = filtered.Where(d => d.Reason == query.Reason.Value);
if (!string.IsNullOrEmpty(query.ChannelType)) filtered = filtered.Where(d => d.ChannelType == query.ChannelType);
if (query.Status.HasValue) filtered = filtered.Where(d => d.Status == query.Status.Value);
if (query.After.HasValue) filtered = filtered.Where(d => d.DeadLetteredAt > query.After.Value);
if (query.Before.HasValue) filtered = filtered.Where(d => d.DeadLetteredAt < query.Before.Value);
}
var result = filtered.OrderByDescending(d => d.DeadLetteredAt).Skip(query?.Offset ?? 0).Take(query?.Limit ?? 100).ToList();
return Task.FromResult<IReadOnlyList<DeadLetteredDelivery>>(result);
}
public Task<DeadLetterRetryResult> RetryAsync(string tenantId, string deadLetterId, CancellationToken cancellationToken = default)
{
if (!_deadLetters.TryGetValue(tenantId, out var list))
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = false, Error = "Not found", NewStatus = DeadLetterStatus.Pending });
DeadLetteredDelivery? deadLetter;
lock (list) { deadLetter = list.FirstOrDefault(d => d.DeadLetterId == deadLetterId); }
if (deadLetter is null)
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = false, Error = "Not found", NewStatus = DeadLetterStatus.Pending });
lock (list)
{
var index = list.FindIndex(d => d.DeadLetterId == deadLetterId);
if (index >= 0)
list[index] = deadLetter with { Status = DeadLetterStatus.Retried, LastRetryAt = _timeProvider.GetUtcNow(), RetryCount = deadLetter.RetryCount + 1 };
}
_logger.LogInformation("Retrying dead-lettered delivery {DeadLetterId} for tenant {TenantId}", deadLetterId, tenantId);
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = true, NewStatus = DeadLetterStatus.Retried });
}
public async Task<DeadLetterBulkRetryResult> RetryBulkAsync(string tenantId, DeadLetterQuery? query = null, CancellationToken cancellationToken = default)
{
var deadLetters = await GetAsync(tenantId, query, cancellationToken);
var results = new List<DeadLetterRetryResult>();
foreach (var dl in deadLetters.Where(d => d.Status == DeadLetterStatus.Pending))
results.Add(await RetryAsync(tenantId, dl.DeadLetterId, cancellationToken));
return new DeadLetterBulkRetryResult { Total = results.Count, Succeeded = results.Count(r => r.Success), Failed = results.Count(r => !r.Success), Results = results };
}
public Task<bool> DiscardAsync(string tenantId, string deadLetterId, string? reason = null, CancellationToken cancellationToken = default)
{
if (!_deadLetters.TryGetValue(tenantId, out var list)) return Task.FromResult(false);
lock (list)
{
var index = list.FindIndex(d => d.DeadLetterId == deadLetterId);
if (index < 0) return Task.FromResult(false);
list[index] = list[index] with { Status = DeadLetterStatus.Discarded, DiscardReason = reason };
}
_logger.LogInformation("Discarded dead-lettered delivery {DeadLetterId} for tenant {TenantId}: {Reason}", deadLetterId, tenantId, reason ?? "No reason");
return Task.FromResult(true);
}
public Task<DeadLetterStats> GetStatsAsync(string? tenantId = null, CancellationToken cancellationToken = default)
{
var all = tenantId is not null ? (_deadLetters.TryGetValue(tenantId, out var l) ? l.ToList() : []) : _deadLetters.Values.SelectMany(v => v).ToList();
return Task.FromResult(new DeadLetterStats
{
Timestamp = _timeProvider.GetUtcNow(),
TenantId = tenantId,
TotalCount = all.Count,
PendingCount = all.Count(d => d.Status == DeadLetterStatus.Pending),
RetryingCount = all.Count(d => d.Status == DeadLetterStatus.Retrying),
RetriedCount = all.Count(d => d.Status == DeadLetterStatus.Retried),
DiscardedCount = all.Count(d => d.Status == DeadLetterStatus.Discarded),
ByReason = all.GroupBy(d => d.Reason).ToDictionary(g => g.Key, g => g.Count()),
ByChannel = all.GroupBy(d => d.ChannelType).ToDictionary(g => g.Key, g => g.Count()),
OldestDeadLetterAt = all.MinBy(d => d.DeadLetteredAt)?.DeadLetteredAt,
NewestDeadLetterAt = all.MaxBy(d => d.DeadLetteredAt)?.DeadLetteredAt
});
}
public Task<int> PurgeAsync(string? tenantId, TimeSpan olderThan, CancellationToken cancellationToken = default)
{
var cutoff = _timeProvider.GetUtcNow() - olderThan;
var purged = 0;
var tenants = tenantId is not null ? [tenantId] : _deadLetters.Keys.ToList();
foreach (var t in tenants)
{
if (!_deadLetters.TryGetValue(t, out var list)) continue;
lock (list) { purged += list.RemoveAll(d => d.DeadLetteredAt < cutoff); }
}
_logger.LogInformation("Purged {Count} dead-lettered deliveries older than {OlderThan}", purged, olderThan);
return Task.FromResult(purged);
}
}
using System.Collections.Concurrent;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Handles notifications that have failed permanently after all retries.
/// </summary>
public interface IDeadLetterHandler
{
/// <summary>
/// Moves a delivery to the dead-letter queue.
/// </summary>
Task<DeadLetteredDelivery> DeadLetterAsync(
string tenantId,
string deliveryId,
DeadLetterReason reason,
string channelType,
object? payload = null,
Exception? exception = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets dead-lettered deliveries for a tenant.
/// </summary>
Task<IReadOnlyList<DeadLetteredDelivery>> GetAsync(
string tenantId,
DeadLetterQuery? query = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Retries a dead-lettered delivery.
/// </summary>
Task<DeadLetterRetryResult> RetryAsync(
string tenantId,
string deadLetterId,
CancellationToken cancellationToken = default);
/// <summary>
/// Retries all matching dead-lettered deliveries.
/// </summary>
Task<DeadLetterBulkRetryResult> RetryBulkAsync(
string tenantId,
DeadLetterQuery? query = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Discards a dead-lettered delivery.
/// </summary>
Task<bool> DiscardAsync(
string tenantId,
string deadLetterId,
string? reason = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets statistics about dead-lettered deliveries.
/// </summary>
Task<DeadLetterStats> GetStatsAsync(
string? tenantId = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Purges old dead-lettered deliveries.
/// </summary>
Task<int> PurgeAsync(
string? tenantId,
TimeSpan olderThan,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Reason for dead-lettering.
/// </summary>
public enum DeadLetterReason
{
MaxRetriesExceeded,
InvalidPayload,
ChannelUnavailable,
AuthenticationFailed,
RateLimited,
TemplateRenderFailed,
ConfigurationError,
UnknownError
}
/// <summary>
/// A dead-lettered delivery.
/// </summary>
public sealed record DeadLetteredDelivery
{
public required string DeadLetterId { get; init; }
public required string TenantId { get; init; }
public required string DeliveryId { get; init; }
public required string ChannelType { get; init; }
public required DeadLetterReason Reason { get; init; }
public string? ReasonDetails { get; init; }
public object? OriginalPayload { get; init; }
public string? ExceptionType { get; init; }
public string? ExceptionMessage { get; init; }
public int AttemptCount { get; init; }
public DateTimeOffset FirstAttemptAt { get; init; }
public DateTimeOffset DeadLetteredAt { get; init; }
public DateTimeOffset? LastRetryAt { get; init; }
public int RetryCount { get; init; }
public DeadLetterStatus Status { get; init; } = DeadLetterStatus.Pending;
public string? DiscardReason { get; init; }
}
/// <summary>
/// Status of a dead-lettered delivery.
/// </summary>
public enum DeadLetterStatus
{
Pending,
Retrying,
Retried,
Discarded
}
/// <summary>
/// Query for dead-lettered deliveries.
/// </summary>
public sealed record DeadLetterQuery
{
public string? Id { get; init; }
public DeadLetterReason? Reason { get; init; }
public string? ChannelType { get; init; }
public DeadLetterStatus? Status { get; init; }
public DateTimeOffset? After { get; init; }
public DateTimeOffset? Before { get; init; }
public int Limit { get; init; } = 100;
public int Offset { get; init; }
}
/// <summary>
/// Result of a retry attempt.
/// </summary>
public sealed record DeadLetterRetryResult
{
public required string DeadLetterId { get; init; }
public bool Success { get; init; }
public string? Error { get; init; }
public DeadLetterStatus NewStatus { get; init; }
}
/// <summary>
/// Result of a bulk retry operation.
/// </summary>
public sealed record DeadLetterBulkRetryResult
{
public int Total { get; init; }
public int Succeeded { get; init; }
public int Failed { get; init; }
public IReadOnlyList<DeadLetterRetryResult> Results { get; init; } = [];
}
/// <summary>
/// Statistics about dead-lettered deliveries.
/// </summary>
public sealed record DeadLetterStats
{
public DateTimeOffset Timestamp { get; init; }
public string? TenantId { get; init; }
public int TotalCount { get; init; }
public int PendingCount { get; init; }
public int RetryingCount { get; init; }
public int RetriedCount { get; init; }
public int DiscardedCount { get; init; }
public IReadOnlyDictionary<DeadLetterReason, int> ByReason { get; init; } = new Dictionary<DeadLetterReason, int>();
public IReadOnlyDictionary<string, int> ByChannel { get; init; } = new Dictionary<string, int>();
public DateTimeOffset? OldestDeadLetterAt { get; init; }
public DateTimeOffset? NewestDeadLetterAt { get; init; }
}
/// <summary>
/// Options for dead-letter handling.
/// </summary>
public sealed class DeadLetterOptions
{
public const string SectionName = "Notifier:Observability:DeadLetter";
public bool Enabled { get; set; } = true;
public int MaxRetryAttempts { get; set; } = 3;
public TimeSpan RetryDelay { get; set; } = TimeSpan.FromMinutes(5);
public TimeSpan RetentionPeriod { get; set; } = TimeSpan.FromDays(30);
public bool AutoPurge { get; set; } = true;
public TimeSpan PurgeInterval { get; set; } = TimeSpan.FromHours(24);
public int AlertThreshold { get; set; } = 100;
}
/// <summary>
/// In-memory implementation of dead-letter handler.
/// </summary>
public sealed class InMemoryDeadLetterHandler : IDeadLetterHandler
{
private readonly ConcurrentDictionary<string, List<DeadLetteredDelivery>> _deadLetters = new();
private readonly DeadLetterOptions _options;
private readonly TimeProvider _timeProvider;
private readonly INotifierMetrics? _metrics;
private readonly ILogger<InMemoryDeadLetterHandler> _logger;
public InMemoryDeadLetterHandler(
IOptions<DeadLetterOptions> options,
TimeProvider timeProvider,
INotifierMetrics? metrics,
ILogger<InMemoryDeadLetterHandler> logger)
{
_options = options?.Value ?? new DeadLetterOptions();
_timeProvider = timeProvider ?? TimeProvider.System;
_metrics = metrics;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
public Task<DeadLetteredDelivery> DeadLetterAsync(
string tenantId,
string deliveryId,
DeadLetterReason reason,
string channelType,
object? payload = null,
Exception? exception = null,
CancellationToken cancellationToken = default)
{
var now = _timeProvider.GetUtcNow();
var deadLetter = new DeadLetteredDelivery
{
DeadLetterId = $"dl-{Guid.NewGuid():N}"[..16],
TenantId = tenantId,
DeliveryId = deliveryId,
ChannelType = channelType,
Reason = reason,
ReasonDetails = exception?.Message,
OriginalPayload = payload,
ExceptionType = exception?.GetType().FullName,
ExceptionMessage = exception?.Message,
DeadLetteredAt = now,
FirstAttemptAt = now,
Status = DeadLetterStatus.Pending
};
var list = _deadLetters.GetOrAdd(tenantId, _ => []);
lock (list) { list.Add(deadLetter); }
_metrics?.RecordDeadLetter(tenantId, reason.ToString(), channelType);
_logger.LogWarning("Dead-lettered delivery {DeliveryId} for tenant {TenantId}: {Reason}", deliveryId, tenantId, reason);
return Task.FromResult(deadLetter);
}
public Task<IReadOnlyList<DeadLetteredDelivery>> GetAsync(
string tenantId,
DeadLetterQuery? query = null,
CancellationToken cancellationToken = default)
{
if (!_deadLetters.TryGetValue(tenantId, out var list))
return Task.FromResult<IReadOnlyList<DeadLetteredDelivery>>([]);
IEnumerable<DeadLetteredDelivery> filtered;
lock (list) { filtered = list.ToList(); }
if (query is not null)
{
if (!string.IsNullOrWhiteSpace(query.Id)) filtered = filtered.Where(d => d.DeadLetterId == query.Id);
if (query.Reason.HasValue) filtered = filtered.Where(d => d.Reason == query.Reason.Value);
if (!string.IsNullOrEmpty(query.ChannelType)) filtered = filtered.Where(d => d.ChannelType == query.ChannelType);
if (query.Status.HasValue) filtered = filtered.Where(d => d.Status == query.Status.Value);
if (query.After.HasValue) filtered = filtered.Where(d => d.DeadLetteredAt > query.After.Value);
if (query.Before.HasValue) filtered = filtered.Where(d => d.DeadLetteredAt < query.Before.Value);
}
var result = filtered.OrderByDescending(d => d.DeadLetteredAt).Skip(query?.Offset ?? 0).Take(query?.Limit ?? 100).ToList();
return Task.FromResult<IReadOnlyList<DeadLetteredDelivery>>(result);
}
public Task<DeadLetterRetryResult> RetryAsync(string tenantId, string deadLetterId, CancellationToken cancellationToken = default)
{
if (!_deadLetters.TryGetValue(tenantId, out var list))
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = false, Error = "Not found", NewStatus = DeadLetterStatus.Pending });
DeadLetteredDelivery? deadLetter;
lock (list) { deadLetter = list.FirstOrDefault(d => d.DeadLetterId == deadLetterId); }
if (deadLetter is null)
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = false, Error = "Not found", NewStatus = DeadLetterStatus.Pending });
lock (list)
{
var index = list.FindIndex(d => d.DeadLetterId == deadLetterId);
if (index >= 0)
list[index] = deadLetter with { Status = DeadLetterStatus.Retried, LastRetryAt = _timeProvider.GetUtcNow(), RetryCount = deadLetter.RetryCount + 1 };
}
_logger.LogInformation("Retrying dead-lettered delivery {DeadLetterId} for tenant {TenantId}", deadLetterId, tenantId);
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = true, NewStatus = DeadLetterStatus.Retried });
}
public async Task<DeadLetterBulkRetryResult> RetryBulkAsync(string tenantId, DeadLetterQuery? query = null, CancellationToken cancellationToken = default)
{
var deadLetters = await GetAsync(tenantId, query, cancellationToken);
var results = new List<DeadLetterRetryResult>();
foreach (var dl in deadLetters.Where(d => d.Status == DeadLetterStatus.Pending))
results.Add(await RetryAsync(tenantId, dl.DeadLetterId, cancellationToken));
return new DeadLetterBulkRetryResult { Total = results.Count, Succeeded = results.Count(r => r.Success), Failed = results.Count(r => !r.Success), Results = results };
}
public Task<bool> DiscardAsync(string tenantId, string deadLetterId, string? reason = null, CancellationToken cancellationToken = default)
{
if (!_deadLetters.TryGetValue(tenantId, out var list)) return Task.FromResult(false);
lock (list)
{
var index = list.FindIndex(d => d.DeadLetterId == deadLetterId);
if (index < 0) return Task.FromResult(false);
list[index] = list[index] with { Status = DeadLetterStatus.Discarded, DiscardReason = reason };
}
_logger.LogInformation("Discarded dead-lettered delivery {DeadLetterId} for tenant {TenantId}: {Reason}", deadLetterId, tenantId, reason ?? "No reason");
return Task.FromResult(true);
}
public Task<DeadLetterStats> GetStatsAsync(string? tenantId = null, CancellationToken cancellationToken = default)
{
var all = tenantId is not null ? (_deadLetters.TryGetValue(tenantId, out var l) ? l.ToList() : []) : _deadLetters.Values.SelectMany(v => v).ToList();
return Task.FromResult(new DeadLetterStats
{
Timestamp = _timeProvider.GetUtcNow(),
TenantId = tenantId,
TotalCount = all.Count,
PendingCount = all.Count(d => d.Status == DeadLetterStatus.Pending),
RetryingCount = all.Count(d => d.Status == DeadLetterStatus.Retrying),
RetriedCount = all.Count(d => d.Status == DeadLetterStatus.Retried),
DiscardedCount = all.Count(d => d.Status == DeadLetterStatus.Discarded),
ByReason = all.GroupBy(d => d.Reason).ToDictionary(g => g.Key, g => g.Count()),
ByChannel = all.GroupBy(d => d.ChannelType).ToDictionary(g => g.Key, g => g.Count()),
OldestDeadLetterAt = all.MinBy(d => d.DeadLetteredAt)?.DeadLetteredAt,
NewestDeadLetterAt = all.MaxBy(d => d.DeadLetteredAt)?.DeadLetteredAt
});
}
public Task<int> PurgeAsync(string? tenantId, TimeSpan olderThan, CancellationToken cancellationToken = default)
{
var cutoff = _timeProvider.GetUtcNow() - olderThan;
var purged = 0;
var tenants = tenantId is not null ? [tenantId] : _deadLetters.Keys.ToList();
foreach (var t in tenants)
{
if (!_deadLetters.TryGetValue(t, out var list)) continue;
lock (list) { purged += list.RemoveAll(d => d.DeadLetteredAt < cutoff); }
}
_logger.LogInformation("Purged {Count} dead-lettered deliveries older than {OlderThan}", purged, olderThan);
return Task.FromResult(purged);
}
}

View File

@@ -1,395 +1,395 @@
using System.Diagnostics;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Tracing service for the Notifier module.
/// Provides distributed tracing capabilities using OpenTelemetry-compatible Activity API.
/// </summary>
public interface INotifierTracing
{
/// <summary>
/// Starts a delivery span.
/// </summary>
Activity? StartDeliverySpan(string tenantId, string deliveryId, string channelType);
/// <summary>
/// Starts an escalation span.
/// </summary>
Activity? StartEscalationSpan(string tenantId, string incidentId, string policyId);
/// <summary>
/// Starts a digest generation span.
/// </summary>
Activity? StartDigestSpan(string tenantId, string scheduleId);
/// <summary>
/// Starts a template render span.
/// </summary>
Activity? StartTemplateRenderSpan(string tenantId, string templateId);
/// <summary>
/// Starts a correlation span.
/// </summary>
Activity? StartCorrelationSpan(string tenantId, string eventKind);
/// <summary>
/// Starts a webhook validation span.
/// </summary>
Activity? StartWebhookValidationSpan(string tenantId, string channelId);
/// <summary>
/// Adds an event to the current span.
/// </summary>
void AddEvent(Activity? activity, string name, IDictionary<string, object?>? attributes = null);
/// <summary>
/// Sets span status to error.
/// </summary>
void SetError(Activity? activity, Exception? exception = null, string? description = null);
/// <summary>
/// Sets span status to ok.
/// </summary>
void SetOk(Activity? activity);
/// <summary>
/// Adds custom tags to a span.
/// </summary>
void AddTags(Activity? activity, IDictionary<string, object?> tags);
/// <summary>
/// Creates a linked span (for batch operations).
/// </summary>
Activity? StartLinkedSpan(string operationName, ActivityContext parentContext, IDictionary<string, object?>? tags = null);
}
/// <summary>
/// Options for tracing service.
/// </summary>
public sealed class NotifierTracingOptions
{
public const string SectionName = "Notifier:Observability:Tracing";
/// <summary>
/// Whether tracing is enabled.
/// </summary>
public bool Enabled { get; set; } = true;
/// <summary>
/// Activity source name.
/// </summary>
public string SourceName { get; set; } = "StellaOps.Notifier";
/// <summary>
/// Whether to include sensitive data in traces.
/// </summary>
public bool IncludeSensitiveData { get; set; }
/// <summary>
/// Sampling ratio (0.0 to 1.0).
/// </summary>
public double SamplingRatio { get; set; } = 1.0;
/// <summary>
/// Maximum number of attributes per span.
/// </summary>
public int MaxAttributesPerSpan { get; set; } = 128;
/// <summary>
/// Maximum number of events per span.
/// </summary>
public int MaxEventsPerSpan { get; set; } = 128;
}
/// <summary>
/// Default implementation of notifier tracing.
/// </summary>
public sealed class DefaultNotifierTracing : INotifierTracing, IDisposable
{
private readonly ActivitySource _activitySource;
private readonly NotifierTracingOptions _options;
private readonly ILogger<DefaultNotifierTracing> _logger;
public DefaultNotifierTracing(
IOptions<NotifierTracingOptions> options,
ILogger<DefaultNotifierTracing> logger)
{
_options = options?.Value ?? new NotifierTracingOptions();
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_activitySource = new ActivitySource(_options.SourceName, "1.0.0");
}
public Activity? StartDeliverySpan(string tenantId, string deliveryId, string channelType)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.delivery",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("delivery.id", deliveryId);
activity.SetTag("channel.type", channelType);
return activity;
}
public Activity? StartEscalationSpan(string tenantId, string incidentId, string policyId)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.escalation",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("incident.id", incidentId);
activity.SetTag("policy.id", policyId);
return activity;
}
public Activity? StartDigestSpan(string tenantId, string scheduleId)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.digest",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("schedule.id", scheduleId);
return activity;
}
public Activity? StartTemplateRenderSpan(string tenantId, string templateId)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.template.render",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("template.id", templateId);
return activity;
}
public Activity? StartCorrelationSpan(string tenantId, string eventKind)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.correlation",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("event.kind", eventKind);
return activity;
}
public Activity? StartWebhookValidationSpan(string tenantId, string channelId)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.webhook.validation",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("channel.id", channelId);
return activity;
}
public void AddEvent(Activity? activity, string name, IDictionary<string, object?>? attributes = null)
{
if (activity is null) return;
var tags = new ActivityTagsCollection();
if (attributes is not null)
{
foreach (var (key, value) in attributes)
{
if (value is not null)
{
tags.Add(key, value);
}
}
}
activity.AddEvent(new ActivityEvent(name, tags: tags));
}
public void SetError(Activity? activity, Exception? exception = null, string? description = null)
{
if (activity is null) return;
activity.SetStatus(ActivityStatusCode.Error, description ?? exception?.Message);
if (exception is not null)
{
activity.SetTag("exception.type", exception.GetType().FullName);
activity.SetTag("exception.message", exception.Message);
if (_options.IncludeSensitiveData)
{
activity.SetTag("exception.stacktrace", exception.StackTrace);
}
}
}
public void SetOk(Activity? activity)
{
activity?.SetStatus(ActivityStatusCode.Ok);
}
public void AddTags(Activity? activity, IDictionary<string, object?> tags)
{
if (activity is null) return;
foreach (var (key, value) in tags)
{
if (value is not null)
{
activity.SetTag(key, value);
}
}
}
public Activity? StartLinkedSpan(string operationName, ActivityContext parentContext, IDictionary<string, object?>? tags = null)
{
if (!_options.Enabled) return null;
var links = new[] { new ActivityLink(parentContext) };
var activity = _activitySource.StartActivity(
operationName,
ActivityKind.Internal,
parentContext: default,
links: links);
if (activity is not null && tags is not null)
{
AddTags(activity, tags);
}
return activity;
}
public void Dispose()
{
_activitySource.Dispose();
}
}
/// <summary>
/// Extension methods for Activity-based tracing.
/// </summary>
public static class ActivityExtensions
{
/// <summary>
/// Records a delivery result on the activity.
/// </summary>
public static void RecordDeliveryResult(this Activity? activity, bool success, int? httpStatusCode = null, string? error = null)
{
if (activity is null) return;
activity.SetTag("delivery.success", success);
if (httpStatusCode.HasValue)
{
activity.SetTag("http.status_code", httpStatusCode.Value);
}
if (!string.IsNullOrEmpty(error))
{
activity.SetTag("delivery.error", error);
}
}
/// <summary>
/// Records an escalation level change.
/// </summary>
public static void RecordEscalationLevel(this Activity? activity, int level, string? target = null)
{
if (activity is null) return;
activity.SetTag("escalation.level", level);
if (!string.IsNullOrEmpty(target))
{
activity.SetTag("escalation.target", target);
}
activity.AddEvent(new ActivityEvent("escalation.level.changed", tags: new ActivityTagsCollection
{
{ "level", level },
{ "target", target }
}));
}
/// <summary>
/// Records storm detection.
/// </summary>
public static void RecordStormDetected(this Activity? activity, string eventKind, int eventCount)
{
if (activity is null) return;
activity.AddEvent(new ActivityEvent("storm.detected", tags: new ActivityTagsCollection
{
{ "event_kind", eventKind },
{ "event_count", eventCount }
}));
}
/// <summary>
/// Records fallback attempt.
/// </summary>
public static void RecordFallback(this Activity? activity, string fromChannel, string toChannel)
{
if (activity is null) return;
activity.AddEvent(new ActivityEvent("fallback.attempted", tags: new ActivityTagsCollection
{
{ "from_channel", fromChannel },
{ "to_channel", toChannel }
}));
}
/// <summary>
/// Records template render details.
/// </summary>
public static void RecordTemplateRender(this Activity? activity, string format, int outputLength)
{
if (activity is null) return;
activity.SetTag("template.format", format);
activity.SetTag("template.output_length", outputLength);
}
/// <summary>
/// Records correlation result.
/// </summary>
public static void RecordCorrelationResult(this Activity? activity, string correlationKey, bool isNewIncident)
{
if (activity is null) return;
activity.SetTag("correlation.key", correlationKey);
activity.SetTag("correlation.new_incident", isNewIncident);
}
}
using System.Diagnostics;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Tracing service for the Notifier module.
/// Provides distributed tracing capabilities using OpenTelemetry-compatible Activity API.
/// </summary>
public interface INotifierTracing
{
/// <summary>
/// Starts a delivery span.
/// </summary>
Activity? StartDeliverySpan(string tenantId, string deliveryId, string channelType);
/// <summary>
/// Starts an escalation span.
/// </summary>
Activity? StartEscalationSpan(string tenantId, string incidentId, string policyId);
/// <summary>
/// Starts a digest generation span.
/// </summary>
Activity? StartDigestSpan(string tenantId, string scheduleId);
/// <summary>
/// Starts a template render span.
/// </summary>
Activity? StartTemplateRenderSpan(string tenantId, string templateId);
/// <summary>
/// Starts a correlation span.
/// </summary>
Activity? StartCorrelationSpan(string tenantId, string eventKind);
/// <summary>
/// Starts a webhook validation span.
/// </summary>
Activity? StartWebhookValidationSpan(string tenantId, string channelId);
/// <summary>
/// Adds an event to the current span.
/// </summary>
void AddEvent(Activity? activity, string name, IDictionary<string, object?>? attributes = null);
/// <summary>
/// Sets span status to error.
/// </summary>
void SetError(Activity? activity, Exception? exception = null, string? description = null);
/// <summary>
/// Sets span status to ok.
/// </summary>
void SetOk(Activity? activity);
/// <summary>
/// Adds custom tags to a span.
/// </summary>
void AddTags(Activity? activity, IDictionary<string, object?> tags);
/// <summary>
/// Creates a linked span (for batch operations).
/// </summary>
Activity? StartLinkedSpan(string operationName, ActivityContext parentContext, IDictionary<string, object?>? tags = null);
}
/// <summary>
/// Options for tracing service.
/// </summary>
public sealed class NotifierTracingOptions
{
public const string SectionName = "Notifier:Observability:Tracing";
/// <summary>
/// Whether tracing is enabled.
/// </summary>
public bool Enabled { get; set; } = true;
/// <summary>
/// Activity source name.
/// </summary>
public string SourceName { get; set; } = "StellaOps.Notifier";
/// <summary>
/// Whether to include sensitive data in traces.
/// </summary>
public bool IncludeSensitiveData { get; set; }
/// <summary>
/// Sampling ratio (0.0 to 1.0).
/// </summary>
public double SamplingRatio { get; set; } = 1.0;
/// <summary>
/// Maximum number of attributes per span.
/// </summary>
public int MaxAttributesPerSpan { get; set; } = 128;
/// <summary>
/// Maximum number of events per span.
/// </summary>
public int MaxEventsPerSpan { get; set; } = 128;
}
/// <summary>
/// Default implementation of notifier tracing.
/// </summary>
public sealed class DefaultNotifierTracing : INotifierTracing, IDisposable
{
private readonly ActivitySource _activitySource;
private readonly NotifierTracingOptions _options;
private readonly ILogger<DefaultNotifierTracing> _logger;
public DefaultNotifierTracing(
IOptions<NotifierTracingOptions> options,
ILogger<DefaultNotifierTracing> logger)
{
_options = options?.Value ?? new NotifierTracingOptions();
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_activitySource = new ActivitySource(_options.SourceName, "1.0.0");
}
public Activity? StartDeliverySpan(string tenantId, string deliveryId, string channelType)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.delivery",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("delivery.id", deliveryId);
activity.SetTag("channel.type", channelType);
return activity;
}
public Activity? StartEscalationSpan(string tenantId, string incidentId, string policyId)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.escalation",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("incident.id", incidentId);
activity.SetTag("policy.id", policyId);
return activity;
}
public Activity? StartDigestSpan(string tenantId, string scheduleId)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.digest",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("schedule.id", scheduleId);
return activity;
}
public Activity? StartTemplateRenderSpan(string tenantId, string templateId)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.template.render",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("template.id", templateId);
return activity;
}
public Activity? StartCorrelationSpan(string tenantId, string eventKind)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.correlation",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("event.kind", eventKind);
return activity;
}
public Activity? StartWebhookValidationSpan(string tenantId, string channelId)
{
if (!_options.Enabled) return null;
var activity = _activitySource.StartActivity(
"notifier.webhook.validation",
ActivityKind.Internal);
if (activity is null) return null;
activity.SetTag("tenant.id", tenantId);
activity.SetTag("channel.id", channelId);
return activity;
}
public void AddEvent(Activity? activity, string name, IDictionary<string, object?>? attributes = null)
{
if (activity is null) return;
var tags = new ActivityTagsCollection();
if (attributes is not null)
{
foreach (var (key, value) in attributes)
{
if (value is not null)
{
tags.Add(key, value);
}
}
}
activity.AddEvent(new ActivityEvent(name, tags: tags));
}
public void SetError(Activity? activity, Exception? exception = null, string? description = null)
{
if (activity is null) return;
activity.SetStatus(ActivityStatusCode.Error, description ?? exception?.Message);
if (exception is not null)
{
activity.SetTag("exception.type", exception.GetType().FullName);
activity.SetTag("exception.message", exception.Message);
if (_options.IncludeSensitiveData)
{
activity.SetTag("exception.stacktrace", exception.StackTrace);
}
}
}
public void SetOk(Activity? activity)
{
activity?.SetStatus(ActivityStatusCode.Ok);
}
public void AddTags(Activity? activity, IDictionary<string, object?> tags)
{
if (activity is null) return;
foreach (var (key, value) in tags)
{
if (value is not null)
{
activity.SetTag(key, value);
}
}
}
public Activity? StartLinkedSpan(string operationName, ActivityContext parentContext, IDictionary<string, object?>? tags = null)
{
if (!_options.Enabled) return null;
var links = new[] { new ActivityLink(parentContext) };
var activity = _activitySource.StartActivity(
operationName,
ActivityKind.Internal,
parentContext: default,
links: links);
if (activity is not null && tags is not null)
{
AddTags(activity, tags);
}
return activity;
}
public void Dispose()
{
_activitySource.Dispose();
}
}
/// <summary>
/// Extension methods for Activity-based tracing.
/// </summary>
public static class ActivityExtensions
{
/// <summary>
/// Records a delivery result on the activity.
/// </summary>
public static void RecordDeliveryResult(this Activity? activity, bool success, int? httpStatusCode = null, string? error = null)
{
if (activity is null) return;
activity.SetTag("delivery.success", success);
if (httpStatusCode.HasValue)
{
activity.SetTag("http.status_code", httpStatusCode.Value);
}
if (!string.IsNullOrEmpty(error))
{
activity.SetTag("delivery.error", error);
}
}
/// <summary>
/// Records an escalation level change.
/// </summary>
public static void RecordEscalationLevel(this Activity? activity, int level, string? target = null)
{
if (activity is null) return;
activity.SetTag("escalation.level", level);
if (!string.IsNullOrEmpty(target))
{
activity.SetTag("escalation.target", target);
}
activity.AddEvent(new ActivityEvent("escalation.level.changed", tags: new ActivityTagsCollection
{
{ "level", level },
{ "target", target }
}));
}
/// <summary>
/// Records storm detection.
/// </summary>
public static void RecordStormDetected(this Activity? activity, string eventKind, int eventCount)
{
if (activity is null) return;
activity.AddEvent(new ActivityEvent("storm.detected", tags: new ActivityTagsCollection
{
{ "event_kind", eventKind },
{ "event_count", eventCount }
}));
}
/// <summary>
/// Records fallback attempt.
/// </summary>
public static void RecordFallback(this Activity? activity, string fromChannel, string toChannel)
{
if (activity is null) return;
activity.AddEvent(new ActivityEvent("fallback.attempted", tags: new ActivityTagsCollection
{
{ "from_channel", fromChannel },
{ "to_channel", toChannel }
}));
}
/// <summary>
/// Records template render details.
/// </summary>
public static void RecordTemplateRender(this Activity? activity, string format, int outputLength)
{
if (activity is null) return;
activity.SetTag("template.format", format);
activity.SetTag("template.output_length", outputLength);
}
/// <summary>
/// Records correlation result.
/// </summary>
public static void RecordCorrelationResult(this Activity? activity, string correlationKey, bool isNewIncident)
{
if (activity is null) return;
activity.SetTag("correlation.key", correlationKey);
activity.SetTag("correlation.new_incident", isNewIncident);
}
}

View File

@@ -1,98 +1,98 @@
using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Interface for notification system metrics and tracing.
/// </summary>
public interface INotifyMetrics
{
/// <summary>
/// Records a notification delivery attempt.
/// </summary>
void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration);
/// <summary>
/// Records an escalation event.
/// </summary>
void RecordEscalation(string tenantId, int level, string outcome);
/// <summary>
/// Records a dead-letter entry.
/// </summary>
void RecordDeadLetter(string tenantId, string reason, string channelType);
/// <summary>
/// Records rule evaluation.
/// </summary>
void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration);
/// <summary>
/// Records template rendering.
/// </summary>
void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration);
/// <summary>
/// Records storm detection event.
/// </summary>
void RecordStormEvent(string tenantId, string stormKey, string decision);
/// <summary>
/// Records retention cleanup.
/// </summary>
void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount);
/// <summary>
/// Gets the current queue depth for a channel.
/// </summary>
void RecordQueueDepth(string tenantId, string channelType, int depth);
/// <summary>
/// Creates an activity for distributed tracing.
/// </summary>
Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType);
/// <summary>
/// Creates an activity for escalation tracing.
/// </summary>
Activity? StartEscalationActivity(string tenantId, string incidentId, int level);
}
/// <summary>
/// Metric tag names for consistency.
/// </summary>
public static class NotifyMetricTags
{
public const string TenantId = "tenant_id";
public const string ChannelType = "channel_type";
public const string Status = "status";
public const string Outcome = "outcome";
public const string Level = "level";
public const string Reason = "reason";
public const string RuleId = "rule_id";
public const string Matched = "matched";
public const string TemplateKey = "template_key";
public const string Success = "success";
public const string StormKey = "storm_key";
public const string Decision = "decision";
public const string EntityType = "entity_type";
}
/// <summary>
/// Metric names for the notification system.
/// </summary>
public static class NotifyMetricNames
{
public const string DeliveryAttempts = "notify.delivery.attempts";
public const string DeliveryDuration = "notify.delivery.duration";
public const string EscalationEvents = "notify.escalation.events";
public const string DeadLetterEntries = "notify.deadletter.entries";
public const string RuleEvaluations = "notify.rule.evaluations";
public const string RuleEvaluationDuration = "notify.rule.evaluation.duration";
public const string TemplateRenders = "notify.template.renders";
public const string TemplateRenderDuration = "notify.template.render.duration";
public const string StormEvents = "notify.storm.events";
public const string RetentionCleanups = "notify.retention.cleanups";
public const string QueueDepth = "notify.queue.depth";
}
using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Interface for notification system metrics and tracing.
/// </summary>
public interface INotifyMetrics
{
/// <summary>
/// Records a notification delivery attempt.
/// </summary>
void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration);
/// <summary>
/// Records an escalation event.
/// </summary>
void RecordEscalation(string tenantId, int level, string outcome);
/// <summary>
/// Records a dead-letter entry.
/// </summary>
void RecordDeadLetter(string tenantId, string reason, string channelType);
/// <summary>
/// Records rule evaluation.
/// </summary>
void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration);
/// <summary>
/// Records template rendering.
/// </summary>
void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration);
/// <summary>
/// Records storm detection event.
/// </summary>
void RecordStormEvent(string tenantId, string stormKey, string decision);
/// <summary>
/// Records retention cleanup.
/// </summary>
void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount);
/// <summary>
/// Gets the current queue depth for a channel.
/// </summary>
void RecordQueueDepth(string tenantId, string channelType, int depth);
/// <summary>
/// Creates an activity for distributed tracing.
/// </summary>
Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType);
/// <summary>
/// Creates an activity for escalation tracing.
/// </summary>
Activity? StartEscalationActivity(string tenantId, string incidentId, int level);
}
/// <summary>
/// Metric tag names for consistency.
/// </summary>
public static class NotifyMetricTags
{
public const string TenantId = "tenant_id";
public const string ChannelType = "channel_type";
public const string Status = "status";
public const string Outcome = "outcome";
public const string Level = "level";
public const string Reason = "reason";
public const string RuleId = "rule_id";
public const string Matched = "matched";
public const string TemplateKey = "template_key";
public const string Success = "success";
public const string StormKey = "storm_key";
public const string Decision = "decision";
public const string EntityType = "entity_type";
}
/// <summary>
/// Metric names for the notification system.
/// </summary>
public static class NotifyMetricNames
{
public const string DeliveryAttempts = "notify.delivery.attempts";
public const string DeliveryDuration = "notify.delivery.duration";
public const string EscalationEvents = "notify.escalation.events";
public const string DeadLetterEntries = "notify.deadletter.entries";
public const string RuleEvaluations = "notify.rule.evaluations";
public const string RuleEvaluationDuration = "notify.rule.evaluation.duration";
public const string TemplateRenders = "notify.template.renders";
public const string TemplateRenderDuration = "notify.template.render.duration";
public const string StormEvents = "notify.storm.events";
public const string RetentionCleanups = "notify.retention.cleanups";
public const string QueueDepth = "notify.queue.depth";
}

View File

@@ -1,243 +1,243 @@
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Notifier.Worker.Retention;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Extension methods for registering observability services.
/// </summary>
public static class ObservabilityServiceExtensions
{
/// <summary>
/// Adds all observability services (metrics, tracing, dead-letter, chaos, retention).
/// </summary>
public static IServiceCollection AddNotifierObservability(
this IServiceCollection services,
IConfiguration configuration)
{
return services
.AddNotifierMetrics(configuration)
.AddNotifierTracing(configuration)
.AddDeadLetterHandling(configuration)
.AddChaosEngine(configuration)
.AddRetentionPolicies(configuration);
}
/// <summary>
/// Adds notifier metrics services.
/// </summary>
public static IServiceCollection AddNotifierMetrics(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<NotifierMetricsOptions>(
configuration.GetSection(NotifierMetricsOptions.SectionName));
services.AddSingleton<INotifierMetrics, DefaultNotifierMetrics>();
return services;
}
/// <summary>
/// Adds notifier tracing services.
/// </summary>
public static IServiceCollection AddNotifierTracing(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<NotifierTracingOptions>(
configuration.GetSection(NotifierTracingOptions.SectionName));
services.AddSingleton<INotifierTracing, DefaultNotifierTracing>();
return services;
}
/// <summary>
/// Adds dead-letter handling services.
/// </summary>
public static IServiceCollection AddDeadLetterHandling(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<DeadLetterOptions>(
configuration.GetSection(DeadLetterOptions.SectionName));
services.AddSingleton<IDeadLetterHandler, InMemoryDeadLetterHandler>();
return services;
}
/// <summary>
/// Adds chaos engine services.
/// </summary>
public static IServiceCollection AddChaosEngine(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<ChaosEngineOptions>(
configuration.GetSection(ChaosEngineOptions.SectionName));
services.AddSingleton<IChaosEngine, DefaultChaosEngine>();
return services;
}
/// <summary>
/// Adds retention policy services.
/// </summary>
public static IServiceCollection AddRetentionPolicies(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<RetentionOptions>(
configuration.GetSection(RetentionOptions.SectionName));
services.AddSingleton<IRetentionPolicyService, DefaultRetentionPolicyService>();
return services;
}
/// <summary>
/// Builder for customizing observability services.
/// </summary>
public static ObservabilityServiceBuilder AddNotifierObservability(this IServiceCollection services)
{
return new ObservabilityServiceBuilder(services);
}
}
/// <summary>
/// Builder for customizing observability services.
/// </summary>
public sealed class ObservabilityServiceBuilder
{
private readonly IServiceCollection _services;
public ObservabilityServiceBuilder(IServiceCollection services)
{
_services = services ?? throw new ArgumentNullException(nameof(services));
}
/// <summary>
/// Configures metrics options.
/// </summary>
public ObservabilityServiceBuilder ConfigureMetrics(Action<NotifierMetricsOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Configures tracing options.
/// </summary>
public ObservabilityServiceBuilder ConfigureTracing(Action<NotifierTracingOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Configures dead-letter options.
/// </summary>
public ObservabilityServiceBuilder ConfigureDeadLetter(Action<DeadLetterOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Configures chaos engine options.
/// </summary>
public ObservabilityServiceBuilder ConfigureChaos(Action<ChaosEngineOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Configures retention options.
/// </summary>
public ObservabilityServiceBuilder ConfigureRetention(Action<RetentionOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Uses a custom metrics implementation.
/// </summary>
public ObservabilityServiceBuilder UseCustomMetrics<T>() where T : class, INotifierMetrics
{
_services.AddSingleton<INotifierMetrics, T>();
return this;
}
/// <summary>
/// Uses a custom tracing implementation.
/// </summary>
public ObservabilityServiceBuilder UseCustomTracing<T>() where T : class, INotifierTracing
{
_services.AddSingleton<INotifierTracing, T>();
return this;
}
/// <summary>
/// Uses a custom dead-letter handler.
/// </summary>
public ObservabilityServiceBuilder UseCustomDeadLetterHandler<T>() where T : class, IDeadLetterHandler
{
_services.AddSingleton<IDeadLetterHandler, T>();
return this;
}
/// <summary>
/// Uses a custom chaos engine.
/// </summary>
public ObservabilityServiceBuilder UseCustomChaosEngine<T>() where T : class, IChaosEngine
{
_services.AddSingleton<IChaosEngine, T>();
return this;
}
/// <summary>
/// Uses a custom retention policy service.
/// </summary>
public ObservabilityServiceBuilder UseCustomRetentionService<T>() where T : class, IRetentionPolicyService
{
_services.AddSingleton<IRetentionPolicyService, T>();
return this;
}
/// <summary>
/// Builds the services with default implementations.
/// </summary>
public IServiceCollection Build()
{
// Register defaults if not already registered
_services.TryAddSingleton<INotifierMetrics, DefaultNotifierMetrics>();
_services.TryAddSingleton<INotifierTracing, DefaultNotifierTracing>();
_services.TryAddSingleton<IDeadLetterHandler, InMemoryDeadLetterHandler>();
_services.TryAddSingleton<IChaosEngine, DefaultChaosEngine>();
_services.TryAddSingleton<IRetentionPolicyService, DefaultRetentionPolicyService>();
return _services;
}
}
/// <summary>
/// Extension methods for service collection to ensure singleton registration.
/// </summary>
file static class ServiceCollectionExtensions
{
public static void TryAddSingleton<TService, TImplementation>(this IServiceCollection services)
where TService : class
where TImplementation : class, TService
{
if (!services.Any(s => s.ServiceType == typeof(TService)))
{
services.AddSingleton<TService, TImplementation>();
}
}
}
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using StellaOps.Notifier.Worker.Retention;
namespace StellaOps.Notifier.Worker.Observability;
/// <summary>
/// Extension methods for registering observability services.
/// </summary>
public static class ObservabilityServiceExtensions
{
/// <summary>
/// Adds all observability services (metrics, tracing, dead-letter, chaos, retention).
/// </summary>
public static IServiceCollection AddNotifierObservability(
this IServiceCollection services,
IConfiguration configuration)
{
return services
.AddNotifierMetrics(configuration)
.AddNotifierTracing(configuration)
.AddDeadLetterHandling(configuration)
.AddChaosEngine(configuration)
.AddRetentionPolicies(configuration);
}
/// <summary>
/// Adds notifier metrics services.
/// </summary>
public static IServiceCollection AddNotifierMetrics(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<NotifierMetricsOptions>(
configuration.GetSection(NotifierMetricsOptions.SectionName));
services.AddSingleton<INotifierMetrics, DefaultNotifierMetrics>();
return services;
}
/// <summary>
/// Adds notifier tracing services.
/// </summary>
public static IServiceCollection AddNotifierTracing(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<NotifierTracingOptions>(
configuration.GetSection(NotifierTracingOptions.SectionName));
services.AddSingleton<INotifierTracing, DefaultNotifierTracing>();
return services;
}
/// <summary>
/// Adds dead-letter handling services.
/// </summary>
public static IServiceCollection AddDeadLetterHandling(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<DeadLetterOptions>(
configuration.GetSection(DeadLetterOptions.SectionName));
services.AddSingleton<IDeadLetterHandler, InMemoryDeadLetterHandler>();
return services;
}
/// <summary>
/// Adds chaos engine services.
/// </summary>
public static IServiceCollection AddChaosEngine(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<ChaosEngineOptions>(
configuration.GetSection(ChaosEngineOptions.SectionName));
services.AddSingleton<IChaosEngine, DefaultChaosEngine>();
return services;
}
/// <summary>
/// Adds retention policy services.
/// </summary>
public static IServiceCollection AddRetentionPolicies(
this IServiceCollection services,
IConfiguration configuration)
{
services.Configure<RetentionOptions>(
configuration.GetSection(RetentionOptions.SectionName));
services.AddSingleton<IRetentionPolicyService, DefaultRetentionPolicyService>();
return services;
}
/// <summary>
/// Builder for customizing observability services.
/// </summary>
public static ObservabilityServiceBuilder AddNotifierObservability(this IServiceCollection services)
{
return new ObservabilityServiceBuilder(services);
}
}
/// <summary>
/// Builder for customizing observability services.
/// </summary>
public sealed class ObservabilityServiceBuilder
{
private readonly IServiceCollection _services;
public ObservabilityServiceBuilder(IServiceCollection services)
{
_services = services ?? throw new ArgumentNullException(nameof(services));
}
/// <summary>
/// Configures metrics options.
/// </summary>
public ObservabilityServiceBuilder ConfigureMetrics(Action<NotifierMetricsOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Configures tracing options.
/// </summary>
public ObservabilityServiceBuilder ConfigureTracing(Action<NotifierTracingOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Configures dead-letter options.
/// </summary>
public ObservabilityServiceBuilder ConfigureDeadLetter(Action<DeadLetterOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Configures chaos engine options.
/// </summary>
public ObservabilityServiceBuilder ConfigureChaos(Action<ChaosEngineOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Configures retention options.
/// </summary>
public ObservabilityServiceBuilder ConfigureRetention(Action<RetentionOptions> configure)
{
_services.Configure(configure);
return this;
}
/// <summary>
/// Uses a custom metrics implementation.
/// </summary>
public ObservabilityServiceBuilder UseCustomMetrics<T>() where T : class, INotifierMetrics
{
_services.AddSingleton<INotifierMetrics, T>();
return this;
}
/// <summary>
/// Uses a custom tracing implementation.
/// </summary>
public ObservabilityServiceBuilder UseCustomTracing<T>() where T : class, INotifierTracing
{
_services.AddSingleton<INotifierTracing, T>();
return this;
}
/// <summary>
/// Uses a custom dead-letter handler.
/// </summary>
public ObservabilityServiceBuilder UseCustomDeadLetterHandler<T>() where T : class, IDeadLetterHandler
{
_services.AddSingleton<IDeadLetterHandler, T>();
return this;
}
/// <summary>
/// Uses a custom chaos engine.
/// </summary>
public ObservabilityServiceBuilder UseCustomChaosEngine<T>() where T : class, IChaosEngine
{
_services.AddSingleton<IChaosEngine, T>();
return this;
}
/// <summary>
/// Uses a custom retention policy service.
/// </summary>
public ObservabilityServiceBuilder UseCustomRetentionService<T>() where T : class, IRetentionPolicyService
{
_services.AddSingleton<IRetentionPolicyService, T>();
return this;
}
/// <summary>
/// Builds the services with default implementations.
/// </summary>
public IServiceCollection Build()
{
// Register defaults if not already registered
_services.TryAddSingleton<INotifierMetrics, DefaultNotifierMetrics>();
_services.TryAddSingleton<INotifierTracing, DefaultNotifierTracing>();
_services.TryAddSingleton<IDeadLetterHandler, InMemoryDeadLetterHandler>();
_services.TryAddSingleton<IChaosEngine, DefaultChaosEngine>();
_services.TryAddSingleton<IRetentionPolicyService, DefaultRetentionPolicyService>();
return _services;
}
}
/// <summary>
/// Extension methods for service collection to ensure singleton registration.
/// </summary>
file static class ServiceCollectionExtensions
{
public static void TryAddSingleton<TService, TImplementation>(this IServiceCollection services)
where TService : class
where TImplementation : class, TService
{
if (!services.Any(s => s.ServiceType == typeof(TService)))
{
services.AddSingleton<TService, TImplementation>();
}
}
}