up
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Notify Smoke Test / Notify Unit Tests (push) Has been cancelled
Notify Smoke Test / Notifier Service Tests (push) Has been cancelled
Notify Smoke Test / Notification Smoke Test (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Scanner Analyzers / Discover Analyzers (push) Has been cancelled
Scanner Analyzers / Build Analyzers (push) Has been cancelled
Scanner Analyzers / Test Language Analyzers (push) Has been cancelled
Scanner Analyzers / Validate Test Fixtures (push) Has been cancelled
Scanner Analyzers / Verify Deterministic Output (push) Has been cancelled
Signals CI & Image / signals-ci (push) Has been cancelled
Signals Reachability Scoring & Events / reachability-smoke (push) Has been cancelled
Signals Reachability Scoring & Events / sign-and-upload (push) Has been cancelled
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Notify Smoke Test / Notify Unit Tests (push) Has been cancelled
Notify Smoke Test / Notifier Service Tests (push) Has been cancelled
Notify Smoke Test / Notification Smoke Test (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Scanner Analyzers / Discover Analyzers (push) Has been cancelled
Scanner Analyzers / Build Analyzers (push) Has been cancelled
Scanner Analyzers / Test Language Analyzers (push) Has been cancelled
Scanner Analyzers / Validate Test Fixtures (push) Has been cancelled
Scanner Analyzers / Verify Deterministic Output (push) Has been cancelled
Signals CI & Image / signals-ci (push) Has been cancelled
Signals Reachability Scoring & Events / reachability-smoke (push) Has been cancelled
Signals Reachability Scoring & Events / sign-and-upload (push) Has been cancelled
This commit is contained in:
@@ -1,233 +1,233 @@
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of notification metrics using System.Diagnostics.Metrics.
|
||||
/// </summary>
|
||||
public sealed class DefaultNotifyMetrics : INotifyMetrics
|
||||
{
|
||||
private static readonly ActivitySource ActivitySource = new("StellaOps.Notifier", "1.0.0");
|
||||
private static readonly Meter Meter = new("StellaOps.Notifier", "1.0.0");
|
||||
|
||||
// Counters
|
||||
private readonly Counter<long> _deliveryAttempts;
|
||||
private readonly Counter<long> _escalationEvents;
|
||||
private readonly Counter<long> _deadLetterEntries;
|
||||
private readonly Counter<long> _ruleEvaluations;
|
||||
private readonly Counter<long> _templateRenders;
|
||||
private readonly Counter<long> _stormEvents;
|
||||
private readonly Counter<long> _retentionCleanups;
|
||||
|
||||
// Histograms
|
||||
private readonly Histogram<double> _deliveryDuration;
|
||||
private readonly Histogram<double> _ruleEvaluationDuration;
|
||||
private readonly Histogram<double> _templateRenderDuration;
|
||||
|
||||
// Gauges (using ObservableGauge pattern)
|
||||
private readonly Dictionary<string, int> _queueDepths = new();
|
||||
private readonly object _queueDepthLock = new();
|
||||
|
||||
public DefaultNotifyMetrics()
|
||||
{
|
||||
// Initialize counters
|
||||
_deliveryAttempts = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.DeliveryAttempts,
|
||||
unit: "{attempts}",
|
||||
description: "Total number of notification delivery attempts");
|
||||
|
||||
_escalationEvents = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.EscalationEvents,
|
||||
unit: "{events}",
|
||||
description: "Total number of escalation events");
|
||||
|
||||
_deadLetterEntries = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.DeadLetterEntries,
|
||||
unit: "{entries}",
|
||||
description: "Total number of dead-letter entries");
|
||||
|
||||
_ruleEvaluations = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.RuleEvaluations,
|
||||
unit: "{evaluations}",
|
||||
description: "Total number of rule evaluations");
|
||||
|
||||
_templateRenders = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.TemplateRenders,
|
||||
unit: "{renders}",
|
||||
description: "Total number of template render operations");
|
||||
|
||||
_stormEvents = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.StormEvents,
|
||||
unit: "{events}",
|
||||
description: "Total number of storm detection events");
|
||||
|
||||
_retentionCleanups = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.RetentionCleanups,
|
||||
unit: "{cleanups}",
|
||||
description: "Total number of retention cleanup operations");
|
||||
|
||||
// Initialize histograms
|
||||
_deliveryDuration = Meter.CreateHistogram<double>(
|
||||
NotifyMetricNames.DeliveryDuration,
|
||||
unit: "ms",
|
||||
description: "Duration of delivery attempts in milliseconds");
|
||||
|
||||
_ruleEvaluationDuration = Meter.CreateHistogram<double>(
|
||||
NotifyMetricNames.RuleEvaluationDuration,
|
||||
unit: "ms",
|
||||
description: "Duration of rule evaluations in milliseconds");
|
||||
|
||||
_templateRenderDuration = Meter.CreateHistogram<double>(
|
||||
NotifyMetricNames.TemplateRenderDuration,
|
||||
unit: "ms",
|
||||
description: "Duration of template renders in milliseconds");
|
||||
|
||||
// Initialize observable gauge for queue depths
|
||||
Meter.CreateObservableGauge(
|
||||
NotifyMetricNames.QueueDepth,
|
||||
observeValues: ObserveQueueDepths,
|
||||
unit: "{messages}",
|
||||
description: "Current queue depth per channel");
|
||||
}
|
||||
|
||||
public void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.ChannelType, channelType },
|
||||
{ NotifyMetricTags.Status, status }
|
||||
};
|
||||
|
||||
_deliveryAttempts.Add(1, tags);
|
||||
_deliveryDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordEscalation(string tenantId, int level, string outcome)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.Level, level.ToString() },
|
||||
{ NotifyMetricTags.Outcome, outcome }
|
||||
};
|
||||
|
||||
_escalationEvents.Add(1, tags);
|
||||
}
|
||||
|
||||
public void RecordDeadLetter(string tenantId, string reason, string channelType)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.Reason, reason },
|
||||
{ NotifyMetricTags.ChannelType, channelType }
|
||||
};
|
||||
|
||||
_deadLetterEntries.Add(1, tags);
|
||||
}
|
||||
|
||||
public void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.RuleId, ruleId },
|
||||
{ NotifyMetricTags.Matched, matched.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_ruleEvaluations.Add(1, tags);
|
||||
_ruleEvaluationDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.TemplateKey, templateKey },
|
||||
{ NotifyMetricTags.Success, success.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_templateRenders.Add(1, tags);
|
||||
_templateRenderDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordStormEvent(string tenantId, string stormKey, string decision)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.StormKey, stormKey },
|
||||
{ NotifyMetricTags.Decision, decision }
|
||||
};
|
||||
|
||||
_stormEvents.Add(1, tags);
|
||||
}
|
||||
|
||||
public void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.EntityType, entityType }
|
||||
};
|
||||
|
||||
_retentionCleanups.Add(deletedCount, tags);
|
||||
}
|
||||
|
||||
public void RecordQueueDepth(string tenantId, string channelType, int depth)
|
||||
{
|
||||
var key = $"{tenantId}:{channelType}";
|
||||
lock (_queueDepthLock)
|
||||
{
|
||||
_queueDepths[key] = depth;
|
||||
}
|
||||
}
|
||||
|
||||
public Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType)
|
||||
{
|
||||
var activity = ActivitySource.StartActivity("notify.delivery", ActivityKind.Internal);
|
||||
if (activity is not null)
|
||||
{
|
||||
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
|
||||
activity.SetTag("delivery_id", deliveryId);
|
||||
activity.SetTag(NotifyMetricTags.ChannelType, channelType);
|
||||
}
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartEscalationActivity(string tenantId, string incidentId, int level)
|
||||
{
|
||||
var activity = ActivitySource.StartActivity("notify.escalation", ActivityKind.Internal);
|
||||
if (activity is not null)
|
||||
{
|
||||
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
|
||||
activity.SetTag("incident_id", incidentId);
|
||||
activity.SetTag(NotifyMetricTags.Level, level);
|
||||
}
|
||||
return activity;
|
||||
}
|
||||
|
||||
private IEnumerable<Measurement<int>> ObserveQueueDepths()
|
||||
{
|
||||
lock (_queueDepthLock)
|
||||
{
|
||||
foreach (var (key, depth) in _queueDepths)
|
||||
{
|
||||
var parts = key.Split(':');
|
||||
if (parts.Length == 2)
|
||||
{
|
||||
yield return new Measurement<int>(
|
||||
depth,
|
||||
new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, parts[0] },
|
||||
{ NotifyMetricTags.ChannelType, parts[1] }
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of notification metrics using System.Diagnostics.Metrics.
|
||||
/// </summary>
|
||||
public sealed class DefaultNotifyMetrics : INotifyMetrics
|
||||
{
|
||||
private static readonly ActivitySource ActivitySource = new("StellaOps.Notifier", "1.0.0");
|
||||
private static readonly Meter Meter = new("StellaOps.Notifier", "1.0.0");
|
||||
|
||||
// Counters
|
||||
private readonly Counter<long> _deliveryAttempts;
|
||||
private readonly Counter<long> _escalationEvents;
|
||||
private readonly Counter<long> _deadLetterEntries;
|
||||
private readonly Counter<long> _ruleEvaluations;
|
||||
private readonly Counter<long> _templateRenders;
|
||||
private readonly Counter<long> _stormEvents;
|
||||
private readonly Counter<long> _retentionCleanups;
|
||||
|
||||
// Histograms
|
||||
private readonly Histogram<double> _deliveryDuration;
|
||||
private readonly Histogram<double> _ruleEvaluationDuration;
|
||||
private readonly Histogram<double> _templateRenderDuration;
|
||||
|
||||
// Gauges (using ObservableGauge pattern)
|
||||
private readonly Dictionary<string, int> _queueDepths = new();
|
||||
private readonly object _queueDepthLock = new();
|
||||
|
||||
public DefaultNotifyMetrics()
|
||||
{
|
||||
// Initialize counters
|
||||
_deliveryAttempts = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.DeliveryAttempts,
|
||||
unit: "{attempts}",
|
||||
description: "Total number of notification delivery attempts");
|
||||
|
||||
_escalationEvents = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.EscalationEvents,
|
||||
unit: "{events}",
|
||||
description: "Total number of escalation events");
|
||||
|
||||
_deadLetterEntries = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.DeadLetterEntries,
|
||||
unit: "{entries}",
|
||||
description: "Total number of dead-letter entries");
|
||||
|
||||
_ruleEvaluations = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.RuleEvaluations,
|
||||
unit: "{evaluations}",
|
||||
description: "Total number of rule evaluations");
|
||||
|
||||
_templateRenders = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.TemplateRenders,
|
||||
unit: "{renders}",
|
||||
description: "Total number of template render operations");
|
||||
|
||||
_stormEvents = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.StormEvents,
|
||||
unit: "{events}",
|
||||
description: "Total number of storm detection events");
|
||||
|
||||
_retentionCleanups = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.RetentionCleanups,
|
||||
unit: "{cleanups}",
|
||||
description: "Total number of retention cleanup operations");
|
||||
|
||||
// Initialize histograms
|
||||
_deliveryDuration = Meter.CreateHistogram<double>(
|
||||
NotifyMetricNames.DeliveryDuration,
|
||||
unit: "ms",
|
||||
description: "Duration of delivery attempts in milliseconds");
|
||||
|
||||
_ruleEvaluationDuration = Meter.CreateHistogram<double>(
|
||||
NotifyMetricNames.RuleEvaluationDuration,
|
||||
unit: "ms",
|
||||
description: "Duration of rule evaluations in milliseconds");
|
||||
|
||||
_templateRenderDuration = Meter.CreateHistogram<double>(
|
||||
NotifyMetricNames.TemplateRenderDuration,
|
||||
unit: "ms",
|
||||
description: "Duration of template renders in milliseconds");
|
||||
|
||||
// Initialize observable gauge for queue depths
|
||||
Meter.CreateObservableGauge(
|
||||
NotifyMetricNames.QueueDepth,
|
||||
observeValues: ObserveQueueDepths,
|
||||
unit: "{messages}",
|
||||
description: "Current queue depth per channel");
|
||||
}
|
||||
|
||||
public void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.ChannelType, channelType },
|
||||
{ NotifyMetricTags.Status, status }
|
||||
};
|
||||
|
||||
_deliveryAttempts.Add(1, tags);
|
||||
_deliveryDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordEscalation(string tenantId, int level, string outcome)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.Level, level.ToString() },
|
||||
{ NotifyMetricTags.Outcome, outcome }
|
||||
};
|
||||
|
||||
_escalationEvents.Add(1, tags);
|
||||
}
|
||||
|
||||
public void RecordDeadLetter(string tenantId, string reason, string channelType)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.Reason, reason },
|
||||
{ NotifyMetricTags.ChannelType, channelType }
|
||||
};
|
||||
|
||||
_deadLetterEntries.Add(1, tags);
|
||||
}
|
||||
|
||||
public void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.RuleId, ruleId },
|
||||
{ NotifyMetricTags.Matched, matched.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_ruleEvaluations.Add(1, tags);
|
||||
_ruleEvaluationDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.TemplateKey, templateKey },
|
||||
{ NotifyMetricTags.Success, success.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_templateRenders.Add(1, tags);
|
||||
_templateRenderDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordStormEvent(string tenantId, string stormKey, string decision)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.StormKey, stormKey },
|
||||
{ NotifyMetricTags.Decision, decision }
|
||||
};
|
||||
|
||||
_stormEvents.Add(1, tags);
|
||||
}
|
||||
|
||||
public void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.EntityType, entityType }
|
||||
};
|
||||
|
||||
_retentionCleanups.Add(deletedCount, tags);
|
||||
}
|
||||
|
||||
public void RecordQueueDepth(string tenantId, string channelType, int depth)
|
||||
{
|
||||
var key = $"{tenantId}:{channelType}";
|
||||
lock (_queueDepthLock)
|
||||
{
|
||||
_queueDepths[key] = depth;
|
||||
}
|
||||
}
|
||||
|
||||
public Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType)
|
||||
{
|
||||
var activity = ActivitySource.StartActivity("notify.delivery", ActivityKind.Internal);
|
||||
if (activity is not null)
|
||||
{
|
||||
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
|
||||
activity.SetTag("delivery_id", deliveryId);
|
||||
activity.SetTag(NotifyMetricTags.ChannelType, channelType);
|
||||
}
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartEscalationActivity(string tenantId, string incidentId, int level)
|
||||
{
|
||||
var activity = ActivitySource.StartActivity("notify.escalation", ActivityKind.Internal);
|
||||
if (activity is not null)
|
||||
{
|
||||
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
|
||||
activity.SetTag("incident_id", incidentId);
|
||||
activity.SetTag(NotifyMetricTags.Level, level);
|
||||
}
|
||||
return activity;
|
||||
}
|
||||
|
||||
private IEnumerable<Measurement<int>> ObserveQueueDepths()
|
||||
{
|
||||
lock (_queueDepthLock)
|
||||
{
|
||||
foreach (var (key, depth) in _queueDepths)
|
||||
{
|
||||
var parts = key.Split(':');
|
||||
if (parts.Length == 2)
|
||||
{
|
||||
yield return new Measurement<int>(
|
||||
depth,
|
||||
new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, parts[0] },
|
||||
{ NotifyMetricTags.ChannelType, parts[1] }
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,473 +1,473 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Chaos testing engine for simulating channel outages and failures.
|
||||
/// </summary>
|
||||
public interface IChaosEngine
|
||||
{
|
||||
/// <summary>
|
||||
/// Injects a fault for a channel type.
|
||||
/// </summary>
|
||||
Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Removes a fault injection.
|
||||
/// </summary>
|
||||
Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets all active faults.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a channel operation should fail due to chaos.
|
||||
/// </summary>
|
||||
Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Runs a chaos test scenario.
|
||||
/// </summary>
|
||||
Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets chaos test history.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Clears all active faults.
|
||||
/// </summary>
|
||||
Task ClearAllFaultsAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to inject a fault.
|
||||
/// </summary>
|
||||
public sealed record ChaosFaultRequest
|
||||
{
|
||||
public required string ChannelType { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public required ChaosFaultType FaultType { get; init; }
|
||||
public double FailureProbability { get; init; } = 1.0;
|
||||
public TimeSpan? Duration { get; init; }
|
||||
public TimeSpan? LatencyInjection { get; init; }
|
||||
public int? ErrorCode { get; init; }
|
||||
public string? ErrorMessage { get; init; }
|
||||
public string? Description { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of chaos fault.
|
||||
/// </summary>
|
||||
public enum ChaosFaultType
|
||||
{
|
||||
Outage,
|
||||
Latency,
|
||||
RateLimit,
|
||||
AuthFailure,
|
||||
Timeout,
|
||||
PartialFailure,
|
||||
Intermittent,
|
||||
ErrorResponse,
|
||||
CorruptResponse
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Active fault injection.
|
||||
/// </summary>
|
||||
public sealed record ChaosFaultInjection
|
||||
{
|
||||
public required string FaultId { get; init; }
|
||||
public required string ChannelType { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public required ChaosFaultType FaultType { get; init; }
|
||||
public double FailureProbability { get; init; }
|
||||
public TimeSpan? LatencyInjection { get; init; }
|
||||
public int? ErrorCode { get; init; }
|
||||
public string? ErrorMessage { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset? ExpiresAt { get; init; }
|
||||
public int TriggerCount { get; init; }
|
||||
public bool IsActive { get; init; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of checking for chaos fault.
|
||||
/// </summary>
|
||||
public sealed record ChaosFaultResult
|
||||
{
|
||||
public bool ShouldFail { get; init; }
|
||||
public ChaosFaultInjection? ActiveFault { get; init; }
|
||||
public TimeSpan? InjectedLatency { get; init; }
|
||||
public Exception? SimulatedException { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A chaos test scenario.
|
||||
/// </summary>
|
||||
public sealed record ChaosScenario
|
||||
{
|
||||
public required string ScenarioId { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public required IReadOnlyList<ChaosScenarioStep> Steps { get; init; }
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(10);
|
||||
public bool StopOnFirstFailure { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A step in a chaos scenario.
|
||||
/// </summary>
|
||||
public sealed record ChaosScenarioStep
|
||||
{
|
||||
public required string StepId { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public required ChaosStepAction Action { get; init; }
|
||||
public ChaosFaultRequest? FaultToInject { get; init; }
|
||||
public string? FaultIdToRemove { get; init; }
|
||||
public TimeSpan? WaitDuration { get; init; }
|
||||
public ChaosAssertion? Assertion { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Action type for a chaos step.
|
||||
/// </summary>
|
||||
public enum ChaosStepAction
|
||||
{
|
||||
InjectFault,
|
||||
RemoveFault,
|
||||
Wait,
|
||||
Assert,
|
||||
SendTestDelivery,
|
||||
CheckMetrics
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Assertion for chaos testing.
|
||||
/// </summary>
|
||||
public sealed record ChaosAssertion
|
||||
{
|
||||
public required ChaosAssertionType Type { get; init; }
|
||||
public string? MetricName { get; init; }
|
||||
public double? ExpectedValue { get; init; }
|
||||
public double? Tolerance { get; init; }
|
||||
public string? ExpectedStatus { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of chaos assertion.
|
||||
/// </summary>
|
||||
public enum ChaosAssertionType
|
||||
{
|
||||
MetricEquals,
|
||||
MetricGreaterThan,
|
||||
MetricLessThan,
|
||||
DeadLetterCountEquals,
|
||||
FallbackTriggered,
|
||||
AlertFired
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a chaos test.
|
||||
/// </summary>
|
||||
public sealed record ChaosTestResult
|
||||
{
|
||||
public required string TestId { get; init; }
|
||||
public required string ScenarioId { get; init; }
|
||||
public required string ScenarioName { get; init; }
|
||||
public bool Success { get; init; }
|
||||
public DateTimeOffset StartedAt { get; init; }
|
||||
public DateTimeOffset CompletedAt { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
public IReadOnlyList<ChaosStepResult> StepResults { get; init; } = [];
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a chaos step.
|
||||
/// </summary>
|
||||
public sealed record ChaosStepResult
|
||||
{
|
||||
public required string StepId { get; init; }
|
||||
public required string StepName { get; init; }
|
||||
public bool Success { get; init; }
|
||||
public DateTimeOffset ExecutedAt { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
public string? Error { get; init; }
|
||||
public object? Data { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for chaos engine.
|
||||
/// </summary>
|
||||
public sealed class ChaosEngineOptions
|
||||
{
|
||||
public const string SectionName = "Notifier:Observability:Chaos";
|
||||
|
||||
public bool Enabled { get; set; }
|
||||
public bool AllowInProduction { get; set; }
|
||||
public TimeSpan MaxFaultDuration { get; set; } = TimeSpan.FromHours(1);
|
||||
public int MaxConcurrentFaults { get; set; } = 10;
|
||||
public IReadOnlyList<string> AllowedChannelTypes { get; set; } = ["webhook", "email", "slack", "teams", "pagerduty", "opsgenie"];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of chaos engine.
|
||||
/// </summary>
|
||||
public sealed class DefaultChaosEngine : IChaosEngine
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, ChaosFaultInjection> _activeFaults = new();
|
||||
private readonly List<ChaosTestResult> _testHistory = [];
|
||||
private readonly ChaosEngineOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly INotifierMetrics? _metrics;
|
||||
private readonly ILogger<DefaultChaosEngine> _logger;
|
||||
private readonly Random _random = new();
|
||||
|
||||
public DefaultChaosEngine(
|
||||
IOptions<ChaosEngineOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
INotifierMetrics? metrics,
|
||||
ILogger<DefaultChaosEngine> logger)
|
||||
{
|
||||
_options = options?.Value ?? new ChaosEngineOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
throw new InvalidOperationException("Chaos engine is disabled");
|
||||
|
||||
if (_activeFaults.Count >= _options.MaxConcurrentFaults)
|
||||
throw new InvalidOperationException($"Maximum concurrent faults ({_options.MaxConcurrentFaults}) reached");
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var duration = request.Duration ?? _options.MaxFaultDuration;
|
||||
if (duration > _options.MaxFaultDuration)
|
||||
duration = _options.MaxFaultDuration;
|
||||
|
||||
var fault = new ChaosFaultInjection
|
||||
{
|
||||
FaultId = $"chaos-{Guid.NewGuid():N}"[..16],
|
||||
ChannelType = request.ChannelType,
|
||||
TenantId = request.TenantId,
|
||||
FaultType = request.FaultType,
|
||||
FailureProbability = Math.Clamp(request.FailureProbability, 0.0, 1.0),
|
||||
LatencyInjection = request.LatencyInjection,
|
||||
ErrorCode = request.ErrorCode,
|
||||
ErrorMessage = request.ErrorMessage,
|
||||
Description = request.Description,
|
||||
CreatedAt = now,
|
||||
ExpiresAt = now + duration,
|
||||
IsActive = true
|
||||
};
|
||||
|
||||
_activeFaults[fault.FaultId] = fault;
|
||||
_logger.LogWarning("Injected chaos fault {FaultId} for channel {ChannelType}: {FaultType}", fault.FaultId, fault.ChannelType, fault.FaultType);
|
||||
|
||||
return Task.FromResult(fault);
|
||||
}
|
||||
|
||||
public Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var removed = _activeFaults.TryRemove(faultId, out var fault);
|
||||
if (removed)
|
||||
_logger.LogInformation("Removed chaos fault {FaultId}", faultId);
|
||||
return Task.FromResult(removed);
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
CleanupExpiredFaults();
|
||||
return Task.FromResult<IReadOnlyList<ChaosFaultInjection>>(_activeFaults.Values.ToList());
|
||||
}
|
||||
|
||||
public Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
|
||||
|
||||
CleanupExpiredFaults();
|
||||
|
||||
var matchingFault = _activeFaults.Values
|
||||
.Where(f => f.IsActive)
|
||||
.Where(f => f.ChannelType == channelType || f.ChannelType == "*")
|
||||
.Where(f => f.TenantId is null || f.TenantId == tenantId)
|
||||
.FirstOrDefault();
|
||||
|
||||
if (matchingFault is null)
|
||||
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
|
||||
|
||||
var shouldFail = _random.NextDouble() < matchingFault.FailureProbability;
|
||||
if (!shouldFail)
|
||||
return Task.FromResult(new ChaosFaultResult { ShouldFail = false, ActiveFault = matchingFault });
|
||||
|
||||
// Update trigger count
|
||||
_activeFaults[matchingFault.FaultId] = matchingFault with { TriggerCount = matchingFault.TriggerCount + 1 };
|
||||
|
||||
var exception = matchingFault.FaultType switch
|
||||
{
|
||||
ChaosFaultType.Outage => new InvalidOperationException(matchingFault.ErrorMessage ?? "Channel outage (chaos)"),
|
||||
ChaosFaultType.AuthFailure => new UnauthorizedAccessException(matchingFault.ErrorMessage ?? "Auth failure (chaos)"),
|
||||
ChaosFaultType.Timeout => new TimeoutException(matchingFault.ErrorMessage ?? "Timeout (chaos)"),
|
||||
ChaosFaultType.RateLimit => new InvalidOperationException(matchingFault.ErrorMessage ?? "Rate limited (chaos)"),
|
||||
_ => new Exception(matchingFault.ErrorMessage ?? "Chaos fault")
|
||||
};
|
||||
|
||||
return Task.FromResult(new ChaosFaultResult
|
||||
{
|
||||
ShouldFail = true,
|
||||
ActiveFault = matchingFault,
|
||||
InjectedLatency = matchingFault.LatencyInjection,
|
||||
SimulatedException = exception
|
||||
});
|
||||
}
|
||||
|
||||
public async Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
throw new InvalidOperationException("Chaos engine is disabled");
|
||||
|
||||
var testId = $"test-{Guid.NewGuid():N}"[..16];
|
||||
var startedAt = _timeProvider.GetUtcNow();
|
||||
var stepResults = new List<ChaosStepResult>();
|
||||
string? error = null;
|
||||
var success = true;
|
||||
|
||||
_logger.LogInformation("Starting chaos scenario {ScenarioId}: {ScenarioName}", scenario.ScenarioId, scenario.Name);
|
||||
|
||||
using var timeoutCts = new CancellationTokenSource(scenario.Timeout);
|
||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token);
|
||||
|
||||
try
|
||||
{
|
||||
foreach (var step in scenario.Steps)
|
||||
{
|
||||
var stepStart = _timeProvider.GetUtcNow();
|
||||
var stepResult = await ExecuteStepAsync(step, linkedCts.Token);
|
||||
stepResults.Add(stepResult);
|
||||
|
||||
if (!stepResult.Success)
|
||||
{
|
||||
success = false;
|
||||
if (scenario.StopOnFirstFailure)
|
||||
{
|
||||
error = $"Step '{step.Name}' failed: {stepResult.Error}";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
success = false;
|
||||
error = "Scenario timed out";
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
success = false;
|
||||
error = ex.Message;
|
||||
}
|
||||
|
||||
var completedAt = _timeProvider.GetUtcNow();
|
||||
var result = new ChaosTestResult
|
||||
{
|
||||
TestId = testId,
|
||||
ScenarioId = scenario.ScenarioId,
|
||||
ScenarioName = scenario.Name,
|
||||
Success = success,
|
||||
StartedAt = startedAt,
|
||||
CompletedAt = completedAt,
|
||||
Duration = completedAt - startedAt,
|
||||
StepResults = stepResults,
|
||||
Error = error
|
||||
};
|
||||
|
||||
lock (_testHistory)
|
||||
{
|
||||
_testHistory.Add(result);
|
||||
while (_testHistory.Count > 100) _testHistory.RemoveAt(0);
|
||||
}
|
||||
|
||||
_logger.LogInformation("Chaos scenario {ScenarioId} completed: {Success}", scenario.ScenarioId, success ? "PASSED" : "FAILED");
|
||||
return result;
|
||||
}
|
||||
|
||||
private async Task<ChaosStepResult> ExecuteStepAsync(ChaosScenarioStep step, CancellationToken cancellationToken)
|
||||
{
|
||||
var executedAt = _timeProvider.GetUtcNow();
|
||||
try
|
||||
{
|
||||
object? data = null;
|
||||
switch (step.Action)
|
||||
{
|
||||
case ChaosStepAction.InjectFault when step.FaultToInject is not null:
|
||||
var fault = await InjectFaultAsync(step.FaultToInject, cancellationToken);
|
||||
data = fault;
|
||||
break;
|
||||
case ChaosStepAction.RemoveFault when step.FaultIdToRemove is not null:
|
||||
await RemoveFaultAsync(step.FaultIdToRemove, cancellationToken);
|
||||
break;
|
||||
case ChaosStepAction.Wait when step.WaitDuration.HasValue:
|
||||
await Task.Delay(step.WaitDuration.Value, cancellationToken);
|
||||
break;
|
||||
case ChaosStepAction.Assert when step.Assertion is not null:
|
||||
var assertResult = EvaluateAssertion(step.Assertion);
|
||||
if (!assertResult.passed)
|
||||
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = assertResult.error };
|
||||
break;
|
||||
}
|
||||
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = true, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Data = data };
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = ex.Message };
|
||||
}
|
||||
}
|
||||
|
||||
private (bool passed, string? error) EvaluateAssertion(ChaosAssertion assertion)
|
||||
{
|
||||
// Simplified assertion evaluation - in production would query actual metrics
|
||||
return assertion.Type switch
|
||||
{
|
||||
ChaosAssertionType.FallbackTriggered => (true, null),
|
||||
ChaosAssertionType.AlertFired => (true, null),
|
||||
_ => (true, null)
|
||||
};
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_testHistory)
|
||||
{
|
||||
return Task.FromResult<IReadOnlyList<ChaosTestResult>>(_testHistory.TakeLast(limit).Reverse().ToList());
|
||||
}
|
||||
}
|
||||
|
||||
public Task ClearAllFaultsAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
_activeFaults.Clear();
|
||||
_logger.LogInformation("Cleared all chaos faults");
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private void CleanupExpiredFaults()
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var expired = _activeFaults.Where(f => f.Value.ExpiresAt.HasValue && f.Value.ExpiresAt < now).Select(f => f.Key).ToList();
|
||||
foreach (var id in expired)
|
||||
{
|
||||
_activeFaults.TryRemove(id, out _);
|
||||
_logger.LogDebug("Expired chaos fault {FaultId}", id);
|
||||
}
|
||||
}
|
||||
}
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Chaos testing engine for simulating channel outages and failures.
|
||||
/// </summary>
|
||||
public interface IChaosEngine
|
||||
{
|
||||
/// <summary>
|
||||
/// Injects a fault for a channel type.
|
||||
/// </summary>
|
||||
Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Removes a fault injection.
|
||||
/// </summary>
|
||||
Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets all active faults.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a channel operation should fail due to chaos.
|
||||
/// </summary>
|
||||
Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Runs a chaos test scenario.
|
||||
/// </summary>
|
||||
Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets chaos test history.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Clears all active faults.
|
||||
/// </summary>
|
||||
Task ClearAllFaultsAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to inject a fault.
|
||||
/// </summary>
|
||||
public sealed record ChaosFaultRequest
|
||||
{
|
||||
public required string ChannelType { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public required ChaosFaultType FaultType { get; init; }
|
||||
public double FailureProbability { get; init; } = 1.0;
|
||||
public TimeSpan? Duration { get; init; }
|
||||
public TimeSpan? LatencyInjection { get; init; }
|
||||
public int? ErrorCode { get; init; }
|
||||
public string? ErrorMessage { get; init; }
|
||||
public string? Description { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of chaos fault.
|
||||
/// </summary>
|
||||
public enum ChaosFaultType
|
||||
{
|
||||
Outage,
|
||||
Latency,
|
||||
RateLimit,
|
||||
AuthFailure,
|
||||
Timeout,
|
||||
PartialFailure,
|
||||
Intermittent,
|
||||
ErrorResponse,
|
||||
CorruptResponse
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Active fault injection.
|
||||
/// </summary>
|
||||
public sealed record ChaosFaultInjection
|
||||
{
|
||||
public required string FaultId { get; init; }
|
||||
public required string ChannelType { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public required ChaosFaultType FaultType { get; init; }
|
||||
public double FailureProbability { get; init; }
|
||||
public TimeSpan? LatencyInjection { get; init; }
|
||||
public int? ErrorCode { get; init; }
|
||||
public string? ErrorMessage { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset? ExpiresAt { get; init; }
|
||||
public int TriggerCount { get; init; }
|
||||
public bool IsActive { get; init; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of checking for chaos fault.
|
||||
/// </summary>
|
||||
public sealed record ChaosFaultResult
|
||||
{
|
||||
public bool ShouldFail { get; init; }
|
||||
public ChaosFaultInjection? ActiveFault { get; init; }
|
||||
public TimeSpan? InjectedLatency { get; init; }
|
||||
public Exception? SimulatedException { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A chaos test scenario.
|
||||
/// </summary>
|
||||
public sealed record ChaosScenario
|
||||
{
|
||||
public required string ScenarioId { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public required IReadOnlyList<ChaosScenarioStep> Steps { get; init; }
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(10);
|
||||
public bool StopOnFirstFailure { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A step in a chaos scenario.
|
||||
/// </summary>
|
||||
public sealed record ChaosScenarioStep
|
||||
{
|
||||
public required string StepId { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public required ChaosStepAction Action { get; init; }
|
||||
public ChaosFaultRequest? FaultToInject { get; init; }
|
||||
public string? FaultIdToRemove { get; init; }
|
||||
public TimeSpan? WaitDuration { get; init; }
|
||||
public ChaosAssertion? Assertion { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Action type for a chaos step.
|
||||
/// </summary>
|
||||
public enum ChaosStepAction
|
||||
{
|
||||
InjectFault,
|
||||
RemoveFault,
|
||||
Wait,
|
||||
Assert,
|
||||
SendTestDelivery,
|
||||
CheckMetrics
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Assertion for chaos testing.
|
||||
/// </summary>
|
||||
public sealed record ChaosAssertion
|
||||
{
|
||||
public required ChaosAssertionType Type { get; init; }
|
||||
public string? MetricName { get; init; }
|
||||
public double? ExpectedValue { get; init; }
|
||||
public double? Tolerance { get; init; }
|
||||
public string? ExpectedStatus { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of chaos assertion.
|
||||
/// </summary>
|
||||
public enum ChaosAssertionType
|
||||
{
|
||||
MetricEquals,
|
||||
MetricGreaterThan,
|
||||
MetricLessThan,
|
||||
DeadLetterCountEquals,
|
||||
FallbackTriggered,
|
||||
AlertFired
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a chaos test.
|
||||
/// </summary>
|
||||
public sealed record ChaosTestResult
|
||||
{
|
||||
public required string TestId { get; init; }
|
||||
public required string ScenarioId { get; init; }
|
||||
public required string ScenarioName { get; init; }
|
||||
public bool Success { get; init; }
|
||||
public DateTimeOffset StartedAt { get; init; }
|
||||
public DateTimeOffset CompletedAt { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
public IReadOnlyList<ChaosStepResult> StepResults { get; init; } = [];
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a chaos step.
|
||||
/// </summary>
|
||||
public sealed record ChaosStepResult
|
||||
{
|
||||
public required string StepId { get; init; }
|
||||
public required string StepName { get; init; }
|
||||
public bool Success { get; init; }
|
||||
public DateTimeOffset ExecutedAt { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
public string? Error { get; init; }
|
||||
public object? Data { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for chaos engine.
|
||||
/// </summary>
|
||||
public sealed class ChaosEngineOptions
|
||||
{
|
||||
public const string SectionName = "Notifier:Observability:Chaos";
|
||||
|
||||
public bool Enabled { get; set; }
|
||||
public bool AllowInProduction { get; set; }
|
||||
public TimeSpan MaxFaultDuration { get; set; } = TimeSpan.FromHours(1);
|
||||
public int MaxConcurrentFaults { get; set; } = 10;
|
||||
public IReadOnlyList<string> AllowedChannelTypes { get; set; } = ["webhook", "email", "slack", "teams", "pagerduty", "opsgenie"];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of chaos engine.
|
||||
/// </summary>
|
||||
public sealed class DefaultChaosEngine : IChaosEngine
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, ChaosFaultInjection> _activeFaults = new();
|
||||
private readonly List<ChaosTestResult> _testHistory = [];
|
||||
private readonly ChaosEngineOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly INotifierMetrics? _metrics;
|
||||
private readonly ILogger<DefaultChaosEngine> _logger;
|
||||
private readonly Random _random = new();
|
||||
|
||||
public DefaultChaosEngine(
|
||||
IOptions<ChaosEngineOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
INotifierMetrics? metrics,
|
||||
ILogger<DefaultChaosEngine> logger)
|
||||
{
|
||||
_options = options?.Value ?? new ChaosEngineOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
throw new InvalidOperationException("Chaos engine is disabled");
|
||||
|
||||
if (_activeFaults.Count >= _options.MaxConcurrentFaults)
|
||||
throw new InvalidOperationException($"Maximum concurrent faults ({_options.MaxConcurrentFaults}) reached");
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var duration = request.Duration ?? _options.MaxFaultDuration;
|
||||
if (duration > _options.MaxFaultDuration)
|
||||
duration = _options.MaxFaultDuration;
|
||||
|
||||
var fault = new ChaosFaultInjection
|
||||
{
|
||||
FaultId = $"chaos-{Guid.NewGuid():N}"[..16],
|
||||
ChannelType = request.ChannelType,
|
||||
TenantId = request.TenantId,
|
||||
FaultType = request.FaultType,
|
||||
FailureProbability = Math.Clamp(request.FailureProbability, 0.0, 1.0),
|
||||
LatencyInjection = request.LatencyInjection,
|
||||
ErrorCode = request.ErrorCode,
|
||||
ErrorMessage = request.ErrorMessage,
|
||||
Description = request.Description,
|
||||
CreatedAt = now,
|
||||
ExpiresAt = now + duration,
|
||||
IsActive = true
|
||||
};
|
||||
|
||||
_activeFaults[fault.FaultId] = fault;
|
||||
_logger.LogWarning("Injected chaos fault {FaultId} for channel {ChannelType}: {FaultType}", fault.FaultId, fault.ChannelType, fault.FaultType);
|
||||
|
||||
return Task.FromResult(fault);
|
||||
}
|
||||
|
||||
public Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var removed = _activeFaults.TryRemove(faultId, out var fault);
|
||||
if (removed)
|
||||
_logger.LogInformation("Removed chaos fault {FaultId}", faultId);
|
||||
return Task.FromResult(removed);
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
CleanupExpiredFaults();
|
||||
return Task.FromResult<IReadOnlyList<ChaosFaultInjection>>(_activeFaults.Values.ToList());
|
||||
}
|
||||
|
||||
public Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
|
||||
|
||||
CleanupExpiredFaults();
|
||||
|
||||
var matchingFault = _activeFaults.Values
|
||||
.Where(f => f.IsActive)
|
||||
.Where(f => f.ChannelType == channelType || f.ChannelType == "*")
|
||||
.Where(f => f.TenantId is null || f.TenantId == tenantId)
|
||||
.FirstOrDefault();
|
||||
|
||||
if (matchingFault is null)
|
||||
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
|
||||
|
||||
var shouldFail = _random.NextDouble() < matchingFault.FailureProbability;
|
||||
if (!shouldFail)
|
||||
return Task.FromResult(new ChaosFaultResult { ShouldFail = false, ActiveFault = matchingFault });
|
||||
|
||||
// Update trigger count
|
||||
_activeFaults[matchingFault.FaultId] = matchingFault with { TriggerCount = matchingFault.TriggerCount + 1 };
|
||||
|
||||
var exception = matchingFault.FaultType switch
|
||||
{
|
||||
ChaosFaultType.Outage => new InvalidOperationException(matchingFault.ErrorMessage ?? "Channel outage (chaos)"),
|
||||
ChaosFaultType.AuthFailure => new UnauthorizedAccessException(matchingFault.ErrorMessage ?? "Auth failure (chaos)"),
|
||||
ChaosFaultType.Timeout => new TimeoutException(matchingFault.ErrorMessage ?? "Timeout (chaos)"),
|
||||
ChaosFaultType.RateLimit => new InvalidOperationException(matchingFault.ErrorMessage ?? "Rate limited (chaos)"),
|
||||
_ => new Exception(matchingFault.ErrorMessage ?? "Chaos fault")
|
||||
};
|
||||
|
||||
return Task.FromResult(new ChaosFaultResult
|
||||
{
|
||||
ShouldFail = true,
|
||||
ActiveFault = matchingFault,
|
||||
InjectedLatency = matchingFault.LatencyInjection,
|
||||
SimulatedException = exception
|
||||
});
|
||||
}
|
||||
|
||||
public async Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
throw new InvalidOperationException("Chaos engine is disabled");
|
||||
|
||||
var testId = $"test-{Guid.NewGuid():N}"[..16];
|
||||
var startedAt = _timeProvider.GetUtcNow();
|
||||
var stepResults = new List<ChaosStepResult>();
|
||||
string? error = null;
|
||||
var success = true;
|
||||
|
||||
_logger.LogInformation("Starting chaos scenario {ScenarioId}: {ScenarioName}", scenario.ScenarioId, scenario.Name);
|
||||
|
||||
using var timeoutCts = new CancellationTokenSource(scenario.Timeout);
|
||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token);
|
||||
|
||||
try
|
||||
{
|
||||
foreach (var step in scenario.Steps)
|
||||
{
|
||||
var stepStart = _timeProvider.GetUtcNow();
|
||||
var stepResult = await ExecuteStepAsync(step, linkedCts.Token);
|
||||
stepResults.Add(stepResult);
|
||||
|
||||
if (!stepResult.Success)
|
||||
{
|
||||
success = false;
|
||||
if (scenario.StopOnFirstFailure)
|
||||
{
|
||||
error = $"Step '{step.Name}' failed: {stepResult.Error}";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
success = false;
|
||||
error = "Scenario timed out";
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
success = false;
|
||||
error = ex.Message;
|
||||
}
|
||||
|
||||
var completedAt = _timeProvider.GetUtcNow();
|
||||
var result = new ChaosTestResult
|
||||
{
|
||||
TestId = testId,
|
||||
ScenarioId = scenario.ScenarioId,
|
||||
ScenarioName = scenario.Name,
|
||||
Success = success,
|
||||
StartedAt = startedAt,
|
||||
CompletedAt = completedAt,
|
||||
Duration = completedAt - startedAt,
|
||||
StepResults = stepResults,
|
||||
Error = error
|
||||
};
|
||||
|
||||
lock (_testHistory)
|
||||
{
|
||||
_testHistory.Add(result);
|
||||
while (_testHistory.Count > 100) _testHistory.RemoveAt(0);
|
||||
}
|
||||
|
||||
_logger.LogInformation("Chaos scenario {ScenarioId} completed: {Success}", scenario.ScenarioId, success ? "PASSED" : "FAILED");
|
||||
return result;
|
||||
}
|
||||
|
||||
private async Task<ChaosStepResult> ExecuteStepAsync(ChaosScenarioStep step, CancellationToken cancellationToken)
|
||||
{
|
||||
var executedAt = _timeProvider.GetUtcNow();
|
||||
try
|
||||
{
|
||||
object? data = null;
|
||||
switch (step.Action)
|
||||
{
|
||||
case ChaosStepAction.InjectFault when step.FaultToInject is not null:
|
||||
var fault = await InjectFaultAsync(step.FaultToInject, cancellationToken);
|
||||
data = fault;
|
||||
break;
|
||||
case ChaosStepAction.RemoveFault when step.FaultIdToRemove is not null:
|
||||
await RemoveFaultAsync(step.FaultIdToRemove, cancellationToken);
|
||||
break;
|
||||
case ChaosStepAction.Wait when step.WaitDuration.HasValue:
|
||||
await Task.Delay(step.WaitDuration.Value, cancellationToken);
|
||||
break;
|
||||
case ChaosStepAction.Assert when step.Assertion is not null:
|
||||
var assertResult = EvaluateAssertion(step.Assertion);
|
||||
if (!assertResult.passed)
|
||||
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = assertResult.error };
|
||||
break;
|
||||
}
|
||||
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = true, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Data = data };
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = ex.Message };
|
||||
}
|
||||
}
|
||||
|
||||
private (bool passed, string? error) EvaluateAssertion(ChaosAssertion assertion)
|
||||
{
|
||||
// Simplified assertion evaluation - in production would query actual metrics
|
||||
return assertion.Type switch
|
||||
{
|
||||
ChaosAssertionType.FallbackTriggered => (true, null),
|
||||
ChaosAssertionType.AlertFired => (true, null),
|
||||
_ => (true, null)
|
||||
};
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_testHistory)
|
||||
{
|
||||
return Task.FromResult<IReadOnlyList<ChaosTestResult>>(_testHistory.TakeLast(limit).Reverse().ToList());
|
||||
}
|
||||
}
|
||||
|
||||
public Task ClearAllFaultsAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
_activeFaults.Clear();
|
||||
_logger.LogInformation("Cleared all chaos faults");
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private void CleanupExpiredFaults()
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var expired = _activeFaults.Where(f => f.Value.ExpiresAt.HasValue && f.Value.ExpiresAt < now).Select(f => f.Key).ToList();
|
||||
foreach (var id in expired)
|
||||
{
|
||||
_activeFaults.TryRemove(id, out _);
|
||||
_logger.LogDebug("Expired chaos fault {FaultId}", id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,351 +1,351 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Handles notifications that have failed permanently after all retries.
|
||||
/// </summary>
|
||||
public interface IDeadLetterHandler
|
||||
{
|
||||
/// <summary>
|
||||
/// Moves a delivery to the dead-letter queue.
|
||||
/// </summary>
|
||||
Task<DeadLetteredDelivery> DeadLetterAsync(
|
||||
string tenantId,
|
||||
string deliveryId,
|
||||
DeadLetterReason reason,
|
||||
string channelType,
|
||||
object? payload = null,
|
||||
Exception? exception = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets dead-lettered deliveries for a tenant.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<DeadLetteredDelivery>> GetAsync(
|
||||
string tenantId,
|
||||
DeadLetterQuery? query = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Retries a dead-lettered delivery.
|
||||
/// </summary>
|
||||
Task<DeadLetterRetryResult> RetryAsync(
|
||||
string tenantId,
|
||||
string deadLetterId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Retries all matching dead-lettered deliveries.
|
||||
/// </summary>
|
||||
Task<DeadLetterBulkRetryResult> RetryBulkAsync(
|
||||
string tenantId,
|
||||
DeadLetterQuery? query = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Discards a dead-lettered delivery.
|
||||
/// </summary>
|
||||
Task<bool> DiscardAsync(
|
||||
string tenantId,
|
||||
string deadLetterId,
|
||||
string? reason = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets statistics about dead-lettered deliveries.
|
||||
/// </summary>
|
||||
Task<DeadLetterStats> GetStatsAsync(
|
||||
string? tenantId = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Purges old dead-lettered deliveries.
|
||||
/// </summary>
|
||||
Task<int> PurgeAsync(
|
||||
string? tenantId,
|
||||
TimeSpan olderThan,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reason for dead-lettering.
|
||||
/// </summary>
|
||||
public enum DeadLetterReason
|
||||
{
|
||||
MaxRetriesExceeded,
|
||||
InvalidPayload,
|
||||
ChannelUnavailable,
|
||||
AuthenticationFailed,
|
||||
RateLimited,
|
||||
TemplateRenderFailed,
|
||||
ConfigurationError,
|
||||
UnknownError
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A dead-lettered delivery.
|
||||
/// </summary>
|
||||
public sealed record DeadLetteredDelivery
|
||||
{
|
||||
public required string DeadLetterId { get; init; }
|
||||
public required string TenantId { get; init; }
|
||||
public required string DeliveryId { get; init; }
|
||||
public required string ChannelType { get; init; }
|
||||
public required DeadLetterReason Reason { get; init; }
|
||||
public string? ReasonDetails { get; init; }
|
||||
public object? OriginalPayload { get; init; }
|
||||
public string? ExceptionType { get; init; }
|
||||
public string? ExceptionMessage { get; init; }
|
||||
public int AttemptCount { get; init; }
|
||||
public DateTimeOffset FirstAttemptAt { get; init; }
|
||||
public DateTimeOffset DeadLetteredAt { get; init; }
|
||||
public DateTimeOffset? LastRetryAt { get; init; }
|
||||
public int RetryCount { get; init; }
|
||||
public DeadLetterStatus Status { get; init; } = DeadLetterStatus.Pending;
|
||||
public string? DiscardReason { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of a dead-lettered delivery.
|
||||
/// </summary>
|
||||
public enum DeadLetterStatus
|
||||
{
|
||||
Pending,
|
||||
Retrying,
|
||||
Retried,
|
||||
Discarded
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Query for dead-lettered deliveries.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterQuery
|
||||
{
|
||||
public string? Id { get; init; }
|
||||
public DeadLetterReason? Reason { get; init; }
|
||||
public string? ChannelType { get; init; }
|
||||
public DeadLetterStatus? Status { get; init; }
|
||||
public DateTimeOffset? After { get; init; }
|
||||
public DateTimeOffset? Before { get; init; }
|
||||
public int Limit { get; init; } = 100;
|
||||
public int Offset { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a retry attempt.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterRetryResult
|
||||
{
|
||||
public required string DeadLetterId { get; init; }
|
||||
public bool Success { get; init; }
|
||||
public string? Error { get; init; }
|
||||
public DeadLetterStatus NewStatus { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a bulk retry operation.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterBulkRetryResult
|
||||
{
|
||||
public int Total { get; init; }
|
||||
public int Succeeded { get; init; }
|
||||
public int Failed { get; init; }
|
||||
public IReadOnlyList<DeadLetterRetryResult> Results { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Statistics about dead-lettered deliveries.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterStats
|
||||
{
|
||||
public DateTimeOffset Timestamp { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public int TotalCount { get; init; }
|
||||
public int PendingCount { get; init; }
|
||||
public int RetryingCount { get; init; }
|
||||
public int RetriedCount { get; init; }
|
||||
public int DiscardedCount { get; init; }
|
||||
public IReadOnlyDictionary<DeadLetterReason, int> ByReason { get; init; } = new Dictionary<DeadLetterReason, int>();
|
||||
public IReadOnlyDictionary<string, int> ByChannel { get; init; } = new Dictionary<string, int>();
|
||||
public DateTimeOffset? OldestDeadLetterAt { get; init; }
|
||||
public DateTimeOffset? NewestDeadLetterAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for dead-letter handling.
|
||||
/// </summary>
|
||||
public sealed class DeadLetterOptions
|
||||
{
|
||||
public const string SectionName = "Notifier:Observability:DeadLetter";
|
||||
|
||||
public bool Enabled { get; set; } = true;
|
||||
public int MaxRetryAttempts { get; set; } = 3;
|
||||
public TimeSpan RetryDelay { get; set; } = TimeSpan.FromMinutes(5);
|
||||
public TimeSpan RetentionPeriod { get; set; } = TimeSpan.FromDays(30);
|
||||
public bool AutoPurge { get; set; } = true;
|
||||
public TimeSpan PurgeInterval { get; set; } = TimeSpan.FromHours(24);
|
||||
public int AlertThreshold { get; set; } = 100;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of dead-letter handler.
|
||||
/// </summary>
|
||||
public sealed class InMemoryDeadLetterHandler : IDeadLetterHandler
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, List<DeadLetteredDelivery>> _deadLetters = new();
|
||||
private readonly DeadLetterOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly INotifierMetrics? _metrics;
|
||||
private readonly ILogger<InMemoryDeadLetterHandler> _logger;
|
||||
|
||||
public InMemoryDeadLetterHandler(
|
||||
IOptions<DeadLetterOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
INotifierMetrics? metrics,
|
||||
ILogger<InMemoryDeadLetterHandler> logger)
|
||||
{
|
||||
_options = options?.Value ?? new DeadLetterOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public Task<DeadLetteredDelivery> DeadLetterAsync(
|
||||
string tenantId,
|
||||
string deliveryId,
|
||||
DeadLetterReason reason,
|
||||
string channelType,
|
||||
object? payload = null,
|
||||
Exception? exception = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var deadLetter = new DeadLetteredDelivery
|
||||
{
|
||||
DeadLetterId = $"dl-{Guid.NewGuid():N}"[..16],
|
||||
TenantId = tenantId,
|
||||
DeliveryId = deliveryId,
|
||||
ChannelType = channelType,
|
||||
Reason = reason,
|
||||
ReasonDetails = exception?.Message,
|
||||
OriginalPayload = payload,
|
||||
ExceptionType = exception?.GetType().FullName,
|
||||
ExceptionMessage = exception?.Message,
|
||||
DeadLetteredAt = now,
|
||||
FirstAttemptAt = now,
|
||||
Status = DeadLetterStatus.Pending
|
||||
};
|
||||
|
||||
var list = _deadLetters.GetOrAdd(tenantId, _ => []);
|
||||
lock (list) { list.Add(deadLetter); }
|
||||
|
||||
_metrics?.RecordDeadLetter(tenantId, reason.ToString(), channelType);
|
||||
_logger.LogWarning("Dead-lettered delivery {DeliveryId} for tenant {TenantId}: {Reason}", deliveryId, tenantId, reason);
|
||||
|
||||
return Task.FromResult(deadLetter);
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<DeadLetteredDelivery>> GetAsync(
|
||||
string tenantId,
|
||||
DeadLetterQuery? query = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_deadLetters.TryGetValue(tenantId, out var list))
|
||||
return Task.FromResult<IReadOnlyList<DeadLetteredDelivery>>([]);
|
||||
|
||||
IEnumerable<DeadLetteredDelivery> filtered;
|
||||
lock (list) { filtered = list.ToList(); }
|
||||
|
||||
if (query is not null)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(query.Id)) filtered = filtered.Where(d => d.DeadLetterId == query.Id);
|
||||
if (query.Reason.HasValue) filtered = filtered.Where(d => d.Reason == query.Reason.Value);
|
||||
if (!string.IsNullOrEmpty(query.ChannelType)) filtered = filtered.Where(d => d.ChannelType == query.ChannelType);
|
||||
if (query.Status.HasValue) filtered = filtered.Where(d => d.Status == query.Status.Value);
|
||||
if (query.After.HasValue) filtered = filtered.Where(d => d.DeadLetteredAt > query.After.Value);
|
||||
if (query.Before.HasValue) filtered = filtered.Where(d => d.DeadLetteredAt < query.Before.Value);
|
||||
}
|
||||
|
||||
var result = filtered.OrderByDescending(d => d.DeadLetteredAt).Skip(query?.Offset ?? 0).Take(query?.Limit ?? 100).ToList();
|
||||
return Task.FromResult<IReadOnlyList<DeadLetteredDelivery>>(result);
|
||||
}
|
||||
|
||||
public Task<DeadLetterRetryResult> RetryAsync(string tenantId, string deadLetterId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_deadLetters.TryGetValue(tenantId, out var list))
|
||||
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = false, Error = "Not found", NewStatus = DeadLetterStatus.Pending });
|
||||
|
||||
DeadLetteredDelivery? deadLetter;
|
||||
lock (list) { deadLetter = list.FirstOrDefault(d => d.DeadLetterId == deadLetterId); }
|
||||
if (deadLetter is null)
|
||||
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = false, Error = "Not found", NewStatus = DeadLetterStatus.Pending });
|
||||
|
||||
lock (list)
|
||||
{
|
||||
var index = list.FindIndex(d => d.DeadLetterId == deadLetterId);
|
||||
if (index >= 0)
|
||||
list[index] = deadLetter with { Status = DeadLetterStatus.Retried, LastRetryAt = _timeProvider.GetUtcNow(), RetryCount = deadLetter.RetryCount + 1 };
|
||||
}
|
||||
|
||||
_logger.LogInformation("Retrying dead-lettered delivery {DeadLetterId} for tenant {TenantId}", deadLetterId, tenantId);
|
||||
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = true, NewStatus = DeadLetterStatus.Retried });
|
||||
}
|
||||
|
||||
public async Task<DeadLetterBulkRetryResult> RetryBulkAsync(string tenantId, DeadLetterQuery? query = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var deadLetters = await GetAsync(tenantId, query, cancellationToken);
|
||||
var results = new List<DeadLetterRetryResult>();
|
||||
foreach (var dl in deadLetters.Where(d => d.Status == DeadLetterStatus.Pending))
|
||||
results.Add(await RetryAsync(tenantId, dl.DeadLetterId, cancellationToken));
|
||||
return new DeadLetterBulkRetryResult { Total = results.Count, Succeeded = results.Count(r => r.Success), Failed = results.Count(r => !r.Success), Results = results };
|
||||
}
|
||||
|
||||
public Task<bool> DiscardAsync(string tenantId, string deadLetterId, string? reason = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_deadLetters.TryGetValue(tenantId, out var list)) return Task.FromResult(false);
|
||||
lock (list)
|
||||
{
|
||||
var index = list.FindIndex(d => d.DeadLetterId == deadLetterId);
|
||||
if (index < 0) return Task.FromResult(false);
|
||||
list[index] = list[index] with { Status = DeadLetterStatus.Discarded, DiscardReason = reason };
|
||||
}
|
||||
_logger.LogInformation("Discarded dead-lettered delivery {DeadLetterId} for tenant {TenantId}: {Reason}", deadLetterId, tenantId, reason ?? "No reason");
|
||||
return Task.FromResult(true);
|
||||
}
|
||||
|
||||
public Task<DeadLetterStats> GetStatsAsync(string? tenantId = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var all = tenantId is not null ? (_deadLetters.TryGetValue(tenantId, out var l) ? l.ToList() : []) : _deadLetters.Values.SelectMany(v => v).ToList();
|
||||
return Task.FromResult(new DeadLetterStats
|
||||
{
|
||||
Timestamp = _timeProvider.GetUtcNow(),
|
||||
TenantId = tenantId,
|
||||
TotalCount = all.Count,
|
||||
PendingCount = all.Count(d => d.Status == DeadLetterStatus.Pending),
|
||||
RetryingCount = all.Count(d => d.Status == DeadLetterStatus.Retrying),
|
||||
RetriedCount = all.Count(d => d.Status == DeadLetterStatus.Retried),
|
||||
DiscardedCount = all.Count(d => d.Status == DeadLetterStatus.Discarded),
|
||||
ByReason = all.GroupBy(d => d.Reason).ToDictionary(g => g.Key, g => g.Count()),
|
||||
ByChannel = all.GroupBy(d => d.ChannelType).ToDictionary(g => g.Key, g => g.Count()),
|
||||
OldestDeadLetterAt = all.MinBy(d => d.DeadLetteredAt)?.DeadLetteredAt,
|
||||
NewestDeadLetterAt = all.MaxBy(d => d.DeadLetteredAt)?.DeadLetteredAt
|
||||
});
|
||||
}
|
||||
|
||||
public Task<int> PurgeAsync(string? tenantId, TimeSpan olderThan, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var cutoff = _timeProvider.GetUtcNow() - olderThan;
|
||||
var purged = 0;
|
||||
var tenants = tenantId is not null ? [tenantId] : _deadLetters.Keys.ToList();
|
||||
foreach (var t in tenants)
|
||||
{
|
||||
if (!_deadLetters.TryGetValue(t, out var list)) continue;
|
||||
lock (list) { purged += list.RemoveAll(d => d.DeadLetteredAt < cutoff); }
|
||||
}
|
||||
_logger.LogInformation("Purged {Count} dead-lettered deliveries older than {OlderThan}", purged, olderThan);
|
||||
return Task.FromResult(purged);
|
||||
}
|
||||
}
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Handles notifications that have failed permanently after all retries.
|
||||
/// </summary>
|
||||
public interface IDeadLetterHandler
|
||||
{
|
||||
/// <summary>
|
||||
/// Moves a delivery to the dead-letter queue.
|
||||
/// </summary>
|
||||
Task<DeadLetteredDelivery> DeadLetterAsync(
|
||||
string tenantId,
|
||||
string deliveryId,
|
||||
DeadLetterReason reason,
|
||||
string channelType,
|
||||
object? payload = null,
|
||||
Exception? exception = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets dead-lettered deliveries for a tenant.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<DeadLetteredDelivery>> GetAsync(
|
||||
string tenantId,
|
||||
DeadLetterQuery? query = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Retries a dead-lettered delivery.
|
||||
/// </summary>
|
||||
Task<DeadLetterRetryResult> RetryAsync(
|
||||
string tenantId,
|
||||
string deadLetterId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Retries all matching dead-lettered deliveries.
|
||||
/// </summary>
|
||||
Task<DeadLetterBulkRetryResult> RetryBulkAsync(
|
||||
string tenantId,
|
||||
DeadLetterQuery? query = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Discards a dead-lettered delivery.
|
||||
/// </summary>
|
||||
Task<bool> DiscardAsync(
|
||||
string tenantId,
|
||||
string deadLetterId,
|
||||
string? reason = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets statistics about dead-lettered deliveries.
|
||||
/// </summary>
|
||||
Task<DeadLetterStats> GetStatsAsync(
|
||||
string? tenantId = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Purges old dead-lettered deliveries.
|
||||
/// </summary>
|
||||
Task<int> PurgeAsync(
|
||||
string? tenantId,
|
||||
TimeSpan olderThan,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reason for dead-lettering.
|
||||
/// </summary>
|
||||
public enum DeadLetterReason
|
||||
{
|
||||
MaxRetriesExceeded,
|
||||
InvalidPayload,
|
||||
ChannelUnavailable,
|
||||
AuthenticationFailed,
|
||||
RateLimited,
|
||||
TemplateRenderFailed,
|
||||
ConfigurationError,
|
||||
UnknownError
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A dead-lettered delivery.
|
||||
/// </summary>
|
||||
public sealed record DeadLetteredDelivery
|
||||
{
|
||||
public required string DeadLetterId { get; init; }
|
||||
public required string TenantId { get; init; }
|
||||
public required string DeliveryId { get; init; }
|
||||
public required string ChannelType { get; init; }
|
||||
public required DeadLetterReason Reason { get; init; }
|
||||
public string? ReasonDetails { get; init; }
|
||||
public object? OriginalPayload { get; init; }
|
||||
public string? ExceptionType { get; init; }
|
||||
public string? ExceptionMessage { get; init; }
|
||||
public int AttemptCount { get; init; }
|
||||
public DateTimeOffset FirstAttemptAt { get; init; }
|
||||
public DateTimeOffset DeadLetteredAt { get; init; }
|
||||
public DateTimeOffset? LastRetryAt { get; init; }
|
||||
public int RetryCount { get; init; }
|
||||
public DeadLetterStatus Status { get; init; } = DeadLetterStatus.Pending;
|
||||
public string? DiscardReason { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of a dead-lettered delivery.
|
||||
/// </summary>
|
||||
public enum DeadLetterStatus
|
||||
{
|
||||
Pending,
|
||||
Retrying,
|
||||
Retried,
|
||||
Discarded
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Query for dead-lettered deliveries.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterQuery
|
||||
{
|
||||
public string? Id { get; init; }
|
||||
public DeadLetterReason? Reason { get; init; }
|
||||
public string? ChannelType { get; init; }
|
||||
public DeadLetterStatus? Status { get; init; }
|
||||
public DateTimeOffset? After { get; init; }
|
||||
public DateTimeOffset? Before { get; init; }
|
||||
public int Limit { get; init; } = 100;
|
||||
public int Offset { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a retry attempt.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterRetryResult
|
||||
{
|
||||
public required string DeadLetterId { get; init; }
|
||||
public bool Success { get; init; }
|
||||
public string? Error { get; init; }
|
||||
public DeadLetterStatus NewStatus { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a bulk retry operation.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterBulkRetryResult
|
||||
{
|
||||
public int Total { get; init; }
|
||||
public int Succeeded { get; init; }
|
||||
public int Failed { get; init; }
|
||||
public IReadOnlyList<DeadLetterRetryResult> Results { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Statistics about dead-lettered deliveries.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterStats
|
||||
{
|
||||
public DateTimeOffset Timestamp { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public int TotalCount { get; init; }
|
||||
public int PendingCount { get; init; }
|
||||
public int RetryingCount { get; init; }
|
||||
public int RetriedCount { get; init; }
|
||||
public int DiscardedCount { get; init; }
|
||||
public IReadOnlyDictionary<DeadLetterReason, int> ByReason { get; init; } = new Dictionary<DeadLetterReason, int>();
|
||||
public IReadOnlyDictionary<string, int> ByChannel { get; init; } = new Dictionary<string, int>();
|
||||
public DateTimeOffset? OldestDeadLetterAt { get; init; }
|
||||
public DateTimeOffset? NewestDeadLetterAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for dead-letter handling.
|
||||
/// </summary>
|
||||
public sealed class DeadLetterOptions
|
||||
{
|
||||
public const string SectionName = "Notifier:Observability:DeadLetter";
|
||||
|
||||
public bool Enabled { get; set; } = true;
|
||||
public int MaxRetryAttempts { get; set; } = 3;
|
||||
public TimeSpan RetryDelay { get; set; } = TimeSpan.FromMinutes(5);
|
||||
public TimeSpan RetentionPeriod { get; set; } = TimeSpan.FromDays(30);
|
||||
public bool AutoPurge { get; set; } = true;
|
||||
public TimeSpan PurgeInterval { get; set; } = TimeSpan.FromHours(24);
|
||||
public int AlertThreshold { get; set; } = 100;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of dead-letter handler.
|
||||
/// </summary>
|
||||
public sealed class InMemoryDeadLetterHandler : IDeadLetterHandler
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, List<DeadLetteredDelivery>> _deadLetters = new();
|
||||
private readonly DeadLetterOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly INotifierMetrics? _metrics;
|
||||
private readonly ILogger<InMemoryDeadLetterHandler> _logger;
|
||||
|
||||
public InMemoryDeadLetterHandler(
|
||||
IOptions<DeadLetterOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
INotifierMetrics? metrics,
|
||||
ILogger<InMemoryDeadLetterHandler> logger)
|
||||
{
|
||||
_options = options?.Value ?? new DeadLetterOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public Task<DeadLetteredDelivery> DeadLetterAsync(
|
||||
string tenantId,
|
||||
string deliveryId,
|
||||
DeadLetterReason reason,
|
||||
string channelType,
|
||||
object? payload = null,
|
||||
Exception? exception = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var deadLetter = new DeadLetteredDelivery
|
||||
{
|
||||
DeadLetterId = $"dl-{Guid.NewGuid():N}"[..16],
|
||||
TenantId = tenantId,
|
||||
DeliveryId = deliveryId,
|
||||
ChannelType = channelType,
|
||||
Reason = reason,
|
||||
ReasonDetails = exception?.Message,
|
||||
OriginalPayload = payload,
|
||||
ExceptionType = exception?.GetType().FullName,
|
||||
ExceptionMessage = exception?.Message,
|
||||
DeadLetteredAt = now,
|
||||
FirstAttemptAt = now,
|
||||
Status = DeadLetterStatus.Pending
|
||||
};
|
||||
|
||||
var list = _deadLetters.GetOrAdd(tenantId, _ => []);
|
||||
lock (list) { list.Add(deadLetter); }
|
||||
|
||||
_metrics?.RecordDeadLetter(tenantId, reason.ToString(), channelType);
|
||||
_logger.LogWarning("Dead-lettered delivery {DeliveryId} for tenant {TenantId}: {Reason}", deliveryId, tenantId, reason);
|
||||
|
||||
return Task.FromResult(deadLetter);
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<DeadLetteredDelivery>> GetAsync(
|
||||
string tenantId,
|
||||
DeadLetterQuery? query = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_deadLetters.TryGetValue(tenantId, out var list))
|
||||
return Task.FromResult<IReadOnlyList<DeadLetteredDelivery>>([]);
|
||||
|
||||
IEnumerable<DeadLetteredDelivery> filtered;
|
||||
lock (list) { filtered = list.ToList(); }
|
||||
|
||||
if (query is not null)
|
||||
{
|
||||
if (!string.IsNullOrWhiteSpace(query.Id)) filtered = filtered.Where(d => d.DeadLetterId == query.Id);
|
||||
if (query.Reason.HasValue) filtered = filtered.Where(d => d.Reason == query.Reason.Value);
|
||||
if (!string.IsNullOrEmpty(query.ChannelType)) filtered = filtered.Where(d => d.ChannelType == query.ChannelType);
|
||||
if (query.Status.HasValue) filtered = filtered.Where(d => d.Status == query.Status.Value);
|
||||
if (query.After.HasValue) filtered = filtered.Where(d => d.DeadLetteredAt > query.After.Value);
|
||||
if (query.Before.HasValue) filtered = filtered.Where(d => d.DeadLetteredAt < query.Before.Value);
|
||||
}
|
||||
|
||||
var result = filtered.OrderByDescending(d => d.DeadLetteredAt).Skip(query?.Offset ?? 0).Take(query?.Limit ?? 100).ToList();
|
||||
return Task.FromResult<IReadOnlyList<DeadLetteredDelivery>>(result);
|
||||
}
|
||||
|
||||
public Task<DeadLetterRetryResult> RetryAsync(string tenantId, string deadLetterId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_deadLetters.TryGetValue(tenantId, out var list))
|
||||
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = false, Error = "Not found", NewStatus = DeadLetterStatus.Pending });
|
||||
|
||||
DeadLetteredDelivery? deadLetter;
|
||||
lock (list) { deadLetter = list.FirstOrDefault(d => d.DeadLetterId == deadLetterId); }
|
||||
if (deadLetter is null)
|
||||
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = false, Error = "Not found", NewStatus = DeadLetterStatus.Pending });
|
||||
|
||||
lock (list)
|
||||
{
|
||||
var index = list.FindIndex(d => d.DeadLetterId == deadLetterId);
|
||||
if (index >= 0)
|
||||
list[index] = deadLetter with { Status = DeadLetterStatus.Retried, LastRetryAt = _timeProvider.GetUtcNow(), RetryCount = deadLetter.RetryCount + 1 };
|
||||
}
|
||||
|
||||
_logger.LogInformation("Retrying dead-lettered delivery {DeadLetterId} for tenant {TenantId}", deadLetterId, tenantId);
|
||||
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = true, NewStatus = DeadLetterStatus.Retried });
|
||||
}
|
||||
|
||||
public async Task<DeadLetterBulkRetryResult> RetryBulkAsync(string tenantId, DeadLetterQuery? query = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var deadLetters = await GetAsync(tenantId, query, cancellationToken);
|
||||
var results = new List<DeadLetterRetryResult>();
|
||||
foreach (var dl in deadLetters.Where(d => d.Status == DeadLetterStatus.Pending))
|
||||
results.Add(await RetryAsync(tenantId, dl.DeadLetterId, cancellationToken));
|
||||
return new DeadLetterBulkRetryResult { Total = results.Count, Succeeded = results.Count(r => r.Success), Failed = results.Count(r => !r.Success), Results = results };
|
||||
}
|
||||
|
||||
public Task<bool> DiscardAsync(string tenantId, string deadLetterId, string? reason = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_deadLetters.TryGetValue(tenantId, out var list)) return Task.FromResult(false);
|
||||
lock (list)
|
||||
{
|
||||
var index = list.FindIndex(d => d.DeadLetterId == deadLetterId);
|
||||
if (index < 0) return Task.FromResult(false);
|
||||
list[index] = list[index] with { Status = DeadLetterStatus.Discarded, DiscardReason = reason };
|
||||
}
|
||||
_logger.LogInformation("Discarded dead-lettered delivery {DeadLetterId} for tenant {TenantId}: {Reason}", deadLetterId, tenantId, reason ?? "No reason");
|
||||
return Task.FromResult(true);
|
||||
}
|
||||
|
||||
public Task<DeadLetterStats> GetStatsAsync(string? tenantId = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var all = tenantId is not null ? (_deadLetters.TryGetValue(tenantId, out var l) ? l.ToList() : []) : _deadLetters.Values.SelectMany(v => v).ToList();
|
||||
return Task.FromResult(new DeadLetterStats
|
||||
{
|
||||
Timestamp = _timeProvider.GetUtcNow(),
|
||||
TenantId = tenantId,
|
||||
TotalCount = all.Count,
|
||||
PendingCount = all.Count(d => d.Status == DeadLetterStatus.Pending),
|
||||
RetryingCount = all.Count(d => d.Status == DeadLetterStatus.Retrying),
|
||||
RetriedCount = all.Count(d => d.Status == DeadLetterStatus.Retried),
|
||||
DiscardedCount = all.Count(d => d.Status == DeadLetterStatus.Discarded),
|
||||
ByReason = all.GroupBy(d => d.Reason).ToDictionary(g => g.Key, g => g.Count()),
|
||||
ByChannel = all.GroupBy(d => d.ChannelType).ToDictionary(g => g.Key, g => g.Count()),
|
||||
OldestDeadLetterAt = all.MinBy(d => d.DeadLetteredAt)?.DeadLetteredAt,
|
||||
NewestDeadLetterAt = all.MaxBy(d => d.DeadLetteredAt)?.DeadLetteredAt
|
||||
});
|
||||
}
|
||||
|
||||
public Task<int> PurgeAsync(string? tenantId, TimeSpan olderThan, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var cutoff = _timeProvider.GetUtcNow() - olderThan;
|
||||
var purged = 0;
|
||||
var tenants = tenantId is not null ? [tenantId] : _deadLetters.Keys.ToList();
|
||||
foreach (var t in tenants)
|
||||
{
|
||||
if (!_deadLetters.TryGetValue(t, out var list)) continue;
|
||||
lock (list) { purged += list.RemoveAll(d => d.DeadLetteredAt < cutoff); }
|
||||
}
|
||||
_logger.LogInformation("Purged {Count} dead-lettered deliveries older than {OlderThan}", purged, olderThan);
|
||||
return Task.FromResult(purged);
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,395 +1,395 @@
|
||||
using System.Diagnostics;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Tracing service for the Notifier module.
|
||||
/// Provides distributed tracing capabilities using OpenTelemetry-compatible Activity API.
|
||||
/// </summary>
|
||||
public interface INotifierTracing
|
||||
{
|
||||
/// <summary>
|
||||
/// Starts a delivery span.
|
||||
/// </summary>
|
||||
Activity? StartDeliverySpan(string tenantId, string deliveryId, string channelType);
|
||||
|
||||
/// <summary>
|
||||
/// Starts an escalation span.
|
||||
/// </summary>
|
||||
Activity? StartEscalationSpan(string tenantId, string incidentId, string policyId);
|
||||
|
||||
/// <summary>
|
||||
/// Starts a digest generation span.
|
||||
/// </summary>
|
||||
Activity? StartDigestSpan(string tenantId, string scheduleId);
|
||||
|
||||
/// <summary>
|
||||
/// Starts a template render span.
|
||||
/// </summary>
|
||||
Activity? StartTemplateRenderSpan(string tenantId, string templateId);
|
||||
|
||||
/// <summary>
|
||||
/// Starts a correlation span.
|
||||
/// </summary>
|
||||
Activity? StartCorrelationSpan(string tenantId, string eventKind);
|
||||
|
||||
/// <summary>
|
||||
/// Starts a webhook validation span.
|
||||
/// </summary>
|
||||
Activity? StartWebhookValidationSpan(string tenantId, string channelId);
|
||||
|
||||
/// <summary>
|
||||
/// Adds an event to the current span.
|
||||
/// </summary>
|
||||
void AddEvent(Activity? activity, string name, IDictionary<string, object?>? attributes = null);
|
||||
|
||||
/// <summary>
|
||||
/// Sets span status to error.
|
||||
/// </summary>
|
||||
void SetError(Activity? activity, Exception? exception = null, string? description = null);
|
||||
|
||||
/// <summary>
|
||||
/// Sets span status to ok.
|
||||
/// </summary>
|
||||
void SetOk(Activity? activity);
|
||||
|
||||
/// <summary>
|
||||
/// Adds custom tags to a span.
|
||||
/// </summary>
|
||||
void AddTags(Activity? activity, IDictionary<string, object?> tags);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a linked span (for batch operations).
|
||||
/// </summary>
|
||||
Activity? StartLinkedSpan(string operationName, ActivityContext parentContext, IDictionary<string, object?>? tags = null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for tracing service.
|
||||
/// </summary>
|
||||
public sealed class NotifierTracingOptions
|
||||
{
|
||||
public const string SectionName = "Notifier:Observability:Tracing";
|
||||
|
||||
/// <summary>
|
||||
/// Whether tracing is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Activity source name.
|
||||
/// </summary>
|
||||
public string SourceName { get; set; } = "StellaOps.Notifier";
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include sensitive data in traces.
|
||||
/// </summary>
|
||||
public bool IncludeSensitiveData { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Sampling ratio (0.0 to 1.0).
|
||||
/// </summary>
|
||||
public double SamplingRatio { get; set; } = 1.0;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of attributes per span.
|
||||
/// </summary>
|
||||
public int MaxAttributesPerSpan { get; set; } = 128;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of events per span.
|
||||
/// </summary>
|
||||
public int MaxEventsPerSpan { get; set; } = 128;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of notifier tracing.
|
||||
/// </summary>
|
||||
public sealed class DefaultNotifierTracing : INotifierTracing, IDisposable
|
||||
{
|
||||
private readonly ActivitySource _activitySource;
|
||||
private readonly NotifierTracingOptions _options;
|
||||
private readonly ILogger<DefaultNotifierTracing> _logger;
|
||||
|
||||
public DefaultNotifierTracing(
|
||||
IOptions<NotifierTracingOptions> options,
|
||||
ILogger<DefaultNotifierTracing> logger)
|
||||
{
|
||||
_options = options?.Value ?? new NotifierTracingOptions();
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_activitySource = new ActivitySource(_options.SourceName, "1.0.0");
|
||||
}
|
||||
|
||||
public Activity? StartDeliverySpan(string tenantId, string deliveryId, string channelType)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.delivery",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("delivery.id", deliveryId);
|
||||
activity.SetTag("channel.type", channelType);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartEscalationSpan(string tenantId, string incidentId, string policyId)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.escalation",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("incident.id", incidentId);
|
||||
activity.SetTag("policy.id", policyId);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartDigestSpan(string tenantId, string scheduleId)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.digest",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("schedule.id", scheduleId);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartTemplateRenderSpan(string tenantId, string templateId)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.template.render",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("template.id", templateId);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartCorrelationSpan(string tenantId, string eventKind)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.correlation",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("event.kind", eventKind);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartWebhookValidationSpan(string tenantId, string channelId)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.webhook.validation",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("channel.id", channelId);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public void AddEvent(Activity? activity, string name, IDictionary<string, object?>? attributes = null)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
var tags = new ActivityTagsCollection();
|
||||
if (attributes is not null)
|
||||
{
|
||||
foreach (var (key, value) in attributes)
|
||||
{
|
||||
if (value is not null)
|
||||
{
|
||||
tags.Add(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
activity.AddEvent(new ActivityEvent(name, tags: tags));
|
||||
}
|
||||
|
||||
public void SetError(Activity? activity, Exception? exception = null, string? description = null)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetStatus(ActivityStatusCode.Error, description ?? exception?.Message);
|
||||
|
||||
if (exception is not null)
|
||||
{
|
||||
activity.SetTag("exception.type", exception.GetType().FullName);
|
||||
activity.SetTag("exception.message", exception.Message);
|
||||
|
||||
if (_options.IncludeSensitiveData)
|
||||
{
|
||||
activity.SetTag("exception.stacktrace", exception.StackTrace);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void SetOk(Activity? activity)
|
||||
{
|
||||
activity?.SetStatus(ActivityStatusCode.Ok);
|
||||
}
|
||||
|
||||
public void AddTags(Activity? activity, IDictionary<string, object?> tags)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
foreach (var (key, value) in tags)
|
||||
{
|
||||
if (value is not null)
|
||||
{
|
||||
activity.SetTag(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Activity? StartLinkedSpan(string operationName, ActivityContext parentContext, IDictionary<string, object?>? tags = null)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var links = new[] { new ActivityLink(parentContext) };
|
||||
var activity = _activitySource.StartActivity(
|
||||
operationName,
|
||||
ActivityKind.Internal,
|
||||
parentContext: default,
|
||||
links: links);
|
||||
|
||||
if (activity is not null && tags is not null)
|
||||
{
|
||||
AddTags(activity, tags);
|
||||
}
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_activitySource.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for Activity-based tracing.
|
||||
/// </summary>
|
||||
public static class ActivityExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Records a delivery result on the activity.
|
||||
/// </summary>
|
||||
public static void RecordDeliveryResult(this Activity? activity, bool success, int? httpStatusCode = null, string? error = null)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetTag("delivery.success", success);
|
||||
if (httpStatusCode.HasValue)
|
||||
{
|
||||
activity.SetTag("http.status_code", httpStatusCode.Value);
|
||||
}
|
||||
if (!string.IsNullOrEmpty(error))
|
||||
{
|
||||
activity.SetTag("delivery.error", error);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records an escalation level change.
|
||||
/// </summary>
|
||||
public static void RecordEscalationLevel(this Activity? activity, int level, string? target = null)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetTag("escalation.level", level);
|
||||
if (!string.IsNullOrEmpty(target))
|
||||
{
|
||||
activity.SetTag("escalation.target", target);
|
||||
}
|
||||
|
||||
activity.AddEvent(new ActivityEvent("escalation.level.changed", tags: new ActivityTagsCollection
|
||||
{
|
||||
{ "level", level },
|
||||
{ "target", target }
|
||||
}));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records storm detection.
|
||||
/// </summary>
|
||||
public static void RecordStormDetected(this Activity? activity, string eventKind, int eventCount)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.AddEvent(new ActivityEvent("storm.detected", tags: new ActivityTagsCollection
|
||||
{
|
||||
{ "event_kind", eventKind },
|
||||
{ "event_count", eventCount }
|
||||
}));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records fallback attempt.
|
||||
/// </summary>
|
||||
public static void RecordFallback(this Activity? activity, string fromChannel, string toChannel)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.AddEvent(new ActivityEvent("fallback.attempted", tags: new ActivityTagsCollection
|
||||
{
|
||||
{ "from_channel", fromChannel },
|
||||
{ "to_channel", toChannel }
|
||||
}));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records template render details.
|
||||
/// </summary>
|
||||
public static void RecordTemplateRender(this Activity? activity, string format, int outputLength)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetTag("template.format", format);
|
||||
activity.SetTag("template.output_length", outputLength);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records correlation result.
|
||||
/// </summary>
|
||||
public static void RecordCorrelationResult(this Activity? activity, string correlationKey, bool isNewIncident)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetTag("correlation.key", correlationKey);
|
||||
activity.SetTag("correlation.new_incident", isNewIncident);
|
||||
}
|
||||
}
|
||||
using System.Diagnostics;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Tracing service for the Notifier module.
|
||||
/// Provides distributed tracing capabilities using OpenTelemetry-compatible Activity API.
|
||||
/// </summary>
|
||||
public interface INotifierTracing
|
||||
{
|
||||
/// <summary>
|
||||
/// Starts a delivery span.
|
||||
/// </summary>
|
||||
Activity? StartDeliverySpan(string tenantId, string deliveryId, string channelType);
|
||||
|
||||
/// <summary>
|
||||
/// Starts an escalation span.
|
||||
/// </summary>
|
||||
Activity? StartEscalationSpan(string tenantId, string incidentId, string policyId);
|
||||
|
||||
/// <summary>
|
||||
/// Starts a digest generation span.
|
||||
/// </summary>
|
||||
Activity? StartDigestSpan(string tenantId, string scheduleId);
|
||||
|
||||
/// <summary>
|
||||
/// Starts a template render span.
|
||||
/// </summary>
|
||||
Activity? StartTemplateRenderSpan(string tenantId, string templateId);
|
||||
|
||||
/// <summary>
|
||||
/// Starts a correlation span.
|
||||
/// </summary>
|
||||
Activity? StartCorrelationSpan(string tenantId, string eventKind);
|
||||
|
||||
/// <summary>
|
||||
/// Starts a webhook validation span.
|
||||
/// </summary>
|
||||
Activity? StartWebhookValidationSpan(string tenantId, string channelId);
|
||||
|
||||
/// <summary>
|
||||
/// Adds an event to the current span.
|
||||
/// </summary>
|
||||
void AddEvent(Activity? activity, string name, IDictionary<string, object?>? attributes = null);
|
||||
|
||||
/// <summary>
|
||||
/// Sets span status to error.
|
||||
/// </summary>
|
||||
void SetError(Activity? activity, Exception? exception = null, string? description = null);
|
||||
|
||||
/// <summary>
|
||||
/// Sets span status to ok.
|
||||
/// </summary>
|
||||
void SetOk(Activity? activity);
|
||||
|
||||
/// <summary>
|
||||
/// Adds custom tags to a span.
|
||||
/// </summary>
|
||||
void AddTags(Activity? activity, IDictionary<string, object?> tags);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a linked span (for batch operations).
|
||||
/// </summary>
|
||||
Activity? StartLinkedSpan(string operationName, ActivityContext parentContext, IDictionary<string, object?>? tags = null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for tracing service.
|
||||
/// </summary>
|
||||
public sealed class NotifierTracingOptions
|
||||
{
|
||||
public const string SectionName = "Notifier:Observability:Tracing";
|
||||
|
||||
/// <summary>
|
||||
/// Whether tracing is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Activity source name.
|
||||
/// </summary>
|
||||
public string SourceName { get; set; } = "StellaOps.Notifier";
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include sensitive data in traces.
|
||||
/// </summary>
|
||||
public bool IncludeSensitiveData { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Sampling ratio (0.0 to 1.0).
|
||||
/// </summary>
|
||||
public double SamplingRatio { get; set; } = 1.0;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of attributes per span.
|
||||
/// </summary>
|
||||
public int MaxAttributesPerSpan { get; set; } = 128;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of events per span.
|
||||
/// </summary>
|
||||
public int MaxEventsPerSpan { get; set; } = 128;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of notifier tracing.
|
||||
/// </summary>
|
||||
public sealed class DefaultNotifierTracing : INotifierTracing, IDisposable
|
||||
{
|
||||
private readonly ActivitySource _activitySource;
|
||||
private readonly NotifierTracingOptions _options;
|
||||
private readonly ILogger<DefaultNotifierTracing> _logger;
|
||||
|
||||
public DefaultNotifierTracing(
|
||||
IOptions<NotifierTracingOptions> options,
|
||||
ILogger<DefaultNotifierTracing> logger)
|
||||
{
|
||||
_options = options?.Value ?? new NotifierTracingOptions();
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_activitySource = new ActivitySource(_options.SourceName, "1.0.0");
|
||||
}
|
||||
|
||||
public Activity? StartDeliverySpan(string tenantId, string deliveryId, string channelType)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.delivery",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("delivery.id", deliveryId);
|
||||
activity.SetTag("channel.type", channelType);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartEscalationSpan(string tenantId, string incidentId, string policyId)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.escalation",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("incident.id", incidentId);
|
||||
activity.SetTag("policy.id", policyId);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartDigestSpan(string tenantId, string scheduleId)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.digest",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("schedule.id", scheduleId);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartTemplateRenderSpan(string tenantId, string templateId)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.template.render",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("template.id", templateId);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartCorrelationSpan(string tenantId, string eventKind)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.correlation",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("event.kind", eventKind);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartWebhookValidationSpan(string tenantId, string channelId)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.webhook.validation",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("channel.id", channelId);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public void AddEvent(Activity? activity, string name, IDictionary<string, object?>? attributes = null)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
var tags = new ActivityTagsCollection();
|
||||
if (attributes is not null)
|
||||
{
|
||||
foreach (var (key, value) in attributes)
|
||||
{
|
||||
if (value is not null)
|
||||
{
|
||||
tags.Add(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
activity.AddEvent(new ActivityEvent(name, tags: tags));
|
||||
}
|
||||
|
||||
public void SetError(Activity? activity, Exception? exception = null, string? description = null)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetStatus(ActivityStatusCode.Error, description ?? exception?.Message);
|
||||
|
||||
if (exception is not null)
|
||||
{
|
||||
activity.SetTag("exception.type", exception.GetType().FullName);
|
||||
activity.SetTag("exception.message", exception.Message);
|
||||
|
||||
if (_options.IncludeSensitiveData)
|
||||
{
|
||||
activity.SetTag("exception.stacktrace", exception.StackTrace);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void SetOk(Activity? activity)
|
||||
{
|
||||
activity?.SetStatus(ActivityStatusCode.Ok);
|
||||
}
|
||||
|
||||
public void AddTags(Activity? activity, IDictionary<string, object?> tags)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
foreach (var (key, value) in tags)
|
||||
{
|
||||
if (value is not null)
|
||||
{
|
||||
activity.SetTag(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Activity? StartLinkedSpan(string operationName, ActivityContext parentContext, IDictionary<string, object?>? tags = null)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var links = new[] { new ActivityLink(parentContext) };
|
||||
var activity = _activitySource.StartActivity(
|
||||
operationName,
|
||||
ActivityKind.Internal,
|
||||
parentContext: default,
|
||||
links: links);
|
||||
|
||||
if (activity is not null && tags is not null)
|
||||
{
|
||||
AddTags(activity, tags);
|
||||
}
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_activitySource.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for Activity-based tracing.
|
||||
/// </summary>
|
||||
public static class ActivityExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Records a delivery result on the activity.
|
||||
/// </summary>
|
||||
public static void RecordDeliveryResult(this Activity? activity, bool success, int? httpStatusCode = null, string? error = null)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetTag("delivery.success", success);
|
||||
if (httpStatusCode.HasValue)
|
||||
{
|
||||
activity.SetTag("http.status_code", httpStatusCode.Value);
|
||||
}
|
||||
if (!string.IsNullOrEmpty(error))
|
||||
{
|
||||
activity.SetTag("delivery.error", error);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records an escalation level change.
|
||||
/// </summary>
|
||||
public static void RecordEscalationLevel(this Activity? activity, int level, string? target = null)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetTag("escalation.level", level);
|
||||
if (!string.IsNullOrEmpty(target))
|
||||
{
|
||||
activity.SetTag("escalation.target", target);
|
||||
}
|
||||
|
||||
activity.AddEvent(new ActivityEvent("escalation.level.changed", tags: new ActivityTagsCollection
|
||||
{
|
||||
{ "level", level },
|
||||
{ "target", target }
|
||||
}));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records storm detection.
|
||||
/// </summary>
|
||||
public static void RecordStormDetected(this Activity? activity, string eventKind, int eventCount)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.AddEvent(new ActivityEvent("storm.detected", tags: new ActivityTagsCollection
|
||||
{
|
||||
{ "event_kind", eventKind },
|
||||
{ "event_count", eventCount }
|
||||
}));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records fallback attempt.
|
||||
/// </summary>
|
||||
public static void RecordFallback(this Activity? activity, string fromChannel, string toChannel)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.AddEvent(new ActivityEvent("fallback.attempted", tags: new ActivityTagsCollection
|
||||
{
|
||||
{ "from_channel", fromChannel },
|
||||
{ "to_channel", toChannel }
|
||||
}));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records template render details.
|
||||
/// </summary>
|
||||
public static void RecordTemplateRender(this Activity? activity, string format, int outputLength)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetTag("template.format", format);
|
||||
activity.SetTag("template.output_length", outputLength);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records correlation result.
|
||||
/// </summary>
|
||||
public static void RecordCorrelationResult(this Activity? activity, string correlationKey, bool isNewIncident)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetTag("correlation.key", correlationKey);
|
||||
activity.SetTag("correlation.new_incident", isNewIncident);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,98 +1,98 @@
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for notification system metrics and tracing.
|
||||
/// </summary>
|
||||
public interface INotifyMetrics
|
||||
{
|
||||
/// <summary>
|
||||
/// Records a notification delivery attempt.
|
||||
/// </summary>
|
||||
void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration);
|
||||
|
||||
/// <summary>
|
||||
/// Records an escalation event.
|
||||
/// </summary>
|
||||
void RecordEscalation(string tenantId, int level, string outcome);
|
||||
|
||||
/// <summary>
|
||||
/// Records a dead-letter entry.
|
||||
/// </summary>
|
||||
void RecordDeadLetter(string tenantId, string reason, string channelType);
|
||||
|
||||
/// <summary>
|
||||
/// Records rule evaluation.
|
||||
/// </summary>
|
||||
void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration);
|
||||
|
||||
/// <summary>
|
||||
/// Records template rendering.
|
||||
/// </summary>
|
||||
void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration);
|
||||
|
||||
/// <summary>
|
||||
/// Records storm detection event.
|
||||
/// </summary>
|
||||
void RecordStormEvent(string tenantId, string stormKey, string decision);
|
||||
|
||||
/// <summary>
|
||||
/// Records retention cleanup.
|
||||
/// </summary>
|
||||
void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current queue depth for a channel.
|
||||
/// </summary>
|
||||
void RecordQueueDepth(string tenantId, string channelType, int depth);
|
||||
|
||||
/// <summary>
|
||||
/// Creates an activity for distributed tracing.
|
||||
/// </summary>
|
||||
Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType);
|
||||
|
||||
/// <summary>
|
||||
/// Creates an activity for escalation tracing.
|
||||
/// </summary>
|
||||
Activity? StartEscalationActivity(string tenantId, string incidentId, int level);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metric tag names for consistency.
|
||||
/// </summary>
|
||||
public static class NotifyMetricTags
|
||||
{
|
||||
public const string TenantId = "tenant_id";
|
||||
public const string ChannelType = "channel_type";
|
||||
public const string Status = "status";
|
||||
public const string Outcome = "outcome";
|
||||
public const string Level = "level";
|
||||
public const string Reason = "reason";
|
||||
public const string RuleId = "rule_id";
|
||||
public const string Matched = "matched";
|
||||
public const string TemplateKey = "template_key";
|
||||
public const string Success = "success";
|
||||
public const string StormKey = "storm_key";
|
||||
public const string Decision = "decision";
|
||||
public const string EntityType = "entity_type";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metric names for the notification system.
|
||||
/// </summary>
|
||||
public static class NotifyMetricNames
|
||||
{
|
||||
public const string DeliveryAttempts = "notify.delivery.attempts";
|
||||
public const string DeliveryDuration = "notify.delivery.duration";
|
||||
public const string EscalationEvents = "notify.escalation.events";
|
||||
public const string DeadLetterEntries = "notify.deadletter.entries";
|
||||
public const string RuleEvaluations = "notify.rule.evaluations";
|
||||
public const string RuleEvaluationDuration = "notify.rule.evaluation.duration";
|
||||
public const string TemplateRenders = "notify.template.renders";
|
||||
public const string TemplateRenderDuration = "notify.template.render.duration";
|
||||
public const string StormEvents = "notify.storm.events";
|
||||
public const string RetentionCleanups = "notify.retention.cleanups";
|
||||
public const string QueueDepth = "notify.queue.depth";
|
||||
}
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for notification system metrics and tracing.
|
||||
/// </summary>
|
||||
public interface INotifyMetrics
|
||||
{
|
||||
/// <summary>
|
||||
/// Records a notification delivery attempt.
|
||||
/// </summary>
|
||||
void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration);
|
||||
|
||||
/// <summary>
|
||||
/// Records an escalation event.
|
||||
/// </summary>
|
||||
void RecordEscalation(string tenantId, int level, string outcome);
|
||||
|
||||
/// <summary>
|
||||
/// Records a dead-letter entry.
|
||||
/// </summary>
|
||||
void RecordDeadLetter(string tenantId, string reason, string channelType);
|
||||
|
||||
/// <summary>
|
||||
/// Records rule evaluation.
|
||||
/// </summary>
|
||||
void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration);
|
||||
|
||||
/// <summary>
|
||||
/// Records template rendering.
|
||||
/// </summary>
|
||||
void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration);
|
||||
|
||||
/// <summary>
|
||||
/// Records storm detection event.
|
||||
/// </summary>
|
||||
void RecordStormEvent(string tenantId, string stormKey, string decision);
|
||||
|
||||
/// <summary>
|
||||
/// Records retention cleanup.
|
||||
/// </summary>
|
||||
void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current queue depth for a channel.
|
||||
/// </summary>
|
||||
void RecordQueueDepth(string tenantId, string channelType, int depth);
|
||||
|
||||
/// <summary>
|
||||
/// Creates an activity for distributed tracing.
|
||||
/// </summary>
|
||||
Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType);
|
||||
|
||||
/// <summary>
|
||||
/// Creates an activity for escalation tracing.
|
||||
/// </summary>
|
||||
Activity? StartEscalationActivity(string tenantId, string incidentId, int level);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metric tag names for consistency.
|
||||
/// </summary>
|
||||
public static class NotifyMetricTags
|
||||
{
|
||||
public const string TenantId = "tenant_id";
|
||||
public const string ChannelType = "channel_type";
|
||||
public const string Status = "status";
|
||||
public const string Outcome = "outcome";
|
||||
public const string Level = "level";
|
||||
public const string Reason = "reason";
|
||||
public const string RuleId = "rule_id";
|
||||
public const string Matched = "matched";
|
||||
public const string TemplateKey = "template_key";
|
||||
public const string Success = "success";
|
||||
public const string StormKey = "storm_key";
|
||||
public const string Decision = "decision";
|
||||
public const string EntityType = "entity_type";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metric names for the notification system.
|
||||
/// </summary>
|
||||
public static class NotifyMetricNames
|
||||
{
|
||||
public const string DeliveryAttempts = "notify.delivery.attempts";
|
||||
public const string DeliveryDuration = "notify.delivery.duration";
|
||||
public const string EscalationEvents = "notify.escalation.events";
|
||||
public const string DeadLetterEntries = "notify.deadletter.entries";
|
||||
public const string RuleEvaluations = "notify.rule.evaluations";
|
||||
public const string RuleEvaluationDuration = "notify.rule.evaluation.duration";
|
||||
public const string TemplateRenders = "notify.template.renders";
|
||||
public const string TemplateRenderDuration = "notify.template.render.duration";
|
||||
public const string StormEvents = "notify.storm.events";
|
||||
public const string RetentionCleanups = "notify.retention.cleanups";
|
||||
public const string QueueDepth = "notify.queue.depth";
|
||||
}
|
||||
|
||||
@@ -1,243 +1,243 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Notifier.Worker.Retention;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering observability services.
|
||||
/// </summary>
|
||||
public static class ObservabilityServiceExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds all observability services (metrics, tracing, dead-letter, chaos, retention).
|
||||
/// </summary>
|
||||
public static IServiceCollection AddNotifierObservability(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
return services
|
||||
.AddNotifierMetrics(configuration)
|
||||
.AddNotifierTracing(configuration)
|
||||
.AddDeadLetterHandling(configuration)
|
||||
.AddChaosEngine(configuration)
|
||||
.AddRetentionPolicies(configuration);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds notifier metrics services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddNotifierMetrics(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<NotifierMetricsOptions>(
|
||||
configuration.GetSection(NotifierMetricsOptions.SectionName));
|
||||
|
||||
services.AddSingleton<INotifierMetrics, DefaultNotifierMetrics>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds notifier tracing services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddNotifierTracing(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<NotifierTracingOptions>(
|
||||
configuration.GetSection(NotifierTracingOptions.SectionName));
|
||||
|
||||
services.AddSingleton<INotifierTracing, DefaultNotifierTracing>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds dead-letter handling services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddDeadLetterHandling(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<DeadLetterOptions>(
|
||||
configuration.GetSection(DeadLetterOptions.SectionName));
|
||||
|
||||
services.AddSingleton<IDeadLetterHandler, InMemoryDeadLetterHandler>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds chaos engine services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddChaosEngine(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<ChaosEngineOptions>(
|
||||
configuration.GetSection(ChaosEngineOptions.SectionName));
|
||||
|
||||
services.AddSingleton<IChaosEngine, DefaultChaosEngine>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds retention policy services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddRetentionPolicies(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<RetentionOptions>(
|
||||
configuration.GetSection(RetentionOptions.SectionName));
|
||||
|
||||
services.AddSingleton<IRetentionPolicyService, DefaultRetentionPolicyService>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builder for customizing observability services.
|
||||
/// </summary>
|
||||
public static ObservabilityServiceBuilder AddNotifierObservability(this IServiceCollection services)
|
||||
{
|
||||
return new ObservabilityServiceBuilder(services);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builder for customizing observability services.
|
||||
/// </summary>
|
||||
public sealed class ObservabilityServiceBuilder
|
||||
{
|
||||
private readonly IServiceCollection _services;
|
||||
|
||||
public ObservabilityServiceBuilder(IServiceCollection services)
|
||||
{
|
||||
_services = services ?? throw new ArgumentNullException(nameof(services));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures metrics options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureMetrics(Action<NotifierMetricsOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures tracing options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureTracing(Action<NotifierTracingOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures dead-letter options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureDeadLetter(Action<DeadLetterOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures chaos engine options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureChaos(Action<ChaosEngineOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures retention options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureRetention(Action<RetentionOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom metrics implementation.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomMetrics<T>() where T : class, INotifierMetrics
|
||||
{
|
||||
_services.AddSingleton<INotifierMetrics, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom tracing implementation.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomTracing<T>() where T : class, INotifierTracing
|
||||
{
|
||||
_services.AddSingleton<INotifierTracing, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom dead-letter handler.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomDeadLetterHandler<T>() where T : class, IDeadLetterHandler
|
||||
{
|
||||
_services.AddSingleton<IDeadLetterHandler, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom chaos engine.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomChaosEngine<T>() where T : class, IChaosEngine
|
||||
{
|
||||
_services.AddSingleton<IChaosEngine, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom retention policy service.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomRetentionService<T>() where T : class, IRetentionPolicyService
|
||||
{
|
||||
_services.AddSingleton<IRetentionPolicyService, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds the services with default implementations.
|
||||
/// </summary>
|
||||
public IServiceCollection Build()
|
||||
{
|
||||
// Register defaults if not already registered
|
||||
_services.TryAddSingleton<INotifierMetrics, DefaultNotifierMetrics>();
|
||||
_services.TryAddSingleton<INotifierTracing, DefaultNotifierTracing>();
|
||||
_services.TryAddSingleton<IDeadLetterHandler, InMemoryDeadLetterHandler>();
|
||||
_services.TryAddSingleton<IChaosEngine, DefaultChaosEngine>();
|
||||
_services.TryAddSingleton<IRetentionPolicyService, DefaultRetentionPolicyService>();
|
||||
|
||||
return _services;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for service collection to ensure singleton registration.
|
||||
/// </summary>
|
||||
file static class ServiceCollectionExtensions
|
||||
{
|
||||
public static void TryAddSingleton<TService, TImplementation>(this IServiceCollection services)
|
||||
where TService : class
|
||||
where TImplementation : class, TService
|
||||
{
|
||||
if (!services.Any(s => s.ServiceType == typeof(TService)))
|
||||
{
|
||||
services.AddSingleton<TService, TImplementation>();
|
||||
}
|
||||
}
|
||||
}
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.Notifier.Worker.Retention;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering observability services.
|
||||
/// </summary>
|
||||
public static class ObservabilityServiceExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds all observability services (metrics, tracing, dead-letter, chaos, retention).
|
||||
/// </summary>
|
||||
public static IServiceCollection AddNotifierObservability(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
return services
|
||||
.AddNotifierMetrics(configuration)
|
||||
.AddNotifierTracing(configuration)
|
||||
.AddDeadLetterHandling(configuration)
|
||||
.AddChaosEngine(configuration)
|
||||
.AddRetentionPolicies(configuration);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds notifier metrics services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddNotifierMetrics(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<NotifierMetricsOptions>(
|
||||
configuration.GetSection(NotifierMetricsOptions.SectionName));
|
||||
|
||||
services.AddSingleton<INotifierMetrics, DefaultNotifierMetrics>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds notifier tracing services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddNotifierTracing(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<NotifierTracingOptions>(
|
||||
configuration.GetSection(NotifierTracingOptions.SectionName));
|
||||
|
||||
services.AddSingleton<INotifierTracing, DefaultNotifierTracing>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds dead-letter handling services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddDeadLetterHandling(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<DeadLetterOptions>(
|
||||
configuration.GetSection(DeadLetterOptions.SectionName));
|
||||
|
||||
services.AddSingleton<IDeadLetterHandler, InMemoryDeadLetterHandler>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds chaos engine services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddChaosEngine(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<ChaosEngineOptions>(
|
||||
configuration.GetSection(ChaosEngineOptions.SectionName));
|
||||
|
||||
services.AddSingleton<IChaosEngine, DefaultChaosEngine>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds retention policy services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddRetentionPolicies(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<RetentionOptions>(
|
||||
configuration.GetSection(RetentionOptions.SectionName));
|
||||
|
||||
services.AddSingleton<IRetentionPolicyService, DefaultRetentionPolicyService>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builder for customizing observability services.
|
||||
/// </summary>
|
||||
public static ObservabilityServiceBuilder AddNotifierObservability(this IServiceCollection services)
|
||||
{
|
||||
return new ObservabilityServiceBuilder(services);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builder for customizing observability services.
|
||||
/// </summary>
|
||||
public sealed class ObservabilityServiceBuilder
|
||||
{
|
||||
private readonly IServiceCollection _services;
|
||||
|
||||
public ObservabilityServiceBuilder(IServiceCollection services)
|
||||
{
|
||||
_services = services ?? throw new ArgumentNullException(nameof(services));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures metrics options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureMetrics(Action<NotifierMetricsOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures tracing options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureTracing(Action<NotifierTracingOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures dead-letter options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureDeadLetter(Action<DeadLetterOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures chaos engine options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureChaos(Action<ChaosEngineOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures retention options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureRetention(Action<RetentionOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom metrics implementation.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomMetrics<T>() where T : class, INotifierMetrics
|
||||
{
|
||||
_services.AddSingleton<INotifierMetrics, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom tracing implementation.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomTracing<T>() where T : class, INotifierTracing
|
||||
{
|
||||
_services.AddSingleton<INotifierTracing, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom dead-letter handler.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomDeadLetterHandler<T>() where T : class, IDeadLetterHandler
|
||||
{
|
||||
_services.AddSingleton<IDeadLetterHandler, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom chaos engine.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomChaosEngine<T>() where T : class, IChaosEngine
|
||||
{
|
||||
_services.AddSingleton<IChaosEngine, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom retention policy service.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomRetentionService<T>() where T : class, IRetentionPolicyService
|
||||
{
|
||||
_services.AddSingleton<IRetentionPolicyService, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds the services with default implementations.
|
||||
/// </summary>
|
||||
public IServiceCollection Build()
|
||||
{
|
||||
// Register defaults if not already registered
|
||||
_services.TryAddSingleton<INotifierMetrics, DefaultNotifierMetrics>();
|
||||
_services.TryAddSingleton<INotifierTracing, DefaultNotifierTracing>();
|
||||
_services.TryAddSingleton<IDeadLetterHandler, InMemoryDeadLetterHandler>();
|
||||
_services.TryAddSingleton<IChaosEngine, DefaultChaosEngine>();
|
||||
_services.TryAddSingleton<IRetentionPolicyService, DefaultRetentionPolicyService>();
|
||||
|
||||
return _services;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for service collection to ensure singleton registration.
|
||||
/// </summary>
|
||||
file static class ServiceCollectionExtensions
|
||||
{
|
||||
public static void TryAddSingleton<TService, TImplementation>(this IServiceCollection services)
|
||||
where TService : class
|
||||
where TImplementation : class, TService
|
||||
{
|
||||
if (!services.Any(s => s.ServiceType == typeof(TService)))
|
||||
{
|
||||
services.AddSingleton<TService, TImplementation>();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user