Merge branch 'main' of https://git.stella-ops.org/stella-ops.org/git.stella-ops.org
Some checks failed
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Docs CI / lint-and-preview (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
api-governance / spectral-lint (push) Has been cancelled
oas-ci / oas-validate (push) Has been cancelled
Policy Simulation / policy-simulate (push) Has been cancelled
sdk-generator-smoke / sdk-smoke (push) Has been cancelled
SDK Publish & Sign / sdk-publish (push) Has been cancelled
Some checks failed
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Docs CI / lint-and-preview (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
api-governance / spectral-lint (push) Has been cancelled
oas-ci / oas-validate (push) Has been cancelled
Policy Simulation / policy-simulate (push) Has been cancelled
sdk-generator-smoke / sdk-smoke (push) Has been cancelled
SDK Publish & Sign / sdk-publish (push) Has been cancelled
This commit is contained in:
@@ -1,233 +1,233 @@
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of notification metrics using System.Diagnostics.Metrics.
|
||||
/// </summary>
|
||||
public sealed class DefaultNotifyMetrics : INotifyMetrics
|
||||
{
|
||||
private static readonly ActivitySource ActivitySource = new("StellaOps.Notifier", "1.0.0");
|
||||
private static readonly Meter Meter = new("StellaOps.Notifier", "1.0.0");
|
||||
|
||||
// Counters
|
||||
private readonly Counter<long> _deliveryAttempts;
|
||||
private readonly Counter<long> _escalationEvents;
|
||||
private readonly Counter<long> _deadLetterEntries;
|
||||
private readonly Counter<long> _ruleEvaluations;
|
||||
private readonly Counter<long> _templateRenders;
|
||||
private readonly Counter<long> _stormEvents;
|
||||
private readonly Counter<long> _retentionCleanups;
|
||||
|
||||
// Histograms
|
||||
private readonly Histogram<double> _deliveryDuration;
|
||||
private readonly Histogram<double> _ruleEvaluationDuration;
|
||||
private readonly Histogram<double> _templateRenderDuration;
|
||||
|
||||
// Gauges (using ObservableGauge pattern)
|
||||
private readonly Dictionary<string, int> _queueDepths = new();
|
||||
private readonly object _queueDepthLock = new();
|
||||
|
||||
public DefaultNotifyMetrics()
|
||||
{
|
||||
// Initialize counters
|
||||
_deliveryAttempts = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.DeliveryAttempts,
|
||||
unit: "{attempts}",
|
||||
description: "Total number of notification delivery attempts");
|
||||
|
||||
_escalationEvents = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.EscalationEvents,
|
||||
unit: "{events}",
|
||||
description: "Total number of escalation events");
|
||||
|
||||
_deadLetterEntries = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.DeadLetterEntries,
|
||||
unit: "{entries}",
|
||||
description: "Total number of dead-letter entries");
|
||||
|
||||
_ruleEvaluations = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.RuleEvaluations,
|
||||
unit: "{evaluations}",
|
||||
description: "Total number of rule evaluations");
|
||||
|
||||
_templateRenders = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.TemplateRenders,
|
||||
unit: "{renders}",
|
||||
description: "Total number of template render operations");
|
||||
|
||||
_stormEvents = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.StormEvents,
|
||||
unit: "{events}",
|
||||
description: "Total number of storm detection events");
|
||||
|
||||
_retentionCleanups = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.RetentionCleanups,
|
||||
unit: "{cleanups}",
|
||||
description: "Total number of retention cleanup operations");
|
||||
|
||||
// Initialize histograms
|
||||
_deliveryDuration = Meter.CreateHistogram<double>(
|
||||
NotifyMetricNames.DeliveryDuration,
|
||||
unit: "ms",
|
||||
description: "Duration of delivery attempts in milliseconds");
|
||||
|
||||
_ruleEvaluationDuration = Meter.CreateHistogram<double>(
|
||||
NotifyMetricNames.RuleEvaluationDuration,
|
||||
unit: "ms",
|
||||
description: "Duration of rule evaluations in milliseconds");
|
||||
|
||||
_templateRenderDuration = Meter.CreateHistogram<double>(
|
||||
NotifyMetricNames.TemplateRenderDuration,
|
||||
unit: "ms",
|
||||
description: "Duration of template renders in milliseconds");
|
||||
|
||||
// Initialize observable gauge for queue depths
|
||||
Meter.CreateObservableGauge(
|
||||
NotifyMetricNames.QueueDepth,
|
||||
observeValues: ObserveQueueDepths,
|
||||
unit: "{messages}",
|
||||
description: "Current queue depth per channel");
|
||||
}
|
||||
|
||||
public void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.ChannelType, channelType },
|
||||
{ NotifyMetricTags.Status, status }
|
||||
};
|
||||
|
||||
_deliveryAttempts.Add(1, tags);
|
||||
_deliveryDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordEscalation(string tenantId, int level, string outcome)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.Level, level.ToString() },
|
||||
{ NotifyMetricTags.Outcome, outcome }
|
||||
};
|
||||
|
||||
_escalationEvents.Add(1, tags);
|
||||
}
|
||||
|
||||
public void RecordDeadLetter(string tenantId, string reason, string channelType)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.Reason, reason },
|
||||
{ NotifyMetricTags.ChannelType, channelType }
|
||||
};
|
||||
|
||||
_deadLetterEntries.Add(1, tags);
|
||||
}
|
||||
|
||||
public void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.RuleId, ruleId },
|
||||
{ NotifyMetricTags.Matched, matched.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_ruleEvaluations.Add(1, tags);
|
||||
_ruleEvaluationDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.TemplateKey, templateKey },
|
||||
{ NotifyMetricTags.Success, success.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_templateRenders.Add(1, tags);
|
||||
_templateRenderDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordStormEvent(string tenantId, string stormKey, string decision)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.StormKey, stormKey },
|
||||
{ NotifyMetricTags.Decision, decision }
|
||||
};
|
||||
|
||||
_stormEvents.Add(1, tags);
|
||||
}
|
||||
|
||||
public void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.EntityType, entityType }
|
||||
};
|
||||
|
||||
_retentionCleanups.Add(deletedCount, tags);
|
||||
}
|
||||
|
||||
public void RecordQueueDepth(string tenantId, string channelType, int depth)
|
||||
{
|
||||
var key = $"{tenantId}:{channelType}";
|
||||
lock (_queueDepthLock)
|
||||
{
|
||||
_queueDepths[key] = depth;
|
||||
}
|
||||
}
|
||||
|
||||
public Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType)
|
||||
{
|
||||
var activity = ActivitySource.StartActivity("notify.delivery", ActivityKind.Internal);
|
||||
if (activity is not null)
|
||||
{
|
||||
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
|
||||
activity.SetTag("delivery_id", deliveryId);
|
||||
activity.SetTag(NotifyMetricTags.ChannelType, channelType);
|
||||
}
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartEscalationActivity(string tenantId, string incidentId, int level)
|
||||
{
|
||||
var activity = ActivitySource.StartActivity("notify.escalation", ActivityKind.Internal);
|
||||
if (activity is not null)
|
||||
{
|
||||
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
|
||||
activity.SetTag("incident_id", incidentId);
|
||||
activity.SetTag(NotifyMetricTags.Level, level);
|
||||
}
|
||||
return activity;
|
||||
}
|
||||
|
||||
private IEnumerable<Measurement<int>> ObserveQueueDepths()
|
||||
{
|
||||
lock (_queueDepthLock)
|
||||
{
|
||||
foreach (var (key, depth) in _queueDepths)
|
||||
{
|
||||
var parts = key.Split(':');
|
||||
if (parts.Length == 2)
|
||||
{
|
||||
yield return new Measurement<int>(
|
||||
depth,
|
||||
new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, parts[0] },
|
||||
{ NotifyMetricTags.ChannelType, parts[1] }
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of notification metrics using System.Diagnostics.Metrics.
|
||||
/// </summary>
|
||||
public sealed class DefaultNotifyMetrics : INotifyMetrics
|
||||
{
|
||||
private static readonly ActivitySource ActivitySource = new("StellaOps.Notifier", "1.0.0");
|
||||
private static readonly Meter Meter = new("StellaOps.Notifier", "1.0.0");
|
||||
|
||||
// Counters
|
||||
private readonly Counter<long> _deliveryAttempts;
|
||||
private readonly Counter<long> _escalationEvents;
|
||||
private readonly Counter<long> _deadLetterEntries;
|
||||
private readonly Counter<long> _ruleEvaluations;
|
||||
private readonly Counter<long> _templateRenders;
|
||||
private readonly Counter<long> _stormEvents;
|
||||
private readonly Counter<long> _retentionCleanups;
|
||||
|
||||
// Histograms
|
||||
private readonly Histogram<double> _deliveryDuration;
|
||||
private readonly Histogram<double> _ruleEvaluationDuration;
|
||||
private readonly Histogram<double> _templateRenderDuration;
|
||||
|
||||
// Gauges (using ObservableGauge pattern)
|
||||
private readonly Dictionary<string, int> _queueDepths = new();
|
||||
private readonly object _queueDepthLock = new();
|
||||
|
||||
public DefaultNotifyMetrics()
|
||||
{
|
||||
// Initialize counters
|
||||
_deliveryAttempts = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.DeliveryAttempts,
|
||||
unit: "{attempts}",
|
||||
description: "Total number of notification delivery attempts");
|
||||
|
||||
_escalationEvents = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.EscalationEvents,
|
||||
unit: "{events}",
|
||||
description: "Total number of escalation events");
|
||||
|
||||
_deadLetterEntries = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.DeadLetterEntries,
|
||||
unit: "{entries}",
|
||||
description: "Total number of dead-letter entries");
|
||||
|
||||
_ruleEvaluations = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.RuleEvaluations,
|
||||
unit: "{evaluations}",
|
||||
description: "Total number of rule evaluations");
|
||||
|
||||
_templateRenders = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.TemplateRenders,
|
||||
unit: "{renders}",
|
||||
description: "Total number of template render operations");
|
||||
|
||||
_stormEvents = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.StormEvents,
|
||||
unit: "{events}",
|
||||
description: "Total number of storm detection events");
|
||||
|
||||
_retentionCleanups = Meter.CreateCounter<long>(
|
||||
NotifyMetricNames.RetentionCleanups,
|
||||
unit: "{cleanups}",
|
||||
description: "Total number of retention cleanup operations");
|
||||
|
||||
// Initialize histograms
|
||||
_deliveryDuration = Meter.CreateHistogram<double>(
|
||||
NotifyMetricNames.DeliveryDuration,
|
||||
unit: "ms",
|
||||
description: "Duration of delivery attempts in milliseconds");
|
||||
|
||||
_ruleEvaluationDuration = Meter.CreateHistogram<double>(
|
||||
NotifyMetricNames.RuleEvaluationDuration,
|
||||
unit: "ms",
|
||||
description: "Duration of rule evaluations in milliseconds");
|
||||
|
||||
_templateRenderDuration = Meter.CreateHistogram<double>(
|
||||
NotifyMetricNames.TemplateRenderDuration,
|
||||
unit: "ms",
|
||||
description: "Duration of template renders in milliseconds");
|
||||
|
||||
// Initialize observable gauge for queue depths
|
||||
Meter.CreateObservableGauge(
|
||||
NotifyMetricNames.QueueDepth,
|
||||
observeValues: ObserveQueueDepths,
|
||||
unit: "{messages}",
|
||||
description: "Current queue depth per channel");
|
||||
}
|
||||
|
||||
public void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.ChannelType, channelType },
|
||||
{ NotifyMetricTags.Status, status }
|
||||
};
|
||||
|
||||
_deliveryAttempts.Add(1, tags);
|
||||
_deliveryDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordEscalation(string tenantId, int level, string outcome)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.Level, level.ToString() },
|
||||
{ NotifyMetricTags.Outcome, outcome }
|
||||
};
|
||||
|
||||
_escalationEvents.Add(1, tags);
|
||||
}
|
||||
|
||||
public void RecordDeadLetter(string tenantId, string reason, string channelType)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.Reason, reason },
|
||||
{ NotifyMetricTags.ChannelType, channelType }
|
||||
};
|
||||
|
||||
_deadLetterEntries.Add(1, tags);
|
||||
}
|
||||
|
||||
public void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.RuleId, ruleId },
|
||||
{ NotifyMetricTags.Matched, matched.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_ruleEvaluations.Add(1, tags);
|
||||
_ruleEvaluationDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.TemplateKey, templateKey },
|
||||
{ NotifyMetricTags.Success, success.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_templateRenders.Add(1, tags);
|
||||
_templateRenderDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordStormEvent(string tenantId, string stormKey, string decision)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.StormKey, stormKey },
|
||||
{ NotifyMetricTags.Decision, decision }
|
||||
};
|
||||
|
||||
_stormEvents.Add(1, tags);
|
||||
}
|
||||
|
||||
public void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, tenantId },
|
||||
{ NotifyMetricTags.EntityType, entityType }
|
||||
};
|
||||
|
||||
_retentionCleanups.Add(deletedCount, tags);
|
||||
}
|
||||
|
||||
public void RecordQueueDepth(string tenantId, string channelType, int depth)
|
||||
{
|
||||
var key = $"{tenantId}:{channelType}";
|
||||
lock (_queueDepthLock)
|
||||
{
|
||||
_queueDepths[key] = depth;
|
||||
}
|
||||
}
|
||||
|
||||
public Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType)
|
||||
{
|
||||
var activity = ActivitySource.StartActivity("notify.delivery", ActivityKind.Internal);
|
||||
if (activity is not null)
|
||||
{
|
||||
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
|
||||
activity.SetTag("delivery_id", deliveryId);
|
||||
activity.SetTag(NotifyMetricTags.ChannelType, channelType);
|
||||
}
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartEscalationActivity(string tenantId, string incidentId, int level)
|
||||
{
|
||||
var activity = ActivitySource.StartActivity("notify.escalation", ActivityKind.Internal);
|
||||
if (activity is not null)
|
||||
{
|
||||
activity.SetTag(NotifyMetricTags.TenantId, tenantId);
|
||||
activity.SetTag("incident_id", incidentId);
|
||||
activity.SetTag(NotifyMetricTags.Level, level);
|
||||
}
|
||||
return activity;
|
||||
}
|
||||
|
||||
private IEnumerable<Measurement<int>> ObserveQueueDepths()
|
||||
{
|
||||
lock (_queueDepthLock)
|
||||
{
|
||||
foreach (var (key, depth) in _queueDepths)
|
||||
{
|
||||
var parts = key.Split(':');
|
||||
if (parts.Length == 2)
|
||||
{
|
||||
yield return new Measurement<int>(
|
||||
depth,
|
||||
new TagList
|
||||
{
|
||||
{ NotifyMetricTags.TenantId, parts[0] },
|
||||
{ NotifyMetricTags.ChannelType, parts[1] }
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,471 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Chaos testing engine for simulating channel outages and failures.
|
||||
/// </summary>
|
||||
public interface IChaosEngine
|
||||
{
|
||||
/// <summary>
|
||||
/// Injects a fault for a channel type.
|
||||
/// </summary>
|
||||
Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Removes a fault injection.
|
||||
/// </summary>
|
||||
Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets all active faults.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a channel operation should fail due to chaos.
|
||||
/// </summary>
|
||||
Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Runs a chaos test scenario.
|
||||
/// </summary>
|
||||
Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets chaos test history.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Clears all active faults.
|
||||
/// </summary>
|
||||
Task ClearAllFaultsAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Request to inject a fault.
|
||||
/// </summary>
|
||||
public sealed record ChaosFaultRequest
|
||||
{
|
||||
public required string ChannelType { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public required ChaosFaultType FaultType { get; init; }
|
||||
public double FailureProbability { get; init; } = 1.0;
|
||||
public TimeSpan? Duration { get; init; }
|
||||
public TimeSpan? LatencyInjection { get; init; }
|
||||
public int? ErrorCode { get; init; }
|
||||
public string? ErrorMessage { get; init; }
|
||||
public string? Description { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of chaos fault.
|
||||
/// </summary>
|
||||
public enum ChaosFaultType
|
||||
{
|
||||
Outage,
|
||||
Latency,
|
||||
RateLimit,
|
||||
AuthFailure,
|
||||
Timeout,
|
||||
PartialFailure,
|
||||
Intermittent
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Active fault injection.
|
||||
/// </summary>
|
||||
public sealed record ChaosFaultInjection
|
||||
{
|
||||
public required string FaultId { get; init; }
|
||||
public required string ChannelType { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public required ChaosFaultType FaultType { get; init; }
|
||||
public double FailureProbability { get; init; }
|
||||
public TimeSpan? LatencyInjection { get; init; }
|
||||
public int? ErrorCode { get; init; }
|
||||
public string? ErrorMessage { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset? ExpiresAt { get; init; }
|
||||
public int TriggerCount { get; init; }
|
||||
public bool IsActive { get; init; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of checking for chaos fault.
|
||||
/// </summary>
|
||||
public sealed record ChaosFaultResult
|
||||
{
|
||||
public bool ShouldFail { get; init; }
|
||||
public ChaosFaultInjection? ActiveFault { get; init; }
|
||||
public TimeSpan? InjectedLatency { get; init; }
|
||||
public Exception? SimulatedException { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A chaos test scenario.
|
||||
/// </summary>
|
||||
public sealed record ChaosScenario
|
||||
{
|
||||
public required string ScenarioId { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public required IReadOnlyList<ChaosScenarioStep> Steps { get; init; }
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(10);
|
||||
public bool StopOnFirstFailure { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A step in a chaos scenario.
|
||||
/// </summary>
|
||||
public sealed record ChaosScenarioStep
|
||||
{
|
||||
public required string StepId { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public required ChaosStepAction Action { get; init; }
|
||||
public ChaosFaultRequest? FaultToInject { get; init; }
|
||||
public string? FaultIdToRemove { get; init; }
|
||||
public TimeSpan? WaitDuration { get; init; }
|
||||
public ChaosAssertion? Assertion { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Action type for a chaos step.
|
||||
/// </summary>
|
||||
public enum ChaosStepAction
|
||||
{
|
||||
InjectFault,
|
||||
RemoveFault,
|
||||
Wait,
|
||||
Assert,
|
||||
SendTestDelivery,
|
||||
CheckMetrics
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Assertion for chaos testing.
|
||||
/// </summary>
|
||||
public sealed record ChaosAssertion
|
||||
{
|
||||
public required ChaosAssertionType Type { get; init; }
|
||||
public string? MetricName { get; init; }
|
||||
public double? ExpectedValue { get; init; }
|
||||
public double? Tolerance { get; init; }
|
||||
public string? ExpectedStatus { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of chaos assertion.
|
||||
/// </summary>
|
||||
public enum ChaosAssertionType
|
||||
{
|
||||
MetricEquals,
|
||||
MetricGreaterThan,
|
||||
MetricLessThan,
|
||||
DeadLetterCountEquals,
|
||||
FallbackTriggered,
|
||||
AlertFired
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a chaos test.
|
||||
/// </summary>
|
||||
public sealed record ChaosTestResult
|
||||
{
|
||||
public required string TestId { get; init; }
|
||||
public required string ScenarioId { get; init; }
|
||||
public required string ScenarioName { get; init; }
|
||||
public bool Success { get; init; }
|
||||
public DateTimeOffset StartedAt { get; init; }
|
||||
public DateTimeOffset CompletedAt { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
public IReadOnlyList<ChaosStepResult> StepResults { get; init; } = [];
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a chaos step.
|
||||
/// </summary>
|
||||
public sealed record ChaosStepResult
|
||||
{
|
||||
public required string StepId { get; init; }
|
||||
public required string StepName { get; init; }
|
||||
public bool Success { get; init; }
|
||||
public DateTimeOffset ExecutedAt { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
public string? Error { get; init; }
|
||||
public object? Data { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for chaos engine.
|
||||
/// </summary>
|
||||
public sealed class ChaosEngineOptions
|
||||
{
|
||||
public const string SectionName = "Notifier:Observability:Chaos";
|
||||
|
||||
public bool Enabled { get; set; }
|
||||
public bool AllowInProduction { get; set; }
|
||||
public TimeSpan MaxFaultDuration { get; set; } = TimeSpan.FromHours(1);
|
||||
public int MaxConcurrentFaults { get; set; } = 10;
|
||||
public IReadOnlyList<string> AllowedChannelTypes { get; set; } = ["webhook", "email", "slack", "teams", "pagerduty", "opsgenie"];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of chaos engine.
|
||||
/// </summary>
|
||||
public sealed class DefaultChaosEngine : IChaosEngine
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, ChaosFaultInjection> _activeFaults = new();
|
||||
private readonly List<ChaosTestResult> _testHistory = [];
|
||||
private readonly ChaosEngineOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly INotifierMetrics? _metrics;
|
||||
private readonly ILogger<DefaultChaosEngine> _logger;
|
||||
private readonly Random _random = new();
|
||||
|
||||
public DefaultChaosEngine(
|
||||
IOptions<ChaosEngineOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
INotifierMetrics? metrics,
|
||||
ILogger<DefaultChaosEngine> logger)
|
||||
{
|
||||
_options = options?.Value ?? new ChaosEngineOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public Task<ChaosFaultInjection> InjectFaultAsync(ChaosFaultRequest request, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
throw new InvalidOperationException("Chaos engine is disabled");
|
||||
|
||||
if (_activeFaults.Count >= _options.MaxConcurrentFaults)
|
||||
throw new InvalidOperationException($"Maximum concurrent faults ({_options.MaxConcurrentFaults}) reached");
|
||||
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var duration = request.Duration ?? _options.MaxFaultDuration;
|
||||
if (duration > _options.MaxFaultDuration)
|
||||
duration = _options.MaxFaultDuration;
|
||||
|
||||
var fault = new ChaosFaultInjection
|
||||
{
|
||||
FaultId = $"chaos-{Guid.NewGuid():N}"[..16],
|
||||
ChannelType = request.ChannelType,
|
||||
TenantId = request.TenantId,
|
||||
FaultType = request.FaultType,
|
||||
FailureProbability = Math.Clamp(request.FailureProbability, 0.0, 1.0),
|
||||
LatencyInjection = request.LatencyInjection,
|
||||
ErrorCode = request.ErrorCode,
|
||||
ErrorMessage = request.ErrorMessage,
|
||||
Description = request.Description,
|
||||
CreatedAt = now,
|
||||
ExpiresAt = now + duration,
|
||||
IsActive = true
|
||||
};
|
||||
|
||||
_activeFaults[fault.FaultId] = fault;
|
||||
_logger.LogWarning("Injected chaos fault {FaultId} for channel {ChannelType}: {FaultType}", fault.FaultId, fault.ChannelType, fault.FaultType);
|
||||
|
||||
return Task.FromResult(fault);
|
||||
}
|
||||
|
||||
public Task<bool> RemoveFaultAsync(string faultId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var removed = _activeFaults.TryRemove(faultId, out var fault);
|
||||
if (removed)
|
||||
_logger.LogInformation("Removed chaos fault {FaultId}", faultId);
|
||||
return Task.FromResult(removed);
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<ChaosFaultInjection>> GetActiveFaultsAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
CleanupExpiredFaults();
|
||||
return Task.FromResult<IReadOnlyList<ChaosFaultInjection>>(_activeFaults.Values.ToList());
|
||||
}
|
||||
|
||||
public Task<ChaosFaultResult> ShouldFailAsync(string channelType, string? tenantId = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
|
||||
|
||||
CleanupExpiredFaults();
|
||||
|
||||
var matchingFault = _activeFaults.Values
|
||||
.Where(f => f.IsActive)
|
||||
.Where(f => f.ChannelType == channelType || f.ChannelType == "*")
|
||||
.Where(f => f.TenantId is null || f.TenantId == tenantId)
|
||||
.FirstOrDefault();
|
||||
|
||||
if (matchingFault is null)
|
||||
return Task.FromResult(new ChaosFaultResult { ShouldFail = false });
|
||||
|
||||
var shouldFail = _random.NextDouble() < matchingFault.FailureProbability;
|
||||
if (!shouldFail)
|
||||
return Task.FromResult(new ChaosFaultResult { ShouldFail = false, ActiveFault = matchingFault });
|
||||
|
||||
// Update trigger count
|
||||
_activeFaults[matchingFault.FaultId] = matchingFault with { TriggerCount = matchingFault.TriggerCount + 1 };
|
||||
|
||||
var exception = matchingFault.FaultType switch
|
||||
{
|
||||
ChaosFaultType.Outage => new InvalidOperationException(matchingFault.ErrorMessage ?? "Channel outage (chaos)"),
|
||||
ChaosFaultType.AuthFailure => new UnauthorizedAccessException(matchingFault.ErrorMessage ?? "Auth failure (chaos)"),
|
||||
ChaosFaultType.Timeout => new TimeoutException(matchingFault.ErrorMessage ?? "Timeout (chaos)"),
|
||||
ChaosFaultType.RateLimit => new InvalidOperationException(matchingFault.ErrorMessage ?? "Rate limited (chaos)"),
|
||||
_ => new Exception(matchingFault.ErrorMessage ?? "Chaos fault")
|
||||
};
|
||||
|
||||
return Task.FromResult(new ChaosFaultResult
|
||||
{
|
||||
ShouldFail = true,
|
||||
ActiveFault = matchingFault,
|
||||
InjectedLatency = matchingFault.LatencyInjection,
|
||||
SimulatedException = exception
|
||||
});
|
||||
}
|
||||
|
||||
public async Task<ChaosTestResult> RunScenarioAsync(ChaosScenario scenario, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
throw new InvalidOperationException("Chaos engine is disabled");
|
||||
|
||||
var testId = $"test-{Guid.NewGuid():N}"[..16];
|
||||
var startedAt = _timeProvider.GetUtcNow();
|
||||
var stepResults = new List<ChaosStepResult>();
|
||||
string? error = null;
|
||||
var success = true;
|
||||
|
||||
_logger.LogInformation("Starting chaos scenario {ScenarioId}: {ScenarioName}", scenario.ScenarioId, scenario.Name);
|
||||
|
||||
using var timeoutCts = new CancellationTokenSource(scenario.Timeout);
|
||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken, timeoutCts.Token);
|
||||
|
||||
try
|
||||
{
|
||||
foreach (var step in scenario.Steps)
|
||||
{
|
||||
var stepStart = _timeProvider.GetUtcNow();
|
||||
var stepResult = await ExecuteStepAsync(step, linkedCts.Token);
|
||||
stepResults.Add(stepResult);
|
||||
|
||||
if (!stepResult.Success)
|
||||
{
|
||||
success = false;
|
||||
if (scenario.StopOnFirstFailure)
|
||||
{
|
||||
error = $"Step '{step.Name}' failed: {stepResult.Error}";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
success = false;
|
||||
error = "Scenario timed out";
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
success = false;
|
||||
error = ex.Message;
|
||||
}
|
||||
|
||||
var completedAt = _timeProvider.GetUtcNow();
|
||||
var result = new ChaosTestResult
|
||||
{
|
||||
TestId = testId,
|
||||
ScenarioId = scenario.ScenarioId,
|
||||
ScenarioName = scenario.Name,
|
||||
Success = success,
|
||||
StartedAt = startedAt,
|
||||
CompletedAt = completedAt,
|
||||
Duration = completedAt - startedAt,
|
||||
StepResults = stepResults,
|
||||
Error = error
|
||||
};
|
||||
|
||||
lock (_testHistory)
|
||||
{
|
||||
_testHistory.Add(result);
|
||||
while (_testHistory.Count > 100) _testHistory.RemoveAt(0);
|
||||
}
|
||||
|
||||
_logger.LogInformation("Chaos scenario {ScenarioId} completed: {Success}", scenario.ScenarioId, success ? "PASSED" : "FAILED");
|
||||
return result;
|
||||
}
|
||||
|
||||
private async Task<ChaosStepResult> ExecuteStepAsync(ChaosScenarioStep step, CancellationToken cancellationToken)
|
||||
{
|
||||
var executedAt = _timeProvider.GetUtcNow();
|
||||
try
|
||||
{
|
||||
object? data = null;
|
||||
switch (step.Action)
|
||||
{
|
||||
case ChaosStepAction.InjectFault when step.FaultToInject is not null:
|
||||
var fault = await InjectFaultAsync(step.FaultToInject, cancellationToken);
|
||||
data = fault;
|
||||
break;
|
||||
case ChaosStepAction.RemoveFault when step.FaultIdToRemove is not null:
|
||||
await RemoveFaultAsync(step.FaultIdToRemove, cancellationToken);
|
||||
break;
|
||||
case ChaosStepAction.Wait when step.WaitDuration.HasValue:
|
||||
await Task.Delay(step.WaitDuration.Value, cancellationToken);
|
||||
break;
|
||||
case ChaosStepAction.Assert when step.Assertion is not null:
|
||||
var assertResult = EvaluateAssertion(step.Assertion);
|
||||
if (!assertResult.passed)
|
||||
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = assertResult.error };
|
||||
break;
|
||||
}
|
||||
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = true, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Data = data };
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
return new ChaosStepResult { StepId = step.StepId, StepName = step.Name, Success = false, ExecutedAt = executedAt, Duration = _timeProvider.GetUtcNow() - executedAt, Error = ex.Message };
|
||||
}
|
||||
}
|
||||
|
||||
private (bool passed, string? error) EvaluateAssertion(ChaosAssertion assertion)
|
||||
{
|
||||
// Simplified assertion evaluation - in production would query actual metrics
|
||||
return assertion.Type switch
|
||||
{
|
||||
ChaosAssertionType.FallbackTriggered => (true, null),
|
||||
ChaosAssertionType.AlertFired => (true, null),
|
||||
_ => (true, null)
|
||||
};
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<ChaosTestResult>> GetHistoryAsync(int limit = 50, CancellationToken cancellationToken = default)
|
||||
{
|
||||
lock (_testHistory)
|
||||
{
|
||||
return Task.FromResult<IReadOnlyList<ChaosTestResult>>(_testHistory.TakeLast(limit).Reverse().ToList());
|
||||
}
|
||||
}
|
||||
|
||||
public Task ClearAllFaultsAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
_activeFaults.Clear();
|
||||
_logger.LogInformation("Cleared all chaos faults");
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
private void CleanupExpiredFaults()
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var expired = _activeFaults.Where(f => f.Value.ExpiresAt.HasValue && f.Value.ExpiresAt < now).Select(f => f.Key).ToList();
|
||||
foreach (var id in expired)
|
||||
{
|
||||
_activeFaults.TryRemove(id, out _);
|
||||
_logger.LogDebug("Expired chaos fault {FaultId}", id);
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,349 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Handles notifications that have failed permanently after all retries.
|
||||
/// </summary>
|
||||
public interface IDeadLetterHandler
|
||||
{
|
||||
/// <summary>
|
||||
/// Moves a delivery to the dead-letter queue.
|
||||
/// </summary>
|
||||
Task<DeadLetteredDelivery> DeadLetterAsync(
|
||||
string tenantId,
|
||||
string deliveryId,
|
||||
DeadLetterReason reason,
|
||||
string channelType,
|
||||
object? payload = null,
|
||||
Exception? exception = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets dead-lettered deliveries for a tenant.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<DeadLetteredDelivery>> GetAsync(
|
||||
string tenantId,
|
||||
DeadLetterQuery? query = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Retries a dead-lettered delivery.
|
||||
/// </summary>
|
||||
Task<DeadLetterRetryResult> RetryAsync(
|
||||
string tenantId,
|
||||
string deadLetterId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Retries all matching dead-lettered deliveries.
|
||||
/// </summary>
|
||||
Task<DeadLetterBulkRetryResult> RetryBulkAsync(
|
||||
string tenantId,
|
||||
DeadLetterQuery? query = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Discards a dead-lettered delivery.
|
||||
/// </summary>
|
||||
Task<bool> DiscardAsync(
|
||||
string tenantId,
|
||||
string deadLetterId,
|
||||
string? reason = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets statistics about dead-lettered deliveries.
|
||||
/// </summary>
|
||||
Task<DeadLetterStats> GetStatsAsync(
|
||||
string? tenantId = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Purges old dead-lettered deliveries.
|
||||
/// </summary>
|
||||
Task<int> PurgeAsync(
|
||||
string? tenantId,
|
||||
TimeSpan olderThan,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reason for dead-lettering.
|
||||
/// </summary>
|
||||
public enum DeadLetterReason
|
||||
{
|
||||
MaxRetriesExceeded,
|
||||
InvalidPayload,
|
||||
ChannelUnavailable,
|
||||
AuthenticationFailed,
|
||||
RateLimited,
|
||||
TemplateRenderFailed,
|
||||
ConfigurationError,
|
||||
UnknownError
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A dead-lettered delivery.
|
||||
/// </summary>
|
||||
public sealed record DeadLetteredDelivery
|
||||
{
|
||||
public required string DeadLetterId { get; init; }
|
||||
public required string TenantId { get; init; }
|
||||
public required string DeliveryId { get; init; }
|
||||
public required string ChannelType { get; init; }
|
||||
public required DeadLetterReason Reason { get; init; }
|
||||
public string? ReasonDetails { get; init; }
|
||||
public object? OriginalPayload { get; init; }
|
||||
public string? ExceptionType { get; init; }
|
||||
public string? ExceptionMessage { get; init; }
|
||||
public int AttemptCount { get; init; }
|
||||
public DateTimeOffset FirstAttemptAt { get; init; }
|
||||
public DateTimeOffset DeadLetteredAt { get; init; }
|
||||
public DateTimeOffset? LastRetryAt { get; init; }
|
||||
public int RetryCount { get; init; }
|
||||
public DeadLetterStatus Status { get; init; } = DeadLetterStatus.Pending;
|
||||
public string? DiscardReason { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of a dead-lettered delivery.
|
||||
/// </summary>
|
||||
public enum DeadLetterStatus
|
||||
{
|
||||
Pending,
|
||||
Retrying,
|
||||
Retried,
|
||||
Discarded
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Query for dead-lettered deliveries.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterQuery
|
||||
{
|
||||
public DeadLetterReason? Reason { get; init; }
|
||||
public string? ChannelType { get; init; }
|
||||
public DeadLetterStatus? Status { get; init; }
|
||||
public DateTimeOffset? After { get; init; }
|
||||
public DateTimeOffset? Before { get; init; }
|
||||
public int Limit { get; init; } = 100;
|
||||
public int Offset { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a retry attempt.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterRetryResult
|
||||
{
|
||||
public required string DeadLetterId { get; init; }
|
||||
public bool Success { get; init; }
|
||||
public string? Error { get; init; }
|
||||
public DeadLetterStatus NewStatus { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a bulk retry operation.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterBulkRetryResult
|
||||
{
|
||||
public int Total { get; init; }
|
||||
public int Succeeded { get; init; }
|
||||
public int Failed { get; init; }
|
||||
public IReadOnlyList<DeadLetterRetryResult> Results { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Statistics about dead-lettered deliveries.
|
||||
/// </summary>
|
||||
public sealed record DeadLetterStats
|
||||
{
|
||||
public DateTimeOffset Timestamp { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public int TotalCount { get; init; }
|
||||
public int PendingCount { get; init; }
|
||||
public int RetryingCount { get; init; }
|
||||
public int RetriedCount { get; init; }
|
||||
public int DiscardedCount { get; init; }
|
||||
public IReadOnlyDictionary<DeadLetterReason, int> ByReason { get; init; } = new Dictionary<DeadLetterReason, int>();
|
||||
public IReadOnlyDictionary<string, int> ByChannel { get; init; } = new Dictionary<string, int>();
|
||||
public DateTimeOffset? OldestDeadLetterAt { get; init; }
|
||||
public DateTimeOffset? NewestDeadLetterAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for dead-letter handling.
|
||||
/// </summary>
|
||||
public sealed class DeadLetterOptions
|
||||
{
|
||||
public const string SectionName = "Notifier:Observability:DeadLetter";
|
||||
|
||||
public bool Enabled { get; set; } = true;
|
||||
public int MaxRetryAttempts { get; set; } = 3;
|
||||
public TimeSpan RetryDelay { get; set; } = TimeSpan.FromMinutes(5);
|
||||
public TimeSpan RetentionPeriod { get; set; } = TimeSpan.FromDays(30);
|
||||
public bool AutoPurge { get; set; } = true;
|
||||
public TimeSpan PurgeInterval { get; set; } = TimeSpan.FromHours(24);
|
||||
public int AlertThreshold { get; set; } = 100;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of dead-letter handler.
|
||||
/// </summary>
|
||||
public sealed class InMemoryDeadLetterHandler : IDeadLetterHandler
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, List<DeadLetteredDelivery>> _deadLetters = new();
|
||||
private readonly DeadLetterOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly INotifierMetrics? _metrics;
|
||||
private readonly ILogger<InMemoryDeadLetterHandler> _logger;
|
||||
|
||||
public InMemoryDeadLetterHandler(
|
||||
IOptions<DeadLetterOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
INotifierMetrics? metrics,
|
||||
ILogger<InMemoryDeadLetterHandler> logger)
|
||||
{
|
||||
_options = options?.Value ?? new DeadLetterOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_metrics = metrics;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public Task<DeadLetteredDelivery> DeadLetterAsync(
|
||||
string tenantId,
|
||||
string deliveryId,
|
||||
DeadLetterReason reason,
|
||||
string channelType,
|
||||
object? payload = null,
|
||||
Exception? exception = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var deadLetter = new DeadLetteredDelivery
|
||||
{
|
||||
DeadLetterId = $"dl-{Guid.NewGuid():N}"[..16],
|
||||
TenantId = tenantId,
|
||||
DeliveryId = deliveryId,
|
||||
ChannelType = channelType,
|
||||
Reason = reason,
|
||||
ReasonDetails = exception?.Message,
|
||||
OriginalPayload = payload,
|
||||
ExceptionType = exception?.GetType().FullName,
|
||||
ExceptionMessage = exception?.Message,
|
||||
DeadLetteredAt = now,
|
||||
FirstAttemptAt = now,
|
||||
Status = DeadLetterStatus.Pending
|
||||
};
|
||||
|
||||
var list = _deadLetters.GetOrAdd(tenantId, _ => []);
|
||||
lock (list) { list.Add(deadLetter); }
|
||||
|
||||
_metrics?.RecordDeadLetter(tenantId, reason.ToString(), channelType);
|
||||
_logger.LogWarning("Dead-lettered delivery {DeliveryId} for tenant {TenantId}: {Reason}", deliveryId, tenantId, reason);
|
||||
|
||||
return Task.FromResult(deadLetter);
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<DeadLetteredDelivery>> GetAsync(
|
||||
string tenantId,
|
||||
DeadLetterQuery? query = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_deadLetters.TryGetValue(tenantId, out var list))
|
||||
return Task.FromResult<IReadOnlyList<DeadLetteredDelivery>>([]);
|
||||
|
||||
IEnumerable<DeadLetteredDelivery> filtered;
|
||||
lock (list) { filtered = list.ToList(); }
|
||||
|
||||
if (query is not null)
|
||||
{
|
||||
if (query.Reason.HasValue) filtered = filtered.Where(d => d.Reason == query.Reason.Value);
|
||||
if (!string.IsNullOrEmpty(query.ChannelType)) filtered = filtered.Where(d => d.ChannelType == query.ChannelType);
|
||||
if (query.Status.HasValue) filtered = filtered.Where(d => d.Status == query.Status.Value);
|
||||
if (query.After.HasValue) filtered = filtered.Where(d => d.DeadLetteredAt > query.After.Value);
|
||||
if (query.Before.HasValue) filtered = filtered.Where(d => d.DeadLetteredAt < query.Before.Value);
|
||||
}
|
||||
|
||||
var result = filtered.OrderByDescending(d => d.DeadLetteredAt).Skip(query?.Offset ?? 0).Take(query?.Limit ?? 100).ToList();
|
||||
return Task.FromResult<IReadOnlyList<DeadLetteredDelivery>>(result);
|
||||
}
|
||||
|
||||
public Task<DeadLetterRetryResult> RetryAsync(string tenantId, string deadLetterId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_deadLetters.TryGetValue(tenantId, out var list))
|
||||
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = false, Error = "Not found", NewStatus = DeadLetterStatus.Pending });
|
||||
|
||||
DeadLetteredDelivery? deadLetter;
|
||||
lock (list) { deadLetter = list.FirstOrDefault(d => d.DeadLetterId == deadLetterId); }
|
||||
if (deadLetter is null)
|
||||
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = false, Error = "Not found", NewStatus = DeadLetterStatus.Pending });
|
||||
|
||||
lock (list)
|
||||
{
|
||||
var index = list.FindIndex(d => d.DeadLetterId == deadLetterId);
|
||||
if (index >= 0)
|
||||
list[index] = deadLetter with { Status = DeadLetterStatus.Retried, LastRetryAt = _timeProvider.GetUtcNow(), RetryCount = deadLetter.RetryCount + 1 };
|
||||
}
|
||||
|
||||
_logger.LogInformation("Retrying dead-lettered delivery {DeadLetterId} for tenant {TenantId}", deadLetterId, tenantId);
|
||||
return Task.FromResult(new DeadLetterRetryResult { DeadLetterId = deadLetterId, Success = true, NewStatus = DeadLetterStatus.Retried });
|
||||
}
|
||||
|
||||
public async Task<DeadLetterBulkRetryResult> RetryBulkAsync(string tenantId, DeadLetterQuery? query = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var deadLetters = await GetAsync(tenantId, query, cancellationToken);
|
||||
var results = new List<DeadLetterRetryResult>();
|
||||
foreach (var dl in deadLetters.Where(d => d.Status == DeadLetterStatus.Pending))
|
||||
results.Add(await RetryAsync(tenantId, dl.DeadLetterId, cancellationToken));
|
||||
return new DeadLetterBulkRetryResult { Total = results.Count, Succeeded = results.Count(r => r.Success), Failed = results.Count(r => !r.Success), Results = results };
|
||||
}
|
||||
|
||||
public Task<bool> DiscardAsync(string tenantId, string deadLetterId, string? reason = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_deadLetters.TryGetValue(tenantId, out var list)) return Task.FromResult(false);
|
||||
lock (list)
|
||||
{
|
||||
var index = list.FindIndex(d => d.DeadLetterId == deadLetterId);
|
||||
if (index < 0) return Task.FromResult(false);
|
||||
list[index] = list[index] with { Status = DeadLetterStatus.Discarded, DiscardReason = reason };
|
||||
}
|
||||
_logger.LogInformation("Discarded dead-lettered delivery {DeadLetterId} for tenant {TenantId}: {Reason}", deadLetterId, tenantId, reason ?? "No reason");
|
||||
return Task.FromResult(true);
|
||||
}
|
||||
|
||||
public Task<DeadLetterStats> GetStatsAsync(string? tenantId = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var all = tenantId is not null ? (_deadLetters.TryGetValue(tenantId, out var l) ? l.ToList() : []) : _deadLetters.Values.SelectMany(v => v).ToList();
|
||||
return Task.FromResult(new DeadLetterStats
|
||||
{
|
||||
Timestamp = _timeProvider.GetUtcNow(),
|
||||
TenantId = tenantId,
|
||||
TotalCount = all.Count,
|
||||
PendingCount = all.Count(d => d.Status == DeadLetterStatus.Pending),
|
||||
RetryingCount = all.Count(d => d.Status == DeadLetterStatus.Retrying),
|
||||
RetriedCount = all.Count(d => d.Status == DeadLetterStatus.Retried),
|
||||
DiscardedCount = all.Count(d => d.Status == DeadLetterStatus.Discarded),
|
||||
ByReason = all.GroupBy(d => d.Reason).ToDictionary(g => g.Key, g => g.Count()),
|
||||
ByChannel = all.GroupBy(d => d.ChannelType).ToDictionary(g => g.Key, g => g.Count()),
|
||||
OldestDeadLetterAt = all.MinBy(d => d.DeadLetteredAt)?.DeadLetteredAt,
|
||||
NewestDeadLetterAt = all.MaxBy(d => d.DeadLetteredAt)?.DeadLetteredAt
|
||||
});
|
||||
}
|
||||
|
||||
public Task<int> PurgeAsync(string? tenantId, TimeSpan olderThan, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var cutoff = _timeProvider.GetUtcNow() - olderThan;
|
||||
var purged = 0;
|
||||
var tenants = tenantId is not null ? [tenantId] : _deadLetters.Keys.ToList();
|
||||
foreach (var t in tenants)
|
||||
{
|
||||
if (!_deadLetters.TryGetValue(t, out var list)) continue;
|
||||
lock (list) { purged += list.RemoveAll(d => d.DeadLetteredAt < cutoff); }
|
||||
}
|
||||
_logger.LogInformation("Purged {Count} dead-lettered deliveries older than {OlderThan}", purged, olderThan);
|
||||
return Task.FromResult(purged);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,802 @@
|
||||
using System.Collections.Concurrent;
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Metrics service for the Notifier module.
|
||||
/// Provides counters, histograms, and gauges for observability.
|
||||
/// </summary>
|
||||
public interface INotifierMetrics
|
||||
{
|
||||
/// <summary>
|
||||
/// Records a notification delivery attempt.
|
||||
/// </summary>
|
||||
void RecordDeliveryAttempt(string tenantId, string channelType, bool success, TimeSpan duration);
|
||||
|
||||
/// <summary>
|
||||
/// Records an escalation event.
|
||||
/// </summary>
|
||||
void RecordEscalation(string tenantId, string policyId, int level, EscalationEventType eventType);
|
||||
|
||||
/// <summary>
|
||||
/// Records escalation acknowledgment latency.
|
||||
/// </summary>
|
||||
void RecordAckLatency(string tenantId, string policyId, TimeSpan latency);
|
||||
|
||||
/// <summary>
|
||||
/// Records a storm detection event.
|
||||
/// </summary>
|
||||
void RecordStormEvent(string tenantId, string eventKind, StormEventType eventType, int suppressedCount);
|
||||
|
||||
/// <summary>
|
||||
/// Records a fallback attempt.
|
||||
/// </summary>
|
||||
void RecordFallback(string tenantId, string fromChannel, string toChannel, bool success);
|
||||
|
||||
/// <summary>
|
||||
/// Records a dead-letter event.
|
||||
/// </summary>
|
||||
void RecordDeadLetter(string tenantId, string reason, string channelType);
|
||||
|
||||
/// <summary>
|
||||
/// Records digest generation.
|
||||
/// </summary>
|
||||
void RecordDigestGeneration(string tenantId, string scheduleId, TimeSpan duration, int incidentCount);
|
||||
|
||||
/// <summary>
|
||||
/// Records quiet hours/throttle suppression.
|
||||
/// </summary>
|
||||
void RecordSuppression(string tenantId, SuppressionType type, string eventKind);
|
||||
|
||||
/// <summary>
|
||||
/// Records template rendering.
|
||||
/// </summary>
|
||||
void RecordTemplateRender(string tenantId, string templateId, TimeSpan duration, bool success);
|
||||
|
||||
/// <summary>
|
||||
/// Records incident lifecycle event.
|
||||
/// </summary>
|
||||
void RecordIncidentEvent(string tenantId, IncidentEventType eventType);
|
||||
|
||||
/// <summary>
|
||||
/// Updates active escalations gauge.
|
||||
/// </summary>
|
||||
void SetActiveEscalations(string tenantId, int count);
|
||||
|
||||
/// <summary>
|
||||
/// Updates active storms gauge.
|
||||
/// </summary>
|
||||
void SetActiveStorms(string tenantId, int count);
|
||||
|
||||
/// <summary>
|
||||
/// Updates pending deliveries gauge.
|
||||
/// </summary>
|
||||
void SetPendingDeliveries(string tenantId, int count);
|
||||
|
||||
/// <summary>
|
||||
/// Gets current metrics snapshot.
|
||||
/// </summary>
|
||||
NotifierMetricsSnapshot GetSnapshot(string? tenantId = null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of escalation event.
|
||||
/// </summary>
|
||||
public enum EscalationEventType
|
||||
{
|
||||
Started,
|
||||
LevelAdvanced,
|
||||
Acknowledged,
|
||||
Resolved,
|
||||
Exhausted,
|
||||
Timeout
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of storm event.
|
||||
/// </summary>
|
||||
public enum StormEventType
|
||||
{
|
||||
Detected,
|
||||
Suppressed,
|
||||
SummarySent,
|
||||
Ended
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of suppression.
|
||||
/// </summary>
|
||||
public enum SuppressionType
|
||||
{
|
||||
QuietHours,
|
||||
Throttle,
|
||||
Maintenance,
|
||||
Override
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of incident event.
|
||||
/// </summary>
|
||||
public enum IncidentEventType
|
||||
{
|
||||
Created,
|
||||
Updated,
|
||||
Acknowledged,
|
||||
Resolved,
|
||||
Reopened
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Snapshot of current metrics.
|
||||
/// </summary>
|
||||
public sealed record NotifierMetricsSnapshot
|
||||
{
|
||||
public required DateTimeOffset Timestamp { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
|
||||
// Delivery metrics
|
||||
public long TotalDeliveryAttempts { get; init; }
|
||||
public long SuccessfulDeliveries { get; init; }
|
||||
public long FailedDeliveries { get; init; }
|
||||
public double AverageDeliveryLatencyMs { get; init; }
|
||||
public double P95DeliveryLatencyMs { get; init; }
|
||||
public double P99DeliveryLatencyMs { get; init; }
|
||||
|
||||
// Escalation metrics
|
||||
public long TotalEscalations { get; init; }
|
||||
public long EscalationsAcknowledged { get; init; }
|
||||
public long EscalationsExhausted { get; init; }
|
||||
public double AverageAckLatencyMs { get; init; }
|
||||
public int ActiveEscalations { get; init; }
|
||||
|
||||
// Storm metrics
|
||||
public long StormsDetected { get; init; }
|
||||
public long NotificationsSuppressed { get; init; }
|
||||
public int ActiveStorms { get; init; }
|
||||
|
||||
// Fallback metrics
|
||||
public long FallbackAttempts { get; init; }
|
||||
public long FallbackSuccesses { get; init; }
|
||||
|
||||
// Dead-letter metrics
|
||||
public long DeadLetterCount { get; init; }
|
||||
|
||||
// Incident metrics
|
||||
public long IncidentsCreated { get; init; }
|
||||
public long IncidentsResolved { get; init; }
|
||||
|
||||
// Suppression metrics
|
||||
public long QuietHoursSuppressions { get; init; }
|
||||
public long ThrottleSuppressions { get; init; }
|
||||
|
||||
// Queue metrics
|
||||
public int PendingDeliveries { get; init; }
|
||||
|
||||
// Channel breakdown
|
||||
public IReadOnlyDictionary<string, ChannelMetrics> ChannelMetrics { get; init; } = new Dictionary<string, ChannelMetrics>();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metrics for a specific channel.
|
||||
/// </summary>
|
||||
public sealed record ChannelMetrics
|
||||
{
|
||||
public required string ChannelType { get; init; }
|
||||
public long Attempts { get; init; }
|
||||
public long Successes { get; init; }
|
||||
public long Failures { get; init; }
|
||||
public double AverageLatencyMs { get; init; }
|
||||
public double SuccessRate => Attempts > 0 ? (double)Successes / Attempts * 100 : 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for metrics service.
|
||||
/// </summary>
|
||||
public sealed class NotifierMetricsOptions
|
||||
{
|
||||
public const string SectionName = "Notifier:Observability:Metrics";
|
||||
|
||||
/// <summary>
|
||||
/// Whether metrics collection is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Histogram bucket boundaries for latency (in milliseconds).
|
||||
/// </summary>
|
||||
public double[] LatencyBuckets { get; set; } = [10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000];
|
||||
|
||||
/// <summary>
|
||||
/// How long to retain detailed metrics.
|
||||
/// </summary>
|
||||
public TimeSpan RetentionPeriod { get; set; } = TimeSpan.FromHours(24);
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include tenant-level breakdown.
|
||||
/// </summary>
|
||||
public bool IncludeTenantBreakdown { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of tenants to track individually.
|
||||
/// </summary>
|
||||
public int MaxTrackedTenants { get; set; } = 1000;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of notifier metrics using System.Diagnostics.Metrics.
|
||||
/// </summary>
|
||||
public sealed class DefaultNotifierMetrics : INotifierMetrics, IDisposable
|
||||
{
|
||||
private readonly Meter _meter;
|
||||
private readonly NotifierMetricsOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<DefaultNotifierMetrics> _logger;
|
||||
|
||||
// Counters
|
||||
private readonly Counter<long> _deliveryAttempts;
|
||||
private readonly Counter<long> _deliverySuccesses;
|
||||
private readonly Counter<long> _deliveryFailures;
|
||||
private readonly Counter<long> _escalationEvents;
|
||||
private readonly Counter<long> _stormEvents;
|
||||
private readonly Counter<long> _fallbackAttempts;
|
||||
private readonly Counter<long> _deadLetters;
|
||||
private readonly Counter<long> _suppressions;
|
||||
private readonly Counter<long> _incidentEvents;
|
||||
private readonly Counter<long> _templateRenders;
|
||||
|
||||
// Histograms
|
||||
private readonly Histogram<double> _deliveryLatency;
|
||||
private readonly Histogram<double> _ackLatency;
|
||||
private readonly Histogram<double> _digestDuration;
|
||||
private readonly Histogram<double> _templateRenderDuration;
|
||||
|
||||
// Gauges (tracked via observable gauges)
|
||||
private readonly ConcurrentDictionary<string, int> _activeEscalations = new();
|
||||
private readonly ConcurrentDictionary<string, int> _activeStorms = new();
|
||||
private readonly ConcurrentDictionary<string, int> _pendingDeliveries = new();
|
||||
|
||||
// In-memory aggregation for snapshots
|
||||
private readonly ConcurrentDictionary<string, TenantMetricsData> _tenantMetrics = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
public DefaultNotifierMetrics(
|
||||
IOptions<NotifierMetricsOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<DefaultNotifierMetrics> logger)
|
||||
{
|
||||
_options = options?.Value ?? new NotifierMetricsOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
|
||||
_meter = new Meter("StellaOps.Notifier", "1.0.0");
|
||||
|
||||
// Initialize counters
|
||||
_deliveryAttempts = _meter.CreateCounter<long>(
|
||||
"notifier.delivery.attempts",
|
||||
"attempts",
|
||||
"Total number of delivery attempts");
|
||||
|
||||
_deliverySuccesses = _meter.CreateCounter<long>(
|
||||
"notifier.delivery.successes",
|
||||
"deliveries",
|
||||
"Number of successful deliveries");
|
||||
|
||||
_deliveryFailures = _meter.CreateCounter<long>(
|
||||
"notifier.delivery.failures",
|
||||
"deliveries",
|
||||
"Number of failed deliveries");
|
||||
|
||||
_escalationEvents = _meter.CreateCounter<long>(
|
||||
"notifier.escalation.events",
|
||||
"events",
|
||||
"Number of escalation events");
|
||||
|
||||
_stormEvents = _meter.CreateCounter<long>(
|
||||
"notifier.storm.events",
|
||||
"events",
|
||||
"Number of storm-related events");
|
||||
|
||||
_fallbackAttempts = _meter.CreateCounter<long>(
|
||||
"notifier.fallback.attempts",
|
||||
"attempts",
|
||||
"Number of fallback attempts");
|
||||
|
||||
_deadLetters = _meter.CreateCounter<long>(
|
||||
"notifier.deadletter.count",
|
||||
"messages",
|
||||
"Number of dead-lettered messages");
|
||||
|
||||
_suppressions = _meter.CreateCounter<long>(
|
||||
"notifier.suppression.count",
|
||||
"suppressions",
|
||||
"Number of suppressed notifications");
|
||||
|
||||
_incidentEvents = _meter.CreateCounter<long>(
|
||||
"notifier.incident.events",
|
||||
"events",
|
||||
"Number of incident lifecycle events");
|
||||
|
||||
_templateRenders = _meter.CreateCounter<long>(
|
||||
"notifier.template.renders",
|
||||
"renders",
|
||||
"Number of template render operations");
|
||||
|
||||
// Initialize histograms
|
||||
_deliveryLatency = _meter.CreateHistogram<double>(
|
||||
"notifier.delivery.latency",
|
||||
"ms",
|
||||
"Delivery latency in milliseconds");
|
||||
|
||||
_ackLatency = _meter.CreateHistogram<double>(
|
||||
"notifier.escalation.ack_latency",
|
||||
"ms",
|
||||
"Acknowledgment latency in milliseconds");
|
||||
|
||||
_digestDuration = _meter.CreateHistogram<double>(
|
||||
"notifier.digest.duration",
|
||||
"ms",
|
||||
"Digest generation duration in milliseconds");
|
||||
|
||||
_templateRenderDuration = _meter.CreateHistogram<double>(
|
||||
"notifier.template.render_duration",
|
||||
"ms",
|
||||
"Template render duration in milliseconds");
|
||||
|
||||
// Initialize observable gauges
|
||||
_meter.CreateObservableGauge(
|
||||
"notifier.escalations.active",
|
||||
() => GetObservableGaugeValues(_activeEscalations),
|
||||
"escalations",
|
||||
"Number of active escalations");
|
||||
|
||||
_meter.CreateObservableGauge(
|
||||
"notifier.storms.active",
|
||||
() => GetObservableGaugeValues(_activeStorms),
|
||||
"storms",
|
||||
"Number of active notification storms");
|
||||
|
||||
_meter.CreateObservableGauge(
|
||||
"notifier.deliveries.pending",
|
||||
() => GetObservableGaugeValues(_pendingDeliveries),
|
||||
"deliveries",
|
||||
"Number of pending deliveries");
|
||||
}
|
||||
|
||||
public void RecordDeliveryAttempt(string tenantId, string channelType, bool success, TimeSpan duration)
|
||||
{
|
||||
if (!_options.Enabled) return;
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", tenantId },
|
||||
{ "channel_type", channelType },
|
||||
{ "success", success.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_deliveryAttempts.Add(1, tags);
|
||||
_deliveryLatency.Record(duration.TotalMilliseconds, tags);
|
||||
|
||||
if (success)
|
||||
{
|
||||
_deliverySuccesses.Add(1, tags);
|
||||
}
|
||||
else
|
||||
{
|
||||
_deliveryFailures.Add(1, tags);
|
||||
}
|
||||
|
||||
// Update in-memory aggregation
|
||||
UpdateTenantMetrics(tenantId, m =>
|
||||
{
|
||||
m.TotalDeliveryAttempts++;
|
||||
if (success) m.SuccessfulDeliveries++;
|
||||
else m.FailedDeliveries++;
|
||||
m.DeliveryLatencies.Add(duration.TotalMilliseconds);
|
||||
|
||||
if (!m.ChannelMetrics.TryGetValue(channelType, out var cm))
|
||||
{
|
||||
cm = new MutableChannelMetrics { ChannelType = channelType };
|
||||
m.ChannelMetrics[channelType] = cm;
|
||||
}
|
||||
cm.Attempts++;
|
||||
if (success) cm.Successes++;
|
||||
else cm.Failures++;
|
||||
cm.Latencies.Add(duration.TotalMilliseconds);
|
||||
});
|
||||
}
|
||||
|
||||
public void RecordEscalation(string tenantId, string policyId, int level, EscalationEventType eventType)
|
||||
{
|
||||
if (!_options.Enabled) return;
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", tenantId },
|
||||
{ "policy_id", policyId },
|
||||
{ "level", level.ToString() },
|
||||
{ "event_type", eventType.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_escalationEvents.Add(1, tags);
|
||||
|
||||
UpdateTenantMetrics(tenantId, m =>
|
||||
{
|
||||
m.TotalEscalations++;
|
||||
if (eventType == EscalationEventType.Acknowledged)
|
||||
m.EscalationsAcknowledged++;
|
||||
else if (eventType == EscalationEventType.Exhausted)
|
||||
m.EscalationsExhausted++;
|
||||
});
|
||||
}
|
||||
|
||||
public void RecordAckLatency(string tenantId, string policyId, TimeSpan latency)
|
||||
{
|
||||
if (!_options.Enabled) return;
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", tenantId },
|
||||
{ "policy_id", policyId }
|
||||
};
|
||||
|
||||
_ackLatency.Record(latency.TotalMilliseconds, tags);
|
||||
|
||||
UpdateTenantMetrics(tenantId, m =>
|
||||
{
|
||||
m.AckLatencies.Add(latency.TotalMilliseconds);
|
||||
});
|
||||
}
|
||||
|
||||
public void RecordStormEvent(string tenantId, string eventKind, StormEventType eventType, int suppressedCount)
|
||||
{
|
||||
if (!_options.Enabled) return;
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", tenantId },
|
||||
{ "event_kind", eventKind },
|
||||
{ "event_type", eventType.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_stormEvents.Add(1, tags);
|
||||
|
||||
UpdateTenantMetrics(tenantId, m =>
|
||||
{
|
||||
if (eventType == StormEventType.Detected)
|
||||
m.StormsDetected++;
|
||||
m.NotificationsSuppressed += suppressedCount;
|
||||
});
|
||||
}
|
||||
|
||||
public void RecordFallback(string tenantId, string fromChannel, string toChannel, bool success)
|
||||
{
|
||||
if (!_options.Enabled) return;
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", tenantId },
|
||||
{ "from_channel", fromChannel },
|
||||
{ "to_channel", toChannel },
|
||||
{ "success", success.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_fallbackAttempts.Add(1, tags);
|
||||
|
||||
UpdateTenantMetrics(tenantId, m =>
|
||||
{
|
||||
m.FallbackAttempts++;
|
||||
if (success) m.FallbackSuccesses++;
|
||||
});
|
||||
}
|
||||
|
||||
public void RecordDeadLetter(string tenantId, string reason, string channelType)
|
||||
{
|
||||
if (!_options.Enabled) return;
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", tenantId },
|
||||
{ "reason", reason },
|
||||
{ "channel_type", channelType }
|
||||
};
|
||||
|
||||
_deadLetters.Add(1, tags);
|
||||
|
||||
UpdateTenantMetrics(tenantId, m => m.DeadLetterCount++);
|
||||
}
|
||||
|
||||
public void RecordDigestGeneration(string tenantId, string scheduleId, TimeSpan duration, int incidentCount)
|
||||
{
|
||||
if (!_options.Enabled) return;
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", tenantId },
|
||||
{ "schedule_id", scheduleId }
|
||||
};
|
||||
|
||||
_digestDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordSuppression(string tenantId, SuppressionType type, string eventKind)
|
||||
{
|
||||
if (!_options.Enabled) return;
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", tenantId },
|
||||
{ "suppression_type", type.ToString().ToLowerInvariant() },
|
||||
{ "event_kind", eventKind }
|
||||
};
|
||||
|
||||
_suppressions.Add(1, tags);
|
||||
|
||||
UpdateTenantMetrics(tenantId, m =>
|
||||
{
|
||||
switch (type)
|
||||
{
|
||||
case SuppressionType.QuietHours:
|
||||
m.QuietHoursSuppressions++;
|
||||
break;
|
||||
case SuppressionType.Throttle:
|
||||
m.ThrottleSuppressions++;
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public void RecordTemplateRender(string tenantId, string templateId, TimeSpan duration, bool success)
|
||||
{
|
||||
if (!_options.Enabled) return;
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", tenantId },
|
||||
{ "template_id", templateId },
|
||||
{ "success", success.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_templateRenders.Add(1, tags);
|
||||
_templateRenderDuration.Record(duration.TotalMilliseconds, tags);
|
||||
}
|
||||
|
||||
public void RecordIncidentEvent(string tenantId, IncidentEventType eventType)
|
||||
{
|
||||
if (!_options.Enabled) return;
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "tenant_id", tenantId },
|
||||
{ "event_type", eventType.ToString().ToLowerInvariant() }
|
||||
};
|
||||
|
||||
_incidentEvents.Add(1, tags);
|
||||
|
||||
UpdateTenantMetrics(tenantId, m =>
|
||||
{
|
||||
switch (eventType)
|
||||
{
|
||||
case IncidentEventType.Created:
|
||||
m.IncidentsCreated++;
|
||||
break;
|
||||
case IncidentEventType.Resolved:
|
||||
m.IncidentsResolved++;
|
||||
break;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public void SetActiveEscalations(string tenantId, int count)
|
||||
{
|
||||
_activeEscalations[tenantId] = count;
|
||||
}
|
||||
|
||||
public void SetActiveStorms(string tenantId, int count)
|
||||
{
|
||||
_activeStorms[tenantId] = count;
|
||||
}
|
||||
|
||||
public void SetPendingDeliveries(string tenantId, int count)
|
||||
{
|
||||
_pendingDeliveries[tenantId] = count;
|
||||
}
|
||||
|
||||
public NotifierMetricsSnapshot GetSnapshot(string? tenantId = null)
|
||||
{
|
||||
if (tenantId is not null)
|
||||
{
|
||||
return GetTenantSnapshot(tenantId);
|
||||
}
|
||||
|
||||
// Aggregate all tenants
|
||||
var allMetrics = _tenantMetrics.Values.ToList();
|
||||
|
||||
var deliveryLatencies = allMetrics.SelectMany(m => m.DeliveryLatencies).ToList();
|
||||
var ackLatencies = allMetrics.SelectMany(m => m.AckLatencies).ToList();
|
||||
|
||||
var channelMetrics = new Dictionary<string, ChannelMetrics>();
|
||||
foreach (var tenant in allMetrics)
|
||||
{
|
||||
foreach (var (channel, cm) in tenant.ChannelMetrics)
|
||||
{
|
||||
if (!channelMetrics.TryGetValue(channel, out var existing))
|
||||
{
|
||||
channelMetrics[channel] = new ChannelMetrics
|
||||
{
|
||||
ChannelType = channel,
|
||||
Attempts = cm.Attempts,
|
||||
Successes = cm.Successes,
|
||||
Failures = cm.Failures,
|
||||
AverageLatencyMs = cm.Latencies.Count > 0 ? cm.Latencies.Average() : 0
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
var allLatencies = new List<double>(cm.Latencies);
|
||||
channelMetrics[channel] = existing with
|
||||
{
|
||||
Attempts = existing.Attempts + cm.Attempts,
|
||||
Successes = existing.Successes + cm.Successes,
|
||||
Failures = existing.Failures + cm.Failures,
|
||||
AverageLatencyMs = allLatencies.Count > 0 ? allLatencies.Average() : existing.AverageLatencyMs
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new NotifierMetricsSnapshot
|
||||
{
|
||||
Timestamp = _timeProvider.GetUtcNow(),
|
||||
TotalDeliveryAttempts = allMetrics.Sum(m => m.TotalDeliveryAttempts),
|
||||
SuccessfulDeliveries = allMetrics.Sum(m => m.SuccessfulDeliveries),
|
||||
FailedDeliveries = allMetrics.Sum(m => m.FailedDeliveries),
|
||||
AverageDeliveryLatencyMs = deliveryLatencies.Count > 0 ? deliveryLatencies.Average() : 0,
|
||||
P95DeliveryLatencyMs = CalculatePercentile(deliveryLatencies, 95),
|
||||
P99DeliveryLatencyMs = CalculatePercentile(deliveryLatencies, 99),
|
||||
TotalEscalations = allMetrics.Sum(m => m.TotalEscalations),
|
||||
EscalationsAcknowledged = allMetrics.Sum(m => m.EscalationsAcknowledged),
|
||||
EscalationsExhausted = allMetrics.Sum(m => m.EscalationsExhausted),
|
||||
AverageAckLatencyMs = ackLatencies.Count > 0 ? ackLatencies.Average() : 0,
|
||||
ActiveEscalations = _activeEscalations.Values.Sum(),
|
||||
StormsDetected = allMetrics.Sum(m => m.StormsDetected),
|
||||
NotificationsSuppressed = allMetrics.Sum(m => m.NotificationsSuppressed),
|
||||
ActiveStorms = _activeStorms.Values.Sum(),
|
||||
FallbackAttempts = allMetrics.Sum(m => m.FallbackAttempts),
|
||||
FallbackSuccesses = allMetrics.Sum(m => m.FallbackSuccesses),
|
||||
DeadLetterCount = allMetrics.Sum(m => m.DeadLetterCount),
|
||||
IncidentsCreated = allMetrics.Sum(m => m.IncidentsCreated),
|
||||
IncidentsResolved = allMetrics.Sum(m => m.IncidentsResolved),
|
||||
QuietHoursSuppressions = allMetrics.Sum(m => m.QuietHoursSuppressions),
|
||||
ThrottleSuppressions = allMetrics.Sum(m => m.ThrottleSuppressions),
|
||||
PendingDeliveries = _pendingDeliveries.Values.Sum(),
|
||||
ChannelMetrics = channelMetrics
|
||||
};
|
||||
}
|
||||
|
||||
private NotifierMetricsSnapshot GetTenantSnapshot(string tenantId)
|
||||
{
|
||||
if (!_tenantMetrics.TryGetValue(tenantId, out var metrics))
|
||||
{
|
||||
return new NotifierMetricsSnapshot
|
||||
{
|
||||
Timestamp = _timeProvider.GetUtcNow(),
|
||||
TenantId = tenantId
|
||||
};
|
||||
}
|
||||
|
||||
var channelMetrics = metrics.ChannelMetrics.ToDictionary(
|
||||
kvp => kvp.Key,
|
||||
kvp => new ChannelMetrics
|
||||
{
|
||||
ChannelType = kvp.Key,
|
||||
Attempts = kvp.Value.Attempts,
|
||||
Successes = kvp.Value.Successes,
|
||||
Failures = kvp.Value.Failures,
|
||||
AverageLatencyMs = kvp.Value.Latencies.Count > 0 ? kvp.Value.Latencies.Average() : 0
|
||||
});
|
||||
|
||||
return new NotifierMetricsSnapshot
|
||||
{
|
||||
Timestamp = _timeProvider.GetUtcNow(),
|
||||
TenantId = tenantId,
|
||||
TotalDeliveryAttempts = metrics.TotalDeliveryAttempts,
|
||||
SuccessfulDeliveries = metrics.SuccessfulDeliveries,
|
||||
FailedDeliveries = metrics.FailedDeliveries,
|
||||
AverageDeliveryLatencyMs = metrics.DeliveryLatencies.Count > 0 ? metrics.DeliveryLatencies.Average() : 0,
|
||||
P95DeliveryLatencyMs = CalculatePercentile(metrics.DeliveryLatencies, 95),
|
||||
P99DeliveryLatencyMs = CalculatePercentile(metrics.DeliveryLatencies, 99),
|
||||
TotalEscalations = metrics.TotalEscalations,
|
||||
EscalationsAcknowledged = metrics.EscalationsAcknowledged,
|
||||
EscalationsExhausted = metrics.EscalationsExhausted,
|
||||
AverageAckLatencyMs = metrics.AckLatencies.Count > 0 ? metrics.AckLatencies.Average() : 0,
|
||||
ActiveEscalations = _activeEscalations.GetValueOrDefault(tenantId, 0),
|
||||
StormsDetected = metrics.StormsDetected,
|
||||
NotificationsSuppressed = metrics.NotificationsSuppressed,
|
||||
ActiveStorms = _activeStorms.GetValueOrDefault(tenantId, 0),
|
||||
FallbackAttempts = metrics.FallbackAttempts,
|
||||
FallbackSuccesses = metrics.FallbackSuccesses,
|
||||
DeadLetterCount = metrics.DeadLetterCount,
|
||||
IncidentsCreated = metrics.IncidentsCreated,
|
||||
IncidentsResolved = metrics.IncidentsResolved,
|
||||
QuietHoursSuppressions = metrics.QuietHoursSuppressions,
|
||||
ThrottleSuppressions = metrics.ThrottleSuppressions,
|
||||
PendingDeliveries = _pendingDeliveries.GetValueOrDefault(tenantId, 0),
|
||||
ChannelMetrics = channelMetrics
|
||||
};
|
||||
}
|
||||
|
||||
private void UpdateTenantMetrics(string tenantId, Action<TenantMetricsData> update)
|
||||
{
|
||||
if (!_options.IncludeTenantBreakdown) return;
|
||||
|
||||
var metrics = _tenantMetrics.GetOrAdd(tenantId, _ => new TenantMetricsData());
|
||||
lock (metrics)
|
||||
{
|
||||
update(metrics);
|
||||
}
|
||||
}
|
||||
|
||||
private static IEnumerable<Measurement<int>> GetObservableGaugeValues(ConcurrentDictionary<string, int> values)
|
||||
{
|
||||
foreach (var (tenantId, count) in values)
|
||||
{
|
||||
yield return new Measurement<int>(count, new TagList { { "tenant_id", tenantId } });
|
||||
}
|
||||
}
|
||||
|
||||
private static double CalculatePercentile(List<double> values, int percentile)
|
||||
{
|
||||
if (values.Count == 0) return 0;
|
||||
|
||||
var sorted = values.OrderBy(v => v).ToList();
|
||||
var index = (int)Math.Ceiling(percentile / 100.0 * sorted.Count) - 1;
|
||||
return sorted[Math.Max(0, Math.Min(index, sorted.Count - 1))];
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_meter.Dispose();
|
||||
}
|
||||
|
||||
private sealed class TenantMetricsData
|
||||
{
|
||||
public long TotalDeliveryAttempts;
|
||||
public long SuccessfulDeliveries;
|
||||
public long FailedDeliveries;
|
||||
public List<double> DeliveryLatencies = [];
|
||||
|
||||
public long TotalEscalations;
|
||||
public long EscalationsAcknowledged;
|
||||
public long EscalationsExhausted;
|
||||
public List<double> AckLatencies = [];
|
||||
|
||||
public long StormsDetected;
|
||||
public long NotificationsSuppressed;
|
||||
|
||||
public long FallbackAttempts;
|
||||
public long FallbackSuccesses;
|
||||
|
||||
public long DeadLetterCount;
|
||||
|
||||
public long IncidentsCreated;
|
||||
public long IncidentsResolved;
|
||||
|
||||
public long QuietHoursSuppressions;
|
||||
public long ThrottleSuppressions;
|
||||
|
||||
public Dictionary<string, MutableChannelMetrics> ChannelMetrics = [];
|
||||
}
|
||||
|
||||
private sealed class MutableChannelMetrics
|
||||
{
|
||||
public required string ChannelType { get; init; }
|
||||
public long Attempts;
|
||||
public long Successes;
|
||||
public long Failures;
|
||||
public List<double> Latencies = [];
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,395 @@
|
||||
using System.Diagnostics;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Tracing service for the Notifier module.
|
||||
/// Provides distributed tracing capabilities using OpenTelemetry-compatible Activity API.
|
||||
/// </summary>
|
||||
public interface INotifierTracing
|
||||
{
|
||||
/// <summary>
|
||||
/// Starts a delivery span.
|
||||
/// </summary>
|
||||
Activity? StartDeliverySpan(string tenantId, string deliveryId, string channelType);
|
||||
|
||||
/// <summary>
|
||||
/// Starts an escalation span.
|
||||
/// </summary>
|
||||
Activity? StartEscalationSpan(string tenantId, string incidentId, string policyId);
|
||||
|
||||
/// <summary>
|
||||
/// Starts a digest generation span.
|
||||
/// </summary>
|
||||
Activity? StartDigestSpan(string tenantId, string scheduleId);
|
||||
|
||||
/// <summary>
|
||||
/// Starts a template render span.
|
||||
/// </summary>
|
||||
Activity? StartTemplateRenderSpan(string tenantId, string templateId);
|
||||
|
||||
/// <summary>
|
||||
/// Starts a correlation span.
|
||||
/// </summary>
|
||||
Activity? StartCorrelationSpan(string tenantId, string eventKind);
|
||||
|
||||
/// <summary>
|
||||
/// Starts a webhook validation span.
|
||||
/// </summary>
|
||||
Activity? StartWebhookValidationSpan(string tenantId, string channelId);
|
||||
|
||||
/// <summary>
|
||||
/// Adds an event to the current span.
|
||||
/// </summary>
|
||||
void AddEvent(Activity? activity, string name, IDictionary<string, object?>? attributes = null);
|
||||
|
||||
/// <summary>
|
||||
/// Sets span status to error.
|
||||
/// </summary>
|
||||
void SetError(Activity? activity, Exception? exception = null, string? description = null);
|
||||
|
||||
/// <summary>
|
||||
/// Sets span status to ok.
|
||||
/// </summary>
|
||||
void SetOk(Activity? activity);
|
||||
|
||||
/// <summary>
|
||||
/// Adds custom tags to a span.
|
||||
/// </summary>
|
||||
void AddTags(Activity? activity, IDictionary<string, object?> tags);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a linked span (for batch operations).
|
||||
/// </summary>
|
||||
Activity? StartLinkedSpan(string operationName, ActivityContext parentContext, IDictionary<string, object?>? tags = null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for tracing service.
|
||||
/// </summary>
|
||||
public sealed class NotifierTracingOptions
|
||||
{
|
||||
public const string SectionName = "Notifier:Observability:Tracing";
|
||||
|
||||
/// <summary>
|
||||
/// Whether tracing is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Activity source name.
|
||||
/// </summary>
|
||||
public string SourceName { get; set; } = "StellaOps.Notifier";
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include sensitive data in traces.
|
||||
/// </summary>
|
||||
public bool IncludeSensitiveData { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Sampling ratio (0.0 to 1.0).
|
||||
/// </summary>
|
||||
public double SamplingRatio { get; set; } = 1.0;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of attributes per span.
|
||||
/// </summary>
|
||||
public int MaxAttributesPerSpan { get; set; } = 128;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of events per span.
|
||||
/// </summary>
|
||||
public int MaxEventsPerSpan { get; set; } = 128;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of notifier tracing.
|
||||
/// </summary>
|
||||
public sealed class DefaultNotifierTracing : INotifierTracing, IDisposable
|
||||
{
|
||||
private readonly ActivitySource _activitySource;
|
||||
private readonly NotifierTracingOptions _options;
|
||||
private readonly ILogger<DefaultNotifierTracing> _logger;
|
||||
|
||||
public DefaultNotifierTracing(
|
||||
IOptions<NotifierTracingOptions> options,
|
||||
ILogger<DefaultNotifierTracing> logger)
|
||||
{
|
||||
_options = options?.Value ?? new NotifierTracingOptions();
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_activitySource = new ActivitySource(_options.SourceName, "1.0.0");
|
||||
}
|
||||
|
||||
public Activity? StartDeliverySpan(string tenantId, string deliveryId, string channelType)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.delivery",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("delivery.id", deliveryId);
|
||||
activity.SetTag("channel.type", channelType);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartEscalationSpan(string tenantId, string incidentId, string policyId)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.escalation",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("incident.id", incidentId);
|
||||
activity.SetTag("policy.id", policyId);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartDigestSpan(string tenantId, string scheduleId)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.digest",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("schedule.id", scheduleId);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartTemplateRenderSpan(string tenantId, string templateId)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.template.render",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("template.id", templateId);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartCorrelationSpan(string tenantId, string eventKind)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.correlation",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("event.kind", eventKind);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public Activity? StartWebhookValidationSpan(string tenantId, string channelId)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var activity = _activitySource.StartActivity(
|
||||
"notifier.webhook.validation",
|
||||
ActivityKind.Internal);
|
||||
|
||||
if (activity is null) return null;
|
||||
|
||||
activity.SetTag("tenant.id", tenantId);
|
||||
activity.SetTag("channel.id", channelId);
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public void AddEvent(Activity? activity, string name, IDictionary<string, object?>? attributes = null)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
var tags = new ActivityTagsCollection();
|
||||
if (attributes is not null)
|
||||
{
|
||||
foreach (var (key, value) in attributes)
|
||||
{
|
||||
if (value is not null)
|
||||
{
|
||||
tags.Add(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
activity.AddEvent(new ActivityEvent(name, tags: tags));
|
||||
}
|
||||
|
||||
public void SetError(Activity? activity, Exception? exception = null, string? description = null)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetStatus(ActivityStatusCode.Error, description ?? exception?.Message);
|
||||
|
||||
if (exception is not null)
|
||||
{
|
||||
activity.SetTag("exception.type", exception.GetType().FullName);
|
||||
activity.SetTag("exception.message", exception.Message);
|
||||
|
||||
if (_options.IncludeSensitiveData)
|
||||
{
|
||||
activity.SetTag("exception.stacktrace", exception.StackTrace);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void SetOk(Activity? activity)
|
||||
{
|
||||
activity?.SetStatus(ActivityStatusCode.Ok);
|
||||
}
|
||||
|
||||
public void AddTags(Activity? activity, IDictionary<string, object?> tags)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
foreach (var (key, value) in tags)
|
||||
{
|
||||
if (value is not null)
|
||||
{
|
||||
activity.SetTag(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Activity? StartLinkedSpan(string operationName, ActivityContext parentContext, IDictionary<string, object?>? tags = null)
|
||||
{
|
||||
if (!_options.Enabled) return null;
|
||||
|
||||
var links = new[] { new ActivityLink(parentContext) };
|
||||
var activity = _activitySource.StartActivity(
|
||||
operationName,
|
||||
ActivityKind.Internal,
|
||||
parentContext: default,
|
||||
links: links);
|
||||
|
||||
if (activity is not null && tags is not null)
|
||||
{
|
||||
AddTags(activity, tags);
|
||||
}
|
||||
|
||||
return activity;
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
_activitySource.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for Activity-based tracing.
|
||||
/// </summary>
|
||||
public static class ActivityExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Records a delivery result on the activity.
|
||||
/// </summary>
|
||||
public static void RecordDeliveryResult(this Activity? activity, bool success, int? httpStatusCode = null, string? error = null)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetTag("delivery.success", success);
|
||||
if (httpStatusCode.HasValue)
|
||||
{
|
||||
activity.SetTag("http.status_code", httpStatusCode.Value);
|
||||
}
|
||||
if (!string.IsNullOrEmpty(error))
|
||||
{
|
||||
activity.SetTag("delivery.error", error);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records an escalation level change.
|
||||
/// </summary>
|
||||
public static void RecordEscalationLevel(this Activity? activity, int level, string? target = null)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetTag("escalation.level", level);
|
||||
if (!string.IsNullOrEmpty(target))
|
||||
{
|
||||
activity.SetTag("escalation.target", target);
|
||||
}
|
||||
|
||||
activity.AddEvent(new ActivityEvent("escalation.level.changed", tags: new ActivityTagsCollection
|
||||
{
|
||||
{ "level", level },
|
||||
{ "target", target }
|
||||
}));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records storm detection.
|
||||
/// </summary>
|
||||
public static void RecordStormDetected(this Activity? activity, string eventKind, int eventCount)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.AddEvent(new ActivityEvent("storm.detected", tags: new ActivityTagsCollection
|
||||
{
|
||||
{ "event_kind", eventKind },
|
||||
{ "event_count", eventCount }
|
||||
}));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records fallback attempt.
|
||||
/// </summary>
|
||||
public static void RecordFallback(this Activity? activity, string fromChannel, string toChannel)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.AddEvent(new ActivityEvent("fallback.attempted", tags: new ActivityTagsCollection
|
||||
{
|
||||
{ "from_channel", fromChannel },
|
||||
{ "to_channel", toChannel }
|
||||
}));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records template render details.
|
||||
/// </summary>
|
||||
public static void RecordTemplateRender(this Activity? activity, string format, int outputLength)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetTag("template.format", format);
|
||||
activity.SetTag("template.output_length", outputLength);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records correlation result.
|
||||
/// </summary>
|
||||
public static void RecordCorrelationResult(this Activity? activity, string correlationKey, bool isNewIncident)
|
||||
{
|
||||
if (activity is null) return;
|
||||
|
||||
activity.SetTag("correlation.key", correlationKey);
|
||||
activity.SetTag("correlation.new_incident", isNewIncident);
|
||||
}
|
||||
}
|
||||
@@ -1,98 +1,98 @@
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for notification system metrics and tracing.
|
||||
/// </summary>
|
||||
public interface INotifyMetrics
|
||||
{
|
||||
/// <summary>
|
||||
/// Records a notification delivery attempt.
|
||||
/// </summary>
|
||||
void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration);
|
||||
|
||||
/// <summary>
|
||||
/// Records an escalation event.
|
||||
/// </summary>
|
||||
void RecordEscalation(string tenantId, int level, string outcome);
|
||||
|
||||
/// <summary>
|
||||
/// Records a dead-letter entry.
|
||||
/// </summary>
|
||||
void RecordDeadLetter(string tenantId, string reason, string channelType);
|
||||
|
||||
/// <summary>
|
||||
/// Records rule evaluation.
|
||||
/// </summary>
|
||||
void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration);
|
||||
|
||||
/// <summary>
|
||||
/// Records template rendering.
|
||||
/// </summary>
|
||||
void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration);
|
||||
|
||||
/// <summary>
|
||||
/// Records storm detection event.
|
||||
/// </summary>
|
||||
void RecordStormEvent(string tenantId, string stormKey, string decision);
|
||||
|
||||
/// <summary>
|
||||
/// Records retention cleanup.
|
||||
/// </summary>
|
||||
void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current queue depth for a channel.
|
||||
/// </summary>
|
||||
void RecordQueueDepth(string tenantId, string channelType, int depth);
|
||||
|
||||
/// <summary>
|
||||
/// Creates an activity for distributed tracing.
|
||||
/// </summary>
|
||||
Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType);
|
||||
|
||||
/// <summary>
|
||||
/// Creates an activity for escalation tracing.
|
||||
/// </summary>
|
||||
Activity? StartEscalationActivity(string tenantId, string incidentId, int level);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metric tag names for consistency.
|
||||
/// </summary>
|
||||
public static class NotifyMetricTags
|
||||
{
|
||||
public const string TenantId = "tenant_id";
|
||||
public const string ChannelType = "channel_type";
|
||||
public const string Status = "status";
|
||||
public const string Outcome = "outcome";
|
||||
public const string Level = "level";
|
||||
public const string Reason = "reason";
|
||||
public const string RuleId = "rule_id";
|
||||
public const string Matched = "matched";
|
||||
public const string TemplateKey = "template_key";
|
||||
public const string Success = "success";
|
||||
public const string StormKey = "storm_key";
|
||||
public const string Decision = "decision";
|
||||
public const string EntityType = "entity_type";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metric names for the notification system.
|
||||
/// </summary>
|
||||
public static class NotifyMetricNames
|
||||
{
|
||||
public const string DeliveryAttempts = "notify.delivery.attempts";
|
||||
public const string DeliveryDuration = "notify.delivery.duration";
|
||||
public const string EscalationEvents = "notify.escalation.events";
|
||||
public const string DeadLetterEntries = "notify.deadletter.entries";
|
||||
public const string RuleEvaluations = "notify.rule.evaluations";
|
||||
public const string RuleEvaluationDuration = "notify.rule.evaluation.duration";
|
||||
public const string TemplateRenders = "notify.template.renders";
|
||||
public const string TemplateRenderDuration = "notify.template.render.duration";
|
||||
public const string StormEvents = "notify.storm.events";
|
||||
public const string RetentionCleanups = "notify.retention.cleanups";
|
||||
public const string QueueDepth = "notify.queue.depth";
|
||||
}
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for notification system metrics and tracing.
|
||||
/// </summary>
|
||||
public interface INotifyMetrics
|
||||
{
|
||||
/// <summary>
|
||||
/// Records a notification delivery attempt.
|
||||
/// </summary>
|
||||
void RecordDeliveryAttempt(string tenantId, string channelType, string status, TimeSpan duration);
|
||||
|
||||
/// <summary>
|
||||
/// Records an escalation event.
|
||||
/// </summary>
|
||||
void RecordEscalation(string tenantId, int level, string outcome);
|
||||
|
||||
/// <summary>
|
||||
/// Records a dead-letter entry.
|
||||
/// </summary>
|
||||
void RecordDeadLetter(string tenantId, string reason, string channelType);
|
||||
|
||||
/// <summary>
|
||||
/// Records rule evaluation.
|
||||
/// </summary>
|
||||
void RecordRuleEvaluation(string tenantId, string ruleId, bool matched, TimeSpan duration);
|
||||
|
||||
/// <summary>
|
||||
/// Records template rendering.
|
||||
/// </summary>
|
||||
void RecordTemplateRender(string tenantId, string templateKey, bool success, TimeSpan duration);
|
||||
|
||||
/// <summary>
|
||||
/// Records storm detection event.
|
||||
/// </summary>
|
||||
void RecordStormEvent(string tenantId, string stormKey, string decision);
|
||||
|
||||
/// <summary>
|
||||
/// Records retention cleanup.
|
||||
/// </summary>
|
||||
void RecordRetentionCleanup(string tenantId, string entityType, int deletedCount);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current queue depth for a channel.
|
||||
/// </summary>
|
||||
void RecordQueueDepth(string tenantId, string channelType, int depth);
|
||||
|
||||
/// <summary>
|
||||
/// Creates an activity for distributed tracing.
|
||||
/// </summary>
|
||||
Activity? StartDeliveryActivity(string tenantId, string deliveryId, string channelType);
|
||||
|
||||
/// <summary>
|
||||
/// Creates an activity for escalation tracing.
|
||||
/// </summary>
|
||||
Activity? StartEscalationActivity(string tenantId, string incidentId, int level);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metric tag names for consistency.
|
||||
/// </summary>
|
||||
public static class NotifyMetricTags
|
||||
{
|
||||
public const string TenantId = "tenant_id";
|
||||
public const string ChannelType = "channel_type";
|
||||
public const string Status = "status";
|
||||
public const string Outcome = "outcome";
|
||||
public const string Level = "level";
|
||||
public const string Reason = "reason";
|
||||
public const string RuleId = "rule_id";
|
||||
public const string Matched = "matched";
|
||||
public const string TemplateKey = "template_key";
|
||||
public const string Success = "success";
|
||||
public const string StormKey = "storm_key";
|
||||
public const string Decision = "decision";
|
||||
public const string EntityType = "entity_type";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metric names for the notification system.
|
||||
/// </summary>
|
||||
public static class NotifyMetricNames
|
||||
{
|
||||
public const string DeliveryAttempts = "notify.delivery.attempts";
|
||||
public const string DeliveryDuration = "notify.delivery.duration";
|
||||
public const string EscalationEvents = "notify.escalation.events";
|
||||
public const string DeadLetterEntries = "notify.deadletter.entries";
|
||||
public const string RuleEvaluations = "notify.rule.evaluations";
|
||||
public const string RuleEvaluationDuration = "notify.rule.evaluation.duration";
|
||||
public const string TemplateRenders = "notify.template.renders";
|
||||
public const string TemplateRenderDuration = "notify.template.render.duration";
|
||||
public const string StormEvents = "notify.storm.events";
|
||||
public const string RetentionCleanups = "notify.retention.cleanups";
|
||||
public const string QueueDepth = "notify.queue.depth";
|
||||
}
|
||||
|
||||
@@ -0,0 +1,456 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Manages data retention policies for notifications and related data.
|
||||
/// </summary>
|
||||
public interface IRetentionPolicyService
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets all retention policies for a tenant.
|
||||
/// </summary>
|
||||
Task<IReadOnlyList<RetentionPolicy>> GetPoliciesAsync(string tenantId, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets a specific retention policy.
|
||||
/// </summary>
|
||||
Task<RetentionPolicy?> GetPolicyAsync(string tenantId, string policyId, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Creates or updates a retention policy.
|
||||
/// </summary>
|
||||
Task<RetentionPolicy> UpsertPolicyAsync(RetentionPolicy policy, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Deletes a retention policy.
|
||||
/// </summary>
|
||||
Task<bool> DeletePolicyAsync(string tenantId, string policyId, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Applies retention policies and purges old data.
|
||||
/// </summary>
|
||||
Task<RetentionResult> ApplyAsync(string? tenantId = null, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets retention statistics.
|
||||
/// </summary>
|
||||
Task<RetentionStats> GetStatsAsync(string? tenantId = null, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Previews what would be deleted by retention policies.
|
||||
/// </summary>
|
||||
Task<RetentionPreview> PreviewAsync(string tenantId, CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A data retention policy.
|
||||
/// </summary>
|
||||
public sealed record RetentionPolicy
|
||||
{
|
||||
public required string PolicyId { get; init; }
|
||||
public required string TenantId { get; init; }
|
||||
public required string Name { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public required RetentionDataType DataType { get; init; }
|
||||
public required TimeSpan RetentionPeriod { get; init; }
|
||||
public RetentionAction Action { get; init; } = RetentionAction.Delete;
|
||||
public string? ArchiveDestination { get; init; }
|
||||
public bool Enabled { get; init; } = true;
|
||||
public IReadOnlyList<string>? ChannelTypes { get; init; }
|
||||
public IReadOnlyList<string>? EventKinds { get; init; }
|
||||
public int? MinimumCount { get; init; }
|
||||
public DateTimeOffset CreatedAt { get; init; }
|
||||
public DateTimeOffset UpdatedAt { get; init; }
|
||||
public DateTimeOffset? LastAppliedAt { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of data subject to retention.
|
||||
/// </summary>
|
||||
public enum RetentionDataType
|
||||
{
|
||||
Deliveries,
|
||||
DeadLetters,
|
||||
Incidents,
|
||||
AuditLogs,
|
||||
Metrics,
|
||||
Templates,
|
||||
EscalationHistory,
|
||||
DigestHistory,
|
||||
InboxNotifications
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Action to take when retention period expires.
|
||||
/// </summary>
|
||||
public enum RetentionAction
|
||||
{
|
||||
Delete,
|
||||
Archive,
|
||||
Anonymize
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of applying retention policies.
|
||||
/// </summary>
|
||||
public sealed record RetentionResult
|
||||
{
|
||||
public DateTimeOffset Timestamp { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public int PoliciesApplied { get; init; }
|
||||
public int TotalDeleted { get; init; }
|
||||
public int TotalArchived { get; init; }
|
||||
public int TotalAnonymized { get; init; }
|
||||
public TimeSpan Duration { get; init; }
|
||||
public IReadOnlyList<RetentionPolicyResult> PolicyResults { get; init; } = [];
|
||||
public IReadOnlyList<string> Errors { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of applying a single retention policy.
|
||||
/// </summary>
|
||||
public sealed record RetentionPolicyResult
|
||||
{
|
||||
public required string PolicyId { get; init; }
|
||||
public required string PolicyName { get; init; }
|
||||
public required RetentionDataType DataType { get; init; }
|
||||
public int AffectedCount { get; init; }
|
||||
public RetentionAction ActionTaken { get; init; }
|
||||
public bool Success { get; init; }
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Statistics about retention.
|
||||
/// </summary>
|
||||
public sealed record RetentionStats
|
||||
{
|
||||
public DateTimeOffset Timestamp { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public int TotalPolicies { get; init; }
|
||||
public int EnabledPolicies { get; init; }
|
||||
public int DisabledPolicies { get; init; }
|
||||
public long TotalDeletedAllTime { get; init; }
|
||||
public long TotalArchivedAllTime { get; init; }
|
||||
public DateTimeOffset? LastRunAt { get; init; }
|
||||
public DateTimeOffset? NextScheduledRun { get; init; }
|
||||
public IReadOnlyDictionary<RetentionDataType, DataTypeStats> ByDataType { get; init; } = new Dictionary<RetentionDataType, DataTypeStats>();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Statistics for a specific data type.
|
||||
/// </summary>
|
||||
public sealed record DataTypeStats
|
||||
{
|
||||
public required RetentionDataType DataType { get; init; }
|
||||
public long CurrentCount { get; init; }
|
||||
public DateTimeOffset? OldestRecord { get; init; }
|
||||
public long DeletedCount { get; init; }
|
||||
public long ArchivedCount { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Preview of what retention would delete.
|
||||
/// </summary>
|
||||
public sealed record RetentionPreview
|
||||
{
|
||||
public DateTimeOffset Timestamp { get; init; }
|
||||
public string? TenantId { get; init; }
|
||||
public int TotalToDelete { get; init; }
|
||||
public int TotalToArchive { get; init; }
|
||||
public int TotalToAnonymize { get; init; }
|
||||
public IReadOnlyList<RetentionPreviewItem> Items { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Preview item for a single policy.
|
||||
/// </summary>
|
||||
public sealed record RetentionPreviewItem
|
||||
{
|
||||
public required string PolicyId { get; init; }
|
||||
public required string PolicyName { get; init; }
|
||||
public required RetentionDataType DataType { get; init; }
|
||||
public int AffectedCount { get; init; }
|
||||
public RetentionAction Action { get; init; }
|
||||
public DateTimeOffset? OldestAffected { get; init; }
|
||||
public DateTimeOffset? NewestAffected { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for retention service.
|
||||
/// </summary>
|
||||
public sealed class RetentionOptions
|
||||
{
|
||||
public const string SectionName = "Notifier:Observability:Retention";
|
||||
|
||||
public bool Enabled { get; set; } = true;
|
||||
public TimeSpan DefaultRetentionPeriod { get; set; } = TimeSpan.FromDays(90);
|
||||
public TimeSpan MinimumRetentionPeriod { get; set; } = TimeSpan.FromDays(1);
|
||||
public TimeSpan MaximumRetentionPeriod { get; set; } = TimeSpan.FromDays(365 * 7);
|
||||
public bool AutoRun { get; set; } = true;
|
||||
public TimeSpan RunInterval { get; set; } = TimeSpan.FromHours(24);
|
||||
public TimeSpan RunTime { get; set; } = TimeSpan.FromHours(3);
|
||||
public int BatchSize { get; set; } = 1000;
|
||||
public bool DryRunByDefault { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// In-memory implementation of retention policy service.
|
||||
/// </summary>
|
||||
public sealed class InMemoryRetentionPolicyService : IRetentionPolicyService
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, List<RetentionPolicy>> _policies = new();
|
||||
private readonly ConcurrentDictionary<string, RetentionStats> _stats = new();
|
||||
private readonly RetentionOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<InMemoryRetentionPolicyService> _logger;
|
||||
|
||||
public InMemoryRetentionPolicyService(
|
||||
IOptions<RetentionOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<InMemoryRetentionPolicyService> logger)
|
||||
{
|
||||
_options = options?.Value ?? new RetentionOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public Task<IReadOnlyList<RetentionPolicy>> GetPoliciesAsync(string tenantId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_policies.TryGetValue(tenantId, out var policies))
|
||||
return Task.FromResult<IReadOnlyList<RetentionPolicy>>([]);
|
||||
return Task.FromResult<IReadOnlyList<RetentionPolicy>>(policies.ToList());
|
||||
}
|
||||
|
||||
public Task<RetentionPolicy?> GetPolicyAsync(string tenantId, string policyId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_policies.TryGetValue(tenantId, out var policies))
|
||||
return Task.FromResult<RetentionPolicy?>(null);
|
||||
return Task.FromResult(policies.FirstOrDefault(p => p.PolicyId == policyId));
|
||||
}
|
||||
|
||||
public Task<RetentionPolicy> UpsertPolicyAsync(RetentionPolicy policy, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var list = _policies.GetOrAdd(policy.TenantId, _ => []);
|
||||
|
||||
lock (list)
|
||||
{
|
||||
var index = list.FindIndex(p => p.PolicyId == policy.PolicyId);
|
||||
var updated = policy with { UpdatedAt = now, CreatedAt = index < 0 ? now : list[index].CreatedAt };
|
||||
if (index >= 0) list[index] = updated;
|
||||
else list.Add(updated);
|
||||
_logger.LogInformation("Upserted retention policy {PolicyId} for tenant {TenantId}", policy.PolicyId, policy.TenantId);
|
||||
return Task.FromResult(updated);
|
||||
}
|
||||
}
|
||||
|
||||
public Task<bool> DeletePolicyAsync(string tenantId, string policyId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_policies.TryGetValue(tenantId, out var policies)) return Task.FromResult(false);
|
||||
lock (policies)
|
||||
{
|
||||
var removed = policies.RemoveAll(p => p.PolicyId == policyId) > 0;
|
||||
if (removed) _logger.LogInformation("Deleted retention policy {PolicyId} for tenant {TenantId}", policyId, tenantId);
|
||||
return Task.FromResult(removed);
|
||||
}
|
||||
}
|
||||
|
||||
public Task<RetentionResult> ApplyAsync(string? tenantId = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
var policyResults = new List<RetentionPolicyResult>();
|
||||
var errors = new List<string>();
|
||||
var totalDeleted = 0;
|
||||
var totalArchived = 0;
|
||||
var totalAnonymized = 0;
|
||||
|
||||
var tenantsToProcess = tenantId is not null ? [tenantId] : _policies.Keys.ToList();
|
||||
|
||||
foreach (var t in tenantsToProcess)
|
||||
{
|
||||
if (!_policies.TryGetValue(t, out var policies)) continue;
|
||||
|
||||
foreach (var policy in policies.Where(p => p.Enabled))
|
||||
{
|
||||
try
|
||||
{
|
||||
var affectedCount = SimulateRetention(policy);
|
||||
var result = new RetentionPolicyResult
|
||||
{
|
||||
PolicyId = policy.PolicyId,
|
||||
PolicyName = policy.Name,
|
||||
DataType = policy.DataType,
|
||||
AffectedCount = affectedCount,
|
||||
ActionTaken = policy.Action,
|
||||
Success = true
|
||||
};
|
||||
policyResults.Add(result);
|
||||
|
||||
switch (policy.Action)
|
||||
{
|
||||
case RetentionAction.Delete: totalDeleted += affectedCount; break;
|
||||
case RetentionAction.Archive: totalArchived += affectedCount; break;
|
||||
case RetentionAction.Anonymize: totalAnonymized += affectedCount; break;
|
||||
}
|
||||
|
||||
// Update last applied time
|
||||
lock (policies)
|
||||
{
|
||||
var idx = policies.FindIndex(p => p.PolicyId == policy.PolicyId);
|
||||
if (idx >= 0) policies[idx] = policy with { LastAppliedAt = _timeProvider.GetUtcNow() };
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
errors.Add($"Policy {policy.PolicyId}: {ex.Message}");
|
||||
policyResults.Add(new RetentionPolicyResult
|
||||
{
|
||||
PolicyId = policy.PolicyId,
|
||||
PolicyName = policy.Name,
|
||||
DataType = policy.DataType,
|
||||
Success = false,
|
||||
Error = ex.Message
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var endTime = _timeProvider.GetUtcNow();
|
||||
_logger.LogInformation("Applied retention policies: {Deleted} deleted, {Archived} archived, {Anonymized} anonymized", totalDeleted, totalArchived, totalAnonymized);
|
||||
|
||||
return Task.FromResult(new RetentionResult
|
||||
{
|
||||
Timestamp = endTime,
|
||||
TenantId = tenantId,
|
||||
PoliciesApplied = policyResults.Count(r => r.Success),
|
||||
TotalDeleted = totalDeleted,
|
||||
TotalArchived = totalArchived,
|
||||
TotalAnonymized = totalAnonymized,
|
||||
Duration = endTime - startTime,
|
||||
PolicyResults = policyResults,
|
||||
Errors = errors
|
||||
});
|
||||
}
|
||||
|
||||
public Task<RetentionStats> GetStatsAsync(string? tenantId = null, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var allPolicies = tenantId is not null
|
||||
? (_policies.TryGetValue(tenantId, out var p) ? p : [])
|
||||
: _policies.Values.SelectMany(v => v).ToList();
|
||||
|
||||
var byDataType = Enum.GetValues<RetentionDataType>()
|
||||
.ToDictionary(dt => dt, dt => new DataTypeStats { DataType = dt, CurrentCount = 0, DeletedCount = 0, ArchivedCount = 0 });
|
||||
|
||||
return Task.FromResult(new RetentionStats
|
||||
{
|
||||
Timestamp = _timeProvider.GetUtcNow(),
|
||||
TenantId = tenantId,
|
||||
TotalPolicies = allPolicies.Count,
|
||||
EnabledPolicies = allPolicies.Count(p => p.Enabled),
|
||||
DisabledPolicies = allPolicies.Count(p => !p.Enabled),
|
||||
LastRunAt = allPolicies.Max(p => p.LastAppliedAt),
|
||||
ByDataType = byDataType
|
||||
});
|
||||
}
|
||||
|
||||
public Task<RetentionPreview> PreviewAsync(string tenantId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!_policies.TryGetValue(tenantId, out var policies))
|
||||
return Task.FromResult(new RetentionPreview { Timestamp = _timeProvider.GetUtcNow(), TenantId = tenantId });
|
||||
|
||||
var items = policies.Where(p => p.Enabled).Select(p => new RetentionPreviewItem
|
||||
{
|
||||
PolicyId = p.PolicyId,
|
||||
PolicyName = p.Name,
|
||||
DataType = p.DataType,
|
||||
AffectedCount = SimulateRetention(p),
|
||||
Action = p.Action
|
||||
}).ToList();
|
||||
|
||||
return Task.FromResult(new RetentionPreview
|
||||
{
|
||||
Timestamp = _timeProvider.GetUtcNow(),
|
||||
TenantId = tenantId,
|
||||
TotalToDelete = items.Where(i => i.Action == RetentionAction.Delete).Sum(i => i.AffectedCount),
|
||||
TotalToArchive = items.Where(i => i.Action == RetentionAction.Archive).Sum(i => i.AffectedCount),
|
||||
TotalToAnonymize = items.Where(i => i.Action == RetentionAction.Anonymize).Sum(i => i.AffectedCount),
|
||||
Items = items
|
||||
});
|
||||
}
|
||||
|
||||
private int SimulateRetention(RetentionPolicy policy)
|
||||
{
|
||||
// In production, this would query actual data stores
|
||||
// For simulation, return a random count based on retention period
|
||||
var daysFactor = (int)policy.RetentionPeriod.TotalDays;
|
||||
return Math.Max(0, 100 - daysFactor);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Background service that runs retention policies on schedule.
|
||||
/// </summary>
|
||||
public sealed class RetentionPolicyRunner : BackgroundService
|
||||
{
|
||||
private readonly IRetentionPolicyService _retentionService;
|
||||
private readonly RetentionOptions _options;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<RetentionPolicyRunner> _logger;
|
||||
|
||||
public RetentionPolicyRunner(
|
||||
IRetentionPolicyService retentionService,
|
||||
IOptions<RetentionOptions> options,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<RetentionPolicyRunner> logger)
|
||||
{
|
||||
_retentionService = retentionService ?? throw new ArgumentNullException(nameof(retentionService));
|
||||
_options = options?.Value ?? new RetentionOptions();
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
if (!_options.Enabled || !_options.AutoRun)
|
||||
{
|
||||
_logger.LogInformation("Retention policy runner is disabled");
|
||||
return;
|
||||
}
|
||||
|
||||
_logger.LogInformation("Retention policy runner started with interval {Interval}", _options.RunInterval);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
var now = _timeProvider.GetUtcNow();
|
||||
var nextRun = now.Date.Add(_options.RunTime);
|
||||
if (nextRun <= now) nextRun = nextRun.AddDays(1);
|
||||
|
||||
var delay = nextRun - now;
|
||||
if (delay > _options.RunInterval) delay = _options.RunInterval;
|
||||
|
||||
await Task.Delay(delay, stoppingToken);
|
||||
|
||||
_logger.LogInformation("Running scheduled retention policy application");
|
||||
var result = await _retentionService.ApplyAsync(cancellationToken: stoppingToken);
|
||||
_logger.LogInformation("Retention completed: {Deleted} deleted, {Archived} archived in {Duration}ms",
|
||||
result.TotalDeleted, result.TotalArchived, result.Duration.TotalMilliseconds);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error running retention policies");
|
||||
await Task.Delay(TimeSpan.FromMinutes(5), stoppingToken);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,244 @@
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
|
||||
namespace StellaOps.Notifier.Worker.Observability;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering observability services.
|
||||
/// </summary>
|
||||
public static class ObservabilityServiceExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds all observability services (metrics, tracing, dead-letter, chaos, retention).
|
||||
/// </summary>
|
||||
public static IServiceCollection AddNotifierObservability(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
return services
|
||||
.AddNotifierMetrics(configuration)
|
||||
.AddNotifierTracing(configuration)
|
||||
.AddDeadLetterHandling(configuration)
|
||||
.AddChaosEngine(configuration)
|
||||
.AddRetentionPolicies(configuration);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds notifier metrics services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddNotifierMetrics(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<NotifierMetricsOptions>(
|
||||
configuration.GetSection(NotifierMetricsOptions.SectionName));
|
||||
|
||||
services.AddSingleton<INotifierMetrics, DefaultNotifierMetrics>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds notifier tracing services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddNotifierTracing(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<NotifierTracingOptions>(
|
||||
configuration.GetSection(NotifierTracingOptions.SectionName));
|
||||
|
||||
services.AddSingleton<INotifierTracing, DefaultNotifierTracing>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds dead-letter handling services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddDeadLetterHandling(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<DeadLetterOptions>(
|
||||
configuration.GetSection(DeadLetterOptions.SectionName));
|
||||
|
||||
services.AddSingleton<IDeadLetterHandler, InMemoryDeadLetterHandler>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds chaos engine services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddChaosEngine(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<ChaosEngineOptions>(
|
||||
configuration.GetSection(ChaosEngineOptions.SectionName));
|
||||
|
||||
services.AddSingleton<IChaosEngine, DefaultChaosEngine>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds retention policy services.
|
||||
/// </summary>
|
||||
public static IServiceCollection AddRetentionPolicies(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
services.Configure<RetentionOptions>(
|
||||
configuration.GetSection(RetentionOptions.SectionName));
|
||||
|
||||
services.AddSingleton<IRetentionPolicyService, InMemoryRetentionPolicyService>();
|
||||
services.AddHostedService<RetentionPolicyRunner>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builder for customizing observability services.
|
||||
/// </summary>
|
||||
public static ObservabilityServiceBuilder AddNotifierObservability(this IServiceCollection services)
|
||||
{
|
||||
return new ObservabilityServiceBuilder(services);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builder for customizing observability services.
|
||||
/// </summary>
|
||||
public sealed class ObservabilityServiceBuilder
|
||||
{
|
||||
private readonly IServiceCollection _services;
|
||||
|
||||
public ObservabilityServiceBuilder(IServiceCollection services)
|
||||
{
|
||||
_services = services ?? throw new ArgumentNullException(nameof(services));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures metrics options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureMetrics(Action<NotifierMetricsOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures tracing options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureTracing(Action<NotifierTracingOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures dead-letter options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureDeadLetter(Action<DeadLetterOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures chaos engine options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureChaos(Action<ChaosEngineOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configures retention options.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder ConfigureRetention(Action<RetentionOptions> configure)
|
||||
{
|
||||
_services.Configure(configure);
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom metrics implementation.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomMetrics<T>() where T : class, INotifierMetrics
|
||||
{
|
||||
_services.AddSingleton<INotifierMetrics, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom tracing implementation.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomTracing<T>() where T : class, INotifierTracing
|
||||
{
|
||||
_services.AddSingleton<INotifierTracing, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom dead-letter handler.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomDeadLetterHandler<T>() where T : class, IDeadLetterHandler
|
||||
{
|
||||
_services.AddSingleton<IDeadLetterHandler, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom chaos engine.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomChaosEngine<T>() where T : class, IChaosEngine
|
||||
{
|
||||
_services.AddSingleton<IChaosEngine, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Uses a custom retention policy service.
|
||||
/// </summary>
|
||||
public ObservabilityServiceBuilder UseCustomRetentionService<T>() where T : class, IRetentionPolicyService
|
||||
{
|
||||
_services.AddSingleton<IRetentionPolicyService, T>();
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Builds the services with default implementations.
|
||||
/// </summary>
|
||||
public IServiceCollection Build()
|
||||
{
|
||||
// Register defaults if not already registered
|
||||
_services.TryAddSingleton<INotifierMetrics, DefaultNotifierMetrics>();
|
||||
_services.TryAddSingleton<INotifierTracing, DefaultNotifierTracing>();
|
||||
_services.TryAddSingleton<IDeadLetterHandler, InMemoryDeadLetterHandler>();
|
||||
_services.TryAddSingleton<IChaosEngine, DefaultChaosEngine>();
|
||||
_services.TryAddSingleton<IRetentionPolicyService, InMemoryRetentionPolicyService>();
|
||||
_services.AddHostedService<RetentionPolicyRunner>();
|
||||
|
||||
return _services;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for service collection to ensure singleton registration.
|
||||
/// </summary>
|
||||
file static class ServiceCollectionExtensions
|
||||
{
|
||||
public static void TryAddSingleton<TService, TImplementation>(this IServiceCollection services)
|
||||
where TService : class
|
||||
where TImplementation : class, TService
|
||||
{
|
||||
if (!services.Any(s => s.ServiceType == typeof(TService)))
|
||||
{
|
||||
services.AddSingleton<TService, TImplementation>();
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user