up
This commit is contained in:
@@ -0,0 +1,355 @@
|
||||
using System.Collections.Concurrent;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Findings.Ledger.Observability;
|
||||
using StellaOps.Findings.Ledger.Options;
|
||||
using StellaOps.Telemetry.Core;
|
||||
|
||||
namespace StellaOps.Findings.Ledger.Services.Incident;
|
||||
|
||||
public interface ILedgerIncidentDiagnostics : ILedgerIncidentState
|
||||
{
|
||||
void RecordProjectionLag(ProjectionLagSample sample);
|
||||
|
||||
void RecordConflict(ConflictSnapshot snapshot);
|
||||
|
||||
void RecordReplayTrace(ReplayTraceSample sample);
|
||||
|
||||
IncidentDiagnosticsSnapshot GetDiagnosticsSnapshot();
|
||||
}
|
||||
|
||||
public interface ILedgerIncidentState
|
||||
{
|
||||
bool IsActive { get; }
|
||||
|
||||
LedgerIncidentSnapshot Current { get; }
|
||||
}
|
||||
|
||||
public interface ILedgerIncidentNotifier
|
||||
{
|
||||
Task PublishIncidentModeChangedAsync(LedgerIncidentSnapshot snapshot, CancellationToken cancellationToken);
|
||||
}
|
||||
|
||||
public sealed class LoggingLedgerIncidentNotifier : ILedgerIncidentNotifier
|
||||
{
|
||||
private readonly ILogger<LoggingLedgerIncidentNotifier> _logger;
|
||||
|
||||
public LoggingLedgerIncidentNotifier(ILogger<LoggingLedgerIncidentNotifier> logger)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
public Task PublishIncidentModeChangedAsync(LedgerIncidentSnapshot snapshot, CancellationToken cancellationToken)
|
||||
{
|
||||
var state = snapshot.IsActive ? "enabled" : "disabled";
|
||||
_logger.LogWarning(
|
||||
"NOTIFICATION: Ledger incident mode {State} (activation_id={ActivationId}, retention_extension_days={ExtensionDays})",
|
||||
state,
|
||||
snapshot.ActivationId ?? string.Empty,
|
||||
snapshot.RetentionExtensionDays);
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
public sealed record LedgerIncidentSnapshot(
|
||||
bool IsActive,
|
||||
string? ActivationId,
|
||||
string? Actor,
|
||||
string? Reason,
|
||||
string? TenantId,
|
||||
DateTimeOffset ChangedAt,
|
||||
DateTimeOffset? ExpiresAt,
|
||||
int RetentionExtensionDays);
|
||||
|
||||
public sealed record ProjectionLagSample(
|
||||
string TenantId,
|
||||
Guid ChainId,
|
||||
long SequenceNumber,
|
||||
string EventType,
|
||||
string PolicyVersion,
|
||||
double LagSeconds,
|
||||
DateTimeOffset RecordedAt,
|
||||
DateTimeOffset ObservedAt);
|
||||
|
||||
public sealed record ConflictSnapshot(
|
||||
string TenantId,
|
||||
Guid ChainId,
|
||||
long SequenceNumber,
|
||||
Guid EventId,
|
||||
string EventType,
|
||||
string PolicyVersion,
|
||||
string Reason,
|
||||
DateTimeOffset RecordedAt,
|
||||
DateTimeOffset ObservedAt,
|
||||
string? ActorId,
|
||||
string? ActorType,
|
||||
long ExpectedSequence,
|
||||
string? ProvidedPreviousHash,
|
||||
string? ExpectedPreviousHash);
|
||||
|
||||
public sealed record ReplayTraceSample(
|
||||
string TenantId,
|
||||
long FromSequence,
|
||||
long ToSequence,
|
||||
long EventsCount,
|
||||
bool HasMore,
|
||||
double DurationMs,
|
||||
DateTimeOffset ObservedAt,
|
||||
int ChainFilterCount,
|
||||
int EventTypeFilterCount);
|
||||
|
||||
public sealed record IncidentDiagnosticsSnapshot(
|
||||
LedgerIncidentSnapshot Incident,
|
||||
IReadOnlyList<ProjectionLagSample> LagSamples,
|
||||
IReadOnlyList<ConflictSnapshot> ConflictSnapshots,
|
||||
IReadOnlyList<ReplayTraceSample> ReplayTraces,
|
||||
DateTimeOffset CapturedAt);
|
||||
|
||||
/// <summary>
|
||||
/// Coordinates ledger-specific incident mode behaviour (diagnostics, retention hints, timeline/notification events).
|
||||
/// </summary>
|
||||
public sealed class LedgerIncidentCoordinator : ILedgerIncidentDiagnostics, IDisposable
|
||||
{
|
||||
private const int ReplayTraceLogThresholdMs = 250;
|
||||
|
||||
private readonly LedgerIncidentOptions _options;
|
||||
private readonly ILogger<LedgerIncidentCoordinator> _logger;
|
||||
private readonly ILedgerIncidentNotifier _notifier;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly IIncidentModeService? _incidentModeService;
|
||||
|
||||
private readonly ConcurrentQueue<ProjectionLagSample> _lagSamples = new();
|
||||
private readonly ConcurrentQueue<ConflictSnapshot> _conflictSnapshots = new();
|
||||
private readonly ConcurrentQueue<ReplayTraceSample> _replayTraces = new();
|
||||
private readonly ConcurrentDictionary<string, DateTimeOffset> _lastLagLogByChain = new(StringComparer.Ordinal);
|
||||
|
||||
private readonly object _stateLock = new();
|
||||
private LedgerIncidentSnapshot _current;
|
||||
private bool _disposed;
|
||||
|
||||
public LedgerIncidentCoordinator(
|
||||
IOptions<LedgerIncidentOptions> options,
|
||||
ILogger<LedgerIncidentCoordinator> logger,
|
||||
ILedgerIncidentNotifier notifier,
|
||||
TimeProvider? timeProvider = null,
|
||||
IIncidentModeService? incidentModeService = null)
|
||||
{
|
||||
_options = (options ?? throw new ArgumentNullException(nameof(options))).Value;
|
||||
_options.Validate();
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_notifier = notifier ?? throw new ArgumentNullException(nameof(notifier));
|
||||
_timeProvider = timeProvider ?? TimeProvider.System;
|
||||
_incidentModeService = incidentModeService;
|
||||
|
||||
_current = new LedgerIncidentSnapshot(
|
||||
IsActive: false,
|
||||
ActivationId: null,
|
||||
Actor: null,
|
||||
Reason: null,
|
||||
TenantId: null,
|
||||
ChangedAt: _timeProvider.GetUtcNow(),
|
||||
ExpiresAt: null,
|
||||
RetentionExtensionDays: 0);
|
||||
|
||||
if (_incidentModeService is not null)
|
||||
{
|
||||
_incidentModeService.Activated += OnActivated;
|
||||
_incidentModeService.Deactivated += OnDeactivated;
|
||||
|
||||
if (_incidentModeService.CurrentState is { } state && !_incidentModeService.CurrentState.IsExpired)
|
||||
{
|
||||
ApplyIncidentState(state, wasReactivation: false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public bool IsActive => _current.IsActive;
|
||||
|
||||
public LedgerIncidentSnapshot Current => _current;
|
||||
|
||||
public void RecordProjectionLag(ProjectionLagSample sample)
|
||||
{
|
||||
if (!_options.Enabled || !IsActive || !_options.CaptureLagTraces)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
EnqueueWithLimit(_lagSamples, sample, _options.LagTraceBufferSize);
|
||||
|
||||
if (_options.EmitTimelineEvents && sample.LagSeconds >= _options.LagTraceThresholdSeconds)
|
||||
{
|
||||
var now = sample.ObservedAt;
|
||||
var key = $"{sample.TenantId}:{sample.ChainId}";
|
||||
if (!_lastLagLogByChain.TryGetValue(key, out var lastLogged) ||
|
||||
now - lastLogged >= TimeSpan.FromMinutes(1))
|
||||
{
|
||||
_lastLagLogByChain[key] = now;
|
||||
LedgerTimeline.EmitIncidentLagTrace(_logger, sample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void RecordConflict(ConflictSnapshot snapshot)
|
||||
{
|
||||
if (!_options.Enabled || !IsActive || !_options.CaptureConflictSnapshots)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
EnqueueWithLimit(_conflictSnapshots, snapshot, _options.ConflictSnapshotBufferSize);
|
||||
if (_options.EmitTimelineEvents)
|
||||
{
|
||||
LedgerTimeline.EmitIncidentConflictSnapshot(_logger, snapshot);
|
||||
}
|
||||
}
|
||||
|
||||
public void RecordReplayTrace(ReplayTraceSample sample)
|
||||
{
|
||||
if (!_options.Enabled || !IsActive || !_options.CaptureReplayTraces)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
EnqueueWithLimit(_replayTraces, sample, _options.ReplayTraceBufferSize);
|
||||
|
||||
if (_options.EmitTimelineEvents &&
|
||||
(sample.DurationMs >= ReplayTraceLogThresholdMs || sample.HasMore))
|
||||
{
|
||||
LedgerTimeline.EmitIncidentReplayTrace(_logger, sample);
|
||||
}
|
||||
}
|
||||
|
||||
public IncidentDiagnosticsSnapshot GetDiagnosticsSnapshot()
|
||||
{
|
||||
return new IncidentDiagnosticsSnapshot(
|
||||
_current,
|
||||
_lagSamples.ToArray(),
|
||||
_conflictSnapshots.ToArray(),
|
||||
_replayTraces.ToArray(),
|
||||
_timeProvider.GetUtcNow());
|
||||
}
|
||||
|
||||
private void OnActivated(object? sender, IncidentModeActivatedEventArgs e)
|
||||
{
|
||||
ApplyIncidentState(e.State, e.WasReactivation);
|
||||
}
|
||||
|
||||
private void OnDeactivated(object? sender, IncidentModeDeactivatedEventArgs e)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
lock (_stateLock)
|
||||
{
|
||||
_current = new LedgerIncidentSnapshot(
|
||||
IsActive: false,
|
||||
ActivationId: e.State.ActivationId,
|
||||
Actor: e.DeactivatedBy,
|
||||
Reason: e.Reason.ToString(),
|
||||
TenantId: e.State.TenantId,
|
||||
ChangedAt: _timeProvider.GetUtcNow(),
|
||||
ExpiresAt: e.State.ExpiresAt,
|
||||
RetentionExtensionDays: 0);
|
||||
}
|
||||
|
||||
if (_options.EmitTimelineEvents)
|
||||
{
|
||||
LedgerTimeline.EmitIncidentModeChanged(_logger, _current, wasReactivation: false);
|
||||
}
|
||||
|
||||
if (_options.EmitNotifications)
|
||||
{
|
||||
_ = SafeNotifyAsync(_current);
|
||||
}
|
||||
}
|
||||
|
||||
private void ApplyIncidentState(IncidentModeState state, bool wasReactivation)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
lock (_stateLock)
|
||||
{
|
||||
_current = new LedgerIncidentSnapshot(
|
||||
IsActive: true,
|
||||
ActivationId: state.ActivationId,
|
||||
Actor: state.Actor,
|
||||
Reason: state.Reason,
|
||||
TenantId: state.TenantId,
|
||||
ChangedAt: _timeProvider.GetUtcNow(),
|
||||
ExpiresAt: state.ExpiresAt,
|
||||
RetentionExtensionDays: _options.RetentionExtensionDays);
|
||||
|
||||
if (_options.ResetDiagnosticsOnActivation)
|
||||
{
|
||||
ClearDiagnostics();
|
||||
}
|
||||
}
|
||||
|
||||
if (_options.EmitTimelineEvents)
|
||||
{
|
||||
LedgerTimeline.EmitIncidentModeChanged(_logger, _current, wasReactivation);
|
||||
}
|
||||
|
||||
if (_options.EmitNotifications)
|
||||
{
|
||||
_ = SafeNotifyAsync(_current);
|
||||
}
|
||||
}
|
||||
|
||||
private Task SafeNotifyAsync(LedgerIncidentSnapshot snapshot)
|
||||
{
|
||||
try
|
||||
{
|
||||
return _notifier.PublishIncidentModeChangedAsync(snapshot, CancellationToken.None);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to publish incident mode notification.");
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
|
||||
private void ClearDiagnostics()
|
||||
{
|
||||
while (_lagSamples.TryDequeue(out _))
|
||||
{
|
||||
}
|
||||
|
||||
while (_conflictSnapshots.TryDequeue(out _))
|
||||
{
|
||||
}
|
||||
|
||||
while (_replayTraces.TryDequeue(out _))
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
private static void EnqueueWithLimit<T>(ConcurrentQueue<T> queue, T item, int limit)
|
||||
{
|
||||
queue.Enqueue(item);
|
||||
while (queue.Count > limit && queue.TryDequeue(out _))
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (_incidentModeService is not null)
|
||||
{
|
||||
_incidentModeService.Activated -= OnActivated;
|
||||
_incidentModeService.Deactivated -= OnDeactivated;
|
||||
}
|
||||
|
||||
_disposed = true;
|
||||
}
|
||||
}
|
||||
@@ -5,6 +5,7 @@ using StellaOps.Findings.Ledger.Domain;
|
||||
using StellaOps.Findings.Ledger.Hashing;
|
||||
using StellaOps.Findings.Ledger.Infrastructure;
|
||||
using StellaOps.Findings.Ledger.Observability;
|
||||
using StellaOps.Findings.Ledger.Services.Incident;
|
||||
|
||||
namespace StellaOps.Findings.Ledger.Services;
|
||||
|
||||
@@ -18,15 +19,18 @@ public sealed class LedgerEventWriteService : ILedgerEventWriteService
|
||||
private readonly ILedgerEventRepository _repository;
|
||||
private readonly IMerkleAnchorScheduler _merkleAnchorScheduler;
|
||||
private readonly ILogger<LedgerEventWriteService> _logger;
|
||||
private readonly ILedgerIncidentDiagnostics? _incidentDiagnostics;
|
||||
|
||||
public LedgerEventWriteService(
|
||||
ILedgerEventRepository repository,
|
||||
IMerkleAnchorScheduler merkleAnchorScheduler,
|
||||
ILogger<LedgerEventWriteService> logger)
|
||||
ILogger<LedgerEventWriteService> logger,
|
||||
ILedgerIncidentDiagnostics? incidentDiagnostics = null)
|
||||
{
|
||||
_repository = repository ?? throw new ArgumentNullException(nameof(repository));
|
||||
_merkleAnchorScheduler = merkleAnchorScheduler ?? throw new ArgumentNullException(nameof(merkleAnchorScheduler));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_incidentDiagnostics = incidentDiagnostics;
|
||||
}
|
||||
|
||||
public async Task<LedgerWriteResult> AppendAsync(LedgerEventDraft draft, CancellationToken cancellationToken)
|
||||
@@ -57,6 +61,7 @@ public sealed class LedgerEventWriteService : ILedgerEventWriteService
|
||||
if (!string.Equals(existing.CanonicalJson, canonicalJson, StringComparison.Ordinal))
|
||||
{
|
||||
LedgerTelemetry.MarkError(activity, "event_id_conflict");
|
||||
RecordConflictSnapshot(draft, expectedSequence: existing.SequenceNumber + 1, reason: "event_id_conflict", expectedPreviousHash: existing.EventHash);
|
||||
return LedgerWriteResult.Conflict(
|
||||
"event_id_conflict",
|
||||
$"Event '{draft.EventId}' already exists with a different payload.");
|
||||
@@ -71,6 +76,7 @@ public sealed class LedgerEventWriteService : ILedgerEventWriteService
|
||||
if (draft.SequenceNumber != expectedSequence)
|
||||
{
|
||||
LedgerTelemetry.MarkError(activity, "sequence_mismatch");
|
||||
RecordConflictSnapshot(draft, expectedSequence, reason: "sequence_mismatch", expectedPreviousHash: chainHead?.EventHash);
|
||||
return LedgerWriteResult.Conflict(
|
||||
"sequence_mismatch",
|
||||
$"Sequence number '{draft.SequenceNumber}' does not match expected '{expectedSequence}'.");
|
||||
@@ -80,6 +86,7 @@ public sealed class LedgerEventWriteService : ILedgerEventWriteService
|
||||
if (draft.ProvidedPreviousHash is not null && !string.Equals(draft.ProvidedPreviousHash, previousHash, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
LedgerTelemetry.MarkError(activity, "previous_hash_mismatch");
|
||||
RecordConflictSnapshot(draft, expectedSequence, reason: "previous_hash_mismatch", providedPreviousHash: draft.ProvidedPreviousHash, expectedPreviousHash: previousHash);
|
||||
return LedgerWriteResult.Conflict(
|
||||
"previous_hash_mismatch",
|
||||
$"Provided previous hash '{draft.ProvidedPreviousHash}' does not match chain head hash '{previousHash}'.");
|
||||
@@ -143,11 +150,13 @@ public sealed class LedgerEventWriteService : ILedgerEventWriteService
|
||||
var persisted = await _repository.GetByEventIdAsync(draft.TenantId, draft.EventId, cancellationToken).ConfigureAwait(false);
|
||||
if (persisted is null)
|
||||
{
|
||||
RecordConflictSnapshot(draft, expectedSequence, reason: "append_failed", expectedPreviousHash: previousHash);
|
||||
return LedgerWriteResult.Conflict("append_failed", "Ledger append failed due to concurrent write.");
|
||||
}
|
||||
|
||||
if (!string.Equals(persisted.CanonicalJson, record.CanonicalJson, StringComparison.Ordinal))
|
||||
{
|
||||
RecordConflictSnapshot(draft, expectedSequence, reason: "event_id_conflict", expectedPreviousHash: persisted.EventHash);
|
||||
return LedgerWriteResult.Conflict("event_id_conflict", "Ledger append raced with conflicting payload.");
|
||||
}
|
||||
|
||||
@@ -157,6 +166,37 @@ public sealed class LedgerEventWriteService : ILedgerEventWriteService
|
||||
return LedgerWriteResult.Success(record);
|
||||
}
|
||||
|
||||
private void RecordConflictSnapshot(
|
||||
LedgerEventDraft draft,
|
||||
long expectedSequence,
|
||||
string reason,
|
||||
string? providedPreviousHash = null,
|
||||
string? expectedPreviousHash = null)
|
||||
{
|
||||
if (_incidentDiagnostics is null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var snapshot = new ConflictSnapshot(
|
||||
TenantId: draft.TenantId,
|
||||
ChainId: draft.ChainId,
|
||||
SequenceNumber: draft.SequenceNumber,
|
||||
EventId: draft.EventId,
|
||||
EventType: draft.EventType,
|
||||
PolicyVersion: draft.PolicyVersion ?? string.Empty,
|
||||
Reason: reason,
|
||||
RecordedAt: draft.RecordedAt,
|
||||
ObservedAt: DateTimeOffset.UtcNow,
|
||||
ActorId: draft.ActorId,
|
||||
ActorType: draft.ActorType,
|
||||
ExpectedSequence: expectedSequence,
|
||||
ProvidedPreviousHash: providedPreviousHash,
|
||||
ExpectedPreviousHash: expectedPreviousHash);
|
||||
|
||||
_incidentDiagnostics.RecordConflict(snapshot);
|
||||
}
|
||||
|
||||
private static string DetermineSource(LedgerEventDraft draft)
|
||||
{
|
||||
if (draft.SourceRunId.HasValue)
|
||||
|
||||
@@ -154,7 +154,12 @@ public sealed class ScoredFindingsExportService : IScoredFindingsExportService
|
||||
finding.RiskProfileVersion,
|
||||
finding.RiskExplanationId,
|
||||
finding.ExplainRef,
|
||||
finding.UpdatedAt
|
||||
finding.UpdatedAt,
|
||||
finding.AttestationStatus,
|
||||
finding.AttestationCount,
|
||||
finding.VerifiedAttestationCount,
|
||||
finding.FailedAttestationCount,
|
||||
finding.UnverifiedAttestationCount
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
using StellaOps.Findings.Ledger.Infrastructure.Attestation;
|
||||
|
||||
namespace StellaOps.Findings.Ledger.Services;
|
||||
|
||||
/// <summary>
|
||||
@@ -18,6 +20,9 @@ public sealed record ScoredFindingsQuery
|
||||
public int Limit { get; init; } = 50;
|
||||
public ScoredFindingsSortField SortBy { get; init; } = ScoredFindingsSortField.RiskScore;
|
||||
public bool Descending { get; init; } = true;
|
||||
public IReadOnlyList<AttestationType>? AttestationTypes { get; init; }
|
||||
public AttestationVerificationFilter? AttestationVerification { get; init; }
|
||||
public OverallVerificationStatus? AttestationStatus { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -57,6 +62,11 @@ public sealed record ScoredFinding
|
||||
public Guid? RiskExplanationId { get; init; }
|
||||
public string? ExplainRef { get; init; }
|
||||
public DateTimeOffset UpdatedAt { get; init; }
|
||||
public int AttestationCount { get; init; }
|
||||
public int VerifiedAttestationCount { get; init; }
|
||||
public int FailedAttestationCount { get; init; }
|
||||
public int UnverifiedAttestationCount { get; init; }
|
||||
public OverallVerificationStatus AttestationStatus { get; init; } = OverallVerificationStatus.NoAttestations;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -164,7 +164,12 @@ public sealed class ScoredFindingsQueryService : IScoredFindingsQueryService
|
||||
RiskProfileVersion = projection.RiskProfileVersion,
|
||||
RiskExplanationId = projection.RiskExplanationId,
|
||||
ExplainRef = projection.ExplainRef,
|
||||
UpdatedAt = projection.UpdatedAt
|
||||
UpdatedAt = projection.UpdatedAt,
|
||||
AttestationCount = projection.AttestationCount,
|
||||
VerifiedAttestationCount = projection.VerifiedAttestationCount,
|
||||
FailedAttestationCount = projection.FailedAttestationCount,
|
||||
UnverifiedAttestationCount = projection.UnverifiedAttestationCount,
|
||||
AttestationStatus = projection.AttestationStatus
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
namespace StellaOps.Findings.Ledger.Services;
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
@@ -7,6 +8,7 @@ using Microsoft.Extensions.Logging;
|
||||
using StellaOps.Findings.Ledger.Domain;
|
||||
using StellaOps.Findings.Ledger.Infrastructure.Snapshot;
|
||||
using StellaOps.Findings.Ledger.Observability;
|
||||
using StellaOps.Findings.Ledger.Services.Incident;
|
||||
|
||||
/// <summary>
|
||||
/// Service for managing ledger snapshots and time-travel queries.
|
||||
@@ -17,15 +19,18 @@ public sealed class SnapshotService
|
||||
private readonly ITimeTravelRepository _timeTravelRepository;
|
||||
private readonly ILogger<SnapshotService> _logger;
|
||||
private readonly JsonSerializerOptions _jsonOptions;
|
||||
private readonly ILedgerIncidentDiagnostics? _incidentDiagnostics;
|
||||
|
||||
public SnapshotService(
|
||||
ISnapshotRepository snapshotRepository,
|
||||
ITimeTravelRepository timeTravelRepository,
|
||||
ILogger<SnapshotService> logger)
|
||||
ILogger<SnapshotService> logger,
|
||||
ILedgerIncidentDiagnostics? incidentDiagnostics = null)
|
||||
{
|
||||
_snapshotRepository = snapshotRepository;
|
||||
_timeTravelRepository = timeTravelRepository;
|
||||
_logger = logger;
|
||||
_incidentDiagnostics = incidentDiagnostics;
|
||||
_jsonOptions = new JsonSerializerOptions
|
||||
{
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
|
||||
@@ -42,32 +47,33 @@ public sealed class SnapshotService
|
||||
{
|
||||
try
|
||||
{
|
||||
var effectiveInput = ApplyIncidentRetention(input);
|
||||
_logger.LogInformation(
|
||||
"Creating snapshot for tenant {TenantId} at sequence {Sequence} / timestamp {Timestamp}",
|
||||
input.TenantId,
|
||||
input.AtSequence,
|
||||
input.AtTimestamp);
|
||||
effectiveInput.TenantId,
|
||||
effectiveInput.AtSequence,
|
||||
effectiveInput.AtTimestamp);
|
||||
|
||||
// Get current ledger state
|
||||
var currentPoint = await _timeTravelRepository.GetCurrentPointAsync(input.TenantId, ct);
|
||||
var currentPoint = await _timeTravelRepository.GetCurrentPointAsync(effectiveInput.TenantId, ct);
|
||||
|
||||
// Create the snapshot record
|
||||
var snapshot = await _snapshotRepository.CreateAsync(
|
||||
input.TenantId,
|
||||
input,
|
||||
effectiveInput.TenantId,
|
||||
effectiveInput,
|
||||
currentPoint.SequenceNumber,
|
||||
currentPoint.Timestamp,
|
||||
ct);
|
||||
|
||||
// Compute statistics asynchronously
|
||||
var statistics = await ComputeStatisticsAsync(
|
||||
input.TenantId,
|
||||
effectiveInput.TenantId,
|
||||
snapshot.SequenceNumber,
|
||||
input.IncludeEntityTypes,
|
||||
effectiveInput.IncludeEntityTypes,
|
||||
ct);
|
||||
|
||||
await _snapshotRepository.UpdateStatisticsAsync(
|
||||
input.TenantId,
|
||||
effectiveInput.TenantId,
|
||||
snapshot.SnapshotId,
|
||||
statistics,
|
||||
ct);
|
||||
@@ -79,12 +85,12 @@ public sealed class SnapshotService
|
||||
if (input.Sign)
|
||||
{
|
||||
merkleRoot = await ComputeMerkleRootAsync(
|
||||
input.TenantId,
|
||||
effectiveInput.TenantId,
|
||||
snapshot.SequenceNumber,
|
||||
ct);
|
||||
|
||||
await _snapshotRepository.SetMerkleRootAsync(
|
||||
input.TenantId,
|
||||
effectiveInput.TenantId,
|
||||
snapshot.SnapshotId,
|
||||
merkleRoot,
|
||||
dsseDigest,
|
||||
@@ -93,20 +99,20 @@ public sealed class SnapshotService
|
||||
|
||||
// Mark as available
|
||||
await _snapshotRepository.UpdateStatusAsync(
|
||||
input.TenantId,
|
||||
effectiveInput.TenantId,
|
||||
snapshot.SnapshotId,
|
||||
SnapshotStatus.Available,
|
||||
ct);
|
||||
|
||||
// Retrieve updated snapshot
|
||||
var finalSnapshot = await _snapshotRepository.GetByIdAsync(
|
||||
input.TenantId,
|
||||
effectiveInput.TenantId,
|
||||
snapshot.SnapshotId,
|
||||
ct);
|
||||
|
||||
LedgerTimeline.EmitSnapshotCreated(
|
||||
_logger,
|
||||
input.TenantId,
|
||||
effectiveInput.TenantId,
|
||||
snapshot.SnapshotId,
|
||||
snapshot.SequenceNumber,
|
||||
statistics.FindingsCount);
|
||||
@@ -196,7 +202,20 @@ public sealed class SnapshotService
|
||||
ReplayRequest request,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
return await _timeTravelRepository.ReplayEventsAsync(request, ct);
|
||||
var result = await _timeTravelRepository.ReplayEventsAsync(request, ct);
|
||||
|
||||
_incidentDiagnostics?.RecordReplayTrace(new ReplayTraceSample(
|
||||
TenantId: request.TenantId,
|
||||
FromSequence: result.Metadata.FromSequence,
|
||||
ToSequence: result.Metadata.ToSequence,
|
||||
EventsCount: result.Metadata.EventsCount,
|
||||
HasMore: result.Metadata.HasMore,
|
||||
DurationMs: result.Metadata.ReplayDurationMs,
|
||||
ObservedAt: DateTimeOffset.UtcNow,
|
||||
ChainFilterCount: request.ChainIds?.Count ?? 0,
|
||||
EventTypeFilterCount: request.EventTypes?.Count ?? 0));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -249,6 +268,15 @@ public sealed class SnapshotService
|
||||
public async Task<int> ExpireOldSnapshotsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var cutoff = DateTimeOffset.UtcNow;
|
||||
if (_incidentDiagnostics?.IsActive == true && _incidentDiagnostics.Current.RetentionExtensionDays > 0)
|
||||
{
|
||||
cutoff = cutoff.AddDays(-_incidentDiagnostics.Current.RetentionExtensionDays);
|
||||
_logger.LogInformation(
|
||||
"Incident mode active; extending snapshot expiry cutoff by {ExtensionDays} days (activation {ActivationId}).",
|
||||
_incidentDiagnostics.Current.RetentionExtensionDays,
|
||||
_incidentDiagnostics.Current.ActivationId ?? string.Empty);
|
||||
}
|
||||
|
||||
var count = await _snapshotRepository.ExpireSnapshotsAsync(cutoff, ct);
|
||||
|
||||
if (count > 0)
|
||||
@@ -367,4 +395,44 @@ public sealed class SnapshotService
|
||||
var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(input));
|
||||
return Convert.ToHexStringLower(bytes);
|
||||
}
|
||||
|
||||
private CreateSnapshotInput ApplyIncidentRetention(CreateSnapshotInput input)
|
||||
{
|
||||
if (_incidentDiagnostics is null || !_incidentDiagnostics.IsActive)
|
||||
{
|
||||
return input;
|
||||
}
|
||||
|
||||
var incident = _incidentDiagnostics.Current;
|
||||
if (incident.RetentionExtensionDays <= 0)
|
||||
{
|
||||
return input;
|
||||
}
|
||||
|
||||
TimeSpan? expiresIn = input.ExpiresIn;
|
||||
if (expiresIn.HasValue)
|
||||
{
|
||||
expiresIn = expiresIn.Value.Add(TimeSpan.FromDays(incident.RetentionExtensionDays));
|
||||
}
|
||||
|
||||
var metadata = input.Metadata is null
|
||||
? new Dictionary<string, object>()
|
||||
: new Dictionary<string, object>(input.Metadata);
|
||||
|
||||
metadata["incident.mode"] = "enabled";
|
||||
metadata["incident.activationId"] = incident.ActivationId ?? string.Empty;
|
||||
metadata["incident.retentionExtensionDays"] = incident.RetentionExtensionDays;
|
||||
metadata["incident.changedAt"] = incident.ChangedAt.ToString("O");
|
||||
if (incident.ExpiresAt is not null)
|
||||
{
|
||||
metadata["incident.expiresAt"] = incident.ExpiresAt.Value.ToString("O");
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Incident mode active; extending snapshot retention by {ExtensionDays} days (activation {ActivationId}).",
|
||||
incident.RetentionExtensionDays,
|
||||
incident.ActivationId ?? string.Empty);
|
||||
|
||||
return input with { ExpiresIn = expiresIn, Metadata = metadata };
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user