Refactor SurfaceCacheValidator to simplify oldest entry calculation
Add global using for Xunit in test project Enhance ImportValidatorTests with async validation and quarantine checks Implement FileSystemQuarantineServiceTests for quarantine functionality Add integration tests for ImportValidator to check monotonicity Create BundleVersionTests to validate version parsing and comparison logic Implement VersionMonotonicityCheckerTests for monotonicity checks and activation logic
This commit is contained in:
@@ -0,0 +1,183 @@
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Telemetry.Core.Triage;
|
||||
|
||||
/// <summary>
|
||||
/// Metrics for triage workflow observability (TTFS, clicks-to-closure, evidence completeness).
|
||||
/// </summary>
|
||||
public static class TriageMetrics
|
||||
{
|
||||
/// <summary>
|
||||
/// Meter name for triage metrics.
|
||||
/// </summary>
|
||||
public const string MeterName = "StellaOps.Triage";
|
||||
|
||||
private static readonly Meter Meter = new(MeterName, "1.0.0");
|
||||
|
||||
// TTFS Metrics
|
||||
|
||||
/// <summary>
|
||||
/// Time to skeleton UI render in seconds.
|
||||
/// </summary>
|
||||
public static readonly Histogram<double> TtfsSkeletonSeconds = Meter.CreateHistogram<double>(
|
||||
"stellaops_ttfs_skeleton_seconds",
|
||||
unit: "s",
|
||||
description: "Time to skeleton UI render");
|
||||
|
||||
/// <summary>
|
||||
/// Time to first evidence pill (primary TTFS metric) in seconds.
|
||||
/// </summary>
|
||||
public static readonly Histogram<double> TtfsFirstEvidenceSeconds = Meter.CreateHistogram<double>(
|
||||
"stellaops_ttfs_first_evidence_seconds",
|
||||
unit: "s",
|
||||
description: "Time to first evidence pill (primary TTFS)");
|
||||
|
||||
/// <summary>
|
||||
/// Time to full evidence load in seconds.
|
||||
/// </summary>
|
||||
public static readonly Histogram<double> TtfsFullEvidenceSeconds = Meter.CreateHistogram<double>(
|
||||
"stellaops_ttfs_full_evidence_seconds",
|
||||
unit: "s",
|
||||
description: "Time to full evidence load");
|
||||
|
||||
// Clicks-to-Closure
|
||||
|
||||
/// <summary>
|
||||
/// Interactions required to complete triage decision.
|
||||
/// </summary>
|
||||
public static readonly Histogram<int> ClicksToClosure = Meter.CreateHistogram<int>(
|
||||
"stellaops_clicks_to_closure",
|
||||
unit: "{clicks}",
|
||||
description: "Interactions required to complete triage decision");
|
||||
|
||||
// Evidence Completeness
|
||||
|
||||
/// <summary>
|
||||
/// Evidence completeness at decision time (0-4).
|
||||
/// </summary>
|
||||
public static readonly Histogram<int> EvidenceCompleteness = Meter.CreateHistogram<int>(
|
||||
"stellaops_evidence_completeness_score",
|
||||
unit: "{score}",
|
||||
description: "Evidence completeness at decision time (0-4)");
|
||||
|
||||
/// <summary>
|
||||
/// Count of evidence available by type at decision time.
|
||||
/// </summary>
|
||||
public static readonly Counter<long> EvidenceByType = Meter.CreateCounter<long>(
|
||||
"stellaops_evidence_available_total",
|
||||
description: "Count of evidence available by type at decision time");
|
||||
|
||||
// Decision Metrics
|
||||
|
||||
/// <summary>
|
||||
/// Total triage decisions recorded.
|
||||
/// </summary>
|
||||
public static readonly Counter<long> DecisionsTotal = Meter.CreateCounter<long>(
|
||||
"stellaops_triage_decisions_total",
|
||||
description: "Total triage decisions recorded");
|
||||
|
||||
/// <summary>
|
||||
/// Total time from alert open to decision in seconds.
|
||||
/// </summary>
|
||||
public static readonly Histogram<double> DecisionDurationSeconds = Meter.CreateHistogram<double>(
|
||||
"stellaops_triage_decision_duration_seconds",
|
||||
unit: "s",
|
||||
description: "Total time from alert open to decision");
|
||||
|
||||
// Budget Violations
|
||||
|
||||
/// <summary>
|
||||
/// Count of performance budget violations.
|
||||
/// </summary>
|
||||
public static readonly Counter<long> BudgetViolations = Meter.CreateCounter<long>(
|
||||
"stellaops_performance_budget_violations_total",
|
||||
description: "Count of performance budget violations");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evidence bitset for completeness tracking (C# equivalent).
|
||||
/// </summary>
|
||||
public readonly struct EvidenceBitset
|
||||
{
|
||||
/// <summary>
|
||||
/// Reachability evidence bit (1).
|
||||
/// </summary>
|
||||
public const int Reachability = 1 << 0;
|
||||
|
||||
/// <summary>
|
||||
/// Callstack evidence bit (2).
|
||||
/// </summary>
|
||||
public const int Callstack = 1 << 1;
|
||||
|
||||
/// <summary>
|
||||
/// Provenance evidence bit (4).
|
||||
/// </summary>
|
||||
public const int Provenance = 1 << 2;
|
||||
|
||||
/// <summary>
|
||||
/// VEX evidence bit (8).
|
||||
/// </summary>
|
||||
public const int Vex = 1 << 3;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the bitset value.
|
||||
/// </summary>
|
||||
public int Value { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new EvidenceBitset with the specified value.
|
||||
/// </summary>
|
||||
public EvidenceBitset(int value)
|
||||
{
|
||||
Value = value;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether reachability evidence is present.
|
||||
/// </summary>
|
||||
public bool HasReachability => (Value & Reachability) != 0;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether callstack evidence is present.
|
||||
/// </summary>
|
||||
public bool HasCallstack => (Value & Callstack) != 0;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether provenance evidence is present.
|
||||
/// </summary>
|
||||
public bool HasProvenance => (Value & Provenance) != 0;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether VEX evidence is present.
|
||||
/// </summary>
|
||||
public bool HasVex => (Value & Vex) != 0;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the completeness score (0-4).
|
||||
/// </summary>
|
||||
public int CompletenessScore
|
||||
{
|
||||
get
|
||||
{
|
||||
int score = 0;
|
||||
if (HasReachability) score++;
|
||||
if (HasCallstack) score++;
|
||||
if (HasProvenance) score++;
|
||||
if (HasVex) score++;
|
||||
return score;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates an EvidenceBitset from individual evidence flags.
|
||||
/// </summary>
|
||||
public static EvidenceBitset From(bool reachability, bool callstack, bool provenance, bool vex)
|
||||
{
|
||||
int value = 0;
|
||||
if (reachability) value |= Reachability;
|
||||
if (callstack) value |= Callstack;
|
||||
if (provenance) value |= Provenance;
|
||||
if (vex) value |= Vex;
|
||||
return new EvidenceBitset(value);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,130 @@
|
||||
using System.Text.Json.Serialization;
|
||||
|
||||
namespace StellaOps.Telemetry.Core.Triage;
|
||||
|
||||
/// <summary>
|
||||
/// TTFS telemetry event from frontend.
|
||||
/// </summary>
|
||||
public sealed class TtfsEvent
|
||||
{
|
||||
/// <summary>
|
||||
/// Event type: ttfs.skeleton, ttfs.first_evidence, ttfs.full_evidence, decision.recorded, budget.violation.
|
||||
/// </summary>
|
||||
[JsonPropertyName("event_type")]
|
||||
public required string EventType { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Alert identifier.
|
||||
/// </summary>
|
||||
[JsonPropertyName("alert_id")]
|
||||
public required string AlertId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Duration in milliseconds.
|
||||
/// </summary>
|
||||
[JsonPropertyName("duration_ms")]
|
||||
public double DurationMs { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Evidence type (for first_evidence events).
|
||||
/// </summary>
|
||||
[JsonPropertyName("evidence_type")]
|
||||
public string? EvidenceType { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Evidence completeness score (0-4).
|
||||
/// </summary>
|
||||
[JsonPropertyName("completeness_score")]
|
||||
public int CompletenessScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Click count for decision events.
|
||||
/// </summary>
|
||||
[JsonPropertyName("click_count")]
|
||||
public int ClickCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Decision status for decision events.
|
||||
/// </summary>
|
||||
[JsonPropertyName("decision_status")]
|
||||
public string? DecisionStatus { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Phase for budget violation events.
|
||||
/// </summary>
|
||||
[JsonPropertyName("phase")]
|
||||
public string? Phase { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Budget limit in milliseconds.
|
||||
/// </summary>
|
||||
[JsonPropertyName("budget")]
|
||||
public double Budget { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Evidence bitset value.
|
||||
/// </summary>
|
||||
[JsonPropertyName("evidence_bitset")]
|
||||
public int EvidenceBitset { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Client timestamp (UTC).
|
||||
/// </summary>
|
||||
[JsonPropertyName("timestamp")]
|
||||
public DateTimeOffset Timestamp { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Batch of TTFS events for ingestion.
|
||||
/// </summary>
|
||||
public sealed class TtfsEventBatch
|
||||
{
|
||||
/// <summary>
|
||||
/// Events to ingest.
|
||||
/// </summary>
|
||||
[JsonPropertyName("events")]
|
||||
public required IReadOnlyList<TtfsEvent> Events { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Tenant identifier.
|
||||
/// </summary>
|
||||
[JsonPropertyName("tenant_id")]
|
||||
public string? TenantId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Session identifier.
|
||||
/// </summary>
|
||||
[JsonPropertyName("session_id")]
|
||||
public string? SessionId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Known TTFS event types.
|
||||
/// </summary>
|
||||
public static class TtfsEventType
|
||||
{
|
||||
/// <summary>
|
||||
/// Skeleton UI rendered.
|
||||
/// </summary>
|
||||
public const string Skeleton = "ttfs.skeleton";
|
||||
|
||||
/// <summary>
|
||||
/// First evidence pill rendered.
|
||||
/// </summary>
|
||||
public const string FirstEvidence = "ttfs.first_evidence";
|
||||
|
||||
/// <summary>
|
||||
/// Full evidence loaded.
|
||||
/// </summary>
|
||||
public const string FullEvidence = "ttfs.full_evidence";
|
||||
|
||||
/// <summary>
|
||||
/// Decision recorded.
|
||||
/// </summary>
|
||||
public const string DecisionRecorded = "decision.recorded";
|
||||
|
||||
/// <summary>
|
||||
/// Performance budget violated.
|
||||
/// </summary>
|
||||
public const string BudgetViolation = "budget.violation";
|
||||
}
|
||||
@@ -0,0 +1,216 @@
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Telemetry.Core.Triage;
|
||||
|
||||
/// <summary>
|
||||
/// Service for ingesting TTFS telemetry events.
|
||||
/// </summary>
|
||||
public sealed class TtfsIngestionService
|
||||
{
|
||||
private readonly ILogger<TtfsIngestionService> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="TtfsIngestionService"/> class.
|
||||
/// </summary>
|
||||
public TtfsIngestionService(ILogger<TtfsIngestionService> logger)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Ingests a batch of TTFS events.
|
||||
/// </summary>
|
||||
public void IngestBatch(TtfsEventBatch batch)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(batch);
|
||||
ArgumentNullException.ThrowIfNull(batch.Events);
|
||||
|
||||
foreach (var evt in batch.Events)
|
||||
{
|
||||
IngestEvent(evt, batch.TenantId);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Ingests a single TTFS event.
|
||||
/// </summary>
|
||||
public void IngestEvent(TtfsEvent evt, string? tenantId = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(evt);
|
||||
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "alert_id", evt.AlertId }
|
||||
};
|
||||
if (!string.IsNullOrEmpty(tenantId))
|
||||
{
|
||||
tags.Add("tenant_id", tenantId);
|
||||
}
|
||||
|
||||
switch (evt.EventType)
|
||||
{
|
||||
case TtfsEventType.Skeleton:
|
||||
RecordSkeletonEvent(evt, tags);
|
||||
break;
|
||||
|
||||
case TtfsEventType.FirstEvidence:
|
||||
RecordFirstEvidenceEvent(evt, tags);
|
||||
break;
|
||||
|
||||
case TtfsEventType.FullEvidence:
|
||||
RecordFullEvidenceEvent(evt, tags);
|
||||
break;
|
||||
|
||||
case TtfsEventType.DecisionRecorded:
|
||||
RecordDecisionEvent(evt, tags);
|
||||
break;
|
||||
|
||||
case TtfsEventType.BudgetViolation:
|
||||
RecordBudgetViolation(evt, tags);
|
||||
break;
|
||||
|
||||
default:
|
||||
_logger.LogWarning("Unknown TTFS event type: {EventType}", evt.EventType);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
private void RecordSkeletonEvent(TtfsEvent evt, TagList tags)
|
||||
{
|
||||
var durationSeconds = evt.DurationMs / 1000.0;
|
||||
TriageMetrics.TtfsSkeletonSeconds.Record(durationSeconds, tags);
|
||||
|
||||
_logger.LogDebug(
|
||||
"TTFS skeleton for alert {AlertId}: {Duration:F3}s",
|
||||
evt.AlertId, durationSeconds);
|
||||
|
||||
// Check budget (200ms)
|
||||
if (evt.DurationMs > 200)
|
||||
{
|
||||
RecordBudgetViolation(new TtfsEvent
|
||||
{
|
||||
EventType = TtfsEventType.BudgetViolation,
|
||||
AlertId = evt.AlertId,
|
||||
Phase = "skeleton",
|
||||
DurationMs = evt.DurationMs,
|
||||
Budget = 200,
|
||||
Timestamp = evt.Timestamp
|
||||
}, tags);
|
||||
}
|
||||
}
|
||||
|
||||
private void RecordFirstEvidenceEvent(TtfsEvent evt, TagList tags)
|
||||
{
|
||||
var durationSeconds = evt.DurationMs / 1000.0;
|
||||
|
||||
if (!string.IsNullOrEmpty(evt.EvidenceType))
|
||||
{
|
||||
tags.Add("evidence_type", evt.EvidenceType);
|
||||
}
|
||||
|
||||
TriageMetrics.TtfsFirstEvidenceSeconds.Record(durationSeconds, tags);
|
||||
|
||||
_logger.LogDebug(
|
||||
"TTFS first evidence for alert {AlertId}: {Duration:F3}s, type={Type}",
|
||||
evt.AlertId, durationSeconds, evt.EvidenceType);
|
||||
|
||||
// Check budget (500ms for first pill, 1500ms for p95)
|
||||
if (evt.DurationMs > 500)
|
||||
{
|
||||
RecordBudgetViolation(new TtfsEvent
|
||||
{
|
||||
EventType = TtfsEventType.BudgetViolation,
|
||||
AlertId = evt.AlertId,
|
||||
Phase = "first_evidence",
|
||||
DurationMs = evt.DurationMs,
|
||||
Budget = 500,
|
||||
Timestamp = evt.Timestamp
|
||||
}, tags);
|
||||
}
|
||||
}
|
||||
|
||||
private void RecordFullEvidenceEvent(TtfsEvent evt, TagList tags)
|
||||
{
|
||||
var durationSeconds = evt.DurationMs / 1000.0;
|
||||
tags.Add("completeness", evt.CompletenessScore);
|
||||
|
||||
TriageMetrics.TtfsFullEvidenceSeconds.Record(durationSeconds, tags);
|
||||
TriageMetrics.EvidenceCompleteness.Record(evt.CompletenessScore, tags);
|
||||
|
||||
// Record individual evidence types
|
||||
var bitset = new EvidenceBitset(evt.EvidenceBitset);
|
||||
if (bitset.HasReachability)
|
||||
{
|
||||
TriageMetrics.EvidenceByType.Add(1,
|
||||
new KeyValuePair<string, object?>("evidence_type", "reachability"));
|
||||
}
|
||||
if (bitset.HasCallstack)
|
||||
{
|
||||
TriageMetrics.EvidenceByType.Add(1,
|
||||
new KeyValuePair<string, object?>("evidence_type", "callstack"));
|
||||
}
|
||||
if (bitset.HasProvenance)
|
||||
{
|
||||
TriageMetrics.EvidenceByType.Add(1,
|
||||
new KeyValuePair<string, object?>("evidence_type", "provenance"));
|
||||
}
|
||||
if (bitset.HasVex)
|
||||
{
|
||||
TriageMetrics.EvidenceByType.Add(1,
|
||||
new KeyValuePair<string, object?>("evidence_type", "vex"));
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"TTFS full evidence for alert {AlertId}: {Duration:F3}s, completeness={Score}",
|
||||
evt.AlertId, durationSeconds, evt.CompletenessScore);
|
||||
}
|
||||
|
||||
private void RecordDecisionEvent(TtfsEvent evt, TagList tags)
|
||||
{
|
||||
var durationSeconds = evt.DurationMs / 1000.0;
|
||||
|
||||
if (!string.IsNullOrEmpty(evt.DecisionStatus))
|
||||
{
|
||||
tags.Add("decision_status", evt.DecisionStatus);
|
||||
}
|
||||
|
||||
TriageMetrics.ClicksToClosure.Record(evt.ClickCount, tags);
|
||||
TriageMetrics.DecisionDurationSeconds.Record(durationSeconds, tags);
|
||||
TriageMetrics.DecisionsTotal.Add(1, tags);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Triage decision for alert {AlertId}: status={Status}, clicks={Clicks}, duration={Duration:F3}s",
|
||||
evt.AlertId, evt.DecisionStatus, evt.ClickCount, durationSeconds);
|
||||
|
||||
// Check clicks budget (median < 6)
|
||||
if (evt.ClickCount > 6)
|
||||
{
|
||||
TriageMetrics.BudgetViolations.Add(1,
|
||||
new KeyValuePair<string, object?>("phase", "clicks_to_closure"),
|
||||
new KeyValuePair<string, object?>("budget", 6));
|
||||
|
||||
_logger.LogWarning(
|
||||
"Clicks-to-closure budget exceeded for alert {AlertId}: {Clicks} clicks (budget: 6)",
|
||||
evt.AlertId, evt.ClickCount);
|
||||
}
|
||||
}
|
||||
|
||||
private void RecordBudgetViolation(TtfsEvent evt, TagList baseTags)
|
||||
{
|
||||
var tags = new TagList();
|
||||
foreach (var tag in baseTags)
|
||||
{
|
||||
tags.Add(tag);
|
||||
}
|
||||
tags.Add("phase", evt.Phase ?? "unknown");
|
||||
tags.Add("budget", evt.Budget);
|
||||
|
||||
TriageMetrics.BudgetViolations.Add(1, tags);
|
||||
|
||||
_logger.LogWarning(
|
||||
"Performance budget exceeded for alert {AlertId}: phase={Phase}, actual={Actual:F0}ms, budget={Budget:F0}ms",
|
||||
evt.AlertId, evt.Phase, evt.DurationMs, evt.Budget);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user