up
This commit is contained in:
@@ -85,6 +85,30 @@ public static class TelemetryServiceCollectionExtensions
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers Time-to-Evidence (TTE) metrics for measuring triage workflow performance.
|
||||
/// </summary>
|
||||
/// <param name="services">Service collection to mutate.</param>
|
||||
/// <param name="configureOptions">Optional options configuration including SLO targets.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddTimeToEvidenceMetrics(
|
||||
this IServiceCollection services,
|
||||
Action<TimeToEvidenceOptions>? configureOptions = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
services.AddOptions<TimeToEvidenceOptions>()
|
||||
.Configure(options => configureOptions?.Invoke(options));
|
||||
|
||||
services.TryAddSingleton(sp =>
|
||||
{
|
||||
var options = sp.GetRequiredService<IOptions<TimeToEvidenceOptions>>().Value;
|
||||
return new TimeToEvidenceMetrics(options);
|
||||
});
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers incident mode services for toggling enhanced telemetry during incidents.
|
||||
/// </summary>
|
||||
|
||||
@@ -0,0 +1,378 @@
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Telemetry.Core;
|
||||
|
||||
/// <summary>
|
||||
/// Time-to-Evidence (TTE) metrics for measuring the speed and reliability
|
||||
/// of the evidence chain in vulnerability triage workflows.
|
||||
/// </summary>
|
||||
public sealed class TimeToEvidenceMetrics : IDisposable
|
||||
{
|
||||
/// <summary>
|
||||
/// Default meter name for TTE metrics.
|
||||
/// </summary>
|
||||
public const string MeterName = "StellaOps.TimeToEvidence";
|
||||
|
||||
private readonly Meter _meter;
|
||||
private readonly TimeToEvidenceOptions _options;
|
||||
private bool _disposed;
|
||||
|
||||
private readonly Histogram<double> _phaseLatencyHistogram;
|
||||
private readonly Counter<long> _phaseCompletedCounter;
|
||||
private readonly Counter<long> _phaseFailedCounter;
|
||||
private readonly Counter<long> _sloBreachCounter;
|
||||
private readonly Counter<long> _evidenceAttachedCounter;
|
||||
private readonly Counter<long> _decisionMadeCounter;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of <see cref="TimeToEvidenceMetrics"/>.
|
||||
/// </summary>
|
||||
public TimeToEvidenceMetrics(TimeToEvidenceOptions? options = null)
|
||||
{
|
||||
_options = options ?? new TimeToEvidenceOptions();
|
||||
_meter = new Meter(MeterName, _options.Version);
|
||||
|
||||
_phaseLatencyHistogram = _meter.CreateHistogram<double>(
|
||||
name: "tte_phase_latency_seconds",
|
||||
unit: "s",
|
||||
description: "Latency of TTE phases in seconds.");
|
||||
|
||||
_phaseCompletedCounter = _meter.CreateCounter<long>(
|
||||
name: "tte_phase_completed_total",
|
||||
unit: "{phase}",
|
||||
description: "Total number of completed TTE phases.");
|
||||
|
||||
_phaseFailedCounter = _meter.CreateCounter<long>(
|
||||
name: "tte_phase_failed_total",
|
||||
unit: "{phase}",
|
||||
description: "Total number of failed TTE phases.");
|
||||
|
||||
_sloBreachCounter = _meter.CreateCounter<long>(
|
||||
name: "tte_slo_breach_total",
|
||||
unit: "{breach}",
|
||||
description: "Total number of SLO breaches.");
|
||||
|
||||
_evidenceAttachedCounter = _meter.CreateCounter<long>(
|
||||
name: "tte_evidence_attached_total",
|
||||
unit: "{evidence}",
|
||||
description: "Total number of evidence items attached.");
|
||||
|
||||
_decisionMadeCounter = _meter.CreateCounter<long>(
|
||||
name: "tte_decision_made_total",
|
||||
unit: "{decision}",
|
||||
description: "Total number of VEX decisions made.");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a phase completion with latency.
|
||||
/// </summary>
|
||||
public void RecordPhaseCompleted(TtePhase phase, double latencySeconds, string? tenantId = null, string? surface = null)
|
||||
{
|
||||
var tags = CreatePhaseTags(phase, tenantId, surface);
|
||||
_phaseLatencyHistogram.Record(latencySeconds, tags);
|
||||
_phaseCompletedCounter.Add(1, tags);
|
||||
|
||||
// Check for SLO breach
|
||||
var sloTargetSeconds = GetSloTargetSeconds(phase);
|
||||
if (sloTargetSeconds.HasValue && latencySeconds > sloTargetSeconds.Value)
|
||||
{
|
||||
_sloBreachCounter.Add(1, tags);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a phase failure.
|
||||
/// </summary>
|
||||
public void RecordPhaseFailed(TtePhase phase, string? errorCode = null, string? tenantId = null, string? surface = null)
|
||||
{
|
||||
var tags = CreatePhaseTags(phase, tenantId, surface, errorCode);
|
||||
_phaseFailedCounter.Add(1, tags);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records evidence attachment.
|
||||
/// </summary>
|
||||
public void RecordEvidenceAttached(TteEvidenceType evidenceType, int count = 1, string? tenantId = null)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "evidence_type", evidenceType.ToString().ToLowerInvariant() }
|
||||
};
|
||||
if (!string.IsNullOrEmpty(tenantId)) tags.Add("tenant_id", tenantId);
|
||||
|
||||
_evidenceAttachedCounter.Add(count, tags);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a VEX decision.
|
||||
/// </summary>
|
||||
public void RecordDecisionMade(TteDecisionStatus status, string? tenantId = null, bool isAutomated = false)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "decision_status", status.ToString().ToLowerInvariant() },
|
||||
{ "is_automated", isAutomated }
|
||||
};
|
||||
if (!string.IsNullOrEmpty(tenantId)) tags.Add("tenant_id", tenantId);
|
||||
|
||||
_decisionMadeCounter.Add(1, tags);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records an SLO breach directly.
|
||||
/// </summary>
|
||||
public void RecordSloBreachDirect(TtePhase phase, double actualSeconds, double targetSeconds, string? tenantId = null)
|
||||
{
|
||||
var tags = CreatePhaseTags(phase, tenantId, null);
|
||||
tags.Add("actual_seconds", actualSeconds);
|
||||
tags.Add("target_seconds", targetSeconds);
|
||||
_sloBreachCounter.Add(1, tags);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts a measurement scope for a TTE phase.
|
||||
/// </summary>
|
||||
public TtePhaseScope MeasurePhase(TtePhase phase, string? tenantId = null, string? surface = null)
|
||||
{
|
||||
return new TtePhaseScope(this, phase, tenantId, surface);
|
||||
}
|
||||
|
||||
private TagList CreatePhaseTags(TtePhase phase, string? tenantId, string? surface, string? errorCode = null)
|
||||
{
|
||||
var tags = new TagList
|
||||
{
|
||||
{ "phase", phase.ToString().ToLowerInvariant() }
|
||||
};
|
||||
if (!string.IsNullOrEmpty(tenantId)) tags.Add("tenant_id", tenantId);
|
||||
if (!string.IsNullOrEmpty(surface)) tags.Add("surface", surface);
|
||||
if (!string.IsNullOrEmpty(errorCode)) tags.Add("error_code", errorCode);
|
||||
return tags;
|
||||
}
|
||||
|
||||
private double? GetSloTargetSeconds(TtePhase phase)
|
||||
{
|
||||
return phase switch
|
||||
{
|
||||
TtePhase.ScanToFinding => _options.SloScanToFindingSeconds,
|
||||
TtePhase.FindingToEvidence => _options.SloFindingToEvidenceSeconds,
|
||||
TtePhase.EvidenceToDecision => _options.SloEvidenceToDecisionSeconds,
|
||||
TtePhase.DecisionToAttestation => _options.SloDecisionToAttestationSeconds,
|
||||
TtePhase.AttestationToVerification => _options.SloAttestationToVerificationSeconds,
|
||||
TtePhase.VerificationToPolicy => _options.SloVerificationToPolicySeconds,
|
||||
TtePhase.EndToEnd => _options.SloEndToEndSeconds,
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed) return;
|
||||
_disposed = true;
|
||||
_meter.Dispose();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Measurement scope for TTE phases.
|
||||
/// </summary>
|
||||
public sealed class TtePhaseScope : IDisposable
|
||||
{
|
||||
private readonly TimeToEvidenceMetrics _metrics;
|
||||
private readonly TtePhase _phase;
|
||||
private readonly string? _tenantId;
|
||||
private readonly string? _surface;
|
||||
private readonly Stopwatch _stopwatch;
|
||||
private bool _completed;
|
||||
private string? _errorCode;
|
||||
|
||||
internal TtePhaseScope(TimeToEvidenceMetrics metrics, TtePhase phase, string? tenantId, string? surface)
|
||||
{
|
||||
_metrics = metrics;
|
||||
_phase = phase;
|
||||
_tenantId = tenantId;
|
||||
_surface = surface;
|
||||
_stopwatch = Stopwatch.StartNew();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Marks the phase as failed with an optional error code.
|
||||
/// </summary>
|
||||
public void Fail(string? errorCode = null)
|
||||
{
|
||||
_errorCode = errorCode;
|
||||
_completed = false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Marks the phase as successfully completed.
|
||||
/// </summary>
|
||||
public void Complete()
|
||||
{
|
||||
_completed = true;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public void Dispose()
|
||||
{
|
||||
_stopwatch.Stop();
|
||||
if (_completed)
|
||||
{
|
||||
_metrics.RecordPhaseCompleted(_phase, _stopwatch.Elapsed.TotalSeconds, _tenantId, _surface);
|
||||
}
|
||||
else
|
||||
{
|
||||
_metrics.RecordPhaseFailed(_phase, _errorCode, _tenantId, _surface);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for TTE metrics including SLO targets.
|
||||
/// </summary>
|
||||
public sealed class TimeToEvidenceOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Version string for the meter.
|
||||
/// </summary>
|
||||
public string Version { get; set; } = "1.0.0";
|
||||
|
||||
/// <summary>
|
||||
/// SLO target in seconds for scan-to-finding phase. Default: 30 seconds.
|
||||
/// </summary>
|
||||
public double? SloScanToFindingSeconds { get; set; } = 30;
|
||||
|
||||
/// <summary>
|
||||
/// SLO target in seconds for finding-to-evidence phase. Default: 5 seconds.
|
||||
/// </summary>
|
||||
public double? SloFindingToEvidenceSeconds { get; set; } = 5;
|
||||
|
||||
/// <summary>
|
||||
/// SLO target in seconds for evidence-to-decision phase. Default: 10 seconds.
|
||||
/// </summary>
|
||||
public double? SloEvidenceToDecisionSeconds { get; set; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// SLO target in seconds for decision-to-attestation phase. Default: 5 seconds.
|
||||
/// </summary>
|
||||
public double? SloDecisionToAttestationSeconds { get; set; } = 5;
|
||||
|
||||
/// <summary>
|
||||
/// SLO target in seconds for attestation-to-verification phase. Default: 3 seconds.
|
||||
/// </summary>
|
||||
public double? SloAttestationToVerificationSeconds { get; set; } = 3;
|
||||
|
||||
/// <summary>
|
||||
/// SLO target in seconds for verification-to-policy phase. Default: 2 seconds.
|
||||
/// </summary>
|
||||
public double? SloVerificationToPolicySeconds { get; set; } = 2;
|
||||
|
||||
/// <summary>
|
||||
/// SLO target in seconds for end-to-end triage. Default: 60 seconds.
|
||||
/// </summary>
|
||||
public double? SloEndToEndSeconds { get; set; } = 60;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Phases in the Time-to-Evidence chain.
|
||||
/// </summary>
|
||||
public enum TtePhase
|
||||
{
|
||||
/// <summary>
|
||||
/// From scan completion to finding creation.
|
||||
/// </summary>
|
||||
ScanToFinding,
|
||||
|
||||
/// <summary>
|
||||
/// From finding creation to evidence attachment.
|
||||
/// </summary>
|
||||
FindingToEvidence,
|
||||
|
||||
/// <summary>
|
||||
/// From evidence attachment to VEX decision.
|
||||
/// </summary>
|
||||
EvidenceToDecision,
|
||||
|
||||
/// <summary>
|
||||
/// From VEX decision to attestation signing.
|
||||
/// </summary>
|
||||
DecisionToAttestation,
|
||||
|
||||
/// <summary>
|
||||
/// From attestation signing to verification.
|
||||
/// </summary>
|
||||
AttestationToVerification,
|
||||
|
||||
/// <summary>
|
||||
/// From verification to policy evaluation.
|
||||
/// </summary>
|
||||
VerificationToPolicy,
|
||||
|
||||
/// <summary>
|
||||
/// End-to-end triage workflow.
|
||||
/// </summary>
|
||||
EndToEnd
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Types of evidence in the TTE chain.
|
||||
/// </summary>
|
||||
public enum TteEvidenceType
|
||||
{
|
||||
/// <summary>
|
||||
/// DSSE/in-toto attestation.
|
||||
/// </summary>
|
||||
Attestation,
|
||||
|
||||
/// <summary>
|
||||
/// VEX statement or document.
|
||||
/// </summary>
|
||||
Vex,
|
||||
|
||||
/// <summary>
|
||||
/// SBOM (SPDX or CycloneDX).
|
||||
/// </summary>
|
||||
Sbom,
|
||||
|
||||
/// <summary>
|
||||
/// Policy evaluation result.
|
||||
/// </summary>
|
||||
PolicyEval,
|
||||
|
||||
/// <summary>
|
||||
/// Reachability analysis result.
|
||||
/// </summary>
|
||||
Reachability,
|
||||
|
||||
/// <summary>
|
||||
/// Fix pull request.
|
||||
/// </summary>
|
||||
FixPr
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// VEX decision statuses for TTE tracking.
|
||||
/// </summary>
|
||||
public enum TteDecisionStatus
|
||||
{
|
||||
/// <summary>
|
||||
/// Vulnerability does not affect the product.
|
||||
/// </summary>
|
||||
NotAffected,
|
||||
|
||||
/// <summary>
|
||||
/// Vulnerability affects the product.
|
||||
/// </summary>
|
||||
Affected,
|
||||
|
||||
/// <summary>
|
||||
/// Vulnerability has been fixed.
|
||||
/// </summary>
|
||||
Fixed,
|
||||
|
||||
/// <summary>
|
||||
/// Vulnerability is under investigation.
|
||||
/// </summary>
|
||||
UnderInvestigation
|
||||
}
|
||||
Reference in New Issue
Block a user