Files
git.stella-ops.org/src/Policy/StellaOps.Policy.Engine/Telemetry/PolicyEngineTelemetry.cs
StellaOps Bot 5e514532df Implement VEX document verification system with issuer management and signature verification
- Added IIssuerDirectory interface for managing VEX document issuers, including methods for registration, revocation, and trust validation.
- Created InMemoryIssuerDirectory class as an in-memory implementation of IIssuerDirectory for testing and single-instance deployments.
- Introduced ISignatureVerifier interface for verifying signatures on VEX documents, with support for multiple signature formats.
- Developed SignatureVerifier class as the default implementation of ISignatureVerifier, allowing extensibility for different signature formats.
- Implemented handlers for DSSE and JWS signature formats, including methods for verification and signature extraction.
- Defined various records and enums for issuer and signature metadata, enhancing the structure and clarity of the verification process.
2025-12-06 13:41:22 +02:00

1107 lines
43 KiB
C#

using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace StellaOps.Policy.Engine.Telemetry;
/// <summary>
/// Telemetry instrumentation for the Policy Engine service.
/// Provides metrics, traces, and structured logging correlation.
/// </summary>
public static class PolicyEngineTelemetry
{
/// <summary>
/// The name of the meter used for Policy Engine metrics.
/// </summary>
public const string MeterName = "StellaOps.Policy.Engine";
/// <summary>
/// The name of the activity source used for Policy Engine traces.
/// </summary>
public const string ActivitySourceName = "StellaOps.Policy.Engine";
private static readonly Meter Meter = new(MeterName);
/// <summary>
/// The activity source used for Policy Engine traces.
/// </summary>
public static readonly ActivitySource ActivitySource = new(ActivitySourceName);
// Histogram: policy_run_seconds{mode,tenant,policy}
private static readonly Histogram<double> PolicyRunSecondsHistogram =
Meter.CreateHistogram<double>(
"policy_run_seconds",
unit: "s",
description: "Duration of policy evaluation runs.");
// Gauge: policy_run_queue_depth{tenant}
private static readonly ObservableGauge<int> PolicyRunQueueDepthGauge =
Meter.CreateObservableGauge<int>(
"policy_run_queue_depth",
observeValues: () => QueueDepthObservations ?? Enumerable.Empty<Measurement<int>>(),
unit: "jobs",
description: "Current depth of pending policy run jobs per tenant.");
// Counter: policy_rules_fired_total{policy,rule}
private static readonly Counter<long> PolicyRulesFiredCounter =
Meter.CreateCounter<long>(
"policy_rules_fired_total",
unit: "rules",
description: "Total number of policy rules that fired during evaluation.");
// Counter: policy_vex_overrides_total{policy,vendor}
private static readonly Counter<long> PolicyVexOverridesCounter =
Meter.CreateCounter<long>(
"policy_vex_overrides_total",
unit: "overrides",
description: "Total number of VEX overrides applied during policy evaluation.");
// Counter: policy_compilation_total{outcome}
private static readonly Counter<long> PolicyCompilationCounter =
Meter.CreateCounter<long>(
"policy_compilation_total",
unit: "compilations",
description: "Total number of policy compilations attempted.");
// Histogram: policy_compilation_seconds
private static readonly Histogram<double> PolicyCompilationSecondsHistogram =
Meter.CreateHistogram<double>(
"policy_compilation_seconds",
unit: "s",
description: "Duration of policy compilation.");
// Counter: policy_simulation_total{tenant,outcome}
private static readonly Counter<long> PolicySimulationCounter =
Meter.CreateCounter<long>(
"policy_simulation_total",
unit: "simulations",
description: "Total number of policy simulations executed.");
// Counter: policy_rate_limit_exceeded_total{tenant,endpoint}
private static readonly Counter<long> RateLimitExceededCounter =
Meter.CreateCounter<long>(
"policy_rate_limit_exceeded_total",
unit: "requests",
description: "Total requests rejected due to rate limiting.");
/// <summary>
/// Records a rate limit exceeded event.
/// </summary>
/// <param name="tenant">The tenant ID (or "anonymous" if not available).</param>
/// <param name="endpoint">The endpoint that was rate limited.</param>
public static void RecordRateLimitExceeded(string? tenant = null, string? endpoint = null)
{
var tags = new TagList
{
{ "tenant", NormalizeTag(tenant ?? "anonymous") },
{ "endpoint", NormalizeTag(endpoint ?? "simulation") },
};
RateLimitExceededCounter.Add(1, tags);
}
#region Entropy Metrics
// Counter: policy_entropy_penalty_total{outcome}
private static readonly Counter<long> EntropyPenaltyCounter =
Meter.CreateCounter<long>(
"policy_entropy_penalty_total",
unit: "penalties",
description: "Total entropy penalties computed from scanner evidence.");
// Histogram: policy_entropy_penalty_value{outcome}
private static readonly Histogram<double> EntropyPenaltyHistogram =
Meter.CreateHistogram<double>(
"policy_entropy_penalty_value",
unit: "ratio",
description: "Entropy penalty values (after cap).");
// Histogram: policy_entropy_image_opaque_ratio{outcome}
private static readonly Histogram<double> EntropyImageOpaqueRatioHistogram =
Meter.CreateHistogram<double>(
"policy_entropy_image_opaque_ratio",
unit: "ratio",
description: "Image opaque ratios observed in layer summaries.");
// Histogram: policy_entropy_top_file_ratio{outcome}
private static readonly Histogram<double> EntropyTopFileRatioHistogram =
Meter.CreateHistogram<double>(
"policy_entropy_top_file_ratio",
unit: "ratio",
description: "Opaque ratio of the top offending file when present.");
/// <summary>
/// Records an entropy penalty computation.
/// </summary>
public static void RecordEntropyPenalty(
double penalty,
string outcome,
double imageOpaqueRatio,
double? topFileOpaqueRatio = null)
{
var tags = new TagList
{
{ "outcome", NormalizeTag(outcome) },
};
EntropyPenaltyCounter.Add(1, tags);
EntropyPenaltyHistogram.Record(penalty, tags);
EntropyImageOpaqueRatioHistogram.Record(imageOpaqueRatio, tags);
if (topFileOpaqueRatio.HasValue)
{
EntropyTopFileRatioHistogram.Record(topFileOpaqueRatio.Value, tags);
}
}
#endregion
#region Golden Signals - Latency
// Histogram: policy_api_latency_seconds{endpoint,method,status}
private static readonly Histogram<double> ApiLatencyHistogram =
Meter.CreateHistogram<double>(
"policy_api_latency_seconds",
unit: "s",
description: "API request latency by endpoint.");
// Histogram: policy_evaluation_latency_seconds{tenant,policy}
private static readonly Histogram<double> EvaluationLatencyHistogram =
Meter.CreateHistogram<double>(
"policy_evaluation_latency_seconds",
unit: "s",
description: "Policy evaluation latency per batch.");
#endregion
#region Golden Signals - Traffic
// Counter: policy_requests_total{endpoint,method}
private static readonly Counter<long> RequestsCounter =
Meter.CreateCounter<long>(
"policy_requests_total",
unit: "requests",
description: "Total API requests by endpoint and method.");
// Counter: policy_evaluations_total{tenant,policy,mode}
private static readonly Counter<long> EvaluationsCounter =
Meter.CreateCounter<long>(
"policy_evaluations_total",
unit: "evaluations",
description: "Total policy evaluations by tenant, policy, and mode.");
// Counter: policy_findings_materialized_total{tenant,policy}
private static readonly Counter<long> FindingsMaterializedCounter =
Meter.CreateCounter<long>(
"policy_findings_materialized_total",
unit: "findings",
description: "Total findings materialized during policy evaluation.");
#endregion
#region Golden Signals - Errors
// Counter: policy_errors_total{type,tenant}
private static readonly Counter<long> ErrorsCounter =
Meter.CreateCounter<long>(
"policy_errors_total",
unit: "errors",
description: "Total errors by type (compilation, evaluation, api, storage).");
// Counter: policy_api_errors_total{endpoint,status_code}
private static readonly Counter<long> ApiErrorsCounter =
Meter.CreateCounter<long>(
"policy_api_errors_total",
unit: "errors",
description: "Total API errors by endpoint and status code.");
// Counter: policy_evaluation_failures_total{tenant,policy,reason}
private static readonly Counter<long> EvaluationFailuresCounter =
Meter.CreateCounter<long>(
"policy_evaluation_failures_total",
unit: "failures",
description: "Total evaluation failures by reason (timeout, determinism, storage, canceled).");
#endregion
#region Golden Signals - Saturation
// Gauge: policy_concurrent_evaluations{tenant}
private static readonly ObservableGauge<int> ConcurrentEvaluationsGauge =
Meter.CreateObservableGauge<int>(
"policy_concurrent_evaluations",
observeValues: () => ConcurrentEvaluationsObservations ?? Enumerable.Empty<Measurement<int>>(),
unit: "evaluations",
description: "Current number of concurrent policy evaluations.");
// Gauge: policy_worker_utilization
private static readonly ObservableGauge<double> WorkerUtilizationGauge =
Meter.CreateObservableGauge<double>(
"policy_worker_utilization",
observeValues: () => WorkerUtilizationObservations ?? Enumerable.Empty<Measurement<double>>(),
unit: "ratio",
description: "Worker pool utilization ratio (0.0 to 1.0).");
#endregion
#region SLO Metrics
// Gauge: policy_slo_burn_rate{slo_name}
private static readonly ObservableGauge<double> SloBurnRateGauge =
Meter.CreateObservableGauge<double>(
"policy_slo_burn_rate",
observeValues: () => SloBurnRateObservations ?? Enumerable.Empty<Measurement<double>>(),
unit: "ratio",
description: "SLO burn rate over configured window.");
// Gauge: policy_error_budget_remaining{slo_name}
private static readonly ObservableGauge<double> ErrorBudgetRemainingGauge =
Meter.CreateObservableGauge<double>(
"policy_error_budget_remaining",
observeValues: () => ErrorBudgetObservations ?? Enumerable.Empty<Measurement<double>>(),
unit: "ratio",
description: "Remaining error budget as ratio (0.0 to 1.0).");
// Counter: policy_slo_violations_total{slo_name}
private static readonly Counter<long> SloViolationsCounter =
Meter.CreateCounter<long>(
"policy_slo_violations_total",
unit: "violations",
description: "Total SLO violations detected.");
#endregion
#region Risk Scoring Metrics
// Counter: policy_risk_scoring_jobs_created_total
private static readonly Counter<long> RiskScoringJobsCreatedCounter =
Meter.CreateCounter<long>(
"policy_risk_scoring_jobs_created_total",
unit: "jobs",
description: "Total risk scoring jobs created.");
// Counter: policy_risk_scoring_triggers_skipped_total
private static readonly Counter<long> RiskScoringTriggersSkippedCounter =
Meter.CreateCounter<long>(
"policy_risk_scoring_triggers_skipped_total",
unit: "triggers",
description: "Total risk scoring triggers skipped due to deduplication.");
// Histogram: policy_risk_scoring_duration_seconds
private static readonly Histogram<double> RiskScoringDurationHistogram =
Meter.CreateHistogram<double>(
"policy_risk_scoring_duration_seconds",
unit: "s",
description: "Duration of risk scoring job execution.");
// Counter: policy_risk_scoring_findings_scored_total
private static readonly Counter<long> RiskScoringFindingsScoredCounter =
Meter.CreateCounter<long>(
"policy_risk_scoring_findings_scored_total",
unit: "findings",
description: "Total findings scored by risk scoring jobs.");
/// <summary>
/// Counter for risk scoring jobs created.
/// </summary>
public static Counter<long> RiskScoringJobsCreated => RiskScoringJobsCreatedCounter;
/// <summary>
/// Counter for risk scoring triggers skipped.
/// </summary>
public static Counter<long> RiskScoringTriggersSkipped => RiskScoringTriggersSkippedCounter;
/// <summary>
/// Records risk scoring duration.
/// </summary>
/// <param name="seconds">Duration in seconds.</param>
/// <param name="profileId">Profile identifier.</param>
/// <param name="findingCount">Number of findings scored.</param>
public static void RecordRiskScoringDuration(double seconds, string profileId, int findingCount)
{
var tags = new TagList
{
{ "profile_id", NormalizeTag(profileId) },
{ "finding_count", findingCount.ToString() },
};
RiskScoringDurationHistogram.Record(seconds, tags);
}
/// <summary>
/// Records findings scored by risk scoring.
/// </summary>
/// <param name="profileId">Profile identifier.</param>
/// <param name="count">Number of findings scored.</param>
public static void RecordFindingsScored(string profileId, long count)
{
var tags = new TagList
{
{ "profile_id", NormalizeTag(profileId) },
};
RiskScoringFindingsScoredCounter.Add(count, tags);
}
#endregion
#region Risk Simulation and Events Metrics
// Counter: policy_risk_simulations_run_total
private static readonly Counter<long> RiskSimulationsRunCounter =
Meter.CreateCounter<long>(
"policy_risk_simulations_run_total",
unit: "simulations",
description: "Total risk simulations executed.");
// Counter: policy_profile_events_published_total
private static readonly Counter<long> ProfileEventsPublishedCounter =
Meter.CreateCounter<long>(
"policy_profile_events_published_total",
unit: "events",
description: "Total profile lifecycle events published.");
/// <summary>
/// Counter for risk simulations run.
/// </summary>
public static Counter<long> RiskSimulationsRun => RiskSimulationsRunCounter;
/// <summary>
/// Counter for profile events published.
/// </summary>
public static Counter<long> ProfileEventsPublished => ProfileEventsPublishedCounter;
// Counter: policy_events_processed_total
private static readonly Counter<long> PolicyEventsProcessedCounter =
Meter.CreateCounter<long>(
"policy_events_processed_total",
unit: "events",
description: "Total policy change events processed.");
/// <summary>
/// Counter for policy change events processed.
/// </summary>
public static Counter<long> PolicyEventsProcessed => PolicyEventsProcessedCounter;
// Counter: policy_effective_events_published_total
private static readonly Counter<long> PolicyEffectiveEventsPublishedCounter =
Meter.CreateCounter<long>(
"policy_effective_events_published_total",
unit: "events",
description: "Total policy.effective.* events published.");
/// <summary>
/// Counter for policy effective events published.
/// </summary>
public static Counter<long> PolicyEffectiveEventsPublished => PolicyEffectiveEventsPublishedCounter;
// Counter: policy_reevaluation_jobs_scheduled_total
private static readonly Counter<long> ReEvaluationJobsScheduledCounter =
Meter.CreateCounter<long>(
"policy_reevaluation_jobs_scheduled_total",
unit: "jobs",
description: "Total re-evaluation jobs scheduled.");
/// <summary>
/// Counter for re-evaluation jobs scheduled.
/// </summary>
public static Counter<long> ReEvaluationJobsScheduled => ReEvaluationJobsScheduledCounter;
// Counter: policy_explain_traces_stored_total
private static readonly Counter<long> ExplainTracesStoredCounter =
Meter.CreateCounter<long>(
"policy_explain_traces_stored_total",
unit: "traces",
description: "Total explain traces stored for decision audit.");
/// <summary>
/// Counter for explain traces stored.
/// </summary>
public static Counter<long> ExplainTracesStored => ExplainTracesStoredCounter;
// Counter: policy_effective_decision_map_operations_total
private static readonly Counter<long> EffectiveDecisionMapOperationsCounter =
Meter.CreateCounter<long>(
"policy_effective_decision_map_operations_total",
unit: "operations",
description: "Total effective decision map operations (set, get, invalidate).");
/// <summary>
/// Counter for effective decision map operations.
/// </summary>
public static Counter<long> EffectiveDecisionMapOperations => EffectiveDecisionMapOperationsCounter;
// Counter: policy_exception_operations_total{tenant,operation}
private static readonly Counter<long> ExceptionOperationsCounter =
Meter.CreateCounter<long>(
"policy_exception_operations_total",
unit: "operations",
description: "Total policy exception operations (create, update, revoke, review_*).");
/// <summary>
/// Counter for policy exception operations.
/// </summary>
public static Counter<long> ExceptionOperations => ExceptionOperationsCounter;
// Counter: policy_exception_cache_operations_total{tenant,operation}
private static readonly Counter<long> ExceptionCacheOperationsCounter =
Meter.CreateCounter<long>(
"policy_exception_cache_operations_total",
unit: "operations",
description: "Total exception cache operations (hit, miss, set, warm, invalidate).");
// Counter: policy_exception_applications_total{tenant,effect}
private static readonly Counter<long> ExceptionApplicationsCounter =
Meter.CreateCounter<long>(
"policy_exception_applications_total",
unit: "applications",
description: "Total applied exceptions during evaluation by effect type.");
// Histogram: policy_exception_application_latency_seconds{tenant,effect}
private static readonly Histogram<double> ExceptionApplicationLatencyHistogram =
Meter.CreateHistogram<double>(
"policy_exception_application_latency_seconds",
unit: "s",
description: "Latency impact of exception application during evaluation.");
// Counter: policy_exception_lifecycle_total{tenant,event}
private static readonly Counter<long> ExceptionLifecycleCounter =
Meter.CreateCounter<long>(
"policy_exception_lifecycle_total",
unit: "events",
description: "Lifecycle events for exceptions (activated, expired, revoked).");
/// <summary>
/// Counter for exception cache operations.
/// </summary>
public static Counter<long> ExceptionCacheOperations => ExceptionCacheOperationsCounter;
#endregion
#region Reachability Metrics
// Counter: policy_reachability_applied_total{state}
private static readonly Counter<long> ReachabilityAppliedCounter =
Meter.CreateCounter<long>(
"policy_reachability_applied_total",
unit: "facts",
description: "Total reachability facts applied during policy evaluation.");
// Counter: policy_reachability_cache_hits_total
private static readonly Counter<long> ReachabilityCacheHitsCounter =
Meter.CreateCounter<long>(
"policy_reachability_cache_hits_total",
unit: "hits",
description: "Total reachability facts cache hits.");
// Counter: policy_reachability_cache_misses_total
private static readonly Counter<long> ReachabilityCacheMissesCounter =
Meter.CreateCounter<long>(
"policy_reachability_cache_misses_total",
unit: "misses",
description: "Total reachability facts cache misses.");
// Gauge: policy_reachability_cache_hit_ratio
private static readonly ObservableGauge<double> ReachabilityCacheHitRatioGauge =
Meter.CreateObservableGauge<double>(
"policy_reachability_cache_hit_ratio",
observeValues: () => ReachabilityCacheHitRatioObservations ?? Enumerable.Empty<Measurement<double>>(),
unit: "ratio",
description: "Reachability facts cache hit ratio (0.0 to 1.0).");
// Counter: policy_reachability_lookups_total{outcome}
private static readonly Counter<long> ReachabilityLookupsCounter =
Meter.CreateCounter<long>(
"policy_reachability_lookups_total",
unit: "lookups",
description: "Total reachability facts lookup operations.");
// Histogram: policy_reachability_lookup_seconds
private static readonly Histogram<double> ReachabilityLookupSecondsHistogram =
Meter.CreateHistogram<double>(
"policy_reachability_lookup_seconds",
unit: "s",
description: "Duration of reachability facts lookup operations.");
private static IEnumerable<Measurement<double>> ReachabilityCacheHitRatioObservations = Enumerable.Empty<Measurement<double>>();
/// <summary>
/// Records reachability fact applied during evaluation.
/// </summary>
/// <param name="state">Reachability state (reachable, unreachable, unknown, under_investigation).</param>
/// <param name="count">Number of facts.</param>
public static void RecordReachabilityApplied(string state, long count = 1)
{
var tags = new TagList
{
{ "state", NormalizeTag(state) },
};
ReachabilityAppliedCounter.Add(count, tags);
}
/// <summary>
/// Records reachability cache hits.
/// </summary>
/// <param name="count">Number of hits.</param>
public static void RecordReachabilityCacheHits(long count)
{
ReachabilityCacheHitsCounter.Add(count);
}
/// <summary>
/// Records reachability cache misses.
/// </summary>
/// <param name="count">Number of misses.</param>
public static void RecordReachabilityCacheMisses(long count)
{
ReachabilityCacheMissesCounter.Add(count);
}
/// <summary>
/// Records a reachability lookup operation.
/// </summary>
/// <param name="outcome">Outcome (found, not_found, error).</param>
/// <param name="seconds">Duration in seconds.</param>
/// <param name="batchSize">Number of items looked up.</param>
public static void RecordReachabilityLookup(string outcome, double seconds, int batchSize)
{
var tags = new TagList
{
{ "outcome", NormalizeTag(outcome) },
};
ReachabilityLookupsCounter.Add(batchSize, tags);
ReachabilityLookupSecondsHistogram.Record(seconds, tags);
}
/// <summary>
/// Registers a callback to observe reachability cache hit ratio.
/// </summary>
/// <param name="observeFunc">Function that returns current cache hit ratio measurements.</param>
public static void RegisterReachabilityCacheHitRatioObservation(Func<IEnumerable<Measurement<double>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
ReachabilityCacheHitRatioObservations = observeFunc();
}
#endregion
#region AirGap/Staleness Metrics
// Counter: policy_airgap_staleness_events_total{tenant,event_type}
private static readonly Counter<long> StalenessEventsCounter =
Meter.CreateCounter<long>(
"policy_airgap_staleness_events_total",
unit: "events",
description: "Total staleness events by type (warning, breach, recovered, anchor_missing).");
// Gauge: policy_airgap_sealed
private static readonly ObservableGauge<int> AirGapSealedGauge =
Meter.CreateObservableGauge<int>(
"policy_airgap_sealed",
observeValues: () => AirGapSealedObservations ?? Enumerable.Empty<Measurement<int>>(),
unit: "boolean",
description: "1 if sealed, 0 if unsealed.");
// Gauge: policy_airgap_anchor_age_seconds
private static readonly ObservableGauge<int> AnchorAgeGauge =
Meter.CreateObservableGauge<int>(
"policy_airgap_anchor_age_seconds",
observeValues: () => AnchorAgeObservations ?? Enumerable.Empty<Measurement<int>>(),
unit: "s",
description: "Current age of the time anchor in seconds.");
private static IEnumerable<Measurement<int>> AirGapSealedObservations = Enumerable.Empty<Measurement<int>>();
private static IEnumerable<Measurement<int>> AnchorAgeObservations = Enumerable.Empty<Measurement<int>>();
/// <summary>
/// Records a staleness event.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="eventType">Event type (warning, breach, recovered, anchor_missing).</param>
public static void RecordStalenessEvent(string tenant, string eventType)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "event_type", NormalizeTag(eventType) },
};
StalenessEventsCounter.Add(1, tags);
}
/// <summary>
/// Registers a callback to observe air-gap sealed state.
/// </summary>
/// <param name="observeFunc">Function that returns current sealed state measurements.</param>
public static void RegisterAirGapSealedObservation(Func<IEnumerable<Measurement<int>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
AirGapSealedObservations = observeFunc();
}
/// <summary>
/// Registers a callback to observe time anchor age.
/// </summary>
/// <param name="observeFunc">Function that returns current anchor age measurements.</param>
public static void RegisterAnchorAgeObservation(Func<IEnumerable<Measurement<int>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
AnchorAgeObservations = observeFunc();
}
#endregion
// Storage for observable gauge observations
private static IEnumerable<Measurement<int>> QueueDepthObservations = Enumerable.Empty<Measurement<int>>();
private static IEnumerable<Measurement<int>> ConcurrentEvaluationsObservations = Enumerable.Empty<Measurement<int>>();
private static IEnumerable<Measurement<double>> WorkerUtilizationObservations = Enumerable.Empty<Measurement<double>>();
private static IEnumerable<Measurement<double>> SloBurnRateObservations = Enumerable.Empty<Measurement<double>>();
private static IEnumerable<Measurement<double>> ErrorBudgetObservations = Enumerable.Empty<Measurement<double>>();
/// <summary>
/// Registers a callback to observe queue depth measurements.
/// </summary>
/// <param name="observeFunc">Function that returns current queue depth measurements.</param>
public static void RegisterQueueDepthObservation(Func<IEnumerable<Measurement<int>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
QueueDepthObservations = observeFunc();
}
/// <summary>
/// Records the duration of a policy run.
/// </summary>
/// <param name="seconds">Duration in seconds.</param>
/// <param name="mode">Run mode (full, incremental, simulate).</param>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policy">Policy identifier.</param>
/// <param name="outcome">Outcome of the run (success, failure, canceled).</param>
public static void RecordRunDuration(double seconds, string mode, string tenant, string policy, string outcome)
{
var tags = new TagList
{
{ "mode", NormalizeTag(mode) },
{ "tenant", NormalizeTenant(tenant) },
{ "policy", NormalizeTag(policy) },
{ "outcome", NormalizeTag(outcome) },
};
PolicyRunSecondsHistogram.Record(seconds, tags);
}
/// <summary>
/// Records that a policy rule fired during evaluation.
/// </summary>
/// <param name="policy">Policy identifier.</param>
/// <param name="rule">Rule identifier.</param>
/// <param name="count">Number of times the rule fired.</param>
public static void RecordRuleFired(string policy, string rule, long count = 1)
{
var tags = new TagList
{
{ "policy", NormalizeTag(policy) },
{ "rule", NormalizeTag(rule) },
};
PolicyRulesFiredCounter.Add(count, tags);
}
/// <summary>
/// Records a VEX override applied during policy evaluation.
/// </summary>
/// <param name="policy">Policy identifier.</param>
/// <param name="vendor">VEX vendor identifier.</param>
/// <param name="count">Number of overrides.</param>
public static void RecordVexOverride(string policy, string vendor, long count = 1)
{
var tags = new TagList
{
{ "policy", NormalizeTag(policy) },
{ "vendor", NormalizeTag(vendor) },
};
PolicyVexOverridesCounter.Add(count, tags);
}
/// <summary>
/// Records a policy compilation attempt.
/// </summary>
/// <param name="outcome">Outcome (success, failure).</param>
/// <param name="seconds">Duration in seconds.</param>
public static void RecordCompilation(string outcome, double seconds)
{
var tags = new TagList
{
{ "outcome", NormalizeTag(outcome) },
};
PolicyCompilationCounter.Add(1, tags);
PolicyCompilationSecondsHistogram.Record(seconds, tags);
}
/// <summary>
/// Records a policy simulation execution.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="outcome">Outcome (success, failure).</param>
public static void RecordSimulation(string tenant, string outcome)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "outcome", NormalizeTag(outcome) },
};
PolicySimulationCounter.Add(1, tags);
}
/// <summary>
/// Records a policy exception operation.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="operation">Operation type (create, update, revoke, review_create, review_decision_*, etc.).</param>
public static void RecordExceptionOperation(string tenant, string operation)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "operation", NormalizeTag(operation) },
};
ExceptionOperationsCounter.Add(1, tags);
}
/// <summary>
/// Records an exception cache operation.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="operation">Operation type (hit, miss, set, warm, invalidate_*, event_*).</param>
public static void RecordExceptionCacheOperation(string tenant, string operation)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "operation", NormalizeTag(operation) },
};
ExceptionCacheOperationsCounter.Add(1, tags);
}
/// <summary>
/// Records that an exception was applied during evaluation.
/// </summary>
public static void RecordExceptionApplication(string tenant, string effectType)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "effect", NormalizeTag(effectType) },
};
ExceptionApplicationsCounter.Add(1, tags);
}
/// <summary>
/// Records latency attributed to exception application during evaluation.
/// </summary>
public static void RecordExceptionApplicationLatency(double seconds, string tenant, string effectType)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "effect", NormalizeTag(effectType) },
};
ExceptionApplicationLatencyHistogram.Record(seconds, tags);
}
/// <summary>
/// Records an exception lifecycle event (activated, expired, revoked).
/// </summary>
public static void RecordExceptionLifecycle(string tenant, string eventType)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "event", NormalizeTag(eventType) },
};
ExceptionLifecycleCounter.Add(1, tags);
}
#region Golden Signals - Recording Methods
/// <summary>
/// Records API request latency.
/// </summary>
/// <param name="seconds">Latency in seconds.</param>
/// <param name="endpoint">API endpoint name.</param>
/// <param name="method">HTTP method.</param>
/// <param name="statusCode">HTTP status code.</param>
public static void RecordApiLatency(double seconds, string endpoint, string method, int statusCode)
{
var tags = new TagList
{
{ "endpoint", NormalizeTag(endpoint) },
{ "method", NormalizeTag(method) },
{ "status", statusCode.ToString() },
};
ApiLatencyHistogram.Record(seconds, tags);
}
/// <summary>
/// Records policy evaluation latency for a batch.
/// </summary>
/// <param name="seconds">Latency in seconds.</param>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policy">Policy identifier.</param>
public static void RecordEvaluationLatency(double seconds, string tenant, string policy)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "policy", NormalizeTag(policy) },
};
EvaluationLatencyHistogram.Record(seconds, tags);
}
/// <summary>
/// Records an API request.
/// </summary>
/// <param name="endpoint">API endpoint name.</param>
/// <param name="method">HTTP method.</param>
public static void RecordRequest(string endpoint, string method)
{
var tags = new TagList
{
{ "endpoint", NormalizeTag(endpoint) },
{ "method", NormalizeTag(method) },
};
RequestsCounter.Add(1, tags);
}
/// <summary>
/// Records a policy evaluation execution.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policy">Policy identifier.</param>
/// <param name="mode">Evaluation mode (full, incremental, simulate).</param>
public static void RecordEvaluation(string tenant, string policy, string mode)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "policy", NormalizeTag(policy) },
{ "mode", NormalizeTag(mode) },
};
EvaluationsCounter.Add(1, tags);
}
/// <summary>
/// Records findings materialized during policy evaluation.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policy">Policy identifier.</param>
/// <param name="count">Number of findings materialized.</param>
public static void RecordFindingsMaterialized(string tenant, string policy, long count)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "policy", NormalizeTag(policy) },
};
FindingsMaterializedCounter.Add(count, tags);
}
/// <summary>
/// Records an error.
/// </summary>
/// <param name="errorType">Error type (compilation, evaluation, api, storage).</param>
/// <param name="tenant">Tenant identifier.</param>
public static void RecordError(string errorType, string? tenant = null)
{
var tags = new TagList
{
{ "type", NormalizeTag(errorType) },
{ "tenant", NormalizeTenant(tenant) },
};
ErrorsCounter.Add(1, tags);
}
/// <summary>
/// Records an API error.
/// </summary>
/// <param name="endpoint">API endpoint name.</param>
/// <param name="statusCode">HTTP status code.</param>
public static void RecordApiError(string endpoint, int statusCode)
{
var tags = new TagList
{
{ "endpoint", NormalizeTag(endpoint) },
{ "status_code", statusCode.ToString() },
};
ApiErrorsCounter.Add(1, tags);
}
/// <summary>
/// Records an evaluation failure.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policy">Policy identifier.</param>
/// <param name="reason">Failure reason (timeout, determinism, storage, canceled).</param>
public static void RecordEvaluationFailure(string tenant, string policy, string reason)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "policy", NormalizeTag(policy) },
{ "reason", NormalizeTag(reason) },
};
EvaluationFailuresCounter.Add(1, tags);
}
/// <summary>
/// Records an SLO violation.
/// </summary>
/// <param name="sloName">Name of the SLO that was violated.</param>
public static void RecordSloViolation(string sloName)
{
var tags = new TagList
{
{ "slo_name", NormalizeTag(sloName) },
};
SloViolationsCounter.Add(1, tags);
}
/// <summary>
/// Registers a callback to observe concurrent evaluations measurements.
/// </summary>
/// <param name="observeFunc">Function that returns current concurrent evaluations measurements.</param>
public static void RegisterConcurrentEvaluationsObservation(Func<IEnumerable<Measurement<int>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
ConcurrentEvaluationsObservations = observeFunc();
}
/// <summary>
/// Registers a callback to observe worker utilization measurements.
/// </summary>
/// <param name="observeFunc">Function that returns current worker utilization measurements.</param>
public static void RegisterWorkerUtilizationObservation(Func<IEnumerable<Measurement<double>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
WorkerUtilizationObservations = observeFunc();
}
/// <summary>
/// Registers a callback to observe SLO burn rate measurements.
/// </summary>
/// <param name="observeFunc">Function that returns current SLO burn rate measurements.</param>
public static void RegisterSloBurnRateObservation(Func<IEnumerable<Measurement<double>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
SloBurnRateObservations = observeFunc();
}
/// <summary>
/// Registers a callback to observe error budget measurements.
/// </summary>
/// <param name="observeFunc">Function that returns current error budget measurements.</param>
public static void RegisterErrorBudgetObservation(Func<IEnumerable<Measurement<double>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
ErrorBudgetObservations = observeFunc();
}
#endregion
/// <summary>
/// Starts an activity for selection layer operations.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policyId">Policy identifier.</param>
/// <returns>The started activity, or null if not sampled.</returns>
public static Activity? StartSelectActivity(string? tenant, string? policyId)
{
var activity = ActivitySource.StartActivity("policy.select", ActivityKind.Internal);
activity?.SetTag("tenant", NormalizeTenant(tenant));
activity?.SetTag("policy.id", policyId ?? "unknown");
return activity;
}
/// <summary>
/// Starts an activity for policy evaluation.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policyId">Policy identifier.</param>
/// <param name="runId">Run identifier.</param>
/// <returns>The started activity, or null if not sampled.</returns>
public static Activity? StartEvaluateActivity(string? tenant, string? policyId, string? runId)
{
var activity = ActivitySource.StartActivity("policy.evaluate", ActivityKind.Internal);
activity?.SetTag("tenant", NormalizeTenant(tenant));
activity?.SetTag("policy.id", policyId ?? "unknown");
activity?.SetTag("run.id", runId ?? "unknown");
return activity;
}
/// <summary>
/// Starts an activity for materialization operations.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policyId">Policy identifier.</param>
/// <param name="batchSize">Number of items in the batch.</param>
/// <returns>The started activity, or null if not sampled.</returns>
public static Activity? StartMaterializeActivity(string? tenant, string? policyId, int batchSize)
{
var activity = ActivitySource.StartActivity("policy.materialize", ActivityKind.Internal);
activity?.SetTag("tenant", NormalizeTenant(tenant));
activity?.SetTag("policy.id", policyId ?? "unknown");
activity?.SetTag("batch.size", batchSize);
return activity;
}
/// <summary>
/// Starts an activity for simulation operations.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policyId">Policy identifier.</param>
/// <returns>The started activity, or null if not sampled.</returns>
public static Activity? StartSimulateActivity(string? tenant, string? policyId)
{
var activity = ActivitySource.StartActivity("policy.simulate", ActivityKind.Internal);
activity?.SetTag("tenant", NormalizeTenant(tenant));
activity?.SetTag("policy.id", policyId ?? "unknown");
return activity;
}
/// <summary>
/// Starts an activity for compilation operations.
/// </summary>
/// <param name="policyId">Policy identifier.</param>
/// <param name="version">Policy version.</param>
/// <returns>The started activity, or null if not sampled.</returns>
public static Activity? StartCompileActivity(string? policyId, string? version)
{
var activity = ActivitySource.StartActivity("policy.compile", ActivityKind.Internal);
activity?.SetTag("policy.id", policyId ?? "unknown");
activity?.SetTag("policy.version", version ?? "unknown");
return activity;
}
private static string NormalizeTenant(string? tenant)
=> string.IsNullOrWhiteSpace(tenant) ? "default" : tenant;
private static string NormalizeTag(string? value)
=> string.IsNullOrWhiteSpace(value) ? "unknown" : value;
}