up
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
api-governance / spectral-lint (push) Has been cancelled
oas-ci / oas-validate (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Policy Simulation / policy-simulate (push) Has been cancelled
SDK Publish & Sign / sdk-publish (push) Has been cancelled

This commit is contained in:
master
2025-11-27 15:05:48 +02:00
parent 4831c7fcb0
commit e950474a77
278 changed files with 81498 additions and 672 deletions

View File

@@ -0,0 +1,646 @@
using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace StellaOps.Policy.Engine.Telemetry;
/// <summary>
/// Telemetry instrumentation for the Policy Engine service.
/// Provides metrics, traces, and structured logging correlation.
/// </summary>
public static class PolicyEngineTelemetry
{
/// <summary>
/// The name of the meter used for Policy Engine metrics.
/// </summary>
public const string MeterName = "StellaOps.Policy.Engine";
/// <summary>
/// The name of the activity source used for Policy Engine traces.
/// </summary>
public const string ActivitySourceName = "StellaOps.Policy.Engine";
private static readonly Meter Meter = new(MeterName);
/// <summary>
/// The activity source used for Policy Engine traces.
/// </summary>
public static readonly ActivitySource ActivitySource = new(ActivitySourceName);
// Histogram: policy_run_seconds{mode,tenant,policy}
private static readonly Histogram<double> PolicyRunSecondsHistogram =
Meter.CreateHistogram<double>(
"policy_run_seconds",
unit: "s",
description: "Duration of policy evaluation runs.");
// Gauge: policy_run_queue_depth{tenant}
private static readonly ObservableGauge<int> PolicyRunQueueDepthGauge =
Meter.CreateObservableGauge(
"policy_run_queue_depth",
observeValue: () => QueueDepthObservations,
unit: "jobs",
description: "Current depth of pending policy run jobs per tenant.");
// Counter: policy_rules_fired_total{policy,rule}
private static readonly Counter<long> PolicyRulesFiredCounter =
Meter.CreateCounter<long>(
"policy_rules_fired_total",
unit: "rules",
description: "Total number of policy rules that fired during evaluation.");
// Counter: policy_vex_overrides_total{policy,vendor}
private static readonly Counter<long> PolicyVexOverridesCounter =
Meter.CreateCounter<long>(
"policy_vex_overrides_total",
unit: "overrides",
description: "Total number of VEX overrides applied during policy evaluation.");
// Counter: policy_compilation_total{outcome}
private static readonly Counter<long> PolicyCompilationCounter =
Meter.CreateCounter<long>(
"policy_compilation_total",
unit: "compilations",
description: "Total number of policy compilations attempted.");
// Histogram: policy_compilation_seconds
private static readonly Histogram<double> PolicyCompilationSecondsHistogram =
Meter.CreateHistogram<double>(
"policy_compilation_seconds",
unit: "s",
description: "Duration of policy compilation.");
// Counter: policy_simulation_total{tenant,outcome}
private static readonly Counter<long> PolicySimulationCounter =
Meter.CreateCounter<long>(
"policy_simulation_total",
unit: "simulations",
description: "Total number of policy simulations executed.");
#region Golden Signals - Latency
// Histogram: policy_api_latency_seconds{endpoint,method,status}
private static readonly Histogram<double> ApiLatencyHistogram =
Meter.CreateHistogram<double>(
"policy_api_latency_seconds",
unit: "s",
description: "API request latency by endpoint.");
// Histogram: policy_evaluation_latency_seconds{tenant,policy}
private static readonly Histogram<double> EvaluationLatencyHistogram =
Meter.CreateHistogram<double>(
"policy_evaluation_latency_seconds",
unit: "s",
description: "Policy evaluation latency per batch.");
#endregion
#region Golden Signals - Traffic
// Counter: policy_requests_total{endpoint,method}
private static readonly Counter<long> RequestsCounter =
Meter.CreateCounter<long>(
"policy_requests_total",
unit: "requests",
description: "Total API requests by endpoint and method.");
// Counter: policy_evaluations_total{tenant,policy,mode}
private static readonly Counter<long> EvaluationsCounter =
Meter.CreateCounter<long>(
"policy_evaluations_total",
unit: "evaluations",
description: "Total policy evaluations by tenant, policy, and mode.");
// Counter: policy_findings_materialized_total{tenant,policy}
private static readonly Counter<long> FindingsMaterializedCounter =
Meter.CreateCounter<long>(
"policy_findings_materialized_total",
unit: "findings",
description: "Total findings materialized during policy evaluation.");
#endregion
#region Golden Signals - Errors
// Counter: policy_errors_total{type,tenant}
private static readonly Counter<long> ErrorsCounter =
Meter.CreateCounter<long>(
"policy_errors_total",
unit: "errors",
description: "Total errors by type (compilation, evaluation, api, storage).");
// Counter: policy_api_errors_total{endpoint,status_code}
private static readonly Counter<long> ApiErrorsCounter =
Meter.CreateCounter<long>(
"policy_api_errors_total",
unit: "errors",
description: "Total API errors by endpoint and status code.");
// Counter: policy_evaluation_failures_total{tenant,policy,reason}
private static readonly Counter<long> EvaluationFailuresCounter =
Meter.CreateCounter<long>(
"policy_evaluation_failures_total",
unit: "failures",
description: "Total evaluation failures by reason (timeout, determinism, storage, canceled).");
#endregion
#region Golden Signals - Saturation
// Gauge: policy_concurrent_evaluations{tenant}
private static readonly ObservableGauge<int> ConcurrentEvaluationsGauge =
Meter.CreateObservableGauge(
"policy_concurrent_evaluations",
observeValue: () => ConcurrentEvaluationsObservations,
unit: "evaluations",
description: "Current number of concurrent policy evaluations.");
// Gauge: policy_worker_utilization
private static readonly ObservableGauge<double> WorkerUtilizationGauge =
Meter.CreateObservableGauge(
"policy_worker_utilization",
observeValue: () => WorkerUtilizationObservations,
unit: "ratio",
description: "Worker pool utilization ratio (0.0 to 1.0).");
#endregion
#region SLO Metrics
// Gauge: policy_slo_burn_rate{slo_name}
private static readonly ObservableGauge<double> SloBurnRateGauge =
Meter.CreateObservableGauge(
"policy_slo_burn_rate",
observeValue: () => SloBurnRateObservations,
unit: "ratio",
description: "SLO burn rate over configured window.");
// Gauge: policy_error_budget_remaining{slo_name}
private static readonly ObservableGauge<double> ErrorBudgetRemainingGauge =
Meter.CreateObservableGauge(
"policy_error_budget_remaining",
observeValue: () => ErrorBudgetObservations,
unit: "ratio",
description: "Remaining error budget as ratio (0.0 to 1.0).");
// Counter: policy_slo_violations_total{slo_name}
private static readonly Counter<long> SloViolationsCounter =
Meter.CreateCounter<long>(
"policy_slo_violations_total",
unit: "violations",
description: "Total SLO violations detected.");
#endregion
#region Risk Scoring Metrics
// Counter: policy_risk_scoring_jobs_created_total
private static readonly Counter<long> RiskScoringJobsCreatedCounter =
Meter.CreateCounter<long>(
"policy_risk_scoring_jobs_created_total",
unit: "jobs",
description: "Total risk scoring jobs created.");
// Counter: policy_risk_scoring_triggers_skipped_total
private static readonly Counter<long> RiskScoringTriggersSkippedCounter =
Meter.CreateCounter<long>(
"policy_risk_scoring_triggers_skipped_total",
unit: "triggers",
description: "Total risk scoring triggers skipped due to deduplication.");
// Histogram: policy_risk_scoring_duration_seconds
private static readonly Histogram<double> RiskScoringDurationHistogram =
Meter.CreateHistogram<double>(
"policy_risk_scoring_duration_seconds",
unit: "s",
description: "Duration of risk scoring job execution.");
// Counter: policy_risk_scoring_findings_scored_total
private static readonly Counter<long> RiskScoringFindingsScoredCounter =
Meter.CreateCounter<long>(
"policy_risk_scoring_findings_scored_total",
unit: "findings",
description: "Total findings scored by risk scoring jobs.");
/// <summary>
/// Counter for risk scoring jobs created.
/// </summary>
public static Counter<long> RiskScoringJobsCreated => RiskScoringJobsCreatedCounter;
/// <summary>
/// Counter for risk scoring triggers skipped.
/// </summary>
public static Counter<long> RiskScoringTriggersSkipped => RiskScoringTriggersSkippedCounter;
/// <summary>
/// Records risk scoring duration.
/// </summary>
/// <param name="seconds">Duration in seconds.</param>
/// <param name="profileId">Profile identifier.</param>
/// <param name="findingCount">Number of findings scored.</param>
public static void RecordRiskScoringDuration(double seconds, string profileId, int findingCount)
{
var tags = new TagList
{
{ "profile_id", NormalizeTag(profileId) },
{ "finding_count", findingCount.ToString() },
};
RiskScoringDurationHistogram.Record(seconds, tags);
}
/// <summary>
/// Records findings scored by risk scoring.
/// </summary>
/// <param name="profileId">Profile identifier.</param>
/// <param name="count">Number of findings scored.</param>
public static void RecordFindingsScored(string profileId, long count)
{
var tags = new TagList
{
{ "profile_id", NormalizeTag(profileId) },
};
RiskScoringFindingsScoredCounter.Add(count, tags);
}
#endregion
// Storage for observable gauge observations
private static IEnumerable<Measurement<int>> QueueDepthObservations = Enumerable.Empty<Measurement<int>>();
private static IEnumerable<Measurement<int>> ConcurrentEvaluationsObservations = Enumerable.Empty<Measurement<int>>();
private static IEnumerable<Measurement<double>> WorkerUtilizationObservations = Enumerable.Empty<Measurement<double>>();
private static IEnumerable<Measurement<double>> SloBurnRateObservations = Enumerable.Empty<Measurement<double>>();
private static IEnumerable<Measurement<double>> ErrorBudgetObservations = Enumerable.Empty<Measurement<double>>();
/// <summary>
/// Registers a callback to observe queue depth measurements.
/// </summary>
/// <param name="observeFunc">Function that returns current queue depth measurements.</param>
public static void RegisterQueueDepthObservation(Func<IEnumerable<Measurement<int>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
QueueDepthObservations = observeFunc();
}
/// <summary>
/// Records the duration of a policy run.
/// </summary>
/// <param name="seconds">Duration in seconds.</param>
/// <param name="mode">Run mode (full, incremental, simulate).</param>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policy">Policy identifier.</param>
/// <param name="outcome">Outcome of the run (success, failure, canceled).</param>
public static void RecordRunDuration(double seconds, string mode, string tenant, string policy, string outcome)
{
var tags = new TagList
{
{ "mode", NormalizeTag(mode) },
{ "tenant", NormalizeTenant(tenant) },
{ "policy", NormalizeTag(policy) },
{ "outcome", NormalizeTag(outcome) },
};
PolicyRunSecondsHistogram.Record(seconds, tags);
}
/// <summary>
/// Records that a policy rule fired during evaluation.
/// </summary>
/// <param name="policy">Policy identifier.</param>
/// <param name="rule">Rule identifier.</param>
/// <param name="count">Number of times the rule fired.</param>
public static void RecordRuleFired(string policy, string rule, long count = 1)
{
var tags = new TagList
{
{ "policy", NormalizeTag(policy) },
{ "rule", NormalizeTag(rule) },
};
PolicyRulesFiredCounter.Add(count, tags);
}
/// <summary>
/// Records a VEX override applied during policy evaluation.
/// </summary>
/// <param name="policy">Policy identifier.</param>
/// <param name="vendor">VEX vendor identifier.</param>
/// <param name="count">Number of overrides.</param>
public static void RecordVexOverride(string policy, string vendor, long count = 1)
{
var tags = new TagList
{
{ "policy", NormalizeTag(policy) },
{ "vendor", NormalizeTag(vendor) },
};
PolicyVexOverridesCounter.Add(count, tags);
}
/// <summary>
/// Records a policy compilation attempt.
/// </summary>
/// <param name="outcome">Outcome (success, failure).</param>
/// <param name="seconds">Duration in seconds.</param>
public static void RecordCompilation(string outcome, double seconds)
{
var tags = new TagList
{
{ "outcome", NormalizeTag(outcome) },
};
PolicyCompilationCounter.Add(1, tags);
PolicyCompilationSecondsHistogram.Record(seconds, tags);
}
/// <summary>
/// Records a policy simulation execution.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="outcome">Outcome (success, failure).</param>
public static void RecordSimulation(string tenant, string outcome)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "outcome", NormalizeTag(outcome) },
};
PolicySimulationCounter.Add(1, tags);
}
#region Golden Signals - Recording Methods
/// <summary>
/// Records API request latency.
/// </summary>
/// <param name="seconds">Latency in seconds.</param>
/// <param name="endpoint">API endpoint name.</param>
/// <param name="method">HTTP method.</param>
/// <param name="statusCode">HTTP status code.</param>
public static void RecordApiLatency(double seconds, string endpoint, string method, int statusCode)
{
var tags = new TagList
{
{ "endpoint", NormalizeTag(endpoint) },
{ "method", NormalizeTag(method) },
{ "status", statusCode.ToString() },
};
ApiLatencyHistogram.Record(seconds, tags);
}
/// <summary>
/// Records policy evaluation latency for a batch.
/// </summary>
/// <param name="seconds">Latency in seconds.</param>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policy">Policy identifier.</param>
public static void RecordEvaluationLatency(double seconds, string tenant, string policy)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "policy", NormalizeTag(policy) },
};
EvaluationLatencyHistogram.Record(seconds, tags);
}
/// <summary>
/// Records an API request.
/// </summary>
/// <param name="endpoint">API endpoint name.</param>
/// <param name="method">HTTP method.</param>
public static void RecordRequest(string endpoint, string method)
{
var tags = new TagList
{
{ "endpoint", NormalizeTag(endpoint) },
{ "method", NormalizeTag(method) },
};
RequestsCounter.Add(1, tags);
}
/// <summary>
/// Records a policy evaluation execution.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policy">Policy identifier.</param>
/// <param name="mode">Evaluation mode (full, incremental, simulate).</param>
public static void RecordEvaluation(string tenant, string policy, string mode)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "policy", NormalizeTag(policy) },
{ "mode", NormalizeTag(mode) },
};
EvaluationsCounter.Add(1, tags);
}
/// <summary>
/// Records findings materialized during policy evaluation.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policy">Policy identifier.</param>
/// <param name="count">Number of findings materialized.</param>
public static void RecordFindingsMaterialized(string tenant, string policy, long count)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "policy", NormalizeTag(policy) },
};
FindingsMaterializedCounter.Add(count, tags);
}
/// <summary>
/// Records an error.
/// </summary>
/// <param name="errorType">Error type (compilation, evaluation, api, storage).</param>
/// <param name="tenant">Tenant identifier.</param>
public static void RecordError(string errorType, string? tenant = null)
{
var tags = new TagList
{
{ "type", NormalizeTag(errorType) },
{ "tenant", NormalizeTenant(tenant) },
};
ErrorsCounter.Add(1, tags);
}
/// <summary>
/// Records an API error.
/// </summary>
/// <param name="endpoint">API endpoint name.</param>
/// <param name="statusCode">HTTP status code.</param>
public static void RecordApiError(string endpoint, int statusCode)
{
var tags = new TagList
{
{ "endpoint", NormalizeTag(endpoint) },
{ "status_code", statusCode.ToString() },
};
ApiErrorsCounter.Add(1, tags);
}
/// <summary>
/// Records an evaluation failure.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policy">Policy identifier.</param>
/// <param name="reason">Failure reason (timeout, determinism, storage, canceled).</param>
public static void RecordEvaluationFailure(string tenant, string policy, string reason)
{
var tags = new TagList
{
{ "tenant", NormalizeTenant(tenant) },
{ "policy", NormalizeTag(policy) },
{ "reason", NormalizeTag(reason) },
};
EvaluationFailuresCounter.Add(1, tags);
}
/// <summary>
/// Records an SLO violation.
/// </summary>
/// <param name="sloName">Name of the SLO that was violated.</param>
public static void RecordSloViolation(string sloName)
{
var tags = new TagList
{
{ "slo_name", NormalizeTag(sloName) },
};
SloViolationsCounter.Add(1, tags);
}
/// <summary>
/// Registers a callback to observe concurrent evaluations measurements.
/// </summary>
/// <param name="observeFunc">Function that returns current concurrent evaluations measurements.</param>
public static void RegisterConcurrentEvaluationsObservation(Func<IEnumerable<Measurement<int>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
ConcurrentEvaluationsObservations = observeFunc();
}
/// <summary>
/// Registers a callback to observe worker utilization measurements.
/// </summary>
/// <param name="observeFunc">Function that returns current worker utilization measurements.</param>
public static void RegisterWorkerUtilizationObservation(Func<IEnumerable<Measurement<double>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
WorkerUtilizationObservations = observeFunc();
}
/// <summary>
/// Registers a callback to observe SLO burn rate measurements.
/// </summary>
/// <param name="observeFunc">Function that returns current SLO burn rate measurements.</param>
public static void RegisterSloBurnRateObservation(Func<IEnumerable<Measurement<double>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
SloBurnRateObservations = observeFunc();
}
/// <summary>
/// Registers a callback to observe error budget measurements.
/// </summary>
/// <param name="observeFunc">Function that returns current error budget measurements.</param>
public static void RegisterErrorBudgetObservation(Func<IEnumerable<Measurement<double>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
ErrorBudgetObservations = observeFunc();
}
#endregion
/// <summary>
/// Starts an activity for selection layer operations.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policyId">Policy identifier.</param>
/// <returns>The started activity, or null if not sampled.</returns>
public static Activity? StartSelectActivity(string? tenant, string? policyId)
{
var activity = ActivitySource.StartActivity("policy.select", ActivityKind.Internal);
activity?.SetTag("tenant", NormalizeTenant(tenant));
activity?.SetTag("policy.id", policyId ?? "unknown");
return activity;
}
/// <summary>
/// Starts an activity for policy evaluation.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policyId">Policy identifier.</param>
/// <param name="runId">Run identifier.</param>
/// <returns>The started activity, or null if not sampled.</returns>
public static Activity? StartEvaluateActivity(string? tenant, string? policyId, string? runId)
{
var activity = ActivitySource.StartActivity("policy.evaluate", ActivityKind.Internal);
activity?.SetTag("tenant", NormalizeTenant(tenant));
activity?.SetTag("policy.id", policyId ?? "unknown");
activity?.SetTag("run.id", runId ?? "unknown");
return activity;
}
/// <summary>
/// Starts an activity for materialization operations.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policyId">Policy identifier.</param>
/// <param name="batchSize">Number of items in the batch.</param>
/// <returns>The started activity, or null if not sampled.</returns>
public static Activity? StartMaterializeActivity(string? tenant, string? policyId, int batchSize)
{
var activity = ActivitySource.StartActivity("policy.materialize", ActivityKind.Internal);
activity?.SetTag("tenant", NormalizeTenant(tenant));
activity?.SetTag("policy.id", policyId ?? "unknown");
activity?.SetTag("batch.size", batchSize);
return activity;
}
/// <summary>
/// Starts an activity for simulation operations.
/// </summary>
/// <param name="tenant">Tenant identifier.</param>
/// <param name="policyId">Policy identifier.</param>
/// <returns>The started activity, or null if not sampled.</returns>
public static Activity? StartSimulateActivity(string? tenant, string? policyId)
{
var activity = ActivitySource.StartActivity("policy.simulate", ActivityKind.Internal);
activity?.SetTag("tenant", NormalizeTenant(tenant));
activity?.SetTag("policy.id", policyId ?? "unknown");
return activity;
}
/// <summary>
/// Starts an activity for compilation operations.
/// </summary>
/// <param name="policyId">Policy identifier.</param>
/// <param name="version">Policy version.</param>
/// <returns>The started activity, or null if not sampled.</returns>
public static Activity? StartCompileActivity(string? policyId, string? version)
{
var activity = ActivitySource.StartActivity("policy.compile", ActivityKind.Internal);
activity?.SetTag("policy.id", policyId ?? "unknown");
activity?.SetTag("policy.version", version ?? "unknown");
return activity;
}
private static string NormalizeTenant(string? tenant)
=> string.IsNullOrWhiteSpace(tenant) ? "default" : tenant;
private static string NormalizeTag(string? value)
=> string.IsNullOrWhiteSpace(value) ? "unknown" : value;
}