up
Some checks failed
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Docs CI / lint-and-preview (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled

This commit is contained in:
StellaOps Bot
2025-11-27 23:44:42 +02:00
parent ef6e4b2067
commit 3b96b2e3ea
298 changed files with 47516 additions and 1168 deletions

View File

@@ -35,9 +35,9 @@ public static class PolicyEngineTelemetry
// Gauge: policy_run_queue_depth{tenant}
private static readonly ObservableGauge<int> PolicyRunQueueDepthGauge =
Meter.CreateObservableGauge(
Meter.CreateObservableGauge<int>(
"policy_run_queue_depth",
observeValue: () => QueueDepthObservations,
observeValues: () => QueueDepthObservations ?? Enumerable.Empty<Measurement<int>>(),
unit: "jobs",
description: "Current depth of pending policy run jobs per tenant.");
@@ -148,17 +148,17 @@ public static class PolicyEngineTelemetry
// Gauge: policy_concurrent_evaluations{tenant}
private static readonly ObservableGauge<int> ConcurrentEvaluationsGauge =
Meter.CreateObservableGauge(
Meter.CreateObservableGauge<int>(
"policy_concurrent_evaluations",
observeValue: () => ConcurrentEvaluationsObservations,
observeValues: () => ConcurrentEvaluationsObservations ?? Enumerable.Empty<Measurement<int>>(),
unit: "evaluations",
description: "Current number of concurrent policy evaluations.");
// Gauge: policy_worker_utilization
private static readonly ObservableGauge<double> WorkerUtilizationGauge =
Meter.CreateObservableGauge(
Meter.CreateObservableGauge<double>(
"policy_worker_utilization",
observeValue: () => WorkerUtilizationObservations,
observeValues: () => WorkerUtilizationObservations ?? Enumerable.Empty<Measurement<double>>(),
unit: "ratio",
description: "Worker pool utilization ratio (0.0 to 1.0).");
@@ -168,17 +168,17 @@ public static class PolicyEngineTelemetry
// Gauge: policy_slo_burn_rate{slo_name}
private static readonly ObservableGauge<double> SloBurnRateGauge =
Meter.CreateObservableGauge(
Meter.CreateObservableGauge<double>(
"policy_slo_burn_rate",
observeValue: () => SloBurnRateObservations,
observeValues: () => SloBurnRateObservations ?? Enumerable.Empty<Measurement<double>>(),
unit: "ratio",
description: "SLO burn rate over configured window.");
// Gauge: policy_error_budget_remaining{slo_name}
private static readonly ObservableGauge<double> ErrorBudgetRemainingGauge =
Meter.CreateObservableGauge(
Meter.CreateObservableGauge<double>(
"policy_error_budget_remaining",
observeValue: () => ErrorBudgetObservations,
observeValues: () => ErrorBudgetObservations ?? Enumerable.Empty<Measurement<double>>(),
unit: "ratio",
description: "Remaining error budget as ratio (0.0 to 1.0).");
@@ -265,6 +265,143 @@ public static class PolicyEngineTelemetry
#endregion
#region Risk Simulation and Events Metrics
// Counter: policy_risk_simulations_run_total
private static readonly Counter<long> RiskSimulationsRunCounter =
Meter.CreateCounter<long>(
"policy_risk_simulations_run_total",
unit: "simulations",
description: "Total risk simulations executed.");
// Counter: policy_profile_events_published_total
private static readonly Counter<long> ProfileEventsPublishedCounter =
Meter.CreateCounter<long>(
"policy_profile_events_published_total",
unit: "events",
description: "Total profile lifecycle events published.");
/// <summary>
/// Counter for risk simulations run.
/// </summary>
public static Counter<long> RiskSimulationsRun => RiskSimulationsRunCounter;
/// <summary>
/// Counter for profile events published.
/// </summary>
public static Counter<long> ProfileEventsPublished => ProfileEventsPublishedCounter;
#endregion
#region Reachability Metrics
// Counter: policy_reachability_applied_total{state}
private static readonly Counter<long> ReachabilityAppliedCounter =
Meter.CreateCounter<long>(
"policy_reachability_applied_total",
unit: "facts",
description: "Total reachability facts applied during policy evaluation.");
// Counter: policy_reachability_cache_hits_total
private static readonly Counter<long> ReachabilityCacheHitsCounter =
Meter.CreateCounter<long>(
"policy_reachability_cache_hits_total",
unit: "hits",
description: "Total reachability facts cache hits.");
// Counter: policy_reachability_cache_misses_total
private static readonly Counter<long> ReachabilityCacheMissesCounter =
Meter.CreateCounter<long>(
"policy_reachability_cache_misses_total",
unit: "misses",
description: "Total reachability facts cache misses.");
// Gauge: policy_reachability_cache_hit_ratio
private static readonly ObservableGauge<double> ReachabilityCacheHitRatioGauge =
Meter.CreateObservableGauge<double>(
"policy_reachability_cache_hit_ratio",
observeValues: () => ReachabilityCacheHitRatioObservations ?? Enumerable.Empty<Measurement<double>>(),
unit: "ratio",
description: "Reachability facts cache hit ratio (0.0 to 1.0).");
// Counter: policy_reachability_lookups_total{outcome}
private static readonly Counter<long> ReachabilityLookupsCounter =
Meter.CreateCounter<long>(
"policy_reachability_lookups_total",
unit: "lookups",
description: "Total reachability facts lookup operations.");
// Histogram: policy_reachability_lookup_seconds
private static readonly Histogram<double> ReachabilityLookupSecondsHistogram =
Meter.CreateHistogram<double>(
"policy_reachability_lookup_seconds",
unit: "s",
description: "Duration of reachability facts lookup operations.");
private static IEnumerable<Measurement<double>> ReachabilityCacheHitRatioObservations = Enumerable.Empty<Measurement<double>>();
/// <summary>
/// Records reachability fact applied during evaluation.
/// </summary>
/// <param name="state">Reachability state (reachable, unreachable, unknown, under_investigation).</param>
/// <param name="count">Number of facts.</param>
public static void RecordReachabilityApplied(string state, long count = 1)
{
var tags = new TagList
{
{ "state", NormalizeTag(state) },
};
ReachabilityAppliedCounter.Add(count, tags);
}
/// <summary>
/// Records reachability cache hits.
/// </summary>
/// <param name="count">Number of hits.</param>
public static void RecordReachabilityCacheHits(long count)
{
ReachabilityCacheHitsCounter.Add(count);
}
/// <summary>
/// Records reachability cache misses.
/// </summary>
/// <param name="count">Number of misses.</param>
public static void RecordReachabilityCacheMisses(long count)
{
ReachabilityCacheMissesCounter.Add(count);
}
/// <summary>
/// Records a reachability lookup operation.
/// </summary>
/// <param name="outcome">Outcome (found, not_found, error).</param>
/// <param name="seconds">Duration in seconds.</param>
/// <param name="batchSize">Number of items looked up.</param>
public static void RecordReachabilityLookup(string outcome, double seconds, int batchSize)
{
var tags = new TagList
{
{ "outcome", NormalizeTag(outcome) },
};
ReachabilityLookupsCounter.Add(batchSize, tags);
ReachabilityLookupSecondsHistogram.Record(seconds, tags);
}
/// <summary>
/// Registers a callback to observe reachability cache hit ratio.
/// </summary>
/// <param name="observeFunc">Function that returns current cache hit ratio measurements.</param>
public static void RegisterReachabilityCacheHitRatioObservation(Func<IEnumerable<Measurement<double>>> observeFunc)
{
ArgumentNullException.ThrowIfNull(observeFunc);
ReachabilityCacheHitRatioObservations = observeFunc();
}
#endregion
// Storage for observable gauge observations
private static IEnumerable<Measurement<int>> QueueDepthObservations = Enumerable.Empty<Measurement<int>>();
private static IEnumerable<Measurement<int>> ConcurrentEvaluationsObservations = Enumerable.Empty<Measurement<int>>();