Files
git.stella-ops.org/src/Scanner/StellaOps.Scanner.Worker/Diagnostics/ScannerWorkerMetrics.cs
StellaOps Bot 564df71bfb
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Notify Smoke Test / Notify Unit Tests (push) Has been cancelled
Notify Smoke Test / Notifier Service Tests (push) Has been cancelled
Notify Smoke Test / Notification Smoke Test (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Scanner Analyzers / Discover Analyzers (push) Has been cancelled
Scanner Analyzers / Build Analyzers (push) Has been cancelled
Scanner Analyzers / Test Language Analyzers (push) Has been cancelled
Scanner Analyzers / Validate Test Fixtures (push) Has been cancelled
Scanner Analyzers / Verify Deterministic Output (push) Has been cancelled
Signals CI & Image / signals-ci (push) Has been cancelled
Signals Reachability Scoring & Events / reachability-smoke (push) Has been cancelled
Signals Reachability Scoring & Events / sign-and-upload (push) Has been cancelled
up
2025-12-13 00:20:26 +02:00

338 lines
13 KiB
C#

using System;
using System.Collections.Generic;
using System.Diagnostics.Metrics;
using StellaOps.Scanner.Surface.Secrets;
using StellaOps.Scanner.Worker.Processing;
namespace StellaOps.Scanner.Worker.Diagnostics;
public sealed class ScannerWorkerMetrics
{
private readonly Histogram<double> _queueLatencyMs;
private readonly Histogram<double> _jobDurationMs;
private readonly Histogram<double> _stageDurationMs;
private readonly Counter<long> _jobsCompleted;
private readonly Counter<long> _jobsFailed;
private readonly Counter<long> _languageCacheHits;
private readonly Counter<long> _languageCacheMisses;
private readonly Counter<long> _osCacheHits;
private readonly Counter<long> _osCacheMisses;
private readonly Counter<long> _registrySecretRequests;
private readonly Histogram<double> _registrySecretTtlSeconds;
private readonly Counter<long> _surfaceManifestsPublished;
private readonly Counter<long> _surfaceManifestSkipped;
private readonly Counter<long> _surfaceManifestFailures;
private readonly Counter<long> _surfacePayloadPersisted;
private readonly Histogram<double> _surfaceManifestPublishDurationMs;
public ScannerWorkerMetrics()
{
_queueLatencyMs = ScannerWorkerInstrumentation.Meter.CreateHistogram<double>(
"scanner_worker_queue_latency_ms",
unit: "ms",
description: "Time from job enqueue to lease acquisition.");
_jobDurationMs = ScannerWorkerInstrumentation.Meter.CreateHistogram<double>(
"scanner_worker_job_duration_ms",
unit: "ms",
description: "Total processing duration per job.");
_stageDurationMs = ScannerWorkerInstrumentation.Meter.CreateHistogram<double>(
"scanner_worker_stage_duration_ms",
unit: "ms",
description: "Stage execution duration per job.");
_jobsCompleted = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
"scanner_worker_jobs_completed_total",
description: "Number of successfully completed scan jobs.");
_jobsFailed = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
"scanner_worker_jobs_failed_total",
description: "Number of scan jobs that failed permanently.");
_languageCacheHits = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
"scanner_worker_language_cache_hits_total",
description: "Number of language analyzer cache hits encountered by the worker.");
_languageCacheMisses = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
"scanner_worker_language_cache_misses_total",
description: "Number of language analyzer cache misses encountered by the worker.");
_osCacheHits = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
"scanner_worker_os_cache_hits_total",
description: "Number of OS analyzer cache hits encountered by the worker.");
_osCacheMisses = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
"scanner_worker_os_cache_misses_total",
description: "Number of OS analyzer cache misses encountered by the worker.");
_registrySecretRequests = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
"scanner_worker_registry_secret_requests_total",
description: "Number of registry secret resolution attempts performed by the worker.");
_registrySecretTtlSeconds = ScannerWorkerInstrumentation.Meter.CreateHistogram<double>(
"scanner_worker_registry_secret_ttl_seconds",
unit: "s",
description: "Time-to-live in seconds for resolved registry secrets (earliest expiration).");
_surfaceManifestsPublished = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
"scanner_worker_surface_manifests_published_total",
description: "Number of surface manifests successfully published by the worker.");
_surfaceManifestSkipped = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
"scanner_worker_surface_manifests_skipped_total",
description: "Number of surface manifest publish attempts skipped due to missing payloads.");
_surfaceManifestFailures = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
"scanner_worker_surface_manifests_failed_total",
description: "Number of surface manifest publish attempts that failed.");
_surfacePayloadPersisted = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
"scanner_worker_surface_payload_persisted_total",
description: "Number of surface payload artefacts persisted to the local cache.");
_surfaceManifestPublishDurationMs = ScannerWorkerInstrumentation.Meter.CreateHistogram<double>(
"scanner_worker_surface_manifest_publish_duration_ms",
unit: "ms",
description: "Duration in milliseconds to persist and publish surface manifests.");
}
public void RecordQueueLatency(ScanJobContext context, TimeSpan latency)
{
if (latency <= TimeSpan.Zero)
{
return;
}
_queueLatencyMs.Record(latency.TotalMilliseconds, CreateTags(context));
}
public void RecordJobDuration(ScanJobContext context, TimeSpan duration)
{
if (duration <= TimeSpan.Zero)
{
return;
}
_jobDurationMs.Record(duration.TotalMilliseconds, CreateTags(context));
}
public void RecordStageDuration(ScanJobContext context, string stage, TimeSpan duration)
{
if (duration <= TimeSpan.Zero)
{
return;
}
_stageDurationMs.Record(duration.TotalMilliseconds, CreateTags(context, stage: stage));
}
public void IncrementJobCompleted(ScanJobContext context)
{
_jobsCompleted.Add(1, CreateTags(context));
}
public void IncrementJobFailed(ScanJobContext context, string failureReason)
{
_jobsFailed.Add(1, CreateTags(context, failureReason: failureReason));
}
public void RecordLanguageCacheHit(ScanJobContext context, string analyzerId)
{
_languageCacheHits.Add(1, CreateTags(context, analyzerId: analyzerId));
}
public void RecordLanguageCacheMiss(ScanJobContext context, string analyzerId)
{
_languageCacheMisses.Add(1, CreateTags(context, analyzerId: analyzerId));
}
public void RecordOsCacheHit(ScanJobContext context, string analyzerId)
{
_osCacheHits.Add(1, CreateTags(context, analyzerId: analyzerId));
}
public void RecordOsCacheMiss(ScanJobContext context, string analyzerId)
{
_osCacheMisses.Add(1, CreateTags(context, analyzerId: analyzerId));
}
public void RecordRegistrySecretResolved(
ScanJobContext context,
string secretName,
RegistryAccessSecret secret,
TimeProvider timeProvider)
{
var tags = CreateTags(
context,
secretName: secretName,
secretResult: "resolved",
secretEntryCount: secret.Entries.Count);
_registrySecretRequests.Add(1, tags);
if (ComputeTtlSeconds(secret, timeProvider) is double ttlSeconds)
{
_registrySecretTtlSeconds.Record(ttlSeconds, tags);
}
}
public void RecordRegistrySecretMissing(ScanJobContext context, string secretName)
{
var tags = CreateTags(context, secretName: secretName, secretResult: "missing");
_registrySecretRequests.Add(1, tags);
}
public void RecordRegistrySecretFailure(ScanJobContext context, string secretName)
{
var tags = CreateTags(context, secretName: secretName, secretResult: "failure");
_registrySecretRequests.Add(1, tags);
}
public void RecordSurfaceManifestPublished(ScanJobContext context, int payloadCount, TimeSpan duration)
{
if (payloadCount < 0)
{
payloadCount = 0;
}
var tags = CreateTags(
context,
surfaceAction: "manifest",
surfaceResult: "published",
surfacePayloadCount: payloadCount);
_surfaceManifestsPublished.Add(1, tags);
if (duration > TimeSpan.Zero)
{
_surfaceManifestPublishDurationMs.Record(duration.TotalMilliseconds, tags);
}
}
public void RecordSurfaceManifestSkipped(ScanJobContext context)
{
var tags = CreateTags(context, surfaceAction: "manifest", surfaceResult: "skipped");
_surfaceManifestSkipped.Add(1, tags);
}
public void RecordSurfaceManifestFailed(ScanJobContext context, string failureReason)
{
var tags = CreateTags(
context,
surfaceAction: "manifest",
surfaceResult: "failed",
failureReason: failureReason);
_surfaceManifestFailures.Add(1, tags);
}
public void RecordSurfacePayloadPersisted(ScanJobContext context, string surfaceKind)
{
var normalizedKind = string.IsNullOrWhiteSpace(surfaceKind)
? "unknown"
: surfaceKind.Trim().ToLowerInvariant();
var tags = CreateTags(
context,
surfaceAction: "payload",
surfaceKind: normalizedKind,
surfaceResult: "cached");
_surfacePayloadPersisted.Add(1, tags);
}
private static double? ComputeTtlSeconds(RegistryAccessSecret secret, TimeProvider timeProvider)
{
DateTimeOffset? earliest = null;
foreach (var entry in secret.Entries)
{
if (entry.ExpiresAt is null)
{
continue;
}
if (earliest is null || entry.ExpiresAt < earliest)
{
earliest = entry.ExpiresAt;
}
}
if (earliest is null)
{
return null;
}
var now = timeProvider.GetUtcNow();
var ttl = (earliest.Value - now).TotalSeconds;
return ttl < 0 ? 0 : ttl;
}
private static KeyValuePair<string, object?>[] CreateTags(
ScanJobContext context,
string? stage = null,
string? failureReason = null,
string? analyzerId = null,
string? secretName = null,
string? secretResult = null,
int? secretEntryCount = null,
string? surfaceAction = null,
string? surfaceKind = null,
string? surfaceResult = null,
int? surfacePayloadCount = null)
{
var tags = new List<KeyValuePair<string, object?>>(8)
{
new("job.id", context.JobId),
new("scan.id", context.ScanId),
new("attempt", context.Lease.Attempt),
};
if (context.Lease.Metadata.TryGetValue("queue", out var queueName) && !string.IsNullOrWhiteSpace(queueName))
{
tags.Add(new KeyValuePair<string, object?>("queue", queueName));
}
if (context.Lease.Metadata.TryGetValue("job.kind", out var jobKind) && !string.IsNullOrWhiteSpace(jobKind))
{
tags.Add(new KeyValuePair<string, object?>("job.kind", jobKind));
}
if (!string.IsNullOrWhiteSpace(stage))
{
tags.Add(new KeyValuePair<string, object?>("stage", stage));
}
if (!string.IsNullOrWhiteSpace(failureReason))
{
tags.Add(new KeyValuePair<string, object?>("reason", failureReason));
}
if (!string.IsNullOrWhiteSpace(analyzerId))
{
tags.Add(new KeyValuePair<string, object?>("analyzer.id", analyzerId));
}
if (!string.IsNullOrWhiteSpace(secretName))
{
tags.Add(new KeyValuePair<string, object?>("secret.name", secretName));
}
if (!string.IsNullOrWhiteSpace(secretResult))
{
tags.Add(new KeyValuePair<string, object?>("secret.result", secretResult));
}
if (secretEntryCount is not null)
{
tags.Add(new KeyValuePair<string, object?>("secret.entries", secretEntryCount.Value));
}
if (!string.IsNullOrWhiteSpace(surfaceAction))
{
tags.Add(new KeyValuePair<string, object?>("surface.action", surfaceAction));
}
if (!string.IsNullOrWhiteSpace(surfaceKind))
{
tags.Add(new KeyValuePair<string, object?>("surface.kind", surfaceKind));
}
if (!string.IsNullOrWhiteSpace(surfaceResult))
{
tags.Add(new KeyValuePair<string, object?>("surface.result", surfaceResult));
}
if (surfacePayloadCount is not null)
{
tags.Add(new KeyValuePair<string, object?>("surface.payload_count", surfacePayloadCount.Value));
}
return tags.ToArray();
}
}