Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Notify Smoke Test / Notify Unit Tests (push) Has been cancelled
Notify Smoke Test / Notifier Service Tests (push) Has been cancelled
Notify Smoke Test / Notification Smoke Test (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Scanner Analyzers / Discover Analyzers (push) Has been cancelled
Scanner Analyzers / Build Analyzers (push) Has been cancelled
Scanner Analyzers / Test Language Analyzers (push) Has been cancelled
Scanner Analyzers / Validate Test Fixtures (push) Has been cancelled
Scanner Analyzers / Verify Deterministic Output (push) Has been cancelled
Signals CI & Image / signals-ci (push) Has been cancelled
Signals Reachability Scoring & Events / reachability-smoke (push) Has been cancelled
Signals Reachability Scoring & Events / sign-and-upload (push) Has been cancelled
338 lines
13 KiB
C#
338 lines
13 KiB
C#
using System;
|
|
using System.Collections.Generic;
|
|
using System.Diagnostics.Metrics;
|
|
using StellaOps.Scanner.Surface.Secrets;
|
|
using StellaOps.Scanner.Worker.Processing;
|
|
|
|
namespace StellaOps.Scanner.Worker.Diagnostics;
|
|
|
|
public sealed class ScannerWorkerMetrics
|
|
{
|
|
private readonly Histogram<double> _queueLatencyMs;
|
|
private readonly Histogram<double> _jobDurationMs;
|
|
private readonly Histogram<double> _stageDurationMs;
|
|
private readonly Counter<long> _jobsCompleted;
|
|
private readonly Counter<long> _jobsFailed;
|
|
private readonly Counter<long> _languageCacheHits;
|
|
private readonly Counter<long> _languageCacheMisses;
|
|
private readonly Counter<long> _osCacheHits;
|
|
private readonly Counter<long> _osCacheMisses;
|
|
private readonly Counter<long> _registrySecretRequests;
|
|
private readonly Histogram<double> _registrySecretTtlSeconds;
|
|
private readonly Counter<long> _surfaceManifestsPublished;
|
|
private readonly Counter<long> _surfaceManifestSkipped;
|
|
private readonly Counter<long> _surfaceManifestFailures;
|
|
private readonly Counter<long> _surfacePayloadPersisted;
|
|
private readonly Histogram<double> _surfaceManifestPublishDurationMs;
|
|
|
|
public ScannerWorkerMetrics()
|
|
{
|
|
_queueLatencyMs = ScannerWorkerInstrumentation.Meter.CreateHistogram<double>(
|
|
"scanner_worker_queue_latency_ms",
|
|
unit: "ms",
|
|
description: "Time from job enqueue to lease acquisition.");
|
|
_jobDurationMs = ScannerWorkerInstrumentation.Meter.CreateHistogram<double>(
|
|
"scanner_worker_job_duration_ms",
|
|
unit: "ms",
|
|
description: "Total processing duration per job.");
|
|
_stageDurationMs = ScannerWorkerInstrumentation.Meter.CreateHistogram<double>(
|
|
"scanner_worker_stage_duration_ms",
|
|
unit: "ms",
|
|
description: "Stage execution duration per job.");
|
|
_jobsCompleted = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
|
|
"scanner_worker_jobs_completed_total",
|
|
description: "Number of successfully completed scan jobs.");
|
|
_jobsFailed = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
|
|
"scanner_worker_jobs_failed_total",
|
|
description: "Number of scan jobs that failed permanently.");
|
|
_languageCacheHits = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
|
|
"scanner_worker_language_cache_hits_total",
|
|
description: "Number of language analyzer cache hits encountered by the worker.");
|
|
_languageCacheMisses = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
|
|
"scanner_worker_language_cache_misses_total",
|
|
description: "Number of language analyzer cache misses encountered by the worker.");
|
|
_osCacheHits = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
|
|
"scanner_worker_os_cache_hits_total",
|
|
description: "Number of OS analyzer cache hits encountered by the worker.");
|
|
_osCacheMisses = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
|
|
"scanner_worker_os_cache_misses_total",
|
|
description: "Number of OS analyzer cache misses encountered by the worker.");
|
|
_registrySecretRequests = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
|
|
"scanner_worker_registry_secret_requests_total",
|
|
description: "Number of registry secret resolution attempts performed by the worker.");
|
|
_registrySecretTtlSeconds = ScannerWorkerInstrumentation.Meter.CreateHistogram<double>(
|
|
"scanner_worker_registry_secret_ttl_seconds",
|
|
unit: "s",
|
|
description: "Time-to-live in seconds for resolved registry secrets (earliest expiration).");
|
|
_surfaceManifestsPublished = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
|
|
"scanner_worker_surface_manifests_published_total",
|
|
description: "Number of surface manifests successfully published by the worker.");
|
|
_surfaceManifestSkipped = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
|
|
"scanner_worker_surface_manifests_skipped_total",
|
|
description: "Number of surface manifest publish attempts skipped due to missing payloads.");
|
|
_surfaceManifestFailures = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
|
|
"scanner_worker_surface_manifests_failed_total",
|
|
description: "Number of surface manifest publish attempts that failed.");
|
|
_surfacePayloadPersisted = ScannerWorkerInstrumentation.Meter.CreateCounter<long>(
|
|
"scanner_worker_surface_payload_persisted_total",
|
|
description: "Number of surface payload artefacts persisted to the local cache.");
|
|
_surfaceManifestPublishDurationMs = ScannerWorkerInstrumentation.Meter.CreateHistogram<double>(
|
|
"scanner_worker_surface_manifest_publish_duration_ms",
|
|
unit: "ms",
|
|
description: "Duration in milliseconds to persist and publish surface manifests.");
|
|
}
|
|
|
|
public void RecordQueueLatency(ScanJobContext context, TimeSpan latency)
|
|
{
|
|
if (latency <= TimeSpan.Zero)
|
|
{
|
|
return;
|
|
}
|
|
|
|
_queueLatencyMs.Record(latency.TotalMilliseconds, CreateTags(context));
|
|
}
|
|
|
|
public void RecordJobDuration(ScanJobContext context, TimeSpan duration)
|
|
{
|
|
if (duration <= TimeSpan.Zero)
|
|
{
|
|
return;
|
|
}
|
|
|
|
_jobDurationMs.Record(duration.TotalMilliseconds, CreateTags(context));
|
|
}
|
|
|
|
public void RecordStageDuration(ScanJobContext context, string stage, TimeSpan duration)
|
|
{
|
|
if (duration <= TimeSpan.Zero)
|
|
{
|
|
return;
|
|
}
|
|
|
|
_stageDurationMs.Record(duration.TotalMilliseconds, CreateTags(context, stage: stage));
|
|
}
|
|
|
|
public void IncrementJobCompleted(ScanJobContext context)
|
|
{
|
|
_jobsCompleted.Add(1, CreateTags(context));
|
|
}
|
|
|
|
public void IncrementJobFailed(ScanJobContext context, string failureReason)
|
|
{
|
|
_jobsFailed.Add(1, CreateTags(context, failureReason: failureReason));
|
|
}
|
|
|
|
public void RecordLanguageCacheHit(ScanJobContext context, string analyzerId)
|
|
{
|
|
_languageCacheHits.Add(1, CreateTags(context, analyzerId: analyzerId));
|
|
}
|
|
|
|
public void RecordLanguageCacheMiss(ScanJobContext context, string analyzerId)
|
|
{
|
|
_languageCacheMisses.Add(1, CreateTags(context, analyzerId: analyzerId));
|
|
}
|
|
|
|
public void RecordOsCacheHit(ScanJobContext context, string analyzerId)
|
|
{
|
|
_osCacheHits.Add(1, CreateTags(context, analyzerId: analyzerId));
|
|
}
|
|
|
|
public void RecordOsCacheMiss(ScanJobContext context, string analyzerId)
|
|
{
|
|
_osCacheMisses.Add(1, CreateTags(context, analyzerId: analyzerId));
|
|
}
|
|
|
|
public void RecordRegistrySecretResolved(
|
|
ScanJobContext context,
|
|
string secretName,
|
|
RegistryAccessSecret secret,
|
|
TimeProvider timeProvider)
|
|
{
|
|
var tags = CreateTags(
|
|
context,
|
|
secretName: secretName,
|
|
secretResult: "resolved",
|
|
secretEntryCount: secret.Entries.Count);
|
|
|
|
_registrySecretRequests.Add(1, tags);
|
|
|
|
if (ComputeTtlSeconds(secret, timeProvider) is double ttlSeconds)
|
|
{
|
|
_registrySecretTtlSeconds.Record(ttlSeconds, tags);
|
|
}
|
|
}
|
|
|
|
public void RecordRegistrySecretMissing(ScanJobContext context, string secretName)
|
|
{
|
|
var tags = CreateTags(context, secretName: secretName, secretResult: "missing");
|
|
_registrySecretRequests.Add(1, tags);
|
|
}
|
|
|
|
public void RecordRegistrySecretFailure(ScanJobContext context, string secretName)
|
|
{
|
|
var tags = CreateTags(context, secretName: secretName, secretResult: "failure");
|
|
_registrySecretRequests.Add(1, tags);
|
|
}
|
|
|
|
public void RecordSurfaceManifestPublished(ScanJobContext context, int payloadCount, TimeSpan duration)
|
|
{
|
|
if (payloadCount < 0)
|
|
{
|
|
payloadCount = 0;
|
|
}
|
|
|
|
var tags = CreateTags(
|
|
context,
|
|
surfaceAction: "manifest",
|
|
surfaceResult: "published",
|
|
surfacePayloadCount: payloadCount);
|
|
|
|
_surfaceManifestsPublished.Add(1, tags);
|
|
|
|
if (duration > TimeSpan.Zero)
|
|
{
|
|
_surfaceManifestPublishDurationMs.Record(duration.TotalMilliseconds, tags);
|
|
}
|
|
}
|
|
|
|
public void RecordSurfaceManifestSkipped(ScanJobContext context)
|
|
{
|
|
var tags = CreateTags(context, surfaceAction: "manifest", surfaceResult: "skipped");
|
|
_surfaceManifestSkipped.Add(1, tags);
|
|
}
|
|
|
|
public void RecordSurfaceManifestFailed(ScanJobContext context, string failureReason)
|
|
{
|
|
var tags = CreateTags(
|
|
context,
|
|
surfaceAction: "manifest",
|
|
surfaceResult: "failed",
|
|
failureReason: failureReason);
|
|
_surfaceManifestFailures.Add(1, tags);
|
|
}
|
|
|
|
public void RecordSurfacePayloadPersisted(ScanJobContext context, string surfaceKind)
|
|
{
|
|
var normalizedKind = string.IsNullOrWhiteSpace(surfaceKind)
|
|
? "unknown"
|
|
: surfaceKind.Trim().ToLowerInvariant();
|
|
|
|
var tags = CreateTags(
|
|
context,
|
|
surfaceAction: "payload",
|
|
surfaceKind: normalizedKind,
|
|
surfaceResult: "cached");
|
|
|
|
_surfacePayloadPersisted.Add(1, tags);
|
|
}
|
|
|
|
private static double? ComputeTtlSeconds(RegistryAccessSecret secret, TimeProvider timeProvider)
|
|
{
|
|
DateTimeOffset? earliest = null;
|
|
foreach (var entry in secret.Entries)
|
|
{
|
|
if (entry.ExpiresAt is null)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if (earliest is null || entry.ExpiresAt < earliest)
|
|
{
|
|
earliest = entry.ExpiresAt;
|
|
}
|
|
}
|
|
|
|
if (earliest is null)
|
|
{
|
|
return null;
|
|
}
|
|
|
|
var now = timeProvider.GetUtcNow();
|
|
var ttl = (earliest.Value - now).TotalSeconds;
|
|
return ttl < 0 ? 0 : ttl;
|
|
}
|
|
|
|
private static KeyValuePair<string, object?>[] CreateTags(
|
|
ScanJobContext context,
|
|
string? stage = null,
|
|
string? failureReason = null,
|
|
string? analyzerId = null,
|
|
string? secretName = null,
|
|
string? secretResult = null,
|
|
int? secretEntryCount = null,
|
|
string? surfaceAction = null,
|
|
string? surfaceKind = null,
|
|
string? surfaceResult = null,
|
|
int? surfacePayloadCount = null)
|
|
{
|
|
var tags = new List<KeyValuePair<string, object?>>(8)
|
|
{
|
|
new("job.id", context.JobId),
|
|
new("scan.id", context.ScanId),
|
|
new("attempt", context.Lease.Attempt),
|
|
};
|
|
|
|
if (context.Lease.Metadata.TryGetValue("queue", out var queueName) && !string.IsNullOrWhiteSpace(queueName))
|
|
{
|
|
tags.Add(new KeyValuePair<string, object?>("queue", queueName));
|
|
}
|
|
|
|
if (context.Lease.Metadata.TryGetValue("job.kind", out var jobKind) && !string.IsNullOrWhiteSpace(jobKind))
|
|
{
|
|
tags.Add(new KeyValuePair<string, object?>("job.kind", jobKind));
|
|
}
|
|
|
|
if (!string.IsNullOrWhiteSpace(stage))
|
|
{
|
|
tags.Add(new KeyValuePair<string, object?>("stage", stage));
|
|
}
|
|
|
|
if (!string.IsNullOrWhiteSpace(failureReason))
|
|
{
|
|
tags.Add(new KeyValuePair<string, object?>("reason", failureReason));
|
|
}
|
|
|
|
if (!string.IsNullOrWhiteSpace(analyzerId))
|
|
{
|
|
tags.Add(new KeyValuePair<string, object?>("analyzer.id", analyzerId));
|
|
}
|
|
|
|
if (!string.IsNullOrWhiteSpace(secretName))
|
|
{
|
|
tags.Add(new KeyValuePair<string, object?>("secret.name", secretName));
|
|
}
|
|
|
|
if (!string.IsNullOrWhiteSpace(secretResult))
|
|
{
|
|
tags.Add(new KeyValuePair<string, object?>("secret.result", secretResult));
|
|
}
|
|
|
|
if (secretEntryCount is not null)
|
|
{
|
|
tags.Add(new KeyValuePair<string, object?>("secret.entries", secretEntryCount.Value));
|
|
}
|
|
|
|
if (!string.IsNullOrWhiteSpace(surfaceAction))
|
|
{
|
|
tags.Add(new KeyValuePair<string, object?>("surface.action", surfaceAction));
|
|
}
|
|
|
|
if (!string.IsNullOrWhiteSpace(surfaceKind))
|
|
{
|
|
tags.Add(new KeyValuePair<string, object?>("surface.kind", surfaceKind));
|
|
}
|
|
|
|
if (!string.IsNullOrWhiteSpace(surfaceResult))
|
|
{
|
|
tags.Add(new KeyValuePair<string, object?>("surface.result", surfaceResult));
|
|
}
|
|
|
|
if (surfacePayloadCount is not null)
|
|
{
|
|
tags.Add(new KeyValuePair<string, object?>("surface.payload_count", surfacePayloadCount.Value));
|
|
}
|
|
|
|
return tags.ToArray();
|
|
}
|
|
}
|