using System; using System.Collections.Generic; using System.Diagnostics.Metrics; using StellaOps.Scanner.Surface.Secrets; using StellaOps.Scanner.Worker.Processing; namespace StellaOps.Scanner.Worker.Diagnostics; public sealed class ScannerWorkerMetrics { private readonly Histogram _queueLatencyMs; private readonly Histogram _jobDurationMs; private readonly Histogram _stageDurationMs; private readonly Counter _jobsCompleted; private readonly Counter _jobsFailed; private readonly Counter _languageCacheHits; private readonly Counter _languageCacheMisses; private readonly Counter _osCacheHits; private readonly Counter _osCacheMisses; private readonly Counter _registrySecretRequests; private readonly Histogram _registrySecretTtlSeconds; private readonly Counter _surfaceManifestsPublished; private readonly Counter _surfaceManifestSkipped; private readonly Counter _surfaceManifestFailures; private readonly Counter _surfacePayloadPersisted; private readonly Histogram _surfaceManifestPublishDurationMs; public ScannerWorkerMetrics() { _queueLatencyMs = ScannerWorkerInstrumentation.Meter.CreateHistogram( "scanner_worker_queue_latency_ms", unit: "ms", description: "Time from job enqueue to lease acquisition."); _jobDurationMs = ScannerWorkerInstrumentation.Meter.CreateHistogram( "scanner_worker_job_duration_ms", unit: "ms", description: "Total processing duration per job."); _stageDurationMs = ScannerWorkerInstrumentation.Meter.CreateHistogram( "scanner_worker_stage_duration_ms", unit: "ms", description: "Stage execution duration per job."); _jobsCompleted = ScannerWorkerInstrumentation.Meter.CreateCounter( "scanner_worker_jobs_completed_total", description: "Number of successfully completed scan jobs."); _jobsFailed = ScannerWorkerInstrumentation.Meter.CreateCounter( "scanner_worker_jobs_failed_total", description: "Number of scan jobs that failed permanently."); _languageCacheHits = ScannerWorkerInstrumentation.Meter.CreateCounter( "scanner_worker_language_cache_hits_total", description: "Number of language analyzer cache hits encountered by the worker."); _languageCacheMisses = ScannerWorkerInstrumentation.Meter.CreateCounter( "scanner_worker_language_cache_misses_total", description: "Number of language analyzer cache misses encountered by the worker."); _osCacheHits = ScannerWorkerInstrumentation.Meter.CreateCounter( "scanner_worker_os_cache_hits_total", description: "Number of OS analyzer cache hits encountered by the worker."); _osCacheMisses = ScannerWorkerInstrumentation.Meter.CreateCounter( "scanner_worker_os_cache_misses_total", description: "Number of OS analyzer cache misses encountered by the worker."); _registrySecretRequests = ScannerWorkerInstrumentation.Meter.CreateCounter( "scanner_worker_registry_secret_requests_total", description: "Number of registry secret resolution attempts performed by the worker."); _registrySecretTtlSeconds = ScannerWorkerInstrumentation.Meter.CreateHistogram( "scanner_worker_registry_secret_ttl_seconds", unit: "s", description: "Time-to-live in seconds for resolved registry secrets (earliest expiration)."); _surfaceManifestsPublished = ScannerWorkerInstrumentation.Meter.CreateCounter( "scanner_worker_surface_manifests_published_total", description: "Number of surface manifests successfully published by the worker."); _surfaceManifestSkipped = ScannerWorkerInstrumentation.Meter.CreateCounter( "scanner_worker_surface_manifests_skipped_total", description: "Number of surface manifest publish attempts skipped due to missing payloads."); _surfaceManifestFailures = ScannerWorkerInstrumentation.Meter.CreateCounter( "scanner_worker_surface_manifests_failed_total", description: "Number of surface manifest publish attempts that failed."); _surfacePayloadPersisted = ScannerWorkerInstrumentation.Meter.CreateCounter( "scanner_worker_surface_payload_persisted_total", description: "Number of surface payload artefacts persisted to the local cache."); _surfaceManifestPublishDurationMs = ScannerWorkerInstrumentation.Meter.CreateHistogram( "scanner_worker_surface_manifest_publish_duration_ms", unit: "ms", description: "Duration in milliseconds to persist and publish surface manifests."); } public void RecordQueueLatency(ScanJobContext context, TimeSpan latency) { if (latency <= TimeSpan.Zero) { return; } _queueLatencyMs.Record(latency.TotalMilliseconds, CreateTags(context)); } public void RecordJobDuration(ScanJobContext context, TimeSpan duration) { if (duration <= TimeSpan.Zero) { return; } _jobDurationMs.Record(duration.TotalMilliseconds, CreateTags(context)); } public void RecordStageDuration(ScanJobContext context, string stage, TimeSpan duration) { if (duration <= TimeSpan.Zero) { return; } _stageDurationMs.Record(duration.TotalMilliseconds, CreateTags(context, stage: stage)); } public void IncrementJobCompleted(ScanJobContext context) { _jobsCompleted.Add(1, CreateTags(context)); } public void IncrementJobFailed(ScanJobContext context, string failureReason) { _jobsFailed.Add(1, CreateTags(context, failureReason: failureReason)); } public void RecordLanguageCacheHit(ScanJobContext context, string analyzerId) { _languageCacheHits.Add(1, CreateTags(context, analyzerId: analyzerId)); } public void RecordLanguageCacheMiss(ScanJobContext context, string analyzerId) { _languageCacheMisses.Add(1, CreateTags(context, analyzerId: analyzerId)); } public void RecordOsCacheHit(ScanJobContext context, string analyzerId) { _osCacheHits.Add(1, CreateTags(context, analyzerId: analyzerId)); } public void RecordOsCacheMiss(ScanJobContext context, string analyzerId) { _osCacheMisses.Add(1, CreateTags(context, analyzerId: analyzerId)); } public void RecordRegistrySecretResolved( ScanJobContext context, string secretName, RegistryAccessSecret secret, TimeProvider timeProvider) { var tags = CreateTags( context, secretName: secretName, secretResult: "resolved", secretEntryCount: secret.Entries.Count); _registrySecretRequests.Add(1, tags); if (ComputeTtlSeconds(secret, timeProvider) is double ttlSeconds) { _registrySecretTtlSeconds.Record(ttlSeconds, tags); } } public void RecordRegistrySecretMissing(ScanJobContext context, string secretName) { var tags = CreateTags(context, secretName: secretName, secretResult: "missing"); _registrySecretRequests.Add(1, tags); } public void RecordRegistrySecretFailure(ScanJobContext context, string secretName) { var tags = CreateTags(context, secretName: secretName, secretResult: "failure"); _registrySecretRequests.Add(1, tags); } public void RecordSurfaceManifestPublished(ScanJobContext context, int payloadCount, TimeSpan duration) { if (payloadCount < 0) { payloadCount = 0; } var tags = CreateTags( context, surfaceAction: "manifest", surfaceResult: "published", surfacePayloadCount: payloadCount); _surfaceManifestsPublished.Add(1, tags); if (duration > TimeSpan.Zero) { _surfaceManifestPublishDurationMs.Record(duration.TotalMilliseconds, tags); } } public void RecordSurfaceManifestSkipped(ScanJobContext context) { var tags = CreateTags(context, surfaceAction: "manifest", surfaceResult: "skipped"); _surfaceManifestSkipped.Add(1, tags); } public void RecordSurfaceManifestFailed(ScanJobContext context, string failureReason) { var tags = CreateTags( context, surfaceAction: "manifest", surfaceResult: "failed", failureReason: failureReason); _surfaceManifestFailures.Add(1, tags); } public void RecordSurfacePayloadPersisted(ScanJobContext context, string surfaceKind) { var normalizedKind = string.IsNullOrWhiteSpace(surfaceKind) ? "unknown" : surfaceKind.Trim().ToLowerInvariant(); var tags = CreateTags( context, surfaceAction: "payload", surfaceKind: normalizedKind, surfaceResult: "cached"); _surfacePayloadPersisted.Add(1, tags); } private static double? ComputeTtlSeconds(RegistryAccessSecret secret, TimeProvider timeProvider) { DateTimeOffset? earliest = null; foreach (var entry in secret.Entries) { if (entry.ExpiresAt is null) { continue; } if (earliest is null || entry.ExpiresAt < earliest) { earliest = entry.ExpiresAt; } } if (earliest is null) { return null; } var now = timeProvider.GetUtcNow(); var ttl = (earliest.Value - now).TotalSeconds; return ttl < 0 ? 0 : ttl; } private static KeyValuePair[] CreateTags( ScanJobContext context, string? stage = null, string? failureReason = null, string? analyzerId = null, string? secretName = null, string? secretResult = null, int? secretEntryCount = null, string? surfaceAction = null, string? surfaceKind = null, string? surfaceResult = null, int? surfacePayloadCount = null) { var tags = new List>(8) { new("job.id", context.JobId), new("scan.id", context.ScanId), new("attempt", context.Lease.Attempt), }; if (context.Lease.Metadata.TryGetValue("queue", out var queueName) && !string.IsNullOrWhiteSpace(queueName)) { tags.Add(new KeyValuePair("queue", queueName)); } if (context.Lease.Metadata.TryGetValue("job.kind", out var jobKind) && !string.IsNullOrWhiteSpace(jobKind)) { tags.Add(new KeyValuePair("job.kind", jobKind)); } if (!string.IsNullOrWhiteSpace(stage)) { tags.Add(new KeyValuePair("stage", stage)); } if (!string.IsNullOrWhiteSpace(failureReason)) { tags.Add(new KeyValuePair("reason", failureReason)); } if (!string.IsNullOrWhiteSpace(analyzerId)) { tags.Add(new KeyValuePair("analyzer.id", analyzerId)); } if (!string.IsNullOrWhiteSpace(secretName)) { tags.Add(new KeyValuePair("secret.name", secretName)); } if (!string.IsNullOrWhiteSpace(secretResult)) { tags.Add(new KeyValuePair("secret.result", secretResult)); } if (secretEntryCount is not null) { tags.Add(new KeyValuePair("secret.entries", secretEntryCount.Value)); } if (!string.IsNullOrWhiteSpace(surfaceAction)) { tags.Add(new KeyValuePair("surface.action", surfaceAction)); } if (!string.IsNullOrWhiteSpace(surfaceKind)) { tags.Add(new KeyValuePair("surface.kind", surfaceKind)); } if (!string.IsNullOrWhiteSpace(surfaceResult)) { tags.Add(new KeyValuePair("surface.result", surfaceResult)); } if (surfacePayloadCount is not null) { tags.Add(new KeyValuePair("surface.payload_count", surfacePayloadCount.Value)); } return tags.ToArray(); } }