devops folders consolidate
This commit is contained in:
@@ -121,6 +121,70 @@ public static class AdvisoryCacheKeys
|
||||
public static string CveMappingPattern(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}by:cve:*";
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// IDF (Inverse Document Frequency) Cache Keys
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-007
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/// <summary>
|
||||
/// Key for IDF score of a specific package.
|
||||
/// Format: {prefix}idf:pkg:{normalizedPackageName}
|
||||
/// </summary>
|
||||
/// <param name="packageName">The package name (will be normalized).</param>
|
||||
/// <param name="prefix">Key prefix.</param>
|
||||
public static string IdfPackage(string packageName, string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:pkg:{NormalizePurl(packageName)}";
|
||||
|
||||
/// <summary>
|
||||
/// Key for IDF corpus statistics (total document count).
|
||||
/// Format: {prefix}idf:stats:corpus_size
|
||||
/// </summary>
|
||||
public static string IdfCorpusSize(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:stats:corpus_size";
|
||||
|
||||
/// <summary>
|
||||
/// Key for IDF last refresh timestamp.
|
||||
/// Format: {prefix}idf:stats:last_refresh
|
||||
/// </summary>
|
||||
public static string IdfLastRefresh(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:stats:last_refresh";
|
||||
|
||||
/// <summary>
|
||||
/// Key for IDF refresh lock (distributed coordination).
|
||||
/// Format: {prefix}idf:lock:refresh
|
||||
/// </summary>
|
||||
public static string IdfRefreshLock(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:lock:refresh";
|
||||
|
||||
/// <summary>
|
||||
/// Key for document frequency of a package (count of observations containing the package).
|
||||
/// Format: {prefix}idf:df:{normalizedPackageName}
|
||||
/// </summary>
|
||||
public static string IdfDocumentFrequency(string packageName, string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:df:{NormalizePurl(packageName)}";
|
||||
|
||||
/// <summary>
|
||||
/// Pattern to match all IDF package keys (for scanning/cleanup).
|
||||
/// Format: {prefix}idf:pkg:*
|
||||
/// </summary>
|
||||
public static string IdfPackagePattern(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:pkg:*";
|
||||
|
||||
/// <summary>
|
||||
/// Key for IDF cache hit counter.
|
||||
/// Format: {prefix}idf:stats:hits
|
||||
/// </summary>
|
||||
public static string IdfStatsHits(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:stats:hits";
|
||||
|
||||
/// <summary>
|
||||
/// Key for IDF cache miss counter.
|
||||
/// Format: {prefix}idf:stats:misses
|
||||
/// </summary>
|
||||
public static string IdfStatsMisses(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:stats:misses";
|
||||
|
||||
/// <summary>
|
||||
/// Normalizes a PURL for use as a cache key.
|
||||
/// </summary>
|
||||
|
||||
@@ -0,0 +1,153 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// IPackageIdfService.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-007
|
||||
// Description: Interface for package IDF (Inverse Document Frequency) caching
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.Concelier.Cache.Valkey;
|
||||
|
||||
/// <summary>
|
||||
/// Service for computing and caching IDF (Inverse Document Frequency) weights
|
||||
/// for package keys used in linkset correlation.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// IDF measures how discriminative a package is across the observation corpus:
|
||||
/// <code>
|
||||
/// idf(pkg) = log(N / (1 + df(pkg)))
|
||||
/// </code>
|
||||
/// where N = total observations, df = observations containing the package.
|
||||
///
|
||||
/// Rare packages (low df) have high IDF → stronger correlation signal.
|
||||
/// Common packages (high df) have low IDF → weaker correlation signal.
|
||||
/// </remarks>
|
||||
public interface IPackageIdfService
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the IDF weight for a package key.
|
||||
/// </summary>
|
||||
/// <param name="packageName">The package name (PURL format).</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>
|
||||
/// The IDF weight (0.0-1.0 normalized), or null if not cached.
|
||||
/// Returns null on cache miss or error (graceful degradation).
|
||||
/// </returns>
|
||||
Task<double?> GetIdfAsync(string packageName, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets IDF weights for multiple package keys in a single batch operation.
|
||||
/// </summary>
|
||||
/// <param name="packageNames">The package names to look up.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>
|
||||
/// Dictionary of package name to IDF weight. Missing entries indicate cache miss.
|
||||
/// </returns>
|
||||
Task<IReadOnlyDictionary<string, double>> GetIdfBatchAsync(
|
||||
IEnumerable<string> packageNames,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Sets the IDF weight for a package key.
|
||||
/// </summary>
|
||||
/// <param name="packageName">The package name.</param>
|
||||
/// <param name="idfWeight">The IDF weight (0.0-1.0 normalized).</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task SetIdfAsync(string packageName, double idfWeight, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Sets IDF weights for multiple package keys in a single batch operation.
|
||||
/// </summary>
|
||||
/// <param name="idfWeights">Dictionary of package name to IDF weight.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task SetIdfBatchAsync(
|
||||
IReadOnlyDictionary<string, double> idfWeights,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Updates the corpus statistics used for IDF computation.
|
||||
/// </summary>
|
||||
/// <param name="corpusSize">Total number of observations in the corpus.</param>
|
||||
/// <param name="documentFrequencies">Dictionary of package name to document frequency.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task UpdateCorpusStatsAsync(
|
||||
long corpusSize,
|
||||
IReadOnlyDictionary<string, long> documentFrequencies,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the last refresh timestamp for IDF statistics.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The last refresh time, or null if never refreshed.</returns>
|
||||
Task<DateTimeOffset?> GetLastRefreshAsync(CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Invalidates cached IDF data for a specific package.
|
||||
/// </summary>
|
||||
/// <param name="packageName">The package name to invalidate.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task InvalidateAsync(string packageName, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Invalidates all cached IDF data.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task InvalidateAllAsync(CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Whether the IDF cache is enabled and available.
|
||||
/// </summary>
|
||||
bool IsEnabled { get; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for the package IDF service.
|
||||
/// </summary>
|
||||
public sealed class PackageIdfOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section name.
|
||||
/// </summary>
|
||||
public const string SectionName = "Concelier:PackageIdf";
|
||||
|
||||
/// <summary>
|
||||
/// Whether IDF caching is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// TTL for cached IDF scores.
|
||||
/// Default: 1 hour.
|
||||
/// </summary>
|
||||
public TimeSpan IdfTtl { get; set; } = TimeSpan.FromHours(1);
|
||||
|
||||
/// <summary>
|
||||
/// TTL for corpus statistics.
|
||||
/// Default: 4 hours.
|
||||
/// </summary>
|
||||
public TimeSpan CorpusStatsTtl { get; set; } = TimeSpan.FromHours(4);
|
||||
|
||||
/// <summary>
|
||||
/// Minimum IDF value to cache (to avoid caching very common packages).
|
||||
/// Default: 0.01.
|
||||
/// </summary>
|
||||
public double MinIdfThreshold { get; set; } = 0.01;
|
||||
|
||||
/// <summary>
|
||||
/// Default IDF weight to return on cache miss (uniform weight).
|
||||
/// Default: 1.0 (no discrimination).
|
||||
/// </summary>
|
||||
public double DefaultIdfWeight { get; set; } = 1.0;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of IDF entries to cache.
|
||||
/// Default: 100,000.
|
||||
/// </summary>
|
||||
public int MaxCacheEntries { get; set; } = 100_000;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to normalize IDF scores to 0.0-1.0 range.
|
||||
/// Default: true.
|
||||
/// </summary>
|
||||
public bool NormalizeScores { get; set; } = true;
|
||||
}
|
||||
@@ -0,0 +1,139 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// IdfRefreshHostedService.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-007
|
||||
// Description: Background service for periodic IDF weight refresh
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Concelier.Cache.Valkey;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for providing IDF corpus statistics from the observation store.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This interface should be implemented by the Concelier Core module to provide
|
||||
/// document frequencies from the actual observation database.
|
||||
/// </remarks>
|
||||
public interface IIdfCorpusProvider
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the total number of observations in the corpus.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Total observation count.</returns>
|
||||
Task<long> GetCorpusSizeAsync(CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets document frequencies for all packages in the corpus.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Dictionary mapping package name to the number of observations containing it.</returns>
|
||||
Task<IReadOnlyDictionary<string, long>> GetDocumentFrequenciesAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Background service that periodically refreshes IDF weights from the observation corpus.
|
||||
/// </summary>
|
||||
public sealed class IdfRefreshHostedService : BackgroundService
|
||||
{
|
||||
private readonly IPackageIdfService _idfService;
|
||||
private readonly IIdfCorpusProvider? _corpusProvider;
|
||||
private readonly PackageIdfOptions _options;
|
||||
private readonly ILogger<IdfRefreshHostedService>? _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of <see cref="IdfRefreshHostedService"/>.
|
||||
/// </summary>
|
||||
public IdfRefreshHostedService(
|
||||
IPackageIdfService idfService,
|
||||
IOptions<PackageIdfOptions> options,
|
||||
IIdfCorpusProvider? corpusProvider = null,
|
||||
ILogger<IdfRefreshHostedService>? logger = null)
|
||||
{
|
||||
_idfService = idfService ?? throw new ArgumentNullException(nameof(idfService));
|
||||
_corpusProvider = corpusProvider;
|
||||
_options = options?.Value ?? new PackageIdfOptions();
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
if (!_idfService.IsEnabled)
|
||||
{
|
||||
_logger?.LogInformation("IDF refresh service disabled (IDF caching not enabled)");
|
||||
return;
|
||||
}
|
||||
|
||||
if (_corpusProvider is null)
|
||||
{
|
||||
_logger?.LogWarning(
|
||||
"IDF refresh service has no corpus provider registered. " +
|
||||
"Register IIdfCorpusProvider to enable automatic IDF refresh.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Initial delay before first refresh (allow other services to start)
|
||||
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await RefreshIdfWeightsAsync(stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogError(ex, "Error during IDF refresh cycle");
|
||||
}
|
||||
|
||||
// Wait for next refresh interval (default: 1 hour)
|
||||
try
|
||||
{
|
||||
await Task.Delay(_options.IdfTtl, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
_logger?.LogInformation("IDF refresh service stopped");
|
||||
}
|
||||
|
||||
private async Task RefreshIdfWeightsAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
_logger?.LogDebug("Starting IDF refresh cycle");
|
||||
|
||||
var corpusSize = await _corpusProvider!.GetCorpusSizeAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (corpusSize == 0)
|
||||
{
|
||||
_logger?.LogWarning("IDF refresh skipped: empty corpus");
|
||||
return;
|
||||
}
|
||||
|
||||
var documentFrequencies = await _corpusProvider.GetDocumentFrequenciesAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (documentFrequencies.Count == 0)
|
||||
{
|
||||
_logger?.LogWarning("IDF refresh skipped: no document frequencies");
|
||||
return;
|
||||
}
|
||||
|
||||
await _idfService.UpdateCorpusStatsAsync(corpusSize, documentFrequencies, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger?.LogInformation(
|
||||
"IDF refresh completed: corpus={CorpusSize}, packages={PackageCount}",
|
||||
corpusSize,
|
||||
documentFrequencies.Count);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,249 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// PackageIdfMetrics.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-007
|
||||
// Description: OpenTelemetry metrics for package IDF caching operations
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Concelier.Cache.Valkey;
|
||||
|
||||
/// <summary>
|
||||
/// Metrics instrumentation for the package IDF cache.
|
||||
/// </summary>
|
||||
public sealed class PackageIdfMetrics : IDisposable
|
||||
{
|
||||
/// <summary>
|
||||
/// Activity source name for IDF cache operations.
|
||||
/// </summary>
|
||||
public const string ActivitySourceName = "StellaOps.Concelier.PackageIdf";
|
||||
|
||||
/// <summary>
|
||||
/// Meter name for IDF cache metrics.
|
||||
/// </summary>
|
||||
public const string MeterName = "StellaOps.Concelier.PackageIdf";
|
||||
|
||||
private readonly Meter _meter;
|
||||
private readonly Counter<long> _hitsCounter;
|
||||
private readonly Counter<long> _missesCounter;
|
||||
private readonly Counter<long> _refreshCounter;
|
||||
private readonly Histogram<double> _latencyHistogram;
|
||||
private readonly Histogram<double> _idfWeightHistogram;
|
||||
private readonly ObservableGauge<long> _corpusSizeGauge;
|
||||
private readonly ObservableGauge<long> _cachedEntriesGauge;
|
||||
|
||||
private long _lastKnownCorpusSize;
|
||||
private long _lastKnownCachedEntries;
|
||||
|
||||
/// <summary>
|
||||
/// Activity source for tracing IDF cache operations.
|
||||
/// </summary>
|
||||
public static ActivitySource ActivitySource { get; } = new(ActivitySourceName, "1.0.0");
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of <see cref="PackageIdfMetrics"/>.
|
||||
/// </summary>
|
||||
public PackageIdfMetrics()
|
||||
{
|
||||
_meter = new Meter(MeterName, "1.0.0");
|
||||
|
||||
_hitsCounter = _meter.CreateCounter<long>(
|
||||
"concelier_linkset_package_idf_hits_total",
|
||||
unit: "{hits}",
|
||||
description: "Total number of package IDF cache hits");
|
||||
|
||||
_missesCounter = _meter.CreateCounter<long>(
|
||||
"concelier_linkset_package_idf_misses_total",
|
||||
unit: "{misses}",
|
||||
description: "Total number of package IDF cache misses");
|
||||
|
||||
_refreshCounter = _meter.CreateCounter<long>(
|
||||
"concelier_linkset_package_idf_refreshes_total",
|
||||
unit: "{refreshes}",
|
||||
description: "Total number of IDF corpus refresh operations");
|
||||
|
||||
_latencyHistogram = _meter.CreateHistogram<double>(
|
||||
"concelier_linkset_package_idf_latency_ms",
|
||||
unit: "ms",
|
||||
description: "Package IDF cache operation latency in milliseconds");
|
||||
|
||||
_idfWeightHistogram = _meter.CreateHistogram<double>(
|
||||
"concelier_linkset_package_idf_weight",
|
||||
unit: "{weight}",
|
||||
description: "Distribution of package IDF weights (0.0-1.0)");
|
||||
|
||||
_corpusSizeGauge = _meter.CreateObservableGauge(
|
||||
"concelier_linkset_package_idf_corpus_size",
|
||||
() => _lastKnownCorpusSize,
|
||||
unit: "{observations}",
|
||||
description: "Total number of observations in the IDF corpus");
|
||||
|
||||
_cachedEntriesGauge = _meter.CreateObservableGauge(
|
||||
"concelier_linkset_package_idf_cached_entries",
|
||||
() => _lastKnownCachedEntries,
|
||||
unit: "{entries}",
|
||||
description: "Number of cached IDF entries");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a cache hit.
|
||||
/// </summary>
|
||||
public void RecordHit() => _hitsCounter.Add(1);
|
||||
|
||||
/// <summary>
|
||||
/// Records multiple cache hits.
|
||||
/// </summary>
|
||||
/// <param name="count">Number of hits.</param>
|
||||
public void RecordHits(long count) => _hitsCounter.Add(count);
|
||||
|
||||
/// <summary>
|
||||
/// Records a cache miss.
|
||||
/// </summary>
|
||||
public void RecordMiss() => _missesCounter.Add(1);
|
||||
|
||||
/// <summary>
|
||||
/// Records multiple cache misses.
|
||||
/// </summary>
|
||||
/// <param name="count">Number of misses.</param>
|
||||
public void RecordMisses(long count) => _missesCounter.Add(count);
|
||||
|
||||
/// <summary>
|
||||
/// Records a corpus refresh operation.
|
||||
/// </summary>
|
||||
/// <param name="packageCount">Number of packages refreshed.</param>
|
||||
public void RecordRefresh(long packageCount = 1)
|
||||
{
|
||||
_refreshCounter.Add(1, new KeyValuePair<string, object?>("package_count", packageCount));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records operation latency.
|
||||
/// </summary>
|
||||
/// <param name="milliseconds">Latency in milliseconds.</param>
|
||||
/// <param name="operation">The operation type (get, set, batch_get, refresh).</param>
|
||||
public void RecordLatency(double milliseconds, string operation)
|
||||
{
|
||||
_latencyHistogram.Record(milliseconds, new KeyValuePair<string, object?>("operation", operation));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records an IDF weight observation for distribution analysis.
|
||||
/// </summary>
|
||||
/// <param name="weight">The IDF weight (0.0-1.0).</param>
|
||||
public void RecordIdfWeight(double weight)
|
||||
{
|
||||
_idfWeightHistogram.Record(weight);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates the corpus size gauge.
|
||||
/// </summary>
|
||||
/// <param name="size">Current corpus size.</param>
|
||||
public void UpdateCorpusSize(long size)
|
||||
{
|
||||
_lastKnownCorpusSize = size;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates the cached entries gauge.
|
||||
/// </summary>
|
||||
/// <param name="count">Current cached entry count.</param>
|
||||
public void UpdateCachedEntries(long count)
|
||||
{
|
||||
_lastKnownCachedEntries = count;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts an activity for tracing an IDF cache operation.
|
||||
/// </summary>
|
||||
/// <param name="operationName">Name of the operation.</param>
|
||||
/// <returns>The activity, or null if tracing is disabled.</returns>
|
||||
public static Activity? StartActivity(string operationName)
|
||||
{
|
||||
return ActivitySource.StartActivity(operationName, ActivityKind.Internal);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts an activity with tags.
|
||||
/// </summary>
|
||||
/// <param name="operationName">Name of the operation.</param>
|
||||
/// <param name="tags">Tags to add to the activity.</param>
|
||||
/// <returns>The activity, or null if tracing is disabled.</returns>
|
||||
public static Activity? StartActivity(string operationName, params (string Key, object? Value)[] tags)
|
||||
{
|
||||
var activity = ActivitySource.StartActivity(operationName, ActivityKind.Internal);
|
||||
if (activity is not null)
|
||||
{
|
||||
foreach (var (key, value) in tags)
|
||||
{
|
||||
activity.SetTag(key, value);
|
||||
}
|
||||
}
|
||||
return activity;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Dispose()
|
||||
{
|
||||
_meter.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for timing IDF cache operations.
|
||||
/// </summary>
|
||||
public static class PackageIdfMetricsExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Times an async operation and records the latency.
|
||||
/// </summary>
|
||||
public static async Task<T> TimeAsync<T>(
|
||||
this PackageIdfMetrics? metrics,
|
||||
string operation,
|
||||
Func<Task<T>> action)
|
||||
{
|
||||
if (metrics is null)
|
||||
{
|
||||
return await action().ConfigureAwait(false);
|
||||
}
|
||||
|
||||
var sw = Stopwatch.StartNew();
|
||||
try
|
||||
{
|
||||
return await action().ConfigureAwait(false);
|
||||
}
|
||||
finally
|
||||
{
|
||||
sw.Stop();
|
||||
metrics.RecordLatency(sw.Elapsed.TotalMilliseconds, operation);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Times an async operation and records the latency.
|
||||
/// </summary>
|
||||
public static async Task TimeAsync(
|
||||
this PackageIdfMetrics? metrics,
|
||||
string operation,
|
||||
Func<Task> action)
|
||||
{
|
||||
if (metrics is null)
|
||||
{
|
||||
await action().ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
var sw = Stopwatch.StartNew();
|
||||
try
|
||||
{
|
||||
await action().ConfigureAwait(false);
|
||||
}
|
||||
finally
|
||||
{
|
||||
sw.Stop();
|
||||
metrics.RecordLatency(sw.Elapsed.TotalMilliseconds, operation);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -32,6 +32,10 @@ public static class ServiceCollectionExtensions
|
||||
services.Configure<ConcelierCacheOptions>(
|
||||
configuration.GetSection(ConcelierCacheOptions.SectionName));
|
||||
|
||||
// Bind package IDF options (CORR-V2-007)
|
||||
services.Configure<PackageIdfOptions>(
|
||||
configuration.GetSection(PackageIdfOptions.SectionName));
|
||||
|
||||
return AddCoreServices(services, enableWarmup);
|
||||
}
|
||||
|
||||
@@ -39,16 +43,23 @@ public static class ServiceCollectionExtensions
|
||||
/// Adds Concelier Valkey cache services with custom options.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureOptions">Action to configure options.</param>
|
||||
/// <param name="configureOptions">Action to configure cache options.</param>
|
||||
/// <param name="configureIdfOptions">Optional action to configure IDF options.</param>
|
||||
/// <param name="enableWarmup">Whether to enable background cache warmup.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddConcelierValkeyCache(
|
||||
this IServiceCollection services,
|
||||
Action<ConcelierCacheOptions> configureOptions,
|
||||
Action<PackageIdfOptions>? configureIdfOptions = null,
|
||||
bool enableWarmup = true)
|
||||
{
|
||||
services.Configure(configureOptions);
|
||||
|
||||
if (configureIdfOptions is not null)
|
||||
{
|
||||
services.Configure(configureIdfOptions);
|
||||
}
|
||||
|
||||
return AddCoreServices(services, enableWarmup);
|
||||
}
|
||||
|
||||
@@ -59,9 +70,11 @@ public static class ServiceCollectionExtensions
|
||||
|
||||
// Register metrics
|
||||
services.TryAddSingleton<ConcelierCacheMetrics>();
|
||||
services.TryAddSingleton<PackageIdfMetrics>();
|
||||
|
||||
// Register cache service
|
||||
// Register cache services
|
||||
services.TryAddSingleton<IAdvisoryCacheService, ValkeyAdvisoryCacheService>();
|
||||
services.TryAddSingleton<IPackageIdfService, ValkeyPackageIdfService>();
|
||||
|
||||
// Register warmup hosted service if enabled
|
||||
if (enableWarmup)
|
||||
@@ -69,6 +82,10 @@ public static class ServiceCollectionExtensions
|
||||
services.AddHostedService<CacheWarmupHostedService>();
|
||||
}
|
||||
|
||||
// Register IDF refresh hosted service (CORR-V2-007)
|
||||
// Note: Requires IIdfCorpusProvider to be registered by Concelier.Core
|
||||
services.AddHostedService<IdfRefreshHostedService>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,421 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ValkeyPackageIdfService.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-007
|
||||
// Description: Valkey-backed implementation of IPackageIdfService
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StackExchange.Redis;
|
||||
|
||||
namespace StellaOps.Concelier.Cache.Valkey;
|
||||
|
||||
/// <summary>
|
||||
/// Valkey-backed implementation of <see cref="IPackageIdfService"/>.
|
||||
/// Provides caching for package IDF (Inverse Document Frequency) weights
|
||||
/// used in linkset correlation scoring.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// This service caches pre-computed IDF weights with hourly refresh.
|
||||
/// On cache miss, it returns null to signal the caller should use uniform weights.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Key features:
|
||||
/// - Batch operations for efficient multi-package lookups
|
||||
/// - Graceful degradation on Valkey errors (returns null, logs warning)
|
||||
/// - TTL-based expiration with configurable refresh intervals
|
||||
/// - OpenTelemetry metrics for monitoring cache performance
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class ValkeyPackageIdfService : IPackageIdfService
|
||||
{
|
||||
private readonly ConcelierCacheConnectionFactory _connectionFactory;
|
||||
private readonly ConcelierCacheOptions _cacheOptions;
|
||||
private readonly PackageIdfOptions _idfOptions;
|
||||
private readonly PackageIdfMetrics? _metrics;
|
||||
private readonly ILogger<ValkeyPackageIdfService>? _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of <see cref="ValkeyPackageIdfService"/>.
|
||||
/// </summary>
|
||||
public ValkeyPackageIdfService(
|
||||
ConcelierCacheConnectionFactory connectionFactory,
|
||||
IOptions<ConcelierCacheOptions> cacheOptions,
|
||||
IOptions<PackageIdfOptions> idfOptions,
|
||||
PackageIdfMetrics? metrics = null,
|
||||
ILogger<ValkeyPackageIdfService>? logger = null)
|
||||
{
|
||||
_connectionFactory = connectionFactory ?? throw new ArgumentNullException(nameof(connectionFactory));
|
||||
_cacheOptions = cacheOptions?.Value ?? new ConcelierCacheOptions();
|
||||
_idfOptions = idfOptions?.Value ?? new PackageIdfOptions();
|
||||
_metrics = metrics;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool IsEnabled => _cacheOptions.Enabled && _idfOptions.Enabled;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<double?> GetIdfAsync(string packageName, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled || string.IsNullOrWhiteSpace(packageName))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var sw = StartTiming();
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var key = AdvisoryCacheKeys.IdfPackage(packageName, _cacheOptions.KeyPrefix);
|
||||
|
||||
var cached = await db.StringGetAsync(key).ConfigureAwait(false);
|
||||
if (cached.HasValue && double.TryParse((string?)cached, NumberStyles.Float, CultureInfo.InvariantCulture, out var weight))
|
||||
{
|
||||
await db.StringIncrementAsync(AdvisoryCacheKeys.IdfStatsHits(_cacheOptions.KeyPrefix)).ConfigureAwait(false);
|
||||
_metrics?.RecordHit();
|
||||
_metrics?.RecordIdfWeight(weight);
|
||||
return weight;
|
||||
}
|
||||
|
||||
await db.StringIncrementAsync(AdvisoryCacheKeys.IdfStatsMisses(_cacheOptions.KeyPrefix)).ConfigureAwait(false);
|
||||
_metrics?.RecordMiss();
|
||||
return null;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogWarning(ex, "Failed to get IDF for package {PackageName}", packageName);
|
||||
return null; // Graceful degradation
|
||||
}
|
||||
finally
|
||||
{
|
||||
StopTiming(sw, "get");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IReadOnlyDictionary<string, double>> GetIdfBatchAsync(
|
||||
IEnumerable<string> packageNames,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var names = packageNames?.Where(n => !string.IsNullOrWhiteSpace(n)).Distinct().ToArray()
|
||||
?? Array.Empty<string>();
|
||||
|
||||
if (!IsEnabled || names.Length == 0)
|
||||
{
|
||||
return new Dictionary<string, double>();
|
||||
}
|
||||
|
||||
var sw = StartTiming();
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var keys = names.Select(n => (RedisKey)AdvisoryCacheKeys.IdfPackage(n, _cacheOptions.KeyPrefix)).ToArray();
|
||||
|
||||
var values = await db.StringGetAsync(keys).ConfigureAwait(false);
|
||||
|
||||
var result = new Dictionary<string, double>(names.Length);
|
||||
var hits = 0;
|
||||
var misses = 0;
|
||||
|
||||
for (var i = 0; i < names.Length; i++)
|
||||
{
|
||||
if (values[i].HasValue &&
|
||||
double.TryParse((string?)values[i], NumberStyles.Float, CultureInfo.InvariantCulture, out var weight))
|
||||
{
|
||||
result[names[i]] = weight;
|
||||
hits++;
|
||||
_metrics?.RecordIdfWeight(weight);
|
||||
}
|
||||
else
|
||||
{
|
||||
misses++;
|
||||
}
|
||||
}
|
||||
|
||||
if (hits > 0) _metrics?.RecordHits(hits);
|
||||
if (misses > 0) _metrics?.RecordMisses(misses);
|
||||
|
||||
return result;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogWarning(ex, "Failed to batch get IDF for {Count} packages", names.Length);
|
||||
return new Dictionary<string, double>();
|
||||
}
|
||||
finally
|
||||
{
|
||||
StopTiming(sw, "batch_get");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task SetIdfAsync(string packageName, double idfWeight, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled || string.IsNullOrWhiteSpace(packageName))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Skip caching weights below threshold (very common packages)
|
||||
if (idfWeight < _idfOptions.MinIdfThreshold)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var sw = StartTiming();
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var key = AdvisoryCacheKeys.IdfPackage(packageName, _cacheOptions.KeyPrefix);
|
||||
var value = idfWeight.ToString("F6", CultureInfo.InvariantCulture);
|
||||
|
||||
await db.StringSetAsync(key, value, _idfOptions.IdfTtl).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogWarning(ex, "Failed to set IDF for package {PackageName}", packageName);
|
||||
}
|
||||
finally
|
||||
{
|
||||
StopTiming(sw, "set");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task SetIdfBatchAsync(
|
||||
IReadOnlyDictionary<string, double> idfWeights,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled || idfWeights is null || idfWeights.Count == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var sw = StartTiming();
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var entries = idfWeights
|
||||
.Where(kv => !string.IsNullOrWhiteSpace(kv.Key) && kv.Value >= _idfOptions.MinIdfThreshold)
|
||||
.Select(kv => new KeyValuePair<RedisKey, RedisValue>(
|
||||
AdvisoryCacheKeys.IdfPackage(kv.Key, _cacheOptions.KeyPrefix),
|
||||
kv.Value.ToString("F6", CultureInfo.InvariantCulture)))
|
||||
.ToArray();
|
||||
|
||||
if (entries.Length == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Use pipeline for batch set with TTL
|
||||
var batch = db.CreateBatch();
|
||||
var tasks = new List<Task>(entries.Length);
|
||||
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
tasks.Add(batch.StringSetAsync(entry.Key, entry.Value, _idfOptions.IdfTtl));
|
||||
}
|
||||
|
||||
batch.Execute();
|
||||
await Task.WhenAll(tasks).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogWarning(ex, "Failed to batch set IDF for {Count} packages", idfWeights.Count);
|
||||
}
|
||||
finally
|
||||
{
|
||||
StopTiming(sw, "batch_set");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task UpdateCorpusStatsAsync(
|
||||
long corpusSize,
|
||||
IReadOnlyDictionary<string, long> documentFrequencies,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var sw = StartTiming();
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var prefix = _cacheOptions.KeyPrefix;
|
||||
|
||||
// Update corpus size
|
||||
await db.StringSetAsync(
|
||||
AdvisoryCacheKeys.IdfCorpusSize(prefix),
|
||||
corpusSize.ToString(CultureInfo.InvariantCulture),
|
||||
_idfOptions.CorpusStatsTtl).ConfigureAwait(false);
|
||||
|
||||
// Compute and cache IDF weights
|
||||
var idfWeights = new Dictionary<string, double>(documentFrequencies.Count);
|
||||
var maxIdf = 0.0;
|
||||
|
||||
foreach (var (packageName, df) in documentFrequencies)
|
||||
{
|
||||
// IDF formula: log(N / (1 + df))
|
||||
var rawIdf = Math.Log((double)corpusSize / (1 + df));
|
||||
if (rawIdf > maxIdf) maxIdf = rawIdf;
|
||||
idfWeights[packageName] = rawIdf;
|
||||
}
|
||||
|
||||
// Normalize if configured
|
||||
if (_idfOptions.NormalizeScores && maxIdf > 0)
|
||||
{
|
||||
foreach (var key in idfWeights.Keys.ToArray())
|
||||
{
|
||||
idfWeights[key] /= maxIdf;
|
||||
}
|
||||
}
|
||||
|
||||
// Batch set the normalized IDF weights
|
||||
await SetIdfBatchAsync(idfWeights, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Update document frequencies
|
||||
var batch = db.CreateBatch();
|
||||
var tasks = new List<Task>(documentFrequencies.Count);
|
||||
|
||||
foreach (var (packageName, df) in documentFrequencies)
|
||||
{
|
||||
tasks.Add(batch.StringSetAsync(
|
||||
AdvisoryCacheKeys.IdfDocumentFrequency(packageName, prefix),
|
||||
df.ToString(CultureInfo.InvariantCulture),
|
||||
_idfOptions.CorpusStatsTtl));
|
||||
}
|
||||
|
||||
batch.Execute();
|
||||
await Task.WhenAll(tasks).ConfigureAwait(false);
|
||||
|
||||
// Update last refresh timestamp
|
||||
await db.StringSetAsync(
|
||||
AdvisoryCacheKeys.IdfLastRefresh(prefix),
|
||||
DateTimeOffset.UtcNow.ToString("o", CultureInfo.InvariantCulture),
|
||||
_idfOptions.CorpusStatsTtl).ConfigureAwait(false);
|
||||
|
||||
_metrics?.UpdateCorpusSize(corpusSize);
|
||||
_metrics?.UpdateCachedEntries(documentFrequencies.Count);
|
||||
_metrics?.RecordRefresh(documentFrequencies.Count);
|
||||
|
||||
_logger?.LogInformation(
|
||||
"Updated IDF corpus: size={CorpusSize}, packages={PackageCount}",
|
||||
corpusSize,
|
||||
documentFrequencies.Count);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogError(ex, "Failed to update IDF corpus stats");
|
||||
}
|
||||
finally
|
||||
{
|
||||
StopTiming(sw, "refresh");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DateTimeOffset?> GetLastRefreshAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var key = AdvisoryCacheKeys.IdfLastRefresh(_cacheOptions.KeyPrefix);
|
||||
|
||||
var cached = await db.StringGetAsync(key).ConfigureAwait(false);
|
||||
if (cached.HasValue &&
|
||||
DateTimeOffset.TryParse(cached, CultureInfo.InvariantCulture, DateTimeStyles.RoundtripKind, out var timestamp))
|
||||
{
|
||||
return timestamp;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogWarning(ex, "Failed to get IDF last refresh timestamp");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task InvalidateAsync(string packageName, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled || string.IsNullOrWhiteSpace(packageName))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var prefix = _cacheOptions.KeyPrefix;
|
||||
|
||||
await Task.WhenAll(
|
||||
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfPackage(packageName, prefix)),
|
||||
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfDocumentFrequency(packageName, prefix))
|
||||
).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogWarning(ex, "Failed to invalidate IDF for package {PackageName}", packageName);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task InvalidateAllAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var prefix = _cacheOptions.KeyPrefix;
|
||||
|
||||
// Delete stats keys
|
||||
await Task.WhenAll(
|
||||
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfCorpusSize(prefix)),
|
||||
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfLastRefresh(prefix)),
|
||||
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfStatsHits(prefix)),
|
||||
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfStatsMisses(prefix))
|
||||
).ConfigureAwait(false);
|
||||
|
||||
// Note: Scanning and deleting all idf:pkg:* keys would require SCAN,
|
||||
// which is expensive. For now, rely on TTL expiration.
|
||||
_logger?.LogInformation("Invalidated IDF stats; individual package keys will expire via TTL");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogError(ex, "Failed to invalidate all IDF cache");
|
||||
}
|
||||
}
|
||||
|
||||
private Stopwatch? StartTiming()
|
||||
{
|
||||
if (_metrics is null) return null;
|
||||
return Stopwatch.StartNew();
|
||||
}
|
||||
|
||||
private void StopTiming(Stopwatch? sw, string operation)
|
||||
{
|
||||
if (sw is null || _metrics is null) return;
|
||||
sw.Stop();
|
||||
_metrics.RecordLatency(sw.Elapsed.TotalMilliseconds, operation);
|
||||
}
|
||||
}
|
||||
@@ -40,11 +40,33 @@ public sealed record AdvisoryLinksetProvenance(
|
||||
string? ToolVersion,
|
||||
string? PolicyHash);
|
||||
|
||||
/// <summary>
|
||||
/// Conflict severity levels for typed penalty calculation.
|
||||
/// </summary>
|
||||
public enum ConflictSeverity
|
||||
{
|
||||
/// <summary>No penalty; informational only.</summary>
|
||||
Info = 0,
|
||||
|
||||
/// <summary>Minor disagreement; small penalty.</summary>
|
||||
Soft = 1,
|
||||
|
||||
/// <summary>Significant disagreement; should usually prevent high-confidence linking.</summary>
|
||||
Hard = 2
|
||||
}
|
||||
|
||||
public sealed record AdvisoryLinksetConflict(
|
||||
string Field,
|
||||
string Reason,
|
||||
IReadOnlyList<string>? Values,
|
||||
IReadOnlyList<string>? SourceIds = null);
|
||||
IReadOnlyList<string>? SourceIds = null)
|
||||
{
|
||||
/// <summary>
|
||||
/// Severity of the conflict. Defaults to <see cref="ConflictSeverity.Soft"/>.
|
||||
/// Hard conflicts significantly impact confidence; Info conflicts are purely informational.
|
||||
/// </summary>
|
||||
public ConflictSeverity Severity { get; init; } = ConflictSeverity.Soft;
|
||||
}
|
||||
|
||||
internal static class DocumentHelper
|
||||
{
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ILinksetCorrelationService.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-008
|
||||
// Description: Abstraction for linkset correlation with V1/V2 support
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using StellaOps.Concelier.Models;
|
||||
|
||||
namespace StellaOps.Concelier.Core.Linksets;
|
||||
|
||||
/// <summary>
|
||||
/// Service for computing linkset correlation confidence and conflicts.
|
||||
/// Supports multiple correlation algorithm versions (V1, V2).
|
||||
/// </summary>
|
||||
public interface ILinksetCorrelationService
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the correlation algorithm version being used.
|
||||
/// </summary>
|
||||
string Version { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Computes correlation confidence and conflicts for a set of observation inputs.
|
||||
/// </summary>
|
||||
(double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) Compute(
|
||||
IReadOnlyCollection<CorrelationInput> inputs,
|
||||
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts = null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unified input model for correlation computation.
|
||||
/// </summary>
|
||||
public sealed record CorrelationInput(
|
||||
string ObservationId,
|
||||
string? Vendor,
|
||||
DateTimeOffset? FetchedAt,
|
||||
IReadOnlyCollection<string> Aliases,
|
||||
IReadOnlyCollection<string> Purls,
|
||||
IReadOnlyCollection<string> Cpes,
|
||||
IReadOnlyCollection<string> References,
|
||||
IReadOnlyCollection<string>? PatchReferences = null);
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for the correlation service.
|
||||
/// </summary>
|
||||
public sealed class CorrelationServiceOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Correlation algorithm version. Supported values: "v1", "v2".
|
||||
/// Default: "v1" for backward compatibility.
|
||||
/// </summary>
|
||||
public string Version { get; set; } = "v1";
|
||||
|
||||
/// <summary>
|
||||
/// Optional custom weights for V2 correlation signals.
|
||||
/// Keys: aliasConnectivity, aliasAuthority, packageCoverage, versionCompatibility,
|
||||
/// cpeMatch, patchLineage, referenceOverlap, freshness
|
||||
/// </summary>
|
||||
public Dictionary<string, double>? Weights { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to enable IDF weighting for package keys (V2 only).
|
||||
/// </summary>
|
||||
public bool EnableIdfWeighting { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to enable text similarity scoring (V2 Phase 3, disabled by default).
|
||||
/// </summary>
|
||||
public bool EnableTextSimilarity { get; set; } = false;
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// LinksetCorrelationService.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-008
|
||||
// Description: Implementation of ILinksetCorrelationService with V1/V2 support
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Concelier.Models;
|
||||
|
||||
namespace StellaOps.Concelier.Core.Linksets;
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of <see cref="ILinksetCorrelationService"/>.
|
||||
/// Supports V1 (intersection-based) and V2 (graph-based) correlation algorithms.
|
||||
/// </summary>
|
||||
public sealed class LinksetCorrelationService : ILinksetCorrelationService
|
||||
{
|
||||
private readonly CorrelationServiceOptions _options;
|
||||
private readonly ILogger<LinksetCorrelationService> _logger;
|
||||
private readonly Func<string, double>? _idfProvider;
|
||||
|
||||
public LinksetCorrelationService(
|
||||
IOptions<CorrelationServiceOptions> options,
|
||||
ILogger<LinksetCorrelationService> logger,
|
||||
Func<string, double>? idfProvider = null)
|
||||
{
|
||||
_options = options?.Value ?? new CorrelationServiceOptions();
|
||||
_logger = logger;
|
||||
_idfProvider = idfProvider;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Version => _options.Version?.ToLowerInvariant() switch
|
||||
{
|
||||
"v2" => "v2",
|
||||
_ => "v1"
|
||||
};
|
||||
|
||||
/// <inheritdoc />
|
||||
public (double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) Compute(
|
||||
IReadOnlyCollection<CorrelationInput> inputs,
|
||||
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts = null)
|
||||
{
|
||||
if (inputs.Count == 0)
|
||||
{
|
||||
return (1.0, Array.Empty<AdvisoryLinksetConflict>());
|
||||
}
|
||||
|
||||
return Version switch
|
||||
{
|
||||
"v2" => ComputeV2(inputs, additionalConflicts),
|
||||
_ => ComputeV1(inputs, additionalConflicts)
|
||||
};
|
||||
}
|
||||
|
||||
private (double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) ComputeV1(
|
||||
IReadOnlyCollection<CorrelationInput> inputs,
|
||||
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts)
|
||||
{
|
||||
// Convert to V1 input format
|
||||
var v1Inputs = inputs.Select(i => new LinksetCorrelation.Input(
|
||||
Vendor: i.Vendor,
|
||||
FetchedAt: i.FetchedAt,
|
||||
Aliases: i.Aliases,
|
||||
Purls: i.Purls,
|
||||
Cpes: i.Cpes,
|
||||
References: i.References)).ToArray();
|
||||
|
||||
return LinksetCorrelation.Compute(v1Inputs, additionalConflicts);
|
||||
}
|
||||
|
||||
private (double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) ComputeV2(
|
||||
IReadOnlyCollection<CorrelationInput> inputs,
|
||||
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts)
|
||||
{
|
||||
// Convert to V2 input format
|
||||
var v2Inputs = inputs.Select(i => new LinksetCorrelationV2.InputV2(
|
||||
ObservationId: i.ObservationId,
|
||||
Vendor: i.Vendor,
|
||||
FetchedAt: i.FetchedAt,
|
||||
Aliases: i.Aliases,
|
||||
Purls: i.Purls,
|
||||
Cpes: i.Cpes,
|
||||
References: i.References,
|
||||
PatchReferences: i.PatchReferences)).ToArray();
|
||||
|
||||
var idfProvider = _options.EnableIdfWeighting ? _idfProvider : null;
|
||||
var result = LinksetCorrelationV2.Compute(v2Inputs, additionalConflicts, idfProvider);
|
||||
|
||||
_logger.LogDebug(
|
||||
"V2 correlation computed: confidence={Confidence:F3}, conflicts={ConflictCount}, signals={Signals}",
|
||||
result.Confidence,
|
||||
result.Conflicts.Count,
|
||||
string.Join(", ", result.SignalScores.Select(kv => $"{kv.Key}={kv.Value:F2}")));
|
||||
|
||||
return (result.Confidence, result.Conflicts);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,910 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// LinksetCorrelationV2.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-001 through CORR-V2-008
|
||||
// Description: V2 correlation algorithm with graph-based alias connectivity,
|
||||
// version compatibility scoring, patch lineage signals, and typed
|
||||
// conflict severities.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Collections.Immutable;
|
||||
using System.Linq;
|
||||
using StellaOps.Concelier.Models;
|
||||
|
||||
namespace StellaOps.Concelier.Core.Linksets;
|
||||
|
||||
/// <summary>
|
||||
/// Version relationship classification for affected range comparison.
|
||||
/// </summary>
|
||||
public enum VersionRelation
|
||||
{
|
||||
/// <summary>Unable to determine relationship.</summary>
|
||||
Unknown = 0,
|
||||
|
||||
/// <summary>Ranges normalize to identical primitives.</summary>
|
||||
Equivalent = 1,
|
||||
|
||||
/// <summary>Ranges have non-empty intersection but are not equal.</summary>
|
||||
Overlapping = 2,
|
||||
|
||||
/// <summary>Ranges have no intersection.</summary>
|
||||
Disjoint = 3
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// V2 linkset correlation algorithm with graph-based connectivity,
|
||||
/// typed conflict severities, and multi-signal scoring.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Key improvements over V1:
|
||||
/// - Alias matching uses graph connectivity (LCC ratio) instead of intersection-across-all
|
||||
/// - PURL matching uses pairwise coverage instead of intersection-across-all
|
||||
/// - Reference clash only emitted for true contradictions, not zero overlap
|
||||
/// - Typed conflict severities with per-reason penalties
|
||||
/// - Patch lineage as high-weight signal
|
||||
/// - Version compatibility classification (equivalent/overlapping/disjoint)
|
||||
/// </remarks>
|
||||
internal static class LinksetCorrelationV2
|
||||
{
|
||||
/// <summary>
|
||||
/// Default correlation weights. Can be overridden via configuration.
|
||||
/// </summary>
|
||||
internal static class Weights
|
||||
{
|
||||
public const double AliasConnectivity = 0.30;
|
||||
public const double AliasAuthority = 0.10;
|
||||
public const double PackageCoverage = 0.20;
|
||||
public const double VersionCompatibility = 0.10;
|
||||
public const double CpeMatch = 0.10;
|
||||
public const double PatchLineage = 0.10;
|
||||
public const double ReferenceOverlap = 0.05;
|
||||
public const double Freshness = 0.05;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Conflict penalties by severity and reason.
|
||||
/// </summary>
|
||||
internal static class ConflictPenalties
|
||||
{
|
||||
public const double DistinctCves = 0.40; // Hard: two different CVEs
|
||||
public const double DisjointVersionRanges = 0.30; // Hard: same pkg, no overlap
|
||||
public const double OverlappingRanges = 0.05; // Soft: ranges overlap but differ
|
||||
public const double SeverityMismatch = 0.05; // Soft: CVSS differs
|
||||
public const double AliasInconsistency = 0.10; // Soft: non-CVE alias mismatch
|
||||
public const double ZeroReferenceOverlap = 0.00; // Info: no penalty
|
||||
}
|
||||
|
||||
internal readonly record struct InputV2(
|
||||
string ObservationId,
|
||||
string? Vendor,
|
||||
DateTimeOffset? FetchedAt,
|
||||
IReadOnlyCollection<string> Aliases,
|
||||
IReadOnlyCollection<string> Purls,
|
||||
IReadOnlyCollection<string> Cpes,
|
||||
IReadOnlyCollection<string> References,
|
||||
IReadOnlyCollection<string>? PatchReferences = null);
|
||||
|
||||
internal readonly record struct CorrelationResult(
|
||||
double Confidence,
|
||||
IReadOnlyList<AdvisoryLinksetConflict> Conflicts,
|
||||
IReadOnlyDictionary<string, double> SignalScores);
|
||||
|
||||
/// <summary>
|
||||
/// Computes correlation confidence and conflicts for a set of observations.
|
||||
/// </summary>
|
||||
internal static CorrelationResult Compute(
|
||||
IReadOnlyCollection<InputV2> inputs,
|
||||
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts = null,
|
||||
Func<string, double>? packageIdfProvider = null)
|
||||
{
|
||||
if (inputs.Count == 0)
|
||||
{
|
||||
return new CorrelationResult(
|
||||
1.0,
|
||||
Array.Empty<AdvisoryLinksetConflict>(),
|
||||
ImmutableDictionary<string, double>.Empty);
|
||||
}
|
||||
|
||||
var conflicts = new List<AdvisoryLinksetConflict>();
|
||||
var signalScores = new Dictionary<string, double>();
|
||||
|
||||
// 1. Alias connectivity (graph-based)
|
||||
var (aliasConnectivity, aliasConflicts) = CalculateAliasConnectivity(inputs);
|
||||
conflicts.AddRange(aliasConflicts);
|
||||
signalScores["aliasConnectivity"] = aliasConnectivity;
|
||||
|
||||
// 2. Alias authority (scope-based weighting)
|
||||
var aliasAuthority = CalculateAliasAuthority(inputs);
|
||||
signalScores["aliasAuthority"] = aliasAuthority;
|
||||
|
||||
// 3. Package coverage (pairwise + IDF)
|
||||
var (packageCoverage, packageConflicts) = CalculatePackageCoverage(inputs, packageIdfProvider);
|
||||
conflicts.AddRange(packageConflicts);
|
||||
signalScores["packageCoverage"] = packageCoverage;
|
||||
|
||||
// 4. Version compatibility
|
||||
var (versionScore, versionConflicts) = CalculateVersionCompatibility(inputs);
|
||||
conflicts.AddRange(versionConflicts);
|
||||
signalScores["versionCompatibility"] = versionScore;
|
||||
|
||||
// 5. CPE match (existing logic, minor adjustments)
|
||||
var cpeScore = CalculateCpeScore(inputs);
|
||||
signalScores["cpeMatch"] = cpeScore;
|
||||
|
||||
// 6. Patch lineage
|
||||
var patchScore = CalculatePatchLineageScore(inputs);
|
||||
signalScores["patchLineage"] = patchScore;
|
||||
|
||||
// 7. Reference overlap (positive-only, no conflict on zero)
|
||||
var referenceScore = CalculateReferenceScore(inputs);
|
||||
signalScores["referenceOverlap"] = referenceScore;
|
||||
|
||||
// 8. Freshness
|
||||
var freshnessScore = CalculateFreshnessScore(inputs);
|
||||
signalScores["freshness"] = freshnessScore;
|
||||
|
||||
// Calculate base confidence from weighted signals
|
||||
var baseConfidence = Clamp01(
|
||||
(Weights.AliasConnectivity * aliasConnectivity) +
|
||||
(Weights.AliasAuthority * aliasAuthority) +
|
||||
(Weights.PackageCoverage * packageCoverage) +
|
||||
(Weights.VersionCompatibility * versionScore) +
|
||||
(Weights.CpeMatch * cpeScore) +
|
||||
(Weights.PatchLineage * patchScore) +
|
||||
(Weights.ReferenceOverlap * referenceScore) +
|
||||
(Weights.Freshness * freshnessScore));
|
||||
|
||||
// Add additional conflicts before penalty calculation
|
||||
if (additionalConflicts is { Count: > 0 })
|
||||
{
|
||||
conflicts.AddRange(additionalConflicts);
|
||||
}
|
||||
|
||||
// Apply typed conflict penalties
|
||||
var totalPenalty = CalculateTypedPenalty(conflicts);
|
||||
var finalConfidence = Clamp01(baseConfidence - totalPenalty);
|
||||
|
||||
// Ensure minimum confidence when conflicts exist but evidence is present
|
||||
if (finalConfidence < 0.1 && baseConfidence > 0)
|
||||
{
|
||||
finalConfidence = 0.1;
|
||||
}
|
||||
|
||||
return new CorrelationResult(
|
||||
finalConfidence,
|
||||
DeduplicateAndSort(conflicts, inputs),
|
||||
signalScores.ToImmutableDictionary());
|
||||
}
|
||||
|
||||
#region Alias Connectivity (Graph-based)
|
||||
|
||||
/// <summary>
|
||||
/// Calculates alias connectivity using bipartite graph analysis.
|
||||
/// Returns LCC (largest connected component) ratio instead of intersection.
|
||||
/// </summary>
|
||||
private static (double Score, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) CalculateAliasConnectivity(
|
||||
IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var conflicts = new List<AdvisoryLinksetConflict>();
|
||||
|
||||
if (inputs.Count == 1)
|
||||
{
|
||||
return (inputs.First().Aliases.Count > 0 ? 1d : 0d, conflicts);
|
||||
}
|
||||
|
||||
// Build bipartite graph: observation nodes + alias nodes
|
||||
var observationToAliases = inputs
|
||||
.ToDictionary(
|
||||
i => i.ObservationId,
|
||||
i => i.Aliases.Select(a => a.ToUpperInvariant()).ToHashSet(StringComparer.Ordinal));
|
||||
|
||||
// Build adjacency for union-find
|
||||
var allAliases = observationToAliases.Values.SelectMany(a => a).ToHashSet(StringComparer.Ordinal);
|
||||
|
||||
if (allAliases.Count == 0)
|
||||
{
|
||||
return (0d, conflicts);
|
||||
}
|
||||
|
||||
// Find connected components using alias-based bridging
|
||||
var observationIds = inputs.Select(i => i.ObservationId).ToList();
|
||||
var parent = observationIds.ToDictionary(id => id, id => id);
|
||||
|
||||
string Find(string x)
|
||||
{
|
||||
if (parent[x] != x)
|
||||
parent[x] = Find(parent[x]);
|
||||
return parent[x];
|
||||
}
|
||||
|
||||
void Union(string x, string y)
|
||||
{
|
||||
var px = Find(x);
|
||||
var py = Find(y);
|
||||
if (px != py)
|
||||
parent[px] = py;
|
||||
}
|
||||
|
||||
// Connect observations that share any alias
|
||||
foreach (var alias in allAliases)
|
||||
{
|
||||
var observationsWithAlias = observationIds
|
||||
.Where(id => observationToAliases[id].Contains(alias))
|
||||
.ToList();
|
||||
|
||||
for (int i = 1; i < observationsWithAlias.Count; i++)
|
||||
{
|
||||
Union(observationsWithAlias[0], observationsWithAlias[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate LCC ratio
|
||||
var componentSizes = observationIds
|
||||
.GroupBy(Find)
|
||||
.Select(g => g.Count())
|
||||
.ToList();
|
||||
|
||||
var largestComponent = componentSizes.Max();
|
||||
var lccRatio = (double)largestComponent / observationIds.Count;
|
||||
|
||||
// Check for distinct CVEs (true identity conflict)
|
||||
var cveAliases = allAliases
|
||||
.Where(a => a.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase))
|
||||
.ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
if (cveAliases.Count > 1)
|
||||
{
|
||||
// Multiple distinct CVEs in cluster = hard conflict
|
||||
var values = inputs
|
||||
.Select(i => $"{i.Vendor ?? "source"}:{FirstSortedOrDefault(i.Aliases.Where(a => a.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase)))}")
|
||||
.Where(v => !v.EndsWith(":<none>"))
|
||||
.OrderBy(v => v, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
|
||||
if (values.Length > 1)
|
||||
{
|
||||
conflicts.Add(new AdvisoryLinksetConflict(
|
||||
"aliases",
|
||||
"distinct-cves",
|
||||
values)
|
||||
{
|
||||
Severity = ConflictSeverity.Hard
|
||||
});
|
||||
}
|
||||
}
|
||||
else if (lccRatio < 1.0 && allAliases.Count > 0)
|
||||
{
|
||||
// Disconnected observations but no CVE conflict = soft inconsistency
|
||||
var disconnectedObs = observationIds
|
||||
.Where(id => Find(id) != Find(observationIds[0]))
|
||||
.Select(id => inputs.First(i => i.ObservationId == id))
|
||||
.Select(i => $"{i.Vendor ?? "source"}:{FirstSortedOrDefault(i.Aliases)}")
|
||||
.OrderBy(v => v, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
|
||||
if (disconnectedObs.Length > 0)
|
||||
{
|
||||
conflicts.Add(new AdvisoryLinksetConflict(
|
||||
"aliases",
|
||||
"alias-inconsistency",
|
||||
disconnectedObs)
|
||||
{
|
||||
Severity = ConflictSeverity.Soft
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return (lccRatio, conflicts);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Calculates alias authority score based on scope hierarchy.
|
||||
/// CVE (global) > ECO (ecosystem) > VND (vendor) > DST (distribution).
|
||||
/// </summary>
|
||||
private static double CalculateAliasAuthority(IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var allAliases = inputs.SelectMany(i => i.Aliases).ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
if (allAliases.Count == 0)
|
||||
return 0d;
|
||||
|
||||
// Score based on highest authority alias present
|
||||
var hasCve = allAliases.Any(a => a.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase));
|
||||
var hasGhsa = allAliases.Any(a => a.StartsWith("GHSA-", StringComparison.OrdinalIgnoreCase));
|
||||
var hasVendor = allAliases.Any(a =>
|
||||
a.StartsWith("RHSA-", StringComparison.OrdinalIgnoreCase) ||
|
||||
a.StartsWith("MSRC-", StringComparison.OrdinalIgnoreCase) ||
|
||||
a.StartsWith("CISCO-", StringComparison.OrdinalIgnoreCase) ||
|
||||
a.StartsWith("VMSA-", StringComparison.OrdinalIgnoreCase));
|
||||
var hasDistro = allAliases.Any(a =>
|
||||
a.StartsWith("DSA-", StringComparison.OrdinalIgnoreCase) ||
|
||||
a.StartsWith("USN-", StringComparison.OrdinalIgnoreCase) ||
|
||||
a.StartsWith("SUSE-", StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
if (hasCve) return 1.0;
|
||||
if (hasGhsa) return 0.8;
|
||||
if (hasVendor) return 0.6;
|
||||
if (hasDistro) return 0.4;
|
||||
|
||||
return 0.2; // Unknown alias scheme
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Package Coverage (Pairwise + IDF)
|
||||
|
||||
/// <summary>
|
||||
/// Calculates package coverage using pairwise overlap instead of intersection-across-all.
|
||||
/// A thin source with no packages does not collapse the score.
|
||||
/// </summary>
|
||||
private static (double Score, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) CalculatePackageCoverage(
|
||||
IReadOnlyCollection<InputV2> inputs,
|
||||
Func<string, double>? idfProvider = null)
|
||||
{
|
||||
var conflicts = new List<AdvisoryLinksetConflict>();
|
||||
|
||||
var inputsWithPackages = inputs.Where(i => i.Purls.Count > 0).ToList();
|
||||
if (inputsWithPackages.Count == 0)
|
||||
{
|
||||
return (0d, conflicts);
|
||||
}
|
||||
|
||||
if (inputsWithPackages.Count == 1)
|
||||
{
|
||||
return (inputsWithPackages[0].Purls.Count > 0 ? 1d : 0d, conflicts);
|
||||
}
|
||||
|
||||
// Extract package keys (without version)
|
||||
var packageKeysPerInput = inputsWithPackages
|
||||
.Select(i => i.Purls
|
||||
.Select(ExtractPackageKey)
|
||||
.Where(k => !string.IsNullOrWhiteSpace(k))
|
||||
.ToHashSet(StringComparer.Ordinal))
|
||||
.ToList();
|
||||
|
||||
// Calculate pairwise overlap with optional IDF weighting
|
||||
var totalWeight = 0d;
|
||||
var matchedWeight = 0d;
|
||||
var allPackages = packageKeysPerInput.SelectMany(p => p).ToHashSet(StringComparer.Ordinal);
|
||||
|
||||
foreach (var pkg in allPackages)
|
||||
{
|
||||
var idfWeight = idfProvider?.Invoke(pkg) ?? 1.0;
|
||||
var inputsWithPkg = packageKeysPerInput.Count(set => set.Contains(pkg));
|
||||
|
||||
totalWeight += idfWeight;
|
||||
if (inputsWithPkg > 1)
|
||||
{
|
||||
// Package appears in multiple sources = positive signal
|
||||
matchedWeight += idfWeight * ((double)inputsWithPkg / inputsWithPackages.Count);
|
||||
}
|
||||
}
|
||||
|
||||
var score = totalWeight > 0 ? matchedWeight / totalWeight : 0d;
|
||||
|
||||
// Check for exact PURL overlap (with version)
|
||||
var hasExactOverlap = HasExactPurlOverlap(inputsWithPackages);
|
||||
if (hasExactOverlap)
|
||||
{
|
||||
score = Math.Max(score, 0.8); // Boost for exact match
|
||||
}
|
||||
|
||||
// Collect range divergence as soft conflicts (handled in version scoring)
|
||||
// No longer emitted here to avoid double-counting
|
||||
|
||||
return (Clamp01(score), conflicts);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Version Compatibility
|
||||
|
||||
/// <summary>
|
||||
/// Classifies version relationships for shared packages.
|
||||
/// </summary>
|
||||
private static (double Score, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) CalculateVersionCompatibility(
|
||||
IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var conflicts = new List<AdvisoryLinksetConflict>();
|
||||
|
||||
var inputsWithPackages = inputs.Where(i => i.Purls.Count > 0).ToList();
|
||||
if (inputsWithPackages.Count < 2)
|
||||
{
|
||||
return (0.5d, conflicts); // Neutral when no comparison possible
|
||||
}
|
||||
|
||||
// Find shared package keys
|
||||
var packageKeysPerInput = inputsWithPackages
|
||||
.Select(i => i.Purls
|
||||
.Select(ExtractPackageKey)
|
||||
.Where(k => !string.IsNullOrWhiteSpace(k))
|
||||
.ToHashSet(StringComparer.Ordinal))
|
||||
.ToList();
|
||||
|
||||
var sharedPackages = packageKeysPerInput
|
||||
.Skip(1)
|
||||
.Aggregate(
|
||||
new HashSet<string>(packageKeysPerInput[0], StringComparer.Ordinal),
|
||||
(acc, next) =>
|
||||
{
|
||||
acc.IntersectWith(next);
|
||||
return acc;
|
||||
});
|
||||
|
||||
if (sharedPackages.Count == 0)
|
||||
{
|
||||
return (0.5d, conflicts); // Neutral when no shared packages
|
||||
}
|
||||
|
||||
var totalScore = 0d;
|
||||
var packageCount = 0;
|
||||
|
||||
foreach (var packageKey in sharedPackages)
|
||||
{
|
||||
var versionsPerSource = inputsWithPackages
|
||||
.Select(i => new
|
||||
{
|
||||
i.Vendor,
|
||||
Versions = i.Purls
|
||||
.Where(p => ExtractPackageKey(p) == packageKey)
|
||||
.Select(ExtractVersion)
|
||||
.Where(v => !string.IsNullOrWhiteSpace(v))
|
||||
.ToList()
|
||||
})
|
||||
.Where(x => x.Versions.Count > 0)
|
||||
.ToList();
|
||||
|
||||
if (versionsPerSource.Count < 2)
|
||||
continue;
|
||||
|
||||
packageCount++;
|
||||
|
||||
// Classify relationship (simplified; full impl would use SemanticVersionRangeResolver)
|
||||
var allVersions = versionsPerSource.SelectMany(v => v.Versions).ToHashSet(StringComparer.Ordinal);
|
||||
var relation = ClassifyVersionRelation(versionsPerSource.Select(v => v.Versions).ToList());
|
||||
|
||||
switch (relation)
|
||||
{
|
||||
case VersionRelation.Equivalent:
|
||||
totalScore += 1.0;
|
||||
break;
|
||||
|
||||
case VersionRelation.Overlapping:
|
||||
totalScore += 0.6;
|
||||
var overlapValues = versionsPerSource
|
||||
.Select(v => $"{v.Vendor ?? "source"}:{string.Join(",", v.Versions.OrderBy(x => x))}")
|
||||
.OrderBy(x => x, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
conflicts.Add(new AdvisoryLinksetConflict(
|
||||
$"affected.versions[{packageKey}]",
|
||||
"affected-range-divergence",
|
||||
overlapValues)
|
||||
{
|
||||
Severity = ConflictSeverity.Soft
|
||||
});
|
||||
break;
|
||||
|
||||
case VersionRelation.Disjoint:
|
||||
totalScore += 0.0;
|
||||
var disjointValues = versionsPerSource
|
||||
.Select(v => $"{v.Vendor ?? "source"}:{string.Join(",", v.Versions.OrderBy(x => x))}")
|
||||
.OrderBy(x => x, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
conflicts.Add(new AdvisoryLinksetConflict(
|
||||
$"affected.versions[{packageKey}]",
|
||||
"disjoint-version-ranges",
|
||||
disjointValues)
|
||||
{
|
||||
Severity = ConflictSeverity.Hard
|
||||
});
|
||||
break;
|
||||
|
||||
default:
|
||||
totalScore += 0.5; // Unknown = neutral
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
var avgScore = packageCount > 0 ? totalScore / packageCount : 0.5;
|
||||
return (Clamp01(avgScore), conflicts);
|
||||
}
|
||||
|
||||
private static VersionRelation ClassifyVersionRelation(List<List<string>> versionSets)
|
||||
{
|
||||
if (versionSets.Count < 2)
|
||||
return VersionRelation.Unknown;
|
||||
|
||||
var first = versionSets[0].ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
var allEquivalent = true;
|
||||
var anyOverlap = false;
|
||||
|
||||
foreach (var other in versionSets.Skip(1))
|
||||
{
|
||||
var otherSet = other.ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
if (!first.SetEquals(otherSet))
|
||||
allEquivalent = false;
|
||||
|
||||
if (first.Overlaps(otherSet))
|
||||
anyOverlap = true;
|
||||
}
|
||||
|
||||
if (allEquivalent)
|
||||
return VersionRelation.Equivalent;
|
||||
|
||||
if (anyOverlap)
|
||||
return VersionRelation.Overlapping;
|
||||
|
||||
return VersionRelation.Disjoint;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Patch Lineage
|
||||
|
||||
/// <summary>
|
||||
/// Calculates patch lineage correlation.
|
||||
/// Exact commit SHA match is a very strong signal.
|
||||
/// </summary>
|
||||
private static double CalculatePatchLineageScore(IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var inputsWithPatches = inputs
|
||||
.Where(i => i.PatchReferences?.Count > 0)
|
||||
.ToList();
|
||||
|
||||
if (inputsWithPatches.Count < 2)
|
||||
{
|
||||
return 0d; // No patch data to compare
|
||||
}
|
||||
|
||||
// Extract normalized patch references (commit SHAs, PR URLs)
|
||||
var patchesPerInput = inputsWithPatches
|
||||
.Select(i => i.PatchReferences!
|
||||
.Select(NormalizePatchReference)
|
||||
.Where(p => p is not null)
|
||||
.Select(p => p!)
|
||||
.ToHashSet(StringComparer.OrdinalIgnoreCase))
|
||||
.ToList();
|
||||
|
||||
// Find any pairwise overlap
|
||||
for (int i = 0; i < patchesPerInput.Count; i++)
|
||||
{
|
||||
for (int j = i + 1; j < patchesPerInput.Count; j++)
|
||||
{
|
||||
if (patchesPerInput[i].Overlaps(patchesPerInput[j]))
|
||||
{
|
||||
// Exact patch match = very strong signal
|
||||
return 1.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0d;
|
||||
}
|
||||
|
||||
private static string? NormalizePatchReference(string reference)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(reference))
|
||||
return null;
|
||||
|
||||
// Extract commit SHA from GitHub/GitLab URLs
|
||||
var commitPattern = new System.Text.RegularExpressions.Regex(
|
||||
@"(?:github\.com|gitlab\.com)/[^/]+/[^/]+(?:/-)?/commit/([0-9a-f]{7,40})",
|
||||
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||||
|
||||
var match = commitPattern.Match(reference);
|
||||
if (match.Success)
|
||||
{
|
||||
return match.Groups[1].Value.ToLowerInvariant();
|
||||
}
|
||||
|
||||
// Full SHA pattern
|
||||
var shaPattern = new System.Text.RegularExpressions.Regex(@"\b([0-9a-f]{40})\b",
|
||||
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||||
|
||||
match = shaPattern.Match(reference);
|
||||
if (match.Success)
|
||||
{
|
||||
return match.Groups[1].Value.ToLowerInvariant();
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Reference Score (Positive-Only)
|
||||
|
||||
/// <summary>
|
||||
/// Calculates reference overlap as a positive-only signal.
|
||||
/// Zero overlap is neutral (0.5), not a conflict.
|
||||
/// </summary>
|
||||
private static double CalculateReferenceScore(IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
if (inputs.All(i => i.References.Count == 0))
|
||||
{
|
||||
return 0.5d; // Neutral when no references
|
||||
}
|
||||
|
||||
var inputList = inputs.ToList();
|
||||
var maxOverlap = 0d;
|
||||
|
||||
for (var i = 0; i < inputList.Count; i++)
|
||||
{
|
||||
for (var j = i + 1; j < inputList.Count; j++)
|
||||
{
|
||||
var first = inputList[i].References
|
||||
.Select(NormalizeReferenceUrl)
|
||||
.ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
var second = inputList[j].References
|
||||
.Select(NormalizeReferenceUrl)
|
||||
.ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
var intersection = first.Intersect(second, StringComparer.OrdinalIgnoreCase).Count();
|
||||
var denom = Math.Max(first.Count, second.Count);
|
||||
var overlap = denom == 0 ? 0d : (double)intersection / denom;
|
||||
|
||||
if (overlap > maxOverlap)
|
||||
{
|
||||
maxOverlap = overlap;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Map overlap to score: 0 overlap = 0.5 (neutral), 1.0 overlap = 1.0
|
||||
return 0.5 + (maxOverlap * 0.5);
|
||||
}
|
||||
|
||||
private static string NormalizeReferenceUrl(string url)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(url))
|
||||
return string.Empty;
|
||||
|
||||
// Lowercase, remove tracking params, normalize protocol
|
||||
var normalized = url.ToLowerInvariant().Trim();
|
||||
|
||||
// Remove common tracking parameters
|
||||
var queryIndex = normalized.IndexOf('?');
|
||||
if (queryIndex > 0)
|
||||
{
|
||||
normalized = normalized[..queryIndex];
|
||||
}
|
||||
|
||||
// Normalize protocol
|
||||
if (normalized.StartsWith("http://"))
|
||||
{
|
||||
normalized = "https://" + normalized[7..];
|
||||
}
|
||||
|
||||
// Remove trailing slash
|
||||
return normalized.TrimEnd('/');
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region CPE and Freshness (Minor Updates)
|
||||
|
||||
private static double CalculateCpeScore(IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
if (inputs.All(i => i.Cpes.Count == 0))
|
||||
{
|
||||
return 0d;
|
||||
}
|
||||
|
||||
var cpeSets = inputs.Select(i => i.Cpes.ToHashSet(StringComparer.OrdinalIgnoreCase)).ToList();
|
||||
var exactOverlap = cpeSets.Skip(1).Any(set => set.Overlaps(cpeSets.First()));
|
||||
if (exactOverlap)
|
||||
{
|
||||
return 1d;
|
||||
}
|
||||
|
||||
var vendorProductSets = inputs
|
||||
.Select(i => i.Cpes.Select(ParseVendorProduct).Where(vp => vp.vendor is not null).ToHashSet())
|
||||
.ToList();
|
||||
|
||||
var sharedVendorProduct = vendorProductSets.Skip(1).Any(set => set.Overlaps(vendorProductSets.First()));
|
||||
return sharedVendorProduct ? 0.5d : 0d;
|
||||
}
|
||||
|
||||
private static (string? vendor, string? product) ParseVendorProduct(string cpe)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(cpe))
|
||||
{
|
||||
return (null, null);
|
||||
}
|
||||
|
||||
var parts = cpe.Split(':');
|
||||
if (parts.Length >= 6 && parts[0].StartsWith("cpe", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return (parts[3], parts[4]);
|
||||
}
|
||||
|
||||
if (parts.Length >= 5 && parts[0] == "cpe" && parts[1] == "/")
|
||||
{
|
||||
return (parts[2], parts[3]);
|
||||
}
|
||||
|
||||
return (null, null);
|
||||
}
|
||||
|
||||
private static double CalculateFreshnessScore(IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var fetched = inputs
|
||||
.Select(i => i.FetchedAt)
|
||||
.Where(d => d.HasValue)
|
||||
.Select(d => d!.Value)
|
||||
.ToList();
|
||||
|
||||
if (fetched.Count <= 1)
|
||||
{
|
||||
return 0.5d;
|
||||
}
|
||||
|
||||
var min = fetched.Min();
|
||||
var max = fetched.Max();
|
||||
var spread = max - min;
|
||||
|
||||
if (spread <= TimeSpan.FromHours(48))
|
||||
{
|
||||
return 1d;
|
||||
}
|
||||
|
||||
if (spread >= TimeSpan.FromDays(14))
|
||||
{
|
||||
return 0d;
|
||||
}
|
||||
|
||||
var remaining = TimeSpan.FromDays(14) - spread;
|
||||
return Clamp01(remaining.TotalSeconds / TimeSpan.FromDays(14).TotalSeconds);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Conflict Penalties
|
||||
|
||||
/// <summary>
|
||||
/// Calculates typed penalty based on conflict severities.
|
||||
/// </summary>
|
||||
private static double CalculateTypedPenalty(IReadOnlyList<AdvisoryLinksetConflict> conflicts)
|
||||
{
|
||||
if (conflicts.Count == 0)
|
||||
return 0d;
|
||||
|
||||
var totalPenalty = 0d;
|
||||
|
||||
foreach (var conflict in conflicts)
|
||||
{
|
||||
var penalty = conflict.Reason switch
|
||||
{
|
||||
"distinct-cves" => ConflictPenalties.DistinctCves,
|
||||
"disjoint-version-ranges" => ConflictPenalties.DisjointVersionRanges,
|
||||
"affected-range-divergence" => ConflictPenalties.OverlappingRanges,
|
||||
"severity-mismatch" => ConflictPenalties.SeverityMismatch,
|
||||
"alias-inconsistency" => ConflictPenalties.AliasInconsistency,
|
||||
"reference-clash" => 0d, // No penalty for reference differences
|
||||
_ => 0.05 // Default small penalty for unknown conflicts
|
||||
};
|
||||
|
||||
totalPenalty += penalty;
|
||||
}
|
||||
|
||||
// Saturate at 0.6 to prevent total collapse
|
||||
return Math.Min(totalPenalty, 0.6);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Helpers
|
||||
|
||||
private static bool HasExactPurlOverlap(IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var first = inputs.First().Purls.ToHashSet(StringComparer.Ordinal);
|
||||
return inputs.Skip(1).Any(input => input.Purls.Any(first.Contains));
|
||||
}
|
||||
|
||||
private static string ExtractPackageKey(string purl)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(purl))
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
var atIndex = purl.LastIndexOf('@');
|
||||
return atIndex > 0 ? purl[..atIndex] : purl;
|
||||
}
|
||||
|
||||
private static string ExtractVersion(string purl)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(purl))
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
var atIndex = purl.LastIndexOf('@');
|
||||
if (atIndex < 0 || atIndex >= purl.Length - 1)
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
var version = purl[(atIndex + 1)..];
|
||||
|
||||
// Remove qualifiers if present
|
||||
var qualifierIndex = version.IndexOf('?');
|
||||
if (qualifierIndex > 0)
|
||||
{
|
||||
version = version[..qualifierIndex];
|
||||
}
|
||||
|
||||
return version;
|
||||
}
|
||||
|
||||
private static IReadOnlyList<AdvisoryLinksetConflict> DeduplicateAndSort(
|
||||
IEnumerable<AdvisoryLinksetConflict> conflicts,
|
||||
IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var set = new HashSet<string>(StringComparer.Ordinal);
|
||||
var list = new List<AdvisoryLinksetConflict>();
|
||||
|
||||
foreach (var conflict in conflicts)
|
||||
{
|
||||
var normalizedValues = NormalizeValues(conflict.Values);
|
||||
var normalizedSources = NormalizeValues(conflict.SourceIds);
|
||||
var key = $"{conflict.Field}|{conflict.Reason}|{string.Join('|', normalizedValues)}";
|
||||
|
||||
if (set.Add(key))
|
||||
{
|
||||
if (normalizedSources.Count == 0)
|
||||
{
|
||||
normalizedSources = inputs
|
||||
.Select(i => i.Vendor ?? "source")
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.OrderBy(v => v, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
}
|
||||
|
||||
list.Add(conflict with
|
||||
{
|
||||
Values = normalizedValues,
|
||||
SourceIds = normalizedSources
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return list
|
||||
.OrderBy(c => c.Field, StringComparer.Ordinal)
|
||||
.ThenBy(c => c.Reason, StringComparer.Ordinal)
|
||||
.ThenBy(c => string.Join('|', c.Values ?? Array.Empty<string>()), StringComparer.Ordinal)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static double Clamp01(double value) => Math.Clamp(value, 0d, 1d);
|
||||
|
||||
private static string FirstSortedOrDefault(IEnumerable<string> values)
|
||||
{
|
||||
var first = values
|
||||
.Where(v => !string.IsNullOrWhiteSpace(v))
|
||||
.Select(v => v.Trim())
|
||||
.OrderBy(v => v, StringComparer.Ordinal)
|
||||
.FirstOrDefault();
|
||||
return string.IsNullOrEmpty(first) ? "<none>" : first;
|
||||
}
|
||||
|
||||
private static IReadOnlyList<string> NormalizeValues(IReadOnlyList<string>? values)
|
||||
{
|
||||
if (values is null || values.Count == 0)
|
||||
{
|
||||
return Array.Empty<string>();
|
||||
}
|
||||
|
||||
return values
|
||||
.Where(v => !string.IsNullOrWhiteSpace(v))
|
||||
.Select(v => v.Trim())
|
||||
.OrderBy(v => v, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,331 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// TextSimilarityScorer.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-010
|
||||
// Description: Deterministic TF-IDF text similarity for linkset correlation
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.Concelier.Core.Linksets;
|
||||
|
||||
/// <summary>
|
||||
/// Computes TF-IDF-based text similarity between advisory descriptions.
|
||||
/// Used as an optional correlation signal in V2 linkset correlation.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// This scorer is designed for deterministic, offline operation:
|
||||
/// - No external NLP dependencies (pure C# implementation)
|
||||
/// - Configurable stop words and tokenization
|
||||
/// - Stable output across runs (no randomness)
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Default weight: 0.05 (low weight, supplementary signal).
|
||||
/// Feature flag: <c>concelier:correlation:textSimilarity:enabled</c> (default: false).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class TextSimilarityScorer
|
||||
{
|
||||
private static readonly Regex TokenRegex = new(
|
||||
@"[a-zA-Z][a-zA-Z0-9_-]{2,}",
|
||||
RegexOptions.Compiled | RegexOptions.CultureInvariant);
|
||||
|
||||
private static readonly HashSet<string> DefaultStopWords = new(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
// Common English stop words
|
||||
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
|
||||
"of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
|
||||
"be", "have", "has", "had", "do", "does", "did", "will", "would", "could",
|
||||
"should", "may", "might", "must", "shall", "can", "need", "dare", "ought",
|
||||
"used", "this", "that", "these", "those", "which", "who", "whom", "whose",
|
||||
"what", "where", "when", "why", "how", "all", "each", "every", "both",
|
||||
"few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
|
||||
"own", "same", "so", "than", "too", "very", "just", "also", "now", "here",
|
||||
"there", "then", "once", "if", "into", "over", "after", "before", "about",
|
||||
// Common vulnerability description words (low discriminative value)
|
||||
"vulnerability", "issue", "allows", "attacker", "attack", "remote", "local",
|
||||
"user", "code", "execution", "denial", "service", "buffer", "overflow",
|
||||
"may", "could", "via", "using", "through", "affected", "version", "versions",
|
||||
"product", "software", "application", "component", "module", "function"
|
||||
};
|
||||
|
||||
private readonly TextSimilarityOptions _options;
|
||||
private readonly HashSet<string> _stopWords;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of <see cref="TextSimilarityScorer"/>.
|
||||
/// </summary>
|
||||
/// <param name="options">Configuration options. Null uses defaults.</param>
|
||||
public TextSimilarityScorer(TextSimilarityOptions? options = null)
|
||||
{
|
||||
_options = options ?? new TextSimilarityOptions();
|
||||
_stopWords = _options.CustomStopWords is not null
|
||||
? new HashSet<string>(_options.CustomStopWords, StringComparer.OrdinalIgnoreCase)
|
||||
: DefaultStopWords;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes average pairwise TF-IDF cosine similarity across all description pairs.
|
||||
/// </summary>
|
||||
/// <param name="descriptions">Collection of normalized description texts.</param>
|
||||
/// <returns>Average similarity score (0.0-1.0). Returns 0 if fewer than 2 descriptions.</returns>
|
||||
public double ComputeAverageSimilarity(IReadOnlyCollection<string> descriptions)
|
||||
{
|
||||
if (descriptions.Count < 2)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Filter out empty/null descriptions
|
||||
var validDescriptions = descriptions
|
||||
.Where(d => !string.IsNullOrWhiteSpace(d))
|
||||
.ToArray();
|
||||
|
||||
if (validDescriptions.Length < 2)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Tokenize all descriptions
|
||||
var tokenizedDocs = validDescriptions
|
||||
.Select(d => Tokenize(d))
|
||||
.ToArray();
|
||||
|
||||
// Build document frequency map
|
||||
var documentFrequency = BuildDocumentFrequency(tokenizedDocs);
|
||||
|
||||
// Compute TF-IDF vectors
|
||||
var tfidfVectors = tokenizedDocs
|
||||
.Select(tokens => ComputeTfIdf(tokens, documentFrequency, tokenizedDocs.Length))
|
||||
.ToArray();
|
||||
|
||||
// Compute average pairwise cosine similarity
|
||||
var totalSimilarity = 0.0;
|
||||
var pairCount = 0;
|
||||
|
||||
for (var i = 0; i < tfidfVectors.Length; i++)
|
||||
{
|
||||
for (var j = i + 1; j < tfidfVectors.Length; j++)
|
||||
{
|
||||
totalSimilarity += CosineSimilarity(tfidfVectors[i], tfidfVectors[j]);
|
||||
pairCount++;
|
||||
}
|
||||
}
|
||||
|
||||
return pairCount > 0 ? totalSimilarity / pairCount : 0.0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes TF-IDF cosine similarity between two descriptions.
|
||||
/// </summary>
|
||||
/// <param name="description1">First description text.</param>
|
||||
/// <param name="description2">Second description text.</param>
|
||||
/// <returns>Similarity score (0.0-1.0).</returns>
|
||||
public double ComputePairwiseSimilarity(string description1, string description2)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(description1) || string.IsNullOrWhiteSpace(description2))
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
var tokens1 = Tokenize(description1);
|
||||
var tokens2 = Tokenize(description2);
|
||||
|
||||
if (tokens1.Count == 0 || tokens2.Count == 0)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// For pairwise, use simple term frequency with IDF approximation
|
||||
var allTerms = new HashSet<string>(tokens1, StringComparer.OrdinalIgnoreCase);
|
||||
allTerms.UnionWith(tokens2);
|
||||
|
||||
// Document frequency (appears in 1 or 2 docs)
|
||||
var df = allTerms.ToDictionary(
|
||||
t => t,
|
||||
t => (tokens1.Contains(t) ? 1 : 0) + (tokens2.Contains(t) ? 1 : 0),
|
||||
StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
var vec1 = ComputeTfIdf(tokens1, df, 2);
|
||||
var vec2 = ComputeTfIdf(tokens2, df, 2);
|
||||
|
||||
return CosineSimilarity(vec1, vec2);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tokenizes text into lowercase terms, removing stop words and short tokens.
|
||||
/// </summary>
|
||||
internal IReadOnlyList<string> Tokenize(string text)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text))
|
||||
{
|
||||
return Array.Empty<string>();
|
||||
}
|
||||
|
||||
var matches = TokenRegex.Matches(text);
|
||||
var tokens = new List<string>(matches.Count);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
var token = match.Value.ToLowerInvariant();
|
||||
|
||||
// Skip stop words
|
||||
if (_stopWords.Contains(token))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip tokens that are too short
|
||||
if (token.Length < _options.MinTokenLength)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip tokens that are all digits (version numbers, etc.)
|
||||
if (token.All(char.IsDigit))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
tokens.Add(token);
|
||||
}
|
||||
|
||||
// Sort for determinism
|
||||
tokens.Sort(StringComparer.Ordinal);
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private static Dictionary<string, int> BuildDocumentFrequency(IReadOnlyList<IReadOnlyList<string>> documents)
|
||||
{
|
||||
var df = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (var doc in documents)
|
||||
{
|
||||
var uniqueTerms = new HashSet<string>(doc, StringComparer.OrdinalIgnoreCase);
|
||||
foreach (var term in uniqueTerms)
|
||||
{
|
||||
df.TryGetValue(term, out var count);
|
||||
df[term] = count + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return df;
|
||||
}
|
||||
|
||||
private Dictionary<string, double> ComputeTfIdf(
|
||||
IReadOnlyList<string> tokens,
|
||||
Dictionary<string, int> documentFrequency,
|
||||
int totalDocuments)
|
||||
{
|
||||
// Compute term frequency
|
||||
var termFrequency = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
|
||||
foreach (var token in tokens)
|
||||
{
|
||||
termFrequency.TryGetValue(token, out var count);
|
||||
termFrequency[token] = count + 1;
|
||||
}
|
||||
|
||||
if (termFrequency.Count == 0)
|
||||
{
|
||||
return new Dictionary<string, double>(StringComparer.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
// Compute TF-IDF
|
||||
var tfidf = new Dictionary<string, double>(StringComparer.OrdinalIgnoreCase);
|
||||
var maxTf = termFrequency.Values.Max();
|
||||
|
||||
foreach (var (term, tf) in termFrequency)
|
||||
{
|
||||
// Normalized TF: tf / max_tf (augmented frequency)
|
||||
var normalizedTf = 0.5 + 0.5 * ((double)tf / maxTf);
|
||||
|
||||
// IDF: log((N + 1) / (df + 1)) + 1 (smoothed IDF to avoid zero)
|
||||
// This ensures terms that appear in all documents still have some weight
|
||||
documentFrequency.TryGetValue(term, out var df);
|
||||
var idf = Math.Log((double)(totalDocuments + 1) / (df + 1)) + 1.0;
|
||||
|
||||
tfidf[term] = normalizedTf * idf;
|
||||
}
|
||||
|
||||
return tfidf;
|
||||
}
|
||||
|
||||
private static double CosineSimilarity(
|
||||
Dictionary<string, double> vec1,
|
||||
Dictionary<string, double> vec2)
|
||||
{
|
||||
// Get all terms
|
||||
var allTerms = new HashSet<string>(vec1.Keys, StringComparer.OrdinalIgnoreCase);
|
||||
allTerms.UnionWith(vec2.Keys);
|
||||
|
||||
// Compute dot product and magnitudes
|
||||
var dotProduct = 0.0;
|
||||
var mag1 = 0.0;
|
||||
var mag2 = 0.0;
|
||||
|
||||
foreach (var term in allTerms)
|
||||
{
|
||||
vec1.TryGetValue(term, out var v1);
|
||||
vec2.TryGetValue(term, out var v2);
|
||||
|
||||
dotProduct += v1 * v2;
|
||||
mag1 += v1 * v1;
|
||||
mag2 += v2 * v2;
|
||||
}
|
||||
|
||||
mag1 = Math.Sqrt(mag1);
|
||||
mag2 = Math.Sqrt(mag2);
|
||||
|
||||
if (mag1 < double.Epsilon || mag2 < double.Epsilon)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
return dotProduct / (mag1 * mag2);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for the text similarity scorer.
|
||||
/// </summary>
|
||||
public sealed class TextSimilarityOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section name.
|
||||
/// </summary>
|
||||
public const string SectionName = "Concelier:Correlation:TextSimilarity";
|
||||
|
||||
/// <summary>
|
||||
/// Whether text similarity scoring is enabled.
|
||||
/// Default: false (Phase 3 feature, not yet GA).
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for text similarity in unified scoring.
|
||||
/// Default: 0.05.
|
||||
/// </summary>
|
||||
public double Weight { get; set; } = 0.05;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum token length after normalization.
|
||||
/// Default: 3.
|
||||
/// </summary>
|
||||
public int MinTokenLength { get; set; } = 3;
|
||||
|
||||
/// <summary>
|
||||
/// Custom stop words list. If null, uses built-in defaults.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? CustomStopWords { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to apply Porter stemming to tokens.
|
||||
/// Default: false (adds complexity, minimal benefit for security text).
|
||||
/// </summary>
|
||||
public bool EnableStemming { get; set; } = false;
|
||||
}
|
||||
Reference in New Issue
Block a user