devops folders consolidate
This commit is contained in:
@@ -121,6 +121,70 @@ public static class AdvisoryCacheKeys
|
||||
public static string CveMappingPattern(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}by:cve:*";
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// IDF (Inverse Document Frequency) Cache Keys
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-007
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/// <summary>
|
||||
/// Key for IDF score of a specific package.
|
||||
/// Format: {prefix}idf:pkg:{normalizedPackageName}
|
||||
/// </summary>
|
||||
/// <param name="packageName">The package name (will be normalized).</param>
|
||||
/// <param name="prefix">Key prefix.</param>
|
||||
public static string IdfPackage(string packageName, string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:pkg:{NormalizePurl(packageName)}";
|
||||
|
||||
/// <summary>
|
||||
/// Key for IDF corpus statistics (total document count).
|
||||
/// Format: {prefix}idf:stats:corpus_size
|
||||
/// </summary>
|
||||
public static string IdfCorpusSize(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:stats:corpus_size";
|
||||
|
||||
/// <summary>
|
||||
/// Key for IDF last refresh timestamp.
|
||||
/// Format: {prefix}idf:stats:last_refresh
|
||||
/// </summary>
|
||||
public static string IdfLastRefresh(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:stats:last_refresh";
|
||||
|
||||
/// <summary>
|
||||
/// Key for IDF refresh lock (distributed coordination).
|
||||
/// Format: {prefix}idf:lock:refresh
|
||||
/// </summary>
|
||||
public static string IdfRefreshLock(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:lock:refresh";
|
||||
|
||||
/// <summary>
|
||||
/// Key for document frequency of a package (count of observations containing the package).
|
||||
/// Format: {prefix}idf:df:{normalizedPackageName}
|
||||
/// </summary>
|
||||
public static string IdfDocumentFrequency(string packageName, string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:df:{NormalizePurl(packageName)}";
|
||||
|
||||
/// <summary>
|
||||
/// Pattern to match all IDF package keys (for scanning/cleanup).
|
||||
/// Format: {prefix}idf:pkg:*
|
||||
/// </summary>
|
||||
public static string IdfPackagePattern(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:pkg:*";
|
||||
|
||||
/// <summary>
|
||||
/// Key for IDF cache hit counter.
|
||||
/// Format: {prefix}idf:stats:hits
|
||||
/// </summary>
|
||||
public static string IdfStatsHits(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:stats:hits";
|
||||
|
||||
/// <summary>
|
||||
/// Key for IDF cache miss counter.
|
||||
/// Format: {prefix}idf:stats:misses
|
||||
/// </summary>
|
||||
public static string IdfStatsMisses(string prefix = DefaultPrefix)
|
||||
=> $"{prefix}idf:stats:misses";
|
||||
|
||||
/// <summary>
|
||||
/// Normalizes a PURL for use as a cache key.
|
||||
/// </summary>
|
||||
|
||||
@@ -0,0 +1,153 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// IPackageIdfService.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-007
|
||||
// Description: Interface for package IDF (Inverse Document Frequency) caching
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
namespace StellaOps.Concelier.Cache.Valkey;
|
||||
|
||||
/// <summary>
|
||||
/// Service for computing and caching IDF (Inverse Document Frequency) weights
|
||||
/// for package keys used in linkset correlation.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// IDF measures how discriminative a package is across the observation corpus:
|
||||
/// <code>
|
||||
/// idf(pkg) = log(N / (1 + df(pkg)))
|
||||
/// </code>
|
||||
/// where N = total observations, df = observations containing the package.
|
||||
///
|
||||
/// Rare packages (low df) have high IDF → stronger correlation signal.
|
||||
/// Common packages (high df) have low IDF → weaker correlation signal.
|
||||
/// </remarks>
|
||||
public interface IPackageIdfService
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the IDF weight for a package key.
|
||||
/// </summary>
|
||||
/// <param name="packageName">The package name (PURL format).</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>
|
||||
/// The IDF weight (0.0-1.0 normalized), or null if not cached.
|
||||
/// Returns null on cache miss or error (graceful degradation).
|
||||
/// </returns>
|
||||
Task<double?> GetIdfAsync(string packageName, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets IDF weights for multiple package keys in a single batch operation.
|
||||
/// </summary>
|
||||
/// <param name="packageNames">The package names to look up.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>
|
||||
/// Dictionary of package name to IDF weight. Missing entries indicate cache miss.
|
||||
/// </returns>
|
||||
Task<IReadOnlyDictionary<string, double>> GetIdfBatchAsync(
|
||||
IEnumerable<string> packageNames,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Sets the IDF weight for a package key.
|
||||
/// </summary>
|
||||
/// <param name="packageName">The package name.</param>
|
||||
/// <param name="idfWeight">The IDF weight (0.0-1.0 normalized).</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task SetIdfAsync(string packageName, double idfWeight, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Sets IDF weights for multiple package keys in a single batch operation.
|
||||
/// </summary>
|
||||
/// <param name="idfWeights">Dictionary of package name to IDF weight.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task SetIdfBatchAsync(
|
||||
IReadOnlyDictionary<string, double> idfWeights,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Updates the corpus statistics used for IDF computation.
|
||||
/// </summary>
|
||||
/// <param name="corpusSize">Total number of observations in the corpus.</param>
|
||||
/// <param name="documentFrequencies">Dictionary of package name to document frequency.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task UpdateCorpusStatsAsync(
|
||||
long corpusSize,
|
||||
IReadOnlyDictionary<string, long> documentFrequencies,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the last refresh timestamp for IDF statistics.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The last refresh time, or null if never refreshed.</returns>
|
||||
Task<DateTimeOffset?> GetLastRefreshAsync(CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Invalidates cached IDF data for a specific package.
|
||||
/// </summary>
|
||||
/// <param name="packageName">The package name to invalidate.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task InvalidateAsync(string packageName, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Invalidates all cached IDF data.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task InvalidateAllAsync(CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Whether the IDF cache is enabled and available.
|
||||
/// </summary>
|
||||
bool IsEnabled { get; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for the package IDF service.
|
||||
/// </summary>
|
||||
public sealed class PackageIdfOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section name.
|
||||
/// </summary>
|
||||
public const string SectionName = "Concelier:PackageIdf";
|
||||
|
||||
/// <summary>
|
||||
/// Whether IDF caching is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// TTL for cached IDF scores.
|
||||
/// Default: 1 hour.
|
||||
/// </summary>
|
||||
public TimeSpan IdfTtl { get; set; } = TimeSpan.FromHours(1);
|
||||
|
||||
/// <summary>
|
||||
/// TTL for corpus statistics.
|
||||
/// Default: 4 hours.
|
||||
/// </summary>
|
||||
public TimeSpan CorpusStatsTtl { get; set; } = TimeSpan.FromHours(4);
|
||||
|
||||
/// <summary>
|
||||
/// Minimum IDF value to cache (to avoid caching very common packages).
|
||||
/// Default: 0.01.
|
||||
/// </summary>
|
||||
public double MinIdfThreshold { get; set; } = 0.01;
|
||||
|
||||
/// <summary>
|
||||
/// Default IDF weight to return on cache miss (uniform weight).
|
||||
/// Default: 1.0 (no discrimination).
|
||||
/// </summary>
|
||||
public double DefaultIdfWeight { get; set; } = 1.0;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of IDF entries to cache.
|
||||
/// Default: 100,000.
|
||||
/// </summary>
|
||||
public int MaxCacheEntries { get; set; } = 100_000;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to normalize IDF scores to 0.0-1.0 range.
|
||||
/// Default: true.
|
||||
/// </summary>
|
||||
public bool NormalizeScores { get; set; } = true;
|
||||
}
|
||||
@@ -0,0 +1,139 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// IdfRefreshHostedService.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-007
|
||||
// Description: Background service for periodic IDF weight refresh
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using Microsoft.Extensions.Hosting;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.Concelier.Cache.Valkey;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for providing IDF corpus statistics from the observation store.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This interface should be implemented by the Concelier Core module to provide
|
||||
/// document frequencies from the actual observation database.
|
||||
/// </remarks>
|
||||
public interface IIdfCorpusProvider
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the total number of observations in the corpus.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Total observation count.</returns>
|
||||
Task<long> GetCorpusSizeAsync(CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets document frequencies for all packages in the corpus.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Dictionary mapping package name to the number of observations containing it.</returns>
|
||||
Task<IReadOnlyDictionary<string, long>> GetDocumentFrequenciesAsync(CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Background service that periodically refreshes IDF weights from the observation corpus.
|
||||
/// </summary>
|
||||
public sealed class IdfRefreshHostedService : BackgroundService
|
||||
{
|
||||
private readonly IPackageIdfService _idfService;
|
||||
private readonly IIdfCorpusProvider? _corpusProvider;
|
||||
private readonly PackageIdfOptions _options;
|
||||
private readonly ILogger<IdfRefreshHostedService>? _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of <see cref="IdfRefreshHostedService"/>.
|
||||
/// </summary>
|
||||
public IdfRefreshHostedService(
|
||||
IPackageIdfService idfService,
|
||||
IOptions<PackageIdfOptions> options,
|
||||
IIdfCorpusProvider? corpusProvider = null,
|
||||
ILogger<IdfRefreshHostedService>? logger = null)
|
||||
{
|
||||
_idfService = idfService ?? throw new ArgumentNullException(nameof(idfService));
|
||||
_corpusProvider = corpusProvider;
|
||||
_options = options?.Value ?? new PackageIdfOptions();
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||
{
|
||||
if (!_idfService.IsEnabled)
|
||||
{
|
||||
_logger?.LogInformation("IDF refresh service disabled (IDF caching not enabled)");
|
||||
return;
|
||||
}
|
||||
|
||||
if (_corpusProvider is null)
|
||||
{
|
||||
_logger?.LogWarning(
|
||||
"IDF refresh service has no corpus provider registered. " +
|
||||
"Register IIdfCorpusProvider to enable automatic IDF refresh.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Initial delay before first refresh (allow other services to start)
|
||||
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
|
||||
|
||||
while (!stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
await RefreshIdfWeightsAsync(stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogError(ex, "Error during IDF refresh cycle");
|
||||
}
|
||||
|
||||
// Wait for next refresh interval (default: 1 hour)
|
||||
try
|
||||
{
|
||||
await Task.Delay(_options.IdfTtl, stoppingToken).ConfigureAwait(false);
|
||||
}
|
||||
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
_logger?.LogInformation("IDF refresh service stopped");
|
||||
}
|
||||
|
||||
private async Task RefreshIdfWeightsAsync(CancellationToken cancellationToken)
|
||||
{
|
||||
_logger?.LogDebug("Starting IDF refresh cycle");
|
||||
|
||||
var corpusSize = await _corpusProvider!.GetCorpusSizeAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (corpusSize == 0)
|
||||
{
|
||||
_logger?.LogWarning("IDF refresh skipped: empty corpus");
|
||||
return;
|
||||
}
|
||||
|
||||
var documentFrequencies = await _corpusProvider.GetDocumentFrequenciesAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (documentFrequencies.Count == 0)
|
||||
{
|
||||
_logger?.LogWarning("IDF refresh skipped: no document frequencies");
|
||||
return;
|
||||
}
|
||||
|
||||
await _idfService.UpdateCorpusStatsAsync(corpusSize, documentFrequencies, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger?.LogInformation(
|
||||
"IDF refresh completed: corpus={CorpusSize}, packages={PackageCount}",
|
||||
corpusSize,
|
||||
documentFrequencies.Count);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,249 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// PackageIdfMetrics.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-007
|
||||
// Description: OpenTelemetry metrics for package IDF caching operations
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Diagnostics.Metrics;
|
||||
|
||||
namespace StellaOps.Concelier.Cache.Valkey;
|
||||
|
||||
/// <summary>
|
||||
/// Metrics instrumentation for the package IDF cache.
|
||||
/// </summary>
|
||||
public sealed class PackageIdfMetrics : IDisposable
|
||||
{
|
||||
/// <summary>
|
||||
/// Activity source name for IDF cache operations.
|
||||
/// </summary>
|
||||
public const string ActivitySourceName = "StellaOps.Concelier.PackageIdf";
|
||||
|
||||
/// <summary>
|
||||
/// Meter name for IDF cache metrics.
|
||||
/// </summary>
|
||||
public const string MeterName = "StellaOps.Concelier.PackageIdf";
|
||||
|
||||
private readonly Meter _meter;
|
||||
private readonly Counter<long> _hitsCounter;
|
||||
private readonly Counter<long> _missesCounter;
|
||||
private readonly Counter<long> _refreshCounter;
|
||||
private readonly Histogram<double> _latencyHistogram;
|
||||
private readonly Histogram<double> _idfWeightHistogram;
|
||||
private readonly ObservableGauge<long> _corpusSizeGauge;
|
||||
private readonly ObservableGauge<long> _cachedEntriesGauge;
|
||||
|
||||
private long _lastKnownCorpusSize;
|
||||
private long _lastKnownCachedEntries;
|
||||
|
||||
/// <summary>
|
||||
/// Activity source for tracing IDF cache operations.
|
||||
/// </summary>
|
||||
public static ActivitySource ActivitySource { get; } = new(ActivitySourceName, "1.0.0");
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of <see cref="PackageIdfMetrics"/>.
|
||||
/// </summary>
|
||||
public PackageIdfMetrics()
|
||||
{
|
||||
_meter = new Meter(MeterName, "1.0.0");
|
||||
|
||||
_hitsCounter = _meter.CreateCounter<long>(
|
||||
"concelier_linkset_package_idf_hits_total",
|
||||
unit: "{hits}",
|
||||
description: "Total number of package IDF cache hits");
|
||||
|
||||
_missesCounter = _meter.CreateCounter<long>(
|
||||
"concelier_linkset_package_idf_misses_total",
|
||||
unit: "{misses}",
|
||||
description: "Total number of package IDF cache misses");
|
||||
|
||||
_refreshCounter = _meter.CreateCounter<long>(
|
||||
"concelier_linkset_package_idf_refreshes_total",
|
||||
unit: "{refreshes}",
|
||||
description: "Total number of IDF corpus refresh operations");
|
||||
|
||||
_latencyHistogram = _meter.CreateHistogram<double>(
|
||||
"concelier_linkset_package_idf_latency_ms",
|
||||
unit: "ms",
|
||||
description: "Package IDF cache operation latency in milliseconds");
|
||||
|
||||
_idfWeightHistogram = _meter.CreateHistogram<double>(
|
||||
"concelier_linkset_package_idf_weight",
|
||||
unit: "{weight}",
|
||||
description: "Distribution of package IDF weights (0.0-1.0)");
|
||||
|
||||
_corpusSizeGauge = _meter.CreateObservableGauge(
|
||||
"concelier_linkset_package_idf_corpus_size",
|
||||
() => _lastKnownCorpusSize,
|
||||
unit: "{observations}",
|
||||
description: "Total number of observations in the IDF corpus");
|
||||
|
||||
_cachedEntriesGauge = _meter.CreateObservableGauge(
|
||||
"concelier_linkset_package_idf_cached_entries",
|
||||
() => _lastKnownCachedEntries,
|
||||
unit: "{entries}",
|
||||
description: "Number of cached IDF entries");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a cache hit.
|
||||
/// </summary>
|
||||
public void RecordHit() => _hitsCounter.Add(1);
|
||||
|
||||
/// <summary>
|
||||
/// Records multiple cache hits.
|
||||
/// </summary>
|
||||
/// <param name="count">Number of hits.</param>
|
||||
public void RecordHits(long count) => _hitsCounter.Add(count);
|
||||
|
||||
/// <summary>
|
||||
/// Records a cache miss.
|
||||
/// </summary>
|
||||
public void RecordMiss() => _missesCounter.Add(1);
|
||||
|
||||
/// <summary>
|
||||
/// Records multiple cache misses.
|
||||
/// </summary>
|
||||
/// <param name="count">Number of misses.</param>
|
||||
public void RecordMisses(long count) => _missesCounter.Add(count);
|
||||
|
||||
/// <summary>
|
||||
/// Records a corpus refresh operation.
|
||||
/// </summary>
|
||||
/// <param name="packageCount">Number of packages refreshed.</param>
|
||||
public void RecordRefresh(long packageCount = 1)
|
||||
{
|
||||
_refreshCounter.Add(1, new KeyValuePair<string, object?>("package_count", packageCount));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records operation latency.
|
||||
/// </summary>
|
||||
/// <param name="milliseconds">Latency in milliseconds.</param>
|
||||
/// <param name="operation">The operation type (get, set, batch_get, refresh).</param>
|
||||
public void RecordLatency(double milliseconds, string operation)
|
||||
{
|
||||
_latencyHistogram.Record(milliseconds, new KeyValuePair<string, object?>("operation", operation));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records an IDF weight observation for distribution analysis.
|
||||
/// </summary>
|
||||
/// <param name="weight">The IDF weight (0.0-1.0).</param>
|
||||
public void RecordIdfWeight(double weight)
|
||||
{
|
||||
_idfWeightHistogram.Record(weight);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates the corpus size gauge.
|
||||
/// </summary>
|
||||
/// <param name="size">Current corpus size.</param>
|
||||
public void UpdateCorpusSize(long size)
|
||||
{
|
||||
_lastKnownCorpusSize = size;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates the cached entries gauge.
|
||||
/// </summary>
|
||||
/// <param name="count">Current cached entry count.</param>
|
||||
public void UpdateCachedEntries(long count)
|
||||
{
|
||||
_lastKnownCachedEntries = count;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts an activity for tracing an IDF cache operation.
|
||||
/// </summary>
|
||||
/// <param name="operationName">Name of the operation.</param>
|
||||
/// <returns>The activity, or null if tracing is disabled.</returns>
|
||||
public static Activity? StartActivity(string operationName)
|
||||
{
|
||||
return ActivitySource.StartActivity(operationName, ActivityKind.Internal);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts an activity with tags.
|
||||
/// </summary>
|
||||
/// <param name="operationName">Name of the operation.</param>
|
||||
/// <param name="tags">Tags to add to the activity.</param>
|
||||
/// <returns>The activity, or null if tracing is disabled.</returns>
|
||||
public static Activity? StartActivity(string operationName, params (string Key, object? Value)[] tags)
|
||||
{
|
||||
var activity = ActivitySource.StartActivity(operationName, ActivityKind.Internal);
|
||||
if (activity is not null)
|
||||
{
|
||||
foreach (var (key, value) in tags)
|
||||
{
|
||||
activity.SetTag(key, value);
|
||||
}
|
||||
}
|
||||
return activity;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Dispose()
|
||||
{
|
||||
_meter.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for timing IDF cache operations.
|
||||
/// </summary>
|
||||
public static class PackageIdfMetricsExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Times an async operation and records the latency.
|
||||
/// </summary>
|
||||
public static async Task<T> TimeAsync<T>(
|
||||
this PackageIdfMetrics? metrics,
|
||||
string operation,
|
||||
Func<Task<T>> action)
|
||||
{
|
||||
if (metrics is null)
|
||||
{
|
||||
return await action().ConfigureAwait(false);
|
||||
}
|
||||
|
||||
var sw = Stopwatch.StartNew();
|
||||
try
|
||||
{
|
||||
return await action().ConfigureAwait(false);
|
||||
}
|
||||
finally
|
||||
{
|
||||
sw.Stop();
|
||||
metrics.RecordLatency(sw.Elapsed.TotalMilliseconds, operation);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Times an async operation and records the latency.
|
||||
/// </summary>
|
||||
public static async Task TimeAsync(
|
||||
this PackageIdfMetrics? metrics,
|
||||
string operation,
|
||||
Func<Task> action)
|
||||
{
|
||||
if (metrics is null)
|
||||
{
|
||||
await action().ConfigureAwait(false);
|
||||
return;
|
||||
}
|
||||
|
||||
var sw = Stopwatch.StartNew();
|
||||
try
|
||||
{
|
||||
await action().ConfigureAwait(false);
|
||||
}
|
||||
finally
|
||||
{
|
||||
sw.Stop();
|
||||
metrics.RecordLatency(sw.Elapsed.TotalMilliseconds, operation);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -32,6 +32,10 @@ public static class ServiceCollectionExtensions
|
||||
services.Configure<ConcelierCacheOptions>(
|
||||
configuration.GetSection(ConcelierCacheOptions.SectionName));
|
||||
|
||||
// Bind package IDF options (CORR-V2-007)
|
||||
services.Configure<PackageIdfOptions>(
|
||||
configuration.GetSection(PackageIdfOptions.SectionName));
|
||||
|
||||
return AddCoreServices(services, enableWarmup);
|
||||
}
|
||||
|
||||
@@ -39,16 +43,23 @@ public static class ServiceCollectionExtensions
|
||||
/// Adds Concelier Valkey cache services with custom options.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureOptions">Action to configure options.</param>
|
||||
/// <param name="configureOptions">Action to configure cache options.</param>
|
||||
/// <param name="configureIdfOptions">Optional action to configure IDF options.</param>
|
||||
/// <param name="enableWarmup">Whether to enable background cache warmup.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddConcelierValkeyCache(
|
||||
this IServiceCollection services,
|
||||
Action<ConcelierCacheOptions> configureOptions,
|
||||
Action<PackageIdfOptions>? configureIdfOptions = null,
|
||||
bool enableWarmup = true)
|
||||
{
|
||||
services.Configure(configureOptions);
|
||||
|
||||
if (configureIdfOptions is not null)
|
||||
{
|
||||
services.Configure(configureIdfOptions);
|
||||
}
|
||||
|
||||
return AddCoreServices(services, enableWarmup);
|
||||
}
|
||||
|
||||
@@ -59,9 +70,11 @@ public static class ServiceCollectionExtensions
|
||||
|
||||
// Register metrics
|
||||
services.TryAddSingleton<ConcelierCacheMetrics>();
|
||||
services.TryAddSingleton<PackageIdfMetrics>();
|
||||
|
||||
// Register cache service
|
||||
// Register cache services
|
||||
services.TryAddSingleton<IAdvisoryCacheService, ValkeyAdvisoryCacheService>();
|
||||
services.TryAddSingleton<IPackageIdfService, ValkeyPackageIdfService>();
|
||||
|
||||
// Register warmup hosted service if enabled
|
||||
if (enableWarmup)
|
||||
@@ -69,6 +82,10 @@ public static class ServiceCollectionExtensions
|
||||
services.AddHostedService<CacheWarmupHostedService>();
|
||||
}
|
||||
|
||||
// Register IDF refresh hosted service (CORR-V2-007)
|
||||
// Note: Requires IIdfCorpusProvider to be registered by Concelier.Core
|
||||
services.AddHostedService<IdfRefreshHostedService>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,421 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ValkeyPackageIdfService.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-007
|
||||
// Description: Valkey-backed implementation of IPackageIdfService
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StackExchange.Redis;
|
||||
|
||||
namespace StellaOps.Concelier.Cache.Valkey;
|
||||
|
||||
/// <summary>
|
||||
/// Valkey-backed implementation of <see cref="IPackageIdfService"/>.
|
||||
/// Provides caching for package IDF (Inverse Document Frequency) weights
|
||||
/// used in linkset correlation scoring.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// This service caches pre-computed IDF weights with hourly refresh.
|
||||
/// On cache miss, it returns null to signal the caller should use uniform weights.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Key features:
|
||||
/// - Batch operations for efficient multi-package lookups
|
||||
/// - Graceful degradation on Valkey errors (returns null, logs warning)
|
||||
/// - TTL-based expiration with configurable refresh intervals
|
||||
/// - OpenTelemetry metrics for monitoring cache performance
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class ValkeyPackageIdfService : IPackageIdfService
|
||||
{
|
||||
private readonly ConcelierCacheConnectionFactory _connectionFactory;
|
||||
private readonly ConcelierCacheOptions _cacheOptions;
|
||||
private readonly PackageIdfOptions _idfOptions;
|
||||
private readonly PackageIdfMetrics? _metrics;
|
||||
private readonly ILogger<ValkeyPackageIdfService>? _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of <see cref="ValkeyPackageIdfService"/>.
|
||||
/// </summary>
|
||||
public ValkeyPackageIdfService(
|
||||
ConcelierCacheConnectionFactory connectionFactory,
|
||||
IOptions<ConcelierCacheOptions> cacheOptions,
|
||||
IOptions<PackageIdfOptions> idfOptions,
|
||||
PackageIdfMetrics? metrics = null,
|
||||
ILogger<ValkeyPackageIdfService>? logger = null)
|
||||
{
|
||||
_connectionFactory = connectionFactory ?? throw new ArgumentNullException(nameof(connectionFactory));
|
||||
_cacheOptions = cacheOptions?.Value ?? new ConcelierCacheOptions();
|
||||
_idfOptions = idfOptions?.Value ?? new PackageIdfOptions();
|
||||
_metrics = metrics;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool IsEnabled => _cacheOptions.Enabled && _idfOptions.Enabled;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<double?> GetIdfAsync(string packageName, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled || string.IsNullOrWhiteSpace(packageName))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var sw = StartTiming();
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var key = AdvisoryCacheKeys.IdfPackage(packageName, _cacheOptions.KeyPrefix);
|
||||
|
||||
var cached = await db.StringGetAsync(key).ConfigureAwait(false);
|
||||
if (cached.HasValue && double.TryParse((string?)cached, NumberStyles.Float, CultureInfo.InvariantCulture, out var weight))
|
||||
{
|
||||
await db.StringIncrementAsync(AdvisoryCacheKeys.IdfStatsHits(_cacheOptions.KeyPrefix)).ConfigureAwait(false);
|
||||
_metrics?.RecordHit();
|
||||
_metrics?.RecordIdfWeight(weight);
|
||||
return weight;
|
||||
}
|
||||
|
||||
await db.StringIncrementAsync(AdvisoryCacheKeys.IdfStatsMisses(_cacheOptions.KeyPrefix)).ConfigureAwait(false);
|
||||
_metrics?.RecordMiss();
|
||||
return null;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogWarning(ex, "Failed to get IDF for package {PackageName}", packageName);
|
||||
return null; // Graceful degradation
|
||||
}
|
||||
finally
|
||||
{
|
||||
StopTiming(sw, "get");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IReadOnlyDictionary<string, double>> GetIdfBatchAsync(
|
||||
IEnumerable<string> packageNames,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var names = packageNames?.Where(n => !string.IsNullOrWhiteSpace(n)).Distinct().ToArray()
|
||||
?? Array.Empty<string>();
|
||||
|
||||
if (!IsEnabled || names.Length == 0)
|
||||
{
|
||||
return new Dictionary<string, double>();
|
||||
}
|
||||
|
||||
var sw = StartTiming();
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var keys = names.Select(n => (RedisKey)AdvisoryCacheKeys.IdfPackage(n, _cacheOptions.KeyPrefix)).ToArray();
|
||||
|
||||
var values = await db.StringGetAsync(keys).ConfigureAwait(false);
|
||||
|
||||
var result = new Dictionary<string, double>(names.Length);
|
||||
var hits = 0;
|
||||
var misses = 0;
|
||||
|
||||
for (var i = 0; i < names.Length; i++)
|
||||
{
|
||||
if (values[i].HasValue &&
|
||||
double.TryParse((string?)values[i], NumberStyles.Float, CultureInfo.InvariantCulture, out var weight))
|
||||
{
|
||||
result[names[i]] = weight;
|
||||
hits++;
|
||||
_metrics?.RecordIdfWeight(weight);
|
||||
}
|
||||
else
|
||||
{
|
||||
misses++;
|
||||
}
|
||||
}
|
||||
|
||||
if (hits > 0) _metrics?.RecordHits(hits);
|
||||
if (misses > 0) _metrics?.RecordMisses(misses);
|
||||
|
||||
return result;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogWarning(ex, "Failed to batch get IDF for {Count} packages", names.Length);
|
||||
return new Dictionary<string, double>();
|
||||
}
|
||||
finally
|
||||
{
|
||||
StopTiming(sw, "batch_get");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task SetIdfAsync(string packageName, double idfWeight, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled || string.IsNullOrWhiteSpace(packageName))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Skip caching weights below threshold (very common packages)
|
||||
if (idfWeight < _idfOptions.MinIdfThreshold)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var sw = StartTiming();
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var key = AdvisoryCacheKeys.IdfPackage(packageName, _cacheOptions.KeyPrefix);
|
||||
var value = idfWeight.ToString("F6", CultureInfo.InvariantCulture);
|
||||
|
||||
await db.StringSetAsync(key, value, _idfOptions.IdfTtl).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogWarning(ex, "Failed to set IDF for package {PackageName}", packageName);
|
||||
}
|
||||
finally
|
||||
{
|
||||
StopTiming(sw, "set");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task SetIdfBatchAsync(
|
||||
IReadOnlyDictionary<string, double> idfWeights,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled || idfWeights is null || idfWeights.Count == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var sw = StartTiming();
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
var entries = idfWeights
|
||||
.Where(kv => !string.IsNullOrWhiteSpace(kv.Key) && kv.Value >= _idfOptions.MinIdfThreshold)
|
||||
.Select(kv => new KeyValuePair<RedisKey, RedisValue>(
|
||||
AdvisoryCacheKeys.IdfPackage(kv.Key, _cacheOptions.KeyPrefix),
|
||||
kv.Value.ToString("F6", CultureInfo.InvariantCulture)))
|
||||
.ToArray();
|
||||
|
||||
if (entries.Length == 0)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Use pipeline for batch set with TTL
|
||||
var batch = db.CreateBatch();
|
||||
var tasks = new List<Task>(entries.Length);
|
||||
|
||||
foreach (var entry in entries)
|
||||
{
|
||||
tasks.Add(batch.StringSetAsync(entry.Key, entry.Value, _idfOptions.IdfTtl));
|
||||
}
|
||||
|
||||
batch.Execute();
|
||||
await Task.WhenAll(tasks).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogWarning(ex, "Failed to batch set IDF for {Count} packages", idfWeights.Count);
|
||||
}
|
||||
finally
|
||||
{
|
||||
StopTiming(sw, "batch_set");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task UpdateCorpusStatsAsync(
|
||||
long corpusSize,
|
||||
IReadOnlyDictionary<string, long> documentFrequencies,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var sw = StartTiming();
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var prefix = _cacheOptions.KeyPrefix;
|
||||
|
||||
// Update corpus size
|
||||
await db.StringSetAsync(
|
||||
AdvisoryCacheKeys.IdfCorpusSize(prefix),
|
||||
corpusSize.ToString(CultureInfo.InvariantCulture),
|
||||
_idfOptions.CorpusStatsTtl).ConfigureAwait(false);
|
||||
|
||||
// Compute and cache IDF weights
|
||||
var idfWeights = new Dictionary<string, double>(documentFrequencies.Count);
|
||||
var maxIdf = 0.0;
|
||||
|
||||
foreach (var (packageName, df) in documentFrequencies)
|
||||
{
|
||||
// IDF formula: log(N / (1 + df))
|
||||
var rawIdf = Math.Log((double)corpusSize / (1 + df));
|
||||
if (rawIdf > maxIdf) maxIdf = rawIdf;
|
||||
idfWeights[packageName] = rawIdf;
|
||||
}
|
||||
|
||||
// Normalize if configured
|
||||
if (_idfOptions.NormalizeScores && maxIdf > 0)
|
||||
{
|
||||
foreach (var key in idfWeights.Keys.ToArray())
|
||||
{
|
||||
idfWeights[key] /= maxIdf;
|
||||
}
|
||||
}
|
||||
|
||||
// Batch set the normalized IDF weights
|
||||
await SetIdfBatchAsync(idfWeights, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Update document frequencies
|
||||
var batch = db.CreateBatch();
|
||||
var tasks = new List<Task>(documentFrequencies.Count);
|
||||
|
||||
foreach (var (packageName, df) in documentFrequencies)
|
||||
{
|
||||
tasks.Add(batch.StringSetAsync(
|
||||
AdvisoryCacheKeys.IdfDocumentFrequency(packageName, prefix),
|
||||
df.ToString(CultureInfo.InvariantCulture),
|
||||
_idfOptions.CorpusStatsTtl));
|
||||
}
|
||||
|
||||
batch.Execute();
|
||||
await Task.WhenAll(tasks).ConfigureAwait(false);
|
||||
|
||||
// Update last refresh timestamp
|
||||
await db.StringSetAsync(
|
||||
AdvisoryCacheKeys.IdfLastRefresh(prefix),
|
||||
DateTimeOffset.UtcNow.ToString("o", CultureInfo.InvariantCulture),
|
||||
_idfOptions.CorpusStatsTtl).ConfigureAwait(false);
|
||||
|
||||
_metrics?.UpdateCorpusSize(corpusSize);
|
||||
_metrics?.UpdateCachedEntries(documentFrequencies.Count);
|
||||
_metrics?.RecordRefresh(documentFrequencies.Count);
|
||||
|
||||
_logger?.LogInformation(
|
||||
"Updated IDF corpus: size={CorpusSize}, packages={PackageCount}",
|
||||
corpusSize,
|
||||
documentFrequencies.Count);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogError(ex, "Failed to update IDF corpus stats");
|
||||
}
|
||||
finally
|
||||
{
|
||||
StopTiming(sw, "refresh");
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DateTimeOffset?> GetLastRefreshAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var key = AdvisoryCacheKeys.IdfLastRefresh(_cacheOptions.KeyPrefix);
|
||||
|
||||
var cached = await db.StringGetAsync(key).ConfigureAwait(false);
|
||||
if (cached.HasValue &&
|
||||
DateTimeOffset.TryParse(cached, CultureInfo.InvariantCulture, DateTimeStyles.RoundtripKind, out var timestamp))
|
||||
{
|
||||
return timestamp;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogWarning(ex, "Failed to get IDF last refresh timestamp");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task InvalidateAsync(string packageName, CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled || string.IsNullOrWhiteSpace(packageName))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var prefix = _cacheOptions.KeyPrefix;
|
||||
|
||||
await Task.WhenAll(
|
||||
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfPackage(packageName, prefix)),
|
||||
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfDocumentFrequency(packageName, prefix))
|
||||
).ConfigureAwait(false);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogWarning(ex, "Failed to invalidate IDF for package {PackageName}", packageName);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task InvalidateAllAsync(CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (!IsEnabled)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
|
||||
var prefix = _cacheOptions.KeyPrefix;
|
||||
|
||||
// Delete stats keys
|
||||
await Task.WhenAll(
|
||||
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfCorpusSize(prefix)),
|
||||
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfLastRefresh(prefix)),
|
||||
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfStatsHits(prefix)),
|
||||
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfStatsMisses(prefix))
|
||||
).ConfigureAwait(false);
|
||||
|
||||
// Note: Scanning and deleting all idf:pkg:* keys would require SCAN,
|
||||
// which is expensive. For now, rely on TTL expiration.
|
||||
_logger?.LogInformation("Invalidated IDF stats; individual package keys will expire via TTL");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger?.LogError(ex, "Failed to invalidate all IDF cache");
|
||||
}
|
||||
}
|
||||
|
||||
private Stopwatch? StartTiming()
|
||||
{
|
||||
if (_metrics is null) return null;
|
||||
return Stopwatch.StartNew();
|
||||
}
|
||||
|
||||
private void StopTiming(Stopwatch? sw, string operation)
|
||||
{
|
||||
if (sw is null || _metrics is null) return;
|
||||
sw.Stop();
|
||||
_metrics.RecordLatency(sw.Elapsed.TotalMilliseconds, operation);
|
||||
}
|
||||
}
|
||||
@@ -40,11 +40,33 @@ public sealed record AdvisoryLinksetProvenance(
|
||||
string? ToolVersion,
|
||||
string? PolicyHash);
|
||||
|
||||
/// <summary>
|
||||
/// Conflict severity levels for typed penalty calculation.
|
||||
/// </summary>
|
||||
public enum ConflictSeverity
|
||||
{
|
||||
/// <summary>No penalty; informational only.</summary>
|
||||
Info = 0,
|
||||
|
||||
/// <summary>Minor disagreement; small penalty.</summary>
|
||||
Soft = 1,
|
||||
|
||||
/// <summary>Significant disagreement; should usually prevent high-confidence linking.</summary>
|
||||
Hard = 2
|
||||
}
|
||||
|
||||
public sealed record AdvisoryLinksetConflict(
|
||||
string Field,
|
||||
string Reason,
|
||||
IReadOnlyList<string>? Values,
|
||||
IReadOnlyList<string>? SourceIds = null);
|
||||
IReadOnlyList<string>? SourceIds = null)
|
||||
{
|
||||
/// <summary>
|
||||
/// Severity of the conflict. Defaults to <see cref="ConflictSeverity.Soft"/>.
|
||||
/// Hard conflicts significantly impact confidence; Info conflicts are purely informational.
|
||||
/// </summary>
|
||||
public ConflictSeverity Severity { get; init; } = ConflictSeverity.Soft;
|
||||
}
|
||||
|
||||
internal static class DocumentHelper
|
||||
{
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// ILinksetCorrelationService.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-008
|
||||
// Description: Abstraction for linkset correlation with V1/V2 support
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using StellaOps.Concelier.Models;
|
||||
|
||||
namespace StellaOps.Concelier.Core.Linksets;
|
||||
|
||||
/// <summary>
|
||||
/// Service for computing linkset correlation confidence and conflicts.
|
||||
/// Supports multiple correlation algorithm versions (V1, V2).
|
||||
/// </summary>
|
||||
public interface ILinksetCorrelationService
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the correlation algorithm version being used.
|
||||
/// </summary>
|
||||
string Version { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Computes correlation confidence and conflicts for a set of observation inputs.
|
||||
/// </summary>
|
||||
(double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) Compute(
|
||||
IReadOnlyCollection<CorrelationInput> inputs,
|
||||
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts = null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Unified input model for correlation computation.
|
||||
/// </summary>
|
||||
public sealed record CorrelationInput(
|
||||
string ObservationId,
|
||||
string? Vendor,
|
||||
DateTimeOffset? FetchedAt,
|
||||
IReadOnlyCollection<string> Aliases,
|
||||
IReadOnlyCollection<string> Purls,
|
||||
IReadOnlyCollection<string> Cpes,
|
||||
IReadOnlyCollection<string> References,
|
||||
IReadOnlyCollection<string>? PatchReferences = null);
|
||||
|
||||
/// <summary>
|
||||
/// Configuration for the correlation service.
|
||||
/// </summary>
|
||||
public sealed class CorrelationServiceOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Correlation algorithm version. Supported values: "v1", "v2".
|
||||
/// Default: "v1" for backward compatibility.
|
||||
/// </summary>
|
||||
public string Version { get; set; } = "v1";
|
||||
|
||||
/// <summary>
|
||||
/// Optional custom weights for V2 correlation signals.
|
||||
/// Keys: aliasConnectivity, aliasAuthority, packageCoverage, versionCompatibility,
|
||||
/// cpeMatch, patchLineage, referenceOverlap, freshness
|
||||
/// </summary>
|
||||
public Dictionary<string, double>? Weights { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to enable IDF weighting for package keys (V2 only).
|
||||
/// </summary>
|
||||
public bool EnableIdfWeighting { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to enable text similarity scoring (V2 Phase 3, disabled by default).
|
||||
/// </summary>
|
||||
public bool EnableTextSimilarity { get; set; } = false;
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// LinksetCorrelationService.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-008
|
||||
// Description: Implementation of ILinksetCorrelationService with V1/V2 support
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.Concelier.Models;
|
||||
|
||||
namespace StellaOps.Concelier.Core.Linksets;
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of <see cref="ILinksetCorrelationService"/>.
|
||||
/// Supports V1 (intersection-based) and V2 (graph-based) correlation algorithms.
|
||||
/// </summary>
|
||||
public sealed class LinksetCorrelationService : ILinksetCorrelationService
|
||||
{
|
||||
private readonly CorrelationServiceOptions _options;
|
||||
private readonly ILogger<LinksetCorrelationService> _logger;
|
||||
private readonly Func<string, double>? _idfProvider;
|
||||
|
||||
public LinksetCorrelationService(
|
||||
IOptions<CorrelationServiceOptions> options,
|
||||
ILogger<LinksetCorrelationService> logger,
|
||||
Func<string, double>? idfProvider = null)
|
||||
{
|
||||
_options = options?.Value ?? new CorrelationServiceOptions();
|
||||
_logger = logger;
|
||||
_idfProvider = idfProvider;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Version => _options.Version?.ToLowerInvariant() switch
|
||||
{
|
||||
"v2" => "v2",
|
||||
_ => "v1"
|
||||
};
|
||||
|
||||
/// <inheritdoc />
|
||||
public (double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) Compute(
|
||||
IReadOnlyCollection<CorrelationInput> inputs,
|
||||
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts = null)
|
||||
{
|
||||
if (inputs.Count == 0)
|
||||
{
|
||||
return (1.0, Array.Empty<AdvisoryLinksetConflict>());
|
||||
}
|
||||
|
||||
return Version switch
|
||||
{
|
||||
"v2" => ComputeV2(inputs, additionalConflicts),
|
||||
_ => ComputeV1(inputs, additionalConflicts)
|
||||
};
|
||||
}
|
||||
|
||||
private (double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) ComputeV1(
|
||||
IReadOnlyCollection<CorrelationInput> inputs,
|
||||
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts)
|
||||
{
|
||||
// Convert to V1 input format
|
||||
var v1Inputs = inputs.Select(i => new LinksetCorrelation.Input(
|
||||
Vendor: i.Vendor,
|
||||
FetchedAt: i.FetchedAt,
|
||||
Aliases: i.Aliases,
|
||||
Purls: i.Purls,
|
||||
Cpes: i.Cpes,
|
||||
References: i.References)).ToArray();
|
||||
|
||||
return LinksetCorrelation.Compute(v1Inputs, additionalConflicts);
|
||||
}
|
||||
|
||||
private (double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) ComputeV2(
|
||||
IReadOnlyCollection<CorrelationInput> inputs,
|
||||
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts)
|
||||
{
|
||||
// Convert to V2 input format
|
||||
var v2Inputs = inputs.Select(i => new LinksetCorrelationV2.InputV2(
|
||||
ObservationId: i.ObservationId,
|
||||
Vendor: i.Vendor,
|
||||
FetchedAt: i.FetchedAt,
|
||||
Aliases: i.Aliases,
|
||||
Purls: i.Purls,
|
||||
Cpes: i.Cpes,
|
||||
References: i.References,
|
||||
PatchReferences: i.PatchReferences)).ToArray();
|
||||
|
||||
var idfProvider = _options.EnableIdfWeighting ? _idfProvider : null;
|
||||
var result = LinksetCorrelationV2.Compute(v2Inputs, additionalConflicts, idfProvider);
|
||||
|
||||
_logger.LogDebug(
|
||||
"V2 correlation computed: confidence={Confidence:F3}, conflicts={ConflictCount}, signals={Signals}",
|
||||
result.Confidence,
|
||||
result.Conflicts.Count,
|
||||
string.Join(", ", result.SignalScores.Select(kv => $"{kv.Key}={kv.Value:F2}")));
|
||||
|
||||
return (result.Confidence, result.Conflicts);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,910 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// LinksetCorrelationV2.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-001 through CORR-V2-008
|
||||
// Description: V2 correlation algorithm with graph-based alias connectivity,
|
||||
// version compatibility scoring, patch lineage signals, and typed
|
||||
// conflict severities.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Collections.Immutable;
|
||||
using System.Linq;
|
||||
using StellaOps.Concelier.Models;
|
||||
|
||||
namespace StellaOps.Concelier.Core.Linksets;
|
||||
|
||||
/// <summary>
|
||||
/// Version relationship classification for affected range comparison.
|
||||
/// </summary>
|
||||
public enum VersionRelation
|
||||
{
|
||||
/// <summary>Unable to determine relationship.</summary>
|
||||
Unknown = 0,
|
||||
|
||||
/// <summary>Ranges normalize to identical primitives.</summary>
|
||||
Equivalent = 1,
|
||||
|
||||
/// <summary>Ranges have non-empty intersection but are not equal.</summary>
|
||||
Overlapping = 2,
|
||||
|
||||
/// <summary>Ranges have no intersection.</summary>
|
||||
Disjoint = 3
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// V2 linkset correlation algorithm with graph-based connectivity,
|
||||
/// typed conflict severities, and multi-signal scoring.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Key improvements over V1:
|
||||
/// - Alias matching uses graph connectivity (LCC ratio) instead of intersection-across-all
|
||||
/// - PURL matching uses pairwise coverage instead of intersection-across-all
|
||||
/// - Reference clash only emitted for true contradictions, not zero overlap
|
||||
/// - Typed conflict severities with per-reason penalties
|
||||
/// - Patch lineage as high-weight signal
|
||||
/// - Version compatibility classification (equivalent/overlapping/disjoint)
|
||||
/// </remarks>
|
||||
internal static class LinksetCorrelationV2
|
||||
{
|
||||
/// <summary>
|
||||
/// Default correlation weights. Can be overridden via configuration.
|
||||
/// </summary>
|
||||
internal static class Weights
|
||||
{
|
||||
public const double AliasConnectivity = 0.30;
|
||||
public const double AliasAuthority = 0.10;
|
||||
public const double PackageCoverage = 0.20;
|
||||
public const double VersionCompatibility = 0.10;
|
||||
public const double CpeMatch = 0.10;
|
||||
public const double PatchLineage = 0.10;
|
||||
public const double ReferenceOverlap = 0.05;
|
||||
public const double Freshness = 0.05;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Conflict penalties by severity and reason.
|
||||
/// </summary>
|
||||
internal static class ConflictPenalties
|
||||
{
|
||||
public const double DistinctCves = 0.40; // Hard: two different CVEs
|
||||
public const double DisjointVersionRanges = 0.30; // Hard: same pkg, no overlap
|
||||
public const double OverlappingRanges = 0.05; // Soft: ranges overlap but differ
|
||||
public const double SeverityMismatch = 0.05; // Soft: CVSS differs
|
||||
public const double AliasInconsistency = 0.10; // Soft: non-CVE alias mismatch
|
||||
public const double ZeroReferenceOverlap = 0.00; // Info: no penalty
|
||||
}
|
||||
|
||||
internal readonly record struct InputV2(
|
||||
string ObservationId,
|
||||
string? Vendor,
|
||||
DateTimeOffset? FetchedAt,
|
||||
IReadOnlyCollection<string> Aliases,
|
||||
IReadOnlyCollection<string> Purls,
|
||||
IReadOnlyCollection<string> Cpes,
|
||||
IReadOnlyCollection<string> References,
|
||||
IReadOnlyCollection<string>? PatchReferences = null);
|
||||
|
||||
internal readonly record struct CorrelationResult(
|
||||
double Confidence,
|
||||
IReadOnlyList<AdvisoryLinksetConflict> Conflicts,
|
||||
IReadOnlyDictionary<string, double> SignalScores);
|
||||
|
||||
/// <summary>
|
||||
/// Computes correlation confidence and conflicts for a set of observations.
|
||||
/// </summary>
|
||||
internal static CorrelationResult Compute(
|
||||
IReadOnlyCollection<InputV2> inputs,
|
||||
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts = null,
|
||||
Func<string, double>? packageIdfProvider = null)
|
||||
{
|
||||
if (inputs.Count == 0)
|
||||
{
|
||||
return new CorrelationResult(
|
||||
1.0,
|
||||
Array.Empty<AdvisoryLinksetConflict>(),
|
||||
ImmutableDictionary<string, double>.Empty);
|
||||
}
|
||||
|
||||
var conflicts = new List<AdvisoryLinksetConflict>();
|
||||
var signalScores = new Dictionary<string, double>();
|
||||
|
||||
// 1. Alias connectivity (graph-based)
|
||||
var (aliasConnectivity, aliasConflicts) = CalculateAliasConnectivity(inputs);
|
||||
conflicts.AddRange(aliasConflicts);
|
||||
signalScores["aliasConnectivity"] = aliasConnectivity;
|
||||
|
||||
// 2. Alias authority (scope-based weighting)
|
||||
var aliasAuthority = CalculateAliasAuthority(inputs);
|
||||
signalScores["aliasAuthority"] = aliasAuthority;
|
||||
|
||||
// 3. Package coverage (pairwise + IDF)
|
||||
var (packageCoverage, packageConflicts) = CalculatePackageCoverage(inputs, packageIdfProvider);
|
||||
conflicts.AddRange(packageConflicts);
|
||||
signalScores["packageCoverage"] = packageCoverage;
|
||||
|
||||
// 4. Version compatibility
|
||||
var (versionScore, versionConflicts) = CalculateVersionCompatibility(inputs);
|
||||
conflicts.AddRange(versionConflicts);
|
||||
signalScores["versionCompatibility"] = versionScore;
|
||||
|
||||
// 5. CPE match (existing logic, minor adjustments)
|
||||
var cpeScore = CalculateCpeScore(inputs);
|
||||
signalScores["cpeMatch"] = cpeScore;
|
||||
|
||||
// 6. Patch lineage
|
||||
var patchScore = CalculatePatchLineageScore(inputs);
|
||||
signalScores["patchLineage"] = patchScore;
|
||||
|
||||
// 7. Reference overlap (positive-only, no conflict on zero)
|
||||
var referenceScore = CalculateReferenceScore(inputs);
|
||||
signalScores["referenceOverlap"] = referenceScore;
|
||||
|
||||
// 8. Freshness
|
||||
var freshnessScore = CalculateFreshnessScore(inputs);
|
||||
signalScores["freshness"] = freshnessScore;
|
||||
|
||||
// Calculate base confidence from weighted signals
|
||||
var baseConfidence = Clamp01(
|
||||
(Weights.AliasConnectivity * aliasConnectivity) +
|
||||
(Weights.AliasAuthority * aliasAuthority) +
|
||||
(Weights.PackageCoverage * packageCoverage) +
|
||||
(Weights.VersionCompatibility * versionScore) +
|
||||
(Weights.CpeMatch * cpeScore) +
|
||||
(Weights.PatchLineage * patchScore) +
|
||||
(Weights.ReferenceOverlap * referenceScore) +
|
||||
(Weights.Freshness * freshnessScore));
|
||||
|
||||
// Add additional conflicts before penalty calculation
|
||||
if (additionalConflicts is { Count: > 0 })
|
||||
{
|
||||
conflicts.AddRange(additionalConflicts);
|
||||
}
|
||||
|
||||
// Apply typed conflict penalties
|
||||
var totalPenalty = CalculateTypedPenalty(conflicts);
|
||||
var finalConfidence = Clamp01(baseConfidence - totalPenalty);
|
||||
|
||||
// Ensure minimum confidence when conflicts exist but evidence is present
|
||||
if (finalConfidence < 0.1 && baseConfidence > 0)
|
||||
{
|
||||
finalConfidence = 0.1;
|
||||
}
|
||||
|
||||
return new CorrelationResult(
|
||||
finalConfidence,
|
||||
DeduplicateAndSort(conflicts, inputs),
|
||||
signalScores.ToImmutableDictionary());
|
||||
}
|
||||
|
||||
#region Alias Connectivity (Graph-based)
|
||||
|
||||
/// <summary>
|
||||
/// Calculates alias connectivity using bipartite graph analysis.
|
||||
/// Returns LCC (largest connected component) ratio instead of intersection.
|
||||
/// </summary>
|
||||
private static (double Score, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) CalculateAliasConnectivity(
|
||||
IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var conflicts = new List<AdvisoryLinksetConflict>();
|
||||
|
||||
if (inputs.Count == 1)
|
||||
{
|
||||
return (inputs.First().Aliases.Count > 0 ? 1d : 0d, conflicts);
|
||||
}
|
||||
|
||||
// Build bipartite graph: observation nodes + alias nodes
|
||||
var observationToAliases = inputs
|
||||
.ToDictionary(
|
||||
i => i.ObservationId,
|
||||
i => i.Aliases.Select(a => a.ToUpperInvariant()).ToHashSet(StringComparer.Ordinal));
|
||||
|
||||
// Build adjacency for union-find
|
||||
var allAliases = observationToAliases.Values.SelectMany(a => a).ToHashSet(StringComparer.Ordinal);
|
||||
|
||||
if (allAliases.Count == 0)
|
||||
{
|
||||
return (0d, conflicts);
|
||||
}
|
||||
|
||||
// Find connected components using alias-based bridging
|
||||
var observationIds = inputs.Select(i => i.ObservationId).ToList();
|
||||
var parent = observationIds.ToDictionary(id => id, id => id);
|
||||
|
||||
string Find(string x)
|
||||
{
|
||||
if (parent[x] != x)
|
||||
parent[x] = Find(parent[x]);
|
||||
return parent[x];
|
||||
}
|
||||
|
||||
void Union(string x, string y)
|
||||
{
|
||||
var px = Find(x);
|
||||
var py = Find(y);
|
||||
if (px != py)
|
||||
parent[px] = py;
|
||||
}
|
||||
|
||||
// Connect observations that share any alias
|
||||
foreach (var alias in allAliases)
|
||||
{
|
||||
var observationsWithAlias = observationIds
|
||||
.Where(id => observationToAliases[id].Contains(alias))
|
||||
.ToList();
|
||||
|
||||
for (int i = 1; i < observationsWithAlias.Count; i++)
|
||||
{
|
||||
Union(observationsWithAlias[0], observationsWithAlias[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate LCC ratio
|
||||
var componentSizes = observationIds
|
||||
.GroupBy(Find)
|
||||
.Select(g => g.Count())
|
||||
.ToList();
|
||||
|
||||
var largestComponent = componentSizes.Max();
|
||||
var lccRatio = (double)largestComponent / observationIds.Count;
|
||||
|
||||
// Check for distinct CVEs (true identity conflict)
|
||||
var cveAliases = allAliases
|
||||
.Where(a => a.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase))
|
||||
.ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
if (cveAliases.Count > 1)
|
||||
{
|
||||
// Multiple distinct CVEs in cluster = hard conflict
|
||||
var values = inputs
|
||||
.Select(i => $"{i.Vendor ?? "source"}:{FirstSortedOrDefault(i.Aliases.Where(a => a.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase)))}")
|
||||
.Where(v => !v.EndsWith(":<none>"))
|
||||
.OrderBy(v => v, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
|
||||
if (values.Length > 1)
|
||||
{
|
||||
conflicts.Add(new AdvisoryLinksetConflict(
|
||||
"aliases",
|
||||
"distinct-cves",
|
||||
values)
|
||||
{
|
||||
Severity = ConflictSeverity.Hard
|
||||
});
|
||||
}
|
||||
}
|
||||
else if (lccRatio < 1.0 && allAliases.Count > 0)
|
||||
{
|
||||
// Disconnected observations but no CVE conflict = soft inconsistency
|
||||
var disconnectedObs = observationIds
|
||||
.Where(id => Find(id) != Find(observationIds[0]))
|
||||
.Select(id => inputs.First(i => i.ObservationId == id))
|
||||
.Select(i => $"{i.Vendor ?? "source"}:{FirstSortedOrDefault(i.Aliases)}")
|
||||
.OrderBy(v => v, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
|
||||
if (disconnectedObs.Length > 0)
|
||||
{
|
||||
conflicts.Add(new AdvisoryLinksetConflict(
|
||||
"aliases",
|
||||
"alias-inconsistency",
|
||||
disconnectedObs)
|
||||
{
|
||||
Severity = ConflictSeverity.Soft
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return (lccRatio, conflicts);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Calculates alias authority score based on scope hierarchy.
|
||||
/// CVE (global) > ECO (ecosystem) > VND (vendor) > DST (distribution).
|
||||
/// </summary>
|
||||
private static double CalculateAliasAuthority(IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var allAliases = inputs.SelectMany(i => i.Aliases).ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
if (allAliases.Count == 0)
|
||||
return 0d;
|
||||
|
||||
// Score based on highest authority alias present
|
||||
var hasCve = allAliases.Any(a => a.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase));
|
||||
var hasGhsa = allAliases.Any(a => a.StartsWith("GHSA-", StringComparison.OrdinalIgnoreCase));
|
||||
var hasVendor = allAliases.Any(a =>
|
||||
a.StartsWith("RHSA-", StringComparison.OrdinalIgnoreCase) ||
|
||||
a.StartsWith("MSRC-", StringComparison.OrdinalIgnoreCase) ||
|
||||
a.StartsWith("CISCO-", StringComparison.OrdinalIgnoreCase) ||
|
||||
a.StartsWith("VMSA-", StringComparison.OrdinalIgnoreCase));
|
||||
var hasDistro = allAliases.Any(a =>
|
||||
a.StartsWith("DSA-", StringComparison.OrdinalIgnoreCase) ||
|
||||
a.StartsWith("USN-", StringComparison.OrdinalIgnoreCase) ||
|
||||
a.StartsWith("SUSE-", StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
if (hasCve) return 1.0;
|
||||
if (hasGhsa) return 0.8;
|
||||
if (hasVendor) return 0.6;
|
||||
if (hasDistro) return 0.4;
|
||||
|
||||
return 0.2; // Unknown alias scheme
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Package Coverage (Pairwise + IDF)
|
||||
|
||||
/// <summary>
|
||||
/// Calculates package coverage using pairwise overlap instead of intersection-across-all.
|
||||
/// A thin source with no packages does not collapse the score.
|
||||
/// </summary>
|
||||
private static (double Score, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) CalculatePackageCoverage(
|
||||
IReadOnlyCollection<InputV2> inputs,
|
||||
Func<string, double>? idfProvider = null)
|
||||
{
|
||||
var conflicts = new List<AdvisoryLinksetConflict>();
|
||||
|
||||
var inputsWithPackages = inputs.Where(i => i.Purls.Count > 0).ToList();
|
||||
if (inputsWithPackages.Count == 0)
|
||||
{
|
||||
return (0d, conflicts);
|
||||
}
|
||||
|
||||
if (inputsWithPackages.Count == 1)
|
||||
{
|
||||
return (inputsWithPackages[0].Purls.Count > 0 ? 1d : 0d, conflicts);
|
||||
}
|
||||
|
||||
// Extract package keys (without version)
|
||||
var packageKeysPerInput = inputsWithPackages
|
||||
.Select(i => i.Purls
|
||||
.Select(ExtractPackageKey)
|
||||
.Where(k => !string.IsNullOrWhiteSpace(k))
|
||||
.ToHashSet(StringComparer.Ordinal))
|
||||
.ToList();
|
||||
|
||||
// Calculate pairwise overlap with optional IDF weighting
|
||||
var totalWeight = 0d;
|
||||
var matchedWeight = 0d;
|
||||
var allPackages = packageKeysPerInput.SelectMany(p => p).ToHashSet(StringComparer.Ordinal);
|
||||
|
||||
foreach (var pkg in allPackages)
|
||||
{
|
||||
var idfWeight = idfProvider?.Invoke(pkg) ?? 1.0;
|
||||
var inputsWithPkg = packageKeysPerInput.Count(set => set.Contains(pkg));
|
||||
|
||||
totalWeight += idfWeight;
|
||||
if (inputsWithPkg > 1)
|
||||
{
|
||||
// Package appears in multiple sources = positive signal
|
||||
matchedWeight += idfWeight * ((double)inputsWithPkg / inputsWithPackages.Count);
|
||||
}
|
||||
}
|
||||
|
||||
var score = totalWeight > 0 ? matchedWeight / totalWeight : 0d;
|
||||
|
||||
// Check for exact PURL overlap (with version)
|
||||
var hasExactOverlap = HasExactPurlOverlap(inputsWithPackages);
|
||||
if (hasExactOverlap)
|
||||
{
|
||||
score = Math.Max(score, 0.8); // Boost for exact match
|
||||
}
|
||||
|
||||
// Collect range divergence as soft conflicts (handled in version scoring)
|
||||
// No longer emitted here to avoid double-counting
|
||||
|
||||
return (Clamp01(score), conflicts);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Version Compatibility
|
||||
|
||||
/// <summary>
|
||||
/// Classifies version relationships for shared packages.
|
||||
/// </summary>
|
||||
private static (double Score, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) CalculateVersionCompatibility(
|
||||
IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var conflicts = new List<AdvisoryLinksetConflict>();
|
||||
|
||||
var inputsWithPackages = inputs.Where(i => i.Purls.Count > 0).ToList();
|
||||
if (inputsWithPackages.Count < 2)
|
||||
{
|
||||
return (0.5d, conflicts); // Neutral when no comparison possible
|
||||
}
|
||||
|
||||
// Find shared package keys
|
||||
var packageKeysPerInput = inputsWithPackages
|
||||
.Select(i => i.Purls
|
||||
.Select(ExtractPackageKey)
|
||||
.Where(k => !string.IsNullOrWhiteSpace(k))
|
||||
.ToHashSet(StringComparer.Ordinal))
|
||||
.ToList();
|
||||
|
||||
var sharedPackages = packageKeysPerInput
|
||||
.Skip(1)
|
||||
.Aggregate(
|
||||
new HashSet<string>(packageKeysPerInput[0], StringComparer.Ordinal),
|
||||
(acc, next) =>
|
||||
{
|
||||
acc.IntersectWith(next);
|
||||
return acc;
|
||||
});
|
||||
|
||||
if (sharedPackages.Count == 0)
|
||||
{
|
||||
return (0.5d, conflicts); // Neutral when no shared packages
|
||||
}
|
||||
|
||||
var totalScore = 0d;
|
||||
var packageCount = 0;
|
||||
|
||||
foreach (var packageKey in sharedPackages)
|
||||
{
|
||||
var versionsPerSource = inputsWithPackages
|
||||
.Select(i => new
|
||||
{
|
||||
i.Vendor,
|
||||
Versions = i.Purls
|
||||
.Where(p => ExtractPackageKey(p) == packageKey)
|
||||
.Select(ExtractVersion)
|
||||
.Where(v => !string.IsNullOrWhiteSpace(v))
|
||||
.ToList()
|
||||
})
|
||||
.Where(x => x.Versions.Count > 0)
|
||||
.ToList();
|
||||
|
||||
if (versionsPerSource.Count < 2)
|
||||
continue;
|
||||
|
||||
packageCount++;
|
||||
|
||||
// Classify relationship (simplified; full impl would use SemanticVersionRangeResolver)
|
||||
var allVersions = versionsPerSource.SelectMany(v => v.Versions).ToHashSet(StringComparer.Ordinal);
|
||||
var relation = ClassifyVersionRelation(versionsPerSource.Select(v => v.Versions).ToList());
|
||||
|
||||
switch (relation)
|
||||
{
|
||||
case VersionRelation.Equivalent:
|
||||
totalScore += 1.0;
|
||||
break;
|
||||
|
||||
case VersionRelation.Overlapping:
|
||||
totalScore += 0.6;
|
||||
var overlapValues = versionsPerSource
|
||||
.Select(v => $"{v.Vendor ?? "source"}:{string.Join(",", v.Versions.OrderBy(x => x))}")
|
||||
.OrderBy(x => x, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
conflicts.Add(new AdvisoryLinksetConflict(
|
||||
$"affected.versions[{packageKey}]",
|
||||
"affected-range-divergence",
|
||||
overlapValues)
|
||||
{
|
||||
Severity = ConflictSeverity.Soft
|
||||
});
|
||||
break;
|
||||
|
||||
case VersionRelation.Disjoint:
|
||||
totalScore += 0.0;
|
||||
var disjointValues = versionsPerSource
|
||||
.Select(v => $"{v.Vendor ?? "source"}:{string.Join(",", v.Versions.OrderBy(x => x))}")
|
||||
.OrderBy(x => x, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
conflicts.Add(new AdvisoryLinksetConflict(
|
||||
$"affected.versions[{packageKey}]",
|
||||
"disjoint-version-ranges",
|
||||
disjointValues)
|
||||
{
|
||||
Severity = ConflictSeverity.Hard
|
||||
});
|
||||
break;
|
||||
|
||||
default:
|
||||
totalScore += 0.5; // Unknown = neutral
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
var avgScore = packageCount > 0 ? totalScore / packageCount : 0.5;
|
||||
return (Clamp01(avgScore), conflicts);
|
||||
}
|
||||
|
||||
private static VersionRelation ClassifyVersionRelation(List<List<string>> versionSets)
|
||||
{
|
||||
if (versionSets.Count < 2)
|
||||
return VersionRelation.Unknown;
|
||||
|
||||
var first = versionSets[0].ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
var allEquivalent = true;
|
||||
var anyOverlap = false;
|
||||
|
||||
foreach (var other in versionSets.Skip(1))
|
||||
{
|
||||
var otherSet = other.ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
if (!first.SetEquals(otherSet))
|
||||
allEquivalent = false;
|
||||
|
||||
if (first.Overlaps(otherSet))
|
||||
anyOverlap = true;
|
||||
}
|
||||
|
||||
if (allEquivalent)
|
||||
return VersionRelation.Equivalent;
|
||||
|
||||
if (anyOverlap)
|
||||
return VersionRelation.Overlapping;
|
||||
|
||||
return VersionRelation.Disjoint;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Patch Lineage
|
||||
|
||||
/// <summary>
|
||||
/// Calculates patch lineage correlation.
|
||||
/// Exact commit SHA match is a very strong signal.
|
||||
/// </summary>
|
||||
private static double CalculatePatchLineageScore(IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var inputsWithPatches = inputs
|
||||
.Where(i => i.PatchReferences?.Count > 0)
|
||||
.ToList();
|
||||
|
||||
if (inputsWithPatches.Count < 2)
|
||||
{
|
||||
return 0d; // No patch data to compare
|
||||
}
|
||||
|
||||
// Extract normalized patch references (commit SHAs, PR URLs)
|
||||
var patchesPerInput = inputsWithPatches
|
||||
.Select(i => i.PatchReferences!
|
||||
.Select(NormalizePatchReference)
|
||||
.Where(p => p is not null)
|
||||
.Select(p => p!)
|
||||
.ToHashSet(StringComparer.OrdinalIgnoreCase))
|
||||
.ToList();
|
||||
|
||||
// Find any pairwise overlap
|
||||
for (int i = 0; i < patchesPerInput.Count; i++)
|
||||
{
|
||||
for (int j = i + 1; j < patchesPerInput.Count; j++)
|
||||
{
|
||||
if (patchesPerInput[i].Overlaps(patchesPerInput[j]))
|
||||
{
|
||||
// Exact patch match = very strong signal
|
||||
return 1.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0d;
|
||||
}
|
||||
|
||||
private static string? NormalizePatchReference(string reference)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(reference))
|
||||
return null;
|
||||
|
||||
// Extract commit SHA from GitHub/GitLab URLs
|
||||
var commitPattern = new System.Text.RegularExpressions.Regex(
|
||||
@"(?:github\.com|gitlab\.com)/[^/]+/[^/]+(?:/-)?/commit/([0-9a-f]{7,40})",
|
||||
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||||
|
||||
var match = commitPattern.Match(reference);
|
||||
if (match.Success)
|
||||
{
|
||||
return match.Groups[1].Value.ToLowerInvariant();
|
||||
}
|
||||
|
||||
// Full SHA pattern
|
||||
var shaPattern = new System.Text.RegularExpressions.Regex(@"\b([0-9a-f]{40})\b",
|
||||
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||||
|
||||
match = shaPattern.Match(reference);
|
||||
if (match.Success)
|
||||
{
|
||||
return match.Groups[1].Value.ToLowerInvariant();
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Reference Score (Positive-Only)
|
||||
|
||||
/// <summary>
|
||||
/// Calculates reference overlap as a positive-only signal.
|
||||
/// Zero overlap is neutral (0.5), not a conflict.
|
||||
/// </summary>
|
||||
private static double CalculateReferenceScore(IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
if (inputs.All(i => i.References.Count == 0))
|
||||
{
|
||||
return 0.5d; // Neutral when no references
|
||||
}
|
||||
|
||||
var inputList = inputs.ToList();
|
||||
var maxOverlap = 0d;
|
||||
|
||||
for (var i = 0; i < inputList.Count; i++)
|
||||
{
|
||||
for (var j = i + 1; j < inputList.Count; j++)
|
||||
{
|
||||
var first = inputList[i].References
|
||||
.Select(NormalizeReferenceUrl)
|
||||
.ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
var second = inputList[j].References
|
||||
.Select(NormalizeReferenceUrl)
|
||||
.ToHashSet(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
var intersection = first.Intersect(second, StringComparer.OrdinalIgnoreCase).Count();
|
||||
var denom = Math.Max(first.Count, second.Count);
|
||||
var overlap = denom == 0 ? 0d : (double)intersection / denom;
|
||||
|
||||
if (overlap > maxOverlap)
|
||||
{
|
||||
maxOverlap = overlap;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Map overlap to score: 0 overlap = 0.5 (neutral), 1.0 overlap = 1.0
|
||||
return 0.5 + (maxOverlap * 0.5);
|
||||
}
|
||||
|
||||
private static string NormalizeReferenceUrl(string url)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(url))
|
||||
return string.Empty;
|
||||
|
||||
// Lowercase, remove tracking params, normalize protocol
|
||||
var normalized = url.ToLowerInvariant().Trim();
|
||||
|
||||
// Remove common tracking parameters
|
||||
var queryIndex = normalized.IndexOf('?');
|
||||
if (queryIndex > 0)
|
||||
{
|
||||
normalized = normalized[..queryIndex];
|
||||
}
|
||||
|
||||
// Normalize protocol
|
||||
if (normalized.StartsWith("http://"))
|
||||
{
|
||||
normalized = "https://" + normalized[7..];
|
||||
}
|
||||
|
||||
// Remove trailing slash
|
||||
return normalized.TrimEnd('/');
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region CPE and Freshness (Minor Updates)
|
||||
|
||||
private static double CalculateCpeScore(IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
if (inputs.All(i => i.Cpes.Count == 0))
|
||||
{
|
||||
return 0d;
|
||||
}
|
||||
|
||||
var cpeSets = inputs.Select(i => i.Cpes.ToHashSet(StringComparer.OrdinalIgnoreCase)).ToList();
|
||||
var exactOverlap = cpeSets.Skip(1).Any(set => set.Overlaps(cpeSets.First()));
|
||||
if (exactOverlap)
|
||||
{
|
||||
return 1d;
|
||||
}
|
||||
|
||||
var vendorProductSets = inputs
|
||||
.Select(i => i.Cpes.Select(ParseVendorProduct).Where(vp => vp.vendor is not null).ToHashSet())
|
||||
.ToList();
|
||||
|
||||
var sharedVendorProduct = vendorProductSets.Skip(1).Any(set => set.Overlaps(vendorProductSets.First()));
|
||||
return sharedVendorProduct ? 0.5d : 0d;
|
||||
}
|
||||
|
||||
private static (string? vendor, string? product) ParseVendorProduct(string cpe)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(cpe))
|
||||
{
|
||||
return (null, null);
|
||||
}
|
||||
|
||||
var parts = cpe.Split(':');
|
||||
if (parts.Length >= 6 && parts[0].StartsWith("cpe", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return (parts[3], parts[4]);
|
||||
}
|
||||
|
||||
if (parts.Length >= 5 && parts[0] == "cpe" && parts[1] == "/")
|
||||
{
|
||||
return (parts[2], parts[3]);
|
||||
}
|
||||
|
||||
return (null, null);
|
||||
}
|
||||
|
||||
private static double CalculateFreshnessScore(IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var fetched = inputs
|
||||
.Select(i => i.FetchedAt)
|
||||
.Where(d => d.HasValue)
|
||||
.Select(d => d!.Value)
|
||||
.ToList();
|
||||
|
||||
if (fetched.Count <= 1)
|
||||
{
|
||||
return 0.5d;
|
||||
}
|
||||
|
||||
var min = fetched.Min();
|
||||
var max = fetched.Max();
|
||||
var spread = max - min;
|
||||
|
||||
if (spread <= TimeSpan.FromHours(48))
|
||||
{
|
||||
return 1d;
|
||||
}
|
||||
|
||||
if (spread >= TimeSpan.FromDays(14))
|
||||
{
|
||||
return 0d;
|
||||
}
|
||||
|
||||
var remaining = TimeSpan.FromDays(14) - spread;
|
||||
return Clamp01(remaining.TotalSeconds / TimeSpan.FromDays(14).TotalSeconds);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Conflict Penalties
|
||||
|
||||
/// <summary>
|
||||
/// Calculates typed penalty based on conflict severities.
|
||||
/// </summary>
|
||||
private static double CalculateTypedPenalty(IReadOnlyList<AdvisoryLinksetConflict> conflicts)
|
||||
{
|
||||
if (conflicts.Count == 0)
|
||||
return 0d;
|
||||
|
||||
var totalPenalty = 0d;
|
||||
|
||||
foreach (var conflict in conflicts)
|
||||
{
|
||||
var penalty = conflict.Reason switch
|
||||
{
|
||||
"distinct-cves" => ConflictPenalties.DistinctCves,
|
||||
"disjoint-version-ranges" => ConflictPenalties.DisjointVersionRanges,
|
||||
"affected-range-divergence" => ConflictPenalties.OverlappingRanges,
|
||||
"severity-mismatch" => ConflictPenalties.SeverityMismatch,
|
||||
"alias-inconsistency" => ConflictPenalties.AliasInconsistency,
|
||||
"reference-clash" => 0d, // No penalty for reference differences
|
||||
_ => 0.05 // Default small penalty for unknown conflicts
|
||||
};
|
||||
|
||||
totalPenalty += penalty;
|
||||
}
|
||||
|
||||
// Saturate at 0.6 to prevent total collapse
|
||||
return Math.Min(totalPenalty, 0.6);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Helpers
|
||||
|
||||
private static bool HasExactPurlOverlap(IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var first = inputs.First().Purls.ToHashSet(StringComparer.Ordinal);
|
||||
return inputs.Skip(1).Any(input => input.Purls.Any(first.Contains));
|
||||
}
|
||||
|
||||
private static string ExtractPackageKey(string purl)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(purl))
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
var atIndex = purl.LastIndexOf('@');
|
||||
return atIndex > 0 ? purl[..atIndex] : purl;
|
||||
}
|
||||
|
||||
private static string ExtractVersion(string purl)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(purl))
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
var atIndex = purl.LastIndexOf('@');
|
||||
if (atIndex < 0 || atIndex >= purl.Length - 1)
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
var version = purl[(atIndex + 1)..];
|
||||
|
||||
// Remove qualifiers if present
|
||||
var qualifierIndex = version.IndexOf('?');
|
||||
if (qualifierIndex > 0)
|
||||
{
|
||||
version = version[..qualifierIndex];
|
||||
}
|
||||
|
||||
return version;
|
||||
}
|
||||
|
||||
private static IReadOnlyList<AdvisoryLinksetConflict> DeduplicateAndSort(
|
||||
IEnumerable<AdvisoryLinksetConflict> conflicts,
|
||||
IReadOnlyCollection<InputV2> inputs)
|
||||
{
|
||||
var set = new HashSet<string>(StringComparer.Ordinal);
|
||||
var list = new List<AdvisoryLinksetConflict>();
|
||||
|
||||
foreach (var conflict in conflicts)
|
||||
{
|
||||
var normalizedValues = NormalizeValues(conflict.Values);
|
||||
var normalizedSources = NormalizeValues(conflict.SourceIds);
|
||||
var key = $"{conflict.Field}|{conflict.Reason}|{string.Join('|', normalizedValues)}";
|
||||
|
||||
if (set.Add(key))
|
||||
{
|
||||
if (normalizedSources.Count == 0)
|
||||
{
|
||||
normalizedSources = inputs
|
||||
.Select(i => i.Vendor ?? "source")
|
||||
.Distinct(StringComparer.OrdinalIgnoreCase)
|
||||
.OrderBy(v => v, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
}
|
||||
|
||||
list.Add(conflict with
|
||||
{
|
||||
Values = normalizedValues,
|
||||
SourceIds = normalizedSources
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return list
|
||||
.OrderBy(c => c.Field, StringComparer.Ordinal)
|
||||
.ThenBy(c => c.Reason, StringComparer.Ordinal)
|
||||
.ThenBy(c => string.Join('|', c.Values ?? Array.Empty<string>()), StringComparer.Ordinal)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static double Clamp01(double value) => Math.Clamp(value, 0d, 1d);
|
||||
|
||||
private static string FirstSortedOrDefault(IEnumerable<string> values)
|
||||
{
|
||||
var first = values
|
||||
.Where(v => !string.IsNullOrWhiteSpace(v))
|
||||
.Select(v => v.Trim())
|
||||
.OrderBy(v => v, StringComparer.Ordinal)
|
||||
.FirstOrDefault();
|
||||
return string.IsNullOrEmpty(first) ? "<none>" : first;
|
||||
}
|
||||
|
||||
private static IReadOnlyList<string> NormalizeValues(IReadOnlyList<string>? values)
|
||||
{
|
||||
if (values is null || values.Count == 0)
|
||||
{
|
||||
return Array.Empty<string>();
|
||||
}
|
||||
|
||||
return values
|
||||
.Where(v => !string.IsNullOrWhiteSpace(v))
|
||||
.Select(v => v.Trim())
|
||||
.OrderBy(v => v, StringComparer.Ordinal)
|
||||
.ToArray();
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,331 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// TextSimilarityScorer.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-010
|
||||
// Description: Deterministic TF-IDF text similarity for linkset correlation
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.Concelier.Core.Linksets;
|
||||
|
||||
/// <summary>
|
||||
/// Computes TF-IDF-based text similarity between advisory descriptions.
|
||||
/// Used as an optional correlation signal in V2 linkset correlation.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// <para>
|
||||
/// This scorer is designed for deterministic, offline operation:
|
||||
/// - No external NLP dependencies (pure C# implementation)
|
||||
/// - Configurable stop words and tokenization
|
||||
/// - Stable output across runs (no randomness)
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// Default weight: 0.05 (low weight, supplementary signal).
|
||||
/// Feature flag: <c>concelier:correlation:textSimilarity:enabled</c> (default: false).
|
||||
/// </para>
|
||||
/// </remarks>
|
||||
public sealed class TextSimilarityScorer
|
||||
{
|
||||
private static readonly Regex TokenRegex = new(
|
||||
@"[a-zA-Z][a-zA-Z0-9_-]{2,}",
|
||||
RegexOptions.Compiled | RegexOptions.CultureInvariant);
|
||||
|
||||
private static readonly HashSet<string> DefaultStopWords = new(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
// Common English stop words
|
||||
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
|
||||
"of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
|
||||
"be", "have", "has", "had", "do", "does", "did", "will", "would", "could",
|
||||
"should", "may", "might", "must", "shall", "can", "need", "dare", "ought",
|
||||
"used", "this", "that", "these", "those", "which", "who", "whom", "whose",
|
||||
"what", "where", "when", "why", "how", "all", "each", "every", "both",
|
||||
"few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
|
||||
"own", "same", "so", "than", "too", "very", "just", "also", "now", "here",
|
||||
"there", "then", "once", "if", "into", "over", "after", "before", "about",
|
||||
// Common vulnerability description words (low discriminative value)
|
||||
"vulnerability", "issue", "allows", "attacker", "attack", "remote", "local",
|
||||
"user", "code", "execution", "denial", "service", "buffer", "overflow",
|
||||
"may", "could", "via", "using", "through", "affected", "version", "versions",
|
||||
"product", "software", "application", "component", "module", "function"
|
||||
};
|
||||
|
||||
private readonly TextSimilarityOptions _options;
|
||||
private readonly HashSet<string> _stopWords;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of <see cref="TextSimilarityScorer"/>.
|
||||
/// </summary>
|
||||
/// <param name="options">Configuration options. Null uses defaults.</param>
|
||||
public TextSimilarityScorer(TextSimilarityOptions? options = null)
|
||||
{
|
||||
_options = options ?? new TextSimilarityOptions();
|
||||
_stopWords = _options.CustomStopWords is not null
|
||||
? new HashSet<string>(_options.CustomStopWords, StringComparer.OrdinalIgnoreCase)
|
||||
: DefaultStopWords;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes average pairwise TF-IDF cosine similarity across all description pairs.
|
||||
/// </summary>
|
||||
/// <param name="descriptions">Collection of normalized description texts.</param>
|
||||
/// <returns>Average similarity score (0.0-1.0). Returns 0 if fewer than 2 descriptions.</returns>
|
||||
public double ComputeAverageSimilarity(IReadOnlyCollection<string> descriptions)
|
||||
{
|
||||
if (descriptions.Count < 2)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Filter out empty/null descriptions
|
||||
var validDescriptions = descriptions
|
||||
.Where(d => !string.IsNullOrWhiteSpace(d))
|
||||
.ToArray();
|
||||
|
||||
if (validDescriptions.Length < 2)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Tokenize all descriptions
|
||||
var tokenizedDocs = validDescriptions
|
||||
.Select(d => Tokenize(d))
|
||||
.ToArray();
|
||||
|
||||
// Build document frequency map
|
||||
var documentFrequency = BuildDocumentFrequency(tokenizedDocs);
|
||||
|
||||
// Compute TF-IDF vectors
|
||||
var tfidfVectors = tokenizedDocs
|
||||
.Select(tokens => ComputeTfIdf(tokens, documentFrequency, tokenizedDocs.Length))
|
||||
.ToArray();
|
||||
|
||||
// Compute average pairwise cosine similarity
|
||||
var totalSimilarity = 0.0;
|
||||
var pairCount = 0;
|
||||
|
||||
for (var i = 0; i < tfidfVectors.Length; i++)
|
||||
{
|
||||
for (var j = i + 1; j < tfidfVectors.Length; j++)
|
||||
{
|
||||
totalSimilarity += CosineSimilarity(tfidfVectors[i], tfidfVectors[j]);
|
||||
pairCount++;
|
||||
}
|
||||
}
|
||||
|
||||
return pairCount > 0 ? totalSimilarity / pairCount : 0.0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes TF-IDF cosine similarity between two descriptions.
|
||||
/// </summary>
|
||||
/// <param name="description1">First description text.</param>
|
||||
/// <param name="description2">Second description text.</param>
|
||||
/// <returns>Similarity score (0.0-1.0).</returns>
|
||||
public double ComputePairwiseSimilarity(string description1, string description2)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(description1) || string.IsNullOrWhiteSpace(description2))
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
var tokens1 = Tokenize(description1);
|
||||
var tokens2 = Tokenize(description2);
|
||||
|
||||
if (tokens1.Count == 0 || tokens2.Count == 0)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// For pairwise, use simple term frequency with IDF approximation
|
||||
var allTerms = new HashSet<string>(tokens1, StringComparer.OrdinalIgnoreCase);
|
||||
allTerms.UnionWith(tokens2);
|
||||
|
||||
// Document frequency (appears in 1 or 2 docs)
|
||||
var df = allTerms.ToDictionary(
|
||||
t => t,
|
||||
t => (tokens1.Contains(t) ? 1 : 0) + (tokens2.Contains(t) ? 1 : 0),
|
||||
StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
var vec1 = ComputeTfIdf(tokens1, df, 2);
|
||||
var vec2 = ComputeTfIdf(tokens2, df, 2);
|
||||
|
||||
return CosineSimilarity(vec1, vec2);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tokenizes text into lowercase terms, removing stop words and short tokens.
|
||||
/// </summary>
|
||||
internal IReadOnlyList<string> Tokenize(string text)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text))
|
||||
{
|
||||
return Array.Empty<string>();
|
||||
}
|
||||
|
||||
var matches = TokenRegex.Matches(text);
|
||||
var tokens = new List<string>(matches.Count);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
var token = match.Value.ToLowerInvariant();
|
||||
|
||||
// Skip stop words
|
||||
if (_stopWords.Contains(token))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip tokens that are too short
|
||||
if (token.Length < _options.MinTokenLength)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip tokens that are all digits (version numbers, etc.)
|
||||
if (token.All(char.IsDigit))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
tokens.Add(token);
|
||||
}
|
||||
|
||||
// Sort for determinism
|
||||
tokens.Sort(StringComparer.Ordinal);
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private static Dictionary<string, int> BuildDocumentFrequency(IReadOnlyList<IReadOnlyList<string>> documents)
|
||||
{
|
||||
var df = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (var doc in documents)
|
||||
{
|
||||
var uniqueTerms = new HashSet<string>(doc, StringComparer.OrdinalIgnoreCase);
|
||||
foreach (var term in uniqueTerms)
|
||||
{
|
||||
df.TryGetValue(term, out var count);
|
||||
df[term] = count + 1;
|
||||
}
|
||||
}
|
||||
|
||||
return df;
|
||||
}
|
||||
|
||||
private Dictionary<string, double> ComputeTfIdf(
|
||||
IReadOnlyList<string> tokens,
|
||||
Dictionary<string, int> documentFrequency,
|
||||
int totalDocuments)
|
||||
{
|
||||
// Compute term frequency
|
||||
var termFrequency = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
|
||||
foreach (var token in tokens)
|
||||
{
|
||||
termFrequency.TryGetValue(token, out var count);
|
||||
termFrequency[token] = count + 1;
|
||||
}
|
||||
|
||||
if (termFrequency.Count == 0)
|
||||
{
|
||||
return new Dictionary<string, double>(StringComparer.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
// Compute TF-IDF
|
||||
var tfidf = new Dictionary<string, double>(StringComparer.OrdinalIgnoreCase);
|
||||
var maxTf = termFrequency.Values.Max();
|
||||
|
||||
foreach (var (term, tf) in termFrequency)
|
||||
{
|
||||
// Normalized TF: tf / max_tf (augmented frequency)
|
||||
var normalizedTf = 0.5 + 0.5 * ((double)tf / maxTf);
|
||||
|
||||
// IDF: log((N + 1) / (df + 1)) + 1 (smoothed IDF to avoid zero)
|
||||
// This ensures terms that appear in all documents still have some weight
|
||||
documentFrequency.TryGetValue(term, out var df);
|
||||
var idf = Math.Log((double)(totalDocuments + 1) / (df + 1)) + 1.0;
|
||||
|
||||
tfidf[term] = normalizedTf * idf;
|
||||
}
|
||||
|
||||
return tfidf;
|
||||
}
|
||||
|
||||
private static double CosineSimilarity(
|
||||
Dictionary<string, double> vec1,
|
||||
Dictionary<string, double> vec2)
|
||||
{
|
||||
// Get all terms
|
||||
var allTerms = new HashSet<string>(vec1.Keys, StringComparer.OrdinalIgnoreCase);
|
||||
allTerms.UnionWith(vec2.Keys);
|
||||
|
||||
// Compute dot product and magnitudes
|
||||
var dotProduct = 0.0;
|
||||
var mag1 = 0.0;
|
||||
var mag2 = 0.0;
|
||||
|
||||
foreach (var term in allTerms)
|
||||
{
|
||||
vec1.TryGetValue(term, out var v1);
|
||||
vec2.TryGetValue(term, out var v2);
|
||||
|
||||
dotProduct += v1 * v2;
|
||||
mag1 += v1 * v1;
|
||||
mag2 += v2 * v2;
|
||||
}
|
||||
|
||||
mag1 = Math.Sqrt(mag1);
|
||||
mag2 = Math.Sqrt(mag2);
|
||||
|
||||
if (mag1 < double.Epsilon || mag2 < double.Epsilon)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
return dotProduct / (mag1 * mag2);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for the text similarity scorer.
|
||||
/// </summary>
|
||||
public sealed class TextSimilarityOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section name.
|
||||
/// </summary>
|
||||
public const string SectionName = "Concelier:Correlation:TextSimilarity";
|
||||
|
||||
/// <summary>
|
||||
/// Whether text similarity scoring is enabled.
|
||||
/// Default: false (Phase 3 feature, not yet GA).
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for text similarity in unified scoring.
|
||||
/// Default: 0.05.
|
||||
/// </summary>
|
||||
public double Weight { get; set; } = 0.05;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum token length after normalization.
|
||||
/// Default: 3.
|
||||
/// </summary>
|
||||
public int MinTokenLength { get; set; } = 3;
|
||||
|
||||
/// <summary>
|
||||
/// Custom stop words list. If null, uses built-in defaults.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? CustomStopWords { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to apply Porter stemming to tokens.
|
||||
/// Default: false (adds complexity, minimal benefit for security text).
|
||||
/// </summary>
|
||||
public bool EnableStemming { get; set; } = false;
|
||||
}
|
||||
@@ -0,0 +1,379 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// PackageIdfServiceTests.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-007
|
||||
// Description: Unit tests for package IDF keys, options, and conceptual IDF computations
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using FluentAssertions;
|
||||
using Xunit;
|
||||
|
||||
using StellaOps.TestKit;
|
||||
namespace StellaOps.Concelier.Cache.Valkey.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Unit tests for package IDF caching key generation, options, and IDF formulas.
|
||||
/// Note: Service-level tests requiring Valkey are in the Integration folder.
|
||||
/// </summary>
|
||||
public class PackageIdfKeyTests
|
||||
{
|
||||
#region IDF Key Generation Tests
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void IdfPackage_GeneratesCorrectKey()
|
||||
{
|
||||
// Arrange
|
||||
var packageName = "pkg:npm/lodash@4.17.21";
|
||||
|
||||
// Act
|
||||
var key = AdvisoryCacheKeys.IdfPackage(packageName);
|
||||
|
||||
// Assert
|
||||
key.Should().Be("concelier:idf:pkg:pkg:npm/lodash@4.17.21");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void IdfPackage_NormalizesToLowercase()
|
||||
{
|
||||
// Arrange
|
||||
var packageName = "pkg:NPM/Lodash@4.17.21";
|
||||
|
||||
// Act
|
||||
var key = AdvisoryCacheKeys.IdfPackage(packageName);
|
||||
|
||||
// Assert
|
||||
key.Should().Be("concelier:idf:pkg:pkg:npm/lodash@4.17.21");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void IdfPackage_WithCustomPrefix_GeneratesCorrectKey()
|
||||
{
|
||||
// Arrange
|
||||
var packageName = "pkg:npm/express@4.18.2";
|
||||
var prefix = "prod:";
|
||||
|
||||
// Act
|
||||
var key = AdvisoryCacheKeys.IdfPackage(packageName, prefix);
|
||||
|
||||
// Assert
|
||||
key.Should().Be("prod:idf:pkg:pkg:npm/express@4.18.2");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void IdfCorpusSize_GeneratesCorrectKey()
|
||||
{
|
||||
// Act
|
||||
var key = AdvisoryCacheKeys.IdfCorpusSize();
|
||||
|
||||
// Assert
|
||||
key.Should().Be("concelier:idf:stats:corpus_size");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void IdfLastRefresh_GeneratesCorrectKey()
|
||||
{
|
||||
// Act
|
||||
var key = AdvisoryCacheKeys.IdfLastRefresh();
|
||||
|
||||
// Assert
|
||||
key.Should().Be("concelier:idf:stats:last_refresh");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void IdfRefreshLock_GeneratesCorrectKey()
|
||||
{
|
||||
// Act
|
||||
var key = AdvisoryCacheKeys.IdfRefreshLock();
|
||||
|
||||
// Assert
|
||||
key.Should().Be("concelier:idf:lock:refresh");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void IdfDocumentFrequency_GeneratesCorrectKey()
|
||||
{
|
||||
// Arrange
|
||||
var packageName = "pkg:cargo/serde@1.0.0";
|
||||
|
||||
// Act
|
||||
var key = AdvisoryCacheKeys.IdfDocumentFrequency(packageName);
|
||||
|
||||
// Assert
|
||||
key.Should().Be("concelier:idf:df:pkg:cargo/serde@1.0.0");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void IdfPackagePattern_GeneratesCorrectPattern()
|
||||
{
|
||||
// Act
|
||||
var pattern = AdvisoryCacheKeys.IdfPackagePattern();
|
||||
|
||||
// Assert
|
||||
pattern.Should().Be("concelier:idf:pkg:*");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void IdfStatsHits_GeneratesCorrectKey()
|
||||
{
|
||||
// Act
|
||||
var key = AdvisoryCacheKeys.IdfStatsHits();
|
||||
|
||||
// Assert
|
||||
key.Should().Be("concelier:idf:stats:hits");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void IdfStatsMisses_GeneratesCorrectKey()
|
||||
{
|
||||
// Act
|
||||
var key = AdvisoryCacheKeys.IdfStatsMisses();
|
||||
|
||||
// Assert
|
||||
key.Should().Be("concelier:idf:stats:misses");
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tests for PackageIdfOptions defaults and configuration.
|
||||
/// </summary>
|
||||
public class PackageIdfOptionsTests
|
||||
{
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void PackageIdfOptions_DefaultValues_AreCorrect()
|
||||
{
|
||||
// Arrange & Act
|
||||
var options = new PackageIdfOptions();
|
||||
|
||||
// Assert
|
||||
options.Enabled.Should().BeTrue();
|
||||
options.IdfTtl.Should().Be(TimeSpan.FromHours(1));
|
||||
options.CorpusStatsTtl.Should().Be(TimeSpan.FromHours(4));
|
||||
options.MinIdfThreshold.Should().Be(0.01);
|
||||
options.DefaultIdfWeight.Should().Be(1.0);
|
||||
options.MaxCacheEntries.Should().Be(100_000);
|
||||
options.NormalizeScores.Should().BeTrue();
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void PackageIdfOptions_SectionName_IsCorrect()
|
||||
{
|
||||
// Assert
|
||||
PackageIdfOptions.SectionName.Should().Be("Concelier:PackageIdf");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void PackageIdfOptions_CanBeCustomized()
|
||||
{
|
||||
// Arrange & Act
|
||||
var options = new PackageIdfOptions
|
||||
{
|
||||
Enabled = false,
|
||||
IdfTtl = TimeSpan.FromMinutes(30),
|
||||
CorpusStatsTtl = TimeSpan.FromHours(2),
|
||||
MinIdfThreshold = 0.05,
|
||||
DefaultIdfWeight = 0.5,
|
||||
MaxCacheEntries = 50_000,
|
||||
NormalizeScores = false
|
||||
};
|
||||
|
||||
// Assert
|
||||
options.Enabled.Should().BeFalse();
|
||||
options.IdfTtl.Should().Be(TimeSpan.FromMinutes(30));
|
||||
options.CorpusStatsTtl.Should().Be(TimeSpan.FromHours(2));
|
||||
options.MinIdfThreshold.Should().Be(0.05);
|
||||
options.DefaultIdfWeight.Should().Be(0.5);
|
||||
options.MaxCacheEntries.Should().Be(50_000);
|
||||
options.NormalizeScores.Should().BeFalse();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tests for IDF formula computation (conceptual validation).
|
||||
/// </summary>
|
||||
public class IdfFormulaTests
|
||||
{
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Theory]
|
||||
[InlineData(10000, 1, 9.21)] // Rare package: log(10000/2) ≈ 8.52
|
||||
[InlineData(10000, 5000, 0.69)] // Common package: log(10000/5001) ≈ 0.69
|
||||
[InlineData(10000, 10000, 0.0)] // Ubiquitous: log(10000/10001) ≈ 0
|
||||
public void IdfFormula_ComputesCorrectly(long corpusSize, long docFrequency, double expectedRawIdf)
|
||||
{
|
||||
// This test validates the IDF formula used in UpdateCorpusStatsAsync
|
||||
// IDF = log(N / (1 + df))
|
||||
|
||||
// Act
|
||||
var rawIdf = Math.Log((double)corpusSize / (1 + docFrequency));
|
||||
|
||||
// Assert
|
||||
rawIdf.Should().BeApproximately(expectedRawIdf, 0.1);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void IdfFormula_RarePackageHasHighWeight()
|
||||
{
|
||||
// Arrange
|
||||
const long corpusSize = 100_000;
|
||||
const long rareDocFrequency = 5;
|
||||
const long commonDocFrequency = 50_000;
|
||||
|
||||
// Act
|
||||
var rareIdf = Math.Log((double)corpusSize / (1 + rareDocFrequency));
|
||||
var commonIdf = Math.Log((double)corpusSize / (1 + commonDocFrequency));
|
||||
|
||||
// Assert - rare package should have much higher IDF
|
||||
rareIdf.Should().BeGreaterThan(commonIdf * 5);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void IdfNormalization_ScalesToUnitInterval()
|
||||
{
|
||||
// Arrange - simulate corpus with various document frequencies
|
||||
var corpusSize = 100_000L;
|
||||
var documentFrequencies = new Dictionary<string, long>
|
||||
{
|
||||
["pkg:npm/lodash"] = 80_000, // Very common
|
||||
["pkg:npm/express"] = 40_000, // Common
|
||||
["pkg:cargo/serde"] = 10_000, // Moderate
|
||||
["pkg:npm/obscure"] = 100, // Rare
|
||||
["pkg:cargo/unique"] = 1 // Very rare
|
||||
};
|
||||
|
||||
// Act - compute raw IDFs
|
||||
var rawIdfs = documentFrequencies.ToDictionary(
|
||||
kv => kv.Key,
|
||||
kv => Math.Log((double)corpusSize / (1 + kv.Value)));
|
||||
|
||||
var maxIdf = rawIdfs.Values.Max();
|
||||
|
||||
// Normalize to 0-1
|
||||
var normalizedIdfs = rawIdfs.ToDictionary(
|
||||
kv => kv.Key,
|
||||
kv => kv.Value / maxIdf);
|
||||
|
||||
// Assert - all values should be in [0, 1]
|
||||
foreach (var (pkg, idf) in normalizedIdfs)
|
||||
{
|
||||
idf.Should().BeGreaterThanOrEqualTo(0.0, because: $"{pkg} should have non-negative IDF");
|
||||
idf.Should().BeLessThanOrEqualTo(1.0, because: $"{pkg} should have IDF ≤ 1.0");
|
||||
}
|
||||
|
||||
// The rarest package should have IDF close to 1.0
|
||||
normalizedIdfs["pkg:cargo/unique"].Should().BeApproximately(1.0, 0.01);
|
||||
|
||||
// The most common package should have low IDF
|
||||
normalizedIdfs["pkg:npm/lodash"].Should().BeLessThan(0.3);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void IdfWeight_DiscriminatesBetweenPackages()
|
||||
{
|
||||
// This test validates that IDF provides meaningful discrimination
|
||||
// for linkset correlation
|
||||
|
||||
// Arrange
|
||||
var corpusSize = 50_000L;
|
||||
|
||||
// Package that appears in many advisories (low discrimination)
|
||||
var commonPkgDf = 25_000L;
|
||||
// Package that appears in few advisories (high discrimination)
|
||||
var rarePkgDf = 50L;
|
||||
|
||||
// Act
|
||||
var commonIdf = Math.Log((double)corpusSize / (1 + commonPkgDf));
|
||||
var rareIdf = Math.Log((double)corpusSize / (1 + rarePkgDf));
|
||||
|
||||
// Normalize
|
||||
var maxIdf = Math.Max(commonIdf, rareIdf);
|
||||
var commonNorm = commonIdf / maxIdf;
|
||||
var rareNorm = rareIdf / maxIdf;
|
||||
|
||||
// Assert
|
||||
// When two advisories share a rare package, it should be a stronger
|
||||
// correlation signal than when they share a common package
|
||||
rareNorm.Should().BeGreaterThan(commonNorm * 3,
|
||||
because: "sharing a rare package should be 3x more discriminative than sharing a common package");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tests for PackageIdfMetrics instrumentation.
|
||||
/// </summary>
|
||||
public class PackageIdfMetricsTests
|
||||
{
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void PackageIdfMetrics_ActivitySourceName_IsCorrect()
|
||||
{
|
||||
// Assert
|
||||
PackageIdfMetrics.ActivitySourceName.Should().Be("StellaOps.Concelier.PackageIdf");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void PackageIdfMetrics_MeterName_IsCorrect()
|
||||
{
|
||||
// Assert
|
||||
PackageIdfMetrics.MeterName.Should().Be("StellaOps.Concelier.PackageIdf");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void PackageIdfMetrics_CanBeCreatedAndDisposed()
|
||||
{
|
||||
// Arrange & Act
|
||||
using var metrics = new PackageIdfMetrics();
|
||||
|
||||
// Assert - no exception thrown
|
||||
metrics.Should().NotBeNull();
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void PackageIdfMetrics_RecordsOperations_WithoutException()
|
||||
{
|
||||
// Arrange
|
||||
using var metrics = new PackageIdfMetrics();
|
||||
|
||||
// Act & Assert - none of these should throw
|
||||
metrics.RecordHit();
|
||||
metrics.RecordHits(5);
|
||||
metrics.RecordMiss();
|
||||
metrics.RecordMisses(3);
|
||||
metrics.RecordRefresh(100);
|
||||
metrics.RecordLatency(15.5, "get");
|
||||
metrics.RecordIdfWeight(0.75);
|
||||
metrics.UpdateCorpusSize(50_000);
|
||||
metrics.UpdateCachedEntries(10_000);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void PackageIdfMetrics_StartActivity_ReturnsNullWhenNoListeners()
|
||||
{
|
||||
// Act
|
||||
var activity = PackageIdfMetrics.StartActivity("test-operation");
|
||||
|
||||
// Assert - no listeners registered, so activity should be null
|
||||
// (This is expected behavior for OpenTelemetry when no exporters are configured)
|
||||
// Just verify it doesn't throw
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,636 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// LinksetCorrelationV2Tests.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-001 through CORR-V2-008
|
||||
// Description: Comprehensive tests for V2 correlation algorithm
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Globalization;
|
||||
using System.Linq;
|
||||
using FluentAssertions;
|
||||
using StellaOps.Concelier.Core.Linksets;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Concelier.Core.Tests.Linksets;
|
||||
|
||||
/// <summary>
|
||||
/// Tests for the V2 linkset correlation algorithm.
|
||||
/// Validates graph-based alias connectivity, pairwise package coverage,
|
||||
/// version compatibility, patch lineage, and typed conflict severities.
|
||||
/// </summary>
|
||||
public sealed class LinksetCorrelationV2Tests
|
||||
{
|
||||
#region CORR-V2-001: Alias Connectivity (Graph-based)
|
||||
|
||||
[Fact]
|
||||
public void AliasConnectivity_TransitiveBridging_CorrectlyLinksThreeSources()
|
||||
{
|
||||
// Arrange: A has CVE-X, B has CVE-X + GHSA-Y, C has GHSA-Y
|
||||
// V1 would produce score=0 (empty intersection)
|
||||
// V2 should produce high score via transitive bridging
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" }),
|
||||
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-1234", "GHSA-aaaa-bbbb-cccc" }),
|
||||
CreateInput("obs-c", "osv", aliases: new[] { "GHSA-aaaa-bbbb-cccc" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
// With only alias signals: 0.30*1.0 + 0.10*1.0 + neutrals = 0.50
|
||||
result.Confidence.Should().BeGreaterThanOrEqualTo(0.5, "transitive bridging should yield positive confidence");
|
||||
result.SignalScores["aliasConnectivity"].Should().Be(1.0, "all observations connected via alias graph");
|
||||
result.Conflicts.Should().NotContain(c => c.Reason == "alias-inconsistency",
|
||||
"no inconsistency when transitively connected");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AliasConnectivity_DisjointAliases_ProducesLowScoreAndConflict()
|
||||
{
|
||||
// Arrange: Two sources with completely disjoint aliases (no bridging)
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1111" }),
|
||||
CreateInput("obs-b", "vendor", aliases: new[] { "VENDOR-ADV-999" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["aliasConnectivity"].Should().Be(0.5, "50% in LCC (each disconnected)");
|
||||
result.Conflicts.Should().Contain(c => c.Reason == "alias-inconsistency");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AliasConnectivity_DistinctCVEs_ProducesHardConflict()
|
||||
{
|
||||
// Arrange: Two different CVE identifiers in the cluster = hard conflict
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1111" }),
|
||||
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-2222" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.Conflicts.Should().Contain(c =>
|
||||
c.Reason == "distinct-cves" && c.Severity == ConflictSeverity.Hard);
|
||||
result.Confidence.Should().BeLessThan(0.5, "hard conflict should significantly reduce confidence");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AliasConnectivity_SingleObservation_ReturnsFullScoreWithAliases()
|
||||
{
|
||||
// Arrange
|
||||
var inputs = new[] { CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" }) };
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["aliasConnectivity"].Should().Be(1.0);
|
||||
result.Conflicts.Should().BeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void AliasConnectivity_NoAliases_ReturnsZeroScore()
|
||||
{
|
||||
// Arrange
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", aliases: Array.Empty<string>()),
|
||||
CreateInput("obs-b", "vendor", aliases: Array.Empty<string>())
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["aliasConnectivity"].Should().Be(0.0);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region CORR-V2-002: Package Coverage (Pairwise + IDF)
|
||||
|
||||
[Fact]
|
||||
public void PackageCoverage_ThinSource_DoesNotCollapseScore()
|
||||
{
|
||||
// Arrange: Source A and B share package, Source C has no packages
|
||||
// V1 intersection-across-all would produce 0
|
||||
// V2 pairwise should still produce positive score
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", purls: new[] { "pkg:npm/lodash@4.17.21" }),
|
||||
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:npm/lodash@4.17.20" }),
|
||||
CreateInput("obs-c", "vendor", purls: Array.Empty<string>())
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["packageCoverage"].Should().BeGreaterThan(0,
|
||||
"thin source should not collapse pairwise coverage");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PackageCoverage_ExactPurlMatch_BoostsScore()
|
||||
{
|
||||
// Arrange: Same exact PURL (with version)
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", purls: new[] { "pkg:npm/lodash@4.17.21" }),
|
||||
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:npm/lodash@4.17.21" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["packageCoverage"].Should().BeGreaterThanOrEqualTo(0.8,
|
||||
"exact PURL match should boost score");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PackageCoverage_NoOverlap_ReturnsZero()
|
||||
{
|
||||
// Arrange: Completely different packages
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", purls: new[] { "pkg:npm/lodash@4.17.21" }),
|
||||
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:pypi/requests@2.28.0" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["packageCoverage"].Should().Be(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PackageCoverage_WithIdfProvider_WeightsRarePackagesHigher()
|
||||
{
|
||||
// Arrange: Custom IDF provider
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", purls: new[] { "pkg:cargo/obscure-lib@1.0.0" }),
|
||||
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:cargo/obscure-lib@1.0.0" })
|
||||
};
|
||||
|
||||
// IDF provider: rare package gets high weight
|
||||
double IdfProvider(string pkg) => pkg.Contains("obscure") ? 5.0 : 1.0;
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs, packageIdfProvider: IdfProvider);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["packageCoverage"].Should().BeGreaterThan(0.5);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region CORR-V2-003: Reference Score (Positive-Only)
|
||||
|
||||
[Fact]
|
||||
public void ReferenceScore_ZeroOverlap_ReturnsNeutral_NoConflict()
|
||||
{
|
||||
// Arrange: Different references from different sources
|
||||
// V1 would emit reference-clash
|
||||
// V2 should return neutral (0.5) with no conflict
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", references: new[] { "https://nvd.nist.gov/vuln/detail/CVE-2025-1234" }),
|
||||
CreateInput("obs-b", "ghsa", references: new[] { "https://github.com/advisories/GHSA-xxxx" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["referenceOverlap"].Should().Be(0.5, "zero overlap = neutral, not negative");
|
||||
result.Conflicts.Should().NotContain(c => c.Reason == "reference-clash",
|
||||
"no conflict for simple disjoint references");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReferenceScore_PartialOverlap_ProducesPositiveScore()
|
||||
{
|
||||
// Arrange: Some shared references
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", references: new[]
|
||||
{
|
||||
"https://example.com/advisory",
|
||||
"https://nvd.nist.gov/vuln/detail/CVE-2025-1234"
|
||||
}),
|
||||
CreateInput("obs-b", "ghsa", references: new[]
|
||||
{
|
||||
"https://example.com/advisory",
|
||||
"https://github.com/advisories/GHSA-xxxx"
|
||||
})
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["referenceOverlap"].Should().BeGreaterThan(0.5);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReferenceScore_NormalizesUrls()
|
||||
{
|
||||
// Arrange: Same URL with different casing/protocol
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", references: new[] { "http://Example.COM/advisory?utm_source=test" }),
|
||||
CreateInput("obs-b", "ghsa", references: new[] { "https://example.com/advisory" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert: Should match after normalization
|
||||
result.SignalScores["referenceOverlap"].Should().BeGreaterThan(0.5);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region CORR-V2-004: Typed Conflict Severities
|
||||
|
||||
[Fact]
|
||||
public void ConflictPenalty_HardConflict_AppliesLargePenalty()
|
||||
{
|
||||
// Arrange: Distinct CVEs = hard conflict
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1111" }),
|
||||
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-2222" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
var hardConflict = result.Conflicts.FirstOrDefault(c => c.Severity == ConflictSeverity.Hard);
|
||||
hardConflict.Should().NotBeNull();
|
||||
result.Confidence.Should().BeLessThan(0.5);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ConflictPenalty_SoftConflict_AppliesSmallPenalty()
|
||||
{
|
||||
// Arrange: Same CVE but overlapping version ranges (share at least one version)
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd",
|
||||
aliases: new[] { "CVE-2025-1234" },
|
||||
purls: new[] { "pkg:npm/lodash@4.17.20", "pkg:npm/lodash@4.17.21" }),
|
||||
CreateInput("obs-b", "ghsa",
|
||||
aliases: new[] { "CVE-2025-1234" },
|
||||
purls: new[] { "pkg:npm/lodash@4.17.20", "pkg:npm/lodash@4.17.19" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert: Should have soft divergence conflict (overlapping but not equivalent)
|
||||
var softConflict = result.Conflicts.FirstOrDefault(c =>
|
||||
c.Severity == ConflictSeverity.Soft && c.Reason == "affected-range-divergence");
|
||||
softConflict.Should().NotBeNull("overlapping but non-equivalent ranges should produce soft conflict");
|
||||
result.Confidence.Should().BeGreaterThan(0.5, "soft conflicts should not severely impact confidence");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ConflictPenalty_Saturates_AtMaximum()
|
||||
{
|
||||
// Arrange: Multiple hard conflicts
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd",
|
||||
aliases: new[] { "CVE-2025-1111" },
|
||||
purls: new[] { "pkg:npm/lodash@1.0.0" }),
|
||||
CreateInput("obs-b", "ghsa",
|
||||
aliases: new[] { "CVE-2025-2222" },
|
||||
purls: new[] { "pkg:npm/lodash@9.0.0" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert: Confidence should not go below 0.1 minimum
|
||||
result.Confidence.Should().BeGreaterThanOrEqualTo(0.1);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region CORR-V2-005: Patch Lineage
|
||||
|
||||
[Fact]
|
||||
public void PatchLineage_ExactCommitShaMatch_ProducesHighScore()
|
||||
{
|
||||
// Arrange: Same commit SHA in patch references
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd",
|
||||
aliases: new[] { "CVE-2025-1234" },
|
||||
patchReferences: new[] { "https://github.com/org/repo/commit/abc123def456789012345678901234567890abcd" }),
|
||||
CreateInput("obs-b", "ghsa",
|
||||
aliases: new[] { "CVE-2025-1234" },
|
||||
patchReferences: new[] { "https://github.com/org/repo/commit/abc123def456789012345678901234567890abcd" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["patchLineage"].Should().Be(1.0, "exact commit SHA match is very strong signal");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PatchLineage_DifferentCommits_ProducesZeroScore()
|
||||
{
|
||||
// Arrange: Different commit SHAs
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd",
|
||||
patchReferences: new[] { "https://github.com/org/repo/commit/1111111111111111111111111111111111111111" }),
|
||||
CreateInput("obs-b", "ghsa",
|
||||
patchReferences: new[] { "https://github.com/org/repo/commit/2222222222222222222222222222222222222222" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["patchLineage"].Should().Be(0);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void PatchLineage_NoPatchData_ReturnsZero()
|
||||
{
|
||||
// Arrange: No patch references
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" }),
|
||||
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-1234" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["patchLineage"].Should().Be(0);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region CORR-V2-006: Version Compatibility
|
||||
|
||||
[Fact]
|
||||
public void VersionCompatibility_EquivalentRanges_ProducesHighScore()
|
||||
{
|
||||
// Arrange: Same versions for same package
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", purls: new[] { "pkg:npm/lodash@4.17.21" }),
|
||||
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:npm/lodash@4.17.21" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["versionCompatibility"].Should().BeGreaterThanOrEqualTo(0.8);
|
||||
result.Conflicts.Should().NotContain(c =>
|
||||
c.Reason == "affected-range-divergence" || c.Reason == "disjoint-version-ranges");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VersionCompatibility_OverlappingRanges_ProducesMediumScoreWithSoftConflict()
|
||||
{
|
||||
// Arrange: Overlapping but not identical versions
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", purls: new[] { "pkg:npm/lodash@4.17.21", "pkg:npm/lodash@4.17.20" }),
|
||||
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:npm/lodash@4.17.20", "pkg:npm/lodash@4.17.19" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.SignalScores["versionCompatibility"].Should().BeInRange(0.4, 0.8);
|
||||
result.Conflicts.Should().Contain(c =>
|
||||
c.Reason == "affected-range-divergence" && c.Severity == ConflictSeverity.Soft);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VersionCompatibility_DisjointRanges_ProducesLowScoreWithHardConflict()
|
||||
{
|
||||
// Arrange: Completely different versions for same package
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", purls: new[] { "pkg:npm/lodash@1.0.0" }),
|
||||
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:npm/lodash@9.0.0" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.Conflicts.Should().Contain(c =>
|
||||
c.Reason == "disjoint-version-ranges" && c.Severity == ConflictSeverity.Hard);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region CORR-V2-008: Integrated Scoring
|
||||
|
||||
[Fact]
|
||||
public void IntegratedScoring_HighConfidenceScenario()
|
||||
{
|
||||
// Arrange: Strong signals across all dimensions
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd",
|
||||
aliases: new[] { "CVE-2025-1234" },
|
||||
purls: new[] { "pkg:npm/vulnerable-lib@2.0.0" },
|
||||
cpes: new[] { "cpe:2.3:a:vendor:vulnerable-lib:2.0.0:*:*:*:*:*:*:*" },
|
||||
references: new[] { "https://example.com/advisory" },
|
||||
patchReferences: new[] { "https://github.com/org/repo/commit/abc123def456789012345678901234567890abcd" },
|
||||
fetchedAt: DateTimeOffset.Parse("2025-01-25T10:00:00Z", CultureInfo.InvariantCulture)),
|
||||
CreateInput("obs-b", "ghsa",
|
||||
aliases: new[] { "CVE-2025-1234", "GHSA-xxxx-yyyy-zzzz" },
|
||||
purls: new[] { "pkg:npm/vulnerable-lib@2.0.0" },
|
||||
cpes: new[] { "cpe:2.3:a:vendor:vulnerable-lib:2.0.0:*:*:*:*:*:*:*" },
|
||||
references: new[] { "https://example.com/advisory", "https://github.com/advisories/GHSA-xxxx" },
|
||||
patchReferences: new[] { "https://github.com/org/repo/commit/abc123def456789012345678901234567890abcd" },
|
||||
fetchedAt: DateTimeOffset.Parse("2025-01-25T11:00:00Z", CultureInfo.InvariantCulture))
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.Confidence.Should().BeGreaterThanOrEqualTo(0.85, "all signals strong = high confidence");
|
||||
result.Conflicts.Should().BeEmpty();
|
||||
|
||||
// Verify individual signals
|
||||
result.SignalScores["aliasConnectivity"].Should().Be(1.0);
|
||||
result.SignalScores["aliasAuthority"].Should().Be(1.0); // CVE present
|
||||
result.SignalScores["packageCoverage"].Should().BeGreaterThanOrEqualTo(0.8);
|
||||
result.SignalScores["patchLineage"].Should().Be(1.0);
|
||||
result.SignalScores["freshness"].Should().Be(1.0); // Within 48h
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IntegratedScoring_MixedSignalsScenario()
|
||||
{
|
||||
// Arrange: Some strong signals, some weak
|
||||
// Note: Disconnected aliases will produce alias-inconsistency conflict
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd",
|
||||
aliases: new[] { "CVE-2025-1234" },
|
||||
purls: new[] { "pkg:npm/lodash@4.17.21" },
|
||||
fetchedAt: DateTimeOffset.Parse("2025-01-10T00:00:00Z", CultureInfo.InvariantCulture)),
|
||||
CreateInput("obs-b", "vendor",
|
||||
aliases: new[] { "VENDOR-2025-001" }, // No CVE, only vendor ID
|
||||
purls: new[] { "pkg:npm/lodash@4.17.20" }, // Different version
|
||||
fetchedAt: DateTimeOffset.Parse("2025-01-25T00:00:00Z", CultureInfo.InvariantCulture)) // 15 days apart
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
// Disconnected aliases + version divergence = conflicts reducing confidence
|
||||
// Minimum confidence is 0.1 when there are conflicts but some evidence
|
||||
result.Confidence.Should().BeInRange(0.1, 0.4, "mixed signals with conflicts = low-moderate confidence");
|
||||
result.SignalScores["aliasConnectivity"].Should().BeLessThan(1.0); // Disconnected
|
||||
result.SignalScores["freshness"].Should().BeLessThan(0.5); // 15 days spread
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void IntegratedScoring_EmptyInputs_ReturnsFullConfidence()
|
||||
{
|
||||
// Arrange
|
||||
var inputs = Array.Empty<LinksetCorrelationV2.InputV2>();
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result.Confidence.Should().Be(1.0);
|
||||
result.Conflicts.Should().BeEmpty();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Determinism Tests
|
||||
|
||||
[Fact]
|
||||
public void Determinism_SameInputs_ProduceSameOutput()
|
||||
{
|
||||
// Arrange
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" }),
|
||||
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-1234", "GHSA-xxxx" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var result1 = LinksetCorrelationV2.Compute(inputs);
|
||||
var result2 = LinksetCorrelationV2.Compute(inputs);
|
||||
|
||||
// Assert
|
||||
result1.Confidence.Should().Be(result2.Confidence);
|
||||
result1.Conflicts.Should().BeEquivalentTo(result2.Conflicts);
|
||||
result1.SignalScores.Should().BeEquivalentTo(result2.SignalScores);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Determinism_InputOrdering_DoesNotAffectResult()
|
||||
{
|
||||
// Arrange
|
||||
var inputsA = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" }),
|
||||
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-1234" })
|
||||
};
|
||||
|
||||
var inputsB = new[]
|
||||
{
|
||||
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-1234" }),
|
||||
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" })
|
||||
};
|
||||
|
||||
// Act
|
||||
var resultA = LinksetCorrelationV2.Compute(inputsA);
|
||||
var resultB = LinksetCorrelationV2.Compute(inputsB);
|
||||
|
||||
// Assert
|
||||
resultA.Confidence.Should().Be(resultB.Confidence);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Conflicts_AreDeduplicated()
|
||||
{
|
||||
// Arrange: Add duplicate conflicts via additionalConflicts
|
||||
// Use inputs that won't generate their own alias-inconsistency
|
||||
var inputs = new[]
|
||||
{
|
||||
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" }),
|
||||
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-1234" }) // Same CVE = connected
|
||||
};
|
||||
|
||||
var additionalConflicts = new List<AdvisoryLinksetConflict>
|
||||
{
|
||||
new("custom-field", "custom-reason", new[] { "a", "b" }),
|
||||
new("custom-field", "custom-reason", new[] { "a", "b" }) // Duplicate
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = LinksetCorrelationV2.Compute(inputs, additionalConflicts);
|
||||
|
||||
// Assert: Should deduplicate the additional conflicts
|
||||
result.Conflicts.Count(c => c.Reason == "custom-reason").Should().Be(1);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Helper Methods
|
||||
|
||||
private static LinksetCorrelationV2.InputV2 CreateInput(
|
||||
string observationId,
|
||||
string? vendor = null,
|
||||
string[]? aliases = null,
|
||||
string[]? purls = null,
|
||||
string[]? cpes = null,
|
||||
string[]? references = null,
|
||||
string[]? patchReferences = null,
|
||||
DateTimeOffset? fetchedAt = null)
|
||||
{
|
||||
return new LinksetCorrelationV2.InputV2(
|
||||
ObservationId: observationId,
|
||||
Vendor: vendor,
|
||||
FetchedAt: fetchedAt,
|
||||
Aliases: aliases ?? Array.Empty<string>(),
|
||||
Purls: purls ?? Array.Empty<string>(),
|
||||
Cpes: cpes ?? Array.Empty<string>(),
|
||||
References: references ?? Array.Empty<string>(),
|
||||
PatchReferences: patchReferences);
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,561 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// TextSimilarityScorerTests.cs
|
||||
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
|
||||
// Task: CORR-V2-010
|
||||
// Description: Unit tests and performance benchmarks for TextSimilarityScorer
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.Linq;
|
||||
using FluentAssertions;
|
||||
using StellaOps.Concelier.Core.Linksets;
|
||||
using StellaOps.TestKit;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Concelier.Core.Tests.Linksets;
|
||||
|
||||
/// <summary>
|
||||
/// Unit tests for <see cref="TextSimilarityScorer"/>.
|
||||
/// </summary>
|
||||
public class TextSimilarityScorerTests
|
||||
{
|
||||
private readonly TextSimilarityScorer _scorer = new();
|
||||
|
||||
#region Tokenization Tests
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void Tokenize_EmptyString_ReturnsEmpty()
|
||||
{
|
||||
// Act
|
||||
var tokens = _scorer.Tokenize("");
|
||||
|
||||
// Assert
|
||||
tokens.Should().BeEmpty();
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void Tokenize_NullString_ReturnsEmpty()
|
||||
{
|
||||
// Act
|
||||
var tokens = _scorer.Tokenize(null!);
|
||||
|
||||
// Assert
|
||||
tokens.Should().BeEmpty();
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void Tokenize_NormalizesToLowercase()
|
||||
{
|
||||
// Arrange
|
||||
var text = "BUFFER OVERFLOW Memory Corruption";
|
||||
|
||||
// Act
|
||||
var tokens = _scorer.Tokenize(text);
|
||||
|
||||
// Assert
|
||||
tokens.Should().AllSatisfy(t => t.Should().Be(t.ToLowerInvariant()));
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void Tokenize_RemovesStopWords()
|
||||
{
|
||||
// Arrange
|
||||
var text = "The vulnerability allows an attacker to execute code";
|
||||
|
||||
// Act
|
||||
var tokens = _scorer.Tokenize(text);
|
||||
|
||||
// Assert - common stop words should be removed
|
||||
tokens.Should().NotContain("the");
|
||||
tokens.Should().NotContain("an");
|
||||
tokens.Should().NotContain("to");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void Tokenize_RemovesShortTokens()
|
||||
{
|
||||
// Arrange
|
||||
var text = "CVE ID in XSS bug";
|
||||
|
||||
// Act
|
||||
var tokens = _scorer.Tokenize(text);
|
||||
|
||||
// Assert - tokens shorter than 3 chars should be removed
|
||||
tokens.Should().NotContain("id");
|
||||
tokens.Should().NotContain("in");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void Tokenize_RemovesNumericTokens()
|
||||
{
|
||||
// Arrange
|
||||
var text = "version 123 release 2024";
|
||||
|
||||
// Act
|
||||
var tokens = _scorer.Tokenize(text);
|
||||
|
||||
// Assert - pure numeric tokens should be removed
|
||||
tokens.Should().NotContain("123");
|
||||
tokens.Should().NotContain("2024");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void Tokenize_KeepsAlphanumericTokens()
|
||||
{
|
||||
// Arrange
|
||||
var text = "CVE2024 log4j2 spring4shell";
|
||||
|
||||
// Act
|
||||
var tokens = _scorer.Tokenize(text);
|
||||
|
||||
// Assert - alphanumeric tokens should be kept
|
||||
tokens.Should().Contain("cve2024");
|
||||
tokens.Should().Contain("log4j2");
|
||||
tokens.Should().Contain("spring4shell");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void Tokenize_IsDeterministic()
|
||||
{
|
||||
// Arrange
|
||||
var text = "Memory corruption in JSON parser leads to arbitrary code execution";
|
||||
|
||||
// Act
|
||||
var tokens1 = _scorer.Tokenize(text);
|
||||
var tokens2 = _scorer.Tokenize(text);
|
||||
|
||||
// Assert
|
||||
tokens1.Should().BeEquivalentTo(tokens2, options => options.WithStrictOrdering());
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void Tokenize_SortsTokensForDeterminism()
|
||||
{
|
||||
// Arrange
|
||||
var text = "zebra alpha memory parser";
|
||||
|
||||
// Act
|
||||
var tokens = _scorer.Tokenize(text);
|
||||
|
||||
// Assert - tokens should be sorted alphabetically
|
||||
tokens.Should().BeInAscendingOrder();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Pairwise Similarity Tests
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void ComputePairwiseSimilarity_IdenticalTexts_ReturnsOne()
|
||||
{
|
||||
// Arrange
|
||||
var text = "A heap-based buffer overflow in libpng allows remote attackers to execute arbitrary code";
|
||||
|
||||
// Act
|
||||
var similarity = _scorer.ComputePairwiseSimilarity(text, text);
|
||||
|
||||
// Assert
|
||||
similarity.Should().BeApproximately(1.0, 0.01);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void ComputePairwiseSimilarity_CompletelyDifferent_ReturnsLowScore()
|
||||
{
|
||||
// Arrange
|
||||
var text1 = "SQL injection in database query handler";
|
||||
var text2 = "Memory corruption in graphics renderer";
|
||||
|
||||
// Act
|
||||
var similarity = _scorer.ComputePairwiseSimilarity(text1, text2);
|
||||
|
||||
// Assert
|
||||
similarity.Should().BeLessThan(0.3);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void ComputePairwiseSimilarity_SimilarDescriptions_ReturnsPositiveScore()
|
||||
{
|
||||
// Arrange - same vulnerability described differently
|
||||
var text1 = "A heap-based buffer overflow in the PNG image parser allows remote code execution";
|
||||
var text2 = "Remote code execution via heap buffer overflow in PNG image processing library";
|
||||
|
||||
// Act
|
||||
var similarity = _scorer.ComputePairwiseSimilarity(text1, text2);
|
||||
|
||||
// Assert - TF-IDF similarity for short texts with stop words removed
|
||||
// is typically moderate (0.2-0.5 range)
|
||||
similarity.Should().BeGreaterThan(0.2);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void ComputePairwiseSimilarity_EmptyFirst_ReturnsZero()
|
||||
{
|
||||
// Act
|
||||
var similarity = _scorer.ComputePairwiseSimilarity("", "some text here");
|
||||
|
||||
// Assert
|
||||
similarity.Should().Be(0.0);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void ComputePairwiseSimilarity_EmptySecond_ReturnsZero()
|
||||
{
|
||||
// Act
|
||||
var similarity = _scorer.ComputePairwiseSimilarity("some text here", "");
|
||||
|
||||
// Assert
|
||||
similarity.Should().Be(0.0);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void ComputePairwiseSimilarity_OnlyStopWords_ReturnsZero()
|
||||
{
|
||||
// Arrange - text with only stop words
|
||||
var text1 = "the and or but";
|
||||
var text2 = "the and or but";
|
||||
|
||||
// Act
|
||||
var similarity = _scorer.ComputePairwiseSimilarity(text1, text2);
|
||||
|
||||
// Assert - no tokens after stop word removal
|
||||
similarity.Should().Be(0.0);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Average Similarity Tests
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void ComputeAverageSimilarity_SingleDescription_ReturnsZero()
|
||||
{
|
||||
// Arrange
|
||||
var descriptions = new[] { "Only one description here" };
|
||||
|
||||
// Act
|
||||
var similarity = _scorer.ComputeAverageSimilarity(descriptions);
|
||||
|
||||
// Assert
|
||||
similarity.Should().Be(0.0);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void ComputeAverageSimilarity_EmptyCollection_ReturnsZero()
|
||||
{
|
||||
// Act
|
||||
var similarity = _scorer.ComputeAverageSimilarity(Array.Empty<string>());
|
||||
|
||||
// Assert
|
||||
similarity.Should().Be(0.0);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void ComputeAverageSimilarity_IdenticalDescriptions_ReturnsOne()
|
||||
{
|
||||
// Arrange
|
||||
var description = "A critical buffer overflow vulnerability in the image processing library";
|
||||
var descriptions = new[] { description, description, description };
|
||||
|
||||
// Act
|
||||
var similarity = _scorer.ComputeAverageSimilarity(descriptions);
|
||||
|
||||
// Assert
|
||||
similarity.Should().BeApproximately(1.0, 0.01);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void ComputeAverageSimilarity_MixedSimilarity_ReturnsReasonableAverage()
|
||||
{
|
||||
// Arrange - three descriptions about the same CVE from different sources
|
||||
var descriptions = new[]
|
||||
{
|
||||
"A heap-based buffer overflow in libpng before 1.6.37 allows remote attackers to cause denial of service",
|
||||
"Buffer overflow vulnerability in PNG library (libpng) can be exploited by remote attackers for DoS",
|
||||
"libpng contains a heap overflow that may lead to denial of service when processing malformed PNG files"
|
||||
};
|
||||
|
||||
// Act
|
||||
var similarity = _scorer.ComputeAverageSimilarity(descriptions);
|
||||
|
||||
// Assert - TF-IDF similarity for related security texts typically
|
||||
// produces moderate scores (0.1-0.4 range) after stop word removal
|
||||
similarity.Should().BeGreaterThan(0.1);
|
||||
similarity.Should().BeLessThanOrEqualTo(1.0);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void ComputeAverageSimilarity_SkipsEmptyDescriptions()
|
||||
{
|
||||
// Arrange
|
||||
var descriptions = new[]
|
||||
{
|
||||
"A critical vulnerability in the parser",
|
||||
"",
|
||||
null!,
|
||||
" ",
|
||||
"A critical vulnerability in the parser"
|
||||
};
|
||||
|
||||
// Act
|
||||
var similarity = _scorer.ComputeAverageSimilarity(descriptions);
|
||||
|
||||
// Assert - should only consider non-empty descriptions
|
||||
similarity.Should().BeApproximately(1.0, 0.01);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Options Tests
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void TextSimilarityOptions_DefaultValues_AreCorrect()
|
||||
{
|
||||
// Arrange & Act
|
||||
var options = new TextSimilarityOptions();
|
||||
|
||||
// Assert
|
||||
options.Enabled.Should().BeFalse();
|
||||
options.Weight.Should().Be(0.05);
|
||||
options.MinTokenLength.Should().Be(3);
|
||||
options.CustomStopWords.Should().BeNull();
|
||||
options.EnableStemming.Should().BeFalse();
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void TextSimilarityOptions_SectionName_IsCorrect()
|
||||
{
|
||||
// Assert
|
||||
TextSimilarityOptions.SectionName.Should().Be("Concelier:Correlation:TextSimilarity");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void Scorer_WithCustomStopWords_UsesCustomList()
|
||||
{
|
||||
// Arrange
|
||||
var options = new TextSimilarityOptions
|
||||
{
|
||||
CustomStopWords = new[] { "custom", "stop", "words" }
|
||||
};
|
||||
var scorer = new TextSimilarityScorer(options);
|
||||
|
||||
// Act
|
||||
var tokens = scorer.Tokenize("custom stop words remain here");
|
||||
|
||||
// Assert - custom stop words should be removed
|
||||
tokens.Should().NotContain("custom");
|
||||
tokens.Should().NotContain("stop");
|
||||
tokens.Should().NotContain("words");
|
||||
tokens.Should().Contain("remain");
|
||||
tokens.Should().Contain("here");
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Real-World Description Fixtures
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Theory]
|
||||
[MemberData(nameof(RealWorldDescriptionFixtures))]
|
||||
public void ComputeAverageSimilarity_RealWorldFixtures_ReturnsExpectedRange(
|
||||
string[] descriptions,
|
||||
double minExpected,
|
||||
double maxExpected,
|
||||
string scenario)
|
||||
{
|
||||
// Act
|
||||
var similarity = _scorer.ComputeAverageSimilarity(descriptions);
|
||||
|
||||
// Assert
|
||||
similarity.Should().BeGreaterThanOrEqualTo(minExpected,
|
||||
because: $"scenario '{scenario}' should have similarity >= {minExpected}");
|
||||
similarity.Should().BeLessThanOrEqualTo(maxExpected,
|
||||
because: $"scenario '{scenario}' should have similarity <= {maxExpected}");
|
||||
}
|
||||
|
||||
public static IEnumerable<object[]> RealWorldDescriptionFixtures()
|
||||
{
|
||||
// CVE-2021-44228 (Log4Shell) - same vulnerability, different sources
|
||||
// TF-IDF similarity for related security texts is typically 0.1-0.5
|
||||
yield return new object[]
|
||||
{
|
||||
new[]
|
||||
{
|
||||
"Apache Log4j2 2.0-beta9 through 2.15.0 (excluding security releases 2.12.2, 2.12.3, and 2.3.1) JNDI features used in configuration, log messages, and parameters do not protect against attacker controlled LDAP and other JNDI related endpoints.",
|
||||
"A flaw was found in the Java logging library Apache Log4j in version 2.x. When configured to use a JNDI URL with a LDAP scheme, an attacker can execute arbitrary code.",
|
||||
"Remote code execution vulnerability in Apache Log4j2 allows attackers to execute arbitrary code via JNDI lookup in log messages."
|
||||
},
|
||||
0.05, 0.9, "Log4Shell - same CVE, different sources"
|
||||
};
|
||||
|
||||
// Unrelated vulnerabilities - should have low similarity
|
||||
yield return new object[]
|
||||
{
|
||||
new[]
|
||||
{
|
||||
"SQL injection vulnerability in the login form allows authentication bypass",
|
||||
"Cross-site scripting (XSS) in the comments section enables script injection",
|
||||
"Buffer overflow in image processing library causes denial of service"
|
||||
},
|
||||
0.0, 0.4, "Unrelated vulnerabilities"
|
||||
};
|
||||
|
||||
// Same library, different CVEs - moderate similarity
|
||||
yield return new object[]
|
||||
{
|
||||
new[]
|
||||
{
|
||||
"OpenSSL before 3.0.7 allows remote attackers to cause a denial of service via a crafted X.509 certificate",
|
||||
"OpenSSL 3.0.x before 3.0.5 contains a heap-based buffer overflow in the SM2 implementation",
|
||||
"A timing-based side channel in OpenSSL allows recovery of private key material"
|
||||
},
|
||||
0.05, 0.6, "Same library (OpenSSL), different CVEs"
|
||||
};
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Determinism Tests
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void ComputeAverageSimilarity_IsDeterministic()
|
||||
{
|
||||
// Arrange
|
||||
var descriptions = new[]
|
||||
{
|
||||
"A heap-based buffer overflow in libpng",
|
||||
"Buffer overflow in PNG library",
|
||||
"libpng heap overflow vulnerability"
|
||||
};
|
||||
|
||||
// Act
|
||||
var similarity1 = _scorer.ComputeAverageSimilarity(descriptions);
|
||||
var similarity2 = _scorer.ComputeAverageSimilarity(descriptions);
|
||||
var similarity3 = _scorer.ComputeAverageSimilarity(descriptions);
|
||||
|
||||
// Assert
|
||||
similarity1.Should().Be(similarity2);
|
||||
similarity2.Should().Be(similarity3);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public void ComputePairwiseSimilarity_IsDeterministic()
|
||||
{
|
||||
// Arrange
|
||||
var text1 = "Memory corruption in JSON parser";
|
||||
var text2 = "JSON parser memory corruption vulnerability";
|
||||
|
||||
// Act
|
||||
var similarity1 = _scorer.ComputePairwiseSimilarity(text1, text2);
|
||||
var similarity2 = _scorer.ComputePairwiseSimilarity(text1, text2);
|
||||
|
||||
// Assert
|
||||
similarity1.Should().Be(similarity2);
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Performance benchmarks for <see cref="TextSimilarityScorer"/>.
|
||||
/// Target: <= 5ms per pair.
|
||||
/// </summary>
|
||||
public class TextSimilarityScorerBenchmarks
|
||||
{
|
||||
private readonly TextSimilarityScorer _scorer = new();
|
||||
|
||||
[Trait("Category", TestCategories.Performance)]
|
||||
[Fact]
|
||||
public void ComputePairwiseSimilarity_MeetsPerformanceTarget()
|
||||
{
|
||||
// Arrange - realistic vulnerability descriptions
|
||||
var text1 = "A heap-based buffer overflow vulnerability has been discovered in the image processing library libpng version 1.6.37. Remote attackers can exploit this flaw by providing specially crafted PNG files, potentially leading to arbitrary code execution or denial of service conditions.";
|
||||
var text2 = "The PNG image handling library (libpng) contains a buffer overflow vulnerability in the row processing function. Exploitation of this issue allows attackers to execute arbitrary code in the context of the application using the affected library.";
|
||||
|
||||
// Warmup
|
||||
for (var i = 0; i < 10; i++)
|
||||
{
|
||||
_scorer.ComputePairwiseSimilarity(text1, text2);
|
||||
}
|
||||
|
||||
// Act - measure 100 iterations
|
||||
var sw = Stopwatch.StartNew();
|
||||
const int iterations = 100;
|
||||
|
||||
for (var i = 0; i < iterations; i++)
|
||||
{
|
||||
_scorer.ComputePairwiseSimilarity(text1, text2);
|
||||
}
|
||||
|
||||
sw.Stop();
|
||||
var averageMs = sw.Elapsed.TotalMilliseconds / iterations;
|
||||
|
||||
// Assert - target: <= 5ms per pair
|
||||
averageMs.Should().BeLessThanOrEqualTo(5.0,
|
||||
because: $"text similarity computation should complete within 5ms per pair (actual: {averageMs:F3} ms)");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Performance)]
|
||||
[Fact]
|
||||
public void ComputeAverageSimilarity_FiveDescriptions_MeetsPerformanceTarget()
|
||||
{
|
||||
// Arrange - 5 descriptions = 10 pairs
|
||||
var descriptions = new[]
|
||||
{
|
||||
"Apache Log4j2 JNDI features do not protect against attacker controlled LDAP endpoints",
|
||||
"A flaw in Log4j in version 2.x allows attackers to execute arbitrary code via JNDI lookup",
|
||||
"Remote code execution in Apache Log4j2 via malicious JNDI lookup patterns",
|
||||
"Log4j2 vulnerability allows remote attackers to execute code through JNDI injection",
|
||||
"Critical RCE vulnerability in Apache Log4j2 logging library through JNDI features"
|
||||
};
|
||||
|
||||
// Warmup
|
||||
for (var i = 0; i < 10; i++)
|
||||
{
|
||||
_scorer.ComputeAverageSimilarity(descriptions);
|
||||
}
|
||||
|
||||
// Act
|
||||
var sw = Stopwatch.StartNew();
|
||||
const int iterations = 100;
|
||||
|
||||
for (var i = 0; i < iterations; i++)
|
||||
{
|
||||
_scorer.ComputeAverageSimilarity(descriptions);
|
||||
}
|
||||
|
||||
sw.Stop();
|
||||
var averageMs = sw.Elapsed.TotalMilliseconds / iterations;
|
||||
var pairsPerCall = 10; // C(5,2) = 10 pairs
|
||||
var msPerPair = averageMs / pairsPerCall;
|
||||
|
||||
// Assert - target: <= 5ms per pair
|
||||
msPerPair.Should().BeLessThanOrEqualTo(5.0,
|
||||
because: $"text similarity computation should complete within 5ms per pair (actual: {msPerPair:F3} ms)");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user