devops folders consolidate

This commit is contained in:
master
2026-01-25 23:27:41 +02:00
parent 6e687b523a
commit a743bb9a1d
613 changed files with 8611 additions and 41846 deletions

View File

@@ -121,6 +121,70 @@ public static class AdvisoryCacheKeys
public static string CveMappingPattern(string prefix = DefaultPrefix)
=> $"{prefix}by:cve:*";
// -------------------------------------------------------------------------
// IDF (Inverse Document Frequency) Cache Keys
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-007
// -------------------------------------------------------------------------
/// <summary>
/// Key for IDF score of a specific package.
/// Format: {prefix}idf:pkg:{normalizedPackageName}
/// </summary>
/// <param name="packageName">The package name (will be normalized).</param>
/// <param name="prefix">Key prefix.</param>
public static string IdfPackage(string packageName, string prefix = DefaultPrefix)
=> $"{prefix}idf:pkg:{NormalizePurl(packageName)}";
/// <summary>
/// Key for IDF corpus statistics (total document count).
/// Format: {prefix}idf:stats:corpus_size
/// </summary>
public static string IdfCorpusSize(string prefix = DefaultPrefix)
=> $"{prefix}idf:stats:corpus_size";
/// <summary>
/// Key for IDF last refresh timestamp.
/// Format: {prefix}idf:stats:last_refresh
/// </summary>
public static string IdfLastRefresh(string prefix = DefaultPrefix)
=> $"{prefix}idf:stats:last_refresh";
/// <summary>
/// Key for IDF refresh lock (distributed coordination).
/// Format: {prefix}idf:lock:refresh
/// </summary>
public static string IdfRefreshLock(string prefix = DefaultPrefix)
=> $"{prefix}idf:lock:refresh";
/// <summary>
/// Key for document frequency of a package (count of observations containing the package).
/// Format: {prefix}idf:df:{normalizedPackageName}
/// </summary>
public static string IdfDocumentFrequency(string packageName, string prefix = DefaultPrefix)
=> $"{prefix}idf:df:{NormalizePurl(packageName)}";
/// <summary>
/// Pattern to match all IDF package keys (for scanning/cleanup).
/// Format: {prefix}idf:pkg:*
/// </summary>
public static string IdfPackagePattern(string prefix = DefaultPrefix)
=> $"{prefix}idf:pkg:*";
/// <summary>
/// Key for IDF cache hit counter.
/// Format: {prefix}idf:stats:hits
/// </summary>
public static string IdfStatsHits(string prefix = DefaultPrefix)
=> $"{prefix}idf:stats:hits";
/// <summary>
/// Key for IDF cache miss counter.
/// Format: {prefix}idf:stats:misses
/// </summary>
public static string IdfStatsMisses(string prefix = DefaultPrefix)
=> $"{prefix}idf:stats:misses";
/// <summary>
/// Normalizes a PURL for use as a cache key.
/// </summary>

View File

@@ -0,0 +1,153 @@
// -----------------------------------------------------------------------------
// IPackageIdfService.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-007
// Description: Interface for package IDF (Inverse Document Frequency) caching
// -----------------------------------------------------------------------------
namespace StellaOps.Concelier.Cache.Valkey;
/// <summary>
/// Service for computing and caching IDF (Inverse Document Frequency) weights
/// for package keys used in linkset correlation.
/// </summary>
/// <remarks>
/// IDF measures how discriminative a package is across the observation corpus:
/// <code>
/// idf(pkg) = log(N / (1 + df(pkg)))
/// </code>
/// where N = total observations, df = observations containing the package.
///
/// Rare packages (low df) have high IDF → stronger correlation signal.
/// Common packages (high df) have low IDF → weaker correlation signal.
/// </remarks>
public interface IPackageIdfService
{
/// <summary>
/// Gets the IDF weight for a package key.
/// </summary>
/// <param name="packageName">The package name (PURL format).</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>
/// The IDF weight (0.0-1.0 normalized), or null if not cached.
/// Returns null on cache miss or error (graceful degradation).
/// </returns>
Task<double?> GetIdfAsync(string packageName, CancellationToken cancellationToken = default);
/// <summary>
/// Gets IDF weights for multiple package keys in a single batch operation.
/// </summary>
/// <param name="packageNames">The package names to look up.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>
/// Dictionary of package name to IDF weight. Missing entries indicate cache miss.
/// </returns>
Task<IReadOnlyDictionary<string, double>> GetIdfBatchAsync(
IEnumerable<string> packageNames,
CancellationToken cancellationToken = default);
/// <summary>
/// Sets the IDF weight for a package key.
/// </summary>
/// <param name="packageName">The package name.</param>
/// <param name="idfWeight">The IDF weight (0.0-1.0 normalized).</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task SetIdfAsync(string packageName, double idfWeight, CancellationToken cancellationToken = default);
/// <summary>
/// Sets IDF weights for multiple package keys in a single batch operation.
/// </summary>
/// <param name="idfWeights">Dictionary of package name to IDF weight.</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task SetIdfBatchAsync(
IReadOnlyDictionary<string, double> idfWeights,
CancellationToken cancellationToken = default);
/// <summary>
/// Updates the corpus statistics used for IDF computation.
/// </summary>
/// <param name="corpusSize">Total number of observations in the corpus.</param>
/// <param name="documentFrequencies">Dictionary of package name to document frequency.</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task UpdateCorpusStatsAsync(
long corpusSize,
IReadOnlyDictionary<string, long> documentFrequencies,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets the last refresh timestamp for IDF statistics.
/// </summary>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The last refresh time, or null if never refreshed.</returns>
Task<DateTimeOffset?> GetLastRefreshAsync(CancellationToken cancellationToken = default);
/// <summary>
/// Invalidates cached IDF data for a specific package.
/// </summary>
/// <param name="packageName">The package name to invalidate.</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task InvalidateAsync(string packageName, CancellationToken cancellationToken = default);
/// <summary>
/// Invalidates all cached IDF data.
/// </summary>
/// <param name="cancellationToken">Cancellation token.</param>
Task InvalidateAllAsync(CancellationToken cancellationToken = default);
/// <summary>
/// Whether the IDF cache is enabled and available.
/// </summary>
bool IsEnabled { get; }
}
/// <summary>
/// Configuration options for the package IDF service.
/// </summary>
public sealed class PackageIdfOptions
{
/// <summary>
/// Configuration section name.
/// </summary>
public const string SectionName = "Concelier:PackageIdf";
/// <summary>
/// Whether IDF caching is enabled.
/// </summary>
public bool Enabled { get; set; } = true;
/// <summary>
/// TTL for cached IDF scores.
/// Default: 1 hour.
/// </summary>
public TimeSpan IdfTtl { get; set; } = TimeSpan.FromHours(1);
/// <summary>
/// TTL for corpus statistics.
/// Default: 4 hours.
/// </summary>
public TimeSpan CorpusStatsTtl { get; set; } = TimeSpan.FromHours(4);
/// <summary>
/// Minimum IDF value to cache (to avoid caching very common packages).
/// Default: 0.01.
/// </summary>
public double MinIdfThreshold { get; set; } = 0.01;
/// <summary>
/// Default IDF weight to return on cache miss (uniform weight).
/// Default: 1.0 (no discrimination).
/// </summary>
public double DefaultIdfWeight { get; set; } = 1.0;
/// <summary>
/// Maximum number of IDF entries to cache.
/// Default: 100,000.
/// </summary>
public int MaxCacheEntries { get; set; } = 100_000;
/// <summary>
/// Whether to normalize IDF scores to 0.0-1.0 range.
/// Default: true.
/// </summary>
public bool NormalizeScores { get; set; } = true;
}

View File

@@ -0,0 +1,139 @@
// -----------------------------------------------------------------------------
// IdfRefreshHostedService.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-007
// Description: Background service for periodic IDF weight refresh
// -----------------------------------------------------------------------------
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Concelier.Cache.Valkey;
/// <summary>
/// Interface for providing IDF corpus statistics from the observation store.
/// </summary>
/// <remarks>
/// This interface should be implemented by the Concelier Core module to provide
/// document frequencies from the actual observation database.
/// </remarks>
public interface IIdfCorpusProvider
{
/// <summary>
/// Gets the total number of observations in the corpus.
/// </summary>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Total observation count.</returns>
Task<long> GetCorpusSizeAsync(CancellationToken cancellationToken = default);
/// <summary>
/// Gets document frequencies for all packages in the corpus.
/// </summary>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Dictionary mapping package name to the number of observations containing it.</returns>
Task<IReadOnlyDictionary<string, long>> GetDocumentFrequenciesAsync(CancellationToken cancellationToken = default);
}
/// <summary>
/// Background service that periodically refreshes IDF weights from the observation corpus.
/// </summary>
public sealed class IdfRefreshHostedService : BackgroundService
{
private readonly IPackageIdfService _idfService;
private readonly IIdfCorpusProvider? _corpusProvider;
private readonly PackageIdfOptions _options;
private readonly ILogger<IdfRefreshHostedService>? _logger;
/// <summary>
/// Initializes a new instance of <see cref="IdfRefreshHostedService"/>.
/// </summary>
public IdfRefreshHostedService(
IPackageIdfService idfService,
IOptions<PackageIdfOptions> options,
IIdfCorpusProvider? corpusProvider = null,
ILogger<IdfRefreshHostedService>? logger = null)
{
_idfService = idfService ?? throw new ArgumentNullException(nameof(idfService));
_corpusProvider = corpusProvider;
_options = options?.Value ?? new PackageIdfOptions();
_logger = logger;
}
/// <inheritdoc />
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
if (!_idfService.IsEnabled)
{
_logger?.LogInformation("IDF refresh service disabled (IDF caching not enabled)");
return;
}
if (_corpusProvider is null)
{
_logger?.LogWarning(
"IDF refresh service has no corpus provider registered. " +
"Register IIdfCorpusProvider to enable automatic IDF refresh.");
return;
}
// Initial delay before first refresh (allow other services to start)
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
while (!stoppingToken.IsCancellationRequested)
{
try
{
await RefreshIdfWeightsAsync(stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger?.LogError(ex, "Error during IDF refresh cycle");
}
// Wait for next refresh interval (default: 1 hour)
try
{
await Task.Delay(_options.IdfTtl, stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
}
_logger?.LogInformation("IDF refresh service stopped");
}
private async Task RefreshIdfWeightsAsync(CancellationToken cancellationToken)
{
_logger?.LogDebug("Starting IDF refresh cycle");
var corpusSize = await _corpusProvider!.GetCorpusSizeAsync(cancellationToken).ConfigureAwait(false);
if (corpusSize == 0)
{
_logger?.LogWarning("IDF refresh skipped: empty corpus");
return;
}
var documentFrequencies = await _corpusProvider.GetDocumentFrequenciesAsync(cancellationToken).ConfigureAwait(false);
if (documentFrequencies.Count == 0)
{
_logger?.LogWarning("IDF refresh skipped: no document frequencies");
return;
}
await _idfService.UpdateCorpusStatsAsync(corpusSize, documentFrequencies, cancellationToken).ConfigureAwait(false);
_logger?.LogInformation(
"IDF refresh completed: corpus={CorpusSize}, packages={PackageCount}",
corpusSize,
documentFrequencies.Count);
}
}

View File

@@ -0,0 +1,249 @@
// -----------------------------------------------------------------------------
// PackageIdfMetrics.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-007
// Description: OpenTelemetry metrics for package IDF caching operations
// -----------------------------------------------------------------------------
using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace StellaOps.Concelier.Cache.Valkey;
/// <summary>
/// Metrics instrumentation for the package IDF cache.
/// </summary>
public sealed class PackageIdfMetrics : IDisposable
{
/// <summary>
/// Activity source name for IDF cache operations.
/// </summary>
public const string ActivitySourceName = "StellaOps.Concelier.PackageIdf";
/// <summary>
/// Meter name for IDF cache metrics.
/// </summary>
public const string MeterName = "StellaOps.Concelier.PackageIdf";
private readonly Meter _meter;
private readonly Counter<long> _hitsCounter;
private readonly Counter<long> _missesCounter;
private readonly Counter<long> _refreshCounter;
private readonly Histogram<double> _latencyHistogram;
private readonly Histogram<double> _idfWeightHistogram;
private readonly ObservableGauge<long> _corpusSizeGauge;
private readonly ObservableGauge<long> _cachedEntriesGauge;
private long _lastKnownCorpusSize;
private long _lastKnownCachedEntries;
/// <summary>
/// Activity source for tracing IDF cache operations.
/// </summary>
public static ActivitySource ActivitySource { get; } = new(ActivitySourceName, "1.0.0");
/// <summary>
/// Initializes a new instance of <see cref="PackageIdfMetrics"/>.
/// </summary>
public PackageIdfMetrics()
{
_meter = new Meter(MeterName, "1.0.0");
_hitsCounter = _meter.CreateCounter<long>(
"concelier_linkset_package_idf_hits_total",
unit: "{hits}",
description: "Total number of package IDF cache hits");
_missesCounter = _meter.CreateCounter<long>(
"concelier_linkset_package_idf_misses_total",
unit: "{misses}",
description: "Total number of package IDF cache misses");
_refreshCounter = _meter.CreateCounter<long>(
"concelier_linkset_package_idf_refreshes_total",
unit: "{refreshes}",
description: "Total number of IDF corpus refresh operations");
_latencyHistogram = _meter.CreateHistogram<double>(
"concelier_linkset_package_idf_latency_ms",
unit: "ms",
description: "Package IDF cache operation latency in milliseconds");
_idfWeightHistogram = _meter.CreateHistogram<double>(
"concelier_linkset_package_idf_weight",
unit: "{weight}",
description: "Distribution of package IDF weights (0.0-1.0)");
_corpusSizeGauge = _meter.CreateObservableGauge(
"concelier_linkset_package_idf_corpus_size",
() => _lastKnownCorpusSize,
unit: "{observations}",
description: "Total number of observations in the IDF corpus");
_cachedEntriesGauge = _meter.CreateObservableGauge(
"concelier_linkset_package_idf_cached_entries",
() => _lastKnownCachedEntries,
unit: "{entries}",
description: "Number of cached IDF entries");
}
/// <summary>
/// Records a cache hit.
/// </summary>
public void RecordHit() => _hitsCounter.Add(1);
/// <summary>
/// Records multiple cache hits.
/// </summary>
/// <param name="count">Number of hits.</param>
public void RecordHits(long count) => _hitsCounter.Add(count);
/// <summary>
/// Records a cache miss.
/// </summary>
public void RecordMiss() => _missesCounter.Add(1);
/// <summary>
/// Records multiple cache misses.
/// </summary>
/// <param name="count">Number of misses.</param>
public void RecordMisses(long count) => _missesCounter.Add(count);
/// <summary>
/// Records a corpus refresh operation.
/// </summary>
/// <param name="packageCount">Number of packages refreshed.</param>
public void RecordRefresh(long packageCount = 1)
{
_refreshCounter.Add(1, new KeyValuePair<string, object?>("package_count", packageCount));
}
/// <summary>
/// Records operation latency.
/// </summary>
/// <param name="milliseconds">Latency in milliseconds.</param>
/// <param name="operation">The operation type (get, set, batch_get, refresh).</param>
public void RecordLatency(double milliseconds, string operation)
{
_latencyHistogram.Record(milliseconds, new KeyValuePair<string, object?>("operation", operation));
}
/// <summary>
/// Records an IDF weight observation for distribution analysis.
/// </summary>
/// <param name="weight">The IDF weight (0.0-1.0).</param>
public void RecordIdfWeight(double weight)
{
_idfWeightHistogram.Record(weight);
}
/// <summary>
/// Updates the corpus size gauge.
/// </summary>
/// <param name="size">Current corpus size.</param>
public void UpdateCorpusSize(long size)
{
_lastKnownCorpusSize = size;
}
/// <summary>
/// Updates the cached entries gauge.
/// </summary>
/// <param name="count">Current cached entry count.</param>
public void UpdateCachedEntries(long count)
{
_lastKnownCachedEntries = count;
}
/// <summary>
/// Starts an activity for tracing an IDF cache operation.
/// </summary>
/// <param name="operationName">Name of the operation.</param>
/// <returns>The activity, or null if tracing is disabled.</returns>
public static Activity? StartActivity(string operationName)
{
return ActivitySource.StartActivity(operationName, ActivityKind.Internal);
}
/// <summary>
/// Starts an activity with tags.
/// </summary>
/// <param name="operationName">Name of the operation.</param>
/// <param name="tags">Tags to add to the activity.</param>
/// <returns>The activity, or null if tracing is disabled.</returns>
public static Activity? StartActivity(string operationName, params (string Key, object? Value)[] tags)
{
var activity = ActivitySource.StartActivity(operationName, ActivityKind.Internal);
if (activity is not null)
{
foreach (var (key, value) in tags)
{
activity.SetTag(key, value);
}
}
return activity;
}
/// <inheritdoc />
public void Dispose()
{
_meter.Dispose();
}
}
/// <summary>
/// Extension methods for timing IDF cache operations.
/// </summary>
public static class PackageIdfMetricsExtensions
{
/// <summary>
/// Times an async operation and records the latency.
/// </summary>
public static async Task<T> TimeAsync<T>(
this PackageIdfMetrics? metrics,
string operation,
Func<Task<T>> action)
{
if (metrics is null)
{
return await action().ConfigureAwait(false);
}
var sw = Stopwatch.StartNew();
try
{
return await action().ConfigureAwait(false);
}
finally
{
sw.Stop();
metrics.RecordLatency(sw.Elapsed.TotalMilliseconds, operation);
}
}
/// <summary>
/// Times an async operation and records the latency.
/// </summary>
public static async Task TimeAsync(
this PackageIdfMetrics? metrics,
string operation,
Func<Task> action)
{
if (metrics is null)
{
await action().ConfigureAwait(false);
return;
}
var sw = Stopwatch.StartNew();
try
{
await action().ConfigureAwait(false);
}
finally
{
sw.Stop();
metrics.RecordLatency(sw.Elapsed.TotalMilliseconds, operation);
}
}
}

View File

@@ -32,6 +32,10 @@ public static class ServiceCollectionExtensions
services.Configure<ConcelierCacheOptions>(
configuration.GetSection(ConcelierCacheOptions.SectionName));
// Bind package IDF options (CORR-V2-007)
services.Configure<PackageIdfOptions>(
configuration.GetSection(PackageIdfOptions.SectionName));
return AddCoreServices(services, enableWarmup);
}
@@ -39,16 +43,23 @@ public static class ServiceCollectionExtensions
/// Adds Concelier Valkey cache services with custom options.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="configureOptions">Action to configure options.</param>
/// <param name="configureOptions">Action to configure cache options.</param>
/// <param name="configureIdfOptions">Optional action to configure IDF options.</param>
/// <param name="enableWarmup">Whether to enable background cache warmup.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddConcelierValkeyCache(
this IServiceCollection services,
Action<ConcelierCacheOptions> configureOptions,
Action<PackageIdfOptions>? configureIdfOptions = null,
bool enableWarmup = true)
{
services.Configure(configureOptions);
if (configureIdfOptions is not null)
{
services.Configure(configureIdfOptions);
}
return AddCoreServices(services, enableWarmup);
}
@@ -59,9 +70,11 @@ public static class ServiceCollectionExtensions
// Register metrics
services.TryAddSingleton<ConcelierCacheMetrics>();
services.TryAddSingleton<PackageIdfMetrics>();
// Register cache service
// Register cache services
services.TryAddSingleton<IAdvisoryCacheService, ValkeyAdvisoryCacheService>();
services.TryAddSingleton<IPackageIdfService, ValkeyPackageIdfService>();
// Register warmup hosted service if enabled
if (enableWarmup)
@@ -69,6 +82,10 @@ public static class ServiceCollectionExtensions
services.AddHostedService<CacheWarmupHostedService>();
}
// Register IDF refresh hosted service (CORR-V2-007)
// Note: Requires IIdfCorpusProvider to be registered by Concelier.Core
services.AddHostedService<IdfRefreshHostedService>();
return services;
}

View File

@@ -0,0 +1,421 @@
// -----------------------------------------------------------------------------
// ValkeyPackageIdfService.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-007
// Description: Valkey-backed implementation of IPackageIdfService
// -----------------------------------------------------------------------------
using System.Diagnostics;
using System.Globalization;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StackExchange.Redis;
namespace StellaOps.Concelier.Cache.Valkey;
/// <summary>
/// Valkey-backed implementation of <see cref="IPackageIdfService"/>.
/// Provides caching for package IDF (Inverse Document Frequency) weights
/// used in linkset correlation scoring.
/// </summary>
/// <remarks>
/// <para>
/// This service caches pre-computed IDF weights with hourly refresh.
/// On cache miss, it returns null to signal the caller should use uniform weights.
/// </para>
/// <para>
/// Key features:
/// - Batch operations for efficient multi-package lookups
/// - Graceful degradation on Valkey errors (returns null, logs warning)
/// - TTL-based expiration with configurable refresh intervals
/// - OpenTelemetry metrics for monitoring cache performance
/// </para>
/// </remarks>
public sealed class ValkeyPackageIdfService : IPackageIdfService
{
private readonly ConcelierCacheConnectionFactory _connectionFactory;
private readonly ConcelierCacheOptions _cacheOptions;
private readonly PackageIdfOptions _idfOptions;
private readonly PackageIdfMetrics? _metrics;
private readonly ILogger<ValkeyPackageIdfService>? _logger;
/// <summary>
/// Initializes a new instance of <see cref="ValkeyPackageIdfService"/>.
/// </summary>
public ValkeyPackageIdfService(
ConcelierCacheConnectionFactory connectionFactory,
IOptions<ConcelierCacheOptions> cacheOptions,
IOptions<PackageIdfOptions> idfOptions,
PackageIdfMetrics? metrics = null,
ILogger<ValkeyPackageIdfService>? logger = null)
{
_connectionFactory = connectionFactory ?? throw new ArgumentNullException(nameof(connectionFactory));
_cacheOptions = cacheOptions?.Value ?? new ConcelierCacheOptions();
_idfOptions = idfOptions?.Value ?? new PackageIdfOptions();
_metrics = metrics;
_logger = logger;
}
/// <inheritdoc />
public bool IsEnabled => _cacheOptions.Enabled && _idfOptions.Enabled;
/// <inheritdoc />
public async Task<double?> GetIdfAsync(string packageName, CancellationToken cancellationToken = default)
{
if (!IsEnabled || string.IsNullOrWhiteSpace(packageName))
{
return null;
}
var sw = StartTiming();
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var key = AdvisoryCacheKeys.IdfPackage(packageName, _cacheOptions.KeyPrefix);
var cached = await db.StringGetAsync(key).ConfigureAwait(false);
if (cached.HasValue && double.TryParse((string?)cached, NumberStyles.Float, CultureInfo.InvariantCulture, out var weight))
{
await db.StringIncrementAsync(AdvisoryCacheKeys.IdfStatsHits(_cacheOptions.KeyPrefix)).ConfigureAwait(false);
_metrics?.RecordHit();
_metrics?.RecordIdfWeight(weight);
return weight;
}
await db.StringIncrementAsync(AdvisoryCacheKeys.IdfStatsMisses(_cacheOptions.KeyPrefix)).ConfigureAwait(false);
_metrics?.RecordMiss();
return null;
}
catch (Exception ex)
{
_logger?.LogWarning(ex, "Failed to get IDF for package {PackageName}", packageName);
return null; // Graceful degradation
}
finally
{
StopTiming(sw, "get");
}
}
/// <inheritdoc />
public async Task<IReadOnlyDictionary<string, double>> GetIdfBatchAsync(
IEnumerable<string> packageNames,
CancellationToken cancellationToken = default)
{
var names = packageNames?.Where(n => !string.IsNullOrWhiteSpace(n)).Distinct().ToArray()
?? Array.Empty<string>();
if (!IsEnabled || names.Length == 0)
{
return new Dictionary<string, double>();
}
var sw = StartTiming();
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var keys = names.Select(n => (RedisKey)AdvisoryCacheKeys.IdfPackage(n, _cacheOptions.KeyPrefix)).ToArray();
var values = await db.StringGetAsync(keys).ConfigureAwait(false);
var result = new Dictionary<string, double>(names.Length);
var hits = 0;
var misses = 0;
for (var i = 0; i < names.Length; i++)
{
if (values[i].HasValue &&
double.TryParse((string?)values[i], NumberStyles.Float, CultureInfo.InvariantCulture, out var weight))
{
result[names[i]] = weight;
hits++;
_metrics?.RecordIdfWeight(weight);
}
else
{
misses++;
}
}
if (hits > 0) _metrics?.RecordHits(hits);
if (misses > 0) _metrics?.RecordMisses(misses);
return result;
}
catch (Exception ex)
{
_logger?.LogWarning(ex, "Failed to batch get IDF for {Count} packages", names.Length);
return new Dictionary<string, double>();
}
finally
{
StopTiming(sw, "batch_get");
}
}
/// <inheritdoc />
public async Task SetIdfAsync(string packageName, double idfWeight, CancellationToken cancellationToken = default)
{
if (!IsEnabled || string.IsNullOrWhiteSpace(packageName))
{
return;
}
// Skip caching weights below threshold (very common packages)
if (idfWeight < _idfOptions.MinIdfThreshold)
{
return;
}
var sw = StartTiming();
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var key = AdvisoryCacheKeys.IdfPackage(packageName, _cacheOptions.KeyPrefix);
var value = idfWeight.ToString("F6", CultureInfo.InvariantCulture);
await db.StringSetAsync(key, value, _idfOptions.IdfTtl).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger?.LogWarning(ex, "Failed to set IDF for package {PackageName}", packageName);
}
finally
{
StopTiming(sw, "set");
}
}
/// <inheritdoc />
public async Task SetIdfBatchAsync(
IReadOnlyDictionary<string, double> idfWeights,
CancellationToken cancellationToken = default)
{
if (!IsEnabled || idfWeights is null || idfWeights.Count == 0)
{
return;
}
var sw = StartTiming();
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var entries = idfWeights
.Where(kv => !string.IsNullOrWhiteSpace(kv.Key) && kv.Value >= _idfOptions.MinIdfThreshold)
.Select(kv => new KeyValuePair<RedisKey, RedisValue>(
AdvisoryCacheKeys.IdfPackage(kv.Key, _cacheOptions.KeyPrefix),
kv.Value.ToString("F6", CultureInfo.InvariantCulture)))
.ToArray();
if (entries.Length == 0)
{
return;
}
// Use pipeline for batch set with TTL
var batch = db.CreateBatch();
var tasks = new List<Task>(entries.Length);
foreach (var entry in entries)
{
tasks.Add(batch.StringSetAsync(entry.Key, entry.Value, _idfOptions.IdfTtl));
}
batch.Execute();
await Task.WhenAll(tasks).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger?.LogWarning(ex, "Failed to batch set IDF for {Count} packages", idfWeights.Count);
}
finally
{
StopTiming(sw, "batch_set");
}
}
/// <inheritdoc />
public async Task UpdateCorpusStatsAsync(
long corpusSize,
IReadOnlyDictionary<string, long> documentFrequencies,
CancellationToken cancellationToken = default)
{
if (!IsEnabled)
{
return;
}
var sw = StartTiming();
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var prefix = _cacheOptions.KeyPrefix;
// Update corpus size
await db.StringSetAsync(
AdvisoryCacheKeys.IdfCorpusSize(prefix),
corpusSize.ToString(CultureInfo.InvariantCulture),
_idfOptions.CorpusStatsTtl).ConfigureAwait(false);
// Compute and cache IDF weights
var idfWeights = new Dictionary<string, double>(documentFrequencies.Count);
var maxIdf = 0.0;
foreach (var (packageName, df) in documentFrequencies)
{
// IDF formula: log(N / (1 + df))
var rawIdf = Math.Log((double)corpusSize / (1 + df));
if (rawIdf > maxIdf) maxIdf = rawIdf;
idfWeights[packageName] = rawIdf;
}
// Normalize if configured
if (_idfOptions.NormalizeScores && maxIdf > 0)
{
foreach (var key in idfWeights.Keys.ToArray())
{
idfWeights[key] /= maxIdf;
}
}
// Batch set the normalized IDF weights
await SetIdfBatchAsync(idfWeights, cancellationToken).ConfigureAwait(false);
// Update document frequencies
var batch = db.CreateBatch();
var tasks = new List<Task>(documentFrequencies.Count);
foreach (var (packageName, df) in documentFrequencies)
{
tasks.Add(batch.StringSetAsync(
AdvisoryCacheKeys.IdfDocumentFrequency(packageName, prefix),
df.ToString(CultureInfo.InvariantCulture),
_idfOptions.CorpusStatsTtl));
}
batch.Execute();
await Task.WhenAll(tasks).ConfigureAwait(false);
// Update last refresh timestamp
await db.StringSetAsync(
AdvisoryCacheKeys.IdfLastRefresh(prefix),
DateTimeOffset.UtcNow.ToString("o", CultureInfo.InvariantCulture),
_idfOptions.CorpusStatsTtl).ConfigureAwait(false);
_metrics?.UpdateCorpusSize(corpusSize);
_metrics?.UpdateCachedEntries(documentFrequencies.Count);
_metrics?.RecordRefresh(documentFrequencies.Count);
_logger?.LogInformation(
"Updated IDF corpus: size={CorpusSize}, packages={PackageCount}",
corpusSize,
documentFrequencies.Count);
}
catch (Exception ex)
{
_logger?.LogError(ex, "Failed to update IDF corpus stats");
}
finally
{
StopTiming(sw, "refresh");
}
}
/// <inheritdoc />
public async Task<DateTimeOffset?> GetLastRefreshAsync(CancellationToken cancellationToken = default)
{
if (!IsEnabled)
{
return null;
}
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var key = AdvisoryCacheKeys.IdfLastRefresh(_cacheOptions.KeyPrefix);
var cached = await db.StringGetAsync(key).ConfigureAwait(false);
if (cached.HasValue &&
DateTimeOffset.TryParse(cached, CultureInfo.InvariantCulture, DateTimeStyles.RoundtripKind, out var timestamp))
{
return timestamp;
}
return null;
}
catch (Exception ex)
{
_logger?.LogWarning(ex, "Failed to get IDF last refresh timestamp");
return null;
}
}
/// <inheritdoc />
public async Task InvalidateAsync(string packageName, CancellationToken cancellationToken = default)
{
if (!IsEnabled || string.IsNullOrWhiteSpace(packageName))
{
return;
}
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var prefix = _cacheOptions.KeyPrefix;
await Task.WhenAll(
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfPackage(packageName, prefix)),
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfDocumentFrequency(packageName, prefix))
).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger?.LogWarning(ex, "Failed to invalidate IDF for package {PackageName}", packageName);
}
}
/// <inheritdoc />
public async Task InvalidateAllAsync(CancellationToken cancellationToken = default)
{
if (!IsEnabled)
{
return;
}
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var prefix = _cacheOptions.KeyPrefix;
// Delete stats keys
await Task.WhenAll(
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfCorpusSize(prefix)),
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfLastRefresh(prefix)),
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfStatsHits(prefix)),
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfStatsMisses(prefix))
).ConfigureAwait(false);
// Note: Scanning and deleting all idf:pkg:* keys would require SCAN,
// which is expensive. For now, rely on TTL expiration.
_logger?.LogInformation("Invalidated IDF stats; individual package keys will expire via TTL");
}
catch (Exception ex)
{
_logger?.LogError(ex, "Failed to invalidate all IDF cache");
}
}
private Stopwatch? StartTiming()
{
if (_metrics is null) return null;
return Stopwatch.StartNew();
}
private void StopTiming(Stopwatch? sw, string operation)
{
if (sw is null || _metrics is null) return;
sw.Stop();
_metrics.RecordLatency(sw.Elapsed.TotalMilliseconds, operation);
}
}

View File

@@ -40,11 +40,33 @@ public sealed record AdvisoryLinksetProvenance(
string? ToolVersion,
string? PolicyHash);
/// <summary>
/// Conflict severity levels for typed penalty calculation.
/// </summary>
public enum ConflictSeverity
{
/// <summary>No penalty; informational only.</summary>
Info = 0,
/// <summary>Minor disagreement; small penalty.</summary>
Soft = 1,
/// <summary>Significant disagreement; should usually prevent high-confidence linking.</summary>
Hard = 2
}
public sealed record AdvisoryLinksetConflict(
string Field,
string Reason,
IReadOnlyList<string>? Values,
IReadOnlyList<string>? SourceIds = null);
IReadOnlyList<string>? SourceIds = null)
{
/// <summary>
/// Severity of the conflict. Defaults to <see cref="ConflictSeverity.Soft"/>.
/// Hard conflicts significantly impact confidence; Info conflicts are purely informational.
/// </summary>
public ConflictSeverity Severity { get; init; } = ConflictSeverity.Soft;
}
internal static class DocumentHelper
{

View File

@@ -0,0 +1,73 @@
// -----------------------------------------------------------------------------
// ILinksetCorrelationService.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-008
// Description: Abstraction for linkset correlation with V1/V2 support
// -----------------------------------------------------------------------------
using System;
using System.Collections.Generic;
using StellaOps.Concelier.Models;
namespace StellaOps.Concelier.Core.Linksets;
/// <summary>
/// Service for computing linkset correlation confidence and conflicts.
/// Supports multiple correlation algorithm versions (V1, V2).
/// </summary>
public interface ILinksetCorrelationService
{
/// <summary>
/// Gets the correlation algorithm version being used.
/// </summary>
string Version { get; }
/// <summary>
/// Computes correlation confidence and conflicts for a set of observation inputs.
/// </summary>
(double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) Compute(
IReadOnlyCollection<CorrelationInput> inputs,
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts = null);
}
/// <summary>
/// Unified input model for correlation computation.
/// </summary>
public sealed record CorrelationInput(
string ObservationId,
string? Vendor,
DateTimeOffset? FetchedAt,
IReadOnlyCollection<string> Aliases,
IReadOnlyCollection<string> Purls,
IReadOnlyCollection<string> Cpes,
IReadOnlyCollection<string> References,
IReadOnlyCollection<string>? PatchReferences = null);
/// <summary>
/// Configuration for the correlation service.
/// </summary>
public sealed class CorrelationServiceOptions
{
/// <summary>
/// Correlation algorithm version. Supported values: "v1", "v2".
/// Default: "v1" for backward compatibility.
/// </summary>
public string Version { get; set; } = "v1";
/// <summary>
/// Optional custom weights for V2 correlation signals.
/// Keys: aliasConnectivity, aliasAuthority, packageCoverage, versionCompatibility,
/// cpeMatch, patchLineage, referenceOverlap, freshness
/// </summary>
public Dictionary<string, double>? Weights { get; set; }
/// <summary>
/// Whether to enable IDF weighting for package keys (V2 only).
/// </summary>
public bool EnableIdfWeighting { get; set; } = true;
/// <summary>
/// Whether to enable text similarity scoring (V2 Phase 3, disabled by default).
/// </summary>
public bool EnableTextSimilarity { get; set; } = false;
}

View File

@@ -0,0 +1,104 @@
// -----------------------------------------------------------------------------
// LinksetCorrelationService.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-008
// Description: Implementation of ILinksetCorrelationService with V1/V2 support
// -----------------------------------------------------------------------------
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Concelier.Models;
namespace StellaOps.Concelier.Core.Linksets;
/// <summary>
/// Default implementation of <see cref="ILinksetCorrelationService"/>.
/// Supports V1 (intersection-based) and V2 (graph-based) correlation algorithms.
/// </summary>
public sealed class LinksetCorrelationService : ILinksetCorrelationService
{
private readonly CorrelationServiceOptions _options;
private readonly ILogger<LinksetCorrelationService> _logger;
private readonly Func<string, double>? _idfProvider;
public LinksetCorrelationService(
IOptions<CorrelationServiceOptions> options,
ILogger<LinksetCorrelationService> logger,
Func<string, double>? idfProvider = null)
{
_options = options?.Value ?? new CorrelationServiceOptions();
_logger = logger;
_idfProvider = idfProvider;
}
/// <inheritdoc />
public string Version => _options.Version?.ToLowerInvariant() switch
{
"v2" => "v2",
_ => "v1"
};
/// <inheritdoc />
public (double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) Compute(
IReadOnlyCollection<CorrelationInput> inputs,
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts = null)
{
if (inputs.Count == 0)
{
return (1.0, Array.Empty<AdvisoryLinksetConflict>());
}
return Version switch
{
"v2" => ComputeV2(inputs, additionalConflicts),
_ => ComputeV1(inputs, additionalConflicts)
};
}
private (double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) ComputeV1(
IReadOnlyCollection<CorrelationInput> inputs,
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts)
{
// Convert to V1 input format
var v1Inputs = inputs.Select(i => new LinksetCorrelation.Input(
Vendor: i.Vendor,
FetchedAt: i.FetchedAt,
Aliases: i.Aliases,
Purls: i.Purls,
Cpes: i.Cpes,
References: i.References)).ToArray();
return LinksetCorrelation.Compute(v1Inputs, additionalConflicts);
}
private (double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) ComputeV2(
IReadOnlyCollection<CorrelationInput> inputs,
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts)
{
// Convert to V2 input format
var v2Inputs = inputs.Select(i => new LinksetCorrelationV2.InputV2(
ObservationId: i.ObservationId,
Vendor: i.Vendor,
FetchedAt: i.FetchedAt,
Aliases: i.Aliases,
Purls: i.Purls,
Cpes: i.Cpes,
References: i.References,
PatchReferences: i.PatchReferences)).ToArray();
var idfProvider = _options.EnableIdfWeighting ? _idfProvider : null;
var result = LinksetCorrelationV2.Compute(v2Inputs, additionalConflicts, idfProvider);
_logger.LogDebug(
"V2 correlation computed: confidence={Confidence:F3}, conflicts={ConflictCount}, signals={Signals}",
result.Confidence,
result.Conflicts.Count,
string.Join(", ", result.SignalScores.Select(kv => $"{kv.Key}={kv.Value:F2}")));
return (result.Confidence, result.Conflicts);
}
}

View File

@@ -0,0 +1,910 @@
// -----------------------------------------------------------------------------
// LinksetCorrelationV2.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-001 through CORR-V2-008
// Description: V2 correlation algorithm with graph-based alias connectivity,
// version compatibility scoring, patch lineage signals, and typed
// conflict severities.
// -----------------------------------------------------------------------------
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using StellaOps.Concelier.Models;
namespace StellaOps.Concelier.Core.Linksets;
/// <summary>
/// Version relationship classification for affected range comparison.
/// </summary>
public enum VersionRelation
{
/// <summary>Unable to determine relationship.</summary>
Unknown = 0,
/// <summary>Ranges normalize to identical primitives.</summary>
Equivalent = 1,
/// <summary>Ranges have non-empty intersection but are not equal.</summary>
Overlapping = 2,
/// <summary>Ranges have no intersection.</summary>
Disjoint = 3
}
/// <summary>
/// V2 linkset correlation algorithm with graph-based connectivity,
/// typed conflict severities, and multi-signal scoring.
/// </summary>
/// <remarks>
/// Key improvements over V1:
/// - Alias matching uses graph connectivity (LCC ratio) instead of intersection-across-all
/// - PURL matching uses pairwise coverage instead of intersection-across-all
/// - Reference clash only emitted for true contradictions, not zero overlap
/// - Typed conflict severities with per-reason penalties
/// - Patch lineage as high-weight signal
/// - Version compatibility classification (equivalent/overlapping/disjoint)
/// </remarks>
internal static class LinksetCorrelationV2
{
/// <summary>
/// Default correlation weights. Can be overridden via configuration.
/// </summary>
internal static class Weights
{
public const double AliasConnectivity = 0.30;
public const double AliasAuthority = 0.10;
public const double PackageCoverage = 0.20;
public const double VersionCompatibility = 0.10;
public const double CpeMatch = 0.10;
public const double PatchLineage = 0.10;
public const double ReferenceOverlap = 0.05;
public const double Freshness = 0.05;
}
/// <summary>
/// Conflict penalties by severity and reason.
/// </summary>
internal static class ConflictPenalties
{
public const double DistinctCves = 0.40; // Hard: two different CVEs
public const double DisjointVersionRanges = 0.30; // Hard: same pkg, no overlap
public const double OverlappingRanges = 0.05; // Soft: ranges overlap but differ
public const double SeverityMismatch = 0.05; // Soft: CVSS differs
public const double AliasInconsistency = 0.10; // Soft: non-CVE alias mismatch
public const double ZeroReferenceOverlap = 0.00; // Info: no penalty
}
internal readonly record struct InputV2(
string ObservationId,
string? Vendor,
DateTimeOffset? FetchedAt,
IReadOnlyCollection<string> Aliases,
IReadOnlyCollection<string> Purls,
IReadOnlyCollection<string> Cpes,
IReadOnlyCollection<string> References,
IReadOnlyCollection<string>? PatchReferences = null);
internal readonly record struct CorrelationResult(
double Confidence,
IReadOnlyList<AdvisoryLinksetConflict> Conflicts,
IReadOnlyDictionary<string, double> SignalScores);
/// <summary>
/// Computes correlation confidence and conflicts for a set of observations.
/// </summary>
internal static CorrelationResult Compute(
IReadOnlyCollection<InputV2> inputs,
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts = null,
Func<string, double>? packageIdfProvider = null)
{
if (inputs.Count == 0)
{
return new CorrelationResult(
1.0,
Array.Empty<AdvisoryLinksetConflict>(),
ImmutableDictionary<string, double>.Empty);
}
var conflicts = new List<AdvisoryLinksetConflict>();
var signalScores = new Dictionary<string, double>();
// 1. Alias connectivity (graph-based)
var (aliasConnectivity, aliasConflicts) = CalculateAliasConnectivity(inputs);
conflicts.AddRange(aliasConflicts);
signalScores["aliasConnectivity"] = aliasConnectivity;
// 2. Alias authority (scope-based weighting)
var aliasAuthority = CalculateAliasAuthority(inputs);
signalScores["aliasAuthority"] = aliasAuthority;
// 3. Package coverage (pairwise + IDF)
var (packageCoverage, packageConflicts) = CalculatePackageCoverage(inputs, packageIdfProvider);
conflicts.AddRange(packageConflicts);
signalScores["packageCoverage"] = packageCoverage;
// 4. Version compatibility
var (versionScore, versionConflicts) = CalculateVersionCompatibility(inputs);
conflicts.AddRange(versionConflicts);
signalScores["versionCompatibility"] = versionScore;
// 5. CPE match (existing logic, minor adjustments)
var cpeScore = CalculateCpeScore(inputs);
signalScores["cpeMatch"] = cpeScore;
// 6. Patch lineage
var patchScore = CalculatePatchLineageScore(inputs);
signalScores["patchLineage"] = patchScore;
// 7. Reference overlap (positive-only, no conflict on zero)
var referenceScore = CalculateReferenceScore(inputs);
signalScores["referenceOverlap"] = referenceScore;
// 8. Freshness
var freshnessScore = CalculateFreshnessScore(inputs);
signalScores["freshness"] = freshnessScore;
// Calculate base confidence from weighted signals
var baseConfidence = Clamp01(
(Weights.AliasConnectivity * aliasConnectivity) +
(Weights.AliasAuthority * aliasAuthority) +
(Weights.PackageCoverage * packageCoverage) +
(Weights.VersionCompatibility * versionScore) +
(Weights.CpeMatch * cpeScore) +
(Weights.PatchLineage * patchScore) +
(Weights.ReferenceOverlap * referenceScore) +
(Weights.Freshness * freshnessScore));
// Add additional conflicts before penalty calculation
if (additionalConflicts is { Count: > 0 })
{
conflicts.AddRange(additionalConflicts);
}
// Apply typed conflict penalties
var totalPenalty = CalculateTypedPenalty(conflicts);
var finalConfidence = Clamp01(baseConfidence - totalPenalty);
// Ensure minimum confidence when conflicts exist but evidence is present
if (finalConfidence < 0.1 && baseConfidence > 0)
{
finalConfidence = 0.1;
}
return new CorrelationResult(
finalConfidence,
DeduplicateAndSort(conflicts, inputs),
signalScores.ToImmutableDictionary());
}
#region Alias Connectivity (Graph-based)
/// <summary>
/// Calculates alias connectivity using bipartite graph analysis.
/// Returns LCC (largest connected component) ratio instead of intersection.
/// </summary>
private static (double Score, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) CalculateAliasConnectivity(
IReadOnlyCollection<InputV2> inputs)
{
var conflicts = new List<AdvisoryLinksetConflict>();
if (inputs.Count == 1)
{
return (inputs.First().Aliases.Count > 0 ? 1d : 0d, conflicts);
}
// Build bipartite graph: observation nodes + alias nodes
var observationToAliases = inputs
.ToDictionary(
i => i.ObservationId,
i => i.Aliases.Select(a => a.ToUpperInvariant()).ToHashSet(StringComparer.Ordinal));
// Build adjacency for union-find
var allAliases = observationToAliases.Values.SelectMany(a => a).ToHashSet(StringComparer.Ordinal);
if (allAliases.Count == 0)
{
return (0d, conflicts);
}
// Find connected components using alias-based bridging
var observationIds = inputs.Select(i => i.ObservationId).ToList();
var parent = observationIds.ToDictionary(id => id, id => id);
string Find(string x)
{
if (parent[x] != x)
parent[x] = Find(parent[x]);
return parent[x];
}
void Union(string x, string y)
{
var px = Find(x);
var py = Find(y);
if (px != py)
parent[px] = py;
}
// Connect observations that share any alias
foreach (var alias in allAliases)
{
var observationsWithAlias = observationIds
.Where(id => observationToAliases[id].Contains(alias))
.ToList();
for (int i = 1; i < observationsWithAlias.Count; i++)
{
Union(observationsWithAlias[0], observationsWithAlias[i]);
}
}
// Calculate LCC ratio
var componentSizes = observationIds
.GroupBy(Find)
.Select(g => g.Count())
.ToList();
var largestComponent = componentSizes.Max();
var lccRatio = (double)largestComponent / observationIds.Count;
// Check for distinct CVEs (true identity conflict)
var cveAliases = allAliases
.Where(a => a.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase))
.ToHashSet(StringComparer.OrdinalIgnoreCase);
if (cveAliases.Count > 1)
{
// Multiple distinct CVEs in cluster = hard conflict
var values = inputs
.Select(i => $"{i.Vendor ?? "source"}:{FirstSortedOrDefault(i.Aliases.Where(a => a.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase)))}")
.Where(v => !v.EndsWith(":<none>"))
.OrderBy(v => v, StringComparer.Ordinal)
.ToArray();
if (values.Length > 1)
{
conflicts.Add(new AdvisoryLinksetConflict(
"aliases",
"distinct-cves",
values)
{
Severity = ConflictSeverity.Hard
});
}
}
else if (lccRatio < 1.0 && allAliases.Count > 0)
{
// Disconnected observations but no CVE conflict = soft inconsistency
var disconnectedObs = observationIds
.Where(id => Find(id) != Find(observationIds[0]))
.Select(id => inputs.First(i => i.ObservationId == id))
.Select(i => $"{i.Vendor ?? "source"}:{FirstSortedOrDefault(i.Aliases)}")
.OrderBy(v => v, StringComparer.Ordinal)
.ToArray();
if (disconnectedObs.Length > 0)
{
conflicts.Add(new AdvisoryLinksetConflict(
"aliases",
"alias-inconsistency",
disconnectedObs)
{
Severity = ConflictSeverity.Soft
});
}
}
return (lccRatio, conflicts);
}
/// <summary>
/// Calculates alias authority score based on scope hierarchy.
/// CVE (global) > ECO (ecosystem) > VND (vendor) > DST (distribution).
/// </summary>
private static double CalculateAliasAuthority(IReadOnlyCollection<InputV2> inputs)
{
var allAliases = inputs.SelectMany(i => i.Aliases).ToHashSet(StringComparer.OrdinalIgnoreCase);
if (allAliases.Count == 0)
return 0d;
// Score based on highest authority alias present
var hasCve = allAliases.Any(a => a.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase));
var hasGhsa = allAliases.Any(a => a.StartsWith("GHSA-", StringComparison.OrdinalIgnoreCase));
var hasVendor = allAliases.Any(a =>
a.StartsWith("RHSA-", StringComparison.OrdinalIgnoreCase) ||
a.StartsWith("MSRC-", StringComparison.OrdinalIgnoreCase) ||
a.StartsWith("CISCO-", StringComparison.OrdinalIgnoreCase) ||
a.StartsWith("VMSA-", StringComparison.OrdinalIgnoreCase));
var hasDistro = allAliases.Any(a =>
a.StartsWith("DSA-", StringComparison.OrdinalIgnoreCase) ||
a.StartsWith("USN-", StringComparison.OrdinalIgnoreCase) ||
a.StartsWith("SUSE-", StringComparison.OrdinalIgnoreCase));
if (hasCve) return 1.0;
if (hasGhsa) return 0.8;
if (hasVendor) return 0.6;
if (hasDistro) return 0.4;
return 0.2; // Unknown alias scheme
}
#endregion
#region Package Coverage (Pairwise + IDF)
/// <summary>
/// Calculates package coverage using pairwise overlap instead of intersection-across-all.
/// A thin source with no packages does not collapse the score.
/// </summary>
private static (double Score, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) CalculatePackageCoverage(
IReadOnlyCollection<InputV2> inputs,
Func<string, double>? idfProvider = null)
{
var conflicts = new List<AdvisoryLinksetConflict>();
var inputsWithPackages = inputs.Where(i => i.Purls.Count > 0).ToList();
if (inputsWithPackages.Count == 0)
{
return (0d, conflicts);
}
if (inputsWithPackages.Count == 1)
{
return (inputsWithPackages[0].Purls.Count > 0 ? 1d : 0d, conflicts);
}
// Extract package keys (without version)
var packageKeysPerInput = inputsWithPackages
.Select(i => i.Purls
.Select(ExtractPackageKey)
.Where(k => !string.IsNullOrWhiteSpace(k))
.ToHashSet(StringComparer.Ordinal))
.ToList();
// Calculate pairwise overlap with optional IDF weighting
var totalWeight = 0d;
var matchedWeight = 0d;
var allPackages = packageKeysPerInput.SelectMany(p => p).ToHashSet(StringComparer.Ordinal);
foreach (var pkg in allPackages)
{
var idfWeight = idfProvider?.Invoke(pkg) ?? 1.0;
var inputsWithPkg = packageKeysPerInput.Count(set => set.Contains(pkg));
totalWeight += idfWeight;
if (inputsWithPkg > 1)
{
// Package appears in multiple sources = positive signal
matchedWeight += idfWeight * ((double)inputsWithPkg / inputsWithPackages.Count);
}
}
var score = totalWeight > 0 ? matchedWeight / totalWeight : 0d;
// Check for exact PURL overlap (with version)
var hasExactOverlap = HasExactPurlOverlap(inputsWithPackages);
if (hasExactOverlap)
{
score = Math.Max(score, 0.8); // Boost for exact match
}
// Collect range divergence as soft conflicts (handled in version scoring)
// No longer emitted here to avoid double-counting
return (Clamp01(score), conflicts);
}
#endregion
#region Version Compatibility
/// <summary>
/// Classifies version relationships for shared packages.
/// </summary>
private static (double Score, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) CalculateVersionCompatibility(
IReadOnlyCollection<InputV2> inputs)
{
var conflicts = new List<AdvisoryLinksetConflict>();
var inputsWithPackages = inputs.Where(i => i.Purls.Count > 0).ToList();
if (inputsWithPackages.Count < 2)
{
return (0.5d, conflicts); // Neutral when no comparison possible
}
// Find shared package keys
var packageKeysPerInput = inputsWithPackages
.Select(i => i.Purls
.Select(ExtractPackageKey)
.Where(k => !string.IsNullOrWhiteSpace(k))
.ToHashSet(StringComparer.Ordinal))
.ToList();
var sharedPackages = packageKeysPerInput
.Skip(1)
.Aggregate(
new HashSet<string>(packageKeysPerInput[0], StringComparer.Ordinal),
(acc, next) =>
{
acc.IntersectWith(next);
return acc;
});
if (sharedPackages.Count == 0)
{
return (0.5d, conflicts); // Neutral when no shared packages
}
var totalScore = 0d;
var packageCount = 0;
foreach (var packageKey in sharedPackages)
{
var versionsPerSource = inputsWithPackages
.Select(i => new
{
i.Vendor,
Versions = i.Purls
.Where(p => ExtractPackageKey(p) == packageKey)
.Select(ExtractVersion)
.Where(v => !string.IsNullOrWhiteSpace(v))
.ToList()
})
.Where(x => x.Versions.Count > 0)
.ToList();
if (versionsPerSource.Count < 2)
continue;
packageCount++;
// Classify relationship (simplified; full impl would use SemanticVersionRangeResolver)
var allVersions = versionsPerSource.SelectMany(v => v.Versions).ToHashSet(StringComparer.Ordinal);
var relation = ClassifyVersionRelation(versionsPerSource.Select(v => v.Versions).ToList());
switch (relation)
{
case VersionRelation.Equivalent:
totalScore += 1.0;
break;
case VersionRelation.Overlapping:
totalScore += 0.6;
var overlapValues = versionsPerSource
.Select(v => $"{v.Vendor ?? "source"}:{string.Join(",", v.Versions.OrderBy(x => x))}")
.OrderBy(x => x, StringComparer.Ordinal)
.ToArray();
conflicts.Add(new AdvisoryLinksetConflict(
$"affected.versions[{packageKey}]",
"affected-range-divergence",
overlapValues)
{
Severity = ConflictSeverity.Soft
});
break;
case VersionRelation.Disjoint:
totalScore += 0.0;
var disjointValues = versionsPerSource
.Select(v => $"{v.Vendor ?? "source"}:{string.Join(",", v.Versions.OrderBy(x => x))}")
.OrderBy(x => x, StringComparer.Ordinal)
.ToArray();
conflicts.Add(new AdvisoryLinksetConflict(
$"affected.versions[{packageKey}]",
"disjoint-version-ranges",
disjointValues)
{
Severity = ConflictSeverity.Hard
});
break;
default:
totalScore += 0.5; // Unknown = neutral
break;
}
}
var avgScore = packageCount > 0 ? totalScore / packageCount : 0.5;
return (Clamp01(avgScore), conflicts);
}
private static VersionRelation ClassifyVersionRelation(List<List<string>> versionSets)
{
if (versionSets.Count < 2)
return VersionRelation.Unknown;
var first = versionSets[0].ToHashSet(StringComparer.OrdinalIgnoreCase);
var allEquivalent = true;
var anyOverlap = false;
foreach (var other in versionSets.Skip(1))
{
var otherSet = other.ToHashSet(StringComparer.OrdinalIgnoreCase);
if (!first.SetEquals(otherSet))
allEquivalent = false;
if (first.Overlaps(otherSet))
anyOverlap = true;
}
if (allEquivalent)
return VersionRelation.Equivalent;
if (anyOverlap)
return VersionRelation.Overlapping;
return VersionRelation.Disjoint;
}
#endregion
#region Patch Lineage
/// <summary>
/// Calculates patch lineage correlation.
/// Exact commit SHA match is a very strong signal.
/// </summary>
private static double CalculatePatchLineageScore(IReadOnlyCollection<InputV2> inputs)
{
var inputsWithPatches = inputs
.Where(i => i.PatchReferences?.Count > 0)
.ToList();
if (inputsWithPatches.Count < 2)
{
return 0d; // No patch data to compare
}
// Extract normalized patch references (commit SHAs, PR URLs)
var patchesPerInput = inputsWithPatches
.Select(i => i.PatchReferences!
.Select(NormalizePatchReference)
.Where(p => p is not null)
.Select(p => p!)
.ToHashSet(StringComparer.OrdinalIgnoreCase))
.ToList();
// Find any pairwise overlap
for (int i = 0; i < patchesPerInput.Count; i++)
{
for (int j = i + 1; j < patchesPerInput.Count; j++)
{
if (patchesPerInput[i].Overlaps(patchesPerInput[j]))
{
// Exact patch match = very strong signal
return 1.0;
}
}
}
return 0d;
}
private static string? NormalizePatchReference(string reference)
{
if (string.IsNullOrWhiteSpace(reference))
return null;
// Extract commit SHA from GitHub/GitLab URLs
var commitPattern = new System.Text.RegularExpressions.Regex(
@"(?:github\.com|gitlab\.com)/[^/]+/[^/]+(?:/-)?/commit/([0-9a-f]{7,40})",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
var match = commitPattern.Match(reference);
if (match.Success)
{
return match.Groups[1].Value.ToLowerInvariant();
}
// Full SHA pattern
var shaPattern = new System.Text.RegularExpressions.Regex(@"\b([0-9a-f]{40})\b",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
match = shaPattern.Match(reference);
if (match.Success)
{
return match.Groups[1].Value.ToLowerInvariant();
}
return null;
}
#endregion
#region Reference Score (Positive-Only)
/// <summary>
/// Calculates reference overlap as a positive-only signal.
/// Zero overlap is neutral (0.5), not a conflict.
/// </summary>
private static double CalculateReferenceScore(IReadOnlyCollection<InputV2> inputs)
{
if (inputs.All(i => i.References.Count == 0))
{
return 0.5d; // Neutral when no references
}
var inputList = inputs.ToList();
var maxOverlap = 0d;
for (var i = 0; i < inputList.Count; i++)
{
for (var j = i + 1; j < inputList.Count; j++)
{
var first = inputList[i].References
.Select(NormalizeReferenceUrl)
.ToHashSet(StringComparer.OrdinalIgnoreCase);
var second = inputList[j].References
.Select(NormalizeReferenceUrl)
.ToHashSet(StringComparer.OrdinalIgnoreCase);
var intersection = first.Intersect(second, StringComparer.OrdinalIgnoreCase).Count();
var denom = Math.Max(first.Count, second.Count);
var overlap = denom == 0 ? 0d : (double)intersection / denom;
if (overlap > maxOverlap)
{
maxOverlap = overlap;
}
}
}
// Map overlap to score: 0 overlap = 0.5 (neutral), 1.0 overlap = 1.0
return 0.5 + (maxOverlap * 0.5);
}
private static string NormalizeReferenceUrl(string url)
{
if (string.IsNullOrWhiteSpace(url))
return string.Empty;
// Lowercase, remove tracking params, normalize protocol
var normalized = url.ToLowerInvariant().Trim();
// Remove common tracking parameters
var queryIndex = normalized.IndexOf('?');
if (queryIndex > 0)
{
normalized = normalized[..queryIndex];
}
// Normalize protocol
if (normalized.StartsWith("http://"))
{
normalized = "https://" + normalized[7..];
}
// Remove trailing slash
return normalized.TrimEnd('/');
}
#endregion
#region CPE and Freshness (Minor Updates)
private static double CalculateCpeScore(IReadOnlyCollection<InputV2> inputs)
{
if (inputs.All(i => i.Cpes.Count == 0))
{
return 0d;
}
var cpeSets = inputs.Select(i => i.Cpes.ToHashSet(StringComparer.OrdinalIgnoreCase)).ToList();
var exactOverlap = cpeSets.Skip(1).Any(set => set.Overlaps(cpeSets.First()));
if (exactOverlap)
{
return 1d;
}
var vendorProductSets = inputs
.Select(i => i.Cpes.Select(ParseVendorProduct).Where(vp => vp.vendor is not null).ToHashSet())
.ToList();
var sharedVendorProduct = vendorProductSets.Skip(1).Any(set => set.Overlaps(vendorProductSets.First()));
return sharedVendorProduct ? 0.5d : 0d;
}
private static (string? vendor, string? product) ParseVendorProduct(string cpe)
{
if (string.IsNullOrWhiteSpace(cpe))
{
return (null, null);
}
var parts = cpe.Split(':');
if (parts.Length >= 6 && parts[0].StartsWith("cpe", StringComparison.OrdinalIgnoreCase))
{
return (parts[3], parts[4]);
}
if (parts.Length >= 5 && parts[0] == "cpe" && parts[1] == "/")
{
return (parts[2], parts[3]);
}
return (null, null);
}
private static double CalculateFreshnessScore(IReadOnlyCollection<InputV2> inputs)
{
var fetched = inputs
.Select(i => i.FetchedAt)
.Where(d => d.HasValue)
.Select(d => d!.Value)
.ToList();
if (fetched.Count <= 1)
{
return 0.5d;
}
var min = fetched.Min();
var max = fetched.Max();
var spread = max - min;
if (spread <= TimeSpan.FromHours(48))
{
return 1d;
}
if (spread >= TimeSpan.FromDays(14))
{
return 0d;
}
var remaining = TimeSpan.FromDays(14) - spread;
return Clamp01(remaining.TotalSeconds / TimeSpan.FromDays(14).TotalSeconds);
}
#endregion
#region Conflict Penalties
/// <summary>
/// Calculates typed penalty based on conflict severities.
/// </summary>
private static double CalculateTypedPenalty(IReadOnlyList<AdvisoryLinksetConflict> conflicts)
{
if (conflicts.Count == 0)
return 0d;
var totalPenalty = 0d;
foreach (var conflict in conflicts)
{
var penalty = conflict.Reason switch
{
"distinct-cves" => ConflictPenalties.DistinctCves,
"disjoint-version-ranges" => ConflictPenalties.DisjointVersionRanges,
"affected-range-divergence" => ConflictPenalties.OverlappingRanges,
"severity-mismatch" => ConflictPenalties.SeverityMismatch,
"alias-inconsistency" => ConflictPenalties.AliasInconsistency,
"reference-clash" => 0d, // No penalty for reference differences
_ => 0.05 // Default small penalty for unknown conflicts
};
totalPenalty += penalty;
}
// Saturate at 0.6 to prevent total collapse
return Math.Min(totalPenalty, 0.6);
}
#endregion
#region Helpers
private static bool HasExactPurlOverlap(IReadOnlyCollection<InputV2> inputs)
{
var first = inputs.First().Purls.ToHashSet(StringComparer.Ordinal);
return inputs.Skip(1).Any(input => input.Purls.Any(first.Contains));
}
private static string ExtractPackageKey(string purl)
{
if (string.IsNullOrWhiteSpace(purl))
{
return string.Empty;
}
var atIndex = purl.LastIndexOf('@');
return atIndex > 0 ? purl[..atIndex] : purl;
}
private static string ExtractVersion(string purl)
{
if (string.IsNullOrWhiteSpace(purl))
{
return string.Empty;
}
var atIndex = purl.LastIndexOf('@');
if (atIndex < 0 || atIndex >= purl.Length - 1)
{
return string.Empty;
}
var version = purl[(atIndex + 1)..];
// Remove qualifiers if present
var qualifierIndex = version.IndexOf('?');
if (qualifierIndex > 0)
{
version = version[..qualifierIndex];
}
return version;
}
private static IReadOnlyList<AdvisoryLinksetConflict> DeduplicateAndSort(
IEnumerable<AdvisoryLinksetConflict> conflicts,
IReadOnlyCollection<InputV2> inputs)
{
var set = new HashSet<string>(StringComparer.Ordinal);
var list = new List<AdvisoryLinksetConflict>();
foreach (var conflict in conflicts)
{
var normalizedValues = NormalizeValues(conflict.Values);
var normalizedSources = NormalizeValues(conflict.SourceIds);
var key = $"{conflict.Field}|{conflict.Reason}|{string.Join('|', normalizedValues)}";
if (set.Add(key))
{
if (normalizedSources.Count == 0)
{
normalizedSources = inputs
.Select(i => i.Vendor ?? "source")
.Distinct(StringComparer.OrdinalIgnoreCase)
.OrderBy(v => v, StringComparer.Ordinal)
.ToArray();
}
list.Add(conflict with
{
Values = normalizedValues,
SourceIds = normalizedSources
});
}
}
return list
.OrderBy(c => c.Field, StringComparer.Ordinal)
.ThenBy(c => c.Reason, StringComparer.Ordinal)
.ThenBy(c => string.Join('|', c.Values ?? Array.Empty<string>()), StringComparer.Ordinal)
.ToList();
}
private static double Clamp01(double value) => Math.Clamp(value, 0d, 1d);
private static string FirstSortedOrDefault(IEnumerable<string> values)
{
var first = values
.Where(v => !string.IsNullOrWhiteSpace(v))
.Select(v => v.Trim())
.OrderBy(v => v, StringComparer.Ordinal)
.FirstOrDefault();
return string.IsNullOrEmpty(first) ? "<none>" : first;
}
private static IReadOnlyList<string> NormalizeValues(IReadOnlyList<string>? values)
{
if (values is null || values.Count == 0)
{
return Array.Empty<string>();
}
return values
.Where(v => !string.IsNullOrWhiteSpace(v))
.Select(v => v.Trim())
.OrderBy(v => v, StringComparer.Ordinal)
.ToArray();
}
#endregion
}

View File

@@ -0,0 +1,331 @@
// -----------------------------------------------------------------------------
// TextSimilarityScorer.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-010
// Description: Deterministic TF-IDF text similarity for linkset correlation
// -----------------------------------------------------------------------------
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace StellaOps.Concelier.Core.Linksets;
/// <summary>
/// Computes TF-IDF-based text similarity between advisory descriptions.
/// Used as an optional correlation signal in V2 linkset correlation.
/// </summary>
/// <remarks>
/// <para>
/// This scorer is designed for deterministic, offline operation:
/// - No external NLP dependencies (pure C# implementation)
/// - Configurable stop words and tokenization
/// - Stable output across runs (no randomness)
/// </para>
/// <para>
/// Default weight: 0.05 (low weight, supplementary signal).
/// Feature flag: <c>concelier:correlation:textSimilarity:enabled</c> (default: false).
/// </para>
/// </remarks>
public sealed class TextSimilarityScorer
{
private static readonly Regex TokenRegex = new(
@"[a-zA-Z][a-zA-Z0-9_-]{2,}",
RegexOptions.Compiled | RegexOptions.CultureInvariant);
private static readonly HashSet<string> DefaultStopWords = new(StringComparer.OrdinalIgnoreCase)
{
// Common English stop words
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
"of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
"be", "have", "has", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "must", "shall", "can", "need", "dare", "ought",
"used", "this", "that", "these", "those", "which", "who", "whom", "whose",
"what", "where", "when", "why", "how", "all", "each", "every", "both",
"few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
"own", "same", "so", "than", "too", "very", "just", "also", "now", "here",
"there", "then", "once", "if", "into", "over", "after", "before", "about",
// Common vulnerability description words (low discriminative value)
"vulnerability", "issue", "allows", "attacker", "attack", "remote", "local",
"user", "code", "execution", "denial", "service", "buffer", "overflow",
"may", "could", "via", "using", "through", "affected", "version", "versions",
"product", "software", "application", "component", "module", "function"
};
private readonly TextSimilarityOptions _options;
private readonly HashSet<string> _stopWords;
/// <summary>
/// Initializes a new instance of <see cref="TextSimilarityScorer"/>.
/// </summary>
/// <param name="options">Configuration options. Null uses defaults.</param>
public TextSimilarityScorer(TextSimilarityOptions? options = null)
{
_options = options ?? new TextSimilarityOptions();
_stopWords = _options.CustomStopWords is not null
? new HashSet<string>(_options.CustomStopWords, StringComparer.OrdinalIgnoreCase)
: DefaultStopWords;
}
/// <summary>
/// Computes average pairwise TF-IDF cosine similarity across all description pairs.
/// </summary>
/// <param name="descriptions">Collection of normalized description texts.</param>
/// <returns>Average similarity score (0.0-1.0). Returns 0 if fewer than 2 descriptions.</returns>
public double ComputeAverageSimilarity(IReadOnlyCollection<string> descriptions)
{
if (descriptions.Count < 2)
{
return 0.0;
}
// Filter out empty/null descriptions
var validDescriptions = descriptions
.Where(d => !string.IsNullOrWhiteSpace(d))
.ToArray();
if (validDescriptions.Length < 2)
{
return 0.0;
}
// Tokenize all descriptions
var tokenizedDocs = validDescriptions
.Select(d => Tokenize(d))
.ToArray();
// Build document frequency map
var documentFrequency = BuildDocumentFrequency(tokenizedDocs);
// Compute TF-IDF vectors
var tfidfVectors = tokenizedDocs
.Select(tokens => ComputeTfIdf(tokens, documentFrequency, tokenizedDocs.Length))
.ToArray();
// Compute average pairwise cosine similarity
var totalSimilarity = 0.0;
var pairCount = 0;
for (var i = 0; i < tfidfVectors.Length; i++)
{
for (var j = i + 1; j < tfidfVectors.Length; j++)
{
totalSimilarity += CosineSimilarity(tfidfVectors[i], tfidfVectors[j]);
pairCount++;
}
}
return pairCount > 0 ? totalSimilarity / pairCount : 0.0;
}
/// <summary>
/// Computes TF-IDF cosine similarity between two descriptions.
/// </summary>
/// <param name="description1">First description text.</param>
/// <param name="description2">Second description text.</param>
/// <returns>Similarity score (0.0-1.0).</returns>
public double ComputePairwiseSimilarity(string description1, string description2)
{
if (string.IsNullOrWhiteSpace(description1) || string.IsNullOrWhiteSpace(description2))
{
return 0.0;
}
var tokens1 = Tokenize(description1);
var tokens2 = Tokenize(description2);
if (tokens1.Count == 0 || tokens2.Count == 0)
{
return 0.0;
}
// For pairwise, use simple term frequency with IDF approximation
var allTerms = new HashSet<string>(tokens1, StringComparer.OrdinalIgnoreCase);
allTerms.UnionWith(tokens2);
// Document frequency (appears in 1 or 2 docs)
var df = allTerms.ToDictionary(
t => t,
t => (tokens1.Contains(t) ? 1 : 0) + (tokens2.Contains(t) ? 1 : 0),
StringComparer.OrdinalIgnoreCase);
var vec1 = ComputeTfIdf(tokens1, df, 2);
var vec2 = ComputeTfIdf(tokens2, df, 2);
return CosineSimilarity(vec1, vec2);
}
/// <summary>
/// Tokenizes text into lowercase terms, removing stop words and short tokens.
/// </summary>
internal IReadOnlyList<string> Tokenize(string text)
{
if (string.IsNullOrWhiteSpace(text))
{
return Array.Empty<string>();
}
var matches = TokenRegex.Matches(text);
var tokens = new List<string>(matches.Count);
foreach (Match match in matches)
{
var token = match.Value.ToLowerInvariant();
// Skip stop words
if (_stopWords.Contains(token))
{
continue;
}
// Skip tokens that are too short
if (token.Length < _options.MinTokenLength)
{
continue;
}
// Skip tokens that are all digits (version numbers, etc.)
if (token.All(char.IsDigit))
{
continue;
}
tokens.Add(token);
}
// Sort for determinism
tokens.Sort(StringComparer.Ordinal);
return tokens;
}
private static Dictionary<string, int> BuildDocumentFrequency(IReadOnlyList<IReadOnlyList<string>> documents)
{
var df = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
foreach (var doc in documents)
{
var uniqueTerms = new HashSet<string>(doc, StringComparer.OrdinalIgnoreCase);
foreach (var term in uniqueTerms)
{
df.TryGetValue(term, out var count);
df[term] = count + 1;
}
}
return df;
}
private Dictionary<string, double> ComputeTfIdf(
IReadOnlyList<string> tokens,
Dictionary<string, int> documentFrequency,
int totalDocuments)
{
// Compute term frequency
var termFrequency = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
foreach (var token in tokens)
{
termFrequency.TryGetValue(token, out var count);
termFrequency[token] = count + 1;
}
if (termFrequency.Count == 0)
{
return new Dictionary<string, double>(StringComparer.OrdinalIgnoreCase);
}
// Compute TF-IDF
var tfidf = new Dictionary<string, double>(StringComparer.OrdinalIgnoreCase);
var maxTf = termFrequency.Values.Max();
foreach (var (term, tf) in termFrequency)
{
// Normalized TF: tf / max_tf (augmented frequency)
var normalizedTf = 0.5 + 0.5 * ((double)tf / maxTf);
// IDF: log((N + 1) / (df + 1)) + 1 (smoothed IDF to avoid zero)
// This ensures terms that appear in all documents still have some weight
documentFrequency.TryGetValue(term, out var df);
var idf = Math.Log((double)(totalDocuments + 1) / (df + 1)) + 1.0;
tfidf[term] = normalizedTf * idf;
}
return tfidf;
}
private static double CosineSimilarity(
Dictionary<string, double> vec1,
Dictionary<string, double> vec2)
{
// Get all terms
var allTerms = new HashSet<string>(vec1.Keys, StringComparer.OrdinalIgnoreCase);
allTerms.UnionWith(vec2.Keys);
// Compute dot product and magnitudes
var dotProduct = 0.0;
var mag1 = 0.0;
var mag2 = 0.0;
foreach (var term in allTerms)
{
vec1.TryGetValue(term, out var v1);
vec2.TryGetValue(term, out var v2);
dotProduct += v1 * v2;
mag1 += v1 * v1;
mag2 += v2 * v2;
}
mag1 = Math.Sqrt(mag1);
mag2 = Math.Sqrt(mag2);
if (mag1 < double.Epsilon || mag2 < double.Epsilon)
{
return 0.0;
}
return dotProduct / (mag1 * mag2);
}
}
/// <summary>
/// Configuration options for the text similarity scorer.
/// </summary>
public sealed class TextSimilarityOptions
{
/// <summary>
/// Configuration section name.
/// </summary>
public const string SectionName = "Concelier:Correlation:TextSimilarity";
/// <summary>
/// Whether text similarity scoring is enabled.
/// Default: false (Phase 3 feature, not yet GA).
/// </summary>
public bool Enabled { get; set; } = false;
/// <summary>
/// Weight for text similarity in unified scoring.
/// Default: 0.05.
/// </summary>
public double Weight { get; set; } = 0.05;
/// <summary>
/// Minimum token length after normalization.
/// Default: 3.
/// </summary>
public int MinTokenLength { get; set; } = 3;
/// <summary>
/// Custom stop words list. If null, uses built-in defaults.
/// </summary>
public IReadOnlyList<string>? CustomStopWords { get; set; }
/// <summary>
/// Whether to apply Porter stemming to tokens.
/// Default: false (adds complexity, minimal benefit for security text).
/// </summary>
public bool EnableStemming { get; set; } = false;
}

View File

@@ -0,0 +1,379 @@
// -----------------------------------------------------------------------------
// PackageIdfServiceTests.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-007
// Description: Unit tests for package IDF keys, options, and conceptual IDF computations
// -----------------------------------------------------------------------------
using FluentAssertions;
using Xunit;
using StellaOps.TestKit;
namespace StellaOps.Concelier.Cache.Valkey.Tests;
/// <summary>
/// Unit tests for package IDF caching key generation, options, and IDF formulas.
/// Note: Service-level tests requiring Valkey are in the Integration folder.
/// </summary>
public class PackageIdfKeyTests
{
#region IDF Key Generation Tests
[Trait("Category", TestCategories.Unit)]
[Fact]
public void IdfPackage_GeneratesCorrectKey()
{
// Arrange
var packageName = "pkg:npm/lodash@4.17.21";
// Act
var key = AdvisoryCacheKeys.IdfPackage(packageName);
// Assert
key.Should().Be("concelier:idf:pkg:pkg:npm/lodash@4.17.21");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void IdfPackage_NormalizesToLowercase()
{
// Arrange
var packageName = "pkg:NPM/Lodash@4.17.21";
// Act
var key = AdvisoryCacheKeys.IdfPackage(packageName);
// Assert
key.Should().Be("concelier:idf:pkg:pkg:npm/lodash@4.17.21");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void IdfPackage_WithCustomPrefix_GeneratesCorrectKey()
{
// Arrange
var packageName = "pkg:npm/express@4.18.2";
var prefix = "prod:";
// Act
var key = AdvisoryCacheKeys.IdfPackage(packageName, prefix);
// Assert
key.Should().Be("prod:idf:pkg:pkg:npm/express@4.18.2");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void IdfCorpusSize_GeneratesCorrectKey()
{
// Act
var key = AdvisoryCacheKeys.IdfCorpusSize();
// Assert
key.Should().Be("concelier:idf:stats:corpus_size");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void IdfLastRefresh_GeneratesCorrectKey()
{
// Act
var key = AdvisoryCacheKeys.IdfLastRefresh();
// Assert
key.Should().Be("concelier:idf:stats:last_refresh");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void IdfRefreshLock_GeneratesCorrectKey()
{
// Act
var key = AdvisoryCacheKeys.IdfRefreshLock();
// Assert
key.Should().Be("concelier:idf:lock:refresh");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void IdfDocumentFrequency_GeneratesCorrectKey()
{
// Arrange
var packageName = "pkg:cargo/serde@1.0.0";
// Act
var key = AdvisoryCacheKeys.IdfDocumentFrequency(packageName);
// Assert
key.Should().Be("concelier:idf:df:pkg:cargo/serde@1.0.0");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void IdfPackagePattern_GeneratesCorrectPattern()
{
// Act
var pattern = AdvisoryCacheKeys.IdfPackagePattern();
// Assert
pattern.Should().Be("concelier:idf:pkg:*");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void IdfStatsHits_GeneratesCorrectKey()
{
// Act
var key = AdvisoryCacheKeys.IdfStatsHits();
// Assert
key.Should().Be("concelier:idf:stats:hits");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void IdfStatsMisses_GeneratesCorrectKey()
{
// Act
var key = AdvisoryCacheKeys.IdfStatsMisses();
// Assert
key.Should().Be("concelier:idf:stats:misses");
}
#endregion
}
/// <summary>
/// Tests for PackageIdfOptions defaults and configuration.
/// </summary>
public class PackageIdfOptionsTests
{
[Trait("Category", TestCategories.Unit)]
[Fact]
public void PackageIdfOptions_DefaultValues_AreCorrect()
{
// Arrange & Act
var options = new PackageIdfOptions();
// Assert
options.Enabled.Should().BeTrue();
options.IdfTtl.Should().Be(TimeSpan.FromHours(1));
options.CorpusStatsTtl.Should().Be(TimeSpan.FromHours(4));
options.MinIdfThreshold.Should().Be(0.01);
options.DefaultIdfWeight.Should().Be(1.0);
options.MaxCacheEntries.Should().Be(100_000);
options.NormalizeScores.Should().BeTrue();
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void PackageIdfOptions_SectionName_IsCorrect()
{
// Assert
PackageIdfOptions.SectionName.Should().Be("Concelier:PackageIdf");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void PackageIdfOptions_CanBeCustomized()
{
// Arrange & Act
var options = new PackageIdfOptions
{
Enabled = false,
IdfTtl = TimeSpan.FromMinutes(30),
CorpusStatsTtl = TimeSpan.FromHours(2),
MinIdfThreshold = 0.05,
DefaultIdfWeight = 0.5,
MaxCacheEntries = 50_000,
NormalizeScores = false
};
// Assert
options.Enabled.Should().BeFalse();
options.IdfTtl.Should().Be(TimeSpan.FromMinutes(30));
options.CorpusStatsTtl.Should().Be(TimeSpan.FromHours(2));
options.MinIdfThreshold.Should().Be(0.05);
options.DefaultIdfWeight.Should().Be(0.5);
options.MaxCacheEntries.Should().Be(50_000);
options.NormalizeScores.Should().BeFalse();
}
}
/// <summary>
/// Tests for IDF formula computation (conceptual validation).
/// </summary>
public class IdfFormulaTests
{
[Trait("Category", TestCategories.Unit)]
[Theory]
[InlineData(10000, 1, 9.21)] // Rare package: log(10000/2) ≈ 8.52
[InlineData(10000, 5000, 0.69)] // Common package: log(10000/5001) ≈ 0.69
[InlineData(10000, 10000, 0.0)] // Ubiquitous: log(10000/10001) ≈ 0
public void IdfFormula_ComputesCorrectly(long corpusSize, long docFrequency, double expectedRawIdf)
{
// This test validates the IDF formula used in UpdateCorpusStatsAsync
// IDF = log(N / (1 + df))
// Act
var rawIdf = Math.Log((double)corpusSize / (1 + docFrequency));
// Assert
rawIdf.Should().BeApproximately(expectedRawIdf, 0.1);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void IdfFormula_RarePackageHasHighWeight()
{
// Arrange
const long corpusSize = 100_000;
const long rareDocFrequency = 5;
const long commonDocFrequency = 50_000;
// Act
var rareIdf = Math.Log((double)corpusSize / (1 + rareDocFrequency));
var commonIdf = Math.Log((double)corpusSize / (1 + commonDocFrequency));
// Assert - rare package should have much higher IDF
rareIdf.Should().BeGreaterThan(commonIdf * 5);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void IdfNormalization_ScalesToUnitInterval()
{
// Arrange - simulate corpus with various document frequencies
var corpusSize = 100_000L;
var documentFrequencies = new Dictionary<string, long>
{
["pkg:npm/lodash"] = 80_000, // Very common
["pkg:npm/express"] = 40_000, // Common
["pkg:cargo/serde"] = 10_000, // Moderate
["pkg:npm/obscure"] = 100, // Rare
["pkg:cargo/unique"] = 1 // Very rare
};
// Act - compute raw IDFs
var rawIdfs = documentFrequencies.ToDictionary(
kv => kv.Key,
kv => Math.Log((double)corpusSize / (1 + kv.Value)));
var maxIdf = rawIdfs.Values.Max();
// Normalize to 0-1
var normalizedIdfs = rawIdfs.ToDictionary(
kv => kv.Key,
kv => kv.Value / maxIdf);
// Assert - all values should be in [0, 1]
foreach (var (pkg, idf) in normalizedIdfs)
{
idf.Should().BeGreaterThanOrEqualTo(0.0, because: $"{pkg} should have non-negative IDF");
idf.Should().BeLessThanOrEqualTo(1.0, because: $"{pkg} should have IDF ≤ 1.0");
}
// The rarest package should have IDF close to 1.0
normalizedIdfs["pkg:cargo/unique"].Should().BeApproximately(1.0, 0.01);
// The most common package should have low IDF
normalizedIdfs["pkg:npm/lodash"].Should().BeLessThan(0.3);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void IdfWeight_DiscriminatesBetweenPackages()
{
// This test validates that IDF provides meaningful discrimination
// for linkset correlation
// Arrange
var corpusSize = 50_000L;
// Package that appears in many advisories (low discrimination)
var commonPkgDf = 25_000L;
// Package that appears in few advisories (high discrimination)
var rarePkgDf = 50L;
// Act
var commonIdf = Math.Log((double)corpusSize / (1 + commonPkgDf));
var rareIdf = Math.Log((double)corpusSize / (1 + rarePkgDf));
// Normalize
var maxIdf = Math.Max(commonIdf, rareIdf);
var commonNorm = commonIdf / maxIdf;
var rareNorm = rareIdf / maxIdf;
// Assert
// When two advisories share a rare package, it should be a stronger
// correlation signal than when they share a common package
rareNorm.Should().BeGreaterThan(commonNorm * 3,
because: "sharing a rare package should be 3x more discriminative than sharing a common package");
}
}
/// <summary>
/// Tests for PackageIdfMetrics instrumentation.
/// </summary>
public class PackageIdfMetricsTests
{
[Trait("Category", TestCategories.Unit)]
[Fact]
public void PackageIdfMetrics_ActivitySourceName_IsCorrect()
{
// Assert
PackageIdfMetrics.ActivitySourceName.Should().Be("StellaOps.Concelier.PackageIdf");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void PackageIdfMetrics_MeterName_IsCorrect()
{
// Assert
PackageIdfMetrics.MeterName.Should().Be("StellaOps.Concelier.PackageIdf");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void PackageIdfMetrics_CanBeCreatedAndDisposed()
{
// Arrange & Act
using var metrics = new PackageIdfMetrics();
// Assert - no exception thrown
metrics.Should().NotBeNull();
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void PackageIdfMetrics_RecordsOperations_WithoutException()
{
// Arrange
using var metrics = new PackageIdfMetrics();
// Act & Assert - none of these should throw
metrics.RecordHit();
metrics.RecordHits(5);
metrics.RecordMiss();
metrics.RecordMisses(3);
metrics.RecordRefresh(100);
metrics.RecordLatency(15.5, "get");
metrics.RecordIdfWeight(0.75);
metrics.UpdateCorpusSize(50_000);
metrics.UpdateCachedEntries(10_000);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void PackageIdfMetrics_StartActivity_ReturnsNullWhenNoListeners()
{
// Act
var activity = PackageIdfMetrics.StartActivity("test-operation");
// Assert - no listeners registered, so activity should be null
// (This is expected behavior for OpenTelemetry when no exporters are configured)
// Just verify it doesn't throw
}
}

View File

@@ -0,0 +1,636 @@
// -----------------------------------------------------------------------------
// LinksetCorrelationV2Tests.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-001 through CORR-V2-008
// Description: Comprehensive tests for V2 correlation algorithm
// -----------------------------------------------------------------------------
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using FluentAssertions;
using StellaOps.Concelier.Core.Linksets;
using Xunit;
namespace StellaOps.Concelier.Core.Tests.Linksets;
/// <summary>
/// Tests for the V2 linkset correlation algorithm.
/// Validates graph-based alias connectivity, pairwise package coverage,
/// version compatibility, patch lineage, and typed conflict severities.
/// </summary>
public sealed class LinksetCorrelationV2Tests
{
#region CORR-V2-001: Alias Connectivity (Graph-based)
[Fact]
public void AliasConnectivity_TransitiveBridging_CorrectlyLinksThreeSources()
{
// Arrange: A has CVE-X, B has CVE-X + GHSA-Y, C has GHSA-Y
// V1 would produce score=0 (empty intersection)
// V2 should produce high score via transitive bridging
var inputs = new[]
{
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" }),
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-1234", "GHSA-aaaa-bbbb-cccc" }),
CreateInput("obs-c", "osv", aliases: new[] { "GHSA-aaaa-bbbb-cccc" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
// With only alias signals: 0.30*1.0 + 0.10*1.0 + neutrals = 0.50
result.Confidence.Should().BeGreaterThanOrEqualTo(0.5, "transitive bridging should yield positive confidence");
result.SignalScores["aliasConnectivity"].Should().Be(1.0, "all observations connected via alias graph");
result.Conflicts.Should().NotContain(c => c.Reason == "alias-inconsistency",
"no inconsistency when transitively connected");
}
[Fact]
public void AliasConnectivity_DisjointAliases_ProducesLowScoreAndConflict()
{
// Arrange: Two sources with completely disjoint aliases (no bridging)
var inputs = new[]
{
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1111" }),
CreateInput("obs-b", "vendor", aliases: new[] { "VENDOR-ADV-999" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.SignalScores["aliasConnectivity"].Should().Be(0.5, "50% in LCC (each disconnected)");
result.Conflicts.Should().Contain(c => c.Reason == "alias-inconsistency");
}
[Fact]
public void AliasConnectivity_DistinctCVEs_ProducesHardConflict()
{
// Arrange: Two different CVE identifiers in the cluster = hard conflict
var inputs = new[]
{
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1111" }),
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-2222" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.Conflicts.Should().Contain(c =>
c.Reason == "distinct-cves" && c.Severity == ConflictSeverity.Hard);
result.Confidence.Should().BeLessThan(0.5, "hard conflict should significantly reduce confidence");
}
[Fact]
public void AliasConnectivity_SingleObservation_ReturnsFullScoreWithAliases()
{
// Arrange
var inputs = new[] { CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" }) };
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.SignalScores["aliasConnectivity"].Should().Be(1.0);
result.Conflicts.Should().BeEmpty();
}
[Fact]
public void AliasConnectivity_NoAliases_ReturnsZeroScore()
{
// Arrange
var inputs = new[]
{
CreateInput("obs-a", "nvd", aliases: Array.Empty<string>()),
CreateInput("obs-b", "vendor", aliases: Array.Empty<string>())
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.SignalScores["aliasConnectivity"].Should().Be(0.0);
}
#endregion
#region CORR-V2-002: Package Coverage (Pairwise + IDF)
[Fact]
public void PackageCoverage_ThinSource_DoesNotCollapseScore()
{
// Arrange: Source A and B share package, Source C has no packages
// V1 intersection-across-all would produce 0
// V2 pairwise should still produce positive score
var inputs = new[]
{
CreateInput("obs-a", "nvd", purls: new[] { "pkg:npm/lodash@4.17.21" }),
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:npm/lodash@4.17.20" }),
CreateInput("obs-c", "vendor", purls: Array.Empty<string>())
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.SignalScores["packageCoverage"].Should().BeGreaterThan(0,
"thin source should not collapse pairwise coverage");
}
[Fact]
public void PackageCoverage_ExactPurlMatch_BoostsScore()
{
// Arrange: Same exact PURL (with version)
var inputs = new[]
{
CreateInput("obs-a", "nvd", purls: new[] { "pkg:npm/lodash@4.17.21" }),
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:npm/lodash@4.17.21" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.SignalScores["packageCoverage"].Should().BeGreaterThanOrEqualTo(0.8,
"exact PURL match should boost score");
}
[Fact]
public void PackageCoverage_NoOverlap_ReturnsZero()
{
// Arrange: Completely different packages
var inputs = new[]
{
CreateInput("obs-a", "nvd", purls: new[] { "pkg:npm/lodash@4.17.21" }),
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:pypi/requests@2.28.0" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.SignalScores["packageCoverage"].Should().Be(0);
}
[Fact]
public void PackageCoverage_WithIdfProvider_WeightsRarePackagesHigher()
{
// Arrange: Custom IDF provider
var inputs = new[]
{
CreateInput("obs-a", "nvd", purls: new[] { "pkg:cargo/obscure-lib@1.0.0" }),
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:cargo/obscure-lib@1.0.0" })
};
// IDF provider: rare package gets high weight
double IdfProvider(string pkg) => pkg.Contains("obscure") ? 5.0 : 1.0;
// Act
var result = LinksetCorrelationV2.Compute(inputs, packageIdfProvider: IdfProvider);
// Assert
result.SignalScores["packageCoverage"].Should().BeGreaterThan(0.5);
}
#endregion
#region CORR-V2-003: Reference Score (Positive-Only)
[Fact]
public void ReferenceScore_ZeroOverlap_ReturnsNeutral_NoConflict()
{
// Arrange: Different references from different sources
// V1 would emit reference-clash
// V2 should return neutral (0.5) with no conflict
var inputs = new[]
{
CreateInput("obs-a", "nvd", references: new[] { "https://nvd.nist.gov/vuln/detail/CVE-2025-1234" }),
CreateInput("obs-b", "ghsa", references: new[] { "https://github.com/advisories/GHSA-xxxx" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.SignalScores["referenceOverlap"].Should().Be(0.5, "zero overlap = neutral, not negative");
result.Conflicts.Should().NotContain(c => c.Reason == "reference-clash",
"no conflict for simple disjoint references");
}
[Fact]
public void ReferenceScore_PartialOverlap_ProducesPositiveScore()
{
// Arrange: Some shared references
var inputs = new[]
{
CreateInput("obs-a", "nvd", references: new[]
{
"https://example.com/advisory",
"https://nvd.nist.gov/vuln/detail/CVE-2025-1234"
}),
CreateInput("obs-b", "ghsa", references: new[]
{
"https://example.com/advisory",
"https://github.com/advisories/GHSA-xxxx"
})
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.SignalScores["referenceOverlap"].Should().BeGreaterThan(0.5);
}
[Fact]
public void ReferenceScore_NormalizesUrls()
{
// Arrange: Same URL with different casing/protocol
var inputs = new[]
{
CreateInput("obs-a", "nvd", references: new[] { "http://Example.COM/advisory?utm_source=test" }),
CreateInput("obs-b", "ghsa", references: new[] { "https://example.com/advisory" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert: Should match after normalization
result.SignalScores["referenceOverlap"].Should().BeGreaterThan(0.5);
}
#endregion
#region CORR-V2-004: Typed Conflict Severities
[Fact]
public void ConflictPenalty_HardConflict_AppliesLargePenalty()
{
// Arrange: Distinct CVEs = hard conflict
var inputs = new[]
{
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1111" }),
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-2222" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
var hardConflict = result.Conflicts.FirstOrDefault(c => c.Severity == ConflictSeverity.Hard);
hardConflict.Should().NotBeNull();
result.Confidence.Should().BeLessThan(0.5);
}
[Fact]
public void ConflictPenalty_SoftConflict_AppliesSmallPenalty()
{
// Arrange: Same CVE but overlapping version ranges (share at least one version)
var inputs = new[]
{
CreateInput("obs-a", "nvd",
aliases: new[] { "CVE-2025-1234" },
purls: new[] { "pkg:npm/lodash@4.17.20", "pkg:npm/lodash@4.17.21" }),
CreateInput("obs-b", "ghsa",
aliases: new[] { "CVE-2025-1234" },
purls: new[] { "pkg:npm/lodash@4.17.20", "pkg:npm/lodash@4.17.19" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert: Should have soft divergence conflict (overlapping but not equivalent)
var softConflict = result.Conflicts.FirstOrDefault(c =>
c.Severity == ConflictSeverity.Soft && c.Reason == "affected-range-divergence");
softConflict.Should().NotBeNull("overlapping but non-equivalent ranges should produce soft conflict");
result.Confidence.Should().BeGreaterThan(0.5, "soft conflicts should not severely impact confidence");
}
[Fact]
public void ConflictPenalty_Saturates_AtMaximum()
{
// Arrange: Multiple hard conflicts
var inputs = new[]
{
CreateInput("obs-a", "nvd",
aliases: new[] { "CVE-2025-1111" },
purls: new[] { "pkg:npm/lodash@1.0.0" }),
CreateInput("obs-b", "ghsa",
aliases: new[] { "CVE-2025-2222" },
purls: new[] { "pkg:npm/lodash@9.0.0" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert: Confidence should not go below 0.1 minimum
result.Confidence.Should().BeGreaterThanOrEqualTo(0.1);
}
#endregion
#region CORR-V2-005: Patch Lineage
[Fact]
public void PatchLineage_ExactCommitShaMatch_ProducesHighScore()
{
// Arrange: Same commit SHA in patch references
var inputs = new[]
{
CreateInput("obs-a", "nvd",
aliases: new[] { "CVE-2025-1234" },
patchReferences: new[] { "https://github.com/org/repo/commit/abc123def456789012345678901234567890abcd" }),
CreateInput("obs-b", "ghsa",
aliases: new[] { "CVE-2025-1234" },
patchReferences: new[] { "https://github.com/org/repo/commit/abc123def456789012345678901234567890abcd" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.SignalScores["patchLineage"].Should().Be(1.0, "exact commit SHA match is very strong signal");
}
[Fact]
public void PatchLineage_DifferentCommits_ProducesZeroScore()
{
// Arrange: Different commit SHAs
var inputs = new[]
{
CreateInput("obs-a", "nvd",
patchReferences: new[] { "https://github.com/org/repo/commit/1111111111111111111111111111111111111111" }),
CreateInput("obs-b", "ghsa",
patchReferences: new[] { "https://github.com/org/repo/commit/2222222222222222222222222222222222222222" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.SignalScores["patchLineage"].Should().Be(0);
}
[Fact]
public void PatchLineage_NoPatchData_ReturnsZero()
{
// Arrange: No patch references
var inputs = new[]
{
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" }),
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-1234" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.SignalScores["patchLineage"].Should().Be(0);
}
#endregion
#region CORR-V2-006: Version Compatibility
[Fact]
public void VersionCompatibility_EquivalentRanges_ProducesHighScore()
{
// Arrange: Same versions for same package
var inputs = new[]
{
CreateInput("obs-a", "nvd", purls: new[] { "pkg:npm/lodash@4.17.21" }),
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:npm/lodash@4.17.21" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.SignalScores["versionCompatibility"].Should().BeGreaterThanOrEqualTo(0.8);
result.Conflicts.Should().NotContain(c =>
c.Reason == "affected-range-divergence" || c.Reason == "disjoint-version-ranges");
}
[Fact]
public void VersionCompatibility_OverlappingRanges_ProducesMediumScoreWithSoftConflict()
{
// Arrange: Overlapping but not identical versions
var inputs = new[]
{
CreateInput("obs-a", "nvd", purls: new[] { "pkg:npm/lodash@4.17.21", "pkg:npm/lodash@4.17.20" }),
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:npm/lodash@4.17.20", "pkg:npm/lodash@4.17.19" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.SignalScores["versionCompatibility"].Should().BeInRange(0.4, 0.8);
result.Conflicts.Should().Contain(c =>
c.Reason == "affected-range-divergence" && c.Severity == ConflictSeverity.Soft);
}
[Fact]
public void VersionCompatibility_DisjointRanges_ProducesLowScoreWithHardConflict()
{
// Arrange: Completely different versions for same package
var inputs = new[]
{
CreateInput("obs-a", "nvd", purls: new[] { "pkg:npm/lodash@1.0.0" }),
CreateInput("obs-b", "ghsa", purls: new[] { "pkg:npm/lodash@9.0.0" })
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.Conflicts.Should().Contain(c =>
c.Reason == "disjoint-version-ranges" && c.Severity == ConflictSeverity.Hard);
}
#endregion
#region CORR-V2-008: Integrated Scoring
[Fact]
public void IntegratedScoring_HighConfidenceScenario()
{
// Arrange: Strong signals across all dimensions
var inputs = new[]
{
CreateInput("obs-a", "nvd",
aliases: new[] { "CVE-2025-1234" },
purls: new[] { "pkg:npm/vulnerable-lib@2.0.0" },
cpes: new[] { "cpe:2.3:a:vendor:vulnerable-lib:2.0.0:*:*:*:*:*:*:*" },
references: new[] { "https://example.com/advisory" },
patchReferences: new[] { "https://github.com/org/repo/commit/abc123def456789012345678901234567890abcd" },
fetchedAt: DateTimeOffset.Parse("2025-01-25T10:00:00Z", CultureInfo.InvariantCulture)),
CreateInput("obs-b", "ghsa",
aliases: new[] { "CVE-2025-1234", "GHSA-xxxx-yyyy-zzzz" },
purls: new[] { "pkg:npm/vulnerable-lib@2.0.0" },
cpes: new[] { "cpe:2.3:a:vendor:vulnerable-lib:2.0.0:*:*:*:*:*:*:*" },
references: new[] { "https://example.com/advisory", "https://github.com/advisories/GHSA-xxxx" },
patchReferences: new[] { "https://github.com/org/repo/commit/abc123def456789012345678901234567890abcd" },
fetchedAt: DateTimeOffset.Parse("2025-01-25T11:00:00Z", CultureInfo.InvariantCulture))
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.Confidence.Should().BeGreaterThanOrEqualTo(0.85, "all signals strong = high confidence");
result.Conflicts.Should().BeEmpty();
// Verify individual signals
result.SignalScores["aliasConnectivity"].Should().Be(1.0);
result.SignalScores["aliasAuthority"].Should().Be(1.0); // CVE present
result.SignalScores["packageCoverage"].Should().BeGreaterThanOrEqualTo(0.8);
result.SignalScores["patchLineage"].Should().Be(1.0);
result.SignalScores["freshness"].Should().Be(1.0); // Within 48h
}
[Fact]
public void IntegratedScoring_MixedSignalsScenario()
{
// Arrange: Some strong signals, some weak
// Note: Disconnected aliases will produce alias-inconsistency conflict
var inputs = new[]
{
CreateInput("obs-a", "nvd",
aliases: new[] { "CVE-2025-1234" },
purls: new[] { "pkg:npm/lodash@4.17.21" },
fetchedAt: DateTimeOffset.Parse("2025-01-10T00:00:00Z", CultureInfo.InvariantCulture)),
CreateInput("obs-b", "vendor",
aliases: new[] { "VENDOR-2025-001" }, // No CVE, only vendor ID
purls: new[] { "pkg:npm/lodash@4.17.20" }, // Different version
fetchedAt: DateTimeOffset.Parse("2025-01-25T00:00:00Z", CultureInfo.InvariantCulture)) // 15 days apart
};
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
// Disconnected aliases + version divergence = conflicts reducing confidence
// Minimum confidence is 0.1 when there are conflicts but some evidence
result.Confidence.Should().BeInRange(0.1, 0.4, "mixed signals with conflicts = low-moderate confidence");
result.SignalScores["aliasConnectivity"].Should().BeLessThan(1.0); // Disconnected
result.SignalScores["freshness"].Should().BeLessThan(0.5); // 15 days spread
}
[Fact]
public void IntegratedScoring_EmptyInputs_ReturnsFullConfidence()
{
// Arrange
var inputs = Array.Empty<LinksetCorrelationV2.InputV2>();
// Act
var result = LinksetCorrelationV2.Compute(inputs);
// Assert
result.Confidence.Should().Be(1.0);
result.Conflicts.Should().BeEmpty();
}
#endregion
#region Determinism Tests
[Fact]
public void Determinism_SameInputs_ProduceSameOutput()
{
// Arrange
var inputs = new[]
{
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" }),
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-1234", "GHSA-xxxx" })
};
// Act
var result1 = LinksetCorrelationV2.Compute(inputs);
var result2 = LinksetCorrelationV2.Compute(inputs);
// Assert
result1.Confidence.Should().Be(result2.Confidence);
result1.Conflicts.Should().BeEquivalentTo(result2.Conflicts);
result1.SignalScores.Should().BeEquivalentTo(result2.SignalScores);
}
[Fact]
public void Determinism_InputOrdering_DoesNotAffectResult()
{
// Arrange
var inputsA = new[]
{
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" }),
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-1234" })
};
var inputsB = new[]
{
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-1234" }),
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" })
};
// Act
var resultA = LinksetCorrelationV2.Compute(inputsA);
var resultB = LinksetCorrelationV2.Compute(inputsB);
// Assert
resultA.Confidence.Should().Be(resultB.Confidence);
}
[Fact]
public void Conflicts_AreDeduplicated()
{
// Arrange: Add duplicate conflicts via additionalConflicts
// Use inputs that won't generate their own alias-inconsistency
var inputs = new[]
{
CreateInput("obs-a", "nvd", aliases: new[] { "CVE-2025-1234" }),
CreateInput("obs-b", "ghsa", aliases: new[] { "CVE-2025-1234" }) // Same CVE = connected
};
var additionalConflicts = new List<AdvisoryLinksetConflict>
{
new("custom-field", "custom-reason", new[] { "a", "b" }),
new("custom-field", "custom-reason", new[] { "a", "b" }) // Duplicate
};
// Act
var result = LinksetCorrelationV2.Compute(inputs, additionalConflicts);
// Assert: Should deduplicate the additional conflicts
result.Conflicts.Count(c => c.Reason == "custom-reason").Should().Be(1);
}
#endregion
#region Helper Methods
private static LinksetCorrelationV2.InputV2 CreateInput(
string observationId,
string? vendor = null,
string[]? aliases = null,
string[]? purls = null,
string[]? cpes = null,
string[]? references = null,
string[]? patchReferences = null,
DateTimeOffset? fetchedAt = null)
{
return new LinksetCorrelationV2.InputV2(
ObservationId: observationId,
Vendor: vendor,
FetchedAt: fetchedAt,
Aliases: aliases ?? Array.Empty<string>(),
Purls: purls ?? Array.Empty<string>(),
Cpes: cpes ?? Array.Empty<string>(),
References: references ?? Array.Empty<string>(),
PatchReferences: patchReferences);
}
#endregion
}

View File

@@ -0,0 +1,561 @@
// -----------------------------------------------------------------------------
// TextSimilarityScorerTests.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-010
// Description: Unit tests and performance benchmarks for TextSimilarityScorer
// -----------------------------------------------------------------------------
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using FluentAssertions;
using StellaOps.Concelier.Core.Linksets;
using StellaOps.TestKit;
using Xunit;
namespace StellaOps.Concelier.Core.Tests.Linksets;
/// <summary>
/// Unit tests for <see cref="TextSimilarityScorer"/>.
/// </summary>
public class TextSimilarityScorerTests
{
private readonly TextSimilarityScorer _scorer = new();
#region Tokenization Tests
[Trait("Category", TestCategories.Unit)]
[Fact]
public void Tokenize_EmptyString_ReturnsEmpty()
{
// Act
var tokens = _scorer.Tokenize("");
// Assert
tokens.Should().BeEmpty();
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void Tokenize_NullString_ReturnsEmpty()
{
// Act
var tokens = _scorer.Tokenize(null!);
// Assert
tokens.Should().BeEmpty();
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void Tokenize_NormalizesToLowercase()
{
// Arrange
var text = "BUFFER OVERFLOW Memory Corruption";
// Act
var tokens = _scorer.Tokenize(text);
// Assert
tokens.Should().AllSatisfy(t => t.Should().Be(t.ToLowerInvariant()));
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void Tokenize_RemovesStopWords()
{
// Arrange
var text = "The vulnerability allows an attacker to execute code";
// Act
var tokens = _scorer.Tokenize(text);
// Assert - common stop words should be removed
tokens.Should().NotContain("the");
tokens.Should().NotContain("an");
tokens.Should().NotContain("to");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void Tokenize_RemovesShortTokens()
{
// Arrange
var text = "CVE ID in XSS bug";
// Act
var tokens = _scorer.Tokenize(text);
// Assert - tokens shorter than 3 chars should be removed
tokens.Should().NotContain("id");
tokens.Should().NotContain("in");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void Tokenize_RemovesNumericTokens()
{
// Arrange
var text = "version 123 release 2024";
// Act
var tokens = _scorer.Tokenize(text);
// Assert - pure numeric tokens should be removed
tokens.Should().NotContain("123");
tokens.Should().NotContain("2024");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void Tokenize_KeepsAlphanumericTokens()
{
// Arrange
var text = "CVE2024 log4j2 spring4shell";
// Act
var tokens = _scorer.Tokenize(text);
// Assert - alphanumeric tokens should be kept
tokens.Should().Contain("cve2024");
tokens.Should().Contain("log4j2");
tokens.Should().Contain("spring4shell");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void Tokenize_IsDeterministic()
{
// Arrange
var text = "Memory corruption in JSON parser leads to arbitrary code execution";
// Act
var tokens1 = _scorer.Tokenize(text);
var tokens2 = _scorer.Tokenize(text);
// Assert
tokens1.Should().BeEquivalentTo(tokens2, options => options.WithStrictOrdering());
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void Tokenize_SortsTokensForDeterminism()
{
// Arrange
var text = "zebra alpha memory parser";
// Act
var tokens = _scorer.Tokenize(text);
// Assert - tokens should be sorted alphabetically
tokens.Should().BeInAscendingOrder();
}
#endregion
#region Pairwise Similarity Tests
[Trait("Category", TestCategories.Unit)]
[Fact]
public void ComputePairwiseSimilarity_IdenticalTexts_ReturnsOne()
{
// Arrange
var text = "A heap-based buffer overflow in libpng allows remote attackers to execute arbitrary code";
// Act
var similarity = _scorer.ComputePairwiseSimilarity(text, text);
// Assert
similarity.Should().BeApproximately(1.0, 0.01);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void ComputePairwiseSimilarity_CompletelyDifferent_ReturnsLowScore()
{
// Arrange
var text1 = "SQL injection in database query handler";
var text2 = "Memory corruption in graphics renderer";
// Act
var similarity = _scorer.ComputePairwiseSimilarity(text1, text2);
// Assert
similarity.Should().BeLessThan(0.3);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void ComputePairwiseSimilarity_SimilarDescriptions_ReturnsPositiveScore()
{
// Arrange - same vulnerability described differently
var text1 = "A heap-based buffer overflow in the PNG image parser allows remote code execution";
var text2 = "Remote code execution via heap buffer overflow in PNG image processing library";
// Act
var similarity = _scorer.ComputePairwiseSimilarity(text1, text2);
// Assert - TF-IDF similarity for short texts with stop words removed
// is typically moderate (0.2-0.5 range)
similarity.Should().BeGreaterThan(0.2);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void ComputePairwiseSimilarity_EmptyFirst_ReturnsZero()
{
// Act
var similarity = _scorer.ComputePairwiseSimilarity("", "some text here");
// Assert
similarity.Should().Be(0.0);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void ComputePairwiseSimilarity_EmptySecond_ReturnsZero()
{
// Act
var similarity = _scorer.ComputePairwiseSimilarity("some text here", "");
// Assert
similarity.Should().Be(0.0);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void ComputePairwiseSimilarity_OnlyStopWords_ReturnsZero()
{
// Arrange - text with only stop words
var text1 = "the and or but";
var text2 = "the and or but";
// Act
var similarity = _scorer.ComputePairwiseSimilarity(text1, text2);
// Assert - no tokens after stop word removal
similarity.Should().Be(0.0);
}
#endregion
#region Average Similarity Tests
[Trait("Category", TestCategories.Unit)]
[Fact]
public void ComputeAverageSimilarity_SingleDescription_ReturnsZero()
{
// Arrange
var descriptions = new[] { "Only one description here" };
// Act
var similarity = _scorer.ComputeAverageSimilarity(descriptions);
// Assert
similarity.Should().Be(0.0);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void ComputeAverageSimilarity_EmptyCollection_ReturnsZero()
{
// Act
var similarity = _scorer.ComputeAverageSimilarity(Array.Empty<string>());
// Assert
similarity.Should().Be(0.0);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void ComputeAverageSimilarity_IdenticalDescriptions_ReturnsOne()
{
// Arrange
var description = "A critical buffer overflow vulnerability in the image processing library";
var descriptions = new[] { description, description, description };
// Act
var similarity = _scorer.ComputeAverageSimilarity(descriptions);
// Assert
similarity.Should().BeApproximately(1.0, 0.01);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void ComputeAverageSimilarity_MixedSimilarity_ReturnsReasonableAverage()
{
// Arrange - three descriptions about the same CVE from different sources
var descriptions = new[]
{
"A heap-based buffer overflow in libpng before 1.6.37 allows remote attackers to cause denial of service",
"Buffer overflow vulnerability in PNG library (libpng) can be exploited by remote attackers for DoS",
"libpng contains a heap overflow that may lead to denial of service when processing malformed PNG files"
};
// Act
var similarity = _scorer.ComputeAverageSimilarity(descriptions);
// Assert - TF-IDF similarity for related security texts typically
// produces moderate scores (0.1-0.4 range) after stop word removal
similarity.Should().BeGreaterThan(0.1);
similarity.Should().BeLessThanOrEqualTo(1.0);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void ComputeAverageSimilarity_SkipsEmptyDescriptions()
{
// Arrange
var descriptions = new[]
{
"A critical vulnerability in the parser",
"",
null!,
" ",
"A critical vulnerability in the parser"
};
// Act
var similarity = _scorer.ComputeAverageSimilarity(descriptions);
// Assert - should only consider non-empty descriptions
similarity.Should().BeApproximately(1.0, 0.01);
}
#endregion
#region Options Tests
[Trait("Category", TestCategories.Unit)]
[Fact]
public void TextSimilarityOptions_DefaultValues_AreCorrect()
{
// Arrange & Act
var options = new TextSimilarityOptions();
// Assert
options.Enabled.Should().BeFalse();
options.Weight.Should().Be(0.05);
options.MinTokenLength.Should().Be(3);
options.CustomStopWords.Should().BeNull();
options.EnableStemming.Should().BeFalse();
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void TextSimilarityOptions_SectionName_IsCorrect()
{
// Assert
TextSimilarityOptions.SectionName.Should().Be("Concelier:Correlation:TextSimilarity");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void Scorer_WithCustomStopWords_UsesCustomList()
{
// Arrange
var options = new TextSimilarityOptions
{
CustomStopWords = new[] { "custom", "stop", "words" }
};
var scorer = new TextSimilarityScorer(options);
// Act
var tokens = scorer.Tokenize("custom stop words remain here");
// Assert - custom stop words should be removed
tokens.Should().NotContain("custom");
tokens.Should().NotContain("stop");
tokens.Should().NotContain("words");
tokens.Should().Contain("remain");
tokens.Should().Contain("here");
}
#endregion
#region Real-World Description Fixtures
[Trait("Category", TestCategories.Unit)]
[Theory]
[MemberData(nameof(RealWorldDescriptionFixtures))]
public void ComputeAverageSimilarity_RealWorldFixtures_ReturnsExpectedRange(
string[] descriptions,
double minExpected,
double maxExpected,
string scenario)
{
// Act
var similarity = _scorer.ComputeAverageSimilarity(descriptions);
// Assert
similarity.Should().BeGreaterThanOrEqualTo(minExpected,
because: $"scenario '{scenario}' should have similarity >= {minExpected}");
similarity.Should().BeLessThanOrEqualTo(maxExpected,
because: $"scenario '{scenario}' should have similarity <= {maxExpected}");
}
public static IEnumerable<object[]> RealWorldDescriptionFixtures()
{
// CVE-2021-44228 (Log4Shell) - same vulnerability, different sources
// TF-IDF similarity for related security texts is typically 0.1-0.5
yield return new object[]
{
new[]
{
"Apache Log4j2 2.0-beta9 through 2.15.0 (excluding security releases 2.12.2, 2.12.3, and 2.3.1) JNDI features used in configuration, log messages, and parameters do not protect against attacker controlled LDAP and other JNDI related endpoints.",
"A flaw was found in the Java logging library Apache Log4j in version 2.x. When configured to use a JNDI URL with a LDAP scheme, an attacker can execute arbitrary code.",
"Remote code execution vulnerability in Apache Log4j2 allows attackers to execute arbitrary code via JNDI lookup in log messages."
},
0.05, 0.9, "Log4Shell - same CVE, different sources"
};
// Unrelated vulnerabilities - should have low similarity
yield return new object[]
{
new[]
{
"SQL injection vulnerability in the login form allows authentication bypass",
"Cross-site scripting (XSS) in the comments section enables script injection",
"Buffer overflow in image processing library causes denial of service"
},
0.0, 0.4, "Unrelated vulnerabilities"
};
// Same library, different CVEs - moderate similarity
yield return new object[]
{
new[]
{
"OpenSSL before 3.0.7 allows remote attackers to cause a denial of service via a crafted X.509 certificate",
"OpenSSL 3.0.x before 3.0.5 contains a heap-based buffer overflow in the SM2 implementation",
"A timing-based side channel in OpenSSL allows recovery of private key material"
},
0.05, 0.6, "Same library (OpenSSL), different CVEs"
};
}
#endregion
#region Determinism Tests
[Trait("Category", TestCategories.Unit)]
[Fact]
public void ComputeAverageSimilarity_IsDeterministic()
{
// Arrange
var descriptions = new[]
{
"A heap-based buffer overflow in libpng",
"Buffer overflow in PNG library",
"libpng heap overflow vulnerability"
};
// Act
var similarity1 = _scorer.ComputeAverageSimilarity(descriptions);
var similarity2 = _scorer.ComputeAverageSimilarity(descriptions);
var similarity3 = _scorer.ComputeAverageSimilarity(descriptions);
// Assert
similarity1.Should().Be(similarity2);
similarity2.Should().Be(similarity3);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public void ComputePairwiseSimilarity_IsDeterministic()
{
// Arrange
var text1 = "Memory corruption in JSON parser";
var text2 = "JSON parser memory corruption vulnerability";
// Act
var similarity1 = _scorer.ComputePairwiseSimilarity(text1, text2);
var similarity2 = _scorer.ComputePairwiseSimilarity(text1, text2);
// Assert
similarity1.Should().Be(similarity2);
}
#endregion
}
/// <summary>
/// Performance benchmarks for <see cref="TextSimilarityScorer"/>.
/// Target: <= 5ms per pair.
/// </summary>
public class TextSimilarityScorerBenchmarks
{
private readonly TextSimilarityScorer _scorer = new();
[Trait("Category", TestCategories.Performance)]
[Fact]
public void ComputePairwiseSimilarity_MeetsPerformanceTarget()
{
// Arrange - realistic vulnerability descriptions
var text1 = "A heap-based buffer overflow vulnerability has been discovered in the image processing library libpng version 1.6.37. Remote attackers can exploit this flaw by providing specially crafted PNG files, potentially leading to arbitrary code execution or denial of service conditions.";
var text2 = "The PNG image handling library (libpng) contains a buffer overflow vulnerability in the row processing function. Exploitation of this issue allows attackers to execute arbitrary code in the context of the application using the affected library.";
// Warmup
for (var i = 0; i < 10; i++)
{
_scorer.ComputePairwiseSimilarity(text1, text2);
}
// Act - measure 100 iterations
var sw = Stopwatch.StartNew();
const int iterations = 100;
for (var i = 0; i < iterations; i++)
{
_scorer.ComputePairwiseSimilarity(text1, text2);
}
sw.Stop();
var averageMs = sw.Elapsed.TotalMilliseconds / iterations;
// Assert - target: <= 5ms per pair
averageMs.Should().BeLessThanOrEqualTo(5.0,
because: $"text similarity computation should complete within 5ms per pair (actual: {averageMs:F3} ms)");
}
[Trait("Category", TestCategories.Performance)]
[Fact]
public void ComputeAverageSimilarity_FiveDescriptions_MeetsPerformanceTarget()
{
// Arrange - 5 descriptions = 10 pairs
var descriptions = new[]
{
"Apache Log4j2 JNDI features do not protect against attacker controlled LDAP endpoints",
"A flaw in Log4j in version 2.x allows attackers to execute arbitrary code via JNDI lookup",
"Remote code execution in Apache Log4j2 via malicious JNDI lookup patterns",
"Log4j2 vulnerability allows remote attackers to execute code through JNDI injection",
"Critical RCE vulnerability in Apache Log4j2 logging library through JNDI features"
};
// Warmup
for (var i = 0; i < 10; i++)
{
_scorer.ComputeAverageSimilarity(descriptions);
}
// Act
var sw = Stopwatch.StartNew();
const int iterations = 100;
for (var i = 0; i < iterations; i++)
{
_scorer.ComputeAverageSimilarity(descriptions);
}
sw.Stop();
var averageMs = sw.Elapsed.TotalMilliseconds / iterations;
var pairsPerCall = 10; // C(5,2) = 10 pairs
var msPerPair = averageMs / pairsPerCall;
// Assert - target: <= 5ms per pair
msPerPair.Should().BeLessThanOrEqualTo(5.0,
because: $"text similarity computation should complete within 5ms per pair (actual: {msPerPair:F3} ms)");
}
}