devops folders consolidate

This commit is contained in:
master
2026-01-25 23:27:41 +02:00
parent 6e687b523a
commit a743bb9a1d
613 changed files with 8611 additions and 41846 deletions

View File

@@ -121,6 +121,70 @@ public static class AdvisoryCacheKeys
public static string CveMappingPattern(string prefix = DefaultPrefix)
=> $"{prefix}by:cve:*";
// -------------------------------------------------------------------------
// IDF (Inverse Document Frequency) Cache Keys
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-007
// -------------------------------------------------------------------------
/// <summary>
/// Key for IDF score of a specific package.
/// Format: {prefix}idf:pkg:{normalizedPackageName}
/// </summary>
/// <param name="packageName">The package name (will be normalized).</param>
/// <param name="prefix">Key prefix.</param>
public static string IdfPackage(string packageName, string prefix = DefaultPrefix)
=> $"{prefix}idf:pkg:{NormalizePurl(packageName)}";
/// <summary>
/// Key for IDF corpus statistics (total document count).
/// Format: {prefix}idf:stats:corpus_size
/// </summary>
public static string IdfCorpusSize(string prefix = DefaultPrefix)
=> $"{prefix}idf:stats:corpus_size";
/// <summary>
/// Key for IDF last refresh timestamp.
/// Format: {prefix}idf:stats:last_refresh
/// </summary>
public static string IdfLastRefresh(string prefix = DefaultPrefix)
=> $"{prefix}idf:stats:last_refresh";
/// <summary>
/// Key for IDF refresh lock (distributed coordination).
/// Format: {prefix}idf:lock:refresh
/// </summary>
public static string IdfRefreshLock(string prefix = DefaultPrefix)
=> $"{prefix}idf:lock:refresh";
/// <summary>
/// Key for document frequency of a package (count of observations containing the package).
/// Format: {prefix}idf:df:{normalizedPackageName}
/// </summary>
public static string IdfDocumentFrequency(string packageName, string prefix = DefaultPrefix)
=> $"{prefix}idf:df:{NormalizePurl(packageName)}";
/// <summary>
/// Pattern to match all IDF package keys (for scanning/cleanup).
/// Format: {prefix}idf:pkg:*
/// </summary>
public static string IdfPackagePattern(string prefix = DefaultPrefix)
=> $"{prefix}idf:pkg:*";
/// <summary>
/// Key for IDF cache hit counter.
/// Format: {prefix}idf:stats:hits
/// </summary>
public static string IdfStatsHits(string prefix = DefaultPrefix)
=> $"{prefix}idf:stats:hits";
/// <summary>
/// Key for IDF cache miss counter.
/// Format: {prefix}idf:stats:misses
/// </summary>
public static string IdfStatsMisses(string prefix = DefaultPrefix)
=> $"{prefix}idf:stats:misses";
/// <summary>
/// Normalizes a PURL for use as a cache key.
/// </summary>

View File

@@ -0,0 +1,153 @@
// -----------------------------------------------------------------------------
// IPackageIdfService.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-007
// Description: Interface for package IDF (Inverse Document Frequency) caching
// -----------------------------------------------------------------------------
namespace StellaOps.Concelier.Cache.Valkey;
/// <summary>
/// Service for computing and caching IDF (Inverse Document Frequency) weights
/// for package keys used in linkset correlation.
/// </summary>
/// <remarks>
/// IDF measures how discriminative a package is across the observation corpus:
/// <code>
/// idf(pkg) = log(N / (1 + df(pkg)))
/// </code>
/// where N = total observations, df = observations containing the package.
///
/// Rare packages (low df) have high IDF → stronger correlation signal.
/// Common packages (high df) have low IDF → weaker correlation signal.
/// </remarks>
public interface IPackageIdfService
{
/// <summary>
/// Gets the IDF weight for a package key.
/// </summary>
/// <param name="packageName">The package name (PURL format).</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>
/// The IDF weight (0.0-1.0 normalized), or null if not cached.
/// Returns null on cache miss or error (graceful degradation).
/// </returns>
Task<double?> GetIdfAsync(string packageName, CancellationToken cancellationToken = default);
/// <summary>
/// Gets IDF weights for multiple package keys in a single batch operation.
/// </summary>
/// <param name="packageNames">The package names to look up.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>
/// Dictionary of package name to IDF weight. Missing entries indicate cache miss.
/// </returns>
Task<IReadOnlyDictionary<string, double>> GetIdfBatchAsync(
IEnumerable<string> packageNames,
CancellationToken cancellationToken = default);
/// <summary>
/// Sets the IDF weight for a package key.
/// </summary>
/// <param name="packageName">The package name.</param>
/// <param name="idfWeight">The IDF weight (0.0-1.0 normalized).</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task SetIdfAsync(string packageName, double idfWeight, CancellationToken cancellationToken = default);
/// <summary>
/// Sets IDF weights for multiple package keys in a single batch operation.
/// </summary>
/// <param name="idfWeights">Dictionary of package name to IDF weight.</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task SetIdfBatchAsync(
IReadOnlyDictionary<string, double> idfWeights,
CancellationToken cancellationToken = default);
/// <summary>
/// Updates the corpus statistics used for IDF computation.
/// </summary>
/// <param name="corpusSize">Total number of observations in the corpus.</param>
/// <param name="documentFrequencies">Dictionary of package name to document frequency.</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task UpdateCorpusStatsAsync(
long corpusSize,
IReadOnlyDictionary<string, long> documentFrequencies,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets the last refresh timestamp for IDF statistics.
/// </summary>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The last refresh time, or null if never refreshed.</returns>
Task<DateTimeOffset?> GetLastRefreshAsync(CancellationToken cancellationToken = default);
/// <summary>
/// Invalidates cached IDF data for a specific package.
/// </summary>
/// <param name="packageName">The package name to invalidate.</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task InvalidateAsync(string packageName, CancellationToken cancellationToken = default);
/// <summary>
/// Invalidates all cached IDF data.
/// </summary>
/// <param name="cancellationToken">Cancellation token.</param>
Task InvalidateAllAsync(CancellationToken cancellationToken = default);
/// <summary>
/// Whether the IDF cache is enabled and available.
/// </summary>
bool IsEnabled { get; }
}
/// <summary>
/// Configuration options for the package IDF service.
/// </summary>
public sealed class PackageIdfOptions
{
/// <summary>
/// Configuration section name.
/// </summary>
public const string SectionName = "Concelier:PackageIdf";
/// <summary>
/// Whether IDF caching is enabled.
/// </summary>
public bool Enabled { get; set; } = true;
/// <summary>
/// TTL for cached IDF scores.
/// Default: 1 hour.
/// </summary>
public TimeSpan IdfTtl { get; set; } = TimeSpan.FromHours(1);
/// <summary>
/// TTL for corpus statistics.
/// Default: 4 hours.
/// </summary>
public TimeSpan CorpusStatsTtl { get; set; } = TimeSpan.FromHours(4);
/// <summary>
/// Minimum IDF value to cache (to avoid caching very common packages).
/// Default: 0.01.
/// </summary>
public double MinIdfThreshold { get; set; } = 0.01;
/// <summary>
/// Default IDF weight to return on cache miss (uniform weight).
/// Default: 1.0 (no discrimination).
/// </summary>
public double DefaultIdfWeight { get; set; } = 1.0;
/// <summary>
/// Maximum number of IDF entries to cache.
/// Default: 100,000.
/// </summary>
public int MaxCacheEntries { get; set; } = 100_000;
/// <summary>
/// Whether to normalize IDF scores to 0.0-1.0 range.
/// Default: true.
/// </summary>
public bool NormalizeScores { get; set; } = true;
}

View File

@@ -0,0 +1,139 @@
// -----------------------------------------------------------------------------
// IdfRefreshHostedService.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-007
// Description: Background service for periodic IDF weight refresh
// -----------------------------------------------------------------------------
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.Concelier.Cache.Valkey;
/// <summary>
/// Interface for providing IDF corpus statistics from the observation store.
/// </summary>
/// <remarks>
/// This interface should be implemented by the Concelier Core module to provide
/// document frequencies from the actual observation database.
/// </remarks>
public interface IIdfCorpusProvider
{
/// <summary>
/// Gets the total number of observations in the corpus.
/// </summary>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Total observation count.</returns>
Task<long> GetCorpusSizeAsync(CancellationToken cancellationToken = default);
/// <summary>
/// Gets document frequencies for all packages in the corpus.
/// </summary>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Dictionary mapping package name to the number of observations containing it.</returns>
Task<IReadOnlyDictionary<string, long>> GetDocumentFrequenciesAsync(CancellationToken cancellationToken = default);
}
/// <summary>
/// Background service that periodically refreshes IDF weights from the observation corpus.
/// </summary>
public sealed class IdfRefreshHostedService : BackgroundService
{
private readonly IPackageIdfService _idfService;
private readonly IIdfCorpusProvider? _corpusProvider;
private readonly PackageIdfOptions _options;
private readonly ILogger<IdfRefreshHostedService>? _logger;
/// <summary>
/// Initializes a new instance of <see cref="IdfRefreshHostedService"/>.
/// </summary>
public IdfRefreshHostedService(
IPackageIdfService idfService,
IOptions<PackageIdfOptions> options,
IIdfCorpusProvider? corpusProvider = null,
ILogger<IdfRefreshHostedService>? logger = null)
{
_idfService = idfService ?? throw new ArgumentNullException(nameof(idfService));
_corpusProvider = corpusProvider;
_options = options?.Value ?? new PackageIdfOptions();
_logger = logger;
}
/// <inheritdoc />
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
if (!_idfService.IsEnabled)
{
_logger?.LogInformation("IDF refresh service disabled (IDF caching not enabled)");
return;
}
if (_corpusProvider is null)
{
_logger?.LogWarning(
"IDF refresh service has no corpus provider registered. " +
"Register IIdfCorpusProvider to enable automatic IDF refresh.");
return;
}
// Initial delay before first refresh (allow other services to start)
await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken).ConfigureAwait(false);
while (!stoppingToken.IsCancellationRequested)
{
try
{
await RefreshIdfWeightsAsync(stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
catch (Exception ex)
{
_logger?.LogError(ex, "Error during IDF refresh cycle");
}
// Wait for next refresh interval (default: 1 hour)
try
{
await Task.Delay(_options.IdfTtl, stoppingToken).ConfigureAwait(false);
}
catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested)
{
break;
}
}
_logger?.LogInformation("IDF refresh service stopped");
}
private async Task RefreshIdfWeightsAsync(CancellationToken cancellationToken)
{
_logger?.LogDebug("Starting IDF refresh cycle");
var corpusSize = await _corpusProvider!.GetCorpusSizeAsync(cancellationToken).ConfigureAwait(false);
if (corpusSize == 0)
{
_logger?.LogWarning("IDF refresh skipped: empty corpus");
return;
}
var documentFrequencies = await _corpusProvider.GetDocumentFrequenciesAsync(cancellationToken).ConfigureAwait(false);
if (documentFrequencies.Count == 0)
{
_logger?.LogWarning("IDF refresh skipped: no document frequencies");
return;
}
await _idfService.UpdateCorpusStatsAsync(corpusSize, documentFrequencies, cancellationToken).ConfigureAwait(false);
_logger?.LogInformation(
"IDF refresh completed: corpus={CorpusSize}, packages={PackageCount}",
corpusSize,
documentFrequencies.Count);
}
}

View File

@@ -0,0 +1,249 @@
// -----------------------------------------------------------------------------
// PackageIdfMetrics.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-007
// Description: OpenTelemetry metrics for package IDF caching operations
// -----------------------------------------------------------------------------
using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace StellaOps.Concelier.Cache.Valkey;
/// <summary>
/// Metrics instrumentation for the package IDF cache.
/// </summary>
public sealed class PackageIdfMetrics : IDisposable
{
/// <summary>
/// Activity source name for IDF cache operations.
/// </summary>
public const string ActivitySourceName = "StellaOps.Concelier.PackageIdf";
/// <summary>
/// Meter name for IDF cache metrics.
/// </summary>
public const string MeterName = "StellaOps.Concelier.PackageIdf";
private readonly Meter _meter;
private readonly Counter<long> _hitsCounter;
private readonly Counter<long> _missesCounter;
private readonly Counter<long> _refreshCounter;
private readonly Histogram<double> _latencyHistogram;
private readonly Histogram<double> _idfWeightHistogram;
private readonly ObservableGauge<long> _corpusSizeGauge;
private readonly ObservableGauge<long> _cachedEntriesGauge;
private long _lastKnownCorpusSize;
private long _lastKnownCachedEntries;
/// <summary>
/// Activity source for tracing IDF cache operations.
/// </summary>
public static ActivitySource ActivitySource { get; } = new(ActivitySourceName, "1.0.0");
/// <summary>
/// Initializes a new instance of <see cref="PackageIdfMetrics"/>.
/// </summary>
public PackageIdfMetrics()
{
_meter = new Meter(MeterName, "1.0.0");
_hitsCounter = _meter.CreateCounter<long>(
"concelier_linkset_package_idf_hits_total",
unit: "{hits}",
description: "Total number of package IDF cache hits");
_missesCounter = _meter.CreateCounter<long>(
"concelier_linkset_package_idf_misses_total",
unit: "{misses}",
description: "Total number of package IDF cache misses");
_refreshCounter = _meter.CreateCounter<long>(
"concelier_linkset_package_idf_refreshes_total",
unit: "{refreshes}",
description: "Total number of IDF corpus refresh operations");
_latencyHistogram = _meter.CreateHistogram<double>(
"concelier_linkset_package_idf_latency_ms",
unit: "ms",
description: "Package IDF cache operation latency in milliseconds");
_idfWeightHistogram = _meter.CreateHistogram<double>(
"concelier_linkset_package_idf_weight",
unit: "{weight}",
description: "Distribution of package IDF weights (0.0-1.0)");
_corpusSizeGauge = _meter.CreateObservableGauge(
"concelier_linkset_package_idf_corpus_size",
() => _lastKnownCorpusSize,
unit: "{observations}",
description: "Total number of observations in the IDF corpus");
_cachedEntriesGauge = _meter.CreateObservableGauge(
"concelier_linkset_package_idf_cached_entries",
() => _lastKnownCachedEntries,
unit: "{entries}",
description: "Number of cached IDF entries");
}
/// <summary>
/// Records a cache hit.
/// </summary>
public void RecordHit() => _hitsCounter.Add(1);
/// <summary>
/// Records multiple cache hits.
/// </summary>
/// <param name="count">Number of hits.</param>
public void RecordHits(long count) => _hitsCounter.Add(count);
/// <summary>
/// Records a cache miss.
/// </summary>
public void RecordMiss() => _missesCounter.Add(1);
/// <summary>
/// Records multiple cache misses.
/// </summary>
/// <param name="count">Number of misses.</param>
public void RecordMisses(long count) => _missesCounter.Add(count);
/// <summary>
/// Records a corpus refresh operation.
/// </summary>
/// <param name="packageCount">Number of packages refreshed.</param>
public void RecordRefresh(long packageCount = 1)
{
_refreshCounter.Add(1, new KeyValuePair<string, object?>("package_count", packageCount));
}
/// <summary>
/// Records operation latency.
/// </summary>
/// <param name="milliseconds">Latency in milliseconds.</param>
/// <param name="operation">The operation type (get, set, batch_get, refresh).</param>
public void RecordLatency(double milliseconds, string operation)
{
_latencyHistogram.Record(milliseconds, new KeyValuePair<string, object?>("operation", operation));
}
/// <summary>
/// Records an IDF weight observation for distribution analysis.
/// </summary>
/// <param name="weight">The IDF weight (0.0-1.0).</param>
public void RecordIdfWeight(double weight)
{
_idfWeightHistogram.Record(weight);
}
/// <summary>
/// Updates the corpus size gauge.
/// </summary>
/// <param name="size">Current corpus size.</param>
public void UpdateCorpusSize(long size)
{
_lastKnownCorpusSize = size;
}
/// <summary>
/// Updates the cached entries gauge.
/// </summary>
/// <param name="count">Current cached entry count.</param>
public void UpdateCachedEntries(long count)
{
_lastKnownCachedEntries = count;
}
/// <summary>
/// Starts an activity for tracing an IDF cache operation.
/// </summary>
/// <param name="operationName">Name of the operation.</param>
/// <returns>The activity, or null if tracing is disabled.</returns>
public static Activity? StartActivity(string operationName)
{
return ActivitySource.StartActivity(operationName, ActivityKind.Internal);
}
/// <summary>
/// Starts an activity with tags.
/// </summary>
/// <param name="operationName">Name of the operation.</param>
/// <param name="tags">Tags to add to the activity.</param>
/// <returns>The activity, or null if tracing is disabled.</returns>
public static Activity? StartActivity(string operationName, params (string Key, object? Value)[] tags)
{
var activity = ActivitySource.StartActivity(operationName, ActivityKind.Internal);
if (activity is not null)
{
foreach (var (key, value) in tags)
{
activity.SetTag(key, value);
}
}
return activity;
}
/// <inheritdoc />
public void Dispose()
{
_meter.Dispose();
}
}
/// <summary>
/// Extension methods for timing IDF cache operations.
/// </summary>
public static class PackageIdfMetricsExtensions
{
/// <summary>
/// Times an async operation and records the latency.
/// </summary>
public static async Task<T> TimeAsync<T>(
this PackageIdfMetrics? metrics,
string operation,
Func<Task<T>> action)
{
if (metrics is null)
{
return await action().ConfigureAwait(false);
}
var sw = Stopwatch.StartNew();
try
{
return await action().ConfigureAwait(false);
}
finally
{
sw.Stop();
metrics.RecordLatency(sw.Elapsed.TotalMilliseconds, operation);
}
}
/// <summary>
/// Times an async operation and records the latency.
/// </summary>
public static async Task TimeAsync(
this PackageIdfMetrics? metrics,
string operation,
Func<Task> action)
{
if (metrics is null)
{
await action().ConfigureAwait(false);
return;
}
var sw = Stopwatch.StartNew();
try
{
await action().ConfigureAwait(false);
}
finally
{
sw.Stop();
metrics.RecordLatency(sw.Elapsed.TotalMilliseconds, operation);
}
}
}

View File

@@ -32,6 +32,10 @@ public static class ServiceCollectionExtensions
services.Configure<ConcelierCacheOptions>(
configuration.GetSection(ConcelierCacheOptions.SectionName));
// Bind package IDF options (CORR-V2-007)
services.Configure<PackageIdfOptions>(
configuration.GetSection(PackageIdfOptions.SectionName));
return AddCoreServices(services, enableWarmup);
}
@@ -39,16 +43,23 @@ public static class ServiceCollectionExtensions
/// Adds Concelier Valkey cache services with custom options.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="configureOptions">Action to configure options.</param>
/// <param name="configureOptions">Action to configure cache options.</param>
/// <param name="configureIdfOptions">Optional action to configure IDF options.</param>
/// <param name="enableWarmup">Whether to enable background cache warmup.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddConcelierValkeyCache(
this IServiceCollection services,
Action<ConcelierCacheOptions> configureOptions,
Action<PackageIdfOptions>? configureIdfOptions = null,
bool enableWarmup = true)
{
services.Configure(configureOptions);
if (configureIdfOptions is not null)
{
services.Configure(configureIdfOptions);
}
return AddCoreServices(services, enableWarmup);
}
@@ -59,9 +70,11 @@ public static class ServiceCollectionExtensions
// Register metrics
services.TryAddSingleton<ConcelierCacheMetrics>();
services.TryAddSingleton<PackageIdfMetrics>();
// Register cache service
// Register cache services
services.TryAddSingleton<IAdvisoryCacheService, ValkeyAdvisoryCacheService>();
services.TryAddSingleton<IPackageIdfService, ValkeyPackageIdfService>();
// Register warmup hosted service if enabled
if (enableWarmup)
@@ -69,6 +82,10 @@ public static class ServiceCollectionExtensions
services.AddHostedService<CacheWarmupHostedService>();
}
// Register IDF refresh hosted service (CORR-V2-007)
// Note: Requires IIdfCorpusProvider to be registered by Concelier.Core
services.AddHostedService<IdfRefreshHostedService>();
return services;
}

View File

@@ -0,0 +1,421 @@
// -----------------------------------------------------------------------------
// ValkeyPackageIdfService.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-007
// Description: Valkey-backed implementation of IPackageIdfService
// -----------------------------------------------------------------------------
using System.Diagnostics;
using System.Globalization;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StackExchange.Redis;
namespace StellaOps.Concelier.Cache.Valkey;
/// <summary>
/// Valkey-backed implementation of <see cref="IPackageIdfService"/>.
/// Provides caching for package IDF (Inverse Document Frequency) weights
/// used in linkset correlation scoring.
/// </summary>
/// <remarks>
/// <para>
/// This service caches pre-computed IDF weights with hourly refresh.
/// On cache miss, it returns null to signal the caller should use uniform weights.
/// </para>
/// <para>
/// Key features:
/// - Batch operations for efficient multi-package lookups
/// - Graceful degradation on Valkey errors (returns null, logs warning)
/// - TTL-based expiration with configurable refresh intervals
/// - OpenTelemetry metrics for monitoring cache performance
/// </para>
/// </remarks>
public sealed class ValkeyPackageIdfService : IPackageIdfService
{
private readonly ConcelierCacheConnectionFactory _connectionFactory;
private readonly ConcelierCacheOptions _cacheOptions;
private readonly PackageIdfOptions _idfOptions;
private readonly PackageIdfMetrics? _metrics;
private readonly ILogger<ValkeyPackageIdfService>? _logger;
/// <summary>
/// Initializes a new instance of <see cref="ValkeyPackageIdfService"/>.
/// </summary>
public ValkeyPackageIdfService(
ConcelierCacheConnectionFactory connectionFactory,
IOptions<ConcelierCacheOptions> cacheOptions,
IOptions<PackageIdfOptions> idfOptions,
PackageIdfMetrics? metrics = null,
ILogger<ValkeyPackageIdfService>? logger = null)
{
_connectionFactory = connectionFactory ?? throw new ArgumentNullException(nameof(connectionFactory));
_cacheOptions = cacheOptions?.Value ?? new ConcelierCacheOptions();
_idfOptions = idfOptions?.Value ?? new PackageIdfOptions();
_metrics = metrics;
_logger = logger;
}
/// <inheritdoc />
public bool IsEnabled => _cacheOptions.Enabled && _idfOptions.Enabled;
/// <inheritdoc />
public async Task<double?> GetIdfAsync(string packageName, CancellationToken cancellationToken = default)
{
if (!IsEnabled || string.IsNullOrWhiteSpace(packageName))
{
return null;
}
var sw = StartTiming();
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var key = AdvisoryCacheKeys.IdfPackage(packageName, _cacheOptions.KeyPrefix);
var cached = await db.StringGetAsync(key).ConfigureAwait(false);
if (cached.HasValue && double.TryParse((string?)cached, NumberStyles.Float, CultureInfo.InvariantCulture, out var weight))
{
await db.StringIncrementAsync(AdvisoryCacheKeys.IdfStatsHits(_cacheOptions.KeyPrefix)).ConfigureAwait(false);
_metrics?.RecordHit();
_metrics?.RecordIdfWeight(weight);
return weight;
}
await db.StringIncrementAsync(AdvisoryCacheKeys.IdfStatsMisses(_cacheOptions.KeyPrefix)).ConfigureAwait(false);
_metrics?.RecordMiss();
return null;
}
catch (Exception ex)
{
_logger?.LogWarning(ex, "Failed to get IDF for package {PackageName}", packageName);
return null; // Graceful degradation
}
finally
{
StopTiming(sw, "get");
}
}
/// <inheritdoc />
public async Task<IReadOnlyDictionary<string, double>> GetIdfBatchAsync(
IEnumerable<string> packageNames,
CancellationToken cancellationToken = default)
{
var names = packageNames?.Where(n => !string.IsNullOrWhiteSpace(n)).Distinct().ToArray()
?? Array.Empty<string>();
if (!IsEnabled || names.Length == 0)
{
return new Dictionary<string, double>();
}
var sw = StartTiming();
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var keys = names.Select(n => (RedisKey)AdvisoryCacheKeys.IdfPackage(n, _cacheOptions.KeyPrefix)).ToArray();
var values = await db.StringGetAsync(keys).ConfigureAwait(false);
var result = new Dictionary<string, double>(names.Length);
var hits = 0;
var misses = 0;
for (var i = 0; i < names.Length; i++)
{
if (values[i].HasValue &&
double.TryParse((string?)values[i], NumberStyles.Float, CultureInfo.InvariantCulture, out var weight))
{
result[names[i]] = weight;
hits++;
_metrics?.RecordIdfWeight(weight);
}
else
{
misses++;
}
}
if (hits > 0) _metrics?.RecordHits(hits);
if (misses > 0) _metrics?.RecordMisses(misses);
return result;
}
catch (Exception ex)
{
_logger?.LogWarning(ex, "Failed to batch get IDF for {Count} packages", names.Length);
return new Dictionary<string, double>();
}
finally
{
StopTiming(sw, "batch_get");
}
}
/// <inheritdoc />
public async Task SetIdfAsync(string packageName, double idfWeight, CancellationToken cancellationToken = default)
{
if (!IsEnabled || string.IsNullOrWhiteSpace(packageName))
{
return;
}
// Skip caching weights below threshold (very common packages)
if (idfWeight < _idfOptions.MinIdfThreshold)
{
return;
}
var sw = StartTiming();
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var key = AdvisoryCacheKeys.IdfPackage(packageName, _cacheOptions.KeyPrefix);
var value = idfWeight.ToString("F6", CultureInfo.InvariantCulture);
await db.StringSetAsync(key, value, _idfOptions.IdfTtl).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger?.LogWarning(ex, "Failed to set IDF for package {PackageName}", packageName);
}
finally
{
StopTiming(sw, "set");
}
}
/// <inheritdoc />
public async Task SetIdfBatchAsync(
IReadOnlyDictionary<string, double> idfWeights,
CancellationToken cancellationToken = default)
{
if (!IsEnabled || idfWeights is null || idfWeights.Count == 0)
{
return;
}
var sw = StartTiming();
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var entries = idfWeights
.Where(kv => !string.IsNullOrWhiteSpace(kv.Key) && kv.Value >= _idfOptions.MinIdfThreshold)
.Select(kv => new KeyValuePair<RedisKey, RedisValue>(
AdvisoryCacheKeys.IdfPackage(kv.Key, _cacheOptions.KeyPrefix),
kv.Value.ToString("F6", CultureInfo.InvariantCulture)))
.ToArray();
if (entries.Length == 0)
{
return;
}
// Use pipeline for batch set with TTL
var batch = db.CreateBatch();
var tasks = new List<Task>(entries.Length);
foreach (var entry in entries)
{
tasks.Add(batch.StringSetAsync(entry.Key, entry.Value, _idfOptions.IdfTtl));
}
batch.Execute();
await Task.WhenAll(tasks).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger?.LogWarning(ex, "Failed to batch set IDF for {Count} packages", idfWeights.Count);
}
finally
{
StopTiming(sw, "batch_set");
}
}
/// <inheritdoc />
public async Task UpdateCorpusStatsAsync(
long corpusSize,
IReadOnlyDictionary<string, long> documentFrequencies,
CancellationToken cancellationToken = default)
{
if (!IsEnabled)
{
return;
}
var sw = StartTiming();
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var prefix = _cacheOptions.KeyPrefix;
// Update corpus size
await db.StringSetAsync(
AdvisoryCacheKeys.IdfCorpusSize(prefix),
corpusSize.ToString(CultureInfo.InvariantCulture),
_idfOptions.CorpusStatsTtl).ConfigureAwait(false);
// Compute and cache IDF weights
var idfWeights = new Dictionary<string, double>(documentFrequencies.Count);
var maxIdf = 0.0;
foreach (var (packageName, df) in documentFrequencies)
{
// IDF formula: log(N / (1 + df))
var rawIdf = Math.Log((double)corpusSize / (1 + df));
if (rawIdf > maxIdf) maxIdf = rawIdf;
idfWeights[packageName] = rawIdf;
}
// Normalize if configured
if (_idfOptions.NormalizeScores && maxIdf > 0)
{
foreach (var key in idfWeights.Keys.ToArray())
{
idfWeights[key] /= maxIdf;
}
}
// Batch set the normalized IDF weights
await SetIdfBatchAsync(idfWeights, cancellationToken).ConfigureAwait(false);
// Update document frequencies
var batch = db.CreateBatch();
var tasks = new List<Task>(documentFrequencies.Count);
foreach (var (packageName, df) in documentFrequencies)
{
tasks.Add(batch.StringSetAsync(
AdvisoryCacheKeys.IdfDocumentFrequency(packageName, prefix),
df.ToString(CultureInfo.InvariantCulture),
_idfOptions.CorpusStatsTtl));
}
batch.Execute();
await Task.WhenAll(tasks).ConfigureAwait(false);
// Update last refresh timestamp
await db.StringSetAsync(
AdvisoryCacheKeys.IdfLastRefresh(prefix),
DateTimeOffset.UtcNow.ToString("o", CultureInfo.InvariantCulture),
_idfOptions.CorpusStatsTtl).ConfigureAwait(false);
_metrics?.UpdateCorpusSize(corpusSize);
_metrics?.UpdateCachedEntries(documentFrequencies.Count);
_metrics?.RecordRefresh(documentFrequencies.Count);
_logger?.LogInformation(
"Updated IDF corpus: size={CorpusSize}, packages={PackageCount}",
corpusSize,
documentFrequencies.Count);
}
catch (Exception ex)
{
_logger?.LogError(ex, "Failed to update IDF corpus stats");
}
finally
{
StopTiming(sw, "refresh");
}
}
/// <inheritdoc />
public async Task<DateTimeOffset?> GetLastRefreshAsync(CancellationToken cancellationToken = default)
{
if (!IsEnabled)
{
return null;
}
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var key = AdvisoryCacheKeys.IdfLastRefresh(_cacheOptions.KeyPrefix);
var cached = await db.StringGetAsync(key).ConfigureAwait(false);
if (cached.HasValue &&
DateTimeOffset.TryParse(cached, CultureInfo.InvariantCulture, DateTimeStyles.RoundtripKind, out var timestamp))
{
return timestamp;
}
return null;
}
catch (Exception ex)
{
_logger?.LogWarning(ex, "Failed to get IDF last refresh timestamp");
return null;
}
}
/// <inheritdoc />
public async Task InvalidateAsync(string packageName, CancellationToken cancellationToken = default)
{
if (!IsEnabled || string.IsNullOrWhiteSpace(packageName))
{
return;
}
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var prefix = _cacheOptions.KeyPrefix;
await Task.WhenAll(
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfPackage(packageName, prefix)),
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfDocumentFrequency(packageName, prefix))
).ConfigureAwait(false);
}
catch (Exception ex)
{
_logger?.LogWarning(ex, "Failed to invalidate IDF for package {PackageName}", packageName);
}
}
/// <inheritdoc />
public async Task InvalidateAllAsync(CancellationToken cancellationToken = default)
{
if (!IsEnabled)
{
return;
}
try
{
var db = await _connectionFactory.GetDatabaseAsync(cancellationToken).ConfigureAwait(false);
var prefix = _cacheOptions.KeyPrefix;
// Delete stats keys
await Task.WhenAll(
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfCorpusSize(prefix)),
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfLastRefresh(prefix)),
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfStatsHits(prefix)),
db.KeyDeleteAsync(AdvisoryCacheKeys.IdfStatsMisses(prefix))
).ConfigureAwait(false);
// Note: Scanning and deleting all idf:pkg:* keys would require SCAN,
// which is expensive. For now, rely on TTL expiration.
_logger?.LogInformation("Invalidated IDF stats; individual package keys will expire via TTL");
}
catch (Exception ex)
{
_logger?.LogError(ex, "Failed to invalidate all IDF cache");
}
}
private Stopwatch? StartTiming()
{
if (_metrics is null) return null;
return Stopwatch.StartNew();
}
private void StopTiming(Stopwatch? sw, string operation)
{
if (sw is null || _metrics is null) return;
sw.Stop();
_metrics.RecordLatency(sw.Elapsed.TotalMilliseconds, operation);
}
}

View File

@@ -40,11 +40,33 @@ public sealed record AdvisoryLinksetProvenance(
string? ToolVersion,
string? PolicyHash);
/// <summary>
/// Conflict severity levels for typed penalty calculation.
/// </summary>
public enum ConflictSeverity
{
/// <summary>No penalty; informational only.</summary>
Info = 0,
/// <summary>Minor disagreement; small penalty.</summary>
Soft = 1,
/// <summary>Significant disagreement; should usually prevent high-confidence linking.</summary>
Hard = 2
}
public sealed record AdvisoryLinksetConflict(
string Field,
string Reason,
IReadOnlyList<string>? Values,
IReadOnlyList<string>? SourceIds = null);
IReadOnlyList<string>? SourceIds = null)
{
/// <summary>
/// Severity of the conflict. Defaults to <see cref="ConflictSeverity.Soft"/>.
/// Hard conflicts significantly impact confidence; Info conflicts are purely informational.
/// </summary>
public ConflictSeverity Severity { get; init; } = ConflictSeverity.Soft;
}
internal static class DocumentHelper
{

View File

@@ -0,0 +1,73 @@
// -----------------------------------------------------------------------------
// ILinksetCorrelationService.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-008
// Description: Abstraction for linkset correlation with V1/V2 support
// -----------------------------------------------------------------------------
using System;
using System.Collections.Generic;
using StellaOps.Concelier.Models;
namespace StellaOps.Concelier.Core.Linksets;
/// <summary>
/// Service for computing linkset correlation confidence and conflicts.
/// Supports multiple correlation algorithm versions (V1, V2).
/// </summary>
public interface ILinksetCorrelationService
{
/// <summary>
/// Gets the correlation algorithm version being used.
/// </summary>
string Version { get; }
/// <summary>
/// Computes correlation confidence and conflicts for a set of observation inputs.
/// </summary>
(double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) Compute(
IReadOnlyCollection<CorrelationInput> inputs,
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts = null);
}
/// <summary>
/// Unified input model for correlation computation.
/// </summary>
public sealed record CorrelationInput(
string ObservationId,
string? Vendor,
DateTimeOffset? FetchedAt,
IReadOnlyCollection<string> Aliases,
IReadOnlyCollection<string> Purls,
IReadOnlyCollection<string> Cpes,
IReadOnlyCollection<string> References,
IReadOnlyCollection<string>? PatchReferences = null);
/// <summary>
/// Configuration for the correlation service.
/// </summary>
public sealed class CorrelationServiceOptions
{
/// <summary>
/// Correlation algorithm version. Supported values: "v1", "v2".
/// Default: "v1" for backward compatibility.
/// </summary>
public string Version { get; set; } = "v1";
/// <summary>
/// Optional custom weights for V2 correlation signals.
/// Keys: aliasConnectivity, aliasAuthority, packageCoverage, versionCompatibility,
/// cpeMatch, patchLineage, referenceOverlap, freshness
/// </summary>
public Dictionary<string, double>? Weights { get; set; }
/// <summary>
/// Whether to enable IDF weighting for package keys (V2 only).
/// </summary>
public bool EnableIdfWeighting { get; set; } = true;
/// <summary>
/// Whether to enable text similarity scoring (V2 Phase 3, disabled by default).
/// </summary>
public bool EnableTextSimilarity { get; set; } = false;
}

View File

@@ -0,0 +1,104 @@
// -----------------------------------------------------------------------------
// LinksetCorrelationService.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-008
// Description: Implementation of ILinksetCorrelationService with V1/V2 support
// -----------------------------------------------------------------------------
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.Concelier.Models;
namespace StellaOps.Concelier.Core.Linksets;
/// <summary>
/// Default implementation of <see cref="ILinksetCorrelationService"/>.
/// Supports V1 (intersection-based) and V2 (graph-based) correlation algorithms.
/// </summary>
public sealed class LinksetCorrelationService : ILinksetCorrelationService
{
private readonly CorrelationServiceOptions _options;
private readonly ILogger<LinksetCorrelationService> _logger;
private readonly Func<string, double>? _idfProvider;
public LinksetCorrelationService(
IOptions<CorrelationServiceOptions> options,
ILogger<LinksetCorrelationService> logger,
Func<string, double>? idfProvider = null)
{
_options = options?.Value ?? new CorrelationServiceOptions();
_logger = logger;
_idfProvider = idfProvider;
}
/// <inheritdoc />
public string Version => _options.Version?.ToLowerInvariant() switch
{
"v2" => "v2",
_ => "v1"
};
/// <inheritdoc />
public (double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) Compute(
IReadOnlyCollection<CorrelationInput> inputs,
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts = null)
{
if (inputs.Count == 0)
{
return (1.0, Array.Empty<AdvisoryLinksetConflict>());
}
return Version switch
{
"v2" => ComputeV2(inputs, additionalConflicts),
_ => ComputeV1(inputs, additionalConflicts)
};
}
private (double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) ComputeV1(
IReadOnlyCollection<CorrelationInput> inputs,
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts)
{
// Convert to V1 input format
var v1Inputs = inputs.Select(i => new LinksetCorrelation.Input(
Vendor: i.Vendor,
FetchedAt: i.FetchedAt,
Aliases: i.Aliases,
Purls: i.Purls,
Cpes: i.Cpes,
References: i.References)).ToArray();
return LinksetCorrelation.Compute(v1Inputs, additionalConflicts);
}
private (double Confidence, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) ComputeV2(
IReadOnlyCollection<CorrelationInput> inputs,
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts)
{
// Convert to V2 input format
var v2Inputs = inputs.Select(i => new LinksetCorrelationV2.InputV2(
ObservationId: i.ObservationId,
Vendor: i.Vendor,
FetchedAt: i.FetchedAt,
Aliases: i.Aliases,
Purls: i.Purls,
Cpes: i.Cpes,
References: i.References,
PatchReferences: i.PatchReferences)).ToArray();
var idfProvider = _options.EnableIdfWeighting ? _idfProvider : null;
var result = LinksetCorrelationV2.Compute(v2Inputs, additionalConflicts, idfProvider);
_logger.LogDebug(
"V2 correlation computed: confidence={Confidence:F3}, conflicts={ConflictCount}, signals={Signals}",
result.Confidence,
result.Conflicts.Count,
string.Join(", ", result.SignalScores.Select(kv => $"{kv.Key}={kv.Value:F2}")));
return (result.Confidence, result.Conflicts);
}
}

View File

@@ -0,0 +1,910 @@
// -----------------------------------------------------------------------------
// LinksetCorrelationV2.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-001 through CORR-V2-008
// Description: V2 correlation algorithm with graph-based alias connectivity,
// version compatibility scoring, patch lineage signals, and typed
// conflict severities.
// -----------------------------------------------------------------------------
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using StellaOps.Concelier.Models;
namespace StellaOps.Concelier.Core.Linksets;
/// <summary>
/// Version relationship classification for affected range comparison.
/// </summary>
public enum VersionRelation
{
/// <summary>Unable to determine relationship.</summary>
Unknown = 0,
/// <summary>Ranges normalize to identical primitives.</summary>
Equivalent = 1,
/// <summary>Ranges have non-empty intersection but are not equal.</summary>
Overlapping = 2,
/// <summary>Ranges have no intersection.</summary>
Disjoint = 3
}
/// <summary>
/// V2 linkset correlation algorithm with graph-based connectivity,
/// typed conflict severities, and multi-signal scoring.
/// </summary>
/// <remarks>
/// Key improvements over V1:
/// - Alias matching uses graph connectivity (LCC ratio) instead of intersection-across-all
/// - PURL matching uses pairwise coverage instead of intersection-across-all
/// - Reference clash only emitted for true contradictions, not zero overlap
/// - Typed conflict severities with per-reason penalties
/// - Patch lineage as high-weight signal
/// - Version compatibility classification (equivalent/overlapping/disjoint)
/// </remarks>
internal static class LinksetCorrelationV2
{
/// <summary>
/// Default correlation weights. Can be overridden via configuration.
/// </summary>
internal static class Weights
{
public const double AliasConnectivity = 0.30;
public const double AliasAuthority = 0.10;
public const double PackageCoverage = 0.20;
public const double VersionCompatibility = 0.10;
public const double CpeMatch = 0.10;
public const double PatchLineage = 0.10;
public const double ReferenceOverlap = 0.05;
public const double Freshness = 0.05;
}
/// <summary>
/// Conflict penalties by severity and reason.
/// </summary>
internal static class ConflictPenalties
{
public const double DistinctCves = 0.40; // Hard: two different CVEs
public const double DisjointVersionRanges = 0.30; // Hard: same pkg, no overlap
public const double OverlappingRanges = 0.05; // Soft: ranges overlap but differ
public const double SeverityMismatch = 0.05; // Soft: CVSS differs
public const double AliasInconsistency = 0.10; // Soft: non-CVE alias mismatch
public const double ZeroReferenceOverlap = 0.00; // Info: no penalty
}
internal readonly record struct InputV2(
string ObservationId,
string? Vendor,
DateTimeOffset? FetchedAt,
IReadOnlyCollection<string> Aliases,
IReadOnlyCollection<string> Purls,
IReadOnlyCollection<string> Cpes,
IReadOnlyCollection<string> References,
IReadOnlyCollection<string>? PatchReferences = null);
internal readonly record struct CorrelationResult(
double Confidence,
IReadOnlyList<AdvisoryLinksetConflict> Conflicts,
IReadOnlyDictionary<string, double> SignalScores);
/// <summary>
/// Computes correlation confidence and conflicts for a set of observations.
/// </summary>
internal static CorrelationResult Compute(
IReadOnlyCollection<InputV2> inputs,
IReadOnlyList<AdvisoryLinksetConflict>? additionalConflicts = null,
Func<string, double>? packageIdfProvider = null)
{
if (inputs.Count == 0)
{
return new CorrelationResult(
1.0,
Array.Empty<AdvisoryLinksetConflict>(),
ImmutableDictionary<string, double>.Empty);
}
var conflicts = new List<AdvisoryLinksetConflict>();
var signalScores = new Dictionary<string, double>();
// 1. Alias connectivity (graph-based)
var (aliasConnectivity, aliasConflicts) = CalculateAliasConnectivity(inputs);
conflicts.AddRange(aliasConflicts);
signalScores["aliasConnectivity"] = aliasConnectivity;
// 2. Alias authority (scope-based weighting)
var aliasAuthority = CalculateAliasAuthority(inputs);
signalScores["aliasAuthority"] = aliasAuthority;
// 3. Package coverage (pairwise + IDF)
var (packageCoverage, packageConflicts) = CalculatePackageCoverage(inputs, packageIdfProvider);
conflicts.AddRange(packageConflicts);
signalScores["packageCoverage"] = packageCoverage;
// 4. Version compatibility
var (versionScore, versionConflicts) = CalculateVersionCompatibility(inputs);
conflicts.AddRange(versionConflicts);
signalScores["versionCompatibility"] = versionScore;
// 5. CPE match (existing logic, minor adjustments)
var cpeScore = CalculateCpeScore(inputs);
signalScores["cpeMatch"] = cpeScore;
// 6. Patch lineage
var patchScore = CalculatePatchLineageScore(inputs);
signalScores["patchLineage"] = patchScore;
// 7. Reference overlap (positive-only, no conflict on zero)
var referenceScore = CalculateReferenceScore(inputs);
signalScores["referenceOverlap"] = referenceScore;
// 8. Freshness
var freshnessScore = CalculateFreshnessScore(inputs);
signalScores["freshness"] = freshnessScore;
// Calculate base confidence from weighted signals
var baseConfidence = Clamp01(
(Weights.AliasConnectivity * aliasConnectivity) +
(Weights.AliasAuthority * aliasAuthority) +
(Weights.PackageCoverage * packageCoverage) +
(Weights.VersionCompatibility * versionScore) +
(Weights.CpeMatch * cpeScore) +
(Weights.PatchLineage * patchScore) +
(Weights.ReferenceOverlap * referenceScore) +
(Weights.Freshness * freshnessScore));
// Add additional conflicts before penalty calculation
if (additionalConflicts is { Count: > 0 })
{
conflicts.AddRange(additionalConflicts);
}
// Apply typed conflict penalties
var totalPenalty = CalculateTypedPenalty(conflicts);
var finalConfidence = Clamp01(baseConfidence - totalPenalty);
// Ensure minimum confidence when conflicts exist but evidence is present
if (finalConfidence < 0.1 && baseConfidence > 0)
{
finalConfidence = 0.1;
}
return new CorrelationResult(
finalConfidence,
DeduplicateAndSort(conflicts, inputs),
signalScores.ToImmutableDictionary());
}
#region Alias Connectivity (Graph-based)
/// <summary>
/// Calculates alias connectivity using bipartite graph analysis.
/// Returns LCC (largest connected component) ratio instead of intersection.
/// </summary>
private static (double Score, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) CalculateAliasConnectivity(
IReadOnlyCollection<InputV2> inputs)
{
var conflicts = new List<AdvisoryLinksetConflict>();
if (inputs.Count == 1)
{
return (inputs.First().Aliases.Count > 0 ? 1d : 0d, conflicts);
}
// Build bipartite graph: observation nodes + alias nodes
var observationToAliases = inputs
.ToDictionary(
i => i.ObservationId,
i => i.Aliases.Select(a => a.ToUpperInvariant()).ToHashSet(StringComparer.Ordinal));
// Build adjacency for union-find
var allAliases = observationToAliases.Values.SelectMany(a => a).ToHashSet(StringComparer.Ordinal);
if (allAliases.Count == 0)
{
return (0d, conflicts);
}
// Find connected components using alias-based bridging
var observationIds = inputs.Select(i => i.ObservationId).ToList();
var parent = observationIds.ToDictionary(id => id, id => id);
string Find(string x)
{
if (parent[x] != x)
parent[x] = Find(parent[x]);
return parent[x];
}
void Union(string x, string y)
{
var px = Find(x);
var py = Find(y);
if (px != py)
parent[px] = py;
}
// Connect observations that share any alias
foreach (var alias in allAliases)
{
var observationsWithAlias = observationIds
.Where(id => observationToAliases[id].Contains(alias))
.ToList();
for (int i = 1; i < observationsWithAlias.Count; i++)
{
Union(observationsWithAlias[0], observationsWithAlias[i]);
}
}
// Calculate LCC ratio
var componentSizes = observationIds
.GroupBy(Find)
.Select(g => g.Count())
.ToList();
var largestComponent = componentSizes.Max();
var lccRatio = (double)largestComponent / observationIds.Count;
// Check for distinct CVEs (true identity conflict)
var cveAliases = allAliases
.Where(a => a.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase))
.ToHashSet(StringComparer.OrdinalIgnoreCase);
if (cveAliases.Count > 1)
{
// Multiple distinct CVEs in cluster = hard conflict
var values = inputs
.Select(i => $"{i.Vendor ?? "source"}:{FirstSortedOrDefault(i.Aliases.Where(a => a.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase)))}")
.Where(v => !v.EndsWith(":<none>"))
.OrderBy(v => v, StringComparer.Ordinal)
.ToArray();
if (values.Length > 1)
{
conflicts.Add(new AdvisoryLinksetConflict(
"aliases",
"distinct-cves",
values)
{
Severity = ConflictSeverity.Hard
});
}
}
else if (lccRatio < 1.0 && allAliases.Count > 0)
{
// Disconnected observations but no CVE conflict = soft inconsistency
var disconnectedObs = observationIds
.Where(id => Find(id) != Find(observationIds[0]))
.Select(id => inputs.First(i => i.ObservationId == id))
.Select(i => $"{i.Vendor ?? "source"}:{FirstSortedOrDefault(i.Aliases)}")
.OrderBy(v => v, StringComparer.Ordinal)
.ToArray();
if (disconnectedObs.Length > 0)
{
conflicts.Add(new AdvisoryLinksetConflict(
"aliases",
"alias-inconsistency",
disconnectedObs)
{
Severity = ConflictSeverity.Soft
});
}
}
return (lccRatio, conflicts);
}
/// <summary>
/// Calculates alias authority score based on scope hierarchy.
/// CVE (global) > ECO (ecosystem) > VND (vendor) > DST (distribution).
/// </summary>
private static double CalculateAliasAuthority(IReadOnlyCollection<InputV2> inputs)
{
var allAliases = inputs.SelectMany(i => i.Aliases).ToHashSet(StringComparer.OrdinalIgnoreCase);
if (allAliases.Count == 0)
return 0d;
// Score based on highest authority alias present
var hasCve = allAliases.Any(a => a.StartsWith("CVE-", StringComparison.OrdinalIgnoreCase));
var hasGhsa = allAliases.Any(a => a.StartsWith("GHSA-", StringComparison.OrdinalIgnoreCase));
var hasVendor = allAliases.Any(a =>
a.StartsWith("RHSA-", StringComparison.OrdinalIgnoreCase) ||
a.StartsWith("MSRC-", StringComparison.OrdinalIgnoreCase) ||
a.StartsWith("CISCO-", StringComparison.OrdinalIgnoreCase) ||
a.StartsWith("VMSA-", StringComparison.OrdinalIgnoreCase));
var hasDistro = allAliases.Any(a =>
a.StartsWith("DSA-", StringComparison.OrdinalIgnoreCase) ||
a.StartsWith("USN-", StringComparison.OrdinalIgnoreCase) ||
a.StartsWith("SUSE-", StringComparison.OrdinalIgnoreCase));
if (hasCve) return 1.0;
if (hasGhsa) return 0.8;
if (hasVendor) return 0.6;
if (hasDistro) return 0.4;
return 0.2; // Unknown alias scheme
}
#endregion
#region Package Coverage (Pairwise + IDF)
/// <summary>
/// Calculates package coverage using pairwise overlap instead of intersection-across-all.
/// A thin source with no packages does not collapse the score.
/// </summary>
private static (double Score, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) CalculatePackageCoverage(
IReadOnlyCollection<InputV2> inputs,
Func<string, double>? idfProvider = null)
{
var conflicts = new List<AdvisoryLinksetConflict>();
var inputsWithPackages = inputs.Where(i => i.Purls.Count > 0).ToList();
if (inputsWithPackages.Count == 0)
{
return (0d, conflicts);
}
if (inputsWithPackages.Count == 1)
{
return (inputsWithPackages[0].Purls.Count > 0 ? 1d : 0d, conflicts);
}
// Extract package keys (without version)
var packageKeysPerInput = inputsWithPackages
.Select(i => i.Purls
.Select(ExtractPackageKey)
.Where(k => !string.IsNullOrWhiteSpace(k))
.ToHashSet(StringComparer.Ordinal))
.ToList();
// Calculate pairwise overlap with optional IDF weighting
var totalWeight = 0d;
var matchedWeight = 0d;
var allPackages = packageKeysPerInput.SelectMany(p => p).ToHashSet(StringComparer.Ordinal);
foreach (var pkg in allPackages)
{
var idfWeight = idfProvider?.Invoke(pkg) ?? 1.0;
var inputsWithPkg = packageKeysPerInput.Count(set => set.Contains(pkg));
totalWeight += idfWeight;
if (inputsWithPkg > 1)
{
// Package appears in multiple sources = positive signal
matchedWeight += idfWeight * ((double)inputsWithPkg / inputsWithPackages.Count);
}
}
var score = totalWeight > 0 ? matchedWeight / totalWeight : 0d;
// Check for exact PURL overlap (with version)
var hasExactOverlap = HasExactPurlOverlap(inputsWithPackages);
if (hasExactOverlap)
{
score = Math.Max(score, 0.8); // Boost for exact match
}
// Collect range divergence as soft conflicts (handled in version scoring)
// No longer emitted here to avoid double-counting
return (Clamp01(score), conflicts);
}
#endregion
#region Version Compatibility
/// <summary>
/// Classifies version relationships for shared packages.
/// </summary>
private static (double Score, IReadOnlyList<AdvisoryLinksetConflict> Conflicts) CalculateVersionCompatibility(
IReadOnlyCollection<InputV2> inputs)
{
var conflicts = new List<AdvisoryLinksetConflict>();
var inputsWithPackages = inputs.Where(i => i.Purls.Count > 0).ToList();
if (inputsWithPackages.Count < 2)
{
return (0.5d, conflicts); // Neutral when no comparison possible
}
// Find shared package keys
var packageKeysPerInput = inputsWithPackages
.Select(i => i.Purls
.Select(ExtractPackageKey)
.Where(k => !string.IsNullOrWhiteSpace(k))
.ToHashSet(StringComparer.Ordinal))
.ToList();
var sharedPackages = packageKeysPerInput
.Skip(1)
.Aggregate(
new HashSet<string>(packageKeysPerInput[0], StringComparer.Ordinal),
(acc, next) =>
{
acc.IntersectWith(next);
return acc;
});
if (sharedPackages.Count == 0)
{
return (0.5d, conflicts); // Neutral when no shared packages
}
var totalScore = 0d;
var packageCount = 0;
foreach (var packageKey in sharedPackages)
{
var versionsPerSource = inputsWithPackages
.Select(i => new
{
i.Vendor,
Versions = i.Purls
.Where(p => ExtractPackageKey(p) == packageKey)
.Select(ExtractVersion)
.Where(v => !string.IsNullOrWhiteSpace(v))
.ToList()
})
.Where(x => x.Versions.Count > 0)
.ToList();
if (versionsPerSource.Count < 2)
continue;
packageCount++;
// Classify relationship (simplified; full impl would use SemanticVersionRangeResolver)
var allVersions = versionsPerSource.SelectMany(v => v.Versions).ToHashSet(StringComparer.Ordinal);
var relation = ClassifyVersionRelation(versionsPerSource.Select(v => v.Versions).ToList());
switch (relation)
{
case VersionRelation.Equivalent:
totalScore += 1.0;
break;
case VersionRelation.Overlapping:
totalScore += 0.6;
var overlapValues = versionsPerSource
.Select(v => $"{v.Vendor ?? "source"}:{string.Join(",", v.Versions.OrderBy(x => x))}")
.OrderBy(x => x, StringComparer.Ordinal)
.ToArray();
conflicts.Add(new AdvisoryLinksetConflict(
$"affected.versions[{packageKey}]",
"affected-range-divergence",
overlapValues)
{
Severity = ConflictSeverity.Soft
});
break;
case VersionRelation.Disjoint:
totalScore += 0.0;
var disjointValues = versionsPerSource
.Select(v => $"{v.Vendor ?? "source"}:{string.Join(",", v.Versions.OrderBy(x => x))}")
.OrderBy(x => x, StringComparer.Ordinal)
.ToArray();
conflicts.Add(new AdvisoryLinksetConflict(
$"affected.versions[{packageKey}]",
"disjoint-version-ranges",
disjointValues)
{
Severity = ConflictSeverity.Hard
});
break;
default:
totalScore += 0.5; // Unknown = neutral
break;
}
}
var avgScore = packageCount > 0 ? totalScore / packageCount : 0.5;
return (Clamp01(avgScore), conflicts);
}
private static VersionRelation ClassifyVersionRelation(List<List<string>> versionSets)
{
if (versionSets.Count < 2)
return VersionRelation.Unknown;
var first = versionSets[0].ToHashSet(StringComparer.OrdinalIgnoreCase);
var allEquivalent = true;
var anyOverlap = false;
foreach (var other in versionSets.Skip(1))
{
var otherSet = other.ToHashSet(StringComparer.OrdinalIgnoreCase);
if (!first.SetEquals(otherSet))
allEquivalent = false;
if (first.Overlaps(otherSet))
anyOverlap = true;
}
if (allEquivalent)
return VersionRelation.Equivalent;
if (anyOverlap)
return VersionRelation.Overlapping;
return VersionRelation.Disjoint;
}
#endregion
#region Patch Lineage
/// <summary>
/// Calculates patch lineage correlation.
/// Exact commit SHA match is a very strong signal.
/// </summary>
private static double CalculatePatchLineageScore(IReadOnlyCollection<InputV2> inputs)
{
var inputsWithPatches = inputs
.Where(i => i.PatchReferences?.Count > 0)
.ToList();
if (inputsWithPatches.Count < 2)
{
return 0d; // No patch data to compare
}
// Extract normalized patch references (commit SHAs, PR URLs)
var patchesPerInput = inputsWithPatches
.Select(i => i.PatchReferences!
.Select(NormalizePatchReference)
.Where(p => p is not null)
.Select(p => p!)
.ToHashSet(StringComparer.OrdinalIgnoreCase))
.ToList();
// Find any pairwise overlap
for (int i = 0; i < patchesPerInput.Count; i++)
{
for (int j = i + 1; j < patchesPerInput.Count; j++)
{
if (patchesPerInput[i].Overlaps(patchesPerInput[j]))
{
// Exact patch match = very strong signal
return 1.0;
}
}
}
return 0d;
}
private static string? NormalizePatchReference(string reference)
{
if (string.IsNullOrWhiteSpace(reference))
return null;
// Extract commit SHA from GitHub/GitLab URLs
var commitPattern = new System.Text.RegularExpressions.Regex(
@"(?:github\.com|gitlab\.com)/[^/]+/[^/]+(?:/-)?/commit/([0-9a-f]{7,40})",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
var match = commitPattern.Match(reference);
if (match.Success)
{
return match.Groups[1].Value.ToLowerInvariant();
}
// Full SHA pattern
var shaPattern = new System.Text.RegularExpressions.Regex(@"\b([0-9a-f]{40})\b",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
match = shaPattern.Match(reference);
if (match.Success)
{
return match.Groups[1].Value.ToLowerInvariant();
}
return null;
}
#endregion
#region Reference Score (Positive-Only)
/// <summary>
/// Calculates reference overlap as a positive-only signal.
/// Zero overlap is neutral (0.5), not a conflict.
/// </summary>
private static double CalculateReferenceScore(IReadOnlyCollection<InputV2> inputs)
{
if (inputs.All(i => i.References.Count == 0))
{
return 0.5d; // Neutral when no references
}
var inputList = inputs.ToList();
var maxOverlap = 0d;
for (var i = 0; i < inputList.Count; i++)
{
for (var j = i + 1; j < inputList.Count; j++)
{
var first = inputList[i].References
.Select(NormalizeReferenceUrl)
.ToHashSet(StringComparer.OrdinalIgnoreCase);
var second = inputList[j].References
.Select(NormalizeReferenceUrl)
.ToHashSet(StringComparer.OrdinalIgnoreCase);
var intersection = first.Intersect(second, StringComparer.OrdinalIgnoreCase).Count();
var denom = Math.Max(first.Count, second.Count);
var overlap = denom == 0 ? 0d : (double)intersection / denom;
if (overlap > maxOverlap)
{
maxOverlap = overlap;
}
}
}
// Map overlap to score: 0 overlap = 0.5 (neutral), 1.0 overlap = 1.0
return 0.5 + (maxOverlap * 0.5);
}
private static string NormalizeReferenceUrl(string url)
{
if (string.IsNullOrWhiteSpace(url))
return string.Empty;
// Lowercase, remove tracking params, normalize protocol
var normalized = url.ToLowerInvariant().Trim();
// Remove common tracking parameters
var queryIndex = normalized.IndexOf('?');
if (queryIndex > 0)
{
normalized = normalized[..queryIndex];
}
// Normalize protocol
if (normalized.StartsWith("http://"))
{
normalized = "https://" + normalized[7..];
}
// Remove trailing slash
return normalized.TrimEnd('/');
}
#endregion
#region CPE and Freshness (Minor Updates)
private static double CalculateCpeScore(IReadOnlyCollection<InputV2> inputs)
{
if (inputs.All(i => i.Cpes.Count == 0))
{
return 0d;
}
var cpeSets = inputs.Select(i => i.Cpes.ToHashSet(StringComparer.OrdinalIgnoreCase)).ToList();
var exactOverlap = cpeSets.Skip(1).Any(set => set.Overlaps(cpeSets.First()));
if (exactOverlap)
{
return 1d;
}
var vendorProductSets = inputs
.Select(i => i.Cpes.Select(ParseVendorProduct).Where(vp => vp.vendor is not null).ToHashSet())
.ToList();
var sharedVendorProduct = vendorProductSets.Skip(1).Any(set => set.Overlaps(vendorProductSets.First()));
return sharedVendorProduct ? 0.5d : 0d;
}
private static (string? vendor, string? product) ParseVendorProduct(string cpe)
{
if (string.IsNullOrWhiteSpace(cpe))
{
return (null, null);
}
var parts = cpe.Split(':');
if (parts.Length >= 6 && parts[0].StartsWith("cpe", StringComparison.OrdinalIgnoreCase))
{
return (parts[3], parts[4]);
}
if (parts.Length >= 5 && parts[0] == "cpe" && parts[1] == "/")
{
return (parts[2], parts[3]);
}
return (null, null);
}
private static double CalculateFreshnessScore(IReadOnlyCollection<InputV2> inputs)
{
var fetched = inputs
.Select(i => i.FetchedAt)
.Where(d => d.HasValue)
.Select(d => d!.Value)
.ToList();
if (fetched.Count <= 1)
{
return 0.5d;
}
var min = fetched.Min();
var max = fetched.Max();
var spread = max - min;
if (spread <= TimeSpan.FromHours(48))
{
return 1d;
}
if (spread >= TimeSpan.FromDays(14))
{
return 0d;
}
var remaining = TimeSpan.FromDays(14) - spread;
return Clamp01(remaining.TotalSeconds / TimeSpan.FromDays(14).TotalSeconds);
}
#endregion
#region Conflict Penalties
/// <summary>
/// Calculates typed penalty based on conflict severities.
/// </summary>
private static double CalculateTypedPenalty(IReadOnlyList<AdvisoryLinksetConflict> conflicts)
{
if (conflicts.Count == 0)
return 0d;
var totalPenalty = 0d;
foreach (var conflict in conflicts)
{
var penalty = conflict.Reason switch
{
"distinct-cves" => ConflictPenalties.DistinctCves,
"disjoint-version-ranges" => ConflictPenalties.DisjointVersionRanges,
"affected-range-divergence" => ConflictPenalties.OverlappingRanges,
"severity-mismatch" => ConflictPenalties.SeverityMismatch,
"alias-inconsistency" => ConflictPenalties.AliasInconsistency,
"reference-clash" => 0d, // No penalty for reference differences
_ => 0.05 // Default small penalty for unknown conflicts
};
totalPenalty += penalty;
}
// Saturate at 0.6 to prevent total collapse
return Math.Min(totalPenalty, 0.6);
}
#endregion
#region Helpers
private static bool HasExactPurlOverlap(IReadOnlyCollection<InputV2> inputs)
{
var first = inputs.First().Purls.ToHashSet(StringComparer.Ordinal);
return inputs.Skip(1).Any(input => input.Purls.Any(first.Contains));
}
private static string ExtractPackageKey(string purl)
{
if (string.IsNullOrWhiteSpace(purl))
{
return string.Empty;
}
var atIndex = purl.LastIndexOf('@');
return atIndex > 0 ? purl[..atIndex] : purl;
}
private static string ExtractVersion(string purl)
{
if (string.IsNullOrWhiteSpace(purl))
{
return string.Empty;
}
var atIndex = purl.LastIndexOf('@');
if (atIndex < 0 || atIndex >= purl.Length - 1)
{
return string.Empty;
}
var version = purl[(atIndex + 1)..];
// Remove qualifiers if present
var qualifierIndex = version.IndexOf('?');
if (qualifierIndex > 0)
{
version = version[..qualifierIndex];
}
return version;
}
private static IReadOnlyList<AdvisoryLinksetConflict> DeduplicateAndSort(
IEnumerable<AdvisoryLinksetConflict> conflicts,
IReadOnlyCollection<InputV2> inputs)
{
var set = new HashSet<string>(StringComparer.Ordinal);
var list = new List<AdvisoryLinksetConflict>();
foreach (var conflict in conflicts)
{
var normalizedValues = NormalizeValues(conflict.Values);
var normalizedSources = NormalizeValues(conflict.SourceIds);
var key = $"{conflict.Field}|{conflict.Reason}|{string.Join('|', normalizedValues)}";
if (set.Add(key))
{
if (normalizedSources.Count == 0)
{
normalizedSources = inputs
.Select(i => i.Vendor ?? "source")
.Distinct(StringComparer.OrdinalIgnoreCase)
.OrderBy(v => v, StringComparer.Ordinal)
.ToArray();
}
list.Add(conflict with
{
Values = normalizedValues,
SourceIds = normalizedSources
});
}
}
return list
.OrderBy(c => c.Field, StringComparer.Ordinal)
.ThenBy(c => c.Reason, StringComparer.Ordinal)
.ThenBy(c => string.Join('|', c.Values ?? Array.Empty<string>()), StringComparer.Ordinal)
.ToList();
}
private static double Clamp01(double value) => Math.Clamp(value, 0d, 1d);
private static string FirstSortedOrDefault(IEnumerable<string> values)
{
var first = values
.Where(v => !string.IsNullOrWhiteSpace(v))
.Select(v => v.Trim())
.OrderBy(v => v, StringComparer.Ordinal)
.FirstOrDefault();
return string.IsNullOrEmpty(first) ? "<none>" : first;
}
private static IReadOnlyList<string> NormalizeValues(IReadOnlyList<string>? values)
{
if (values is null || values.Count == 0)
{
return Array.Empty<string>();
}
return values
.Where(v => !string.IsNullOrWhiteSpace(v))
.Select(v => v.Trim())
.OrderBy(v => v, StringComparer.Ordinal)
.ToArray();
}
#endregion
}

View File

@@ -0,0 +1,331 @@
// -----------------------------------------------------------------------------
// TextSimilarityScorer.cs
// Sprint: SPRINT_20260125_001_Concelier_linkset_correlation_v2
// Task: CORR-V2-010
// Description: Deterministic TF-IDF text similarity for linkset correlation
// -----------------------------------------------------------------------------
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
namespace StellaOps.Concelier.Core.Linksets;
/// <summary>
/// Computes TF-IDF-based text similarity between advisory descriptions.
/// Used as an optional correlation signal in V2 linkset correlation.
/// </summary>
/// <remarks>
/// <para>
/// This scorer is designed for deterministic, offline operation:
/// - No external NLP dependencies (pure C# implementation)
/// - Configurable stop words and tokenization
/// - Stable output across runs (no randomness)
/// </para>
/// <para>
/// Default weight: 0.05 (low weight, supplementary signal).
/// Feature flag: <c>concelier:correlation:textSimilarity:enabled</c> (default: false).
/// </para>
/// </remarks>
public sealed class TextSimilarityScorer
{
private static readonly Regex TokenRegex = new(
@"[a-zA-Z][a-zA-Z0-9_-]{2,}",
RegexOptions.Compiled | RegexOptions.CultureInvariant);
private static readonly HashSet<string> DefaultStopWords = new(StringComparer.OrdinalIgnoreCase)
{
// Common English stop words
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
"of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
"be", "have", "has", "had", "do", "does", "did", "will", "would", "could",
"should", "may", "might", "must", "shall", "can", "need", "dare", "ought",
"used", "this", "that", "these", "those", "which", "who", "whom", "whose",
"what", "where", "when", "why", "how", "all", "each", "every", "both",
"few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
"own", "same", "so", "than", "too", "very", "just", "also", "now", "here",
"there", "then", "once", "if", "into", "over", "after", "before", "about",
// Common vulnerability description words (low discriminative value)
"vulnerability", "issue", "allows", "attacker", "attack", "remote", "local",
"user", "code", "execution", "denial", "service", "buffer", "overflow",
"may", "could", "via", "using", "through", "affected", "version", "versions",
"product", "software", "application", "component", "module", "function"
};
private readonly TextSimilarityOptions _options;
private readonly HashSet<string> _stopWords;
/// <summary>
/// Initializes a new instance of <see cref="TextSimilarityScorer"/>.
/// </summary>
/// <param name="options">Configuration options. Null uses defaults.</param>
public TextSimilarityScorer(TextSimilarityOptions? options = null)
{
_options = options ?? new TextSimilarityOptions();
_stopWords = _options.CustomStopWords is not null
? new HashSet<string>(_options.CustomStopWords, StringComparer.OrdinalIgnoreCase)
: DefaultStopWords;
}
/// <summary>
/// Computes average pairwise TF-IDF cosine similarity across all description pairs.
/// </summary>
/// <param name="descriptions">Collection of normalized description texts.</param>
/// <returns>Average similarity score (0.0-1.0). Returns 0 if fewer than 2 descriptions.</returns>
public double ComputeAverageSimilarity(IReadOnlyCollection<string> descriptions)
{
if (descriptions.Count < 2)
{
return 0.0;
}
// Filter out empty/null descriptions
var validDescriptions = descriptions
.Where(d => !string.IsNullOrWhiteSpace(d))
.ToArray();
if (validDescriptions.Length < 2)
{
return 0.0;
}
// Tokenize all descriptions
var tokenizedDocs = validDescriptions
.Select(d => Tokenize(d))
.ToArray();
// Build document frequency map
var documentFrequency = BuildDocumentFrequency(tokenizedDocs);
// Compute TF-IDF vectors
var tfidfVectors = tokenizedDocs
.Select(tokens => ComputeTfIdf(tokens, documentFrequency, tokenizedDocs.Length))
.ToArray();
// Compute average pairwise cosine similarity
var totalSimilarity = 0.0;
var pairCount = 0;
for (var i = 0; i < tfidfVectors.Length; i++)
{
for (var j = i + 1; j < tfidfVectors.Length; j++)
{
totalSimilarity += CosineSimilarity(tfidfVectors[i], tfidfVectors[j]);
pairCount++;
}
}
return pairCount > 0 ? totalSimilarity / pairCount : 0.0;
}
/// <summary>
/// Computes TF-IDF cosine similarity between two descriptions.
/// </summary>
/// <param name="description1">First description text.</param>
/// <param name="description2">Second description text.</param>
/// <returns>Similarity score (0.0-1.0).</returns>
public double ComputePairwiseSimilarity(string description1, string description2)
{
if (string.IsNullOrWhiteSpace(description1) || string.IsNullOrWhiteSpace(description2))
{
return 0.0;
}
var tokens1 = Tokenize(description1);
var tokens2 = Tokenize(description2);
if (tokens1.Count == 0 || tokens2.Count == 0)
{
return 0.0;
}
// For pairwise, use simple term frequency with IDF approximation
var allTerms = new HashSet<string>(tokens1, StringComparer.OrdinalIgnoreCase);
allTerms.UnionWith(tokens2);
// Document frequency (appears in 1 or 2 docs)
var df = allTerms.ToDictionary(
t => t,
t => (tokens1.Contains(t) ? 1 : 0) + (tokens2.Contains(t) ? 1 : 0),
StringComparer.OrdinalIgnoreCase);
var vec1 = ComputeTfIdf(tokens1, df, 2);
var vec2 = ComputeTfIdf(tokens2, df, 2);
return CosineSimilarity(vec1, vec2);
}
/// <summary>
/// Tokenizes text into lowercase terms, removing stop words and short tokens.
/// </summary>
internal IReadOnlyList<string> Tokenize(string text)
{
if (string.IsNullOrWhiteSpace(text))
{
return Array.Empty<string>();
}
var matches = TokenRegex.Matches(text);
var tokens = new List<string>(matches.Count);
foreach (Match match in matches)
{
var token = match.Value.ToLowerInvariant();
// Skip stop words
if (_stopWords.Contains(token))
{
continue;
}
// Skip tokens that are too short
if (token.Length < _options.MinTokenLength)
{
continue;
}
// Skip tokens that are all digits (version numbers, etc.)
if (token.All(char.IsDigit))
{
continue;
}
tokens.Add(token);
}
// Sort for determinism
tokens.Sort(StringComparer.Ordinal);
return tokens;
}
private static Dictionary<string, int> BuildDocumentFrequency(IReadOnlyList<IReadOnlyList<string>> documents)
{
var df = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
foreach (var doc in documents)
{
var uniqueTerms = new HashSet<string>(doc, StringComparer.OrdinalIgnoreCase);
foreach (var term in uniqueTerms)
{
df.TryGetValue(term, out var count);
df[term] = count + 1;
}
}
return df;
}
private Dictionary<string, double> ComputeTfIdf(
IReadOnlyList<string> tokens,
Dictionary<string, int> documentFrequency,
int totalDocuments)
{
// Compute term frequency
var termFrequency = new Dictionary<string, int>(StringComparer.OrdinalIgnoreCase);
foreach (var token in tokens)
{
termFrequency.TryGetValue(token, out var count);
termFrequency[token] = count + 1;
}
if (termFrequency.Count == 0)
{
return new Dictionary<string, double>(StringComparer.OrdinalIgnoreCase);
}
// Compute TF-IDF
var tfidf = new Dictionary<string, double>(StringComparer.OrdinalIgnoreCase);
var maxTf = termFrequency.Values.Max();
foreach (var (term, tf) in termFrequency)
{
// Normalized TF: tf / max_tf (augmented frequency)
var normalizedTf = 0.5 + 0.5 * ((double)tf / maxTf);
// IDF: log((N + 1) / (df + 1)) + 1 (smoothed IDF to avoid zero)
// This ensures terms that appear in all documents still have some weight
documentFrequency.TryGetValue(term, out var df);
var idf = Math.Log((double)(totalDocuments + 1) / (df + 1)) + 1.0;
tfidf[term] = normalizedTf * idf;
}
return tfidf;
}
private static double CosineSimilarity(
Dictionary<string, double> vec1,
Dictionary<string, double> vec2)
{
// Get all terms
var allTerms = new HashSet<string>(vec1.Keys, StringComparer.OrdinalIgnoreCase);
allTerms.UnionWith(vec2.Keys);
// Compute dot product and magnitudes
var dotProduct = 0.0;
var mag1 = 0.0;
var mag2 = 0.0;
foreach (var term in allTerms)
{
vec1.TryGetValue(term, out var v1);
vec2.TryGetValue(term, out var v2);
dotProduct += v1 * v2;
mag1 += v1 * v1;
mag2 += v2 * v2;
}
mag1 = Math.Sqrt(mag1);
mag2 = Math.Sqrt(mag2);
if (mag1 < double.Epsilon || mag2 < double.Epsilon)
{
return 0.0;
}
return dotProduct / (mag1 * mag2);
}
}
/// <summary>
/// Configuration options for the text similarity scorer.
/// </summary>
public sealed class TextSimilarityOptions
{
/// <summary>
/// Configuration section name.
/// </summary>
public const string SectionName = "Concelier:Correlation:TextSimilarity";
/// <summary>
/// Whether text similarity scoring is enabled.
/// Default: false (Phase 3 feature, not yet GA).
/// </summary>
public bool Enabled { get; set; } = false;
/// <summary>
/// Weight for text similarity in unified scoring.
/// Default: 0.05.
/// </summary>
public double Weight { get; set; } = 0.05;
/// <summary>
/// Minimum token length after normalization.
/// Default: 3.
/// </summary>
public int MinTokenLength { get; set; } = 3;
/// <summary>
/// Custom stop words list. If null, uses built-in defaults.
/// </summary>
public IReadOnlyList<string>? CustomStopWords { get; set; }
/// <summary>
/// Whether to apply Porter stemming to tokens.
/// Default: false (adds complexity, minimal benefit for security text).
/// </summary>
public bool EnableStemming { get; set; } = false;
}