566 lines
26 KiB
C#
566 lines
26 KiB
C#
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
using Npgsql;
|
|
using NpgsqlTypes;
|
|
using StellaOps.AdvisoryAI.KnowledgeSearch;
|
|
using System.Text.Json;
|
|
using System.Diagnostics;
|
|
using System.Globalization;
|
|
using System.Linq;
|
|
|
|
namespace StellaOps.AdvisoryAI.UnifiedSearch;
|
|
|
|
internal sealed class UnifiedSearchIndexer : IUnifiedSearchIndexer
|
|
{
|
|
private readonly KnowledgeSearchOptions _options;
|
|
private readonly IKnowledgeSearchStore _store;
|
|
private readonly IEnumerable<ISearchIngestionAdapter> _adapters;
|
|
private readonly ILogger<UnifiedSearchIndexer> _logger;
|
|
|
|
public UnifiedSearchIndexer(
|
|
IOptions<KnowledgeSearchOptions> options,
|
|
IKnowledgeSearchStore store,
|
|
IEnumerable<ISearchIngestionAdapter> adapters,
|
|
ILogger<UnifiedSearchIndexer> logger)
|
|
{
|
|
ArgumentNullException.ThrowIfNull(options);
|
|
_options = options.Value ?? new KnowledgeSearchOptions();
|
|
_store = store ?? throw new ArgumentNullException(nameof(store));
|
|
_adapters = adapters ?? throw new ArgumentNullException(nameof(adapters));
|
|
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
|
}
|
|
|
|
public async Task IndexAllAsync(CancellationToken cancellationToken)
|
|
{
|
|
await IndexAllWithSummaryAsync(cancellationToken).ConfigureAwait(false);
|
|
}
|
|
|
|
internal async Task<UnifiedSearchIndexSummary> IndexAllWithSummaryAsync(CancellationToken cancellationToken)
|
|
{
|
|
if (!_options.Enabled || string.IsNullOrWhiteSpace(_options.ConnectionString))
|
|
{
|
|
_logger.LogDebug("Unified search indexing skipped because configuration is incomplete.");
|
|
return new UnifiedSearchIndexSummary(0, 0, 0);
|
|
}
|
|
|
|
await _store.EnsureSchemaAsync(cancellationToken).ConfigureAwait(false);
|
|
|
|
var stopwatch = Stopwatch.StartNew();
|
|
var domains = 0;
|
|
var chunks = 0;
|
|
var changed = 0;
|
|
var removed = 0;
|
|
|
|
foreach (var domainGroup in _adapters
|
|
.GroupBy(static adapter => adapter.Domain, StringComparer.OrdinalIgnoreCase)
|
|
.OrderBy(static group => group.Key, StringComparer.OrdinalIgnoreCase))
|
|
{
|
|
cancellationToken.ThrowIfCancellationRequested();
|
|
|
|
var domainStopwatch = Stopwatch.StartNew();
|
|
var domain = domainGroup.Key;
|
|
var domainChunks = new List<UnifiedChunk>();
|
|
var hadSuccessfulAdapter = false;
|
|
|
|
foreach (var adapter in domainGroup)
|
|
{
|
|
try
|
|
{
|
|
_logger.LogInformation("Unified search indexing adapter '{Adapter}' for domain '{Domain}'.",
|
|
adapter.GetType().Name,
|
|
domain);
|
|
var adapterChunks = await adapter.ProduceChunksAsync(cancellationToken).ConfigureAwait(false);
|
|
domainChunks.AddRange(adapterChunks);
|
|
hadSuccessfulAdapter = true;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"Failed to index adapter '{Adapter}' for domain '{Domain}'; continuing with other adapters in this domain.",
|
|
adapter.GetType().Name,
|
|
domain);
|
|
}
|
|
}
|
|
|
|
if (!hadSuccessfulAdapter)
|
|
{
|
|
_logger.LogWarning(
|
|
"Unified search skipped domain '{Domain}' because all adapters failed in this refresh cycle.",
|
|
domain);
|
|
continue;
|
|
}
|
|
|
|
var deduplicated = DeduplicateChunks(domainChunks);
|
|
var changedForDomain = 0;
|
|
if (deduplicated.Count > 0)
|
|
{
|
|
changedForDomain = await UpsertChunksAsync(deduplicated, cancellationToken).ConfigureAwait(false);
|
|
}
|
|
|
|
var removedForDomain = await DeleteMissingChunksByDomainAsync(
|
|
domain,
|
|
deduplicated.Select(static chunk => chunk.ChunkId).ToArray(),
|
|
cancellationToken)
|
|
.ConfigureAwait(false);
|
|
|
|
domainStopwatch.Stop();
|
|
domains++;
|
|
chunks += deduplicated.Count;
|
|
changed += changedForDomain;
|
|
removed += removedForDomain;
|
|
_logger.LogInformation(
|
|
"Unified search refresh domain '{Domain}' completed: seen_chunks={SeenChunkCount}, changed_chunks={ChangedChunkCount}, removed={RemovedCount}, duration_ms={DurationMs}",
|
|
domain,
|
|
deduplicated.Count,
|
|
changedForDomain,
|
|
removedForDomain,
|
|
(long)domainStopwatch.Elapsed.TotalMilliseconds);
|
|
}
|
|
|
|
stopwatch.Stop();
|
|
_logger.LogInformation(
|
|
"Unified search incremental indexing completed: domains={DomainCount}, seen_chunks={SeenChunkCount}, changed_chunks={ChangedChunkCount}, removed={RemovedCount}, duration_ms={DurationMs}",
|
|
domains,
|
|
chunks,
|
|
changed,
|
|
removed,
|
|
(long)stopwatch.Elapsed.TotalMilliseconds);
|
|
|
|
return new UnifiedSearchIndexSummary(domains, chunks, (long)stopwatch.Elapsed.TotalMilliseconds);
|
|
}
|
|
|
|
public async Task<UnifiedSearchIndexSummary> RebuildAllAsync(CancellationToken cancellationToken)
|
|
{
|
|
if (!_options.Enabled || string.IsNullOrWhiteSpace(_options.ConnectionString))
|
|
{
|
|
_logger.LogDebug("Unified search rebuild skipped because configuration is incomplete.");
|
|
return new UnifiedSearchIndexSummary(0, 0, 0);
|
|
}
|
|
|
|
await _store.EnsureSchemaAsync(cancellationToken).ConfigureAwait(false);
|
|
|
|
var stopwatch = Stopwatch.StartNew();
|
|
var domains = 0;
|
|
var chunks = 0;
|
|
|
|
foreach (var domainGroup in _adapters
|
|
.GroupBy(static adapter => adapter.Domain, StringComparer.OrdinalIgnoreCase)
|
|
.OrderBy(static group => group.Key, StringComparer.OrdinalIgnoreCase))
|
|
{
|
|
cancellationToken.ThrowIfCancellationRequested();
|
|
|
|
var domain = domainGroup.Key;
|
|
var domainStopwatch = Stopwatch.StartNew();
|
|
var domainChunks = new List<UnifiedChunk>();
|
|
var hadSuccessfulAdapter = false;
|
|
|
|
foreach (var adapter in domainGroup)
|
|
{
|
|
try
|
|
{
|
|
var adapterChunks = await adapter.ProduceChunksAsync(cancellationToken).ConfigureAwait(false);
|
|
domainChunks.AddRange(adapterChunks);
|
|
hadSuccessfulAdapter = true;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex,
|
|
"Failed to rebuild adapter '{Adapter}' for domain '{Domain}'; continuing with other adapters in this domain.",
|
|
adapter.GetType().Name,
|
|
domain);
|
|
}
|
|
}
|
|
|
|
if (!hadSuccessfulAdapter)
|
|
{
|
|
_logger.LogWarning(
|
|
"Unified search rebuild skipped domain '{Domain}' because all adapters failed.",
|
|
domain);
|
|
continue;
|
|
}
|
|
|
|
await DeleteChunksByDomainAsync(domain, cancellationToken).ConfigureAwait(false);
|
|
var deduplicated = DeduplicateChunks(domainChunks);
|
|
if (deduplicated.Count > 0)
|
|
{
|
|
await UpsertChunksAsync(deduplicated, cancellationToken).ConfigureAwait(false);
|
|
}
|
|
|
|
domainStopwatch.Stop();
|
|
domains++;
|
|
chunks += deduplicated.Count;
|
|
|
|
_logger.LogInformation(
|
|
"Unified search rebuild domain '{Domain}' completed: chunks={ChunkCount}, duration_ms={DurationMs}",
|
|
domain,
|
|
deduplicated.Count,
|
|
(long)domainStopwatch.Elapsed.TotalMilliseconds);
|
|
}
|
|
|
|
stopwatch.Stop();
|
|
return new UnifiedSearchIndexSummary(domains, chunks, (long)stopwatch.Elapsed.TotalMilliseconds);
|
|
}
|
|
|
|
public async Task DeleteChunksByDomainAsync(string domain, CancellationToken cancellationToken)
|
|
{
|
|
if (!_options.Enabled || string.IsNullOrWhiteSpace(_options.ConnectionString))
|
|
{
|
|
return;
|
|
}
|
|
|
|
await using var dataSource = new NpgsqlDataSourceBuilder(_options.ConnectionString).Build();
|
|
const string sql = "DELETE FROM advisoryai.kb_chunk WHERE domain = @domain;";
|
|
await using var command = dataSource.CreateCommand(sql);
|
|
command.CommandTimeout = 60;
|
|
command.Parameters.AddWithValue("domain", domain);
|
|
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
|
}
|
|
|
|
private async Task<int> DeleteMissingChunksByDomainAsync(
|
|
string domain,
|
|
IReadOnlyCollection<string> currentChunkIds,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
if (!_options.Enabled || string.IsNullOrWhiteSpace(_options.ConnectionString))
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
await using var dataSource = new NpgsqlDataSourceBuilder(_options.ConnectionString).Build();
|
|
await using var command = dataSource.CreateCommand();
|
|
command.CommandTimeout = 90;
|
|
command.Parameters.AddWithValue("domain", domain);
|
|
|
|
if (currentChunkIds.Count == 0)
|
|
{
|
|
command.CommandText = "DELETE FROM advisoryai.kb_chunk WHERE domain = @domain;";
|
|
}
|
|
else
|
|
{
|
|
command.CommandText = """
|
|
DELETE FROM advisoryai.kb_chunk
|
|
WHERE domain = @domain
|
|
AND NOT (chunk_id = ANY(@chunk_ids));
|
|
""";
|
|
command.Parameters.AddWithValue(
|
|
"chunk_ids",
|
|
NpgsqlDbType.Array | NpgsqlDbType.Text,
|
|
currentChunkIds.ToArray());
|
|
}
|
|
|
|
return await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
|
}
|
|
|
|
private async Task<int> UpsertChunksAsync(IReadOnlyList<UnifiedChunk> chunks, CancellationToken cancellationToken)
|
|
{
|
|
await using var dataSource = new NpgsqlDataSourceBuilder(_options.ConnectionString).Build();
|
|
await using var connection = await dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
|
|
var hasEmbeddingVectorColumn = await HasEmbeddingVectorColumnAsync(connection, cancellationToken).ConfigureAwait(false);
|
|
|
|
// Ensure parent documents exist for each unique DocId
|
|
var uniqueDocIds = chunks.Select(static c => c.DocId).Distinct(StringComparer.Ordinal).ToArray();
|
|
foreach (var docId in uniqueDocIds)
|
|
{
|
|
var chunk = chunks.First(c => c.DocId == docId);
|
|
await EnsureDocumentExistsAsync(connection, docId, chunk, cancellationToken).ConfigureAwait(false);
|
|
}
|
|
|
|
var sql = hasEmbeddingVectorColumn
|
|
? """
|
|
INSERT INTO advisoryai.kb_chunk
|
|
(
|
|
chunk_id, doc_id, kind, anchor, section_path,
|
|
span_start, span_end, title, body, body_tsv,
|
|
body_tsv_en, body_tsv_de, body_tsv_fr, body_tsv_es, body_tsv_ru,
|
|
embedding, embedding_vec, metadata, domain, entity_key, entity_type, freshness,
|
|
indexed_at
|
|
)
|
|
VALUES
|
|
(
|
|
@chunk_id, @doc_id, @kind, @anchor, @section_path,
|
|
@span_start, @span_end, @title, @body,
|
|
setweight(to_tsvector('simple', coalesce(@title, '')), 'A') ||
|
|
setweight(to_tsvector('simple', coalesce(@section_path, '')), 'B') ||
|
|
setweight(to_tsvector('simple', coalesce(@body, '')), 'D'),
|
|
setweight(to_tsvector('english', coalesce(@title, '')), 'A') ||
|
|
setweight(to_tsvector('english', coalesce(@section_path, '')), 'B') ||
|
|
setweight(to_tsvector('english', coalesce(@body, '')), 'D'),
|
|
setweight(to_tsvector('german', coalesce(@title, '')), 'A') ||
|
|
setweight(to_tsvector('german', coalesce(@section_path, '')), 'B') ||
|
|
setweight(to_tsvector('german', coalesce(@body, '')), 'D'),
|
|
setweight(to_tsvector('french', coalesce(@title, '')), 'A') ||
|
|
setweight(to_tsvector('french', coalesce(@section_path, '')), 'B') ||
|
|
setweight(to_tsvector('french', coalesce(@body, '')), 'D'),
|
|
setweight(to_tsvector('spanish', coalesce(@title, '')), 'A') ||
|
|
setweight(to_tsvector('spanish', coalesce(@section_path, '')), 'B') ||
|
|
setweight(to_tsvector('spanish', coalesce(@body, '')), 'D'),
|
|
setweight(to_tsvector('russian', coalesce(@title, '')), 'A') ||
|
|
setweight(to_tsvector('russian', coalesce(@section_path, '')), 'B') ||
|
|
setweight(to_tsvector('russian', coalesce(@body, '')), 'D'),
|
|
@embedding, CAST(@embedding_vector AS vector), @metadata::jsonb, @domain, @entity_key, @entity_type, @freshness,
|
|
NOW()
|
|
)
|
|
ON CONFLICT (chunk_id) DO UPDATE SET
|
|
doc_id = EXCLUDED.doc_id,
|
|
kind = EXCLUDED.kind,
|
|
anchor = EXCLUDED.anchor,
|
|
section_path = EXCLUDED.section_path,
|
|
span_start = EXCLUDED.span_start,
|
|
span_end = EXCLUDED.span_end,
|
|
title = EXCLUDED.title,
|
|
body = EXCLUDED.body,
|
|
body_tsv = EXCLUDED.body_tsv,
|
|
body_tsv_en = EXCLUDED.body_tsv_en,
|
|
body_tsv_de = EXCLUDED.body_tsv_de,
|
|
body_tsv_fr = EXCLUDED.body_tsv_fr,
|
|
body_tsv_es = EXCLUDED.body_tsv_es,
|
|
body_tsv_ru = EXCLUDED.body_tsv_ru,
|
|
embedding = EXCLUDED.embedding,
|
|
embedding_vec = EXCLUDED.embedding_vec,
|
|
metadata = EXCLUDED.metadata,
|
|
domain = EXCLUDED.domain,
|
|
entity_key = EXCLUDED.entity_key,
|
|
entity_type = EXCLUDED.entity_type,
|
|
freshness = EXCLUDED.freshness,
|
|
indexed_at = NOW()
|
|
WHERE advisoryai.kb_chunk.doc_id IS DISTINCT FROM EXCLUDED.doc_id
|
|
OR advisoryai.kb_chunk.kind IS DISTINCT FROM EXCLUDED.kind
|
|
OR advisoryai.kb_chunk.anchor IS DISTINCT FROM EXCLUDED.anchor
|
|
OR advisoryai.kb_chunk.section_path IS DISTINCT FROM EXCLUDED.section_path
|
|
OR advisoryai.kb_chunk.span_start IS DISTINCT FROM EXCLUDED.span_start
|
|
OR advisoryai.kb_chunk.span_end IS DISTINCT FROM EXCLUDED.span_end
|
|
OR advisoryai.kb_chunk.title IS DISTINCT FROM EXCLUDED.title
|
|
OR advisoryai.kb_chunk.body IS DISTINCT FROM EXCLUDED.body
|
|
OR advisoryai.kb_chunk.body_tsv IS DISTINCT FROM EXCLUDED.body_tsv
|
|
OR advisoryai.kb_chunk.body_tsv_en IS DISTINCT FROM EXCLUDED.body_tsv_en
|
|
OR advisoryai.kb_chunk.body_tsv_de IS DISTINCT FROM EXCLUDED.body_tsv_de
|
|
OR advisoryai.kb_chunk.body_tsv_fr IS DISTINCT FROM EXCLUDED.body_tsv_fr
|
|
OR advisoryai.kb_chunk.body_tsv_es IS DISTINCT FROM EXCLUDED.body_tsv_es
|
|
OR advisoryai.kb_chunk.body_tsv_ru IS DISTINCT FROM EXCLUDED.body_tsv_ru
|
|
OR advisoryai.kb_chunk.embedding IS DISTINCT FROM EXCLUDED.embedding
|
|
OR advisoryai.kb_chunk.embedding_vec IS DISTINCT FROM EXCLUDED.embedding_vec
|
|
OR advisoryai.kb_chunk.metadata IS DISTINCT FROM EXCLUDED.metadata
|
|
OR advisoryai.kb_chunk.domain IS DISTINCT FROM EXCLUDED.domain
|
|
OR advisoryai.kb_chunk.entity_key IS DISTINCT FROM EXCLUDED.entity_key
|
|
OR advisoryai.kb_chunk.entity_type IS DISTINCT FROM EXCLUDED.entity_type
|
|
OR advisoryai.kb_chunk.freshness IS DISTINCT FROM EXCLUDED.freshness;
|
|
"""
|
|
: """
|
|
INSERT INTO advisoryai.kb_chunk
|
|
(
|
|
chunk_id, doc_id, kind, anchor, section_path,
|
|
span_start, span_end, title, body, body_tsv,
|
|
body_tsv_en, body_tsv_de, body_tsv_fr, body_tsv_es, body_tsv_ru,
|
|
embedding, metadata, domain, entity_key, entity_type, freshness,
|
|
indexed_at
|
|
)
|
|
VALUES
|
|
(
|
|
@chunk_id, @doc_id, @kind, @anchor, @section_path,
|
|
@span_start, @span_end, @title, @body,
|
|
setweight(to_tsvector('simple', coalesce(@title, '')), 'A') ||
|
|
setweight(to_tsvector('simple', coalesce(@section_path, '')), 'B') ||
|
|
setweight(to_tsvector('simple', coalesce(@body, '')), 'D'),
|
|
setweight(to_tsvector('english', coalesce(@title, '')), 'A') ||
|
|
setweight(to_tsvector('english', coalesce(@section_path, '')), 'B') ||
|
|
setweight(to_tsvector('english', coalesce(@body, '')), 'D'),
|
|
setweight(to_tsvector('german', coalesce(@title, '')), 'A') ||
|
|
setweight(to_tsvector('german', coalesce(@section_path, '')), 'B') ||
|
|
setweight(to_tsvector('german', coalesce(@body, '')), 'D'),
|
|
setweight(to_tsvector('french', coalesce(@title, '')), 'A') ||
|
|
setweight(to_tsvector('french', coalesce(@section_path, '')), 'B') ||
|
|
setweight(to_tsvector('french', coalesce(@body, '')), 'D'),
|
|
setweight(to_tsvector('spanish', coalesce(@title, '')), 'A') ||
|
|
setweight(to_tsvector('spanish', coalesce(@section_path, '')), 'B') ||
|
|
setweight(to_tsvector('spanish', coalesce(@body, '')), 'D'),
|
|
setweight(to_tsvector('russian', coalesce(@title, '')), 'A') ||
|
|
setweight(to_tsvector('russian', coalesce(@section_path, '')), 'B') ||
|
|
setweight(to_tsvector('russian', coalesce(@body, '')), 'D'),
|
|
@embedding, @metadata::jsonb, @domain, @entity_key, @entity_type, @freshness,
|
|
NOW()
|
|
)
|
|
ON CONFLICT (chunk_id) DO UPDATE SET
|
|
doc_id = EXCLUDED.doc_id,
|
|
kind = EXCLUDED.kind,
|
|
anchor = EXCLUDED.anchor,
|
|
section_path = EXCLUDED.section_path,
|
|
span_start = EXCLUDED.span_start,
|
|
span_end = EXCLUDED.span_end,
|
|
title = EXCLUDED.title,
|
|
body = EXCLUDED.body,
|
|
body_tsv = EXCLUDED.body_tsv,
|
|
body_tsv_en = EXCLUDED.body_tsv_en,
|
|
body_tsv_de = EXCLUDED.body_tsv_de,
|
|
body_tsv_fr = EXCLUDED.body_tsv_fr,
|
|
body_tsv_es = EXCLUDED.body_tsv_es,
|
|
body_tsv_ru = EXCLUDED.body_tsv_ru,
|
|
embedding = EXCLUDED.embedding,
|
|
metadata = EXCLUDED.metadata,
|
|
domain = EXCLUDED.domain,
|
|
entity_key = EXCLUDED.entity_key,
|
|
entity_type = EXCLUDED.entity_type,
|
|
freshness = EXCLUDED.freshness,
|
|
indexed_at = NOW()
|
|
WHERE advisoryai.kb_chunk.doc_id IS DISTINCT FROM EXCLUDED.doc_id
|
|
OR advisoryai.kb_chunk.kind IS DISTINCT FROM EXCLUDED.kind
|
|
OR advisoryai.kb_chunk.anchor IS DISTINCT FROM EXCLUDED.anchor
|
|
OR advisoryai.kb_chunk.section_path IS DISTINCT FROM EXCLUDED.section_path
|
|
OR advisoryai.kb_chunk.span_start IS DISTINCT FROM EXCLUDED.span_start
|
|
OR advisoryai.kb_chunk.span_end IS DISTINCT FROM EXCLUDED.span_end
|
|
OR advisoryai.kb_chunk.title IS DISTINCT FROM EXCLUDED.title
|
|
OR advisoryai.kb_chunk.body IS DISTINCT FROM EXCLUDED.body
|
|
OR advisoryai.kb_chunk.body_tsv IS DISTINCT FROM EXCLUDED.body_tsv
|
|
OR advisoryai.kb_chunk.body_tsv_en IS DISTINCT FROM EXCLUDED.body_tsv_en
|
|
OR advisoryai.kb_chunk.body_tsv_de IS DISTINCT FROM EXCLUDED.body_tsv_de
|
|
OR advisoryai.kb_chunk.body_tsv_fr IS DISTINCT FROM EXCLUDED.body_tsv_fr
|
|
OR advisoryai.kb_chunk.body_tsv_es IS DISTINCT FROM EXCLUDED.body_tsv_es
|
|
OR advisoryai.kb_chunk.body_tsv_ru IS DISTINCT FROM EXCLUDED.body_tsv_ru
|
|
OR advisoryai.kb_chunk.embedding IS DISTINCT FROM EXCLUDED.embedding
|
|
OR advisoryai.kb_chunk.metadata IS DISTINCT FROM EXCLUDED.metadata
|
|
OR advisoryai.kb_chunk.domain IS DISTINCT FROM EXCLUDED.domain
|
|
OR advisoryai.kb_chunk.entity_key IS DISTINCT FROM EXCLUDED.entity_key
|
|
OR advisoryai.kb_chunk.entity_type IS DISTINCT FROM EXCLUDED.entity_type
|
|
OR advisoryai.kb_chunk.freshness IS DISTINCT FROM EXCLUDED.freshness;
|
|
""";
|
|
|
|
await using var command = connection.CreateCommand();
|
|
command.CommandText = sql;
|
|
command.CommandTimeout = 120;
|
|
|
|
var affectedRows = 0;
|
|
foreach (var chunk in chunks)
|
|
{
|
|
command.Parameters.Clear();
|
|
command.Parameters.AddWithValue("chunk_id", chunk.ChunkId);
|
|
command.Parameters.AddWithValue("doc_id", chunk.DocId);
|
|
command.Parameters.AddWithValue("kind", chunk.Kind);
|
|
command.Parameters.AddWithValue("anchor", (object?)chunk.Anchor ?? DBNull.Value);
|
|
command.Parameters.AddWithValue("section_path", (object?)chunk.SectionPath ?? DBNull.Value);
|
|
command.Parameters.AddWithValue("span_start", chunk.SpanStart);
|
|
command.Parameters.AddWithValue("span_end", chunk.SpanEnd);
|
|
command.Parameters.AddWithValue("title", chunk.Title);
|
|
command.Parameters.AddWithValue("body", chunk.Body);
|
|
command.Parameters.AddWithValue(
|
|
"embedding",
|
|
NpgsqlDbType.Array | NpgsqlDbType.Real,
|
|
chunk.Embedding is null ? Array.Empty<float>() : chunk.Embedding);
|
|
if (hasEmbeddingVectorColumn)
|
|
{
|
|
var vectorLiteral = chunk.Embedding is null ? (object)DBNull.Value : BuildVectorLiteral(chunk.Embedding);
|
|
command.Parameters.AddWithValue("embedding_vector", vectorLiteral);
|
|
}
|
|
command.Parameters.AddWithValue("metadata", NpgsqlDbType.Jsonb, chunk.Metadata.RootElement.GetRawText());
|
|
command.Parameters.AddWithValue("domain", chunk.Domain);
|
|
command.Parameters.AddWithValue("entity_key", (object?)chunk.EntityKey ?? DBNull.Value);
|
|
command.Parameters.AddWithValue("entity_type", (object?)chunk.EntityType ?? DBNull.Value);
|
|
command.Parameters.AddWithValue("freshness",
|
|
chunk.Freshness.HasValue ? (object)chunk.Freshness.Value : DBNull.Value);
|
|
|
|
affectedRows += await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
|
}
|
|
|
|
return affectedRows;
|
|
}
|
|
|
|
private static async Task<bool> HasEmbeddingVectorColumnAsync(
|
|
NpgsqlConnection connection,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
const string sql = """
|
|
SELECT EXISTS (
|
|
SELECT 1
|
|
FROM information_schema.columns
|
|
WHERE table_schema = 'advisoryai'
|
|
AND table_name = 'kb_chunk'
|
|
AND column_name = 'embedding_vec'
|
|
);
|
|
""";
|
|
|
|
await using var command = connection.CreateCommand();
|
|
command.CommandText = sql;
|
|
command.CommandTimeout = 30;
|
|
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
|
return result is bool value && value;
|
|
}
|
|
|
|
private static string BuildVectorLiteral(float[] values)
|
|
{
|
|
return "[" + string.Join(",", values.Select(static value => value.ToString("G9", CultureInfo.InvariantCulture))) + "]";
|
|
}
|
|
|
|
private static async Task EnsureDocumentExistsAsync(
|
|
NpgsqlConnection connection,
|
|
string docId,
|
|
UnifiedChunk chunk,
|
|
CancellationToken cancellationToken)
|
|
{
|
|
var sourceRef = ResolveSourceRef(chunk);
|
|
var sourcePath = ResolveSourcePath(chunk);
|
|
|
|
const string sql = """
|
|
INSERT INTO advisoryai.kb_doc
|
|
(doc_id, doc_type, product, version, source_ref, path, title, content_hash, metadata, indexed_at)
|
|
VALUES (@doc_id, @doc_type, @product, @version, @source_ref, @path, @title, @content_hash, '{}'::jsonb, NOW())
|
|
ON CONFLICT (doc_id) DO UPDATE SET
|
|
title = EXCLUDED.title,
|
|
content_hash = EXCLUDED.content_hash,
|
|
indexed_at = NOW();
|
|
""";
|
|
|
|
await using var command = connection.CreateCommand();
|
|
command.CommandText = sql;
|
|
command.CommandTimeout = 30;
|
|
command.Parameters.AddWithValue("doc_id", docId);
|
|
command.Parameters.AddWithValue("doc_type", chunk.Domain);
|
|
command.Parameters.AddWithValue("product", "stella-ops");
|
|
command.Parameters.AddWithValue("version", "local");
|
|
command.Parameters.AddWithValue("source_ref", sourceRef);
|
|
command.Parameters.AddWithValue("path", sourcePath);
|
|
command.Parameters.AddWithValue("title", chunk.Title);
|
|
command.Parameters.AddWithValue("content_hash", KnowledgeSearchText.StableId(chunk.Body));
|
|
|
|
await command.ExecuteNonQueryAsync(cancellationToken).ConfigureAwait(false);
|
|
}
|
|
|
|
private static string ResolveSourceRef(UnifiedChunk chunk)
|
|
{
|
|
if (!string.IsNullOrWhiteSpace(chunk.EntityKey))
|
|
{
|
|
return chunk.EntityKey.Trim();
|
|
}
|
|
|
|
return chunk.DocId;
|
|
}
|
|
|
|
private static string ResolveSourcePath(UnifiedChunk chunk)
|
|
{
|
|
if (!string.IsNullOrWhiteSpace(chunk.DocId))
|
|
{
|
|
return chunk.DocId;
|
|
}
|
|
|
|
return $"{chunk.Domain}/{chunk.Kind}";
|
|
}
|
|
|
|
private static IReadOnlyList<UnifiedChunk> DeduplicateChunks(IEnumerable<UnifiedChunk> chunks)
|
|
{
|
|
var byChunkId = new SortedDictionary<string, UnifiedChunk>(StringComparer.Ordinal);
|
|
foreach (var chunk in chunks)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(chunk.ChunkId))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
byChunkId[chunk.ChunkId] = chunk;
|
|
}
|
|
|
|
return byChunkId.Values.ToArray();
|
|
}
|
|
}
|
|
|
|
public sealed record UnifiedSearchIndexSummary(
|
|
int DomainCount,
|
|
int ChunkCount,
|
|
long DurationMs);
|