Harden live-backed unified search weighting and indexing

This commit is contained in:
master
2026-03-08 02:23:43 +02:00
parent c7b7ddf436
commit 145e67a544
26 changed files with 1585 additions and 207 deletions

View File

@@ -5,6 +5,7 @@ using NpgsqlTypes;
using StellaOps.AdvisoryAI.KnowledgeSearch;
using System.Text.Json;
using System.Diagnostics;
using System.Globalization;
using System.Linq;
namespace StellaOps.AdvisoryAI.UnifiedSearch;
@@ -254,6 +255,7 @@ internal sealed class UnifiedSearchIndexer : IUnifiedSearchIndexer
{
await using var dataSource = new NpgsqlDataSourceBuilder(_options.ConnectionString).Build();
await using var connection = await dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
var hasEmbeddingVectorColumn = await HasEmbeddingVectorColumnAsync(connection, cancellationToken).ConfigureAwait(false);
// Ensure parent documents exist for each unique DocId
var uniqueDocIds = chunks.Select(static c => c.DocId).Distinct(StringComparer.Ordinal).ToArray();
@@ -263,57 +265,163 @@ internal sealed class UnifiedSearchIndexer : IUnifiedSearchIndexer
await EnsureDocumentExistsAsync(connection, docId, chunk, cancellationToken).ConfigureAwait(false);
}
const string sql = """
INSERT INTO advisoryai.kb_chunk
(
chunk_id, doc_id, kind, anchor, section_path,
span_start, span_end, title, body, body_tsv,
embedding, metadata, domain, entity_key, entity_type, freshness,
indexed_at
)
VALUES
(
@chunk_id, @doc_id, @kind, @anchor, @section_path,
@span_start, @span_end, @title, @body,
setweight(to_tsvector('simple', coalesce(@title, '')), 'A') ||
setweight(to_tsvector('simple', coalesce(@section_path, '')), 'B') ||
setweight(to_tsvector('simple', coalesce(@body, '')), 'D'),
@embedding, @metadata::jsonb, @domain, @entity_key, @entity_type, @freshness,
NOW()
)
ON CONFLICT (chunk_id) DO UPDATE SET
doc_id = EXCLUDED.doc_id,
kind = EXCLUDED.kind,
anchor = EXCLUDED.anchor,
section_path = EXCLUDED.section_path,
span_start = EXCLUDED.span_start,
span_end = EXCLUDED.span_end,
title = EXCLUDED.title,
body = EXCLUDED.body,
body_tsv = EXCLUDED.body_tsv,
embedding = EXCLUDED.embedding,
metadata = EXCLUDED.metadata,
domain = EXCLUDED.domain,
entity_key = EXCLUDED.entity_key,
entity_type = EXCLUDED.entity_type,
freshness = EXCLUDED.freshness,
indexed_at = NOW()
WHERE advisoryai.kb_chunk.doc_id IS DISTINCT FROM EXCLUDED.doc_id
OR advisoryai.kb_chunk.kind IS DISTINCT FROM EXCLUDED.kind
OR advisoryai.kb_chunk.anchor IS DISTINCT FROM EXCLUDED.anchor
OR advisoryai.kb_chunk.section_path IS DISTINCT FROM EXCLUDED.section_path
OR advisoryai.kb_chunk.span_start IS DISTINCT FROM EXCLUDED.span_start
OR advisoryai.kb_chunk.span_end IS DISTINCT FROM EXCLUDED.span_end
OR advisoryai.kb_chunk.title IS DISTINCT FROM EXCLUDED.title
OR advisoryai.kb_chunk.body IS DISTINCT FROM EXCLUDED.body
OR advisoryai.kb_chunk.body_tsv IS DISTINCT FROM EXCLUDED.body_tsv
OR advisoryai.kb_chunk.embedding IS DISTINCT FROM EXCLUDED.embedding
OR advisoryai.kb_chunk.metadata IS DISTINCT FROM EXCLUDED.metadata
OR advisoryai.kb_chunk.domain IS DISTINCT FROM EXCLUDED.domain
OR advisoryai.kb_chunk.entity_key IS DISTINCT FROM EXCLUDED.entity_key
OR advisoryai.kb_chunk.entity_type IS DISTINCT FROM EXCLUDED.entity_type
OR advisoryai.kb_chunk.freshness IS DISTINCT FROM EXCLUDED.freshness;
""";
var sql = hasEmbeddingVectorColumn
? """
INSERT INTO advisoryai.kb_chunk
(
chunk_id, doc_id, kind, anchor, section_path,
span_start, span_end, title, body, body_tsv,
body_tsv_en, body_tsv_de, body_tsv_fr, body_tsv_es, body_tsv_ru,
embedding, embedding_vec, metadata, domain, entity_key, entity_type, freshness,
indexed_at
)
VALUES
(
@chunk_id, @doc_id, @kind, @anchor, @section_path,
@span_start, @span_end, @title, @body,
setweight(to_tsvector('simple', coalesce(@title, '')), 'A') ||
setweight(to_tsvector('simple', coalesce(@section_path, '')), 'B') ||
setweight(to_tsvector('simple', coalesce(@body, '')), 'D'),
setweight(to_tsvector('english', coalesce(@title, '')), 'A') ||
setweight(to_tsvector('english', coalesce(@section_path, '')), 'B') ||
setweight(to_tsvector('english', coalesce(@body, '')), 'D'),
setweight(to_tsvector('german', coalesce(@title, '')), 'A') ||
setweight(to_tsvector('german', coalesce(@section_path, '')), 'B') ||
setweight(to_tsvector('german', coalesce(@body, '')), 'D'),
setweight(to_tsvector('french', coalesce(@title, '')), 'A') ||
setweight(to_tsvector('french', coalesce(@section_path, '')), 'B') ||
setweight(to_tsvector('french', coalesce(@body, '')), 'D'),
setweight(to_tsvector('spanish', coalesce(@title, '')), 'A') ||
setweight(to_tsvector('spanish', coalesce(@section_path, '')), 'B') ||
setweight(to_tsvector('spanish', coalesce(@body, '')), 'D'),
setweight(to_tsvector('russian', coalesce(@title, '')), 'A') ||
setweight(to_tsvector('russian', coalesce(@section_path, '')), 'B') ||
setweight(to_tsvector('russian', coalesce(@body, '')), 'D'),
@embedding, CAST(@embedding_vector AS vector), @metadata::jsonb, @domain, @entity_key, @entity_type, @freshness,
NOW()
)
ON CONFLICT (chunk_id) DO UPDATE SET
doc_id = EXCLUDED.doc_id,
kind = EXCLUDED.kind,
anchor = EXCLUDED.anchor,
section_path = EXCLUDED.section_path,
span_start = EXCLUDED.span_start,
span_end = EXCLUDED.span_end,
title = EXCLUDED.title,
body = EXCLUDED.body,
body_tsv = EXCLUDED.body_tsv,
body_tsv_en = EXCLUDED.body_tsv_en,
body_tsv_de = EXCLUDED.body_tsv_de,
body_tsv_fr = EXCLUDED.body_tsv_fr,
body_tsv_es = EXCLUDED.body_tsv_es,
body_tsv_ru = EXCLUDED.body_tsv_ru,
embedding = EXCLUDED.embedding,
embedding_vec = EXCLUDED.embedding_vec,
metadata = EXCLUDED.metadata,
domain = EXCLUDED.domain,
entity_key = EXCLUDED.entity_key,
entity_type = EXCLUDED.entity_type,
freshness = EXCLUDED.freshness,
indexed_at = NOW()
WHERE advisoryai.kb_chunk.doc_id IS DISTINCT FROM EXCLUDED.doc_id
OR advisoryai.kb_chunk.kind IS DISTINCT FROM EXCLUDED.kind
OR advisoryai.kb_chunk.anchor IS DISTINCT FROM EXCLUDED.anchor
OR advisoryai.kb_chunk.section_path IS DISTINCT FROM EXCLUDED.section_path
OR advisoryai.kb_chunk.span_start IS DISTINCT FROM EXCLUDED.span_start
OR advisoryai.kb_chunk.span_end IS DISTINCT FROM EXCLUDED.span_end
OR advisoryai.kb_chunk.title IS DISTINCT FROM EXCLUDED.title
OR advisoryai.kb_chunk.body IS DISTINCT FROM EXCLUDED.body
OR advisoryai.kb_chunk.body_tsv IS DISTINCT FROM EXCLUDED.body_tsv
OR advisoryai.kb_chunk.body_tsv_en IS DISTINCT FROM EXCLUDED.body_tsv_en
OR advisoryai.kb_chunk.body_tsv_de IS DISTINCT FROM EXCLUDED.body_tsv_de
OR advisoryai.kb_chunk.body_tsv_fr IS DISTINCT FROM EXCLUDED.body_tsv_fr
OR advisoryai.kb_chunk.body_tsv_es IS DISTINCT FROM EXCLUDED.body_tsv_es
OR advisoryai.kb_chunk.body_tsv_ru IS DISTINCT FROM EXCLUDED.body_tsv_ru
OR advisoryai.kb_chunk.embedding IS DISTINCT FROM EXCLUDED.embedding
OR advisoryai.kb_chunk.embedding_vec IS DISTINCT FROM EXCLUDED.embedding_vec
OR advisoryai.kb_chunk.metadata IS DISTINCT FROM EXCLUDED.metadata
OR advisoryai.kb_chunk.domain IS DISTINCT FROM EXCLUDED.domain
OR advisoryai.kb_chunk.entity_key IS DISTINCT FROM EXCLUDED.entity_key
OR advisoryai.kb_chunk.entity_type IS DISTINCT FROM EXCLUDED.entity_type
OR advisoryai.kb_chunk.freshness IS DISTINCT FROM EXCLUDED.freshness;
"""
: """
INSERT INTO advisoryai.kb_chunk
(
chunk_id, doc_id, kind, anchor, section_path,
span_start, span_end, title, body, body_tsv,
body_tsv_en, body_tsv_de, body_tsv_fr, body_tsv_es, body_tsv_ru,
embedding, metadata, domain, entity_key, entity_type, freshness,
indexed_at
)
VALUES
(
@chunk_id, @doc_id, @kind, @anchor, @section_path,
@span_start, @span_end, @title, @body,
setweight(to_tsvector('simple', coalesce(@title, '')), 'A') ||
setweight(to_tsvector('simple', coalesce(@section_path, '')), 'B') ||
setweight(to_tsvector('simple', coalesce(@body, '')), 'D'),
setweight(to_tsvector('english', coalesce(@title, '')), 'A') ||
setweight(to_tsvector('english', coalesce(@section_path, '')), 'B') ||
setweight(to_tsvector('english', coalesce(@body, '')), 'D'),
setweight(to_tsvector('german', coalesce(@title, '')), 'A') ||
setweight(to_tsvector('german', coalesce(@section_path, '')), 'B') ||
setweight(to_tsvector('german', coalesce(@body, '')), 'D'),
setweight(to_tsvector('french', coalesce(@title, '')), 'A') ||
setweight(to_tsvector('french', coalesce(@section_path, '')), 'B') ||
setweight(to_tsvector('french', coalesce(@body, '')), 'D'),
setweight(to_tsvector('spanish', coalesce(@title, '')), 'A') ||
setweight(to_tsvector('spanish', coalesce(@section_path, '')), 'B') ||
setweight(to_tsvector('spanish', coalesce(@body, '')), 'D'),
setweight(to_tsvector('russian', coalesce(@title, '')), 'A') ||
setweight(to_tsvector('russian', coalesce(@section_path, '')), 'B') ||
setweight(to_tsvector('russian', coalesce(@body, '')), 'D'),
@embedding, @metadata::jsonb, @domain, @entity_key, @entity_type, @freshness,
NOW()
)
ON CONFLICT (chunk_id) DO UPDATE SET
doc_id = EXCLUDED.doc_id,
kind = EXCLUDED.kind,
anchor = EXCLUDED.anchor,
section_path = EXCLUDED.section_path,
span_start = EXCLUDED.span_start,
span_end = EXCLUDED.span_end,
title = EXCLUDED.title,
body = EXCLUDED.body,
body_tsv = EXCLUDED.body_tsv,
body_tsv_en = EXCLUDED.body_tsv_en,
body_tsv_de = EXCLUDED.body_tsv_de,
body_tsv_fr = EXCLUDED.body_tsv_fr,
body_tsv_es = EXCLUDED.body_tsv_es,
body_tsv_ru = EXCLUDED.body_tsv_ru,
embedding = EXCLUDED.embedding,
metadata = EXCLUDED.metadata,
domain = EXCLUDED.domain,
entity_key = EXCLUDED.entity_key,
entity_type = EXCLUDED.entity_type,
freshness = EXCLUDED.freshness,
indexed_at = NOW()
WHERE advisoryai.kb_chunk.doc_id IS DISTINCT FROM EXCLUDED.doc_id
OR advisoryai.kb_chunk.kind IS DISTINCT FROM EXCLUDED.kind
OR advisoryai.kb_chunk.anchor IS DISTINCT FROM EXCLUDED.anchor
OR advisoryai.kb_chunk.section_path IS DISTINCT FROM EXCLUDED.section_path
OR advisoryai.kb_chunk.span_start IS DISTINCT FROM EXCLUDED.span_start
OR advisoryai.kb_chunk.span_end IS DISTINCT FROM EXCLUDED.span_end
OR advisoryai.kb_chunk.title IS DISTINCT FROM EXCLUDED.title
OR advisoryai.kb_chunk.body IS DISTINCT FROM EXCLUDED.body
OR advisoryai.kb_chunk.body_tsv IS DISTINCT FROM EXCLUDED.body_tsv
OR advisoryai.kb_chunk.body_tsv_en IS DISTINCT FROM EXCLUDED.body_tsv_en
OR advisoryai.kb_chunk.body_tsv_de IS DISTINCT FROM EXCLUDED.body_tsv_de
OR advisoryai.kb_chunk.body_tsv_fr IS DISTINCT FROM EXCLUDED.body_tsv_fr
OR advisoryai.kb_chunk.body_tsv_es IS DISTINCT FROM EXCLUDED.body_tsv_es
OR advisoryai.kb_chunk.body_tsv_ru IS DISTINCT FROM EXCLUDED.body_tsv_ru
OR advisoryai.kb_chunk.embedding IS DISTINCT FROM EXCLUDED.embedding
OR advisoryai.kb_chunk.metadata IS DISTINCT FROM EXCLUDED.metadata
OR advisoryai.kb_chunk.domain IS DISTINCT FROM EXCLUDED.domain
OR advisoryai.kb_chunk.entity_key IS DISTINCT FROM EXCLUDED.entity_key
OR advisoryai.kb_chunk.entity_type IS DISTINCT FROM EXCLUDED.entity_type
OR advisoryai.kb_chunk.freshness IS DISTINCT FROM EXCLUDED.freshness;
""";
await using var command = connection.CreateCommand();
command.CommandText = sql;
@@ -336,6 +444,11 @@ internal sealed class UnifiedSearchIndexer : IUnifiedSearchIndexer
"embedding",
NpgsqlDbType.Array | NpgsqlDbType.Real,
chunk.Embedding is null ? Array.Empty<float>() : chunk.Embedding);
if (hasEmbeddingVectorColumn)
{
var vectorLiteral = chunk.Embedding is null ? (object)DBNull.Value : BuildVectorLiteral(chunk.Embedding);
command.Parameters.AddWithValue("embedding_vector", vectorLiteral);
}
command.Parameters.AddWithValue("metadata", NpgsqlDbType.Jsonb, chunk.Metadata.RootElement.GetRawText());
command.Parameters.AddWithValue("domain", chunk.Domain);
command.Parameters.AddWithValue("entity_key", (object?)chunk.EntityKey ?? DBNull.Value);
@@ -349,6 +462,32 @@ internal sealed class UnifiedSearchIndexer : IUnifiedSearchIndexer
return affectedRows;
}
private static async Task<bool> HasEmbeddingVectorColumnAsync(
NpgsqlConnection connection,
CancellationToken cancellationToken)
{
const string sql = """
SELECT EXISTS (
SELECT 1
FROM information_schema.columns
WHERE table_schema = 'advisoryai'
AND table_name = 'kb_chunk'
AND column_name = 'embedding_vec'
);
""";
await using var command = connection.CreateCommand();
command.CommandText = sql;
command.CommandTimeout = 30;
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
return result is bool value && value;
}
private static string BuildVectorLiteral(float[] values)
{
return "[" + string.Join(",", values.Select(static value => value.ToString("G9", CultureInfo.InvariantCulture))) + "]";
}
private static async Task EnsureDocumentExistsAsync(
NpgsqlConnection connection,
string docId,