Harden live-backed unified search weighting and indexing
This commit is contained in:
@@ -5,6 +5,7 @@ using NpgsqlTypes;
|
||||
using StellaOps.AdvisoryAI.KnowledgeSearch;
|
||||
using System.Text.Json;
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Linq;
|
||||
|
||||
namespace StellaOps.AdvisoryAI.UnifiedSearch;
|
||||
@@ -254,6 +255,7 @@ internal sealed class UnifiedSearchIndexer : IUnifiedSearchIndexer
|
||||
{
|
||||
await using var dataSource = new NpgsqlDataSourceBuilder(_options.ConnectionString).Build();
|
||||
await using var connection = await dataSource.OpenConnectionAsync(cancellationToken).ConfigureAwait(false);
|
||||
var hasEmbeddingVectorColumn = await HasEmbeddingVectorColumnAsync(connection, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// Ensure parent documents exist for each unique DocId
|
||||
var uniqueDocIds = chunks.Select(static c => c.DocId).Distinct(StringComparer.Ordinal).ToArray();
|
||||
@@ -263,57 +265,163 @@ internal sealed class UnifiedSearchIndexer : IUnifiedSearchIndexer
|
||||
await EnsureDocumentExistsAsync(connection, docId, chunk, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
const string sql = """
|
||||
INSERT INTO advisoryai.kb_chunk
|
||||
(
|
||||
chunk_id, doc_id, kind, anchor, section_path,
|
||||
span_start, span_end, title, body, body_tsv,
|
||||
embedding, metadata, domain, entity_key, entity_type, freshness,
|
||||
indexed_at
|
||||
)
|
||||
VALUES
|
||||
(
|
||||
@chunk_id, @doc_id, @kind, @anchor, @section_path,
|
||||
@span_start, @span_end, @title, @body,
|
||||
setweight(to_tsvector('simple', coalesce(@title, '')), 'A') ||
|
||||
setweight(to_tsvector('simple', coalesce(@section_path, '')), 'B') ||
|
||||
setweight(to_tsvector('simple', coalesce(@body, '')), 'D'),
|
||||
@embedding, @metadata::jsonb, @domain, @entity_key, @entity_type, @freshness,
|
||||
NOW()
|
||||
)
|
||||
ON CONFLICT (chunk_id) DO UPDATE SET
|
||||
doc_id = EXCLUDED.doc_id,
|
||||
kind = EXCLUDED.kind,
|
||||
anchor = EXCLUDED.anchor,
|
||||
section_path = EXCLUDED.section_path,
|
||||
span_start = EXCLUDED.span_start,
|
||||
span_end = EXCLUDED.span_end,
|
||||
title = EXCLUDED.title,
|
||||
body = EXCLUDED.body,
|
||||
body_tsv = EXCLUDED.body_tsv,
|
||||
embedding = EXCLUDED.embedding,
|
||||
metadata = EXCLUDED.metadata,
|
||||
domain = EXCLUDED.domain,
|
||||
entity_key = EXCLUDED.entity_key,
|
||||
entity_type = EXCLUDED.entity_type,
|
||||
freshness = EXCLUDED.freshness,
|
||||
indexed_at = NOW()
|
||||
WHERE advisoryai.kb_chunk.doc_id IS DISTINCT FROM EXCLUDED.doc_id
|
||||
OR advisoryai.kb_chunk.kind IS DISTINCT FROM EXCLUDED.kind
|
||||
OR advisoryai.kb_chunk.anchor IS DISTINCT FROM EXCLUDED.anchor
|
||||
OR advisoryai.kb_chunk.section_path IS DISTINCT FROM EXCLUDED.section_path
|
||||
OR advisoryai.kb_chunk.span_start IS DISTINCT FROM EXCLUDED.span_start
|
||||
OR advisoryai.kb_chunk.span_end IS DISTINCT FROM EXCLUDED.span_end
|
||||
OR advisoryai.kb_chunk.title IS DISTINCT FROM EXCLUDED.title
|
||||
OR advisoryai.kb_chunk.body IS DISTINCT FROM EXCLUDED.body
|
||||
OR advisoryai.kb_chunk.body_tsv IS DISTINCT FROM EXCLUDED.body_tsv
|
||||
OR advisoryai.kb_chunk.embedding IS DISTINCT FROM EXCLUDED.embedding
|
||||
OR advisoryai.kb_chunk.metadata IS DISTINCT FROM EXCLUDED.metadata
|
||||
OR advisoryai.kb_chunk.domain IS DISTINCT FROM EXCLUDED.domain
|
||||
OR advisoryai.kb_chunk.entity_key IS DISTINCT FROM EXCLUDED.entity_key
|
||||
OR advisoryai.kb_chunk.entity_type IS DISTINCT FROM EXCLUDED.entity_type
|
||||
OR advisoryai.kb_chunk.freshness IS DISTINCT FROM EXCLUDED.freshness;
|
||||
""";
|
||||
var sql = hasEmbeddingVectorColumn
|
||||
? """
|
||||
INSERT INTO advisoryai.kb_chunk
|
||||
(
|
||||
chunk_id, doc_id, kind, anchor, section_path,
|
||||
span_start, span_end, title, body, body_tsv,
|
||||
body_tsv_en, body_tsv_de, body_tsv_fr, body_tsv_es, body_tsv_ru,
|
||||
embedding, embedding_vec, metadata, domain, entity_key, entity_type, freshness,
|
||||
indexed_at
|
||||
)
|
||||
VALUES
|
||||
(
|
||||
@chunk_id, @doc_id, @kind, @anchor, @section_path,
|
||||
@span_start, @span_end, @title, @body,
|
||||
setweight(to_tsvector('simple', coalesce(@title, '')), 'A') ||
|
||||
setweight(to_tsvector('simple', coalesce(@section_path, '')), 'B') ||
|
||||
setweight(to_tsvector('simple', coalesce(@body, '')), 'D'),
|
||||
setweight(to_tsvector('english', coalesce(@title, '')), 'A') ||
|
||||
setweight(to_tsvector('english', coalesce(@section_path, '')), 'B') ||
|
||||
setweight(to_tsvector('english', coalesce(@body, '')), 'D'),
|
||||
setweight(to_tsvector('german', coalesce(@title, '')), 'A') ||
|
||||
setweight(to_tsvector('german', coalesce(@section_path, '')), 'B') ||
|
||||
setweight(to_tsvector('german', coalesce(@body, '')), 'D'),
|
||||
setweight(to_tsvector('french', coalesce(@title, '')), 'A') ||
|
||||
setweight(to_tsvector('french', coalesce(@section_path, '')), 'B') ||
|
||||
setweight(to_tsvector('french', coalesce(@body, '')), 'D'),
|
||||
setweight(to_tsvector('spanish', coalesce(@title, '')), 'A') ||
|
||||
setweight(to_tsvector('spanish', coalesce(@section_path, '')), 'B') ||
|
||||
setweight(to_tsvector('spanish', coalesce(@body, '')), 'D'),
|
||||
setweight(to_tsvector('russian', coalesce(@title, '')), 'A') ||
|
||||
setweight(to_tsvector('russian', coalesce(@section_path, '')), 'B') ||
|
||||
setweight(to_tsvector('russian', coalesce(@body, '')), 'D'),
|
||||
@embedding, CAST(@embedding_vector AS vector), @metadata::jsonb, @domain, @entity_key, @entity_type, @freshness,
|
||||
NOW()
|
||||
)
|
||||
ON CONFLICT (chunk_id) DO UPDATE SET
|
||||
doc_id = EXCLUDED.doc_id,
|
||||
kind = EXCLUDED.kind,
|
||||
anchor = EXCLUDED.anchor,
|
||||
section_path = EXCLUDED.section_path,
|
||||
span_start = EXCLUDED.span_start,
|
||||
span_end = EXCLUDED.span_end,
|
||||
title = EXCLUDED.title,
|
||||
body = EXCLUDED.body,
|
||||
body_tsv = EXCLUDED.body_tsv,
|
||||
body_tsv_en = EXCLUDED.body_tsv_en,
|
||||
body_tsv_de = EXCLUDED.body_tsv_de,
|
||||
body_tsv_fr = EXCLUDED.body_tsv_fr,
|
||||
body_tsv_es = EXCLUDED.body_tsv_es,
|
||||
body_tsv_ru = EXCLUDED.body_tsv_ru,
|
||||
embedding = EXCLUDED.embedding,
|
||||
embedding_vec = EXCLUDED.embedding_vec,
|
||||
metadata = EXCLUDED.metadata,
|
||||
domain = EXCLUDED.domain,
|
||||
entity_key = EXCLUDED.entity_key,
|
||||
entity_type = EXCLUDED.entity_type,
|
||||
freshness = EXCLUDED.freshness,
|
||||
indexed_at = NOW()
|
||||
WHERE advisoryai.kb_chunk.doc_id IS DISTINCT FROM EXCLUDED.doc_id
|
||||
OR advisoryai.kb_chunk.kind IS DISTINCT FROM EXCLUDED.kind
|
||||
OR advisoryai.kb_chunk.anchor IS DISTINCT FROM EXCLUDED.anchor
|
||||
OR advisoryai.kb_chunk.section_path IS DISTINCT FROM EXCLUDED.section_path
|
||||
OR advisoryai.kb_chunk.span_start IS DISTINCT FROM EXCLUDED.span_start
|
||||
OR advisoryai.kb_chunk.span_end IS DISTINCT FROM EXCLUDED.span_end
|
||||
OR advisoryai.kb_chunk.title IS DISTINCT FROM EXCLUDED.title
|
||||
OR advisoryai.kb_chunk.body IS DISTINCT FROM EXCLUDED.body
|
||||
OR advisoryai.kb_chunk.body_tsv IS DISTINCT FROM EXCLUDED.body_tsv
|
||||
OR advisoryai.kb_chunk.body_tsv_en IS DISTINCT FROM EXCLUDED.body_tsv_en
|
||||
OR advisoryai.kb_chunk.body_tsv_de IS DISTINCT FROM EXCLUDED.body_tsv_de
|
||||
OR advisoryai.kb_chunk.body_tsv_fr IS DISTINCT FROM EXCLUDED.body_tsv_fr
|
||||
OR advisoryai.kb_chunk.body_tsv_es IS DISTINCT FROM EXCLUDED.body_tsv_es
|
||||
OR advisoryai.kb_chunk.body_tsv_ru IS DISTINCT FROM EXCLUDED.body_tsv_ru
|
||||
OR advisoryai.kb_chunk.embedding IS DISTINCT FROM EXCLUDED.embedding
|
||||
OR advisoryai.kb_chunk.embedding_vec IS DISTINCT FROM EXCLUDED.embedding_vec
|
||||
OR advisoryai.kb_chunk.metadata IS DISTINCT FROM EXCLUDED.metadata
|
||||
OR advisoryai.kb_chunk.domain IS DISTINCT FROM EXCLUDED.domain
|
||||
OR advisoryai.kb_chunk.entity_key IS DISTINCT FROM EXCLUDED.entity_key
|
||||
OR advisoryai.kb_chunk.entity_type IS DISTINCT FROM EXCLUDED.entity_type
|
||||
OR advisoryai.kb_chunk.freshness IS DISTINCT FROM EXCLUDED.freshness;
|
||||
"""
|
||||
: """
|
||||
INSERT INTO advisoryai.kb_chunk
|
||||
(
|
||||
chunk_id, doc_id, kind, anchor, section_path,
|
||||
span_start, span_end, title, body, body_tsv,
|
||||
body_tsv_en, body_tsv_de, body_tsv_fr, body_tsv_es, body_tsv_ru,
|
||||
embedding, metadata, domain, entity_key, entity_type, freshness,
|
||||
indexed_at
|
||||
)
|
||||
VALUES
|
||||
(
|
||||
@chunk_id, @doc_id, @kind, @anchor, @section_path,
|
||||
@span_start, @span_end, @title, @body,
|
||||
setweight(to_tsvector('simple', coalesce(@title, '')), 'A') ||
|
||||
setweight(to_tsvector('simple', coalesce(@section_path, '')), 'B') ||
|
||||
setweight(to_tsvector('simple', coalesce(@body, '')), 'D'),
|
||||
setweight(to_tsvector('english', coalesce(@title, '')), 'A') ||
|
||||
setweight(to_tsvector('english', coalesce(@section_path, '')), 'B') ||
|
||||
setweight(to_tsvector('english', coalesce(@body, '')), 'D'),
|
||||
setweight(to_tsvector('german', coalesce(@title, '')), 'A') ||
|
||||
setweight(to_tsvector('german', coalesce(@section_path, '')), 'B') ||
|
||||
setweight(to_tsvector('german', coalesce(@body, '')), 'D'),
|
||||
setweight(to_tsvector('french', coalesce(@title, '')), 'A') ||
|
||||
setweight(to_tsvector('french', coalesce(@section_path, '')), 'B') ||
|
||||
setweight(to_tsvector('french', coalesce(@body, '')), 'D'),
|
||||
setweight(to_tsvector('spanish', coalesce(@title, '')), 'A') ||
|
||||
setweight(to_tsvector('spanish', coalesce(@section_path, '')), 'B') ||
|
||||
setweight(to_tsvector('spanish', coalesce(@body, '')), 'D'),
|
||||
setweight(to_tsvector('russian', coalesce(@title, '')), 'A') ||
|
||||
setweight(to_tsvector('russian', coalesce(@section_path, '')), 'B') ||
|
||||
setweight(to_tsvector('russian', coalesce(@body, '')), 'D'),
|
||||
@embedding, @metadata::jsonb, @domain, @entity_key, @entity_type, @freshness,
|
||||
NOW()
|
||||
)
|
||||
ON CONFLICT (chunk_id) DO UPDATE SET
|
||||
doc_id = EXCLUDED.doc_id,
|
||||
kind = EXCLUDED.kind,
|
||||
anchor = EXCLUDED.anchor,
|
||||
section_path = EXCLUDED.section_path,
|
||||
span_start = EXCLUDED.span_start,
|
||||
span_end = EXCLUDED.span_end,
|
||||
title = EXCLUDED.title,
|
||||
body = EXCLUDED.body,
|
||||
body_tsv = EXCLUDED.body_tsv,
|
||||
body_tsv_en = EXCLUDED.body_tsv_en,
|
||||
body_tsv_de = EXCLUDED.body_tsv_de,
|
||||
body_tsv_fr = EXCLUDED.body_tsv_fr,
|
||||
body_tsv_es = EXCLUDED.body_tsv_es,
|
||||
body_tsv_ru = EXCLUDED.body_tsv_ru,
|
||||
embedding = EXCLUDED.embedding,
|
||||
metadata = EXCLUDED.metadata,
|
||||
domain = EXCLUDED.domain,
|
||||
entity_key = EXCLUDED.entity_key,
|
||||
entity_type = EXCLUDED.entity_type,
|
||||
freshness = EXCLUDED.freshness,
|
||||
indexed_at = NOW()
|
||||
WHERE advisoryai.kb_chunk.doc_id IS DISTINCT FROM EXCLUDED.doc_id
|
||||
OR advisoryai.kb_chunk.kind IS DISTINCT FROM EXCLUDED.kind
|
||||
OR advisoryai.kb_chunk.anchor IS DISTINCT FROM EXCLUDED.anchor
|
||||
OR advisoryai.kb_chunk.section_path IS DISTINCT FROM EXCLUDED.section_path
|
||||
OR advisoryai.kb_chunk.span_start IS DISTINCT FROM EXCLUDED.span_start
|
||||
OR advisoryai.kb_chunk.span_end IS DISTINCT FROM EXCLUDED.span_end
|
||||
OR advisoryai.kb_chunk.title IS DISTINCT FROM EXCLUDED.title
|
||||
OR advisoryai.kb_chunk.body IS DISTINCT FROM EXCLUDED.body
|
||||
OR advisoryai.kb_chunk.body_tsv IS DISTINCT FROM EXCLUDED.body_tsv
|
||||
OR advisoryai.kb_chunk.body_tsv_en IS DISTINCT FROM EXCLUDED.body_tsv_en
|
||||
OR advisoryai.kb_chunk.body_tsv_de IS DISTINCT FROM EXCLUDED.body_tsv_de
|
||||
OR advisoryai.kb_chunk.body_tsv_fr IS DISTINCT FROM EXCLUDED.body_tsv_fr
|
||||
OR advisoryai.kb_chunk.body_tsv_es IS DISTINCT FROM EXCLUDED.body_tsv_es
|
||||
OR advisoryai.kb_chunk.body_tsv_ru IS DISTINCT FROM EXCLUDED.body_tsv_ru
|
||||
OR advisoryai.kb_chunk.embedding IS DISTINCT FROM EXCLUDED.embedding
|
||||
OR advisoryai.kb_chunk.metadata IS DISTINCT FROM EXCLUDED.metadata
|
||||
OR advisoryai.kb_chunk.domain IS DISTINCT FROM EXCLUDED.domain
|
||||
OR advisoryai.kb_chunk.entity_key IS DISTINCT FROM EXCLUDED.entity_key
|
||||
OR advisoryai.kb_chunk.entity_type IS DISTINCT FROM EXCLUDED.entity_type
|
||||
OR advisoryai.kb_chunk.freshness IS DISTINCT FROM EXCLUDED.freshness;
|
||||
""";
|
||||
|
||||
await using var command = connection.CreateCommand();
|
||||
command.CommandText = sql;
|
||||
@@ -336,6 +444,11 @@ internal sealed class UnifiedSearchIndexer : IUnifiedSearchIndexer
|
||||
"embedding",
|
||||
NpgsqlDbType.Array | NpgsqlDbType.Real,
|
||||
chunk.Embedding is null ? Array.Empty<float>() : chunk.Embedding);
|
||||
if (hasEmbeddingVectorColumn)
|
||||
{
|
||||
var vectorLiteral = chunk.Embedding is null ? (object)DBNull.Value : BuildVectorLiteral(chunk.Embedding);
|
||||
command.Parameters.AddWithValue("embedding_vector", vectorLiteral);
|
||||
}
|
||||
command.Parameters.AddWithValue("metadata", NpgsqlDbType.Jsonb, chunk.Metadata.RootElement.GetRawText());
|
||||
command.Parameters.AddWithValue("domain", chunk.Domain);
|
||||
command.Parameters.AddWithValue("entity_key", (object?)chunk.EntityKey ?? DBNull.Value);
|
||||
@@ -349,6 +462,32 @@ internal sealed class UnifiedSearchIndexer : IUnifiedSearchIndexer
|
||||
return affectedRows;
|
||||
}
|
||||
|
||||
private static async Task<bool> HasEmbeddingVectorColumnAsync(
|
||||
NpgsqlConnection connection,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
const string sql = """
|
||||
SELECT EXISTS (
|
||||
SELECT 1
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = 'advisoryai'
|
||||
AND table_name = 'kb_chunk'
|
||||
AND column_name = 'embedding_vec'
|
||||
);
|
||||
""";
|
||||
|
||||
await using var command = connection.CreateCommand();
|
||||
command.CommandText = sql;
|
||||
command.CommandTimeout = 30;
|
||||
var result = await command.ExecuteScalarAsync(cancellationToken).ConfigureAwait(false);
|
||||
return result is bool value && value;
|
||||
}
|
||||
|
||||
private static string BuildVectorLiteral(float[] values)
|
||||
{
|
||||
return "[" + string.Join(",", values.Select(static value => value.ToString("G9", CultureInfo.InvariantCulture))) + "]";
|
||||
}
|
||||
|
||||
private static async Task EnsureDocumentExistsAsync(
|
||||
NpgsqlConnection connection,
|
||||
string docId,
|
||||
|
||||
Reference in New Issue
Block a user