Files
git.stella-ops.org/devops/tools/ops-scripts/check-advisory-raw-duplicates.sql
2026-01-25 23:27:41 +02:00

47 lines
1.3 KiB
SQL

-- Advisory raw duplicate detection query
-- Surfaces advisory_raw duplicate candidates prior to enabling the idempotency unique index.
-- Intended for staging/offline snapshots.
--
-- Usage:
-- psql -d concelier -f ops/devops/tools/ops-scripts/check-advisory-raw-duplicates.sql
--
-- Environment variables:
-- LIMIT - optional cap on number of duplicate groups to print (default 50).
\echo '== advisory_raw duplicate audit =='
\conninfo
WITH duplicates AS (
SELECT
source_vendor,
upstream_id,
content_hash,
tenant,
COUNT(*) as count,
ARRAY_AGG(id) as ids
FROM advisory_raw
GROUP BY source_vendor, upstream_id, content_hash, tenant
HAVING COUNT(*) > 1
ORDER BY COUNT(*) DESC, source_vendor, upstream_id
LIMIT COALESCE(NULLIF(:'LIMIT', '')::INT, 50)
)
SELECT
'vendor: ' || source_vendor || E'\n' ||
'upstream_id: ' || upstream_id || E'\n' ||
'tenant: ' || COALESCE(tenant, 'NULL') || E'\n' ||
'content_hash: ' || content_hash || E'\n' ||
'count: ' || count || E'\n' ||
'ids: ' || ARRAY_TO_STRING(ids, ', ') AS duplicate_info
FROM duplicates;
SELECT CASE WHEN COUNT(*) = 0
THEN 'No duplicate advisory_raw documents detected.'
ELSE 'Found ' || COUNT(*) || ' duplicate groups.'
END as status
FROM (
SELECT 1 FROM advisory_raw
GROUP BY source_vendor, upstream_id, content_hash, tenant
HAVING COUNT(*) > 1
LIMIT 1
) t;