-- Advisory raw duplicate detection query -- Surfaces advisory_raw duplicate candidates prior to enabling the idempotency unique index. -- Intended for staging/offline snapshots. -- -- Usage: -- psql -d concelier -f ops/devops/tools/ops-scripts/check-advisory-raw-duplicates.sql -- -- Environment variables: -- LIMIT - optional cap on number of duplicate groups to print (default 50). \echo '== advisory_raw duplicate audit ==' \conninfo WITH duplicates AS ( SELECT source_vendor, upstream_id, content_hash, tenant, COUNT(*) as count, ARRAY_AGG(id) as ids FROM advisory_raw GROUP BY source_vendor, upstream_id, content_hash, tenant HAVING COUNT(*) > 1 ORDER BY COUNT(*) DESC, source_vendor, upstream_id LIMIT COALESCE(NULLIF(:'LIMIT', '')::INT, 50) ) SELECT 'vendor: ' || source_vendor || E'\n' || 'upstream_id: ' || upstream_id || E'\n' || 'tenant: ' || COALESCE(tenant, 'NULL') || E'\n' || 'content_hash: ' || content_hash || E'\n' || 'count: ' || count || E'\n' || 'ids: ' || ARRAY_TO_STRING(ids, ', ') AS duplicate_info FROM duplicates; SELECT CASE WHEN COUNT(*) = 0 THEN 'No duplicate advisory_raw documents detected.' ELSE 'Found ' || COUNT(*) || ' duplicate groups.' END as status FROM ( SELECT 1 FROM advisory_raw GROUP BY source_vendor, upstream_id, content_hash, tenant HAVING COUNT(*) > 1 LIMIT 1 ) t;