47 lines
1.3 KiB
SQL
47 lines
1.3 KiB
SQL
-- Advisory raw duplicate detection query
|
|
-- Surfaces advisory_raw duplicate candidates prior to enabling the idempotency unique index.
|
|
-- Intended for staging/offline snapshots.
|
|
--
|
|
-- Usage:
|
|
-- psql -d concelier -f ops/devops/tools/ops-scripts/check-advisory-raw-duplicates.sql
|
|
--
|
|
-- Environment variables:
|
|
-- LIMIT - optional cap on number of duplicate groups to print (default 50).
|
|
|
|
\echo '== advisory_raw duplicate audit =='
|
|
\conninfo
|
|
|
|
WITH duplicates AS (
|
|
SELECT
|
|
source_vendor,
|
|
upstream_id,
|
|
content_hash,
|
|
tenant,
|
|
COUNT(*) as count,
|
|
ARRAY_AGG(id) as ids
|
|
FROM advisory_raw
|
|
GROUP BY source_vendor, upstream_id, content_hash, tenant
|
|
HAVING COUNT(*) > 1
|
|
ORDER BY COUNT(*) DESC, source_vendor, upstream_id
|
|
LIMIT COALESCE(NULLIF(:'LIMIT', '')::INT, 50)
|
|
)
|
|
SELECT
|
|
'vendor: ' || source_vendor || E'\n' ||
|
|
'upstream_id: ' || upstream_id || E'\n' ||
|
|
'tenant: ' || COALESCE(tenant, 'NULL') || E'\n' ||
|
|
'content_hash: ' || content_hash || E'\n' ||
|
|
'count: ' || count || E'\n' ||
|
|
'ids: ' || ARRAY_TO_STRING(ids, ', ') AS duplicate_info
|
|
FROM duplicates;
|
|
|
|
SELECT CASE WHEN COUNT(*) = 0
|
|
THEN 'No duplicate advisory_raw documents detected.'
|
|
ELSE 'Found ' || COUNT(*) || ' duplicate groups.'
|
|
END as status
|
|
FROM (
|
|
SELECT 1 FROM advisory_raw
|
|
GROUP BY source_vendor, upstream_id, content_hash, tenant
|
|
HAVING COUNT(*) > 1
|
|
LIMIT 1
|
|
) t;
|