git.stella-ops.org/scripts/concelier/backfill-store-aoc-19-005.sh

#!/usr/bin/env bash
set -euo pipefail

# Postgres backfill runner for STORE-AOC-19-005-DEV (Link-Not-Merge raw linksets/chunks)
# Usage:
#   PGURI=postgres://.../concelier ./scripts/concelier/backfill-store-aoc-19-005.sh /path/to/linksets-stage-backfill.tar.zst
# Optional:
#   PGSCHEMA=lnm_raw (default), DRY_RUN=1 to stop after extraction
#
# Assumptions:
# - Dataset contains ndjson files: linksets.ndjson, advisory_chunks.ndjson, manifest.json
# - Target staging tables are created by this script if absent:
#     <schema>.linksets_raw(id text primary key, raw jsonb)
#     <schema>.advisory_chunks_raw(id text primary key, raw jsonb)

DATASET_PATH="${1:-}"
if [[ -z "${DATASET_PATH}" || ! -f "${DATASET_PATH}" ]]; then
  echo "Dataset tarball not found. Provide path to linksets-stage-backfill.tar.zst" >&2
  exit 1
fi

PGURI="${PGURI:-${CONCELIER_PG_URI:-}}"
PGSCHEMA="${PGSCHEMA:-lnm_raw}"
DRY_RUN="${DRY_RUN:-0}"

if [[ -z "${PGURI}" ]]; then
  echo "PGURI (or CONCELIER_PG_URI) must be set" >&2
  exit 1
fi

WORKDIR="$(mktemp -d)"
cleanup() { rm -rf "${WORKDIR}"; }
trap cleanup EXIT

echo "==> Dataset: ${DATASET_PATH}"
sha256sum "${DATASET_PATH}"

echo "==> Extracting to ${WORKDIR}"
tar -xf "${DATASET_PATH}" -C "${WORKDIR}"

for required in linksets.ndjson advisory_chunks.ndjson manifest.json; do
  if [[ ! -f "${WORKDIR}/${required}" ]]; then
    echo "Missing required file in dataset: ${required}" >&2
    exit 1
  fi
done

echo "==> Ensuring staging schema/tables exist in Postgres"
psql "${PGURI}" <<SQL
create schema if not exists ${PGSCHEMA};
create table if not exists ${PGSCHEMA}.linksets_raw (
  id text primary key,
  raw jsonb not null
);
create table if not exists ${PGSCHEMA}.advisory_chunks_raw (
  id text primary key,
  raw jsonb not null
);
SQL

if [[ "${DRY_RUN}" != "0" ]]; then
  echo "DRY_RUN=1 set; extraction and schema verification completed, skipping import."
  exit 0
fi

echo "==> Importing linksets into ${PGSCHEMA}.linksets_raw"
cat >"${WORKDIR}/linksets.tsv" <(jq -rc '[._id, .] | @tsv' "${WORKDIR}/linksets.ndjson")
psql "${PGURI}" <<SQL
TRUNCATE TABLE ${PGSCHEMA}.linksets_raw;
\copy ${PGSCHEMA}.linksets_raw (id, raw) FROM '${WORKDIR}/linksets.tsv' WITH (FORMAT csv, DELIMITER E'\t', QUOTE '"', ESCAPE '"');
SQL

echo "==> Importing advisory_chunks into ${PGSCHEMA}.advisory_chunks_raw"
cat >"${WORKDIR}/advisory_chunks.tsv" <(jq -rc '[._id, .] | @tsv' "${WORKDIR}/advisory_chunks.ndjson")
psql "${PGURI}" <<SQL
TRUNCATE TABLE ${PGSCHEMA}.advisory_chunks_raw;
\copy ${PGSCHEMA}.advisory_chunks_raw (id, raw) FROM '${WORKDIR}/advisory_chunks.tsv' WITH (FORMAT csv, DELIMITER E'\t', QUOTE '"', ESCAPE '"');
SQL

echo "==> Post-import counts"
psql -tA "${PGURI}" -c "select 'linksets_raw='||count(*) from ${PGSCHEMA}.linksets_raw;"
psql -tA "${PGURI}" -c "select 'advisory_chunks_raw='||count(*) from ${PGSCHEMA}.advisory_chunks_raw;"

echo "==> Manifest summary"
cat "${WORKDIR}/manifest.json"

echo "Backfill complete."