#!/usr/bin/env bash # Deterministic CI runner for reachability benchmark (task BENCH-CI-513-013). set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" export SOURCE_DATE_EPOCH="${SOURCE_DATE_EPOCH:-1730000000}" export DOTNET_CLI_TELEMETRY_OPTOUT=1 export GIT_TERMINAL_PROMPT=0 export TZ=UTC source "${ROOT}/tools/java/ensure_jdk.sh" ensure_bench_jdk # 1) Validate schemas (truth + submission samples) python "${ROOT}/tools/validate.py" --schemas "${ROOT}/schemas" # 2) Build all cases deterministically (including Java via vendored JDK) python "${ROOT}/tools/build/build_all.py" --cases "${ROOT}/cases" # 3) Run Semgrep baseline (offline-safe) bash "${ROOT}/baselines/semgrep/run_all.sh" "${ROOT}/cases" "${ROOT}/out/semgrep-baseline" # 4) Run Stella baseline (offline-safe, uses truth) bash "${ROOT}/baselines/stella/run_all.sh" "${ROOT}/cases" "${ROOT}/out/stella-baseline" # 5) Run CodeQL baseline (offline-safe fallback) bash "${ROOT}/baselines/codeql/run_all.sh" "${ROOT}/cases" "${ROOT}/out/codeql-baseline" # 6) Build aggregated truth (merge all truth JSON files) TRUTH_AGG="${ROOT}/out/truth-aggregated.json" python - <<'PY' import json, pathlib, sys truth_dir = pathlib.Path(sys.argv[1]) out_path = pathlib.Path(sys.argv[2]) cases = [] for path in sorted(truth_dir.glob("*.json")): doc = json.loads(path.read_text()) cases.extend(doc.get("cases", [])) agg = {"version": "1.0.0", "cases": cases} out_path.write_text(json.dumps(agg, indent=2, sort_keys=True)) PY "${ROOT}/benchmark/truth" "${TRUTH_AGG}" # 7) Leaderboard (using available baselines) python "${ROOT}/tools/scorer/rb_compare.py" \ --truth "${TRUTH_AGG}" \ --submissions \ "${ROOT}/out/semgrep-baseline/submission.json" \ "${ROOT}/out/stella-baseline/submission.json" \ "${ROOT}/out/codeql-baseline/submission.json" \ --output "${ROOT}/out/leaderboard.json" \ --text echo "CI run complete. Outputs under ${ROOT}/out"