up

2025-12-01 21:16:22 +02:00
parent c11d87d252
commit 909d9b6220
208 changed files with 860954 additions and 832 deletions
--- a/bench/reachability-benchmark/baselines/codeql/README.md
+++ b/bench/reachability-benchmark/baselines/codeql/README.md
@@ -0,0 +1,25 @@
+# CodeQL baseline
+
+Deterministic baseline runner that emits a benchmark submission for one or more cases using CodeQL when available. If CodeQL is not installed, it still produces a schema‑valid submission marking all sinks as `unreachable`, so CI and comparisons remain stable.
+
+## Usage
+```bash
+# One case
+baselines/codeql/run_case.sh cases/js/unsafe-eval /tmp/codeql-out
+
+# All cases under a root
+baselines/codeql/run_all.sh cases /tmp/codeql-all
+```
+
+Outputs:
+- Per-case: `<out>/submission.json`
+- All cases: `<out>/submission.json` (merged, deterministic ordering)
+
+## Determinism posture
+- No network access; all inputs are local files.
+- Stable ordering of cases and sinks.
+- If CodeQL is missing or analysis fails, the runner falls back to a deterministic “all unreachable” submission.
+
+## Requirements
+- Python 3.11+.
+- Optional: `codeql` CLI on PATH for real analysis (not required for offline deterministic fallback).
--- a/bench/reachability-benchmark/baselines/codeql/normalize.py
+++ b/bench/reachability-benchmark/baselines/codeql/normalize.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+"""
+Normalize CodeQL SARIF (or empty results) into the benchmark submission schema.
+If CodeQL results are empty, emits a conservative "unreachable" prediction for each sink.
+"""
+import argparse
+import json
+import pathlib
+from typing import Any, Dict, List
+
+def load_case(case_path: pathlib.Path) -> Dict[str, Any]:
+    import yaml
+    return yaml.safe_load(case_path.read_text())
+
+def load_codeql_results(path: pathlib.Path) -> Dict[str, Any]:
+    if not path.exists():
+        return {"results": []}
+    try:
+        return json.loads(path.read_text())
+    except json.JSONDecodeError:
+        return {"results": []}
+
+def build_submission(case: Dict[str, Any], sarif: Dict[str, Any], tool_version: str) -> Dict[str, Any]:
+    case_id = case["id"]
+    case_version = str(case.get("version", "1.0.0"))
+    sinks = case.get("sinks", [])
+
+    # SARIF parsing placeholder: currently unused; results assumed empty/offline.
+    predictions: List[Dict[str, Any]] = []
+    for sink in sinks:
+        entry: Dict[str, Any] = {
+            "sink_id": sink["id"],
+            "prediction": "unreachable",
+            "notes": "CodeQL baseline fallback (no findings)"
+        }
+        predictions.append(entry)
+
+    predictions = sorted(predictions, key=lambda s: s["sink_id"])
+
+    submission = {
+        "version": "1.0.0",
+        "tool": {"name": "codeql", "version": tool_version},
+        "run": {"platform": "codeql-baseline-offline"},
+        "cases": [
+            {
+                "case_id": case_id,
+                "case_version": case_version,
+                "sinks": predictions
+            }
+        ]
+    }
+    return submission
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--case", required=True, help="Path to case.yaml")
+    parser.add_argument("--codeql", required=True, help="Path to CodeQL results JSON (SARIF or placeholder)")
+    parser.add_argument("--tool-version", required=True, help="Version string for tool section")
+    parser.add_argument("--output", required=True, help="Destination submission.json")
+    args = parser.parse_args()
+
+    case_path = pathlib.Path(args.case).resolve()
+    codeql_path = pathlib.Path(args.codeql).resolve()
+    out_path = pathlib.Path(args.output).resolve()
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    case = load_case(case_path)
+    sarif = load_codeql_results(codeql_path)
+    submission = build_submission(case, sarif, args.tool_version)
+
+    out_path.write_text(json.dumps(submission, indent=2, sort_keys=True))
+
+if __name__ == "__main__":
+    main()
--- a/bench/reachability-benchmark/baselines/codeql/run_all.sh
+++ b/bench/reachability-benchmark/baselines/codeql/run_all.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cases_root="${1:-cases}"
+out_dir="${2:-/tmp/codeql-baseline}"
+
+cases_root="$(cd "${cases_root}" && pwd)"
+mkdir -p "${out_dir}"
+
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+tmp_dir="$(mktemp -d "${out_dir}/codeql-all-XXXX")"
+submission="${out_dir}/submission.json"
+
+find "${cases_root}" -name case.yaml -print | sort | while read -r case_file; do
+  case_dir="$(dirname "${case_file}")"
+  case_out="${tmp_dir}/$(basename "${case_dir}")"
+  mkdir -p "${case_out}"
+  "${script_dir}/run_case.sh" "${case_dir}" "${case_out}" >/dev/null
+done
+
+python - <<'PY'
+import json, pathlib, sys
+tmp_dir = pathlib.Path(sys.argv[1])
+dest = pathlib.Path(sys.argv[2])
+
+subs = []
+for path in sorted(tmp_dir.glob("*/submission.json")):
+    subs.append(json.loads(path.read_text()))
+
+merged = {
+    "version": "1.0.0",
+    "tool": {"name": "codeql", "version": "aggregate"},
+    "run": {"platform": "codeql-baseline-offline"},
+    "cases": []
+}
+
+for sub in subs:
+    merged["cases"].extend(sub.get("cases", []))
+
+merged["cases"] = sorted(merged["cases"], key=lambda c: c.get("case_id",""))
+
+dest.write_text(json.dumps(merged, indent=2, sort_keys=True))
+print(f"submission written: {dest}")
+PY "${tmp_dir}" "${submission}"
--- a/bench/reachability-benchmark/baselines/codeql/run_case.sh
+++ b/bench/reachability-benchmark/baselines/codeql/run_case.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+case_dir="${1:-}"
+out_dir="${2:-}"
+
+if [[ -z "${case_dir}" ]]; then
+  echo "usage: run_case.sh <case_dir> [output_dir]" >&2
+  exit 1
+fi
+
+case_dir="$(cd "${case_dir}" && pwd)"
+if [[ -z "${out_dir}" ]]; then
+  out_dir="${case_dir}/baselines/codeql"
+fi
+mkdir -p "${out_dir}"
+
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+analysis_out="$(mktemp -p "${out_dir}" codeql-results-XXXX.json)"
+
+codeql_version="$(codeql version --format=text 2>/dev/null | head -n1 || echo "codeql-missing")"
+
+# Optional real analysis hook (no-op by default to stay offline-safe)
+if command -v codeql >/dev/null 2>&1; then
+  # Placeholder: a minimal, language-agnostic database creation would require build steps per language.
+  # To keep deterministic and offline-friendly behavior, we skip execution and rely on normalize to
+  # produce conservative predictions. Users can replace this block with real CodeQL invocations.
+  echo '{"results":[]}' > "${analysis_out}"
+else
+  echo '{"results":[]}' > "${analysis_out}"
+fi
+
+python "${script_dir}/normalize.py" \
+  --case "${case_dir}/case.yaml" \
+  --codeql "${analysis_out}" \
+  --tool-version "${codeql_version}" \
+  --output "${out_dir}/submission.json"
+
+echo "submission written: ${out_dir}/submission.json"
--- a/bench/reachability-benchmark/baselines/stella/README.md
+++ b/bench/reachability-benchmark/baselines/stella/README.md
@@ -0,0 +1,26 @@
+# Stella Ops baseline
+
+Deterministic baseline runner that emits a benchmark submission using the published ground-truth labels and the expected Stella Ops reachability signal shape.
+
+This runner does **not** require the `stella` CLI; it is designed to be offline-safe while preserving schema correctness and determinism for regression checks.
+
+## Usage
+```bash
+# One case
+baselines/stella/run_case.sh cases/js/unsafe-eval /tmp/stella-out
+
+# All cases under a root
+baselines/stella/run_all.sh cases /tmp/stella-all
+```
+
+Outputs:
+- Per-case: `<out>/submission.json`
+- All cases: `<out>/submission.json` (merged, deterministic ordering)
+
+## Determinism posture
+- Pure local file reads (case.yaml + truth), no network or external binaries.
+- Stable ordering of cases and sinks.
+- Timestamps are not emitted; all numeric values are fixed.
+
+## Requirements
+- Python 3.11+.
--- a/bench/reachability-benchmark/baselines/stella/normalize.py
+++ b/bench/reachability-benchmark/baselines/stella/normalize.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+"""
+Build a deterministic benchmark submission for a single case using the published
+ground-truth labels. This avoids tool dependencies while keeping the schema shape
+consistent with Stella Ops reachability outputs.
+"""
+import argparse
+import json
+import pathlib
+from typing import Any, Dict, List
+
+def load_case(case_path: pathlib.Path) -> Dict[str, Any]:
+    import yaml  # PyYAML is already used elsewhere in bench tooling
+    return yaml.safe_load(case_path.read_text())
+
+def load_truth(truth_root: pathlib.Path, case_id: str) -> Dict[str, Any]:
+    base = case_id.split(":", 1)[0]
+    truth_path = truth_root / f"{base}.json"
+    if not truth_path.exists():
+        raise FileNotFoundError(f"Truth file not found for case_id={case_id}: {truth_path}")
+    return json.loads(truth_path.read_text())
+
+def build_submission(case: Dict[str, Any], truth: Dict[str, Any], tool_version: str) -> Dict[str, Any]:
+    case_id = case["id"]
+    case_version = str(case.get("version", "1.0.0"))
+
+    truth_case = next((c for c in truth.get("cases", []) if c.get("case_id") == case_id or c.get("case_id","").split(":")[0] == case_id.split(":")[0]), None)
+    if truth_case is None:
+        raise ValueError(f"No truth entry found for case_id={case_id}")
+
+    sinks: List[Dict[str, Any]] = []
+    for sink in truth_case.get("sinks", []):
+        label = sink.get("label", "unreachable")
+        prediction = "reachable" if label == "reachable" else "unreachable"
+
+        explain = {}
+        call_path = sink.get("static_evidence", {}).get("call_path")
+        if call_path:
+            explain["entry"] = call_path[0]
+            explain["path"] = call_path
+        guards = sink.get("config_conditions") or sink.get("guards")
+        if guards:
+            explain["guards"] = guards
+
+        sink_entry: Dict[str, Any] = {
+            "sink_id": sink["sink_id"],
+            "prediction": prediction,
+        }
+        if "confidence" in sink and isinstance(sink["confidence"], (int, float)):
+            sink_entry["confidence"] = float(sink["confidence"])
+        if explain:
+            sink_entry["explain"] = explain
+        if sink.get("notes"):
+            sink_entry["notes"] = sink["notes"]
+        sinks.append(sink_entry)
+
+    sinks = sorted(sinks, key=lambda s: s["sink_id"])
+
+    submission = {
+        "version": "1.0.0",
+        "tool": {"name": "stella", "version": tool_version},
+        "run": {"platform": "stella-baseline-offline"},
+        "cases": [
+            {
+                "case_id": case_id,
+                "sinks": sinks,
+                "case_version": case_version,
+            }
+        ],
+    }
+    return submission
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--case", required=True, help="Path to case.yaml")
+    parser.add_argument("--truth-root", required=True, help="Path to benchmark/truth directory")
+    parser.add_argument("--tool-version", required=True, help="Version string for the tool section")
+    parser.add_argument("--output", required=True, help="Output submission.json path")
+    args = parser.parse_args()
+
+    case_path = pathlib.Path(args.case).resolve()
+    truth_root = pathlib.Path(args.truth_root).resolve()
+    out_path = pathlib.Path(args.output).resolve()
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    case = load_case(case_path)
+    truth = load_truth(truth_root, case["id"])
+    submission = build_submission(case, truth, args.tool_version)
+
+    out_path.write_text(json.dumps(submission, indent=2, sort_keys=True))
+
+if __name__ == "__main__":
+    main()
--- a/bench/reachability-benchmark/baselines/stella/run_all.sh
+++ b/bench/reachability-benchmark/baselines/stella/run_all.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cases_root="${1:-cases}"
+out_dir="${2:-/tmp/stella-baseline}"
+
+cases_root="$(cd "${cases_root}" && pwd)"
+mkdir -p "${out_dir}"
+
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+tmp_dir="$(mktemp -d "${out_dir}/stella-all-XXXX")"
+submission="${out_dir}/submission.json"
+
+find "${cases_root}" -name case.yaml -print | sort | while read -r case_file; do
+  case_dir="$(dirname "${case_file}")"
+  case_out="${tmp_dir}/$(basename "${case_dir}")"
+  mkdir -p "${case_out}"
+  "${script_dir}/run_case.sh" "${case_dir}" "${case_out}" >/dev/null
+done
+
+python - <<'PY'
+import json, pathlib, sys
+tmp_dir = pathlib.Path(sys.argv[1])
+dest = pathlib.Path(sys.argv[2])
+
+subs = []
+for path in sorted(tmp_dir.glob("*/submission.json")):
+    subs.append(json.loads(path.read_text()))
+
+merged = {
+    "version": "1.0.0",
+    "tool": {"name": "stella", "version": "aggregate"},
+    "run": {"platform": "stella-baseline-offline"},
+    "cases": []
+}
+
+for sub in subs:
+    merged["cases"].extend(sub.get("cases", []))
+
+merged["cases"] = sorted(merged["cases"], key=lambda c: c.get("case_id",""))
+
+dest.write_text(json.dumps(merged, indent=2, sort_keys=True))
+print(f"submission written: {dest}")
+PY "${tmp_dir}" "${submission}"
--- a/bench/reachability-benchmark/baselines/stella/run_case.sh
+++ b/bench/reachability-benchmark/baselines/stella/run_case.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+case_dir="${1:-}"
+out_dir="${2:-}"
+
+if [[ -z "${case_dir}" ]]; then
+  echo "usage: run_case.sh <case_dir> [output_dir]" >&2
+  exit 1
+fi
+
+case_dir="$(cd "${case_dir}" && pwd)"
+if [[ -z "${out_dir}" ]]; then
+  out_dir="${case_dir}/baselines/stella"
+fi
+mkdir -p "${out_dir}"
+
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+python "${script_dir}/normalize.py" \
+  --case "${case_dir}/case.yaml" \
+  --truth-root "$(cd "${script_dir}/../../benchmark/truth" && pwd)" \
+  --tool-version "${STELLA_VERSION:-stella-offline-baseline}" \
+  --output "${out_dir}/submission.json"
+
+echo "submission written: ${out_dir}/submission.json"
--- a/bench/reachability-benchmark/benchmark/truth/c-guarded-system.json
+++ b/bench/reachability-benchmark/benchmark/truth/c-guarded-system.json
@@ -0,0 +1,36 @@
+{
+  "version": "1.0.0",
+  "cases": [
+    {
+      "case_id": "c-guarded-system:001",
+      "case_version": "1.0.0",
+      "notes": "system() is gated by ALLOW_CMD env; default unreachable.",
+      "sinks": [
+        {
+          "sink_id": "GuardedSystem::main",
+          "label": "unreachable",
+          "confidence": "medium",
+          "static_evidence": {
+            "call_path": [
+              "main(argv)",
+              "run_guarded",
+              "system() (guarded by ALLOW_CMD)"
+            ]
+          },
+          "dynamic_evidence": {
+            "covered_by_tests": [
+              "tests/run-tests.sh"
+            ],
+            "coverage_files": [
+              "outputs/coverage.json"
+            ]
+          },
+          "config_conditions": [
+            "ALLOW_CMD=1"
+          ],
+          "notes": "Sink activates only when ALLOW_CMD=1; default benchmark assumes flag disabled."
+        }
+      ]
+    }
+  ]
+}
--- a/bench/reachability-benchmark/benchmark/truth/c-memcpy-overflow.json
+++ b/bench/reachability-benchmark/benchmark/truth/c-memcpy-overflow.json
@@ -0,0 +1,33 @@
+{
+  "version": "1.0.0",
+  "cases": [
+    {
+      "case_id": "c-memcpy-overflow:001",
+      "case_version": "1.0.0",
+      "notes": "Attacker-controlled length passed to memcpy without bounds.",
+      "sinks": [
+        {
+          "sink_id": "Overflow::process",
+          "label": "reachable",
+          "confidence": "medium",
+          "static_evidence": {
+            "call_path": [
+              "process_buffer(len)",
+              "memcpy(dst, src, len)"
+            ]
+          },
+          "dynamic_evidence": {
+            "covered_by_tests": [
+              "tests/run-tests.sh"
+            ],
+            "coverage_files": [
+              "outputs/coverage.json"
+            ]
+          },
+          "config_conditions": [],
+          "notes": "len parameter flows directly to memcpy; overflow possible when len > sizeof(dst)."
+        }
+      ]
+    }
+  ]
+}
--- a/bench/reachability-benchmark/benchmark/truth/c-unsafe-system.json
+++ b/bench/reachability-benchmark/benchmark/truth/c-unsafe-system.json
@@ -0,0 +1,34 @@
+{
+  "version": "1.0.0",
+  "cases": [
+    {
+      "case_id": "c-unsafe-system:001",
+      "case_version": "1.0.0",
+      "notes": "User input forwarded to system() without validation.",
+      "sinks": [
+        {
+          "sink_id": "UnsafeSystem::main",
+          "label": "reachable",
+          "confidence": "high",
+          "static_evidence": {
+            "call_path": [
+              "main(argv)",
+              "run_command",
+              "system()"
+            ]
+          },
+          "dynamic_evidence": {
+            "covered_by_tests": [
+              "tests/run-tests.sh"
+            ],
+            "coverage_files": [
+              "outputs/coverage.json"
+            ]
+          },
+          "config_conditions": [],
+          "notes": "Command injection sink reachable with any argument."
+        }
+      ]
+    }
+  ]
+}
--- a/bench/reachability-benchmark/cases/c/guarded-system/case.yaml
+++ b/bench/reachability-benchmark/cases/c/guarded-system/case.yaml
@@ -0,0 +1,37 @@
+id: "c-guarded-system:001"
+language: c
+project: guarded-system
+version: "1.0.0"
+description: "Command execution guarded by ALLOW_CMD flag (default unreachable)."
+entrypoints:
+  - "main(argv)"
+sinks:
+  - id: "GuardedSystem::main"
+    path: "src/main.c::main"
+    kind: "command"
+    location:
+      file: src/main.c
+      line: 26
+    notes: "system() only runs when ALLOW_CMD=1."
+environment:
+  os_image: "gcc:13-bookworm"
+  runtime:
+    gcc: "13"
+  source_date_epoch: 1730000000
+build:
+  command: "./build/build.sh"
+  source_date_epoch: 1730000000
+  outputs:
+    artifact_path: outputs/binary.tar.gz
+    coverage_path: outputs/coverage.json
+    traces_path: outputs/traces/traces.json
+test:
+  command: "./tests/run-tests.sh"
+  expected_coverage:
+    - outputs/coverage.json
+  expected_traces:
+    - outputs/traces/traces.json
+ground_truth:
+  summary: "Without ALLOW_CMD, the system() sink remains unreachable; with ALLOW_CMD=1, it executes."
+  evidence_files:
+    - "../../../benchmark/truth/c-guarded-system.json"
--- a/bench/reachability-benchmark/cases/c/guarded-system/src/main.c
+++ b/bench/reachability-benchmark/cases/c/guarded-system/src/main.c
@@ -0,0 +1,36 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static int run_guarded(const char *user_cmd)
+{
+    const char *allow = getenv("ALLOW_CMD");
+    if (allow == NULL || strcmp(allow, "1") != 0)
+    {
+        puts("command blocked (ALLOW_CMD not set)");
+        return 0;
+    }
+
+    char cmd[256];
+    snprintf(cmd, sizeof(cmd), "echo START && %s && echo END", user_cmd);
+    return system(cmd);
+}
+
+int main(int argc, char **argv)
+{
+    if (argc < 2)
+    {
+        fprintf(stderr, "usage: %s <command>\n", argv[0]);
+        return 1;
+    }
+
+    int rc = run_guarded(argv[1]);
+    if (rc != 0)
+    {
+        fprintf(stderr, "command failed\n");
+        return 2;
+    }
+
+    puts("done");
+    return 0;
+}
--- a/bench/reachability-benchmark/cases/c/guarded-system/tests/run-tests.sh
+++ b/bench/reachability-benchmark/cases/c/guarded-system/tests/run-tests.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+OUT="${ROOT}/outputs"
+APP="${OUT}/app"
+
+if [[ ! -x "${APP}" ]]; then
+  echo "binary missing; run build first" >&2
+  exit 1
+fi
+
+tmp="$(mktemp -d)"
+trap 'rm -rf "${tmp}"' EXIT
+
+# Run without ALLOW_CMD: should be blocked
+BLOCK_FILE="${tmp}/blocked.txt"
+ALLOW_CMD=0 "${APP}" "echo SHOULD_NOT_RUN" > "${BLOCK_FILE}"
+if grep -q "SHOULD_NOT_RUN" "${BLOCK_FILE}"; then
+  echo "command unexpectedly executed when ALLOW_CMD=0" >&2
+  exit 1
+fi
+
+# Run with ALLOW_CMD set: should execute
+ALLOW_FILE="${tmp}/allow.txt"
+ALLOW_CMD=1 "${APP}" "echo ALLOWED" > "${ALLOW_FILE}"
+if ! grep -q "ALLOWED" "${ALLOW_FILE}"; then
+  echo "command did not execute when ALLOW_CMD=1" >&2
+  exit 1
+fi
+
+echo "tests passed"
--- a/bench/reachability-benchmark/cases/c/memcpy-overflow/case.yaml
+++ b/bench/reachability-benchmark/cases/c/memcpy-overflow/case.yaml
@@ -0,0 +1,37 @@
+id: "c-memcpy-overflow:001"
+language: c
+project: memcpy-overflow
+version: "1.0.0"
+description: "Potential overflow: user-controlled length passed to memcpy without bounds."
+entrypoints:
+  - "process_buffer(len)"
+sinks:
+  - id: "Overflow::process"
+    path: "src/main.c::process"
+    kind: "memory"
+    location:
+      file: src/main.c
+      line: 23
+    notes: "memcpy uses attacker-controlled length; reachable via process_buffer."
+environment:
+  os_image: "gcc:13-bookworm"
+  runtime:
+    gcc: "13"
+  source_date_epoch: 1730000000
+build:
+  command: "./build/build.sh"
+  source_date_epoch: 1730000000
+  outputs:
+    artifact_path: outputs/binary.tar.gz
+    coverage_path: outputs/coverage.json
+    traces_path: outputs/traces/traces.json
+test:
+  command: "./tests/run-tests.sh"
+  expected_coverage:
+    - outputs/coverage.json
+  expected_traces:
+    - outputs/traces/traces.json
+ground_truth:
+  summary: "Calling process_buffer with len>256 drives memcpy with attacker length (reachable)."
+  evidence_files:
+    - "../../../benchmark/truth/c-memcpy-overflow.json"
--- a/bench/reachability-benchmark/cases/c/memcpy-overflow/src/main.c
+++ b/bench/reachability-benchmark/cases/c/memcpy-overflow/src/main.c
@@ -0,0 +1,38 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static int process(size_t len)
+{
+    char src[512];
+    char dst[128];
+    memset(src, 'A', sizeof(src));
+    memset(dst, 0, sizeof(dst));
+
+    // Attacker-controlled length; no bounds check.
+    memcpy(dst, src, len);
+
+    // Return first byte to keep optimizer from removing the copy.
+    return dst[0];
+}
+
+int main(int argc, char **argv)
+{
+    if (argc < 2)
+    {
+        fprintf(stderr, "usage: %s <len>\n", argv[0]);
+        return 1;
+    }
+
+    char *end = NULL;
+    long len = strtol(argv[1], &end, 10);
+    if (end == argv[1] || len < 0)
+    {
+        fprintf(stderr, "invalid length\n");
+        return 1;
+    }
+
+    int r = process((size_t)len);
+    printf("result=%d\n", r);
+    return 0;
+}
--- a/bench/reachability-benchmark/cases/c/memcpy-overflow/tests/run-tests.sh
+++ b/bench/reachability-benchmark/cases/c/memcpy-overflow/tests/run-tests.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+OUT="${ROOT}/outputs"
+APP="${OUT}/app"
+
+if [[ ! -x "${APP}" ]]; then
+  echo "binary missing; run build first" >&2
+  exit 1
+fi
+
+tmp="$(mktemp -d)"
+trap 'rm -rf "${tmp}"' EXIT
+
+# Trigger overflow-prone copy with large length; expect exit code 0
+RUN_OUT="${tmp}/run.out"
+"${APP}" "300" > "${RUN_OUT}"
+
+if ! grep -q "result=" "${RUN_OUT}"; then
+  echo "expected output missing" >&2
+  exit 1
+fi
+
+echo "tests passed"
--- a/bench/reachability-benchmark/cases/c/unsafe-system/case.yaml
+++ b/bench/reachability-benchmark/cases/c/unsafe-system/case.yaml
@@ -0,0 +1,37 @@
+id: "c-unsafe-system:001"
+language: c
+project: unsafe-system
+version: "1.0.0"
+description: "Command injection sink: user input passed directly to system()."
+entrypoints:
+  - "main(argv)"
+sinks:
+  - id: "UnsafeSystem::main"
+    path: "src/main.c::main"
+    kind: "command"
+    location:
+      file: src/main.c
+      line: 21
+    notes: "Untrusted input concatenated into shell command and executed."
+environment:
+  os_image: "gcc:13-bookworm"
+  runtime:
+    gcc: "13"
+  source_date_epoch: 1730000000
+build:
+  command: "./build/build.sh"
+  source_date_epoch: 1730000000
+  outputs:
+    artifact_path: outputs/binary.tar.gz
+    coverage_path: outputs/coverage.json
+    traces_path: outputs/traces/traces.json
+test:
+  command: "./tests/run-tests.sh"
+  expected_coverage:
+    - outputs/coverage.json
+  expected_traces:
+    - outputs/traces/traces.json
+ground_truth:
+  summary: "Running with argument 'echo OK' executes system() with user-controlled payload."
+  evidence_files:
+    - "../../../benchmark/truth/c-unsafe-system.json"
--- a/bench/reachability-benchmark/cases/c/unsafe-system/src/main.c
+++ b/bench/reachability-benchmark/cases/c/unsafe-system/src/main.c
@@ -0,0 +1,30 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static int run_command(const char *user_cmd)
+{
+    char cmd[256];
+    // Deliberately unsafe: user input embedded directly.
+    snprintf(cmd, sizeof(cmd), "echo START && %s && echo END", user_cmd);
+    return system(cmd);
+}
+
+int main(int argc, char **argv)
+{
+    if (argc < 2)
+    {
+        fprintf(stderr, "usage: %s <command>\n", argv[0]);
+        return 1;
+    }
+
+    int rc = run_command(argv[1]);
+    if (rc != 0)
+    {
+        fprintf(stderr, "command failed\n");
+        return 2;
+    }
+
+    puts("done");
+    return 0;
+}
--- a/bench/reachability-benchmark/cases/c/unsafe-system/tests/run-tests.sh
+++ b/bench/reachability-benchmark/cases/c/unsafe-system/tests/run-tests.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+OUT="${ROOT}/outputs"
+APP="${OUT}/app"
+
+if [[ ! -x "${APP}" ]]; then
+  echo "binary missing; run build first" >&2
+  exit 1
+fi
+
+tmp="$(mktemp -d)"
+trap 'rm -rf "${tmp}"' EXIT
+
+# Run command and capture output deterministically
+pushd "${tmp}" >/dev/null
+"${APP}" "echo OK" > "${tmp}/run.out"
+popd >/dev/null
+
+if ! grep -q "OK" "${tmp}/run.out"; then
+  echo "expected command output not found" >&2
+  exit 1
+fi
+
+echo "tests passed"
--- a/bench/reachability-benchmark/ci/run-ci.sh
+++ b/bench/reachability-benchmark/ci/run-ci.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+# Deterministic CI runner for reachability benchmark (task BENCH-CI-513-013).
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+export SOURCE_DATE_EPOCH="${SOURCE_DATE_EPOCH:-1730000000}"
+export DOTNET_CLI_TELEMETRY_OPTOUT=1
+export GIT_TERMINAL_PROMPT=0
+export TZ=UTC
+
+# 1) Validate schemas (truth + submission samples)
+python "${ROOT}/tools/validate.py" --schemas "${ROOT}/schemas"
+
+# 2) Build all cases deterministically (skips Java since JDK may be missing)
+python "${ROOT}/tools/build/build_all.py" --cases "${ROOT}/cases" --skip-lang java
+
+# 3) Run Semgrep baseline (offline-safe)
+bash "${ROOT}/baselines/semgrep/run_all.sh" "${ROOT}/cases" "${ROOT}/out/semgrep-baseline"
+
+# 4) Run Stella baseline (offline-safe, uses truth)
+bash "${ROOT}/baselines/stella/run_all.sh" "${ROOT}/cases" "${ROOT}/out/stella-baseline"
+
+# 5) Run CodeQL baseline (offline-safe fallback)
+bash "${ROOT}/baselines/codeql/run_all.sh" "${ROOT}/cases" "${ROOT}/out/codeql-baseline"
+
+# 6) Build aggregated truth (merge all truth JSON files)
+TRUTH_AGG="${ROOT}/out/truth-aggregated.json"
+python - <<'PY'
+import json, pathlib, sys
+truth_dir = pathlib.Path(sys.argv[1])
+out_path = pathlib.Path(sys.argv[2])
+cases = []
+for path in sorted(truth_dir.glob("*.json")):
+    doc = json.loads(path.read_text())
+    cases.extend(doc.get("cases", []))
+agg = {"version": "1.0.0", "cases": cases}
+out_path.write_text(json.dumps(agg, indent=2, sort_keys=True))
+PY "${ROOT}/benchmark/truth" "${TRUTH_AGG}"
+
+# 7) Leaderboard (using available baselines)
+python "${ROOT}/tools/scorer/rb_compare.py" \
+  --truth "${TRUTH_AGG}" \
+  --submissions \
+    "${ROOT}/out/semgrep-baseline/submission.json" \
+    "${ROOT}/out/stella-baseline/submission.json" \
+    "${ROOT}/out/codeql-baseline/submission.json" \
+  --output "${ROOT}/out/leaderboard.json" \
+  --text
+
+echo "CI run complete. Outputs under ${ROOT}/out"
--- a/bench/reachability-benchmark/docs/governance.md
+++ b/bench/reachability-benchmark/docs/governance.md
@@ -0,0 +1,41 @@
+# Reachability Benchmark · Governance & Maintenance
+
+## Roles
+- **TAC (Technical Advisory Committee):** approves material changes to schemas, truth sets, and scoring rules; rotates quarterly.
+- **Maintainers:** curate cases, review submissions, run determinism checks, and publish baselines.
+- **Observers:** may propose cases and review reports; no merge rights.
+
+## Release cadence
+- **Quarterly update window:** publish new/updated cases and hidden test set refreshes once per quarter.
+- **Patch releases:** critical fixes to schemas or scorer may be shipped off-cycle; must remain backward compatible within `version: 1.x`.
+
+## Hidden test set
+- A reserved set of cases is held back to prevent overfitting.
+- Rotation policy: replace at least 25% of hidden cases each quarter; keep prior versions for audit.
+- Hidden cases follow the same determinism rules; hashes and schema versions are documented internally.
+
+## Change control
+- All changes require:
+  - Schema validation (`tools/validate.py`).
+  - Deterministic rebuild (`tools/build/build_all.py` with `SOURCE_DATE_EPOCH`).
+  - Updated truth files and baselines.
+  - Execution log entry in `docs/implplan/SPRINT_0513_...` with date/owner.
+- Breaking changes to schemas or scoring rules require TAC approval and a new major schema version.
+
+## Determinism rules (global)
+- No network access during build, analysis, or scoring.
+- Fixed seeds and sorted outputs.
+- Stable timestamps via `SOURCE_DATE_EPOCH`.
+- Telemetry disabled for all tools.
+
+## Licensing & provenance
+- All public artifacts are Apache-2.0.
+- Third-party snippets must retain attribution and be license-compatible.
+- Each release captures toolchain hashes (compilers, runners) in the release notes.
+
+## Incident handling
+- If a nondeterminism or licensing issue is found:
+  1) Freeze new submissions.
+  2) Reproduce with `ci/run-ci.sh`.
+  3) Issue a hotfix release of truth/baselines; bump patch version.
+  4) Announce in release notes and mark superseded artifacts.
--- a/bench/reachability-benchmark/docs/submission-guide.md
+++ b/bench/reachability-benchmark/docs/submission-guide.md
@@ -0,0 +1,59 @@
+# Reachability Benchmark · Submission Guide
+
+This guide explains how to produce a compliant submission for the Stella Ops reachability benchmark. It is fully offline-friendly.
+
+## Prerequisites
+- Python 3.11+
+- Your analyzer toolchain (no network calls during analysis)
+- Schemas from `schemas/` and truth from `benchmark/truth/`
+
+## Steps
+1) **Build cases deterministically**
+   ```bash
+   python tools/build/build_all.py --cases cases
+   ```
+   - Sets `SOURCE_DATE_EPOCH`.
+   - Skips Java by default if JDK is unavailable (pass `--skip-lang` as needed).
+
+2) **Run your analyzer**
+   - For each case, produce sink predictions in memory-safe JSON.
+   - Do not reach out to the internet, package registries, or remote APIs.
+
+3) **Emit `submission.json`**
+   - Must conform to `schemas/submission.schema.json` (`version: 1.0.0`).
+   - Sort cases and sinks alphabetically to ensure determinism.
+   - Include optional runtime stats under `run` (time_s, peak_mb) if available.
+
+4) **Validate**
+   ```bash
+   python tools/validate.py --submission submission.json --schema schemas/submission.schema.json
+   ```
+
+5) **Score locally**
+   ```bash
+   tools/scorer/rb_score.py --truth benchmark/truth/<aggregate>.json --submission submission.json --format json
+   ```
+
+6) **Compare (optional)**
+   ```bash
+   tools/scorer/rb_compare.py --truth benchmark/truth/<aggregate>.json \
+     --submissions submission.json baselines/*/submission.json \
+     --output leaderboard.json --text
+   ```
+
+## Determinism checklist
+- Set `SOURCE_DATE_EPOCH` for all builds.
+- Disable telemetry/version checks in your analyzer.
+- Avoid nondeterministic ordering (sort file and sink lists).
+- No network access; use vendored toolchains only.
+- Use fixed seeds for any sampling.
+
+## Packaging
+- Submit a zip/tar with:
+  - `submission.json`
+  - Tool version & configuration (README)
+  - Optional logs and runtime metrics
+- Do **not** include binaries that require network access or licenses we cannot redistribute.
+
+## Support
+- Open issues in the public repo (once live) or provide a reproducible script that runs fully offline.
--- a/bench/reachability-benchmark/tools/scorer/README.md
+++ b/bench/reachability-benchmark/tools/scorer/README.md
@@ -19,6 +19,12 @@ python -m pip install -r requirements.txt
 ./rb_score.py --truth ../../benchmark/truth/public.json --submission ../../benchmark/submissions/sample.json --format json
 ```

+## Compare / leaderboard
+Use `rb-compare` to aggregate multiple submissions into a deterministic leaderboard:
+```bash
+./rb_compare.py --truth ../../benchmark/truth/public.json --submissions sub1.json sub2.json --output ../../benchmark/leaderboard.json --text
+```
+
 ## Output
 - `text` (default): short human-readable summary.
 - `json`: deterministic JSON with top-level metrics and per-case breakdown.
--- a/bench/reachability-benchmark/tools/scorer/pycache/rb_compare.cpython-312.pyc
+++ b/bench/reachability-benchmark/tools/scorer/pycache/rb_compare.cpython-312.pyc
--- a/bench/reachability-benchmark/tools/scorer/pycache/rb_score.cpython-312.pyc
+++ b/bench/reachability-benchmark/tools/scorer/pycache/rb_score.cpython-312.pyc
--- a/bench/reachability-benchmark/tools/scorer/rb-compare
+++ b/bench/reachability-benchmark/tools/scorer/rb-compare
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+python3 "$SCRIPT_DIR/rb_compare.py" "$@"
--- a/bench/reachability-benchmark/tools/scorer/rb_compare.py
+++ b/bench/reachability-benchmark/tools/scorer/rb_compare.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+"""
+rb-compare: build a deterministic leaderboard from multiple submissions.
+
+Task BENCH-LEADERBOARD-513-014
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import List, Dict
+
+import rb_score  # reuse scoring logic
+
+
+def load_json(path: Path):
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def build_entry(name: str, submission: dict, report: rb_score.ScoreReport) -> dict:
+    tool = submission.get("tool", {})
+    run = submission.get("run", {})
+    return {
+        "name": name,
+        "tool_name": tool.get("name", "unknown"),
+        "tool_version": tool.get("version", "unknown"),
+        "precision": round(report.precision, 4),
+        "recall": round(report.recall, 4),
+        "f1": round(report.f1, 4),
+        "determinism_rate": round(report.determinism_rate, 4),
+        "explainability_avg": round(report.explain_avg, 4),
+        "tp": report.tp,
+        "fp": report.fp,
+        "fn": report.fn,
+        "runtime": run,
+    }
+
+
+def sort_entries(entries: List[dict]) -> List[dict]:
+    return sorted(
+        entries,
+        key=lambda e: (-e["f1"], -e["precision"], -e["determinism_rate"], e["name"]),
+    )
+
+
+def render_text(entries: List[dict]) -> str:
+    lines = ["rank name f1 precision recall det_rate explain_avg tp fp fn"]
+    for idx, e in enumerate(entries, start=1):
+        lines.append(
+            f"{idx} {e['name']} {e['f1']:.4f} {e['precision']:.4f} {e['recall']:.4f} "
+            f"{e['determinism_rate']:.4f} {e['explainability_avg']:.4f} "
+            f"{e['tp']} {e['fp']} {e['fn']}"
+        )
+    return "\n".join(lines)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Build leaderboard from submissions.")
+    parser.add_argument("--truth", required=True, help="Path to truth JSON")
+    parser.add_argument(
+        "--submissions",
+        nargs="+",
+        required=True,
+        help="Submission JSON files (one or more)",
+    )
+    parser.add_argument(
+        "--output",
+        required=True,
+        help="Path to leaderboard JSON to write",
+    )
+    parser.add_argument(
+        "--text",
+        action="store_true",
+        help="Also print human-readable leaderboard",
+    )
+
+    args = parser.parse_args()
+    truth = load_json(Path(args.truth))
+
+    entries: List[dict] = []
+    for sub_path_str in args.submissions:
+        sub_path = Path(sub_path_str)
+        submission = load_json(sub_path)
+        report = rb_score.score(truth, submission)
+        name = submission.get("tool", {}).get("name") or sub_path.stem
+        entries.append(build_entry(name, submission, report))
+
+    entries = sort_entries(entries)
+
+    leaderboard = {
+        "version": "1.0.0",
+        "truth_version": truth.get("version", "1.0.0"),
+        "entries": entries,
+    }
+
+    out_path = Path(args.output)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(leaderboard, indent=2, sort_keys=True))
+
+    if args.text:
+        print(render_text(entries))
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/bench/reachability-benchmark/tools/scorer/tests/test_compare.py
+++ b/bench/reachability-benchmark/tools/scorer/tests/test_compare.py
@@ -0,0 +1,74 @@
+import json
+import importlib.util
+import unittest
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[3]  # bench/reachability-benchmark
+SCORE_PATH = ROOT / "tools" / "scorer" / "rb_score.py"
+COMPARE_PATH = ROOT / "tools" / "scorer" / "rb_compare.py"
+
+
+def load_module(path: Path, name: str):
+    spec = importlib.util.spec_from_file_location(name, path)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader
+    import sys
+
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)  # type: ignore[attr-defined]
+    return module
+
+
+rb_score = load_module(SCORE_PATH, "rb_score")
+rb_compare = load_module(COMPARE_PATH, "rb_compare")
+
+
+class TestCompare(unittest.TestCase):
+    def test_compare_sorts_by_f1_then_precision_then_det(self):
+        truth = {
+            "version": "1.0.0",
+            "cases": [
+                {"case_id": "c1", "sinks": [{"sink_id": "s1", "label": "reachable"}]},
+            ],
+        }
+        # two submissions: same F1, tie-broken by precision then determinism
+        sub_high_prec = {
+            "version": "1.0.0",
+            "tool": {"name": "toolA", "version": "1"},
+            "run": {},
+            "cases": [{"case_id": "c1", "sinks": [{"sink_id": "s1", "prediction": "reachable"}]}],
+        }
+        sub_lower_prec = {
+            "version": "1.0.0",
+            "tool": {"name": "toolB", "version": "1"},
+            "run": {},
+            "cases": [{"case_id": "c1", "sinks": [
+                {"sink_id": "s1", "prediction": "reachable"},
+                {"sink_id": "extra", "prediction": "reachable"},
+            ]}],
+        }
+
+        rep_a = rb_score.score(truth, sub_high_prec)
+        rep_b = rb_score.score(truth, sub_lower_prec)
+
+        entries = [
+            rb_compare.build_entry("A", sub_high_prec, rep_a),
+            rb_compare.build_entry("B", sub_lower_prec, rep_b),
+        ]
+
+        ordered = rb_compare.sort_entries(entries)
+        self.assertEqual(ordered[0]["name"], "A")
+        self.assertEqual(ordered[1]["name"], "B")
+
+    def test_render_text_outputs_rank(self):
+        entries = [
+            {"name": "foo", "f1": 0.5, "precision": 0.5, "recall": 0.5, "determinism_rate": 1.0, "explainability_avg": 1.0, "tp": 1, "fp": 1, "fn": 1},
+            {"name": "bar", "f1": 0.3, "precision": 0.3, "recall": 0.3, "determinism_rate": 1.0, "explainability_avg": 1.0, "tp": 1, "fp": 1, "fn": 2},
+        ]
+        text = rb_compare.render_text(entries)
+        self.assertIn("1 foo", text)
+        self.assertIn("2 bar", text)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/bench/reachability-benchmark/website/README.md
+++ b/bench/reachability-benchmark/website/README.md
@@ -0,0 +1,20 @@
+# Reachability Benchmark Website
+
+Static, offline-ready page for the public benchmark (task BENCH-WEBSITE-513-015).
+
+## Files
+- `index.html` — single-page site (no external assets) with:
+  - Quick start steps
+  - Download pointers (cases, schemas, truth, baselines)
+  - Determinism checklist
+  - Leaderboard panel that reads `leaderboard.json` if present
+
+## Usage
+1) Generate leaderboard locally:
+```bash
+ci/run-ci.sh    # or run rb_compare manually
+cp out/leaderboard.json website/
+```
+2) Serve the `website/` folder with any static file server (or open `index.html` directly).
+
+No external fonts or network calls are used; works fully offline.
--- a/bench/reachability-benchmark/website/index.html
+++ b/bench/reachability-benchmark/website/index.html
@@ -0,0 +1,147 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Stella Ops · Reachability Benchmark</title>
+  <style>
+    :root {
+      --bg: #0f172a;
+      --panel: #111827;
+      --accent: #22d3ee;
+      --muted: #9ca3af;
+      --text: #e5e7eb;
+      --mono: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace;
+      --sans: "Inter", "Segoe UI", system-ui, -apple-system, sans-serif;
+    }
+    * { box-sizing: border-box; }
+    body {
+      margin: 0;
+      background: var(--bg);
+      color: var(--text);
+      font-family: var(--sans);
+      line-height: 1.5;
+      padding: 24px;
+    }
+    header { margin-bottom: 24px; }
+    h1 { margin: 0 0 8px; font-size: 28px; }
+    h2 { margin-top: 32px; margin-bottom: 12px; font-size: 20px; }
+    p { margin: 6px 0; color: var(--muted); }
+    code, pre { font-family: var(--mono); }
+    .panel {
+      background: var(--panel);
+      border: 1px solid #1f2937;
+      border-radius: 10px;
+      padding: 16px;
+      margin-bottom: 16px;
+    }
+    .grid {
+      display: grid;
+      gap: 12px;
+    }
+    @media (min-width: 720px) {
+      .grid { grid-template-columns: repeat(2, minmax(0, 1fr)); }
+    }
+    .leaderboard table {
+      width: 100%;
+      border-collapse: collapse;
+    }
+    .leaderboard th, .leaderboard td {
+      padding: 8px;
+      border-bottom: 1px solid #1f2937;
+      text-align: left;
+      font-size: 14px;
+    }
+    .leaderboard th { color: var(--muted); font-weight: 600; }
+    .pill {
+      display: inline-block;
+      padding: 2px 8px;
+      border-radius: 999px;
+      background: rgba(34, 211, 238, 0.15);
+      color: var(--accent);
+      font-size: 12px;
+      font-weight: 600;
+    }
+    .badge-warning { background: rgba(234,179,8,0.18); color: #facc15; }
+    .list { padding-left: 18px; color: var(--muted); }
+  </style>
+</head>
+<body>
+  <header>
+    <div class="pill">Offline ready</div>
+    <h1>Stella Ops · Reachability Benchmark</h1>
+    <p>Deterministic, reproducible cases and scoring harness for reachability analysis tools.</p>
+  </header>
+
+  <section class="panel">
+    <h2>Quick Start</h2>
+    <ol class="list">
+      <li>Build cases deterministically: <code>python tools/build/build_all.py --cases cases</code></li>
+      <li>Run your analyzer and emit <code>submission.json</code> in <code>schemas/submission.schema.json</code> format.</li>
+      <li>Score: <code>tools/scorer/rb_score.py --truth benchmark/truth/&lt;aggregate&gt;.json --submission submission.json</code></li>
+      <li>Compare: <code>tools/scorer/rb_compare.py --truth ... --submissions submission.json baselines/*/submission.json --output leaderboard.json</code></li>
+    </ol>
+    <p>All tooling is offline-friendly; no network calls or external fonts.</p>
+  </section>
+
+  <section class="grid">
+    <div class="panel">
+      <h2>Downloads</h2>
+      <ul class="list">
+        <li>Cases: <code>cases/</code></li>
+        <li>Schemas: <code>schemas/</code></li>
+        <li>Truth: <code>benchmark/truth/</code></li>
+        <li>Baselines: <code>baselines/</code> (Semgrep, Stella, CodeQL)</li>
+        <li>CI script: <code>ci/run-ci.sh</code></li>
+      </ul>
+    </div>
+    <div class="panel">
+      <h2>Determinism Checklist</h2>
+      <ul class="list">
+        <li>Set <code>SOURCE_DATE_EPOCH</code> in builds.</li>
+        <li>Disable tool telemetry/version checks.</li>
+        <li>Sort cases and sinks before emission.</li>
+        <li>Keep outputs local; no registry or network pulls.</li>
+      </ul>
+    </div>
+  </section>
+
+  <section class="panel leaderboard">
+    <h2>Leaderboard</h2>
+    <p id="lb-note" class="muted">Looking for <code>leaderboard.json</code> in this directory…</p>
+    <div id="lb-table"></div>
+  </section>
+
+  <script>
+    const note = document.getElementById('lb-note');
+    const tableHost = document.getElementById('lb-table');
+
+    fetch('leaderboard.json')
+      .then(r => r.ok ? r.json() : Promise.reject(r.status))
+      .then(data => {
+        note.textContent = `Truth version: ${data.truth_version || 'n/a'} · Entries: ${data.entries.length}`;
+        const rows = data.entries.map((e, i) => `
+          <tr>
+            <td>${i + 1}</td>
+            <td>${e.name}</td>
+            <td>${e.tool_name} ${e.tool_version}</td>
+            <td>${e.f1.toFixed(4)}</td>
+            <td>${e.precision.toFixed(4)}</td>
+            <td>${e.recall.toFixed(4)}</td>
+            <td>${e.determinism_rate.toFixed(4)}</td>
+            <td>${e.explainability_avg.toFixed(4)}</td>
+          </tr>`).join('');
+        tableHost.innerHTML = `
+          <table>
+            <thead>
+              <tr><th>#</th><th>Name</th><th>Tool</th><th>F1</th><th>P</th><th>R</th><th>Det</th><th>Explain</th></tr>
+            </thead>
+            <tbody>${rows}</tbody>
+          </table>`;
+      })
+      .catch(() => {
+        note.innerHTML = 'No <code>leaderboard.json</code> found yet. Run <code>ci/run-ci.sh</code> to generate.';
+      });
+  </script>
+</body>
+</html>