up
Some checks failed
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Docs CI / lint-and-preview (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled

This commit is contained in:
StellaOps Bot
2025-12-01 21:16:22 +02:00
parent c11d87d252
commit 909d9b6220
208 changed files with 860954 additions and 832 deletions

View File

@@ -19,6 +19,12 @@ python -m pip install -r requirements.txt
./rb_score.py --truth ../../benchmark/truth/public.json --submission ../../benchmark/submissions/sample.json --format json
```
## Compare / leaderboard
Use `rb-compare` to aggregate multiple submissions into a deterministic leaderboard:
```bash
./rb_compare.py --truth ../../benchmark/truth/public.json --submissions sub1.json sub2.json --output ../../benchmark/leaderboard.json --text
```
## Output
- `text` (default): short human-readable summary.
- `json`: deterministic JSON with top-level metrics and per-case breakdown.

View File

@@ -0,0 +1,4 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
python3 "$SCRIPT_DIR/rb_compare.py" "$@"

View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
"""
rb-compare: build a deterministic leaderboard from multiple submissions.
Task BENCH-LEADERBOARD-513-014
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from typing import List, Dict
import rb_score # reuse scoring logic
def load_json(path: Path):
return json.loads(path.read_text(encoding="utf-8"))
def build_entry(name: str, submission: dict, report: rb_score.ScoreReport) -> dict:
tool = submission.get("tool", {})
run = submission.get("run", {})
return {
"name": name,
"tool_name": tool.get("name", "unknown"),
"tool_version": tool.get("version", "unknown"),
"precision": round(report.precision, 4),
"recall": round(report.recall, 4),
"f1": round(report.f1, 4),
"determinism_rate": round(report.determinism_rate, 4),
"explainability_avg": round(report.explain_avg, 4),
"tp": report.tp,
"fp": report.fp,
"fn": report.fn,
"runtime": run,
}
def sort_entries(entries: List[dict]) -> List[dict]:
return sorted(
entries,
key=lambda e: (-e["f1"], -e["precision"], -e["determinism_rate"], e["name"]),
)
def render_text(entries: List[dict]) -> str:
lines = ["rank name f1 precision recall det_rate explain_avg tp fp fn"]
for idx, e in enumerate(entries, start=1):
lines.append(
f"{idx} {e['name']} {e['f1']:.4f} {e['precision']:.4f} {e['recall']:.4f} "
f"{e['determinism_rate']:.4f} {e['explainability_avg']:.4f} "
f"{e['tp']} {e['fp']} {e['fn']}"
)
return "\n".join(lines)
def main() -> int:
parser = argparse.ArgumentParser(description="Build leaderboard from submissions.")
parser.add_argument("--truth", required=True, help="Path to truth JSON")
parser.add_argument(
"--submissions",
nargs="+",
required=True,
help="Submission JSON files (one or more)",
)
parser.add_argument(
"--output",
required=True,
help="Path to leaderboard JSON to write",
)
parser.add_argument(
"--text",
action="store_true",
help="Also print human-readable leaderboard",
)
args = parser.parse_args()
truth = load_json(Path(args.truth))
entries: List[dict] = []
for sub_path_str in args.submissions:
sub_path = Path(sub_path_str)
submission = load_json(sub_path)
report = rb_score.score(truth, submission)
name = submission.get("tool", {}).get("name") or sub_path.stem
entries.append(build_entry(name, submission, report))
entries = sort_entries(entries)
leaderboard = {
"version": "1.0.0",
"truth_version": truth.get("version", "1.0.0"),
"entries": entries,
}
out_path = Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(leaderboard, indent=2, sort_keys=True))
if args.text:
print(render_text(entries))
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,74 @@
import json
import importlib.util
import unittest
from pathlib import Path
ROOT = Path(__file__).resolve().parents[3] # bench/reachability-benchmark
SCORE_PATH = ROOT / "tools" / "scorer" / "rb_score.py"
COMPARE_PATH = ROOT / "tools" / "scorer" / "rb_compare.py"
def load_module(path: Path, name: str):
spec = importlib.util.spec_from_file_location(name, path)
module = importlib.util.module_from_spec(spec)
assert spec.loader
import sys
sys.modules[spec.name] = module
spec.loader.exec_module(module) # type: ignore[attr-defined]
return module
rb_score = load_module(SCORE_PATH, "rb_score")
rb_compare = load_module(COMPARE_PATH, "rb_compare")
class TestCompare(unittest.TestCase):
def test_compare_sorts_by_f1_then_precision_then_det(self):
truth = {
"version": "1.0.0",
"cases": [
{"case_id": "c1", "sinks": [{"sink_id": "s1", "label": "reachable"}]},
],
}
# two submissions: same F1, tie-broken by precision then determinism
sub_high_prec = {
"version": "1.0.0",
"tool": {"name": "toolA", "version": "1"},
"run": {},
"cases": [{"case_id": "c1", "sinks": [{"sink_id": "s1", "prediction": "reachable"}]}],
}
sub_lower_prec = {
"version": "1.0.0",
"tool": {"name": "toolB", "version": "1"},
"run": {},
"cases": [{"case_id": "c1", "sinks": [
{"sink_id": "s1", "prediction": "reachable"},
{"sink_id": "extra", "prediction": "reachable"},
]}],
}
rep_a = rb_score.score(truth, sub_high_prec)
rep_b = rb_score.score(truth, sub_lower_prec)
entries = [
rb_compare.build_entry("A", sub_high_prec, rep_a),
rb_compare.build_entry("B", sub_lower_prec, rep_b),
]
ordered = rb_compare.sort_entries(entries)
self.assertEqual(ordered[0]["name"], "A")
self.assertEqual(ordered[1]["name"], "B")
def test_render_text_outputs_rank(self):
entries = [
{"name": "foo", "f1": 0.5, "precision": 0.5, "recall": 0.5, "determinism_rate": 1.0, "explainability_avg": 1.0, "tp": 1, "fp": 1, "fn": 1},
{"name": "bar", "f1": 0.3, "precision": 0.3, "recall": 0.3, "determinism_rate": 1.0, "explainability_avg": 1.0, "tp": 1, "fp": 1, "fn": 2},
]
text = rb_compare.render_text(entries)
self.assertIn("1 foo", text)
self.assertIn("2 bar", text)
if __name__ == "__main__":
unittest.main()