up
This commit is contained in:
@@ -19,6 +19,12 @@ python -m pip install -r requirements.txt
|
||||
./rb_score.py --truth ../../benchmark/truth/public.json --submission ../../benchmark/submissions/sample.json --format json
|
||||
```
|
||||
|
||||
## Compare / leaderboard
|
||||
Use `rb-compare` to aggregate multiple submissions into a deterministic leaderboard:
|
||||
```bash
|
||||
./rb_compare.py --truth ../../benchmark/truth/public.json --submissions sub1.json sub2.json --output ../../benchmark/leaderboard.json --text
|
||||
```
|
||||
|
||||
## Output
|
||||
- `text` (default): short human-readable summary.
|
||||
- `json`: deterministic JSON with top-level metrics and per-case breakdown.
|
||||
|
||||
Binary file not shown.
Binary file not shown.
4
bench/reachability-benchmark/tools/scorer/rb-compare
Normal file
4
bench/reachability-benchmark/tools/scorer/rb-compare
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
python3 "$SCRIPT_DIR/rb_compare.py" "$@"
|
||||
109
bench/reachability-benchmark/tools/scorer/rb_compare.py
Normal file
109
bench/reachability-benchmark/tools/scorer/rb_compare.py
Normal file
@@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
rb-compare: build a deterministic leaderboard from multiple submissions.
|
||||
|
||||
Task BENCH-LEADERBOARD-513-014
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
import rb_score # reuse scoring logic
|
||||
|
||||
|
||||
def load_json(path: Path):
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def build_entry(name: str, submission: dict, report: rb_score.ScoreReport) -> dict:
|
||||
tool = submission.get("tool", {})
|
||||
run = submission.get("run", {})
|
||||
return {
|
||||
"name": name,
|
||||
"tool_name": tool.get("name", "unknown"),
|
||||
"tool_version": tool.get("version", "unknown"),
|
||||
"precision": round(report.precision, 4),
|
||||
"recall": round(report.recall, 4),
|
||||
"f1": round(report.f1, 4),
|
||||
"determinism_rate": round(report.determinism_rate, 4),
|
||||
"explainability_avg": round(report.explain_avg, 4),
|
||||
"tp": report.tp,
|
||||
"fp": report.fp,
|
||||
"fn": report.fn,
|
||||
"runtime": run,
|
||||
}
|
||||
|
||||
|
||||
def sort_entries(entries: List[dict]) -> List[dict]:
|
||||
return sorted(
|
||||
entries,
|
||||
key=lambda e: (-e["f1"], -e["precision"], -e["determinism_rate"], e["name"]),
|
||||
)
|
||||
|
||||
|
||||
def render_text(entries: List[dict]) -> str:
|
||||
lines = ["rank name f1 precision recall det_rate explain_avg tp fp fn"]
|
||||
for idx, e in enumerate(entries, start=1):
|
||||
lines.append(
|
||||
f"{idx} {e['name']} {e['f1']:.4f} {e['precision']:.4f} {e['recall']:.4f} "
|
||||
f"{e['determinism_rate']:.4f} {e['explainability_avg']:.4f} "
|
||||
f"{e['tp']} {e['fp']} {e['fn']}"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Build leaderboard from submissions.")
|
||||
parser.add_argument("--truth", required=True, help="Path to truth JSON")
|
||||
parser.add_argument(
|
||||
"--submissions",
|
||||
nargs="+",
|
||||
required=True,
|
||||
help="Submission JSON files (one or more)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
required=True,
|
||||
help="Path to leaderboard JSON to write",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
action="store_true",
|
||||
help="Also print human-readable leaderboard",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
truth = load_json(Path(args.truth))
|
||||
|
||||
entries: List[dict] = []
|
||||
for sub_path_str in args.submissions:
|
||||
sub_path = Path(sub_path_str)
|
||||
submission = load_json(sub_path)
|
||||
report = rb_score.score(truth, submission)
|
||||
name = submission.get("tool", {}).get("name") or sub_path.stem
|
||||
entries.append(build_entry(name, submission, report))
|
||||
|
||||
entries = sort_entries(entries)
|
||||
|
||||
leaderboard = {
|
||||
"version": "1.0.0",
|
||||
"truth_version": truth.get("version", "1.0.0"),
|
||||
"entries": entries,
|
||||
}
|
||||
|
||||
out_path = Path(args.output)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_text(json.dumps(leaderboard, indent=2, sort_keys=True))
|
||||
|
||||
if args.text:
|
||||
print(render_text(entries))
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,74 @@
|
||||
import json
|
||||
import importlib.util
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[3] # bench/reachability-benchmark
|
||||
SCORE_PATH = ROOT / "tools" / "scorer" / "rb_score.py"
|
||||
COMPARE_PATH = ROOT / "tools" / "scorer" / "rb_compare.py"
|
||||
|
||||
|
||||
def load_module(path: Path, name: str):
|
||||
spec = importlib.util.spec_from_file_location(name, path)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
assert spec.loader
|
||||
import sys
|
||||
|
||||
sys.modules[spec.name] = module
|
||||
spec.loader.exec_module(module) # type: ignore[attr-defined]
|
||||
return module
|
||||
|
||||
|
||||
rb_score = load_module(SCORE_PATH, "rb_score")
|
||||
rb_compare = load_module(COMPARE_PATH, "rb_compare")
|
||||
|
||||
|
||||
class TestCompare(unittest.TestCase):
|
||||
def test_compare_sorts_by_f1_then_precision_then_det(self):
|
||||
truth = {
|
||||
"version": "1.0.0",
|
||||
"cases": [
|
||||
{"case_id": "c1", "sinks": [{"sink_id": "s1", "label": "reachable"}]},
|
||||
],
|
||||
}
|
||||
# two submissions: same F1, tie-broken by precision then determinism
|
||||
sub_high_prec = {
|
||||
"version": "1.0.0",
|
||||
"tool": {"name": "toolA", "version": "1"},
|
||||
"run": {},
|
||||
"cases": [{"case_id": "c1", "sinks": [{"sink_id": "s1", "prediction": "reachable"}]}],
|
||||
}
|
||||
sub_lower_prec = {
|
||||
"version": "1.0.0",
|
||||
"tool": {"name": "toolB", "version": "1"},
|
||||
"run": {},
|
||||
"cases": [{"case_id": "c1", "sinks": [
|
||||
{"sink_id": "s1", "prediction": "reachable"},
|
||||
{"sink_id": "extra", "prediction": "reachable"},
|
||||
]}],
|
||||
}
|
||||
|
||||
rep_a = rb_score.score(truth, sub_high_prec)
|
||||
rep_b = rb_score.score(truth, sub_lower_prec)
|
||||
|
||||
entries = [
|
||||
rb_compare.build_entry("A", sub_high_prec, rep_a),
|
||||
rb_compare.build_entry("B", sub_lower_prec, rep_b),
|
||||
]
|
||||
|
||||
ordered = rb_compare.sort_entries(entries)
|
||||
self.assertEqual(ordered[0]["name"], "A")
|
||||
self.assertEqual(ordered[1]["name"], "B")
|
||||
|
||||
def test_render_text_outputs_rank(self):
|
||||
entries = [
|
||||
{"name": "foo", "f1": 0.5, "precision": 0.5, "recall": 0.5, "determinism_rate": 1.0, "explainability_avg": 1.0, "tp": 1, "fp": 1, "fn": 1},
|
||||
{"name": "bar", "f1": 0.3, "precision": 0.3, "recall": 0.3, "determinism_rate": 1.0, "explainability_avg": 1.0, "tp": 1, "fp": 1, "fn": 2},
|
||||
]
|
||||
text = rb_compare.render_text(entries)
|
||||
self.assertIn("1 foo", text)
|
||||
self.assertIn("2 bar", text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user