feat: Implement Filesystem and MongoDB provenance writers for PackRun execution context
- Added `FilesystemPackRunProvenanceWriter` to write provenance manifests to the filesystem. - Introduced `MongoPackRunArtifactReader` to read artifacts from MongoDB. - Created `MongoPackRunProvenanceWriter` to store provenance manifests in MongoDB. - Developed unit tests for filesystem and MongoDB provenance writers. - Established `ITimelineEventStore` and `ITimelineIngestionService` interfaces for timeline event handling. - Implemented `TimelineIngestionService` to validate and persist timeline events with hashing. - Created PostgreSQL schema and migration scripts for timeline indexing. - Added dependency injection support for timeline indexer services. - Developed tests for timeline ingestion and schema validation.
This commit is contained in:
@@ -1,11 +1,34 @@
|
||||
# rb-score (placeholder)
|
||||
# rb-score
|
||||
|
||||
Planned CLI to score reachability submissions against truth sets.
|
||||
Deterministic scorer for the reachability benchmark.
|
||||
|
||||
Future work (BENCH-SCORER-513-008):
|
||||
- Validate submission against `schemas/submission.schema.json`.
|
||||
- Validate truth against `schemas/truth.schema.json`.
|
||||
- Compute precision/recall/F1, explainability score (0-3), runtime stats, determinism rate.
|
||||
- Emit JSON report with stable ordering.
|
||||
## What it does
|
||||
- Validates submissions against `schemas/submission.schema.json` and truth against `schemas/truth.schema.json`.
|
||||
- Computes precision/recall/F1 (micro, sink-level).
|
||||
- Computes explainability score per prediction (0–3) and averages it.
|
||||
- Checks duplicate predictions for determinism (inconsistent duplicates lower the rate).
|
||||
- Surfaces runtime metadata from the submission (`run` block).
|
||||
|
||||
For now this folder is a stub; implementation will be added in task 513-008 once schemas stabilize.
|
||||
## Install (offline-friendly)
|
||||
```bash
|
||||
python -m pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Usage
|
||||
```bash
|
||||
./rb_score.py --truth ../../benchmark/truth/public.json --submission ../../benchmark/submissions/sample.json --format json
|
||||
```
|
||||
|
||||
## Output
|
||||
- `text` (default): short human-readable summary.
|
||||
- `json`: deterministic JSON with top-level metrics and per-case breakdown.
|
||||
|
||||
## Tests
|
||||
```bash
|
||||
python -m unittest tests/test_scoring.py
|
||||
```
|
||||
|
||||
## Notes
|
||||
- Predictions for sinks not present in truth count as false positives (strict posture).
|
||||
- Truth sinks with label `unknown` are ignored for FN/FP counting.
|
||||
- Explainability tiering: 0=no context; 1=path>=2 nodes; 2=entry + path>=3; 3=guards present.
|
||||
|
||||
3
bench/reachability-benchmark/tools/scorer/__init__.py
Normal file
3
bench/reachability-benchmark/tools/scorer/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from . import rb_score
|
||||
|
||||
__all__ = ["rb_score"]
|
||||
4
bench/reachability-benchmark/tools/scorer/rb-score
Normal file
4
bench/reachability-benchmark/tools/scorer/rb-score
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
python3 "$SCRIPT_DIR/rb_score.py" "$@"
|
||||
258
bench/reachability-benchmark/tools/scorer/rb_score.py
Normal file
258
bench/reachability-benchmark/tools/scorer/rb_score.py
Normal file
@@ -0,0 +1,258 @@
|
||||
#!/usr/bin/env python3
|
||||
"""rb-score: deterministic scorer for reachability benchmark submissions.
|
||||
|
||||
Features (task BENCH-SCORER-513-008):
|
||||
- Validate submission and truth against published schemas.
|
||||
- Compute precision / recall / F1 at sink level (micro-averaged).
|
||||
- Compute explainability score per prediction (0–3) and average.
|
||||
- Surface runtime stats from submission metadata.
|
||||
- Emit deterministic JSON or human-readable text.
|
||||
|
||||
Assumptions:
|
||||
- Truth labels may include "unknown"; these are skipped for FN/FP.
|
||||
- A prediction for a sink absent in truth counts as FP (strict posture).
|
||||
- Duplicate predictions for the same sink must agree; disagreement reduces determinism rate.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Tuple
|
||||
|
||||
import yaml
|
||||
from jsonschema import Draft202012Validator
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
SCHEMAS = {
|
||||
"truth": ROOT / "schemas" / "truth.schema.json",
|
||||
"submission": ROOT / "schemas" / "submission.schema.json",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class CaseMetrics:
|
||||
case_id: str
|
||||
tp: int
|
||||
fp: int
|
||||
fn: int
|
||||
precision: float
|
||||
recall: float
|
||||
f1: float
|
||||
explain_avg: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScoreReport:
|
||||
precision: float
|
||||
recall: float
|
||||
f1: float
|
||||
tp: int
|
||||
fp: int
|
||||
fn: int
|
||||
explain_avg: float
|
||||
determinism_rate: float
|
||||
runtime: Dict[str, object]
|
||||
cases: List[CaseMetrics]
|
||||
|
||||
|
||||
def load_json_or_yaml(path: Path):
|
||||
text = path.read_text(encoding="utf-8")
|
||||
if path.suffix.lower() in {".yaml", ".yml"}:
|
||||
return yaml.safe_load(text)
|
||||
return json.loads(text)
|
||||
|
||||
|
||||
def validate_against(schema_path: Path, payload) -> Tuple[bool, List[str]]:
|
||||
schema = load_json_or_yaml(schema_path)
|
||||
validator = Draft202012Validator(schema)
|
||||
errors = sorted(validator.iter_errors(payload), key=lambda e: e.path)
|
||||
if not errors:
|
||||
return True, []
|
||||
return False, [f"{'/'.join(str(p) for p in err.path) or '<root>'}: {err.message}" for err in errors]
|
||||
|
||||
|
||||
def safe_div(num: int, denom: int, default: float) -> float:
|
||||
if denom == 0:
|
||||
return default
|
||||
return num / denom
|
||||
|
||||
|
||||
def explain_score(pred: dict) -> int:
|
||||
expl = pred.get("explain") or {}
|
||||
path = expl.get("path") or []
|
||||
entry = expl.get("entry")
|
||||
guards = expl.get("guards") or []
|
||||
if guards:
|
||||
return 3
|
||||
if entry and len(path) >= 3:
|
||||
return 2
|
||||
if len(path) >= 2:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def determinism_rate(preds: Iterable[dict]) -> float:
|
||||
"""Detect inconsistent duplicate predictions for the same sink."""
|
||||
by_sink: Dict[str, set] = {}
|
||||
total_groups = 0
|
||||
consistent_groups = 0
|
||||
for pred in preds:
|
||||
sink_id = pred.get("sink_id")
|
||||
if sink_id is None:
|
||||
continue
|
||||
by_sink.setdefault(sink_id, set()).add(pred.get("prediction"))
|
||||
for values in by_sink.values():
|
||||
total_groups += 1
|
||||
if len(values) == 1:
|
||||
consistent_groups += 1
|
||||
if total_groups == 0:
|
||||
return 1.0
|
||||
return consistent_groups / total_groups
|
||||
|
||||
|
||||
def score_case(case_id: str, truth_sinks: Dict[str, str], predicted: List[dict]) -> CaseMetrics:
|
||||
truth_reach = {sid for sid, label in truth_sinks.items() if label == "reachable"}
|
||||
truth_unreach = {sid for sid, label in truth_sinks.items() if label == "unreachable"}
|
||||
|
||||
pred_reach = {p["sink_id"] for p in predicted if p.get("prediction") == "reachable"}
|
||||
|
||||
tp = len(pred_reach & truth_reach)
|
||||
fp = len(pred_reach - truth_reach)
|
||||
fn = len(truth_reach - pred_reach)
|
||||
|
||||
precision = safe_div(tp, tp + fp, 1.0)
|
||||
recall = safe_div(tp, tp + fn, 1.0)
|
||||
f1 = 0.0 if (precision + recall) == 0 else 2 * precision * recall / (precision + recall)
|
||||
|
||||
explain_scores = [explain_score(p) for p in predicted]
|
||||
explain_avg = safe_div(sum(explain_scores), len(explain_scores), 0.0)
|
||||
|
||||
return CaseMetrics(case_id, tp, fp, fn, precision, recall, f1, explain_avg)
|
||||
|
||||
|
||||
def aggregate(cases: List[CaseMetrics], preds: List[dict]) -> ScoreReport:
|
||||
tp = sum(c.tp for c in cases)
|
||||
fp = sum(c.fp for c in cases)
|
||||
fn = sum(c.fn for c in cases)
|
||||
precision = safe_div(tp, tp + fp, 1.0)
|
||||
recall = safe_div(tp, tp + fn, 1.0)
|
||||
f1 = 0.0 if (precision + recall) == 0 else 2 * precision * recall / (precision + recall)
|
||||
explain_avg = safe_div(sum(c.explain_avg for c in cases), len(cases), 0.0) if cases else 0.0
|
||||
det_rate = determinism_rate(preds)
|
||||
runtime = {}
|
||||
return ScoreReport(precision, recall, f1, tp, fp, fn, explain_avg, det_rate, runtime, cases)
|
||||
|
||||
|
||||
def build_truth_index(truth_doc: dict) -> Dict[str, Dict[str, str]]:
|
||||
index: Dict[str, Dict[str, str]] = {}
|
||||
for case in truth_doc.get("cases", []):
|
||||
sinks = {s["sink_id"]: s["label"] for s in case.get("sinks", [])}
|
||||
index[case["case_id"]] = sinks
|
||||
return index
|
||||
|
||||
|
||||
def score(truth_doc: dict, submission_doc: dict) -> ScoreReport:
|
||||
truth_index = build_truth_index(truth_doc)
|
||||
cases_metrics: List[CaseMetrics] = []
|
||||
all_preds: List[dict] = []
|
||||
|
||||
for sub_case in submission_doc.get("cases", []):
|
||||
case_id = sub_case.get("case_id")
|
||||
predicted_sinks = sub_case.get("sinks") or []
|
||||
all_preds.extend(predicted_sinks)
|
||||
truth_sinks = truth_index.get(case_id, {})
|
||||
case_metrics = score_case(case_id, truth_sinks, predicted_sinks)
|
||||
cases_metrics.append(case_metrics)
|
||||
|
||||
report = aggregate(cases_metrics, all_preds)
|
||||
report.runtime = submission_doc.get("run", {})
|
||||
return report
|
||||
|
||||
|
||||
def report_as_dict(report: ScoreReport) -> dict:
|
||||
return {
|
||||
"version": "1.0.0",
|
||||
"metrics": {
|
||||
"precision": round(report.precision, 4),
|
||||
"recall": round(report.recall, 4),
|
||||
"f1": round(report.f1, 4),
|
||||
"tp": report.tp,
|
||||
"fp": report.fp,
|
||||
"fn": report.fn,
|
||||
"determinism_rate": round(report.determinism_rate, 4),
|
||||
"explainability_avg": round(report.explain_avg, 4),
|
||||
},
|
||||
"runtime": report.runtime,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": c.case_id,
|
||||
"precision": round(c.precision, 4),
|
||||
"recall": round(c.recall, 4),
|
||||
"f1": round(c.f1, 4),
|
||||
"tp": c.tp,
|
||||
"fp": c.fp,
|
||||
"fn": c.fn,
|
||||
"explainability_avg": round(c.explain_avg, 4),
|
||||
}
|
||||
for c in report.cases
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def format_text(report: ScoreReport) -> str:
|
||||
lines = []
|
||||
lines.append("rb-score summary")
|
||||
lines.append(f" precision {report.precision:.4f} recall {report.recall:.4f} f1 {report.f1:.4f}")
|
||||
lines.append(f" tp {report.tp} fp {report.fp} fn {report.fn} determinism {report.determinism_rate:.4f} explain_avg {report.explain_avg:.4f}")
|
||||
if report.runtime:
|
||||
rt = report.runtime
|
||||
lines.append(" runtime: " + ", ".join(f"{k}={v}" for k, v in sorted(rt.items())))
|
||||
lines.append(" cases:")
|
||||
for c in report.cases:
|
||||
lines.append(
|
||||
f" - {c.case_id}: P {c.precision:.4f} R {c.recall:.4f} F1 {c.f1:.4f} tp {c.tp} fp {c.fp} fn {c.fn} explain_avg {c.explain_avg:.4f}"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def parse_args(argv: List[str]) -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Score reachability benchmark submissions")
|
||||
parser.add_argument("--truth", required=True, help="Path to truth JSON")
|
||||
parser.add_argument("--submission", required=True, help="Path to submission JSON")
|
||||
parser.add_argument("--format", choices=["json", "text"], default="text", help="Output format")
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def main(argv: List[str]) -> int:
|
||||
args = parse_args(argv)
|
||||
truth_path = Path(args.truth)
|
||||
submission_path = Path(args.submission)
|
||||
|
||||
if not truth_path.exists() or not submission_path.exists():
|
||||
print("truth or submission file not found", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
truth_doc = load_json_or_yaml(truth_path)
|
||||
submission_doc = load_json_or_yaml(submission_path)
|
||||
|
||||
ok_truth, truth_errs = validate_against(SCHEMAS["truth"], truth_doc)
|
||||
ok_sub, sub_errs = validate_against(SCHEMAS["submission"], submission_doc)
|
||||
if not ok_truth or not ok_sub:
|
||||
for msg in truth_errs + sub_errs:
|
||||
print(f"validation_error: {msg}", file=sys.stderr)
|
||||
return 3
|
||||
|
||||
report = score(truth_doc, submission_doc)
|
||||
|
||||
if args.format == "json":
|
||||
print(json.dumps(report_as_dict(report), sort_keys=True, indent=2))
|
||||
else:
|
||||
print(format_text(report))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv[1:]))
|
||||
@@ -0,0 +1,2 @@
|
||||
jsonschema==4.23.0
|
||||
PyYAML==6.0.2
|
||||
Binary file not shown.
@@ -0,0 +1,70 @@
|
||||
import json
|
||||
import importlib.util
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[3] # bench/reachability-benchmark
|
||||
SCORER_PATH = ROOT / "tools" / "scorer" / "rb_score.py"
|
||||
|
||||
|
||||
def load_module():
|
||||
spec = importlib.util.spec_from_file_location("rb_score", SCORER_PATH)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
assert spec.loader
|
||||
import sys
|
||||
sys.modules[spec.name] = module
|
||||
spec.loader.exec_module(module) # type: ignore[attr-defined]
|
||||
return module
|
||||
|
||||
|
||||
def load_example(name: str):
|
||||
return json.loads((ROOT / "schemas" / "examples" / name).read_text())
|
||||
|
||||
|
||||
rb_score = load_module()
|
||||
|
||||
|
||||
class TestScoring(unittest.TestCase):
|
||||
def test_score_perfect_prediction(self):
|
||||
truth = load_example("truth.sample.json")
|
||||
submission = load_example("submission.sample.json")
|
||||
|
||||
report = rb_score.score(truth, submission)
|
||||
self.assertEqual(report.tp, 1)
|
||||
self.assertEqual(report.fp, 0)
|
||||
self.assertEqual(report.fn, 0)
|
||||
self.assertEqual(report.precision, 1.0)
|
||||
self.assertEqual(report.recall, 1.0)
|
||||
self.assertEqual(report.f1, 1.0)
|
||||
self.assertGreaterEqual(report.explain_avg, 1.0)
|
||||
self.assertEqual(report.determinism_rate, 1.0)
|
||||
|
||||
def test_score_false_negative_and_fp(self):
|
||||
truth = load_example("truth.sample.json")
|
||||
submission = {
|
||||
"version": "1.0.0",
|
||||
"tool": {"name": "tool", "version": "1"},
|
||||
"run": {"platform": "ubuntu"},
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "js-express-blog:001",
|
||||
"sinks": [
|
||||
{"sink_id": "Deserializer::parse", "prediction": "unreachable"},
|
||||
{"sink_id": "Fake::sink", "prediction": "reachable"},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
report = rb_score.score(truth, submission)
|
||||
self.assertEqual(report.tp, 0)
|
||||
self.assertEqual(report.fp, 1)
|
||||
self.assertEqual(report.fn, 1)
|
||||
self.assertEqual(report.precision, 0.0)
|
||||
self.assertEqual(report.recall, 0.0)
|
||||
self.assertEqual(report.f1, 0.0)
|
||||
self.assertEqual(report.determinism_rate, 1.0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user