feat: Implement Filesystem and MongoDB provenance writers for PackRun execution context

- Added `FilesystemPackRunProvenanceWriter` to write provenance manifests to the filesystem. - Introduced `MongoPackRunArtifactReader` to read artifacts from MongoDB. - Created `MongoPackRunProvenanceWriter` to store provenance manifests in MongoDB. - Developed unit tests for filesystem and MongoDB provenance writers. - Established `ITimelineEventStore` and `ITimelineIngestionService` interfaces for timeline event handling. - Implemented `TimelineIngestionService` to validate and persist timeline events with hashing. - Created PostgreSQL schema and migration scripts for timeline indexing. - Added dependency injection support for timeline indexer services. - Developed tests for timeline ingestion and schema validation.
2025-11-30 15:38:14 +02:00
parent 8f54ffa203
commit 17d45a6d30
276 changed files with 8618 additions and 688 deletions
--- a/bench/reachability-benchmark/tools/scorer/README.md
+++ b/bench/reachability-benchmark/tools/scorer/README.md
@@ -1,11 +1,34 @@
-# rb-score (placeholder)
+# rb-score

-Planned CLI to score reachability submissions against truth sets.
+Deterministic scorer for the reachability benchmark.

-Future work (BENCH-SCORER-513-008):
- Validate submission against `schemas/submission.schema.json`.
- Validate truth against `schemas/truth.schema.json`.
- Compute precision/recall/F1, explainability score (0-3), runtime stats, determinism rate.
- Emit JSON report with stable ordering.
+## What it does
+- Validates submissions against `schemas/submission.schema.json` and truth against `schemas/truth.schema.json`.
+- Computes precision/recall/F1 (micro, sink-level).
+- Computes explainability score per prediction (0–3) and averages it.
+- Checks duplicate predictions for determinism (inconsistent duplicates lower the rate).
+- Surfaces runtime metadata from the submission (`run` block).

-For now this folder is a stub; implementation will be added in task 513-008 once schemas stabilize.
+## Install (offline-friendly)
+```bash
+python -m pip install -r requirements.txt
+```
+
+## Usage
+```bash
+./rb_score.py --truth ../../benchmark/truth/public.json --submission ../../benchmark/submissions/sample.json --format json
+```
+
+## Output
+- `text` (default): short human-readable summary.
+- `json`: deterministic JSON with top-level metrics and per-case breakdown.
+
+## Tests
+```bash
+python -m unittest tests/test_scoring.py
+```
+
+## Notes
+- Predictions for sinks not present in truth count as false positives (strict posture).
+- Truth sinks with label `unknown` are ignored for FN/FP counting.
+- Explainability tiering: 0=no context; 1=path>=2 nodes; 2=entry + path>=3; 3=guards present.
--- a/bench/reachability-benchmark/tools/scorer/init.py
+++ b/bench/reachability-benchmark/tools/scorer/init.py
@@ -0,0 +1,3 @@
+from . import rb_score
+
+__all__ = ["rb_score"]
--- a/bench/reachability-benchmark/tools/scorer/rb-score
+++ b/bench/reachability-benchmark/tools/scorer/rb-score
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+python3 "$SCRIPT_DIR/rb_score.py" "$@"
--- a/bench/reachability-benchmark/tools/scorer/rb_score.py
+++ b/bench/reachability-benchmark/tools/scorer/rb_score.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""rb-score: deterministic scorer for reachability benchmark submissions.
+
+Features (task BENCH-SCORER-513-008):
+- Validate submission and truth against published schemas.
+- Compute precision / recall / F1 at sink level (micro-averaged).
+- Compute explainability score per prediction (0–3) and average.
+- Surface runtime stats from submission metadata.
+- Emit deterministic JSON or human-readable text.
+
+Assumptions:
+- Truth labels may include "unknown"; these are skipped for FN/FP.
+- A prediction for a sink absent in truth counts as FP (strict posture).
+- Duplicate predictions for the same sink must agree; disagreement reduces determinism rate.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple
+
+import yaml
+from jsonschema import Draft202012Validator
+
+ROOT = Path(__file__).resolve().parents[1]
+SCHEMAS = {
+    "truth": ROOT / "schemas" / "truth.schema.json",
+    "submission": ROOT / "schemas" / "submission.schema.json",
+}
+
+
+@dataclass
+class CaseMetrics:
+    case_id: str
+    tp: int
+    fp: int
+    fn: int
+    precision: float
+    recall: float
+    f1: float
+    explain_avg: float
+
+
+@dataclass
+class ScoreReport:
+    precision: float
+    recall: float
+    f1: float
+    tp: int
+    fp: int
+    fn: int
+    explain_avg: float
+    determinism_rate: float
+    runtime: Dict[str, object]
+    cases: List[CaseMetrics]
+
+
+def load_json_or_yaml(path: Path):
+    text = path.read_text(encoding="utf-8")
+    if path.suffix.lower() in {".yaml", ".yml"}:
+        return yaml.safe_load(text)
+    return json.loads(text)
+
+
+def validate_against(schema_path: Path, payload) -> Tuple[bool, List[str]]:
+    schema = load_json_or_yaml(schema_path)
+    validator = Draft202012Validator(schema)
+    errors = sorted(validator.iter_errors(payload), key=lambda e: e.path)
+    if not errors:
+        return True, []
+    return False, [f"{'/'.join(str(p) for p in err.path) or '<root>'}: {err.message}" for err in errors]
+
+
+def safe_div(num: int, denom: int, default: float) -> float:
+    if denom == 0:
+        return default
+    return num / denom
+
+
+def explain_score(pred: dict) -> int:
+    expl = pred.get("explain") or {}
+    path = expl.get("path") or []
+    entry = expl.get("entry")
+    guards = expl.get("guards") or []
+    if guards:
+        return 3
+    if entry and len(path) >= 3:
+        return 2
+    if len(path) >= 2:
+        return 1
+    return 0
+
+
+def determinism_rate(preds: Iterable[dict]) -> float:
+    """Detect inconsistent duplicate predictions for the same sink."""
+    by_sink: Dict[str, set] = {}
+    total_groups = 0
+    consistent_groups = 0
+    for pred in preds:
+        sink_id = pred.get("sink_id")
+        if sink_id is None:
+            continue
+        by_sink.setdefault(sink_id, set()).add(pred.get("prediction"))
+    for values in by_sink.values():
+        total_groups += 1
+        if len(values) == 1:
+            consistent_groups += 1
+    if total_groups == 0:
+        return 1.0
+    return consistent_groups / total_groups
+
+
+def score_case(case_id: str, truth_sinks: Dict[str, str], predicted: List[dict]) -> CaseMetrics:
+    truth_reach = {sid for sid, label in truth_sinks.items() if label == "reachable"}
+    truth_unreach = {sid for sid, label in truth_sinks.items() if label == "unreachable"}
+
+    pred_reach = {p["sink_id"] for p in predicted if p.get("prediction") == "reachable"}
+
+    tp = len(pred_reach & truth_reach)
+    fp = len(pred_reach - truth_reach)
+    fn = len(truth_reach - pred_reach)
+
+    precision = safe_div(tp, tp + fp, 1.0)
+    recall = safe_div(tp, tp + fn, 1.0)
+    f1 = 0.0 if (precision + recall) == 0 else 2 * precision * recall / (precision + recall)
+
+    explain_scores = [explain_score(p) for p in predicted]
+    explain_avg = safe_div(sum(explain_scores), len(explain_scores), 0.0)
+
+    return CaseMetrics(case_id, tp, fp, fn, precision, recall, f1, explain_avg)
+
+
+def aggregate(cases: List[CaseMetrics], preds: List[dict]) -> ScoreReport:
+    tp = sum(c.tp for c in cases)
+    fp = sum(c.fp for c in cases)
+    fn = sum(c.fn for c in cases)
+    precision = safe_div(tp, tp + fp, 1.0)
+    recall = safe_div(tp, tp + fn, 1.0)
+    f1 = 0.0 if (precision + recall) == 0 else 2 * precision * recall / (precision + recall)
+    explain_avg = safe_div(sum(c.explain_avg for c in cases), len(cases), 0.0) if cases else 0.0
+    det_rate = determinism_rate(preds)
+    runtime = {}
+    return ScoreReport(precision, recall, f1, tp, fp, fn, explain_avg, det_rate, runtime, cases)
+
+
+def build_truth_index(truth_doc: dict) -> Dict[str, Dict[str, str]]:
+    index: Dict[str, Dict[str, str]] = {}
+    for case in truth_doc.get("cases", []):
+        sinks = {s["sink_id"]: s["label"] for s in case.get("sinks", [])}
+        index[case["case_id"]] = sinks
+    return index
+
+
+def score(truth_doc: dict, submission_doc: dict) -> ScoreReport:
+    truth_index = build_truth_index(truth_doc)
+    cases_metrics: List[CaseMetrics] = []
+    all_preds: List[dict] = []
+
+    for sub_case in submission_doc.get("cases", []):
+        case_id = sub_case.get("case_id")
+        predicted_sinks = sub_case.get("sinks") or []
+        all_preds.extend(predicted_sinks)
+        truth_sinks = truth_index.get(case_id, {})
+        case_metrics = score_case(case_id, truth_sinks, predicted_sinks)
+        cases_metrics.append(case_metrics)
+
+    report = aggregate(cases_metrics, all_preds)
+    report.runtime = submission_doc.get("run", {})
+    return report
+
+
+def report_as_dict(report: ScoreReport) -> dict:
+    return {
+        "version": "1.0.0",
+        "metrics": {
+            "precision": round(report.precision, 4),
+            "recall": round(report.recall, 4),
+            "f1": round(report.f1, 4),
+            "tp": report.tp,
+            "fp": report.fp,
+            "fn": report.fn,
+            "determinism_rate": round(report.determinism_rate, 4),
+            "explainability_avg": round(report.explain_avg, 4),
+        },
+        "runtime": report.runtime,
+        "cases": [
+            {
+                "case_id": c.case_id,
+                "precision": round(c.precision, 4),
+                "recall": round(c.recall, 4),
+                "f1": round(c.f1, 4),
+                "tp": c.tp,
+                "fp": c.fp,
+                "fn": c.fn,
+                "explainability_avg": round(c.explain_avg, 4),
+            }
+            for c in report.cases
+        ],
+    }
+
+
+def format_text(report: ScoreReport) -> str:
+    lines = []
+    lines.append("rb-score summary")
+    lines.append(f" precision {report.precision:.4f}  recall {report.recall:.4f}  f1 {report.f1:.4f}")
+    lines.append(f" tp {report.tp}  fp {report.fp}  fn {report.fn}  determinism {report.determinism_rate:.4f}  explain_avg {report.explain_avg:.4f}")
+    if report.runtime:
+        rt = report.runtime
+        lines.append(" runtime: " + ", ".join(f"{k}={v}" for k, v in sorted(rt.items())))
+    lines.append(" cases:")
+    for c in report.cases:
+        lines.append(
+            f"  - {c.case_id}: P {c.precision:.4f} R {c.recall:.4f} F1 {c.f1:.4f} tp {c.tp} fp {c.fp} fn {c.fn} explain_avg {c.explain_avg:.4f}"
+        )
+    return "\n".join(lines)
+
+
+def parse_args(argv: List[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Score reachability benchmark submissions")
+    parser.add_argument("--truth", required=True, help="Path to truth JSON")
+    parser.add_argument("--submission", required=True, help="Path to submission JSON")
+    parser.add_argument("--format", choices=["json", "text"], default="text", help="Output format")
+    return parser.parse_args(argv)
+
+
+def main(argv: List[str]) -> int:
+    args = parse_args(argv)
+    truth_path = Path(args.truth)
+    submission_path = Path(args.submission)
+
+    if not truth_path.exists() or not submission_path.exists():
+        print("truth or submission file not found", file=sys.stderr)
+        return 2
+
+    truth_doc = load_json_or_yaml(truth_path)
+    submission_doc = load_json_or_yaml(submission_path)
+
+    ok_truth, truth_errs = validate_against(SCHEMAS["truth"], truth_doc)
+    ok_sub, sub_errs = validate_against(SCHEMAS["submission"], submission_doc)
+    if not ok_truth or not ok_sub:
+        for msg in truth_errs + sub_errs:
+            print(f"validation_error: {msg}", file=sys.stderr)
+        return 3
+
+    report = score(truth_doc, submission_doc)
+
+    if args.format == "json":
+        print(json.dumps(report_as_dict(report), sort_keys=True, indent=2))
+    else:
+        print(format_text(report))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
--- a/bench/reachability-benchmark/tools/scorer/requirements.txt
+++ b/bench/reachability-benchmark/tools/scorer/requirements.txt
@@ -0,0 +1,2 @@
+jsonschema==4.23.0
+PyYAML==6.0.2
--- a/bench/reachability-benchmark/tools/scorer/tests/pycache/test_scoring.cpython-312.pyc
+++ b/bench/reachability-benchmark/tools/scorer/tests/pycache/test_scoring.cpython-312.pyc
--- a/bench/reachability-benchmark/tools/scorer/tests/test_scoring.py
+++ b/bench/reachability-benchmark/tools/scorer/tests/test_scoring.py
@@ -0,0 +1,70 @@
+import json
+import importlib.util
+import unittest
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[3]  # bench/reachability-benchmark
+SCORER_PATH = ROOT / "tools" / "scorer" / "rb_score.py"
+
+
+def load_module():
+    spec = importlib.util.spec_from_file_location("rb_score", SCORER_PATH)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader
+    import sys
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)  # type: ignore[attr-defined]
+    return module
+
+
+def load_example(name: str):
+    return json.loads((ROOT / "schemas" / "examples" / name).read_text())
+
+
+rb_score = load_module()
+
+
+class TestScoring(unittest.TestCase):
+    def test_score_perfect_prediction(self):
+        truth = load_example("truth.sample.json")
+        submission = load_example("submission.sample.json")
+
+        report = rb_score.score(truth, submission)
+        self.assertEqual(report.tp, 1)
+        self.assertEqual(report.fp, 0)
+        self.assertEqual(report.fn, 0)
+        self.assertEqual(report.precision, 1.0)
+        self.assertEqual(report.recall, 1.0)
+        self.assertEqual(report.f1, 1.0)
+        self.assertGreaterEqual(report.explain_avg, 1.0)
+        self.assertEqual(report.determinism_rate, 1.0)
+
+    def test_score_false_negative_and_fp(self):
+        truth = load_example("truth.sample.json")
+        submission = {
+            "version": "1.0.0",
+            "tool": {"name": "tool", "version": "1"},
+            "run": {"platform": "ubuntu"},
+            "cases": [
+                {
+                    "case_id": "js-express-blog:001",
+                    "sinks": [
+                        {"sink_id": "Deserializer::parse", "prediction": "unreachable"},
+                        {"sink_id": "Fake::sink", "prediction": "reachable"},
+                    ],
+                }
+            ],
+        }
+
+        report = rb_score.score(truth, submission)
+        self.assertEqual(report.tp, 0)
+        self.assertEqual(report.fp, 1)
+        self.assertEqual(report.fn, 1)
+        self.assertEqual(report.precision, 0.0)
+        self.assertEqual(report.recall, 0.0)
+        self.assertEqual(report.f1, 0.0)
+        self.assertEqual(report.determinism_rate, 1.0)
+
+
+if __name__ == "__main__":
+    unittest.main()