feat: Implement Filesystem and MongoDB provenance writers for PackRun execution context
Some checks failed
Airgap Sealed CI Smoke / sealed-smoke (push) Has been cancelled
Docs CI / lint-and-preview (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled

- Added `FilesystemPackRunProvenanceWriter` to write provenance manifests to the filesystem.
- Introduced `MongoPackRunArtifactReader` to read artifacts from MongoDB.
- Created `MongoPackRunProvenanceWriter` to store provenance manifests in MongoDB.
- Developed unit tests for filesystem and MongoDB provenance writers.
- Established `ITimelineEventStore` and `ITimelineIngestionService` interfaces for timeline event handling.
- Implemented `TimelineIngestionService` to validate and persist timeline events with hashing.
- Created PostgreSQL schema and migration scripts for timeline indexing.
- Added dependency injection support for timeline indexer services.
- Developed tests for timeline ingestion and schema validation.
This commit is contained in:
StellaOps Bot
2025-11-30 15:38:14 +02:00
parent 8f54ffa203
commit 17d45a6d30
276 changed files with 8618 additions and 688 deletions

View File

@@ -1,11 +1,34 @@
# rb-score (placeholder)
# rb-score
Planned CLI to score reachability submissions against truth sets.
Deterministic scorer for the reachability benchmark.
Future work (BENCH-SCORER-513-008):
- Validate submission against `schemas/submission.schema.json`.
- Validate truth against `schemas/truth.schema.json`.
- Compute precision/recall/F1, explainability score (0-3), runtime stats, determinism rate.
- Emit JSON report with stable ordering.
## What it does
- Validates submissions against `schemas/submission.schema.json` and truth against `schemas/truth.schema.json`.
- Computes precision/recall/F1 (micro, sink-level).
- Computes explainability score per prediction (03) and averages it.
- Checks duplicate predictions for determinism (inconsistent duplicates lower the rate).
- Surfaces runtime metadata from the submission (`run` block).
For now this folder is a stub; implementation will be added in task 513-008 once schemas stabilize.
## Install (offline-friendly)
```bash
python -m pip install -r requirements.txt
```
## Usage
```bash
./rb_score.py --truth ../../benchmark/truth/public.json --submission ../../benchmark/submissions/sample.json --format json
```
## Output
- `text` (default): short human-readable summary.
- `json`: deterministic JSON with top-level metrics and per-case breakdown.
## Tests
```bash
python -m unittest tests/test_scoring.py
```
## Notes
- Predictions for sinks not present in truth count as false positives (strict posture).
- Truth sinks with label `unknown` are ignored for FN/FP counting.
- Explainability tiering: 0=no context; 1=path>=2 nodes; 2=entry + path>=3; 3=guards present.

View File

@@ -0,0 +1,3 @@
from . import rb_score
__all__ = ["rb_score"]

View File

@@ -0,0 +1,4 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
python3 "$SCRIPT_DIR/rb_score.py" "$@"

View File

@@ -0,0 +1,258 @@
#!/usr/bin/env python3
"""rb-score: deterministic scorer for reachability benchmark submissions.
Features (task BENCH-SCORER-513-008):
- Validate submission and truth against published schemas.
- Compute precision / recall / F1 at sink level (micro-averaged).
- Compute explainability score per prediction (03) and average.
- Surface runtime stats from submission metadata.
- Emit deterministic JSON or human-readable text.
Assumptions:
- Truth labels may include "unknown"; these are skipped for FN/FP.
- A prediction for a sink absent in truth counts as FP (strict posture).
- Duplicate predictions for the same sink must agree; disagreement reduces determinism rate.
"""
from __future__ import annotations
import argparse
import json
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
import yaml
from jsonschema import Draft202012Validator
ROOT = Path(__file__).resolve().parents[1]
SCHEMAS = {
"truth": ROOT / "schemas" / "truth.schema.json",
"submission": ROOT / "schemas" / "submission.schema.json",
}
@dataclass
class CaseMetrics:
case_id: str
tp: int
fp: int
fn: int
precision: float
recall: float
f1: float
explain_avg: float
@dataclass
class ScoreReport:
precision: float
recall: float
f1: float
tp: int
fp: int
fn: int
explain_avg: float
determinism_rate: float
runtime: Dict[str, object]
cases: List[CaseMetrics]
def load_json_or_yaml(path: Path):
text = path.read_text(encoding="utf-8")
if path.suffix.lower() in {".yaml", ".yml"}:
return yaml.safe_load(text)
return json.loads(text)
def validate_against(schema_path: Path, payload) -> Tuple[bool, List[str]]:
schema = load_json_or_yaml(schema_path)
validator = Draft202012Validator(schema)
errors = sorted(validator.iter_errors(payload), key=lambda e: e.path)
if not errors:
return True, []
return False, [f"{'/'.join(str(p) for p in err.path) or '<root>'}: {err.message}" for err in errors]
def safe_div(num: int, denom: int, default: float) -> float:
if denom == 0:
return default
return num / denom
def explain_score(pred: dict) -> int:
expl = pred.get("explain") or {}
path = expl.get("path") or []
entry = expl.get("entry")
guards = expl.get("guards") or []
if guards:
return 3
if entry and len(path) >= 3:
return 2
if len(path) >= 2:
return 1
return 0
def determinism_rate(preds: Iterable[dict]) -> float:
"""Detect inconsistent duplicate predictions for the same sink."""
by_sink: Dict[str, set] = {}
total_groups = 0
consistent_groups = 0
for pred in preds:
sink_id = pred.get("sink_id")
if sink_id is None:
continue
by_sink.setdefault(sink_id, set()).add(pred.get("prediction"))
for values in by_sink.values():
total_groups += 1
if len(values) == 1:
consistent_groups += 1
if total_groups == 0:
return 1.0
return consistent_groups / total_groups
def score_case(case_id: str, truth_sinks: Dict[str, str], predicted: List[dict]) -> CaseMetrics:
truth_reach = {sid for sid, label in truth_sinks.items() if label == "reachable"}
truth_unreach = {sid for sid, label in truth_sinks.items() if label == "unreachable"}
pred_reach = {p["sink_id"] for p in predicted if p.get("prediction") == "reachable"}
tp = len(pred_reach & truth_reach)
fp = len(pred_reach - truth_reach)
fn = len(truth_reach - pred_reach)
precision = safe_div(tp, tp + fp, 1.0)
recall = safe_div(tp, tp + fn, 1.0)
f1 = 0.0 if (precision + recall) == 0 else 2 * precision * recall / (precision + recall)
explain_scores = [explain_score(p) for p in predicted]
explain_avg = safe_div(sum(explain_scores), len(explain_scores), 0.0)
return CaseMetrics(case_id, tp, fp, fn, precision, recall, f1, explain_avg)
def aggregate(cases: List[CaseMetrics], preds: List[dict]) -> ScoreReport:
tp = sum(c.tp for c in cases)
fp = sum(c.fp for c in cases)
fn = sum(c.fn for c in cases)
precision = safe_div(tp, tp + fp, 1.0)
recall = safe_div(tp, tp + fn, 1.0)
f1 = 0.0 if (precision + recall) == 0 else 2 * precision * recall / (precision + recall)
explain_avg = safe_div(sum(c.explain_avg for c in cases), len(cases), 0.0) if cases else 0.0
det_rate = determinism_rate(preds)
runtime = {}
return ScoreReport(precision, recall, f1, tp, fp, fn, explain_avg, det_rate, runtime, cases)
def build_truth_index(truth_doc: dict) -> Dict[str, Dict[str, str]]:
index: Dict[str, Dict[str, str]] = {}
for case in truth_doc.get("cases", []):
sinks = {s["sink_id"]: s["label"] for s in case.get("sinks", [])}
index[case["case_id"]] = sinks
return index
def score(truth_doc: dict, submission_doc: dict) -> ScoreReport:
truth_index = build_truth_index(truth_doc)
cases_metrics: List[CaseMetrics] = []
all_preds: List[dict] = []
for sub_case in submission_doc.get("cases", []):
case_id = sub_case.get("case_id")
predicted_sinks = sub_case.get("sinks") or []
all_preds.extend(predicted_sinks)
truth_sinks = truth_index.get(case_id, {})
case_metrics = score_case(case_id, truth_sinks, predicted_sinks)
cases_metrics.append(case_metrics)
report = aggregate(cases_metrics, all_preds)
report.runtime = submission_doc.get("run", {})
return report
def report_as_dict(report: ScoreReport) -> dict:
return {
"version": "1.0.0",
"metrics": {
"precision": round(report.precision, 4),
"recall": round(report.recall, 4),
"f1": round(report.f1, 4),
"tp": report.tp,
"fp": report.fp,
"fn": report.fn,
"determinism_rate": round(report.determinism_rate, 4),
"explainability_avg": round(report.explain_avg, 4),
},
"runtime": report.runtime,
"cases": [
{
"case_id": c.case_id,
"precision": round(c.precision, 4),
"recall": round(c.recall, 4),
"f1": round(c.f1, 4),
"tp": c.tp,
"fp": c.fp,
"fn": c.fn,
"explainability_avg": round(c.explain_avg, 4),
}
for c in report.cases
],
}
def format_text(report: ScoreReport) -> str:
lines = []
lines.append("rb-score summary")
lines.append(f" precision {report.precision:.4f} recall {report.recall:.4f} f1 {report.f1:.4f}")
lines.append(f" tp {report.tp} fp {report.fp} fn {report.fn} determinism {report.determinism_rate:.4f} explain_avg {report.explain_avg:.4f}")
if report.runtime:
rt = report.runtime
lines.append(" runtime: " + ", ".join(f"{k}={v}" for k, v in sorted(rt.items())))
lines.append(" cases:")
for c in report.cases:
lines.append(
f" - {c.case_id}: P {c.precision:.4f} R {c.recall:.4f} F1 {c.f1:.4f} tp {c.tp} fp {c.fp} fn {c.fn} explain_avg {c.explain_avg:.4f}"
)
return "\n".join(lines)
def parse_args(argv: List[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Score reachability benchmark submissions")
parser.add_argument("--truth", required=True, help="Path to truth JSON")
parser.add_argument("--submission", required=True, help="Path to submission JSON")
parser.add_argument("--format", choices=["json", "text"], default="text", help="Output format")
return parser.parse_args(argv)
def main(argv: List[str]) -> int:
args = parse_args(argv)
truth_path = Path(args.truth)
submission_path = Path(args.submission)
if not truth_path.exists() or not submission_path.exists():
print("truth or submission file not found", file=sys.stderr)
return 2
truth_doc = load_json_or_yaml(truth_path)
submission_doc = load_json_or_yaml(submission_path)
ok_truth, truth_errs = validate_against(SCHEMAS["truth"], truth_doc)
ok_sub, sub_errs = validate_against(SCHEMAS["submission"], submission_doc)
if not ok_truth or not ok_sub:
for msg in truth_errs + sub_errs:
print(f"validation_error: {msg}", file=sys.stderr)
return 3
report = score(truth_doc, submission_doc)
if args.format == "json":
print(json.dumps(report_as_dict(report), sort_keys=True, indent=2))
else:
print(format_text(report))
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))

View File

@@ -0,0 +1,2 @@
jsonschema==4.23.0
PyYAML==6.0.2

View File

@@ -0,0 +1,70 @@
import json
import importlib.util
import unittest
from pathlib import Path
ROOT = Path(__file__).resolve().parents[3] # bench/reachability-benchmark
SCORER_PATH = ROOT / "tools" / "scorer" / "rb_score.py"
def load_module():
spec = importlib.util.spec_from_file_location("rb_score", SCORER_PATH)
module = importlib.util.module_from_spec(spec)
assert spec.loader
import sys
sys.modules[spec.name] = module
spec.loader.exec_module(module) # type: ignore[attr-defined]
return module
def load_example(name: str):
return json.loads((ROOT / "schemas" / "examples" / name).read_text())
rb_score = load_module()
class TestScoring(unittest.TestCase):
def test_score_perfect_prediction(self):
truth = load_example("truth.sample.json")
submission = load_example("submission.sample.json")
report = rb_score.score(truth, submission)
self.assertEqual(report.tp, 1)
self.assertEqual(report.fp, 0)
self.assertEqual(report.fn, 0)
self.assertEqual(report.precision, 1.0)
self.assertEqual(report.recall, 1.0)
self.assertEqual(report.f1, 1.0)
self.assertGreaterEqual(report.explain_avg, 1.0)
self.assertEqual(report.determinism_rate, 1.0)
def test_score_false_negative_and_fp(self):
truth = load_example("truth.sample.json")
submission = {
"version": "1.0.0",
"tool": {"name": "tool", "version": "1"},
"run": {"platform": "ubuntu"},
"cases": [
{
"case_id": "js-express-blog:001",
"sinks": [
{"sink_id": "Deserializer::parse", "prediction": "unreachable"},
{"sink_id": "Fake::sink", "prediction": "reachable"},
],
}
],
}
report = rb_score.score(truth, submission)
self.assertEqual(report.tp, 0)
self.assertEqual(report.fp, 1)
self.assertEqual(report.fn, 1)
self.assertEqual(report.precision, 0.0)
self.assertEqual(report.recall, 0.0)
self.assertEqual(report.f1, 0.0)
self.assertEqual(report.determinism_rate, 1.0)
if __name__ == "__main__":
unittest.main()