#!/usr/bin/env python3 # SPDX-License-Identifier: AGPL-3.0-or-later # BENCH-AUTO-401-019: Automate population of bench/findings/** from reachbench fixtures """ Populates bench/findings/** with per-CVE VEX decision bundles derived from reachbench fixtures, including reachability evidence, SBOM excerpts, and DSSE envelope stubs. Usage: python scripts/bench/populate-findings.py [--fixtures PATH] [--output PATH] [--dry-run] """ import argparse import hashlib import json import os import sys from datetime import datetime, timezone from pathlib import Path from typing import Any def blake3_hex(data: bytes) -> str: """Compute BLAKE3-256 hash (fallback to SHA-256 if blake3 not installed).""" try: import blake3 return blake3.blake3(data).hexdigest() except ImportError: return "sha256:" + hashlib.sha256(data).hexdigest() def sha256_hex(data: bytes) -> str: """Compute SHA-256 hash.""" return hashlib.sha256(data).hexdigest() def canonical_json(obj: Any) -> str: """Serialize object to canonical JSON (sorted keys, no extra whitespace for hashes).""" return json.dumps(obj, sort_keys=True, separators=(',', ':')) def canonical_json_pretty(obj: Any) -> str: """Serialize object to canonical JSON with indentation for readability.""" return json.dumps(obj, sort_keys=True, indent=2) def load_reachbench_index(fixtures_path: Path) -> dict: """Load the reachbench INDEX.json.""" index_path = fixtures_path / "INDEX.json" if not index_path.exists(): raise FileNotFoundError(f"Reachbench INDEX not found: {index_path}") with open(index_path, 'r', encoding='utf-8') as f: return json.load(f) def load_ground_truth(case_path: Path, variant: str) -> dict | None: """Load ground-truth.json for a variant.""" truth_path = case_path / "images" / variant / "reachgraph.truth.json" if not truth_path.exists(): return None with open(truth_path, 'r', encoding='utf-8') as f: return json.load(f) def create_openvex_decision( cve_id: str, purl: str, status: str, # "not_affected" or "affected" justification: str | None, evidence_hash: str, timestamp: str ) -> dict: """Create an OpenVEX decision document.""" statement = { "@context": "https://openvex.dev/ns/v0.2.0", "@type": "VEX", "author": "StellaOps Bench Automation", "role": "security_team", "timestamp": timestamp, "version": 1, "tooling": "StellaOps/bench-auto@1.0.0", "statements": [ { "vulnerability": { "@id": f"https://nvd.nist.gov/vuln/detail/{cve_id}", "name": cve_id, }, "products": [ {"@id": purl} ], "status": status, } ] } if justification and status == "not_affected": statement["statements"][0]["justification"] = justification # Add action_statement for affected if status == "affected": statement["statements"][0]["action_statement"] = "Upgrade to patched version or apply mitigation." # Add evidence reference statement["statements"][0]["impact_statement"] = f"Evidence hash: {evidence_hash}" return statement def create_dsse_envelope_stub(payload: dict, payload_type: str = "application/vnd.openvex+json") -> dict: """Create a DSSE envelope stub (signature placeholder for actual signing).""" payload_json = canonical_json(payload) payload_b64 = __import__('base64').b64encode(payload_json.encode()).decode() return { "payloadType": payload_type, "payload": payload_b64, "signatures": [ { "keyid": "stella.ops/bench-automation@v1", "sig": "PLACEHOLDER_SIGNATURE_REQUIRES_ACTUAL_SIGNING" } ] } def create_metadata( cve_id: str, purl: str, variant: str, case_id: str, ground_truth: dict | None, timestamp: str ) -> dict: """Create metadata.json for a finding.""" return { "cve_id": cve_id, "purl": purl, "case_id": case_id, "variant": variant, "reachability_status": "reachable" if variant == "reachable" else "unreachable", "ground_truth_schema": ground_truth.get("schema_version") if ground_truth else None, "generated_at": timestamp, "generator": "scripts/bench/populate-findings.py", "generator_version": "1.0.0" } def extract_cve_id(case_id: str) -> str: """Extract CVE ID from case_id, or generate a placeholder.""" # Common patterns: log4j -> CVE-2021-44228, curl -> CVE-2023-38545, etc. cve_mapping = { "log4j": "CVE-2021-44228", "curl": "CVE-2023-38545", "kestrel": "CVE-2023-44487", "spring": "CVE-2022-22965", "openssl": "CVE-2022-3602", "glibc": "CVE-2015-7547", } for key, cve in cve_mapping.items(): if key in case_id.lower(): return cve # Generate placeholder CVE for unknown cases return f"CVE-BENCH-{case_id.upper()[:8]}" def extract_purl(case_id: str, case_data: dict) -> str: """Extract or generate a purl from case data.""" # Use case metadata if available if "purl" in case_data: return case_data["purl"] # Generate based on case_id patterns lang = case_data.get("language", "unknown") version = case_data.get("version", "1.0.0") pkg_type_map = { "java": "maven", "dotnet": "nuget", "go": "golang", "python": "pypi", "rust": "cargo", "native": "generic", } pkg_type = pkg_type_map.get(lang, "generic") return f"pkg:{pkg_type}/{case_id}@{version}" def populate_finding( case_id: str, case_data: dict, case_path: Path, output_dir: Path, timestamp: str, dry_run: bool ) -> dict: """Populate a single CVE finding bundle.""" cve_id = extract_cve_id(case_id) purl = extract_purl(case_id, case_data) results = { "case_id": case_id, "cve_id": cve_id, "variants_processed": [], "errors": [] } for variant in ["reachable", "unreachable"]: variant_path = case_path / "images" / variant if not variant_path.exists(): continue ground_truth = load_ground_truth(case_path, variant) # Determine VEX status based on variant if variant == "reachable": vex_status = "affected" justification = None else: vex_status = "not_affected" justification = "vulnerable_code_not_present" # Create finding directory finding_id = f"{cve_id}-{variant}" finding_dir = output_dir / finding_id evidence_dir = finding_dir / "evidence" if not dry_run: finding_dir.mkdir(parents=True, exist_ok=True) evidence_dir.mkdir(parents=True, exist_ok=True) # Create reachability evidence excerpt evidence = { "schema_version": "richgraph-excerpt/v1", "case_id": case_id, "variant": variant, "ground_truth": ground_truth, "paths": ground_truth.get("paths", []) if ground_truth else [], "generated_at": timestamp } evidence_json = canonical_json_pretty(evidence) evidence_hash = blake3_hex(evidence_json.encode()) if not dry_run: with open(evidence_dir / "reachability.json", 'w', encoding='utf-8') as f: f.write(evidence_json) # Create SBOM excerpt sbom = { "bomFormat": "CycloneDX", "specVersion": "1.6", "version": 1, "metadata": { "timestamp": timestamp, "tools": [{"vendor": "StellaOps", "name": "bench-auto", "version": "1.0.0"}] }, "components": [ { "type": "library", "purl": purl, "name": case_id, "version": case_data.get("version", "1.0.0") } ] } if not dry_run: with open(evidence_dir / "sbom.cdx.json", 'w', encoding='utf-8') as f: json.dump(sbom, f, indent=2, sort_keys=True) # Create OpenVEX decision openvex = create_openvex_decision( cve_id=cve_id, purl=purl, status=vex_status, justification=justification, evidence_hash=evidence_hash, timestamp=timestamp ) if not dry_run: with open(finding_dir / "decision.openvex.json", 'w', encoding='utf-8') as f: json.dump(openvex, f, indent=2, sort_keys=True) # Create DSSE envelope stub dsse = create_dsse_envelope_stub(openvex) if not dry_run: with open(finding_dir / "decision.dsse.json", 'w', encoding='utf-8') as f: json.dump(dsse, f, indent=2, sort_keys=True) # Create Rekor placeholder if not dry_run: with open(finding_dir / "rekor.txt", 'w', encoding='utf-8') as f: f.write(f"# Rekor log entry placeholder\n") f.write(f"# Submit DSSE envelope to Rekor to populate this file\n") f.write(f"log_index: PENDING\n") f.write(f"uuid: PENDING\n") f.write(f"timestamp: {timestamp}\n") # Create metadata metadata = create_metadata( cve_id=cve_id, purl=purl, variant=variant, case_id=case_id, ground_truth=ground_truth, timestamp=timestamp ) if not dry_run: with open(finding_dir / "metadata.json", 'w', encoding='utf-8') as f: json.dump(metadata, f, indent=2, sort_keys=True) results["variants_processed"].append({ "variant": variant, "finding_id": finding_id, "vex_status": vex_status, "evidence_hash": evidence_hash }) return results def main(): parser = argparse.ArgumentParser( description="Populate bench/findings/** from reachbench fixtures" ) parser.add_argument( "--fixtures", type=Path, default=Path("tests/reachability/fixtures/reachbench-2025-expanded"), help="Path to reachbench fixtures directory" ) parser.add_argument( "--output", type=Path, default=Path("bench/findings"), help="Output directory for findings" ) parser.add_argument( "--dry-run", action="store_true", help="Print what would be created without writing files" ) parser.add_argument( "--limit", type=int, default=0, help="Limit number of cases to process (0 = all)" ) args = parser.parse_args() # Resolve paths relative to repo root repo_root = Path(__file__).parent.parent.parent fixtures_path = repo_root / args.fixtures if not args.fixtures.is_absolute() else args.fixtures output_path = repo_root / args.output if not args.output.is_absolute() else args.output print(f"Fixtures path: {fixtures_path}") print(f"Output path: {output_path}") print(f"Dry run: {args.dry_run}") # Load reachbench index try: index = load_reachbench_index(fixtures_path) except FileNotFoundError as e: print(f"Error: {e}", file=sys.stderr) return 1 timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") cases = index.get("cases", []) if args.limit > 0: cases = cases[:args.limit] print(f"Processing {len(cases)} cases...") all_results = [] for case in cases: case_id = case["id"] case_path_rel = case.get("path", f"cases/{case_id}") case_path = fixtures_path / case_path_rel if not case_path.exists(): print(f" Warning: Case path not found: {case_path}") continue print(f" Processing: {case_id}") result = populate_finding( case_id=case_id, case_data=case, case_path=case_path, output_dir=output_path, timestamp=timestamp, dry_run=args.dry_run ) all_results.append(result) for v in result["variants_processed"]: print(f" - {v['finding_id']}: {v['vex_status']}") # Summary total_findings = sum(len(r["variants_processed"]) for r in all_results) print(f"\nGenerated {total_findings} findings from {len(all_results)} cases") if args.dry_run: print("(dry-run mode - no files written)") return 0 if __name__ == "__main__": sys.exit(main())