git.stella-ops.org/scripts/bench/populate-findings.py

#!/usr/bin/env python3
# SPDX-License-Identifier: AGPL-3.0-or-later
# BENCH-AUTO-401-019: Automate population of bench/findings/** from reachbench fixtures

"""
Populates bench/findings/** with per-CVE VEX decision bundles derived from
reachbench fixtures, including reachability evidence, SBOM excerpts, and
DSSE envelope stubs.

Usage:
    python scripts/bench/populate-findings.py [--fixtures PATH] [--output PATH] [--dry-run]
"""

import argparse
import hashlib
import json
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any


def blake3_hex(data: bytes) -> str:
    """Compute BLAKE3-256 hash (fallback to SHA-256 if blake3 not installed)."""
    try:
        import blake3
        return blake3.blake3(data).hexdigest()
    except ImportError:
        return "sha256:" + hashlib.sha256(data).hexdigest()


def sha256_hex(data: bytes) -> str:
    """Compute SHA-256 hash."""
    return hashlib.sha256(data).hexdigest()


def canonical_json(obj: Any) -> str:
    """Serialize object to canonical JSON (sorted keys, no extra whitespace for hashes)."""
    return json.dumps(obj, sort_keys=True, separators=(',', ':'))


def canonical_json_pretty(obj: Any) -> str:
    """Serialize object to canonical JSON with indentation for readability."""
    return json.dumps(obj, sort_keys=True, indent=2)


def load_reachbench_index(fixtures_path: Path) -> dict:
    """Load the reachbench INDEX.json."""
    index_path = fixtures_path / "INDEX.json"
    if not index_path.exists():
        raise FileNotFoundError(f"Reachbench INDEX not found: {index_path}")
    with open(index_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def load_ground_truth(case_path: Path, variant: str) -> dict | None:
    """Load ground-truth.json for a variant."""
    truth_path = case_path / "images" / variant / "reachgraph.truth.json"
    if not truth_path.exists():
        return None
    with open(truth_path, 'r', encoding='utf-8') as f:
        return json.load(f)


def create_openvex_decision(
    cve_id: str,
    purl: str,
    status: str,  # "not_affected" or "affected"
    justification: str | None,
    evidence_hash: str,
    timestamp: str
) -> dict:
    """Create an OpenVEX decision document."""
    statement = {
        "@context": "https://openvex.dev/ns/v0.2.0",
        "@type": "VEX",
        "author": "StellaOps Bench Automation",
        "role": "security_team",
        "timestamp": timestamp,
        "version": 1,
        "tooling": "StellaOps/bench-auto@1.0.0",
        "statements": [
            {
                "vulnerability": {
                    "@id": f"https://nvd.nist.gov/vuln/detail/{cve_id}",
                    "name": cve_id,
                },
                "products": [
                    {"@id": purl}
                ],
                "status": status,
            }
        ]
    }

    if justification and status == "not_affected":
        statement["statements"][0]["justification"] = justification

    # Add action_statement for affected
    if status == "affected":
        statement["statements"][0]["action_statement"] = "Upgrade to patched version or apply mitigation."

    # Add evidence reference
    statement["statements"][0]["impact_statement"] = f"Evidence hash: {evidence_hash}"

    return statement


def create_dsse_envelope_stub(payload: dict, payload_type: str = "application/vnd.openvex+json") -> dict:
    """Create a DSSE envelope stub (signature placeholder for actual signing)."""
    payload_json = canonical_json(payload)
    payload_b64 = __import__('base64').b64encode(payload_json.encode()).decode()

    return {
        "payloadType": payload_type,
        "payload": payload_b64,
        "signatures": [
            {
                "keyid": "stella.ops/bench-automation@v1",
                "sig": "PLACEHOLDER_SIGNATURE_REQUIRES_ACTUAL_SIGNING"
            }
        ]
    }


def create_metadata(
    cve_id: str,
    purl: str,
    variant: str,
    case_id: str,
    ground_truth: dict | None,
    timestamp: str
) -> dict:
    """Create metadata.json for a finding."""
    return {
        "cve_id": cve_id,
        "purl": purl,
        "case_id": case_id,
        "variant": variant,
        "reachability_status": "reachable" if variant == "reachable" else "unreachable",
        "ground_truth_schema": ground_truth.get("schema_version") if ground_truth else None,
        "generated_at": timestamp,
        "generator": "scripts/bench/populate-findings.py",
        "generator_version": "1.0.0"
    }


def extract_cve_id(case_id: str) -> str:
    """Extract CVE ID from case_id, or generate a placeholder."""
    # Common patterns: log4j -> CVE-2021-44228, curl -> CVE-2023-38545, etc.
    cve_mapping = {
        "log4j": "CVE-2021-44228",
        "curl": "CVE-2023-38545",
        "kestrel": "CVE-2023-44487",
        "spring": "CVE-2022-22965",
        "openssl": "CVE-2022-3602",
        "glibc": "CVE-2015-7547",
    }

    for key, cve in cve_mapping.items():
        if key in case_id.lower():
            return cve

    # Generate placeholder CVE for unknown cases
    return f"CVE-BENCH-{case_id.upper()[:8]}"


def extract_purl(case_id: str, case_data: dict) -> str:
    """Extract or generate a purl from case data."""
    # Use case metadata if available
    if "purl" in case_data:
        return case_data["purl"]

    # Generate based on case_id patterns
    lang = case_data.get("language", "unknown")
    version = case_data.get("version", "1.0.0")

    pkg_type_map = {
        "java": "maven",
        "dotnet": "nuget",
        "go": "golang",
        "python": "pypi",
        "rust": "cargo",
        "native": "generic",
    }

    pkg_type = pkg_type_map.get(lang, "generic")
    return f"pkg:{pkg_type}/{case_id}@{version}"


def populate_finding(
    case_id: str,
    case_data: dict,
    case_path: Path,
    output_dir: Path,
    timestamp: str,
    dry_run: bool
) -> dict:
    """Populate a single CVE finding bundle."""
    cve_id = extract_cve_id(case_id)
    purl = extract_purl(case_id, case_data)

    results = {
        "case_id": case_id,
        "cve_id": cve_id,
        "variants_processed": [],
        "errors": []
    }

    for variant in ["reachable", "unreachable"]:
        variant_path = case_path / "images" / variant
        if not variant_path.exists():
            continue

        ground_truth = load_ground_truth(case_path, variant)

        # Determine VEX status based on variant
        if variant == "reachable":
            vex_status = "affected"
            justification = None
        else:
            vex_status = "not_affected"
            justification = "vulnerable_code_not_present"

        # Create finding directory
        finding_id = f"{cve_id}-{variant}"
        finding_dir = output_dir / finding_id
        evidence_dir = finding_dir / "evidence"

        if not dry_run:
            finding_dir.mkdir(parents=True, exist_ok=True)
            evidence_dir.mkdir(parents=True, exist_ok=True)

        # Create reachability evidence excerpt
        evidence = {
            "schema_version": "richgraph-excerpt/v1",
            "case_id": case_id,
            "variant": variant,
            "ground_truth": ground_truth,
            "paths": ground_truth.get("paths", []) if ground_truth else [],
            "generated_at": timestamp
        }
        evidence_json = canonical_json_pretty(evidence)
        evidence_hash = blake3_hex(evidence_json.encode())

        if not dry_run:
            with open(evidence_dir / "reachability.json", 'w', encoding='utf-8') as f:
                f.write(evidence_json)

        # Create SBOM excerpt
        sbom = {
            "bomFormat": "CycloneDX",
            "specVersion": "1.6",
            "version": 1,
            "metadata": {
                "timestamp": timestamp,
                "tools": [{"vendor": "StellaOps", "name": "bench-auto", "version": "1.0.0"}]
            },
            "components": [
                {
                    "type": "library",
                    "purl": purl,
                    "name": case_id,
                    "version": case_data.get("version", "1.0.0")
                }
            ]
        }

        if not dry_run:
            with open(evidence_dir / "sbom.cdx.json", 'w', encoding='utf-8') as f:
                json.dump(sbom, f, indent=2, sort_keys=True)

        # Create OpenVEX decision
        openvex = create_openvex_decision(
            cve_id=cve_id,
            purl=purl,
            status=vex_status,
            justification=justification,
            evidence_hash=evidence_hash,
            timestamp=timestamp
        )

        if not dry_run:
            with open(finding_dir / "decision.openvex.json", 'w', encoding='utf-8') as f:
                json.dump(openvex, f, indent=2, sort_keys=True)

        # Create DSSE envelope stub
        dsse = create_dsse_envelope_stub(openvex)

        if not dry_run:
            with open(finding_dir / "decision.dsse.json", 'w', encoding='utf-8') as f:
                json.dump(dsse, f, indent=2, sort_keys=True)

        # Create Rekor placeholder
        if not dry_run:
            with open(finding_dir / "rekor.txt", 'w', encoding='utf-8') as f:
                f.write(f"# Rekor log entry placeholder\n")
                f.write(f"# Submit DSSE envelope to Rekor to populate this file\n")
                f.write(f"log_index: PENDING\n")
                f.write(f"uuid: PENDING\n")
                f.write(f"timestamp: {timestamp}\n")

        # Create metadata
        metadata = create_metadata(
            cve_id=cve_id,
            purl=purl,
            variant=variant,
            case_id=case_id,
            ground_truth=ground_truth,
            timestamp=timestamp
        )

        if not dry_run:
            with open(finding_dir / "metadata.json", 'w', encoding='utf-8') as f:
                json.dump(metadata, f, indent=2, sort_keys=True)

        results["variants_processed"].append({
            "variant": variant,
            "finding_id": finding_id,
            "vex_status": vex_status,
            "evidence_hash": evidence_hash
        })

    return results


def main():
    parser = argparse.ArgumentParser(
        description="Populate bench/findings/** from reachbench fixtures"
    )
    parser.add_argument(
        "--fixtures",
        type=Path,
        default=Path("tests/reachability/fixtures/reachbench-2025-expanded"),
        help="Path to reachbench fixtures directory"
    )
    parser.add_argument(
        "--output",
        type=Path,
        default=Path("bench/findings"),
        help="Output directory for findings"
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Print what would be created without writing files"
    )
    parser.add_argument(
        "--limit",
        type=int,
        default=0,
        help="Limit number of cases to process (0 = all)"
    )

    args = parser.parse_args()

    # Resolve paths relative to repo root
    repo_root = Path(__file__).parent.parent.parent
    fixtures_path = repo_root / args.fixtures if not args.fixtures.is_absolute() else args.fixtures
    output_path = repo_root / args.output if not args.output.is_absolute() else args.output

    print(f"Fixtures path: {fixtures_path}")
    print(f"Output path: {output_path}")
    print(f"Dry run: {args.dry_run}")

    # Load reachbench index
    try:
        index = load_reachbench_index(fixtures_path)
    except FileNotFoundError as e:
        print(f"Error: {e}", file=sys.stderr)
        return 1

    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    cases = index.get("cases", [])
    if args.limit > 0:
        cases = cases[:args.limit]

    print(f"Processing {len(cases)} cases...")

    all_results = []
    for case in cases:
        case_id = case["id"]
        case_path_rel = case.get("path", f"cases/{case_id}")
        case_path = fixtures_path / case_path_rel

        if not case_path.exists():
            print(f"  Warning: Case path not found: {case_path}")
            continue

        print(f"  Processing: {case_id}")
        result = populate_finding(
            case_id=case_id,
            case_data=case,
            case_path=case_path,
            output_dir=output_path,
            timestamp=timestamp,
            dry_run=args.dry_run
        )
        all_results.append(result)

        for v in result["variants_processed"]:
            print(f"    - {v['finding_id']}: {v['vex_status']}")

    # Summary
    total_findings = sum(len(r["variants_processed"]) for r in all_results)
    print(f"\nGenerated {total_findings} findings from {len(all_results)} cases")

    if args.dry_run:
        print("(dry-run mode - no files written)")

    return 0


if __name__ == "__main__":
    sys.exit(main())