Files
git.stella-ops.org/scripts/bench/populate-findings.py
StellaOps Bot 233873f620
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Signals CI & Image / signals-ci (push) Has been cancelled
Signals Reachability Scoring & Events / reachability-smoke (push) Has been cancelled
Signals Reachability Scoring & Events / sign-and-upload (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Reachability Corpus Validation / validate-corpus (push) Has been cancelled
Reachability Corpus Validation / validate-ground-truths (push) Has been cancelled
Scanner Analyzers / Discover Analyzers (push) Has been cancelled
Scanner Analyzers / Validate Test Fixtures (push) Has been cancelled
Reachability Corpus Validation / determinism-check (push) Has been cancelled
Scanner Analyzers / Build Analyzers (push) Has been cancelled
Scanner Analyzers / Test Language Analyzers (push) Has been cancelled
Scanner Analyzers / Verify Deterministic Output (push) Has been cancelled
Notify Smoke Test / Notify Unit Tests (push) Has been cancelled
Notify Smoke Test / Notifier Service Tests (push) Has been cancelled
Notify Smoke Test / Notification Smoke Test (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
up
2025-12-14 15:50:38 +02:00

418 lines
13 KiB
Python

#!/usr/bin/env python3
# SPDX-License-Identifier: AGPL-3.0-or-later
# BENCH-AUTO-401-019: Automate population of bench/findings/** from reachbench fixtures
"""
Populates bench/findings/** with per-CVE VEX decision bundles derived from
reachbench fixtures, including reachability evidence, SBOM excerpts, and
DSSE envelope stubs.
Usage:
python scripts/bench/populate-findings.py [--fixtures PATH] [--output PATH] [--dry-run]
"""
import argparse
import hashlib
import json
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
def blake3_hex(data: bytes) -> str:
"""Compute BLAKE3-256 hash (fallback to SHA-256 if blake3 not installed)."""
try:
import blake3
return blake3.blake3(data).hexdigest()
except ImportError:
return "sha256:" + hashlib.sha256(data).hexdigest()
def sha256_hex(data: bytes) -> str:
"""Compute SHA-256 hash."""
return hashlib.sha256(data).hexdigest()
def canonical_json(obj: Any) -> str:
"""Serialize object to canonical JSON (sorted keys, no extra whitespace for hashes)."""
return json.dumps(obj, sort_keys=True, separators=(',', ':'))
def canonical_json_pretty(obj: Any) -> str:
"""Serialize object to canonical JSON with indentation for readability."""
return json.dumps(obj, sort_keys=True, indent=2)
def load_reachbench_index(fixtures_path: Path) -> dict:
"""Load the reachbench INDEX.json."""
index_path = fixtures_path / "INDEX.json"
if not index_path.exists():
raise FileNotFoundError(f"Reachbench INDEX not found: {index_path}")
with open(index_path, 'r', encoding='utf-8') as f:
return json.load(f)
def load_ground_truth(case_path: Path, variant: str) -> dict | None:
"""Load ground-truth.json for a variant."""
truth_path = case_path / "images" / variant / "reachgraph.truth.json"
if not truth_path.exists():
return None
with open(truth_path, 'r', encoding='utf-8') as f:
return json.load(f)
def create_openvex_decision(
cve_id: str,
purl: str,
status: str, # "not_affected" or "affected"
justification: str | None,
evidence_hash: str,
timestamp: str
) -> dict:
"""Create an OpenVEX decision document."""
statement = {
"@context": "https://openvex.dev/ns/v0.2.0",
"@type": "VEX",
"author": "StellaOps Bench Automation",
"role": "security_team",
"timestamp": timestamp,
"version": 1,
"tooling": "StellaOps/bench-auto@1.0.0",
"statements": [
{
"vulnerability": {
"@id": f"https://nvd.nist.gov/vuln/detail/{cve_id}",
"name": cve_id,
},
"products": [
{"@id": purl}
],
"status": status,
}
]
}
if justification and status == "not_affected":
statement["statements"][0]["justification"] = justification
# Add action_statement for affected
if status == "affected":
statement["statements"][0]["action_statement"] = "Upgrade to patched version or apply mitigation."
# Add evidence reference
statement["statements"][0]["impact_statement"] = f"Evidence hash: {evidence_hash}"
return statement
def create_dsse_envelope_stub(payload: dict, payload_type: str = "application/vnd.openvex+json") -> dict:
"""Create a DSSE envelope stub (signature placeholder for actual signing)."""
payload_json = canonical_json(payload)
payload_b64 = __import__('base64').b64encode(payload_json.encode()).decode()
return {
"payloadType": payload_type,
"payload": payload_b64,
"signatures": [
{
"keyid": "stella.ops/bench-automation@v1",
"sig": "PLACEHOLDER_SIGNATURE_REQUIRES_ACTUAL_SIGNING"
}
]
}
def create_metadata(
cve_id: str,
purl: str,
variant: str,
case_id: str,
ground_truth: dict | None,
timestamp: str
) -> dict:
"""Create metadata.json for a finding."""
return {
"cve_id": cve_id,
"purl": purl,
"case_id": case_id,
"variant": variant,
"reachability_status": "reachable" if variant == "reachable" else "unreachable",
"ground_truth_schema": ground_truth.get("schema_version") if ground_truth else None,
"generated_at": timestamp,
"generator": "scripts/bench/populate-findings.py",
"generator_version": "1.0.0"
}
def extract_cve_id(case_id: str) -> str:
"""Extract CVE ID from case_id, or generate a placeholder."""
# Common patterns: log4j -> CVE-2021-44228, curl -> CVE-2023-38545, etc.
cve_mapping = {
"log4j": "CVE-2021-44228",
"curl": "CVE-2023-38545",
"kestrel": "CVE-2023-44487",
"spring": "CVE-2022-22965",
"openssl": "CVE-2022-3602",
"glibc": "CVE-2015-7547",
}
for key, cve in cve_mapping.items():
if key in case_id.lower():
return cve
# Generate placeholder CVE for unknown cases
return f"CVE-BENCH-{case_id.upper()[:8]}"
def extract_purl(case_id: str, case_data: dict) -> str:
"""Extract or generate a purl from case data."""
# Use case metadata if available
if "purl" in case_data:
return case_data["purl"]
# Generate based on case_id patterns
lang = case_data.get("language", "unknown")
version = case_data.get("version", "1.0.0")
pkg_type_map = {
"java": "maven",
"dotnet": "nuget",
"go": "golang",
"python": "pypi",
"rust": "cargo",
"native": "generic",
}
pkg_type = pkg_type_map.get(lang, "generic")
return f"pkg:{pkg_type}/{case_id}@{version}"
def populate_finding(
case_id: str,
case_data: dict,
case_path: Path,
output_dir: Path,
timestamp: str,
dry_run: bool
) -> dict:
"""Populate a single CVE finding bundle."""
cve_id = extract_cve_id(case_id)
purl = extract_purl(case_id, case_data)
results = {
"case_id": case_id,
"cve_id": cve_id,
"variants_processed": [],
"errors": []
}
for variant in ["reachable", "unreachable"]:
variant_path = case_path / "images" / variant
if not variant_path.exists():
continue
ground_truth = load_ground_truth(case_path, variant)
# Determine VEX status based on variant
if variant == "reachable":
vex_status = "affected"
justification = None
else:
vex_status = "not_affected"
justification = "vulnerable_code_not_present"
# Create finding directory
finding_id = f"{cve_id}-{variant}"
finding_dir = output_dir / finding_id
evidence_dir = finding_dir / "evidence"
if not dry_run:
finding_dir.mkdir(parents=True, exist_ok=True)
evidence_dir.mkdir(parents=True, exist_ok=True)
# Create reachability evidence excerpt
evidence = {
"schema_version": "richgraph-excerpt/v1",
"case_id": case_id,
"variant": variant,
"ground_truth": ground_truth,
"paths": ground_truth.get("paths", []) if ground_truth else [],
"generated_at": timestamp
}
evidence_json = canonical_json_pretty(evidence)
evidence_hash = blake3_hex(evidence_json.encode())
if not dry_run:
with open(evidence_dir / "reachability.json", 'w', encoding='utf-8') as f:
f.write(evidence_json)
# Create SBOM excerpt
sbom = {
"bomFormat": "CycloneDX",
"specVersion": "1.6",
"version": 1,
"metadata": {
"timestamp": timestamp,
"tools": [{"vendor": "StellaOps", "name": "bench-auto", "version": "1.0.0"}]
},
"components": [
{
"type": "library",
"purl": purl,
"name": case_id,
"version": case_data.get("version", "1.0.0")
}
]
}
if not dry_run:
with open(evidence_dir / "sbom.cdx.json", 'w', encoding='utf-8') as f:
json.dump(sbom, f, indent=2, sort_keys=True)
# Create OpenVEX decision
openvex = create_openvex_decision(
cve_id=cve_id,
purl=purl,
status=vex_status,
justification=justification,
evidence_hash=evidence_hash,
timestamp=timestamp
)
if not dry_run:
with open(finding_dir / "decision.openvex.json", 'w', encoding='utf-8') as f:
json.dump(openvex, f, indent=2, sort_keys=True)
# Create DSSE envelope stub
dsse = create_dsse_envelope_stub(openvex)
if not dry_run:
with open(finding_dir / "decision.dsse.json", 'w', encoding='utf-8') as f:
json.dump(dsse, f, indent=2, sort_keys=True)
# Create Rekor placeholder
if not dry_run:
with open(finding_dir / "rekor.txt", 'w', encoding='utf-8') as f:
f.write(f"# Rekor log entry placeholder\n")
f.write(f"# Submit DSSE envelope to Rekor to populate this file\n")
f.write(f"log_index: PENDING\n")
f.write(f"uuid: PENDING\n")
f.write(f"timestamp: {timestamp}\n")
# Create metadata
metadata = create_metadata(
cve_id=cve_id,
purl=purl,
variant=variant,
case_id=case_id,
ground_truth=ground_truth,
timestamp=timestamp
)
if not dry_run:
with open(finding_dir / "metadata.json", 'w', encoding='utf-8') as f:
json.dump(metadata, f, indent=2, sort_keys=True)
results["variants_processed"].append({
"variant": variant,
"finding_id": finding_id,
"vex_status": vex_status,
"evidence_hash": evidence_hash
})
return results
def main():
parser = argparse.ArgumentParser(
description="Populate bench/findings/** from reachbench fixtures"
)
parser.add_argument(
"--fixtures",
type=Path,
default=Path("tests/reachability/fixtures/reachbench-2025-expanded"),
help="Path to reachbench fixtures directory"
)
parser.add_argument(
"--output",
type=Path,
default=Path("bench/findings"),
help="Output directory for findings"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Print what would be created without writing files"
)
parser.add_argument(
"--limit",
type=int,
default=0,
help="Limit number of cases to process (0 = all)"
)
args = parser.parse_args()
# Resolve paths relative to repo root
repo_root = Path(__file__).parent.parent.parent
fixtures_path = repo_root / args.fixtures if not args.fixtures.is_absolute() else args.fixtures
output_path = repo_root / args.output if not args.output.is_absolute() else args.output
print(f"Fixtures path: {fixtures_path}")
print(f"Output path: {output_path}")
print(f"Dry run: {args.dry_run}")
# Load reachbench index
try:
index = load_reachbench_index(fixtures_path)
except FileNotFoundError as e:
print(f"Error: {e}", file=sys.stderr)
return 1
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
cases = index.get("cases", [])
if args.limit > 0:
cases = cases[:args.limit]
print(f"Processing {len(cases)} cases...")
all_results = []
for case in cases:
case_id = case["id"]
case_path_rel = case.get("path", f"cases/{case_id}")
case_path = fixtures_path / case_path_rel
if not case_path.exists():
print(f" Warning: Case path not found: {case_path}")
continue
print(f" Processing: {case_id}")
result = populate_finding(
case_id=case_id,
case_data=case,
case_path=case_path,
output_dir=output_path,
timestamp=timestamp,
dry_run=args.dry_run
)
all_results.append(result)
for v in result["variants_processed"]:
print(f" - {v['finding_id']}: {v['vex_status']}")
# Summary
total_findings = sum(len(r["variants_processed"]) for r in all_results)
print(f"\nGenerated {total_findings} findings from {len(all_results)} cases")
if args.dry_run:
print("(dry-run mode - no files written)")
return 0
if __name__ == "__main__":
sys.exit(main())