Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Signals CI & Image / signals-ci (push) Has been cancelled
Signals Reachability Scoring & Events / reachability-smoke (push) Has been cancelled
Signals Reachability Scoring & Events / sign-and-upload (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Reachability Corpus Validation / validate-corpus (push) Has been cancelled
Reachability Corpus Validation / validate-ground-truths (push) Has been cancelled
Scanner Analyzers / Discover Analyzers (push) Has been cancelled
Scanner Analyzers / Validate Test Fixtures (push) Has been cancelled
Reachability Corpus Validation / determinism-check (push) Has been cancelled
Scanner Analyzers / Build Analyzers (push) Has been cancelled
Scanner Analyzers / Test Language Analyzers (push) Has been cancelled
Scanner Analyzers / Verify Deterministic Output (push) Has been cancelled
Notify Smoke Test / Notify Unit Tests (push) Has been cancelled
Notify Smoke Test / Notifier Service Tests (push) Has been cancelled
Notify Smoke Test / Notification Smoke Test (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
418 lines
13 KiB
Python
418 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
# BENCH-AUTO-401-019: Automate population of bench/findings/** from reachbench fixtures
|
|
|
|
"""
|
|
Populates bench/findings/** with per-CVE VEX decision bundles derived from
|
|
reachbench fixtures, including reachability evidence, SBOM excerpts, and
|
|
DSSE envelope stubs.
|
|
|
|
Usage:
|
|
python scripts/bench/populate-findings.py [--fixtures PATH] [--output PATH] [--dry-run]
|
|
"""
|
|
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
def blake3_hex(data: bytes) -> str:
|
|
"""Compute BLAKE3-256 hash (fallback to SHA-256 if blake3 not installed)."""
|
|
try:
|
|
import blake3
|
|
return blake3.blake3(data).hexdigest()
|
|
except ImportError:
|
|
return "sha256:" + hashlib.sha256(data).hexdigest()
|
|
|
|
|
|
def sha256_hex(data: bytes) -> str:
|
|
"""Compute SHA-256 hash."""
|
|
return hashlib.sha256(data).hexdigest()
|
|
|
|
|
|
def canonical_json(obj: Any) -> str:
|
|
"""Serialize object to canonical JSON (sorted keys, no extra whitespace for hashes)."""
|
|
return json.dumps(obj, sort_keys=True, separators=(',', ':'))
|
|
|
|
|
|
def canonical_json_pretty(obj: Any) -> str:
|
|
"""Serialize object to canonical JSON with indentation for readability."""
|
|
return json.dumps(obj, sort_keys=True, indent=2)
|
|
|
|
|
|
def load_reachbench_index(fixtures_path: Path) -> dict:
|
|
"""Load the reachbench INDEX.json."""
|
|
index_path = fixtures_path / "INDEX.json"
|
|
if not index_path.exists():
|
|
raise FileNotFoundError(f"Reachbench INDEX not found: {index_path}")
|
|
with open(index_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
|
|
def load_ground_truth(case_path: Path, variant: str) -> dict | None:
|
|
"""Load ground-truth.json for a variant."""
|
|
truth_path = case_path / "images" / variant / "reachgraph.truth.json"
|
|
if not truth_path.exists():
|
|
return None
|
|
with open(truth_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
|
|
def create_openvex_decision(
|
|
cve_id: str,
|
|
purl: str,
|
|
status: str, # "not_affected" or "affected"
|
|
justification: str | None,
|
|
evidence_hash: str,
|
|
timestamp: str
|
|
) -> dict:
|
|
"""Create an OpenVEX decision document."""
|
|
statement = {
|
|
"@context": "https://openvex.dev/ns/v0.2.0",
|
|
"@type": "VEX",
|
|
"author": "StellaOps Bench Automation",
|
|
"role": "security_team",
|
|
"timestamp": timestamp,
|
|
"version": 1,
|
|
"tooling": "StellaOps/bench-auto@1.0.0",
|
|
"statements": [
|
|
{
|
|
"vulnerability": {
|
|
"@id": f"https://nvd.nist.gov/vuln/detail/{cve_id}",
|
|
"name": cve_id,
|
|
},
|
|
"products": [
|
|
{"@id": purl}
|
|
],
|
|
"status": status,
|
|
}
|
|
]
|
|
}
|
|
|
|
if justification and status == "not_affected":
|
|
statement["statements"][0]["justification"] = justification
|
|
|
|
# Add action_statement for affected
|
|
if status == "affected":
|
|
statement["statements"][0]["action_statement"] = "Upgrade to patched version or apply mitigation."
|
|
|
|
# Add evidence reference
|
|
statement["statements"][0]["impact_statement"] = f"Evidence hash: {evidence_hash}"
|
|
|
|
return statement
|
|
|
|
|
|
def create_dsse_envelope_stub(payload: dict, payload_type: str = "application/vnd.openvex+json") -> dict:
|
|
"""Create a DSSE envelope stub (signature placeholder for actual signing)."""
|
|
payload_json = canonical_json(payload)
|
|
payload_b64 = __import__('base64').b64encode(payload_json.encode()).decode()
|
|
|
|
return {
|
|
"payloadType": payload_type,
|
|
"payload": payload_b64,
|
|
"signatures": [
|
|
{
|
|
"keyid": "stella.ops/bench-automation@v1",
|
|
"sig": "PLACEHOLDER_SIGNATURE_REQUIRES_ACTUAL_SIGNING"
|
|
}
|
|
]
|
|
}
|
|
|
|
|
|
def create_metadata(
|
|
cve_id: str,
|
|
purl: str,
|
|
variant: str,
|
|
case_id: str,
|
|
ground_truth: dict | None,
|
|
timestamp: str
|
|
) -> dict:
|
|
"""Create metadata.json for a finding."""
|
|
return {
|
|
"cve_id": cve_id,
|
|
"purl": purl,
|
|
"case_id": case_id,
|
|
"variant": variant,
|
|
"reachability_status": "reachable" if variant == "reachable" else "unreachable",
|
|
"ground_truth_schema": ground_truth.get("schema_version") if ground_truth else None,
|
|
"generated_at": timestamp,
|
|
"generator": "scripts/bench/populate-findings.py",
|
|
"generator_version": "1.0.0"
|
|
}
|
|
|
|
|
|
def extract_cve_id(case_id: str) -> str:
|
|
"""Extract CVE ID from case_id, or generate a placeholder."""
|
|
# Common patterns: log4j -> CVE-2021-44228, curl -> CVE-2023-38545, etc.
|
|
cve_mapping = {
|
|
"log4j": "CVE-2021-44228",
|
|
"curl": "CVE-2023-38545",
|
|
"kestrel": "CVE-2023-44487",
|
|
"spring": "CVE-2022-22965",
|
|
"openssl": "CVE-2022-3602",
|
|
"glibc": "CVE-2015-7547",
|
|
}
|
|
|
|
for key, cve in cve_mapping.items():
|
|
if key in case_id.lower():
|
|
return cve
|
|
|
|
# Generate placeholder CVE for unknown cases
|
|
return f"CVE-BENCH-{case_id.upper()[:8]}"
|
|
|
|
|
|
def extract_purl(case_id: str, case_data: dict) -> str:
|
|
"""Extract or generate a purl from case data."""
|
|
# Use case metadata if available
|
|
if "purl" in case_data:
|
|
return case_data["purl"]
|
|
|
|
# Generate based on case_id patterns
|
|
lang = case_data.get("language", "unknown")
|
|
version = case_data.get("version", "1.0.0")
|
|
|
|
pkg_type_map = {
|
|
"java": "maven",
|
|
"dotnet": "nuget",
|
|
"go": "golang",
|
|
"python": "pypi",
|
|
"rust": "cargo",
|
|
"native": "generic",
|
|
}
|
|
|
|
pkg_type = pkg_type_map.get(lang, "generic")
|
|
return f"pkg:{pkg_type}/{case_id}@{version}"
|
|
|
|
|
|
def populate_finding(
|
|
case_id: str,
|
|
case_data: dict,
|
|
case_path: Path,
|
|
output_dir: Path,
|
|
timestamp: str,
|
|
dry_run: bool
|
|
) -> dict:
|
|
"""Populate a single CVE finding bundle."""
|
|
cve_id = extract_cve_id(case_id)
|
|
purl = extract_purl(case_id, case_data)
|
|
|
|
results = {
|
|
"case_id": case_id,
|
|
"cve_id": cve_id,
|
|
"variants_processed": [],
|
|
"errors": []
|
|
}
|
|
|
|
for variant in ["reachable", "unreachable"]:
|
|
variant_path = case_path / "images" / variant
|
|
if not variant_path.exists():
|
|
continue
|
|
|
|
ground_truth = load_ground_truth(case_path, variant)
|
|
|
|
# Determine VEX status based on variant
|
|
if variant == "reachable":
|
|
vex_status = "affected"
|
|
justification = None
|
|
else:
|
|
vex_status = "not_affected"
|
|
justification = "vulnerable_code_not_present"
|
|
|
|
# Create finding directory
|
|
finding_id = f"{cve_id}-{variant}"
|
|
finding_dir = output_dir / finding_id
|
|
evidence_dir = finding_dir / "evidence"
|
|
|
|
if not dry_run:
|
|
finding_dir.mkdir(parents=True, exist_ok=True)
|
|
evidence_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create reachability evidence excerpt
|
|
evidence = {
|
|
"schema_version": "richgraph-excerpt/v1",
|
|
"case_id": case_id,
|
|
"variant": variant,
|
|
"ground_truth": ground_truth,
|
|
"paths": ground_truth.get("paths", []) if ground_truth else [],
|
|
"generated_at": timestamp
|
|
}
|
|
evidence_json = canonical_json_pretty(evidence)
|
|
evidence_hash = blake3_hex(evidence_json.encode())
|
|
|
|
if not dry_run:
|
|
with open(evidence_dir / "reachability.json", 'w', encoding='utf-8') as f:
|
|
f.write(evidence_json)
|
|
|
|
# Create SBOM excerpt
|
|
sbom = {
|
|
"bomFormat": "CycloneDX",
|
|
"specVersion": "1.6",
|
|
"version": 1,
|
|
"metadata": {
|
|
"timestamp": timestamp,
|
|
"tools": [{"vendor": "StellaOps", "name": "bench-auto", "version": "1.0.0"}]
|
|
},
|
|
"components": [
|
|
{
|
|
"type": "library",
|
|
"purl": purl,
|
|
"name": case_id,
|
|
"version": case_data.get("version", "1.0.0")
|
|
}
|
|
]
|
|
}
|
|
|
|
if not dry_run:
|
|
with open(evidence_dir / "sbom.cdx.json", 'w', encoding='utf-8') as f:
|
|
json.dump(sbom, f, indent=2, sort_keys=True)
|
|
|
|
# Create OpenVEX decision
|
|
openvex = create_openvex_decision(
|
|
cve_id=cve_id,
|
|
purl=purl,
|
|
status=vex_status,
|
|
justification=justification,
|
|
evidence_hash=evidence_hash,
|
|
timestamp=timestamp
|
|
)
|
|
|
|
if not dry_run:
|
|
with open(finding_dir / "decision.openvex.json", 'w', encoding='utf-8') as f:
|
|
json.dump(openvex, f, indent=2, sort_keys=True)
|
|
|
|
# Create DSSE envelope stub
|
|
dsse = create_dsse_envelope_stub(openvex)
|
|
|
|
if not dry_run:
|
|
with open(finding_dir / "decision.dsse.json", 'w', encoding='utf-8') as f:
|
|
json.dump(dsse, f, indent=2, sort_keys=True)
|
|
|
|
# Create Rekor placeholder
|
|
if not dry_run:
|
|
with open(finding_dir / "rekor.txt", 'w', encoding='utf-8') as f:
|
|
f.write(f"# Rekor log entry placeholder\n")
|
|
f.write(f"# Submit DSSE envelope to Rekor to populate this file\n")
|
|
f.write(f"log_index: PENDING\n")
|
|
f.write(f"uuid: PENDING\n")
|
|
f.write(f"timestamp: {timestamp}\n")
|
|
|
|
# Create metadata
|
|
metadata = create_metadata(
|
|
cve_id=cve_id,
|
|
purl=purl,
|
|
variant=variant,
|
|
case_id=case_id,
|
|
ground_truth=ground_truth,
|
|
timestamp=timestamp
|
|
)
|
|
|
|
if not dry_run:
|
|
with open(finding_dir / "metadata.json", 'w', encoding='utf-8') as f:
|
|
json.dump(metadata, f, indent=2, sort_keys=True)
|
|
|
|
results["variants_processed"].append({
|
|
"variant": variant,
|
|
"finding_id": finding_id,
|
|
"vex_status": vex_status,
|
|
"evidence_hash": evidence_hash
|
|
})
|
|
|
|
return results
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Populate bench/findings/** from reachbench fixtures"
|
|
)
|
|
parser.add_argument(
|
|
"--fixtures",
|
|
type=Path,
|
|
default=Path("tests/reachability/fixtures/reachbench-2025-expanded"),
|
|
help="Path to reachbench fixtures directory"
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
default=Path("bench/findings"),
|
|
help="Output directory for findings"
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Print what would be created without writing files"
|
|
)
|
|
parser.add_argument(
|
|
"--limit",
|
|
type=int,
|
|
default=0,
|
|
help="Limit number of cases to process (0 = all)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Resolve paths relative to repo root
|
|
repo_root = Path(__file__).parent.parent.parent
|
|
fixtures_path = repo_root / args.fixtures if not args.fixtures.is_absolute() else args.fixtures
|
|
output_path = repo_root / args.output if not args.output.is_absolute() else args.output
|
|
|
|
print(f"Fixtures path: {fixtures_path}")
|
|
print(f"Output path: {output_path}")
|
|
print(f"Dry run: {args.dry_run}")
|
|
|
|
# Load reachbench index
|
|
try:
|
|
index = load_reachbench_index(fixtures_path)
|
|
except FileNotFoundError as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
return 1
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
cases = index.get("cases", [])
|
|
if args.limit > 0:
|
|
cases = cases[:args.limit]
|
|
|
|
print(f"Processing {len(cases)} cases...")
|
|
|
|
all_results = []
|
|
for case in cases:
|
|
case_id = case["id"]
|
|
case_path_rel = case.get("path", f"cases/{case_id}")
|
|
case_path = fixtures_path / case_path_rel
|
|
|
|
if not case_path.exists():
|
|
print(f" Warning: Case path not found: {case_path}")
|
|
continue
|
|
|
|
print(f" Processing: {case_id}")
|
|
result = populate_finding(
|
|
case_id=case_id,
|
|
case_data=case,
|
|
case_path=case_path,
|
|
output_dir=output_path,
|
|
timestamp=timestamp,
|
|
dry_run=args.dry_run
|
|
)
|
|
all_results.append(result)
|
|
|
|
for v in result["variants_processed"]:
|
|
print(f" - {v['finding_id']}: {v['vex_status']}")
|
|
|
|
# Summary
|
|
total_findings = sum(len(r["variants_processed"]) for r in all_results)
|
|
print(f"\nGenerated {total_findings} findings from {len(all_results)} cases")
|
|
|
|
if args.dry_run:
|
|
print("(dry-run mode - no files written)")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|