up

2025-12-03 00:10:19 +02:00
parent ea1d58a89b
commit 37cba83708
158 changed files with 147438 additions and 867 deletions
--- a/samples/graph/scripts/build_explorer_fixture.py
+++ b/samples/graph/scripts/build_explorer_fixture.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""
+Build vulnerability explorer fixtures (JSON + CSV) from the canonical graph-40k fixture.
+
+Generates deterministic outputs in `samples/graph/graph-40k/explorer/`:
+- vuln-explorer.json
+- vuln-explorer.csv
+- manifest.json (hashes + counts)
+"""
+from __future__ import annotations
+
+import csv
+import hashlib
+import json
+from pathlib import Path
+from typing import List
+
+ROOT = Path(__file__).resolve().parent.parent
+GRAPH_ROOT = ROOT / "graph-40k"
+OVERLAY_PATH = GRAPH_ROOT / "overlay.ndjson"
+OUT_DIR = GRAPH_ROOT / "explorer"
+
+# Fixed advisory set to keep fixtures stable and small.
+ADVISORIES = [
+    ("CVE-2024-0001", "critical"),
+    ("CVE-2024-0002", "high"),
+    ("CVE-2023-9999", "medium"),
+    ("CVE-2025-1234", "low"),
+    ("CVE-2022-4242", "none"),
+]
+
+
+def sha256(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as f:
+        for chunk in iter(lambda: f.read(8192), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def load_overlays() -> List[dict]:
+    overlays: List[dict] = []
+    with OVERLAY_PATH.open("r", encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                overlays.append(json.loads(line))
+    overlays.sort(key=lambda o: o["overlay_id"])
+    return overlays
+
+
+def build_records() -> List[dict]:
+    overlays = load_overlays()[: len(ADVISORIES)]
+    records: List[dict] = []
+    for idx, overlay in enumerate(overlays):
+        advisory_id, advisory_sev = ADVISORIES[idx]
+        reachable = idx % 2 == 0  # alternate reachable/unreachable for UI coverage
+        status = "affected" if reachable else "not_affected"
+        conflict = "policy_deny_vs_scanner_affected" if overlay["verdict"] == "deny" and reachable else None
+
+        record = {
+            "component": overlay["node_id"],
+            "advisory": advisory_id,
+            "advisory_severity": advisory_sev,
+            "reachability": "reachable" if reachable else "unreachable",
+            "status": status,
+            "policy_overlay_id": overlay["overlay_id"],
+            "policy_verdict": overlay["verdict"],
+            "policy_severity": overlay["severity"],
+            "policy_rule_id": overlay["rule_id"],
+            "evidence": [
+                "sbom:mock-sbom-v1",
+                f"overlay:{overlay['overlay_id']}",
+            ],
+            "conflict": conflict or "",
+            "snapshot": overlay["snapshot"],
+            "tenant": overlay["tenant"],
+        }
+        records.append(record)
+    return records
+
+
+def write_json(records: List[dict], path: Path) -> None:
+    path.write_text(json.dumps(records, indent=2, sort_keys=True))
+
+
+def write_csv(records: List[dict], path: Path) -> None:
+    fieldnames = [
+        "component",
+        "advisory",
+        "advisory_severity",
+        "reachability",
+        "status",
+        "policy_overlay_id",
+        "policy_verdict",
+        "policy_severity",
+        "policy_rule_id",
+        "evidence",
+        "conflict",
+        "snapshot",
+        "tenant",
+    ]
+    with path.open("w", encoding="utf-8", newline="") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        for r in records:
+            row = r.copy()
+            row["evidence"] = ";".join(r["evidence"])
+            writer.writerow(row)
+
+
+def write_manifest(json_path: Path, csv_path: Path, count: int, manifest_path: Path) -> None:
+    manifest = {
+        "fixture": "graph-40k",
+        "advisories": [a for a, _ in ADVISORIES],
+        "count": count,
+        "hashes": {
+            "vuln-explorer.json": sha256(json_path),
+            "vuln-explorer.csv": sha256(csv_path),
+        },
+    }
+    manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True))
+
+
+def main() -> int:
+    OUT_DIR.mkdir(parents=True, exist_ok=True)
+    records = build_records()
+    json_path = OUT_DIR / "vuln-explorer.json"
+    csv_path = OUT_DIR / "vuln-explorer.csv"
+    manifest_path = OUT_DIR / "manifest.json"
+
+    write_json(records, json_path)
+    write_csv(records, csv_path)
+    write_manifest(json_path, csv_path, len(records), manifest_path)
+    print(f"Wrote {len(records)} records to {OUT_DIR}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/samples/graph/scripts/generate_canonical.py
+++ b/samples/graph/scripts/generate_canonical.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+"""
+Generate canonical SAMPLES-GRAPH-24-003 fixture.
+
+Outputs:
+- nodes.ndjson, edges.ndjson, overlay.ndjson
+- manifest.json with counts and SHA-256 hashes
+
+Deterministic and offline-only: fixed seed, fixed timestamps, sorted output.
+"""
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import random
+from pathlib import Path
+from typing import Iterable, List, Tuple
+
+TENANT = "demo-tenant"
+SNAPSHOT_ID = "graph-40k-policy-overlay-20251122"
+GENERATED_AT = "2025-11-22T00:00:00Z"
+DEFAULT_NODE_COUNT = 40_000
+SEED = 424_242
+MAX_FANOUT = 4
+OVERLAY_INTERVAL = 400  # one overlay per 400 nodes -> ~100 overlays for 40k nodes
+OVERLAY_VERDICTS = ("allow", "deny", "defer")
+OVERLAY_SEVERITIES = ("none", "low", "medium", "high", "critical")
+
+
+def sha256(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as f:
+        for chunk in iter(lambda: f.read(8192), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def write_ndjson(path: Path, rows: Iterable[dict]) -> None:
+    with path.open("w", encoding="utf-8", newline="\n") as f:
+        for row in rows:
+            f.write(json.dumps(row, sort_keys=True, separators=(",", ":")))
+            f.write("\n")
+
+
+def build_nodes(count: int, rng: random.Random) -> List[dict]:
+    nodes: List[dict] = []
+    for i in range(count):
+        version_patch = i % 5
+        purl = f"pkg:pypi/demo-{i}@1.0.{version_patch}"
+        node = {
+            "id": purl,
+            "kind": "component",
+            "name": f"demo-{i}",
+            "purl": purl,
+            "tenant": TENANT,
+            "version": f"1.0.{version_patch}",
+            "snapshot": SNAPSHOT_ID,
+        }
+        nodes.append(node)
+    nodes.sort(key=lambda n: n["id"])
+    return nodes
+
+
+def build_edges(nodes: List[dict], rng: random.Random) -> List[dict]:
+    edges: List[dict] = []
+    for idx, node in enumerate(nodes):
+        if idx == 0:
+            continue
+        fanout = rng.randint(1, min(MAX_FANOUT, idx))
+        targets_idx = rng.sample(range(idx), fanout)
+        for tgt_idx in targets_idx:
+            edges.append(
+                {
+                    "source": node["id"],
+                    "target": nodes[tgt_idx]["id"],
+                    "kind": "DEPENDS_ON",
+                    "provenance": "mock-sbom-v1",
+                    "snapshot": SNAPSHOT_ID,
+                    "tenant": TENANT,
+                }
+            )
+    edges.sort(key=lambda e: (e["source"], e["target"]))
+    return edges
+
+
+def build_overlays(nodes: List[dict], rng: random.Random) -> List[dict]:
+    overlays: List[dict] = []
+    for idx, node in enumerate(nodes):
+        if idx % OVERLAY_INTERVAL != 0:
+            continue
+        verdict = rng.choice(OVERLAY_VERDICTS)
+        severity = rng.choice(OVERLAY_SEVERITIES)
+        rule_id = f"RULE-{idx:05d}"
+        overlay_id = hashlib.sha256(f"{TENANT}|{node['id']}|policy.overlay.v1".encode()).hexdigest()
+        overlays.append(
+            {
+                "overlay_id": overlay_id,
+                "overlay_kind": "policy.overlay.v1",
+                "tenant": TENANT,
+                "snapshot": SNAPSHOT_ID,
+                "node_id": node["id"],
+                "verdict": verdict,
+                "rule_id": rule_id,
+                "severity": severity,
+                "explain": f"demo policy decision for {node['name']}",
+                # bridge to bench overlay support (optional edge application)
+                "source": node["id"],
+                "target": f"policy:rule:{rule_id}",
+            }
+        )
+    overlays.sort(key=lambda o: o["overlay_id"])
+    return overlays
+
+
+def generate(out_dir: Path, node_count: int, seed: int) -> Tuple[Path, Path, Path, Path]:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    rng = random.Random(seed)
+
+    nodes = build_nodes(node_count, rng)
+    edges = build_edges(nodes, rng)
+    overlays = build_overlays(nodes, rng)
+
+    nodes_path = out_dir / "nodes.ndjson"
+    edges_path = out_dir / "edges.ndjson"
+    overlay_path = out_dir / "overlay.ndjson"
+
+    write_ndjson(nodes_path, nodes)
+    write_ndjson(edges_path, edges)
+    write_ndjson(overlay_path, overlays)
+
+    manifest = {
+        "snapshot_id": SNAPSHOT_ID,
+        "tenant": TENANT,
+        "generated_at": GENERATED_AT,
+        "seed": seed,
+        "counts": {
+            "nodes": len(nodes),
+            "edges": len(edges),
+            "overlays": {"policy.overlay.v1": len(overlays)},
+        },
+        "hashes": {
+            "nodes_ndjson_sha256": sha256(nodes_path),
+            "edges_ndjson_sha256": sha256(edges_path),
+            "overlay_ndjson_sha256": sha256(overlay_path),
+        },
+        "overlay": {
+            "path": "overlay.ndjson",
+            "kind": "policy.overlay.v1",
+            "id_scheme": "sha256(tenant|nodeId|overlayKind)",
+        },
+        "inputs": {"sbom_source": "mock-sbom-v1"},
+    }
+
+    manifest_path = out_dir / "manifest.json"
+    manifest_path.write_text(json.dumps(manifest, indent=2, sort_keys=True))
+    return nodes_path, edges_path, overlay_path, manifest_path
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Generate canonical graph fixture (SAMPLES-GRAPH-24-003).")
+    parser.add_argument("--out-dir", default="samples/graph/graph-40k", help="Output directory for fixture files")
+    parser.add_argument("--nodes", type=int, default=DEFAULT_NODE_COUNT, help="Number of nodes to generate")
+    parser.add_argument("--seed", type=int, default=SEED, help="Seed for deterministic generation")
+    args = parser.parse_args()
+
+    out_dir = Path(args.out_dir).resolve()
+    nodes_path, edges_path, overlay_path, manifest_path = generate(out_dir, args.nodes, args.seed)
+
+    print("Generated fixture:")
+    print(f"  nodes:   {nodes_path}")
+    print(f"  edges:   {edges_path}")
+    print(f"  overlay: {overlay_path}")
+    print(f"  manifest:{manifest_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())