up

2025-11-29 11:37:00 +02:00
parent 3488b22c0c
commit 8f54ffa203
14 changed files with 685 additions and 4 deletions
--- a/bench/reachability-benchmark/README.md
+++ b/bench/reachability-benchmark/README.md
@@ -27,10 +27,10 @@ Deterministic, reproducible benchmark for reachability analysis tools.

 ## Quick Start (once populated)
 ```bash
-# validate schemas
-npm test ./schemas  # or python -m pytest schemas
+# schema sanity checks (offline)
+python tools/validate.py all schemas/examples

-# score a submission
+# score a submission (coming in task 513-008)
 cd tools/scorer
 ./rb-score --cases ../cases --truth ../benchmark/truth --submission ../benchmark/submissions/sample.json
 ```
--- a/bench/reachability-benchmark/schemas/README.md
+++ b/bench/reachability-benchmark/schemas/README.md
@@ -0,0 +1,22 @@
+# Schemas
+
+- `case.schema.yaml` — case descriptor (language, sinks, deterministic build/test, environment, optional inline ground truth summary).
+- `entrypoints.schema.yaml` — declared entrypoints grouped by type (`http`, `cli`, `scheduled`, etc.).
+- `truth.schema.json` — ground-truth labels + evidence per sink (`reachable`/`unreachable`/`unknown`).
+- `submission.schema.json` — submission format (tool/run metadata, per-sink predictions, explanations).
+- `examples/` — minimal valid examples for each schema.
+
+## Validate quickly
+```bash
+# install minimal deps (offline-friendly, pinned)
+python -m pip install -r ../tools/requirements.txt
+
+# validate individual files
+python ../tools/validate.py case examples/case.sample.yaml
+python ../tools/validate.py entrypoints examples/entrypoints.sample.yaml
+python ../tools/validate.py truth examples/truth.sample.json
+python ../tools/validate.py submission examples/submission.sample.json
+
+# or validate everything in one shot
+python ../tools/validate.py all examples
+```
--- a/bench/reachability-benchmark/schemas/case.schema.yaml
+++ b/bench/reachability-benchmark/schemas/case.schema.yaml
@@ -0,0 +1,145 @@
+$schema: "https://json-schema.org/draft/2020-12/schema"
+$id: "https://stellaops.org/benchmark/reachability/case.schema.yaml"
+title: Reachability Benchmark Case Descriptor
+type: object
+required:
+  - id
+  - language
+  - project
+  - version
+  - sinks
+  - environment
+  - build
+  - test
+properties:
+  id:
+    type: string
+    description: Unique, stable case identifier (e.g. js-express-blog:001)
+    pattern: "^[A-Za-z0-9._:-]+$"
+  language:
+    type: string
+    enum: [js, py, java, c]
+  project:
+    type: string
+    description: Short project name
+  version:
+    type: string
+    description: Semantic-ish version of the case contents
+    pattern: "^\\d+(\\.\\d+){0,2}(-[A-Za-z0-9._-]+)?$"
+  description:
+    type: string
+  repository:
+    type: string
+    format: uri
+    description: Upstream repo (if vendored); optional for in-repo cases
+  entrypoints:
+    type: array
+    items:
+      type: string
+    uniqueItems: true
+  sinks:
+    type: array
+    minItems: 1
+    items:
+      type: object
+      required: [id, path, kind, location]
+      additionalProperties: false
+      properties:
+        id:
+          type: string
+          pattern: "^[A-Za-z0-9._:-]+$"
+        path:
+          type: string
+          description: Fully-qualified function/method path for the sink
+        kind:
+          type: string
+          enum: [http, file, crypto, process, deserialization, custom]
+        location:
+          type: object
+          required: [file]
+          additionalProperties: false
+          properties:
+            file:
+              type: string
+            line:
+              type: integer
+              minimum: 1
+        notes:
+          type: string
+  environment:
+    type: object
+    required: [os_image]
+    additionalProperties: false
+    properties:
+      os_image:
+        type: string
+        description: Base image or OS identifier (e.g. ubuntu:24.04)
+      runtime:
+        type: object
+        description: Language/runtime versions
+        additionalProperties:
+          type: string
+      compiler:
+        type: string
+      source_date_epoch:
+        type: integer
+        minimum: 0
+  build:
+    type: object
+    required: [command, source_date_epoch]
+    additionalProperties: false
+    properties:
+      command:
+        type: string
+        description: Deterministic build command (invokes Dockerfile/build.sh)
+      source_date_epoch:
+        type: integer
+        minimum: 0
+      env:
+        type: object
+        additionalProperties: true
+      outputs:
+        type: object
+        additionalProperties: false
+        properties:
+          artifact_path:
+            type: string
+          sbom_path:
+            type: string
+          coverage_path:
+            type: string
+          traces_dir:
+            type: string
+  test:
+    type: object
+    required: [command]
+    additionalProperties: false
+    properties:
+      command:
+        type: string
+        description: Oracle test command producing coverage/traces
+      expected_coverage:
+        type: array
+        items:
+          type: string
+      expected_traces:
+        type: array
+        items:
+          type: string
+      env:
+        type: object
+        additionalProperties: true
+  ground_truth:
+    type: object
+    description: Optional inline truth summary (full truth lives in truth files)
+    additionalProperties: false
+    properties:
+      summary:
+        type: string
+      evidence_files:
+        type: array
+        items:
+          type: string
+      notes:
+        type: string
+additionalProperties: false
--- a/bench/reachability-benchmark/schemas/entrypoints.schema.yaml
+++ b/bench/reachability-benchmark/schemas/entrypoints.schema.yaml
@@ -0,0 +1,41 @@
+$schema: "https://json-schema.org/draft/2020-12/schema"
+$id: "https://stellaops.org/benchmark/reachability/entrypoints.schema.yaml"
+title: Reachability Case Entrypoints
+type: object
+required:
+  - case_id
+  - entries
+properties:
+  case_id:
+    type: string
+  entries:
+    type: object
+    minProperties: 1
+    additionalProperties: false
+    patternProperties:
+      "^[a-z][a-z0-9_-]*$":
+        type: array
+        minItems: 1
+        items:
+          type: object
+          required: [id]
+          additionalProperties: false
+          properties:
+            id:
+              type: string
+            route:
+              type: string
+            method:
+              type: string
+            command:
+              type: string
+            schedule:
+              type: string
+            handler:
+              type: string
+            env:
+              type: object
+              additionalProperties: true
+            description:
+              type: string
+additionalProperties: false
--- a/bench/reachability-benchmark/schemas/examples/case.sample.yaml
+++ b/bench/reachability-benchmark/schemas/examples/case.sample.yaml
@@ -0,0 +1,44 @@
+id: "js-express-blog:001"
+language: js
+project: express-blog
+version: "1.0.0"
+description: Minimal blog API with an unsafe deserializer sink.
+repository: "https://example.org/express-blog"
+entrypoints:
+  - "POST /api/posts"
+sinks:
+  - id: "Deserializer::parse"
+    path: "src/deserializer.js::parse"
+    kind: deserialization
+    location:
+      file: src/deserializer.js
+      line: 42
+    notes: "JSON.parse on user input without guards"
+environment:
+  os_image: "ubuntu:24.04"
+  runtime:
+    node: "20.11.0"
+  source_date_epoch: 1730000000
+build:
+  command: "./build/build.sh"
+  source_date_epoch: 1730000000
+  outputs:
+    artifact_path: outputs/binary.tar.gz
+    sbom_path: outputs/sbom.cdx.json
+    coverage_path: outputs/coverage.json
+    traces_dir: outputs/traces
+  env:
+    NODE_ENV: production
+test:
+  command: "npm test"
+  expected_coverage:
+    - outputs/coverage.json
+  expected_traces:
+    - outputs/traces/traces.json
+  env:
+    NODE_ENV: test
+ground_truth:
+  summary: "Unit test test_reachable_deserialization hits the sink"
+  evidence_files:
+    - truth/truth.yaml
+  notes: "FEATURE_JSON_ENABLED must be true for reachability"
--- a/bench/reachability-benchmark/schemas/examples/entrypoints.sample.yaml
+++ b/bench/reachability-benchmark/schemas/examples/entrypoints.sample.yaml
@@ -0,0 +1,17 @@
+case_id: "js-express-blog:001"
+entries:
+  http:
+    - id: "POST /api/posts"
+      route: "/api/posts"
+      method: "POST"
+      handler: "PostsController.create"
+      description: "Create a new post (hits deserializer)"
+  cli:
+    - id: "generate-report"
+      command: "node cli.js generate-report"
+      description: "Generates a report from posts"
+  scheduled:
+    - id: "daily-cleanup"
+      schedule: "0 3 * * *"
+      handler: "CleanupJob.run"
+      description: "Archives soft-deleted posts nightly"
--- a/bench/reachability-benchmark/schemas/examples/submission.sample.json
+++ b/bench/reachability-benchmark/schemas/examples/submission.sample.json
@@ -0,0 +1,46 @@
+{
+  "version": "1.0.0",
+  "tool": {
+    "name": "sample-tool",
+    "version": "0.1.0"
+  },
+  "run": {
+    "commit": "abcd1234",
+    "platform": "ubuntu:24.04",
+    "time_s": 182.4,
+    "peak_mb": 3072
+  },
+  "cases": [
+    {
+      "case_id": "js-express-blog:001",
+      "sinks": [
+        {
+          "sink_id": "Deserializer::parse",
+          "prediction": "reachable",
+          "confidence": 0.88,
+          "explain": {
+            "entry": "POST /api/posts",
+            "path": [
+              "PostsController.create",
+              "PostsService.createFromJson",
+              "Deserializer.parse"
+            ],
+            "guards": [
+              "process.env.FEATURE_JSON_ENABLED === 'true'"
+            ]
+          },
+          "notes": "Observed via dynamic trace"
+        }
+      ]
+    }
+  ],
+  "artifacts": {
+    "sbom": "sha256:deadbeef",
+    "attestation": "sha256:cafebabe"
+  },
+  "submitter": {
+    "name": "Example Corp",
+    "organization": "Example",
+    "contact": "bench@example.org"
+  }
+}
--- a/bench/reachability-benchmark/schemas/examples/truth.sample.json
+++ b/bench/reachability-benchmark/schemas/examples/truth.sample.json
@@ -0,0 +1,37 @@
+{
+  "version": "1.0.0",
+  "cases": [
+    {
+      "case_id": "js-express-blog:001",
+      "case_version": "1.0.0",
+      "notes": "Baseline public case",
+      "sinks": [
+        {
+          "sink_id": "Deserializer::parse",
+          "label": "reachable",
+          "confidence": "high",
+          "dynamic_evidence": {
+            "covered_by_tests": [
+              "tests/test_reachable_deserialization.js::should_reach_sink"
+            ],
+            "coverage_files": [
+              "outputs/coverage.json"
+            ]
+          },
+          "static_evidence": {
+            "call_path": [
+              "POST /api/posts",
+              "PostsController.create",
+              "PostsService.createFromJson",
+              "Deserializer.parse"
+            ]
+          },
+          "config_conditions": [
+            "process.env.FEATURE_JSON_ENABLED == 'true'"
+          ],
+          "notes": "If FEATURE_JSON_ENABLED=false the path is unreachable"
+        }
+      ]
+    }
+  ]
+}
--- a/bench/reachability-benchmark/schemas/submission.schema.json
+++ b/bench/reachability-benchmark/schemas/submission.schema.json
@@ -0,0 +1,104 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://stellaops.org/benchmark/reachability/submission.schema.json",
+  "title": "Reachability Benchmark Submission",
+  "type": "object",
+  "required": ["version", "tool", "run", "cases"],
+  "additionalProperties": false,
+  "properties": {
+    "version": {
+      "type": "string",
+      "enum": ["1.0.0"],
+      "description": "Submission schema version"
+    },
+    "tool": {
+      "type": "object",
+      "required": ["name", "version"],
+      "additionalProperties": false,
+      "properties": {
+        "name": {"type": "string"},
+        "version": {"type": "string"}
+      }
+    },
+    "run": {
+      "type": "object",
+      "required": ["platform"],
+      "additionalProperties": false,
+      "description": "Execution metadata for reproducibility",
+      "properties": {
+        "commit": {"type": "string"},
+        "platform": {"type": "string"},
+        "time_s": {"type": "number", "minimum": 0},
+        "peak_mb": {"type": "number", "minimum": 0}
+      }
+    },
+    "cases": {
+      "type": "array",
+      "minItems": 1,
+      "items": {
+        "type": "object",
+        "required": ["case_id", "sinks"],
+        "additionalProperties": false,
+        "properties": {
+          "case_id": {"type": "string"},
+          "sinks": {
+            "type": "array",
+            "minItems": 1,
+            "items": {
+              "type": "object",
+              "required": ["sink_id", "prediction"],
+              "additionalProperties": false,
+              "properties": {
+                "sink_id": {"type": "string"},
+                "prediction": {
+                  "type": "string",
+                  "enum": ["reachable", "unreachable"]
+                },
+                "confidence": {
+                  "type": "number",
+                  "minimum": 0,
+                  "maximum": 1
+                },
+                "explain": {
+                  "type": "object",
+                  "additionalProperties": false,
+                  "properties": {
+                    "entry": {"type": "string"},
+                    "path": {
+                      "type": "array",
+                      "items": {"type": "string"},
+                      "minItems": 2
+                    },
+                    "guards": {
+                      "type": "array",
+                      "items": {"type": "string"},
+                      "uniqueItems": true
+                    }
+                  }
+                },
+                "notes": {"type": "string"}
+              }
+            }
+          }
+        }
+      }
+    },
+    "artifacts": {
+      "type": "object",
+      "additionalProperties": false,
+      "properties": {
+        "sbom": {"type": "string"},
+        "attestation": {"type": "string"}
+      }
+    },
+    "submitter": {
+      "type": "object",
+      "properties": {
+        "name": {"type": "string"},
+        "organization": {"type": "string"},
+        "contact": {"type": "string", "format": "email"}
+      },
+      "additionalProperties": false
+    }
+  }
+}
--- a/bench/reachability-benchmark/schemas/truth.schema.json
+++ b/bench/reachability-benchmark/schemas/truth.schema.json
@@ -0,0 +1,79 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://stellaops.org/benchmark/reachability/truth.schema.json",
+  "title": "Reachability Benchmark Truth Set",
+  "type": "object",
+  "required": ["version", "cases"],
+  "properties": {
+    "version": {"type": "string", "enum": ["1.0.0"]},
+    "cases": {
+      "type": "array",
+      "minItems": 1,
+      "items": {
+        "type": "object",
+        "required": ["case_id", "sinks"],
+        "additionalProperties": false,
+        "properties": {
+          "case_id": {"type": "string"},
+          "case_version": {"type": "string"},
+          "notes": {"type": "string"},
+          "sinks": {
+            "type": "array",
+            "minItems": 1,
+            "items": {
+              "type": "object",
+              "required": ["sink_id", "label"],
+              "additionalProperties": false,
+              "properties": {
+                "sink_id": {"type": "string"},
+                "label": {
+                  "type": "string",
+                  "enum": ["reachable", "unreachable", "unknown"]
+                },
+                "confidence": {
+                  "type": "string",
+                  "enum": ["high", "medium", "low"],
+                  "default": "high"
+                },
+                "dynamic_evidence": {
+                  "type": "object",
+                  "additionalProperties": false,
+                  "properties": {
+                    "covered_by_tests": {
+                      "type": "array",
+                      "items": {"type": "string"},
+                      "uniqueItems": true
+                    },
+                    "coverage_files": {
+                      "type": "array",
+                      "items": {"type": "string"},
+                      "uniqueItems": true
+                    }
+                  }
+                },
+                "static_evidence": {
+                  "type": "object",
+                  "additionalProperties": false,
+                  "properties": {
+                    "call_path": {
+                      "type": "array",
+                      "items": {"type": "string"},
+                      "minItems": 2
+                    }
+                  }
+                },
+                "config_conditions": {
+                  "type": "array",
+                  "items": {"type": "string"},
+                  "uniqueItems": true
+                },
+                "notes": {"type": "string"}
+              }
+            }
+          }
+        }
+      }
+    }
+  },
+  "additionalProperties": false
+}
--- a/bench/reachability-benchmark/tools/requirements.txt
+++ b/bench/reachability-benchmark/tools/requirements.txt
@@ -0,0 +1,2 @@
+jsonschema==4.23.0
+PyYAML==6.0.2
--- a/bench/reachability-benchmark/tools/scorer/README.md
+++ b/bench/reachability-benchmark/tools/scorer/README.md
@@ -0,0 +1,11 @@
+# rb-score (placeholder)
+
+Planned CLI to score reachability submissions against truth sets.
+
+Future work (BENCH-SCORER-513-008):
+- Validate submission against `schemas/submission.schema.json`.
+- Validate truth against `schemas/truth.schema.json`.
+- Compute precision/recall/F1, explainability score (0-3), runtime stats, determinism rate.
+- Emit JSON report with stable ordering.
+
+For now this folder is a stub; implementation will be added in task 513-008 once schemas stabilize.
--- a/bench/reachability-benchmark/tools/validate.py
+++ b/bench/reachability-benchmark/tools/validate.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""Deterministic schema validator for reachability benchmark assets.
+
+Usage examples:
+  python tools/validate.py case schemas/examples/case.sample.yaml
+  python tools/validate.py truth benchmark/truth/public.json
+  python tools/validate.py all schemas/examples
+
+The script is offline-friendly and relies only on pinned deps from
+`tools/requirements.txt`.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Iterable, Tuple
+
+import yaml
+from jsonschema import Draft202012Validator, FormatChecker
+
+ROOT = Path(__file__).resolve().parent.parent
+SCHEMAS = {
+    "case": ROOT / "schemas" / "case.schema.yaml",
+    "entrypoints": ROOT / "schemas" / "entrypoints.schema.yaml",
+    "truth": ROOT / "schemas" / "truth.schema.json",
+    "submission": ROOT / "schemas" / "submission.schema.json",
+}
+
+
+def load_yaml_or_json(path: Path):
+    text = path.read_text(encoding="utf-8")
+    if path.suffix.lower() in {".yaml", ".yml"}:
+        return yaml.safe_load(text)
+    return json.loads(text)
+
+
+def load_schema(kind: str):
+    schema_path = SCHEMAS[kind]
+    return load_yaml_or_json(schema_path)
+
+
+def validate_one(kind: str, payload_path: Path) -> Tuple[bool, Tuple[str, ...]]:
+    schema = load_schema(kind)
+    document = load_yaml_or_json(payload_path)
+    validator = Draft202012Validator(schema, format_checker=FormatChecker())
+
+    errors = sorted(validator.iter_errors(document), key=lambda e: (list(e.path), e.message))
+    if errors:
+        messages = tuple(
+            f"{payload_path}: {"/".join(str(p) for p in err.path) or '<root>'}: {err.message}"
+            for err in errors
+        )
+        return False, messages
+    return True, ()
+
+
+def collect_all(directory: Path) -> Iterable[Tuple[str, Path]]:
+    mapping = {
+        "case": ("case",),
+        "entrypoints": ("entrypoints", "entrypoint"),
+        "truth": ("truth",),
+        "submission": ("submission",),
+    }
+    for path in sorted(directory.rglob("*")):
+        if not path.is_file():
+            continue
+        stem_lower = path.stem.lower()
+        for kind, tokens in mapping.items():
+            if any(token in stem_lower for token in tokens):
+                yield kind, path
+                break
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Validate reachability benchmark files against schemas.")
+    parser.add_argument(
+        "kind",
+        choices=["case", "entrypoints", "truth", "submission", "all"],
+        help="Which schema to validate against or 'all' to auto-detect in a directory",
+    )
+    parser.add_argument(
+        "paths",
+        nargs="+",
+        help="File(s) to validate. If kind=all, provide one or more directories to scan.",
+    )
+    return parser.parse_args()
+
+
+def main() -> int:
+    args = parse_args()
+    failures: list[str] = []
+
+    if args.kind == "all":
+        for base in args.paths:
+            base_path = Path(base)
+            if not base_path.exists():
+                failures.append(f"{base}: path not found")
+                continue
+            for kind, path in collect_all(base_path):
+                ok, messages = validate_one(kind, path)
+                if ok:
+                    print(f"OK  [{kind}] {path}")
+                else:
+                    failures.extend(messages)
+        if failures:
+            for msg in failures:
+                print(f"FAIL {msg}")
+            return 1
+        return 0
+
+    # Single schema mode
+    for path_str in args.paths:
+        path = Path(path_str)
+        if not path.exists():
+            failures.append(f"{path}: path not found")
+            continue
+        ok, messages = validate_one(args.kind, path)
+        if ok:
+            print(f"OK  [{args.kind}] {path}")
+        else:
+            failures.extend(messages)
+
+    if failures:
+        for msg in failures:
+            print(f"FAIL {msg}")
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())