From 8f54ffa2033dcd9f9558dc7dc15f66fa70615c3c Mon Sep 17 00:00:00 2001 From: StellaOps Bot Date: Sat, 29 Nov 2025 11:37:00 +0200 Subject: [PATCH] up --- bench/reachability-benchmark/README.md | 6 +- .../reachability-benchmark/schemas/README.md | 22 +++ .../schemas/case.schema.yaml | 145 ++++++++++++++++++ .../schemas/entrypoints.schema.yaml | 41 +++++ .../schemas/examples/case.sample.yaml | 44 ++++++ .../schemas/examples/entrypoints.sample.yaml | 17 ++ .../schemas/examples/submission.sample.json | 46 ++++++ .../schemas/examples/truth.sample.json | 37 +++++ .../schemas/submission.schema.json | 104 +++++++++++++ .../schemas/truth.schema.json | 79 ++++++++++ .../tools/requirements.txt | 2 + .../tools/scorer/README.md | 11 ++ .../reachability-benchmark/tools/validate.py | 132 ++++++++++++++++ ...0001_0001_public_reachability_benchmark.md | 3 +- 14 files changed, 685 insertions(+), 4 deletions(-) create mode 100644 bench/reachability-benchmark/schemas/README.md create mode 100644 bench/reachability-benchmark/schemas/case.schema.yaml create mode 100644 bench/reachability-benchmark/schemas/entrypoints.schema.yaml create mode 100644 bench/reachability-benchmark/schemas/examples/case.sample.yaml create mode 100644 bench/reachability-benchmark/schemas/examples/entrypoints.sample.yaml create mode 100644 bench/reachability-benchmark/schemas/examples/submission.sample.json create mode 100644 bench/reachability-benchmark/schemas/examples/truth.sample.json create mode 100644 bench/reachability-benchmark/schemas/submission.schema.json create mode 100644 bench/reachability-benchmark/schemas/truth.schema.json create mode 100644 bench/reachability-benchmark/tools/requirements.txt create mode 100644 bench/reachability-benchmark/tools/scorer/README.md create mode 100644 bench/reachability-benchmark/tools/validate.py diff --git a/bench/reachability-benchmark/README.md b/bench/reachability-benchmark/README.md index 1a4c0b0c6..1a05f8781 100644 --- a/bench/reachability-benchmark/README.md +++ b/bench/reachability-benchmark/README.md @@ -27,10 +27,10 @@ Deterministic, reproducible benchmark for reachability analysis tools. ## Quick Start (once populated) ```bash -# validate schemas -npm test ./schemas # or python -m pytest schemas +# schema sanity checks (offline) +python tools/validate.py all schemas/examples -# score a submission +# score a submission (coming in task 513-008) cd tools/scorer ./rb-score --cases ../cases --truth ../benchmark/truth --submission ../benchmark/submissions/sample.json ``` diff --git a/bench/reachability-benchmark/schemas/README.md b/bench/reachability-benchmark/schemas/README.md new file mode 100644 index 000000000..209f19d96 --- /dev/null +++ b/bench/reachability-benchmark/schemas/README.md @@ -0,0 +1,22 @@ +# Schemas + +- `case.schema.yaml` — case descriptor (language, sinks, deterministic build/test, environment, optional inline ground truth summary). +- `entrypoints.schema.yaml` — declared entrypoints grouped by type (`http`, `cli`, `scheduled`, etc.). +- `truth.schema.json` — ground-truth labels + evidence per sink (`reachable`/`unreachable`/`unknown`). +- `submission.schema.json` — submission format (tool/run metadata, per-sink predictions, explanations). +- `examples/` — minimal valid examples for each schema. + +## Validate quickly +```bash +# install minimal deps (offline-friendly, pinned) +python -m pip install -r ../tools/requirements.txt + +# validate individual files +python ../tools/validate.py case examples/case.sample.yaml +python ../tools/validate.py entrypoints examples/entrypoints.sample.yaml +python ../tools/validate.py truth examples/truth.sample.json +python ../tools/validate.py submission examples/submission.sample.json + +# or validate everything in one shot +python ../tools/validate.py all examples +``` diff --git a/bench/reachability-benchmark/schemas/case.schema.yaml b/bench/reachability-benchmark/schemas/case.schema.yaml new file mode 100644 index 000000000..49e42fdd1 --- /dev/null +++ b/bench/reachability-benchmark/schemas/case.schema.yaml @@ -0,0 +1,145 @@ +$schema: "https://json-schema.org/draft/2020-12/schema" +$id: "https://stellaops.org/benchmark/reachability/case.schema.yaml" +title: Reachability Benchmark Case Descriptor +type: object +required: + - id + - language + - project + - version + - sinks + - environment + - build + - test +properties: + id: + type: string + description: Unique, stable case identifier (e.g. js-express-blog:001) + pattern: "^[A-Za-z0-9._:-]+$" + language: + type: string + enum: [js, py, java, c] + project: + type: string + description: Short project name + version: + type: string + description: Semantic-ish version of the case contents + pattern: "^\\d+(\\.\\d+){0,2}(-[A-Za-z0-9._-]+)?$" + description: + type: string + repository: + type: string + format: uri + description: Upstream repo (if vendored); optional for in-repo cases + entrypoints: + type: array + items: + type: string + uniqueItems: true + sinks: + type: array + minItems: 1 + items: + type: object + required: [id, path, kind, location] + additionalProperties: false + properties: + id: + type: string + pattern: "^[A-Za-z0-9._:-]+$" + path: + type: string + description: Fully-qualified function/method path for the sink + kind: + type: string + enum: [http, file, crypto, process, deserialization, custom] + location: + type: object + required: [file] + additionalProperties: false + properties: + file: + type: string + line: + type: integer + minimum: 1 + notes: + type: string + environment: + type: object + required: [os_image] + additionalProperties: false + properties: + os_image: + type: string + description: Base image or OS identifier (e.g. ubuntu:24.04) + runtime: + type: object + description: Language/runtime versions + additionalProperties: + type: string + compiler: + type: string + source_date_epoch: + type: integer + minimum: 0 + build: + type: object + required: [command, source_date_epoch] + additionalProperties: false + properties: + command: + type: string + description: Deterministic build command (invokes Dockerfile/build.sh) + source_date_epoch: + type: integer + minimum: 0 + env: + type: object + additionalProperties: true + outputs: + type: object + additionalProperties: false + properties: + artifact_path: + type: string + sbom_path: + type: string + coverage_path: + type: string + traces_dir: + type: string + test: + type: object + required: [command] + additionalProperties: false + properties: + command: + type: string + description: Oracle test command producing coverage/traces + expected_coverage: + type: array + items: + type: string + expected_traces: + type: array + items: + type: string + env: + type: object + additionalProperties: true + ground_truth: + type: object + description: Optional inline truth summary (full truth lives in truth files) + additionalProperties: false + properties: + summary: + type: string + evidence_files: + type: array + items: + type: string + notes: + type: string +additionalProperties: false diff --git a/bench/reachability-benchmark/schemas/entrypoints.schema.yaml b/bench/reachability-benchmark/schemas/entrypoints.schema.yaml new file mode 100644 index 000000000..fbd96f810 --- /dev/null +++ b/bench/reachability-benchmark/schemas/entrypoints.schema.yaml @@ -0,0 +1,41 @@ +$schema: "https://json-schema.org/draft/2020-12/schema" +$id: "https://stellaops.org/benchmark/reachability/entrypoints.schema.yaml" +title: Reachability Case Entrypoints +type: object +required: + - case_id + - entries +properties: + case_id: + type: string + entries: + type: object + minProperties: 1 + additionalProperties: false + patternProperties: + "^[a-z][a-z0-9_-]*$": + type: array + minItems: 1 + items: + type: object + required: [id] + additionalProperties: false + properties: + id: + type: string + route: + type: string + method: + type: string + command: + type: string + schedule: + type: string + handler: + type: string + env: + type: object + additionalProperties: true + description: + type: string +additionalProperties: false diff --git a/bench/reachability-benchmark/schemas/examples/case.sample.yaml b/bench/reachability-benchmark/schemas/examples/case.sample.yaml new file mode 100644 index 000000000..8c1bfedaa --- /dev/null +++ b/bench/reachability-benchmark/schemas/examples/case.sample.yaml @@ -0,0 +1,44 @@ +id: "js-express-blog:001" +language: js +project: express-blog +version: "1.0.0" +description: Minimal blog API with an unsafe deserializer sink. +repository: "https://example.org/express-blog" +entrypoints: + - "POST /api/posts" +sinks: + - id: "Deserializer::parse" + path: "src/deserializer.js::parse" + kind: deserialization + location: + file: src/deserializer.js + line: 42 + notes: "JSON.parse on user input without guards" +environment: + os_image: "ubuntu:24.04" + runtime: + node: "20.11.0" + source_date_epoch: 1730000000 +build: + command: "./build/build.sh" + source_date_epoch: 1730000000 + outputs: + artifact_path: outputs/binary.tar.gz + sbom_path: outputs/sbom.cdx.json + coverage_path: outputs/coverage.json + traces_dir: outputs/traces + env: + NODE_ENV: production +test: + command: "npm test" + expected_coverage: + - outputs/coverage.json + expected_traces: + - outputs/traces/traces.json + env: + NODE_ENV: test +ground_truth: + summary: "Unit test test_reachable_deserialization hits the sink" + evidence_files: + - truth/truth.yaml + notes: "FEATURE_JSON_ENABLED must be true for reachability" diff --git a/bench/reachability-benchmark/schemas/examples/entrypoints.sample.yaml b/bench/reachability-benchmark/schemas/examples/entrypoints.sample.yaml new file mode 100644 index 000000000..e4d1204f9 --- /dev/null +++ b/bench/reachability-benchmark/schemas/examples/entrypoints.sample.yaml @@ -0,0 +1,17 @@ +case_id: "js-express-blog:001" +entries: + http: + - id: "POST /api/posts" + route: "/api/posts" + method: "POST" + handler: "PostsController.create" + description: "Create a new post (hits deserializer)" + cli: + - id: "generate-report" + command: "node cli.js generate-report" + description: "Generates a report from posts" + scheduled: + - id: "daily-cleanup" + schedule: "0 3 * * *" + handler: "CleanupJob.run" + description: "Archives soft-deleted posts nightly" diff --git a/bench/reachability-benchmark/schemas/examples/submission.sample.json b/bench/reachability-benchmark/schemas/examples/submission.sample.json new file mode 100644 index 000000000..6e8f9af32 --- /dev/null +++ b/bench/reachability-benchmark/schemas/examples/submission.sample.json @@ -0,0 +1,46 @@ +{ + "version": "1.0.0", + "tool": { + "name": "sample-tool", + "version": "0.1.0" + }, + "run": { + "commit": "abcd1234", + "platform": "ubuntu:24.04", + "time_s": 182.4, + "peak_mb": 3072 + }, + "cases": [ + { + "case_id": "js-express-blog:001", + "sinks": [ + { + "sink_id": "Deserializer::parse", + "prediction": "reachable", + "confidence": 0.88, + "explain": { + "entry": "POST /api/posts", + "path": [ + "PostsController.create", + "PostsService.createFromJson", + "Deserializer.parse" + ], + "guards": [ + "process.env.FEATURE_JSON_ENABLED === 'true'" + ] + }, + "notes": "Observed via dynamic trace" + } + ] + } + ], + "artifacts": { + "sbom": "sha256:deadbeef", + "attestation": "sha256:cafebabe" + }, + "submitter": { + "name": "Example Corp", + "organization": "Example", + "contact": "bench@example.org" + } +} diff --git a/bench/reachability-benchmark/schemas/examples/truth.sample.json b/bench/reachability-benchmark/schemas/examples/truth.sample.json new file mode 100644 index 000000000..c7116eb5e --- /dev/null +++ b/bench/reachability-benchmark/schemas/examples/truth.sample.json @@ -0,0 +1,37 @@ +{ + "version": "1.0.0", + "cases": [ + { + "case_id": "js-express-blog:001", + "case_version": "1.0.0", + "notes": "Baseline public case", + "sinks": [ + { + "sink_id": "Deserializer::parse", + "label": "reachable", + "confidence": "high", + "dynamic_evidence": { + "covered_by_tests": [ + "tests/test_reachable_deserialization.js::should_reach_sink" + ], + "coverage_files": [ + "outputs/coverage.json" + ] + }, + "static_evidence": { + "call_path": [ + "POST /api/posts", + "PostsController.create", + "PostsService.createFromJson", + "Deserializer.parse" + ] + }, + "config_conditions": [ + "process.env.FEATURE_JSON_ENABLED == 'true'" + ], + "notes": "If FEATURE_JSON_ENABLED=false the path is unreachable" + } + ] + } + ] +} diff --git a/bench/reachability-benchmark/schemas/submission.schema.json b/bench/reachability-benchmark/schemas/submission.schema.json new file mode 100644 index 000000000..aeb169b07 --- /dev/null +++ b/bench/reachability-benchmark/schemas/submission.schema.json @@ -0,0 +1,104 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://stellaops.org/benchmark/reachability/submission.schema.json", + "title": "Reachability Benchmark Submission", + "type": "object", + "required": ["version", "tool", "run", "cases"], + "additionalProperties": false, + "properties": { + "version": { + "type": "string", + "enum": ["1.0.0"], + "description": "Submission schema version" + }, + "tool": { + "type": "object", + "required": ["name", "version"], + "additionalProperties": false, + "properties": { + "name": {"type": "string"}, + "version": {"type": "string"} + } + }, + "run": { + "type": "object", + "required": ["platform"], + "additionalProperties": false, + "description": "Execution metadata for reproducibility", + "properties": { + "commit": {"type": "string"}, + "platform": {"type": "string"}, + "time_s": {"type": "number", "minimum": 0}, + "peak_mb": {"type": "number", "minimum": 0} + } + }, + "cases": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "required": ["case_id", "sinks"], + "additionalProperties": false, + "properties": { + "case_id": {"type": "string"}, + "sinks": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "required": ["sink_id", "prediction"], + "additionalProperties": false, + "properties": { + "sink_id": {"type": "string"}, + "prediction": { + "type": "string", + "enum": ["reachable", "unreachable"] + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "explain": { + "type": "object", + "additionalProperties": false, + "properties": { + "entry": {"type": "string"}, + "path": { + "type": "array", + "items": {"type": "string"}, + "minItems": 2 + }, + "guards": { + "type": "array", + "items": {"type": "string"}, + "uniqueItems": true + } + } + }, + "notes": {"type": "string"} + } + } + } + } + } + }, + "artifacts": { + "type": "object", + "additionalProperties": false, + "properties": { + "sbom": {"type": "string"}, + "attestation": {"type": "string"} + } + }, + "submitter": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "organization": {"type": "string"}, + "contact": {"type": "string", "format": "email"} + }, + "additionalProperties": false + } + } +} diff --git a/bench/reachability-benchmark/schemas/truth.schema.json b/bench/reachability-benchmark/schemas/truth.schema.json new file mode 100644 index 000000000..6148bbf49 --- /dev/null +++ b/bench/reachability-benchmark/schemas/truth.schema.json @@ -0,0 +1,79 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://stellaops.org/benchmark/reachability/truth.schema.json", + "title": "Reachability Benchmark Truth Set", + "type": "object", + "required": ["version", "cases"], + "properties": { + "version": {"type": "string", "enum": ["1.0.0"]}, + "cases": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "required": ["case_id", "sinks"], + "additionalProperties": false, + "properties": { + "case_id": {"type": "string"}, + "case_version": {"type": "string"}, + "notes": {"type": "string"}, + "sinks": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "required": ["sink_id", "label"], + "additionalProperties": false, + "properties": { + "sink_id": {"type": "string"}, + "label": { + "type": "string", + "enum": ["reachable", "unreachable", "unknown"] + }, + "confidence": { + "type": "string", + "enum": ["high", "medium", "low"], + "default": "high" + }, + "dynamic_evidence": { + "type": "object", + "additionalProperties": false, + "properties": { + "covered_by_tests": { + "type": "array", + "items": {"type": "string"}, + "uniqueItems": true + }, + "coverage_files": { + "type": "array", + "items": {"type": "string"}, + "uniqueItems": true + } + } + }, + "static_evidence": { + "type": "object", + "additionalProperties": false, + "properties": { + "call_path": { + "type": "array", + "items": {"type": "string"}, + "minItems": 2 + } + } + }, + "config_conditions": { + "type": "array", + "items": {"type": "string"}, + "uniqueItems": true + }, + "notes": {"type": "string"} + } + } + } + } + } + } + }, + "additionalProperties": false +} diff --git a/bench/reachability-benchmark/tools/requirements.txt b/bench/reachability-benchmark/tools/requirements.txt new file mode 100644 index 000000000..ee5f01df5 --- /dev/null +++ b/bench/reachability-benchmark/tools/requirements.txt @@ -0,0 +1,2 @@ +jsonschema==4.23.0 +PyYAML==6.0.2 diff --git a/bench/reachability-benchmark/tools/scorer/README.md b/bench/reachability-benchmark/tools/scorer/README.md new file mode 100644 index 000000000..23aa5f71e --- /dev/null +++ b/bench/reachability-benchmark/tools/scorer/README.md @@ -0,0 +1,11 @@ +# rb-score (placeholder) + +Planned CLI to score reachability submissions against truth sets. + +Future work (BENCH-SCORER-513-008): +- Validate submission against `schemas/submission.schema.json`. +- Validate truth against `schemas/truth.schema.json`. +- Compute precision/recall/F1, explainability score (0-3), runtime stats, determinism rate. +- Emit JSON report with stable ordering. + +For now this folder is a stub; implementation will be added in task 513-008 once schemas stabilize. diff --git a/bench/reachability-benchmark/tools/validate.py b/bench/reachability-benchmark/tools/validate.py new file mode 100644 index 000000000..22f8d7ae4 --- /dev/null +++ b/bench/reachability-benchmark/tools/validate.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +"""Deterministic schema validator for reachability benchmark assets. + +Usage examples: + python tools/validate.py case schemas/examples/case.sample.yaml + python tools/validate.py truth benchmark/truth/public.json + python tools/validate.py all schemas/examples + +The script is offline-friendly and relies only on pinned deps from +`tools/requirements.txt`. +""" +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Iterable, Tuple + +import yaml +from jsonschema import Draft202012Validator, FormatChecker + +ROOT = Path(__file__).resolve().parent.parent +SCHEMAS = { + "case": ROOT / "schemas" / "case.schema.yaml", + "entrypoints": ROOT / "schemas" / "entrypoints.schema.yaml", + "truth": ROOT / "schemas" / "truth.schema.json", + "submission": ROOT / "schemas" / "submission.schema.json", +} + + +def load_yaml_or_json(path: Path): + text = path.read_text(encoding="utf-8") + if path.suffix.lower() in {".yaml", ".yml"}: + return yaml.safe_load(text) + return json.loads(text) + + +def load_schema(kind: str): + schema_path = SCHEMAS[kind] + return load_yaml_or_json(schema_path) + + +def validate_one(kind: str, payload_path: Path) -> Tuple[bool, Tuple[str, ...]]: + schema = load_schema(kind) + document = load_yaml_or_json(payload_path) + validator = Draft202012Validator(schema, format_checker=FormatChecker()) + + errors = sorted(validator.iter_errors(document), key=lambda e: (list(e.path), e.message)) + if errors: + messages = tuple( + f"{payload_path}: {"/".join(str(p) for p in err.path) or ''}: {err.message}" + for err in errors + ) + return False, messages + return True, () + + +def collect_all(directory: Path) -> Iterable[Tuple[str, Path]]: + mapping = { + "case": ("case",), + "entrypoints": ("entrypoints", "entrypoint"), + "truth": ("truth",), + "submission": ("submission",), + } + for path in sorted(directory.rglob("*")): + if not path.is_file(): + continue + stem_lower = path.stem.lower() + for kind, tokens in mapping.items(): + if any(token in stem_lower for token in tokens): + yield kind, path + break + + +def parse_args(): + parser = argparse.ArgumentParser(description="Validate reachability benchmark files against schemas.") + parser.add_argument( + "kind", + choices=["case", "entrypoints", "truth", "submission", "all"], + help="Which schema to validate against or 'all' to auto-detect in a directory", + ) + parser.add_argument( + "paths", + nargs="+", + help="File(s) to validate. If kind=all, provide one or more directories to scan.", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + failures: list[str] = [] + + if args.kind == "all": + for base in args.paths: + base_path = Path(base) + if not base_path.exists(): + failures.append(f"{base}: path not found") + continue + for kind, path in collect_all(base_path): + ok, messages = validate_one(kind, path) + if ok: + print(f"OK [{kind}] {path}") + else: + failures.extend(messages) + if failures: + for msg in failures: + print(f"FAIL {msg}") + return 1 + return 0 + + # Single schema mode + for path_str in args.paths: + path = Path(path_str) + if not path.exists(): + failures.append(f"{path}: path not found") + continue + ok, messages = validate_one(args.kind, path) + if ok: + print(f"OK [{args.kind}] {path}") + else: + failures.extend(messages) + + if failures: + for msg in failures: + print(f"FAIL {msg}") + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/docs/implplan/SPRINT_0513_0001_0001_public_reachability_benchmark.md b/docs/implplan/SPRINT_0513_0001_0001_public_reachability_benchmark.md index ad6b08dae..fa84daeed 100644 --- a/docs/implplan/SPRINT_0513_0001_0001_public_reachability_benchmark.md +++ b/docs/implplan/SPRINT_0513_0001_0001_public_reachability_benchmark.md @@ -27,7 +27,7 @@ | # | Task ID | Status | Key dependency / next step | Owners | Task Definition | | --- | --- | --- | --- | --- | --- | | 1 | BENCH-REPO-513-001 | DONE (2025-11-29) | None; foundational. | Bench Guild · DevOps Guild | Create public repository structure: `benchmark/cases///`, `benchmark/schemas/`, `benchmark/tools/scorer/`, `baselines/`, `ci/`, `website/`. Add LICENSE (Apache-2.0), README, CONTRIBUTING.md. | -| 2 | BENCH-SCHEMA-513-002 | TODO | Depends on 513-001. | Bench Guild | Define and publish schemas: `case.schema.yaml` (component, sink, label, evidence), `entrypoints.schema.yaml`, `truth.schema.yaml`, `submission.schema.json`. Include JSON Schema validation. | +| 2 | BENCH-SCHEMA-513-002 | DONE (2025-11-29) | Depends on 513-001. | Bench Guild | Define and publish schemas: `case.schema.yaml` (component, sink, label, evidence), `entrypoints.schema.yaml`, `truth.schema.yaml`, `submission.schema.json`. Include JSON Schema validation. | | 3 | BENCH-CASES-JS-513-003 | TODO | Depends on 513-002. | Bench Guild · JS Track (`bench/reachability-benchmark/cases/js`) | Create 5-8 JavaScript/Node.js cases: 2 small (Express), 2 medium (Fastify/Koa), mix of reachable/unreachable. Include Dockerfiles, package-lock.json, unit test oracles, coverage output. | | 4 | BENCH-CASES-PY-513-004 | TODO | Depends on 513-002. | Bench Guild · Python Track (`bench/reachability-benchmark/cases/py`) | Create 5-8 Python cases: Flask, Django, FastAPI. Include requirements.txt pinned, pytest oracles, coverage.py output. | | 5 | BENCH-CASES-JAVA-513-005 | TODO | Depends on 513-002. | Bench Guild · Java Track (`bench/reachability-benchmark/cases/java`) | Create 5-8 Java cases: Spring Boot, Micronaut. Include pom.xml locked, JUnit oracles, JaCoCo coverage. | @@ -84,3 +84,4 @@ | --- | --- | --- | | 2025-11-27 | Sprint created from product advisory `24-Nov-2025 - Designing a Deterministic Reachability Benchmark.md`; 17 tasks defined across 5 waves. | Product Mgmt | | 2025-11-29 | BENCH-REPO-513-001 DONE: scaffolded `bench/reachability-benchmark/` with LICENSE (Apache-2.0), NOTICE, README, CONTRIBUTING, .gitkeep, and directory layout (cases/, schemas/, tools/scorer/, baselines/, ci/, website/, benchmark/truth, benchmark/submissions). | Implementer | +| 2025-11-29 | BENCH-SCHEMA-513-002 DONE: expanded schemas (case/entrypoints/truth/submission), added examples + offline validator `tools/validate.py`, and pinned requirements for deterministic validation. | Implementer |