Add receipt input JSON and SHA256 hash for CVSS policy scoring tests
- Introduced a new JSON fixture `receipt-input.json` containing base, environmental, and threat metrics for CVSS scoring. - Added corresponding SHA256 hash file `receipt-input.sha256` to ensure integrity of the JSON fixture.
This commit is contained in:
@@ -0,0 +1 @@
|
||||
9e58bdee6304e52539eabdcb46563dd6d71af71659e1c825bb4ee378f18a60ff rules.yaml
|
||||
11
bench/reachability-benchmark/benchmark/CHANGELOG.md
Normal file
11
bench/reachability-benchmark/benchmark/CHANGELOG.md
Normal file
@@ -0,0 +1,11 @@
|
||||
# Reachability Benchmark Changelog
|
||||
|
||||
## 1.0.1 · 2025-12-03
|
||||
- Added manifest schema + sample manifest with hashes, SBOM/attestation entries, and sandbox/redaction metadata.
|
||||
- Added coverage/trace schemas and extended validator to cover them.
|
||||
- Introduced `tools/verify_manifest.py` and deterministic offline kit packaging script.
|
||||
- Added per-language determinism env templates and dataset safety checklist.
|
||||
- Populated SBOM + attestation outputs for JS/PY/C tracks; Java remains blocked on JDK availability.
|
||||
|
||||
## 1.0.0 · 2025-12-01
|
||||
- Initial public dataset, scorer, baselines, and website.
|
||||
@@ -0,0 +1,15 @@
|
||||
# Dataset Safety & Provenance Checklist (RD1–RD10)
|
||||
|
||||
Version: 1.0.1 · Date: 2025-12-03
|
||||
|
||||
- [x] PII/secret scrub: no tokens/URLs; build/test logs redacted. Attested by DSSE when signing manifest.
|
||||
- [x] License compatibility: all cases authored in-repo under Apache-2.0; third-party snippets none. NOTICE up to date.
|
||||
- [x] Feed/tool lockfile: manifest.sample.json pins hashes for schemas, scorer, builder, and baseline submissions (when present).
|
||||
- [x] Published schemas/validators: truth/submission/coverage/trace + manifest schemas; validated via `tools/validate.py` and `tools/verify_manifest.py`.
|
||||
- [x] Evidence bundles: coverage + traces + attestation + sbom recorded per case (sample manifest).
|
||||
- [x] Binary case recipe: `cases/**/build/build.sh` pinned `SOURCE_DATE_EPOCH` and env templates under `benchmark/templates/determinism/`.
|
||||
- [x] Determinism CI: `ci/run-ci.sh` + `tools/verify_manifest.py` run twice to compare hashes; Java track still blocked on JDK availability.
|
||||
- [x] Signed baselines: baseline submissions may include DSSE path in manifest (not required for sample kit); rulepack hashes recorded separately.
|
||||
- [x] Submission policy: CLA/DSSE optional in sample; production kits require DSSE envelope recorded in `signatures`.
|
||||
- [x] Semantic versioning & changelog: see `benchmark/CHANGELOG.md`; manifest `version` mirrors dataset release.
|
||||
- [x] Offline kit packaging: `tools/package_offline_kit.sh` produces deterministic tarball with manifest + schemas + tools.
|
||||
92
bench/reachability-benchmark/benchmark/manifest.sample.json
Normal file
92
bench/reachability-benchmark/benchmark/manifest.sample.json
Normal file
@@ -0,0 +1,92 @@
|
||||
{
|
||||
"schemaVersion": "1.0.0",
|
||||
"kitId": "reachability-benchmark:public-v1",
|
||||
"version": "1.0.1",
|
||||
"createdAt": "2025-12-03T00:00:00Z",
|
||||
"sourceDateEpoch": 1730000000,
|
||||
"resourceLimits": {
|
||||
"cpu": "4",
|
||||
"memory": "8Gi"
|
||||
},
|
||||
"cases": [
|
||||
{
|
||||
"id": "js-unsafe-eval:001",
|
||||
"language": "js",
|
||||
"size": "small",
|
||||
"hashes": {
|
||||
"source": { "path": "cases/js/unsafe-eval", "sha256": "69b0d1cbae1e2c9ddc0f4dba8c6db507e1d3a1c5ea0a0a545c6f3e785529c91c" },
|
||||
"case": { "path": "cases/js/unsafe-eval/case.yaml", "sha256": "a858ff509fda65d69df476e870d9646c6a84744010c812f3d23a88576f20cb6b" },
|
||||
"entrypoints": { "path": "cases/js/unsafe-eval/entrypoints.yaml", "sha256": "77829e728d34c9dc5f56c04784c97f619830ad43bd8410acb3d7134f372a49b3" },
|
||||
"binary": { "path": "cases/js/unsafe-eval/outputs/binary.tar.gz", "sha256": "72da19f28c2c36b6666afcc304514b387de20a5de881d5341067481e8418e23e" },
|
||||
"sbom": { "path": "cases/js/unsafe-eval/outputs/sbom.cdx.json", "sha256": "c00ee1e12b1b6a6237e42174b2fe1393bcf575f6605205a2b84366e867b36d5f" },
|
||||
"coverage": { "path": "cases/js/unsafe-eval/outputs/coverage.json", "sha256": "c2cf5af508d33f6ecdc7c0f10200a02a4c0ddeb8e1fc08b55d9bd4a2d6cb926b" },
|
||||
"traces": { "path": "cases/js/unsafe-eval/outputs/traces/traces.json", "sha256": "6e63c78e091cc9d06acdc5966dd9e54593ca6b0b97f502928de278b3f80adbd8" },
|
||||
"attestation": { "path": "cases/js/unsafe-eval/outputs/attestation.json", "sha256": "be3b0971d805f68730a1c4c0f7a4c3c40dfc7a73099a5524c68759fcc1729d7c" },
|
||||
"truth": { "path": "benchmark/truth/js-unsafe-eval.json", "sha256": "ab42f28ed229eb657ffcb36c3a99287436e1822a4c7d395a94de784457a08f62" }
|
||||
},
|
||||
"truth": {
|
||||
"label": "reachable",
|
||||
"confidence": "high",
|
||||
"rationale": "Unit test hits eval sink via POST /api/exec"
|
||||
},
|
||||
"sandbox": { "network": "loopback", "privileges": "rootless" },
|
||||
"redaction": { "pii": false, "policy": "benchmark-default/v1" }
|
||||
},
|
||||
{
|
||||
"id": "py-fastapi-guarded:104",
|
||||
"language": "py",
|
||||
"size": "small",
|
||||
"hashes": {
|
||||
"source": { "path": "cases/py/fastapi-guarded", "sha256": "0869cab10767ac7e7b33c9bbd634f811d98ce5cdeb244769f1a81949438460fb" },
|
||||
"case": { "path": "cases/py/fastapi-guarded/case.yaml", "sha256": "0add8a5f487ebd21ee20ab88b7c6436fe8471f0a54ab8da0e08c8416aa181346" },
|
||||
"entrypoints": { "path": "cases/py/fastapi-guarded/entrypoints.yaml", "sha256": "47c9dd15bf7c5bb8641893a92791d3f7675ed6adba17b251f609335400d29d41" },
|
||||
"binary": { "path": "cases/py/fastapi-guarded/outputs/binary.tar.gz", "sha256": "ca964fef352dc535b63d35b8f8846cc051e10e54cfd8aceef7566f3c94178b76" },
|
||||
"sbom": { "path": "cases/py/fastapi-guarded/outputs/sbom.cdx.json", "sha256": "13999d8f3d4c9bdb70ea54ad1de613be3f893d79bdd1a53f7c9401e6add88cf0" },
|
||||
"coverage": { "path": "cases/py/fastapi-guarded/outputs/coverage.json", "sha256": "07b1f6dccaa02bd4e1c3e2771064fa3c6e06d02843a724151721ea694762c750" },
|
||||
"traces": { "path": "cases/py/fastapi-guarded/outputs/traces/traces.json", "sha256": "4633748b8b428b45e3702f2f8f5b3f4270728078e26bce1e08900ed1d5bb3046" },
|
||||
"attestation": { "path": "cases/py/fastapi-guarded/outputs/attestation.json", "sha256": "257aa5408a5c6ffe0e193a75a2a54597f8c6f61babfe8aaf26bd47340c3086c3" },
|
||||
"truth": { "path": "benchmark/truth/py-fastapi-guarded.json", "sha256": "f8c62abeb00006621feeb010d0e47d248918dffd6d6e20e0f47d74e1b3642760" }
|
||||
},
|
||||
"truth": {
|
||||
"label": "unreachable",
|
||||
"confidence": "high",
|
||||
"rationale": "Feature flag ALLOW_EXEC must be true before sink executes"
|
||||
},
|
||||
"sandbox": { "network": "loopback", "privileges": "rootless" },
|
||||
"redaction": { "pii": false, "policy": "benchmark-default/v1" }
|
||||
},
|
||||
{
|
||||
"id": "c-unsafe-system:001",
|
||||
"language": "c",
|
||||
"size": "small",
|
||||
"hashes": {
|
||||
"source": { "path": "cases/c/unsafe-system", "sha256": "bc39ab3a3e5cb3944a205912ecad8c1ac4b7d15c64b453c9d34a9a5df7fbbbf4" },
|
||||
"case": { "path": "cases/c/unsafe-system/case.yaml", "sha256": "7799a3a629c22ad47197309f44e32aabbc4e6711ef78d606ba57a7a4974787ce" },
|
||||
"entrypoints": { "path": "cases/c/unsafe-system/entrypoints.yaml", "sha256": "06afee8350460c9d15b26ea9d4ea293e8eb3f4b86b3179e19401fa99947e4490" },
|
||||
"binary": { "path": "cases/c/unsafe-system/outputs/binary.tar.gz", "sha256": "62200167bd660bad6d131b21f941acdfebe00e949e353a53c97b6691ac8f0e49" },
|
||||
"sbom": { "path": "cases/c/unsafe-system/outputs/sbom.cdx.json", "sha256": "4c72a213fc4c646f44b4d0be3c23711b120b2a386374ebaa4897e5058980e0f5" },
|
||||
"coverage": { "path": "cases/c/unsafe-system/outputs/coverage.json", "sha256": "03ba8cf09e7e0ed82e9fa8abb48f92355e894fd56e0c0160a504193a6f6ec48a" },
|
||||
"traces": { "path": "cases/c/unsafe-system/outputs/traces/traces.json", "sha256": "f6469e46a57b8a6e8e17c9b8e78168edd6657ea8a5e1e96fe6ab4a0fc88a734e" },
|
||||
"attestation": { "path": "cases/c/unsafe-system/outputs/attestation.json", "sha256": "c3755088182359a45492170fa8a57d826b605176333d109f4f113bc7ccf85f97" },
|
||||
"truth": { "path": "benchmark/truth/c-unsafe-system.json", "sha256": "9a8200c2cf549b3ac8b19b170e9d34df063351879f19f401d8492e280ad08c13" }
|
||||
},
|
||||
"truth": {
|
||||
"label": "reachable",
|
||||
"confidence": "high",
|
||||
"rationale": "Command injection sink reachable via argv -> system()"
|
||||
},
|
||||
"sandbox": { "network": "loopback", "privileges": "rootless" },
|
||||
"redaction": { "pii": false, "policy": "benchmark-default/v1" }
|
||||
}
|
||||
],
|
||||
"artifacts": {
|
||||
"submissionSchema": { "path": "schemas/submission.schema.json", "sha256": "de5bebb2dbcd085d7896f47a16b9d3837a65fb7f816dcf7e587967d5848c50a7" },
|
||||
"scorer": { "path": "tools/scorer/rb_score.py", "sha256": "32d4f69f5d1d4b87902d6c4f020efde703487d526bf7d42b4438cb2499813f7f" },
|
||||
"baselineSubmissions": []
|
||||
},
|
||||
"tools": {
|
||||
"builder": { "path": "tools/build/build_all.py", "sha256": "64a73f3df9b6f2cdaf5cbb33852b8e9bf443f67cf9dff1573fb635a0252bda9a" },
|
||||
"validator": { "path": "tools/validate.py", "sha256": "776009ef0f3691e60cc87df3f0468181ee7a827be1bd0f73c77fdb68d3ed31c0" }
|
||||
},
|
||||
"signatures": []
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
# Deterministic defaults for C cases
|
||||
SOURCE_DATE_EPOCH=1730000000
|
||||
TZ=UTC
|
||||
LANG=C
|
||||
LC_ALL=C
|
||||
@@ -0,0 +1,3 @@
|
||||
# Deterministic defaults for Java cases
|
||||
SOURCE_DATE_EPOCH=1730000000
|
||||
JAVA_TOOL_OPTIONS="-Duser.country=US -Duser.language=en -Duser.timezone=UTC -Dfile.encoding=UTF-8"
|
||||
@@ -0,0 +1,5 @@
|
||||
# Deterministic defaults for JavaScript/Node cases
|
||||
SOURCE_DATE_EPOCH=1730000000
|
||||
NODE_OPTIONS=--no-deprecation
|
||||
NPM_CONFIG_FUND=false
|
||||
NPM_CONFIG_AUDIT=false
|
||||
@@ -0,0 +1,5 @@
|
||||
# Deterministic defaults for Python cases
|
||||
SOURCE_DATE_EPOCH=1730000000
|
||||
PYTHONHASHSEED=1
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1
|
||||
PYTHONUTF8=1
|
||||
@@ -18,13 +18,18 @@ environment:
|
||||
runtime:
|
||||
gcc: "13"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
outputs:
|
||||
artifact_path: outputs/binary.tar.gz
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_path: outputs/traces/traces.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
@@ -35,3 +40,9 @@ ground_truth:
|
||||
summary: "Without ALLOW_CMD, the system() sink remains unreachable; with ALLOW_CMD=1, it executes."
|
||||
evidence_files:
|
||||
- "../../../benchmark/truth/c-guarded-system.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
case_id: "c-guarded-system:001"
|
||||
entries:
|
||||
cli:
|
||||
- id: "main"
|
||||
command: "./app"
|
||||
args: ["<user_input>"]
|
||||
description: "system() guarded by ALLOW_CMD flag"
|
||||
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"bomFormat": "CycloneDX",
|
||||
"components": [],
|
||||
"metadata": {
|
||||
"component": {
|
||||
"name": "guarded-system",
|
||||
"type": "application",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
"timestamp": "1970-01-01T00:00:00Z"
|
||||
},
|
||||
"specVersion": "1.5",
|
||||
"version": 1
|
||||
}
|
||||
@@ -18,13 +18,18 @@ environment:
|
||||
runtime:
|
||||
gcc: "13"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
outputs:
|
||||
artifact_path: outputs/binary.tar.gz
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_path: outputs/traces/traces.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
@@ -35,3 +40,9 @@ ground_truth:
|
||||
summary: "Calling process_buffer with len>256 drives memcpy with attacker length (reachable)."
|
||||
evidence_files:
|
||||
- "../../../benchmark/truth/c-memcpy-overflow.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
case_id: "c-memcpy-overflow:001"
|
||||
entries:
|
||||
cli:
|
||||
- id: "process_buffer"
|
||||
command: "./app"
|
||||
args: ["<length>"]
|
||||
description: "User length forwarded to memcpy without bounds"
|
||||
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"bomFormat": "CycloneDX",
|
||||
"components": [],
|
||||
"metadata": {
|
||||
"component": {
|
||||
"name": "memcpy-overflow",
|
||||
"type": "application",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
"timestamp": "1970-01-01T00:00:00Z"
|
||||
},
|
||||
"specVersion": "1.5",
|
||||
"version": 1
|
||||
}
|
||||
@@ -18,13 +18,18 @@ environment:
|
||||
runtime:
|
||||
gcc: "13"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
outputs:
|
||||
artifact_path: outputs/binary.tar.gz
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_path: outputs/traces/traces.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
@@ -35,3 +40,9 @@ ground_truth:
|
||||
summary: "Running with argument 'echo OK' executes system() with user-controlled payload."
|
||||
evidence_files:
|
||||
- "../../../benchmark/truth/c-unsafe-system.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
case_id: "c-unsafe-system:001"
|
||||
entries:
|
||||
cli:
|
||||
- id: "main"
|
||||
command: "./app"
|
||||
args: ["<user_input>"]
|
||||
description: "Passes argv directly into system()"
|
||||
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"bomFormat": "CycloneDX",
|
||||
"components": [],
|
||||
"metadata": {
|
||||
"component": {
|
||||
"name": "unsafe-system",
|
||||
"type": "application",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
"timestamp": "1970-01-01T00:00:00Z"
|
||||
},
|
||||
"specVersion": "1.5",
|
||||
"version": 1
|
||||
}
|
||||
@@ -18,6 +18,9 @@ environment:
|
||||
runtime:
|
||||
java: "21"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
@@ -26,6 +29,7 @@ build:
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./build/build.sh"
|
||||
expected_coverage: []
|
||||
@@ -36,3 +40,9 @@ ground_truth:
|
||||
summary: "Deserialization reachable"
|
||||
evidence_files:
|
||||
- "../benchmark/truth/java-spring-deserialize.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
@@ -18,6 +18,9 @@ environment:
|
||||
runtime:
|
||||
java: "21"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
@@ -26,6 +29,7 @@ build:
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./build/build.sh"
|
||||
expected_coverage: []
|
||||
@@ -36,3 +40,9 @@ ground_truth:
|
||||
summary: "Guard blocks deserialization unless ALLOW_DESER=true"
|
||||
evidence_files:
|
||||
- "../benchmark/truth/java-spring-guarded.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
@@ -18,6 +18,9 @@ environment:
|
||||
runtime:
|
||||
node: "20.11.0"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
@@ -26,6 +29,7 @@ build:
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
@@ -36,3 +40,9 @@ ground_truth:
|
||||
summary: "Admin exec endpoint reachable and executes eval"
|
||||
evidence_files:
|
||||
- "../benchmark/truth/js-express-eval.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
Binary file not shown.
@@ -18,6 +18,9 @@ environment:
|
||||
runtime:
|
||||
node: "20.11.0"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
@@ -26,6 +29,7 @@ build:
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
@@ -36,3 +40,9 @@ ground_truth:
|
||||
summary: "Guard prevents sink unless ALLOW_EXEC=true"
|
||||
evidence_files:
|
||||
- "../benchmark/truth/js-express-guarded.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
Binary file not shown.
@@ -18,6 +18,9 @@ environment:
|
||||
runtime:
|
||||
node: "20.11.0"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
@@ -26,6 +29,7 @@ build:
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
@@ -36,3 +40,9 @@ ground_truth:
|
||||
summary: "Template rendering reachable via POST /api/render"
|
||||
evidence_files:
|
||||
- "../benchmark/truth/js-fastify-template.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
Binary file not shown.
@@ -18,6 +18,9 @@ environment:
|
||||
runtime:
|
||||
node: "20.11.0"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
@@ -26,6 +29,7 @@ build:
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
@@ -36,3 +40,9 @@ ground_truth:
|
||||
summary: "Guard prevents sink when FEATURE_ENABLE != 1"
|
||||
evidence_files:
|
||||
- "../benchmark/truth/js-guarded-eval.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
Binary file not shown.
@@ -18,6 +18,9 @@ environment:
|
||||
runtime:
|
||||
node: "20.11.0"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
@@ -26,6 +29,7 @@ build:
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
@@ -36,3 +40,9 @@ ground_truth:
|
||||
summary: "Unit test triggers eval sink with payload {code: '1+2'}"
|
||||
evidence_files:
|
||||
- "../benchmark/truth/js-unsafe-eval.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
Binary file not shown.
@@ -18,6 +18,9 @@ environment:
|
||||
runtime:
|
||||
python: "3.12"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
@@ -26,6 +29,7 @@ build:
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
@@ -36,3 +40,9 @@ ground_truth:
|
||||
summary: "Template rendering reachable with autoescape off"
|
||||
evidence_files:
|
||||
- "../benchmark/truth/py-django-ssti.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
Binary file not shown.
@@ -18,6 +18,9 @@ environment:
|
||||
runtime:
|
||||
python: "3.12"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
@@ -26,6 +29,7 @@ build:
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
@@ -36,3 +40,9 @@ ground_truth:
|
||||
summary: "Guard blocks eval unless ALLOW_EXEC=true"
|
||||
evidence_files:
|
||||
- "../benchmark/truth/py-fastapi-guarded.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
Binary file not shown.
@@ -18,6 +18,9 @@ environment:
|
||||
runtime:
|
||||
python: "3.12"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
@@ -26,6 +29,7 @@ build:
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
@@ -36,3 +40,9 @@ ground_truth:
|
||||
summary: "Template rendering reachable"
|
||||
evidence_files:
|
||||
- "../benchmark/truth/py-flask-template.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
Binary file not shown.
@@ -18,6 +18,9 @@ environment:
|
||||
runtime:
|
||||
python: "3.12"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
@@ -26,6 +29,7 @@ build:
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
@@ -36,3 +40,9 @@ ground_truth:
|
||||
summary: "Guard blocks eval when FEATURE_ENABLE != 1"
|
||||
evidence_files:
|
||||
- "../benchmark/truth/py-guarded-exec.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
Binary file not shown.
@@ -18,6 +18,9 @@ environment:
|
||||
runtime:
|
||||
python: "3.12"
|
||||
source_date_epoch: 1730000000
|
||||
resource_limits:
|
||||
cpu: "2"
|
||||
memory: "4Gi"
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
@@ -26,6 +29,7 @@ build:
|
||||
sbom_path: outputs/sbom.cdx.json
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_dir: outputs/traces
|
||||
attestation_path: outputs/attestation.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
@@ -36,3 +40,9 @@ ground_truth:
|
||||
summary: "Eval reachable via POST /api/exec"
|
||||
evidence_files:
|
||||
- "../benchmark/truth/py-unsafe-exec.json"
|
||||
sandbox:
|
||||
network: loopback
|
||||
privileges: rootless
|
||||
redaction:
|
||||
pii: false
|
||||
policy: "benchmark-default/v1"
|
||||
|
||||
Binary file not shown.
@@ -8,11 +8,14 @@ This note closes BENCH-GAPS-513-018, DATASET-GAPS-513-019, and REACH-FIXTURE-GAP
|
||||
## What changed
|
||||
- **Benchmark kit manifest + schema**: `benchmark/schemas/benchmark-manifest.schema.json` with signed/hashed entries for cases, truth, baselines, schemas, and tools. Sample at `benchmark/manifest.sample.json`.
|
||||
- **Offline verifier**: `tools/verify_manifest.py` validates the manifest against local files (hashes, required entries, DSSE envelope presence) to keep runs deterministic and tamper-evident.
|
||||
- **Coverage/trace schemas**: `schemas/coverage.schema.json` and `schemas/trace.schema.json` govern oracle outputs referenced by manifest hashes.
|
||||
- **Submission provenance checks**: manifest requires SHA-256 for submission schema, scorer package, and each baseline submission; DSSE path optional but encouraged.
|
||||
- **Determinism env templates**: manifest captures `sourceDateEpoch` and per-tool pinned versions; cases must provide build seeds in case metadata.
|
||||
- **Unreachability oracles**: truth files must include explicit rationale for unreachable cases; manifest enforces presence of `truth` artifact per case.
|
||||
- **Sandbox/redaction guidance**: case metadata must declare `sandbox` and `redaction` policy fields (schema updated) to ensure PII removal and constrained execution.
|
||||
- **Resource normalization**: manifest records build/runtime resource limits (cpu/memory) for repeatable benchmarking.
|
||||
- **Offline kit & checklist**: dataset safety checklist at `benchmark/checklists/dataset-safety.md`; deterministic packaging via `tools/package_offline_kit.sh`.
|
||||
- **Frozen baselines**: Semgrep rulepack hash pinned at `baselines/semgrep/rules.sha256`; manifest supports hashed baseline submissions.
|
||||
|
||||
## How to use
|
||||
```bash
|
||||
|
||||
@@ -53,7 +53,16 @@ This guide explains how to produce a compliant submission for the Stella Ops rea
|
||||
- `submission.json`
|
||||
- Tool version & configuration (README)
|
||||
- Optional logs and runtime metrics
|
||||
- For production submissions, sign `submission.json` with DSSE and record the envelope under `signatures` in the manifest (see `benchmark/manifest.sample.json`).
|
||||
- Do **not** include binaries that require network access or licenses we cannot redistribute.
|
||||
|
||||
## Provenance & Manifest
|
||||
- Reference kit manifest: `benchmark/manifest.sample.json` (schema: `benchmark/schemas/benchmark-manifest.schema.json`).
|
||||
- Validate your bundle offline:
|
||||
```bash
|
||||
python tools/verify_manifest.py benchmark/manifest.sample.json --root bench/reachability-benchmark
|
||||
```
|
||||
- Determinism templates: `benchmark/templates/determinism/*.env` can be sourced by build scripts per language.
|
||||
|
||||
## Support
|
||||
- Open issues in the public repo (once live) or provide a reproducible script that runs fully offline.
|
||||
|
||||
@@ -11,6 +11,8 @@ required:
|
||||
- environment
|
||||
- build
|
||||
- test
|
||||
- sandbox
|
||||
- redaction
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
@@ -53,7 +55,7 @@ properties:
|
||||
description: Fully-qualified function/method path for the sink
|
||||
kind:
|
||||
type: string
|
||||
enum: [http, file, crypto, process, deserialization, custom]
|
||||
enum: [http, file, crypto, process, deserialization, custom, command, memory]
|
||||
location:
|
||||
type: object
|
||||
required: [file]
|
||||
@@ -84,6 +86,14 @@ properties:
|
||||
source_date_epoch:
|
||||
type: integer
|
||||
minimum: 0
|
||||
resource_limits:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
properties:
|
||||
cpu:
|
||||
type: string
|
||||
memory:
|
||||
type: string
|
||||
build:
|
||||
type: object
|
||||
required: [command, source_date_epoch]
|
||||
@@ -110,6 +120,8 @@ properties:
|
||||
type: string
|
||||
traces_dir:
|
||||
type: string
|
||||
attestation_path:
|
||||
type: string
|
||||
test:
|
||||
type: object
|
||||
required: [command]
|
||||
@@ -142,4 +154,22 @@ properties:
|
||||
type: string
|
||||
notes:
|
||||
type: string
|
||||
sandbox:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
properties:
|
||||
network:
|
||||
type: string
|
||||
enum: [none, loopback, local]
|
||||
privileges:
|
||||
type: string
|
||||
enum: [rootless, root]
|
||||
redaction:
|
||||
type: object
|
||||
additionalProperties: false
|
||||
properties:
|
||||
pii:
|
||||
type: boolean
|
||||
policy:
|
||||
type: string
|
||||
additionalProperties: false
|
||||
|
||||
@@ -29,6 +29,10 @@ properties:
|
||||
type: string
|
||||
command:
|
||||
type: string
|
||||
args:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
schedule:
|
||||
type: string
|
||||
handler:
|
||||
|
||||
29
bench/reachability-benchmark/tools/package_offline_kit.sh
Normal file
29
bench/reachability-benchmark/tools/package_offline_kit.sh
Normal file
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
OUTPUT="${1:-${ROOT}/out/reachability-benchmark-kit.tar.gz}"
|
||||
SDE="${SOURCE_DATE_EPOCH:-1730000000}"
|
||||
|
||||
mkdir -p "$(dirname "${OUTPUT}")"
|
||||
cd "${ROOT}"
|
||||
|
||||
# Deterministic tarball containing schemas, manifest, truth, cases, tools, and docs.
|
||||
tar --sort=name --mtime="@${SDE}" --owner=0 --group=0 --numeric-owner \
|
||||
-czf "${OUTPUT}" \
|
||||
benchmark/manifest.sample.json \
|
||||
benchmark/CHANGELOG.md \
|
||||
benchmark/checklists \
|
||||
benchmark/templates/determinism \
|
||||
benchmark/schemas/benchmark-manifest.schema.json \
|
||||
benchmark/truth \
|
||||
schemas \
|
||||
tools/verify_manifest.py tools/validate.py tools/requirements.txt \
|
||||
cases \
|
||||
baselines \
|
||||
ci \
|
||||
website \
|
||||
docs \
|
||||
README.md LICENSE NOTICE
|
||||
|
||||
sha256sum "${OUTPUT}"
|
||||
137
bench/reachability-benchmark/tools/verify_manifest.py
Normal file
137
bench/reachability-benchmark/tools/verify_manifest.py
Normal file
@@ -0,0 +1,137 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Offline validator for reachability benchmark manifests.
|
||||
|
||||
Usage:
|
||||
python tools/verify_manifest.py benchmark/manifest.sample.json --root bench/reachability-benchmark
|
||||
|
||||
Checks performed:
|
||||
- Manifest validates against `benchmark/schemas/benchmark-manifest.schema.json`.
|
||||
- Every hashed path exists relative to --root (or absolute).
|
||||
- SHA-256 of files/directories matches the manifest values.
|
||||
- Optional DSSE envelopes listed under `dsse` are hashed and compared to envelopeDigest
|
||||
when provided.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable
|
||||
|
||||
from jsonschema import Draft202012Validator
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
SCHEMA_PATH = ROOT / "benchmark" / "schemas" / "benchmark-manifest.schema.json"
|
||||
|
||||
|
||||
def load_manifest(path: Path) -> Dict:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
return json.loads(text)
|
||||
|
||||
|
||||
def compute_sha256(target: Path) -> str:
|
||||
if target.is_dir():
|
||||
digest = hashlib.sha256()
|
||||
for child in sorted(target.rglob("*")):
|
||||
if child.is_dir():
|
||||
continue
|
||||
rel = child.relative_to(target)
|
||||
digest.update(str(rel).encode("utf-8"))
|
||||
digest.update(child.read_bytes())
|
||||
return digest.hexdigest()
|
||||
return hashlib.sha256(target.read_bytes()).hexdigest()
|
||||
|
||||
|
||||
def validate_against_schema(manifest: Dict) -> Iterable[str]:
|
||||
schema = json.loads(SCHEMA_PATH.read_text(encoding="utf-8"))
|
||||
validator = Draft202012Validator(schema)
|
||||
for error in validator.iter_errors(manifest):
|
||||
pointer = "/".join(str(p) for p in error.path) or "<root>"
|
||||
yield f"schema:{pointer}: {error.message}"
|
||||
|
||||
|
||||
def resolve_path(root: Path, path_value: str) -> Path:
|
||||
candidate = Path(path_value)
|
||||
if not candidate.is_absolute():
|
||||
candidate = root / candidate
|
||||
return candidate
|
||||
|
||||
|
||||
def validate_hashed_path(root: Path, label: str, spec: Dict, envelope_digest: str | None = None) -> Iterable[str]:
|
||||
errors: list[str] = []
|
||||
path = resolve_path(root, spec["path"])
|
||||
if not path.exists():
|
||||
return [f"missing:{label}:{path}"]
|
||||
actual = compute_sha256(path)
|
||||
expected = spec["sha256"].lower()
|
||||
if actual.lower() != expected:
|
||||
errors.append(f"mismatch:{label}:{path}: expected {expected} got {actual}")
|
||||
dsse_path = spec.get("dsse")
|
||||
if dsse_path:
|
||||
dsse_full = resolve_path(root, dsse_path)
|
||||
if not dsse_full.exists():
|
||||
errors.append(f"missing:{label}:dsse:{dsse_full}")
|
||||
else:
|
||||
dsse_digest = compute_sha256(dsse_full)
|
||||
if envelope_digest and envelope_digest.lower() != dsse_digest.lower():
|
||||
errors.append(
|
||||
f"mismatch:{label}:dsse:{dsse_full}: expected envelopeDigest {envelope_digest} got {dsse_digest}"
|
||||
)
|
||||
return errors
|
||||
|
||||
|
||||
def validate_cases(root: Path, manifest: Dict) -> Iterable[str]:
|
||||
for case in manifest.get("cases", []):
|
||||
base = f"case:{case.get('id', '<unknown>')}"
|
||||
hashes: Dict = case.get("hashes", {})
|
||||
for key, spec in hashes.items():
|
||||
errors = validate_hashed_path(root, f"{base}:{key}", spec)
|
||||
yield from errors
|
||||
|
||||
|
||||
def validate_artifacts(root: Path, manifest: Dict) -> Iterable[str]:
|
||||
artifacts = manifest.get("artifacts", {})
|
||||
for label in ("submissionSchema", "scorer"):
|
||||
if label in artifacts:
|
||||
yield from validate_hashed_path(root, f"artifacts:{label}", artifacts[label])
|
||||
for baseline in artifacts.get("baselineSubmissions", []) or []:
|
||||
prefix = f"baseline:{baseline.get('tool','?')}-{baseline.get('version','?')}"
|
||||
yield from validate_hashed_path(root, f"{prefix}:submission", baseline["submission"])
|
||||
dsse_spec = baseline.get("dsse")
|
||||
if dsse_spec:
|
||||
yield from validate_hashed_path(root, f"{prefix}:dsse", dsse_spec, baseline.get("envelopeDigest"))
|
||||
|
||||
|
||||
def validate_tools(root: Path, manifest: Dict) -> Iterable[str]:
|
||||
tools = manifest.get("tools", {})
|
||||
for label in ("builder", "validator"):
|
||||
if label in tools:
|
||||
yield from validate_hashed_path(root, f"tools:{label}", tools[label])
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Validate reachability benchmark manifest and artifacts")
|
||||
parser.add_argument("manifest", type=Path, help="Path to manifest JSON")
|
||||
parser.add_argument("--root", type=Path, default=ROOT, help="Root directory for relative paths")
|
||||
args = parser.parse_args()
|
||||
|
||||
manifest = load_manifest(args.manifest)
|
||||
|
||||
failures: list[str] = []
|
||||
failures.extend(validate_against_schema(manifest))
|
||||
failures.extend(validate_cases(args.root, manifest))
|
||||
failures.extend(validate_artifacts(args.root, manifest))
|
||||
failures.extend(validate_tools(args.root, manifest))
|
||||
|
||||
if failures:
|
||||
for item in failures:
|
||||
print(f"FAIL {item}")
|
||||
return 1
|
||||
|
||||
print(f"OK manifest {args.manifest} validated")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user