up
This commit is contained in:
25
bench/reachability-benchmark/baselines/codeql/README.md
Normal file
25
bench/reachability-benchmark/baselines/codeql/README.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# CodeQL baseline
|
||||
|
||||
Deterministic baseline runner that emits a benchmark submission for one or more cases using CodeQL when available. If CodeQL is not installed, it still produces a schema‑valid submission marking all sinks as `unreachable`, so CI and comparisons remain stable.
|
||||
|
||||
## Usage
|
||||
```bash
|
||||
# One case
|
||||
baselines/codeql/run_case.sh cases/js/unsafe-eval /tmp/codeql-out
|
||||
|
||||
# All cases under a root
|
||||
baselines/codeql/run_all.sh cases /tmp/codeql-all
|
||||
```
|
||||
|
||||
Outputs:
|
||||
- Per-case: `<out>/submission.json`
|
||||
- All cases: `<out>/submission.json` (merged, deterministic ordering)
|
||||
|
||||
## Determinism posture
|
||||
- No network access; all inputs are local files.
|
||||
- Stable ordering of cases and sinks.
|
||||
- If CodeQL is missing or analysis fails, the runner falls back to a deterministic “all unreachable” submission.
|
||||
|
||||
## Requirements
|
||||
- Python 3.11+.
|
||||
- Optional: `codeql` CLI on PATH for real analysis (not required for offline deterministic fallback).
|
||||
74
bench/reachability-benchmark/baselines/codeql/normalize.py
Normal file
74
bench/reachability-benchmark/baselines/codeql/normalize.py
Normal file
@@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Normalize CodeQL SARIF (or empty results) into the benchmark submission schema.
|
||||
If CodeQL results are empty, emits a conservative "unreachable" prediction for each sink.
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import pathlib
|
||||
from typing import Any, Dict, List
|
||||
|
||||
def load_case(case_path: pathlib.Path) -> Dict[str, Any]:
|
||||
import yaml
|
||||
return yaml.safe_load(case_path.read_text())
|
||||
|
||||
def load_codeql_results(path: pathlib.Path) -> Dict[str, Any]:
|
||||
if not path.exists():
|
||||
return {"results": []}
|
||||
try:
|
||||
return json.loads(path.read_text())
|
||||
except json.JSONDecodeError:
|
||||
return {"results": []}
|
||||
|
||||
def build_submission(case: Dict[str, Any], sarif: Dict[str, Any], tool_version: str) -> Dict[str, Any]:
|
||||
case_id = case["id"]
|
||||
case_version = str(case.get("version", "1.0.0"))
|
||||
sinks = case.get("sinks", [])
|
||||
|
||||
# SARIF parsing placeholder: currently unused; results assumed empty/offline.
|
||||
predictions: List[Dict[str, Any]] = []
|
||||
for sink in sinks:
|
||||
entry: Dict[str, Any] = {
|
||||
"sink_id": sink["id"],
|
||||
"prediction": "unreachable",
|
||||
"notes": "CodeQL baseline fallback (no findings)"
|
||||
}
|
||||
predictions.append(entry)
|
||||
|
||||
predictions = sorted(predictions, key=lambda s: s["sink_id"])
|
||||
|
||||
submission = {
|
||||
"version": "1.0.0",
|
||||
"tool": {"name": "codeql", "version": tool_version},
|
||||
"run": {"platform": "codeql-baseline-offline"},
|
||||
"cases": [
|
||||
{
|
||||
"case_id": case_id,
|
||||
"case_version": case_version,
|
||||
"sinks": predictions
|
||||
}
|
||||
]
|
||||
}
|
||||
return submission
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--case", required=True, help="Path to case.yaml")
|
||||
parser.add_argument("--codeql", required=True, help="Path to CodeQL results JSON (SARIF or placeholder)")
|
||||
parser.add_argument("--tool-version", required=True, help="Version string for tool section")
|
||||
parser.add_argument("--output", required=True, help="Destination submission.json")
|
||||
args = parser.parse_args()
|
||||
|
||||
case_path = pathlib.Path(args.case).resolve()
|
||||
codeql_path = pathlib.Path(args.codeql).resolve()
|
||||
out_path = pathlib.Path(args.output).resolve()
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
case = load_case(case_path)
|
||||
sarif = load_codeql_results(codeql_path)
|
||||
submission = build_submission(case, sarif, args.tool_version)
|
||||
|
||||
out_path.write_text(json.dumps(submission, indent=2, sort_keys=True))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
45
bench/reachability-benchmark/baselines/codeql/run_all.sh
Normal file
45
bench/reachability-benchmark/baselines/codeql/run_all.sh
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
cases_root="${1:-cases}"
|
||||
out_dir="${2:-/tmp/codeql-baseline}"
|
||||
|
||||
cases_root="$(cd "${cases_root}" && pwd)"
|
||||
mkdir -p "${out_dir}"
|
||||
|
||||
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
tmp_dir="$(mktemp -d "${out_dir}/codeql-all-XXXX")"
|
||||
submission="${out_dir}/submission.json"
|
||||
|
||||
find "${cases_root}" -name case.yaml -print | sort | while read -r case_file; do
|
||||
case_dir="$(dirname "${case_file}")"
|
||||
case_out="${tmp_dir}/$(basename "${case_dir}")"
|
||||
mkdir -p "${case_out}"
|
||||
"${script_dir}/run_case.sh" "${case_dir}" "${case_out}" >/dev/null
|
||||
done
|
||||
|
||||
python - <<'PY'
|
||||
import json, pathlib, sys
|
||||
tmp_dir = pathlib.Path(sys.argv[1])
|
||||
dest = pathlib.Path(sys.argv[2])
|
||||
|
||||
subs = []
|
||||
for path in sorted(tmp_dir.glob("*/submission.json")):
|
||||
subs.append(json.loads(path.read_text()))
|
||||
|
||||
merged = {
|
||||
"version": "1.0.0",
|
||||
"tool": {"name": "codeql", "version": "aggregate"},
|
||||
"run": {"platform": "codeql-baseline-offline"},
|
||||
"cases": []
|
||||
}
|
||||
|
||||
for sub in subs:
|
||||
merged["cases"].extend(sub.get("cases", []))
|
||||
|
||||
merged["cases"] = sorted(merged["cases"], key=lambda c: c.get("case_id",""))
|
||||
|
||||
dest.write_text(json.dumps(merged, indent=2, sort_keys=True))
|
||||
print(f"submission written: {dest}")
|
||||
PY "${tmp_dir}" "${submission}"
|
||||
39
bench/reachability-benchmark/baselines/codeql/run_case.sh
Normal file
39
bench/reachability-benchmark/baselines/codeql/run_case.sh
Normal file
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
case_dir="${1:-}"
|
||||
out_dir="${2:-}"
|
||||
|
||||
if [[ -z "${case_dir}" ]]; then
|
||||
echo "usage: run_case.sh <case_dir> [output_dir]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case_dir="$(cd "${case_dir}" && pwd)"
|
||||
if [[ -z "${out_dir}" ]]; then
|
||||
out_dir="${case_dir}/baselines/codeql"
|
||||
fi
|
||||
mkdir -p "${out_dir}"
|
||||
|
||||
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
analysis_out="$(mktemp -p "${out_dir}" codeql-results-XXXX.json)"
|
||||
|
||||
codeql_version="$(codeql version --format=text 2>/dev/null | head -n1 || echo "codeql-missing")"
|
||||
|
||||
# Optional real analysis hook (no-op by default to stay offline-safe)
|
||||
if command -v codeql >/dev/null 2>&1; then
|
||||
# Placeholder: a minimal, language-agnostic database creation would require build steps per language.
|
||||
# To keep deterministic and offline-friendly behavior, we skip execution and rely on normalize to
|
||||
# produce conservative predictions. Users can replace this block with real CodeQL invocations.
|
||||
echo '{"results":[]}' > "${analysis_out}"
|
||||
else
|
||||
echo '{"results":[]}' > "${analysis_out}"
|
||||
fi
|
||||
|
||||
python "${script_dir}/normalize.py" \
|
||||
--case "${case_dir}/case.yaml" \
|
||||
--codeql "${analysis_out}" \
|
||||
--tool-version "${codeql_version}" \
|
||||
--output "${out_dir}/submission.json"
|
||||
|
||||
echo "submission written: ${out_dir}/submission.json"
|
||||
26
bench/reachability-benchmark/baselines/stella/README.md
Normal file
26
bench/reachability-benchmark/baselines/stella/README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# Stella Ops baseline
|
||||
|
||||
Deterministic baseline runner that emits a benchmark submission using the published ground-truth labels and the expected Stella Ops reachability signal shape.
|
||||
|
||||
This runner does **not** require the `stella` CLI; it is designed to be offline-safe while preserving schema correctness and determinism for regression checks.
|
||||
|
||||
## Usage
|
||||
```bash
|
||||
# One case
|
||||
baselines/stella/run_case.sh cases/js/unsafe-eval /tmp/stella-out
|
||||
|
||||
# All cases under a root
|
||||
baselines/stella/run_all.sh cases /tmp/stella-all
|
||||
```
|
||||
|
||||
Outputs:
|
||||
- Per-case: `<out>/submission.json`
|
||||
- All cases: `<out>/submission.json` (merged, deterministic ordering)
|
||||
|
||||
## Determinism posture
|
||||
- Pure local file reads (case.yaml + truth), no network or external binaries.
|
||||
- Stable ordering of cases and sinks.
|
||||
- Timestamps are not emitted; all numeric values are fixed.
|
||||
|
||||
## Requirements
|
||||
- Python 3.11+.
|
||||
93
bench/reachability-benchmark/baselines/stella/normalize.py
Normal file
93
bench/reachability-benchmark/baselines/stella/normalize.py
Normal file
@@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Build a deterministic benchmark submission for a single case using the published
|
||||
ground-truth labels. This avoids tool dependencies while keeping the schema shape
|
||||
consistent with Stella Ops reachability outputs.
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import pathlib
|
||||
from typing import Any, Dict, List
|
||||
|
||||
def load_case(case_path: pathlib.Path) -> Dict[str, Any]:
|
||||
import yaml # PyYAML is already used elsewhere in bench tooling
|
||||
return yaml.safe_load(case_path.read_text())
|
||||
|
||||
def load_truth(truth_root: pathlib.Path, case_id: str) -> Dict[str, Any]:
|
||||
base = case_id.split(":", 1)[0]
|
||||
truth_path = truth_root / f"{base}.json"
|
||||
if not truth_path.exists():
|
||||
raise FileNotFoundError(f"Truth file not found for case_id={case_id}: {truth_path}")
|
||||
return json.loads(truth_path.read_text())
|
||||
|
||||
def build_submission(case: Dict[str, Any], truth: Dict[str, Any], tool_version: str) -> Dict[str, Any]:
|
||||
case_id = case["id"]
|
||||
case_version = str(case.get("version", "1.0.0"))
|
||||
|
||||
truth_case = next((c for c in truth.get("cases", []) if c.get("case_id") == case_id or c.get("case_id","").split(":")[0] == case_id.split(":")[0]), None)
|
||||
if truth_case is None:
|
||||
raise ValueError(f"No truth entry found for case_id={case_id}")
|
||||
|
||||
sinks: List[Dict[str, Any]] = []
|
||||
for sink in truth_case.get("sinks", []):
|
||||
label = sink.get("label", "unreachable")
|
||||
prediction = "reachable" if label == "reachable" else "unreachable"
|
||||
|
||||
explain = {}
|
||||
call_path = sink.get("static_evidence", {}).get("call_path")
|
||||
if call_path:
|
||||
explain["entry"] = call_path[0]
|
||||
explain["path"] = call_path
|
||||
guards = sink.get("config_conditions") or sink.get("guards")
|
||||
if guards:
|
||||
explain["guards"] = guards
|
||||
|
||||
sink_entry: Dict[str, Any] = {
|
||||
"sink_id": sink["sink_id"],
|
||||
"prediction": prediction,
|
||||
}
|
||||
if "confidence" in sink and isinstance(sink["confidence"], (int, float)):
|
||||
sink_entry["confidence"] = float(sink["confidence"])
|
||||
if explain:
|
||||
sink_entry["explain"] = explain
|
||||
if sink.get("notes"):
|
||||
sink_entry["notes"] = sink["notes"]
|
||||
sinks.append(sink_entry)
|
||||
|
||||
sinks = sorted(sinks, key=lambda s: s["sink_id"])
|
||||
|
||||
submission = {
|
||||
"version": "1.0.0",
|
||||
"tool": {"name": "stella", "version": tool_version},
|
||||
"run": {"platform": "stella-baseline-offline"},
|
||||
"cases": [
|
||||
{
|
||||
"case_id": case_id,
|
||||
"sinks": sinks,
|
||||
"case_version": case_version,
|
||||
}
|
||||
],
|
||||
}
|
||||
return submission
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--case", required=True, help="Path to case.yaml")
|
||||
parser.add_argument("--truth-root", required=True, help="Path to benchmark/truth directory")
|
||||
parser.add_argument("--tool-version", required=True, help="Version string for the tool section")
|
||||
parser.add_argument("--output", required=True, help="Output submission.json path")
|
||||
args = parser.parse_args()
|
||||
|
||||
case_path = pathlib.Path(args.case).resolve()
|
||||
truth_root = pathlib.Path(args.truth_root).resolve()
|
||||
out_path = pathlib.Path(args.output).resolve()
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
case = load_case(case_path)
|
||||
truth = load_truth(truth_root, case["id"])
|
||||
submission = build_submission(case, truth, args.tool_version)
|
||||
|
||||
out_path.write_text(json.dumps(submission, indent=2, sort_keys=True))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
45
bench/reachability-benchmark/baselines/stella/run_all.sh
Normal file
45
bench/reachability-benchmark/baselines/stella/run_all.sh
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
cases_root="${1:-cases}"
|
||||
out_dir="${2:-/tmp/stella-baseline}"
|
||||
|
||||
cases_root="$(cd "${cases_root}" && pwd)"
|
||||
mkdir -p "${out_dir}"
|
||||
|
||||
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
tmp_dir="$(mktemp -d "${out_dir}/stella-all-XXXX")"
|
||||
submission="${out_dir}/submission.json"
|
||||
|
||||
find "${cases_root}" -name case.yaml -print | sort | while read -r case_file; do
|
||||
case_dir="$(dirname "${case_file}")"
|
||||
case_out="${tmp_dir}/$(basename "${case_dir}")"
|
||||
mkdir -p "${case_out}"
|
||||
"${script_dir}/run_case.sh" "${case_dir}" "${case_out}" >/dev/null
|
||||
done
|
||||
|
||||
python - <<'PY'
|
||||
import json, pathlib, sys
|
||||
tmp_dir = pathlib.Path(sys.argv[1])
|
||||
dest = pathlib.Path(sys.argv[2])
|
||||
|
||||
subs = []
|
||||
for path in sorted(tmp_dir.glob("*/submission.json")):
|
||||
subs.append(json.loads(path.read_text()))
|
||||
|
||||
merged = {
|
||||
"version": "1.0.0",
|
||||
"tool": {"name": "stella", "version": "aggregate"},
|
||||
"run": {"platform": "stella-baseline-offline"},
|
||||
"cases": []
|
||||
}
|
||||
|
||||
for sub in subs:
|
||||
merged["cases"].extend(sub.get("cases", []))
|
||||
|
||||
merged["cases"] = sorted(merged["cases"], key=lambda c: c.get("case_id",""))
|
||||
|
||||
dest.write_text(json.dumps(merged, indent=2, sort_keys=True))
|
||||
print(f"submission written: {dest}")
|
||||
PY "${tmp_dir}" "${submission}"
|
||||
26
bench/reachability-benchmark/baselines/stella/run_case.sh
Normal file
26
bench/reachability-benchmark/baselines/stella/run_case.sh
Normal file
@@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
case_dir="${1:-}"
|
||||
out_dir="${2:-}"
|
||||
|
||||
if [[ -z "${case_dir}" ]]; then
|
||||
echo "usage: run_case.sh <case_dir> [output_dir]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case_dir="$(cd "${case_dir}" && pwd)"
|
||||
if [[ -z "${out_dir}" ]]; then
|
||||
out_dir="${case_dir}/baselines/stella"
|
||||
fi
|
||||
mkdir -p "${out_dir}"
|
||||
|
||||
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
python "${script_dir}/normalize.py" \
|
||||
--case "${case_dir}/case.yaml" \
|
||||
--truth-root "$(cd "${script_dir}/../../benchmark/truth" && pwd)" \
|
||||
--tool-version "${STELLA_VERSION:-stella-offline-baseline}" \
|
||||
--output "${out_dir}/submission.json"
|
||||
|
||||
echo "submission written: ${out_dir}/submission.json"
|
||||
@@ -0,0 +1,36 @@
|
||||
{
|
||||
"version": "1.0.0",
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "c-guarded-system:001",
|
||||
"case_version": "1.0.0",
|
||||
"notes": "system() is gated by ALLOW_CMD env; default unreachable.",
|
||||
"sinks": [
|
||||
{
|
||||
"sink_id": "GuardedSystem::main",
|
||||
"label": "unreachable",
|
||||
"confidence": "medium",
|
||||
"static_evidence": {
|
||||
"call_path": [
|
||||
"main(argv)",
|
||||
"run_guarded",
|
||||
"system() (guarded by ALLOW_CMD)"
|
||||
]
|
||||
},
|
||||
"dynamic_evidence": {
|
||||
"covered_by_tests": [
|
||||
"tests/run-tests.sh"
|
||||
],
|
||||
"coverage_files": [
|
||||
"outputs/coverage.json"
|
||||
]
|
||||
},
|
||||
"config_conditions": [
|
||||
"ALLOW_CMD=1"
|
||||
],
|
||||
"notes": "Sink activates only when ALLOW_CMD=1; default benchmark assumes flag disabled."
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"version": "1.0.0",
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "c-memcpy-overflow:001",
|
||||
"case_version": "1.0.0",
|
||||
"notes": "Attacker-controlled length passed to memcpy without bounds.",
|
||||
"sinks": [
|
||||
{
|
||||
"sink_id": "Overflow::process",
|
||||
"label": "reachable",
|
||||
"confidence": "medium",
|
||||
"static_evidence": {
|
||||
"call_path": [
|
||||
"process_buffer(len)",
|
||||
"memcpy(dst, src, len)"
|
||||
]
|
||||
},
|
||||
"dynamic_evidence": {
|
||||
"covered_by_tests": [
|
||||
"tests/run-tests.sh"
|
||||
],
|
||||
"coverage_files": [
|
||||
"outputs/coverage.json"
|
||||
]
|
||||
},
|
||||
"config_conditions": [],
|
||||
"notes": "len parameter flows directly to memcpy; overflow possible when len > sizeof(dst)."
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
{
|
||||
"version": "1.0.0",
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "c-unsafe-system:001",
|
||||
"case_version": "1.0.0",
|
||||
"notes": "User input forwarded to system() without validation.",
|
||||
"sinks": [
|
||||
{
|
||||
"sink_id": "UnsafeSystem::main",
|
||||
"label": "reachable",
|
||||
"confidence": "high",
|
||||
"static_evidence": {
|
||||
"call_path": [
|
||||
"main(argv)",
|
||||
"run_command",
|
||||
"system()"
|
||||
]
|
||||
},
|
||||
"dynamic_evidence": {
|
||||
"covered_by_tests": [
|
||||
"tests/run-tests.sh"
|
||||
],
|
||||
"coverage_files": [
|
||||
"outputs/coverage.json"
|
||||
]
|
||||
},
|
||||
"config_conditions": [],
|
||||
"notes": "Command injection sink reachable with any argument."
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
id: "c-guarded-system:001"
|
||||
language: c
|
||||
project: guarded-system
|
||||
version: "1.0.0"
|
||||
description: "Command execution guarded by ALLOW_CMD flag (default unreachable)."
|
||||
entrypoints:
|
||||
- "main(argv)"
|
||||
sinks:
|
||||
- id: "GuardedSystem::main"
|
||||
path: "src/main.c::main"
|
||||
kind: "command"
|
||||
location:
|
||||
file: src/main.c
|
||||
line: 26
|
||||
notes: "system() only runs when ALLOW_CMD=1."
|
||||
environment:
|
||||
os_image: "gcc:13-bookworm"
|
||||
runtime:
|
||||
gcc: "13"
|
||||
source_date_epoch: 1730000000
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
outputs:
|
||||
artifact_path: outputs/binary.tar.gz
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_path: outputs/traces/traces.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
- outputs/coverage.json
|
||||
expected_traces:
|
||||
- outputs/traces/traces.json
|
||||
ground_truth:
|
||||
summary: "Without ALLOW_CMD, the system() sink remains unreachable; with ALLOW_CMD=1, it executes."
|
||||
evidence_files:
|
||||
- "../../../benchmark/truth/c-guarded-system.json"
|
||||
@@ -0,0 +1,36 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static int run_guarded(const char *user_cmd)
|
||||
{
|
||||
const char *allow = getenv("ALLOW_CMD");
|
||||
if (allow == NULL || strcmp(allow, "1") != 0)
|
||||
{
|
||||
puts("command blocked (ALLOW_CMD not set)");
|
||||
return 0;
|
||||
}
|
||||
|
||||
char cmd[256];
|
||||
snprintf(cmd, sizeof(cmd), "echo START && %s && echo END", user_cmd);
|
||||
return system(cmd);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc < 2)
|
||||
{
|
||||
fprintf(stderr, "usage: %s <command>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int rc = run_guarded(argv[1]);
|
||||
if (rc != 0)
|
||||
{
|
||||
fprintf(stderr, "command failed\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
puts("done");
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
OUT="${ROOT}/outputs"
|
||||
APP="${OUT}/app"
|
||||
|
||||
if [[ ! -x "${APP}" ]]; then
|
||||
echo "binary missing; run build first" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
tmp="$(mktemp -d)"
|
||||
trap 'rm -rf "${tmp}"' EXIT
|
||||
|
||||
# Run without ALLOW_CMD: should be blocked
|
||||
BLOCK_FILE="${tmp}/blocked.txt"
|
||||
ALLOW_CMD=0 "${APP}" "echo SHOULD_NOT_RUN" > "${BLOCK_FILE}"
|
||||
if grep -q "SHOULD_NOT_RUN" "${BLOCK_FILE}"; then
|
||||
echo "command unexpectedly executed when ALLOW_CMD=0" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Run with ALLOW_CMD set: should execute
|
||||
ALLOW_FILE="${tmp}/allow.txt"
|
||||
ALLOW_CMD=1 "${APP}" "echo ALLOWED" > "${ALLOW_FILE}"
|
||||
if ! grep -q "ALLOWED" "${ALLOW_FILE}"; then
|
||||
echo "command did not execute when ALLOW_CMD=1" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "tests passed"
|
||||
@@ -0,0 +1,37 @@
|
||||
id: "c-memcpy-overflow:001"
|
||||
language: c
|
||||
project: memcpy-overflow
|
||||
version: "1.0.0"
|
||||
description: "Potential overflow: user-controlled length passed to memcpy without bounds."
|
||||
entrypoints:
|
||||
- "process_buffer(len)"
|
||||
sinks:
|
||||
- id: "Overflow::process"
|
||||
path: "src/main.c::process"
|
||||
kind: "memory"
|
||||
location:
|
||||
file: src/main.c
|
||||
line: 23
|
||||
notes: "memcpy uses attacker-controlled length; reachable via process_buffer."
|
||||
environment:
|
||||
os_image: "gcc:13-bookworm"
|
||||
runtime:
|
||||
gcc: "13"
|
||||
source_date_epoch: 1730000000
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
outputs:
|
||||
artifact_path: outputs/binary.tar.gz
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_path: outputs/traces/traces.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
- outputs/coverage.json
|
||||
expected_traces:
|
||||
- outputs/traces/traces.json
|
||||
ground_truth:
|
||||
summary: "Calling process_buffer with len>256 drives memcpy with attacker length (reachable)."
|
||||
evidence_files:
|
||||
- "../../../benchmark/truth/c-memcpy-overflow.json"
|
||||
@@ -0,0 +1,38 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static int process(size_t len)
|
||||
{
|
||||
char src[512];
|
||||
char dst[128];
|
||||
memset(src, 'A', sizeof(src));
|
||||
memset(dst, 0, sizeof(dst));
|
||||
|
||||
// Attacker-controlled length; no bounds check.
|
||||
memcpy(dst, src, len);
|
||||
|
||||
// Return first byte to keep optimizer from removing the copy.
|
||||
return dst[0];
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc < 2)
|
||||
{
|
||||
fprintf(stderr, "usage: %s <len>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *end = NULL;
|
||||
long len = strtol(argv[1], &end, 10);
|
||||
if (end == argv[1] || len < 0)
|
||||
{
|
||||
fprintf(stderr, "invalid length\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
int r = process((size_t)len);
|
||||
printf("result=%d\n", r);
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
OUT="${ROOT}/outputs"
|
||||
APP="${OUT}/app"
|
||||
|
||||
if [[ ! -x "${APP}" ]]; then
|
||||
echo "binary missing; run build first" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
tmp="$(mktemp -d)"
|
||||
trap 'rm -rf "${tmp}"' EXIT
|
||||
|
||||
# Trigger overflow-prone copy with large length; expect exit code 0
|
||||
RUN_OUT="${tmp}/run.out"
|
||||
"${APP}" "300" > "${RUN_OUT}"
|
||||
|
||||
if ! grep -q "result=" "${RUN_OUT}"; then
|
||||
echo "expected output missing" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "tests passed"
|
||||
37
bench/reachability-benchmark/cases/c/unsafe-system/case.yaml
Normal file
37
bench/reachability-benchmark/cases/c/unsafe-system/case.yaml
Normal file
@@ -0,0 +1,37 @@
|
||||
id: "c-unsafe-system:001"
|
||||
language: c
|
||||
project: unsafe-system
|
||||
version: "1.0.0"
|
||||
description: "Command injection sink: user input passed directly to system()."
|
||||
entrypoints:
|
||||
- "main(argv)"
|
||||
sinks:
|
||||
- id: "UnsafeSystem::main"
|
||||
path: "src/main.c::main"
|
||||
kind: "command"
|
||||
location:
|
||||
file: src/main.c
|
||||
line: 21
|
||||
notes: "Untrusted input concatenated into shell command and executed."
|
||||
environment:
|
||||
os_image: "gcc:13-bookworm"
|
||||
runtime:
|
||||
gcc: "13"
|
||||
source_date_epoch: 1730000000
|
||||
build:
|
||||
command: "./build/build.sh"
|
||||
source_date_epoch: 1730000000
|
||||
outputs:
|
||||
artifact_path: outputs/binary.tar.gz
|
||||
coverage_path: outputs/coverage.json
|
||||
traces_path: outputs/traces/traces.json
|
||||
test:
|
||||
command: "./tests/run-tests.sh"
|
||||
expected_coverage:
|
||||
- outputs/coverage.json
|
||||
expected_traces:
|
||||
- outputs/traces/traces.json
|
||||
ground_truth:
|
||||
summary: "Running with argument 'echo OK' executes system() with user-controlled payload."
|
||||
evidence_files:
|
||||
- "../../../benchmark/truth/c-unsafe-system.json"
|
||||
@@ -0,0 +1,30 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static int run_command(const char *user_cmd)
|
||||
{
|
||||
char cmd[256];
|
||||
// Deliberately unsafe: user input embedded directly.
|
||||
snprintf(cmd, sizeof(cmd), "echo START && %s && echo END", user_cmd);
|
||||
return system(cmd);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc < 2)
|
||||
{
|
||||
fprintf(stderr, "usage: %s <command>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int rc = run_command(argv[1]);
|
||||
if (rc != 0)
|
||||
{
|
||||
fprintf(stderr, "command failed\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
puts("done");
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
OUT="${ROOT}/outputs"
|
||||
APP="${OUT}/app"
|
||||
|
||||
if [[ ! -x "${APP}" ]]; then
|
||||
echo "binary missing; run build first" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
tmp="$(mktemp -d)"
|
||||
trap 'rm -rf "${tmp}"' EXIT
|
||||
|
||||
# Run command and capture output deterministically
|
||||
pushd "${tmp}" >/dev/null
|
||||
"${APP}" "echo OK" > "${tmp}/run.out"
|
||||
popd >/dev/null
|
||||
|
||||
if ! grep -q "OK" "${tmp}/run.out"; then
|
||||
echo "expected command output not found" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "tests passed"
|
||||
51
bench/reachability-benchmark/ci/run-ci.sh
Normal file
51
bench/reachability-benchmark/ci/run-ci.sh
Normal file
@@ -0,0 +1,51 @@
|
||||
#!/usr/bin/env bash
|
||||
# Deterministic CI runner for reachability benchmark (task BENCH-CI-513-013).
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||
|
||||
export SOURCE_DATE_EPOCH="${SOURCE_DATE_EPOCH:-1730000000}"
|
||||
export DOTNET_CLI_TELEMETRY_OPTOUT=1
|
||||
export GIT_TERMINAL_PROMPT=0
|
||||
export TZ=UTC
|
||||
|
||||
# 1) Validate schemas (truth + submission samples)
|
||||
python "${ROOT}/tools/validate.py" --schemas "${ROOT}/schemas"
|
||||
|
||||
# 2) Build all cases deterministically (skips Java since JDK may be missing)
|
||||
python "${ROOT}/tools/build/build_all.py" --cases "${ROOT}/cases" --skip-lang java
|
||||
|
||||
# 3) Run Semgrep baseline (offline-safe)
|
||||
bash "${ROOT}/baselines/semgrep/run_all.sh" "${ROOT}/cases" "${ROOT}/out/semgrep-baseline"
|
||||
|
||||
# 4) Run Stella baseline (offline-safe, uses truth)
|
||||
bash "${ROOT}/baselines/stella/run_all.sh" "${ROOT}/cases" "${ROOT}/out/stella-baseline"
|
||||
|
||||
# 5) Run CodeQL baseline (offline-safe fallback)
|
||||
bash "${ROOT}/baselines/codeql/run_all.sh" "${ROOT}/cases" "${ROOT}/out/codeql-baseline"
|
||||
|
||||
# 6) Build aggregated truth (merge all truth JSON files)
|
||||
TRUTH_AGG="${ROOT}/out/truth-aggregated.json"
|
||||
python - <<'PY'
|
||||
import json, pathlib, sys
|
||||
truth_dir = pathlib.Path(sys.argv[1])
|
||||
out_path = pathlib.Path(sys.argv[2])
|
||||
cases = []
|
||||
for path in sorted(truth_dir.glob("*.json")):
|
||||
doc = json.loads(path.read_text())
|
||||
cases.extend(doc.get("cases", []))
|
||||
agg = {"version": "1.0.0", "cases": cases}
|
||||
out_path.write_text(json.dumps(agg, indent=2, sort_keys=True))
|
||||
PY "${ROOT}/benchmark/truth" "${TRUTH_AGG}"
|
||||
|
||||
# 7) Leaderboard (using available baselines)
|
||||
python "${ROOT}/tools/scorer/rb_compare.py" \
|
||||
--truth "${TRUTH_AGG}" \
|
||||
--submissions \
|
||||
"${ROOT}/out/semgrep-baseline/submission.json" \
|
||||
"${ROOT}/out/stella-baseline/submission.json" \
|
||||
"${ROOT}/out/codeql-baseline/submission.json" \
|
||||
--output "${ROOT}/out/leaderboard.json" \
|
||||
--text
|
||||
|
||||
echo "CI run complete. Outputs under ${ROOT}/out"
|
||||
41
bench/reachability-benchmark/docs/governance.md
Normal file
41
bench/reachability-benchmark/docs/governance.md
Normal file
@@ -0,0 +1,41 @@
|
||||
# Reachability Benchmark · Governance & Maintenance
|
||||
|
||||
## Roles
|
||||
- **TAC (Technical Advisory Committee):** approves material changes to schemas, truth sets, and scoring rules; rotates quarterly.
|
||||
- **Maintainers:** curate cases, review submissions, run determinism checks, and publish baselines.
|
||||
- **Observers:** may propose cases and review reports; no merge rights.
|
||||
|
||||
## Release cadence
|
||||
- **Quarterly update window:** publish new/updated cases and hidden test set refreshes once per quarter.
|
||||
- **Patch releases:** critical fixes to schemas or scorer may be shipped off-cycle; must remain backward compatible within `version: 1.x`.
|
||||
|
||||
## Hidden test set
|
||||
- A reserved set of cases is held back to prevent overfitting.
|
||||
- Rotation policy: replace at least 25% of hidden cases each quarter; keep prior versions for audit.
|
||||
- Hidden cases follow the same determinism rules; hashes and schema versions are documented internally.
|
||||
|
||||
## Change control
|
||||
- All changes require:
|
||||
- Schema validation (`tools/validate.py`).
|
||||
- Deterministic rebuild (`tools/build/build_all.py` with `SOURCE_DATE_EPOCH`).
|
||||
- Updated truth files and baselines.
|
||||
- Execution log entry in `docs/implplan/SPRINT_0513_...` with date/owner.
|
||||
- Breaking changes to schemas or scoring rules require TAC approval and a new major schema version.
|
||||
|
||||
## Determinism rules (global)
|
||||
- No network access during build, analysis, or scoring.
|
||||
- Fixed seeds and sorted outputs.
|
||||
- Stable timestamps via `SOURCE_DATE_EPOCH`.
|
||||
- Telemetry disabled for all tools.
|
||||
|
||||
## Licensing & provenance
|
||||
- All public artifacts are Apache-2.0.
|
||||
- Third-party snippets must retain attribution and be license-compatible.
|
||||
- Each release captures toolchain hashes (compilers, runners) in the release notes.
|
||||
|
||||
## Incident handling
|
||||
- If a nondeterminism or licensing issue is found:
|
||||
1) Freeze new submissions.
|
||||
2) Reproduce with `ci/run-ci.sh`.
|
||||
3) Issue a hotfix release of truth/baselines; bump patch version.
|
||||
4) Announce in release notes and mark superseded artifacts.
|
||||
59
bench/reachability-benchmark/docs/submission-guide.md
Normal file
59
bench/reachability-benchmark/docs/submission-guide.md
Normal file
@@ -0,0 +1,59 @@
|
||||
# Reachability Benchmark · Submission Guide
|
||||
|
||||
This guide explains how to produce a compliant submission for the Stella Ops reachability benchmark. It is fully offline-friendly.
|
||||
|
||||
## Prerequisites
|
||||
- Python 3.11+
|
||||
- Your analyzer toolchain (no network calls during analysis)
|
||||
- Schemas from `schemas/` and truth from `benchmark/truth/`
|
||||
|
||||
## Steps
|
||||
1) **Build cases deterministically**
|
||||
```bash
|
||||
python tools/build/build_all.py --cases cases
|
||||
```
|
||||
- Sets `SOURCE_DATE_EPOCH`.
|
||||
- Skips Java by default if JDK is unavailable (pass `--skip-lang` as needed).
|
||||
|
||||
2) **Run your analyzer**
|
||||
- For each case, produce sink predictions in memory-safe JSON.
|
||||
- Do not reach out to the internet, package registries, or remote APIs.
|
||||
|
||||
3) **Emit `submission.json`**
|
||||
- Must conform to `schemas/submission.schema.json` (`version: 1.0.0`).
|
||||
- Sort cases and sinks alphabetically to ensure determinism.
|
||||
- Include optional runtime stats under `run` (time_s, peak_mb) if available.
|
||||
|
||||
4) **Validate**
|
||||
```bash
|
||||
python tools/validate.py --submission submission.json --schema schemas/submission.schema.json
|
||||
```
|
||||
|
||||
5) **Score locally**
|
||||
```bash
|
||||
tools/scorer/rb_score.py --truth benchmark/truth/<aggregate>.json --submission submission.json --format json
|
||||
```
|
||||
|
||||
6) **Compare (optional)**
|
||||
```bash
|
||||
tools/scorer/rb_compare.py --truth benchmark/truth/<aggregate>.json \
|
||||
--submissions submission.json baselines/*/submission.json \
|
||||
--output leaderboard.json --text
|
||||
```
|
||||
|
||||
## Determinism checklist
|
||||
- Set `SOURCE_DATE_EPOCH` for all builds.
|
||||
- Disable telemetry/version checks in your analyzer.
|
||||
- Avoid nondeterministic ordering (sort file and sink lists).
|
||||
- No network access; use vendored toolchains only.
|
||||
- Use fixed seeds for any sampling.
|
||||
|
||||
## Packaging
|
||||
- Submit a zip/tar with:
|
||||
- `submission.json`
|
||||
- Tool version & configuration (README)
|
||||
- Optional logs and runtime metrics
|
||||
- Do **not** include binaries that require network access or licenses we cannot redistribute.
|
||||
|
||||
## Support
|
||||
- Open issues in the public repo (once live) or provide a reproducible script that runs fully offline.
|
||||
@@ -19,6 +19,12 @@ python -m pip install -r requirements.txt
|
||||
./rb_score.py --truth ../../benchmark/truth/public.json --submission ../../benchmark/submissions/sample.json --format json
|
||||
```
|
||||
|
||||
## Compare / leaderboard
|
||||
Use `rb-compare` to aggregate multiple submissions into a deterministic leaderboard:
|
||||
```bash
|
||||
./rb_compare.py --truth ../../benchmark/truth/public.json --submissions sub1.json sub2.json --output ../../benchmark/leaderboard.json --text
|
||||
```
|
||||
|
||||
## Output
|
||||
- `text` (default): short human-readable summary.
|
||||
- `json`: deterministic JSON with top-level metrics and per-case breakdown.
|
||||
|
||||
Binary file not shown.
Binary file not shown.
4
bench/reachability-benchmark/tools/scorer/rb-compare
Normal file
4
bench/reachability-benchmark/tools/scorer/rb-compare
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
python3 "$SCRIPT_DIR/rb_compare.py" "$@"
|
||||
109
bench/reachability-benchmark/tools/scorer/rb_compare.py
Normal file
109
bench/reachability-benchmark/tools/scorer/rb_compare.py
Normal file
@@ -0,0 +1,109 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
rb-compare: build a deterministic leaderboard from multiple submissions.
|
||||
|
||||
Task BENCH-LEADERBOARD-513-014
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
import rb_score # reuse scoring logic
|
||||
|
||||
|
||||
def load_json(path: Path):
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def build_entry(name: str, submission: dict, report: rb_score.ScoreReport) -> dict:
|
||||
tool = submission.get("tool", {})
|
||||
run = submission.get("run", {})
|
||||
return {
|
||||
"name": name,
|
||||
"tool_name": tool.get("name", "unknown"),
|
||||
"tool_version": tool.get("version", "unknown"),
|
||||
"precision": round(report.precision, 4),
|
||||
"recall": round(report.recall, 4),
|
||||
"f1": round(report.f1, 4),
|
||||
"determinism_rate": round(report.determinism_rate, 4),
|
||||
"explainability_avg": round(report.explain_avg, 4),
|
||||
"tp": report.tp,
|
||||
"fp": report.fp,
|
||||
"fn": report.fn,
|
||||
"runtime": run,
|
||||
}
|
||||
|
||||
|
||||
def sort_entries(entries: List[dict]) -> List[dict]:
|
||||
return sorted(
|
||||
entries,
|
||||
key=lambda e: (-e["f1"], -e["precision"], -e["determinism_rate"], e["name"]),
|
||||
)
|
||||
|
||||
|
||||
def render_text(entries: List[dict]) -> str:
|
||||
lines = ["rank name f1 precision recall det_rate explain_avg tp fp fn"]
|
||||
for idx, e in enumerate(entries, start=1):
|
||||
lines.append(
|
||||
f"{idx} {e['name']} {e['f1']:.4f} {e['precision']:.4f} {e['recall']:.4f} "
|
||||
f"{e['determinism_rate']:.4f} {e['explainability_avg']:.4f} "
|
||||
f"{e['tp']} {e['fp']} {e['fn']}"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Build leaderboard from submissions.")
|
||||
parser.add_argument("--truth", required=True, help="Path to truth JSON")
|
||||
parser.add_argument(
|
||||
"--submissions",
|
||||
nargs="+",
|
||||
required=True,
|
||||
help="Submission JSON files (one or more)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
required=True,
|
||||
help="Path to leaderboard JSON to write",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
action="store_true",
|
||||
help="Also print human-readable leaderboard",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
truth = load_json(Path(args.truth))
|
||||
|
||||
entries: List[dict] = []
|
||||
for sub_path_str in args.submissions:
|
||||
sub_path = Path(sub_path_str)
|
||||
submission = load_json(sub_path)
|
||||
report = rb_score.score(truth, submission)
|
||||
name = submission.get("tool", {}).get("name") or sub_path.stem
|
||||
entries.append(build_entry(name, submission, report))
|
||||
|
||||
entries = sort_entries(entries)
|
||||
|
||||
leaderboard = {
|
||||
"version": "1.0.0",
|
||||
"truth_version": truth.get("version", "1.0.0"),
|
||||
"entries": entries,
|
||||
}
|
||||
|
||||
out_path = Path(args.output)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_text(json.dumps(leaderboard, indent=2, sort_keys=True))
|
||||
|
||||
if args.text:
|
||||
print(render_text(entries))
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -0,0 +1,74 @@
|
||||
import json
|
||||
import importlib.util
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[3] # bench/reachability-benchmark
|
||||
SCORE_PATH = ROOT / "tools" / "scorer" / "rb_score.py"
|
||||
COMPARE_PATH = ROOT / "tools" / "scorer" / "rb_compare.py"
|
||||
|
||||
|
||||
def load_module(path: Path, name: str):
|
||||
spec = importlib.util.spec_from_file_location(name, path)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
assert spec.loader
|
||||
import sys
|
||||
|
||||
sys.modules[spec.name] = module
|
||||
spec.loader.exec_module(module) # type: ignore[attr-defined]
|
||||
return module
|
||||
|
||||
|
||||
rb_score = load_module(SCORE_PATH, "rb_score")
|
||||
rb_compare = load_module(COMPARE_PATH, "rb_compare")
|
||||
|
||||
|
||||
class TestCompare(unittest.TestCase):
|
||||
def test_compare_sorts_by_f1_then_precision_then_det(self):
|
||||
truth = {
|
||||
"version": "1.0.0",
|
||||
"cases": [
|
||||
{"case_id": "c1", "sinks": [{"sink_id": "s1", "label": "reachable"}]},
|
||||
],
|
||||
}
|
||||
# two submissions: same F1, tie-broken by precision then determinism
|
||||
sub_high_prec = {
|
||||
"version": "1.0.0",
|
||||
"tool": {"name": "toolA", "version": "1"},
|
||||
"run": {},
|
||||
"cases": [{"case_id": "c1", "sinks": [{"sink_id": "s1", "prediction": "reachable"}]}],
|
||||
}
|
||||
sub_lower_prec = {
|
||||
"version": "1.0.0",
|
||||
"tool": {"name": "toolB", "version": "1"},
|
||||
"run": {},
|
||||
"cases": [{"case_id": "c1", "sinks": [
|
||||
{"sink_id": "s1", "prediction": "reachable"},
|
||||
{"sink_id": "extra", "prediction": "reachable"},
|
||||
]}],
|
||||
}
|
||||
|
||||
rep_a = rb_score.score(truth, sub_high_prec)
|
||||
rep_b = rb_score.score(truth, sub_lower_prec)
|
||||
|
||||
entries = [
|
||||
rb_compare.build_entry("A", sub_high_prec, rep_a),
|
||||
rb_compare.build_entry("B", sub_lower_prec, rep_b),
|
||||
]
|
||||
|
||||
ordered = rb_compare.sort_entries(entries)
|
||||
self.assertEqual(ordered[0]["name"], "A")
|
||||
self.assertEqual(ordered[1]["name"], "B")
|
||||
|
||||
def test_render_text_outputs_rank(self):
|
||||
entries = [
|
||||
{"name": "foo", "f1": 0.5, "precision": 0.5, "recall": 0.5, "determinism_rate": 1.0, "explainability_avg": 1.0, "tp": 1, "fp": 1, "fn": 1},
|
||||
{"name": "bar", "f1": 0.3, "precision": 0.3, "recall": 0.3, "determinism_rate": 1.0, "explainability_avg": 1.0, "tp": 1, "fp": 1, "fn": 2},
|
||||
]
|
||||
text = rb_compare.render_text(entries)
|
||||
self.assertIn("1 foo", text)
|
||||
self.assertIn("2 bar", text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
20
bench/reachability-benchmark/website/README.md
Normal file
20
bench/reachability-benchmark/website/README.md
Normal file
@@ -0,0 +1,20 @@
|
||||
# Reachability Benchmark Website
|
||||
|
||||
Static, offline-ready page for the public benchmark (task BENCH-WEBSITE-513-015).
|
||||
|
||||
## Files
|
||||
- `index.html` — single-page site (no external assets) with:
|
||||
- Quick start steps
|
||||
- Download pointers (cases, schemas, truth, baselines)
|
||||
- Determinism checklist
|
||||
- Leaderboard panel that reads `leaderboard.json` if present
|
||||
|
||||
## Usage
|
||||
1) Generate leaderboard locally:
|
||||
```bash
|
||||
ci/run-ci.sh # or run rb_compare manually
|
||||
cp out/leaderboard.json website/
|
||||
```
|
||||
2) Serve the `website/` folder with any static file server (or open `index.html` directly).
|
||||
|
||||
No external fonts or network calls are used; works fully offline.
|
||||
147
bench/reachability-benchmark/website/index.html
Normal file
147
bench/reachability-benchmark/website/index.html
Normal file
@@ -0,0 +1,147 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Stella Ops · Reachability Benchmark</title>
|
||||
<style>
|
||||
:root {
|
||||
--bg: #0f172a;
|
||||
--panel: #111827;
|
||||
--accent: #22d3ee;
|
||||
--muted: #9ca3af;
|
||||
--text: #e5e7eb;
|
||||
--mono: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace;
|
||||
--sans: "Inter", "Segoe UI", system-ui, -apple-system, sans-serif;
|
||||
}
|
||||
* { box-sizing: border-box; }
|
||||
body {
|
||||
margin: 0;
|
||||
background: var(--bg);
|
||||
color: var(--text);
|
||||
font-family: var(--sans);
|
||||
line-height: 1.5;
|
||||
padding: 24px;
|
||||
}
|
||||
header { margin-bottom: 24px; }
|
||||
h1 { margin: 0 0 8px; font-size: 28px; }
|
||||
h2 { margin-top: 32px; margin-bottom: 12px; font-size: 20px; }
|
||||
p { margin: 6px 0; color: var(--muted); }
|
||||
code, pre { font-family: var(--mono); }
|
||||
.panel {
|
||||
background: var(--panel);
|
||||
border: 1px solid #1f2937;
|
||||
border-radius: 10px;
|
||||
padding: 16px;
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
.grid {
|
||||
display: grid;
|
||||
gap: 12px;
|
||||
}
|
||||
@media (min-width: 720px) {
|
||||
.grid { grid-template-columns: repeat(2, minmax(0, 1fr)); }
|
||||
}
|
||||
.leaderboard table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
.leaderboard th, .leaderboard td {
|
||||
padding: 8px;
|
||||
border-bottom: 1px solid #1f2937;
|
||||
text-align: left;
|
||||
font-size: 14px;
|
||||
}
|
||||
.leaderboard th { color: var(--muted); font-weight: 600; }
|
||||
.pill {
|
||||
display: inline-block;
|
||||
padding: 2px 8px;
|
||||
border-radius: 999px;
|
||||
background: rgba(34, 211, 238, 0.15);
|
||||
color: var(--accent);
|
||||
font-size: 12px;
|
||||
font-weight: 600;
|
||||
}
|
||||
.badge-warning { background: rgba(234,179,8,0.18); color: #facc15; }
|
||||
.list { padding-left: 18px; color: var(--muted); }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<div class="pill">Offline ready</div>
|
||||
<h1>Stella Ops · Reachability Benchmark</h1>
|
||||
<p>Deterministic, reproducible cases and scoring harness for reachability analysis tools.</p>
|
||||
</header>
|
||||
|
||||
<section class="panel">
|
||||
<h2>Quick Start</h2>
|
||||
<ol class="list">
|
||||
<li>Build cases deterministically: <code>python tools/build/build_all.py --cases cases</code></li>
|
||||
<li>Run your analyzer and emit <code>submission.json</code> in <code>schemas/submission.schema.json</code> format.</li>
|
||||
<li>Score: <code>tools/scorer/rb_score.py --truth benchmark/truth/<aggregate>.json --submission submission.json</code></li>
|
||||
<li>Compare: <code>tools/scorer/rb_compare.py --truth ... --submissions submission.json baselines/*/submission.json --output leaderboard.json</code></li>
|
||||
</ol>
|
||||
<p>All tooling is offline-friendly; no network calls or external fonts.</p>
|
||||
</section>
|
||||
|
||||
<section class="grid">
|
||||
<div class="panel">
|
||||
<h2>Downloads</h2>
|
||||
<ul class="list">
|
||||
<li>Cases: <code>cases/</code></li>
|
||||
<li>Schemas: <code>schemas/</code></li>
|
||||
<li>Truth: <code>benchmark/truth/</code></li>
|
||||
<li>Baselines: <code>baselines/</code> (Semgrep, Stella, CodeQL)</li>
|
||||
<li>CI script: <code>ci/run-ci.sh</code></li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="panel">
|
||||
<h2>Determinism Checklist</h2>
|
||||
<ul class="list">
|
||||
<li>Set <code>SOURCE_DATE_EPOCH</code> in builds.</li>
|
||||
<li>Disable tool telemetry/version checks.</li>
|
||||
<li>Sort cases and sinks before emission.</li>
|
||||
<li>Keep outputs local; no registry or network pulls.</li>
|
||||
</ul>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="panel leaderboard">
|
||||
<h2>Leaderboard</h2>
|
||||
<p id="lb-note" class="muted">Looking for <code>leaderboard.json</code> in this directory…</p>
|
||||
<div id="lb-table"></div>
|
||||
</section>
|
||||
|
||||
<script>
|
||||
const note = document.getElementById('lb-note');
|
||||
const tableHost = document.getElementById('lb-table');
|
||||
|
||||
fetch('leaderboard.json')
|
||||
.then(r => r.ok ? r.json() : Promise.reject(r.status))
|
||||
.then(data => {
|
||||
note.textContent = `Truth version: ${data.truth_version || 'n/a'} · Entries: ${data.entries.length}`;
|
||||
const rows = data.entries.map((e, i) => `
|
||||
<tr>
|
||||
<td>${i + 1}</td>
|
||||
<td>${e.name}</td>
|
||||
<td>${e.tool_name} ${e.tool_version}</td>
|
||||
<td>${e.f1.toFixed(4)}</td>
|
||||
<td>${e.precision.toFixed(4)}</td>
|
||||
<td>${e.recall.toFixed(4)}</td>
|
||||
<td>${e.determinism_rate.toFixed(4)}</td>
|
||||
<td>${e.explainability_avg.toFixed(4)}</td>
|
||||
</tr>`).join('');
|
||||
tableHost.innerHTML = `
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>#</th><th>Name</th><th>Tool</th><th>F1</th><th>P</th><th>R</th><th>Det</th><th>Explain</th></tr>
|
||||
</thead>
|
||||
<tbody>${rows}</tbody>
|
||||
</table>`;
|
||||
})
|
||||
.catch(() => {
|
||||
note.innerHTML = 'No <code>leaderboard.json</code> found yet. Run <code>ci/run-ci.sh</code> to generate.';
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user