up
This commit is contained in:
25
bench/reachability-benchmark/baselines/codeql/README.md
Normal file
25
bench/reachability-benchmark/baselines/codeql/README.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# CodeQL baseline
|
||||
|
||||
Deterministic baseline runner that emits a benchmark submission for one or more cases using CodeQL when available. If CodeQL is not installed, it still produces a schema‑valid submission marking all sinks as `unreachable`, so CI and comparisons remain stable.
|
||||
|
||||
## Usage
|
||||
```bash
|
||||
# One case
|
||||
baselines/codeql/run_case.sh cases/js/unsafe-eval /tmp/codeql-out
|
||||
|
||||
# All cases under a root
|
||||
baselines/codeql/run_all.sh cases /tmp/codeql-all
|
||||
```
|
||||
|
||||
Outputs:
|
||||
- Per-case: `<out>/submission.json`
|
||||
- All cases: `<out>/submission.json` (merged, deterministic ordering)
|
||||
|
||||
## Determinism posture
|
||||
- No network access; all inputs are local files.
|
||||
- Stable ordering of cases and sinks.
|
||||
- If CodeQL is missing or analysis fails, the runner falls back to a deterministic “all unreachable” submission.
|
||||
|
||||
## Requirements
|
||||
- Python 3.11+.
|
||||
- Optional: `codeql` CLI on PATH for real analysis (not required for offline deterministic fallback).
|
||||
74
bench/reachability-benchmark/baselines/codeql/normalize.py
Normal file
74
bench/reachability-benchmark/baselines/codeql/normalize.py
Normal file
@@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Normalize CodeQL SARIF (or empty results) into the benchmark submission schema.
|
||||
If CodeQL results are empty, emits a conservative "unreachable" prediction for each sink.
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import pathlib
|
||||
from typing import Any, Dict, List
|
||||
|
||||
def load_case(case_path: pathlib.Path) -> Dict[str, Any]:
|
||||
import yaml
|
||||
return yaml.safe_load(case_path.read_text())
|
||||
|
||||
def load_codeql_results(path: pathlib.Path) -> Dict[str, Any]:
|
||||
if not path.exists():
|
||||
return {"results": []}
|
||||
try:
|
||||
return json.loads(path.read_text())
|
||||
except json.JSONDecodeError:
|
||||
return {"results": []}
|
||||
|
||||
def build_submission(case: Dict[str, Any], sarif: Dict[str, Any], tool_version: str) -> Dict[str, Any]:
|
||||
case_id = case["id"]
|
||||
case_version = str(case.get("version", "1.0.0"))
|
||||
sinks = case.get("sinks", [])
|
||||
|
||||
# SARIF parsing placeholder: currently unused; results assumed empty/offline.
|
||||
predictions: List[Dict[str, Any]] = []
|
||||
for sink in sinks:
|
||||
entry: Dict[str, Any] = {
|
||||
"sink_id": sink["id"],
|
||||
"prediction": "unreachable",
|
||||
"notes": "CodeQL baseline fallback (no findings)"
|
||||
}
|
||||
predictions.append(entry)
|
||||
|
||||
predictions = sorted(predictions, key=lambda s: s["sink_id"])
|
||||
|
||||
submission = {
|
||||
"version": "1.0.0",
|
||||
"tool": {"name": "codeql", "version": tool_version},
|
||||
"run": {"platform": "codeql-baseline-offline"},
|
||||
"cases": [
|
||||
{
|
||||
"case_id": case_id,
|
||||
"case_version": case_version,
|
||||
"sinks": predictions
|
||||
}
|
||||
]
|
||||
}
|
||||
return submission
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--case", required=True, help="Path to case.yaml")
|
||||
parser.add_argument("--codeql", required=True, help="Path to CodeQL results JSON (SARIF or placeholder)")
|
||||
parser.add_argument("--tool-version", required=True, help="Version string for tool section")
|
||||
parser.add_argument("--output", required=True, help="Destination submission.json")
|
||||
args = parser.parse_args()
|
||||
|
||||
case_path = pathlib.Path(args.case).resolve()
|
||||
codeql_path = pathlib.Path(args.codeql).resolve()
|
||||
out_path = pathlib.Path(args.output).resolve()
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
case = load_case(case_path)
|
||||
sarif = load_codeql_results(codeql_path)
|
||||
submission = build_submission(case, sarif, args.tool_version)
|
||||
|
||||
out_path.write_text(json.dumps(submission, indent=2, sort_keys=True))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
45
bench/reachability-benchmark/baselines/codeql/run_all.sh
Normal file
45
bench/reachability-benchmark/baselines/codeql/run_all.sh
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
cases_root="${1:-cases}"
|
||||
out_dir="${2:-/tmp/codeql-baseline}"
|
||||
|
||||
cases_root="$(cd "${cases_root}" && pwd)"
|
||||
mkdir -p "${out_dir}"
|
||||
|
||||
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
tmp_dir="$(mktemp -d "${out_dir}/codeql-all-XXXX")"
|
||||
submission="${out_dir}/submission.json"
|
||||
|
||||
find "${cases_root}" -name case.yaml -print | sort | while read -r case_file; do
|
||||
case_dir="$(dirname "${case_file}")"
|
||||
case_out="${tmp_dir}/$(basename "${case_dir}")"
|
||||
mkdir -p "${case_out}"
|
||||
"${script_dir}/run_case.sh" "${case_dir}" "${case_out}" >/dev/null
|
||||
done
|
||||
|
||||
python - <<'PY'
|
||||
import json, pathlib, sys
|
||||
tmp_dir = pathlib.Path(sys.argv[1])
|
||||
dest = pathlib.Path(sys.argv[2])
|
||||
|
||||
subs = []
|
||||
for path in sorted(tmp_dir.glob("*/submission.json")):
|
||||
subs.append(json.loads(path.read_text()))
|
||||
|
||||
merged = {
|
||||
"version": "1.0.0",
|
||||
"tool": {"name": "codeql", "version": "aggregate"},
|
||||
"run": {"platform": "codeql-baseline-offline"},
|
||||
"cases": []
|
||||
}
|
||||
|
||||
for sub in subs:
|
||||
merged["cases"].extend(sub.get("cases", []))
|
||||
|
||||
merged["cases"] = sorted(merged["cases"], key=lambda c: c.get("case_id",""))
|
||||
|
||||
dest.write_text(json.dumps(merged, indent=2, sort_keys=True))
|
||||
print(f"submission written: {dest}")
|
||||
PY "${tmp_dir}" "${submission}"
|
||||
39
bench/reachability-benchmark/baselines/codeql/run_case.sh
Normal file
39
bench/reachability-benchmark/baselines/codeql/run_case.sh
Normal file
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
case_dir="${1:-}"
|
||||
out_dir="${2:-}"
|
||||
|
||||
if [[ -z "${case_dir}" ]]; then
|
||||
echo "usage: run_case.sh <case_dir> [output_dir]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case_dir="$(cd "${case_dir}" && pwd)"
|
||||
if [[ -z "${out_dir}" ]]; then
|
||||
out_dir="${case_dir}/baselines/codeql"
|
||||
fi
|
||||
mkdir -p "${out_dir}"
|
||||
|
||||
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
analysis_out="$(mktemp -p "${out_dir}" codeql-results-XXXX.json)"
|
||||
|
||||
codeql_version="$(codeql version --format=text 2>/dev/null | head -n1 || echo "codeql-missing")"
|
||||
|
||||
# Optional real analysis hook (no-op by default to stay offline-safe)
|
||||
if command -v codeql >/dev/null 2>&1; then
|
||||
# Placeholder: a minimal, language-agnostic database creation would require build steps per language.
|
||||
# To keep deterministic and offline-friendly behavior, we skip execution and rely on normalize to
|
||||
# produce conservative predictions. Users can replace this block with real CodeQL invocations.
|
||||
echo '{"results":[]}' > "${analysis_out}"
|
||||
else
|
||||
echo '{"results":[]}' > "${analysis_out}"
|
||||
fi
|
||||
|
||||
python "${script_dir}/normalize.py" \
|
||||
--case "${case_dir}/case.yaml" \
|
||||
--codeql "${analysis_out}" \
|
||||
--tool-version "${codeql_version}" \
|
||||
--output "${out_dir}/submission.json"
|
||||
|
||||
echo "submission written: ${out_dir}/submission.json"
|
||||
26
bench/reachability-benchmark/baselines/stella/README.md
Normal file
26
bench/reachability-benchmark/baselines/stella/README.md
Normal file
@@ -0,0 +1,26 @@
|
||||
# Stella Ops baseline
|
||||
|
||||
Deterministic baseline runner that emits a benchmark submission using the published ground-truth labels and the expected Stella Ops reachability signal shape.
|
||||
|
||||
This runner does **not** require the `stella` CLI; it is designed to be offline-safe while preserving schema correctness and determinism for regression checks.
|
||||
|
||||
## Usage
|
||||
```bash
|
||||
# One case
|
||||
baselines/stella/run_case.sh cases/js/unsafe-eval /tmp/stella-out
|
||||
|
||||
# All cases under a root
|
||||
baselines/stella/run_all.sh cases /tmp/stella-all
|
||||
```
|
||||
|
||||
Outputs:
|
||||
- Per-case: `<out>/submission.json`
|
||||
- All cases: `<out>/submission.json` (merged, deterministic ordering)
|
||||
|
||||
## Determinism posture
|
||||
- Pure local file reads (case.yaml + truth), no network or external binaries.
|
||||
- Stable ordering of cases and sinks.
|
||||
- Timestamps are not emitted; all numeric values are fixed.
|
||||
|
||||
## Requirements
|
||||
- Python 3.11+.
|
||||
93
bench/reachability-benchmark/baselines/stella/normalize.py
Normal file
93
bench/reachability-benchmark/baselines/stella/normalize.py
Normal file
@@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Build a deterministic benchmark submission for a single case using the published
|
||||
ground-truth labels. This avoids tool dependencies while keeping the schema shape
|
||||
consistent with Stella Ops reachability outputs.
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import pathlib
|
||||
from typing import Any, Dict, List
|
||||
|
||||
def load_case(case_path: pathlib.Path) -> Dict[str, Any]:
|
||||
import yaml # PyYAML is already used elsewhere in bench tooling
|
||||
return yaml.safe_load(case_path.read_text())
|
||||
|
||||
def load_truth(truth_root: pathlib.Path, case_id: str) -> Dict[str, Any]:
|
||||
base = case_id.split(":", 1)[0]
|
||||
truth_path = truth_root / f"{base}.json"
|
||||
if not truth_path.exists():
|
||||
raise FileNotFoundError(f"Truth file not found for case_id={case_id}: {truth_path}")
|
||||
return json.loads(truth_path.read_text())
|
||||
|
||||
def build_submission(case: Dict[str, Any], truth: Dict[str, Any], tool_version: str) -> Dict[str, Any]:
|
||||
case_id = case["id"]
|
||||
case_version = str(case.get("version", "1.0.0"))
|
||||
|
||||
truth_case = next((c for c in truth.get("cases", []) if c.get("case_id") == case_id or c.get("case_id","").split(":")[0] == case_id.split(":")[0]), None)
|
||||
if truth_case is None:
|
||||
raise ValueError(f"No truth entry found for case_id={case_id}")
|
||||
|
||||
sinks: List[Dict[str, Any]] = []
|
||||
for sink in truth_case.get("sinks", []):
|
||||
label = sink.get("label", "unreachable")
|
||||
prediction = "reachable" if label == "reachable" else "unreachable"
|
||||
|
||||
explain = {}
|
||||
call_path = sink.get("static_evidence", {}).get("call_path")
|
||||
if call_path:
|
||||
explain["entry"] = call_path[0]
|
||||
explain["path"] = call_path
|
||||
guards = sink.get("config_conditions") or sink.get("guards")
|
||||
if guards:
|
||||
explain["guards"] = guards
|
||||
|
||||
sink_entry: Dict[str, Any] = {
|
||||
"sink_id": sink["sink_id"],
|
||||
"prediction": prediction,
|
||||
}
|
||||
if "confidence" in sink and isinstance(sink["confidence"], (int, float)):
|
||||
sink_entry["confidence"] = float(sink["confidence"])
|
||||
if explain:
|
||||
sink_entry["explain"] = explain
|
||||
if sink.get("notes"):
|
||||
sink_entry["notes"] = sink["notes"]
|
||||
sinks.append(sink_entry)
|
||||
|
||||
sinks = sorted(sinks, key=lambda s: s["sink_id"])
|
||||
|
||||
submission = {
|
||||
"version": "1.0.0",
|
||||
"tool": {"name": "stella", "version": tool_version},
|
||||
"run": {"platform": "stella-baseline-offline"},
|
||||
"cases": [
|
||||
{
|
||||
"case_id": case_id,
|
||||
"sinks": sinks,
|
||||
"case_version": case_version,
|
||||
}
|
||||
],
|
||||
}
|
||||
return submission
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--case", required=True, help="Path to case.yaml")
|
||||
parser.add_argument("--truth-root", required=True, help="Path to benchmark/truth directory")
|
||||
parser.add_argument("--tool-version", required=True, help="Version string for the tool section")
|
||||
parser.add_argument("--output", required=True, help="Output submission.json path")
|
||||
args = parser.parse_args()
|
||||
|
||||
case_path = pathlib.Path(args.case).resolve()
|
||||
truth_root = pathlib.Path(args.truth_root).resolve()
|
||||
out_path = pathlib.Path(args.output).resolve()
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
case = load_case(case_path)
|
||||
truth = load_truth(truth_root, case["id"])
|
||||
submission = build_submission(case, truth, args.tool_version)
|
||||
|
||||
out_path.write_text(json.dumps(submission, indent=2, sort_keys=True))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
45
bench/reachability-benchmark/baselines/stella/run_all.sh
Normal file
45
bench/reachability-benchmark/baselines/stella/run_all.sh
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
cases_root="${1:-cases}"
|
||||
out_dir="${2:-/tmp/stella-baseline}"
|
||||
|
||||
cases_root="$(cd "${cases_root}" && pwd)"
|
||||
mkdir -p "${out_dir}"
|
||||
|
||||
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
tmp_dir="$(mktemp -d "${out_dir}/stella-all-XXXX")"
|
||||
submission="${out_dir}/submission.json"
|
||||
|
||||
find "${cases_root}" -name case.yaml -print | sort | while read -r case_file; do
|
||||
case_dir="$(dirname "${case_file}")"
|
||||
case_out="${tmp_dir}/$(basename "${case_dir}")"
|
||||
mkdir -p "${case_out}"
|
||||
"${script_dir}/run_case.sh" "${case_dir}" "${case_out}" >/dev/null
|
||||
done
|
||||
|
||||
python - <<'PY'
|
||||
import json, pathlib, sys
|
||||
tmp_dir = pathlib.Path(sys.argv[1])
|
||||
dest = pathlib.Path(sys.argv[2])
|
||||
|
||||
subs = []
|
||||
for path in sorted(tmp_dir.glob("*/submission.json")):
|
||||
subs.append(json.loads(path.read_text()))
|
||||
|
||||
merged = {
|
||||
"version": "1.0.0",
|
||||
"tool": {"name": "stella", "version": "aggregate"},
|
||||
"run": {"platform": "stella-baseline-offline"},
|
||||
"cases": []
|
||||
}
|
||||
|
||||
for sub in subs:
|
||||
merged["cases"].extend(sub.get("cases", []))
|
||||
|
||||
merged["cases"] = sorted(merged["cases"], key=lambda c: c.get("case_id",""))
|
||||
|
||||
dest.write_text(json.dumps(merged, indent=2, sort_keys=True))
|
||||
print(f"submission written: {dest}")
|
||||
PY "${tmp_dir}" "${submission}"
|
||||
26
bench/reachability-benchmark/baselines/stella/run_case.sh
Normal file
26
bench/reachability-benchmark/baselines/stella/run_case.sh
Normal file
@@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
case_dir="${1:-}"
|
||||
out_dir="${2:-}"
|
||||
|
||||
if [[ -z "${case_dir}" ]]; then
|
||||
echo "usage: run_case.sh <case_dir> [output_dir]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case_dir="$(cd "${case_dir}" && pwd)"
|
||||
if [[ -z "${out_dir}" ]]; then
|
||||
out_dir="${case_dir}/baselines/stella"
|
||||
fi
|
||||
mkdir -p "${out_dir}"
|
||||
|
||||
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
python "${script_dir}/normalize.py" \
|
||||
--case "${case_dir}/case.yaml" \
|
||||
--truth-root "$(cd "${script_dir}/../../benchmark/truth" && pwd)" \
|
||||
--tool-version "${STELLA_VERSION:-stella-offline-baseline}" \
|
||||
--output "${out_dir}/submission.json"
|
||||
|
||||
echo "submission written: ${out_dir}/submission.json"
|
||||
Reference in New Issue
Block a user