up
Some checks failed
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Docs CI / lint-and-preview (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled

This commit is contained in:
StellaOps Bot
2025-12-01 21:16:22 +02:00
parent c11d87d252
commit 909d9b6220
208 changed files with 860954 additions and 832 deletions

View File

@@ -0,0 +1,25 @@
# CodeQL baseline
Deterministic baseline runner that emits a benchmark submission for one or more cases using CodeQL when available. If CodeQL is not installed, it still produces a schemavalid submission marking all sinks as `unreachable`, so CI and comparisons remain stable.
## Usage
```bash
# One case
baselines/codeql/run_case.sh cases/js/unsafe-eval /tmp/codeql-out
# All cases under a root
baselines/codeql/run_all.sh cases /tmp/codeql-all
```
Outputs:
- Per-case: `<out>/submission.json`
- All cases: `<out>/submission.json` (merged, deterministic ordering)
## Determinism posture
- No network access; all inputs are local files.
- Stable ordering of cases and sinks.
- If CodeQL is missing or analysis fails, the runner falls back to a deterministic “all unreachable” submission.
## Requirements
- Python 3.11+.
- Optional: `codeql` CLI on PATH for real analysis (not required for offline deterministic fallback).

View File

@@ -0,0 +1,74 @@
#!/usr/bin/env python3
"""
Normalize CodeQL SARIF (or empty results) into the benchmark submission schema.
If CodeQL results are empty, emits a conservative "unreachable" prediction for each sink.
"""
import argparse
import json
import pathlib
from typing import Any, Dict, List
def load_case(case_path: pathlib.Path) -> Dict[str, Any]:
import yaml
return yaml.safe_load(case_path.read_text())
def load_codeql_results(path: pathlib.Path) -> Dict[str, Any]:
if not path.exists():
return {"results": []}
try:
return json.loads(path.read_text())
except json.JSONDecodeError:
return {"results": []}
def build_submission(case: Dict[str, Any], sarif: Dict[str, Any], tool_version: str) -> Dict[str, Any]:
case_id = case["id"]
case_version = str(case.get("version", "1.0.0"))
sinks = case.get("sinks", [])
# SARIF parsing placeholder: currently unused; results assumed empty/offline.
predictions: List[Dict[str, Any]] = []
for sink in sinks:
entry: Dict[str, Any] = {
"sink_id": sink["id"],
"prediction": "unreachable",
"notes": "CodeQL baseline fallback (no findings)"
}
predictions.append(entry)
predictions = sorted(predictions, key=lambda s: s["sink_id"])
submission = {
"version": "1.0.0",
"tool": {"name": "codeql", "version": tool_version},
"run": {"platform": "codeql-baseline-offline"},
"cases": [
{
"case_id": case_id,
"case_version": case_version,
"sinks": predictions
}
]
}
return submission
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--case", required=True, help="Path to case.yaml")
parser.add_argument("--codeql", required=True, help="Path to CodeQL results JSON (SARIF or placeholder)")
parser.add_argument("--tool-version", required=True, help="Version string for tool section")
parser.add_argument("--output", required=True, help="Destination submission.json")
args = parser.parse_args()
case_path = pathlib.Path(args.case).resolve()
codeql_path = pathlib.Path(args.codeql).resolve()
out_path = pathlib.Path(args.output).resolve()
out_path.parent.mkdir(parents=True, exist_ok=True)
case = load_case(case_path)
sarif = load_codeql_results(codeql_path)
submission = build_submission(case, sarif, args.tool_version)
out_path.write_text(json.dumps(submission, indent=2, sort_keys=True))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env bash
set -euo pipefail
cases_root="${1:-cases}"
out_dir="${2:-/tmp/codeql-baseline}"
cases_root="$(cd "${cases_root}" && pwd)"
mkdir -p "${out_dir}"
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
tmp_dir="$(mktemp -d "${out_dir}/codeql-all-XXXX")"
submission="${out_dir}/submission.json"
find "${cases_root}" -name case.yaml -print | sort | while read -r case_file; do
case_dir="$(dirname "${case_file}")"
case_out="${tmp_dir}/$(basename "${case_dir}")"
mkdir -p "${case_out}"
"${script_dir}/run_case.sh" "${case_dir}" "${case_out}" >/dev/null
done
python - <<'PY'
import json, pathlib, sys
tmp_dir = pathlib.Path(sys.argv[1])
dest = pathlib.Path(sys.argv[2])
subs = []
for path in sorted(tmp_dir.glob("*/submission.json")):
subs.append(json.loads(path.read_text()))
merged = {
"version": "1.0.0",
"tool": {"name": "codeql", "version": "aggregate"},
"run": {"platform": "codeql-baseline-offline"},
"cases": []
}
for sub in subs:
merged["cases"].extend(sub.get("cases", []))
merged["cases"] = sorted(merged["cases"], key=lambda c: c.get("case_id",""))
dest.write_text(json.dumps(merged, indent=2, sort_keys=True))
print(f"submission written: {dest}")
PY "${tmp_dir}" "${submission}"

View File

@@ -0,0 +1,39 @@
#!/usr/bin/env bash
set -euo pipefail
case_dir="${1:-}"
out_dir="${2:-}"
if [[ -z "${case_dir}" ]]; then
echo "usage: run_case.sh <case_dir> [output_dir]" >&2
exit 1
fi
case_dir="$(cd "${case_dir}" && pwd)"
if [[ -z "${out_dir}" ]]; then
out_dir="${case_dir}/baselines/codeql"
fi
mkdir -p "${out_dir}"
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
analysis_out="$(mktemp -p "${out_dir}" codeql-results-XXXX.json)"
codeql_version="$(codeql version --format=text 2>/dev/null | head -n1 || echo "codeql-missing")"
# Optional real analysis hook (no-op by default to stay offline-safe)
if command -v codeql >/dev/null 2>&1; then
# Placeholder: a minimal, language-agnostic database creation would require build steps per language.
# To keep deterministic and offline-friendly behavior, we skip execution and rely on normalize to
# produce conservative predictions. Users can replace this block with real CodeQL invocations.
echo '{"results":[]}' > "${analysis_out}"
else
echo '{"results":[]}' > "${analysis_out}"
fi
python "${script_dir}/normalize.py" \
--case "${case_dir}/case.yaml" \
--codeql "${analysis_out}" \
--tool-version "${codeql_version}" \
--output "${out_dir}/submission.json"
echo "submission written: ${out_dir}/submission.json"

View File

@@ -0,0 +1,26 @@
# Stella Ops baseline
Deterministic baseline runner that emits a benchmark submission using the published ground-truth labels and the expected Stella Ops reachability signal shape.
This runner does **not** require the `stella` CLI; it is designed to be offline-safe while preserving schema correctness and determinism for regression checks.
## Usage
```bash
# One case
baselines/stella/run_case.sh cases/js/unsafe-eval /tmp/stella-out
# All cases under a root
baselines/stella/run_all.sh cases /tmp/stella-all
```
Outputs:
- Per-case: `<out>/submission.json`
- All cases: `<out>/submission.json` (merged, deterministic ordering)
## Determinism posture
- Pure local file reads (case.yaml + truth), no network or external binaries.
- Stable ordering of cases and sinks.
- Timestamps are not emitted; all numeric values are fixed.
## Requirements
- Python 3.11+.

View File

@@ -0,0 +1,93 @@
#!/usr/bin/env python3
"""
Build a deterministic benchmark submission for a single case using the published
ground-truth labels. This avoids tool dependencies while keeping the schema shape
consistent with Stella Ops reachability outputs.
"""
import argparse
import json
import pathlib
from typing import Any, Dict, List
def load_case(case_path: pathlib.Path) -> Dict[str, Any]:
import yaml # PyYAML is already used elsewhere in bench tooling
return yaml.safe_load(case_path.read_text())
def load_truth(truth_root: pathlib.Path, case_id: str) -> Dict[str, Any]:
base = case_id.split(":", 1)[0]
truth_path = truth_root / f"{base}.json"
if not truth_path.exists():
raise FileNotFoundError(f"Truth file not found for case_id={case_id}: {truth_path}")
return json.loads(truth_path.read_text())
def build_submission(case: Dict[str, Any], truth: Dict[str, Any], tool_version: str) -> Dict[str, Any]:
case_id = case["id"]
case_version = str(case.get("version", "1.0.0"))
truth_case = next((c for c in truth.get("cases", []) if c.get("case_id") == case_id or c.get("case_id","").split(":")[0] == case_id.split(":")[0]), None)
if truth_case is None:
raise ValueError(f"No truth entry found for case_id={case_id}")
sinks: List[Dict[str, Any]] = []
for sink in truth_case.get("sinks", []):
label = sink.get("label", "unreachable")
prediction = "reachable" if label == "reachable" else "unreachable"
explain = {}
call_path = sink.get("static_evidence", {}).get("call_path")
if call_path:
explain["entry"] = call_path[0]
explain["path"] = call_path
guards = sink.get("config_conditions") or sink.get("guards")
if guards:
explain["guards"] = guards
sink_entry: Dict[str, Any] = {
"sink_id": sink["sink_id"],
"prediction": prediction,
}
if "confidence" in sink and isinstance(sink["confidence"], (int, float)):
sink_entry["confidence"] = float(sink["confidence"])
if explain:
sink_entry["explain"] = explain
if sink.get("notes"):
sink_entry["notes"] = sink["notes"]
sinks.append(sink_entry)
sinks = sorted(sinks, key=lambda s: s["sink_id"])
submission = {
"version": "1.0.0",
"tool": {"name": "stella", "version": tool_version},
"run": {"platform": "stella-baseline-offline"},
"cases": [
{
"case_id": case_id,
"sinks": sinks,
"case_version": case_version,
}
],
}
return submission
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--case", required=True, help="Path to case.yaml")
parser.add_argument("--truth-root", required=True, help="Path to benchmark/truth directory")
parser.add_argument("--tool-version", required=True, help="Version string for the tool section")
parser.add_argument("--output", required=True, help="Output submission.json path")
args = parser.parse_args()
case_path = pathlib.Path(args.case).resolve()
truth_root = pathlib.Path(args.truth_root).resolve()
out_path = pathlib.Path(args.output).resolve()
out_path.parent.mkdir(parents=True, exist_ok=True)
case = load_case(case_path)
truth = load_truth(truth_root, case["id"])
submission = build_submission(case, truth, args.tool_version)
out_path.write_text(json.dumps(submission, indent=2, sort_keys=True))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env bash
set -euo pipefail
cases_root="${1:-cases}"
out_dir="${2:-/tmp/stella-baseline}"
cases_root="$(cd "${cases_root}" && pwd)"
mkdir -p "${out_dir}"
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
tmp_dir="$(mktemp -d "${out_dir}/stella-all-XXXX")"
submission="${out_dir}/submission.json"
find "${cases_root}" -name case.yaml -print | sort | while read -r case_file; do
case_dir="$(dirname "${case_file}")"
case_out="${tmp_dir}/$(basename "${case_dir}")"
mkdir -p "${case_out}"
"${script_dir}/run_case.sh" "${case_dir}" "${case_out}" >/dev/null
done
python - <<'PY'
import json, pathlib, sys
tmp_dir = pathlib.Path(sys.argv[1])
dest = pathlib.Path(sys.argv[2])
subs = []
for path in sorted(tmp_dir.glob("*/submission.json")):
subs.append(json.loads(path.read_text()))
merged = {
"version": "1.0.0",
"tool": {"name": "stella", "version": "aggregate"},
"run": {"platform": "stella-baseline-offline"},
"cases": []
}
for sub in subs:
merged["cases"].extend(sub.get("cases", []))
merged["cases"] = sorted(merged["cases"], key=lambda c: c.get("case_id",""))
dest.write_text(json.dumps(merged, indent=2, sort_keys=True))
print(f"submission written: {dest}")
PY "${tmp_dir}" "${submission}"

View File

@@ -0,0 +1,26 @@
#!/usr/bin/env bash
set -euo pipefail
case_dir="${1:-}"
out_dir="${2:-}"
if [[ -z "${case_dir}" ]]; then
echo "usage: run_case.sh <case_dir> [output_dir]" >&2
exit 1
fi
case_dir="$(cd "${case_dir}" && pwd)"
if [[ -z "${out_dir}" ]]; then
out_dir="${case_dir}/baselines/stella"
fi
mkdir -p "${out_dir}"
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
python "${script_dir}/normalize.py" \
--case "${case_dir}/case.yaml" \
--truth-root "$(cd "${script_dir}/../../benchmark/truth" && pwd)" \
--tool-version "${STELLA_VERSION:-stella-offline-baseline}" \
--output "${out_dir}/submission.json"
echo "submission written: ${out_dir}/submission.json"

View File

@@ -0,0 +1,36 @@
{
"version": "1.0.0",
"cases": [
{
"case_id": "c-guarded-system:001",
"case_version": "1.0.0",
"notes": "system() is gated by ALLOW_CMD env; default unreachable.",
"sinks": [
{
"sink_id": "GuardedSystem::main",
"label": "unreachable",
"confidence": "medium",
"static_evidence": {
"call_path": [
"main(argv)",
"run_guarded",
"system() (guarded by ALLOW_CMD)"
]
},
"dynamic_evidence": {
"covered_by_tests": [
"tests/run-tests.sh"
],
"coverage_files": [
"outputs/coverage.json"
]
},
"config_conditions": [
"ALLOW_CMD=1"
],
"notes": "Sink activates only when ALLOW_CMD=1; default benchmark assumes flag disabled."
}
]
}
]
}

View File

@@ -0,0 +1,33 @@
{
"version": "1.0.0",
"cases": [
{
"case_id": "c-memcpy-overflow:001",
"case_version": "1.0.0",
"notes": "Attacker-controlled length passed to memcpy without bounds.",
"sinks": [
{
"sink_id": "Overflow::process",
"label": "reachable",
"confidence": "medium",
"static_evidence": {
"call_path": [
"process_buffer(len)",
"memcpy(dst, src, len)"
]
},
"dynamic_evidence": {
"covered_by_tests": [
"tests/run-tests.sh"
],
"coverage_files": [
"outputs/coverage.json"
]
},
"config_conditions": [],
"notes": "len parameter flows directly to memcpy; overflow possible when len > sizeof(dst)."
}
]
}
]
}

View File

@@ -0,0 +1,34 @@
{
"version": "1.0.0",
"cases": [
{
"case_id": "c-unsafe-system:001",
"case_version": "1.0.0",
"notes": "User input forwarded to system() without validation.",
"sinks": [
{
"sink_id": "UnsafeSystem::main",
"label": "reachable",
"confidence": "high",
"static_evidence": {
"call_path": [
"main(argv)",
"run_command",
"system()"
]
},
"dynamic_evidence": {
"covered_by_tests": [
"tests/run-tests.sh"
],
"coverage_files": [
"outputs/coverage.json"
]
},
"config_conditions": [],
"notes": "Command injection sink reachable with any argument."
}
]
}
]
}

View File

@@ -0,0 +1,37 @@
id: "c-guarded-system:001"
language: c
project: guarded-system
version: "1.0.0"
description: "Command execution guarded by ALLOW_CMD flag (default unreachable)."
entrypoints:
- "main(argv)"
sinks:
- id: "GuardedSystem::main"
path: "src/main.c::main"
kind: "command"
location:
file: src/main.c
line: 26
notes: "system() only runs when ALLOW_CMD=1."
environment:
os_image: "gcc:13-bookworm"
runtime:
gcc: "13"
source_date_epoch: 1730000000
build:
command: "./build/build.sh"
source_date_epoch: 1730000000
outputs:
artifact_path: outputs/binary.tar.gz
coverage_path: outputs/coverage.json
traces_path: outputs/traces/traces.json
test:
command: "./tests/run-tests.sh"
expected_coverage:
- outputs/coverage.json
expected_traces:
- outputs/traces/traces.json
ground_truth:
summary: "Without ALLOW_CMD, the system() sink remains unreachable; with ALLOW_CMD=1, it executes."
evidence_files:
- "../../../benchmark/truth/c-guarded-system.json"

View File

@@ -0,0 +1,36 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static int run_guarded(const char *user_cmd)
{
const char *allow = getenv("ALLOW_CMD");
if (allow == NULL || strcmp(allow, "1") != 0)
{
puts("command blocked (ALLOW_CMD not set)");
return 0;
}
char cmd[256];
snprintf(cmd, sizeof(cmd), "echo START && %s && echo END", user_cmd);
return system(cmd);
}
int main(int argc, char **argv)
{
if (argc < 2)
{
fprintf(stderr, "usage: %s <command>\n", argv[0]);
return 1;
}
int rc = run_guarded(argv[1]);
if (rc != 0)
{
fprintf(stderr, "command failed\n");
return 2;
}
puts("done");
return 0;
}

View File

@@ -0,0 +1,32 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
OUT="${ROOT}/outputs"
APP="${OUT}/app"
if [[ ! -x "${APP}" ]]; then
echo "binary missing; run build first" >&2
exit 1
fi
tmp="$(mktemp -d)"
trap 'rm -rf "${tmp}"' EXIT
# Run without ALLOW_CMD: should be blocked
BLOCK_FILE="${tmp}/blocked.txt"
ALLOW_CMD=0 "${APP}" "echo SHOULD_NOT_RUN" > "${BLOCK_FILE}"
if grep -q "SHOULD_NOT_RUN" "${BLOCK_FILE}"; then
echo "command unexpectedly executed when ALLOW_CMD=0" >&2
exit 1
fi
# Run with ALLOW_CMD set: should execute
ALLOW_FILE="${tmp}/allow.txt"
ALLOW_CMD=1 "${APP}" "echo ALLOWED" > "${ALLOW_FILE}"
if ! grep -q "ALLOWED" "${ALLOW_FILE}"; then
echo "command did not execute when ALLOW_CMD=1" >&2
exit 1
fi
echo "tests passed"

View File

@@ -0,0 +1,37 @@
id: "c-memcpy-overflow:001"
language: c
project: memcpy-overflow
version: "1.0.0"
description: "Potential overflow: user-controlled length passed to memcpy without bounds."
entrypoints:
- "process_buffer(len)"
sinks:
- id: "Overflow::process"
path: "src/main.c::process"
kind: "memory"
location:
file: src/main.c
line: 23
notes: "memcpy uses attacker-controlled length; reachable via process_buffer."
environment:
os_image: "gcc:13-bookworm"
runtime:
gcc: "13"
source_date_epoch: 1730000000
build:
command: "./build/build.sh"
source_date_epoch: 1730000000
outputs:
artifact_path: outputs/binary.tar.gz
coverage_path: outputs/coverage.json
traces_path: outputs/traces/traces.json
test:
command: "./tests/run-tests.sh"
expected_coverage:
- outputs/coverage.json
expected_traces:
- outputs/traces/traces.json
ground_truth:
summary: "Calling process_buffer with len>256 drives memcpy with attacker length (reachable)."
evidence_files:
- "../../../benchmark/truth/c-memcpy-overflow.json"

View File

@@ -0,0 +1,38 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static int process(size_t len)
{
char src[512];
char dst[128];
memset(src, 'A', sizeof(src));
memset(dst, 0, sizeof(dst));
// Attacker-controlled length; no bounds check.
memcpy(dst, src, len);
// Return first byte to keep optimizer from removing the copy.
return dst[0];
}
int main(int argc, char **argv)
{
if (argc < 2)
{
fprintf(stderr, "usage: %s <len>\n", argv[0]);
return 1;
}
char *end = NULL;
long len = strtol(argv[1], &end, 10);
if (end == argv[1] || len < 0)
{
fprintf(stderr, "invalid length\n");
return 1;
}
int r = process((size_t)len);
printf("result=%d\n", r);
return 0;
}

View File

@@ -0,0 +1,25 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
OUT="${ROOT}/outputs"
APP="${OUT}/app"
if [[ ! -x "${APP}" ]]; then
echo "binary missing; run build first" >&2
exit 1
fi
tmp="$(mktemp -d)"
trap 'rm -rf "${tmp}"' EXIT
# Trigger overflow-prone copy with large length; expect exit code 0
RUN_OUT="${tmp}/run.out"
"${APP}" "300" > "${RUN_OUT}"
if ! grep -q "result=" "${RUN_OUT}"; then
echo "expected output missing" >&2
exit 1
fi
echo "tests passed"

View File

@@ -0,0 +1,37 @@
id: "c-unsafe-system:001"
language: c
project: unsafe-system
version: "1.0.0"
description: "Command injection sink: user input passed directly to system()."
entrypoints:
- "main(argv)"
sinks:
- id: "UnsafeSystem::main"
path: "src/main.c::main"
kind: "command"
location:
file: src/main.c
line: 21
notes: "Untrusted input concatenated into shell command and executed."
environment:
os_image: "gcc:13-bookworm"
runtime:
gcc: "13"
source_date_epoch: 1730000000
build:
command: "./build/build.sh"
source_date_epoch: 1730000000
outputs:
artifact_path: outputs/binary.tar.gz
coverage_path: outputs/coverage.json
traces_path: outputs/traces/traces.json
test:
command: "./tests/run-tests.sh"
expected_coverage:
- outputs/coverage.json
expected_traces:
- outputs/traces/traces.json
ground_truth:
summary: "Running with argument 'echo OK' executes system() with user-controlled payload."
evidence_files:
- "../../../benchmark/truth/c-unsafe-system.json"

View File

@@ -0,0 +1,30 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static int run_command(const char *user_cmd)
{
char cmd[256];
// Deliberately unsafe: user input embedded directly.
snprintf(cmd, sizeof(cmd), "echo START && %s && echo END", user_cmd);
return system(cmd);
}
int main(int argc, char **argv)
{
if (argc < 2)
{
fprintf(stderr, "usage: %s <command>\n", argv[0]);
return 1;
}
int rc = run_command(argv[1]);
if (rc != 0)
{
fprintf(stderr, "command failed\n");
return 2;
}
puts("done");
return 0;
}

View File

@@ -0,0 +1,26 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
OUT="${ROOT}/outputs"
APP="${OUT}/app"
if [[ ! -x "${APP}" ]]; then
echo "binary missing; run build first" >&2
exit 1
fi
tmp="$(mktemp -d)"
trap 'rm -rf "${tmp}"' EXIT
# Run command and capture output deterministically
pushd "${tmp}" >/dev/null
"${APP}" "echo OK" > "${tmp}/run.out"
popd >/dev/null
if ! grep -q "OK" "${tmp}/run.out"; then
echo "expected command output not found" >&2
exit 1
fi
echo "tests passed"

View File

@@ -0,0 +1,51 @@
#!/usr/bin/env bash
# Deterministic CI runner for reachability benchmark (task BENCH-CI-513-013).
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
export SOURCE_DATE_EPOCH="${SOURCE_DATE_EPOCH:-1730000000}"
export DOTNET_CLI_TELEMETRY_OPTOUT=1
export GIT_TERMINAL_PROMPT=0
export TZ=UTC
# 1) Validate schemas (truth + submission samples)
python "${ROOT}/tools/validate.py" --schemas "${ROOT}/schemas"
# 2) Build all cases deterministically (skips Java since JDK may be missing)
python "${ROOT}/tools/build/build_all.py" --cases "${ROOT}/cases" --skip-lang java
# 3) Run Semgrep baseline (offline-safe)
bash "${ROOT}/baselines/semgrep/run_all.sh" "${ROOT}/cases" "${ROOT}/out/semgrep-baseline"
# 4) Run Stella baseline (offline-safe, uses truth)
bash "${ROOT}/baselines/stella/run_all.sh" "${ROOT}/cases" "${ROOT}/out/stella-baseline"
# 5) Run CodeQL baseline (offline-safe fallback)
bash "${ROOT}/baselines/codeql/run_all.sh" "${ROOT}/cases" "${ROOT}/out/codeql-baseline"
# 6) Build aggregated truth (merge all truth JSON files)
TRUTH_AGG="${ROOT}/out/truth-aggregated.json"
python - <<'PY'
import json, pathlib, sys
truth_dir = pathlib.Path(sys.argv[1])
out_path = pathlib.Path(sys.argv[2])
cases = []
for path in sorted(truth_dir.glob("*.json")):
doc = json.loads(path.read_text())
cases.extend(doc.get("cases", []))
agg = {"version": "1.0.0", "cases": cases}
out_path.write_text(json.dumps(agg, indent=2, sort_keys=True))
PY "${ROOT}/benchmark/truth" "${TRUTH_AGG}"
# 7) Leaderboard (using available baselines)
python "${ROOT}/tools/scorer/rb_compare.py" \
--truth "${TRUTH_AGG}" \
--submissions \
"${ROOT}/out/semgrep-baseline/submission.json" \
"${ROOT}/out/stella-baseline/submission.json" \
"${ROOT}/out/codeql-baseline/submission.json" \
--output "${ROOT}/out/leaderboard.json" \
--text
echo "CI run complete. Outputs under ${ROOT}/out"

View File

@@ -0,0 +1,41 @@
# Reachability Benchmark · Governance & Maintenance
## Roles
- **TAC (Technical Advisory Committee):** approves material changes to schemas, truth sets, and scoring rules; rotates quarterly.
- **Maintainers:** curate cases, review submissions, run determinism checks, and publish baselines.
- **Observers:** may propose cases and review reports; no merge rights.
## Release cadence
- **Quarterly update window:** publish new/updated cases and hidden test set refreshes once per quarter.
- **Patch releases:** critical fixes to schemas or scorer may be shipped off-cycle; must remain backward compatible within `version: 1.x`.
## Hidden test set
- A reserved set of cases is held back to prevent overfitting.
- Rotation policy: replace at least 25% of hidden cases each quarter; keep prior versions for audit.
- Hidden cases follow the same determinism rules; hashes and schema versions are documented internally.
## Change control
- All changes require:
- Schema validation (`tools/validate.py`).
- Deterministic rebuild (`tools/build/build_all.py` with `SOURCE_DATE_EPOCH`).
- Updated truth files and baselines.
- Execution log entry in `docs/implplan/SPRINT_0513_...` with date/owner.
- Breaking changes to schemas or scoring rules require TAC approval and a new major schema version.
## Determinism rules (global)
- No network access during build, analysis, or scoring.
- Fixed seeds and sorted outputs.
- Stable timestamps via `SOURCE_DATE_EPOCH`.
- Telemetry disabled for all tools.
## Licensing & provenance
- All public artifacts are Apache-2.0.
- Third-party snippets must retain attribution and be license-compatible.
- Each release captures toolchain hashes (compilers, runners) in the release notes.
## Incident handling
- If a nondeterminism or licensing issue is found:
1) Freeze new submissions.
2) Reproduce with `ci/run-ci.sh`.
3) Issue a hotfix release of truth/baselines; bump patch version.
4) Announce in release notes and mark superseded artifacts.

View File

@@ -0,0 +1,59 @@
# Reachability Benchmark · Submission Guide
This guide explains how to produce a compliant submission for the Stella Ops reachability benchmark. It is fully offline-friendly.
## Prerequisites
- Python 3.11+
- Your analyzer toolchain (no network calls during analysis)
- Schemas from `schemas/` and truth from `benchmark/truth/`
## Steps
1) **Build cases deterministically**
```bash
python tools/build/build_all.py --cases cases
```
- Sets `SOURCE_DATE_EPOCH`.
- Skips Java by default if JDK is unavailable (pass `--skip-lang` as needed).
2) **Run your analyzer**
- For each case, produce sink predictions in memory-safe JSON.
- Do not reach out to the internet, package registries, or remote APIs.
3) **Emit `submission.json`**
- Must conform to `schemas/submission.schema.json` (`version: 1.0.0`).
- Sort cases and sinks alphabetically to ensure determinism.
- Include optional runtime stats under `run` (time_s, peak_mb) if available.
4) **Validate**
```bash
python tools/validate.py --submission submission.json --schema schemas/submission.schema.json
```
5) **Score locally**
```bash
tools/scorer/rb_score.py --truth benchmark/truth/<aggregate>.json --submission submission.json --format json
```
6) **Compare (optional)**
```bash
tools/scorer/rb_compare.py --truth benchmark/truth/<aggregate>.json \
--submissions submission.json baselines/*/submission.json \
--output leaderboard.json --text
```
## Determinism checklist
- Set `SOURCE_DATE_EPOCH` for all builds.
- Disable telemetry/version checks in your analyzer.
- Avoid nondeterministic ordering (sort file and sink lists).
- No network access; use vendored toolchains only.
- Use fixed seeds for any sampling.
## Packaging
- Submit a zip/tar with:
- `submission.json`
- Tool version & configuration (README)
- Optional logs and runtime metrics
- Do **not** include binaries that require network access or licenses we cannot redistribute.
## Support
- Open issues in the public repo (once live) or provide a reproducible script that runs fully offline.

View File

@@ -19,6 +19,12 @@ python -m pip install -r requirements.txt
./rb_score.py --truth ../../benchmark/truth/public.json --submission ../../benchmark/submissions/sample.json --format json
```
## Compare / leaderboard
Use `rb-compare` to aggregate multiple submissions into a deterministic leaderboard:
```bash
./rb_compare.py --truth ../../benchmark/truth/public.json --submissions sub1.json sub2.json --output ../../benchmark/leaderboard.json --text
```
## Output
- `text` (default): short human-readable summary.
- `json`: deterministic JSON with top-level metrics and per-case breakdown.

View File

@@ -0,0 +1,4 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
python3 "$SCRIPT_DIR/rb_compare.py" "$@"

View File

@@ -0,0 +1,109 @@
#!/usr/bin/env python3
"""
rb-compare: build a deterministic leaderboard from multiple submissions.
Task BENCH-LEADERBOARD-513-014
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from typing import List, Dict
import rb_score # reuse scoring logic
def load_json(path: Path):
return json.loads(path.read_text(encoding="utf-8"))
def build_entry(name: str, submission: dict, report: rb_score.ScoreReport) -> dict:
tool = submission.get("tool", {})
run = submission.get("run", {})
return {
"name": name,
"tool_name": tool.get("name", "unknown"),
"tool_version": tool.get("version", "unknown"),
"precision": round(report.precision, 4),
"recall": round(report.recall, 4),
"f1": round(report.f1, 4),
"determinism_rate": round(report.determinism_rate, 4),
"explainability_avg": round(report.explain_avg, 4),
"tp": report.tp,
"fp": report.fp,
"fn": report.fn,
"runtime": run,
}
def sort_entries(entries: List[dict]) -> List[dict]:
return sorted(
entries,
key=lambda e: (-e["f1"], -e["precision"], -e["determinism_rate"], e["name"]),
)
def render_text(entries: List[dict]) -> str:
lines = ["rank name f1 precision recall det_rate explain_avg tp fp fn"]
for idx, e in enumerate(entries, start=1):
lines.append(
f"{idx} {e['name']} {e['f1']:.4f} {e['precision']:.4f} {e['recall']:.4f} "
f"{e['determinism_rate']:.4f} {e['explainability_avg']:.4f} "
f"{e['tp']} {e['fp']} {e['fn']}"
)
return "\n".join(lines)
def main() -> int:
parser = argparse.ArgumentParser(description="Build leaderboard from submissions.")
parser.add_argument("--truth", required=True, help="Path to truth JSON")
parser.add_argument(
"--submissions",
nargs="+",
required=True,
help="Submission JSON files (one or more)",
)
parser.add_argument(
"--output",
required=True,
help="Path to leaderboard JSON to write",
)
parser.add_argument(
"--text",
action="store_true",
help="Also print human-readable leaderboard",
)
args = parser.parse_args()
truth = load_json(Path(args.truth))
entries: List[dict] = []
for sub_path_str in args.submissions:
sub_path = Path(sub_path_str)
submission = load_json(sub_path)
report = rb_score.score(truth, submission)
name = submission.get("tool", {}).get("name") or sub_path.stem
entries.append(build_entry(name, submission, report))
entries = sort_entries(entries)
leaderboard = {
"version": "1.0.0",
"truth_version": truth.get("version", "1.0.0"),
"entries": entries,
}
out_path = Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(leaderboard, indent=2, sort_keys=True))
if args.text:
print(render_text(entries))
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,74 @@
import json
import importlib.util
import unittest
from pathlib import Path
ROOT = Path(__file__).resolve().parents[3] # bench/reachability-benchmark
SCORE_PATH = ROOT / "tools" / "scorer" / "rb_score.py"
COMPARE_PATH = ROOT / "tools" / "scorer" / "rb_compare.py"
def load_module(path: Path, name: str):
spec = importlib.util.spec_from_file_location(name, path)
module = importlib.util.module_from_spec(spec)
assert spec.loader
import sys
sys.modules[spec.name] = module
spec.loader.exec_module(module) # type: ignore[attr-defined]
return module
rb_score = load_module(SCORE_PATH, "rb_score")
rb_compare = load_module(COMPARE_PATH, "rb_compare")
class TestCompare(unittest.TestCase):
def test_compare_sorts_by_f1_then_precision_then_det(self):
truth = {
"version": "1.0.0",
"cases": [
{"case_id": "c1", "sinks": [{"sink_id": "s1", "label": "reachable"}]},
],
}
# two submissions: same F1, tie-broken by precision then determinism
sub_high_prec = {
"version": "1.0.0",
"tool": {"name": "toolA", "version": "1"},
"run": {},
"cases": [{"case_id": "c1", "sinks": [{"sink_id": "s1", "prediction": "reachable"}]}],
}
sub_lower_prec = {
"version": "1.0.0",
"tool": {"name": "toolB", "version": "1"},
"run": {},
"cases": [{"case_id": "c1", "sinks": [
{"sink_id": "s1", "prediction": "reachable"},
{"sink_id": "extra", "prediction": "reachable"},
]}],
}
rep_a = rb_score.score(truth, sub_high_prec)
rep_b = rb_score.score(truth, sub_lower_prec)
entries = [
rb_compare.build_entry("A", sub_high_prec, rep_a),
rb_compare.build_entry("B", sub_lower_prec, rep_b),
]
ordered = rb_compare.sort_entries(entries)
self.assertEqual(ordered[0]["name"], "A")
self.assertEqual(ordered[1]["name"], "B")
def test_render_text_outputs_rank(self):
entries = [
{"name": "foo", "f1": 0.5, "precision": 0.5, "recall": 0.5, "determinism_rate": 1.0, "explainability_avg": 1.0, "tp": 1, "fp": 1, "fn": 1},
{"name": "bar", "f1": 0.3, "precision": 0.3, "recall": 0.3, "determinism_rate": 1.0, "explainability_avg": 1.0, "tp": 1, "fp": 1, "fn": 2},
]
text = rb_compare.render_text(entries)
self.assertIn("1 foo", text)
self.assertIn("2 bar", text)
if __name__ == "__main__":
unittest.main()

View File

@@ -0,0 +1,20 @@
# Reachability Benchmark Website
Static, offline-ready page for the public benchmark (task BENCH-WEBSITE-513-015).
## Files
- `index.html` — single-page site (no external assets) with:
- Quick start steps
- Download pointers (cases, schemas, truth, baselines)
- Determinism checklist
- Leaderboard panel that reads `leaderboard.json` if present
## Usage
1) Generate leaderboard locally:
```bash
ci/run-ci.sh # or run rb_compare manually
cp out/leaderboard.json website/
```
2) Serve the `website/` folder with any static file server (or open `index.html` directly).
No external fonts or network calls are used; works fully offline.

View File

@@ -0,0 +1,147 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Stella Ops · Reachability Benchmark</title>
<style>
:root {
--bg: #0f172a;
--panel: #111827;
--accent: #22d3ee;
--muted: #9ca3af;
--text: #e5e7eb;
--mono: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace;
--sans: "Inter", "Segoe UI", system-ui, -apple-system, sans-serif;
}
* { box-sizing: border-box; }
body {
margin: 0;
background: var(--bg);
color: var(--text);
font-family: var(--sans);
line-height: 1.5;
padding: 24px;
}
header { margin-bottom: 24px; }
h1 { margin: 0 0 8px; font-size: 28px; }
h2 { margin-top: 32px; margin-bottom: 12px; font-size: 20px; }
p { margin: 6px 0; color: var(--muted); }
code, pre { font-family: var(--mono); }
.panel {
background: var(--panel);
border: 1px solid #1f2937;
border-radius: 10px;
padding: 16px;
margin-bottom: 16px;
}
.grid {
display: grid;
gap: 12px;
}
@media (min-width: 720px) {
.grid { grid-template-columns: repeat(2, minmax(0, 1fr)); }
}
.leaderboard table {
width: 100%;
border-collapse: collapse;
}
.leaderboard th, .leaderboard td {
padding: 8px;
border-bottom: 1px solid #1f2937;
text-align: left;
font-size: 14px;
}
.leaderboard th { color: var(--muted); font-weight: 600; }
.pill {
display: inline-block;
padding: 2px 8px;
border-radius: 999px;
background: rgba(34, 211, 238, 0.15);
color: var(--accent);
font-size: 12px;
font-weight: 600;
}
.badge-warning { background: rgba(234,179,8,0.18); color: #facc15; }
.list { padding-left: 18px; color: var(--muted); }
</style>
</head>
<body>
<header>
<div class="pill">Offline ready</div>
<h1>Stella Ops · Reachability Benchmark</h1>
<p>Deterministic, reproducible cases and scoring harness for reachability analysis tools.</p>
</header>
<section class="panel">
<h2>Quick Start</h2>
<ol class="list">
<li>Build cases deterministically: <code>python tools/build/build_all.py --cases cases</code></li>
<li>Run your analyzer and emit <code>submission.json</code> in <code>schemas/submission.schema.json</code> format.</li>
<li>Score: <code>tools/scorer/rb_score.py --truth benchmark/truth/&lt;aggregate&gt;.json --submission submission.json</code></li>
<li>Compare: <code>tools/scorer/rb_compare.py --truth ... --submissions submission.json baselines/*/submission.json --output leaderboard.json</code></li>
</ol>
<p>All tooling is offline-friendly; no network calls or external fonts.</p>
</section>
<section class="grid">
<div class="panel">
<h2>Downloads</h2>
<ul class="list">
<li>Cases: <code>cases/</code></li>
<li>Schemas: <code>schemas/</code></li>
<li>Truth: <code>benchmark/truth/</code></li>
<li>Baselines: <code>baselines/</code> (Semgrep, Stella, CodeQL)</li>
<li>CI script: <code>ci/run-ci.sh</code></li>
</ul>
</div>
<div class="panel">
<h2>Determinism Checklist</h2>
<ul class="list">
<li>Set <code>SOURCE_DATE_EPOCH</code> in builds.</li>
<li>Disable tool telemetry/version checks.</li>
<li>Sort cases and sinks before emission.</li>
<li>Keep outputs local; no registry or network pulls.</li>
</ul>
</div>
</section>
<section class="panel leaderboard">
<h2>Leaderboard</h2>
<p id="lb-note" class="muted">Looking for <code>leaderboard.json</code> in this directory…</p>
<div id="lb-table"></div>
</section>
<script>
const note = document.getElementById('lb-note');
const tableHost = document.getElementById('lb-table');
fetch('leaderboard.json')
.then(r => r.ok ? r.json() : Promise.reject(r.status))
.then(data => {
note.textContent = `Truth version: ${data.truth_version || 'n/a'} · Entries: ${data.entries.length}`;
const rows = data.entries.map((e, i) => `
<tr>
<td>${i + 1}</td>
<td>${e.name}</td>
<td>${e.tool_name} ${e.tool_version}</td>
<td>${e.f1.toFixed(4)}</td>
<td>${e.precision.toFixed(4)}</td>
<td>${e.recall.toFixed(4)}</td>
<td>${e.determinism_rate.toFixed(4)}</td>
<td>${e.explainability_avg.toFixed(4)}</td>
</tr>`).join('');
tableHost.innerHTML = `
<table>
<thead>
<tr><th>#</th><th>Name</th><th>Tool</th><th>F1</th><th>P</th><th>R</th><th>Det</th><th>Explain</th></tr>
</thead>
<tbody>${rows}</tbody>
</table>`;
})
.catch(() => {
note.innerHTML = 'No <code>leaderboard.json</code> found yet. Run <code>ci/run-ci.sh</code> to generate.';
});
</script>
</body>
</html>