Files
git.stella-ops.org/src/Bench/StellaOps.Bench/ImpactIndex/impact_index_bench.py
StellaOps Bot efaf3cb789
Some checks failed
Signals CI & Image / signals-ci (push) Has been cancelled
Signals Reachability Scoring & Events / reachability-smoke (push) Has been cancelled
Signals Reachability Scoring & Events / sign-and-upload (push) Has been cancelled
Manifest Integrity / Validate Schema Integrity (push) Has been cancelled
Manifest Integrity / Validate Contract Documents (push) Has been cancelled
Manifest Integrity / Validate Pack Fixtures (push) Has been cancelled
Manifest Integrity / Audit SHA256SUMS Files (push) Has been cancelled
Manifest Integrity / Verify Merkle Roots (push) Has been cancelled
Docs CI / lint-and-preview (push) Has been cancelled
up
2025-12-12 09:35:37 +02:00

147 lines
5.2 KiB
Python

"""ImpactIndex throughput benchmark harness.
This harness replays a deterministic productKey dataset and records cold vs warm
lookup performance. It is intentionally offline-friendly and relies only on the
provided NDJSON inputs.
"""
import argparse
import gc
import hashlib
import json
import random
import statistics
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable, List, Tuple
def percentile(values: List[float], pct: float) -> float:
"""Return an interpolated percentile to keep outputs deterministic."""
if not values:
return 0.0
ordered = sorted(values)
k = (len(ordered) - 1) * (pct / 100.0)
lower = int(k)
upper = min(lower + 1, len(ordered) - 1)
if lower == upper:
return float(ordered[lower])
fraction = k - lower
return float(ordered[lower] + (ordered[upper] - ordered[lower]) * fraction)
def load_product_keys(path: Path) -> List[str]:
with path.open(encoding="utf-8") as handle:
return [json.loads(line)["productKey"] for line in handle if line.strip()]
class ImpactIndexBench:
def __init__(self, seed: int, threads: int):
self.rng = random.Random(seed)
self.threads = threads
self.cache = {}
self.cache_hits = 0
self.cache_misses = 0
def _compute_cost(self, product_key: str) -> int:
digest = hashlib.blake2b(product_key.encode("utf-8"), digest_size=16).digest()
local_rng = random.Random(hashlib.sha1(product_key.encode("utf-8")).hexdigest())
iterations = 40 + (digest[0] % 30)
value = 0
for i in range(iterations):
value ^= (digest[i % len(digest)] + i * 31) & 0xFFFFFFFF
value ^= local_rng.randint(0, 1024)
# Simple deterministic cost proxy
return value
def resolve(self, product_key: str) -> int:
if product_key in self.cache:
self.cache_hits += 1
return self.cache[product_key]
cost = self._compute_cost(product_key)
enriched = (cost % 1000) + 1
self.cache[product_key] = enriched
self.cache_misses += 1
return enriched
def run_pass(pass_name: str, bench: ImpactIndexBench, product_keys: Iterable[str]) -> Tuple[dict, List[float]]:
started_at = datetime.now(timezone.utc).isoformat()
timings_ms: List[float] = []
gc.collect()
import tracemalloc
tracemalloc.start()
start = time.perf_counter()
for key in product_keys:
t0 = time.perf_counter()
bench.resolve(key)
timings_ms.append((time.perf_counter() - t0) * 1000.0)
duration_ms = (time.perf_counter() - start) * 1000.0
current_bytes, peak_bytes = tracemalloc.get_traced_memory()
tracemalloc.stop()
# GC stats are coarse; we surface gen2 collections as a proxy for managed pressure.
if hasattr(gc, "get_stats"):
gc_stats = gc.get_stats()
gc_gen2 = gc_stats[2]["collections"] if len(gc_stats) > 2 else 0
else:
counts = gc.get_count()
gc_gen2 = counts[2] if len(counts) > 2 else 0
throughput = (len(timings_ms) / (duration_ms / 1000.0)) if duration_ms else 0.0
record = {
"pass": pass_name,
"startedAtUtc": started_at,
"durationMs": round(duration_ms, 3),
"throughput_items_per_sec": round(throughput, 3),
"p95Ms": round(percentile(timings_ms, 95), 3),
"p99Ms": round(percentile(timings_ms, 99), 3),
"maxMs": round(max(timings_ms) if timings_ms else 0.0, 3),
"rssMb": round(peak_bytes / (1024 * 1024), 3),
"managedMb": round(peak_bytes / (1024 * 1024), 3),
"gc_gen2": gc_gen2,
"cacheHitRate": round(
bench.cache_hits / max(1, (bench.cache_hits + bench.cache_misses)), 4
),
}
return record, timings_ms
def write_ndjson(records: List[dict], output: Path):
output.parent.mkdir(parents=True, exist_ok=True)
with output.open("w", encoding="utf-8") as handle:
for record in records:
handle.write(json.dumps(record, separators=(",", ":"), sort_keys=True) + "\n")
def parse_args(argv: List[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="ImpactIndex throughput benchmark")
parser.add_argument("--input", required=True, help="Path to products-10k.ndjson dataset")
parser.add_argument("--output", default="results/impactindex.ndjson", help="Output NDJSON path")
parser.add_argument("--threads", type=int, default=1, help="Thread count (deterministic when 1)")
parser.add_argument("--seed", type=int, default=20250101, help="Seed for deterministic runs")
return parser.parse_args(argv)
def main(argv: List[str] | None = None):
args = parse_args(argv)
dataset_path = Path(args.input)
product_keys = load_product_keys(dataset_path)
bench = ImpactIndexBench(seed=args.seed, threads=args.threads)
cold_record, cold_timings = run_pass("cold", bench, product_keys)
warm_record, warm_timings = run_pass("warm", bench, product_keys)
output_path = Path(args.output)
write_ndjson([cold_record, warm_record], output_path)
print(f"Wrote {output_path} with {len(product_keys)} productKeys")
return 0
if __name__ == "__main__":
raise SystemExit(main())