"""ImpactIndex throughput benchmark harness. This harness replays a deterministic productKey dataset and records cold vs warm lookup performance. It is intentionally offline-friendly and relies only on the provided NDJSON inputs. """ import argparse import gc import hashlib import json import random import statistics import time from datetime import datetime, timezone from pathlib import Path from typing import Iterable, List, Tuple def percentile(values: List[float], pct: float) -> float: """Return an interpolated percentile to keep outputs deterministic.""" if not values: return 0.0 ordered = sorted(values) k = (len(ordered) - 1) * (pct / 100.0) lower = int(k) upper = min(lower + 1, len(ordered) - 1) if lower == upper: return float(ordered[lower]) fraction = k - lower return float(ordered[lower] + (ordered[upper] - ordered[lower]) * fraction) def load_product_keys(path: Path) -> List[str]: with path.open(encoding="utf-8") as handle: return [json.loads(line)["productKey"] for line in handle if line.strip()] class ImpactIndexBench: def __init__(self, seed: int, threads: int): self.rng = random.Random(seed) self.threads = threads self.cache = {} self.cache_hits = 0 self.cache_misses = 0 def _compute_cost(self, product_key: str) -> int: digest = hashlib.blake2b(product_key.encode("utf-8"), digest_size=16).digest() local_rng = random.Random(hashlib.sha1(product_key.encode("utf-8")).hexdigest()) iterations = 40 + (digest[0] % 30) value = 0 for i in range(iterations): value ^= (digest[i % len(digest)] + i * 31) & 0xFFFFFFFF value ^= local_rng.randint(0, 1024) # Simple deterministic cost proxy return value def resolve(self, product_key: str) -> int: if product_key in self.cache: self.cache_hits += 1 return self.cache[product_key] cost = self._compute_cost(product_key) enriched = (cost % 1000) + 1 self.cache[product_key] = enriched self.cache_misses += 1 return enriched def run_pass(pass_name: str, bench: ImpactIndexBench, product_keys: Iterable[str]) -> Tuple[dict, List[float]]: started_at = datetime.now(timezone.utc).isoformat() timings_ms: List[float] = [] gc.collect() import tracemalloc tracemalloc.start() start = time.perf_counter() for key in product_keys: t0 = time.perf_counter() bench.resolve(key) timings_ms.append((time.perf_counter() - t0) * 1000.0) duration_ms = (time.perf_counter() - start) * 1000.0 current_bytes, peak_bytes = tracemalloc.get_traced_memory() tracemalloc.stop() # GC stats are coarse; we surface gen2 collections as a proxy for managed pressure. if hasattr(gc, "get_stats"): gc_stats = gc.get_stats() gc_gen2 = gc_stats[2]["collections"] if len(gc_stats) > 2 else 0 else: counts = gc.get_count() gc_gen2 = counts[2] if len(counts) > 2 else 0 throughput = (len(timings_ms) / (duration_ms / 1000.0)) if duration_ms else 0.0 record = { "pass": pass_name, "startedAtUtc": started_at, "durationMs": round(duration_ms, 3), "throughput_items_per_sec": round(throughput, 3), "p95Ms": round(percentile(timings_ms, 95), 3), "p99Ms": round(percentile(timings_ms, 99), 3), "maxMs": round(max(timings_ms) if timings_ms else 0.0, 3), "rssMb": round(peak_bytes / (1024 * 1024), 3), "managedMb": round(peak_bytes / (1024 * 1024), 3), "gc_gen2": gc_gen2, "cacheHitRate": round( bench.cache_hits / max(1, (bench.cache_hits + bench.cache_misses)), 4 ), } return record, timings_ms def write_ndjson(records: List[dict], output: Path): output.parent.mkdir(parents=True, exist_ok=True) with output.open("w", encoding="utf-8") as handle: for record in records: handle.write(json.dumps(record, separators=(",", ":"), sort_keys=True) + "\n") def parse_args(argv: List[str] | None = None) -> argparse.Namespace: parser = argparse.ArgumentParser(description="ImpactIndex throughput benchmark") parser.add_argument("--input", required=True, help="Path to products-10k.ndjson dataset") parser.add_argument("--output", default="results/impactindex.ndjson", help="Output NDJSON path") parser.add_argument("--threads", type=int, default=1, help="Thread count (deterministic when 1)") parser.add_argument("--seed", type=int, default=20250101, help="Seed for deterministic runs") return parser.parse_args(argv) def main(argv: List[str] | None = None): args = parse_args(argv) dataset_path = Path(args.input) product_keys = load_product_keys(dataset_path) bench = ImpactIndexBench(seed=args.seed, threads=args.threads) cold_record, cold_timings = run_pass("cold", bench, product_keys) warm_record, warm_timings = run_pass("warm", bench, product_keys) output_path = Path(args.output) write_ndjson([cold_record, warm_record], output_path) print(f"Wrote {output_path} with {len(product_keys)} productKeys") return 0 if __name__ == "__main__": raise SystemExit(main())