Some checks failed
Signals CI & Image / signals-ci (push) Has been cancelled
Signals Reachability Scoring & Events / reachability-smoke (push) Has been cancelled
Signals Reachability Scoring & Events / sign-and-upload (push) Has been cancelled
Manifest Integrity / Validate Schema Integrity (push) Has been cancelled
Manifest Integrity / Validate Contract Documents (push) Has been cancelled
Manifest Integrity / Validate Pack Fixtures (push) Has been cancelled
Manifest Integrity / Audit SHA256SUMS Files (push) Has been cancelled
Manifest Integrity / Verify Merkle Roots (push) Has been cancelled
Docs CI / lint-and-preview (push) Has been cancelled
147 lines
5.2 KiB
Python
147 lines
5.2 KiB
Python
"""ImpactIndex throughput benchmark harness.
|
|
|
|
This harness replays a deterministic productKey dataset and records cold vs warm
|
|
lookup performance. It is intentionally offline-friendly and relies only on the
|
|
provided NDJSON inputs.
|
|
"""
|
|
|
|
import argparse
|
|
import gc
|
|
import hashlib
|
|
import json
|
|
import random
|
|
import statistics
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Iterable, List, Tuple
|
|
|
|
|
|
def percentile(values: List[float], pct: float) -> float:
|
|
"""Return an interpolated percentile to keep outputs deterministic."""
|
|
if not values:
|
|
return 0.0
|
|
ordered = sorted(values)
|
|
k = (len(ordered) - 1) * (pct / 100.0)
|
|
lower = int(k)
|
|
upper = min(lower + 1, len(ordered) - 1)
|
|
if lower == upper:
|
|
return float(ordered[lower])
|
|
fraction = k - lower
|
|
return float(ordered[lower] + (ordered[upper] - ordered[lower]) * fraction)
|
|
|
|
|
|
def load_product_keys(path: Path) -> List[str]:
|
|
with path.open(encoding="utf-8") as handle:
|
|
return [json.loads(line)["productKey"] for line in handle if line.strip()]
|
|
|
|
|
|
class ImpactIndexBench:
|
|
def __init__(self, seed: int, threads: int):
|
|
self.rng = random.Random(seed)
|
|
self.threads = threads
|
|
self.cache = {}
|
|
self.cache_hits = 0
|
|
self.cache_misses = 0
|
|
|
|
def _compute_cost(self, product_key: str) -> int:
|
|
digest = hashlib.blake2b(product_key.encode("utf-8"), digest_size=16).digest()
|
|
local_rng = random.Random(hashlib.sha1(product_key.encode("utf-8")).hexdigest())
|
|
iterations = 40 + (digest[0] % 30)
|
|
value = 0
|
|
for i in range(iterations):
|
|
value ^= (digest[i % len(digest)] + i * 31) & 0xFFFFFFFF
|
|
value ^= local_rng.randint(0, 1024)
|
|
# Simple deterministic cost proxy
|
|
return value
|
|
|
|
def resolve(self, product_key: str) -> int:
|
|
if product_key in self.cache:
|
|
self.cache_hits += 1
|
|
return self.cache[product_key]
|
|
|
|
cost = self._compute_cost(product_key)
|
|
enriched = (cost % 1000) + 1
|
|
self.cache[product_key] = enriched
|
|
self.cache_misses += 1
|
|
return enriched
|
|
|
|
|
|
def run_pass(pass_name: str, bench: ImpactIndexBench, product_keys: Iterable[str]) -> Tuple[dict, List[float]]:
|
|
started_at = datetime.now(timezone.utc).isoformat()
|
|
timings_ms: List[float] = []
|
|
|
|
gc.collect()
|
|
import tracemalloc
|
|
|
|
tracemalloc.start()
|
|
start = time.perf_counter()
|
|
for key in product_keys:
|
|
t0 = time.perf_counter()
|
|
bench.resolve(key)
|
|
timings_ms.append((time.perf_counter() - t0) * 1000.0)
|
|
duration_ms = (time.perf_counter() - start) * 1000.0
|
|
current_bytes, peak_bytes = tracemalloc.get_traced_memory()
|
|
tracemalloc.stop()
|
|
|
|
# GC stats are coarse; we surface gen2 collections as a proxy for managed pressure.
|
|
if hasattr(gc, "get_stats"):
|
|
gc_stats = gc.get_stats()
|
|
gc_gen2 = gc_stats[2]["collections"] if len(gc_stats) > 2 else 0
|
|
else:
|
|
counts = gc.get_count()
|
|
gc_gen2 = counts[2] if len(counts) > 2 else 0
|
|
|
|
throughput = (len(timings_ms) / (duration_ms / 1000.0)) if duration_ms else 0.0
|
|
record = {
|
|
"pass": pass_name,
|
|
"startedAtUtc": started_at,
|
|
"durationMs": round(duration_ms, 3),
|
|
"throughput_items_per_sec": round(throughput, 3),
|
|
"p95Ms": round(percentile(timings_ms, 95), 3),
|
|
"p99Ms": round(percentile(timings_ms, 99), 3),
|
|
"maxMs": round(max(timings_ms) if timings_ms else 0.0, 3),
|
|
"rssMb": round(peak_bytes / (1024 * 1024), 3),
|
|
"managedMb": round(peak_bytes / (1024 * 1024), 3),
|
|
"gc_gen2": gc_gen2,
|
|
"cacheHitRate": round(
|
|
bench.cache_hits / max(1, (bench.cache_hits + bench.cache_misses)), 4
|
|
),
|
|
}
|
|
return record, timings_ms
|
|
|
|
|
|
def write_ndjson(records: List[dict], output: Path):
|
|
output.parent.mkdir(parents=True, exist_ok=True)
|
|
with output.open("w", encoding="utf-8") as handle:
|
|
for record in records:
|
|
handle.write(json.dumps(record, separators=(",", ":"), sort_keys=True) + "\n")
|
|
|
|
|
|
def parse_args(argv: List[str] | None = None) -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description="ImpactIndex throughput benchmark")
|
|
parser.add_argument("--input", required=True, help="Path to products-10k.ndjson dataset")
|
|
parser.add_argument("--output", default="results/impactindex.ndjson", help="Output NDJSON path")
|
|
parser.add_argument("--threads", type=int, default=1, help="Thread count (deterministic when 1)")
|
|
parser.add_argument("--seed", type=int, default=20250101, help="Seed for deterministic runs")
|
|
return parser.parse_args(argv)
|
|
|
|
|
|
def main(argv: List[str] | None = None):
|
|
args = parse_args(argv)
|
|
dataset_path = Path(args.input)
|
|
product_keys = load_product_keys(dataset_path)
|
|
|
|
bench = ImpactIndexBench(seed=args.seed, threads=args.threads)
|
|
cold_record, cold_timings = run_pass("cold", bench, product_keys)
|
|
warm_record, warm_timings = run_pass("warm", bench, product_keys)
|
|
|
|
output_path = Path(args.output)
|
|
write_ndjson([cold_record, warm_record], output_path)
|
|
print(f"Wrote {output_path} with {len(product_keys)} productKeys")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|