save progress

2025-12-18 09:10:36 +02:00
parent b4235c134c
commit 28823a8960
169 changed files with 11995 additions and 449 deletions
--- a/docs/product-advisories/18-Dec-2025
+++ b/docs/product-advisories/18-Dec-2025
@@ -0,0 +1,869 @@
+Here’s a compact, practical blueprint for bringing **EPSS** into your stack without chaos: a **3‑layer ingestion model** that keeps raw data, produces clean probabilities, and emits “signal‑ready” events your risk engine can use immediately.
+
+---
+
+# Why this matters (super short)
+
+* **EPSS** = predicted probability a vuln will be exploited soon.
+* Mixing “raw EPSS feed” directly into decisions makes audits, rollbacks, and model upgrades painful.
+* A **layered model** lets you **version probability evolution**, compare vendors, and train **meta‑predictors on deltas** (how risk changes over time), not just on snapshots.
+
+---
+
+# The three layers (and how they map to Stella Ops)
+
+1. **Raw feed layer (immutable)**
+
+* **Goal:** Store exactly what the provider sent (EPSS v4 CSV/JSON, schema drift and all).
+* **Stella modules:** `Concelier` (preserve‑prune source) writes; `Authority` handles signatures/hashes.
+* **Storage:** `postgres.epss_raw` (partitioned by day); blob column for the untouched payload; SHA‑256 of source file.
+* **Why:** Full provenance + deterministic replay.
+
+2. **Normalized probabilistic layer**
+
+* **Goal:** Clean, typed tables keyed by `cve_id`, with **probability, percentile, model_version, asof_ts**.
+* **Stella modules:** `Excititor` (transform); `Policy Engine` reads.
+* **Storage:** `postgres.epss_prob` with a **surrogate key** `(cve_id, model_version, asof_ts)` and computed **delta fields** vs previous `asof_ts`.
+* **Extras:** Keep optional vendor columns (e.g., FIRST, custom regressors) to compare models side‑by‑side.
+
+3. **Signal‑ready layer (risk engine contracts)**
+
+* **Goal:** Pre‑chewed “events” your **Signals/Router** can route instantly.
+* **What’s inside:** Only the fields needed for gating and UI: `cve_id`, `prob_now`, `prob_delta`, `percentile`, `risk_band`, `explain_hash`.
+* **Emit:** `first_signal`, `risk_increase`, `risk_decrease`, `quieted` with **idempotent event keys**.
+* **Stella modules:** `Signals` publishes, `Router` fan‑outs, `Timeline` records; `Notify` handles subscriptions.
+
+---
+
+# Minimal Postgres schema (ready to paste)
+
+```sql
+-- 1) Raw (immutable)
+create table epss_raw (
+  id bigserial primary key,
+  source_uri text not null,
+  ingestion_ts timestamptz not null default now(),
+  asof_date date not null,
+  payload jsonb not null,
+  payload_sha256 bytea not null
+);
+create index on epss_raw (asof_date);
+
+-- 2) Normalized
+create table epss_prob (
+  id bigserial primary key,
+  cve_id text not null,
+  model_version text not null,         -- e.g., 'EPSS-4.0-Falcon-2025-12'
+  asof_ts timestamptz not null,
+  probability double precision not null,
+  percentile double precision,
+  features jsonb,                      -- optional: normalized features used
+  unique (cve_id, model_version, asof_ts)
+);
+-- delta against prior point (materialized view or nightly job)
+create materialized view epss_prob_delta as
+select p.*,
+       p.probability - lag(p.probability) over (partition by cve_id, model_version order by asof_ts) as prob_delta
+from epss_prob p;
+
+-- 3) Signal-ready
+create table epss_signal (
+  signal_id bigserial primary key,
+  cve_id text not null,
+  asof_ts timestamptz not null,
+  probability double precision not null,
+  prob_delta double precision,
+  risk_band text not null,             -- e.g., 'LOW/MED/HIGH/CRITICAL'
+  model_version text not null,
+  explain_hash bytea not null,         -- hash of inputs -> deterministic
+  unique (cve_id, model_version, asof_ts)
+);
+```
+
+---
+
+# C# ingestion skeleton (StellaOps.Scanner.Worker.DotNet style)
+
+```csharp
+// 1) Fetch & store raw (Concelier)
+public async Task IngestRawAsync(Uri src, DateOnly asOfDate) {
+    var bytes = await http.GetByteArrayAsync(src);
+    var sha = SHA256.HashData(bytes);
+    await pg.ExecuteAsync(
+        "insert into epss_raw(source_uri, asof_date, payload, payload_sha256) values (@u,@d,@p::jsonb,@s)",
+        new { u = src.ToString(), d = asOfDate, p = Encoding.UTF8.GetString(bytes), s = sha });
+}
+
+// 2) Normalize (Excititor)
+public async Task NormalizeAsync(DateOnly asOfDate, string modelVersion) {
+    var raws = await pg.QueryAsync<(string Payload)>("select payload from epss_raw where asof_date=@d", new { d = asOfDate });
+    foreach (var r in raws) {
+        foreach (var row in ParseCsvOrJson(r.Payload)) {
+            await pg.ExecuteAsync(
+              @"insert into epss_prob(cve_id, model_version, asof_ts, probability, percentile, features)
+                values (@cve,@mv,@ts,@prob,@pct,@feat)
+                on conflict do nothing",
+              new { cve = row.Cve, mv = modelVersion, ts = row.AsOf, prob = row.Prob, pct = row.Pctl, feat = row.Features });
+        }
+    }
+}
+
+// 3) Emit signal-ready (Signals)
+public async Task EmitSignalsAsync(string modelVersion, double deltaThreshold) {
+    var rows = await pg.QueryAsync(@"select cve_id, asof_ts, probability,
+        probability - lag(probability) over (partition by cve_id, model_version order by asof_ts) as prob_delta
+      from epss_prob where model_version=@mv", new { mv = modelVersion });
+
+    foreach (var r in rows) {
+        var band = Band(r.probability); // map to LOW/MED/HIGH/CRITICAL
+        if (Math.Abs(r.prob_delta ?? 0) >= deltaThreshold) {
+            var explainHash = DeterministicExplainHash(r);
+            await pg.ExecuteAsync(@"insert into epss_signal
+                (cve_id, asof_ts, probability, prob_delta, risk_band, model_version, explain_hash)
+                values (@c,@t,@p,@d,@b,@mv,@h)
+                on conflict do nothing",
+                new { c = r.cve_id, t = r.asof_ts, p = r.probability, d = r.prob_delta, b = band, mv = modelVersion, h = explainHash });
+
+            await bus.PublishAsync("risk.epss.delta", new {
+                cve = r.cve_id, ts = r.asof_ts, prob = r.probability, delta = r.prob_delta, band, model = modelVersion, explain = Convert.ToHexString(explainHash)
+            });
+        }
+    }
+}
+```
+
+---
+
+# Versioning & experiments (the secret sauce)
+
+* **Model namespace:** `EPSS‑4.0‑<regressor‑name>‑<date>` so you can run multiple variants in parallel.
+* **Delta‑training:** Train a small meta‑predictor on **Δprobability** to forecast **“risk jumps in next N days.”**
+* **A/B in production:** Route `model_version=x` to 50% of projects; compare **MTTA to patch** and **false‑alarm rate**.
+
+---
+
+# Policy & UI wiring (quick contracts)
+
+**Policy gates** (OPA/Rego or internal rules):
+
+* Block if `risk_band ∈ {HIGH, CRITICAL}` **AND** `prob_delta >= 0.1` in last 72h.
+* Soften if asset not reachable or mitigated by VEX.
+
+**UI (Evidence pane):**
+
+* Show **sparkline of EPSS over time**, highlight last Δ.
+* “Why now?” button reveals **explain_hash** → deterministic evidence payload.
+
+---
+
+# Ops & reliability
+
+* Daily ingestion with **idempotent** runs (raw SHA guard).
+* Backfills: re‑normalize from `epss_raw` for any new model without re‑downloading.
+* **Deterministic replay:** export `(raw, transform code hash, model_version)` alongside results.
+
+---
+
+If you want, I can drop this as a ready‑to‑run **.sql + .csproj** seed with a tiny CLI (`ingest`, `normalize`, `emit`) tailored to your `Postgres + Valkey` profile.
+Below is a “do this, then this” implementation guide for a **layered EPSS pipeline** inside **Stella Ops**, with concrete schemas, job boundaries, idempotency rules, and the tricky edge cases (model-version shifts, noise control, backfills).
+
+I’ll assume:
+
+* **Postgres** is your system of record, **Valkey** is available for caching,
+* you run **.NET workers** (like `StellaOps.Scanner.Worker.DotNet`),
+* Stella modules you referenced map roughly like this:
+
+  * **Concelier** = ingest + preserve/prune raw sources
+  * **Authority** = provenance (hashes, immutability, signature-like guarantees)
+  * **Excititor** = transform/normalize
+  * **Signals / Router / Timeline / Notify** = event pipeline + audit trail + subscriptions
+
+I’ll anchor the EPSS feed details to FIRST’s docs:
+
+* The data feed fields are `cve`, `epss`, `percentile` and are refreshed daily. ([FIRST][1])
+* Historical daily `.csv.gz` files exist at `https://epss.empiricalsecurity.com/epss_scores-YYYY-mm-dd.csv.gz`. ([FIRST][1])
+* The API base is `https://api.first.org/data/v1/epss` and supports per-CVE and time-series queries. ([FIRST][2])
+* FIRST notes model-version shifts (v2/v3/v4) and that the daily files include a leading `#` comment indicating model version/publish date (important for delta correctness). ([FIRST][1])
+* FIRST’s guidance: use **probability** as the primary score and **show percentile alongside it**; raw feeds provide both as decimals 0–1. ([FIRST][3])
+
+---
+
+## 0) Target architecture and data contracts
+
+### The 3 layers and what must be true in each
+
+1. **Raw layer (immutable)**
+
+   * You can replay exactly what you ingested, byte-for-byte.
+   * Contains: file bytes or object-store pointer, headers (ETag, Last-Modified), SHA-256, parsed “header comment” (the `# …` line), ingestion status.
+
+2. **Normalized probability layer (typed, queryable, historical)**
+
+   * One row per `(model_name, asof_date, cve_id)`.
+   * Contains: `epss` probability (0–1), `percentile` (0–1), `model_version` (from file header comment if available).
+   * Built for joins against vulnerability inventory and for time series.
+
+3. **Signal-ready layer (risk engine contract)**
+
+   * Contains only actionable changes (crossing thresholds, jumps, newly-scored, etc.), ideally scoped to **observed CVEs** in your environment to avoid noise.
+   * Events are idempotent, audit-friendly, and versioned.
+
+---
+
+## 1) Data source choice and acquisition strategy
+
+### Prefer the daily bulk `.csv.gz` over paging the API for full refresh
+
+* FIRST explicitly documents the “ALL CVEs for a date” bulk file URL pattern. ([FIRST][2])
+* The API is great for:
+
+  * “give me EPSS for this CVE list”
+  * “give me last 30 days time series for CVE X” ([FIRST][2])
+
+**Recommendation**
+
+* Daily job pulls the bulk file for “latest available date”.
+* A separate on-demand endpoint uses the API time-series for UI convenience (optional).
+
+### Robust “latest available date” probing
+
+Because the “current day” file may not be published when your cron fires:
+
+Algorithm:
+
+1. Let `d0 = UtcToday`.
+2. For `d in [d0, d0-1, d0-2, d0-3]`:
+
+   * Try `GET https://epss.empiricalsecurity.com/epss_scores-{d:yyyy-MM-dd}.csv.gz`
+   * If HTTP 200: ingest that as `asof_date = d` and stop.
+3. If none succeed: fail the job with a clear message + alert.
+
+This avoids timezone and publishing-time ambiguity.
+
+---
+
+## 2) Layer 1: Raw feed (Concelier + Authority)
+
+### 2.1 Schema for raw + lineage
+
+Use a dedicated schema `epss` so the pipeline is easy to reason about.
+
+```sql
+create schema if not exists epss;
+
+-- Immutable file-level record
+create table if not exists epss.raw_file (
+  raw_id           bigserial primary key,
+  source_uri       text not null,
+  asof_date        date not null,
+  fetched_at       timestamptz not null default now(),
+
+  http_etag        text,
+  http_last_modified timestamptz,
+  content_len      bigint,
+
+  content_sha256   bytea not null,
+
+  -- first non-empty comment lines like "# model=... date=..."
+  header_comment   text,
+  model_version    text,
+  model_published_on date,
+
+  -- storage: either inline bytea OR object storage pointer
+  storage_kind     text not null default 'pg_bytea', -- 'pg_bytea' | 's3' | 'fs'
+  storage_ref      text,
+  content_gz       bytea,  -- nullable if stored externally
+
+  parse_status     text not null default 'pending', -- pending|parsed|failed
+  parse_error      text,
+
+  unique (source_uri, asof_date, content_sha256)
+);
+
+create index if not exists ix_epss_raw_file_asof on epss.raw_file(asof_date);
+create index if not exists ix_epss_raw_file_status on epss.raw_file(parse_status);
+```
+
+**Why store `model_version` here?**
+FIRST warns that model updates cause “major shifts” and the daily files include a `#` comment with model version/publish date. If you ignore this, your delta logic will misfire on model-change days. ([FIRST][1])
+
+### 2.2 Raw ingestion idempotency rules
+
+A run is “already ingested” if:
+
+* a row exists for `(source_uri, asof_date)` with the same `content_sha256`, OR
+* you implement “single truth per day” and treat any new sha for the same date as “replace” (rare, but can happen).
+
+Recommended:
+
+* **Treat as replace only if** you’re confident the source can republish the same date. If not, keep both but mark the superseded one.
+
+### 2.3 Raw ingestion implementation details (.NET)
+
+**Key constraints**
+
+* Download as a stream (`ResponseHeadersRead`)
+* Compute SHA-256 while streaming
+* Store bytes or stream them into object storage
+* Capture ETag/Last-Modified headers if present
+
+Pseudo-implementation structure:
+
+* `EpssFetchJob`
+
+  * `ProbeLatestDateAsync()`
+  * `DownloadAsync(uri)`
+  * `ExtractHeaderCommentAsync(gzipStream)` (read a few first lines after decompression)
+  * `InsertRawFileRecord(...)` (Concelier + Authority)
+
+**Header comment extraction**
+FIRST indicates files may start with `# ... model version ... publish date ...`. ([FIRST][1])
+So do:
+
+* Decompress
+* Read lines until you find first non-empty non-`#` line (that’s likely CSV header / first row)
+* Save the concatenated `#` lines as `header_comment`
+* Regex best-effort parse:
+
+  * `model_version`: something like `v2025.03.14`
+  * `model_published_on`: `YYYY-MM-DD`
+
+If parsing fails, still store `header_comment`.
+
+### 2.4 Pruning raw (Concelier “preserve-prune”)
+
+Define retention policy:
+
+* Keep **raw bytes** 90–180 days (cheap enough; each `.csv.gz` is usually a few–tens of MB)
+* Keep **metadata** forever (tiny, essential for audits)
+
+Nightly cleanup job:
+
+* delete `content_gz` or external object for `raw_file` older than retention
+* keep row but set `storage_kind='pruned'`, `content_gz=null`, `storage_ref=null`
+
+---
+
+## 3) Layer 2: Normalized probability tables (Excititor)
+
+### 3.1 Core normalized table design
+
+Requirements:
+
+* Efficient time series per CVE
+* Efficient “latest score per CVE”
+* Efficient join to “observed vulnerabilities” tables
+
+#### Daily score table (partitioned)
+
+```sql
+create table if not exists epss.daily_score (
+  model_name     text not null,            -- 'FIRST_EPSS'
+  asof_date      date not null,
+  cve_id         text not null,
+  epss           double precision not null,
+  percentile     double precision,
+  model_version  text,                     -- from raw header if available
+  raw_id         bigint references epss.raw_file(raw_id),
+  loaded_at      timestamptz not null default now(),
+
+  -- Guards
+  constraint ck_epss_range check (epss >= 0.0 and epss <= 1.0),
+  constraint ck_percentile_range check (percentile is null or (percentile >= 0.0 and percentile <= 1.0)),
+
+  primary key (model_name, asof_date, cve_id)
+) partition by range (asof_date);
+
+-- Example monthly partitions (create via migration script generator)
+create table if not exists epss.daily_score_2025_12
+  partition of epss.daily_score for values from ('2025-12-01') to ('2026-01-01');
+
+create index if not exists ix_epss_daily_score_cve on epss.daily_score (model_name, cve_id, asof_date desc);
+create index if not exists ix_epss_daily_score_epss on epss.daily_score (model_name, asof_date, epss desc);
+create index if not exists ix_epss_daily_score_pct on epss.daily_score (model_name, asof_date, percentile desc);
+```
+
+**Field semantics**
+
+* `epss` is the probability of exploitation in the next 30 days, 0–1. ([FIRST][1])
+* `percentile` is relative rank among all scored vulnerabilities. ([FIRST][1])
+
+### 3.2 Maintain a “latest” table for fast joins
+
+Don’t compute latest via window functions in hot paths (policy evaluation / scoring). Materialize it.
+
+```sql
+create table if not exists epss.latest_score (
+  model_name     text not null,
+  cve_id         text not null,
+  asof_date      date not null,
+  epss           double precision not null,
+  percentile     double precision,
+  model_version  text,
+  updated_at     timestamptz not null default now(),
+  primary key (model_name, cve_id)
+);
+
+create index if not exists ix_epss_latest_epss on epss.latest_score(model_name, epss desc);
+create index if not exists ix_epss_latest_pct on epss.latest_score(model_name, percentile desc);
+```
+
+Update logic (after loading a day):
+
+* Upsert each CVE (or do a set-based upsert):
+
+  * `asof_date` should only move forward
+  * if a backfill loads an older day, do not overwrite latest
+
+### 3.3 Delta table for change detection
+
+Store deltas per day (this powers signals and “sparkline deltas”).
+
+```sql
+create table if not exists epss.daily_delta (
+  model_name        text not null,
+  asof_date         date not null,
+  cve_id            text not null,
+
+  epss              double precision not null,
+  prev_asof_date    date,
+  prev_epss         double precision,
+  epss_delta        double precision,
+
+  percentile        double precision,
+  prev_percentile   double precision,
+  percentile_delta  double precision,
+
+  model_version     text,
+  prev_model_version text,
+  is_model_change   boolean not null default false,
+
+  created_at        timestamptz not null default now(),
+  primary key (model_name, asof_date, cve_id)
+);
+
+create index if not exists ix_epss_daily_delta_cve on epss.daily_delta(model_name, cve_id, asof_date desc);
+create index if not exists ix_epss_daily_delta_delta on epss.daily_delta(model_name, asof_date, epss_delta desc);
+```
+
+**Model update handling**
+
+* On a model version change day (v3→v4 etc), many deltas will jump.
+* FIRST explicitly warns model shifts. ([FIRST][1])
+  So:
+* detect if today’s `model_version != previous_day.model_version`
+* set `is_model_change = true`
+* optionally **suppress delta-based signals** that day (or emit a separate “MODEL_UPDATED” event)
+
+### 3.4 Normalization job mechanics
+
+Implement `EpssNormalizeJob`:
+
+1. Select `raw_file` rows where `parse_status='pending'`.
+2. Decompress `content_gz` or fetch from object store.
+3. Parse CSV:
+
+   * skip `#` comment lines
+   * expect columns: `cve,epss,percentile` (FIRST documents these fields). ([FIRST][1])
+4. Validate:
+
+   * CVE format: `^CVE-\d{4}-\d{4,}$`
+   * numeric parse for epss/percentile
+   * range checks 0–1
+5. Load into Postgres fast:
+
+   * Use `COPY` (binary import) into a **staging table** `epss.stage_score`
+   * Then set-based insert into `epss.daily_score`
+6. Update `epss.raw_file.parse_status='parsed'` or `failed`.
+
+#### Staging table pattern
+
+```sql
+create unlogged table if not exists epss.stage_score (
+  model_name  text not null,
+  asof_date   date not null,
+  cve_id      text not null,
+  epss        double precision not null,
+  percentile  double precision,
+  model_version text,
+  raw_id      bigint not null
+);
+```
+
+In the job:
+
+* `truncate epss.stage_score;`
+* `COPY epss.stage_score FROM STDIN (FORMAT BINARY)`
+* Then (transactionally):
+
+  * `delete from epss.daily_score where model_name=@m and asof_date=@d;`  *(idempotency for reruns)*
+  * `insert into epss.daily_score (...) select ... from epss.stage_score;`
+
+This avoids `ON CONFLICT` overhead and guarantees deterministic reruns.
+
+### 3.5 Delta + latest materialization job
+
+Implement `EpssMaterializeJob` after successful daily_score insert.
+
+**Compute previous available date**
+
+```sql
+-- previous date available for that model_name
+select max(asof_date)
+from epss.daily_score
+where model_name = @model
+  and asof_date < @asof_date;
+```
+
+**Populate delta (set-based)**
+
+```sql
+insert into epss.daily_delta (
+  model_name, asof_date, cve_id,
+  epss, prev_asof_date, prev_epss, epss_delta,
+  percentile, prev_percentile, percentile_delta,
+  model_version, prev_model_version, is_model_change
+)
+select
+  cur.model_name,
+  cur.asof_date,
+  cur.cve_id,
+  cur.epss,
+  prev.asof_date as prev_asof_date,
+  prev.epss as prev_epss,
+  cur.epss - prev.epss as epss_delta,
+  cur.percentile,
+  prev.percentile as prev_percentile,
+  (cur.percentile - prev.percentile) as percentile_delta,
+  cur.model_version,
+  prev.model_version,
+  (cur.model_version is not null and prev.model_version is not null and cur.model_version <> prev.model_version) as is_model_change
+from epss.daily_score cur
+left join epss.daily_score prev
+  on prev.model_name = cur.model_name
+ and prev.asof_date = @prev_asof_date
+ and prev.cve_id = cur.cve_id
+where cur.model_name = @model
+  and cur.asof_date = @asof_date;
+```
+
+**Update latest_score (set-based upsert)**
+
+```sql
+insert into epss.latest_score(model_name, cve_id, asof_date, epss, percentile, model_version)
+select model_name, cve_id, asof_date, epss, percentile, model_version
+from epss.daily_score
+where model_name=@model and asof_date=@asof_date
+on conflict (model_name, cve_id) do update
+set asof_date = excluded.asof_date,
+    epss = excluded.epss,
+    percentile = excluded.percentile,
+    model_version = excluded.model_version,
+    updated_at = now()
+where epss.latest_score.asof_date < excluded.asof_date;
+```
+
+---
+
+## 4) Layer 3: Signal-ready output (Signals + Router + Timeline + Notify)
+
+### 4.1 Decide what “signal” means in Stella Ops
+
+You do **not** want to emit 300k events daily.
+
+You want “actionable” events, ideally:
+
+* only for CVEs that are **observed** in your tenant’s environment, and
+* only when something meaningful happens.
+
+Examples:
+
+* Risk band changes (based on percentile or probability)
+* ΔEPS S crosses a threshold (e.g., jump ≥ 0.05)
+* Newly scored CVEs that are present in environment
+* Model version change day → one summary event instead of 300k deltas
+
+### 4.2 Risk band mapping (internal heuristic)
+
+FIRST explicitly does **not** “officially bin” EPSS scores; binning is subjective. ([FIRST][3])
+But operationally you’ll want bands. Use config-driven thresholds.
+
+Default band function based on percentile:
+
+* `CRITICAL` if `percentile >= 0.995`
+* `HIGH` if `percentile >= 0.99`
+* `MEDIUM` if `percentile >= 0.90`
+* else `LOW`
+
+Store these in config per tenant/policy pack.
+
+### 4.3 Signal table for idempotency + audit
+
+```sql
+create table if not exists epss.signal (
+  signal_id     bigserial primary key,
+  tenant_id     uuid not null,
+  model_name    text not null,
+  asof_date     date not null,
+  cve_id        text not null,
+
+  event_type    text not null,         -- 'RISK_BAND_UP' | 'RISK_SPIKE' | 'MODEL_UPDATED' | ...
+  risk_band     text,
+  epss          double precision,
+  epss_delta    double precision,
+  percentile    double precision,
+  percentile_delta double precision,
+
+  is_model_change boolean not null default false,
+
+  -- deterministic idempotency key
+  dedupe_key    text not null,
+  payload       jsonb not null,
+
+  created_at    timestamptz not null default now(),
+
+  unique (tenant_id, dedupe_key)
+);
+
+create index if not exists ix_epss_signal_tenant_date on epss.signal(tenant_id, asof_date desc);
+create index if not exists ix_epss_signal_cve on epss.signal(tenant_id, cve_id, asof_date desc);
+```
+
+**Dedupe key pattern**
+Make it deterministic:
+
+```
+dedupe_key = $"{model_name}:{asof_date:yyyy-MM-dd}:{cve_id}:{event_type}:{band_before}->{band_after}"
+```
+
+### 4.4 Signal generation job
+
+Implement `EpssSignalJob(tenant)`:
+
+1. Get tenant’s **observed CVEs** from your vuln inventory (whatever your table is; call it `vuln.instance`):
+
+   * only open/unremediated vulns
+   * optionally only “reachable” or “internet exposed” assets
+
+2. Join against today’s `epss.daily_delta` (or `epss.daily_score` if you skipped delta):
+
+Pseudo-SQL:
+
+```sql
+select d.*
+from epss.daily_delta d
+join vuln.observed_cve oc
+  on oc.tenant_id = @tenant
+ and oc.cve_id = d.cve_id
+where d.model_name=@model
+  and d.asof_date=@asof_date;
+```
+
+3. Suppress noise:
+
+* if `is_model_change=true`, skip “delta spike” events and instead emit one `MODEL_UPDATED` summary event per tenant (and maybe per policy domain).
+* else evaluate:
+
+  * `abs(epss_delta) >= delta_threshold`
+  * band change
+  * percentile crosses a cutoff
+
+4. Insert into `epss.signal` with dedupe key, then publish to Signals bus:
+
+* topic: `signals.epss`
+* payload includes `tenant_id`, `cve_id`, `asof_date`, `epss`, `percentile`, deltas, band, and an `evidence` block.
+
+5. Timeline + Notify:
+
+* Timeline: record the event (what changed, when, data source sha)
+* Notify: notify subscribed channels (Slack/email/etc) based on tenant policy
+
+### 4.5 Evidence payload structure
+
+Keep evidence deterministic + replayable:
+
+```json
+{
+  "source": {
+    "provider": "FIRST",
+    "feed": "epss_scores-YYYY-MM-DD.csv.gz",
+    "asof_date": "2025-12-17",
+    "raw_sha256": "…",
+    "model_version": "v2025.03.14",
+    "header_comment": "# ... "
+  },
+  "metrics": {
+    "epss": 0.153,
+    "percentile": 0.92,
+    "epss_delta": 0.051,
+    "percentile_delta": 0.03
+  },
+  "decision": {
+    "event_type": "RISK_SPIKE",
+    "thresholds": {
+      "delta_threshold": 0.05,
+      "critical_percentile": 0.995
+    }
+  }
+}
+```
+
+This aligns with FIRST’s recommendation to present probability with percentile when possible. ([FIRST][3])
+
+---
+
+## 5) Integration points inside Stella Ops
+
+### 5.1 Policy Engine usage
+
+Policy Engine should **only** read from Layer 2 (normalized) and Layer 3 (signals), never raw.
+
+Patterns:
+
+* For gating decisions: query `epss.latest_score` for each CVE in a build/image/SBOM scan result.
+* For “why was this blocked?”: show evidence that references `raw_sha256` and `model_version`.
+
+### 5.2 Vuln scoring pipeline
+
+When you compute “Stella Risk Score” for a vuln instance:
+
+* Join `vuln_instance.cve_id` → `epss.latest_score`
+* Combine with CVSS, KEV, exploit maturity, asset exposure, etc.
+* EPSS alone is **threat likelihood**, not impact; FIRST explicitly says it’s not a complete picture of risk. ([FIRST][4])
+
+### 5.3 UI display
+
+Recommended UI string (per FIRST guidance):
+
+* Show **probability** as a percent + show percentile:
+
+  * `15.3% (92nd percentile)` ([FIRST][3])
+
+For sparklines:
+
+* Use `epss.daily_score` time series for last N days
+* Annotate model-version change days (vertical marker)
+
+---
+
+## 6) Operational hardening
+
+### 6.1 Scheduling
+
+* Run daily at a fixed time in UTC.
+* Probe up to 3 back days for latest file.
+
+### 6.2 Exactly-once semantics
+
+Use three safeguards:
+
+1. `epss.raw_file` uniqueness on `(source_uri, asof_date, sha256)`
+2. Transactional load:
+
+   * delete existing `daily_score` for that `(model_name, asof_date)`
+   * insert freshly parsed rows
+3. Advisory lock per `(model_name, asof_date)` to prevent concurrent loads:
+
+   * `pg_advisory_xact_lock(hashtext(model_name), asof_date::int)`
+
+### 6.3 Monitoring (must-have metrics)
+
+Emit metrics per job stage:
+
+* download success/failure
+* bytes downloaded
+* sha256 computed
+* rows parsed
+* parse error count
+* rows inserted into `daily_score`
+* delta rows created
+* signal events emitted
+* “model version changed” boolean
+
+Alert conditions:
+
+* no new asof_date ingested for > 48 hours
+* parse failure
+* row count drops by > X% from previous day (data anomaly)
+
+### 6.4 Backfills
+
+Implement `epss backfill --from 2021-04-14 --to 2025-12-17`:
+
+* Fetch raw files for each day
+* Normalize daily_score
+* Materialize latest and delta
+* **Disable signals** during bulk backfill (or route to “silent” topic) to avoid spamming.
+
+FIRST notes historical data begins 2021-04-14. ([FIRST][1])
+
+---
+
+## 7) Reference .NET job skeletons
+
+### Job boundaries
+
+* `EpssFetchJob` → writes `epss.raw_file`
+* `EpssNormalizeJob` → fills `epss.daily_score`
+* `EpssMaterializeJob` → updates `epss.daily_delta` and `epss.latest_score`
+* `EpssSignalJob` → per-tenant emission into `epss.signal` + bus publish
+
+### Performance notes
+
+* Use `GZipStream` + `StreamReader` line-by-line (no full file into memory)
+* Use `NpgsqlBinaryImporter` for `COPY` into staging
+* Use set-based SQL for delta/latest
+
+---
+
+## 8) The “gotchas” that make or break EPSS pipelines
+
+1. **Model version changes create false deltas**
+   Store `model_version` and mark `is_model_change`. FIRST explicitly warns about score shifts on model updates and notes v4 began publishing on 2025‑03‑17. ([FIRST][1])
+
+2. **Percentile is relative; probability is primary**
+   Probability should remain your canonical numeric score; percentile provides context for humans. ([FIRST][3])
+
+3. **Don’t emit global events**
+   Restrict signals to observed CVEs per tenant/environment.
+
+4. **Keep raw provenance**
+   Your audit story depends on storing:
+
+   * exact source URI, as-of date, sha256, header comment
+
+---
+
+## 9) Minimal “definition of done” checklist
+
+### Data correctness
+
+* [ ] For a known CVE, `epss.latest_score` matches the daily file for the latest asof_date
+* [ ] `epss` and `percentile` ranges enforced (0–1)
+* [ ] Model version extracted when present; otherwise stored as null but header_comment preserved
+* [ ] Delta rows created and `is_model_change` flips on version changes
+
+### Operational
+
+* [ ] Daily job retries on transient HTTP failures
+* [ ] Alert if no new asof_date in 48h
+* [ ] Raw bytes retention + metadata retention
+
+### Product
+
+* [ ] UI displays `probability% (percentile)` per FIRST recommendation ([FIRST][3])
+* [ ] Signal events link to evidence (raw sha, model version, asof date)
+* [ ] Policy Engine consumes `latest_score` only (never raw)
+
+---
+
+If you want, I can also provide:
+
+* a **migration script generator** that auto-creates monthly partitions for `epss.daily_score`,
+* an example **Valkey caching strategy** (`epss:latest:{cve}` with a 48h TTL, warmed only for observed CVEs),
+* and a concrete **“observed CVE” join contract** (what columns to expose from your vuln inventory so EPSS signals stay noise-free).
+
+[1]: https://www.first.org/epss/data_stats "Exploit Prediction Scoring System (EPSS)"
+[2]: https://www.first.org/epss/api "Exploit Prediction Scoring System (EPSS)"
+[3]: https://www.first.org/epss/articles/prob_percentile_bins "Exploit Prediction Scoring System (EPSS)"
+[4]: https://www.first.org/epss/faq "EPSS Frequently Asked Questions"