feat(docs): Add comprehensive documentation for Vexer, Vulnerability Explorer, and Zastava modules

- Introduced AGENTS.md, README.md, TASKS.md, and implementation_plan.md for Vexer, detailing mission, responsibilities, key components, and operational notes. - Established similar documentation structure for Vulnerability Explorer and Zastava modules, including their respective workflows, integrations, and observability notes. - Created risk scoring profiles documentation outlining the core workflow, factor model, governance, and deliverables. - Ensured all modules adhere to the Aggregation-Only Contract and maintain determinism and provenance in outputs.
2025-10-30 00:09:39 +02:00
parent 86f606a115
commit e8537460a3
503 changed files with 16136 additions and 54638 deletions
--- a/docs/modules/zastava/operations/runtime-grafana-dashboard.json
+++ b/docs/modules/zastava/operations/runtime-grafana-dashboard.json
@@ -0,0 +1,205 @@
+{
+  "title": "Zastava Runtime Plane",
+  "uid": "zastava-runtime",
+  "timezone": "utc",
+  "schemaVersion": 38,
+  "version": 1,
+  "refresh": "30s",
+  "time": {
+    "from": "now-6h",
+    "to": "now"
+  },
+  "panels": [
+    {
+      "id": 1,
+      "type": "timeseries",
+      "title": "Observer Event Rate",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (tenant,component,kind) (rate(zastava_runtime_events_total{tenant=~\"$tenant\"}[5m]))",
+          "legendFormat": "{{tenant}}/{{component}}/{{kind}}"
+        }
+      ],
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "1/s",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "showLegend": true,
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 2,
+      "type": "timeseries",
+      "title": "Admission Decisions",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (decision) (rate(zastava_admission_decisions_total{tenant=~\"$tenant\"}[5m]))",
+          "legendFormat": "{{decision}}"
+        }
+      ],
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "1/s",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 20
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "showLegend": true,
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    },
+    {
+      "id": 3,
+      "type": "timeseries",
+      "title": "Backend Latency P95",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(zastava_runtime_backend_latency_ms_bucket{tenant=~\"$tenant\"}[5m])))",
+          "legendFormat": "p95 latency"
+        }
+      ],
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 8
+      },
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ms",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "orange",
+                "value": 500
+              },
+              {
+                "color": "red",
+                "value": 750
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "showLegend": true,
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      }
+    }
+  ],
+  "templating": {
+    "list": [
+      {
+        "name": "datasource",
+        "type": "datasource",
+        "query": "prometheus",
+        "label": "Prometheus",
+        "current": {
+          "text": "Prometheus",
+          "value": "Prometheus"
+        }
+      },
+      {
+        "name": "tenant",
+        "type": "query",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${datasource}"
+        },
+        "definition": "label_values(zastava_runtime_events_total, tenant)",
+        "refresh": 1,
+        "hide": 0,
+        "current": {
+          "text": ".*",
+          "value": ".*"
+        },
+        "regex": "",
+        "includeAll": true,
+        "multi": true,
+        "sort": 1
+      }
+    ]
+  },
+  "annotations": {
+    "list": [
+      {
+        "name": "Deployments",
+        "type": "tags",
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${datasource}"
+        },
+        "enable": true,
+        "iconColor": "rgba(255, 96, 96, 1)"
+      }
+    ]
+  }
+}
--- a/docs/modules/zastava/operations/runtime-prometheus-rules.yaml
+++ b/docs/modules/zastava/operations/runtime-prometheus-rules.yaml
@@ -0,0 +1,31 @@
+groups:
+  - name: zastava-runtime
+    interval: 30s
+    rules:
+      - alert: ZastavaRuntimeEventsSilent
+        expr: sum(rate(zastava_runtime_events_total[10m])) == 0
+        for: 15m
+        labels:
+          severity: warning
+          service: zastava-runtime
+        annotations:
+          summary: "Observer events stalled"
+          description: "No runtime events emitted in the last 15 minutes. Check observer DaemonSet health and container runtime mounts."
+      - alert: ZastavaRuntimeBackendLatencyHigh
+        expr: histogram_quantile(0.95, sum by (le) (rate(zastava_runtime_backend_latency_ms_bucket[5m]))) > 0.75
+        for: 10m
+        labels:
+          severity: critical
+          service: zastava-runtime
+        annotations:
+          summary: "Runtime backend latency p95 above 750 ms"
+          description: "Latency to Scanner runtime APIs is elevated. Inspect Scanner.WebService readiness, Authority OpTok issuance, and cluster network."
+      - alert: ZastavaAdmissionDenySpike
+        expr: sum(rate(zastava_admission_decisions_total{decision="deny"}[5m])) > 20
+        for: 5m
+        labels:
+          severity: warning
+          service: zastava-runtime
+        annotations:
+          summary: "Admission webhook denies exceeding threshold"
+          description: "Webhook is denying more than 20 pod admissions per minute. Confirm policy verdicts and consider fail-open exception for impacted namespaces."
--- a/docs/modules/zastava/operations/runtime.md
+++ b/docs/modules/zastava/operations/runtime.md
@@ -0,0 +1,174 @@
+# Zastava Runtime Operations Runbook
+
+This runbook covers the runtime plane (Observer DaemonSet + Admission Webhook).
+It aligns with `Sprint 12 – Runtime Guardrails` and assumes components consume
+`StellaOps.Zastava.Core` (`AddZastavaRuntimeCore(...)`).
+
+## 1. Prerequisites
+
+- **Authority client credentials** – service principal `zastava-runtime` with scopes
+  `aud:scanner` and `api:scanner.runtime.write`. Provision DPoP keys and mTLS client
+  certs before rollout.
+- **Scanner/WebService reachability** – cluster DNS entry (e.g. `scanner.internal`)
+  resolvable from every node running Observer/Webhook.
+- **Host mounts** – read-only access to `/proc`, container runtime state
+  (`/var/lib/containerd`, `/var/run/containerd/containerd.sock`) and scratch space
+  (`/var/run/zastava`).
+- **Offline kit bundle** – operators staging air-gapped installs must download
+  `offline-kit/zastava-runtime-{version}.tar.zst` containing container images,
+  Grafana dashboards, and Prometheus rules referenced below.
+- **Secrets** – Authority OpTok cache dir, DPoP private keys, and webhook TLS secrets
+  live outside git. For air-gapped installs copy them to the sealed secrets vault.
+
+### 1.1 Telemetry quick reference
+
+| Metric | Description | Notes |
+|--------|-------------|-------|
+| `zastava.runtime.events.total{tenant,component,kind}` | Rate of observer events sent to Scanner | Expect >0 on busy nodes. |
+| `zastava.runtime.backend.latency.ms` | Histogram (ms) for `/runtime/events` and `/policy/runtime` calls | P95 & P99 drive alerting. |
+| `zastava.admission.decisions.total{decision}` | Admission verdict counts | Track deny spikes or fail-open fallbacks. |
+| `zastava.admission.cache.hits.total` | (future) Cache utilisation once Observer batches land | Placeholder until Observer tasks 12-004 complete. |
+
+## 2. Deployment workflows
+
+### 2.1 Fresh install (Helm overlay)
+
+1. Load offline kit bundle: `oras cp offline-kit/zastava-runtime-*.tar.zst oci:registry.internal/zastava`.
+2. Render values:
+   - `zastava.runtime.tenant`, `environment`, `deployment` (cluster identifier).
+   - `zastava.runtime.authority` block (issuer, clientId, audience, DPoP toggle).
+   - `zastava.runtime.metrics.commonTags.cluster` for Prometheus labels.
+3. Pre-create secrets:
+   - `zastava-authority-dpop` (JWK + private key).
+   - `zastava-authority-mtls` (client cert/key chain).
+   - `zastava-webhook-tls` (serving cert; CSR bundle if using auto-approval).
+4. Deploy Observer DaemonSet and Webhook chart:
+   ```sh
+   helm upgrade --install zastava-runtime deploy/helm/zastava \
+     -f values/zastava-runtime.yaml \
+     --namespace stellaops \
+     --create-namespace
+   ```
+5. Verify:
+   - `kubectl -n stellaops get pods -l app=zastava-observer` ready.
+   - `kubectl -n stellaops logs ds/zastava-observer --tail=20` shows
+     `Issued runtime OpTok` audit line with DPoP token type.
+   - Admission webhook registered: `kubectl get validatingwebhookconfiguration zastava-webhook`.
+
+### 2.2 Upgrades
+
+1. Scale webhook deployment to `--replicas=3` (rolling).
+2. Drain one node per AZ to ensure Observer tolerates disruption.
+3. Apply chart upgrade; watch `zastava.runtime.backend.latency.ms` P95 (<250 ms).
+4. Post-upgrade, run smoke tests:
+   - Apply unsigned Pod manifest → expect `deny` (policy fail).
+   - Apply signed Pod manifest → expect `allow`.
+5. Record upgrade in ops log with Git SHA + Helm chart version.
+
+### 2.3 Rollback
+
+1. Use Helm revision history: `helm history zastava-runtime`.
+2. Rollback: `helm rollback zastava-runtime <revision>`.
+3. Invalidate cached OpToks:
+   ```sh
+   kubectl -n stellaops exec deploy/zastava-webhook -- \
+     zastava-webhook invalidate-op-token --audience scanner
+   ```
+4. Confirm observers reconnect via metrics (`rate(zastava_runtime_events_total[5m])`).
+
+## 3. Authority & security guardrails
+
+- Tokens must be `DPoP` type when `requireDpop=true`. Logs emit
+  `authority.token.issue` scope with decision data; absence indicates misconfig.
+- `requireMutualTls=true` enforces mTLS during token acquisition. Disable only in
+  lab clusters; expect warning log `Mutual TLS requirement disabled`.
+- Static fallback tokens (`allowStaticTokenFallback=true`) should exist only during
+  initial bootstrap. Rotate nightly; preference is to disable once Authority reachable.
+- Audit every change in `zastava.runtime.authority` through change management.
+  Use `kubectl get secret zastava-authority-dpop -o jsonpath='{.metadata.annotations.revision}'`
+  to confirm key rotation.
+
+## 4. Incident response
+
+### 4.1 Authority offline
+
+1. Check Prometheus alert `ZastavaAuthorityTokenStale`.
+2. Inspect Observer logs for `authority.token.fallback` scope.
+3. If fallback engaged, verify static token validity duration; rotate secret if older than 24 h.
+4. Once Authority restored, delete static fallback secret and restart pods to rebind DPoP keys.
+
+### 4.2 Scanner/WebService latency spike
+
+1. Alert `ZastavaRuntimeBackendLatencyHigh` fires at P95 > 750 ms for 5 minutes.
+2. Run backend health: `kubectl -n scanner exec deploy/scanner-web -- curl -f localhost:8080/healthz/ready`.
+3. If backend degraded, auto buffer may throttle. Confirm disk-backed queue size via
+   `kubectl logs ds/zastava-observer | grep buffer.drops`.
+4. Consider enabling fail-open for namespaces listed in runbook Appendix B (temporary).
+
+### 4.3 Admission deny storm
+
+1. Alert `ZastavaAdmissionDenySpike` indicates >20 denies/minute.
+2. Pull sample: `kubectl logs deploy/zastava-webhook --since=10m | jq '.decision'`.
+3. Cross-check policy backlog in Scanner (`/policy/runtime` logs). Engage application
+   owner; optionally set namespace to `failOpenNamespaces` after risk assessment.
+
+## 5. Offline kit & air-gapped notes
+
+- Bundle contents:
+  - Observer/Webhook container images (multi-arch).
+- `docs/modules/zastava/operations/runtime-prometheus-rules.yaml` + Grafana dashboard JSON.
+  - Sample `zastava-runtime.values.yaml`.
+- Verification:
+  - Validate signature: `cosign verify-blob offline-kit/zastava-runtime-*.tar.zst --certificate offline-kit/zastava-runtime.cert`.
+  - Extract Prometheus rules into offline monitoring cluster (`/etc/prometheus/rules.d`).
+  - Import Grafana dashboard via `grafana-cli --config ...`.
+
+## 6. Observability assets
+
+- Prometheus alert rules: `docs/modules/zastava/operations/runtime-prometheus-rules.yaml`.
+- Grafana dashboard JSON: `docs/modules/zastava/operations/runtime-grafana-dashboard.json`.
+- Add both to the monitoring repo (`ops/monitoring/zastava`) and reference them in
+  the Offline Kit manifest.
+
+## 7. Build-id correlation & symbol retrieval
+
+Runtime events emitted by Observer now include `process.buildId` (from the ELF
+`NT_GNU_BUILD_ID` note) and Scanner `/policy/runtime` surfaces the most recent
+`buildIds` list per digest. Operators can use these hashes to locate debug
+artifacts during incident response:
+
+1. Capture the hash from CLI/webhook/Scanner API—for example:
+   ```bash
+   stellaops-cli runtime policy test --image <digest> --namespace <ns>
+   ```
+   Copy one of the `Build IDs` (e.g.
+   `5f0c7c3cb4d9f8a4f1c1d5c6b7e8f90123456789`).
+2. Derive the debug path (`<aa>/<rest>` under `.build-id`) and check it exists:
+   ```bash
+   ls /var/opt/debug/.build-id/5f/0c7c3cb4d9f8a4f1c1d5c6b7e8f90123456789.debug
+   ```
+3. If the file is missing, rehydrate it from Offline Kit bundles or the
+   `debug-store` object bucket (mirror of release artefacts):
+   ```bash
+   oras cp oci://registry.internal/debug-store:latest . --include \
+     "5f/0c7c3cb4d9f8a4f1c1d5c6b7e8f90123456789.debug"
+   ```
+4. Confirm the running process advertises the same GNU build-id before
+   symbolising:
+   ```bash
+   readelf -n /proc/$(pgrep -f payments-api | head -n1)/exe | grep -i 'Build ID'
+   ```
+5. Attach the `.debug` file in `gdb`/`lldb`, feed it to `eu-unstrip`, or cache it
+   in `debuginfod` for fleet-wide symbol resolution:
+   ```bash
+   debuginfod-find debuginfo 5f0c7c3cb4d9f8a4f1c1d5c6b7e8f90123456789 >/tmp/payments-api.debug
+   ```
+6. For musl-based images, expect shorter build-id footprints. Missing hashes in
+   runtime events indicate stripped binaries without the GNU note—schedule a
+   rebuild with `-Wl,--build-id` enabled or add the binary to the debug-store
+   allowlist so the scanner can surface a fallback symbol package.
+
+Monitor `scanner.policy.runtime` responses for the `buildIds` field; absence of
+data after ZASTAVA-OBS-17-005 implies containers launched before the Observer
+upgrade or non-ELF entrypoints (static scripts). Re-run the workload or restart
+Observer to trigger a fresh capture if symbol parity is required.