feat(docs): Add comprehensive documentation for Vexer, Vulnerability Explorer, and Zastava modules
- Introduced AGENTS.md, README.md, TASKS.md, and implementation_plan.md for Vexer, detailing mission, responsibilities, key components, and operational notes. - Established similar documentation structure for Vulnerability Explorer and Zastava modules, including their respective workflows, integrations, and observability notes. - Created risk scoring profiles documentation outlining the core workflow, factor model, governance, and deliverables. - Ensured all modules adhere to the Aggregation-Only Contract and maintain determinism and provenance in outputs.
This commit is contained in:
205
docs/modules/zastava/operations/runtime-grafana-dashboard.json
Normal file
205
docs/modules/zastava/operations/runtime-grafana-dashboard.json
Normal file
@@ -0,0 +1,205 @@
|
||||
{
|
||||
"title": "Zastava Runtime Plane",
|
||||
"uid": "zastava-runtime",
|
||||
"timezone": "utc",
|
||||
"schemaVersion": 38,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "timeseries",
|
||||
"title": "Observer Event Rate",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (tenant,component,kind) (rate(zastava_runtime_events_total{tenant=~\"$tenant\"}[5m]))",
|
||||
"legendFormat": "{{tenant}}/{{component}}/{{kind}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "1/s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"showLegend": true,
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "timeseries",
|
||||
"title": "Admission Decisions",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (decision) (rate(zastava_admission_decisions_total{tenant=~\"$tenant\"}[5m]))",
|
||||
"legendFormat": "{{decision}}"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "1/s",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 20
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"showLegend": true,
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "timeseries",
|
||||
"title": "Backend Latency P95",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum by (le) (rate(zastava_runtime_backend_latency_ms_bucket{tenant=~\"$tenant\"}[5m])))",
|
||||
"legendFormat": "p95 latency"
|
||||
}
|
||||
],
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 8
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "green"
|
||||
},
|
||||
{
|
||||
"color": "orange",
|
||||
"value": 500
|
||||
},
|
||||
{
|
||||
"color": "red",
|
||||
"value": 750
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {
|
||||
"legend": {
|
||||
"showLegend": true,
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "datasource",
|
||||
"type": "datasource",
|
||||
"query": "prometheus",
|
||||
"label": "Prometheus",
|
||||
"current": {
|
||||
"text": "Prometheus",
|
||||
"value": "Prometheus"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "tenant",
|
||||
"type": "query",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"definition": "label_values(zastava_runtime_events_total, tenant)",
|
||||
"refresh": 1,
|
||||
"hide": 0,
|
||||
"current": {
|
||||
"text": ".*",
|
||||
"value": ".*"
|
||||
},
|
||||
"regex": "",
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"sort": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"name": "Deployments",
|
||||
"type": "tags",
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${datasource}"
|
||||
},
|
||||
"enable": true,
|
||||
"iconColor": "rgba(255, 96, 96, 1)"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
groups:
|
||||
- name: zastava-runtime
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: ZastavaRuntimeEventsSilent
|
||||
expr: sum(rate(zastava_runtime_events_total[10m])) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
service: zastava-runtime
|
||||
annotations:
|
||||
summary: "Observer events stalled"
|
||||
description: "No runtime events emitted in the last 15 minutes. Check observer DaemonSet health and container runtime mounts."
|
||||
- alert: ZastavaRuntimeBackendLatencyHigh
|
||||
expr: histogram_quantile(0.95, sum by (le) (rate(zastava_runtime_backend_latency_ms_bucket[5m]))) > 0.75
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
service: zastava-runtime
|
||||
annotations:
|
||||
summary: "Runtime backend latency p95 above 750 ms"
|
||||
description: "Latency to Scanner runtime APIs is elevated. Inspect Scanner.WebService readiness, Authority OpTok issuance, and cluster network."
|
||||
- alert: ZastavaAdmissionDenySpike
|
||||
expr: sum(rate(zastava_admission_decisions_total{decision="deny"}[5m])) > 20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: zastava-runtime
|
||||
annotations:
|
||||
summary: "Admission webhook denies exceeding threshold"
|
||||
description: "Webhook is denying more than 20 pod admissions per minute. Confirm policy verdicts and consider fail-open exception for impacted namespaces."
|
||||
174
docs/modules/zastava/operations/runtime.md
Normal file
174
docs/modules/zastava/operations/runtime.md
Normal file
@@ -0,0 +1,174 @@
|
||||
# Zastava Runtime Operations Runbook
|
||||
|
||||
This runbook covers the runtime plane (Observer DaemonSet + Admission Webhook).
|
||||
It aligns with `Sprint 12 – Runtime Guardrails` and assumes components consume
|
||||
`StellaOps.Zastava.Core` (`AddZastavaRuntimeCore(...)`).
|
||||
|
||||
## 1. Prerequisites
|
||||
|
||||
- **Authority client credentials** – service principal `zastava-runtime` with scopes
|
||||
`aud:scanner` and `api:scanner.runtime.write`. Provision DPoP keys and mTLS client
|
||||
certs before rollout.
|
||||
- **Scanner/WebService reachability** – cluster DNS entry (e.g. `scanner.internal`)
|
||||
resolvable from every node running Observer/Webhook.
|
||||
- **Host mounts** – read-only access to `/proc`, container runtime state
|
||||
(`/var/lib/containerd`, `/var/run/containerd/containerd.sock`) and scratch space
|
||||
(`/var/run/zastava`).
|
||||
- **Offline kit bundle** – operators staging air-gapped installs must download
|
||||
`offline-kit/zastava-runtime-{version}.tar.zst` containing container images,
|
||||
Grafana dashboards, and Prometheus rules referenced below.
|
||||
- **Secrets** – Authority OpTok cache dir, DPoP private keys, and webhook TLS secrets
|
||||
live outside git. For air-gapped installs copy them to the sealed secrets vault.
|
||||
|
||||
### 1.1 Telemetry quick reference
|
||||
|
||||
| Metric | Description | Notes |
|
||||
|--------|-------------|-------|
|
||||
| `zastava.runtime.events.total{tenant,component,kind}` | Rate of observer events sent to Scanner | Expect >0 on busy nodes. |
|
||||
| `zastava.runtime.backend.latency.ms` | Histogram (ms) for `/runtime/events` and `/policy/runtime` calls | P95 & P99 drive alerting. |
|
||||
| `zastava.admission.decisions.total{decision}` | Admission verdict counts | Track deny spikes or fail-open fallbacks. |
|
||||
| `zastava.admission.cache.hits.total` | (future) Cache utilisation once Observer batches land | Placeholder until Observer tasks 12-004 complete. |
|
||||
|
||||
## 2. Deployment workflows
|
||||
|
||||
### 2.1 Fresh install (Helm overlay)
|
||||
|
||||
1. Load offline kit bundle: `oras cp offline-kit/zastava-runtime-*.tar.zst oci:registry.internal/zastava`.
|
||||
2. Render values:
|
||||
- `zastava.runtime.tenant`, `environment`, `deployment` (cluster identifier).
|
||||
- `zastava.runtime.authority` block (issuer, clientId, audience, DPoP toggle).
|
||||
- `zastava.runtime.metrics.commonTags.cluster` for Prometheus labels.
|
||||
3. Pre-create secrets:
|
||||
- `zastava-authority-dpop` (JWK + private key).
|
||||
- `zastava-authority-mtls` (client cert/key chain).
|
||||
- `zastava-webhook-tls` (serving cert; CSR bundle if using auto-approval).
|
||||
4. Deploy Observer DaemonSet and Webhook chart:
|
||||
```sh
|
||||
helm upgrade --install zastava-runtime deploy/helm/zastava \
|
||||
-f values/zastava-runtime.yaml \
|
||||
--namespace stellaops \
|
||||
--create-namespace
|
||||
```
|
||||
5. Verify:
|
||||
- `kubectl -n stellaops get pods -l app=zastava-observer` ready.
|
||||
- `kubectl -n stellaops logs ds/zastava-observer --tail=20` shows
|
||||
`Issued runtime OpTok` audit line with DPoP token type.
|
||||
- Admission webhook registered: `kubectl get validatingwebhookconfiguration zastava-webhook`.
|
||||
|
||||
### 2.2 Upgrades
|
||||
|
||||
1. Scale webhook deployment to `--replicas=3` (rolling).
|
||||
2. Drain one node per AZ to ensure Observer tolerates disruption.
|
||||
3. Apply chart upgrade; watch `zastava.runtime.backend.latency.ms` P95 (<250 ms).
|
||||
4. Post-upgrade, run smoke tests:
|
||||
- Apply unsigned Pod manifest → expect `deny` (policy fail).
|
||||
- Apply signed Pod manifest → expect `allow`.
|
||||
5. Record upgrade in ops log with Git SHA + Helm chart version.
|
||||
|
||||
### 2.3 Rollback
|
||||
|
||||
1. Use Helm revision history: `helm history zastava-runtime`.
|
||||
2. Rollback: `helm rollback zastava-runtime <revision>`.
|
||||
3. Invalidate cached OpToks:
|
||||
```sh
|
||||
kubectl -n stellaops exec deploy/zastava-webhook -- \
|
||||
zastava-webhook invalidate-op-token --audience scanner
|
||||
```
|
||||
4. Confirm observers reconnect via metrics (`rate(zastava_runtime_events_total[5m])`).
|
||||
|
||||
## 3. Authority & security guardrails
|
||||
|
||||
- Tokens must be `DPoP` type when `requireDpop=true`. Logs emit
|
||||
`authority.token.issue` scope with decision data; absence indicates misconfig.
|
||||
- `requireMutualTls=true` enforces mTLS during token acquisition. Disable only in
|
||||
lab clusters; expect warning log `Mutual TLS requirement disabled`.
|
||||
- Static fallback tokens (`allowStaticTokenFallback=true`) should exist only during
|
||||
initial bootstrap. Rotate nightly; preference is to disable once Authority reachable.
|
||||
- Audit every change in `zastava.runtime.authority` through change management.
|
||||
Use `kubectl get secret zastava-authority-dpop -o jsonpath='{.metadata.annotations.revision}'`
|
||||
to confirm key rotation.
|
||||
|
||||
## 4. Incident response
|
||||
|
||||
### 4.1 Authority offline
|
||||
|
||||
1. Check Prometheus alert `ZastavaAuthorityTokenStale`.
|
||||
2. Inspect Observer logs for `authority.token.fallback` scope.
|
||||
3. If fallback engaged, verify static token validity duration; rotate secret if older than 24 h.
|
||||
4. Once Authority restored, delete static fallback secret and restart pods to rebind DPoP keys.
|
||||
|
||||
### 4.2 Scanner/WebService latency spike
|
||||
|
||||
1. Alert `ZastavaRuntimeBackendLatencyHigh` fires at P95 > 750 ms for 5 minutes.
|
||||
2. Run backend health: `kubectl -n scanner exec deploy/scanner-web -- curl -f localhost:8080/healthz/ready`.
|
||||
3. If backend degraded, auto buffer may throttle. Confirm disk-backed queue size via
|
||||
`kubectl logs ds/zastava-observer | grep buffer.drops`.
|
||||
4. Consider enabling fail-open for namespaces listed in runbook Appendix B (temporary).
|
||||
|
||||
### 4.3 Admission deny storm
|
||||
|
||||
1. Alert `ZastavaAdmissionDenySpike` indicates >20 denies/minute.
|
||||
2. Pull sample: `kubectl logs deploy/zastava-webhook --since=10m | jq '.decision'`.
|
||||
3. Cross-check policy backlog in Scanner (`/policy/runtime` logs). Engage application
|
||||
owner; optionally set namespace to `failOpenNamespaces` after risk assessment.
|
||||
|
||||
## 5. Offline kit & air-gapped notes
|
||||
|
||||
- Bundle contents:
|
||||
- Observer/Webhook container images (multi-arch).
|
||||
- `docs/modules/zastava/operations/runtime-prometheus-rules.yaml` + Grafana dashboard JSON.
|
||||
- Sample `zastava-runtime.values.yaml`.
|
||||
- Verification:
|
||||
- Validate signature: `cosign verify-blob offline-kit/zastava-runtime-*.tar.zst --certificate offline-kit/zastava-runtime.cert`.
|
||||
- Extract Prometheus rules into offline monitoring cluster (`/etc/prometheus/rules.d`).
|
||||
- Import Grafana dashboard via `grafana-cli --config ...`.
|
||||
|
||||
## 6. Observability assets
|
||||
|
||||
- Prometheus alert rules: `docs/modules/zastava/operations/runtime-prometheus-rules.yaml`.
|
||||
- Grafana dashboard JSON: `docs/modules/zastava/operations/runtime-grafana-dashboard.json`.
|
||||
- Add both to the monitoring repo (`ops/monitoring/zastava`) and reference them in
|
||||
the Offline Kit manifest.
|
||||
|
||||
## 7. Build-id correlation & symbol retrieval
|
||||
|
||||
Runtime events emitted by Observer now include `process.buildId` (from the ELF
|
||||
`NT_GNU_BUILD_ID` note) and Scanner `/policy/runtime` surfaces the most recent
|
||||
`buildIds` list per digest. Operators can use these hashes to locate debug
|
||||
artifacts during incident response:
|
||||
|
||||
1. Capture the hash from CLI/webhook/Scanner API—for example:
|
||||
```bash
|
||||
stellaops-cli runtime policy test --image <digest> --namespace <ns>
|
||||
```
|
||||
Copy one of the `Build IDs` (e.g.
|
||||
`5f0c7c3cb4d9f8a4f1c1d5c6b7e8f90123456789`).
|
||||
2. Derive the debug path (`<aa>/<rest>` under `.build-id`) and check it exists:
|
||||
```bash
|
||||
ls /var/opt/debug/.build-id/5f/0c7c3cb4d9f8a4f1c1d5c6b7e8f90123456789.debug
|
||||
```
|
||||
3. If the file is missing, rehydrate it from Offline Kit bundles or the
|
||||
`debug-store` object bucket (mirror of release artefacts):
|
||||
```bash
|
||||
oras cp oci://registry.internal/debug-store:latest . --include \
|
||||
"5f/0c7c3cb4d9f8a4f1c1d5c6b7e8f90123456789.debug"
|
||||
```
|
||||
4. Confirm the running process advertises the same GNU build-id before
|
||||
symbolising:
|
||||
```bash
|
||||
readelf -n /proc/$(pgrep -f payments-api | head -n1)/exe | grep -i 'Build ID'
|
||||
```
|
||||
5. Attach the `.debug` file in `gdb`/`lldb`, feed it to `eu-unstrip`, or cache it
|
||||
in `debuginfod` for fleet-wide symbol resolution:
|
||||
```bash
|
||||
debuginfod-find debuginfo 5f0c7c3cb4d9f8a4f1c1d5c6b7e8f90123456789 >/tmp/payments-api.debug
|
||||
```
|
||||
6. For musl-based images, expect shorter build-id footprints. Missing hashes in
|
||||
runtime events indicate stripped binaries without the GNU note—schedule a
|
||||
rebuild with `-Wl,--build-id` enabled or add the binary to the debug-store
|
||||
allowlist so the scanner can surface a fallback symbol package.
|
||||
|
||||
Monitor `scanner.policy.runtime` responses for the `buildIds` field; absence of
|
||||
data after ZASTAVA-OBS-17-005 implies containers launched before the Observer
|
||||
upgrade or non-ELF entrypoints (static scripts). Re-run the workload or restart
|
||||
Observer to trigger a fresh capture if symbol parity is required.
|
||||
Reference in New Issue
Block a user