Add tests and implement timeline ingestion options with NATS and Redis subscribers
- Introduced `BinaryReachabilityLifterTests` to validate binary lifting functionality. - Created `PackRunWorkerOptions` for configuring worker paths and execution persistence. - Added `TimelineIngestionOptions` for configuring NATS and Redis ingestion transports. - Implemented `NatsTimelineEventSubscriber` for subscribing to NATS events. - Developed `RedisTimelineEventSubscriber` for reading from Redis Streams. - Added `TimelineEnvelopeParser` to normalize incoming event envelopes. - Created unit tests for `TimelineEnvelopeParser` to ensure correct field mapping. - Implemented `TimelineAuthorizationAuditSink` for logging authorization outcomes.
This commit is contained in:
40
ops/devops/docker/Dockerfile.console
Normal file
40
ops/devops/docker/Dockerfile.console
Normal file
@@ -0,0 +1,40 @@
|
||||
# syntax=docker/dockerfile:1.7
|
||||
# Multi-stage Angular console image with non-root runtime (DOCKER-44-001)
|
||||
ARG NODE_IMAGE=node:20-bullseye-slim
|
||||
ARG NGINX_IMAGE=nginxinc/nginx-unprivileged:1.27-alpine
|
||||
ARG APP_DIR=src/UI/StellaOps.UI
|
||||
ARG DIST_DIR=dist
|
||||
ARG APP_PORT=8080
|
||||
|
||||
FROM ${NODE_IMAGE} AS build
|
||||
ENV npm_config_fund=false npm_config_audit=false SOURCE_DATE_EPOCH=1704067200
|
||||
WORKDIR /app
|
||||
COPY ${APP_DIR}/package*.json ./
|
||||
RUN npm ci --prefer-offline --no-progress --cache .npm
|
||||
COPY ${APP_DIR}/ ./
|
||||
RUN npm run build -- --configuration=production --output-path=${DIST_DIR}
|
||||
|
||||
FROM ${NGINX_IMAGE} AS runtime
|
||||
ARG APP_PORT
|
||||
ENV APP_PORT=${APP_PORT}
|
||||
USER 101
|
||||
WORKDIR /
|
||||
COPY --from=build /app/${DIST_DIR}/ /usr/share/nginx/html/
|
||||
COPY ops/devops/docker/healthcheck-frontend.sh /usr/local/bin/healthcheck-frontend.sh
|
||||
RUN rm -f /etc/nginx/conf.d/default.conf && \
|
||||
cat > /etc/nginx/conf.d/default.conf <<CONF
|
||||
server {
|
||||
listen ${APP_PORT};
|
||||
listen [::]:${APP_PORT};
|
||||
server_name _;
|
||||
root /usr/share/nginx/html;
|
||||
location / {
|
||||
try_files $$uri $$uri/ /index.html;
|
||||
}
|
||||
}
|
||||
CONF
|
||||
|
||||
EXPOSE ${APP_PORT}
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
|
||||
CMD /usr/local/bin/healthcheck-frontend.sh
|
||||
CMD ["nginx","-g","daemon off;"]
|
||||
@@ -7,6 +7,7 @@ ARG RUNTIME_IMAGE=mcr.microsoft.com/dotnet/aspnet:10.0-bookworm-slim
|
||||
ARG APP_PROJECT=src/Service/Service.csproj
|
||||
ARG CONFIGURATION=Release
|
||||
ARG PUBLISH_DIR=/app/publish
|
||||
ARG APP_BINARY=StellaOps.Service
|
||||
ARG APP_USER=stella
|
||||
ARG APP_UID=10001
|
||||
ARG APP_GID=10001
|
||||
@@ -38,7 +39,8 @@ COPY --chown=${APP_UID}:${APP_GID} ops/devops/docker/healthcheck.sh /usr/local/b
|
||||
ENV ASPNETCORE_URLS=http://+:${APP_PORT} \
|
||||
DOTNET_EnableDiagnostics=0 \
|
||||
DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1 \
|
||||
COMPlus_EnableDiagnostics=0
|
||||
COMPlus_EnableDiagnostics=0 \
|
||||
APP_BINARY=${APP_BINARY}
|
||||
|
||||
USER ${APP_UID}:${APP_GID}
|
||||
EXPOSE ${APP_PORT}
|
||||
@@ -50,4 +52,5 @@ RUN chmod 500 /app && \
|
||||
find /app -maxdepth 1 -type f -exec chmod 400 {} \; && \
|
||||
find /app -maxdepth 1 -type d -exec chmod 500 {} \;
|
||||
|
||||
ENTRYPOINT ["./StellaOps.Service"]
|
||||
# Use shell form so APP_BINARY env can be expanded without duplicating the template per service
|
||||
ENTRYPOINT ["sh","-c","exec ./\"$APP_BINARY\""]
|
||||
|
||||
@@ -6,6 +6,7 @@ The reusable multi-stage scaffold lives at `ops/devops/docker/Dockerfile.hardene
|
||||
- .NET 10 SDK/runtime images provided via offline mirror (`SDK_IMAGE` / `RUNTIME_IMAGE`).
|
||||
- `APP_PROJECT` path to the service csproj.
|
||||
- `healthcheck.sh` copied from `ops/devops/docker/` (already referenced by the template).
|
||||
- Optional: `APP_BINARY` (assembly name, defaults to `StellaOps.Service`) and `APP_PORT`.
|
||||
|
||||
Copy the template next to the service and set build args in CI (per-service matrix) to avoid maintaining divergent Dockerfiles.
|
||||
|
||||
@@ -41,7 +42,7 @@ USER ${APP_UID}:${APP_GID}
|
||||
EXPOSE ${APP_PORT}
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 CMD /usr/local/bin/healthcheck.sh
|
||||
RUN chmod 500 /app && find /app -maxdepth 1 -type f -exec chmod 400 {} \; && find /app -maxdepth 1 -type d -exec chmod 500 {} \;
|
||||
ENTRYPOINT ["./StellaOps.Service"]
|
||||
ENTRYPOINT ["sh","-c","exec ./\"$APP_BINARY\""]
|
||||
```
|
||||
|
||||
Build stage (per service) should:
|
||||
@@ -56,6 +57,13 @@ Required checks:
|
||||
- Health endpoints exposed: `/health/liveness`, `/health/readiness`, `/version`, `/metrics`.
|
||||
- Image SBOM generated (syft) in pipeline; attach cosign attestations (see DOCKER-44-002).
|
||||
|
||||
Service matrix & helper:
|
||||
- Build args for the core services are enumerated in `ops/devops/docker/services-matrix.env` (API, Console, Orchestrator, Task Runner, Concelier, Excititor, Policy, Notify, Export, AdvisoryAI).
|
||||
- `ops/devops/docker/build-all.sh` reads the matrix and builds/tag images from the shared template with consistent non-root/health defaults. Override `REGISTRY` and `TAG_SUFFIX` to publish.
|
||||
|
||||
Console (Angular) image:
|
||||
- Use `ops/devops/docker/Dockerfile.console` for the UI (Angular v17). It builds with `node:20-bullseye-slim`, serves via `nginxinc/nginx-unprivileged`, includes `healthcheck-frontend.sh`, and runs as non-root UID 101. Build with `docker build -f ops/devops/docker/Dockerfile.console --build-arg APP_DIR=src/UI/StellaOps.UI .`.
|
||||
|
||||
SBOM & attestation helper (DOCKER-44-002):
|
||||
- Script: `ops/devops/docker/sbom_attest.sh <image> [out-dir] [cosign-key]`
|
||||
- Emits SPDX (`*.spdx.json`) and CycloneDX (`*.cdx.json`) with `SOURCE_DATE_EPOCH` pinned for reproducibility.
|
||||
|
||||
50
ops/devops/docker/build-all.sh
Normal file
50
ops/devops/docker/build-all.sh
Normal file
@@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env bash
|
||||
# Build hardened images for the core services using the shared template/matrix (DOCKER-44-001)
|
||||
set -euo pipefail
|
||||
|
||||
ROOT=${ROOT:-"$(git rev-parse --show-toplevel)"}
|
||||
MATRIX=${MATRIX:-"${ROOT}/ops/devops/docker/services-matrix.env"}
|
||||
REGISTRY=${REGISTRY:-"stellaops"}
|
||||
TAG_SUFFIX=${TAG_SUFFIX:-"dev"}
|
||||
SDK_IMAGE=${SDK_IMAGE:-"mcr.microsoft.com/dotnet/sdk:10.0-bookworm-slim"}
|
||||
RUNTIME_IMAGE=${RUNTIME_IMAGE:-"mcr.microsoft.com/dotnet/aspnet:10.0-bookworm-slim"}
|
||||
|
||||
if [[ ! -f "${MATRIX}" ]]; then
|
||||
echo "matrix file not found: ${MATRIX}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Building services from ${MATRIX} -> ${REGISTRY}/<service>:${TAG_SUFFIX}" >&2
|
||||
|
||||
while IFS='|' read -r service dockerfile project binary port; do
|
||||
[[ -z "${service}" || "${service}" =~ ^# ]] && continue
|
||||
image="${REGISTRY}/${service}:${TAG_SUFFIX}"
|
||||
df_path="${ROOT}/${dockerfile}"
|
||||
if [[ ! -f "${df_path}" ]]; then
|
||||
echo "skipping ${service}: dockerfile missing (${df_path})" >&2
|
||||
continue
|
||||
fi
|
||||
|
||||
if [[ "${dockerfile}" == *"Dockerfile.console"* ]]; then
|
||||
# Angular console build uses its dedicated Dockerfile
|
||||
echo "[console] ${service} -> ${image}" >&2
|
||||
docker build \
|
||||
-f "${df_path}" "${ROOT}" \
|
||||
--build-arg APP_DIR="${project}" \
|
||||
--build-arg APP_PORT="${port}" \
|
||||
-t "${image}"
|
||||
else
|
||||
echo "[service] ${service} -> ${image}" >&2
|
||||
docker build \
|
||||
-f "${df_path}" "${ROOT}" \
|
||||
--build-arg SDK_IMAGE="${SDK_IMAGE}" \
|
||||
--build-arg RUNTIME_IMAGE="${RUNTIME_IMAGE}" \
|
||||
--build-arg APP_PROJECT="${project}" \
|
||||
--build-arg APP_BINARY="${binary}" \
|
||||
--build-arg APP_PORT="${port}" \
|
||||
-t "${image}"
|
||||
fi
|
||||
|
||||
done < "${MATRIX}"
|
||||
|
||||
echo "Build complete. Remember to enforce readOnlyRootFilesystem at deploy time and run sbom_attest.sh (DOCKER-44-002)." >&2
|
||||
10
ops/devops/docker/healthcheck-frontend.sh
Normal file
10
ops/devops/docker/healthcheck-frontend.sh
Normal file
@@ -0,0 +1,10 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
HOST="${HEALTH_HOST:-127.0.0.1}"
|
||||
PORT="${HEALTH_PORT:-8080}"
|
||||
PATH_CHECK="${HEALTH_PATH:-/}"
|
||||
USER_AGENT="stellaops-frontend-healthcheck"
|
||||
|
||||
wget -qO- "http://${HOST}:${PORT}${PATH_CHECK}" \
|
||||
--header="User-Agent: ${USER_AGENT}" \
|
||||
--timeout="${HEALTH_TIMEOUT:-4}" >/dev/null
|
||||
12
ops/devops/docker/services-matrix.env
Normal file
12
ops/devops/docker/services-matrix.env
Normal file
@@ -0,0 +1,12 @@
|
||||
# service|dockerfile|project|binary|port
|
||||
# Paths are relative to repo root; dockerfile is usually the shared hardened template.
|
||||
api|ops/devops/docker/Dockerfile.hardened.template|src/VulnExplorer/StellaOps.VulnExplorer.Api/StellaOps.VulnExplorer.Api.csproj|StellaOps.VulnExplorer.Api|8080
|
||||
orchestrator|ops/devops/docker/Dockerfile.hardened.template|src/Orchestrator/StellaOps.Orchestrator.WebService/StellaOps.Orchestrator.WebService.csproj|StellaOps.Orchestrator.WebService|8080
|
||||
task-runner|ops/devops/docker/Dockerfile.hardened.template|src/Orchestrator/StellaOps.Orchestrator.Worker/StellaOps.Orchestrator.Worker.csproj|StellaOps.Orchestrator.Worker|8081
|
||||
concelier|ops/devops/docker/Dockerfile.hardened.template|src/Concelier/StellaOps.Concelier.WebService/StellaOps.Concelier.WebService.csproj|StellaOps.Concelier.WebService|8080
|
||||
excititor|ops/devops/docker/Dockerfile.hardened.template|src/Excititor/StellaOps.Excititor.WebService/StellaOps.Excititor.WebService.csproj|StellaOps.Excititor.WebService|8080
|
||||
policy|ops/devops/docker/Dockerfile.hardened.template|src/Policy/StellaOps.Policy.Gateway/StellaOps.Policy.Gateway.csproj|StellaOps.Policy.Gateway|8084
|
||||
notify|ops/devops/docker/Dockerfile.hardened.template|src/Notify/StellaOps.Notify.WebService/StellaOps.Notify.WebService.csproj|StellaOps.Notify.WebService|8080
|
||||
export|ops/devops/docker/Dockerfile.hardened.template|src/ExportCenter/StellaOps.ExportCenter.WebService/StellaOps.ExportCenter.WebService.csproj|StellaOps.ExportCenter.WebService|8080
|
||||
advisoryai|ops/devops/docker/Dockerfile.hardened.template|src/AdvisoryAI/StellaOps.AdvisoryAI.WebService/StellaOps.AdvisoryAI.WebService.csproj|StellaOps.AdvisoryAI.WebService|8080
|
||||
console|ops/devops/docker/Dockerfile.console|src/UI/StellaOps.UI|StellaOps.UI|8080
|
||||
34
ops/devops/tenant/README.md
Normal file
34
ops/devops/tenant/README.md
Normal file
@@ -0,0 +1,34 @@
|
||||
# Tenant audit & chaos kit (DEVOPS-TEN-49-001)
|
||||
|
||||
Artifacts live in this folder to cover tenant audit logging, usage metrics, JWKS outage chaos, and load/perf benchmarks.
|
||||
|
||||
## What’s here
|
||||
- `recording-rules.yaml` – Prometheus recordings for per-tenant rate/error/latency and JWKS cache ratio.
|
||||
- `alerts.yaml` – Alert rules for error rate, JWKS cache miss spike, p95 latency, auth failures, and rate limit hits.
|
||||
- `dashboards/tenant-audit.json` – Grafana dashboard with tenant/service variables.
|
||||
- `k6-tenant-load.js` – Multi-tenant load/perf scenario (read/write 90/10, tenant header, configurable paths).
|
||||
- `jwks-chaos.sh` – iptables-based JWKS dropper for chaos drills.
|
||||
|
||||
## Import & wiring
|
||||
1. Load `recording-rules.yaml` and `alerts.yaml` into the Prometheus rule groups for the tenancy stack.
|
||||
2. Import `dashboards/tenant-audit.json` into Grafana (folder `StellaOps / Tenancy`).
|
||||
3. Ensure services emit `tenant` labels on request metrics and structured logs (`tenant`, `subject`, `action`, `resource`, `result`, `traceId`).
|
||||
|
||||
## Load/perf (k6)
|
||||
```bash
|
||||
BASE_URL=https://api.stella.local \
|
||||
TENANTS=tenant-a,tenant-b,tenant-c \
|
||||
TENANT_HEADER=X-StellaOps-Tenant \
|
||||
VUS=5000 DURATION=15m \
|
||||
k6 run ops/devops/tenant/k6-tenant-load.js
|
||||
```
|
||||
Adjust `TENANT_READ_PATHS` / `TENANT_WRITE_PATHS` to point at Policy/Vuln/Notify endpoints. Default thresholds: p95 <300ms (read), <600ms (write), error rate <0.5%.
|
||||
|
||||
## JWKS chaos drill
|
||||
```bash
|
||||
JWKS_HOST=authority.local JWKS_PORT=8440 DURATION=300 \
|
||||
./ops/devops/tenant/jwks-chaos.sh &
|
||||
BASE_URL=https://api.stella.local TENANTS=tenant-a,tenant-b \
|
||||
k6 run ops/devops/tenant/k6-tenant-load.js
|
||||
```
|
||||
Run on an isolated agent with sudo/iptables available. Watch `jwks_cache_hit_ratio:5m`, `tenant_error_rate:5m`, and alerts `jwks_cache_miss_spike` / `tenant_auth_failures_spike`.
|
||||
@@ -19,6 +19,14 @@ groups:
|
||||
annotations:
|
||||
summary: JWKS cache miss rate spike
|
||||
description: JWKS miss ratio above 20% may indicate outage or cache expiry.
|
||||
- alert: tenant_latency_p95_high
|
||||
expr: tenant_latency_p95:5m > 0.6
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warn
|
||||
annotations:
|
||||
summary: Tenant p95 latency high
|
||||
description: Per-tenant p95 latency over 600ms for 10m.
|
||||
- alert: tenant_rate_limit_exceeded
|
||||
expr: rate(tenant_rate_limit_hits_total[5m]) > 10
|
||||
for: 5m
|
||||
@@ -27,3 +35,11 @@ groups:
|
||||
annotations:
|
||||
summary: Frequent rate limit hits
|
||||
description: Tenant rate limit exceeded more than 10 times per 5m window.
|
||||
- alert: tenant_auth_failures_spike
|
||||
expr: rate(auth_token_validation_failures_total{tenant!=""}[5m]) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
annotations:
|
||||
summary: Tenant auth failures elevated
|
||||
description: Token validation failures exceed 5 per 5m for tenant-scoped traffic.
|
||||
|
||||
@@ -25,12 +25,15 @@ Scope: deploy audit pipeline, capture tenant usage metrics, run JWKS outage chao
|
||||
- Multi-tenant spread: at least 10 tenants, randomised per VU; ensure metrics maintain `tenant` label cardinality cap (<= 1000 active tenants).
|
||||
|
||||
## Implementation steps
|
||||
- Add dashboards (Grafana folder `StellaOps / Tenancy`) with panels for per-tenant latency, error rate, rate-limit hits, JWKS cache hit rate.
|
||||
- Alert rules: `tenant_error_rate_gt_0_5pct`, `jwks_cache_miss_spike`, `tenant_rate_limit_exceeded`.
|
||||
- CI: add chaos test job stub (uses docker-compose + iptables fault) gated behind manual approval.
|
||||
- Docs: update `deploy/README.md` Tenancy section once dashboards/alerts live.
|
||||
- Add dashboards (Grafana folder `StellaOps / Tenancy`) with panels for per-tenant latency, error rate, rate-limit hits, JWKS cache hit rate, auth failures.
|
||||
- Alert rules: `tenant_error_rate_gt_0_5pct`, `jwks_cache_miss_spike`, `tenant_rate_limit_exceeded`, `tenant_latency_p95_high`, `tenant_auth_failures_spike` with supporting recording rules in `recording-rules.yaml`.
|
||||
- Load/perf: k6 scenario `k6-tenant-load.js` (read/write 90/10, random tenants, headers configurable) targeting 5k RPS.
|
||||
- Chaos: reusable script `jwks-chaos.sh` + CI stub in `README.md` describing manual-gated run to drop JWKS egress while k6 runs.
|
||||
- Docs: update `deploy/README.md` Tenancy section once dashboards/alerts live. Status: added Tenancy Observability section with import steps.
|
||||
|
||||
## Artefacts
|
||||
- Dashboard JSON: `ops/devops/tenant/dashboards/tenant-audit.json`
|
||||
- Alert rules: `ops/devops/tenant/alerts.yaml`
|
||||
- Recording rules: `ops/devops/tenant/recording-rules.yaml`
|
||||
- Load/perf harness: `ops/devops/tenant/k6-tenant-load.js`
|
||||
- Chaos script: `ops/devops/tenant/jwks-chaos.sh`
|
||||
|
||||
@@ -1,11 +1,18 @@
|
||||
{
|
||||
"title": "Tenant Audit & Auth",
|
||||
"timezone": "utc",
|
||||
"templating": {
|
||||
"list": [
|
||||
{ "name": "tenant", "type": "query", "datasource": "Prometheus", "query": "label_values(tenant_requests_total, tenant)", "refresh": 2, "multi": true, "includeAll": true },
|
||||
{ "name": "service", "type": "query", "datasource": "Prometheus", "query": "label_values(tenant_requests_total, service)", "refresh": 2, "multi": true, "includeAll": true }
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{"type": "timeseries", "title": "Tenant request latency p95", "targets": [{"expr": "histogram_quantile(0.95, rate(tenant_requests_duration_seconds_bucket[5m]))"}]},
|
||||
{"type": "timeseries", "title": "Tenant error rate", "targets": [{"expr": "sum(rate(tenant_requests_total{status=~\"5..\"}[5m])) / sum(rate(tenant_requests_total[5m]))"}]},
|
||||
{"type": "timeseries", "title": "JWKS cache hit rate", "targets": [{"expr": "rate(auth_jwks_cache_hits_total[5m]) / (rate(auth_jwks_cache_hits_total[5m]) + rate(auth_jwks_cache_misses_total[5m]))"}]},
|
||||
{"type": "timeseries", "title": "Rate limit hits", "targets": [{"expr": "rate(tenant_rate_limit_hits_total[5m])"}]},
|
||||
{"type": "timeseries", "title": "Tenant queue depth", "targets": [{"expr": "tenant_queue_depth"}]}
|
||||
{ "type": "timeseries", "title": "p95 latency (by service)", "targets": [ { "expr": "tenant_latency_p95:5m{tenant=~\"$tenant\",service=~\"$service\"}" } ] },
|
||||
{ "type": "timeseries", "title": "Error rate", "targets": [ { "expr": "tenant_error_rate:5m{tenant=~\"$tenant\",service=~\"$service\"}" } ] },
|
||||
{ "type": "timeseries", "title": "Requests per second", "targets": [ { "expr": "tenant_requests_rate:5m{tenant=~\"$tenant\",service=~\"$service\"}" } ] },
|
||||
{ "type": "timeseries", "title": "JWKS cache hit ratio", "targets": [ { "expr": "jwks_cache_hit_ratio:5m" } ] },
|
||||
{ "type": "timeseries", "title": "Auth validation failures", "targets": [ { "expr": "rate(auth_token_validation_failures_total{tenant!=\"\",tenant=~\"$tenant\"}[5m])" } ] },
|
||||
{ "type": "timeseries", "title": "Rate limit hits", "targets": [ { "expr": "tenant_rate_limit_hits:5m{tenant=~\"$tenant\",service=~\"$service\"}" } ] }
|
||||
]
|
||||
}
|
||||
|
||||
84
ops/devops/tenant/k6-tenant-load.js
Normal file
84
ops/devops/tenant/k6-tenant-load.js
Normal file
@@ -0,0 +1,84 @@
|
||||
import http from 'k6/http';
|
||||
import { check, sleep } from 'k6';
|
||||
import { Rate, Trend } from 'k6/metrics';
|
||||
|
||||
const BASE_URL = __ENV.BASE_URL || 'http://localhost:8080';
|
||||
const TENANT_HEADER = __ENV.TENANT_HEADER || 'X-StellaOps-Tenant';
|
||||
const TENANTS = (__ENV.TENANTS || 'tenant-a,tenant-b,tenant-c,tenant-d,tenant-e,tenant-f,tenant-g,tenant-h,tenant-i,tenant-j')
|
||||
.split(',')
|
||||
.map((t) => t.trim())
|
||||
.filter(Boolean);
|
||||
const READ_PATHS = (__ENV.TENANT_READ_PATHS || '/api/v1/policy/effective,/api/v1/vuln/search?limit=50,/notify/api/v1/events?limit=20,/health/readiness')
|
||||
.split(',')
|
||||
.map((p) => p.trim())
|
||||
.filter(Boolean);
|
||||
const WRITE_PATHS = (__ENV.TENANT_WRITE_PATHS || '/api/v1/policy/evaluate,/notify/api/v1/test,/api/v1/tasks/submit')
|
||||
.split(',')
|
||||
.map((p) => p.trim())
|
||||
.filter(Boolean);
|
||||
|
||||
const READ_FRACTION = Number(__ENV.READ_FRACTION || '0.9');
|
||||
const SLEEP_MS = Number(__ENV.SLEEP_MS || '250');
|
||||
let seed = Number(__ENV.SEED || '1');
|
||||
|
||||
function rnd() {
|
||||
seed = (seed * 1664525 + 1013904223) >>> 0;
|
||||
return seed / 4294967296;
|
||||
}
|
||||
|
||||
export const options = {
|
||||
vus: Number(__ENV.VUS || '250'),
|
||||
duration: __ENV.DURATION || '10m',
|
||||
thresholds: {
|
||||
http_req_failed: ['rate<0.005'],
|
||||
http_req_duration: ['p(95)<300'],
|
||||
'tenant_write_duration': ['p(95)<600'],
|
||||
'tenant_auth_failures': ['rate<0.01'],
|
||||
},
|
||||
};
|
||||
|
||||
const readDuration = new Trend('tenant_read_duration', true);
|
||||
const writeDuration = new Trend('tenant_write_duration', true);
|
||||
const authFailures = new Rate('tenant_auth_failures');
|
||||
|
||||
function pick(list) {
|
||||
return list[Math.floor(rnd() * list.length)];
|
||||
}
|
||||
|
||||
export default function () {
|
||||
const tenant = pick(TENANTS);
|
||||
const doWrite = rnd() > READ_FRACTION;
|
||||
const path = doWrite ? pick(WRITE_PATHS) : pick(READ_PATHS);
|
||||
|
||||
const headers = {
|
||||
[TENANT_HEADER]: tenant,
|
||||
'Content-Type': 'application/json',
|
||||
};
|
||||
|
||||
const url = `${BASE_URL}${path}`;
|
||||
const payload = JSON.stringify({
|
||||
tenant,
|
||||
traceId: __VU + '-' + Date.now(),
|
||||
now: new Date().toISOString(),
|
||||
sample: 'tenant-chaos',
|
||||
});
|
||||
|
||||
const params = { headers, tags: { tenant, path, kind: doWrite ? 'write' : 'read' } };
|
||||
const res = doWrite ? http.post(url, payload, params) : http.get(url, params);
|
||||
|
||||
if (!check(res, {
|
||||
'status ok': (r) => r.status >= 200 && r.status < 300,
|
||||
})) {
|
||||
if (res.status === 401 || res.status === 403) {
|
||||
authFailures.add(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (doWrite) {
|
||||
writeDuration.add(res.timings.duration);
|
||||
} else {
|
||||
readDuration.add(res.timings.duration);
|
||||
}
|
||||
|
||||
sleep(SLEEP_MS / 1000);
|
||||
}
|
||||
18
ops/devops/tenant/recording-rules.yaml
Normal file
18
ops/devops/tenant/recording-rules.yaml
Normal file
@@ -0,0 +1,18 @@
|
||||
# Recording rules supporting tenant audit dashboards/alerts (DEVOPS-TEN-49-001)
|
||||
apiVersion: 1
|
||||
groups:
|
||||
- name: tenant-sli
|
||||
interval: 30s
|
||||
rules:
|
||||
- record: tenant_requests_rate:5m
|
||||
expr: sum by (tenant, service) (rate(tenant_requests_total[5m]))
|
||||
- record: tenant_error_rate:5m
|
||||
expr: sum by (tenant, service) (rate(tenant_requests_total{status=~"5.."}[5m])) /
|
||||
clamp_min(sum by (tenant, service) (rate(tenant_requests_total[5m])), 1)
|
||||
- record: tenant_latency_p95:5m
|
||||
expr: histogram_quantile(0.95, sum by (le, tenant, service) (rate(tenant_requests_duration_seconds_bucket[5m])))
|
||||
- record: jwks_cache_hit_ratio:5m
|
||||
expr: rate(auth_jwks_cache_hits_total[5m]) /
|
||||
clamp_min(rate(auth_jwks_cache_hits_total[5m]) + rate(auth_jwks_cache_misses_total[5m]), 1)
|
||||
- record: tenant_rate_limit_hits:5m
|
||||
expr: sum by (tenant, service) (rate(tenant_rate_limit_hits_total[5m]))
|
||||
Reference in New Issue
Block a user