Files
git.stella-ops.org/docs/ops/zastava-runtime-prometheus-rules.yaml
master 17d861e4ab
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
up
2025-10-24 09:15:37 +03:00

32 lines
1.4 KiB
YAML

groups:
- name: zastava-runtime
interval: 30s
rules:
- alert: ZastavaRuntimeEventsSilent
expr: sum(rate(zastava_runtime_events_total[10m])) == 0
for: 15m
labels:
severity: warning
service: zastava-runtime
annotations:
summary: "Observer events stalled"
description: "No runtime events emitted in the last 15 minutes. Check observer DaemonSet health and container runtime mounts."
- alert: ZastavaRuntimeBackendLatencyHigh
expr: histogram_quantile(0.95, sum by (le) (rate(zastava_runtime_backend_latency_ms_bucket[5m]))) > 0.75
for: 10m
labels:
severity: critical
service: zastava-runtime
annotations:
summary: "Runtime backend latency p95 above 750 ms"
description: "Latency to Scanner runtime APIs is elevated. Inspect Scanner.WebService readiness, Authority OpTok issuance, and cluster network."
- alert: ZastavaAdmissionDenySpike
expr: sum(rate(zastava_admission_decisions_total{decision="deny"}[5m])) > 20
for: 5m
labels:
severity: warning
service: zastava-runtime
annotations:
summary: "Admission webhook denies exceeding threshold"
description: "Webhook is denying more than 20 pod admissions per minute. Confirm policy verdicts and consider fail-open exception for impacted namespaces."