This commit is contained in:
		
							
								
								
									
										31
									
								
								docs/ops/zastava-runtime-prometheus-rules.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								docs/ops/zastava-runtime-prometheus-rules.yaml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,31 @@ | ||||
| groups: | ||||
|   - name: zastava-runtime | ||||
|     interval: 30s | ||||
|     rules: | ||||
|       - alert: ZastavaRuntimeEventsSilent | ||||
|         expr: sum(rate(zastava_runtime_events_total[10m])) == 0 | ||||
|         for: 15m | ||||
|         labels: | ||||
|           severity: warning | ||||
|           service: zastava-runtime | ||||
|         annotations: | ||||
|           summary: "Observer events stalled" | ||||
|           description: "No runtime events emitted in the last 15 minutes. Check observer DaemonSet health and container runtime mounts." | ||||
|       - alert: ZastavaRuntimeBackendLatencyHigh | ||||
|         expr: histogram_quantile(0.95, sum by (le) (rate(zastava_runtime_backend_latency_ms_bucket[5m]))) > 0.75 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: critical | ||||
|           service: zastava-runtime | ||||
|         annotations: | ||||
|           summary: "Runtime backend latency p95 above 750 ms" | ||||
|           description: "Latency to Scanner runtime APIs is elevated. Inspect Scanner.WebService readiness, Authority OpTok issuance, and cluster network." | ||||
|       - alert: ZastavaAdmissionDenySpike | ||||
|         expr: sum(rate(zastava_admission_decisions_total{decision="deny"}[5m])) > 20 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           severity: warning | ||||
|           service: zastava-runtime | ||||
|         annotations: | ||||
|           summary: "Admission webhook denies exceeding threshold" | ||||
|           description: "Webhook is denying more than 20 pod admissions per minute. Confirm policy verdicts and consider fail-open exception for impacted namespaces." | ||||
		Reference in New Issue
	
	Block a user