work
This commit is contained in:
30
ops/devops/orchestrator/alerts.yaml
Normal file
30
ops/devops/orchestrator/alerts.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
groups:
|
||||
- name: orchestrator-core
|
||||
rules:
|
||||
- alert: OrchestratorQueueDepthHigh
|
||||
expr: job_queue_depth > 500
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: orchestrator
|
||||
annotations:
|
||||
summary: "Queue depth high"
|
||||
description: "job_queue_depth exceeded 500 for 10m"
|
||||
- alert: OrchestratorFailuresHigh
|
||||
expr: rate(job_failures_total[5m]) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: orchestrator
|
||||
annotations:
|
||||
summary: "Job failures elevated"
|
||||
description: "Failure rate above 5/min in last 5m"
|
||||
- alert: OrchestratorLeaseStall
|
||||
expr: rate(lease_extensions_total[5m]) == 0 and job_queue_depth > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: orchestrator
|
||||
annotations:
|
||||
summary: "Leases stalled"
|
||||
description: "No lease renewals while queue has items"
|
||||
Reference in New Issue
Block a user