Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Signals CI & Image / signals-ci (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Policy Simulation / policy-simulate (push) Has been cancelled
SDK Publish & Sign / sdk-publish (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled
devportal-offline / build-offline (push) Has been cancelled
70 lines
2.4 KiB
YAML
70 lines
2.4 KiB
YAML
groups:
|
|
- name: orchestrator-core
|
|
rules:
|
|
- alert: OrchestratorQueueDepthHigh
|
|
expr: job_queue_depth > 500
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: orchestrator
|
|
annotations:
|
|
summary: "Queue depth high"
|
|
description: "job_queue_depth exceeded 500 for 10m"
|
|
- alert: OrchestratorFailuresHigh
|
|
expr: rate(job_failures_total[5m]) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: orchestrator
|
|
annotations:
|
|
summary: "Job failures elevated"
|
|
description: "Failure rate above 5/min in last 5m"
|
|
- alert: OrchestratorLeaseStall
|
|
expr: rate(lease_extensions_total[5m]) == 0 and job_queue_depth > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: orchestrator
|
|
annotations:
|
|
summary: "Leases stalled"
|
|
description: "No lease renewals while queue has items"
|
|
- alert: OrchestratorDLQDepthHigh
|
|
expr: job_dlq_depth > 10
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: orchestrator
|
|
annotations:
|
|
summary: "DLQ depth high"
|
|
description: "Dead-letter queue depth above 10 for 10m"
|
|
- alert: OrchestratorBackpressure
|
|
expr: avg_over_time(rate_limiter_backpressure_ratio[5m]) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: orchestrator
|
|
annotations:
|
|
summary: "Backpressure elevated"
|
|
description: "Rate limiter backpressure >50% over 5m"
|
|
- alert: OrchestratorErrorCluster
|
|
expr: sum by(jobType) (rate(job_failures_total[5m])) > 3
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: orchestrator
|
|
annotations:
|
|
summary: "Error cluster detected"
|
|
description: "Failure rate >3/min for a job type"
|
|
- alert: OrchestratorFailureBurnRateHigh
|
|
expr: |
|
|
(rate(job_failures_total[5m]) / clamp_min(rate(job_processed_total[5m]), 1)) > 0.02
|
|
and
|
|
(rate(job_failures_total[30m]) / clamp_min(rate(job_processed_total[30m]), 1)) > 0.01
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
service: orchestrator
|
|
annotations:
|
|
summary: "Failure burn rate breaching SLO"
|
|
description: "5m/30m failure burn rate above 2%/1% SLO; investigate upstream jobs and dependencies."
|