CD/CD consolidation

This commit is contained in:
StellaOps Bot
2025-12-26 17:32:23 +02:00
parent a866eb6277
commit c786faae84
638 changed files with 3821 additions and 181 deletions

View File

@@ -1,69 +0,0 @@
groups:
- name: orchestrator-core
rules:
- alert: OrchestratorQueueDepthHigh
expr: job_queue_depth > 500
for: 10m
labels:
severity: warning
service: orchestrator
annotations:
summary: "Queue depth high"
description: "job_queue_depth exceeded 500 for 10m"
- alert: OrchestratorFailuresHigh
expr: rate(job_failures_total[5m]) > 5
for: 5m
labels:
severity: critical
service: orchestrator
annotations:
summary: "Job failures elevated"
description: "Failure rate above 5/min in last 5m"
- alert: OrchestratorLeaseStall
expr: rate(lease_extensions_total[5m]) == 0 and job_queue_depth > 0
for: 5m
labels:
severity: critical
service: orchestrator
annotations:
summary: "Leases stalled"
description: "No lease renewals while queue has items"
- alert: OrchestratorDLQDepthHigh
expr: job_dlq_depth > 10
for: 10m
labels:
severity: warning
service: orchestrator
annotations:
summary: "DLQ depth high"
description: "Dead-letter queue depth above 10 for 10m"
- alert: OrchestratorBackpressure
expr: avg_over_time(rate_limiter_backpressure_ratio[5m]) > 0.5
for: 5m
labels:
severity: warning
service: orchestrator
annotations:
summary: "Backpressure elevated"
description: "Rate limiter backpressure >50% over 5m"
- alert: OrchestratorErrorCluster
expr: sum by(jobType) (rate(job_failures_total[5m])) > 3
for: 5m
labels:
severity: critical
service: orchestrator
annotations:
summary: "Error cluster detected"
description: "Failure rate >3/min for a job type"
- alert: OrchestratorFailureBurnRateHigh
expr: |
(rate(job_failures_total[5m]) / clamp_min(rate(job_processed_total[5m]), 1)) > 0.02
and
(rate(job_failures_total[30m]) / clamp_min(rate(job_processed_total[30m]), 1)) > 0.01
for: 10m
labels:
severity: critical
service: orchestrator
annotations:
summary: "Failure burn rate breaching SLO"
description: "5m/30m failure burn rate above 2%/1% SLO; investigate upstream jobs and dependencies."