Files
git.stella-ops.org/devops/observability/signals-alerts.yaml
2025-12-26 18:11:06 +02:00

55 lines
1.9 KiB
YAML

groups:
- name: signals-pipeline
rules:
- alert: SignalsScoringLatencyP95High
expr: histogram_quantile(0.95, sum(rate(signals_reachability_scoring_duration_seconds_bucket[5m])) by (le)) > 2
for: 10m
labels:
severity: warning
service: signals
annotations:
summary: "Signals scoring latency high (p95)"
description: "Reachability scoring p95 exceeds 2s for 10m"
- alert: SignalsCacheMissRateHigh
expr: |
clamp_min(rate(signals_cache_misses_total[5m]), 0)
/ clamp_min(rate(signals_cache_hits_total[5m]) + rate(signals_cache_misses_total[5m]), 1) > 0.3
for: 10m
labels:
severity: warning
service: signals
annotations:
summary: "Signals cache miss rate high"
description: "Cache miss ratio >30% over 10m; investigate Redis or key churn."
- alert: SignalsCacheDown
expr: signals_cache_available == 0
for: 2m
labels:
severity: critical
service: signals
annotations:
summary: "Signals cache unavailable"
description: "Redis cache reported unavailable for >2m"
- alert: SignalsSensorStaleness
expr: time() - max(signals_sensor_last_seen_timestamp_seconds) by (sensor) > 900
for: 5m
labels:
severity: warning
service: signals
annotations:
summary: "Signals sensor stale"
description: "No updates from sensor for >15 minutes"
- alert: SignalsIngestionErrorRate
expr: clamp_min(rate(signals_ingestion_failures_total[5m]), 0) / clamp_min(rate(signals_ingestion_total[5m]), 1) > 0.05
for: 5m
labels:
severity: critical
service: signals
annotations:
summary: "Signals ingestion failures elevated"
description: "Ingestion failure ratio above 5% over 5m"