Files
git.stella-ops.org/ops/devops/observability/signals-alerts.yaml
StellaOps Bot 9f6e6f7fb3
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
Signals CI & Image / signals-ci (push) Has been cancelled
Policy Lint & Smoke / policy-lint (push) Has been cancelled
Policy Simulation / policy-simulate (push) Has been cancelled
SDK Publish & Sign / sdk-publish (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Concelier Attestation Tests / attestation-tests (push) Has been cancelled
devportal-offline / build-offline (push) Has been cancelled
up
2025-11-25 22:09:44 +02:00

55 lines
1.9 KiB
YAML

groups:
- name: signals-pipeline
rules:
- alert: SignalsScoringLatencyP95High
expr: histogram_quantile(0.95, sum(rate(signals_reachability_scoring_duration_seconds_bucket[5m])) by (le)) > 2
for: 10m
labels:
severity: warning
service: signals
annotations:
summary: "Signals scoring latency high (p95)"
description: "Reachability scoring p95 exceeds 2s for 10m"
- alert: SignalsCacheMissRateHigh
expr: |
clamp_min(rate(signals_cache_misses_total[5m]), 0)
/ clamp_min(rate(signals_cache_hits_total[5m]) + rate(signals_cache_misses_total[5m]), 1) > 0.3
for: 10m
labels:
severity: warning
service: signals
annotations:
summary: "Signals cache miss rate high"
description: "Cache miss ratio >30% over 10m; investigate Redis or key churn."
- alert: SignalsCacheDown
expr: signals_cache_available == 0
for: 2m
labels:
severity: critical
service: signals
annotations:
summary: "Signals cache unavailable"
description: "Redis cache reported unavailable for >2m"
- alert: SignalsSensorStaleness
expr: time() - max(signals_sensor_last_seen_timestamp_seconds) by (sensor) > 900
for: 5m
labels:
severity: warning
service: signals
annotations:
summary: "Signals sensor stale"
description: "No updates from sensor for >15 minutes"
- alert: SignalsIngestionErrorRate
expr: clamp_min(rate(signals_ingestion_failures_total[5m]), 0) / clamp_min(rate(signals_ingestion_total[5m]), 1) > 0.05
for: 5m
labels:
severity: critical
service: signals
annotations:
summary: "Signals ingestion failures elevated"
description: "Ingestion failure ratio above 5% over 5m"