Files
git.stella-ops.org/ops/devops/lnm/alerts/lnm-alerts.yaml
StellaOps Bot 2e70c9fdb6
Some checks failed
LNM Migration CI / build-runner (push) Has been cancelled
Ledger OpenAPI CI / deprecation-check (push) Has been cancelled
Docs CI / lint-and-preview (push) Has been cancelled
Airgap Sealed CI Smoke / sealed-smoke (push) Has been cancelled
Ledger Packs CI / build-pack (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Ledger OpenAPI CI / validate-oas (push) Has been cancelled
Ledger OpenAPI CI / check-wellknown (push) Has been cancelled
Ledger Packs CI / verify-pack (push) Has been cancelled
LNM Migration CI / validate-metrics (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
up
2025-12-14 18:33:02 +02:00

58 lines
1.8 KiB
YAML

# LNM Migration Alert Rules
# Prometheus alerting rules for linkset/advisory migrations
groups:
- name: lnm-migration
rules:
- alert: LnmMigrationErrorRate
expr: rate(lnm_migration_errors_total[5m]) > 0.1
for: 5m
labels:
severity: warning
team: concelier
annotations:
summary: "LNM migration error rate elevated"
description: "Migration errors: {{ $value | printf \"%.2f\" }}/s"
- alert: LnmBackfillStalled
expr: increase(lnm_backfill_processed_total[10m]) == 0 and lnm_backfill_running == 1
for: 10m
labels:
severity: critical
team: concelier
annotations:
summary: "LNM backfill stalled"
description: "No progress in 10 minutes while backfill is running"
- alert: LnmLinksetCountMismatch
expr: abs(lnm_linksets_total - lnm_linksets_expected) > 100
for: 15m
labels:
severity: warning
team: concelier
annotations:
summary: "Linkset count mismatch"
description: "Expected {{ $labels.expected }}, got {{ $value }}"
- alert: LnmObservationsBacklogHigh
expr: lnm_observations_backlog > 10000
for: 5m
labels:
severity: warning
team: excititor
annotations:
summary: "Advisory observations backlog high"
description: "Backlog: {{ $value }} items"
- name: lnm-sla
rules:
- alert: LnmIngestToApiLatencyHigh
expr: histogram_quantile(0.95, rate(lnm_ingest_to_api_latency_seconds_bucket[5m])) > 30
for: 10m
labels:
severity: warning
team: platform
annotations:
summary: "Ingest to API latency exceeds SLA"
description: "P95 latency: {{ $value | printf \"%.1f\" }}s (SLA: 30s)"