Files
git.stella-ops.org/devops/offline/fixtures/telemetry/dashboards/ledger/alerts.yml

40 lines
1.6 KiB
YAML

groups:
- name: ledger-observability
interval: 30s
rules:
- alert: LedgerWriteLatencyHighP95
expr: histogram_quantile(0.95, sum(rate(ledger_write_latency_seconds_bucket[5m])) by (le, tenant)) > 0.12
for: 10m
labels:
severity: warning
annotations:
summary: "Ledger write latency p95 high (tenant {{ $labels.tenant }})"
description: "ledger_write_latency_seconds p95 > 120ms for >10m. Check DB/queue."
- alert: ProjectionLagHigh
expr: max_over_time(ledger_projection_lag_seconds[10m]) > 30
for: 10m
labels:
severity: critical
annotations:
summary: "Ledger projection lag high"
description: "projection lag over 30s; projections falling behind ingest."
- alert: MerkleAnchorFailures
expr: sum(rate(ledger_merkle_anchor_failures_total[15m])) by (tenant, reason) > 0
for: 15m
labels:
severity: critical
annotations:
summary: "Merkle anchor failures (tenant {{ $labels.tenant }})"
description: "Anchoring failures detected (reason={{ $labels.reason }}). Investigate signing/storage."
- alert: AttachmentFailures
expr: sum(rate(ledger_attachments_encryption_failures_total[10m])) by (tenant, stage) > 0
for: 10m
labels:
severity: warning
annotations:
summary: "Attachment pipeline failures (tenant {{ $labels.tenant }}, stage {{ $labels.stage }})"
description: "Attachment encryption/sign/upload reported failures in the last 10m."