43 lines
1.3 KiB
YAML
43 lines
1.3 KiB
YAML
# Notify SLO Alerts
|
|
# Prometheus alerting rules for the notification service
|
|
|
|
groups:
|
|
- name: notify-slo
|
|
rules:
|
|
- alert: NotifyDeliverySuccessSLO
|
|
expr: |
|
|
(
|
|
sum(rate(notify_delivery_success_total[5m])) /
|
|
sum(rate(notify_delivery_total[5m]))
|
|
) < 0.99
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: notify
|
|
annotations:
|
|
summary: "Notification delivery success rate below SLO"
|
|
description: "Current success rate: {{ $value | humanizePercentage }}"
|
|
|
|
- alert: NotifyBacklogDepth
|
|
expr: notify_backlog_depth > 10000
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: notify
|
|
annotations:
|
|
summary: "Notification backlog depth high"
|
|
description: "Current backlog: {{ $value }} notifications"
|
|
|
|
- alert: NotifyLatencyP99
|
|
expr: |
|
|
histogram_quantile(0.99,
|
|
sum(rate(notify_delivery_duration_seconds_bucket[5m])) by (le)
|
|
) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: notify
|
|
annotations:
|
|
summary: "Notification delivery P99 latency high"
|
|
description: "P99 latency: {{ $value | humanizeDuration }}"
|