up
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Symbols Server CI / symbols-smoke (push) Has been cancelled
devportal-offline / build-offline (push) Has been cancelled
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Symbols Server CI / symbols-smoke (push) Has been cancelled
devportal-offline / build-offline (push) Has been cancelled
This commit is contained in:
36
ops/devops/observability/alerts-slo.yaml
Normal file
36
ops/devops/observability/alerts-slo.yaml
Normal file
@@ -0,0 +1,36 @@
|
||||
groups:
|
||||
- name: slo-burn
|
||||
rules:
|
||||
- alert: SLOBurnRateFast
|
||||
expr: |
|
||||
(rate(service_request_errors_total[5m]) / rate(service_requests_total[5m])) >
|
||||
4 * (1 - 0.99)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Fast burn: 99% SLO breached"
|
||||
description: "Error budget burn (5m) exceeds fast threshold."
|
||||
- alert: SLOBurnRateSlow
|
||||
expr: |
|
||||
(rate(service_request_errors_total[1h]) / rate(service_requests_total[1h])) >
|
||||
1 * (1 - 0.99)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Slow burn: 99% SLO at risk"
|
||||
description: "Error budget burn (1h) exceeds slow threshold."
|
||||
- name: slo-webhook
|
||||
rules:
|
||||
- alert: SLOWebhookFailures
|
||||
expr: rate(slo_webhook_failures_total[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "SLO webhook failures"
|
||||
description: "Webhook emitter has failures in last 5m."
|
||||
26
ops/devops/observability/grafana/slo-burn.json
Normal file
26
ops/devops/observability/grafana/slo-burn.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"title": "SLO Burn",
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"panels": [
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Error rate",
|
||||
"targets": [
|
||||
{ "expr": "rate(service_request_errors_total[5m]) / rate(service_requests_total[5m])", "legendFormat": "5m" },
|
||||
{ "expr": "rate(service_request_errors_total[1h]) / rate(service_requests_total[1h])", "legendFormat": "1h" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.01 } ] } }
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Budget used (24h)",
|
||||
"targets": [
|
||||
{ "expr": "(sum_over_time(service_request_errors_total[24h]) / sum_over_time(service_requests_total[24h]))" }
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"version": 1
|
||||
}
|
||||
Reference in New Issue
Block a user