up
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Symbols Server CI / symbols-smoke (push) Has been cancelled
devportal-offline / build-offline (push) Has been cancelled
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Symbols Server CI / symbols-smoke (push) Has been cancelled
devportal-offline / build-offline (push) Has been cancelled
This commit is contained in:
42
ops/devops/exporter/alerts.yaml
Normal file
42
ops/devops/exporter/alerts.yaml
Normal file
@@ -0,0 +1,42 @@
|
||||
groups:
|
||||
- name: exporter
|
||||
rules:
|
||||
- alert: ExporterThroughputLow
|
||||
expr: rate(exporter_jobs_processed_total[5m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Exporter throughput low"
|
||||
description: "Processed <1 job/s over last 5m (current {{ $value }})."
|
||||
|
||||
- alert: ExporterFailuresHigh
|
||||
expr: rate(exporter_jobs_failed_total[5m]) / rate(exporter_jobs_processed_total[5m]) > 0.02
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Exporter failure rate >2%"
|
||||
description: "Failure rate {{ $value | humanizePercentage }} over last 5m."
|
||||
|
||||
- alert: ExporterLatencyP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(exporter_job_duration_seconds_bucket[5m])) by (le)) > 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Exporter job p95 latency high"
|
||||
description: "Job p95 latency {{ $value }}s over last 5m (threshold 3s)."
|
||||
|
||||
- alert: ExporterQueueDepthHigh
|
||||
expr: exporter_queue_depth > 500
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Exporter queue depth high"
|
||||
description: "Queue depth {{ $value }} exceeds 500 for >10m."
|
||||
29
ops/devops/exporter/grafana/exporter-overview.json
Normal file
29
ops/devops/exporter/grafana/exporter-overview.json
Normal file
@@ -0,0 +1,29 @@
|
||||
{
|
||||
"title": "Exporter Overview",
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"panels": [
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Queue depth",
|
||||
"targets": [{ "expr": "exporter_queue_depth" }]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Jobs processed / failed",
|
||||
"targets": [
|
||||
{ "expr": "rate(exporter_jobs_processed_total[5m])", "legendFormat": "processed" },
|
||||
{ "expr": "rate(exporter_jobs_failed_total[5m])", "legendFormat": "failed" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Job duration p50/p95",
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.5, sum(rate(exporter_job_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(exporter_job_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95" }
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"version": 1
|
||||
}
|
||||
Reference in New Issue
Block a user