165 lines
6.1 KiB
YAML
165 lines
6.1 KiB
YAML
# ExportCenter Alert Rules
|
|
# SLO Burn-rate alerts for export service reliability
|
|
|
|
groups:
|
|
- name: export-center-slo
|
|
interval: 30s
|
|
rules:
|
|
# SLO: 99.5% success rate target
|
|
# Error budget: 0.5% (432 errors per day at 86400 requests/day)
|
|
|
|
# Fast burn - 2% budget consumption in 1 hour (critical)
|
|
- alert: ExportCenterHighErrorBurnRate
|
|
expr: |
|
|
(
|
|
sum(rate(export_runs_failed_total[1h]))
|
|
/
|
|
sum(rate(export_runs_total[1h]))
|
|
) > (14.4 * 0.005)
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: export-center
|
|
slo: availability
|
|
annotations:
|
|
summary: "ExportCenter high error burn rate"
|
|
description: "Error rate is {{ $value | humanizePercentage }} over the last hour, consuming error budget at 14.4x the sustainable rate."
|
|
runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-error-rate"
|
|
|
|
# Slow burn - 10% budget consumption in 6 hours (warning)
|
|
- alert: ExportCenterElevatedErrorBurnRate
|
|
expr: |
|
|
(
|
|
sum(rate(export_runs_failed_total[6h]))
|
|
/
|
|
sum(rate(export_runs_total[6h]))
|
|
) > (6 * 0.005)
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: export-center
|
|
slo: availability
|
|
annotations:
|
|
summary: "ExportCenter elevated error burn rate"
|
|
description: "Error rate is {{ $value | humanizePercentage }} over the last 6 hours, consuming error budget at 6x the sustainable rate."
|
|
runbook_url: "https://docs.stellaops.io/runbooks/export-center/elevated-error-rate"
|
|
|
|
- name: export-center-latency
|
|
interval: 30s
|
|
rules:
|
|
# SLO: 95% of exports complete within 120s
|
|
# Fast burn - p95 latency exceeding threshold
|
|
- alert: ExportCenterHighLatency
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(export_run_duration_seconds_bucket[5m])) by (le)
|
|
) > 120
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: export-center
|
|
slo: latency
|
|
annotations:
|
|
summary: "ExportCenter high latency"
|
|
description: "95th percentile export duration is {{ $value | humanizeDuration }}, exceeding 120s SLO target."
|
|
runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-latency"
|
|
|
|
# Critical latency - p99 exceeding 5 minutes
|
|
- alert: ExportCenterCriticalLatency
|
|
expr: |
|
|
histogram_quantile(0.99,
|
|
sum(rate(export_run_duration_seconds_bucket[5m])) by (le)
|
|
) > 300
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: export-center
|
|
slo: latency
|
|
annotations:
|
|
summary: "ExportCenter critical latency"
|
|
description: "99th percentile export duration is {{ $value | humanizeDuration }}, indicating severe performance degradation."
|
|
runbook_url: "https://docs.stellaops.io/runbooks/export-center/critical-latency"
|
|
|
|
- name: export-center-capacity
|
|
interval: 60s
|
|
rules:
|
|
# Queue buildup warning
|
|
- alert: ExportCenterHighConcurrency
|
|
expr: sum(export_runs_in_progress) > 50
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: export-center
|
|
annotations:
|
|
summary: "ExportCenter high concurrency"
|
|
description: "{{ $value }} exports currently in progress. Consider scaling or investigating slow exports."
|
|
runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-concurrency"
|
|
|
|
# Stuck exports - exports running longer than 30 minutes
|
|
- alert: ExportCenterStuckExports
|
|
expr: |
|
|
histogram_quantile(0.99,
|
|
sum(rate(export_run_duration_seconds_bucket{status!="completed"}[1h])) by (le)
|
|
) > 1800
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: export-center
|
|
annotations:
|
|
summary: "ExportCenter potentially stuck exports"
|
|
description: "Some exports may be stuck - 99th percentile duration for incomplete exports exceeds 30 minutes."
|
|
runbook_url: "https://docs.stellaops.io/runbooks/export-center/stuck-exports"
|
|
|
|
- name: export-center-errors
|
|
interval: 30s
|
|
rules:
|
|
# Specific error code spike
|
|
- alert: ExportCenterErrorCodeSpike
|
|
expr: |
|
|
sum by (error_code) (
|
|
rate(export_runs_failed_total[5m])
|
|
) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: export-center
|
|
annotations:
|
|
summary: "ExportCenter error code spike: {{ $labels.error_code }}"
|
|
description: "Error code {{ $labels.error_code }} is occurring at {{ $value | humanize }}/s rate."
|
|
runbook_url: "https://docs.stellaops.io/runbooks/export-center/error-codes"
|
|
|
|
# No successful exports in 15 minutes (when there is traffic)
|
|
- alert: ExportCenterNoSuccessfulExports
|
|
expr: |
|
|
(
|
|
sum(rate(export_runs_total[15m])) > 0
|
|
)
|
|
and
|
|
(
|
|
sum(rate(export_runs_success_total[15m])) == 0
|
|
)
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
service: export-center
|
|
annotations:
|
|
summary: "ExportCenter no successful exports"
|
|
description: "No exports have completed successfully in the last 15 minutes despite ongoing attempts."
|
|
runbook_url: "https://docs.stellaops.io/runbooks/export-center/no-successful-exports"
|
|
|
|
- name: export-center-deprecation
|
|
interval: 5m
|
|
rules:
|
|
# Deprecated endpoint usage
|
|
- alert: ExportCenterDeprecatedEndpointUsage
|
|
expr: |
|
|
sum(rate(export_center_deprecated_endpoint_access_total[1h])) > 0
|
|
for: 1h
|
|
labels:
|
|
severity: info
|
|
service: export-center
|
|
annotations:
|
|
summary: "Deprecated export endpoints still in use"
|
|
description: "Legacy /exports endpoints are still being accessed at {{ $value | humanize }}/s. Migration to v1 API recommended."
|
|
runbook_url: "https://docs.stellaops.io/api/export-center/migration"
|