# ExportCenter Alert Rules # SLO Burn-rate alerts for export service reliability groups: - name: export-center-slo interval: 30s rules: # SLO: 99.5% success rate target # Error budget: 0.5% (432 errors per day at 86400 requests/day) # Fast burn - 2% budget consumption in 1 hour (critical) - alert: ExportCenterHighErrorBurnRate expr: | ( sum(rate(export_runs_failed_total[1h])) / sum(rate(export_runs_total[1h])) ) > (14.4 * 0.005) for: 2m labels: severity: critical service: export-center slo: availability annotations: summary: "ExportCenter high error burn rate" description: "Error rate is {{ $value | humanizePercentage }} over the last hour, consuming error budget at 14.4x the sustainable rate." runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-error-rate" # Slow burn - 10% budget consumption in 6 hours (warning) - alert: ExportCenterElevatedErrorBurnRate expr: | ( sum(rate(export_runs_failed_total[6h])) / sum(rate(export_runs_total[6h])) ) > (6 * 0.005) for: 5m labels: severity: warning service: export-center slo: availability annotations: summary: "ExportCenter elevated error burn rate" description: "Error rate is {{ $value | humanizePercentage }} over the last 6 hours, consuming error budget at 6x the sustainable rate." runbook_url: "https://docs.stellaops.io/runbooks/export-center/elevated-error-rate" - name: export-center-latency interval: 30s rules: # SLO: 95% of exports complete within 120s # Fast burn - p95 latency exceeding threshold - alert: ExportCenterHighLatency expr: | histogram_quantile(0.95, sum(rate(export_run_duration_seconds_bucket[5m])) by (le) ) > 120 for: 5m labels: severity: warning service: export-center slo: latency annotations: summary: "ExportCenter high latency" description: "95th percentile export duration is {{ $value | humanizeDuration }}, exceeding 120s SLO target." runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-latency" # Critical latency - p99 exceeding 5 minutes - alert: ExportCenterCriticalLatency expr: | histogram_quantile(0.99, sum(rate(export_run_duration_seconds_bucket[5m])) by (le) ) > 300 for: 2m labels: severity: critical service: export-center slo: latency annotations: summary: "ExportCenter critical latency" description: "99th percentile export duration is {{ $value | humanizeDuration }}, indicating severe performance degradation." runbook_url: "https://docs.stellaops.io/runbooks/export-center/critical-latency" - name: export-center-capacity interval: 60s rules: # Queue buildup warning - alert: ExportCenterHighConcurrency expr: sum(export_runs_in_progress) > 50 for: 5m labels: severity: warning service: export-center annotations: summary: "ExportCenter high concurrency" description: "{{ $value }} exports currently in progress. Consider scaling or investigating slow exports." runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-concurrency" # Stuck exports - exports running longer than 30 minutes - alert: ExportCenterStuckExports expr: | histogram_quantile(0.99, sum(rate(export_run_duration_seconds_bucket{status!="completed"}[1h])) by (le) ) > 1800 for: 10m labels: severity: warning service: export-center annotations: summary: "ExportCenter potentially stuck exports" description: "Some exports may be stuck - 99th percentile duration for incomplete exports exceeds 30 minutes." runbook_url: "https://docs.stellaops.io/runbooks/export-center/stuck-exports" - name: export-center-errors interval: 30s rules: # Specific error code spike - alert: ExportCenterErrorCodeSpike expr: | sum by (error_code) ( rate(export_runs_failed_total[5m]) ) > 0.1 for: 5m labels: severity: warning service: export-center annotations: summary: "ExportCenter error code spike: {{ $labels.error_code }}" description: "Error code {{ $labels.error_code }} is occurring at {{ $value | humanize }}/s rate." runbook_url: "https://docs.stellaops.io/runbooks/export-center/error-codes" # No successful exports in 15 minutes (when there is traffic) - alert: ExportCenterNoSuccessfulExports expr: | ( sum(rate(export_runs_total[15m])) > 0 ) and ( sum(rate(export_runs_success_total[15m])) == 0 ) for: 10m labels: severity: critical service: export-center annotations: summary: "ExportCenter no successful exports" description: "No exports have completed successfully in the last 15 minutes despite ongoing attempts." runbook_url: "https://docs.stellaops.io/runbooks/export-center/no-successful-exports" - name: export-center-deprecation interval: 5m rules: # Deprecated endpoint usage - alert: ExportCenterDeprecatedEndpointUsage expr: | sum(rate(export_center_deprecated_endpoint_access_total[1h])) > 0 for: 1h labels: severity: info service: export-center annotations: summary: "Deprecated export endpoints still in use" description: "Legacy /exports endpoints are still being accessed at {{ $value | humanize }}/s. Migration to v1 API recommended." runbook_url: "https://docs.stellaops.io/api/export-center/migration"