Files
git.stella-ops.org/devops/telemetry/alerts/stella-p0-alerts.yml

119 lines
5.3 KiB
YAML

# Sprint: SPRINT_20260117_028_Telemetry_p0_metrics
# Task: P0M-006 - Alerting Rules
# P0 Product Metrics Alert Rules
groups:
- name: stella-p0-metrics
rules:
# P0M-001: Time to First Verified Release
- alert: StellaTimeToFirstReleaseHigh
expr: |
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 14400
for: 1h
labels:
severity: warning
category: adoption
annotations:
summary: "Time to first verified release is high for tenant {{ $labels.tenant }}"
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 4 hours)"
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
- alert: StellaTimeToFirstReleaseCritical
expr: |
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 86400
for: 1h
labels:
severity: critical
category: adoption
annotations:
summary: "Time to first verified release critically high for tenant {{ $labels.tenant }}"
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 24 hours)"
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
# P0M-002: Why Blocked Latency
- alert: StellaWhyBlockedLatencyHigh
expr: |
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 300
for: 30m
labels:
severity: warning
category: usability
annotations:
summary: "Why-blocked latency is high for tenant {{ $labels.tenant }}"
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
- alert: StellaWhyBlockedLatencyCritical
expr: |
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 3600
for: 30m
labels:
severity: critical
category: usability
annotations:
summary: "Why-blocked latency critically high for tenant {{ $labels.tenant }}"
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 1 hour)"
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
# P0M-003: Support Burden
- alert: StellaSupportBurdenHigh
expr: |
sum by (tenant, month) (stella_support_burden_minutes_total) > 30
for: 0m
labels:
severity: warning
category: operations
annotations:
summary: "Support burden high for tenant {{ $labels.tenant }}"
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 30 minutes)"
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
- alert: StellaSupportBurdenCritical
expr: |
sum by (tenant, month) (stella_support_burden_minutes_total) > 60
for: 0m
labels:
severity: critical
category: operations
annotations:
summary: "Support burden critically high for tenant {{ $labels.tenant }}"
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 60 minutes)"
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
# P0M-004: Determinism Regressions
- alert: StellaDeterminismRegression
expr: |
increase(stella_determinism_regressions_total{severity="policy"}[5m]) > 0
for: 0m
labels:
severity: critical
category: reliability
annotations:
summary: "Policy-level determinism regression detected for tenant {{ $labels.tenant }}"
description: "Determinism failure in {{ $labels.component }} component - same inputs produced different policy decisions"
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
- alert: StellaDeterminismRegressionSemantic
expr: |
increase(stella_determinism_regressions_total{severity="semantic"}[1h]) > 0
for: 0m
labels:
severity: warning
category: reliability
annotations:
summary: "Semantic determinism regression detected for tenant {{ $labels.tenant }}"
description: "Semantic-level determinism failure in {{ $labels.component }} - outputs differ but policy decision unchanged"
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
- alert: StellaDeterminismRegressionBitwise
expr: |
increase(stella_determinism_regressions_total{severity="bitwise"}[24h]) > 5
for: 0m
labels:
severity: warning
category: reliability
annotations:
summary: "Multiple bitwise determinism regressions for tenant {{ $labels.tenant }}"
description: "{{ $value }} bitwise-level determinism failures in {{ $labels.component }} in last 24h"
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"