# Sprint: SPRINT_20260117_028_Telemetry_p0_metrics # Task: P0M-006 - Alerting Rules # P0 Product Metrics Alert Rules groups: - name: stella-p0-metrics rules: # P0M-001: Time to First Verified Release - alert: StellaTimeToFirstReleaseHigh expr: | histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 14400 for: 1h labels: severity: warning category: adoption annotations: summary: "Time to first verified release is high for tenant {{ $labels.tenant }}" description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 4 hours)" runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding" - alert: StellaTimeToFirstReleaseCritical expr: | histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 86400 for: 1h labels: severity: critical category: adoption annotations: summary: "Time to first verified release critically high for tenant {{ $labels.tenant }}" description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 24 hours)" runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding" # P0M-002: Why Blocked Latency - alert: StellaWhyBlockedLatencyHigh expr: | histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 300 for: 30m labels: severity: warning category: usability annotations: summary: "Why-blocked latency is high for tenant {{ $labels.tenant }}" description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 5 minutes)" runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain" - alert: StellaWhyBlockedLatencyCritical expr: | histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 3600 for: 30m labels: severity: critical category: usability annotations: summary: "Why-blocked latency critically high for tenant {{ $labels.tenant }}" description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 1 hour)" runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain" # P0M-003: Support Burden - alert: StellaSupportBurdenHigh expr: | sum by (tenant, month) (stella_support_burden_minutes_total) > 30 for: 0m labels: severity: warning category: operations annotations: summary: "Support burden high for tenant {{ $labels.tenant }}" description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 30 minutes)" runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization" - alert: StellaSupportBurdenCritical expr: | sum by (tenant, month) (stella_support_burden_minutes_total) > 60 for: 0m labels: severity: critical category: operations annotations: summary: "Support burden critically high for tenant {{ $labels.tenant }}" description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 60 minutes)" runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization" # P0M-004: Determinism Regressions - alert: StellaDeterminismRegression expr: | increase(stella_determinism_regressions_total{severity="policy"}[5m]) > 0 for: 0m labels: severity: critical category: reliability annotations: summary: "Policy-level determinism regression detected for tenant {{ $labels.tenant }}" description: "Determinism failure in {{ $labels.component }} component - same inputs produced different policy decisions" runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure" - alert: StellaDeterminismRegressionSemantic expr: | increase(stella_determinism_regressions_total{severity="semantic"}[1h]) > 0 for: 0m labels: severity: warning category: reliability annotations: summary: "Semantic determinism regression detected for tenant {{ $labels.tenant }}" description: "Semantic-level determinism failure in {{ $labels.component }} - outputs differ but policy decision unchanged" runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure" - alert: StellaDeterminismRegressionBitwise expr: | increase(stella_determinism_regressions_total{severity="bitwise"}[24h]) > 5 for: 0m labels: severity: warning category: reliability annotations: summary: "Multiple bitwise determinism regressions for tenant {{ $labels.tenant }}" description: "{{ $value }} bitwise-level determinism failures in {{ $labels.component }} in last 24h" runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"