119 lines
5.3 KiB
YAML
119 lines
5.3 KiB
YAML
# Sprint: SPRINT_20260117_028_Telemetry_p0_metrics
|
|
# Task: P0M-006 - Alerting Rules
|
|
# P0 Product Metrics Alert Rules
|
|
|
|
groups:
|
|
- name: stella-p0-metrics
|
|
rules:
|
|
# P0M-001: Time to First Verified Release
|
|
- alert: StellaTimeToFirstReleaseHigh
|
|
expr: |
|
|
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 14400
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
category: adoption
|
|
annotations:
|
|
summary: "Time to first verified release is high for tenant {{ $labels.tenant }}"
|
|
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 4 hours)"
|
|
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
|
|
|
|
- alert: StellaTimeToFirstReleaseCritical
|
|
expr: |
|
|
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 86400
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
category: adoption
|
|
annotations:
|
|
summary: "Time to first verified release critically high for tenant {{ $labels.tenant }}"
|
|
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 24 hours)"
|
|
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
|
|
|
|
# P0M-002: Why Blocked Latency
|
|
- alert: StellaWhyBlockedLatencyHigh
|
|
expr: |
|
|
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 300
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
category: usability
|
|
annotations:
|
|
summary: "Why-blocked latency is high for tenant {{ $labels.tenant }}"
|
|
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
|
|
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
|
|
|
|
- alert: StellaWhyBlockedLatencyCritical
|
|
expr: |
|
|
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 3600
|
|
for: 30m
|
|
labels:
|
|
severity: critical
|
|
category: usability
|
|
annotations:
|
|
summary: "Why-blocked latency critically high for tenant {{ $labels.tenant }}"
|
|
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 1 hour)"
|
|
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
|
|
|
|
# P0M-003: Support Burden
|
|
- alert: StellaSupportBurdenHigh
|
|
expr: |
|
|
sum by (tenant, month) (stella_support_burden_minutes_total) > 30
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
category: operations
|
|
annotations:
|
|
summary: "Support burden high for tenant {{ $labels.tenant }}"
|
|
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 30 minutes)"
|
|
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
|
|
|
|
- alert: StellaSupportBurdenCritical
|
|
expr: |
|
|
sum by (tenant, month) (stella_support_burden_minutes_total) > 60
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
category: operations
|
|
annotations:
|
|
summary: "Support burden critically high for tenant {{ $labels.tenant }}"
|
|
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 60 minutes)"
|
|
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
|
|
|
|
# P0M-004: Determinism Regressions
|
|
- alert: StellaDeterminismRegression
|
|
expr: |
|
|
increase(stella_determinism_regressions_total{severity="policy"}[5m]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
category: reliability
|
|
annotations:
|
|
summary: "Policy-level determinism regression detected for tenant {{ $labels.tenant }}"
|
|
description: "Determinism failure in {{ $labels.component }} component - same inputs produced different policy decisions"
|
|
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
|
|
|
|
- alert: StellaDeterminismRegressionSemantic
|
|
expr: |
|
|
increase(stella_determinism_regressions_total{severity="semantic"}[1h]) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
category: reliability
|
|
annotations:
|
|
summary: "Semantic determinism regression detected for tenant {{ $labels.tenant }}"
|
|
description: "Semantic-level determinism failure in {{ $labels.component }} - outputs differ but policy decision unchanged"
|
|
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
|
|
|
|
- alert: StellaDeterminismRegressionBitwise
|
|
expr: |
|
|
increase(stella_determinism_regressions_total{severity="bitwise"}[24h]) > 5
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
category: reliability
|
|
annotations:
|
|
summary: "Multiple bitwise determinism regressions for tenant {{ $labels.tenant }}"
|
|
description: "{{ $value }} bitwise-level determinism failures in {{ $labels.component }} in last 24h"
|
|
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
|