git.stella-ops.org/devops/telemetry/alerts/stella-p0-alerts.yml

# Sprint: SPRINT_20260117_028_Telemetry_p0_metrics
# Task: P0M-006 - Alerting Rules
# P0 Product Metrics Alert Rules

groups:
  - name: stella-p0-metrics
    rules:
      # P0M-001: Time to First Verified Release
      - alert: StellaTimeToFirstReleaseHigh
        expr: |
          histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 14400
        for: 1h
        labels:
          severity: warning
          category: adoption
        annotations:
          summary: "Time to first verified release is high for tenant {{ $labels.tenant }}"
          description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 4 hours)"
          runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"

      - alert: StellaTimeToFirstReleaseCritical
        expr: |
          histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 86400
        for: 1h
        labels:
          severity: critical
          category: adoption
        annotations:
          summary: "Time to first verified release critically high for tenant {{ $labels.tenant }}"
          description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 24 hours)"
          runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"

      # P0M-002: Why Blocked Latency
      - alert: StellaWhyBlockedLatencyHigh
        expr: |
          histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 300
        for: 30m
        labels:
          severity: warning
          category: usability
        annotations:
          summary: "Why-blocked latency is high for tenant {{ $labels.tenant }}"
          description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
          runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"

      - alert: StellaWhyBlockedLatencyCritical
        expr: |
          histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 3600
        for: 30m
        labels:
          severity: critical
          category: usability
        annotations:
          summary: "Why-blocked latency critically high for tenant {{ $labels.tenant }}"
          description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 1 hour)"
          runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"

      # P0M-003: Support Burden
      - alert: StellaSupportBurdenHigh
        expr: |
          sum by (tenant, month) (stella_support_burden_minutes_total) > 30
        for: 0m
        labels:
          severity: warning
          category: operations
        annotations:
          summary: "Support burden high for tenant {{ $labels.tenant }}"
          description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 30 minutes)"
          runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"

      - alert: StellaSupportBurdenCritical
        expr: |
          sum by (tenant, month) (stella_support_burden_minutes_total) > 60
        for: 0m
        labels:
          severity: critical
          category: operations
        annotations:
          summary: "Support burden critically high for tenant {{ $labels.tenant }}"
          description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 60 minutes)"
          runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"

      # P0M-004: Determinism Regressions
      - alert: StellaDeterminismRegression
        expr: |
          increase(stella_determinism_regressions_total{severity="policy"}[5m]) > 0
        for: 0m
        labels:
          severity: critical
          category: reliability
        annotations:
          summary: "Policy-level determinism regression detected for tenant {{ $labels.tenant }}"
          description: "Determinism failure in {{ $labels.component }} component - same inputs produced different policy decisions"
          runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"

      - alert: StellaDeterminismRegressionSemantic
        expr: |
          increase(stella_determinism_regressions_total{severity="semantic"}[1h]) > 0
        for: 0m
        labels:
          severity: warning
          category: reliability
        annotations:
          summary: "Semantic determinism regression detected for tenant {{ $labels.tenant }}"
          description: "Semantic-level determinism failure in {{ $labels.component }} - outputs differ but policy decision unchanged"
          runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"

      - alert: StellaDeterminismRegressionBitwise
        expr: |
          increase(stella_determinism_regressions_total{severity="bitwise"}[24h]) > 5
        for: 0m
        labels:
          severity: warning
          category: reliability
        annotations:
          summary: "Multiple bitwise determinism regressions for tenant {{ $labels.tenant }}"
          description: "{{ $value }} bitwise-level determinism failures in {{ $labels.component }} in last 24h"
          runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"