# TTFS (Time to First Signal) Alert Rules # Reference: SPRINT_0341_0001_0001 Task T10 # These alerts monitor SLOs for the TTFS experience groups: - name: ttfs-slo interval: 30s rules: # Primary SLO: P95 latency must be under 5 seconds - alert: TtfsP95High expr: | histogram_quantile(0.95, sum(rate(ttfs_latency_seconds_bucket[5m])) by (le, surface)) > 5 for: 5m labels: severity: page component: ttfs slo: ttfs-latency annotations: summary: "TTFS P95 latency exceeds 5s for {{ $labels.surface }}" description: "Time to First Signal P95 is {{ $value | humanizeDuration }} for surface {{ $labels.surface }}. This breaches the TTFS SLO." runbook: "docs/runbooks/ttfs-latency-high.md" dashboard: "https://grafana.stellaops.local/d/ttfs-overview" # Cache performance: Hit rate should be above 70% - alert: TtfsCacheHitRateLow expr: | sum(rate(ttfs_cache_hit_total[5m])) / sum(rate(ttfs_signal_total[5m])) < 0.7 for: 10m labels: severity: warning component: ttfs annotations: summary: "TTFS cache hit rate below 70%" description: "Cache hit rate is {{ $value | humanizePercentage }}. Low cache hit rates increase TTFS latency." runbook: "docs/runbooks/ttfs-cache-performance.md" # Error rate: Should be under 1% - alert: TtfsErrorRateHigh expr: | sum(rate(ttfs_error_total[5m])) / sum(rate(ttfs_signal_total[5m])) > 0.01 for: 5m labels: severity: warning component: ttfs annotations: summary: "TTFS error rate exceeds 1%" description: "Error rate is {{ $value | humanizePercentage }}. Check logs for FirstSignalService errors." runbook: "docs/runbooks/ttfs-error-investigation.md" # SLO breach counter: Too many breaches in a short window - alert: TtfsSloBreach expr: | sum(increase(ttfs_slo_breach_total[5m])) > 10 for: 1m labels: severity: page component: ttfs slo: ttfs-breach-rate annotations: summary: "TTFS SLO breach rate high" description: "{{ $value }} SLO breaches in last 5 minutes. Immediate investigation required." runbook: "docs/runbooks/ttfs-slo-breach.md" # Endpoint latency: HTTP endpoint should respond within 500ms - alert: FirstSignalEndpointLatencyHigh expr: | histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{route=~"/api/v1/orchestrator/runs/.*/first-signal"}[5m])) by (le)) > 0.5 for: 5m labels: severity: warning component: ttfs annotations: summary: "First signal endpoint P95 latency > 500ms" description: "The /first-signal API endpoint P95 is {{ $value | humanizeDuration }}. This is the API-level latency only." runbook: "docs/runbooks/first-signal-api-slow.md" - name: ttfs-availability interval: 1m rules: # Availability: First signal endpoint should be available - alert: FirstSignalEndpointDown expr: | up{job="orchestrator"} == 0 for: 2m labels: severity: critical component: ttfs annotations: summary: "Orchestrator (First Signal provider) is down" description: "The Orchestrator service is not responding. First Signal functionality is unavailable." runbook: "docs/runbooks/orchestrator-down.md" # No signals being generated - alert: TtfsNoSignals expr: | sum(rate(ttfs_signal_total[10m])) == 0 for: 15m labels: severity: warning component: ttfs annotations: summary: "No TTFS signals generated in 15 minutes" description: "No First Signal events have been recorded. This could indicate no active runs or a metric collection issue." - name: ttfs-ux interval: 1m rules: # UX: High bounce rate indicates poor experience - alert: TtfsBounceRateHigh expr: | sum(rate(ttfs_bounce_total[5m])) / sum(rate(ttfs_page_view_total[5m])) > 0.5 for: 30m labels: severity: warning component: ttfs area: ux annotations: summary: "TTFS page bounce rate exceeds 50%" description: "More than 50% of users are leaving the run page within 10 seconds. This may indicate poor First Signal experience." # UX: Long open-to-action time - alert: TtfsOpenToActionSlow expr: | histogram_quantile(0.75, sum(rate(ttfs_open_to_action_seconds_bucket[15m])) by (le)) > 30 for: 1h labels: severity: info component: ttfs area: ux annotations: summary: "75% of users take >30s to first action" description: "Users are taking a long time to act on First Signal. Consider UX improvements." - name: ttfs-failure-signatures interval: 30s rules: # New failure pattern emerging - alert: TtfsNewFailurePatternHigh expr: | sum(rate(ttfs_failure_signature_new_total[5m])) > 1 for: 10m labels: severity: warning component: ttfs annotations: summary: "High rate of new failure signatures" description: "New failure patterns are being detected at {{ $value }}/s. This may indicate a new class of errors." # Failure signature confidence upgrades - alert: TtfsFailureSignatureConfidenceUpgrade expr: | sum(increase(ttfs_failure_signature_confidence_upgrade_total[1h])) > 5 for: 5m labels: severity: info component: ttfs annotations: summary: "Multiple failure signatures upgraded to high confidence" description: "{{ $value }} failure signatures have been upgraded to high confidence in the last hour."