Some checks failed
Lighthouse CI / Lighthouse Audit (push) Waiting to run
Lighthouse CI / Axe Accessibility Audit (push) Waiting to run
Manifest Integrity / Validate Schema Integrity (push) Waiting to run
Manifest Integrity / Validate Contract Documents (push) Waiting to run
Manifest Integrity / Validate Pack Fixtures (push) Waiting to run
Manifest Integrity / Audit SHA256SUMS Files (push) Waiting to run
Manifest Integrity / Verify Merkle Roots (push) Waiting to run
Policy Lint & Smoke / policy-lint (push) Waiting to run
Policy Simulation / policy-simulate (push) Waiting to run
Docs CI / lint-and-preview (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Findings Ledger CI / build-test (push) Has been cancelled
Findings Ledger CI / migration-validation (push) Has been cancelled
Findings Ledger CI / generate-manifest (push) Has been cancelled
- Implemented tests for Cryptographic Failures (A02) to ensure proper handling of sensitive data, secure algorithms, and key management. - Added tests for Security Misconfiguration (A05) to validate production configurations, security headers, CORS settings, and feature management. - Developed tests for Authentication Failures (A07) to enforce strong password policies, rate limiting, session management, and MFA support. - Created tests for Software and Data Integrity Failures (A08) to verify artifact signatures, SBOM integrity, attestation chains, and feed updates.
160 lines
6.0 KiB
YAML
160 lines
6.0 KiB
YAML
# TTFS (Time to First Signal) Alert Rules
|
|
# Reference: SPRINT_0341_0001_0001 Task T10
|
|
# These alerts monitor SLOs for the TTFS experience
|
|
|
|
groups:
|
|
- name: ttfs-slo
|
|
interval: 30s
|
|
rules:
|
|
# Primary SLO: P95 latency must be under 5 seconds
|
|
- alert: TtfsP95High
|
|
expr: |
|
|
histogram_quantile(0.95, sum(rate(ttfs_latency_seconds_bucket[5m])) by (le, surface)) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: page
|
|
component: ttfs
|
|
slo: ttfs-latency
|
|
annotations:
|
|
summary: "TTFS P95 latency exceeds 5s for {{ $labels.surface }}"
|
|
description: "Time to First Signal P95 is {{ $value | humanizeDuration }} for surface {{ $labels.surface }}. This breaches the TTFS SLO."
|
|
runbook: "docs/runbooks/ttfs-latency-high.md"
|
|
dashboard: "https://grafana.stellaops.local/d/ttfs-overview"
|
|
|
|
# Cache performance: Hit rate should be above 70%
|
|
- alert: TtfsCacheHitRateLow
|
|
expr: |
|
|
sum(rate(ttfs_cache_hit_total[5m])) / sum(rate(ttfs_signal_total[5m])) < 0.7
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
component: ttfs
|
|
annotations:
|
|
summary: "TTFS cache hit rate below 70%"
|
|
description: "Cache hit rate is {{ $value | humanizePercentage }}. Low cache hit rates increase TTFS latency."
|
|
runbook: "docs/runbooks/ttfs-cache-performance.md"
|
|
|
|
# Error rate: Should be under 1%
|
|
- alert: TtfsErrorRateHigh
|
|
expr: |
|
|
sum(rate(ttfs_error_total[5m])) / sum(rate(ttfs_signal_total[5m])) > 0.01
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: ttfs
|
|
annotations:
|
|
summary: "TTFS error rate exceeds 1%"
|
|
description: "Error rate is {{ $value | humanizePercentage }}. Check logs for FirstSignalService errors."
|
|
runbook: "docs/runbooks/ttfs-error-investigation.md"
|
|
|
|
# SLO breach counter: Too many breaches in a short window
|
|
- alert: TtfsSloBreach
|
|
expr: |
|
|
sum(increase(ttfs_slo_breach_total[5m])) > 10
|
|
for: 1m
|
|
labels:
|
|
severity: page
|
|
component: ttfs
|
|
slo: ttfs-breach-rate
|
|
annotations:
|
|
summary: "TTFS SLO breach rate high"
|
|
description: "{{ $value }} SLO breaches in last 5 minutes. Immediate investigation required."
|
|
runbook: "docs/runbooks/ttfs-slo-breach.md"
|
|
|
|
# Endpoint latency: HTTP endpoint should respond within 500ms
|
|
- alert: FirstSignalEndpointLatencyHigh
|
|
expr: |
|
|
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{route=~"/api/v1/orchestrator/runs/.*/first-signal"}[5m])) by (le)) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
component: ttfs
|
|
annotations:
|
|
summary: "First signal endpoint P95 latency > 500ms"
|
|
description: "The /first-signal API endpoint P95 is {{ $value | humanizeDuration }}. This is the API-level latency only."
|
|
runbook: "docs/runbooks/first-signal-api-slow.md"
|
|
|
|
- name: ttfs-availability
|
|
interval: 1m
|
|
rules:
|
|
# Availability: First signal endpoint should be available
|
|
- alert: FirstSignalEndpointDown
|
|
expr: |
|
|
up{job="orchestrator"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
component: ttfs
|
|
annotations:
|
|
summary: "Orchestrator (First Signal provider) is down"
|
|
description: "The Orchestrator service is not responding. First Signal functionality is unavailable."
|
|
runbook: "docs/runbooks/orchestrator-down.md"
|
|
|
|
# No signals being generated
|
|
- alert: TtfsNoSignals
|
|
expr: |
|
|
sum(rate(ttfs_signal_total[10m])) == 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
component: ttfs
|
|
annotations:
|
|
summary: "No TTFS signals generated in 15 minutes"
|
|
description: "No First Signal events have been recorded. This could indicate no active runs or a metric collection issue."
|
|
|
|
- name: ttfs-ux
|
|
interval: 1m
|
|
rules:
|
|
# UX: High bounce rate indicates poor experience
|
|
- alert: TtfsBounceRateHigh
|
|
expr: |
|
|
sum(rate(ttfs_bounce_total[5m])) / sum(rate(ttfs_page_view_total[5m])) > 0.5
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
component: ttfs
|
|
area: ux
|
|
annotations:
|
|
summary: "TTFS page bounce rate exceeds 50%"
|
|
description: "More than 50% of users are leaving the run page within 10 seconds. This may indicate poor First Signal experience."
|
|
|
|
# UX: Long open-to-action time
|
|
- alert: TtfsOpenToActionSlow
|
|
expr: |
|
|
histogram_quantile(0.75, sum(rate(ttfs_open_to_action_seconds_bucket[15m])) by (le)) > 30
|
|
for: 1h
|
|
labels:
|
|
severity: info
|
|
component: ttfs
|
|
area: ux
|
|
annotations:
|
|
summary: "75% of users take >30s to first action"
|
|
description: "Users are taking a long time to act on First Signal. Consider UX improvements."
|
|
|
|
- name: ttfs-failure-signatures
|
|
interval: 30s
|
|
rules:
|
|
# New failure pattern emerging
|
|
- alert: TtfsNewFailurePatternHigh
|
|
expr: |
|
|
sum(rate(ttfs_failure_signature_new_total[5m])) > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
component: ttfs
|
|
annotations:
|
|
summary: "High rate of new failure signatures"
|
|
description: "New failure patterns are being detected at {{ $value }}/s. This may indicate a new class of errors."
|
|
|
|
# Failure signature confidence upgrades
|
|
- alert: TtfsFailureSignatureConfidenceUpgrade
|
|
expr: |
|
|
sum(increase(ttfs_failure_signature_confidence_upgrade_total[1h])) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: info
|
|
component: ttfs
|
|
annotations:
|
|
summary: "Multiple failure signatures upgraded to high confidence"
|
|
description: "{{ $value }} failure signatures have been upgraded to high confidence in the last hour."
|