devops folders consolidate
This commit is contained in:
36
deploy/telemetry/alerts/alerts-slo.yaml
Normal file
36
deploy/telemetry/alerts/alerts-slo.yaml
Normal file
@@ -0,0 +1,36 @@
|
||||
groups:
|
||||
- name: slo-burn
|
||||
rules:
|
||||
- alert: SLOBurnRateFast
|
||||
expr: |
|
||||
(rate(service_request_errors_total[5m]) / rate(service_requests_total[5m])) >
|
||||
4 * (1 - 0.99)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Fast burn: 99% SLO breached"
|
||||
description: "Error budget burn (5m) exceeds fast threshold."
|
||||
- alert: SLOBurnRateSlow
|
||||
expr: |
|
||||
(rate(service_request_errors_total[1h]) / rate(service_requests_total[1h])) >
|
||||
1 * (1 - 0.99)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Slow burn: 99% SLO at risk"
|
||||
description: "Error budget burn (1h) exceeds slow threshold."
|
||||
- name: slo-webhook
|
||||
rules:
|
||||
- alert: SLOWebhookFailures
|
||||
expr: rate(slo_webhook_failures_total[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "SLO webhook failures"
|
||||
description: "Webhook emitter has failures in last 5m."
|
||||
164
deploy/telemetry/alerts/export-center-alerts.yaml
Normal file
164
deploy/telemetry/alerts/export-center-alerts.yaml
Normal file
@@ -0,0 +1,164 @@
|
||||
# ExportCenter Alert Rules
|
||||
# SLO Burn-rate alerts for export service reliability
|
||||
|
||||
groups:
|
||||
- name: export-center-slo
|
||||
interval: 30s
|
||||
rules:
|
||||
# SLO: 99.5% success rate target
|
||||
# Error budget: 0.5% (432 errors per day at 86400 requests/day)
|
||||
|
||||
# Fast burn - 2% budget consumption in 1 hour (critical)
|
||||
- alert: ExportCenterHighErrorBurnRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(export_runs_failed_total[1h]))
|
||||
/
|
||||
sum(rate(export_runs_total[1h]))
|
||||
) > (14.4 * 0.005)
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: export-center
|
||||
slo: availability
|
||||
annotations:
|
||||
summary: "ExportCenter high error burn rate"
|
||||
description: "Error rate is {{ $value | humanizePercentage }} over the last hour, consuming error budget at 14.4x the sustainable rate."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-error-rate"
|
||||
|
||||
# Slow burn - 10% budget consumption in 6 hours (warning)
|
||||
- alert: ExportCenterElevatedErrorBurnRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(export_runs_failed_total[6h]))
|
||||
/
|
||||
sum(rate(export_runs_total[6h]))
|
||||
) > (6 * 0.005)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: export-center
|
||||
slo: availability
|
||||
annotations:
|
||||
summary: "ExportCenter elevated error burn rate"
|
||||
description: "Error rate is {{ $value | humanizePercentage }} over the last 6 hours, consuming error budget at 6x the sustainable rate."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/elevated-error-rate"
|
||||
|
||||
- name: export-center-latency
|
||||
interval: 30s
|
||||
rules:
|
||||
# SLO: 95% of exports complete within 120s
|
||||
# Fast burn - p95 latency exceeding threshold
|
||||
- alert: ExportCenterHighLatency
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(export_run_duration_seconds_bucket[5m])) by (le)
|
||||
) > 120
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: export-center
|
||||
slo: latency
|
||||
annotations:
|
||||
summary: "ExportCenter high latency"
|
||||
description: "95th percentile export duration is {{ $value | humanizeDuration }}, exceeding 120s SLO target."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-latency"
|
||||
|
||||
# Critical latency - p99 exceeding 5 minutes
|
||||
- alert: ExportCenterCriticalLatency
|
||||
expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(export_run_duration_seconds_bucket[5m])) by (le)
|
||||
) > 300
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: export-center
|
||||
slo: latency
|
||||
annotations:
|
||||
summary: "ExportCenter critical latency"
|
||||
description: "99th percentile export duration is {{ $value | humanizeDuration }}, indicating severe performance degradation."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/critical-latency"
|
||||
|
||||
- name: export-center-capacity
|
||||
interval: 60s
|
||||
rules:
|
||||
# Queue buildup warning
|
||||
- alert: ExportCenterHighConcurrency
|
||||
expr: sum(export_runs_in_progress) > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: export-center
|
||||
annotations:
|
||||
summary: "ExportCenter high concurrency"
|
||||
description: "{{ $value }} exports currently in progress. Consider scaling or investigating slow exports."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/high-concurrency"
|
||||
|
||||
# Stuck exports - exports running longer than 30 minutes
|
||||
- alert: ExportCenterStuckExports
|
||||
expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(export_run_duration_seconds_bucket{status!="completed"}[1h])) by (le)
|
||||
) > 1800
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: export-center
|
||||
annotations:
|
||||
summary: "ExportCenter potentially stuck exports"
|
||||
description: "Some exports may be stuck - 99th percentile duration for incomplete exports exceeds 30 minutes."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/stuck-exports"
|
||||
|
||||
- name: export-center-errors
|
||||
interval: 30s
|
||||
rules:
|
||||
# Specific error code spike
|
||||
- alert: ExportCenterErrorCodeSpike
|
||||
expr: |
|
||||
sum by (error_code) (
|
||||
rate(export_runs_failed_total[5m])
|
||||
) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: export-center
|
||||
annotations:
|
||||
summary: "ExportCenter error code spike: {{ $labels.error_code }}"
|
||||
description: "Error code {{ $labels.error_code }} is occurring at {{ $value | humanize }}/s rate."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/error-codes"
|
||||
|
||||
# No successful exports in 15 minutes (when there is traffic)
|
||||
- alert: ExportCenterNoSuccessfulExports
|
||||
expr: |
|
||||
(
|
||||
sum(rate(export_runs_total[15m])) > 0
|
||||
)
|
||||
and
|
||||
(
|
||||
sum(rate(export_runs_success_total[15m])) == 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
service: export-center
|
||||
annotations:
|
||||
summary: "ExportCenter no successful exports"
|
||||
description: "No exports have completed successfully in the last 15 minutes despite ongoing attempts."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/export-center/no-successful-exports"
|
||||
|
||||
- name: export-center-deprecation
|
||||
interval: 5m
|
||||
rules:
|
||||
# Deprecated endpoint usage
|
||||
- alert: ExportCenterDeprecatedEndpointUsage
|
||||
expr: |
|
||||
sum(rate(export_center_deprecated_endpoint_access_total[1h])) > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: info
|
||||
service: export-center
|
||||
annotations:
|
||||
summary: "Deprecated export endpoints still in use"
|
||||
description: "Legacy /exports endpoints are still being accessed at {{ $value | humanize }}/s. Migration to v1 API recommended."
|
||||
runbook_url: "https://docs.stellaops.io/api/export-center/migration"
|
||||
52
deploy/telemetry/alerts/policy-alerts.yaml
Normal file
52
deploy/telemetry/alerts/policy-alerts.yaml
Normal file
@@ -0,0 +1,52 @@
|
||||
groups:
|
||||
- name: policy-pipeline
|
||||
rules:
|
||||
- alert: PolicyCompileLatencyP99High
|
||||
expr: histogram_quantile(0.99, sum(rate(policy_compile_duration_seconds_bucket[5m])) by (le)) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy compile latency elevated (p99)"
|
||||
description: "p99 compile duration has been >5s for 10m"
|
||||
|
||||
- alert: PolicySimulationQueueBacklog
|
||||
expr: sum(policy_simulation_queue_depth) > 100
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy simulation backlog"
|
||||
description: "Simulation queue depth above 100 for 10m"
|
||||
|
||||
- alert: PolicyApprovalLatencyHigh
|
||||
expr: histogram_quantile(0.95, sum(rate(policy_approval_latency_seconds_bucket[5m])) by (le)) > 30
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy approval latency high"
|
||||
description: "p95 approval latency above 30s for 15m"
|
||||
|
||||
- alert: PolicyPromotionFailureRate
|
||||
expr: clamp_min(rate(policy_promotion_outcomes_total{outcome="failure"}[15m]), 0) / clamp_min(rate(policy_promotion_outcomes_total[15m]), 1) > 0.2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy promotion failure rate elevated"
|
||||
description: "Failures exceed 20% of promotions over 15m"
|
||||
|
||||
- alert: PolicyPromotionStall
|
||||
expr: rate(policy_promotion_outcomes_total{outcome="success"}[10m]) == 0 and sum(policy_simulation_queue_depth) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy promotion stalled"
|
||||
description: "No successful promotions while work is queued"
|
||||
42
deploy/telemetry/alerts/scanner-fn-drift-alerts.yaml
Normal file
42
deploy/telemetry/alerts/scanner-fn-drift-alerts.yaml
Normal file
@@ -0,0 +1,42 @@
|
||||
# Scanner FN-Drift Alert Rules
|
||||
# SLO alerts for false-negative drift thresholds (30-day rolling window)
|
||||
|
||||
groups:
|
||||
- name: scanner-fn-drift
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: ScannerFnDriftWarning
|
||||
expr: scanner_fn_drift_percent > 1.0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: scanner
|
||||
slo: fn-drift
|
||||
annotations:
|
||||
summary: "Scanner FN-Drift rate above warning threshold"
|
||||
description: "FN-Drift is {{ $value | humanizePercentage }} (> 1.0%) over the 30-day rolling window."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-warning"
|
||||
|
||||
- alert: ScannerFnDriftCritical
|
||||
expr: scanner_fn_drift_percent > 2.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: scanner
|
||||
slo: fn-drift
|
||||
annotations:
|
||||
summary: "Scanner FN-Drift rate above critical threshold"
|
||||
description: "FN-Drift is {{ $value | humanizePercentage }} (> 2.5%) over the 30-day rolling window."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-critical"
|
||||
|
||||
- alert: ScannerFnDriftEngineViolation
|
||||
expr: scanner_fn_drift_cause_engine > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: page
|
||||
service: scanner
|
||||
slo: determinism
|
||||
annotations:
|
||||
summary: "Engine-caused FN drift detected (determinism violation)"
|
||||
description: "Engine-caused FN drift count is {{ $value }} (> 0). This indicates non-feed, non-policy changes affecting outcomes."
|
||||
runbook_url: "https://docs.stellaops.io/runbooks/scanner/fn-drift-engine-violation"
|
||||
54
deploy/telemetry/alerts/signals-alerts.yaml
Normal file
54
deploy/telemetry/alerts/signals-alerts.yaml
Normal file
@@ -0,0 +1,54 @@
|
||||
groups:
|
||||
- name: signals-pipeline
|
||||
rules:
|
||||
- alert: SignalsScoringLatencyP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(signals_reachability_scoring_duration_seconds_bucket[5m])) by (le)) > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals scoring latency high (p95)"
|
||||
description: "Reachability scoring p95 exceeds 2s for 10m"
|
||||
|
||||
- alert: SignalsCacheMissRateHigh
|
||||
expr: |
|
||||
clamp_min(rate(signals_cache_misses_total[5m]), 0)
|
||||
/ clamp_min(rate(signals_cache_hits_total[5m]) + rate(signals_cache_misses_total[5m]), 1) > 0.3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals cache miss rate high"
|
||||
description: "Cache miss ratio >30% over 10m; investigate Redis or key churn."
|
||||
|
||||
- alert: SignalsCacheDown
|
||||
expr: signals_cache_available == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals cache unavailable"
|
||||
description: "Redis cache reported unavailable for >2m"
|
||||
|
||||
- alert: SignalsSensorStaleness
|
||||
expr: time() - max(signals_sensor_last_seen_timestamp_seconds) by (sensor) > 900
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals sensor stale"
|
||||
description: "No updates from sensor for >15 minutes"
|
||||
|
||||
- alert: SignalsIngestionErrorRate
|
||||
expr: clamp_min(rate(signals_ingestion_failures_total[5m]), 0) / clamp_min(rate(signals_ingestion_total[5m]), 1) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals ingestion failures elevated"
|
||||
description: "Ingestion failure ratio above 5% over 5m"
|
||||
118
deploy/telemetry/alerts/stella-p0-alerts.yml
Normal file
118
deploy/telemetry/alerts/stella-p0-alerts.yml
Normal file
@@ -0,0 +1,118 @@
|
||||
# Sprint: SPRINT_20260117_028_Telemetry_p0_metrics
|
||||
# Task: P0M-006 - Alerting Rules
|
||||
# P0 Product Metrics Alert Rules
|
||||
|
||||
groups:
|
||||
- name: stella-p0-metrics
|
||||
rules:
|
||||
# P0M-001: Time to First Verified Release
|
||||
- alert: StellaTimeToFirstReleaseHigh
|
||||
expr: |
|
||||
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 14400
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
category: adoption
|
||||
annotations:
|
||||
summary: "Time to first verified release is high for tenant {{ $labels.tenant }}"
|
||||
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 4 hours)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
|
||||
|
||||
- alert: StellaTimeToFirstReleaseCritical
|
||||
expr: |
|
||||
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 86400
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
category: adoption
|
||||
annotations:
|
||||
summary: "Time to first verified release critically high for tenant {{ $labels.tenant }}"
|
||||
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 24 hours)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
|
||||
|
||||
# P0M-002: Why Blocked Latency
|
||||
- alert: StellaWhyBlockedLatencyHigh
|
||||
expr: |
|
||||
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 300
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
category: usability
|
||||
annotations:
|
||||
summary: "Why-blocked latency is high for tenant {{ $labels.tenant }}"
|
||||
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
|
||||
|
||||
- alert: StellaWhyBlockedLatencyCritical
|
||||
expr: |
|
||||
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 3600
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
category: usability
|
||||
annotations:
|
||||
summary: "Why-blocked latency critically high for tenant {{ $labels.tenant }}"
|
||||
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 1 hour)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
|
||||
|
||||
# P0M-003: Support Burden
|
||||
- alert: StellaSupportBurdenHigh
|
||||
expr: |
|
||||
sum by (tenant, month) (stella_support_burden_minutes_total) > 30
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
category: operations
|
||||
annotations:
|
||||
summary: "Support burden high for tenant {{ $labels.tenant }}"
|
||||
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 30 minutes)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
|
||||
|
||||
- alert: StellaSupportBurdenCritical
|
||||
expr: |
|
||||
sum by (tenant, month) (stella_support_burden_minutes_total) > 60
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
category: operations
|
||||
annotations:
|
||||
summary: "Support burden critically high for tenant {{ $labels.tenant }}"
|
||||
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 60 minutes)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
|
||||
|
||||
# P0M-004: Determinism Regressions
|
||||
- alert: StellaDeterminismRegression
|
||||
expr: |
|
||||
increase(stella_determinism_regressions_total{severity="policy"}[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
category: reliability
|
||||
annotations:
|
||||
summary: "Policy-level determinism regression detected for tenant {{ $labels.tenant }}"
|
||||
description: "Determinism failure in {{ $labels.component }} component - same inputs produced different policy decisions"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
|
||||
|
||||
- alert: StellaDeterminismRegressionSemantic
|
||||
expr: |
|
||||
increase(stella_determinism_regressions_total{severity="semantic"}[1h]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
category: reliability
|
||||
annotations:
|
||||
summary: "Semantic determinism regression detected for tenant {{ $labels.tenant }}"
|
||||
description: "Semantic-level determinism failure in {{ $labels.component }} - outputs differ but policy decision unchanged"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
|
||||
|
||||
- alert: StellaDeterminismRegressionBitwise
|
||||
expr: |
|
||||
increase(stella_determinism_regressions_total{severity="bitwise"}[24h]) > 5
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
category: reliability
|
||||
annotations:
|
||||
summary: "Multiple bitwise determinism regressions for tenant {{ $labels.tenant }}"
|
||||
description: "{{ $value }} bitwise-level determinism failures in {{ $labels.component }} in last 24h"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
|
||||
62
deploy/telemetry/alerts/triage-alerts.yaml
Normal file
62
deploy/telemetry/alerts/triage-alerts.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
groups:
|
||||
- name: triage-ttfs
|
||||
rules:
|
||||
- alert: TriageTtfsFirstEvidenceP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_first_evidence_seconds_bucket[5m])) by (le)) > 1.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "TTFS first evidence p95 high"
|
||||
description: "TTFS first-evidence p95 exceeds 1.5s for 10m (triage experience degraded)."
|
||||
|
||||
- alert: TriageTtfsSkeletonP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_skeleton_seconds_bucket[5m])) by (le)) > 0.2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "TTFS skeleton p95 high"
|
||||
description: "TTFS skeleton p95 exceeds 200ms for 10m."
|
||||
|
||||
- alert: TriageTtfsFullEvidenceP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_full_evidence_seconds_bucket[5m])) by (le)) > 1.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "TTFS full evidence p95 high"
|
||||
description: "TTFS full-evidence p95 exceeds 1.5s for 10m."
|
||||
|
||||
- alert: TriageClicksToClosureMedianHigh
|
||||
expr: histogram_quantile(0.50, sum(rate(stellaops_clicks_to_closure_bucket[5m])) by (le)) > 6
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "Clicks-to-closure median high"
|
||||
description: "Median clicks-to-closure exceeds 6 for 15m."
|
||||
|
||||
- alert: TriageEvidenceCompletenessAvgLow
|
||||
expr: (sum(rate(stellaops_evidence_completeness_score_sum[15m])) / clamp_min(sum(rate(stellaops_evidence_completeness_score_count[15m])), 1)) < 3.6
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "Evidence completeness below target"
|
||||
description: "Average evidence completeness score below 3.6 (90%) for 30m."
|
||||
|
||||
- alert: TriageBudgetViolationRateHigh
|
||||
expr: sum(rate(stellaops_performance_budget_violations_total[5m])) by (phase) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "Performance budget violations elevated"
|
||||
description: "Performance budget violation rate exceeds 0.05/s for 10m."
|
||||
Reference in New Issue
Block a user