synergy moats product advisory implementations
This commit is contained in:
118
devops/telemetry/alerts/stella-p0-alerts.yml
Normal file
118
devops/telemetry/alerts/stella-p0-alerts.yml
Normal file
@@ -0,0 +1,118 @@
|
||||
# Sprint: SPRINT_20260117_028_Telemetry_p0_metrics
|
||||
# Task: P0M-006 - Alerting Rules
|
||||
# P0 Product Metrics Alert Rules
|
||||
|
||||
groups:
|
||||
- name: stella-p0-metrics
|
||||
rules:
|
||||
# P0M-001: Time to First Verified Release
|
||||
- alert: StellaTimeToFirstReleaseHigh
|
||||
expr: |
|
||||
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 14400
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
category: adoption
|
||||
annotations:
|
||||
summary: "Time to first verified release is high for tenant {{ $labels.tenant }}"
|
||||
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 4 hours)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
|
||||
|
||||
- alert: StellaTimeToFirstReleaseCritical
|
||||
expr: |
|
||||
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 86400
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
category: adoption
|
||||
annotations:
|
||||
summary: "Time to first verified release critically high for tenant {{ $labels.tenant }}"
|
||||
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 24 hours)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
|
||||
|
||||
# P0M-002: Why Blocked Latency
|
||||
- alert: StellaWhyBlockedLatencyHigh
|
||||
expr: |
|
||||
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 300
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
category: usability
|
||||
annotations:
|
||||
summary: "Why-blocked latency is high for tenant {{ $labels.tenant }}"
|
||||
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
|
||||
|
||||
- alert: StellaWhyBlockedLatencyCritical
|
||||
expr: |
|
||||
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 3600
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
category: usability
|
||||
annotations:
|
||||
summary: "Why-blocked latency critically high for tenant {{ $labels.tenant }}"
|
||||
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 1 hour)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
|
||||
|
||||
# P0M-003: Support Burden
|
||||
- alert: StellaSupportBurdenHigh
|
||||
expr: |
|
||||
sum by (tenant, month) (stella_support_burden_minutes_total) > 30
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
category: operations
|
||||
annotations:
|
||||
summary: "Support burden high for tenant {{ $labels.tenant }}"
|
||||
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 30 minutes)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
|
||||
|
||||
- alert: StellaSupportBurdenCritical
|
||||
expr: |
|
||||
sum by (tenant, month) (stella_support_burden_minutes_total) > 60
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
category: operations
|
||||
annotations:
|
||||
summary: "Support burden critically high for tenant {{ $labels.tenant }}"
|
||||
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 60 minutes)"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
|
||||
|
||||
# P0M-004: Determinism Regressions
|
||||
- alert: StellaDeterminismRegression
|
||||
expr: |
|
||||
increase(stella_determinism_regressions_total{severity="policy"}[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
category: reliability
|
||||
annotations:
|
||||
summary: "Policy-level determinism regression detected for tenant {{ $labels.tenant }}"
|
||||
description: "Determinism failure in {{ $labels.component }} component - same inputs produced different policy decisions"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
|
||||
|
||||
- alert: StellaDeterminismRegressionSemantic
|
||||
expr: |
|
||||
increase(stella_determinism_regressions_total{severity="semantic"}[1h]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
category: reliability
|
||||
annotations:
|
||||
summary: "Semantic determinism regression detected for tenant {{ $labels.tenant }}"
|
||||
description: "Semantic-level determinism failure in {{ $labels.component }} - outputs differ but policy decision unchanged"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
|
||||
|
||||
- alert: StellaDeterminismRegressionBitwise
|
||||
expr: |
|
||||
increase(stella_determinism_regressions_total{severity="bitwise"}[24h]) > 5
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
category: reliability
|
||||
annotations:
|
||||
summary: "Multiple bitwise determinism regressions for tenant {{ $labels.tenant }}"
|
||||
description: "{{ $value }} bitwise-level determinism failures in {{ $labels.component }} in last 24h"
|
||||
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
|
||||
308
devops/telemetry/grafana/dashboards/stella-ops-p0-metrics.json
Normal file
308
devops/telemetry/grafana/dashboards/stella-ops-p0-metrics.json
Normal file
@@ -0,0 +1,308 @@
|
||||
{
|
||||
"__comment": "Sprint: SPRINT_20260117_028_Telemetry_p0_metrics - P0 Product Metrics Dashboard",
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Time from fresh install to first successful verified promotion",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 14400 },
|
||||
{ "color": "red", "value": 86400 }
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["p90"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"title": "Time to First Verified Release (P90)",
|
||||
"type": "gauge",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket{tenant=~\"$tenant\"}[24h])) by (le))",
|
||||
"legendFormat": "P90",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Time from block decision to user viewing explanation",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 300 },
|
||||
{ "color": "red", "value": 3600 }
|
||||
]
|
||||
},
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["p90"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showThresholdLabels": false,
|
||||
"showThresholdMarkers": true
|
||||
},
|
||||
"title": "Why Blocked Latency (P90)",
|
||||
"type": "gauge",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket{tenant=~\"$tenant\"}[24h])) by (le))",
|
||||
"legendFormat": "P90",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Support minutes per tenant this month",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 30 },
|
||||
{ "color": "red", "value": 60 }
|
||||
]
|
||||
},
|
||||
"unit": "m"
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"displayMode": "lcd",
|
||||
"minVizHeight": 10,
|
||||
"minVizWidth": 0,
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"showUnfilled": true
|
||||
},
|
||||
"title": "Support Burden (minutes/month)",
|
||||
"type": "bargauge",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (tenant, category) (stella_support_burden_minutes_total{month=~\"$month\", tenant=~\"$tenant\"})",
|
||||
"legendFormat": "{{tenant}} - {{category}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Determinism regression count by severity",
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"title": "Determinism Regressions",
|
||||
"type": "stat",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (severity) (stella_determinism_regressions_total{tenant=~\"$tenant\"})",
|
||||
"legendFormat": "{{severity}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"description": "Time to first release heatmap over time",
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
|
||||
"id": 5,
|
||||
"options": {
|
||||
"calculate": false,
|
||||
"cellGap": 1,
|
||||
"color": {
|
||||
"exponent": 0.5,
|
||||
"fill": "dark-orange",
|
||||
"mode": "scheme",
|
||||
"reverse": false,
|
||||
"scale": "exponential",
|
||||
"scheme": "Oranges",
|
||||
"steps": 64
|
||||
},
|
||||
"exemplars": {
|
||||
"color": "rgba(255,0,255,0.7)"
|
||||
},
|
||||
"filterValues": {
|
||||
"le": 1e-9
|
||||
},
|
||||
"legend": {
|
||||
"show": true
|
||||
},
|
||||
"rowsFrame": {
|
||||
"layout": "auto"
|
||||
},
|
||||
"tooltip": {
|
||||
"show": true,
|
||||
"yHistogram": false
|
||||
},
|
||||
"yAxis": {
|
||||
"axisPlacement": "left",
|
||||
"reverse": false,
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"title": "Time to First Release Distribution",
|
||||
"type": "heatmap",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(stella_time_to_first_verified_release_seconds_bucket{tenant=~\"$tenant\"}[1h])) by (le)",
|
||||
"format": "heatmap",
|
||||
"legendFormat": "{{le}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": ["stella-ops", "p0-metrics", "product"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": {},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "${DS_PROMETHEUS}"
|
||||
},
|
||||
"definition": "label_values(stella_time_to_first_verified_release_seconds_count, tenant)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Tenant",
|
||||
"multi": true,
|
||||
"name": "tenant",
|
||||
"options": [],
|
||||
"query": {
|
||||
"query": "label_values(stella_time_to_first_verified_release_seconds_count, tenant)",
|
||||
"refId": "StandardVariableQuery"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": "2026-01",
|
||||
"value": "2026-01"
|
||||
},
|
||||
"hide": 0,
|
||||
"label": "Month",
|
||||
"name": "month",
|
||||
"options": [
|
||||
{ "selected": true, "text": "2026-01", "value": "2026-01" },
|
||||
{ "selected": false, "text": "2025-12", "value": "2025-12" }
|
||||
],
|
||||
"query": "2026-01,2025-12",
|
||||
"skipUrlSync": false,
|
||||
"type": "custom"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-7d",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "utc",
|
||||
"title": "Stella Ops P0 Product Metrics",
|
||||
"uid": "stella-ops-p0-metrics",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
Reference in New Issue
Block a user