synergy moats product advisory implementations

This commit is contained in:
master
2026-01-17 01:30:03 +02:00
parent 77ff029205
commit 702a27ac83
112 changed files with 21356 additions and 127 deletions

View File

@@ -0,0 +1,38 @@
-- -----------------------------------------------------------------------------
-- V20260117__create_doctor_reports_table.sql
-- Sprint: SPRINT_20260117_025_Doctor_coverage_expansion
-- Task: DOC-EXP-005 - Persistent Report Storage
-- Description: Migration to create doctor_reports table for persistent storage
-- -----------------------------------------------------------------------------
-- Doctor reports table for persistent storage
CREATE TABLE IF NOT EXISTS doctor_reports (
run_id VARCHAR(64) PRIMARY KEY,
started_at TIMESTAMPTZ NOT NULL,
completed_at TIMESTAMPTZ,
overall_severity VARCHAR(16) NOT NULL,
passed_count INTEGER NOT NULL DEFAULT 0,
warning_count INTEGER NOT NULL DEFAULT 0,
failed_count INTEGER NOT NULL DEFAULT 0,
skipped_count INTEGER NOT NULL DEFAULT 0,
info_count INTEGER NOT NULL DEFAULT 0,
total_count INTEGER NOT NULL DEFAULT 0,
report_json_compressed BYTEA NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Index for listing reports by date
CREATE INDEX IF NOT EXISTS idx_doctor_reports_started_at
ON doctor_reports (started_at DESC);
-- Index for retention cleanup
CREATE INDEX IF NOT EXISTS idx_doctor_reports_created_at
ON doctor_reports (created_at);
-- Index for filtering by severity
CREATE INDEX IF NOT EXISTS idx_doctor_reports_severity
ON doctor_reports (overall_severity);
-- Comment on table
COMMENT ON TABLE doctor_reports IS 'Stores Doctor diagnostic reports with compression for audit trail';
COMMENT ON COLUMN doctor_reports.report_json_compressed IS 'GZip compressed JSON report data';

View File

@@ -0,0 +1,118 @@
# Sprint: SPRINT_20260117_028_Telemetry_p0_metrics
# Task: P0M-006 - Alerting Rules
# P0 Product Metrics Alert Rules
groups:
- name: stella-p0-metrics
rules:
# P0M-001: Time to First Verified Release
- alert: StellaTimeToFirstReleaseHigh
expr: |
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 14400
for: 1h
labels:
severity: warning
category: adoption
annotations:
summary: "Time to first verified release is high for tenant {{ $labels.tenant }}"
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 4 hours)"
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
- alert: StellaTimeToFirstReleaseCritical
expr: |
histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 86400
for: 1h
labels:
severity: critical
category: adoption
annotations:
summary: "Time to first verified release critically high for tenant {{ $labels.tenant }}"
description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 24 hours)"
runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
# P0M-002: Why Blocked Latency
- alert: StellaWhyBlockedLatencyHigh
expr: |
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 300
for: 30m
labels:
severity: warning
category: usability
annotations:
summary: "Why-blocked latency is high for tenant {{ $labels.tenant }}"
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
- alert: StellaWhyBlockedLatencyCritical
expr: |
histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 3600
for: 30m
labels:
severity: critical
category: usability
annotations:
summary: "Why-blocked latency critically high for tenant {{ $labels.tenant }}"
description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 1 hour)"
runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
# P0M-003: Support Burden
- alert: StellaSupportBurdenHigh
expr: |
sum by (tenant, month) (stella_support_burden_minutes_total) > 30
for: 0m
labels:
severity: warning
category: operations
annotations:
summary: "Support burden high for tenant {{ $labels.tenant }}"
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 30 minutes)"
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
- alert: StellaSupportBurdenCritical
expr: |
sum by (tenant, month) (stella_support_burden_minutes_total) > 60
for: 0m
labels:
severity: critical
category: operations
annotations:
summary: "Support burden critically high for tenant {{ $labels.tenant }}"
description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 60 minutes)"
runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
# P0M-004: Determinism Regressions
- alert: StellaDeterminismRegression
expr: |
increase(stella_determinism_regressions_total{severity="policy"}[5m]) > 0
for: 0m
labels:
severity: critical
category: reliability
annotations:
summary: "Policy-level determinism regression detected for tenant {{ $labels.tenant }}"
description: "Determinism failure in {{ $labels.component }} component - same inputs produced different policy decisions"
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
- alert: StellaDeterminismRegressionSemantic
expr: |
increase(stella_determinism_regressions_total{severity="semantic"}[1h]) > 0
for: 0m
labels:
severity: warning
category: reliability
annotations:
summary: "Semantic determinism regression detected for tenant {{ $labels.tenant }}"
description: "Semantic-level determinism failure in {{ $labels.component }} - outputs differ but policy decision unchanged"
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
- alert: StellaDeterminismRegressionBitwise
expr: |
increase(stella_determinism_regressions_total{severity="bitwise"}[24h]) > 5
for: 0m
labels:
severity: warning
category: reliability
annotations:
summary: "Multiple bitwise determinism regressions for tenant {{ $labels.tenant }}"
description: "{{ $value }} bitwise-level determinism failures in {{ $labels.component }} in last 24h"
runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"

View File

@@ -0,0 +1,308 @@
{
"__comment": "Sprint: SPRINT_20260117_028_Telemetry_p0_metrics - P0 Product Metrics Dashboard",
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Time from fresh install to first successful verified promotion",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 14400 },
{ "color": "red", "value": 86400 }
]
},
"unit": "s"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
"id": 1,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": ["p90"],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"title": "Time to First Verified Release (P90)",
"type": "gauge",
"targets": [
{
"expr": "histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket{tenant=~\"$tenant\"}[24h])) by (le))",
"legendFormat": "P90",
"refId": "A"
}
]
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Time from block decision to user viewing explanation",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 300 },
{ "color": "red", "value": 3600 }
]
},
"unit": "s"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
"id": 2,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": ["p90"],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"title": "Why Blocked Latency (P90)",
"type": "gauge",
"targets": [
{
"expr": "histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket{tenant=~\"$tenant\"}[24h])) by (le))",
"legendFormat": "P90",
"refId": "A"
}
]
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Support minutes per tenant this month",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 30 },
{ "color": "red", "value": 60 }
]
},
"unit": "m"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
"id": 3,
"options": {
"displayMode": "lcd",
"minVizHeight": 10,
"minVizWidth": 0,
"orientation": "horizontal",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"showUnfilled": true
},
"title": "Support Burden (minutes/month)",
"type": "bargauge",
"targets": [
{
"expr": "sum by (tenant, category) (stella_support_burden_minutes_total{month=~\"$month\", tenant=~\"$tenant\"})",
"legendFormat": "{{tenant}} - {{category}}",
"refId": "A"
}
]
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Determinism regression count by severity",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "red", "value": 1 }
]
},
"unit": "short"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"title": "Determinism Regressions",
"type": "stat",
"targets": [
{
"expr": "sum by (severity) (stella_determinism_regressions_total{tenant=~\"$tenant\"})",
"legendFormat": "{{severity}}",
"refId": "A"
}
]
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Time to first release heatmap over time",
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
"id": 5,
"options": {
"calculate": false,
"cellGap": 1,
"color": {
"exponent": 0.5,
"fill": "dark-orange",
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Oranges",
"steps": 64
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": true
},
"rowsFrame": {
"layout": "auto"
},
"tooltip": {
"show": true,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "s"
}
},
"title": "Time to First Release Distribution",
"type": "heatmap",
"targets": [
{
"expr": "sum(rate(stella_time_to_first_verified_release_seconds_bucket{tenant=~\"$tenant\"}[1h])) by (le)",
"format": "heatmap",
"legendFormat": "{{le}}",
"refId": "A"
}
]
}
],
"refresh": "30s",
"schemaVersion": 38,
"style": "dark",
"tags": ["stella-ops", "p0-metrics", "product"],
"templating": {
"list": [
{
"current": {},
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"definition": "label_values(stella_time_to_first_verified_release_seconds_count, tenant)",
"hide": 0,
"includeAll": true,
"label": "Tenant",
"multi": true,
"name": "tenant",
"options": [],
"query": {
"query": "label_values(stella_time_to_first_verified_release_seconds_count, tenant)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": {
"selected": true,
"text": "2026-01",
"value": "2026-01"
},
"hide": 0,
"label": "Month",
"name": "month",
"options": [
{ "selected": true, "text": "2026-01", "value": "2026-01" },
{ "selected": false, "text": "2025-12", "value": "2025-12" }
],
"query": "2026-01,2025-12",
"skipUrlSync": false,
"type": "custom"
}
]
},
"time": {
"from": "now-7d",
"to": "now"
},
"timepicker": {},
"timezone": "utc",
"title": "Stella Ops P0 Product Metrics",
"uid": "stella-ops-p0-metrics",
"version": 1,
"weekStart": ""
}