synergy moats product advisory implementations

2026-01-17 01:30:03 +02:00
parent 77ff029205
commit 702a27ac83
112 changed files with 21356 additions and 127 deletions
--- a/devops/database/migrations/V20260117__create_doctor_reports_table.sql
+++ b/devops/database/migrations/V20260117__create_doctor_reports_table.sql
@@ -0,0 +1,38 @@
+-- -----------------------------------------------------------------------------
+-- V20260117__create_doctor_reports_table.sql
+-- Sprint: SPRINT_20260117_025_Doctor_coverage_expansion
+-- Task: DOC-EXP-005 - Persistent Report Storage
+-- Description: Migration to create doctor_reports table for persistent storage
+-- -----------------------------------------------------------------------------
+
+-- Doctor reports table for persistent storage
+CREATE TABLE IF NOT EXISTS doctor_reports (
+    run_id VARCHAR(64) PRIMARY KEY,
+    started_at TIMESTAMPTZ NOT NULL,
+    completed_at TIMESTAMPTZ,
+    overall_severity VARCHAR(16) NOT NULL,
+    passed_count INTEGER NOT NULL DEFAULT 0,
+    warning_count INTEGER NOT NULL DEFAULT 0,
+    failed_count INTEGER NOT NULL DEFAULT 0,
+    skipped_count INTEGER NOT NULL DEFAULT 0,
+    info_count INTEGER NOT NULL DEFAULT 0,
+    total_count INTEGER NOT NULL DEFAULT 0,
+    report_json_compressed BYTEA NOT NULL,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+-- Index for listing reports by date
+CREATE INDEX IF NOT EXISTS idx_doctor_reports_started_at 
+    ON doctor_reports (started_at DESC);
+
+-- Index for retention cleanup
+CREATE INDEX IF NOT EXISTS idx_doctor_reports_created_at 
+    ON doctor_reports (created_at);
+
+-- Index for filtering by severity
+CREATE INDEX IF NOT EXISTS idx_doctor_reports_severity 
+    ON doctor_reports (overall_severity);
+
+-- Comment on table
+COMMENT ON TABLE doctor_reports IS 'Stores Doctor diagnostic reports with compression for audit trail';
+COMMENT ON COLUMN doctor_reports.report_json_compressed IS 'GZip compressed JSON report data';
--- a/devops/telemetry/alerts/stella-p0-alerts.yml
+++ b/devops/telemetry/alerts/stella-p0-alerts.yml
@@ -0,0 +1,118 @@
+# Sprint: SPRINT_20260117_028_Telemetry_p0_metrics
+# Task: P0M-006 - Alerting Rules
+# P0 Product Metrics Alert Rules
+
+groups:
+  - name: stella-p0-metrics
+    rules:
+      # P0M-001: Time to First Verified Release
+      - alert: StellaTimeToFirstReleaseHigh
+        expr: |
+          histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 14400
+        for: 1h
+        labels:
+          severity: warning
+          category: adoption
+        annotations:
+          summary: "Time to first verified release is high for tenant {{ $labels.tenant }}"
+          description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 4 hours)"
+          runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
+          
+      - alert: StellaTimeToFirstReleaseCritical
+        expr: |
+          histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket[24h])) by (le, tenant)) > 86400
+        for: 1h
+        labels:
+          severity: critical
+          category: adoption
+        annotations:
+          summary: "Time to first verified release critically high for tenant {{ $labels.tenant }}"
+          description: "P90 time to first verified release is {{ $value | humanizeDuration }} (threshold: 24 hours)"
+          runbook_url: "https://docs.stella-ops.org/runbooks/adoption-onboarding"
+
+      # P0M-002: Why Blocked Latency
+      - alert: StellaWhyBlockedLatencyHigh
+        expr: |
+          histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 300
+        for: 30m
+        labels:
+          severity: warning
+          category: usability
+        annotations:
+          summary: "Why-blocked latency is high for tenant {{ $labels.tenant }}"
+          description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 5 minutes)"
+          runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
+
+      - alert: StellaWhyBlockedLatencyCritical
+        expr: |
+          histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket[1h])) by (le, tenant)) > 3600
+        for: 30m
+        labels:
+          severity: critical
+          category: usability
+        annotations:
+          summary: "Why-blocked latency critically high for tenant {{ $labels.tenant }}"
+          description: "P90 time to answer 'why blocked' is {{ $value | humanizeDuration }} (threshold: 1 hour)"
+          runbook_url: "https://docs.stella-ops.org/runbooks/usability-explain"
+
+      # P0M-003: Support Burden
+      - alert: StellaSupportBurdenHigh
+        expr: |
+          sum by (tenant, month) (stella_support_burden_minutes_total) > 30
+        for: 0m
+        labels:
+          severity: warning
+          category: operations
+        annotations:
+          summary: "Support burden high for tenant {{ $labels.tenant }}"
+          description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 30 minutes)"
+          runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
+
+      - alert: StellaSupportBurdenCritical
+        expr: |
+          sum by (tenant, month) (stella_support_burden_minutes_total) > 60
+        for: 0m
+        labels:
+          severity: critical
+          category: operations
+        annotations:
+          summary: "Support burden critically high for tenant {{ $labels.tenant }}"
+          description: "Support time for {{ $labels.tenant }} in {{ $labels.month }} is {{ $value }} minutes (threshold: 60 minutes)"
+          runbook_url: "https://docs.stella-ops.org/runbooks/support-optimization"
+
+      # P0M-004: Determinism Regressions
+      - alert: StellaDeterminismRegression
+        expr: |
+          increase(stella_determinism_regressions_total{severity="policy"}[5m]) > 0
+        for: 0m
+        labels:
+          severity: critical
+          category: reliability
+        annotations:
+          summary: "Policy-level determinism regression detected for tenant {{ $labels.tenant }}"
+          description: "Determinism failure in {{ $labels.component }} component - same inputs produced different policy decisions"
+          runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
+
+      - alert: StellaDeterminismRegressionSemantic
+        expr: |
+          increase(stella_determinism_regressions_total{severity="semantic"}[1h]) > 0
+        for: 0m
+        labels:
+          severity: warning
+          category: reliability
+        annotations:
+          summary: "Semantic determinism regression detected for tenant {{ $labels.tenant }}"
+          description: "Semantic-level determinism failure in {{ $labels.component }} - outputs differ but policy decision unchanged"
+          runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
+
+      - alert: StellaDeterminismRegressionBitwise
+        expr: |
+          increase(stella_determinism_regressions_total{severity="bitwise"}[24h]) > 5
+        for: 0m
+        labels:
+          severity: warning
+          category: reliability
+        annotations:
+          summary: "Multiple bitwise determinism regressions for tenant {{ $labels.tenant }}"
+          description: "{{ $value }} bitwise-level determinism failures in {{ $labels.component }} in last 24h"
+          runbook_url: "https://docs.stella-ops.org/runbooks/determinism-failure"
--- a/devops/telemetry/grafana/dashboards/stella-ops-p0-metrics.json
+++ b/devops/telemetry/grafana/dashboards/stella-ops-p0-metrics.json
@@ -0,0 +1,308 @@
+{
+  "__comment": "Sprint: SPRINT_20260117_028_Telemetry_p0_metrics - P0 Product Metrics Dashboard",
+  "annotations": {
+    "list": []
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Time from fresh install to first successful verified promotion",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 14400 },
+              { "color": "red", "value": 86400 }
+            ]
+          },
+          "unit": "s"
+        }
+      },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "id": 1,
+      "options": {
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["p90"],
+          "fields": "",
+          "values": false
+        },
+        "showThresholdLabels": false,
+        "showThresholdMarkers": true
+      },
+      "title": "Time to First Verified Release (P90)",
+      "type": "gauge",
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.90, sum(rate(stella_time_to_first_verified_release_seconds_bucket{tenant=~\"$tenant\"}[24h])) by (le))",
+          "legendFormat": "P90",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Time from block decision to user viewing explanation",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 300 },
+              { "color": "red", "value": 3600 }
+            ]
+          },
+          "unit": "s"
+        }
+      },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "id": 2,
+      "options": {
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["p90"],
+          "fields": "",
+          "values": false
+        },
+        "showThresholdLabels": false,
+        "showThresholdMarkers": true
+      },
+      "title": "Why Blocked Latency (P90)",
+      "type": "gauge",
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.90, sum(rate(stella_why_blocked_latency_seconds_bucket{tenant=~\"$tenant\"}[24h])) by (le))",
+          "legendFormat": "P90",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Support minutes per tenant this month",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "yellow", "value": 30 },
+              { "color": "red", "value": 60 }
+            ]
+          },
+          "unit": "m"
+        }
+      },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "id": 3,
+      "options": {
+        "displayMode": "lcd",
+        "minVizHeight": 10,
+        "minVizWidth": 0,
+        "orientation": "horizontal",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "showUnfilled": true
+      },
+      "title": "Support Burden (minutes/month)",
+      "type": "bargauge",
+      "targets": [
+        {
+          "expr": "sum by (tenant, category) (stella_support_burden_minutes_total{month=~\"$month\", tenant=~\"$tenant\"})",
+          "legendFormat": "{{tenant}} - {{category}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Determinism regression count by severity",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "red", "value": 1 }
+            ]
+          },
+          "unit": "short"
+        }
+      },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 },
+      "id": 4,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": ["lastNotNull"],
+          "fields": "",
+          "values": false
+        },
+        "textMode": "auto"
+      },
+      "title": "Determinism Regressions",
+      "type": "stat",
+      "targets": [
+        {
+          "expr": "sum by (severity) (stella_determinism_regressions_total{tenant=~\"$tenant\"})",
+          "legendFormat": "{{severity}}",
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Time to first release heatmap over time",
+      "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 },
+      "id": 5,
+      "options": {
+        "calculate": false,
+        "cellGap": 1,
+        "color": {
+          "exponent": 0.5,
+          "fill": "dark-orange",
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Oranges",
+          "steps": 64
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": true
+        },
+        "rowsFrame": {
+          "layout": "auto"
+        },
+        "tooltip": {
+          "show": true,
+          "yHistogram": false
+        },
+        "yAxis": {
+          "axisPlacement": "left",
+          "reverse": false,
+          "unit": "s"
+        }
+      },
+      "title": "Time to First Release Distribution",
+      "type": "heatmap",
+      "targets": [
+        {
+          "expr": "sum(rate(stella_time_to_first_verified_release_seconds_bucket{tenant=~\"$tenant\"}[1h])) by (le)",
+          "format": "heatmap",
+          "legendFormat": "{{le}}",
+          "refId": "A"
+        }
+      ]
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 38,
+  "style": "dark",
+  "tags": ["stella-ops", "p0-metrics", "product"],
+  "templating": {
+    "list": [
+      {
+        "current": {},
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${DS_PROMETHEUS}"
+        },
+        "definition": "label_values(stella_time_to_first_verified_release_seconds_count, tenant)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Tenant",
+        "multi": true,
+        "name": "tenant",
+        "options": [],
+        "query": {
+          "query": "label_values(stella_time_to_first_verified_release_seconds_count, tenant)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      },
+      {
+        "current": {
+          "selected": true,
+          "text": "2026-01",
+          "value": "2026-01"
+        },
+        "hide": 0,
+        "label": "Month",
+        "name": "month",
+        "options": [
+          { "selected": true, "text": "2026-01", "value": "2026-01" },
+          { "selected": false, "text": "2025-12", "value": "2025-12" }
+        ],
+        "query": "2026-01,2025-12",
+        "skipUrlSync": false,
+        "type": "custom"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-7d",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "utc",
+  "title": "Stella Ops P0 Product Metrics",
+  "uid": "stella-ops-p0-metrics",
+  "version": 1,
+  "weekStart": ""
+}