Add comprehensive security tests for OWASP A02, A05, A07, and A08 categories

- Implemented tests for Cryptographic Failures (A02) to ensure proper handling of sensitive data, secure algorithms, and key management. - Added tests for Security Misconfiguration (A05) to validate production configurations, security headers, CORS settings, and feature management. - Developed tests for Authentication Failures (A07) to enforce strong password policies, rate limiting, session management, and MFA support. - Created tests for Software and Data Integrity Failures (A08) to verify artifact signatures, SBOM integrity, attestation chains, and feed updates.
2025-12-16 16:40:19 +02:00
parent 415eff1207
commit 2170a58734
206 changed files with 30547 additions and 534 deletions
--- a/docs/modules/telemetry/operations/alerts/ttfs-alerts.yaml
+++ b/docs/modules/telemetry/operations/alerts/ttfs-alerts.yaml
@@ -0,0 +1,159 @@
+# TTFS (Time to First Signal) Alert Rules
+# Reference: SPRINT_0341_0001_0001 Task T10
+# These alerts monitor SLOs for the TTFS experience
+
+groups:
+  - name: ttfs-slo
+    interval: 30s
+    rules:
+      # Primary SLO: P95 latency must be under 5 seconds
+      - alert: TtfsP95High
+        expr: |
+          histogram_quantile(0.95, sum(rate(ttfs_latency_seconds_bucket[5m])) by (le, surface)) > 5
+        for: 5m
+        labels:
+          severity: page
+          component: ttfs
+          slo: ttfs-latency
+        annotations:
+          summary: "TTFS P95 latency exceeds 5s for {{ $labels.surface }}"
+          description: "Time to First Signal P95 is {{ $value | humanizeDuration }} for surface {{ $labels.surface }}. This breaches the TTFS SLO."
+          runbook: "docs/runbooks/ttfs-latency-high.md"
+          dashboard: "https://grafana.stellaops.local/d/ttfs-overview"
+
+      # Cache performance: Hit rate should be above 70%
+      - alert: TtfsCacheHitRateLow
+        expr: |
+          sum(rate(ttfs_cache_hit_total[5m])) / sum(rate(ttfs_signal_total[5m])) < 0.7
+        for: 10m
+        labels:
+          severity: warning
+          component: ttfs
+        annotations:
+          summary: "TTFS cache hit rate below 70%"
+          description: "Cache hit rate is {{ $value | humanizePercentage }}. Low cache hit rates increase TTFS latency."
+          runbook: "docs/runbooks/ttfs-cache-performance.md"
+
+      # Error rate: Should be under 1%
+      - alert: TtfsErrorRateHigh
+        expr: |
+          sum(rate(ttfs_error_total[5m])) / sum(rate(ttfs_signal_total[5m])) > 0.01
+        for: 5m
+        labels:
+          severity: warning
+          component: ttfs
+        annotations:
+          summary: "TTFS error rate exceeds 1%"
+          description: "Error rate is {{ $value | humanizePercentage }}. Check logs for FirstSignalService errors."
+          runbook: "docs/runbooks/ttfs-error-investigation.md"
+
+      # SLO breach counter: Too many breaches in a short window
+      - alert: TtfsSloBreach
+        expr: |
+          sum(increase(ttfs_slo_breach_total[5m])) > 10
+        for: 1m
+        labels:
+          severity: page
+          component: ttfs
+          slo: ttfs-breach-rate
+        annotations:
+          summary: "TTFS SLO breach rate high"
+          description: "{{ $value }} SLO breaches in last 5 minutes. Immediate investigation required."
+          runbook: "docs/runbooks/ttfs-slo-breach.md"
+
+      # Endpoint latency: HTTP endpoint should respond within 500ms
+      - alert: FirstSignalEndpointLatencyHigh
+        expr: |
+          histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{route=~"/api/v1/orchestrator/runs/.*/first-signal"}[5m])) by (le)) > 0.5
+        for: 5m
+        labels:
+          severity: warning
+          component: ttfs
+        annotations:
+          summary: "First signal endpoint P95 latency > 500ms"
+          description: "The /first-signal API endpoint P95 is {{ $value | humanizeDuration }}. This is the API-level latency only."
+          runbook: "docs/runbooks/first-signal-api-slow.md"
+
+  - name: ttfs-availability
+    interval: 1m
+    rules:
+      # Availability: First signal endpoint should be available
+      - alert: FirstSignalEndpointDown
+        expr: |
+          up{job="orchestrator"} == 0
+        for: 2m
+        labels:
+          severity: critical
+          component: ttfs
+        annotations:
+          summary: "Orchestrator (First Signal provider) is down"
+          description: "The Orchestrator service is not responding. First Signal functionality is unavailable."
+          runbook: "docs/runbooks/orchestrator-down.md"
+
+      # No signals being generated
+      - alert: TtfsNoSignals
+        expr: |
+          sum(rate(ttfs_signal_total[10m])) == 0
+        for: 15m
+        labels:
+          severity: warning
+          component: ttfs
+        annotations:
+          summary: "No TTFS signals generated in 15 minutes"
+          description: "No First Signal events have been recorded. This could indicate no active runs or a metric collection issue."
+
+  - name: ttfs-ux
+    interval: 1m
+    rules:
+      # UX: High bounce rate indicates poor experience
+      - alert: TtfsBounceRateHigh
+        expr: |
+          sum(rate(ttfs_bounce_total[5m])) / sum(rate(ttfs_page_view_total[5m])) > 0.5
+        for: 30m
+        labels:
+          severity: warning
+          component: ttfs
+          area: ux
+        annotations:
+          summary: "TTFS page bounce rate exceeds 50%"
+          description: "More than 50% of users are leaving the run page within 10 seconds. This may indicate poor First Signal experience."
+
+      # UX: Long open-to-action time
+      - alert: TtfsOpenToActionSlow
+        expr: |
+          histogram_quantile(0.75, sum(rate(ttfs_open_to_action_seconds_bucket[15m])) by (le)) > 30
+        for: 1h
+        labels:
+          severity: info
+          component: ttfs
+          area: ux
+        annotations:
+          summary: "75% of users take >30s to first action"
+          description: "Users are taking a long time to act on First Signal. Consider UX improvements."
+
+  - name: ttfs-failure-signatures
+    interval: 30s
+    rules:
+      # New failure pattern emerging
+      - alert: TtfsNewFailurePatternHigh
+        expr: |
+          sum(rate(ttfs_failure_signature_new_total[5m])) > 1
+        for: 10m
+        labels:
+          severity: warning
+          component: ttfs
+        annotations:
+          summary: "High rate of new failure signatures"
+          description: "New failure patterns are being detected at {{ $value }}/s. This may indicate a new class of errors."
+
+      # Failure signature confidence upgrades
+      - alert: TtfsFailureSignatureConfidenceUpgrade
+        expr: |
+          sum(increase(ttfs_failure_signature_confidence_upgrade_total[1h])) > 5
+        for: 5m
+        labels:
+          severity: info
+          component: ttfs
+        annotations:
+          summary: "Multiple failure signatures upgraded to high confidence"
+          description: "{{ $value }} failure signatures have been upgraded to high confidence in the last hour."
--- a/docs/modules/telemetry/operations/dashboards/ttfs-observability.json
+++ b/docs/modules/telemetry/operations/dashboards/ttfs-observability.json
@@ -0,0 +1,552 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "datasource",
+          "uid": "grafana"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "description": "Time to First Signal (TTFS) observability dashboard for StellaOps",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "title": "TTFS P50/P95/P99 by Surface",
+      "type": "timeseries",
+      "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
+      "id": 1,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum(rate(ttfs_latency_seconds_bucket[5m])) by (le, surface))",
+          "legendFormat": "P50 - {{surface}}",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(ttfs_latency_seconds_bucket[5m])) by (le, surface))",
+          "legendFormat": "P95 - {{surface}}",
+          "refId": "B"
+        },
+        {
+          "expr": "histogram_quantile(0.99, sum(rate(ttfs_latency_seconds_bucket[5m])) by (le, surface))",
+          "legendFormat": "P99 - {{surface}}",
+          "refId": "C"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "green" },
+              { "value": 2, "color": "yellow" },
+              { "value": 5, "color": "red" }
+            ]
+          },
+          "custom": {
+            "lineWidth": 1,
+            "fillOpacity": 10,
+            "showPoints": "auto"
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "table",
+          "placement": "bottom",
+          "calcs": ["mean", "max", "lastNotNull"]
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "desc"
+        }
+      }
+    },
+    {
+      "title": "Cache Hit Rate",
+      "type": "stat",
+      "gridPos": { "x": 12, "y": 0, "w": 6, "h": 4 },
+      "id": 2,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(ttfs_cache_hit_total[5m])) / sum(rate(ttfs_signal_total[5m]))",
+          "legendFormat": "Hit Rate",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "red" },
+              { "value": 0.7, "color": "yellow" },
+              { "value": 0.9, "color": "green" }
+            ]
+          },
+          "mappings": []
+        },
+        "overrides": []
+      },
+      "options": {
+        "reduceOptions": {
+          "values": false,
+          "calcs": ["lastNotNull"],
+          "fields": ""
+        },
+        "orientation": "auto",
+        "textMode": "auto",
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto"
+      }
+    },
+    {
+      "title": "SLO Breaches (P95 > 5s)",
+      "type": "stat",
+      "gridPos": { "x": 18, "y": 0, "w": 6, "h": 4 },
+      "id": 3,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(ttfs_slo_breach_total[1h]))",
+          "legendFormat": "Breaches (1h)",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "green" },
+              { "value": 1, "color": "yellow" },
+              { "value": 10, "color": "red" }
+            ]
+          },
+          "mappings": []
+        },
+        "overrides": []
+      },
+      "options": {
+        "reduceOptions": {
+          "values": false,
+          "calcs": ["lastNotNull"],
+          "fields": ""
+        },
+        "orientation": "auto",
+        "textMode": "auto",
+        "colorMode": "background",
+        "graphMode": "none",
+        "justifyMode": "auto"
+      }
+    },
+    {
+      "title": "Signal Source Distribution",
+      "type": "piechart",
+      "gridPos": { "x": 12, "y": 4, "w": 6, "h": 4 },
+      "id": 4,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (signal_source) (rate(ttfs_signal_total[1h]))",
+          "legendFormat": "{{signal_source}}",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "mappings": []
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "right"
+        },
+        "pieType": "pie",
+        "tooltip": {
+          "mode": "single"
+        }
+      }
+    },
+    {
+      "title": "Failure Signature Matches",
+      "type": "stat",
+      "gridPos": { "x": 18, "y": 4, "w": 6, "h": 4 },
+      "id": 5,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(ttfs_failure_signature_match_total[5m]))",
+          "legendFormat": "Matches/s",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "blue" }
+            ]
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "title": "Signals by Kind",
+      "type": "timeseries",
+      "gridPos": { "x": 0, "y": 8, "w": 12, "h": 6 },
+      "id": 6,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum by (kind) (rate(ttfs_signal_total[5m]))",
+          "legendFormat": "{{kind}}",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {
+            "lineWidth": 1,
+            "fillOpacity": 20,
+            "stacking": {
+              "mode": "normal",
+              "group": "A"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        }
+      }
+    },
+    {
+      "title": "Error Rate",
+      "type": "timeseries",
+      "gridPos": { "x": 12, "y": 8, "w": 12, "h": 6 },
+      "id": 7,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(ttfs_error_total[5m])) / sum(rate(ttfs_signal_total[5m]))",
+          "legendFormat": "Error Rate",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "max": 0.1,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "green" },
+              { "value": 0.01, "color": "yellow" },
+              { "value": 0.05, "color": "red" }
+            ]
+          },
+          "custom": {
+            "lineWidth": 2,
+            "fillOpacity": 10
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        }
+      }
+    },
+    {
+      "title": "TTFS Latency Heatmap",
+      "type": "heatmap",
+      "gridPos": { "x": 0, "y": 14, "w": 12, "h": 8 },
+      "id": 8,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(ttfs_latency_seconds_bucket[1m])) by (le)",
+          "legendFormat": "{{le}}",
+          "format": "heatmap",
+          "refId": "A"
+        }
+      ],
+      "options": {
+        "calculate": false,
+        "yAxis": {
+          "axisPlacement": "left",
+          "unit": "s"
+        },
+        "color": {
+          "scheme": "Spectral",
+          "mode": "scheme"
+        },
+        "cellGap": 1
+      }
+    },
+    {
+      "title": "First Signal Endpoint Latency",
+      "type": "timeseries",
+      "gridPos": { "x": 12, "y": 14, "w": 12, "h": 8 },
+      "id": 9,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{route=~\"/api/v1/orchestrator/runs/.*/first-signal\"}[5m])) by (le))",
+          "legendFormat": "P50",
+          "refId": "A"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{route=~\"/api/v1/orchestrator/runs/.*/first-signal\"}[5m])) by (le))",
+          "legendFormat": "P95",
+          "refId": "B"
+        },
+        {
+          "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{route=~\"/api/v1/orchestrator/runs/.*/first-signal\"}[5m])) by (le))",
+          "legendFormat": "P99",
+          "refId": "C"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "green" },
+              { "value": 0.3, "color": "yellow" },
+              { "value": 0.5, "color": "red" }
+            ]
+          },
+          "custom": {
+            "lineWidth": 1,
+            "fillOpacity": 10
+          }
+        },
+        "overrides": []
+      }
+    },
+    {
+      "title": "Open→Action Time Distribution",
+      "type": "histogram",
+      "gridPos": { "x": 0, "y": 22, "w": 8, "h": 6 },
+      "id": 10,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(increase(ttfs_open_to_action_seconds_bucket[5m])) by (le)",
+          "legendFormat": "{{le}}",
+          "format": "heatmap",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "s"
+        }
+      }
+    },
+    {
+      "title": "Bounce Rate (< 10s)",
+      "type": "stat",
+      "gridPos": { "x": 8, "y": 22, "w": 4, "h": 6 },
+      "id": 11,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "sum(rate(ttfs_bounce_total[5m])) / sum(rate(ttfs_page_view_total[5m]))",
+          "legendFormat": "Bounce Rate",
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              { "value": null, "color": "green" },
+              { "value": 0.3, "color": "yellow" },
+              { "value": 0.5, "color": "red" }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "Top Failure Signatures",
+      "type": "table",
+      "gridPos": { "x": 12, "y": 22, "w": 12, "h": 6 },
+      "id": 12,
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${datasource}"
+      },
+      "targets": [
+        {
+          "expr": "topk(10, sum by (error_token, error_code) (ttfs_failure_signature_hit_total))",
+          "legendFormat": "{{error_token}} ({{error_code}})",
+          "format": "table",
+          "instant": true,
+          "refId": "A"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "align": "auto"
+          }
+        },
+        "overrides": [
+          {
+            "matcher": { "id": "byName", "options": "Value" },
+            "properties": [
+              { "id": "displayName", "value": "Hit Count" }
+            ]
+          }
+        ]
+      },
+      "transformations": [
+        {
+          "id": "organize",
+          "options": {
+            "excludeByName": {
+              "Time": true
+            },
+            "renameByName": {
+              "error_token": "Token",
+              "error_code": "Code"
+            }
+          }
+        }
+      ]
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 38,
+  "style": "dark",
+  "tags": ["ttfs", "ux", "slo", "stellaops"],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "Prometheus",
+          "value": "prometheus"
+        },
+        "hide": 0,
+        "includeAll": false,
+        "label": "Datasource",
+        "multi": false,
+        "name": "datasource",
+        "options": [],
+        "query": "prometheus",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      },
+      {
+        "allValue": ".*",
+        "current": {
+          "selected": true,
+          "text": "All",
+          "value": "$__all"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "${datasource}"
+        },
+        "definition": "label_values(ttfs_latency_seconds_bucket, surface)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "Surface",
+        "multi": true,
+        "name": "surface",
+        "options": [],
+        "query": {
+          "query": "label_values(ttfs_latency_seconds_bucket, surface)",
+          "refId": "PrometheusVariableQueryEditor-VariableQuery"
+        },
+        "refresh": 2,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-6h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "utc",
+  "title": "TTFS - Time to First Signal",
+  "uid": "ttfs-overview",
+  "version": 1,
+  "weekStart": ""
+}