CD/CD consolidation
This commit is contained in:
36
devops/observability/alerts-slo.yaml
Normal file
36
devops/observability/alerts-slo.yaml
Normal file
@@ -0,0 +1,36 @@
|
||||
groups:
|
||||
- name: slo-burn
|
||||
rules:
|
||||
- alert: SLOBurnRateFast
|
||||
expr: |
|
||||
(rate(service_request_errors_total[5m]) / rate(service_requests_total[5m])) >
|
||||
4 * (1 - 0.99)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Fast burn: 99% SLO breached"
|
||||
description: "Error budget burn (5m) exceeds fast threshold."
|
||||
- alert: SLOBurnRateSlow
|
||||
expr: |
|
||||
(rate(service_request_errors_total[1h]) / rate(service_requests_total[1h])) >
|
||||
1 * (1 - 0.99)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Slow burn: 99% SLO at risk"
|
||||
description: "Error budget burn (1h) exceeds slow threshold."
|
||||
- name: slo-webhook
|
||||
rules:
|
||||
- alert: SLOWebhookFailures
|
||||
expr: rate(slo_webhook_failures_total[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "SLO webhook failures"
|
||||
description: "Webhook emitter has failures in last 5m."
|
||||
78
devops/observability/grafana/policy-pipeline.json
Normal file
78
devops/observability/grafana/policy-pipeline.json
Normal file
@@ -0,0 +1,78 @@
|
||||
{
|
||||
"schemaVersion": 39,
|
||||
"title": "Policy Pipeline",
|
||||
"panels": [
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Compile p99 (s)",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "s", "decimals": 2}},
|
||||
"targets": [
|
||||
{"expr": "histogram_quantile(0.99, sum(rate(policy_compile_duration_seconds_bucket[5m])) by (le))"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Compile Duration (p95/p50)",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "s", "decimals": 2}},
|
||||
"targets": [
|
||||
{"expr": "histogram_quantile(0.95, sum(rate(policy_compile_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95"},
|
||||
{"expr": "histogram_quantile(0.50, sum(rate(policy_compile_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p50"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Simulation Queue Depth",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "none"}},
|
||||
"targets": [{"expr": "sum(policy_simulation_queue_depth)"}]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Queue Depth by Stage",
|
||||
"datasource": "Prometheus",
|
||||
"targets": [{"expr": "policy_simulation_queue_depth", "legendFormat": "{{stage}}"}],
|
||||
"fieldConfig": {"defaults": {"unit": "none"}}
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Approval p95 (s)",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "s", "decimals": 1}},
|
||||
"targets": [
|
||||
{"expr": "histogram_quantile(0.95, sum(rate(policy_approval_latency_seconds_bucket[5m])) by (le))"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Approval Latency",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "s", "decimals": 1}},
|
||||
"targets": [
|
||||
{"expr": "histogram_quantile(0.90, sum(rate(policy_approval_latency_seconds_bucket[5m])) by (le))", "legendFormat": "p90"},
|
||||
{"expr": "histogram_quantile(0.50, sum(rate(policy_approval_latency_seconds_bucket[5m])) by (le))", "legendFormat": "p50"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "gauge",
|
||||
"title": "Promotion Success Rate (30m)",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "percent", "min": 0, "max": 100}},
|
||||
"options": {"reduceOptions": {"calcs": ["last"]}, "orientation": "horizontal"},
|
||||
"targets": [
|
||||
{"expr": "100 * clamp_min(rate(policy_promotion_outcomes_total{outcome=\"success\"}[30m]),0) / clamp_min(rate(policy_promotion_outcomes_total[30m]),1)"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "barchart",
|
||||
"title": "Promotion Outcomes",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "1/s"}},
|
||||
"options": {"displayMode": "series"},
|
||||
"targets": [
|
||||
{"expr": "rate(policy_promotion_outcomes_total[5m])", "legendFormat": "{{outcome}}"}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
74
devops/observability/grafana/signals-pipeline.json
Normal file
74
devops/observability/grafana/signals-pipeline.json
Normal file
@@ -0,0 +1,74 @@
|
||||
{
|
||||
"schemaVersion": 39,
|
||||
"title": "Signals Pipeline",
|
||||
"panels": [
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Scoring p95 (s)",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "s", "decimals": 2}},
|
||||
"targets": [
|
||||
{"expr": "histogram_quantile(0.95, sum(rate(signals_reachability_scoring_duration_seconds_bucket[5m])) by (le))"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Scoring Duration p95/p50",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "s", "decimals": 2}},
|
||||
"targets": [
|
||||
{"expr": "histogram_quantile(0.95, sum(rate(signals_reachability_scoring_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95"},
|
||||
{"expr": "histogram_quantile(0.50, sum(rate(signals_reachability_scoring_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p50"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "gauge",
|
||||
"title": "Cache Hit Ratio (5m)",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "percent", "min": 0, "max": 100}},
|
||||
"options": {"reduceOptions": {"calcs": ["last"]}, "orientation": "horizontal"},
|
||||
"targets": [
|
||||
{"expr": "100 * clamp_min(rate(signals_cache_hits_total[5m]),0) / clamp_min(rate(signals_cache_hits_total[5m]) + rate(signals_cache_misses_total[5m]), 1)"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Cache Hits/Misses",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "1/s"}},
|
||||
"targets": [
|
||||
{"expr": "rate(signals_cache_hits_total[5m])", "legendFormat": "hits"},
|
||||
{"expr": "rate(signals_cache_misses_total[5m])", "legendFormat": "misses"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Sensors Reporting",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "none"}},
|
||||
"targets": [
|
||||
{"expr": "count(max_over_time(signals_sensor_last_seen_timestamp_seconds[15m]))"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Sensor Staleness",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "s"}},
|
||||
"targets": [
|
||||
{"expr": "time() - max(signals_sensor_last_seen_timestamp_seconds) by (sensor)", "legendFormat": "{{sensor}}"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "barchart",
|
||||
"title": "Ingestion Outcomes",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "1/s"}},
|
||||
"options": {"displayMode": "series"},
|
||||
"targets": [
|
||||
{"expr": "rate(signals_ingestion_total[5m])", "legendFormat": "total"},
|
||||
{"expr": "rate(signals_ingestion_failures_total[5m])", "legendFormat": "failures"}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
26
devops/observability/grafana/slo-burn.json
Normal file
26
devops/observability/grafana/slo-burn.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"title": "SLO Burn",
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"panels": [
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Error rate",
|
||||
"targets": [
|
||||
{ "expr": "rate(service_request_errors_total[5m]) / rate(service_requests_total[5m])", "legendFormat": "5m" },
|
||||
{ "expr": "rate(service_request_errors_total[1h]) / rate(service_requests_total[1h])", "legendFormat": "1h" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.01 } ] } }
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Budget used (24h)",
|
||||
"targets": [
|
||||
{ "expr": "(sum_over_time(service_request_errors_total[24h]) / sum_over_time(service_requests_total[24h]))" }
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"version": 1
|
||||
}
|
||||
97
devops/observability/grafana/triage-ttfs.json
Normal file
97
devops/observability/grafana/triage-ttfs.json
Normal file
@@ -0,0 +1,97 @@
|
||||
{
|
||||
"schemaVersion": 39,
|
||||
"title": "Triage TTFS",
|
||||
"panels": [
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "TTFS First Evidence p95 (s)",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "s", "decimals": 3}},
|
||||
"targets": [
|
||||
{"expr": "histogram_quantile(0.95, sum(rate(stellaops_ttfs_first_evidence_seconds_bucket[5m])) by (le))"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "TTFS First Evidence p50/p95 (s)",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "s", "decimals": 3}},
|
||||
"targets": [
|
||||
{"expr": "histogram_quantile(0.50, sum(rate(stellaops_ttfs_first_evidence_seconds_bucket[5m])) by (le))", "legendFormat": "p50"},
|
||||
{"expr": "histogram_quantile(0.95, sum(rate(stellaops_ttfs_first_evidence_seconds_bucket[5m])) by (le))", "legendFormat": "p95"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "TTFS Skeleton p50/p95 (s)",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "s", "decimals": 3}},
|
||||
"targets": [
|
||||
{"expr": "histogram_quantile(0.50, sum(rate(stellaops_ttfs_skeleton_seconds_bucket[5m])) by (le))", "legendFormat": "p50"},
|
||||
{"expr": "histogram_quantile(0.95, sum(rate(stellaops_ttfs_skeleton_seconds_bucket[5m])) by (le))", "legendFormat": "p95"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "TTFS Full Evidence p50/p95 (s)",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "s", "decimals": 3}},
|
||||
"targets": [
|
||||
{"expr": "histogram_quantile(0.50, sum(rate(stellaops_ttfs_full_evidence_seconds_bucket[5m])) by (le))", "legendFormat": "p50"},
|
||||
{"expr": "histogram_quantile(0.95, sum(rate(stellaops_ttfs_full_evidence_seconds_bucket[5m])) by (le))", "legendFormat": "p95"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Clicks-to-Closure Median",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "none", "decimals": 1}},
|
||||
"targets": [
|
||||
{"expr": "histogram_quantile(0.50, sum(rate(stellaops_clicks_to_closure_bucket[5m])) by (le))"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Clicks-to-Closure p50/p95",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "none", "decimals": 1}},
|
||||
"targets": [
|
||||
{"expr": "histogram_quantile(0.50, sum(rate(stellaops_clicks_to_closure_bucket[5m])) by (le))", "legendFormat": "p50"},
|
||||
{"expr": "histogram_quantile(0.95, sum(rate(stellaops_clicks_to_closure_bucket[5m])) by (le))", "legendFormat": "p95"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Evidence Completeness Avg (%)",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "percent", "decimals": 1}},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (sum(rate(stellaops_evidence_completeness_score_sum[5m])) / clamp_min(sum(rate(stellaops_evidence_completeness_score_count[5m])), 1)) / 4"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Evidence Completeness Avg (%)",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "percent", "decimals": 1}},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "100 * (sum(rate(stellaops_evidence_completeness_score_sum[5m])) / clamp_min(sum(rate(stellaops_evidence_completeness_score_count[5m])), 1)) / 4",
|
||||
"legendFormat": "avg"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "barchart",
|
||||
"title": "Budget Violations Rate (1/s)",
|
||||
"datasource": "Prometheus",
|
||||
"fieldConfig": {"defaults": {"unit": "1/s"}},
|
||||
"options": {"displayMode": "series"},
|
||||
"targets": [
|
||||
{"expr": "sum(rate(stellaops_performance_budget_violations_total[5m])) by (phase)", "legendFormat": "{{phase}}"}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
49
devops/observability/incident-mode.md
Normal file
49
devops/observability/incident-mode.md
Normal file
@@ -0,0 +1,49 @@
|
||||
# Incident Mode Automation (DEVOPS-OBS-55-001)
|
||||
|
||||
## What it does
|
||||
- Auto-enables an *incident* feature flag when SLO burn rate crosses a threshold.
|
||||
- Writes deterministic retention overrides (hours) for downstream storage/ingest.
|
||||
- Auto-clears after a cooldown once burn is back under the reset threshold.
|
||||
- Offline-friendly: no external calls; pure file outputs under `out/incident-mode/`.
|
||||
|
||||
## Inputs
|
||||
- Burn rate multiple (fast-burn): required.
|
||||
- Thresholds/cooldown/retention configurable via CLI flags or env vars.
|
||||
- Optional note for audit context.
|
||||
|
||||
## Outputs
|
||||
- `flag.json` — enabled/disabled + burn rate and note.
|
||||
- `retention.json` — retention override hours + applied time.
|
||||
- `last_burn.txt`, `cooldown.txt` — trace for automation/testing.
|
||||
|
||||
## Usage
|
||||
```bash
|
||||
# Activate if burn >= 2.5, otherwise decay cooldown; clear after 15 mins <0.4
|
||||
scripts/observability/incident-mode.sh \
|
||||
--burn-rate 3.2 \
|
||||
--threshold 2.5 \
|
||||
--reset-threshold 0.4 \
|
||||
--cooldown-mins 15 \
|
||||
--retention-hours 48 \
|
||||
--note "api error burst"
|
||||
|
||||
# Later (burn back to normal):
|
||||
scripts/observability/incident-mode.sh --burn-rate 0.2 --reset-threshold 0.4 --cooldown-mins 15
|
||||
```
|
||||
Outputs land in `out/incident-mode/` by default (override with `--state-dir`).
|
||||
|
||||
## Integration hooks
|
||||
- Prometheus rule should page on SLOBurnRateFast (already in `alerts-slo.yaml`).
|
||||
- A small runner (cron/workflow) can feed burn rate into this script from PromQL
|
||||
(`scalar(slo:burn_rate:fast)`), then distribute `flag.json` via configmap/secret.
|
||||
- Downstream services can read `retention.json` to temporarily raise retention
|
||||
windows during incident mode.
|
||||
|
||||
## Determinism
|
||||
- Timestamps are UTC ISO-8601; no network dependencies.
|
||||
- State is contained under the chosen `state-dir` for reproducible runs.
|
||||
|
||||
## Clearing / reset
|
||||
- Cooldown counter increments only when burn stays below reset threshold.
|
||||
- Once cooldown minutes are met, `flag.json` flips `enabled=false` and the script
|
||||
leaves prior retention files untouched (downstream can prune separately).
|
||||
52
devops/observability/policy-alerts.yaml
Normal file
52
devops/observability/policy-alerts.yaml
Normal file
@@ -0,0 +1,52 @@
|
||||
groups:
|
||||
- name: policy-pipeline
|
||||
rules:
|
||||
- alert: PolicyCompileLatencyP99High
|
||||
expr: histogram_quantile(0.99, sum(rate(policy_compile_duration_seconds_bucket[5m])) by (le)) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy compile latency elevated (p99)"
|
||||
description: "p99 compile duration has been >5s for 10m"
|
||||
|
||||
- alert: PolicySimulationQueueBacklog
|
||||
expr: sum(policy_simulation_queue_depth) > 100
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy simulation backlog"
|
||||
description: "Simulation queue depth above 100 for 10m"
|
||||
|
||||
- alert: PolicyApprovalLatencyHigh
|
||||
expr: histogram_quantile(0.95, sum(rate(policy_approval_latency_seconds_bucket[5m])) by (le)) > 30
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy approval latency high"
|
||||
description: "p95 approval latency above 30s for 15m"
|
||||
|
||||
- alert: PolicyPromotionFailureRate
|
||||
expr: clamp_min(rate(policy_promotion_outcomes_total{outcome="failure"}[15m]), 0) / clamp_min(rate(policy_promotion_outcomes_total[15m]), 1) > 0.2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy promotion failure rate elevated"
|
||||
description: "Failures exceed 20% of promotions over 15m"
|
||||
|
||||
- alert: PolicyPromotionStall
|
||||
expr: rate(policy_promotion_outcomes_total{outcome="success"}[10m]) == 0 and sum(policy_simulation_queue_depth) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: policy
|
||||
annotations:
|
||||
summary: "Policy promotion stalled"
|
||||
description: "No successful promotions while work is queued"
|
||||
39
devops/observability/policy-playbook.md
Normal file
39
devops/observability/policy-playbook.md
Normal file
@@ -0,0 +1,39 @@
|
||||
# Policy Pipeline Playbook
|
||||
|
||||
Scope: policy compile → simulation → approval → promotion path.
|
||||
|
||||
## Dashboards
|
||||
- Grafana: import `ops/devops/observability/grafana/policy-pipeline.json` (datasource `Prometheus`).
|
||||
- Key tiles: Compile p99, Simulation Queue Depth, Approval p95, Promotion Success Rate, Promotion Outcomes.
|
||||
|
||||
## Alerts (Prometheus)
|
||||
- Rules: `ops/devops/observability/policy-alerts.yaml`
|
||||
- `PolicyCompileLatencyP99High` (p99 > 5s for 10m)
|
||||
- `PolicySimulationQueueBacklog` (queue depth > 100 for 10m)
|
||||
- `PolicyApprovalLatencyHigh` (p95 > 30s for 15m)
|
||||
- `PolicyPromotionFailureRate` (failures >20% over 15m)
|
||||
- `PolicyPromotionStall` (no successes while queue non-empty for 10m)
|
||||
|
||||
## Runbook
|
||||
1. **Compile latency alert**
|
||||
- Check build nodes for CPU cap; verify cache hits for policy engine.
|
||||
- Roll restart single runner; if persists, scale policy compile workers (+1) or purge stale cache.
|
||||
2. **Simulation backlog**
|
||||
- Inspect queue per stage (panel "Queue Depth by Stage").
|
||||
- If queue limited to one stage, increase concurrency for that stage or drain stuck items; otherwise, add workers.
|
||||
3. **Approval latency high**
|
||||
- Look for blocked approvals (UI/API outages). Re-run approval service health check; fail over to standby.
|
||||
4. **Promotion failure rate/stall**
|
||||
- Pull recent logs for promotion job; compare failure reasons (policy validation vs. target registry).
|
||||
- If registry errors, pause promotions and file incident with registry owner; if policy validation, revert latest policy change or apply override to unblock critical tenants.
|
||||
5. **Verification**
|
||||
- After mitigation, ensure promotion success rate gauge recovers >95% and queues drain to baseline (<10).
|
||||
|
||||
## Escalation
|
||||
- Primary: Policy On-Call (week N roster).
|
||||
- Secondary: DevOps Guild (release).
|
||||
- Page if two critical alerts fire concurrently or any critical alert lasts >30m.
|
||||
|
||||
## Notes
|
||||
- Metrics assumed available: `policy_compile_duration_seconds_bucket`, `policy_simulation_queue_depth`, `policy_approval_latency_seconds_bucket`, `policy_promotion_outcomes_total{outcome=*}`.
|
||||
- Keep alert thresholds stable unless load profile changes; adjust in Git with approval from Policy + DevOps leads.
|
||||
54
devops/observability/signals-alerts.yaml
Normal file
54
devops/observability/signals-alerts.yaml
Normal file
@@ -0,0 +1,54 @@
|
||||
groups:
|
||||
- name: signals-pipeline
|
||||
rules:
|
||||
- alert: SignalsScoringLatencyP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(signals_reachability_scoring_duration_seconds_bucket[5m])) by (le)) > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals scoring latency high (p95)"
|
||||
description: "Reachability scoring p95 exceeds 2s for 10m"
|
||||
|
||||
- alert: SignalsCacheMissRateHigh
|
||||
expr: |
|
||||
clamp_min(rate(signals_cache_misses_total[5m]), 0)
|
||||
/ clamp_min(rate(signals_cache_hits_total[5m]) + rate(signals_cache_misses_total[5m]), 1) > 0.3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals cache miss rate high"
|
||||
description: "Cache miss ratio >30% over 10m; investigate Redis or key churn."
|
||||
|
||||
- alert: SignalsCacheDown
|
||||
expr: signals_cache_available == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals cache unavailable"
|
||||
description: "Redis cache reported unavailable for >2m"
|
||||
|
||||
- alert: SignalsSensorStaleness
|
||||
expr: time() - max(signals_sensor_last_seen_timestamp_seconds) by (sensor) > 900
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals sensor stale"
|
||||
description: "No updates from sensor for >15 minutes"
|
||||
|
||||
- alert: SignalsIngestionErrorRate
|
||||
expr: clamp_min(rate(signals_ingestion_failures_total[5m]), 0) / clamp_min(rate(signals_ingestion_total[5m]), 1) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: signals
|
||||
annotations:
|
||||
summary: "Signals ingestion failures elevated"
|
||||
description: "Ingestion failure ratio above 5% over 5m"
|
||||
40
devops/observability/signals-playbook.md
Normal file
40
devops/observability/signals-playbook.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# Signals Pipeline Playbook
|
||||
|
||||
Scope: Signals ingestion, cache, scoring, and sensor freshness.
|
||||
|
||||
## Dashboards
|
||||
- Grafana: import `ops/devops/observability/grafana/signals-pipeline.json` (datasource `Prometheus`).
|
||||
- Key tiles: Scoring p95, Cache hit ratio, Sensor staleness, Ingestion outcomes.
|
||||
|
||||
## Alerts
|
||||
- Rules: `ops/devops/observability/signals-alerts.yaml`
|
||||
- `SignalsScoringLatencyP95High` (p95 > 2s for 10m)
|
||||
- `SignalsCacheMissRateHigh` (miss ratio >30% for 10m)
|
||||
- `SignalsCacheDown`
|
||||
- `SignalsSensorStaleness` (no update >15m)
|
||||
- `SignalsIngestionErrorRate` (failures >5%)
|
||||
|
||||
## Runbook
|
||||
1. **Scoring latency high**
|
||||
- Check Mongo/Redis health; inspect CPU on workers.
|
||||
- Scale Signals API pods or increase cache TTL to reduce load.
|
||||
2. **Cache miss rate / cache down**
|
||||
- Validate Redis connectivity/ACL; flush not recommended unless key explosion.
|
||||
- Increase cache TTL; ensure connection string matches deployment.
|
||||
3. **Sensor staleness**
|
||||
- Identify stale sensors from alert label; verify upstream pipeline/log shipping.
|
||||
- If sensor retired, update allowlist to silence expected gaps.
|
||||
4. **Ingestion errors**
|
||||
- Tail ingestion logs; classify errors (schema vs. storage).
|
||||
- If artifacts rejected, check storage path and disk fullness; add capacity or rotate.
|
||||
5. **Verification**
|
||||
- Ensure cache hit ratio >90%, scoring p95 <2s, staleness panel near baseline (<5m) after mitigation.
|
||||
|
||||
## Escalation
|
||||
- Primary: Signals on-call.
|
||||
- Secondary: DevOps Guild (observability).
|
||||
- Page when critical alerts persist >20m or when cache down + scoring latency co-occur.
|
||||
|
||||
## Notes
|
||||
- Metrics expected: `signals_reachability_scoring_duration_seconds_bucket`, `signals_cache_hits_total`, `signals_cache_misses_total`, `signals_cache_available`, `signals_sensor_last_seen_timestamp_seconds`, `signals_ingestion_total`, `signals_ingestion_failures_total`.
|
||||
- Keep thresholds version-controlled; align with Policy Engine consumers if scoring SLAs change.
|
||||
62
devops/observability/triage-alerts.yaml
Normal file
62
devops/observability/triage-alerts.yaml
Normal file
@@ -0,0 +1,62 @@
|
||||
groups:
|
||||
- name: triage-ttfs
|
||||
rules:
|
||||
- alert: TriageTtfsFirstEvidenceP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_first_evidence_seconds_bucket[5m])) by (le)) > 1.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "TTFS first evidence p95 high"
|
||||
description: "TTFS first-evidence p95 exceeds 1.5s for 10m (triage experience degraded)."
|
||||
|
||||
- alert: TriageTtfsSkeletonP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_skeleton_seconds_bucket[5m])) by (le)) > 0.2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "TTFS skeleton p95 high"
|
||||
description: "TTFS skeleton p95 exceeds 200ms for 10m."
|
||||
|
||||
- alert: TriageTtfsFullEvidenceP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(stellaops_ttfs_full_evidence_seconds_bucket[5m])) by (le)) > 1.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "TTFS full evidence p95 high"
|
||||
description: "TTFS full-evidence p95 exceeds 1.5s for 10m."
|
||||
|
||||
- alert: TriageClicksToClosureMedianHigh
|
||||
expr: histogram_quantile(0.50, sum(rate(stellaops_clicks_to_closure_bucket[5m])) by (le)) > 6
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "Clicks-to-closure median high"
|
||||
description: "Median clicks-to-closure exceeds 6 for 15m."
|
||||
|
||||
- alert: TriageEvidenceCompletenessAvgLow
|
||||
expr: (sum(rate(stellaops_evidence_completeness_score_sum[15m])) / clamp_min(sum(rate(stellaops_evidence_completeness_score_count[15m])), 1)) < 3.6
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "Evidence completeness below target"
|
||||
description: "Average evidence completeness score below 3.6 (90%) for 30m."
|
||||
|
||||
- alert: TriageBudgetViolationRateHigh
|
||||
expr: sum(rate(stellaops_performance_budget_violations_total[5m])) by (phase) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: triage
|
||||
annotations:
|
||||
summary: "Performance budget violations elevated"
|
||||
description: "Performance budget violation rate exceeds 0.05/s for 10m."
|
||||
Reference in New Issue
Block a user