up
This commit is contained in:
24
ops/devops/attestation/ALERTS.md
Normal file
24
ops/devops/attestation/ALERTS.md
Normal file
@@ -0,0 +1,24 @@
|
||||
# Attestation Alerts & Dashboards (DEVOPS-ATTEST-75-001)
|
||||
|
||||
## Prometheus alert rules
|
||||
File: `ops/devops/attestation/attestation-alerts.yaml`
|
||||
- `AttestorSignLatencyP95High`: p95 signing latency > 2s for 5m.
|
||||
- `AttestorVerifyLatencyP95High`: p95 verification latency > 2s for 5m.
|
||||
- `AttestorVerifyFailureRate`: verification failures / requests > 2% over 5m.
|
||||
- `AttestorKeyRotationStale`: key not rotated in 30d.
|
||||
|
||||
Metrics expected:
|
||||
- `attestor_sign_duration_seconds_bucket`
|
||||
- `attestor_verify_duration_seconds_bucket`
|
||||
- `attestor_verify_failures_total`
|
||||
- `attestor_verify_requests_total`
|
||||
- `attestor_key_last_rotated_seconds` (gauge of Unix epoch seconds of last rotation)
|
||||
|
||||
## Grafana
|
||||
File: `ops/devops/attestation/grafana/attestation-latency.json`
|
||||
- Panels: signing p50/p95, verification p50/p95, failure rate, key-age gauge, last 24h error counts.
|
||||
|
||||
## Runbook
|
||||
- Verify exporters scrape `attestor-*` metrics from Attestor service.
|
||||
- Ensure alertmanager routes `team=devops` to on-call.
|
||||
- Key rotation alert: rotate via standard KMS workflow; acknowledge alert after new metric value observed.
|
||||
43
ops/devops/attestation/attestation-alerts.yaml
Normal file
43
ops/devops/attestation/attestation-alerts.yaml
Normal file
@@ -0,0 +1,43 @@
|
||||
groups:
|
||||
- name: attestor-latency
|
||||
rules:
|
||||
- alert: AttestorSignLatencyP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(attestor_sign_duration_seconds_bucket[5m])) by (le)) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Attestor signing latency p95 high"
|
||||
description: "Signing p95 is {{ $value }}s over the last 5m (threshold 2s)."
|
||||
- alert: AttestorVerifyLatencyP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(attestor_verify_duration_seconds_bucket[5m])) by (le)) > 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Attestor verification latency p95 high"
|
||||
description: "Verification p95 is {{ $value }}s over the last 5m (threshold 2s)."
|
||||
- name: attestor-errors
|
||||
rules:
|
||||
- alert: AttestorVerifyFailureRate
|
||||
expr: rate(attestor_verify_failures_total[5m]) / rate(attestor_verify_requests_total[5m]) > 0.02
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Attestor verification failure rate above 2%"
|
||||
description: "Verification failure rate is {{ $value | humanizePercentage }} over last 5m."
|
||||
- name: attestor-keys
|
||||
rules:
|
||||
- alert: AttestorKeyRotationStale
|
||||
expr: (time() - attestor_key_last_rotated_seconds) > 60*60*24*30
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Attestor signing key rotation overdue"
|
||||
description: "Signing key has not rotated in >30d ({{ $value }} seconds)."
|
||||
38
ops/devops/attestation/grafana/attestation-latency.json
Normal file
38
ops/devops/attestation/grafana/attestation-latency.json
Normal file
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"title": "Attestor Latency & Errors",
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"panels": [
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Signing latency p50/p95",
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.5, sum(rate(attestor_sign_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(attestor_sign_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Verification latency p50/p95",
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.5, sum(rate(attestor_verify_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(attestor_verify_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Verification failure rate",
|
||||
"targets": [
|
||||
{ "expr": "rate(attestor_verify_failures_total[5m]) / rate(attestor_verify_requests_total[5m])", "legendFormat": "failure rate" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Key age (days)",
|
||||
"targets": [
|
||||
{ "expr": "(time() - attestor_key_last_rotated_seconds) / 86400" }
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"version": 1
|
||||
}
|
||||
Reference in New Issue
Block a user