up
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Symbols Server CI / symbols-smoke (push) Has been cancelled
devportal-offline / build-offline (push) Has been cancelled
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Symbols Server CI / symbols-smoke (push) Has been cancelled
devportal-offline / build-offline (push) Has been cancelled
This commit is contained in:
32
ops/devops/evidence-locker/alerts.yaml
Normal file
32
ops/devops/evidence-locker/alerts.yaml
Normal file
@@ -0,0 +1,32 @@
|
||||
groups:
|
||||
- name: evidence-locker
|
||||
rules:
|
||||
- alert: EvidenceLockerRetentionDrift
|
||||
expr: evidence_retention_days != 180
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Evidence locker retention drift"
|
||||
description: "Configured retention {{ $value }}d differs from target 180d."
|
||||
|
||||
- alert: EvidenceLockerWormDisabled
|
||||
expr: evidence_worm_enabled == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "WORM/immutability disabled"
|
||||
description: "Evidence locker WORM not enabled."
|
||||
|
||||
- alert: EvidenceLockerBackupLag
|
||||
expr: (time() - evidence_last_backup_seconds) > 3600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Evidence locker backup lag > 1h"
|
||||
description: "Last backup older than 1 hour."
|
||||
23
ops/devops/evidence-locker/grafana/evidence-locker.json
Normal file
23
ops/devops/evidence-locker/grafana/evidence-locker.json
Normal file
@@ -0,0 +1,23 @@
|
||||
{
|
||||
"title": "Evidence Locker",
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"panels": [
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "WORM enabled",
|
||||
"targets": [{ "expr": "evidence_worm_enabled" }]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Retention days",
|
||||
"targets": [{ "expr": "evidence_retention_days" }]
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Backup lag (seconds)",
|
||||
"targets": [{ "expr": "time() - evidence_last_backup_seconds" }]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"version": 1
|
||||
}
|
||||
42
ops/devops/exporter/alerts.yaml
Normal file
42
ops/devops/exporter/alerts.yaml
Normal file
@@ -0,0 +1,42 @@
|
||||
groups:
|
||||
- name: exporter
|
||||
rules:
|
||||
- alert: ExporterThroughputLow
|
||||
expr: rate(exporter_jobs_processed_total[5m]) < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Exporter throughput low"
|
||||
description: "Processed <1 job/s over last 5m (current {{ $value }})."
|
||||
|
||||
- alert: ExporterFailuresHigh
|
||||
expr: rate(exporter_jobs_failed_total[5m]) / rate(exporter_jobs_processed_total[5m]) > 0.02
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Exporter failure rate >2%"
|
||||
description: "Failure rate {{ $value | humanizePercentage }} over last 5m."
|
||||
|
||||
- alert: ExporterLatencyP95High
|
||||
expr: histogram_quantile(0.95, sum(rate(exporter_job_duration_seconds_bucket[5m])) by (le)) > 3
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Exporter job p95 latency high"
|
||||
description: "Job p95 latency {{ $value }}s over last 5m (threshold 3s)."
|
||||
|
||||
- alert: ExporterQueueDepthHigh
|
||||
expr: exporter_queue_depth > 500
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Exporter queue depth high"
|
||||
description: "Queue depth {{ $value }} exceeds 500 for >10m."
|
||||
29
ops/devops/exporter/grafana/exporter-overview.json
Normal file
29
ops/devops/exporter/grafana/exporter-overview.json
Normal file
@@ -0,0 +1,29 @@
|
||||
{
|
||||
"title": "Exporter Overview",
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"panels": [
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Queue depth",
|
||||
"targets": [{ "expr": "exporter_queue_depth" }]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Jobs processed / failed",
|
||||
"targets": [
|
||||
{ "expr": "rate(exporter_jobs_processed_total[5m])", "legendFormat": "processed" },
|
||||
{ "expr": "rate(exporter_jobs_failed_total[5m])", "legendFormat": "failed" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Job duration p50/p95",
|
||||
"targets": [
|
||||
{ "expr": "histogram_quantile(0.5, sum(rate(exporter_job_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p50" },
|
||||
{ "expr": "histogram_quantile(0.95, sum(rate(exporter_job_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95" }
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"version": 1
|
||||
}
|
||||
36
ops/devops/observability/alerts-slo.yaml
Normal file
36
ops/devops/observability/alerts-slo.yaml
Normal file
@@ -0,0 +1,36 @@
|
||||
groups:
|
||||
- name: slo-burn
|
||||
rules:
|
||||
- alert: SLOBurnRateFast
|
||||
expr: |
|
||||
(rate(service_request_errors_total[5m]) / rate(service_requests_total[5m])) >
|
||||
4 * (1 - 0.99)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Fast burn: 99% SLO breached"
|
||||
description: "Error budget burn (5m) exceeds fast threshold."
|
||||
- alert: SLOBurnRateSlow
|
||||
expr: |
|
||||
(rate(service_request_errors_total[1h]) / rate(service_requests_total[1h])) >
|
||||
1 * (1 - 0.99)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Slow burn: 99% SLO at risk"
|
||||
description: "Error budget burn (1h) exceeds slow threshold."
|
||||
- name: slo-webhook
|
||||
rules:
|
||||
- alert: SLOWebhookFailures
|
||||
expr: rate(slo_webhook_failures_total[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "SLO webhook failures"
|
||||
description: "Webhook emitter has failures in last 5m."
|
||||
26
ops/devops/observability/grafana/slo-burn.json
Normal file
26
ops/devops/observability/grafana/slo-burn.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"title": "SLO Burn",
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"panels": [
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Error rate",
|
||||
"targets": [
|
||||
{ "expr": "rate(service_request_errors_total[5m]) / rate(service_requests_total[5m])", "legendFormat": "5m" },
|
||||
{ "expr": "rate(service_request_errors_total[1h]) / rate(service_requests_total[1h])", "legendFormat": "1h" }
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": { "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.01 } ] } }
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Budget used (24h)",
|
||||
"targets": [
|
||||
{ "expr": "(sum_over_time(service_request_errors_total[24h]) / sum_over_time(service_requests_total[24h]))" }
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"version": 1
|
||||
}
|
||||
22
ops/devops/provenance/alerts.yaml
Normal file
22
ops/devops/provenance/alerts.yaml
Normal file
@@ -0,0 +1,22 @@
|
||||
groups:
|
||||
- name: provenance
|
||||
rules:
|
||||
- alert: ProvenanceKeyRotationOverdue
|
||||
expr: (time() - provenance_last_key_rotation_seconds) > 60*60*24*90
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Provenance signing key rotation overdue"
|
||||
description: "Last rotation {{ $value }} seconds ago (>90d)."
|
||||
|
||||
- alert: ProvenanceSignerFailures
|
||||
expr: rate(provenance_sign_failures_total[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: devops
|
||||
annotations:
|
||||
summary: "Provenance signer failures detected"
|
||||
description: "Signer failure rate non-zero in last 5m."
|
||||
22
ops/devops/provenance/grafana/provenance-overview.json
Normal file
22
ops/devops/provenance/grafana/provenance-overview.json
Normal file
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"title": "Provenance Signing",
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"panels": [
|
||||
{
|
||||
"type": "stat",
|
||||
"title": "Last key rotation (days)",
|
||||
"targets": [
|
||||
{ "expr": "(time() - provenance_last_key_rotation_seconds) / 86400" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "timeseries",
|
||||
"title": "Signing failures",
|
||||
"targets": [
|
||||
{ "expr": "rate(provenance_sign_failures_total[5m])", "legendFormat": "failures/s" }
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 39,
|
||||
"version": 1
|
||||
}
|
||||
21
ops/devops/symbols/alerts.yaml
Normal file
21
ops/devops/symbols/alerts.yaml
Normal file
@@ -0,0 +1,21 @@
|
||||
groups:
|
||||
- name: symbols-availability
|
||||
rules:
|
||||
- alert: SymbolsDown
|
||||
expr: up{job="symbols"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: page
|
||||
service: symbols
|
||||
annotations:
|
||||
summary: "Symbols.Server instance is down"
|
||||
description: "symbols scrape target has been down for 5 minutes"
|
||||
- alert: SymbolsErrorRateHigh
|
||||
expr: rate(http_requests_total{job="symbols",status=~"5.."}[5m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: symbols
|
||||
annotations:
|
||||
summary: "Symbols.Server error rate is elevated"
|
||||
description: "5xx responses detected for Symbols.Server"
|
||||
43
ops/devops/symbols/docker-compose.symbols.yaml
Normal file
43
ops/devops/symbols/docker-compose.symbols.yaml
Normal file
@@ -0,0 +1,43 @@
|
||||
version: "3.9"
|
||||
services:
|
||||
mongo:
|
||||
image: mongo:7.0
|
||||
restart: unless-stopped
|
||||
command: ["mongod", "--bind_ip_all"]
|
||||
ports:
|
||||
- "27017:27017"
|
||||
minio:
|
||||
image: minio/minio:RELEASE.2024-08-17T00-00-00Z
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
MINIO_ROOT_USER: minio
|
||||
MINIO_ROOT_PASSWORD: minio123
|
||||
command: server /data --console-address :9001
|
||||
ports:
|
||||
- "9000:9000"
|
||||
- "9001:9001"
|
||||
symbols:
|
||||
image: ghcr.io/stella-ops/symbols-server:edge
|
||||
depends_on:
|
||||
- mongo
|
||||
- minio
|
||||
environment:
|
||||
Mongo__ConnectionString: mongodb://mongo:27017/symbols
|
||||
Storage__Provider: S3
|
||||
Storage__S3__Endpoint: http://minio:9000
|
||||
Storage__S3__Bucket: symbols
|
||||
Storage__S3__AccessKeyId: minio
|
||||
Storage__S3__SecretAccessKey: minio123
|
||||
Storage__S3__UsePathStyle: "true"
|
||||
Logging__Console__FormatterName: json
|
||||
ports:
|
||||
- "8080:8080"
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-fsS", "http://localhost:8080/healthz"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 6
|
||||
start_period: 10s
|
||||
networks:
|
||||
default:
|
||||
name: symbols-ci
|
||||
18
ops/devops/symbols/values.yaml
Normal file
18
ops/devops/symbols/values.yaml
Normal file
@@ -0,0 +1,18 @@
|
||||
# Minimal values stub for Symbols.Server deployment
|
||||
image:
|
||||
repository: ghcr.io/stella-ops/symbols-server
|
||||
tag: edge
|
||||
|
||||
mongodb:
|
||||
enabled: true
|
||||
connectionString: "mongodb://mongo:27017/symbols"
|
||||
|
||||
minio:
|
||||
enabled: true
|
||||
endpoint: "http://minio:9000"
|
||||
bucket: "symbols"
|
||||
accessKey: "minio"
|
||||
secretKey: "minio123"
|
||||
|
||||
ingress:
|
||||
enabled: false
|
||||
Reference in New Issue
Block a user