up
Some checks failed
Docs CI / lint-and-preview (push) Has been cancelled
AOC Guard CI / aoc-guard (push) Has been cancelled
AOC Guard CI / aoc-verify (push) Has been cancelled
Export Center CI / export-ci (push) Has been cancelled
Symbols Server CI / symbols-smoke (push) Has been cancelled
devportal-offline / build-offline (push) Has been cancelled

This commit is contained in:
StellaOps Bot
2025-11-24 20:57:49 +02:00
parent 46c8c47d06
commit 7c39058386
92 changed files with 3549 additions and 157 deletions

View File

@@ -0,0 +1,32 @@
groups:
- name: evidence-locker
rules:
- alert: EvidenceLockerRetentionDrift
expr: evidence_retention_days != 180
for: 10m
labels:
severity: warning
team: devops
annotations:
summary: "Evidence locker retention drift"
description: "Configured retention {{ $value }}d differs from target 180d."
- alert: EvidenceLockerWormDisabled
expr: evidence_worm_enabled == 0
for: 5m
labels:
severity: critical
team: devops
annotations:
summary: "WORM/immutability disabled"
description: "Evidence locker WORM not enabled."
- alert: EvidenceLockerBackupLag
expr: (time() - evidence_last_backup_seconds) > 3600
for: 10m
labels:
severity: warning
team: devops
annotations:
summary: "Evidence locker backup lag > 1h"
description: "Last backup older than 1 hour."

View File

@@ -0,0 +1,23 @@
{
"title": "Evidence Locker",
"time": { "from": "now-24h", "to": "now" },
"panels": [
{
"type": "stat",
"title": "WORM enabled",
"targets": [{ "expr": "evidence_worm_enabled" }]
},
{
"type": "stat",
"title": "Retention days",
"targets": [{ "expr": "evidence_retention_days" }]
},
{
"type": "stat",
"title": "Backup lag (seconds)",
"targets": [{ "expr": "time() - evidence_last_backup_seconds" }]
}
],
"schemaVersion": 39,
"version": 1
}

View File

@@ -0,0 +1,42 @@
groups:
- name: exporter
rules:
- alert: ExporterThroughputLow
expr: rate(exporter_jobs_processed_total[5m]) < 1
for: 10m
labels:
severity: warning
team: devops
annotations:
summary: "Exporter throughput low"
description: "Processed <1 job/s over last 5m (current {{ $value }})."
- alert: ExporterFailuresHigh
expr: rate(exporter_jobs_failed_total[5m]) / rate(exporter_jobs_processed_total[5m]) > 0.02
for: 5m
labels:
severity: critical
team: devops
annotations:
summary: "Exporter failure rate >2%"
description: "Failure rate {{ $value | humanizePercentage }} over last 5m."
- alert: ExporterLatencyP95High
expr: histogram_quantile(0.95, sum(rate(exporter_job_duration_seconds_bucket[5m])) by (le)) > 3
for: 5m
labels:
severity: warning
team: devops
annotations:
summary: "Exporter job p95 latency high"
description: "Job p95 latency {{ $value }}s over last 5m (threshold 3s)."
- alert: ExporterQueueDepthHigh
expr: exporter_queue_depth > 500
for: 10m
labels:
severity: warning
team: devops
annotations:
summary: "Exporter queue depth high"
description: "Queue depth {{ $value }} exceeds 500 for >10m."

View File

@@ -0,0 +1,29 @@
{
"title": "Exporter Overview",
"time": { "from": "now-24h", "to": "now" },
"panels": [
{
"type": "stat",
"title": "Queue depth",
"targets": [{ "expr": "exporter_queue_depth" }]
},
{
"type": "timeseries",
"title": "Jobs processed / failed",
"targets": [
{ "expr": "rate(exporter_jobs_processed_total[5m])", "legendFormat": "processed" },
{ "expr": "rate(exporter_jobs_failed_total[5m])", "legendFormat": "failed" }
]
},
{
"type": "timeseries",
"title": "Job duration p50/p95",
"targets": [
{ "expr": "histogram_quantile(0.5, sum(rate(exporter_job_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p50" },
{ "expr": "histogram_quantile(0.95, sum(rate(exporter_job_duration_seconds_bucket[5m])) by (le))", "legendFormat": "p95" }
]
}
],
"schemaVersion": 39,
"version": 1
}

View File

@@ -0,0 +1,36 @@
groups:
- name: slo-burn
rules:
- alert: SLOBurnRateFast
expr: |
(rate(service_request_errors_total[5m]) / rate(service_requests_total[5m])) >
4 * (1 - 0.99)
for: 5m
labels:
severity: critical
team: devops
annotations:
summary: "Fast burn: 99% SLO breached"
description: "Error budget burn (5m) exceeds fast threshold."
- alert: SLOBurnRateSlow
expr: |
(rate(service_request_errors_total[1h]) / rate(service_requests_total[1h])) >
1 * (1 - 0.99)
for: 1h
labels:
severity: warning
team: devops
annotations:
summary: "Slow burn: 99% SLO at risk"
description: "Error budget burn (1h) exceeds slow threshold."
- name: slo-webhook
rules:
- alert: SLOWebhookFailures
expr: rate(slo_webhook_failures_total[5m]) > 0
for: 10m
labels:
severity: warning
team: devops
annotations:
summary: "SLO webhook failures"
description: "Webhook emitter has failures in last 5m."

View File

@@ -0,0 +1,26 @@
{
"title": "SLO Burn",
"time": { "from": "now-24h", "to": "now" },
"panels": [
{
"type": "timeseries",
"title": "Error rate",
"targets": [
{ "expr": "rate(service_request_errors_total[5m]) / rate(service_requests_total[5m])", "legendFormat": "5m" },
{ "expr": "rate(service_request_errors_total[1h]) / rate(service_requests_total[1h])", "legendFormat": "1h" }
],
"fieldConfig": {
"defaults": { "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.01 } ] } }
}
},
{
"type": "stat",
"title": "Budget used (24h)",
"targets": [
{ "expr": "(sum_over_time(service_request_errors_total[24h]) / sum_over_time(service_requests_total[24h]))" }
]
}
],
"schemaVersion": 39,
"version": 1
}

View File

@@ -0,0 +1,22 @@
groups:
- name: provenance
rules:
- alert: ProvenanceKeyRotationOverdue
expr: (time() - provenance_last_key_rotation_seconds) > 60*60*24*90
for: 10m
labels:
severity: warning
team: devops
annotations:
summary: "Provenance signing key rotation overdue"
description: "Last rotation {{ $value }} seconds ago (>90d)."
- alert: ProvenanceSignerFailures
expr: rate(provenance_sign_failures_total[5m]) > 0
for: 5m
labels:
severity: critical
team: devops
annotations:
summary: "Provenance signer failures detected"
description: "Signer failure rate non-zero in last 5m."

View File

@@ -0,0 +1,22 @@
{
"title": "Provenance Signing",
"time": { "from": "now-24h", "to": "now" },
"panels": [
{
"type": "stat",
"title": "Last key rotation (days)",
"targets": [
{ "expr": "(time() - provenance_last_key_rotation_seconds) / 86400" }
]
},
{
"type": "timeseries",
"title": "Signing failures",
"targets": [
{ "expr": "rate(provenance_sign_failures_total[5m])", "legendFormat": "failures/s" }
]
}
],
"schemaVersion": 39,
"version": 1
}

View File

@@ -0,0 +1,21 @@
groups:
- name: symbols-availability
rules:
- alert: SymbolsDown
expr: up{job="symbols"} == 0
for: 5m
labels:
severity: page
service: symbols
annotations:
summary: "Symbols.Server instance is down"
description: "symbols scrape target has been down for 5 minutes"
- alert: SymbolsErrorRateHigh
expr: rate(http_requests_total{job="symbols",status=~"5.."}[5m]) > 0
for: 2m
labels:
severity: critical
service: symbols
annotations:
summary: "Symbols.Server error rate is elevated"
description: "5xx responses detected for Symbols.Server"

View File

@@ -0,0 +1,43 @@
version: "3.9"
services:
mongo:
image: mongo:7.0
restart: unless-stopped
command: ["mongod", "--bind_ip_all"]
ports:
- "27017:27017"
minio:
image: minio/minio:RELEASE.2024-08-17T00-00-00Z
restart: unless-stopped
environment:
MINIO_ROOT_USER: minio
MINIO_ROOT_PASSWORD: minio123
command: server /data --console-address :9001
ports:
- "9000:9000"
- "9001:9001"
symbols:
image: ghcr.io/stella-ops/symbols-server:edge
depends_on:
- mongo
- minio
environment:
Mongo__ConnectionString: mongodb://mongo:27017/symbols
Storage__Provider: S3
Storage__S3__Endpoint: http://minio:9000
Storage__S3__Bucket: symbols
Storage__S3__AccessKeyId: minio
Storage__S3__SecretAccessKey: minio123
Storage__S3__UsePathStyle: "true"
Logging__Console__FormatterName: json
ports:
- "8080:8080"
healthcheck:
test: ["CMD", "curl", "-fsS", "http://localhost:8080/healthz"]
interval: 10s
timeout: 5s
retries: 6
start_period: 10s
networks:
default:
name: symbols-ci

View File

@@ -0,0 +1,18 @@
# Minimal values stub for Symbols.Server deployment
image:
repository: ghcr.io/stella-ops/symbols-server
tag: edge
mongodb:
enabled: true
connectionString: "mongodb://mongo:27017/symbols"
minio:
enabled: true
endpoint: "http://minio:9000"
bucket: "symbols"
accessKey: "minio"
secretKey: "minio123"
ingress:
enabled: false