audit work, fixed StellaOps.sln warnings/errors, fixed tests, sprints work, new advisories
This commit is contained in:
119
devops/observability/alerting/hlc-alerts.yaml
Normal file
119
devops/observability/alerting/hlc-alerts.yaml
Normal file
@@ -0,0 +1,119 @@
|
||||
# HLC Queue Alerting Rules
|
||||
# Sprint: SPRINT_20260105_002_004_BE_hlc_integration_tests
|
||||
# Task: INT-018 - Create alerts for HLC anomalies
|
||||
|
||||
groups:
|
||||
- name: hlc_alerts
|
||||
interval: 1m
|
||||
rules:
|
||||
# Critical: Chain verification failures indicate tampering or corruption
|
||||
- alert: HlcChainVerificationFailure
|
||||
expr: increase(scheduler_chain_verification_failures_total[5m]) > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
team: scheduler
|
||||
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#chain-verification-failure
|
||||
annotations:
|
||||
summary: "HLC chain verification failure detected"
|
||||
description: "Chain verification failure on node {{ $labels.node_id }} for tenant {{ $labels.tenant_id }}. This may indicate data tampering or corruption."
|
||||
impact: "Audit trail integrity compromised. Investigation required."
|
||||
action: "1. Check scheduler_log table for gaps. 2. Verify no unauthorized changes. 3. Review chain head consistency."
|
||||
|
||||
# Critical: Clock skew exceeds tolerance - can cause ordering issues
|
||||
- alert: HlcClockSkewExceedsTolerance
|
||||
expr: increase(hlc_clock_skew_rejections_total[5m]) > 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: infrastructure
|
||||
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#clock-skew
|
||||
annotations:
|
||||
summary: "HLC clock skew rejections on {{ $labels.node_id }}"
|
||||
description: "Node {{ $labels.node_id }} is rejecting HLC updates due to clock skew. {{ $value }} rejections in last 5 minutes."
|
||||
impact: "Job ordering may be inconsistent. Distributed consistency at risk."
|
||||
action: "1. Check NTP synchronization on affected node. 2. Verify time sources. 3. Consider increasing skew tolerance temporarily."
|
||||
|
||||
# Warning: Physical time offset is drifting
|
||||
- alert: HlcPhysicalTimeOffset
|
||||
expr: abs(hlc_physical_time_offset_seconds) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: infrastructure
|
||||
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#time-offset
|
||||
annotations:
|
||||
summary: "HLC physical time offset on {{ $labels.node_id }}"
|
||||
description: "HLC physical time is {{ $value }}s offset from wall clock on {{ $labels.node_id }}."
|
||||
impact: "May cause timestamp ordering anomalies in logs and diagnostics."
|
||||
action: "Monitor NTP status and consider clock synchronization."
|
||||
|
||||
# Warning: High merge conflict rate in air-gap sync
|
||||
- alert: HlcMergeConflictRateHigh
|
||||
expr: increase(airgap_merge_conflicts_total[1h]) > 100
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: scheduler
|
||||
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#merge-conflicts
|
||||
annotations:
|
||||
summary: "High HLC merge conflict rate during air-gap sync"
|
||||
description: "{{ $value }} merge conflicts detected in the last hour for conflict type {{ $labels.conflict_type }}."
|
||||
impact: "Air-gap sync may be producing unexpected results or dropping jobs."
|
||||
action: "1. Review conflict resolution logs. 2. Check for duplicate job submissions. 3. Verify offline node clocks."
|
||||
|
||||
# Warning: Air-gap sync duration increasing
|
||||
- alert: HlcSyncDurationHigh
|
||||
expr: histogram_quantile(0.95, sum(rate(airgap_sync_duration_seconds_bucket[15m])) by (le)) > 30
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: scheduler
|
||||
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#slow-sync
|
||||
annotations:
|
||||
summary: "Air-gap sync duration is high"
|
||||
description: "95th percentile sync duration is {{ $value }}s, exceeding 30s threshold."
|
||||
impact: "Air-gap import operations are slow, may delay job processing."
|
||||
action: "1. Check bundle sizes. 2. Verify database performance. 3. Consider chunking large bundles."
|
||||
|
||||
# Info: HLC enqueue rate is zero (may be expected in some deployments)
|
||||
- alert: HlcEnqueueRateZero
|
||||
expr: sum(rate(scheduler_hlc_enqueues_total[10m])) == 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: info
|
||||
team: scheduler
|
||||
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#no-enqueues
|
||||
annotations:
|
||||
summary: "No HLC enqueues in last 30 minutes"
|
||||
description: "No jobs have been enqueued with HLC timestamps in the last 30 minutes."
|
||||
impact: "May be expected if no jobs are scheduled, or may indicate HLC ordering is disabled."
|
||||
action: "Verify EnableHlcOrdering configuration if HLC ordering is expected."
|
||||
|
||||
# Warning: Batch snapshot creation failing
|
||||
- alert: HlcBatchSnapshotFailures
|
||||
expr: increase(scheduler_batch_snapshot_failures_total[5m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: scheduler
|
||||
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#batch-snapshot-failure
|
||||
annotations:
|
||||
summary: "Batch snapshot creation failures"
|
||||
description: "{{ $value }} batch snapshot creation failures in the last 5 minutes."
|
||||
impact: "DSSE-signed batch proofs may be missing for affected time ranges."
|
||||
action: "1. Check signing key availability. 2. Verify database connectivity. 3. Review batch size limits."
|
||||
|
||||
# Critical: Multiple nodes with same node ID (configuration error)
|
||||
- alert: HlcDuplicateNodeId
|
||||
expr: count by (node_id) (group by (node_id, instance) (hlc_ticks_total)) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: scheduler
|
||||
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#duplicate-node-id
|
||||
annotations:
|
||||
summary: "Duplicate HLC node ID detected"
|
||||
description: "Multiple instances are using node_id={{ $labels.node_id }}. This will cause ordering conflicts."
|
||||
impact: "Critical: Job ordering and chain integrity will be compromised."
|
||||
action: "Immediately reconfigure affected instances with unique node IDs."
|
||||
290
devops/observability/grafana/hlc-queue-metrics.json
Normal file
290
devops/observability/grafana/hlc-queue-metrics.json
Normal file
@@ -0,0 +1,290 @@
|
||||
{
|
||||
"dashboard": {
|
||||
"id": null,
|
||||
"uid": "stellaops-hlc-metrics",
|
||||
"title": "StellaOps HLC Queue Metrics",
|
||||
"description": "Hybrid Logical Clock ordering metrics for the Scheduler queue",
|
||||
"tags": ["stellaops", "hlc", "scheduler", "audit"],
|
||||
"timezone": "utc",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "HLC Tick Rate",
|
||||
"description": "Rate of HLC tick operations per second",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": { "drawStyle": "line", "lineInterpolation": "smooth" }
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(hlc_ticks_total[1m])",
|
||||
"legendFormat": "{{node_id}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Clock Skew Rejections",
|
||||
"description": "HLC rejections due to clock skew exceeding tolerance",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(hlc_clock_skew_rejections_total[1h]))",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Physical Time Offset",
|
||||
"description": "Difference between HLC physical time and wall clock",
|
||||
"type": "gauge",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ms",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 100 },
|
||||
{ "color": "red", "value": 1000 }
|
||||
]
|
||||
},
|
||||
"max": 5000
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "max(hlc_physical_time_offset_seconds) * 1000",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Scheduler HLC Enqueues",
|
||||
"description": "Rate of jobs enqueued with HLC timestamps",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops",
|
||||
"custom": { "drawStyle": "bars", "fillOpacity": 50 }
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(scheduler_hlc_enqueues_total[5m])",
|
||||
"legendFormat": "{{tenant_id}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Chain Verifications",
|
||||
"description": "Chain verification operations by result",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops"
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "valid" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }]
|
||||
},
|
||||
{
|
||||
"matcher": { "id": "byName", "options": "invalid" },
|
||||
"properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }]
|
||||
}
|
||||
]
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(scheduler_chain_verifications_total[5m])",
|
||||
"legendFormat": "{{result}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Verification Failures",
|
||||
"description": "Chain verification failures - indicates tampering or corruption",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 8 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "red", "value": 1 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(scheduler_chain_verification_failures_total[1h]))",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Batch Snapshots",
|
||||
"description": "Batch snapshot creation rate",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 8 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(scheduler_batch_snapshots_total[1h]))",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Air-Gap Bundle Exports",
|
||||
"description": "Rate of air-gap bundles exported",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 16 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(airgap_bundles_exported_total[5m])",
|
||||
"legendFormat": "{{node_id}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "Air-Gap Bundle Imports",
|
||||
"description": "Rate of air-gap bundles imported",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 16 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "ops"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(airgap_bundles_imported_total[5m])",
|
||||
"legendFormat": "imported",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Air-Gap Merge Conflicts",
|
||||
"description": "Merge conflicts by type during air-gap sync",
|
||||
"type": "stat",
|
||||
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 16 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 1 },
|
||||
{ "color": "red", "value": 10 }
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (conflict_type) (increase(airgap_merge_conflicts_total[1h]))",
|
||||
"legendFormat": "{{conflict_type}}",
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Sync Duration",
|
||||
"description": "Air-gap sync operation duration percentiles",
|
||||
"type": "timeseries",
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 20 },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, sum(rate(airgap_sync_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum(rate(airgap_sync_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, sum(rate(airgap_sync_duration_seconds_bucket[5m])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"name": "Deployments",
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"iconColor": "blue"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"folderId": 0,
|
||||
"overwrite": true
|
||||
}
|
||||
Reference in New Issue
Block a user