Files
git.stella-ops.org/devops/observability/alerting/hlc-alerts.yaml

120 lines
6.2 KiB
YAML

# HLC Queue Alerting Rules
# Sprint: SPRINT_20260105_002_004_BE_hlc_integration_tests
# Task: INT-018 - Create alerts for HLC anomalies
groups:
- name: hlc_alerts
interval: 1m
rules:
# Critical: Chain verification failures indicate tampering or corruption
- alert: HlcChainVerificationFailure
expr: increase(scheduler_chain_verification_failures_total[5m]) > 0
for: 1m
labels:
severity: critical
team: scheduler
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#chain-verification-failure
annotations:
summary: "HLC chain verification failure detected"
description: "Chain verification failure on node {{ $labels.node_id }} for tenant {{ $labels.tenant_id }}. This may indicate data tampering or corruption."
impact: "Audit trail integrity compromised. Investigation required."
action: "1. Check scheduler_log table for gaps. 2. Verify no unauthorized changes. 3. Review chain head consistency."
# Critical: Clock skew exceeds tolerance - can cause ordering issues
- alert: HlcClockSkewExceedsTolerance
expr: increase(hlc_clock_skew_rejections_total[5m]) > 5
for: 2m
labels:
severity: critical
team: infrastructure
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#clock-skew
annotations:
summary: "HLC clock skew rejections on {{ $labels.node_id }}"
description: "Node {{ $labels.node_id }} is rejecting HLC updates due to clock skew. {{ $value }} rejections in last 5 minutes."
impact: "Job ordering may be inconsistent. Distributed consistency at risk."
action: "1. Check NTP synchronization on affected node. 2. Verify time sources. 3. Consider increasing skew tolerance temporarily."
# Warning: Physical time offset is drifting
- alert: HlcPhysicalTimeOffset
expr: abs(hlc_physical_time_offset_seconds) > 0.5
for: 5m
labels:
severity: warning
team: infrastructure
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#time-offset
annotations:
summary: "HLC physical time offset on {{ $labels.node_id }}"
description: "HLC physical time is {{ $value }}s offset from wall clock on {{ $labels.node_id }}."
impact: "May cause timestamp ordering anomalies in logs and diagnostics."
action: "Monitor NTP status and consider clock synchronization."
# Warning: High merge conflict rate in air-gap sync
- alert: HlcMergeConflictRateHigh
expr: increase(airgap_merge_conflicts_total[1h]) > 100
for: 10m
labels:
severity: warning
team: scheduler
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#merge-conflicts
annotations:
summary: "High HLC merge conflict rate during air-gap sync"
description: "{{ $value }} merge conflicts detected in the last hour for conflict type {{ $labels.conflict_type }}."
impact: "Air-gap sync may be producing unexpected results or dropping jobs."
action: "1. Review conflict resolution logs. 2. Check for duplicate job submissions. 3. Verify offline node clocks."
# Warning: Air-gap sync duration increasing
- alert: HlcSyncDurationHigh
expr: histogram_quantile(0.95, sum(rate(airgap_sync_duration_seconds_bucket[15m])) by (le)) > 30
for: 10m
labels:
severity: warning
team: scheduler
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#slow-sync
annotations:
summary: "Air-gap sync duration is high"
description: "95th percentile sync duration is {{ $value }}s, exceeding 30s threshold."
impact: "Air-gap import operations are slow, may delay job processing."
action: "1. Check bundle sizes. 2. Verify database performance. 3. Consider chunking large bundles."
# Info: HLC enqueue rate is zero (may be expected in some deployments)
- alert: HlcEnqueueRateZero
expr: sum(rate(scheduler_hlc_enqueues_total[10m])) == 0
for: 30m
labels:
severity: info
team: scheduler
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#no-enqueues
annotations:
summary: "No HLC enqueues in last 30 minutes"
description: "No jobs have been enqueued with HLC timestamps in the last 30 minutes."
impact: "May be expected if no jobs are scheduled, or may indicate HLC ordering is disabled."
action: "Verify EnableHlcOrdering configuration if HLC ordering is expected."
# Warning: Batch snapshot creation failing
- alert: HlcBatchSnapshotFailures
expr: increase(scheduler_batch_snapshot_failures_total[5m]) > 0
for: 2m
labels:
severity: warning
team: scheduler
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#batch-snapshot-failure
annotations:
summary: "Batch snapshot creation failures"
description: "{{ $value }} batch snapshot creation failures in the last 5 minutes."
impact: "DSSE-signed batch proofs may be missing for affected time ranges."
action: "1. Check signing key availability. 2. Verify database connectivity. 3. Review batch size limits."
# Critical: Multiple nodes with same node ID (configuration error)
- alert: HlcDuplicateNodeId
expr: count by (node_id) (group by (node_id, instance) (hlc_ticks_total)) > 1
for: 5m
labels:
severity: critical
team: scheduler
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#duplicate-node-id
annotations:
summary: "Duplicate HLC node ID detected"
description: "Multiple instances are using node_id={{ $labels.node_id }}. This will cause ordering conflicts."
impact: "Critical: Job ordering and chain integrity will be compromised."
action: "Immediately reconfigure affected instances with unique node IDs."