120 lines
6.2 KiB
YAML
120 lines
6.2 KiB
YAML
# HLC Queue Alerting Rules
|
|
# Sprint: SPRINT_20260105_002_004_BE_hlc_integration_tests
|
|
# Task: INT-018 - Create alerts for HLC anomalies
|
|
|
|
groups:
|
|
- name: hlc_alerts
|
|
interval: 1m
|
|
rules:
|
|
# Critical: Chain verification failures indicate tampering or corruption
|
|
- alert: HlcChainVerificationFailure
|
|
expr: increase(scheduler_chain_verification_failures_total[5m]) > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: scheduler
|
|
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#chain-verification-failure
|
|
annotations:
|
|
summary: "HLC chain verification failure detected"
|
|
description: "Chain verification failure on node {{ $labels.node_id }} for tenant {{ $labels.tenant_id }}. This may indicate data tampering or corruption."
|
|
impact: "Audit trail integrity compromised. Investigation required."
|
|
action: "1. Check scheduler_log table for gaps. 2. Verify no unauthorized changes. 3. Review chain head consistency."
|
|
|
|
# Critical: Clock skew exceeds tolerance - can cause ordering issues
|
|
- alert: HlcClockSkewExceedsTolerance
|
|
expr: increase(hlc_clock_skew_rejections_total[5m]) > 5
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
team: infrastructure
|
|
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#clock-skew
|
|
annotations:
|
|
summary: "HLC clock skew rejections on {{ $labels.node_id }}"
|
|
description: "Node {{ $labels.node_id }} is rejecting HLC updates due to clock skew. {{ $value }} rejections in last 5 minutes."
|
|
impact: "Job ordering may be inconsistent. Distributed consistency at risk."
|
|
action: "1. Check NTP synchronization on affected node. 2. Verify time sources. 3. Consider increasing skew tolerance temporarily."
|
|
|
|
# Warning: Physical time offset is drifting
|
|
- alert: HlcPhysicalTimeOffset
|
|
expr: abs(hlc_physical_time_offset_seconds) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: infrastructure
|
|
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#time-offset
|
|
annotations:
|
|
summary: "HLC physical time offset on {{ $labels.node_id }}"
|
|
description: "HLC physical time is {{ $value }}s offset from wall clock on {{ $labels.node_id }}."
|
|
impact: "May cause timestamp ordering anomalies in logs and diagnostics."
|
|
action: "Monitor NTP status and consider clock synchronization."
|
|
|
|
# Warning: High merge conflict rate in air-gap sync
|
|
- alert: HlcMergeConflictRateHigh
|
|
expr: increase(airgap_merge_conflicts_total[1h]) > 100
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
team: scheduler
|
|
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#merge-conflicts
|
|
annotations:
|
|
summary: "High HLC merge conflict rate during air-gap sync"
|
|
description: "{{ $value }} merge conflicts detected in the last hour for conflict type {{ $labels.conflict_type }}."
|
|
impact: "Air-gap sync may be producing unexpected results or dropping jobs."
|
|
action: "1. Review conflict resolution logs. 2. Check for duplicate job submissions. 3. Verify offline node clocks."
|
|
|
|
# Warning: Air-gap sync duration increasing
|
|
- alert: HlcSyncDurationHigh
|
|
expr: histogram_quantile(0.95, sum(rate(airgap_sync_duration_seconds_bucket[15m])) by (le)) > 30
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
team: scheduler
|
|
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#slow-sync
|
|
annotations:
|
|
summary: "Air-gap sync duration is high"
|
|
description: "95th percentile sync duration is {{ $value }}s, exceeding 30s threshold."
|
|
impact: "Air-gap import operations are slow, may delay job processing."
|
|
action: "1. Check bundle sizes. 2. Verify database performance. 3. Consider chunking large bundles."
|
|
|
|
# Info: HLC enqueue rate is zero (may be expected in some deployments)
|
|
- alert: HlcEnqueueRateZero
|
|
expr: sum(rate(scheduler_hlc_enqueues_total[10m])) == 0
|
|
for: 30m
|
|
labels:
|
|
severity: info
|
|
team: scheduler
|
|
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#no-enqueues
|
|
annotations:
|
|
summary: "No HLC enqueues in last 30 minutes"
|
|
description: "No jobs have been enqueued with HLC timestamps in the last 30 minutes."
|
|
impact: "May be expected if no jobs are scheduled, or may indicate HLC ordering is disabled."
|
|
action: "Verify EnableHlcOrdering configuration if HLC ordering is expected."
|
|
|
|
# Warning: Batch snapshot creation failing
|
|
- alert: HlcBatchSnapshotFailures
|
|
expr: increase(scheduler_batch_snapshot_failures_total[5m]) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: scheduler
|
|
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#batch-snapshot-failure
|
|
annotations:
|
|
summary: "Batch snapshot creation failures"
|
|
description: "{{ $value }} batch snapshot creation failures in the last 5 minutes."
|
|
impact: "DSSE-signed batch proofs may be missing for affected time ranges."
|
|
action: "1. Check signing key availability. 2. Verify database connectivity. 3. Review batch size limits."
|
|
|
|
# Critical: Multiple nodes with same node ID (configuration error)
|
|
- alert: HlcDuplicateNodeId
|
|
expr: count by (node_id) (group by (node_id, instance) (hlc_ticks_total)) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
team: scheduler
|
|
runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#duplicate-node-id
|
|
annotations:
|
|
summary: "Duplicate HLC node ID detected"
|
|
description: "Multiple instances are using node_id={{ $labels.node_id }}. This will cause ordering conflicts."
|
|
impact: "Critical: Job ordering and chain integrity will be compromised."
|
|
action: "Immediately reconfigure affected instances with unique node IDs."
|