# HLC Queue Alerting Rules # Sprint: SPRINT_20260105_002_004_BE_hlc_integration_tests # Task: INT-018 - Create alerts for HLC anomalies groups: - name: hlc_alerts interval: 1m rules: # Critical: Chain verification failures indicate tampering or corruption - alert: HlcChainVerificationFailure expr: increase(scheduler_chain_verification_failures_total[5m]) > 0 for: 1m labels: severity: critical team: scheduler runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#chain-verification-failure annotations: summary: "HLC chain verification failure detected" description: "Chain verification failure on node {{ $labels.node_id }} for tenant {{ $labels.tenant_id }}. This may indicate data tampering or corruption." impact: "Audit trail integrity compromised. Investigation required." action: "1. Check scheduler_log table for gaps. 2. Verify no unauthorized changes. 3. Review chain head consistency." # Critical: Clock skew exceeds tolerance - can cause ordering issues - alert: HlcClockSkewExceedsTolerance expr: increase(hlc_clock_skew_rejections_total[5m]) > 5 for: 2m labels: severity: critical team: infrastructure runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#clock-skew annotations: summary: "HLC clock skew rejections on {{ $labels.node_id }}" description: "Node {{ $labels.node_id }} is rejecting HLC updates due to clock skew. {{ $value }} rejections in last 5 minutes." impact: "Job ordering may be inconsistent. Distributed consistency at risk." action: "1. Check NTP synchronization on affected node. 2. Verify time sources. 3. Consider increasing skew tolerance temporarily." # Warning: Physical time offset is drifting - alert: HlcPhysicalTimeOffset expr: abs(hlc_physical_time_offset_seconds) > 0.5 for: 5m labels: severity: warning team: infrastructure runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#time-offset annotations: summary: "HLC physical time offset on {{ $labels.node_id }}" description: "HLC physical time is {{ $value }}s offset from wall clock on {{ $labels.node_id }}." impact: "May cause timestamp ordering anomalies in logs and diagnostics." action: "Monitor NTP status and consider clock synchronization." # Warning: High merge conflict rate in air-gap sync - alert: HlcMergeConflictRateHigh expr: increase(airgap_merge_conflicts_total[1h]) > 100 for: 10m labels: severity: warning team: scheduler runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#merge-conflicts annotations: summary: "High HLC merge conflict rate during air-gap sync" description: "{{ $value }} merge conflicts detected in the last hour for conflict type {{ $labels.conflict_type }}." impact: "Air-gap sync may be producing unexpected results or dropping jobs." action: "1. Review conflict resolution logs. 2. Check for duplicate job submissions. 3. Verify offline node clocks." # Warning: Air-gap sync duration increasing - alert: HlcSyncDurationHigh expr: histogram_quantile(0.95, sum(rate(airgap_sync_duration_seconds_bucket[15m])) by (le)) > 30 for: 10m labels: severity: warning team: scheduler runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#slow-sync annotations: summary: "Air-gap sync duration is high" description: "95th percentile sync duration is {{ $value }}s, exceeding 30s threshold." impact: "Air-gap import operations are slow, may delay job processing." action: "1. Check bundle sizes. 2. Verify database performance. 3. Consider chunking large bundles." # Info: HLC enqueue rate is zero (may be expected in some deployments) - alert: HlcEnqueueRateZero expr: sum(rate(scheduler_hlc_enqueues_total[10m])) == 0 for: 30m labels: severity: info team: scheduler runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#no-enqueues annotations: summary: "No HLC enqueues in last 30 minutes" description: "No jobs have been enqueued with HLC timestamps in the last 30 minutes." impact: "May be expected if no jobs are scheduled, or may indicate HLC ordering is disabled." action: "Verify EnableHlcOrdering configuration if HLC ordering is expected." # Warning: Batch snapshot creation failing - alert: HlcBatchSnapshotFailures expr: increase(scheduler_batch_snapshot_failures_total[5m]) > 0 for: 2m labels: severity: warning team: scheduler runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#batch-snapshot-failure annotations: summary: "Batch snapshot creation failures" description: "{{ $value }} batch snapshot creation failures in the last 5 minutes." impact: "DSSE-signed batch proofs may be missing for affected time ranges." action: "1. Check signing key availability. 2. Verify database connectivity. 3. Review batch size limits." # Critical: Multiple nodes with same node ID (configuration error) - alert: HlcDuplicateNodeId expr: count by (node_id) (group by (node_id, instance) (hlc_ticks_total)) > 1 for: 5m labels: severity: critical team: scheduler runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#duplicate-node-id annotations: summary: "Duplicate HLC node ID detected" description: "Multiple instances are using node_id={{ $labels.node_id }}. This will cause ordering conflicts." impact: "Critical: Job ordering and chain integrity will be compromised." action: "Immediately reconfigure affected instances with unique node IDs."