git.stella-ops.org/devops/observability/alerting/hlc-alerts.yaml

# HLC Queue Alerting Rules
# Sprint: SPRINT_20260105_002_004_BE_hlc_integration_tests
# Task: INT-018 - Create alerts for HLC anomalies

groups:
  - name: hlc_alerts
    interval: 1m
    rules:
      # Critical: Chain verification failures indicate tampering or corruption
      - alert: HlcChainVerificationFailure
        expr: increase(scheduler_chain_verification_failures_total[5m]) > 0
        for: 1m
        labels:
          severity: critical
          team: scheduler
          runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#chain-verification-failure
        annotations:
          summary: "HLC chain verification failure detected"
          description: "Chain verification failure on node {{ $labels.node_id }} for tenant {{ $labels.tenant_id }}. This may indicate data tampering or corruption."
          impact: "Audit trail integrity compromised. Investigation required."
          action: "1. Check scheduler_log table for gaps. 2. Verify no unauthorized changes. 3. Review chain head consistency."

      # Critical: Clock skew exceeds tolerance - can cause ordering issues
      - alert: HlcClockSkewExceedsTolerance
        expr: increase(hlc_clock_skew_rejections_total[5m]) > 5
        for: 2m
        labels:
          severity: critical
          team: infrastructure
          runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#clock-skew
        annotations:
          summary: "HLC clock skew rejections on {{ $labels.node_id }}"
          description: "Node {{ $labels.node_id }} is rejecting HLC updates due to clock skew. {{ $value }} rejections in last 5 minutes."
          impact: "Job ordering may be inconsistent. Distributed consistency at risk."
          action: "1. Check NTP synchronization on affected node. 2. Verify time sources. 3. Consider increasing skew tolerance temporarily."

      # Warning: Physical time offset is drifting
      - alert: HlcPhysicalTimeOffset
        expr: abs(hlc_physical_time_offset_seconds) > 0.5
        for: 5m
        labels:
          severity: warning
          team: infrastructure
          runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#time-offset
        annotations:
          summary: "HLC physical time offset on {{ $labels.node_id }}"
          description: "HLC physical time is {{ $value }}s offset from wall clock on {{ $labels.node_id }}."
          impact: "May cause timestamp ordering anomalies in logs and diagnostics."
          action: "Monitor NTP status and consider clock synchronization."

      # Warning: High merge conflict rate in air-gap sync
      - alert: HlcMergeConflictRateHigh
        expr: increase(airgap_merge_conflicts_total[1h]) > 100
        for: 10m
        labels:
          severity: warning
          team: scheduler
          runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#merge-conflicts
        annotations:
          summary: "High HLC merge conflict rate during air-gap sync"
          description: "{{ $value }} merge conflicts detected in the last hour for conflict type {{ $labels.conflict_type }}."
          impact: "Air-gap sync may be producing unexpected results or dropping jobs."
          action: "1. Review conflict resolution logs. 2. Check for duplicate job submissions. 3. Verify offline node clocks."

      # Warning: Air-gap sync duration increasing
      - alert: HlcSyncDurationHigh
        expr: histogram_quantile(0.95, sum(rate(airgap_sync_duration_seconds_bucket[15m])) by (le)) > 30
        for: 10m
        labels:
          severity: warning
          team: scheduler
          runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#slow-sync
        annotations:
          summary: "Air-gap sync duration is high"
          description: "95th percentile sync duration is {{ $value }}s, exceeding 30s threshold."
          impact: "Air-gap import operations are slow, may delay job processing."
          action: "1. Check bundle sizes. 2. Verify database performance. 3. Consider chunking large bundles."

      # Info: HLC enqueue rate is zero (may be expected in some deployments)
      - alert: HlcEnqueueRateZero
        expr: sum(rate(scheduler_hlc_enqueues_total[10m])) == 0
        for: 30m
        labels:
          severity: info
          team: scheduler
          runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#no-enqueues
        annotations:
          summary: "No HLC enqueues in last 30 minutes"
          description: "No jobs have been enqueued with HLC timestamps in the last 30 minutes."
          impact: "May be expected if no jobs are scheduled, or may indicate HLC ordering is disabled."
          action: "Verify EnableHlcOrdering configuration if HLC ordering is expected."

      # Warning: Batch snapshot creation failing
      - alert: HlcBatchSnapshotFailures
        expr: increase(scheduler_batch_snapshot_failures_total[5m]) > 0
        for: 2m
        labels:
          severity: warning
          team: scheduler
          runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#batch-snapshot-failure
        annotations:
          summary: "Batch snapshot creation failures"
          description: "{{ $value }} batch snapshot creation failures in the last 5 minutes."
          impact: "DSSE-signed batch proofs may be missing for affected time ranges."
          action: "1. Check signing key availability. 2. Verify database connectivity. 3. Review batch size limits."

      # Critical: Multiple nodes with same node ID (configuration error)
      - alert: HlcDuplicateNodeId
        expr: count by (node_id) (group by (node_id, instance) (hlc_ticks_total)) > 1
        for: 5m
        labels:
          severity: critical
          team: scheduler
          runbook: https://docs.stellaops.internal/operations/runbooks/hlc-troubleshooting#duplicate-node-id
        annotations:
          summary: "Duplicate HLC node ID detected"
          description: "Multiple instances are using node_id={{ $labels.node_id }}. This will cause ordering conflicts."
          impact: "Critical: Job ordering and chain integrity will be compromised."
          action: "Immediately reconfigure affected instances with unique node IDs."