Align AOC tasks for Excititor and Concelier
This commit is contained in:
		@@ -1,42 +1,42 @@
 | 
			
		||||
groups:
 | 
			
		||||
  - name: scheduler-worker
 | 
			
		||||
    interval: 30s
 | 
			
		||||
    rules:
 | 
			
		||||
      - alert: SchedulerPlannerFailuresHigh
 | 
			
		||||
        expr: sum(rate(scheduler_planner_runs_total{status="failed"}[5m]))
 | 
			
		||||
          /
 | 
			
		||||
          sum(rate(scheduler_planner_runs_total[5m])) > 0.05
 | 
			
		||||
        for: 10m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: critical
 | 
			
		||||
          service: scheduler-worker
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: "Planner failure ratio above 5%"
 | 
			
		||||
          description: "More than 5% of planning runs are failing. Inspect scheduler logs and ImpactIndex connectivity before queues back up."
 | 
			
		||||
      - alert: SchedulerPlannerLatencyHigh
 | 
			
		||||
        expr: histogram_quantile(0.95, sum by (le) (rate(scheduler_planner_latency_seconds_bucket[5m]))) > 45
 | 
			
		||||
        for: 10m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
          service: scheduler-worker
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: "Planner latency p95 above 45s"
 | 
			
		||||
          description: "Planning latency p95 stayed above 45 seconds for 10 minutes. Check ImpactIndex, Mongo, or external selectors to prevent missed SLAs."
 | 
			
		||||
      - alert: SchedulerRunnerBacklogGrowing
 | 
			
		||||
        expr: max_over_time(scheduler_runner_backlog[15m]) > 500
 | 
			
		||||
        for: 15m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
          service: scheduler-worker
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: "Runner backlog above 500 images"
 | 
			
		||||
          description: "Runner backlog exceeded 500 images over the last 15 minutes. Verify runner workers, scanner availability, and rate limits."
 | 
			
		||||
      - alert: SchedulerRunStuck
 | 
			
		||||
        expr: sum(scheduler_runs_active) > 0 and max_over_time(scheduler_runs_active[30m]) == min_over_time(scheduler_runs_active[30m])
 | 
			
		||||
        for: 30m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
          service: scheduler-worker
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: "Scheduler runs stuck without progress"
 | 
			
		||||
          description: "Active runs count has remained flat for 30 minutes. Investigate stuck segments or scanner timeouts."
 | 
			
		||||
groups:
 | 
			
		||||
  - name: scheduler-worker
 | 
			
		||||
    interval: 30s
 | 
			
		||||
    rules:
 | 
			
		||||
      - alert: SchedulerPlannerFailuresHigh
 | 
			
		||||
        expr: sum(rate(scheduler_planner_runs_total{status="failed"}[5m]))
 | 
			
		||||
          /
 | 
			
		||||
          sum(rate(scheduler_planner_runs_total[5m])) > 0.05
 | 
			
		||||
        for: 10m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: critical
 | 
			
		||||
          service: scheduler-worker
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: "Planner failure ratio above 5%"
 | 
			
		||||
          description: "More than 5% of planning runs are failing. Inspect scheduler logs and ImpactIndex connectivity before queues back up."
 | 
			
		||||
      - alert: SchedulerPlannerLatencyHigh
 | 
			
		||||
        expr: histogram_quantile(0.95, sum by (le) (rate(scheduler_planner_latency_seconds_bucket[5m]))) > 45
 | 
			
		||||
        for: 10m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
          service: scheduler-worker
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: "Planner latency p95 above 45s"
 | 
			
		||||
          description: "Planning latency p95 stayed above 45 seconds for 10 minutes. Check ImpactIndex, Mongo, or external selectors to prevent missed SLAs."
 | 
			
		||||
      - alert: SchedulerRunnerBacklogGrowing
 | 
			
		||||
        expr: max_over_time(scheduler_runner_backlog[15m]) > 500
 | 
			
		||||
        for: 15m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
          service: scheduler-worker
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: "Runner backlog above 500 images"
 | 
			
		||||
          description: "Runner backlog exceeded 500 images over the last 15 minutes. Verify runner workers, scanner availability, and rate limits."
 | 
			
		||||
      - alert: SchedulerRunStuck
 | 
			
		||||
        expr: sum(scheduler_runs_active) > 0 and max_over_time(scheduler_runs_active[30m]) == min_over_time(scheduler_runs_active[30m])
 | 
			
		||||
        for: 30m
 | 
			
		||||
        labels:
 | 
			
		||||
          severity: warning
 | 
			
		||||
          service: scheduler-worker
 | 
			
		||||
        annotations:
 | 
			
		||||
          summary: "Scheduler runs stuck without progress"
 | 
			
		||||
          description: "Active runs count has remained flat for 30 minutes. Investigate stuck segments or scanner timeouts."
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user