groups: - name: scheduler-worker interval: 30s rules: - alert: SchedulerPlannerFailuresHigh expr: sum(rate(scheduler_planner_runs_total{status="failed"}[5m])) / sum(rate(scheduler_planner_runs_total[5m])) > 0.05 for: 10m labels: severity: critical service: scheduler-worker annotations: summary: "Planner failure ratio above 5%" description: "More than 5% of planning runs are failing. Inspect scheduler logs and ImpactIndex connectivity before queues back up." - alert: SchedulerPlannerLatencyHigh expr: histogram_quantile(0.95, sum by (le) (rate(scheduler_planner_latency_seconds_bucket[5m]))) > 45 for: 10m labels: severity: warning service: scheduler-worker annotations: summary: "Planner latency p95 above 45s" description: "Planning latency p95 stayed above 45 seconds for 10 minutes. Check ImpactIndex, Mongo, or external selectors to prevent missed SLAs." - alert: SchedulerRunnerBacklogGrowing expr: max_over_time(scheduler_runner_backlog[15m]) > 500 for: 15m labels: severity: warning service: scheduler-worker annotations: summary: "Runner backlog above 500 images" description: "Runner backlog exceeded 500 images over the last 15 minutes. Verify runner workers, scanner availability, and rate limits." - alert: SchedulerRunStuck expr: sum(scheduler_runs_active) > 0 and max_over_time(scheduler_runs_active[30m]) == min_over_time(scheduler_runs_active[30m]) for: 30m labels: severity: warning service: scheduler-worker annotations: summary: "Scheduler runs stuck without progress" description: "Active runs count has remained flat for 30 minutes. Investigate stuck segments or scanner timeouts."