feat: Implement runner execution pipeline with planner dispatch and execution services
	
		
			
	
		
	
	
		
	
		
			Some checks failed
		
		
	
	
		
			
				
	
				Docs CI / lint-and-preview (push) Has been cancelled
				
			
		
		
	
	
				
					
				
			
		
			Some checks failed
		
		
	
	Docs CI / lint-and-preview (push) Has been cancelled
				
			- Introduced RunnerBackgroundService to handle execution of runner segments. - Added RunnerExecutionService for processing segments and aggregating results. - Implemented PlannerQueueDispatchService to manage dispatching of planner messages. - Created PlannerQueueDispatcherBackgroundService for leasing and processing planner queue messages. - Developed ScannerReportClient for interacting with the scanner service. - Enhanced observability with SchedulerWorkerMetrics for tracking planner and runner performance. - Added comprehensive documentation for the new runner execution pipeline and observability metrics. - Implemented event emission for rescan activity and scanner report readiness.
This commit is contained in:
		
							
								
								
									
										42
									
								
								docs/ops/scheduler-worker-prometheus-rules.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								docs/ops/scheduler-worker-prometheus-rules.yaml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,42 @@ | ||||
| groups: | ||||
|   - name: scheduler-worker | ||||
|     interval: 30s | ||||
|     rules: | ||||
|       - alert: SchedulerPlannerFailuresHigh | ||||
|         expr: sum(rate(scheduler_planner_runs_total{status="failed"}[5m])) | ||||
|           / | ||||
|           sum(rate(scheduler_planner_runs_total[5m])) > 0.05 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: critical | ||||
|           service: scheduler-worker | ||||
|         annotations: | ||||
|           summary: "Planner failure ratio above 5%" | ||||
|           description: "More than 5% of planning runs are failing. Inspect scheduler logs and ImpactIndex connectivity before queues back up." | ||||
|       - alert: SchedulerPlannerLatencyHigh | ||||
|         expr: histogram_quantile(0.95, sum by (le) (rate(scheduler_planner_latency_seconds_bucket[5m]))) > 45 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|           service: scheduler-worker | ||||
|         annotations: | ||||
|           summary: "Planner latency p95 above 45s" | ||||
|           description: "Planning latency p95 stayed above 45 seconds for 10 minutes. Check ImpactIndex, Mongo, or external selectors to prevent missed SLAs." | ||||
|       - alert: SchedulerRunnerBacklogGrowing | ||||
|         expr: max_over_time(scheduler_runner_backlog[15m]) > 500 | ||||
|         for: 15m | ||||
|         labels: | ||||
|           severity: warning | ||||
|           service: scheduler-worker | ||||
|         annotations: | ||||
|           summary: "Runner backlog above 500 images" | ||||
|           description: "Runner backlog exceeded 500 images over the last 15 minutes. Verify runner workers, scanner availability, and rate limits." | ||||
|       - alert: SchedulerRunStuck | ||||
|         expr: sum(scheduler_runs_active) > 0 and max_over_time(scheduler_runs_active[30m]) == min_over_time(scheduler_runs_active[30m]) | ||||
|         for: 30m | ||||
|         labels: | ||||
|           severity: warning | ||||
|           service: scheduler-worker | ||||
|         annotations: | ||||
|           summary: "Scheduler runs stuck without progress" | ||||
|           description: "Active runs count has remained flat for 30 minutes. Investigate stuck segments or scanner timeouts." | ||||
		Reference in New Issue
	
	Block a user