# Ollama LLM Provider configuration template # For local inference using Ollama. # Copy to ollama.yaml (remove .sample extension) and configure. # Provider enabled state and priority # Priority 20 = prefer over cloud, but after llama-server (10) enabled: true priority: 20 # Server Configuration server: # Base URL for Ollama server # Default Ollama port is 11434 baseUrl: "http://localhost:11434" # Health check endpoint healthEndpoint: "/api/tags" # Model Configuration model: # Primary model name # Use 'ollama list' to see available models # Common options: llama3:8b, llama3:70b, codellama:13b, mistral:7b name: "llama3:8b" # Fallback models (tried in order if primary fails) fallbacks: - "llama3:latest" - "mistral:7b" # Keep model loaded in memory (prevents unloading between requests) # Options: "5m", "10m", "1h", "-1" (forever) keepAlive: "5m" # Inference Parameters inference: # Temperature: 0 = deterministic (REQUIRED for reproducibility) temperature: 0.0 # Maximum tokens to generate (-1 = use model default) maxTokens: 4096 # Random seed for reproducibility (REQUIRED for determinism) seed: 42 # Nucleus sampling (top-p) topP: 1.0 # Top-k sampling topK: 40 # Repeat penalty (1.0 = no penalty) repeatPenalty: 1.1 # Context window size numCtx: 4096 # Number of tokens to predict (-1 = unlimited, use maxTokens) numPredict: -1 # GPU Configuration gpu: # Number of GPU layers to offload (0 = CPU only) # -1 = offload all layers to GPU numGpu: 0 # Request Configuration request: # Request timeout (longer for local inference) timeout: "00:05:00" # Maximum retries on failure maxRetries: 2 # Model Management management: # Automatically pull model if not found locally # WARNING: Requires internet access, disable for airgap autoPull: false # Verify model integrity after pull verifyPull: true # Logging Configuration logging: # Log token usage statistics logUsage: true