# Ollama LLM Provider Configuration # Documentation: https://ollama.ai/ # # Install Ollama: curl -fsSL https://ollama.ai/install.sh | sh # Pull a model: ollama pull llama3:8b # Start server: ollama serve # # Ollama provides an easy way to run local LLMs with automatic GPU detection. # Provider metadata provider: id: ollama name: Ollama description: Local LLM inference via Ollama # Enable/disable this provider enabled: true # Priority for provider selection (lower = higher priority) # Set lower than cloud providers for local-first operation priority: 20 # Server Configuration server: # Ollama server URL baseUrl: "http://localhost:11434" # Health check endpoint healthEndpoint: "/api/tags" # Model Configuration model: # Model to use (must be pulled first: ollama pull ) # Popular options: # - llama3:8b (8B params, good quality/speed balance) # - llama3:70b (70B params, higher quality, needs more RAM) # - mistral:7b (7B params, fast) # - mixtral:8x7b (MoE, good quality) # - codellama:7b (code-focused) # - phi3:mini (small, fast) name: "llama3:8b" # Fallback models fallbacks: - "mistral:7b" - "phi3:mini" # Keep model loaded in memory (reduces latency for repeated requests) keepAlive: "5m" # Inference Parameters inference: # Temperature (0 = deterministic) temperature: 0 # Maximum tokens to generate maxTokens: 4096 # Random seed for reproducibility seed: 42 # Top-p (nucleus sampling) topP: 1.0 # Top-k sampling topK: 40 # Repeat penalty repeatPenalty: 1.1 # Context length numCtx: 4096 # Number of tokens to predict (-1 = unlimited) numPredict: -1 # Stop sequences stopSequences: [] # Request Configuration request: # Request timeout timeout: "00:05:00" # Maximum retries on failure maxRetries: 2 # Retry delay retryDelay: "00:00:02" # GPU Configuration gpu: # Number of GPU layers to use (0 = CPU only) numGpu: 0 # Use GPU for embedding (if available) useGpuForEmbedding: false # Model Management management: # Auto-pull model if not found autoPull: false # Verify model integrity after pull verifyPull: true # Logging logging: logBodies: false logUsage: true