111 lines
2.2 KiB
YAML
111 lines
2.2 KiB
YAML
# Ollama LLM Provider Configuration
|
|
# Documentation: https://ollama.ai/
|
|
#
|
|
# Install Ollama: curl -fsSL https://ollama.ai/install.sh | sh
|
|
# Pull a model: ollama pull llama3:8b
|
|
# Start server: ollama serve
|
|
#
|
|
# Ollama provides an easy way to run local LLMs with automatic GPU detection.
|
|
|
|
# Provider metadata
|
|
provider:
|
|
id: ollama
|
|
name: Ollama
|
|
description: Local LLM inference via Ollama
|
|
|
|
# Enable/disable this provider
|
|
enabled: true
|
|
|
|
# Priority for provider selection (lower = higher priority)
|
|
# Set lower than cloud providers for local-first operation
|
|
priority: 20
|
|
|
|
# Server Configuration
|
|
server:
|
|
# Ollama server URL
|
|
baseUrl: "http://localhost:11434"
|
|
|
|
# Health check endpoint
|
|
healthEndpoint: "/api/tags"
|
|
|
|
# Model Configuration
|
|
model:
|
|
# Model to use (must be pulled first: ollama pull <model>)
|
|
# Popular options:
|
|
# - llama3:8b (8B params, good quality/speed balance)
|
|
# - llama3:70b (70B params, higher quality, needs more RAM)
|
|
# - mistral:7b (7B params, fast)
|
|
# - mixtral:8x7b (MoE, good quality)
|
|
# - codellama:7b (code-focused)
|
|
# - phi3:mini (small, fast)
|
|
name: "llama3:8b"
|
|
|
|
# Fallback models
|
|
fallbacks:
|
|
- "mistral:7b"
|
|
- "phi3:mini"
|
|
|
|
# Keep model loaded in memory (reduces latency for repeated requests)
|
|
keepAlive: "5m"
|
|
|
|
# Inference Parameters
|
|
inference:
|
|
# Temperature (0 = deterministic)
|
|
temperature: 0
|
|
|
|
# Maximum tokens to generate
|
|
maxTokens: 4096
|
|
|
|
# Random seed for reproducibility
|
|
seed: 42
|
|
|
|
# Top-p (nucleus sampling)
|
|
topP: 1.0
|
|
|
|
# Top-k sampling
|
|
topK: 40
|
|
|
|
# Repeat penalty
|
|
repeatPenalty: 1.1
|
|
|
|
# Context length
|
|
numCtx: 4096
|
|
|
|
# Number of tokens to predict (-1 = unlimited)
|
|
numPredict: -1
|
|
|
|
# Stop sequences
|
|
stopSequences: []
|
|
|
|
# Request Configuration
|
|
request:
|
|
# Request timeout
|
|
timeout: "00:05:00"
|
|
|
|
# Maximum retries on failure
|
|
maxRetries: 2
|
|
|
|
# Retry delay
|
|
retryDelay: "00:00:02"
|
|
|
|
# GPU Configuration
|
|
gpu:
|
|
# Number of GPU layers to use (0 = CPU only)
|
|
numGpu: 0
|
|
|
|
# Use GPU for embedding (if available)
|
|
useGpuForEmbedding: false
|
|
|
|
# Model Management
|
|
management:
|
|
# Auto-pull model if not found
|
|
autoPull: false
|
|
|
|
# Verify model integrity after pull
|
|
verifyPull: true
|
|
|
|
# Logging
|
|
logging:
|
|
logBodies: false
|
|
logUsage: true
|