Refactor code structure and optimize performance across multiple modules
This commit is contained in:
87
etc/llm-providers/ollama.yaml.sample
Normal file
87
etc/llm-providers/ollama.yaml.sample
Normal file
@@ -0,0 +1,87 @@
|
||||
# Ollama LLM Provider configuration template
|
||||
# For local inference using Ollama.
|
||||
# Copy to ollama.yaml (remove .sample extension) and configure.
|
||||
|
||||
# Provider enabled state and priority
|
||||
# Priority 20 = prefer over cloud, but after llama-server (10)
|
||||
enabled: true
|
||||
priority: 20
|
||||
|
||||
# Server Configuration
|
||||
server:
|
||||
# Base URL for Ollama server
|
||||
# Default Ollama port is 11434
|
||||
baseUrl: "http://localhost:11434"
|
||||
|
||||
# Health check endpoint
|
||||
healthEndpoint: "/api/tags"
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
# Primary model name
|
||||
# Use 'ollama list' to see available models
|
||||
# Common options: llama3:8b, llama3:70b, codellama:13b, mistral:7b
|
||||
name: "llama3:8b"
|
||||
|
||||
# Fallback models (tried in order if primary fails)
|
||||
fallbacks:
|
||||
- "llama3:latest"
|
||||
- "mistral:7b"
|
||||
|
||||
# Keep model loaded in memory (prevents unloading between requests)
|
||||
# Options: "5m", "10m", "1h", "-1" (forever)
|
||||
keepAlive: "5m"
|
||||
|
||||
# Inference Parameters
|
||||
inference:
|
||||
# Temperature: 0 = deterministic (REQUIRED for reproducibility)
|
||||
temperature: 0.0
|
||||
|
||||
# Maximum tokens to generate (-1 = use model default)
|
||||
maxTokens: 4096
|
||||
|
||||
# Random seed for reproducibility (REQUIRED for determinism)
|
||||
seed: 42
|
||||
|
||||
# Nucleus sampling (top-p)
|
||||
topP: 1.0
|
||||
|
||||
# Top-k sampling
|
||||
topK: 40
|
||||
|
||||
# Repeat penalty (1.0 = no penalty)
|
||||
repeatPenalty: 1.1
|
||||
|
||||
# Context window size
|
||||
numCtx: 4096
|
||||
|
||||
# Number of tokens to predict (-1 = unlimited, use maxTokens)
|
||||
numPredict: -1
|
||||
|
||||
# GPU Configuration
|
||||
gpu:
|
||||
# Number of GPU layers to offload (0 = CPU only)
|
||||
# -1 = offload all layers to GPU
|
||||
numGpu: 0
|
||||
|
||||
# Request Configuration
|
||||
request:
|
||||
# Request timeout (longer for local inference)
|
||||
timeout: "00:05:00"
|
||||
|
||||
# Maximum retries on failure
|
||||
maxRetries: 2
|
||||
|
||||
# Model Management
|
||||
management:
|
||||
# Automatically pull model if not found locally
|
||||
# WARNING: Requires internet access, disable for airgap
|
||||
autoPull: false
|
||||
|
||||
# Verify model integrity after pull
|
||||
verifyPull: true
|
||||
|
||||
# Logging Configuration
|
||||
logging:
|
||||
# Log token usage statistics
|
||||
logUsage: true
|
||||
Reference in New Issue
Block a user