Files
git.stella-ops.org/etc/llm-providers/ollama.yaml.sample

88 lines
1.9 KiB
Plaintext

# Ollama LLM Provider configuration template
# For local inference using Ollama.
# Copy to ollama.yaml (remove .sample extension) and configure.
# Provider enabled state and priority
# Priority 20 = prefer over cloud, but after llama-server (10)
enabled: true
priority: 20
# Server Configuration
server:
# Base URL for Ollama server
# Default Ollama port is 11434
baseUrl: "http://localhost:11434"
# Health check endpoint
healthEndpoint: "/api/tags"
# Model Configuration
model:
# Primary model name
# Use 'ollama list' to see available models
# Common options: llama3:8b, llama3:70b, codellama:13b, mistral:7b
name: "llama3:8b"
# Fallback models (tried in order if primary fails)
fallbacks:
- "llama3:latest"
- "mistral:7b"
# Keep model loaded in memory (prevents unloading between requests)
# Options: "5m", "10m", "1h", "-1" (forever)
keepAlive: "5m"
# Inference Parameters
inference:
# Temperature: 0 = deterministic (REQUIRED for reproducibility)
temperature: 0.0
# Maximum tokens to generate (-1 = use model default)
maxTokens: 4096
# Random seed for reproducibility (REQUIRED for determinism)
seed: 42
# Nucleus sampling (top-p)
topP: 1.0
# Top-k sampling
topK: 40
# Repeat penalty (1.0 = no penalty)
repeatPenalty: 1.1
# Context window size
numCtx: 4096
# Number of tokens to predict (-1 = unlimited, use maxTokens)
numPredict: -1
# GPU Configuration
gpu:
# Number of GPU layers to offload (0 = CPU only)
# -1 = offload all layers to GPU
numGpu: 0
# Request Configuration
request:
# Request timeout (longer for local inference)
timeout: "00:05:00"
# Maximum retries on failure
maxRetries: 2
# Model Management
management:
# Automatically pull model if not found locally
# WARNING: Requires internet access, disable for airgap
autoPull: false
# Verify model integrity after pull
verifyPull: true
# Logging Configuration
logging:
# Log token usage statistics
logUsage: true