git.stella-ops.org/etc/llm-providers/ollama.yaml

# Ollama LLM Provider Configuration
# Documentation: https://ollama.ai/
#
# Install Ollama: curl -fsSL https://ollama.ai/install.sh | sh
# Pull a model: ollama pull llama3:8b
# Start server: ollama serve
#
# Ollama provides an easy way to run local LLMs with automatic GPU detection.

# Provider metadata
provider:
  id: ollama
  name: Ollama
  description: Local LLM inference via Ollama

# Enable/disable this provider
enabled: true

# Priority for provider selection (lower = higher priority)
# Set lower than cloud providers for local-first operation
priority: 20

# Server Configuration
server:
  # Ollama server URL
  baseUrl: "http://localhost:11434"

  # Health check endpoint
  healthEndpoint: "/api/tags"

# Model Configuration
model:
  # Model to use (must be pulled first: ollama pull <model>)
  # Popular options:
  #   - llama3:8b (8B params, good quality/speed balance)
  #   - llama3:70b (70B params, higher quality, needs more RAM)
  #   - mistral:7b (7B params, fast)
  #   - mixtral:8x7b (MoE, good quality)
  #   - codellama:7b (code-focused)
  #   - phi3:mini (small, fast)
  name: "llama3:8b"

  # Fallback models
  fallbacks:
    - "mistral:7b"
    - "phi3:mini"

  # Keep model loaded in memory (reduces latency for repeated requests)
  keepAlive: "5m"

# Inference Parameters
inference:
  # Temperature (0 = deterministic)
  temperature: 0

  # Maximum tokens to generate
  maxTokens: 4096

  # Random seed for reproducibility
  seed: 42

  # Top-p (nucleus sampling)
  topP: 1.0

  # Top-k sampling
  topK: 40

  # Repeat penalty
  repeatPenalty: 1.1

  # Context length
  numCtx: 4096

  # Number of tokens to predict (-1 = unlimited)
  numPredict: -1

  # Stop sequences
  stopSequences: []

# Request Configuration
request:
  # Request timeout
  timeout: "00:05:00"

  # Maximum retries on failure
  maxRetries: 2

  # Retry delay
  retryDelay: "00:00:02"

# GPU Configuration
gpu:
  # Number of GPU layers to use (0 = CPU only)
  numGpu: 0

  # Use GPU for embedding (if available)
  useGpuForEmbedding: false

# Model Management
management:
  # Auto-pull model if not found
  autoPull: false

  # Verify model integrity after pull
  verifyPull: true

# Logging
logging:
  logBodies: false
  logUsage: true