# Ollama LLM Provider configuration template
# For local inference using Ollama.
# Copy to ollama.yaml (remove .sample extension) and configure.

# Provider enabled state and priority
# Priority 20 = prefer over cloud, but after llama-server (10)
enabled: true
priority: 20

# Server Configuration
server:
  # Base URL for Ollama server
  # Default Ollama port is 11434
  baseUrl: "http://localhost:11434"

  # Health check endpoint
  healthEndpoint: "/api/tags"

# Model Configuration
model:
  # Primary model name
  # Use 'ollama list' to see available models
  # Common options: llama3:8b, llama3:70b, codellama:13b, mistral:7b
  name: "llama3:8b"

  # Fallback models (tried in order if primary fails)
  fallbacks:
    - "llama3:latest"
    - "mistral:7b"

  # Keep model loaded in memory (prevents unloading between requests)
  # Options: "5m", "10m", "1h", "-1" (forever)
  keepAlive: "5m"

# Inference Parameters
inference:
  # Temperature: 0 = deterministic (REQUIRED for reproducibility)
  temperature: 0.0

  # Maximum tokens to generate (-1 = use model default)
  maxTokens: 4096

  # Random seed for reproducibility (REQUIRED for determinism)
  seed: 42

  # Nucleus sampling (top-p)
  topP: 1.0

  # Top-k sampling
  topK: 40

  # Repeat penalty (1.0 = no penalty)
  repeatPenalty: 1.1

  # Context window size
  numCtx: 4096

  # Number of tokens to predict (-1 = unlimited, use maxTokens)
  numPredict: -1

# GPU Configuration
gpu:
  # Number of GPU layers to offload (0 = CPU only)
  # -1 = offload all layers to GPU
  numGpu: 0

# Request Configuration
request:
  # Request timeout (longer for local inference)
  timeout: "00:05:00"

  # Maximum retries on failure
  maxRetries: 2

# Model Management
management:
  # Automatically pull model if not found locally
  # WARNING: Requires internet access, disable for airgap
  autoPull: false

  # Verify model integrity after pull
  verifyPull: true

# Logging Configuration
logging:
  # Log token usage statistics
  logUsage: true