Refactor code structure and optimize performance across multiple modules

2025-12-26 20:03:22 +02:00
parent c786faae84
commit b4fc66feb6
3353 changed files with 88254 additions and 1590657 deletions
--- a/etc/llm-providers/ollama.yaml.sample
+++ b/etc/llm-providers/ollama.yaml.sample
@@ -0,0 +1,87 @@
+# Ollama LLM Provider configuration template
+# For local inference using Ollama.
+# Copy to ollama.yaml (remove .sample extension) and configure.
+
+# Provider enabled state and priority
+# Priority 20 = prefer over cloud, but after llama-server (10)
+enabled: true
+priority: 20
+
+# Server Configuration
+server:
+  # Base URL for Ollama server
+  # Default Ollama port is 11434
+  baseUrl: "http://localhost:11434"
+
+  # Health check endpoint
+  healthEndpoint: "/api/tags"
+
+# Model Configuration
+model:
+  # Primary model name
+  # Use 'ollama list' to see available models
+  # Common options: llama3:8b, llama3:70b, codellama:13b, mistral:7b
+  name: "llama3:8b"
+
+  # Fallback models (tried in order if primary fails)
+  fallbacks:
+    - "llama3:latest"
+    - "mistral:7b"
+
+  # Keep model loaded in memory (prevents unloading between requests)
+  # Options: "5m", "10m", "1h", "-1" (forever)
+  keepAlive: "5m"
+
+# Inference Parameters
+inference:
+  # Temperature: 0 = deterministic (REQUIRED for reproducibility)
+  temperature: 0.0
+
+  # Maximum tokens to generate (-1 = use model default)
+  maxTokens: 4096
+
+  # Random seed for reproducibility (REQUIRED for determinism)
+  seed: 42
+
+  # Nucleus sampling (top-p)
+  topP: 1.0
+
+  # Top-k sampling
+  topK: 40
+
+  # Repeat penalty (1.0 = no penalty)
+  repeatPenalty: 1.1
+
+  # Context window size
+  numCtx: 4096
+
+  # Number of tokens to predict (-1 = unlimited, use maxTokens)
+  numPredict: -1
+
+# GPU Configuration
+gpu:
+  # Number of GPU layers to offload (0 = CPU only)
+  # -1 = offload all layers to GPU
+  numGpu: 0
+
+# Request Configuration
+request:
+  # Request timeout (longer for local inference)
+  timeout: "00:05:00"
+
+  # Maximum retries on failure
+  maxRetries: 2
+
+# Model Management
+management:
+  # Automatically pull model if not found locally
+  # WARNING: Requires internet access, disable for airgap
+  autoPull: false
+
+  # Verify model integrity after pull
+  verifyPull: true
+
+# Logging Configuration
+logging:
+  # Log token usage statistics
+  logUsage: true