# llama.cpp Server LLM Provider configuration template # This is the PRIMARY provider for OFFLINE/AIRGAP deployments. # Copy to llama-server.yaml (remove .sample extension) and configure. # Provider enabled state and priority # Lower priority number = higher preference (10 = prefer over cloud providers) enabled: true priority: 10 # Server Configuration server: # Base URL for llama.cpp server # Start llama.cpp with: llama-server -m model.gguf --host 0.0.0.0 --port 8080 baseUrl: "http://localhost:8080" # API key if server requires authentication (--api-key flag) apiKey: "" # Health check endpoint healthEndpoint: "/health" # Model Configuration model: # Model name (for logging and identification) name: "llama3-8b-q4km" # Path to model file (informational, model is loaded on server) modelPath: "/models/llama-3-8b-instruct.Q4_K_M.gguf" # Expected model digest (SHA-256) for verification # Ensures the correct model is loaded in airgap environments expectedDigest: "" # Inference Parameters inference: # Temperature: 0 = deterministic (REQUIRED for reproducibility) temperature: 0.0 # Maximum tokens to generate maxTokens: 4096 # Random seed for reproducibility (REQUIRED for determinism) seed: 42 # Nucleus sampling (top-p) topP: 1.0 # Top-k sampling topK: 40 # Repeat penalty (1.0 = no penalty) repeatPenalty: 1.1 # Context length (must match server's -c flag) contextLength: 4096 # Request Configuration request: # Request timeout (longer for local inference) timeout: "00:05:00" # Maximum retries on failure maxRetries: 2 # Model Bundle Configuration (for airgap deployments) bundle: # Path to signed model bundle (.stellaops-model directory) # Created using: stella model bundle --sign bundlePath: "" # Verify bundle signature before loading verifySignature: true # Cryptographic scheme for verification # Options: ed25519, ecdsa-p256, gost3410, sm2 cryptoScheme: "ed25519" # Logging Configuration logging: # Log health check results logHealthChecks: false # Log token usage statistics logUsage: true # Performance Tuning performance: # Number of threads for inference (-t flag on server) # 0 = auto-detect threads: 0 # Batch size for prompt processing batchSize: 512 # Context size for parallel requests parallelContexts: 1