# llama.cpp Server LLM Provider Configuration # Documentation: https://github.com/ggerganov/llama.cpp/tree/master/examples/server # # Start llama.cpp server: # llama-server -m /path/to/model.gguf --port 8080 --host 0.0.0.0 # # For GPU acceleration: # llama-server -m model.gguf --port 8080 -ngl 99 # # This provider enables OFFLINE/AIRGAP operation by using locally-hosted models. # Provider metadata provider: id: llama-server name: llama.cpp Server description: Local LLM inference via llama.cpp HTTP server (OpenAI-compatible API) # Enable/disable this provider enabled: true # Priority for provider selection (lower = higher priority) # Set to 10 for offline-first deployments priority: 10 # Server Configuration server: # Server URL # Default: http://localhost:8080 baseUrl: "http://localhost:8080" # API key (if server requires authentication) # Start server with: --api-key your-secret-key apiKey: null # Health check endpoint healthEndpoint: "/health" # Model Configuration model: # Model identifier (for logging/tracing) # The actual model is loaded on the server at startup name: "local-llama" # Model file path (informational - model loaded on server) # Used for bundle verification and documentation modelPath: null # Expected model digest (SHA-256) # If set, verify model integrity on connection expectedDigest: null # Inference Parameters inference: # Temperature (0 = deterministic) temperature: 0 # Maximum tokens to generate maxTokens: 4096 # Random seed for reproducibility # llama.cpp respects seed for deterministic output when temp=0 seed: 42 # Top-p (nucleus sampling) topP: 1.0 # Top-k sampling topK: 40 # Repeat penalty repeatPenalty: 1.1 # Context length (must match server -c parameter) contextLength: 4096 # Batch size batchSize: 512 # Stop sequences stopSequences: [] # Request Configuration request: # Request timeout (local inference may be slower) timeout: "00:05:00" # Maximum retries on failure maxRetries: 2 # Retry delay retryDelay: "00:00:02" # Hardware Configuration (informational - actual settings on server) hardware: # Device for inference # Options: cpu, cuda, rocm, metal, vulkan device: "auto" # Number of GPU layers to offload gpuLayers: 0 # Number of CPU threads (0 = auto) threads: 0 # Model Bundle (for airgap deployments) bundle: # Path to model bundle directory bundlePath: null # Verify bundle signature on startup verifySignature: true # Crypto scheme for signature verification cryptoScheme: null # eidas, fips, gost, sm # Logging logging: logBodies: false logUsage: true # Log server health check results logHealthChecks: false