123 lines
2.7 KiB
YAML
123 lines
2.7 KiB
YAML
# llama.cpp Server LLM Provider Configuration
|
|
# Documentation: https://github.com/ggerganov/llama.cpp/tree/master/examples/server
|
|
#
|
|
# Start llama.cpp server:
|
|
# llama-server -m /path/to/model.gguf --port 8080 --host 0.0.0.0
|
|
#
|
|
# For GPU acceleration:
|
|
# llama-server -m model.gguf --port 8080 -ngl 99
|
|
#
|
|
# This provider enables OFFLINE/AIRGAP operation by using locally-hosted models.
|
|
|
|
# Provider metadata
|
|
provider:
|
|
id: llama-server
|
|
name: llama.cpp Server
|
|
description: Local LLM inference via llama.cpp HTTP server (OpenAI-compatible API)
|
|
|
|
# Enable/disable this provider
|
|
enabled: true
|
|
|
|
# Priority for provider selection (lower = higher priority)
|
|
# Set to 10 for offline-first deployments
|
|
priority: 10
|
|
|
|
# Server Configuration
|
|
server:
|
|
# Server URL
|
|
# Default: http://localhost:8080
|
|
baseUrl: "http://localhost:8080"
|
|
|
|
# API key (if server requires authentication)
|
|
# Start server with: --api-key your-secret-key
|
|
apiKey: null
|
|
|
|
# Health check endpoint
|
|
healthEndpoint: "/health"
|
|
|
|
# Model Configuration
|
|
model:
|
|
# Model identifier (for logging/tracing)
|
|
# The actual model is loaded on the server at startup
|
|
name: "local-llama"
|
|
|
|
# Model file path (informational - model loaded on server)
|
|
# Used for bundle verification and documentation
|
|
modelPath: null
|
|
|
|
# Expected model digest (SHA-256)
|
|
# If set, verify model integrity on connection
|
|
expectedDigest: null
|
|
|
|
# Inference Parameters
|
|
inference:
|
|
# Temperature (0 = deterministic)
|
|
temperature: 0
|
|
|
|
# Maximum tokens to generate
|
|
maxTokens: 4096
|
|
|
|
# Random seed for reproducibility
|
|
# llama.cpp respects seed for deterministic output when temp=0
|
|
seed: 42
|
|
|
|
# Top-p (nucleus sampling)
|
|
topP: 1.0
|
|
|
|
# Top-k sampling
|
|
topK: 40
|
|
|
|
# Repeat penalty
|
|
repeatPenalty: 1.1
|
|
|
|
# Context length (must match server -c parameter)
|
|
contextLength: 4096
|
|
|
|
# Batch size
|
|
batchSize: 512
|
|
|
|
# Stop sequences
|
|
stopSequences: []
|
|
|
|
# Request Configuration
|
|
request:
|
|
# Request timeout (local inference may be slower)
|
|
timeout: "00:05:00"
|
|
|
|
# Maximum retries on failure
|
|
maxRetries: 2
|
|
|
|
# Retry delay
|
|
retryDelay: "00:00:02"
|
|
|
|
# Hardware Configuration (informational - actual settings on server)
|
|
hardware:
|
|
# Device for inference
|
|
# Options: cpu, cuda, rocm, metal, vulkan
|
|
device: "auto"
|
|
|
|
# Number of GPU layers to offload
|
|
gpuLayers: 0
|
|
|
|
# Number of CPU threads (0 = auto)
|
|
threads: 0
|
|
|
|
# Model Bundle (for airgap deployments)
|
|
bundle:
|
|
# Path to model bundle directory
|
|
bundlePath: null
|
|
|
|
# Verify bundle signature on startup
|
|
verifySignature: true
|
|
|
|
# Crypto scheme for signature verification
|
|
cryptoScheme: null # eidas, fips, gost, sm
|
|
|
|
# Logging
|
|
logging:
|
|
logBodies: false
|
|
logUsage: true
|
|
|
|
# Log server health check results
|
|
logHealthChecks: false
|