97 lines
2.3 KiB
Plaintext
97 lines
2.3 KiB
Plaintext
# llama.cpp Server LLM Provider configuration template
|
|
# This is the PRIMARY provider for OFFLINE/AIRGAP deployments.
|
|
# Copy to llama-server.yaml (remove .sample extension) and configure.
|
|
|
|
# Provider enabled state and priority
|
|
# Lower priority number = higher preference (10 = prefer over cloud providers)
|
|
enabled: true
|
|
priority: 10
|
|
|
|
# Server Configuration
|
|
server:
|
|
# Base URL for llama.cpp server
|
|
# Start llama.cpp with: llama-server -m model.gguf --host 0.0.0.0 --port 8080
|
|
baseUrl: "http://localhost:8080"
|
|
|
|
# API key if server requires authentication (--api-key flag)
|
|
apiKey: ""
|
|
|
|
# Health check endpoint
|
|
healthEndpoint: "/health"
|
|
|
|
# Model Configuration
|
|
model:
|
|
# Model name (for logging and identification)
|
|
name: "llama3-8b-q4km"
|
|
|
|
# Path to model file (informational, model is loaded on server)
|
|
modelPath: "/models/llama-3-8b-instruct.Q4_K_M.gguf"
|
|
|
|
# Expected model digest (SHA-256) for verification
|
|
# Ensures the correct model is loaded in airgap environments
|
|
expectedDigest: ""
|
|
|
|
# Inference Parameters
|
|
inference:
|
|
# Temperature: 0 = deterministic (REQUIRED for reproducibility)
|
|
temperature: 0.0
|
|
|
|
# Maximum tokens to generate
|
|
maxTokens: 4096
|
|
|
|
# Random seed for reproducibility (REQUIRED for determinism)
|
|
seed: 42
|
|
|
|
# Nucleus sampling (top-p)
|
|
topP: 1.0
|
|
|
|
# Top-k sampling
|
|
topK: 40
|
|
|
|
# Repeat penalty (1.0 = no penalty)
|
|
repeatPenalty: 1.1
|
|
|
|
# Context length (must match server's -c flag)
|
|
contextLength: 4096
|
|
|
|
# Request Configuration
|
|
request:
|
|
# Request timeout (longer for local inference)
|
|
timeout: "00:05:00"
|
|
|
|
# Maximum retries on failure
|
|
maxRetries: 2
|
|
|
|
# Model Bundle Configuration (for airgap deployments)
|
|
bundle:
|
|
# Path to signed model bundle (.stellaops-model directory)
|
|
# Created using: stella model bundle --sign
|
|
bundlePath: ""
|
|
|
|
# Verify bundle signature before loading
|
|
verifySignature: true
|
|
|
|
# Cryptographic scheme for verification
|
|
# Options: ed25519, ecdsa-p256, gost3410, sm2
|
|
cryptoScheme: "ed25519"
|
|
|
|
# Logging Configuration
|
|
logging:
|
|
# Log health check results
|
|
logHealthChecks: false
|
|
|
|
# Log token usage statistics
|
|
logUsage: true
|
|
|
|
# Performance Tuning
|
|
performance:
|
|
# Number of threads for inference (-t flag on server)
|
|
# 0 = auto-detect
|
|
threads: 0
|
|
|
|
# Batch size for prompt processing
|
|
batchSize: 512
|
|
|
|
# Context size for parallel requests
|
|
parallelContexts: 1
|