Refactor code structure and optimize performance across multiple modules
This commit is contained in:
96
etc/llm-providers/llama-server.yaml.sample
Normal file
96
etc/llm-providers/llama-server.yaml.sample
Normal file
@@ -0,0 +1,96 @@
|
||||
# llama.cpp Server LLM Provider configuration template
|
||||
# This is the PRIMARY provider for OFFLINE/AIRGAP deployments.
|
||||
# Copy to llama-server.yaml (remove .sample extension) and configure.
|
||||
|
||||
# Provider enabled state and priority
|
||||
# Lower priority number = higher preference (10 = prefer over cloud providers)
|
||||
enabled: true
|
||||
priority: 10
|
||||
|
||||
# Server Configuration
|
||||
server:
|
||||
# Base URL for llama.cpp server
|
||||
# Start llama.cpp with: llama-server -m model.gguf --host 0.0.0.0 --port 8080
|
||||
baseUrl: "http://localhost:8080"
|
||||
|
||||
# API key if server requires authentication (--api-key flag)
|
||||
apiKey: ""
|
||||
|
||||
# Health check endpoint
|
||||
healthEndpoint: "/health"
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
# Model name (for logging and identification)
|
||||
name: "llama3-8b-q4km"
|
||||
|
||||
# Path to model file (informational, model is loaded on server)
|
||||
modelPath: "/models/llama-3-8b-instruct.Q4_K_M.gguf"
|
||||
|
||||
# Expected model digest (SHA-256) for verification
|
||||
# Ensures the correct model is loaded in airgap environments
|
||||
expectedDigest: ""
|
||||
|
||||
# Inference Parameters
|
||||
inference:
|
||||
# Temperature: 0 = deterministic (REQUIRED for reproducibility)
|
||||
temperature: 0.0
|
||||
|
||||
# Maximum tokens to generate
|
||||
maxTokens: 4096
|
||||
|
||||
# Random seed for reproducibility (REQUIRED for determinism)
|
||||
seed: 42
|
||||
|
||||
# Nucleus sampling (top-p)
|
||||
topP: 1.0
|
||||
|
||||
# Top-k sampling
|
||||
topK: 40
|
||||
|
||||
# Repeat penalty (1.0 = no penalty)
|
||||
repeatPenalty: 1.1
|
||||
|
||||
# Context length (must match server's -c flag)
|
||||
contextLength: 4096
|
||||
|
||||
# Request Configuration
|
||||
request:
|
||||
# Request timeout (longer for local inference)
|
||||
timeout: "00:05:00"
|
||||
|
||||
# Maximum retries on failure
|
||||
maxRetries: 2
|
||||
|
||||
# Model Bundle Configuration (for airgap deployments)
|
||||
bundle:
|
||||
# Path to signed model bundle (.stellaops-model directory)
|
||||
# Created using: stella model bundle --sign
|
||||
bundlePath: ""
|
||||
|
||||
# Verify bundle signature before loading
|
||||
verifySignature: true
|
||||
|
||||
# Cryptographic scheme for verification
|
||||
# Options: ed25519, ecdsa-p256, gost3410, sm2
|
||||
cryptoScheme: "ed25519"
|
||||
|
||||
# Logging Configuration
|
||||
logging:
|
||||
# Log health check results
|
||||
logHealthChecks: false
|
||||
|
||||
# Log token usage statistics
|
||||
logUsage: true
|
||||
|
||||
# Performance Tuning
|
||||
performance:
|
||||
# Number of threads for inference (-t flag on server)
|
||||
# 0 = auto-detect
|
||||
threads: 0
|
||||
|
||||
# Batch size for prompt processing
|
||||
batchSize: 512
|
||||
|
||||
# Context size for parallel requests
|
||||
parallelContexts: 1
|
||||
Reference in New Issue
Block a user