Files
git.stella-ops.org/etc/llm-providers/llama-server.yaml
2025-12-26 18:11:06 +02:00

123 lines
2.7 KiB
YAML

# llama.cpp Server LLM Provider Configuration
# Documentation: https://github.com/ggerganov/llama.cpp/tree/master/examples/server
#
# Start llama.cpp server:
# llama-server -m /path/to/model.gguf --port 8080 --host 0.0.0.0
#
# For GPU acceleration:
# llama-server -m model.gguf --port 8080 -ngl 99
#
# This provider enables OFFLINE/AIRGAP operation by using locally-hosted models.
# Provider metadata
provider:
id: llama-server
name: llama.cpp Server
description: Local LLM inference via llama.cpp HTTP server (OpenAI-compatible API)
# Enable/disable this provider
enabled: true
# Priority for provider selection (lower = higher priority)
# Set to 10 for offline-first deployments
priority: 10
# Server Configuration
server:
# Server URL
# Default: http://localhost:8080
baseUrl: "http://localhost:8080"
# API key (if server requires authentication)
# Start server with: --api-key your-secret-key
apiKey: null
# Health check endpoint
healthEndpoint: "/health"
# Model Configuration
model:
# Model identifier (for logging/tracing)
# The actual model is loaded on the server at startup
name: "local-llama"
# Model file path (informational - model loaded on server)
# Used for bundle verification and documentation
modelPath: null
# Expected model digest (SHA-256)
# If set, verify model integrity on connection
expectedDigest: null
# Inference Parameters
inference:
# Temperature (0 = deterministic)
temperature: 0
# Maximum tokens to generate
maxTokens: 4096
# Random seed for reproducibility
# llama.cpp respects seed for deterministic output when temp=0
seed: 42
# Top-p (nucleus sampling)
topP: 1.0
# Top-k sampling
topK: 40
# Repeat penalty
repeatPenalty: 1.1
# Context length (must match server -c parameter)
contextLength: 4096
# Batch size
batchSize: 512
# Stop sequences
stopSequences: []
# Request Configuration
request:
# Request timeout (local inference may be slower)
timeout: "00:05:00"
# Maximum retries on failure
maxRetries: 2
# Retry delay
retryDelay: "00:00:02"
# Hardware Configuration (informational - actual settings on server)
hardware:
# Device for inference
# Options: cpu, cuda, rocm, metal, vulkan
device: "auto"
# Number of GPU layers to offload
gpuLayers: 0
# Number of CPU threads (0 = auto)
threads: 0
# Model Bundle (for airgap deployments)
bundle:
# Path to model bundle directory
bundlePath: null
# Verify bundle signature on startup
verifySignature: true
# Crypto scheme for signature verification
cryptoScheme: null # eidas, fips, gost, sm
# Logging
logging:
logBodies: false
logUsage: true
# Log server health check results
logHealthChecks: false