git.stella-ops.org/etc/llm-providers/llama-server.yaml

# llama.cpp Server LLM Provider Configuration
# Documentation: https://github.com/ggerganov/llama.cpp/tree/master/examples/server
#
# Start llama.cpp server:
#   llama-server -m /path/to/model.gguf --port 8080 --host 0.0.0.0
#
# For GPU acceleration:
#   llama-server -m model.gguf --port 8080 -ngl 99
#
# This provider enables OFFLINE/AIRGAP operation by using locally-hosted models.

# Provider metadata
provider:
  id: llama-server
  name: llama.cpp Server
  description: Local LLM inference via llama.cpp HTTP server (OpenAI-compatible API)

# Enable/disable this provider
enabled: true

# Priority for provider selection (lower = higher priority)
# Set to 10 for offline-first deployments
priority: 10

# Server Configuration
server:
  # Server URL
  # Default: http://localhost:8080
  baseUrl: "http://localhost:8080"

  # API key (if server requires authentication)
  # Start server with: --api-key your-secret-key
  apiKey: null

  # Health check endpoint
  healthEndpoint: "/health"

# Model Configuration
model:
  # Model identifier (for logging/tracing)
  # The actual model is loaded on the server at startup
  name: "local-llama"

  # Model file path (informational - model loaded on server)
  # Used for bundle verification and documentation
  modelPath: null

  # Expected model digest (SHA-256)
  # If set, verify model integrity on connection
  expectedDigest: null

# Inference Parameters
inference:
  # Temperature (0 = deterministic)
  temperature: 0

  # Maximum tokens to generate
  maxTokens: 4096

  # Random seed for reproducibility
  # llama.cpp respects seed for deterministic output when temp=0
  seed: 42

  # Top-p (nucleus sampling)
  topP: 1.0

  # Top-k sampling
  topK: 40

  # Repeat penalty
  repeatPenalty: 1.1

  # Context length (must match server -c parameter)
  contextLength: 4096

  # Batch size
  batchSize: 512

  # Stop sequences
  stopSequences: []

# Request Configuration
request:
  # Request timeout (local inference may be slower)
  timeout: "00:05:00"

  # Maximum retries on failure
  maxRetries: 2

  # Retry delay
  retryDelay: "00:00:02"

# Hardware Configuration (informational - actual settings on server)
hardware:
  # Device for inference
  # Options: cpu, cuda, rocm, metal, vulkan
  device: "auto"

  # Number of GPU layers to offload
  gpuLayers: 0

  # Number of CPU threads (0 = auto)
  threads: 0

# Model Bundle (for airgap deployments)
bundle:
  # Path to model bundle directory
  bundlePath: null

  # Verify bundle signature on startup
  verifySignature: true

  # Crypto scheme for signature verification
  cryptoScheme: null  # eidas, fips, gost, sm

# Logging
logging:
  logBodies: false
  logUsage: true

  # Log server health check results
  logHealthChecks: false