git.stella-ops.org/etc/llm-providers/llama-server.yaml.sample

# llama.cpp Server LLM Provider configuration template
# This is the PRIMARY provider for OFFLINE/AIRGAP deployments.
# Copy to llama-server.yaml (remove .sample extension) and configure.

# Provider enabled state and priority
# Lower priority number = higher preference (10 = prefer over cloud providers)
enabled: true
priority: 10

# Server Configuration
server:
  # Base URL for llama.cpp server
  # Start llama.cpp with: llama-server -m model.gguf --host 0.0.0.0 --port 8080
  baseUrl: "http://localhost:8080"

  # API key if server requires authentication (--api-key flag)
  apiKey: ""

  # Health check endpoint
  healthEndpoint: "/health"

# Model Configuration
model:
  # Model name (for logging and identification)
  name: "llama3-8b-q4km"

  # Path to model file (informational, model is loaded on server)
  modelPath: "/models/llama-3-8b-instruct.Q4_K_M.gguf"

  # Expected model digest (SHA-256) for verification
  # Ensures the correct model is loaded in airgap environments
  expectedDigest: ""

# Inference Parameters
inference:
  # Temperature: 0 = deterministic (REQUIRED for reproducibility)
  temperature: 0.0

  # Maximum tokens to generate
  maxTokens: 4096

  # Random seed for reproducibility (REQUIRED for determinism)
  seed: 42

  # Nucleus sampling (top-p)
  topP: 1.0

  # Top-k sampling
  topK: 40

  # Repeat penalty (1.0 = no penalty)
  repeatPenalty: 1.1

  # Context length (must match server's -c flag)
  contextLength: 4096

# Request Configuration
request:
  # Request timeout (longer for local inference)
  timeout: "00:05:00"

  # Maximum retries on failure
  maxRetries: 2

# Model Bundle Configuration (for airgap deployments)
bundle:
  # Path to signed model bundle (.stellaops-model directory)
  # Created using: stella model bundle --sign
  bundlePath: ""

  # Verify bundle signature before loading
  verifySignature: true

  # Cryptographic scheme for verification
  # Options: ed25519, ecdsa-p256, gost3410, sm2
  cryptoScheme: "ed25519"

# Logging Configuration
logging:
  # Log health check results
  logHealthChecks: false

  # Log token usage statistics
  logUsage: true

# Performance Tuning
performance:
  # Number of threads for inference (-t flag on server)
  # 0 = auto-detect
  threads: 0

  # Batch size for prompt processing
  batchSize: 512

  # Context size for parallel requests
  parallelContexts: 1