Refactor code structure and optimize performance across multiple modules
This commit is contained in:
81
etc/llm-providers/claude.yaml.sample
Normal file
81
etc/llm-providers/claude.yaml.sample
Normal file
@@ -0,0 +1,81 @@
|
||||
# Claude (Anthropic) LLM Provider configuration template
|
||||
# Copy to claude.yaml (remove .sample extension) and configure.
|
||||
# Environment variable ANTHROPIC_API_KEY can be used instead of api.apiKey.
|
||||
|
||||
# Provider enabled state and priority (lower = higher priority)
|
||||
enabled: true
|
||||
priority: 100
|
||||
|
||||
# API Configuration
|
||||
api:
|
||||
# API key - use environment variable reference or set directly
|
||||
# Environment variable: ANTHROPIC_API_KEY
|
||||
apiKey: "${ANTHROPIC_API_KEY}"
|
||||
|
||||
# Base URL for API requests
|
||||
baseUrl: "https://api.anthropic.com"
|
||||
|
||||
# API version header
|
||||
apiVersion: "2023-06-01"
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
# Primary model name
|
||||
# Options: claude-sonnet-4-20250514, claude-opus-4-20250514, claude-3-5-sonnet-20241022
|
||||
name: "claude-sonnet-4-20250514"
|
||||
|
||||
# Fallback models (tried in order if primary fails)
|
||||
fallbacks:
|
||||
- "claude-3-5-sonnet-20241022"
|
||||
|
||||
# Inference Parameters
|
||||
inference:
|
||||
# Temperature: 0 = deterministic, higher = more creative
|
||||
# For reproducibility in StellaOps, use 0
|
||||
temperature: 0.0
|
||||
|
||||
# Maximum tokens to generate
|
||||
maxTokens: 4096
|
||||
|
||||
# Nucleus sampling (top-p)
|
||||
# 1.0 = disabled, lower values = more focused
|
||||
topP: 1.0
|
||||
|
||||
# Top-k sampling (0 = disabled)
|
||||
# Lower values = more focused
|
||||
topK: 0
|
||||
|
||||
# Extended Thinking (Claude's reasoning feature)
|
||||
thinking:
|
||||
# Enable extended thinking for complex reasoning tasks
|
||||
enabled: false
|
||||
|
||||
# Budget tokens for thinking process
|
||||
budgetTokens: 10000
|
||||
|
||||
# Request Configuration
|
||||
request:
|
||||
# Request timeout
|
||||
timeout: "00:02:00"
|
||||
|
||||
# Maximum retries on failure
|
||||
maxRetries: 3
|
||||
|
||||
# Logging Configuration
|
||||
logging:
|
||||
# Log request/response bodies (WARNING: may contain sensitive data)
|
||||
logBodies: false
|
||||
|
||||
# Log token usage statistics
|
||||
logUsage: true
|
||||
|
||||
# Rate Limiting
|
||||
rateLimit:
|
||||
# Requests per minute limit (0 = no limit)
|
||||
requestsPerMinute: 0
|
||||
|
||||
# Tokens per minute limit (0 = no limit)
|
||||
tokensPerMinute: 0
|
||||
|
||||
# Backoff duration when rate limited
|
||||
backoff: "00:01:00"
|
||||
96
etc/llm-providers/llama-server.yaml.sample
Normal file
96
etc/llm-providers/llama-server.yaml.sample
Normal file
@@ -0,0 +1,96 @@
|
||||
# llama.cpp Server LLM Provider configuration template
|
||||
# This is the PRIMARY provider for OFFLINE/AIRGAP deployments.
|
||||
# Copy to llama-server.yaml (remove .sample extension) and configure.
|
||||
|
||||
# Provider enabled state and priority
|
||||
# Lower priority number = higher preference (10 = prefer over cloud providers)
|
||||
enabled: true
|
||||
priority: 10
|
||||
|
||||
# Server Configuration
|
||||
server:
|
||||
# Base URL for llama.cpp server
|
||||
# Start llama.cpp with: llama-server -m model.gguf --host 0.0.0.0 --port 8080
|
||||
baseUrl: "http://localhost:8080"
|
||||
|
||||
# API key if server requires authentication (--api-key flag)
|
||||
apiKey: ""
|
||||
|
||||
# Health check endpoint
|
||||
healthEndpoint: "/health"
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
# Model name (for logging and identification)
|
||||
name: "llama3-8b-q4km"
|
||||
|
||||
# Path to model file (informational, model is loaded on server)
|
||||
modelPath: "/models/llama-3-8b-instruct.Q4_K_M.gguf"
|
||||
|
||||
# Expected model digest (SHA-256) for verification
|
||||
# Ensures the correct model is loaded in airgap environments
|
||||
expectedDigest: ""
|
||||
|
||||
# Inference Parameters
|
||||
inference:
|
||||
# Temperature: 0 = deterministic (REQUIRED for reproducibility)
|
||||
temperature: 0.0
|
||||
|
||||
# Maximum tokens to generate
|
||||
maxTokens: 4096
|
||||
|
||||
# Random seed for reproducibility (REQUIRED for determinism)
|
||||
seed: 42
|
||||
|
||||
# Nucleus sampling (top-p)
|
||||
topP: 1.0
|
||||
|
||||
# Top-k sampling
|
||||
topK: 40
|
||||
|
||||
# Repeat penalty (1.0 = no penalty)
|
||||
repeatPenalty: 1.1
|
||||
|
||||
# Context length (must match server's -c flag)
|
||||
contextLength: 4096
|
||||
|
||||
# Request Configuration
|
||||
request:
|
||||
# Request timeout (longer for local inference)
|
||||
timeout: "00:05:00"
|
||||
|
||||
# Maximum retries on failure
|
||||
maxRetries: 2
|
||||
|
||||
# Model Bundle Configuration (for airgap deployments)
|
||||
bundle:
|
||||
# Path to signed model bundle (.stellaops-model directory)
|
||||
# Created using: stella model bundle --sign
|
||||
bundlePath: ""
|
||||
|
||||
# Verify bundle signature before loading
|
||||
verifySignature: true
|
||||
|
||||
# Cryptographic scheme for verification
|
||||
# Options: ed25519, ecdsa-p256, gost3410, sm2
|
||||
cryptoScheme: "ed25519"
|
||||
|
||||
# Logging Configuration
|
||||
logging:
|
||||
# Log health check results
|
||||
logHealthChecks: false
|
||||
|
||||
# Log token usage statistics
|
||||
logUsage: true
|
||||
|
||||
# Performance Tuning
|
||||
performance:
|
||||
# Number of threads for inference (-t flag on server)
|
||||
# 0 = auto-detect
|
||||
threads: 0
|
||||
|
||||
# Batch size for prompt processing
|
||||
batchSize: 512
|
||||
|
||||
# Context size for parallel requests
|
||||
parallelContexts: 1
|
||||
87
etc/llm-providers/ollama.yaml.sample
Normal file
87
etc/llm-providers/ollama.yaml.sample
Normal file
@@ -0,0 +1,87 @@
|
||||
# Ollama LLM Provider configuration template
|
||||
# For local inference using Ollama.
|
||||
# Copy to ollama.yaml (remove .sample extension) and configure.
|
||||
|
||||
# Provider enabled state and priority
|
||||
# Priority 20 = prefer over cloud, but after llama-server (10)
|
||||
enabled: true
|
||||
priority: 20
|
||||
|
||||
# Server Configuration
|
||||
server:
|
||||
# Base URL for Ollama server
|
||||
# Default Ollama port is 11434
|
||||
baseUrl: "http://localhost:11434"
|
||||
|
||||
# Health check endpoint
|
||||
healthEndpoint: "/api/tags"
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
# Primary model name
|
||||
# Use 'ollama list' to see available models
|
||||
# Common options: llama3:8b, llama3:70b, codellama:13b, mistral:7b
|
||||
name: "llama3:8b"
|
||||
|
||||
# Fallback models (tried in order if primary fails)
|
||||
fallbacks:
|
||||
- "llama3:latest"
|
||||
- "mistral:7b"
|
||||
|
||||
# Keep model loaded in memory (prevents unloading between requests)
|
||||
# Options: "5m", "10m", "1h", "-1" (forever)
|
||||
keepAlive: "5m"
|
||||
|
||||
# Inference Parameters
|
||||
inference:
|
||||
# Temperature: 0 = deterministic (REQUIRED for reproducibility)
|
||||
temperature: 0.0
|
||||
|
||||
# Maximum tokens to generate (-1 = use model default)
|
||||
maxTokens: 4096
|
||||
|
||||
# Random seed for reproducibility (REQUIRED for determinism)
|
||||
seed: 42
|
||||
|
||||
# Nucleus sampling (top-p)
|
||||
topP: 1.0
|
||||
|
||||
# Top-k sampling
|
||||
topK: 40
|
||||
|
||||
# Repeat penalty (1.0 = no penalty)
|
||||
repeatPenalty: 1.1
|
||||
|
||||
# Context window size
|
||||
numCtx: 4096
|
||||
|
||||
# Number of tokens to predict (-1 = unlimited, use maxTokens)
|
||||
numPredict: -1
|
||||
|
||||
# GPU Configuration
|
||||
gpu:
|
||||
# Number of GPU layers to offload (0 = CPU only)
|
||||
# -1 = offload all layers to GPU
|
||||
numGpu: 0
|
||||
|
||||
# Request Configuration
|
||||
request:
|
||||
# Request timeout (longer for local inference)
|
||||
timeout: "00:05:00"
|
||||
|
||||
# Maximum retries on failure
|
||||
maxRetries: 2
|
||||
|
||||
# Model Management
|
||||
management:
|
||||
# Automatically pull model if not found locally
|
||||
# WARNING: Requires internet access, disable for airgap
|
||||
autoPull: false
|
||||
|
||||
# Verify model integrity after pull
|
||||
verifyPull: true
|
||||
|
||||
# Logging Configuration
|
||||
logging:
|
||||
# Log token usage statistics
|
||||
logUsage: true
|
||||
87
etc/llm-providers/openai.yaml.sample
Normal file
87
etc/llm-providers/openai.yaml.sample
Normal file
@@ -0,0 +1,87 @@
|
||||
# OpenAI LLM Provider configuration template
|
||||
# Copy to openai.yaml (remove .sample extension) and configure.
|
||||
# Environment variable OPENAI_API_KEY can be used instead of api.apiKey.
|
||||
|
||||
# Provider enabled state and priority (lower = higher priority)
|
||||
enabled: true
|
||||
priority: 100
|
||||
|
||||
# API Configuration
|
||||
api:
|
||||
# API key - use environment variable reference or set directly
|
||||
# Environment variable: OPENAI_API_KEY
|
||||
apiKey: "${OPENAI_API_KEY}"
|
||||
|
||||
# Base URL for API requests
|
||||
# Default: https://api.openai.com/v1
|
||||
# For Azure OpenAI: https://{resource}.openai.azure.com/openai/deployments/{deployment}
|
||||
baseUrl: "https://api.openai.com/v1"
|
||||
|
||||
# Organization ID (optional, for multi-org accounts)
|
||||
organizationId: ""
|
||||
|
||||
# API version (required for Azure OpenAI, e.g., "2024-02-15-preview")
|
||||
apiVersion: ""
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
# Primary model name
|
||||
# Options: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-4, gpt-3.5-turbo
|
||||
# For Azure: use your deployment name
|
||||
name: "gpt-4o"
|
||||
|
||||
# Fallback models (tried in order if primary fails)
|
||||
fallbacks:
|
||||
- "gpt-4o-mini"
|
||||
- "gpt-3.5-turbo"
|
||||
|
||||
# Inference Parameters
|
||||
inference:
|
||||
# Temperature: 0 = deterministic, higher = more creative
|
||||
# For reproducibility in StellaOps, use 0
|
||||
temperature: 0.0
|
||||
|
||||
# Maximum tokens to generate
|
||||
maxTokens: 4096
|
||||
|
||||
# Random seed for reproducibility (when temperature=0)
|
||||
seed: 42
|
||||
|
||||
# Nucleus sampling (top-p)
|
||||
# 1.0 = disabled, lower values = more focused
|
||||
topP: 1.0
|
||||
|
||||
# Frequency penalty (-2.0 to 2.0)
|
||||
# Positive = reduce repetition of tokens already used
|
||||
frequencyPenalty: 0.0
|
||||
|
||||
# Presence penalty (-2.0 to 2.0)
|
||||
# Positive = encourage new topics
|
||||
presencePenalty: 0.0
|
||||
|
||||
# Request Configuration
|
||||
request:
|
||||
# Request timeout
|
||||
timeout: "00:02:00"
|
||||
|
||||
# Maximum retries on failure
|
||||
maxRetries: 3
|
||||
|
||||
# Logging Configuration
|
||||
logging:
|
||||
# Log request/response bodies (WARNING: may contain sensitive data)
|
||||
logBodies: false
|
||||
|
||||
# Log token usage statistics
|
||||
logUsage: true
|
||||
|
||||
# Rate Limiting
|
||||
rateLimit:
|
||||
# Requests per minute limit (0 = no limit)
|
||||
requestsPerMinute: 0
|
||||
|
||||
# Tokens per minute limit (0 = no limit)
|
||||
tokensPerMinute: 0
|
||||
|
||||
# Backoff duration when rate limited
|
||||
backoff: "00:01:00"
|
||||
92
etc/llm-providers/registry.yaml
Normal file
92
etc/llm-providers/registry.yaml
Normal file
@@ -0,0 +1,92 @@
|
||||
# LLM Provider Registry
|
||||
# AI/LLM provider configurations for Advisory AI
|
||||
|
||||
version: "1.0"
|
||||
category: llm-providers
|
||||
|
||||
# Global LLM settings
|
||||
defaults:
|
||||
enabled: false # Explicitly enable providers
|
||||
timeout: "00:02:00"
|
||||
maxRetries: 3
|
||||
|
||||
# ============================================================================
|
||||
# LLM PROVIDERS
|
||||
# ============================================================================
|
||||
providers:
|
||||
# Cloud providers
|
||||
claude:
|
||||
enabled: false
|
||||
priority: 100
|
||||
config: claude.yaml
|
||||
description: "Anthropic Claude (Claude 3.5/4)"
|
||||
mode: remote
|
||||
|
||||
openai:
|
||||
enabled: false
|
||||
priority: 90
|
||||
config: openai.yaml
|
||||
description: "OpenAI GPT-4/4o"
|
||||
mode: remote
|
||||
|
||||
azure-openai:
|
||||
enabled: false
|
||||
priority: 90
|
||||
config: azure-openai.yaml
|
||||
description: "Azure OpenAI Service"
|
||||
mode: remote
|
||||
|
||||
# Local providers (for air-gap)
|
||||
ollama:
|
||||
enabled: false
|
||||
priority: 80
|
||||
config: ollama.yaml
|
||||
description: "Ollama local inference"
|
||||
mode: local
|
||||
|
||||
llama-server:
|
||||
enabled: false
|
||||
priority: 80
|
||||
config: llama-server.yaml
|
||||
description: "llama.cpp HTTP server"
|
||||
mode: local
|
||||
|
||||
vllm:
|
||||
enabled: false
|
||||
priority: 80
|
||||
config: vllm.yaml
|
||||
description: "vLLM inference server"
|
||||
mode: local
|
||||
|
||||
# ============================================================================
|
||||
# INFERENCE SETTINGS
|
||||
# ============================================================================
|
||||
inference:
|
||||
# Mode: remote, local, hybrid
|
||||
mode: "${ADVISORY_AI_INFERENCE_MODE:-local}"
|
||||
|
||||
# Fallback chain
|
||||
fallbackChain:
|
||||
- claude
|
||||
- openai
|
||||
- ollama
|
||||
|
||||
# Model selection
|
||||
modelSelection:
|
||||
# Task-specific model overrides
|
||||
explanation: "" # Use default
|
||||
remediation: "" # Use default
|
||||
classification: "" # Use default
|
||||
|
||||
# ============================================================================
|
||||
# OFFLINE/AIR-GAP SETTINGS
|
||||
# ============================================================================
|
||||
offline:
|
||||
# Signed model bundle path
|
||||
modelBundlePath: "${ADVISORY_AI_MODEL_BUNDLE_PATH:-/opt/stellaops/offline/models}"
|
||||
|
||||
# Verify bundle signatures
|
||||
verifySignatures: true
|
||||
|
||||
# Public key for signature verification
|
||||
publicKeyPath: "${ADVISORY_AI_MODEL_PUBKEY:-/etc/stellaops/model-signing-pubkey.pem}"
|
||||
Reference in New Issue
Block a user