CD/CD consolidation
This commit is contained in:
84
etc/llm-providers/claude.yaml
Normal file
84
etc/llm-providers/claude.yaml
Normal file
@@ -0,0 +1,84 @@
|
||||
# Claude (Anthropic) LLM Provider Configuration
|
||||
# Documentation: https://docs.anthropic.com/en/api
|
||||
|
||||
# Provider metadata
|
||||
provider:
|
||||
id: claude
|
||||
name: Claude
|
||||
description: Anthropic Claude models via API
|
||||
|
||||
# Enable/disable this provider
|
||||
enabled: true
|
||||
|
||||
# Priority for provider selection (lower = higher priority)
|
||||
priority: 100
|
||||
|
||||
# API Configuration
|
||||
api:
|
||||
# API key (can also use ANTHROPIC_API_KEY environment variable)
|
||||
apiKey: "${ANTHROPIC_API_KEY}"
|
||||
|
||||
# Base URL for API requests
|
||||
baseUrl: "https://api.anthropic.com"
|
||||
|
||||
# API version header
|
||||
apiVersion: "2023-06-01"
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
# Model to use for inference
|
||||
# Options: claude-sonnet-4-20250514, claude-opus-4-20250514, claude-3-5-haiku-20241022
|
||||
name: "claude-sonnet-4-20250514"
|
||||
|
||||
# Fallback models if primary is unavailable
|
||||
fallbacks:
|
||||
- "claude-3-5-haiku-20241022"
|
||||
|
||||
# Inference Parameters
|
||||
inference:
|
||||
# Temperature (0 = deterministic, 1 = creative)
|
||||
temperature: 0
|
||||
|
||||
# Maximum tokens to generate
|
||||
maxTokens: 4096
|
||||
|
||||
# Top-p (nucleus sampling)
|
||||
topP: 1.0
|
||||
|
||||
# Top-k sampling (0 = disabled)
|
||||
topK: 0
|
||||
|
||||
# Stop sequences
|
||||
stopSequences: []
|
||||
|
||||
# Request Configuration
|
||||
request:
|
||||
# Request timeout
|
||||
timeout: "00:02:00"
|
||||
|
||||
# Maximum retries on failure
|
||||
maxRetries: 3
|
||||
|
||||
# Retry delay
|
||||
retryDelay: "00:00:01"
|
||||
|
||||
# Rate Limiting
|
||||
rateLimit:
|
||||
# Requests per minute (0 = unlimited)
|
||||
requestsPerMinute: 0
|
||||
|
||||
# Tokens per minute (0 = unlimited)
|
||||
tokensPerMinute: 0
|
||||
|
||||
# Extended Thinking (Claude 3.5+ feature)
|
||||
thinking:
|
||||
# Enable extended thinking for complex reasoning
|
||||
enabled: false
|
||||
|
||||
# Budget tokens for thinking (when enabled)
|
||||
budgetTokens: 10000
|
||||
|
||||
# Logging
|
||||
logging:
|
||||
logBodies: false
|
||||
logUsage: true
|
||||
122
etc/llm-providers/llama-server.yaml
Normal file
122
etc/llm-providers/llama-server.yaml
Normal file
@@ -0,0 +1,122 @@
|
||||
# llama.cpp Server LLM Provider Configuration
|
||||
# Documentation: https://github.com/ggerganov/llama.cpp/tree/master/examples/server
|
||||
#
|
||||
# Start llama.cpp server:
|
||||
# llama-server -m /path/to/model.gguf --port 8080 --host 0.0.0.0
|
||||
#
|
||||
# For GPU acceleration:
|
||||
# llama-server -m model.gguf --port 8080 -ngl 99
|
||||
#
|
||||
# This provider enables OFFLINE/AIRGAP operation by using locally-hosted models.
|
||||
|
||||
# Provider metadata
|
||||
provider:
|
||||
id: llama-server
|
||||
name: llama.cpp Server
|
||||
description: Local LLM inference via llama.cpp HTTP server (OpenAI-compatible API)
|
||||
|
||||
# Enable/disable this provider
|
||||
enabled: true
|
||||
|
||||
# Priority for provider selection (lower = higher priority)
|
||||
# Set to 10 for offline-first deployments
|
||||
priority: 10
|
||||
|
||||
# Server Configuration
|
||||
server:
|
||||
# Server URL
|
||||
# Default: http://localhost:8080
|
||||
baseUrl: "http://localhost:8080"
|
||||
|
||||
# API key (if server requires authentication)
|
||||
# Start server with: --api-key your-secret-key
|
||||
apiKey: null
|
||||
|
||||
# Health check endpoint
|
||||
healthEndpoint: "/health"
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
# Model identifier (for logging/tracing)
|
||||
# The actual model is loaded on the server at startup
|
||||
name: "local-llama"
|
||||
|
||||
# Model file path (informational - model loaded on server)
|
||||
# Used for bundle verification and documentation
|
||||
modelPath: null
|
||||
|
||||
# Expected model digest (SHA-256)
|
||||
# If set, verify model integrity on connection
|
||||
expectedDigest: null
|
||||
|
||||
# Inference Parameters
|
||||
inference:
|
||||
# Temperature (0 = deterministic)
|
||||
temperature: 0
|
||||
|
||||
# Maximum tokens to generate
|
||||
maxTokens: 4096
|
||||
|
||||
# Random seed for reproducibility
|
||||
# llama.cpp respects seed for deterministic output when temp=0
|
||||
seed: 42
|
||||
|
||||
# Top-p (nucleus sampling)
|
||||
topP: 1.0
|
||||
|
||||
# Top-k sampling
|
||||
topK: 40
|
||||
|
||||
# Repeat penalty
|
||||
repeatPenalty: 1.1
|
||||
|
||||
# Context length (must match server -c parameter)
|
||||
contextLength: 4096
|
||||
|
||||
# Batch size
|
||||
batchSize: 512
|
||||
|
||||
# Stop sequences
|
||||
stopSequences: []
|
||||
|
||||
# Request Configuration
|
||||
request:
|
||||
# Request timeout (local inference may be slower)
|
||||
timeout: "00:05:00"
|
||||
|
||||
# Maximum retries on failure
|
||||
maxRetries: 2
|
||||
|
||||
# Retry delay
|
||||
retryDelay: "00:00:02"
|
||||
|
||||
# Hardware Configuration (informational - actual settings on server)
|
||||
hardware:
|
||||
# Device for inference
|
||||
# Options: cpu, cuda, rocm, metal, vulkan
|
||||
device: "auto"
|
||||
|
||||
# Number of GPU layers to offload
|
||||
gpuLayers: 0
|
||||
|
||||
# Number of CPU threads (0 = auto)
|
||||
threads: 0
|
||||
|
||||
# Model Bundle (for airgap deployments)
|
||||
bundle:
|
||||
# Path to model bundle directory
|
||||
bundlePath: null
|
||||
|
||||
# Verify bundle signature on startup
|
||||
verifySignature: true
|
||||
|
||||
# Crypto scheme for signature verification
|
||||
cryptoScheme: null # eidas, fips, gost, sm
|
||||
|
||||
# Logging
|
||||
logging:
|
||||
logBodies: false
|
||||
logUsage: true
|
||||
|
||||
# Log server health check results
|
||||
logHealthChecks: false
|
||||
110
etc/llm-providers/ollama.yaml
Normal file
110
etc/llm-providers/ollama.yaml
Normal file
@@ -0,0 +1,110 @@
|
||||
# Ollama LLM Provider Configuration
|
||||
# Documentation: https://ollama.ai/
|
||||
#
|
||||
# Install Ollama: curl -fsSL https://ollama.ai/install.sh | sh
|
||||
# Pull a model: ollama pull llama3:8b
|
||||
# Start server: ollama serve
|
||||
#
|
||||
# Ollama provides an easy way to run local LLMs with automatic GPU detection.
|
||||
|
||||
# Provider metadata
|
||||
provider:
|
||||
id: ollama
|
||||
name: Ollama
|
||||
description: Local LLM inference via Ollama
|
||||
|
||||
# Enable/disable this provider
|
||||
enabled: true
|
||||
|
||||
# Priority for provider selection (lower = higher priority)
|
||||
# Set lower than cloud providers for local-first operation
|
||||
priority: 20
|
||||
|
||||
# Server Configuration
|
||||
server:
|
||||
# Ollama server URL
|
||||
baseUrl: "http://localhost:11434"
|
||||
|
||||
# Health check endpoint
|
||||
healthEndpoint: "/api/tags"
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
# Model to use (must be pulled first: ollama pull <model>)
|
||||
# Popular options:
|
||||
# - llama3:8b (8B params, good quality/speed balance)
|
||||
# - llama3:70b (70B params, higher quality, needs more RAM)
|
||||
# - mistral:7b (7B params, fast)
|
||||
# - mixtral:8x7b (MoE, good quality)
|
||||
# - codellama:7b (code-focused)
|
||||
# - phi3:mini (small, fast)
|
||||
name: "llama3:8b"
|
||||
|
||||
# Fallback models
|
||||
fallbacks:
|
||||
- "mistral:7b"
|
||||
- "phi3:mini"
|
||||
|
||||
# Keep model loaded in memory (reduces latency for repeated requests)
|
||||
keepAlive: "5m"
|
||||
|
||||
# Inference Parameters
|
||||
inference:
|
||||
# Temperature (0 = deterministic)
|
||||
temperature: 0
|
||||
|
||||
# Maximum tokens to generate
|
||||
maxTokens: 4096
|
||||
|
||||
# Random seed for reproducibility
|
||||
seed: 42
|
||||
|
||||
# Top-p (nucleus sampling)
|
||||
topP: 1.0
|
||||
|
||||
# Top-k sampling
|
||||
topK: 40
|
||||
|
||||
# Repeat penalty
|
||||
repeatPenalty: 1.1
|
||||
|
||||
# Context length
|
||||
numCtx: 4096
|
||||
|
||||
# Number of tokens to predict (-1 = unlimited)
|
||||
numPredict: -1
|
||||
|
||||
# Stop sequences
|
||||
stopSequences: []
|
||||
|
||||
# Request Configuration
|
||||
request:
|
||||
# Request timeout
|
||||
timeout: "00:05:00"
|
||||
|
||||
# Maximum retries on failure
|
||||
maxRetries: 2
|
||||
|
||||
# Retry delay
|
||||
retryDelay: "00:00:02"
|
||||
|
||||
# GPU Configuration
|
||||
gpu:
|
||||
# Number of GPU layers to use (0 = CPU only)
|
||||
numGpu: 0
|
||||
|
||||
# Use GPU for embedding (if available)
|
||||
useGpuForEmbedding: false
|
||||
|
||||
# Model Management
|
||||
management:
|
||||
# Auto-pull model if not found
|
||||
autoPull: false
|
||||
|
||||
# Verify model integrity after pull
|
||||
verifyPull: true
|
||||
|
||||
# Logging
|
||||
logging:
|
||||
logBodies: false
|
||||
logUsage: true
|
||||
94
etc/llm-providers/openai.yaml
Normal file
94
etc/llm-providers/openai.yaml
Normal file
@@ -0,0 +1,94 @@
|
||||
# OpenAI LLM Provider Configuration
|
||||
# Documentation: https://platform.openai.com/docs/api-reference
|
||||
|
||||
# Provider metadata
|
||||
provider:
|
||||
id: openai
|
||||
name: OpenAI
|
||||
description: OpenAI GPT models via API
|
||||
|
||||
# Enable/disable this provider
|
||||
enabled: true
|
||||
|
||||
# Priority for provider selection (lower = higher priority)
|
||||
# When multiple providers are available, the one with lowest priority is used
|
||||
priority: 100
|
||||
|
||||
# API Configuration
|
||||
api:
|
||||
# API key (can also use OPENAI_API_KEY environment variable)
|
||||
# Environment variables are expanded: ${OPENAI_API_KEY}
|
||||
apiKey: "${OPENAI_API_KEY}"
|
||||
|
||||
# Base URL for API requests
|
||||
# Default: https://api.openai.com/v1
|
||||
# For Azure OpenAI, use: https://{resource}.openai.azure.com/openai/deployments/{deployment}
|
||||
baseUrl: "https://api.openai.com/v1"
|
||||
|
||||
# Organization ID (optional)
|
||||
organizationId: null
|
||||
|
||||
# API version (for Azure OpenAI)
|
||||
apiVersion: null
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
# Model to use for inference
|
||||
# Options: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-4, gpt-3.5-turbo
|
||||
name: "gpt-4o"
|
||||
|
||||
# Fallback models if primary is unavailable
|
||||
fallbacks:
|
||||
- "gpt-4o-mini"
|
||||
- "gpt-4-turbo"
|
||||
|
||||
# Inference Parameters
|
||||
inference:
|
||||
# Temperature (0 = deterministic, 1 = creative)
|
||||
# For security analysis, use 0 for reproducibility
|
||||
temperature: 0
|
||||
|
||||
# Maximum tokens to generate
|
||||
maxTokens: 4096
|
||||
|
||||
# Random seed for reproducibility (when temperature=0)
|
||||
seed: 42
|
||||
|
||||
# Top-p (nucleus sampling)
|
||||
topP: 1.0
|
||||
|
||||
# Frequency penalty (-2.0 to 2.0)
|
||||
frequencyPenalty: 0
|
||||
|
||||
# Presence penalty (-2.0 to 2.0)
|
||||
presencePenalty: 0
|
||||
|
||||
# Stop sequences
|
||||
stopSequences: []
|
||||
|
||||
# Request Configuration
|
||||
request:
|
||||
# Request timeout
|
||||
timeout: "00:02:00"
|
||||
|
||||
# Maximum retries on failure
|
||||
maxRetries: 3
|
||||
|
||||
# Retry delay (exponential backoff base)
|
||||
retryDelay: "00:00:01"
|
||||
|
||||
# Rate Limiting
|
||||
rateLimit:
|
||||
# Requests per minute (0 = unlimited)
|
||||
requestsPerMinute: 0
|
||||
|
||||
# Tokens per minute (0 = unlimited)
|
||||
tokensPerMinute: 0
|
||||
|
||||
# Logging
|
||||
logging:
|
||||
# Log request/response bodies (WARNING: may contain sensitive data)
|
||||
logBodies: false
|
||||
|
||||
# Log token usage
|
||||
logUsage: true
|
||||
Reference in New Issue
Block a user