CD/CD consolidation

This commit is contained in:
StellaOps Bot
2025-12-26 17:32:23 +02:00
parent a866eb6277
commit c786faae84
638 changed files with 3821 additions and 181 deletions

View File

@@ -0,0 +1,84 @@
# Claude (Anthropic) LLM Provider Configuration
# Documentation: https://docs.anthropic.com/en/api
# Provider metadata
provider:
id: claude
name: Claude
description: Anthropic Claude models via API
# Enable/disable this provider
enabled: true
# Priority for provider selection (lower = higher priority)
priority: 100
# API Configuration
api:
# API key (can also use ANTHROPIC_API_KEY environment variable)
apiKey: "${ANTHROPIC_API_KEY}"
# Base URL for API requests
baseUrl: "https://api.anthropic.com"
# API version header
apiVersion: "2023-06-01"
# Model Configuration
model:
# Model to use for inference
# Options: claude-sonnet-4-20250514, claude-opus-4-20250514, claude-3-5-haiku-20241022
name: "claude-sonnet-4-20250514"
# Fallback models if primary is unavailable
fallbacks:
- "claude-3-5-haiku-20241022"
# Inference Parameters
inference:
# Temperature (0 = deterministic, 1 = creative)
temperature: 0
# Maximum tokens to generate
maxTokens: 4096
# Top-p (nucleus sampling)
topP: 1.0
# Top-k sampling (0 = disabled)
topK: 0
# Stop sequences
stopSequences: []
# Request Configuration
request:
# Request timeout
timeout: "00:02:00"
# Maximum retries on failure
maxRetries: 3
# Retry delay
retryDelay: "00:00:01"
# Rate Limiting
rateLimit:
# Requests per minute (0 = unlimited)
requestsPerMinute: 0
# Tokens per minute (0 = unlimited)
tokensPerMinute: 0
# Extended Thinking (Claude 3.5+ feature)
thinking:
# Enable extended thinking for complex reasoning
enabled: false
# Budget tokens for thinking (when enabled)
budgetTokens: 10000
# Logging
logging:
logBodies: false
logUsage: true

View File

@@ -0,0 +1,122 @@
# llama.cpp Server LLM Provider Configuration
# Documentation: https://github.com/ggerganov/llama.cpp/tree/master/examples/server
#
# Start llama.cpp server:
# llama-server -m /path/to/model.gguf --port 8080 --host 0.0.0.0
#
# For GPU acceleration:
# llama-server -m model.gguf --port 8080 -ngl 99
#
# This provider enables OFFLINE/AIRGAP operation by using locally-hosted models.
# Provider metadata
provider:
id: llama-server
name: llama.cpp Server
description: Local LLM inference via llama.cpp HTTP server (OpenAI-compatible API)
# Enable/disable this provider
enabled: true
# Priority for provider selection (lower = higher priority)
# Set to 10 for offline-first deployments
priority: 10
# Server Configuration
server:
# Server URL
# Default: http://localhost:8080
baseUrl: "http://localhost:8080"
# API key (if server requires authentication)
# Start server with: --api-key your-secret-key
apiKey: null
# Health check endpoint
healthEndpoint: "/health"
# Model Configuration
model:
# Model identifier (for logging/tracing)
# The actual model is loaded on the server at startup
name: "local-llama"
# Model file path (informational - model loaded on server)
# Used for bundle verification and documentation
modelPath: null
# Expected model digest (SHA-256)
# If set, verify model integrity on connection
expectedDigest: null
# Inference Parameters
inference:
# Temperature (0 = deterministic)
temperature: 0
# Maximum tokens to generate
maxTokens: 4096
# Random seed for reproducibility
# llama.cpp respects seed for deterministic output when temp=0
seed: 42
# Top-p (nucleus sampling)
topP: 1.0
# Top-k sampling
topK: 40
# Repeat penalty
repeatPenalty: 1.1
# Context length (must match server -c parameter)
contextLength: 4096
# Batch size
batchSize: 512
# Stop sequences
stopSequences: []
# Request Configuration
request:
# Request timeout (local inference may be slower)
timeout: "00:05:00"
# Maximum retries on failure
maxRetries: 2
# Retry delay
retryDelay: "00:00:02"
# Hardware Configuration (informational - actual settings on server)
hardware:
# Device for inference
# Options: cpu, cuda, rocm, metal, vulkan
device: "auto"
# Number of GPU layers to offload
gpuLayers: 0
# Number of CPU threads (0 = auto)
threads: 0
# Model Bundle (for airgap deployments)
bundle:
# Path to model bundle directory
bundlePath: null
# Verify bundle signature on startup
verifySignature: true
# Crypto scheme for signature verification
cryptoScheme: null # eidas, fips, gost, sm
# Logging
logging:
logBodies: false
logUsage: true
# Log server health check results
logHealthChecks: false

View File

@@ -0,0 +1,110 @@
# Ollama LLM Provider Configuration
# Documentation: https://ollama.ai/
#
# Install Ollama: curl -fsSL https://ollama.ai/install.sh | sh
# Pull a model: ollama pull llama3:8b
# Start server: ollama serve
#
# Ollama provides an easy way to run local LLMs with automatic GPU detection.
# Provider metadata
provider:
id: ollama
name: Ollama
description: Local LLM inference via Ollama
# Enable/disable this provider
enabled: true
# Priority for provider selection (lower = higher priority)
# Set lower than cloud providers for local-first operation
priority: 20
# Server Configuration
server:
# Ollama server URL
baseUrl: "http://localhost:11434"
# Health check endpoint
healthEndpoint: "/api/tags"
# Model Configuration
model:
# Model to use (must be pulled first: ollama pull <model>)
# Popular options:
# - llama3:8b (8B params, good quality/speed balance)
# - llama3:70b (70B params, higher quality, needs more RAM)
# - mistral:7b (7B params, fast)
# - mixtral:8x7b (MoE, good quality)
# - codellama:7b (code-focused)
# - phi3:mini (small, fast)
name: "llama3:8b"
# Fallback models
fallbacks:
- "mistral:7b"
- "phi3:mini"
# Keep model loaded in memory (reduces latency for repeated requests)
keepAlive: "5m"
# Inference Parameters
inference:
# Temperature (0 = deterministic)
temperature: 0
# Maximum tokens to generate
maxTokens: 4096
# Random seed for reproducibility
seed: 42
# Top-p (nucleus sampling)
topP: 1.0
# Top-k sampling
topK: 40
# Repeat penalty
repeatPenalty: 1.1
# Context length
numCtx: 4096
# Number of tokens to predict (-1 = unlimited)
numPredict: -1
# Stop sequences
stopSequences: []
# Request Configuration
request:
# Request timeout
timeout: "00:05:00"
# Maximum retries on failure
maxRetries: 2
# Retry delay
retryDelay: "00:00:02"
# GPU Configuration
gpu:
# Number of GPU layers to use (0 = CPU only)
numGpu: 0
# Use GPU for embedding (if available)
useGpuForEmbedding: false
# Model Management
management:
# Auto-pull model if not found
autoPull: false
# Verify model integrity after pull
verifyPull: true
# Logging
logging:
logBodies: false
logUsage: true

View File

@@ -0,0 +1,94 @@
# OpenAI LLM Provider Configuration
# Documentation: https://platform.openai.com/docs/api-reference
# Provider metadata
provider:
id: openai
name: OpenAI
description: OpenAI GPT models via API
# Enable/disable this provider
enabled: true
# Priority for provider selection (lower = higher priority)
# When multiple providers are available, the one with lowest priority is used
priority: 100
# API Configuration
api:
# API key (can also use OPENAI_API_KEY environment variable)
# Environment variables are expanded: ${OPENAI_API_KEY}
apiKey: "${OPENAI_API_KEY}"
# Base URL for API requests
# Default: https://api.openai.com/v1
# For Azure OpenAI, use: https://{resource}.openai.azure.com/openai/deployments/{deployment}
baseUrl: "https://api.openai.com/v1"
# Organization ID (optional)
organizationId: null
# API version (for Azure OpenAI)
apiVersion: null
# Model Configuration
model:
# Model to use for inference
# Options: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-4, gpt-3.5-turbo
name: "gpt-4o"
# Fallback models if primary is unavailable
fallbacks:
- "gpt-4o-mini"
- "gpt-4-turbo"
# Inference Parameters
inference:
# Temperature (0 = deterministic, 1 = creative)
# For security analysis, use 0 for reproducibility
temperature: 0
# Maximum tokens to generate
maxTokens: 4096
# Random seed for reproducibility (when temperature=0)
seed: 42
# Top-p (nucleus sampling)
topP: 1.0
# Frequency penalty (-2.0 to 2.0)
frequencyPenalty: 0
# Presence penalty (-2.0 to 2.0)
presencePenalty: 0
# Stop sequences
stopSequences: []
# Request Configuration
request:
# Request timeout
timeout: "00:02:00"
# Maximum retries on failure
maxRetries: 3
# Retry delay (exponential backoff base)
retryDelay: "00:00:01"
# Rate Limiting
rateLimit:
# Requests per minute (0 = unlimited)
requestsPerMinute: 0
# Tokens per minute (0 = unlimited)
tokensPerMinute: 0
# Logging
logging:
# Log request/response bodies (WARNING: may contain sensitive data)
logBodies: false
# Log token usage
logUsage: true