CD/CD consolidation

2025-12-26 17:32:23 +02:00
parent a866eb6277
commit c786faae84
638 changed files with 3821 additions and 181 deletions
--- a/etc/llm-providers/claude.yaml
+++ b/etc/llm-providers/claude.yaml
@@ -0,0 +1,84 @@
+# Claude (Anthropic) LLM Provider Configuration
+# Documentation: https://docs.anthropic.com/en/api
+
+# Provider metadata
+provider:
+  id: claude
+  name: Claude
+  description: Anthropic Claude models via API
+
+# Enable/disable this provider
+enabled: true
+
+# Priority for provider selection (lower = higher priority)
+priority: 100
+
+# API Configuration
+api:
+  # API key (can also use ANTHROPIC_API_KEY environment variable)
+  apiKey: "${ANTHROPIC_API_KEY}"
+
+  # Base URL for API requests
+  baseUrl: "https://api.anthropic.com"
+
+  # API version header
+  apiVersion: "2023-06-01"
+
+# Model Configuration
+model:
+  # Model to use for inference
+  # Options: claude-sonnet-4-20250514, claude-opus-4-20250514, claude-3-5-haiku-20241022
+  name: "claude-sonnet-4-20250514"
+
+  # Fallback models if primary is unavailable
+  fallbacks:
+    - "claude-3-5-haiku-20241022"
+
+# Inference Parameters
+inference:
+  # Temperature (0 = deterministic, 1 = creative)
+  temperature: 0
+
+  # Maximum tokens to generate
+  maxTokens: 4096
+
+  # Top-p (nucleus sampling)
+  topP: 1.0
+
+  # Top-k sampling (0 = disabled)
+  topK: 0
+
+  # Stop sequences
+  stopSequences: []
+
+# Request Configuration
+request:
+  # Request timeout
+  timeout: "00:02:00"
+
+  # Maximum retries on failure
+  maxRetries: 3
+
+  # Retry delay
+  retryDelay: "00:00:01"
+
+# Rate Limiting
+rateLimit:
+  # Requests per minute (0 = unlimited)
+  requestsPerMinute: 0
+
+  # Tokens per minute (0 = unlimited)
+  tokensPerMinute: 0
+
+# Extended Thinking (Claude 3.5+ feature)
+thinking:
+  # Enable extended thinking for complex reasoning
+  enabled: false
+
+  # Budget tokens for thinking (when enabled)
+  budgetTokens: 10000
+
+# Logging
+logging:
+  logBodies: false
+  logUsage: true
--- a/etc/llm-providers/llama-server.yaml
+++ b/etc/llm-providers/llama-server.yaml
@@ -0,0 +1,122 @@
+# llama.cpp Server LLM Provider Configuration
+# Documentation: https://github.com/ggerganov/llama.cpp/tree/master/examples/server
+#
+# Start llama.cpp server:
+#   llama-server -m /path/to/model.gguf --port 8080 --host 0.0.0.0
+#
+# For GPU acceleration:
+#   llama-server -m model.gguf --port 8080 -ngl 99
+#
+# This provider enables OFFLINE/AIRGAP operation by using locally-hosted models.
+
+# Provider metadata
+provider:
+  id: llama-server
+  name: llama.cpp Server
+  description: Local LLM inference via llama.cpp HTTP server (OpenAI-compatible API)
+
+# Enable/disable this provider
+enabled: true
+
+# Priority for provider selection (lower = higher priority)
+# Set to 10 for offline-first deployments
+priority: 10
+
+# Server Configuration
+server:
+  # Server URL
+  # Default: http://localhost:8080
+  baseUrl: "http://localhost:8080"
+
+  # API key (if server requires authentication)
+  # Start server with: --api-key your-secret-key
+  apiKey: null
+
+  # Health check endpoint
+  healthEndpoint: "/health"
+
+# Model Configuration
+model:
+  # Model identifier (for logging/tracing)
+  # The actual model is loaded on the server at startup
+  name: "local-llama"
+
+  # Model file path (informational - model loaded on server)
+  # Used for bundle verification and documentation
+  modelPath: null
+
+  # Expected model digest (SHA-256)
+  # If set, verify model integrity on connection
+  expectedDigest: null
+
+# Inference Parameters
+inference:
+  # Temperature (0 = deterministic)
+  temperature: 0
+
+  # Maximum tokens to generate
+  maxTokens: 4096
+
+  # Random seed for reproducibility
+  # llama.cpp respects seed for deterministic output when temp=0
+  seed: 42
+
+  # Top-p (nucleus sampling)
+  topP: 1.0
+
+  # Top-k sampling
+  topK: 40
+
+  # Repeat penalty
+  repeatPenalty: 1.1
+
+  # Context length (must match server -c parameter)
+  contextLength: 4096
+
+  # Batch size
+  batchSize: 512
+
+  # Stop sequences
+  stopSequences: []
+
+# Request Configuration
+request:
+  # Request timeout (local inference may be slower)
+  timeout: "00:05:00"
+
+  # Maximum retries on failure
+  maxRetries: 2
+
+  # Retry delay
+  retryDelay: "00:00:02"
+
+# Hardware Configuration (informational - actual settings on server)
+hardware:
+  # Device for inference
+  # Options: cpu, cuda, rocm, metal, vulkan
+  device: "auto"
+
+  # Number of GPU layers to offload
+  gpuLayers: 0
+
+  # Number of CPU threads (0 = auto)
+  threads: 0
+
+# Model Bundle (for airgap deployments)
+bundle:
+  # Path to model bundle directory
+  bundlePath: null
+
+  # Verify bundle signature on startup
+  verifySignature: true
+
+  # Crypto scheme for signature verification
+  cryptoScheme: null  # eidas, fips, gost, sm
+
+# Logging
+logging:
+  logBodies: false
+  logUsage: true
+
+  # Log server health check results
+  logHealthChecks: false
--- a/etc/llm-providers/ollama.yaml
+++ b/etc/llm-providers/ollama.yaml
@@ -0,0 +1,110 @@
+# Ollama LLM Provider Configuration
+# Documentation: https://ollama.ai/
+#
+# Install Ollama: curl -fsSL https://ollama.ai/install.sh | sh
+# Pull a model: ollama pull llama3:8b
+# Start server: ollama serve
+#
+# Ollama provides an easy way to run local LLMs with automatic GPU detection.
+
+# Provider metadata
+provider:
+  id: ollama
+  name: Ollama
+  description: Local LLM inference via Ollama
+
+# Enable/disable this provider
+enabled: true
+
+# Priority for provider selection (lower = higher priority)
+# Set lower than cloud providers for local-first operation
+priority: 20
+
+# Server Configuration
+server:
+  # Ollama server URL
+  baseUrl: "http://localhost:11434"
+
+  # Health check endpoint
+  healthEndpoint: "/api/tags"
+
+# Model Configuration
+model:
+  # Model to use (must be pulled first: ollama pull <model>)
+  # Popular options:
+  #   - llama3:8b (8B params, good quality/speed balance)
+  #   - llama3:70b (70B params, higher quality, needs more RAM)
+  #   - mistral:7b (7B params, fast)
+  #   - mixtral:8x7b (MoE, good quality)
+  #   - codellama:7b (code-focused)
+  #   - phi3:mini (small, fast)
+  name: "llama3:8b"
+
+  # Fallback models
+  fallbacks:
+    - "mistral:7b"
+    - "phi3:mini"
+
+  # Keep model loaded in memory (reduces latency for repeated requests)
+  keepAlive: "5m"
+
+# Inference Parameters
+inference:
+  # Temperature (0 = deterministic)
+  temperature: 0
+
+  # Maximum tokens to generate
+  maxTokens: 4096
+
+  # Random seed for reproducibility
+  seed: 42
+
+  # Top-p (nucleus sampling)
+  topP: 1.0
+
+  # Top-k sampling
+  topK: 40
+
+  # Repeat penalty
+  repeatPenalty: 1.1
+
+  # Context length
+  numCtx: 4096
+
+  # Number of tokens to predict (-1 = unlimited)
+  numPredict: -1
+
+  # Stop sequences
+  stopSequences: []
+
+# Request Configuration
+request:
+  # Request timeout
+  timeout: "00:05:00"
+
+  # Maximum retries on failure
+  maxRetries: 2
+
+  # Retry delay
+  retryDelay: "00:00:02"
+
+# GPU Configuration
+gpu:
+  # Number of GPU layers to use (0 = CPU only)
+  numGpu: 0
+
+  # Use GPU for embedding (if available)
+  useGpuForEmbedding: false
+
+# Model Management
+management:
+  # Auto-pull model if not found
+  autoPull: false
+
+  # Verify model integrity after pull
+  verifyPull: true
+
+# Logging
+logging:
+  logBodies: false
+  logUsage: true
--- a/etc/llm-providers/openai.yaml
+++ b/etc/llm-providers/openai.yaml
@@ -0,0 +1,94 @@
+# OpenAI LLM Provider Configuration
+# Documentation: https://platform.openai.com/docs/api-reference
+
+# Provider metadata
+provider:
+  id: openai
+  name: OpenAI
+  description: OpenAI GPT models via API
+
+# Enable/disable this provider
+enabled: true
+
+# Priority for provider selection (lower = higher priority)
+# When multiple providers are available, the one with lowest priority is used
+priority: 100
+
+# API Configuration
+api:
+  # API key (can also use OPENAI_API_KEY environment variable)
+  # Environment variables are expanded: ${OPENAI_API_KEY}
+  apiKey: "${OPENAI_API_KEY}"
+
+  # Base URL for API requests
+  # Default: https://api.openai.com/v1
+  # For Azure OpenAI, use: https://{resource}.openai.azure.com/openai/deployments/{deployment}
+  baseUrl: "https://api.openai.com/v1"
+
+  # Organization ID (optional)
+  organizationId: null
+
+  # API version (for Azure OpenAI)
+  apiVersion: null
+
+# Model Configuration
+model:
+  # Model to use for inference
+  # Options: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-4, gpt-3.5-turbo
+  name: "gpt-4o"
+
+  # Fallback models if primary is unavailable
+  fallbacks:
+    - "gpt-4o-mini"
+    - "gpt-4-turbo"
+
+# Inference Parameters
+inference:
+  # Temperature (0 = deterministic, 1 = creative)
+  # For security analysis, use 0 for reproducibility
+  temperature: 0
+
+  # Maximum tokens to generate
+  maxTokens: 4096
+
+  # Random seed for reproducibility (when temperature=0)
+  seed: 42
+
+  # Top-p (nucleus sampling)
+  topP: 1.0
+
+  # Frequency penalty (-2.0 to 2.0)
+  frequencyPenalty: 0
+
+  # Presence penalty (-2.0 to 2.0)
+  presencePenalty: 0
+
+  # Stop sequences
+  stopSequences: []
+
+# Request Configuration
+request:
+  # Request timeout
+  timeout: "00:02:00"
+
+  # Maximum retries on failure
+  maxRetries: 3
+
+  # Retry delay (exponential backoff base)
+  retryDelay: "00:00:01"
+
+# Rate Limiting
+rateLimit:
+  # Requests per minute (0 = unlimited)
+  requestsPerMinute: 0
+
+  # Tokens per minute (0 = unlimited)
+  tokensPerMinute: 0
+
+# Logging
+logging:
+  # Log request/response bodies (WARNING: may contain sensitive data)
+  logBodies: false
+
+  # Log token usage
+  logUsage: true