Refactor code structure and optimize performance across multiple modules

2025-12-26 20:03:22 +02:00
parent c786faae84
commit b4fc66feb6
3353 changed files with 88254 additions and 1590657 deletions
--- a/etc/llm-providers/claude.yaml.sample
+++ b/etc/llm-providers/claude.yaml.sample
@@ -0,0 +1,81 @@
+# Claude (Anthropic) LLM Provider configuration template
+# Copy to claude.yaml (remove .sample extension) and configure.
+# Environment variable ANTHROPIC_API_KEY can be used instead of api.apiKey.
+
+# Provider enabled state and priority (lower = higher priority)
+enabled: true
+priority: 100
+
+# API Configuration
+api:
+  # API key - use environment variable reference or set directly
+  # Environment variable: ANTHROPIC_API_KEY
+  apiKey: "${ANTHROPIC_API_KEY}"
+
+  # Base URL for API requests
+  baseUrl: "https://api.anthropic.com"
+
+  # API version header
+  apiVersion: "2023-06-01"
+
+# Model Configuration
+model:
+  # Primary model name
+  # Options: claude-sonnet-4-20250514, claude-opus-4-20250514, claude-3-5-sonnet-20241022
+  name: "claude-sonnet-4-20250514"
+
+  # Fallback models (tried in order if primary fails)
+  fallbacks:
+    - "claude-3-5-sonnet-20241022"
+
+# Inference Parameters
+inference:
+  # Temperature: 0 = deterministic, higher = more creative
+  # For reproducibility in StellaOps, use 0
+  temperature: 0.0
+
+  # Maximum tokens to generate
+  maxTokens: 4096
+
+  # Nucleus sampling (top-p)
+  # 1.0 = disabled, lower values = more focused
+  topP: 1.0
+
+  # Top-k sampling (0 = disabled)
+  # Lower values = more focused
+  topK: 0
+
+# Extended Thinking (Claude's reasoning feature)
+thinking:
+  # Enable extended thinking for complex reasoning tasks
+  enabled: false
+
+  # Budget tokens for thinking process
+  budgetTokens: 10000
+
+# Request Configuration
+request:
+  # Request timeout
+  timeout: "00:02:00"
+
+  # Maximum retries on failure
+  maxRetries: 3
+
+# Logging Configuration
+logging:
+  # Log request/response bodies (WARNING: may contain sensitive data)
+  logBodies: false
+
+  # Log token usage statistics
+  logUsage: true
+
+# Rate Limiting
+rateLimit:
+  # Requests per minute limit (0 = no limit)
+  requestsPerMinute: 0
+
+  # Tokens per minute limit (0 = no limit)
+  tokensPerMinute: 0
+
+  # Backoff duration when rate limited
+  backoff: "00:01:00"
--- a/etc/llm-providers/llama-server.yaml.sample
+++ b/etc/llm-providers/llama-server.yaml.sample
@@ -0,0 +1,96 @@
+# llama.cpp Server LLM Provider configuration template
+# This is the PRIMARY provider for OFFLINE/AIRGAP deployments.
+# Copy to llama-server.yaml (remove .sample extension) and configure.
+
+# Provider enabled state and priority
+# Lower priority number = higher preference (10 = prefer over cloud providers)
+enabled: true
+priority: 10
+
+# Server Configuration
+server:
+  # Base URL for llama.cpp server
+  # Start llama.cpp with: llama-server -m model.gguf --host 0.0.0.0 --port 8080
+  baseUrl: "http://localhost:8080"
+
+  # API key if server requires authentication (--api-key flag)
+  apiKey: ""
+
+  # Health check endpoint
+  healthEndpoint: "/health"
+
+# Model Configuration
+model:
+  # Model name (for logging and identification)
+  name: "llama3-8b-q4km"
+
+  # Path to model file (informational, model is loaded on server)
+  modelPath: "/models/llama-3-8b-instruct.Q4_K_M.gguf"
+
+  # Expected model digest (SHA-256) for verification
+  # Ensures the correct model is loaded in airgap environments
+  expectedDigest: ""
+
+# Inference Parameters
+inference:
+  # Temperature: 0 = deterministic (REQUIRED for reproducibility)
+  temperature: 0.0
+
+  # Maximum tokens to generate
+  maxTokens: 4096
+
+  # Random seed for reproducibility (REQUIRED for determinism)
+  seed: 42
+
+  # Nucleus sampling (top-p)
+  topP: 1.0
+
+  # Top-k sampling
+  topK: 40
+
+  # Repeat penalty (1.0 = no penalty)
+  repeatPenalty: 1.1
+
+  # Context length (must match server's -c flag)
+  contextLength: 4096
+
+# Request Configuration
+request:
+  # Request timeout (longer for local inference)
+  timeout: "00:05:00"
+
+  # Maximum retries on failure
+  maxRetries: 2
+
+# Model Bundle Configuration (for airgap deployments)
+bundle:
+  # Path to signed model bundle (.stellaops-model directory)
+  # Created using: stella model bundle --sign
+  bundlePath: ""
+
+  # Verify bundle signature before loading
+  verifySignature: true
+
+  # Cryptographic scheme for verification
+  # Options: ed25519, ecdsa-p256, gost3410, sm2
+  cryptoScheme: "ed25519"
+
+# Logging Configuration
+logging:
+  # Log health check results
+  logHealthChecks: false
+
+  # Log token usage statistics
+  logUsage: true
+
+# Performance Tuning
+performance:
+  # Number of threads for inference (-t flag on server)
+  # 0 = auto-detect
+  threads: 0
+
+  # Batch size for prompt processing
+  batchSize: 512
+
+  # Context size for parallel requests
+  parallelContexts: 1
--- a/etc/llm-providers/ollama.yaml.sample
+++ b/etc/llm-providers/ollama.yaml.sample
@@ -0,0 +1,87 @@
+# Ollama LLM Provider configuration template
+# For local inference using Ollama.
+# Copy to ollama.yaml (remove .sample extension) and configure.
+
+# Provider enabled state and priority
+# Priority 20 = prefer over cloud, but after llama-server (10)
+enabled: true
+priority: 20
+
+# Server Configuration
+server:
+  # Base URL for Ollama server
+  # Default Ollama port is 11434
+  baseUrl: "http://localhost:11434"
+
+  # Health check endpoint
+  healthEndpoint: "/api/tags"
+
+# Model Configuration
+model:
+  # Primary model name
+  # Use 'ollama list' to see available models
+  # Common options: llama3:8b, llama3:70b, codellama:13b, mistral:7b
+  name: "llama3:8b"
+
+  # Fallback models (tried in order if primary fails)
+  fallbacks:
+    - "llama3:latest"
+    - "mistral:7b"
+
+  # Keep model loaded in memory (prevents unloading between requests)
+  # Options: "5m", "10m", "1h", "-1" (forever)
+  keepAlive: "5m"
+
+# Inference Parameters
+inference:
+  # Temperature: 0 = deterministic (REQUIRED for reproducibility)
+  temperature: 0.0
+
+  # Maximum tokens to generate (-1 = use model default)
+  maxTokens: 4096
+
+  # Random seed for reproducibility (REQUIRED for determinism)
+  seed: 42
+
+  # Nucleus sampling (top-p)
+  topP: 1.0
+
+  # Top-k sampling
+  topK: 40
+
+  # Repeat penalty (1.0 = no penalty)
+  repeatPenalty: 1.1
+
+  # Context window size
+  numCtx: 4096
+
+  # Number of tokens to predict (-1 = unlimited, use maxTokens)
+  numPredict: -1
+
+# GPU Configuration
+gpu:
+  # Number of GPU layers to offload (0 = CPU only)
+  # -1 = offload all layers to GPU
+  numGpu: 0
+
+# Request Configuration
+request:
+  # Request timeout (longer for local inference)
+  timeout: "00:05:00"
+
+  # Maximum retries on failure
+  maxRetries: 2
+
+# Model Management
+management:
+  # Automatically pull model if not found locally
+  # WARNING: Requires internet access, disable for airgap
+  autoPull: false
+
+  # Verify model integrity after pull
+  verifyPull: true
+
+# Logging Configuration
+logging:
+  # Log token usage statistics
+  logUsage: true
--- a/etc/llm-providers/openai.yaml.sample
+++ b/etc/llm-providers/openai.yaml.sample
@@ -0,0 +1,87 @@
+# OpenAI LLM Provider configuration template
+# Copy to openai.yaml (remove .sample extension) and configure.
+# Environment variable OPENAI_API_KEY can be used instead of api.apiKey.
+
+# Provider enabled state and priority (lower = higher priority)
+enabled: true
+priority: 100
+
+# API Configuration
+api:
+  # API key - use environment variable reference or set directly
+  # Environment variable: OPENAI_API_KEY
+  apiKey: "${OPENAI_API_KEY}"
+
+  # Base URL for API requests
+  # Default: https://api.openai.com/v1
+  # For Azure OpenAI: https://{resource}.openai.azure.com/openai/deployments/{deployment}
+  baseUrl: "https://api.openai.com/v1"
+
+  # Organization ID (optional, for multi-org accounts)
+  organizationId: ""
+
+  # API version (required for Azure OpenAI, e.g., "2024-02-15-preview")
+  apiVersion: ""
+
+# Model Configuration
+model:
+  # Primary model name
+  # Options: gpt-4o, gpt-4o-mini, gpt-4-turbo, gpt-4, gpt-3.5-turbo
+  # For Azure: use your deployment name
+  name: "gpt-4o"
+
+  # Fallback models (tried in order if primary fails)
+  fallbacks:
+    - "gpt-4o-mini"
+    - "gpt-3.5-turbo"
+
+# Inference Parameters
+inference:
+  # Temperature: 0 = deterministic, higher = more creative
+  # For reproducibility in StellaOps, use 0
+  temperature: 0.0
+
+  # Maximum tokens to generate
+  maxTokens: 4096
+
+  # Random seed for reproducibility (when temperature=0)
+  seed: 42
+
+  # Nucleus sampling (top-p)
+  # 1.0 = disabled, lower values = more focused
+  topP: 1.0
+
+  # Frequency penalty (-2.0 to 2.0)
+  # Positive = reduce repetition of tokens already used
+  frequencyPenalty: 0.0
+
+  # Presence penalty (-2.0 to 2.0)
+  # Positive = encourage new topics
+  presencePenalty: 0.0
+
+# Request Configuration
+request:
+  # Request timeout
+  timeout: "00:02:00"
+
+  # Maximum retries on failure
+  maxRetries: 3
+
+# Logging Configuration
+logging:
+  # Log request/response bodies (WARNING: may contain sensitive data)
+  logBodies: false
+
+  # Log token usage statistics
+  logUsage: true
+
+# Rate Limiting
+rateLimit:
+  # Requests per minute limit (0 = no limit)
+  requestsPerMinute: 0
+
+  # Tokens per minute limit (0 = no limit)
+  tokensPerMinute: 0
+
+  # Backoff duration when rate limited
+  backoff: "00:01:00"
--- a/etc/llm-providers/registry.yaml
+++ b/etc/llm-providers/registry.yaml
@@ -0,0 +1,92 @@
+# LLM Provider Registry
+# AI/LLM provider configurations for Advisory AI
+
+version: "1.0"
+category: llm-providers
+
+# Global LLM settings
+defaults:
+  enabled: false  # Explicitly enable providers
+  timeout: "00:02:00"
+  maxRetries: 3
+
+# ============================================================================
+# LLM PROVIDERS
+# ============================================================================
+providers:
+  # Cloud providers
+  claude:
+    enabled: false
+    priority: 100
+    config: claude.yaml
+    description: "Anthropic Claude (Claude 3.5/4)"
+    mode: remote
+
+  openai:
+    enabled: false
+    priority: 90
+    config: openai.yaml
+    description: "OpenAI GPT-4/4o"
+    mode: remote
+
+  azure-openai:
+    enabled: false
+    priority: 90
+    config: azure-openai.yaml
+    description: "Azure OpenAI Service"
+    mode: remote
+
+  # Local providers (for air-gap)
+  ollama:
+    enabled: false
+    priority: 80
+    config: ollama.yaml
+    description: "Ollama local inference"
+    mode: local
+
+  llama-server:
+    enabled: false
+    priority: 80
+    config: llama-server.yaml
+    description: "llama.cpp HTTP server"
+    mode: local
+
+  vllm:
+    enabled: false
+    priority: 80
+    config: vllm.yaml
+    description: "vLLM inference server"
+    mode: local
+
+# ============================================================================
+# INFERENCE SETTINGS
+# ============================================================================
+inference:
+  # Mode: remote, local, hybrid
+  mode: "${ADVISORY_AI_INFERENCE_MODE:-local}"
+
+  # Fallback chain
+  fallbackChain:
+    - claude
+    - openai
+    - ollama
+
+  # Model selection
+  modelSelection:
+    # Task-specific model overrides
+    explanation: ""       # Use default
+    remediation: ""       # Use default
+    classification: ""    # Use default
+
+# ============================================================================
+# OFFLINE/AIR-GAP SETTINGS
+# ============================================================================
+offline:
+  # Signed model bundle path
+  modelBundlePath: "${ADVISORY_AI_MODEL_BUNDLE_PATH:-/opt/stellaops/offline/models}"
+
+  # Verify bundle signatures
+  verifySignatures: true
+
+  # Public key for signature verification
+  publicKeyPath: "${ADVISORY_AI_MODEL_PUBKEY:-/etc/stellaops/model-signing-pubkey.pem}"