Refactor code structure and optimize performance across multiple modules

2025-12-26 20:03:22 +02:00
parent c786faae84
commit f10d83c444
1385 changed files with 69732 additions and 10280 deletions
--- a/docs/modules/advisory-ai/guides/llm-provider-plugins.md
+++ b/docs/modules/advisory-ai/guides/llm-provider-plugins.md
@@ -0,0 +1,560 @@
+# LLM Provider Plugins
+
+> **Sprint:** SPRINT_20251226_019_AI_offline_inference
+> **Tasks:** OFFLINE-07, OFFLINE-08, OFFLINE-09
+
+This guide documents the LLM (Large Language Model) provider plugin architecture for AI-powered advisory analysis, explanations, and remediation planning.
+
+## Overview
+
+StellaOps supports multiple LLM backends through a unified plugin architecture:
+
+| Provider | Type | Use Case | Priority |
+|----------|------|----------|----------|
+| **llama-server** | Local | Airgap/Offline deployment | 10 (highest) |
+| **ollama** | Local | Development, edge deployment | 20 |
+| **openai** | Cloud | GPT-4o for high-quality output | 100 |
+| **claude** | Cloud | Claude Sonnet for complex reasoning | 100 |
+
+## Architecture
+
+### Plugin Interface
+
+```csharp
+public interface ILlmProviderPlugin : IAvailabilityPlugin
+{
+    string ProviderId { get; }              // "openai", "claude", "llama-server", "ollama"
+    string DisplayName { get; }              // Human-readable name
+    string Description { get; }              // Provider description
+    string DefaultConfigFileName { get; }    // "openai.yaml", etc.
+
+    ILlmProvider Create(IServiceProvider services, IConfiguration configuration);
+    LlmProviderConfigValidation ValidateConfiguration(IConfiguration configuration);
+}
+```
+
+### Provider Interface
+
+```csharp
+public interface ILlmProvider : IDisposable
+{
+    string ProviderId { get; }
+
+    Task<bool> IsAvailableAsync(CancellationToken cancellationToken = default);
+
+    Task<LlmCompletionResult> CompleteAsync(
+        LlmCompletionRequest request,
+        CancellationToken cancellationToken = default);
+
+    IAsyncEnumerable<LlmStreamChunk> CompleteStreamAsync(
+        LlmCompletionRequest request,
+        CancellationToken cancellationToken = default);
+}
+```
+
+### Request and Response
+
+```csharp
+public record LlmCompletionRequest
+{
+    string? SystemPrompt { get; init; }
+    required string UserPrompt { get; init; }
+    string? Model { get; init; }
+    double Temperature { get; init; } = 0;      // 0 = deterministic
+    int MaxTokens { get; init; } = 4096;
+    int? Seed { get; init; }                     // For reproducibility
+    IReadOnlyList<string>? StopSequences { get; init; }
+    string? RequestId { get; init; }
+}
+
+public record LlmCompletionResult
+{
+    required string Content { get; init; }
+    required string ModelId { get; init; }
+    required string ProviderId { get; init; }
+    int? InputTokens { get; init; }
+    int? OutputTokens { get; init; }
+    long? TotalTimeMs { get; init; }
+    string? FinishReason { get; init; }
+    bool Deterministic { get; init; }
+}
+```
+
+## Configuration
+
+### Directory Structure
+
+```
+etc/
+  llm-providers/
+    openai.yaml          # OpenAI configuration
+    claude.yaml          # Claude/Anthropic configuration
+    llama-server.yaml    # llama.cpp server configuration
+    ollama.yaml          # Ollama configuration
+```
+
+### Environment Variables
+
+| Variable | Provider | Description |
+|----------|----------|-------------|
+| `OPENAI_API_KEY` | OpenAI | API key for OpenAI |
+| `ANTHROPIC_API_KEY` | Claude | API key for Anthropic |
+
+### Priority System
+
+Providers are selected by priority (lower = higher preference):
+
+```yaml
+# llama-server.yaml - highest priority for offline
+priority: 10
+
+# ollama.yaml - second priority for local
+priority: 20
+
+# openai.yaml / claude.yaml - cloud fallback
+priority: 100
+```
+
+## Provider Details
+
+### OpenAI Provider
+
+Supports OpenAI API and Azure OpenAI Service.
+
+```yaml
+# etc/llm-providers/openai.yaml
+enabled: true
+priority: 100
+
+api:
+  apiKey: "${OPENAI_API_KEY}"
+  baseUrl: "https://api.openai.com/v1"
+  organizationId: ""
+  apiVersion: ""  # Required for Azure OpenAI
+
+model:
+  name: "gpt-4o"
+  fallbacks:
+    - "gpt-4o-mini"
+
+inference:
+  temperature: 0.0
+  maxTokens: 4096
+  seed: 42
+  topP: 1.0
+  frequencyPenalty: 0.0
+  presencePenalty: 0.0
+
+request:
+  timeout: "00:02:00"
+  maxRetries: 3
+```
+
+**Azure OpenAI Configuration:**
+
+```yaml
+api:
+  baseUrl: "https://{resource}.openai.azure.com/openai/deployments/{deployment}"
+  apiKey: "${AZURE_OPENAI_KEY}"
+  apiVersion: "2024-02-15-preview"
+```
+
+### Claude Provider
+
+Supports Anthropic Claude API.
+
+```yaml
+# etc/llm-providers/claude.yaml
+enabled: true
+priority: 100
+
+api:
+  apiKey: "${ANTHROPIC_API_KEY}"
+  baseUrl: "https://api.anthropic.com"
+  apiVersion: "2023-06-01"
+
+model:
+  name: "claude-sonnet-4-20250514"
+  fallbacks:
+    - "claude-3-5-sonnet-20241022"
+
+inference:
+  temperature: 0.0
+  maxTokens: 4096
+  topP: 1.0
+  topK: 0
+
+thinking:
+  enabled: false
+  budgetTokens: 10000
+
+request:
+  timeout: "00:02:00"
+  maxRetries: 3
+```
+
+### llama.cpp Server Provider
+
+**Primary provider for airgap/offline deployments.**
+
+```yaml
+# etc/llm-providers/llama-server.yaml
+enabled: true
+priority: 10  # Highest priority
+
+server:
+  baseUrl: "http://localhost:8080"
+  apiKey: ""
+  healthEndpoint: "/health"
+
+model:
+  name: "llama3-8b-q4km"
+  modelPath: "/models/llama-3-8b-instruct.Q4_K_M.gguf"
+  expectedDigest: "sha256:..."  # For airgap verification
+
+inference:
+  temperature: 0.0
+  maxTokens: 4096
+  seed: 42
+  topP: 1.0
+  topK: 40
+  repeatPenalty: 1.1
+  contextLength: 4096
+
+bundle:
+  bundlePath: "/bundles/llama3-8b.stellaops-model"
+  verifySignature: true
+  cryptoScheme: "ed25519"
+
+request:
+  timeout: "00:05:00"
+  maxRetries: 2
+```
+
+**Starting llama.cpp server:**
+
+```bash
+# Basic server
+llama-server -m model.gguf --host 0.0.0.0 --port 8080
+
+# With GPU acceleration
+llama-server -m model.gguf --host 0.0.0.0 --port 8080 -ngl 35
+
+# With API key authentication
+llama-server -m model.gguf --host 0.0.0.0 --port 8080 --api-key "your-key"
+```
+
+### Ollama Provider
+
+For local development and edge deployments.
+
+```yaml
+# etc/llm-providers/ollama.yaml
+enabled: true
+priority: 20
+
+server:
+  baseUrl: "http://localhost:11434"
+  healthEndpoint: "/api/tags"
+
+model:
+  name: "llama3:8b"
+  fallbacks:
+    - "mistral:7b"
+  keepAlive: "5m"
+
+inference:
+  temperature: 0.0
+  maxTokens: 4096
+  seed: 42
+  topP: 1.0
+  topK: 40
+  repeatPenalty: 1.1
+  numCtx: 4096
+
+gpu:
+  numGpu: 0  # 0 = CPU only, -1 = all layers on GPU
+
+management:
+  autoPull: false  # Disable for airgap
+  verifyPull: true
+
+request:
+  timeout: "00:05:00"
+  maxRetries: 2
+```
+
+## Usage
+
+### Dependency Injection
+
+```csharp
+// Program.cs or Startup.cs
+services.AddLlmProviderPlugins("etc/llm-providers");
+
+// Or with explicit configuration
+services.AddLlmProviderPlugins(catalog =>
+{
+    catalog.LoadConfigurationsFromDirectory("etc/llm-providers");
+    // Optionally register custom plugins
+    catalog.RegisterPlugin(new CustomLlmProviderPlugin());
+});
+```
+
+### Using the Provider Factory
+
+```csharp
+public class AdvisoryExplanationService
+{
+    private readonly ILlmProviderFactory _providerFactory;
+
+    public async Task<string> GenerateExplanationAsync(
+        string vulnerabilityId,
+        CancellationToken cancellationToken)
+    {
+        // Get the default (highest priority available) provider
+        var provider = _providerFactory.GetDefaultProvider();
+
+        var request = new LlmCompletionRequest
+        {
+            SystemPrompt = "You are a security analyst explaining vulnerabilities.",
+            UserPrompt = $"Explain {vulnerabilityId} in plain language.",
+            Temperature = 0,  // Deterministic
+            Seed = 42,        // Reproducible
+            MaxTokens = 2048
+        };
+
+        var result = await provider.CompleteAsync(request, cancellationToken);
+        return result.Content;
+    }
+}
+```
+
+### Provider Selection
+
+```csharp
+// Get specific provider
+var openaiProvider = _providerFactory.GetProvider("openai");
+var claudeProvider = _providerFactory.GetProvider("claude");
+var llamaProvider = _providerFactory.GetProvider("llama-server");
+
+// List available providers
+var available = _providerFactory.AvailableProviders;
+// Returns: ["llama-server", "ollama", "openai", "claude"]
+```
+
+### Automatic Fallback
+
+```csharp
+// Create a fallback provider that tries providers in order
+var fallbackProvider = new FallbackLlmProvider(
+    _providerFactory,
+    providerOrder: ["llama-server", "ollama", "openai", "claude"],
+    _logger);
+
+// Uses first available provider, falls back on failure
+var result = await fallbackProvider.CompleteAsync(request, cancellationToken);
+```
+
+### Streaming Responses
+
+```csharp
+var provider = _providerFactory.GetDefaultProvider();
+
+await foreach (var chunk in provider.CompleteStreamAsync(request, cancellationToken))
+{
+    Console.Write(chunk.Content);
+
+    if (chunk.IsFinal)
+    {
+        Console.WriteLine($"\n[Finished: {chunk.FinishReason}]");
+    }
+}
+```
+
+## Determinism Requirements
+
+For reproducible AI outputs (required for attestations):
+
+| Setting | Value | Purpose |
+|---------|-------|---------|
+| `temperature` | `0.0` | No randomness in token selection |
+| `seed` | `42` | Fixed random seed |
+| `topK` | `1` | Single token selection (optional) |
+
+```yaml
+inference:
+  temperature: 0.0
+  seed: 42
+  topK: 1  # Most deterministic
+```
+
+**Verification:**
+
+```csharp
+var result = await provider.CompleteAsync(request, cancellationToken);
+
+if (!result.Deterministic)
+{
+    _logger.LogWarning("Output may not be reproducible");
+}
+```
+
+## Offline/Airgap Deployment
+
+### Recommended Configuration
+
+```
+etc/llm-providers/
+  llama-server.yaml    # Primary - enabled, priority: 10
+  ollama.yaml          # Backup - enabled, priority: 20
+  openai.yaml          # Disabled or missing
+  claude.yaml          # Disabled or missing
+```
+
+### Model Bundle Verification
+
+For airgap environments, use signed model bundles:
+
+```yaml
+# llama-server.yaml
+bundle:
+  bundlePath: "/bundles/llama3-8b.stellaops-model"
+  verifySignature: true
+  cryptoScheme: "ed25519"
+
+model:
+  expectedDigest: "sha256:abc123..."
+```
+
+**Creating a model bundle:**
+
+```bash
+# Create signed bundle
+stella model bundle \
+  --model /models/llama-3-8b-instruct.Q4_K_M.gguf \
+  --sign \
+  --output /bundles/llama3-8b.stellaops-model
+
+# Verify bundle
+stella model verify /bundles/llama3-8b.stellaops-model
+```
+
+## Custom Plugins
+
+To add support for a new LLM provider:
+
+```csharp
+public sealed class CustomLlmProviderPlugin : ILlmProviderPlugin
+{
+    public string Name => "Custom LLM Provider";
+    public string ProviderId => "custom";
+    public string DisplayName => "Custom LLM";
+    public string Description => "Custom LLM backend";
+    public string DefaultConfigFileName => "custom.yaml";
+
+    public bool IsAvailable(IServiceProvider services) => true;
+
+    public ILlmProvider Create(IServiceProvider services, IConfiguration configuration)
+    {
+        var config = CustomConfig.FromConfiguration(configuration);
+        var httpClientFactory = services.GetRequiredService<IHttpClientFactory>();
+        var logger = services.GetRequiredService<ILogger<CustomLlmProvider>>();
+        return new CustomLlmProvider(httpClientFactory.CreateClient(), config, logger);
+    }
+
+    public LlmProviderConfigValidation ValidateConfiguration(IConfiguration configuration)
+    {
+        // Validate configuration
+        return LlmProviderConfigValidation.Success();
+    }
+}
+```
+
+Register the custom plugin:
+
+```csharp
+services.AddLlmProviderPlugins(catalog =>
+{
+    catalog.RegisterPlugin(new CustomLlmProviderPlugin());
+    catalog.LoadConfigurationsFromDirectory("etc/llm-providers");
+});
+```
+
+## Telemetry
+
+LLM operations emit structured logs:
+
+```json
+{
+  "timestamp": "2025-12-26T10:30:00Z",
+  "operation": "llm_completion",
+  "providerId": "llama-server",
+  "model": "llama3-8b-q4km",
+  "inputTokens": 1234,
+  "outputTokens": 567,
+  "totalTimeMs": 2345,
+  "deterministic": true,
+  "finishReason": "stop"
+}
+```
+
+## Performance Comparison
+
+| Provider | Latency (TTFT) | Throughput | Cost | Offline |
+|----------|---------------|------------|------|---------|
+| **llama-server** | 50-200ms | 20-50 tok/s | Free | Yes |
+| **ollama** | 100-500ms | 15-40 tok/s | Free | Yes |
+| **openai (gpt-4o)** | 200-500ms | 50-100 tok/s | $$$ | No |
+| **claude (sonnet)** | 300-600ms | 40-80 tok/s | $$$ | No |
+
+*Note: Local performance depends heavily on hardware (GPU, RAM, CPU).*
+
+## Troubleshooting
+
+### Provider Not Available
+
+```
+InvalidOperationException: No LLM providers are available.
+```
+
+**Solutions:**
+1. Check configuration files exist in `etc/llm-providers/`
+2. Verify API keys are set (environment variables or config)
+3. For local providers, ensure server is running:
+   ```bash
+   # llama-server
+   curl http://localhost:8080/health
+
+   # ollama
+   curl http://localhost:11434/api/tags
+   ```
+
+### Non-Deterministic Output
+
+```
+Warning: Output may not be reproducible
+```
+
+**Solutions:**
+1. Set `temperature: 0.0` in configuration
+2. Set `seed: 42` (or any fixed value)
+3. Use the same model version across environments
+
+### Timeout Errors
+
+```
+TaskCanceledException: The request was canceled due to timeout.
+```
+
+**Solutions:**
+1. Increase `request.timeout` in configuration
+2. For local inference, ensure sufficient hardware resources
+3. Reduce `maxTokens` if appropriate
+
+## Related Documentation
+
+- [AI Attestations](./ai-attestations.md)
+- [Offline Model Bundles](./offline-model-bundles.md)
+- [Advisory AI Architecture](../architecture.md)
+- [Configuration Reference](../../../../etc/llm-providers/)