183 lines
6.1 KiB
C#
183 lines
6.1 KiB
C#
using System.Diagnostics;
|
|
using System.Runtime.CompilerServices;
|
|
using System.Security.Cryptography;
|
|
|
|
namespace StellaOps.AdvisoryAI.Inference;
|
|
|
|
/// <summary>
|
|
/// Local LLM runtime using llama.cpp bindings.
|
|
/// Sprint: SPRINT_20251226_019_AI_offline_inference
|
|
/// Task: OFFLINE-05
|
|
/// </summary>
|
|
public sealed class LlamaCppRuntime : ILocalLlmRuntime
|
|
{
|
|
private LocalLlmConfig? _config;
|
|
private bool _modelLoaded;
|
|
private string? _computedDigest;
|
|
|
|
public string RuntimeType => "llama.cpp";
|
|
|
|
public Task LoadModelAsync(LocalLlmConfig config, CancellationToken cancellationToken = default)
|
|
{
|
|
_config = config;
|
|
|
|
// Verify model file exists
|
|
if (!File.Exists(config.ModelPath))
|
|
{
|
|
throw new FileNotFoundException($"Model file not found: {config.ModelPath}");
|
|
}
|
|
|
|
// In a real implementation, this would:
|
|
// 1. Load the GGUF/GGML model file
|
|
// 2. Initialize llama.cpp context with config settings
|
|
// 3. Verify digest if required
|
|
|
|
_modelLoaded = true;
|
|
return Task.CompletedTask;
|
|
}
|
|
|
|
public Task UnloadModelAsync(CancellationToken cancellationToken = default)
|
|
{
|
|
_modelLoaded = false;
|
|
_config = null;
|
|
_computedDigest = null;
|
|
return Task.CompletedTask;
|
|
}
|
|
|
|
public Task<LocalModelStatus> GetStatusAsync(CancellationToken cancellationToken = default)
|
|
{
|
|
return Task.FromResult(new LocalModelStatus
|
|
{
|
|
Loaded = _modelLoaded,
|
|
ModelPath = _config?.ModelPath ?? string.Empty,
|
|
DigestVerified = _computedDigest == _config?.WeightsDigest,
|
|
MemoryBytes = _modelLoaded ? EstimateMemoryUsage() : 0,
|
|
Device = _config?.Device.ToString() ?? "Unknown",
|
|
ContextSize = _config?.ContextLength ?? 0
|
|
});
|
|
}
|
|
|
|
public async Task<LocalInferenceResult> GenerateAsync(string prompt, CancellationToken cancellationToken = default)
|
|
{
|
|
if (!_modelLoaded || _config is null)
|
|
{
|
|
throw new InvalidOperationException("Model not loaded");
|
|
}
|
|
|
|
var stopwatch = Stopwatch.StartNew();
|
|
var firstTokenTime = 0L;
|
|
|
|
// In a real implementation, this would call llama.cpp inference
|
|
// For now, return a placeholder response
|
|
|
|
await Task.Delay(100, cancellationToken); // Simulate first token
|
|
firstTokenTime = stopwatch.ElapsedMilliseconds;
|
|
|
|
await Task.Delay(400, cancellationToken); // Simulate generation
|
|
|
|
stopwatch.Stop();
|
|
|
|
var generatedContent = GeneratePlaceholderResponse(prompt);
|
|
var tokensGenerated = generatedContent.Split(' ').Length;
|
|
|
|
return new LocalInferenceResult
|
|
{
|
|
Content = generatedContent,
|
|
TokensGenerated = tokensGenerated,
|
|
InferenceTimeMs = stopwatch.ElapsedMilliseconds,
|
|
TimeToFirstTokenMs = firstTokenTime,
|
|
ModelId = $"local:{Path.GetFileName(_config.ModelPath)}",
|
|
Deterministic = _config.Temperature == 0,
|
|
Seed = _config.Seed
|
|
};
|
|
}
|
|
|
|
public async IAsyncEnumerable<string> GenerateStreamAsync(
|
|
string prompt,
|
|
[EnumeratorCancellation] CancellationToken cancellationToken = default)
|
|
{
|
|
if (!_modelLoaded || _config is null)
|
|
{
|
|
throw new InvalidOperationException("Model not loaded");
|
|
}
|
|
|
|
// Simulate streaming output
|
|
var words = GeneratePlaceholderResponse(prompt).Split(' ');
|
|
foreach (var word in words)
|
|
{
|
|
if (cancellationToken.IsCancellationRequested)
|
|
{
|
|
yield break;
|
|
}
|
|
|
|
await Task.Delay(50, cancellationToken);
|
|
yield return word + " ";
|
|
}
|
|
}
|
|
|
|
public async Task<bool> VerifyDigestAsync(string expectedDigest, CancellationToken cancellationToken = default)
|
|
{
|
|
if (_config is null || !File.Exists(_config.ModelPath))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
using var sha256 = SHA256.Create();
|
|
await using var stream = File.OpenRead(_config.ModelPath);
|
|
var hash = await sha256.ComputeHashAsync(stream, cancellationToken);
|
|
_computedDigest = Convert.ToHexStringLower(hash);
|
|
|
|
return string.Equals(_computedDigest, expectedDigest, StringComparison.OrdinalIgnoreCase);
|
|
}
|
|
|
|
private long EstimateMemoryUsage()
|
|
{
|
|
if (_config is null)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
// Rough estimate based on quantization
|
|
var baseSize = new FileInfo(_config.ModelPath).Length;
|
|
var contextOverhead = _config.ContextLength * 4096L; // Rough KV cache estimate
|
|
|
|
return baseSize + contextOverhead;
|
|
}
|
|
|
|
private static string GeneratePlaceholderResponse(string prompt)
|
|
{
|
|
// In a real implementation, this would be actual LLM output
|
|
if (prompt.Contains("explain", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
return "This vulnerability affects the component by allowing unauthorized access. " +
|
|
"The vulnerable code path is reachable from the application entry point. " +
|
|
"Evidence: [EVIDENCE:sbom-001] Component is present in SBOM. " +
|
|
"[EVIDENCE:reach-001] Call graph shows reachability.";
|
|
}
|
|
|
|
if (prompt.Contains("remediat", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
return "Recommended remediation: Upgrade the affected component to the patched version. " +
|
|
"- Update package.json: dependency@1.0.0 -> dependency@1.0.1 " +
|
|
"- Run npm install to update lockfile " +
|
|
"- Verify with npm audit";
|
|
}
|
|
|
|
if (prompt.Contains("policy", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
return "Parsed policy intent: Override rule for critical severity. " +
|
|
"Conditions: severity = critical, scope = production. " +
|
|
"Actions: set_verdict = block.";
|
|
}
|
|
|
|
return "Analysis complete. The finding has been evaluated based on available evidence.";
|
|
}
|
|
|
|
public void Dispose()
|
|
{
|
|
_modelLoaded = false;
|
|
_config = null;
|
|
_computedDigest = null;
|
|
}
|
|
}
|