save progress
This commit is contained in:
@@ -12,6 +12,8 @@ namespace StellaOps.AdvisoryAI.Tests;
|
||||
/// Sprint: SPRINT_20251226_015_AI_zastava_companion
|
||||
/// Task: ZASTAVA-19
|
||||
/// </summary>
|
||||
[Trait("Category", TestCategories.Integration)]
|
||||
[Trait("BlastRadius", TestCategories.BlastRadius.Advisories)]
|
||||
public sealed class ExplanationGeneratorIntegrationTests
|
||||
{
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
|
||||
@@ -83,80 +83,6 @@ public sealed class HttpClientUsageAnalyzerTests
|
||||
Assert.DoesNotContain(diagnostics, d => d.Id == HttpClientUsageAnalyzer.DiagnosticId);
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task CodeFix_RewritesToFactoryCall()
|
||||
{
|
||||
const string source = """
|
||||
using System.Net.Http;
|
||||
|
||||
namespace Sample.Service;
|
||||
|
||||
public sealed class Demo
|
||||
{
|
||||
public void Run()
|
||||
{
|
||||
var client = new HttpClient();
|
||||
}
|
||||
}
|
||||
""";
|
||||
|
||||
const string expected = """
|
||||
using System.Net.Http;
|
||||
|
||||
namespace Sample.Service;
|
||||
|
||||
public sealed class Demo
|
||||
{
|
||||
public void Run()
|
||||
{
|
||||
var client = global::StellaOps.AirGap.Policy.EgressHttpClientFactory.Create(egressPolicy: default(global::StellaOps.AirGap.Policy.IEgressPolicy) /* TODO: provide IEgressPolicy instance */, request: new global::StellaOps.AirGap.Policy.EgressRequest(component: "REPLACE_COMPONENT", destination: new global::System.Uri("https://replace-with-endpoint"), intent: "REPLACE_INTENT"));
|
||||
}
|
||||
}
|
||||
""";
|
||||
|
||||
var updated = await ApplyCodeFixAsync(source, assemblyName: "Sample.Service");
|
||||
Assert.Equal(expected.ReplaceLineEndings(), updated.ReplaceLineEndings());
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task CodeFix_PreservesHttpClientArguments()
|
||||
{
|
||||
const string source = """
|
||||
using System.Net.Http;
|
||||
|
||||
namespace Sample.Service;
|
||||
|
||||
public sealed class Demo
|
||||
{
|
||||
public void Run()
|
||||
{
|
||||
var handler = new HttpClientHandler();
|
||||
var client = new HttpClient(handler, disposeHandler: false);
|
||||
}
|
||||
}
|
||||
""";
|
||||
|
||||
const string expected = """
|
||||
using System.Net.Http;
|
||||
|
||||
namespace Sample.Service;
|
||||
|
||||
public sealed class Demo
|
||||
{
|
||||
public void Run()
|
||||
{
|
||||
var handler = new HttpClientHandler();
|
||||
var client = global::StellaOps.AirGap.Policy.EgressHttpClientFactory.Create(egressPolicy: default(global::StellaOps.AirGap.Policy.IEgressPolicy) /* TODO: provide IEgressPolicy instance */, request: new global::StellaOps.AirGap.Policy.EgressRequest(component: "REPLACE_COMPONENT", destination: new global::System.Uri("https://replace-with-endpoint"), intent: "REPLACE_INTENT"), clientFactory: () => new global::System.Net.Http.HttpClient(handler, disposeHandler: false));
|
||||
}
|
||||
}
|
||||
""";
|
||||
|
||||
var updated = await ApplyCodeFixAsync(source, assemblyName: "Sample.Service");
|
||||
Assert.Equal(expected.ReplaceLineEndings(), updated.ReplaceLineEndings());
|
||||
}
|
||||
|
||||
private static async Task<ImmutableArray<Diagnostic>> AnalyzeAsync(string source, string assemblyName)
|
||||
{
|
||||
var compilation = CSharpCompilation.Create(
|
||||
@@ -174,53 +100,6 @@ public sealed class HttpClientUsageAnalyzerTests
|
||||
return await compilationWithAnalyzers.GetAnalyzerDiagnosticsAsync();
|
||||
}
|
||||
|
||||
private static async Task<string> ApplyCodeFixAsync(string source, string assemblyName)
|
||||
{
|
||||
using var workspace = new AdhocWorkspace();
|
||||
|
||||
var projectId = ProjectId.CreateNewId();
|
||||
var documentId = DocumentId.CreateNewId(projectId);
|
||||
var stubDocumentId = DocumentId.CreateNewId(projectId);
|
||||
|
||||
var solution = workspace.CurrentSolution
|
||||
.AddProject(projectId, "TestProject", "TestProject", LanguageNames.CSharp)
|
||||
.WithProjectCompilationOptions(projectId, new CSharpCompilationOptions(OutputKind.DynamicallyLinkedLibrary))
|
||||
.WithProjectAssemblyName(projectId, assemblyName)
|
||||
.AddMetadataReferences(projectId, CreateMetadataReferences())
|
||||
.AddDocument(documentId, "Test.cs", SourceText.From(source))
|
||||
.AddDocument(stubDocumentId, "PolicyStubs.cs", SourceText.From(PolicyStubSource));
|
||||
|
||||
var project = solution.GetProject(projectId)!;
|
||||
var document = solution.GetDocument(documentId)!;
|
||||
|
||||
var compilation = await project.GetCompilationAsync();
|
||||
var analyzer = new HttpClientUsageAnalyzer();
|
||||
var diagnostics = await compilation!.WithAnalyzers(ImmutableArray.Create<DiagnosticAnalyzer>(analyzer))
|
||||
.GetAnalyzerDiagnosticsAsync();
|
||||
|
||||
var diagnostic = Assert.Single(diagnostics);
|
||||
|
||||
var codeFixProvider = new HttpClientUsageCodeFixProvider();
|
||||
var actions = new List<CodeAction>();
|
||||
var context = new CodeFixContext(
|
||||
document,
|
||||
diagnostic,
|
||||
(action, _) => actions.Add(action),
|
||||
CancellationToken.None);
|
||||
|
||||
await codeFixProvider.RegisterCodeFixesAsync(context);
|
||||
var action = Assert.Single(actions);
|
||||
var operations = await action.GetOperationsAsync(CancellationToken.None);
|
||||
|
||||
foreach (var operation in operations)
|
||||
{
|
||||
operation.Apply(workspace, CancellationToken.None);
|
||||
}
|
||||
var updatedDocument = workspace.CurrentSolution.GetDocument(documentId)!;
|
||||
var updatedText = await updatedDocument.GetTextAsync();
|
||||
return updatedText.ToString();
|
||||
}
|
||||
|
||||
private static IEnumerable<MetadataReference> CreateMetadataReferences()
|
||||
{
|
||||
yield return MetadataReference.CreateFromFile(typeof(object).GetTypeInfo().Assembly.Location);
|
||||
|
||||
@@ -276,165 +276,6 @@ public sealed class PolicyAnalyzerRoslynTests
|
||||
|
||||
#region AIRGAP-5100-006: Golden Generated Code Tests
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task CodeFix_GeneratesExpectedFactoryCall()
|
||||
{
|
||||
const string source = """
|
||||
using System.Net.Http;
|
||||
|
||||
namespace Sample.Service;
|
||||
|
||||
public sealed class Demo
|
||||
{
|
||||
public void Run()
|
||||
{
|
||||
var client = new HttpClient();
|
||||
}
|
||||
}
|
||||
""";
|
||||
|
||||
const string expectedGolden = """
|
||||
using System.Net.Http;
|
||||
|
||||
namespace Sample.Service;
|
||||
|
||||
public sealed class Demo
|
||||
{
|
||||
public void Run()
|
||||
{
|
||||
var client = global::StellaOps.AirGap.Policy.EgressHttpClientFactory.Create(egressPolicy: default(global::StellaOps.AirGap.Policy.IEgressPolicy) /* TODO: provide IEgressPolicy instance */, request: new global::StellaOps.AirGap.Policy.EgressRequest(component: "REPLACE_COMPONENT", destination: new global::System.Uri("https://replace-with-endpoint"), intent: "REPLACE_INTENT"));
|
||||
}
|
||||
}
|
||||
""";
|
||||
|
||||
var fixedCode = await ApplyCodeFixAsync(source, assemblyName: "Sample.Service");
|
||||
fixedCode.ReplaceLineEndings().Should().Be(expectedGolden.ReplaceLineEndings(),
|
||||
"Code fix should match golden output exactly");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task CodeFix_PreservesTrivia()
|
||||
{
|
||||
const string source = """
|
||||
using System.Net.Http;
|
||||
|
||||
namespace Sample.Service;
|
||||
|
||||
public sealed class Demo
|
||||
{
|
||||
public void Run()
|
||||
{
|
||||
// Important: this client handles external requests
|
||||
var client = new HttpClient(); // end of line comment
|
||||
}
|
||||
}
|
||||
""";
|
||||
|
||||
var fixedCode = await ApplyCodeFixAsync(source, assemblyName: "Sample.Service");
|
||||
|
||||
// The code fix preserves the trivia from the original node
|
||||
fixedCode.Should().Contain("// Important: this client handles external requests",
|
||||
"Leading comment should be preserved");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task CodeFix_DeterministicOutput()
|
||||
{
|
||||
const string source = """
|
||||
using System.Net.Http;
|
||||
|
||||
namespace Sample.Determinism;
|
||||
|
||||
public sealed class Demo
|
||||
{
|
||||
public void Run()
|
||||
{
|
||||
var client = new HttpClient();
|
||||
}
|
||||
}
|
||||
""";
|
||||
|
||||
// Apply code fix multiple times
|
||||
var result1 = await ApplyCodeFixAsync(source, assemblyName: "Sample.Determinism");
|
||||
var result2 = await ApplyCodeFixAsync(source, assemblyName: "Sample.Determinism");
|
||||
var result3 = await ApplyCodeFixAsync(source, assemblyName: "Sample.Determinism");
|
||||
|
||||
result1.Should().Be(result2, "Code fix should be deterministic");
|
||||
result2.Should().Be(result3, "Code fix should be deterministic");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task CodeFix_ContainsRequiredPlaceholders()
|
||||
{
|
||||
const string source = """
|
||||
using System.Net.Http;
|
||||
|
||||
namespace Sample.Service;
|
||||
|
||||
public sealed class Demo
|
||||
{
|
||||
public void Run()
|
||||
{
|
||||
var client = new HttpClient();
|
||||
}
|
||||
}
|
||||
""";
|
||||
|
||||
var fixedCode = await ApplyCodeFixAsync(source, assemblyName: "Sample.Service");
|
||||
|
||||
// Verify all required placeholders are present for developer to fill in
|
||||
fixedCode.Should().Contain("EgressHttpClientFactory.Create");
|
||||
fixedCode.Should().Contain("egressPolicy:");
|
||||
fixedCode.Should().Contain("IEgressPolicy");
|
||||
fixedCode.Should().Contain("EgressRequest");
|
||||
fixedCode.Should().Contain("component:");
|
||||
fixedCode.Should().Contain("REPLACE_COMPONENT");
|
||||
fixedCode.Should().Contain("destination:");
|
||||
fixedCode.Should().Contain("intent:");
|
||||
fixedCode.Should().Contain("REPLACE_INTENT");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task CodeFix_UsesFullyQualifiedNames()
|
||||
{
|
||||
const string source = """
|
||||
using System.Net.Http;
|
||||
|
||||
namespace Sample.Service;
|
||||
|
||||
public sealed class Demo
|
||||
{
|
||||
public void Run()
|
||||
{
|
||||
var client = new HttpClient();
|
||||
}
|
||||
}
|
||||
""";
|
||||
|
||||
var fixedCode = await ApplyCodeFixAsync(source, assemblyName: "Sample.Service");
|
||||
|
||||
// Verify fully qualified names are used to avoid namespace conflicts
|
||||
fixedCode.Should().Contain("global::StellaOps.AirGap.Policy.EgressHttpClientFactory");
|
||||
fixedCode.Should().Contain("global::StellaOps.AirGap.Policy.EgressRequest");
|
||||
fixedCode.Should().Contain("global::System.Uri");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task FixAllProvider_IsWellKnownBatchFixer()
|
||||
{
|
||||
var provider = new HttpClientUsageCodeFixProvider();
|
||||
var fixAllProvider = provider.GetFixAllProvider();
|
||||
|
||||
fixAllProvider.Should().Be(WellKnownFixAllProviders.BatchFixer,
|
||||
"Should use batch fixer for efficient multi-fix application");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task Analyzer_SupportedDiagnostics_ContainsExpectedId()
|
||||
@@ -446,20 +287,6 @@ public sealed class PolicyAnalyzerRoslynTests
|
||||
supportedDiagnostics[0].Id.Should().Be("AIRGAP001");
|
||||
}
|
||||
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
[Fact]
|
||||
public async Task CodeFixProvider_FixableDiagnosticIds_MatchesAnalyzer()
|
||||
{
|
||||
var analyzer = new HttpClientUsageAnalyzer();
|
||||
var codeFixProvider = new HttpClientUsageCodeFixProvider();
|
||||
|
||||
var analyzerIds = analyzer.SupportedDiagnostics.Select(d => d.Id).ToHashSet();
|
||||
var fixableIds = codeFixProvider.FixableDiagnosticIds.ToHashSet();
|
||||
|
||||
fixableIds.Should().BeSubsetOf(analyzerIds,
|
||||
"Code fix provider should only fix diagnostics reported by the analyzer");
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Test Helpers
|
||||
@@ -481,53 +308,6 @@ public sealed class PolicyAnalyzerRoslynTests
|
||||
return await compilationWithAnalyzers.GetAnalyzerDiagnosticsAsync();
|
||||
}
|
||||
|
||||
private static async Task<string> ApplyCodeFixAsync(string source, string assemblyName)
|
||||
{
|
||||
using var workspace = new AdhocWorkspace();
|
||||
|
||||
var projectId = ProjectId.CreateNewId();
|
||||
var documentId = DocumentId.CreateNewId(projectId);
|
||||
var stubDocumentId = DocumentId.CreateNewId(projectId);
|
||||
|
||||
var solution = workspace.CurrentSolution
|
||||
.AddProject(projectId, "TestProject", "TestProject", LanguageNames.CSharp)
|
||||
.WithProjectCompilationOptions(projectId, new CSharpCompilationOptions(OutputKind.DynamicallyLinkedLibrary))
|
||||
.WithProjectAssemblyName(projectId, assemblyName)
|
||||
.AddMetadataReferences(projectId, CreateMetadataReferences())
|
||||
.AddDocument(documentId, "Test.cs", SourceText.From(source))
|
||||
.AddDocument(stubDocumentId, "PolicyStubs.cs", SourceText.From(PolicyStubSource));
|
||||
|
||||
var project = solution.GetProject(projectId)!;
|
||||
var document = solution.GetDocument(documentId)!;
|
||||
|
||||
var compilation = await project.GetCompilationAsync();
|
||||
var analyzer = new HttpClientUsageAnalyzer();
|
||||
var diagnostics = await compilation!.WithAnalyzers(ImmutableArray.Create<DiagnosticAnalyzer>(analyzer))
|
||||
.GetAnalyzerDiagnosticsAsync();
|
||||
|
||||
var diagnostic = diagnostics.Single(d => d.Id == HttpClientUsageAnalyzer.DiagnosticId);
|
||||
|
||||
var codeFixProvider = new HttpClientUsageCodeFixProvider();
|
||||
var actions = new List<CodeAction>();
|
||||
var context = new CodeFixContext(
|
||||
document,
|
||||
diagnostic,
|
||||
(action, _) => actions.Add(action),
|
||||
CancellationToken.None);
|
||||
|
||||
await codeFixProvider.RegisterCodeFixesAsync(context);
|
||||
var action = actions.Single();
|
||||
var operations = await action.GetOperationsAsync(CancellationToken.None);
|
||||
|
||||
foreach (var operation in operations)
|
||||
{
|
||||
operation.Apply(workspace, CancellationToken.None);
|
||||
}
|
||||
var updatedDocument = workspace.CurrentSolution.GetDocument(documentId)!;
|
||||
var updatedText = await updatedDocument.GetTextAsync();
|
||||
return updatedText.ToString();
|
||||
}
|
||||
|
||||
private static IEnumerable<MetadataReference> CreateMetadataReferences()
|
||||
{
|
||||
// Core runtime references
|
||||
|
||||
@@ -1,125 +0,0 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Collections.Immutable;
|
||||
using System.Composition;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.CodeAnalysis;
|
||||
using Microsoft.CodeAnalysis.CodeActions;
|
||||
using Microsoft.CodeAnalysis.CodeFixes;
|
||||
using Microsoft.CodeAnalysis.CSharp;
|
||||
using Microsoft.CodeAnalysis.CSharp.Syntax;
|
||||
|
||||
namespace StellaOps.AirGap.Policy.Analyzers;
|
||||
|
||||
/// <summary>
|
||||
/// Offers a remediation template that routes HttpClient creation through the shared EgressPolicy factory.
|
||||
/// </summary>
|
||||
[ExportCodeFixProvider(LanguageNames.CSharp, Name = nameof(HttpClientUsageCodeFixProvider))]
|
||||
[Shared]
|
||||
public sealed class HttpClientUsageCodeFixProvider : CodeFixProvider
|
||||
{
|
||||
private const string Title = "Use EgressHttpClientFactory.Create(...)";
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override ImmutableArray<string> FixableDiagnosticIds
|
||||
=> ImmutableArray.Create(HttpClientUsageAnalyzer.DiagnosticId);
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override FixAllProvider GetFixAllProvider()
|
||||
=> WellKnownFixAllProviders.BatchFixer;
|
||||
|
||||
/// <inheritdoc/>
|
||||
public override async Task RegisterCodeFixesAsync(CodeFixContext context)
|
||||
{
|
||||
if (context.Document is null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var root = await context.Document.GetSyntaxRootAsync(context.CancellationToken).ConfigureAwait(false);
|
||||
if (root is null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
var diagnostic = context.Diagnostics[0];
|
||||
var node = root.FindNode(diagnostic.Location.SourceSpan);
|
||||
if (node is not ObjectCreationExpressionSyntax objectCreation)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
context.RegisterCodeFix(
|
||||
CodeAction.Create(
|
||||
Title,
|
||||
cancellationToken => ReplaceWithFactoryCallAsync(context.Document, objectCreation, cancellationToken),
|
||||
equivalenceKey: Title),
|
||||
diagnostic);
|
||||
}
|
||||
|
||||
private static async Task<Document> ReplaceWithFactoryCallAsync(Document document, ObjectCreationExpressionSyntax creation, CancellationToken cancellationToken)
|
||||
{
|
||||
var replacementExpression = BuildReplacementExpression(creation);
|
||||
|
||||
var root = await document.GetSyntaxRootAsync(cancellationToken).ConfigureAwait(false);
|
||||
if (root is null)
|
||||
{
|
||||
return document;
|
||||
}
|
||||
|
||||
var updatedRoot = root.ReplaceNode(creation, replacementExpression.WithTriviaFrom(creation));
|
||||
return document.WithSyntaxRoot(updatedRoot);
|
||||
}
|
||||
|
||||
private static ExpressionSyntax BuildReplacementExpression(ObjectCreationExpressionSyntax creation)
|
||||
{
|
||||
var requestExpression = SyntaxFactory.ParseExpression(
|
||||
"new global::StellaOps.AirGap.Policy.EgressRequest(" +
|
||||
"component: \"REPLACE_COMPONENT\", " +
|
||||
"destination: new global::System.Uri(\"https://replace-with-endpoint\"), " +
|
||||
"intent: \"REPLACE_INTENT\")");
|
||||
|
||||
var egressPolicyExpression = SyntaxFactory.ParseExpression(
|
||||
"default(global::StellaOps.AirGap.Policy.IEgressPolicy)");
|
||||
|
||||
var arguments = new List<ArgumentSyntax>
|
||||
{
|
||||
SyntaxFactory.Argument(egressPolicyExpression)
|
||||
.WithNameColon(SyntaxFactory.NameColon("egressPolicy"))
|
||||
.WithTrailingTrivia(
|
||||
SyntaxFactory.Space,
|
||||
SyntaxFactory.Comment("/* TODO: provide IEgressPolicy instance */")),
|
||||
SyntaxFactory.Argument(requestExpression)
|
||||
.WithNameColon(SyntaxFactory.NameColon("request"))
|
||||
};
|
||||
|
||||
if (ShouldUseClientFactory(creation))
|
||||
{
|
||||
var clientFactoryLambda = SyntaxFactory.ParenthesizedLambdaExpression(
|
||||
SyntaxFactory.ParameterList(),
|
||||
CreateHttpClientExpression(creation));
|
||||
|
||||
arguments.Add(
|
||||
SyntaxFactory.Argument(clientFactoryLambda)
|
||||
.WithNameColon(SyntaxFactory.NameColon("clientFactory")));
|
||||
}
|
||||
|
||||
return SyntaxFactory.InvocationExpression(
|
||||
SyntaxFactory.ParseExpression("global::StellaOps.AirGap.Policy.EgressHttpClientFactory.Create"))
|
||||
.WithArgumentList(SyntaxFactory.ArgumentList(SyntaxFactory.SeparatedList(arguments)));
|
||||
}
|
||||
|
||||
private static bool ShouldUseClientFactory(ObjectCreationExpressionSyntax creation)
|
||||
=> (creation.ArgumentList?.Arguments.Count ?? 0) > 0 || creation.Initializer is not null;
|
||||
|
||||
private static ObjectCreationExpressionSyntax CreateHttpClientExpression(ObjectCreationExpressionSyntax creation)
|
||||
{
|
||||
var httpClientType = SyntaxFactory.ParseTypeName("global::System.Net.Http.HttpClient");
|
||||
var arguments = creation.ArgumentList ?? SyntaxFactory.ArgumentList();
|
||||
|
||||
return SyntaxFactory.ObjectCreationExpression(httpClientType)
|
||||
.WithArgumentList(arguments)
|
||||
.WithInitializer(creation.Initializer);
|
||||
}
|
||||
}
|
||||
@@ -13,7 +13,6 @@
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.CodeAnalysis.CSharp" PrivateAssets="all" />
|
||||
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Workspaces" PrivateAssets="all" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
||||
@@ -0,0 +1,148 @@
|
||||
// <copyright file="AirGapSyncServiceCollectionExtensions.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.DependencyInjection.Extensions;
|
||||
using StellaOps.AirGap.Sync.Services;
|
||||
using StellaOps.AirGap.Sync.Stores;
|
||||
using StellaOps.AirGap.Sync.Transport;
|
||||
using StellaOps.Determinism;
|
||||
using StellaOps.HybridLogicalClock;
|
||||
|
||||
namespace StellaOps.AirGap.Sync;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering air-gap sync services.
|
||||
/// </summary>
|
||||
public static class AirGapSyncServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds air-gap sync services to the service collection.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="nodeId">The node identifier for this instance.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddAirGapSyncServices(
|
||||
this IServiceCollection services,
|
||||
string nodeId)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(nodeId);
|
||||
|
||||
// Core services
|
||||
services.TryAddSingleton<IConflictResolver, ConflictResolver>();
|
||||
services.TryAddSingleton<IHlcMergeService, HlcMergeService>();
|
||||
services.TryAddSingleton<IAirGapBundleImporter, AirGapBundleImporter>();
|
||||
|
||||
// Register in-memory HLC state store for offline operation
|
||||
services.TryAddSingleton<IHlcStateStore, InMemoryHlcStateStore>();
|
||||
|
||||
// Register HLC clock with node ID
|
||||
services.TryAddSingleton<IHybridLogicalClock>(sp =>
|
||||
{
|
||||
var timeProvider = sp.GetService<TimeProvider>() ?? TimeProvider.System;
|
||||
var stateStore = sp.GetRequiredService<IHlcStateStore>();
|
||||
return new HybridLogicalClock.HybridLogicalClock(timeProvider, nodeId, stateStore);
|
||||
});
|
||||
|
||||
// Register deterministic GUID provider
|
||||
services.TryAddSingleton<IGuidProvider>(SystemGuidProvider.Instance);
|
||||
|
||||
// File-based store (can be overridden)
|
||||
services.TryAddSingleton<IOfflineJobLogStore, FileBasedOfflineJobLogStore>();
|
||||
|
||||
// Offline HLC manager
|
||||
services.TryAddSingleton<IOfflineHlcManager, OfflineHlcManager>();
|
||||
|
||||
// Bundle exporter
|
||||
services.TryAddSingleton<IAirGapBundleExporter, AirGapBundleExporter>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds air-gap sync services with custom options.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="nodeId">The node identifier for this instance.</param>
|
||||
/// <param name="configureOptions">Action to configure file-based store options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddAirGapSyncServices(
|
||||
this IServiceCollection services,
|
||||
string nodeId,
|
||||
Action<FileBasedOfflineJobLogStoreOptions> configureOptions)
|
||||
{
|
||||
// Configure file-based store options
|
||||
services.Configure(configureOptions);
|
||||
|
||||
return services.AddAirGapSyncServices(nodeId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds the air-gap sync service for importing bundles to the central scheduler.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
/// <remarks>
|
||||
/// This requires ISyncSchedulerLogRepository to be registered separately,
|
||||
/// as it depends on the Scheduler.Persistence module.
|
||||
/// </remarks>
|
||||
public static IServiceCollection AddAirGapSyncImportService(this IServiceCollection services)
|
||||
{
|
||||
services.TryAddScoped<IAirGapSyncService, AirGapSyncService>();
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds file-based transport for job sync bundles.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddFileBasedJobSyncTransport(this IServiceCollection services)
|
||||
{
|
||||
services.TryAddSingleton<IJobSyncTransport, FileBasedJobSyncTransport>();
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds file-based transport for job sync bundles with custom options.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureOptions">Action to configure transport options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddFileBasedJobSyncTransport(
|
||||
this IServiceCollection services,
|
||||
Action<FileBasedJobSyncTransportOptions> configureOptions)
|
||||
{
|
||||
services.Configure(configureOptions);
|
||||
return services.AddFileBasedJobSyncTransport();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds Router-based transport for job sync bundles.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
/// <remarks>
|
||||
/// Requires IRouterJobSyncClient to be registered separately.
|
||||
/// </remarks>
|
||||
public static IServiceCollection AddRouterJobSyncTransport(this IServiceCollection services)
|
||||
{
|
||||
services.TryAddSingleton<IJobSyncTransport, RouterJobSyncTransport>();
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds Router-based transport for job sync bundles with custom options.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureOptions">Action to configure transport options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddRouterJobSyncTransport(
|
||||
this IServiceCollection services,
|
||||
Action<RouterJobSyncTransportOptions> configureOptions)
|
||||
{
|
||||
services.Configure(configureOptions);
|
||||
return services.AddRouterJobSyncTransport();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
// <copyright file="AirGapBundle.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Models;
|
||||
|
||||
/// <summary>
|
||||
/// Represents an air-gap bundle containing job logs from one or more offline nodes.
|
||||
/// </summary>
|
||||
public sealed record AirGapBundle
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the unique bundle identifier.
|
||||
/// </summary>
|
||||
public required Guid BundleId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the tenant ID for this bundle.
|
||||
/// </summary>
|
||||
public required string TenantId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets when the bundle was created.
|
||||
/// </summary>
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the node ID that created this bundle.
|
||||
/// </summary>
|
||||
public required string CreatedByNodeId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the job logs from each offline node.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<NodeJobLog> JobLogs { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the bundle manifest digest for integrity verification.
|
||||
/// </summary>
|
||||
public required string ManifestDigest { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the optional DSSE signature over the manifest.
|
||||
/// </summary>
|
||||
public string? Signature { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the key ID used for signing (if signed).
|
||||
/// </summary>
|
||||
public string? SignedBy { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,68 @@
|
||||
// <copyright file="ConflictResolution.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Models;
|
||||
|
||||
/// <summary>
|
||||
/// Result of conflict resolution for a job ID.
|
||||
/// </summary>
|
||||
public sealed record ConflictResolution
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the type of conflict detected.
|
||||
/// </summary>
|
||||
public required ConflictType Type { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the resolution strategy applied.
|
||||
/// </summary>
|
||||
public required ResolutionStrategy Resolution { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the selected entry (when resolution is not Error).
|
||||
/// </summary>
|
||||
public OfflineJobLogEntry? SelectedEntry { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the entries that were dropped.
|
||||
/// </summary>
|
||||
public IReadOnlyList<OfflineJobLogEntry>? DroppedEntries { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the error message (when resolution is Error).
|
||||
/// </summary>
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Types of conflicts that can occur during merge.
|
||||
/// </summary>
|
||||
public enum ConflictType
|
||||
{
|
||||
/// <summary>
|
||||
/// Same JobId with different HLC timestamps but identical payload.
|
||||
/// </summary>
|
||||
DuplicateTimestamp,
|
||||
|
||||
/// <summary>
|
||||
/// Same JobId with different payloads - indicates a bug.
|
||||
/// </summary>
|
||||
PayloadMismatch
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Strategies for resolving conflicts.
|
||||
/// </summary>
|
||||
public enum ResolutionStrategy
|
||||
{
|
||||
/// <summary>
|
||||
/// Take the entry with the earliest HLC timestamp.
|
||||
/// </summary>
|
||||
TakeEarliest,
|
||||
|
||||
/// <summary>
|
||||
/// Fail the merge - conflict cannot be resolved.
|
||||
/// </summary>
|
||||
Error
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
// <copyright file="MergeResult.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using StellaOps.HybridLogicalClock;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Models;
|
||||
|
||||
/// <summary>
|
||||
/// Result of merging job logs from multiple offline nodes.
|
||||
/// </summary>
|
||||
public sealed record MergeResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the merged entries in HLC total order.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<MergedJobEntry> MergedEntries { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets duplicate entries that were dropped during merge.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<DuplicateEntry> Duplicates { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the merged chain head (final link after merge).
|
||||
/// </summary>
|
||||
public byte[]? MergedChainHead { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the source node IDs that contributed to this merge.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<string> SourceNodes { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A job entry after merge with unified chain link.
|
||||
/// </summary>
|
||||
public sealed class MergedJobEntry
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets or sets the source node ID that created this entry.
|
||||
/// </summary>
|
||||
public required string SourceNodeId { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the HLC timestamp.
|
||||
/// </summary>
|
||||
public required HlcTimestamp THlc { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the job ID.
|
||||
/// </summary>
|
||||
public required Guid JobId { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the partition key.
|
||||
/// </summary>
|
||||
public string? PartitionKey { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the serialized payload.
|
||||
/// </summary>
|
||||
public required string Payload { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the payload hash.
|
||||
/// </summary>
|
||||
public required byte[] PayloadHash { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the original chain link from the source node.
|
||||
/// </summary>
|
||||
public required byte[] OriginalLink { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the merged chain link (computed during merge).
|
||||
/// </summary>
|
||||
public byte[]? MergedLink { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Represents a duplicate entry dropped during merge.
|
||||
/// </summary>
|
||||
public sealed record DuplicateEntry(
|
||||
Guid JobId,
|
||||
string NodeId,
|
||||
HlcTimestamp THlc);
|
||||
@@ -0,0 +1,33 @@
|
||||
// <copyright file="NodeJobLog.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using StellaOps.HybridLogicalClock;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Models;
|
||||
|
||||
/// <summary>
|
||||
/// Represents the job log from a single offline node.
|
||||
/// </summary>
|
||||
public sealed record NodeJobLog
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the node identifier.
|
||||
/// </summary>
|
||||
public required string NodeId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the last HLC timestamp in this log.
|
||||
/// </summary>
|
||||
public required HlcTimestamp LastHlc { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the chain head (last link) in this log.
|
||||
/// </summary>
|
||||
public required byte[] ChainHead { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the job log entries in HLC order.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<OfflineJobLogEntry> Entries { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
// <copyright file="OfflineJobLogEntry.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using StellaOps.HybridLogicalClock;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Models;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a job log entry created while operating offline.
|
||||
/// </summary>
|
||||
public sealed record OfflineJobLogEntry
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the node ID that created this entry.
|
||||
/// </summary>
|
||||
public required string NodeId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the HLC timestamp when the job was enqueued.
|
||||
/// </summary>
|
||||
public required HlcTimestamp THlc { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the deterministic job ID.
|
||||
/// </summary>
|
||||
public required Guid JobId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the partition key (if any).
|
||||
/// </summary>
|
||||
public string? PartitionKey { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the serialized job payload.
|
||||
/// </summary>
|
||||
public required string Payload { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the SHA-256 hash of the canonical payload.
|
||||
/// </summary>
|
||||
public required byte[] PayloadHash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the previous chain link (null for first entry).
|
||||
/// </summary>
|
||||
public byte[]? PrevLink { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the chain link: Hash(prev_link || job_id || t_hlc || payload_hash).
|
||||
/// </summary>
|
||||
public required byte[] Link { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the wall-clock time when the entry was created (informational only).
|
||||
/// </summary>
|
||||
public DateTimeOffset EnqueuedAt { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
// <copyright file="SyncResult.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Models;
|
||||
|
||||
/// <summary>
|
||||
/// Result of syncing an air-gap bundle to the central scheduler.
|
||||
/// </summary>
|
||||
public sealed record SyncResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the bundle ID that was synced.
|
||||
/// </summary>
|
||||
public required Guid BundleId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the total number of entries in the bundle.
|
||||
/// </summary>
|
||||
public required int TotalInBundle { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of entries appended to the scheduler log.
|
||||
/// </summary>
|
||||
public required int Appended { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of duplicate entries skipped.
|
||||
/// </summary>
|
||||
public required int Duplicates { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of entries that already existed (idempotency).
|
||||
/// </summary>
|
||||
public int AlreadyExisted { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the new chain head after sync.
|
||||
/// </summary>
|
||||
public byte[]? NewChainHead { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets any warnings generated during sync.
|
||||
/// </summary>
|
||||
public IReadOnlyList<string>? Warnings { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of an offline enqueue operation.
|
||||
/// </summary>
|
||||
public sealed record OfflineEnqueueResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the HLC timestamp assigned.
|
||||
/// </summary>
|
||||
public required StellaOps.HybridLogicalClock.HlcTimestamp THlc { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the deterministic job ID.
|
||||
/// </summary>
|
||||
public required Guid JobId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the chain link computed.
|
||||
/// </summary>
|
||||
public required byte[] Link { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the node ID that created this entry.
|
||||
/// </summary>
|
||||
public required string NodeId { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,270 @@
|
||||
// <copyright file="AirGapBundleExporter.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.AirGap.Sync.Models;
|
||||
using StellaOps.AirGap.Sync.Stores;
|
||||
using StellaOps.Canonical.Json;
|
||||
using StellaOps.Determinism;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for air-gap bundle export operations.
|
||||
/// </summary>
|
||||
public interface IAirGapBundleExporter
|
||||
{
|
||||
/// <summary>
|
||||
/// Exports an air-gap bundle containing offline job logs.
|
||||
/// </summary>
|
||||
/// <param name="tenantId">The tenant ID.</param>
|
||||
/// <param name="nodeIds">The node IDs to include (null for current node only).</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The exported bundle.</returns>
|
||||
Task<AirGapBundle> ExportAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<string>? nodeIds = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Exports an air-gap bundle to a file.
|
||||
/// </summary>
|
||||
/// <param name="bundle">The bundle to export.</param>
|
||||
/// <param name="outputPath">The output file path.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task ExportToFileAsync(
|
||||
AirGapBundle bundle,
|
||||
string outputPath,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Exports an air-gap bundle to a JSON string.
|
||||
/// </summary>
|
||||
/// <param name="bundle">The bundle to export.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The JSON string representation.</returns>
|
||||
Task<string> ExportToStringAsync(
|
||||
AirGapBundle bundle,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service for exporting air-gap bundles.
|
||||
/// </summary>
|
||||
public sealed class AirGapBundleExporter : IAirGapBundleExporter
|
||||
{
|
||||
private readonly IOfflineJobLogStore _jobLogStore;
|
||||
private readonly IOfflineHlcManager _hlcManager;
|
||||
private readonly IGuidProvider _guidProvider;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private readonly ILogger<AirGapBundleExporter> _logger;
|
||||
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
WriteIndented = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="AirGapBundleExporter"/> class.
|
||||
/// </summary>
|
||||
public AirGapBundleExporter(
|
||||
IOfflineJobLogStore jobLogStore,
|
||||
IOfflineHlcManager hlcManager,
|
||||
IGuidProvider guidProvider,
|
||||
TimeProvider timeProvider,
|
||||
ILogger<AirGapBundleExporter> logger)
|
||||
{
|
||||
_jobLogStore = jobLogStore ?? throw new ArgumentNullException(nameof(jobLogStore));
|
||||
_hlcManager = hlcManager ?? throw new ArgumentNullException(nameof(hlcManager));
|
||||
_guidProvider = guidProvider ?? throw new ArgumentNullException(nameof(guidProvider));
|
||||
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<AirGapBundle> ExportAsync(
|
||||
string tenantId,
|
||||
IReadOnlyList<string>? nodeIds = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
|
||||
|
||||
var effectiveNodeIds = nodeIds ?? new[] { _hlcManager.NodeId };
|
||||
|
||||
_logger.LogInformation(
|
||||
"Exporting air-gap bundle for tenant {TenantId} with {NodeCount} nodes",
|
||||
tenantId, effectiveNodeIds.Count);
|
||||
|
||||
var jobLogs = new List<NodeJobLog>();
|
||||
|
||||
foreach (var nodeId in effectiveNodeIds)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var nodeLog = await _jobLogStore.GetNodeJobLogAsync(nodeId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (nodeLog is not null && nodeLog.Entries.Count > 0)
|
||||
{
|
||||
jobLogs.Add(nodeLog);
|
||||
_logger.LogDebug(
|
||||
"Added node {NodeId} with {EntryCount} entries to bundle",
|
||||
nodeId, nodeLog.Entries.Count);
|
||||
}
|
||||
}
|
||||
|
||||
if (jobLogs.Count == 0)
|
||||
{
|
||||
_logger.LogWarning("No offline job logs found for export");
|
||||
}
|
||||
|
||||
var bundle = new AirGapBundle
|
||||
{
|
||||
BundleId = _guidProvider.NewGuid(),
|
||||
TenantId = tenantId,
|
||||
CreatedAt = _timeProvider.GetUtcNow(),
|
||||
CreatedByNodeId = _hlcManager.NodeId,
|
||||
JobLogs = jobLogs,
|
||||
ManifestDigest = ComputeManifestDigest(jobLogs)
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Created bundle {BundleId} with {LogCount} node logs, {TotalEntries} total entries",
|
||||
bundle.BundleId, jobLogs.Count, jobLogs.Sum(l => l.Entries.Count));
|
||||
|
||||
return bundle;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task ExportToFileAsync(
|
||||
AirGapBundle bundle,
|
||||
string outputPath,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(bundle);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(outputPath);
|
||||
|
||||
var dto = ToExportDto(bundle);
|
||||
var json = JsonSerializer.Serialize(dto, JsonOptions);
|
||||
|
||||
var directory = Path.GetDirectoryName(outputPath);
|
||||
if (!string.IsNullOrEmpty(directory) && !Directory.Exists(directory))
|
||||
{
|
||||
Directory.CreateDirectory(directory);
|
||||
}
|
||||
|
||||
await File.WriteAllTextAsync(outputPath, json, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Exported bundle {BundleId} to {OutputPath}",
|
||||
bundle.BundleId, outputPath);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<string> ExportToStringAsync(
|
||||
AirGapBundle bundle,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(bundle);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
var dto = ToExportDto(bundle);
|
||||
var json = JsonSerializer.Serialize(dto, JsonOptions);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Exported bundle {BundleId} to string ({Length} chars)",
|
||||
bundle.BundleId, json.Length);
|
||||
|
||||
return Task.FromResult(json);
|
||||
}
|
||||
|
||||
private static string ComputeManifestDigest(IReadOnlyList<NodeJobLog> jobLogs)
|
||||
{
|
||||
// Create manifest of all chain heads for integrity
|
||||
var manifest = jobLogs
|
||||
.OrderBy(l => l.NodeId, StringComparer.Ordinal)
|
||||
.Select(l => new
|
||||
{
|
||||
l.NodeId,
|
||||
LastHlc = l.LastHlc.ToSortableString(),
|
||||
ChainHead = Convert.ToHexString(l.ChainHead)
|
||||
})
|
||||
.ToList();
|
||||
|
||||
var json = CanonJson.Serialize(manifest);
|
||||
var hash = SHA256.HashData(Encoding.UTF8.GetBytes(json));
|
||||
return "sha256:" + Convert.ToHexString(hash).ToLowerInvariant();
|
||||
}
|
||||
|
||||
private static AirGapBundleExportDto ToExportDto(AirGapBundle bundle) => new()
|
||||
{
|
||||
BundleId = bundle.BundleId,
|
||||
TenantId = bundle.TenantId,
|
||||
CreatedAt = bundle.CreatedAt,
|
||||
CreatedByNodeId = bundle.CreatedByNodeId,
|
||||
ManifestDigest = bundle.ManifestDigest,
|
||||
Signature = bundle.Signature,
|
||||
SignedBy = bundle.SignedBy,
|
||||
JobLogs = bundle.JobLogs.Select(ToNodeJobLogDto).ToList()
|
||||
};
|
||||
|
||||
private static NodeJobLogExportDto ToNodeJobLogDto(NodeJobLog log) => new()
|
||||
{
|
||||
NodeId = log.NodeId,
|
||||
LastHlc = log.LastHlc.ToSortableString(),
|
||||
ChainHead = Convert.ToBase64String(log.ChainHead),
|
||||
Entries = log.Entries.Select(ToEntryDto).ToList()
|
||||
};
|
||||
|
||||
private static OfflineJobLogEntryExportDto ToEntryDto(OfflineJobLogEntry entry) => new()
|
||||
{
|
||||
NodeId = entry.NodeId,
|
||||
THlc = entry.THlc.ToSortableString(),
|
||||
JobId = entry.JobId,
|
||||
PartitionKey = entry.PartitionKey,
|
||||
Payload = entry.Payload,
|
||||
PayloadHash = Convert.ToBase64String(entry.PayloadHash),
|
||||
PrevLink = entry.PrevLink is not null ? Convert.ToBase64String(entry.PrevLink) : null,
|
||||
Link = Convert.ToBase64String(entry.Link),
|
||||
EnqueuedAt = entry.EnqueuedAt
|
||||
};
|
||||
|
||||
// Export DTOs
|
||||
private sealed record AirGapBundleExportDto
|
||||
{
|
||||
public required Guid BundleId { get; init; }
|
||||
public required string TenantId { get; init; }
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
public required string CreatedByNodeId { get; init; }
|
||||
public required string ManifestDigest { get; init; }
|
||||
public string? Signature { get; init; }
|
||||
public string? SignedBy { get; init; }
|
||||
public required IReadOnlyList<NodeJobLogExportDto> JobLogs { get; init; }
|
||||
}
|
||||
|
||||
private sealed record NodeJobLogExportDto
|
||||
{
|
||||
public required string NodeId { get; init; }
|
||||
public required string LastHlc { get; init; }
|
||||
public required string ChainHead { get; init; }
|
||||
public required IReadOnlyList<OfflineJobLogEntryExportDto> Entries { get; init; }
|
||||
}
|
||||
|
||||
private sealed record OfflineJobLogEntryExportDto
|
||||
{
|
||||
public required string NodeId { get; init; }
|
||||
public required string THlc { get; init; }
|
||||
public required Guid JobId { get; init; }
|
||||
public string? PartitionKey { get; init; }
|
||||
public required string Payload { get; init; }
|
||||
public required string PayloadHash { get; init; }
|
||||
public string? PrevLink { get; init; }
|
||||
public required string Link { get; init; }
|
||||
public DateTimeOffset EnqueuedAt { get; init; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,316 @@
|
||||
// <copyright file="AirGapBundleImporter.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.AirGap.Sync.Models;
|
||||
using StellaOps.Canonical.Json;
|
||||
using StellaOps.HybridLogicalClock;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for air-gap bundle import operations.
|
||||
/// </summary>
|
||||
public interface IAirGapBundleImporter
|
||||
{
|
||||
/// <summary>
|
||||
/// Imports an air-gap bundle from a file.
|
||||
/// </summary>
|
||||
/// <param name="inputPath">The input file path.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The imported bundle.</returns>
|
||||
Task<AirGapBundle> ImportFromFileAsync(
|
||||
string inputPath,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Validates a bundle's integrity.
|
||||
/// </summary>
|
||||
/// <param name="bundle">The bundle to validate.</param>
|
||||
/// <returns>Validation result with any issues found.</returns>
|
||||
BundleValidationResult Validate(AirGapBundle bundle);
|
||||
|
||||
/// <summary>
|
||||
/// Imports an air-gap bundle from a JSON string.
|
||||
/// </summary>
|
||||
/// <param name="json">The JSON string representation.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The imported bundle.</returns>
|
||||
Task<AirGapBundle> ImportFromStringAsync(
|
||||
string json,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of bundle validation.
|
||||
/// </summary>
|
||||
public sealed record BundleValidationResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets whether the bundle is valid.
|
||||
/// </summary>
|
||||
public required bool IsValid { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets validation issues found.
|
||||
/// </summary>
|
||||
public required IReadOnlyList<string> Issues { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service for importing air-gap bundles.
|
||||
/// </summary>
|
||||
public sealed class AirGapBundleImporter : IAirGapBundleImporter
|
||||
{
|
||||
private readonly ILogger<AirGapBundleImporter> _logger;
|
||||
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
|
||||
PropertyNameCaseInsensitive = true
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="AirGapBundleImporter"/> class.
|
||||
/// </summary>
|
||||
public AirGapBundleImporter(ILogger<AirGapBundleImporter> logger)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<AirGapBundle> ImportFromFileAsync(
|
||||
string inputPath,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(inputPath);
|
||||
|
||||
if (!File.Exists(inputPath))
|
||||
{
|
||||
throw new FileNotFoundException($"Bundle file not found: {inputPath}", inputPath);
|
||||
}
|
||||
|
||||
_logger.LogInformation("Importing air-gap bundle from {InputPath}", inputPath);
|
||||
|
||||
var json = await File.ReadAllTextAsync(inputPath, cancellationToken).ConfigureAwait(false);
|
||||
var dto = JsonSerializer.Deserialize<AirGapBundleImportDto>(json, JsonOptions);
|
||||
|
||||
if (dto is null)
|
||||
{
|
||||
throw new InvalidOperationException("Failed to deserialize bundle file");
|
||||
}
|
||||
|
||||
var bundle = FromImportDto(dto);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Imported bundle {BundleId} from {InputPath}: {LogCount} node logs, {TotalEntries} total entries",
|
||||
bundle.BundleId, inputPath, bundle.JobLogs.Count, bundle.JobLogs.Sum(l => l.Entries.Count));
|
||||
|
||||
return bundle;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<AirGapBundle> ImportFromStringAsync(
|
||||
string json,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(json);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
_logger.LogDebug("Importing air-gap bundle from string ({Length} chars)", json.Length);
|
||||
|
||||
var dto = JsonSerializer.Deserialize<AirGapBundleImportDto>(json, JsonOptions);
|
||||
|
||||
if (dto is null)
|
||||
{
|
||||
throw new InvalidOperationException("Failed to deserialize bundle JSON");
|
||||
}
|
||||
|
||||
var bundle = FromImportDto(dto);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Imported bundle {BundleId} from string: {LogCount} node logs, {TotalEntries} total entries",
|
||||
bundle.BundleId, bundle.JobLogs.Count, bundle.JobLogs.Sum(l => l.Entries.Count));
|
||||
|
||||
return Task.FromResult(bundle);
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public BundleValidationResult Validate(AirGapBundle bundle)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(bundle);
|
||||
|
||||
var issues = new List<string>();
|
||||
|
||||
// 1. Validate manifest digest
|
||||
var computedDigest = ComputeManifestDigest(bundle.JobLogs);
|
||||
if (!string.Equals(computedDigest, bundle.ManifestDigest, StringComparison.Ordinal))
|
||||
{
|
||||
issues.Add($"Manifest digest mismatch: expected {bundle.ManifestDigest}, computed {computedDigest}");
|
||||
}
|
||||
|
||||
// 2. Validate each node log's chain integrity
|
||||
foreach (var nodeLog in bundle.JobLogs)
|
||||
{
|
||||
var nodeIssues = ValidateNodeLog(nodeLog);
|
||||
issues.AddRange(nodeIssues);
|
||||
}
|
||||
|
||||
// 3. Validate chain heads match last entry links
|
||||
foreach (var nodeLog in bundle.JobLogs)
|
||||
{
|
||||
if (nodeLog.Entries.Count > 0)
|
||||
{
|
||||
var lastEntry = nodeLog.Entries[^1];
|
||||
if (!ByteArrayEquals(nodeLog.ChainHead, lastEntry.Link))
|
||||
{
|
||||
issues.Add($"Node {nodeLog.NodeId}: chain head doesn't match last entry link");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var isValid = issues.Count == 0;
|
||||
|
||||
if (!isValid)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Bundle {BundleId} validation failed with {IssueCount} issues",
|
||||
bundle.BundleId, issues.Count);
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogDebug("Bundle {BundleId} validation passed", bundle.BundleId);
|
||||
}
|
||||
|
||||
return new BundleValidationResult
|
||||
{
|
||||
IsValid = isValid,
|
||||
Issues = issues
|
||||
};
|
||||
}
|
||||
|
||||
private static IEnumerable<string> ValidateNodeLog(NodeJobLog nodeLog)
|
||||
{
|
||||
byte[]? expectedPrevLink = null;
|
||||
|
||||
for (var i = 0; i < nodeLog.Entries.Count; i++)
|
||||
{
|
||||
var entry = nodeLog.Entries[i];
|
||||
|
||||
// Verify prev_link matches expected
|
||||
if (!ByteArrayEquals(entry.PrevLink, expectedPrevLink))
|
||||
{
|
||||
yield return $"Node {nodeLog.NodeId}, entry {i}: prev_link mismatch";
|
||||
}
|
||||
|
||||
// Recompute and verify link
|
||||
var computedLink = OfflineHlcManager.ComputeLink(
|
||||
entry.PrevLink,
|
||||
entry.JobId,
|
||||
entry.THlc,
|
||||
entry.PayloadHash);
|
||||
|
||||
if (!ByteArrayEquals(entry.Link, computedLink))
|
||||
{
|
||||
yield return $"Node {nodeLog.NodeId}, entry {i} (JobId {entry.JobId}): link mismatch";
|
||||
}
|
||||
|
||||
expectedPrevLink = entry.Link;
|
||||
}
|
||||
}
|
||||
|
||||
private static string ComputeManifestDigest(IReadOnlyList<NodeJobLog> jobLogs)
|
||||
{
|
||||
var manifest = jobLogs
|
||||
.OrderBy(l => l.NodeId, StringComparer.Ordinal)
|
||||
.Select(l => new
|
||||
{
|
||||
l.NodeId,
|
||||
LastHlc = l.LastHlc.ToSortableString(),
|
||||
ChainHead = Convert.ToHexString(l.ChainHead)
|
||||
})
|
||||
.ToList();
|
||||
|
||||
var json = CanonJson.Serialize(manifest);
|
||||
var hash = SHA256.HashData(Encoding.UTF8.GetBytes(json));
|
||||
return "sha256:" + Convert.ToHexString(hash).ToLowerInvariant();
|
||||
}
|
||||
|
||||
private static bool ByteArrayEquals(byte[]? a, byte[]? b)
|
||||
{
|
||||
if (a is null && b is null) return true;
|
||||
if (a is null || b is null) return false;
|
||||
return a.AsSpan().SequenceEqual(b);
|
||||
}
|
||||
|
||||
private static AirGapBundle FromImportDto(AirGapBundleImportDto dto) => new()
|
||||
{
|
||||
BundleId = dto.BundleId,
|
||||
TenantId = dto.TenantId,
|
||||
CreatedAt = dto.CreatedAt,
|
||||
CreatedByNodeId = dto.CreatedByNodeId,
|
||||
ManifestDigest = dto.ManifestDigest,
|
||||
Signature = dto.Signature,
|
||||
SignedBy = dto.SignedBy,
|
||||
JobLogs = dto.JobLogs.Select(FromNodeJobLogDto).ToList()
|
||||
};
|
||||
|
||||
private static NodeJobLog FromNodeJobLogDto(NodeJobLogImportDto dto) => new()
|
||||
{
|
||||
NodeId = dto.NodeId,
|
||||
LastHlc = HlcTimestamp.Parse(dto.LastHlc),
|
||||
ChainHead = Convert.FromBase64String(dto.ChainHead),
|
||||
Entries = dto.Entries.Select(FromEntryDto).ToList()
|
||||
};
|
||||
|
||||
private static OfflineJobLogEntry FromEntryDto(OfflineJobLogEntryImportDto dto) => new()
|
||||
{
|
||||
NodeId = dto.NodeId,
|
||||
THlc = HlcTimestamp.Parse(dto.THlc),
|
||||
JobId = dto.JobId,
|
||||
PartitionKey = dto.PartitionKey,
|
||||
Payload = dto.Payload,
|
||||
PayloadHash = Convert.FromBase64String(dto.PayloadHash),
|
||||
PrevLink = dto.PrevLink is not null ? Convert.FromBase64String(dto.PrevLink) : null,
|
||||
Link = Convert.FromBase64String(dto.Link),
|
||||
EnqueuedAt = dto.EnqueuedAt
|
||||
};
|
||||
|
||||
// Import DTOs
|
||||
private sealed record AirGapBundleImportDto
|
||||
{
|
||||
public required Guid BundleId { get; init; }
|
||||
public required string TenantId { get; init; }
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
public required string CreatedByNodeId { get; init; }
|
||||
public required string ManifestDigest { get; init; }
|
||||
public string? Signature { get; init; }
|
||||
public string? SignedBy { get; init; }
|
||||
public required IReadOnlyList<NodeJobLogImportDto> JobLogs { get; init; }
|
||||
}
|
||||
|
||||
private sealed record NodeJobLogImportDto
|
||||
{
|
||||
public required string NodeId { get; init; }
|
||||
public required string LastHlc { get; init; }
|
||||
public required string ChainHead { get; init; }
|
||||
public required IReadOnlyList<OfflineJobLogEntryImportDto> Entries { get; init; }
|
||||
}
|
||||
|
||||
private sealed record OfflineJobLogEntryImportDto
|
||||
{
|
||||
public required string NodeId { get; init; }
|
||||
public required string THlc { get; init; }
|
||||
public required Guid JobId { get; init; }
|
||||
public string? PartitionKey { get; init; }
|
||||
public required string Payload { get; init; }
|
||||
public required string PayloadHash { get; init; }
|
||||
public string? PrevLink { get; init; }
|
||||
public required string Link { get; init; }
|
||||
public DateTimeOffset EnqueuedAt { get; init; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,198 @@
|
||||
// <copyright file="AirGapSyncService.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.AirGap.Sync.Models;
|
||||
using StellaOps.HybridLogicalClock;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for the scheduler log repository used by sync.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This is a subset of the full ISchedulerLogRepository to avoid circular dependencies.
|
||||
/// Implementations should delegate to the actual repository.
|
||||
/// </remarks>
|
||||
public interface ISyncSchedulerLogRepository
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the chain head for a tenant/partition.
|
||||
/// </summary>
|
||||
Task<(byte[]? Link, string? THlc)> GetChainHeadAsync(
|
||||
string tenantId,
|
||||
string? partitionKey = null,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets an entry by job ID.
|
||||
/// </summary>
|
||||
Task<bool> ExistsByJobIdAsync(
|
||||
string tenantId,
|
||||
Guid jobId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Inserts a synced entry.
|
||||
/// </summary>
|
||||
Task InsertSyncedEntryAsync(
|
||||
string tenantId,
|
||||
string tHlc,
|
||||
string? partitionKey,
|
||||
Guid jobId,
|
||||
byte[] payloadHash,
|
||||
byte[]? prevLink,
|
||||
byte[] link,
|
||||
string sourceNodeId,
|
||||
Guid syncedFromBundle,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for air-gap sync operations.
|
||||
/// </summary>
|
||||
public interface IAirGapSyncService
|
||||
{
|
||||
/// <summary>
|
||||
/// Syncs offline jobs from an air-gap bundle to the central scheduler.
|
||||
/// </summary>
|
||||
/// <param name="bundle">The bundle to sync.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The sync result.</returns>
|
||||
Task<SyncResult> SyncFromBundleAsync(
|
||||
AirGapBundle bundle,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service for syncing air-gap bundles to the central scheduler.
|
||||
/// </summary>
|
||||
public sealed class AirGapSyncService : IAirGapSyncService
|
||||
{
|
||||
private readonly IHlcMergeService _mergeService;
|
||||
private readonly ISyncSchedulerLogRepository _schedulerLogRepo;
|
||||
private readonly IHybridLogicalClock _hlc;
|
||||
private readonly ILogger<AirGapSyncService> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="AirGapSyncService"/> class.
|
||||
/// </summary>
|
||||
public AirGapSyncService(
|
||||
IHlcMergeService mergeService,
|
||||
ISyncSchedulerLogRepository schedulerLogRepo,
|
||||
IHybridLogicalClock hlc,
|
||||
ILogger<AirGapSyncService> logger)
|
||||
{
|
||||
_mergeService = mergeService ?? throw new ArgumentNullException(nameof(mergeService));
|
||||
_schedulerLogRepo = schedulerLogRepo ?? throw new ArgumentNullException(nameof(schedulerLogRepo));
|
||||
_hlc = hlc ?? throw new ArgumentNullException(nameof(hlc));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<SyncResult> SyncFromBundleAsync(
|
||||
AirGapBundle bundle,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(bundle);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting sync from bundle {BundleId} with {LogCount} node logs for tenant {TenantId}",
|
||||
bundle.BundleId, bundle.JobLogs.Count, bundle.TenantId);
|
||||
|
||||
// 1. Merge all offline logs
|
||||
var merged = await _mergeService.MergeAsync(bundle.JobLogs, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
if (merged.MergedEntries.Count == 0)
|
||||
{
|
||||
_logger.LogInformation("Bundle {BundleId} has no entries to sync", bundle.BundleId);
|
||||
return new SyncResult
|
||||
{
|
||||
BundleId = bundle.BundleId,
|
||||
TotalInBundle = 0,
|
||||
Appended = 0,
|
||||
Duplicates = 0,
|
||||
AlreadyExisted = 0
|
||||
};
|
||||
}
|
||||
|
||||
// 2. Get current scheduler chain head
|
||||
var (currentLink, _) = await _schedulerLogRepo.GetChainHeadAsync(
|
||||
bundle.TenantId,
|
||||
cancellationToken: cancellationToken).ConfigureAwait(false);
|
||||
|
||||
// 3. For each merged entry, update HLC clock (receive)
|
||||
// This ensures central clock advances past all offline timestamps
|
||||
foreach (var entry in merged.MergedEntries)
|
||||
{
|
||||
_hlc.Receive(entry.THlc);
|
||||
}
|
||||
|
||||
// 4. Append merged entries to scheduler log
|
||||
// Chain links recomputed to extend from current head
|
||||
byte[]? prevLink = currentLink;
|
||||
var appended = 0;
|
||||
var alreadyExisted = 0;
|
||||
var warnings = new List<string>();
|
||||
|
||||
foreach (var entry in merged.MergedEntries)
|
||||
{
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
// Check if job already exists (idempotency)
|
||||
var exists = await _schedulerLogRepo.ExistsByJobIdAsync(
|
||||
bundle.TenantId,
|
||||
entry.JobId,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (exists)
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Job {JobId} already exists in scheduler log, skipping",
|
||||
entry.JobId);
|
||||
alreadyExisted++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Compute new chain link extending from current chain
|
||||
var newLink = OfflineHlcManager.ComputeLink(
|
||||
prevLink,
|
||||
entry.JobId,
|
||||
entry.THlc,
|
||||
entry.PayloadHash);
|
||||
|
||||
// Insert the entry
|
||||
await _schedulerLogRepo.InsertSyncedEntryAsync(
|
||||
bundle.TenantId,
|
||||
entry.THlc.ToSortableString(),
|
||||
entry.PartitionKey,
|
||||
entry.JobId,
|
||||
entry.PayloadHash,
|
||||
prevLink,
|
||||
newLink,
|
||||
entry.SourceNodeId,
|
||||
bundle.BundleId,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
prevLink = newLink;
|
||||
appended++;
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Sync complete for bundle {BundleId}: {Appended} appended, {Duplicates} duplicates, {AlreadyExisted} already existed",
|
||||
bundle.BundleId, appended, merged.Duplicates.Count, alreadyExisted);
|
||||
|
||||
return new SyncResult
|
||||
{
|
||||
BundleId = bundle.BundleId,
|
||||
TotalInBundle = merged.MergedEntries.Count,
|
||||
Appended = appended,
|
||||
Duplicates = merged.Duplicates.Count,
|
||||
AlreadyExisted = alreadyExisted,
|
||||
NewChainHead = prevLink,
|
||||
Warnings = warnings.Count > 0 ? warnings : null
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,114 @@
|
||||
// <copyright file="ConflictResolver.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.AirGap.Sync.Models;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for conflict resolution during merge.
|
||||
/// </summary>
|
||||
public interface IConflictResolver
|
||||
{
|
||||
/// <summary>
|
||||
/// Resolves conflicts when the same JobId appears in multiple entries.
|
||||
/// </summary>
|
||||
/// <param name="jobId">The conflicting job ID.</param>
|
||||
/// <param name="conflicting">The conflicting entries with their source nodes.</param>
|
||||
/// <returns>The resolution result.</returns>
|
||||
ConflictResolution Resolve(
|
||||
Guid jobId,
|
||||
IReadOnlyList<(string NodeId, OfflineJobLogEntry Entry)> conflicting);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Resolves conflicts during HLC merge operations.
|
||||
/// </summary>
|
||||
public sealed class ConflictResolver : IConflictResolver
|
||||
{
|
||||
private readonly ILogger<ConflictResolver> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="ConflictResolver"/> class.
|
||||
/// </summary>
|
||||
public ConflictResolver(ILogger<ConflictResolver> logger)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public ConflictResolution Resolve(
|
||||
Guid jobId,
|
||||
IReadOnlyList<(string NodeId, OfflineJobLogEntry Entry)> conflicting)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(conflicting);
|
||||
|
||||
if (conflicting.Count == 0)
|
||||
{
|
||||
throw new ArgumentException("Conflicting list cannot be empty", nameof(conflicting));
|
||||
}
|
||||
|
||||
if (conflicting.Count == 1)
|
||||
{
|
||||
// No conflict
|
||||
return new ConflictResolution
|
||||
{
|
||||
Type = ConflictType.DuplicateTimestamp,
|
||||
Resolution = ResolutionStrategy.TakeEarliest,
|
||||
SelectedEntry = conflicting[0].Entry,
|
||||
DroppedEntries = Array.Empty<OfflineJobLogEntry>()
|
||||
};
|
||||
}
|
||||
|
||||
// Verify payloads are actually different
|
||||
var uniquePayloads = conflicting
|
||||
.Select(c => Convert.ToHexString(c.Entry.PayloadHash))
|
||||
.Distinct()
|
||||
.ToList();
|
||||
|
||||
if (uniquePayloads.Count == 1)
|
||||
{
|
||||
// Same payload, different HLC timestamps - not a real conflict
|
||||
// Take the earliest HLC (preserves causality)
|
||||
var sorted = conflicting
|
||||
.OrderBy(c => c.Entry.THlc.PhysicalTime)
|
||||
.ThenBy(c => c.Entry.THlc.LogicalCounter)
|
||||
.ThenBy(c => c.Entry.THlc.NodeId, StringComparer.Ordinal)
|
||||
.ToList();
|
||||
|
||||
var earliest = sorted[0];
|
||||
var dropped = sorted.Skip(1).Select(s => s.Entry).ToList();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Resolved duplicate timestamp conflict for JobId {JobId}: selected entry from node {NodeId} at {THlc}, dropped {DroppedCount} duplicates",
|
||||
jobId, earliest.NodeId, earliest.Entry.THlc, dropped.Count);
|
||||
|
||||
return new ConflictResolution
|
||||
{
|
||||
Type = ConflictType.DuplicateTimestamp,
|
||||
Resolution = ResolutionStrategy.TakeEarliest,
|
||||
SelectedEntry = earliest.Entry,
|
||||
DroppedEntries = dropped
|
||||
};
|
||||
}
|
||||
|
||||
// Actual conflict: same JobId, different payloads
|
||||
// This indicates a bug in deterministic ID computation
|
||||
var nodeIds = string.Join(", ", conflicting.Select(c => c.NodeId));
|
||||
var payloadHashes = string.Join(", ", conflicting.Select(c => Convert.ToHexString(c.Entry.PayloadHash)[..16] + "..."));
|
||||
|
||||
_logger.LogError(
|
||||
"Payload mismatch conflict for JobId {JobId}: different payloads from nodes [{NodeIds}] with hashes [{PayloadHashes}]",
|
||||
jobId, nodeIds, payloadHashes);
|
||||
|
||||
return new ConflictResolution
|
||||
{
|
||||
Type = ConflictType.PayloadMismatch,
|
||||
Resolution = ResolutionStrategy.Error,
|
||||
Error = $"JobId {jobId} has conflicting payloads from nodes: {nodeIds}. " +
|
||||
"This indicates a bug in deterministic job ID computation or payload tampering."
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,169 @@
|
||||
// <copyright file="HlcMergeService.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.AirGap.Sync.Models;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for HLC-based merge operations.
|
||||
/// </summary>
|
||||
public interface IHlcMergeService
|
||||
{
|
||||
/// <summary>
|
||||
/// Merges job logs from multiple offline nodes into a unified, HLC-ordered stream.
|
||||
/// </summary>
|
||||
/// <param name="nodeLogs">The node logs to merge.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The merge result.</returns>
|
||||
Task<MergeResult> MergeAsync(
|
||||
IReadOnlyList<NodeJobLog> nodeLogs,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service for merging job logs from multiple offline nodes using HLC total ordering.
|
||||
/// </summary>
|
||||
public sealed class HlcMergeService : IHlcMergeService
|
||||
{
|
||||
private readonly IConflictResolver _conflictResolver;
|
||||
private readonly ILogger<HlcMergeService> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="HlcMergeService"/> class.
|
||||
/// </summary>
|
||||
public HlcMergeService(
|
||||
IConflictResolver conflictResolver,
|
||||
ILogger<HlcMergeService> logger)
|
||||
{
|
||||
_conflictResolver = conflictResolver ?? throw new ArgumentNullException(nameof(conflictResolver));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<MergeResult> MergeAsync(
|
||||
IReadOnlyList<NodeJobLog> nodeLogs,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(nodeLogs);
|
||||
cancellationToken.ThrowIfCancellationRequested();
|
||||
|
||||
if (nodeLogs.Count == 0)
|
||||
{
|
||||
return Task.FromResult(new MergeResult
|
||||
{
|
||||
MergedEntries = Array.Empty<MergedJobEntry>(),
|
||||
Duplicates = Array.Empty<DuplicateEntry>(),
|
||||
SourceNodes = Array.Empty<string>()
|
||||
});
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting merge of {NodeCount} node logs with {TotalEntries} total entries",
|
||||
nodeLogs.Count,
|
||||
nodeLogs.Sum(l => l.Entries.Count));
|
||||
|
||||
// 1. Collect all entries from all nodes
|
||||
var allEntries = nodeLogs
|
||||
.SelectMany(log => log.Entries.Select(e => (log.NodeId, Entry: e)))
|
||||
.ToList();
|
||||
|
||||
// 2. Sort by HLC total order: (PhysicalTime, LogicalCounter, NodeId, JobId)
|
||||
var sorted = allEntries
|
||||
.OrderBy(x => x.Entry.THlc.PhysicalTime)
|
||||
.ThenBy(x => x.Entry.THlc.LogicalCounter)
|
||||
.ThenBy(x => x.Entry.THlc.NodeId, StringComparer.Ordinal)
|
||||
.ThenBy(x => x.Entry.JobId)
|
||||
.ToList();
|
||||
|
||||
// 3. Group by JobId to detect duplicates
|
||||
var groupedByJobId = sorted.GroupBy(x => x.Entry.JobId).ToList();
|
||||
|
||||
var deduplicated = new List<MergedJobEntry>();
|
||||
var duplicates = new List<DuplicateEntry>();
|
||||
|
||||
foreach (var group in groupedByJobId)
|
||||
{
|
||||
var entries = group.ToList();
|
||||
|
||||
if (entries.Count == 1)
|
||||
{
|
||||
// No conflict - add directly
|
||||
var (nodeId, entry) = entries[0];
|
||||
deduplicated.Add(CreateMergedEntry(nodeId, entry));
|
||||
}
|
||||
else
|
||||
{
|
||||
// Multiple entries with same JobId - resolve conflict
|
||||
var resolution = _conflictResolver.Resolve(group.Key, entries);
|
||||
|
||||
if (resolution.Resolution == ResolutionStrategy.Error)
|
||||
{
|
||||
_logger.LogError(
|
||||
"Conflict resolution failed for JobId {JobId}: {Error}",
|
||||
group.Key, resolution.Error);
|
||||
throw new InvalidOperationException(resolution.Error);
|
||||
}
|
||||
|
||||
// Add the selected entry
|
||||
if (resolution.SelectedEntry is not null)
|
||||
{
|
||||
var sourceEntry = entries.First(e => e.Entry == resolution.SelectedEntry);
|
||||
deduplicated.Add(CreateMergedEntry(sourceEntry.NodeId, resolution.SelectedEntry));
|
||||
}
|
||||
|
||||
// Record duplicates
|
||||
foreach (var dropped in resolution.DroppedEntries ?? Array.Empty<OfflineJobLogEntry>())
|
||||
{
|
||||
var sourceEntry = entries.First(e => e.Entry == dropped);
|
||||
duplicates.Add(new DuplicateEntry(dropped.JobId, sourceEntry.NodeId, dropped.THlc));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Sort deduplicated entries by HLC order
|
||||
deduplicated = deduplicated
|
||||
.OrderBy(x => x.THlc.PhysicalTime)
|
||||
.ThenBy(x => x.THlc.LogicalCounter)
|
||||
.ThenBy(x => x.THlc.NodeId, StringComparer.Ordinal)
|
||||
.ThenBy(x => x.JobId)
|
||||
.ToList();
|
||||
|
||||
// 5. Recompute unified chain
|
||||
byte[]? prevLink = null;
|
||||
foreach (var entry in deduplicated)
|
||||
{
|
||||
entry.MergedLink = OfflineHlcManager.ComputeLink(
|
||||
prevLink,
|
||||
entry.JobId,
|
||||
entry.THlc,
|
||||
entry.PayloadHash);
|
||||
prevLink = entry.MergedLink;
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Merge complete: {MergedCount} entries, {DuplicateCount} duplicates dropped",
|
||||
deduplicated.Count, duplicates.Count);
|
||||
|
||||
return Task.FromResult(new MergeResult
|
||||
{
|
||||
MergedEntries = deduplicated,
|
||||
Duplicates = duplicates,
|
||||
MergedChainHead = prevLink,
|
||||
SourceNodes = nodeLogs.Select(l => l.NodeId).ToList()
|
||||
});
|
||||
}
|
||||
|
||||
private static MergedJobEntry CreateMergedEntry(string nodeId, OfflineJobLogEntry entry) => new()
|
||||
{
|
||||
SourceNodeId = nodeId,
|
||||
THlc = entry.THlc,
|
||||
JobId = entry.JobId,
|
||||
PartitionKey = entry.PartitionKey,
|
||||
Payload = entry.Payload,
|
||||
PayloadHash = entry.PayloadHash,
|
||||
OriginalLink = entry.Link
|
||||
};
|
||||
}
|
||||
@@ -0,0 +1,172 @@
|
||||
// <copyright file="OfflineHlcManager.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.AirGap.Sync.Models;
|
||||
using StellaOps.AirGap.Sync.Stores;
|
||||
using StellaOps.Canonical.Json;
|
||||
using StellaOps.Determinism;
|
||||
using StellaOps.HybridLogicalClock;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for offline HLC management.
|
||||
/// </summary>
|
||||
public interface IOfflineHlcManager
|
||||
{
|
||||
/// <summary>
|
||||
/// Enqueues a job locally while offline, maintaining the local chain.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">The payload type.</typeparam>
|
||||
/// <param name="payload">The job payload.</param>
|
||||
/// <param name="idempotencyKey">The idempotency key for deterministic job ID.</param>
|
||||
/// <param name="partitionKey">Optional partition key.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The enqueue result.</returns>
|
||||
Task<OfflineEnqueueResult> EnqueueOfflineAsync<T>(
|
||||
T payload,
|
||||
string idempotencyKey,
|
||||
string? partitionKey = null,
|
||||
CancellationToken cancellationToken = default) where T : notnull;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current node's job log for export.
|
||||
/// </summary>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The node job log, or null if empty.</returns>
|
||||
Task<NodeJobLog?> GetNodeJobLogAsync(CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the node ID.
|
||||
/// </summary>
|
||||
string NodeId { get; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Manages HLC operations for offline/air-gap scenarios.
|
||||
/// </summary>
|
||||
public sealed class OfflineHlcManager : IOfflineHlcManager
|
||||
{
|
||||
private readonly IHybridLogicalClock _hlc;
|
||||
private readonly IOfflineJobLogStore _jobLogStore;
|
||||
private readonly IGuidProvider _guidProvider;
|
||||
private readonly ILogger<OfflineHlcManager> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="OfflineHlcManager"/> class.
|
||||
/// </summary>
|
||||
public OfflineHlcManager(
|
||||
IHybridLogicalClock hlc,
|
||||
IOfflineJobLogStore jobLogStore,
|
||||
IGuidProvider guidProvider,
|
||||
ILogger<OfflineHlcManager> logger)
|
||||
{
|
||||
_hlc = hlc ?? throw new ArgumentNullException(nameof(hlc));
|
||||
_jobLogStore = jobLogStore ?? throw new ArgumentNullException(nameof(jobLogStore));
|
||||
_guidProvider = guidProvider ?? throw new ArgumentNullException(nameof(guidProvider));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public string NodeId => _hlc.NodeId;
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<OfflineEnqueueResult> EnqueueOfflineAsync<T>(
|
||||
T payload,
|
||||
string idempotencyKey,
|
||||
string? partitionKey = null,
|
||||
CancellationToken cancellationToken = default) where T : notnull
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(payload);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(idempotencyKey);
|
||||
|
||||
// 1. Generate HLC timestamp
|
||||
var tHlc = _hlc.Tick();
|
||||
|
||||
// 2. Compute deterministic job ID from idempotency key
|
||||
var jobId = ComputeDeterministicJobId(idempotencyKey);
|
||||
|
||||
// 3. Serialize and hash payload
|
||||
var payloadJson = CanonJson.Serialize(payload);
|
||||
var payloadHash = SHA256.HashData(Encoding.UTF8.GetBytes(payloadJson));
|
||||
|
||||
// 4. Get previous chain link
|
||||
var prevLink = await _jobLogStore.GetLastLinkAsync(NodeId, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
// 5. Compute chain link
|
||||
var link = ComputeLink(prevLink, jobId, tHlc, payloadHash);
|
||||
|
||||
// 6. Create and store entry
|
||||
var entry = new OfflineJobLogEntry
|
||||
{
|
||||
NodeId = NodeId,
|
||||
THlc = tHlc,
|
||||
JobId = jobId,
|
||||
PartitionKey = partitionKey,
|
||||
Payload = payloadJson,
|
||||
PayloadHash = payloadHash,
|
||||
PrevLink = prevLink,
|
||||
Link = link,
|
||||
EnqueuedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
|
||||
await _jobLogStore.AppendAsync(entry, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Enqueued offline job {JobId} with HLC {THlc} on node {NodeId}",
|
||||
jobId, tHlc, NodeId);
|
||||
|
||||
return new OfflineEnqueueResult
|
||||
{
|
||||
THlc = tHlc,
|
||||
JobId = jobId,
|
||||
Link = link,
|
||||
NodeId = NodeId
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<NodeJobLog?> GetNodeJobLogAsync(CancellationToken cancellationToken = default)
|
||||
=> _jobLogStore.GetNodeJobLogAsync(NodeId, cancellationToken);
|
||||
|
||||
/// <summary>
|
||||
/// Computes deterministic job ID from idempotency key.
|
||||
/// </summary>
|
||||
private Guid ComputeDeterministicJobId(string idempotencyKey)
|
||||
{
|
||||
var hash = SHA256.HashData(Encoding.UTF8.GetBytes(idempotencyKey));
|
||||
// Use first 16 bytes of SHA-256 as deterministic GUID
|
||||
return new Guid(hash.AsSpan(0, 16));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes chain link: Hash(prev_link || job_id || t_hlc || payload_hash).
|
||||
/// </summary>
|
||||
internal static byte[] ComputeLink(
|
||||
byte[]? prevLink,
|
||||
Guid jobId,
|
||||
HlcTimestamp tHlc,
|
||||
byte[] payloadHash)
|
||||
{
|
||||
using var hasher = IncrementalHash.CreateHash(HashAlgorithmName.SHA256);
|
||||
|
||||
// Previous link (or 32 zero bytes for first entry)
|
||||
hasher.AppendData(prevLink ?? new byte[32]);
|
||||
|
||||
// Job ID as bytes
|
||||
hasher.AppendData(jobId.ToByteArray());
|
||||
|
||||
// HLC timestamp as UTF-8 bytes
|
||||
hasher.AppendData(Encoding.UTF8.GetBytes(tHlc.ToSortableString()));
|
||||
|
||||
// Payload hash
|
||||
hasher.AppendData(payloadHash);
|
||||
|
||||
return hasher.GetHashAndReset();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Canonical.Json\StellaOps.Canonical.Json.csproj" />
|
||||
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Determinism.Abstractions\StellaOps.Determinism.Abstractions.csproj" />
|
||||
<ProjectReference Include="..\..\..\__Libraries\StellaOps.HybridLogicalClock\StellaOps.HybridLogicalClock.csproj" />
|
||||
<ProjectReference Include="..\..\..\Scheduler\__Libraries\StellaOps.Scheduler.Models\StellaOps.Scheduler.Models.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -0,0 +1,246 @@
|
||||
// <copyright file="FileBasedOfflineJobLogStore.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.AirGap.Sync.Models;
|
||||
using StellaOps.Canonical.Json;
|
||||
using StellaOps.HybridLogicalClock;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Stores;
|
||||
|
||||
/// <summary>
|
||||
/// Options for the file-based offline job log store.
|
||||
/// </summary>
|
||||
public sealed class FileBasedOfflineJobLogStoreOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets or sets the directory for storing offline job logs.
|
||||
/// </summary>
|
||||
public string DataDirectory { get; set; } = "./offline-job-logs";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// File-based implementation of <see cref="IOfflineJobLogStore"/> for air-gap scenarios.
|
||||
/// </summary>
|
||||
public sealed class FileBasedOfflineJobLogStore : IOfflineJobLogStore
|
||||
{
|
||||
private readonly IOptions<FileBasedOfflineJobLogStoreOptions> _options;
|
||||
private readonly ILogger<FileBasedOfflineJobLogStore> _logger;
|
||||
private readonly SemaphoreSlim _lock = new(1, 1);
|
||||
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
WriteIndented = false,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="FileBasedOfflineJobLogStore"/> class.
|
||||
/// </summary>
|
||||
public FileBasedOfflineJobLogStore(
|
||||
IOptions<FileBasedOfflineJobLogStoreOptions> options,
|
||||
ILogger<FileBasedOfflineJobLogStore> logger)
|
||||
{
|
||||
_options = options ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
|
||||
EnsureDirectoryExists();
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task AppendAsync(OfflineJobLogEntry entry, CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(entry);
|
||||
|
||||
await _lock.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
var filePath = GetNodeLogFilePath(entry.NodeId);
|
||||
var dto = ToDto(entry);
|
||||
var line = JsonSerializer.Serialize(dto, JsonOptions);
|
||||
|
||||
await File.AppendAllTextAsync(filePath, line + Environment.NewLine, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Appended offline job entry {JobId} for node {NodeId}",
|
||||
entry.JobId, entry.NodeId);
|
||||
}
|
||||
finally
|
||||
{
|
||||
_lock.Release();
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<OfflineJobLogEntry>> GetEntriesAsync(
|
||||
string nodeId,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(nodeId);
|
||||
|
||||
var filePath = GetNodeLogFilePath(nodeId);
|
||||
if (!File.Exists(filePath))
|
||||
{
|
||||
return Array.Empty<OfflineJobLogEntry>();
|
||||
}
|
||||
|
||||
await _lock.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
var lines = await File.ReadAllLinesAsync(filePath, cancellationToken).ConfigureAwait(false);
|
||||
var entries = new List<OfflineJobLogEntry>(lines.Length);
|
||||
|
||||
foreach (var line in lines)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(line))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var dto = JsonSerializer.Deserialize<OfflineJobLogEntryDto>(line, JsonOptions);
|
||||
if (dto is not null)
|
||||
{
|
||||
entries.Add(FromDto(dto));
|
||||
}
|
||||
}
|
||||
|
||||
// Return in HLC order
|
||||
return entries.OrderBy(e => e.THlc).ToList();
|
||||
}
|
||||
finally
|
||||
{
|
||||
_lock.Release();
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<byte[]?> GetLastLinkAsync(string nodeId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var entries = await GetEntriesAsync(nodeId, cancellationToken).ConfigureAwait(false);
|
||||
return entries.Count > 0 ? entries[^1].Link : null;
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<NodeJobLog?> GetNodeJobLogAsync(string nodeId, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var entries = await GetEntriesAsync(nodeId, cancellationToken).ConfigureAwait(false);
|
||||
if (entries.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var lastEntry = entries[^1];
|
||||
return new NodeJobLog
|
||||
{
|
||||
NodeId = nodeId,
|
||||
LastHlc = lastEntry.THlc,
|
||||
ChainHead = lastEntry.Link,
|
||||
Entries = entries
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<int> ClearEntriesAsync(
|
||||
string nodeId,
|
||||
string upToHlc,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(nodeId);
|
||||
|
||||
await _lock.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
try
|
||||
{
|
||||
var entries = await GetEntriesAsync(nodeId, cancellationToken).ConfigureAwait(false);
|
||||
var remaining = entries
|
||||
.Where(e => string.CompareOrdinal(e.THlc.ToSortableString(), upToHlc) > 0)
|
||||
.ToList();
|
||||
|
||||
var cleared = entries.Count - remaining.Count;
|
||||
|
||||
if (remaining.Count == 0)
|
||||
{
|
||||
var filePath = GetNodeLogFilePath(nodeId);
|
||||
if (File.Exists(filePath))
|
||||
{
|
||||
File.Delete(filePath);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Rewrite with remaining entries
|
||||
var filePath = GetNodeLogFilePath(nodeId);
|
||||
var lines = remaining.Select(e => JsonSerializer.Serialize(ToDto(e), JsonOptions));
|
||||
await File.WriteAllLinesAsync(filePath, lines, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Cleared {Count} offline job entries for node {NodeId} up to HLC {UpToHlc}",
|
||||
cleared, nodeId, upToHlc);
|
||||
|
||||
return cleared;
|
||||
}
|
||||
finally
|
||||
{
|
||||
_lock.Release();
|
||||
}
|
||||
}
|
||||
|
||||
private string GetNodeLogFilePath(string nodeId)
|
||||
{
|
||||
var safeNodeId = nodeId.Replace('/', '_').Replace('\\', '_').Replace(':', '_');
|
||||
return Path.Combine(_options.Value.DataDirectory, $"offline-jobs-{safeNodeId}.ndjson");
|
||||
}
|
||||
|
||||
private void EnsureDirectoryExists()
|
||||
{
|
||||
var dir = _options.Value.DataDirectory;
|
||||
if (!Directory.Exists(dir))
|
||||
{
|
||||
Directory.CreateDirectory(dir);
|
||||
_logger.LogInformation("Created offline job log directory: {Directory}", dir);
|
||||
}
|
||||
}
|
||||
|
||||
private static OfflineJobLogEntryDto ToDto(OfflineJobLogEntry entry) => new()
|
||||
{
|
||||
NodeId = entry.NodeId,
|
||||
THlc = entry.THlc.ToSortableString(),
|
||||
JobId = entry.JobId,
|
||||
PartitionKey = entry.PartitionKey,
|
||||
Payload = entry.Payload,
|
||||
PayloadHash = Convert.ToBase64String(entry.PayloadHash),
|
||||
PrevLink = entry.PrevLink is not null ? Convert.ToBase64String(entry.PrevLink) : null,
|
||||
Link = Convert.ToBase64String(entry.Link),
|
||||
EnqueuedAt = entry.EnqueuedAt
|
||||
};
|
||||
|
||||
private static OfflineJobLogEntry FromDto(OfflineJobLogEntryDto dto) => new()
|
||||
{
|
||||
NodeId = dto.NodeId,
|
||||
THlc = HlcTimestamp.Parse(dto.THlc),
|
||||
JobId = dto.JobId,
|
||||
PartitionKey = dto.PartitionKey,
|
||||
Payload = dto.Payload,
|
||||
PayloadHash = Convert.FromBase64String(dto.PayloadHash),
|
||||
PrevLink = dto.PrevLink is not null ? Convert.FromBase64String(dto.PrevLink) : null,
|
||||
Link = Convert.FromBase64String(dto.Link),
|
||||
EnqueuedAt = dto.EnqueuedAt
|
||||
};
|
||||
|
||||
private sealed record OfflineJobLogEntryDto
|
||||
{
|
||||
public required string NodeId { get; init; }
|
||||
public required string THlc { get; init; }
|
||||
public required Guid JobId { get; init; }
|
||||
public string? PartitionKey { get; init; }
|
||||
public required string Payload { get; init; }
|
||||
public required string PayloadHash { get; init; }
|
||||
public string? PrevLink { get; init; }
|
||||
public required string Link { get; init; }
|
||||
public DateTimeOffset EnqueuedAt { get; init; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
// <copyright file="IOfflineJobLogStore.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using StellaOps.AirGap.Sync.Models;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Stores;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for storing offline job log entries.
|
||||
/// </summary>
|
||||
public interface IOfflineJobLogStore
|
||||
{
|
||||
/// <summary>
|
||||
/// Appends an entry to the offline job log.
|
||||
/// </summary>
|
||||
/// <param name="entry">The entry to append.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
Task AppendAsync(OfflineJobLogEntry entry, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets all entries for a node.
|
||||
/// </summary>
|
||||
/// <param name="nodeId">The node ID.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>All entries in HLC order.</returns>
|
||||
Task<IReadOnlyList<OfflineJobLogEntry>> GetEntriesAsync(
|
||||
string nodeId,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the last chain link for a node.
|
||||
/// </summary>
|
||||
/// <param name="nodeId">The node ID.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The last link, or null if no entries exist.</returns>
|
||||
Task<byte[]?> GetLastLinkAsync(string nodeId, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the node job log for export.
|
||||
/// </summary>
|
||||
/// <param name="nodeId">The node ID.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The complete node job log.</returns>
|
||||
Task<NodeJobLog?> GetNodeJobLogAsync(string nodeId, CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Clears entries for a node after successful sync.
|
||||
/// </summary>
|
||||
/// <param name="nodeId">The node ID.</param>
|
||||
/// <param name="upToHlc">Clear entries up to and including this HLC timestamp.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Number of entries cleared.</returns>
|
||||
Task<int> ClearEntriesAsync(
|
||||
string nodeId,
|
||||
string upToHlc,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
@@ -0,0 +1,161 @@
|
||||
// <copyright file="AirGapSyncMetrics.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using System.Diagnostics.Metrics;
|
||||
using StellaOps.AirGap.Sync.Models;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Telemetry;
|
||||
|
||||
/// <summary>
|
||||
/// Metrics for air-gap sync operations.
|
||||
/// </summary>
|
||||
public static class AirGapSyncMetrics
|
||||
{
|
||||
private const string NodeIdTag = "node_id";
|
||||
private const string TenantIdTag = "tenant_id";
|
||||
private const string ConflictTypeTag = "conflict_type";
|
||||
|
||||
private static readonly Meter Meter = new("StellaOps.AirGap.Sync");
|
||||
|
||||
// Counters
|
||||
private static readonly Counter<long> BundlesExportedCounter = Meter.CreateCounter<long>(
|
||||
"airgap_bundles_exported_total",
|
||||
unit: "{bundle}",
|
||||
description: "Total number of air-gap bundles exported");
|
||||
|
||||
private static readonly Counter<long> BundlesImportedCounter = Meter.CreateCounter<long>(
|
||||
"airgap_bundles_imported_total",
|
||||
unit: "{bundle}",
|
||||
description: "Total number of air-gap bundles imported");
|
||||
|
||||
private static readonly Counter<long> JobsSyncedCounter = Meter.CreateCounter<long>(
|
||||
"airgap_jobs_synced_total",
|
||||
unit: "{job}",
|
||||
description: "Total number of jobs synced from air-gap bundles");
|
||||
|
||||
private static readonly Counter<long> DuplicatesDroppedCounter = Meter.CreateCounter<long>(
|
||||
"airgap_duplicates_dropped_total",
|
||||
unit: "{duplicate}",
|
||||
description: "Total number of duplicate entries dropped during merge");
|
||||
|
||||
private static readonly Counter<long> MergeConflictsCounter = Meter.CreateCounter<long>(
|
||||
"airgap_merge_conflicts_total",
|
||||
unit: "{conflict}",
|
||||
description: "Total number of merge conflicts by type");
|
||||
|
||||
private static readonly Counter<long> OfflineEnqueuesCounter = Meter.CreateCounter<long>(
|
||||
"airgap_offline_enqueues_total",
|
||||
unit: "{enqueue}",
|
||||
description: "Total number of offline enqueue operations");
|
||||
|
||||
// Histograms
|
||||
private static readonly Histogram<double> BundleSizeHistogram = Meter.CreateHistogram<double>(
|
||||
"airgap_bundle_size_bytes",
|
||||
unit: "By",
|
||||
description: "Size of air-gap bundles in bytes");
|
||||
|
||||
private static readonly Histogram<double> SyncDurationHistogram = Meter.CreateHistogram<double>(
|
||||
"airgap_sync_duration_seconds",
|
||||
unit: "s",
|
||||
description: "Duration of air-gap sync operations");
|
||||
|
||||
private static readonly Histogram<int> MergeEntriesHistogram = Meter.CreateHistogram<int>(
|
||||
"airgap_merge_entries_count",
|
||||
unit: "{entry}",
|
||||
description: "Number of entries in merge operations");
|
||||
|
||||
/// <summary>
|
||||
/// Records a bundle export.
|
||||
/// </summary>
|
||||
/// <param name="nodeId">The node ID that exported.</param>
|
||||
/// <param name="tenantId">The tenant ID.</param>
|
||||
/// <param name="entryCount">Number of entries in the bundle.</param>
|
||||
public static void RecordBundleExported(string nodeId, string tenantId, int entryCount)
|
||||
{
|
||||
BundlesExportedCounter.Add(1,
|
||||
new KeyValuePair<string, object?>(NodeIdTag, nodeId),
|
||||
new KeyValuePair<string, object?>(TenantIdTag, tenantId));
|
||||
MergeEntriesHistogram.Record(entryCount,
|
||||
new KeyValuePair<string, object?>(NodeIdTag, nodeId));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a bundle import.
|
||||
/// </summary>
|
||||
/// <param name="nodeId">The node ID that imported.</param>
|
||||
/// <param name="tenantId">The tenant ID.</param>
|
||||
public static void RecordBundleImported(string nodeId, string tenantId)
|
||||
{
|
||||
BundlesImportedCounter.Add(1,
|
||||
new KeyValuePair<string, object?>(NodeIdTag, nodeId),
|
||||
new KeyValuePair<string, object?>(TenantIdTag, tenantId));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records jobs synced from a bundle.
|
||||
/// </summary>
|
||||
/// <param name="nodeId">The node ID.</param>
|
||||
/// <param name="count">Number of jobs synced.</param>
|
||||
public static void RecordJobsSynced(string nodeId, int count)
|
||||
{
|
||||
JobsSyncedCounter.Add(count,
|
||||
new KeyValuePair<string, object?>(NodeIdTag, nodeId));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records duplicates dropped during merge.
|
||||
/// </summary>
|
||||
/// <param name="nodeId">The node ID.</param>
|
||||
/// <param name="count">Number of duplicates dropped.</param>
|
||||
public static void RecordDuplicatesDropped(string nodeId, int count)
|
||||
{
|
||||
if (count > 0)
|
||||
{
|
||||
DuplicatesDroppedCounter.Add(count,
|
||||
new KeyValuePair<string, object?>(NodeIdTag, nodeId));
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records a merge conflict.
|
||||
/// </summary>
|
||||
/// <param name="conflictType">The type of conflict.</param>
|
||||
public static void RecordMergeConflict(ConflictType conflictType)
|
||||
{
|
||||
MergeConflictsCounter.Add(1,
|
||||
new KeyValuePair<string, object?>(ConflictTypeTag, conflictType.ToString()));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records an offline enqueue operation.
|
||||
/// </summary>
|
||||
/// <param name="nodeId">The node ID.</param>
|
||||
public static void RecordOfflineEnqueue(string nodeId)
|
||||
{
|
||||
OfflineEnqueuesCounter.Add(1,
|
||||
new KeyValuePair<string, object?>(NodeIdTag, nodeId));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records bundle size.
|
||||
/// </summary>
|
||||
/// <param name="nodeId">The node ID.</param>
|
||||
/// <param name="sizeBytes">Size in bytes.</param>
|
||||
public static void RecordBundleSize(string nodeId, long sizeBytes)
|
||||
{
|
||||
BundleSizeHistogram.Record(sizeBytes,
|
||||
new KeyValuePair<string, object?>(NodeIdTag, nodeId));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Records sync duration.
|
||||
/// </summary>
|
||||
/// <param name="nodeId">The node ID.</param>
|
||||
/// <param name="durationSeconds">Duration in seconds.</param>
|
||||
public static void RecordSyncDuration(string nodeId, double durationSeconds)
|
||||
{
|
||||
SyncDurationHistogram.Record(durationSeconds,
|
||||
new KeyValuePair<string, object?>(NodeIdTag, nodeId));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,221 @@
|
||||
// <copyright file="FileBasedJobSyncTransport.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.AirGap.Sync.Models;
|
||||
using StellaOps.AirGap.Sync.Services;
|
||||
using StellaOps.AirGap.Sync.Telemetry;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Transport;
|
||||
|
||||
/// <summary>
|
||||
/// File-based transport for job sync bundles in air-gapped scenarios.
|
||||
/// </summary>
|
||||
public sealed class FileBasedJobSyncTransport : IJobSyncTransport
|
||||
{
|
||||
private readonly IAirGapBundleExporter _exporter;
|
||||
private readonly IAirGapBundleImporter _importer;
|
||||
private readonly FileBasedJobSyncTransportOptions _options;
|
||||
private readonly ILogger<FileBasedJobSyncTransport> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="FileBasedJobSyncTransport"/> class.
|
||||
/// </summary>
|
||||
public FileBasedJobSyncTransport(
|
||||
IAirGapBundleExporter exporter,
|
||||
IAirGapBundleImporter importer,
|
||||
IOptions<FileBasedJobSyncTransportOptions> options,
|
||||
ILogger<FileBasedJobSyncTransport> logger)
|
||||
{
|
||||
_exporter = exporter ?? throw new ArgumentNullException(nameof(exporter));
|
||||
_importer = importer ?? throw new ArgumentNullException(nameof(importer));
|
||||
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public string TransportId => "file";
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<JobSyncSendResult> SendBundleAsync(
|
||||
AirGapBundle bundle,
|
||||
string destination,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var startTime = DateTimeOffset.UtcNow;
|
||||
|
||||
try
|
||||
{
|
||||
// Ensure destination directory exists
|
||||
var destPath = Path.IsPathRooted(destination)
|
||||
? destination
|
||||
: Path.Combine(_options.OutputDirectory, destination);
|
||||
|
||||
Directory.CreateDirectory(destPath);
|
||||
|
||||
// Export to file
|
||||
var filePath = Path.Combine(destPath, $"job-sync-{bundle.BundleId:N}.json");
|
||||
await _exporter.ExportToFileAsync(bundle, filePath, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
var fileInfo = new FileInfo(filePath);
|
||||
var sizeBytes = fileInfo.Exists ? fileInfo.Length : 0;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Exported job sync bundle {BundleId} to {Path} ({Size} bytes)",
|
||||
bundle.BundleId,
|
||||
filePath,
|
||||
sizeBytes);
|
||||
|
||||
AirGapSyncMetrics.RecordBundleSize(bundle.CreatedByNodeId, sizeBytes);
|
||||
|
||||
return new JobSyncSendResult
|
||||
{
|
||||
Success = true,
|
||||
BundleId = bundle.BundleId,
|
||||
Destination = filePath,
|
||||
TransmittedAt = startTime,
|
||||
SizeBytes = sizeBytes
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to export job sync bundle {BundleId}", bundle.BundleId);
|
||||
|
||||
return new JobSyncSendResult
|
||||
{
|
||||
Success = false,
|
||||
BundleId = bundle.BundleId,
|
||||
Destination = destination,
|
||||
Error = ex.Message,
|
||||
TransmittedAt = startTime
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<AirGapBundle?> ReceiveBundleAsync(
|
||||
string source,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
var sourcePath = Path.IsPathRooted(source)
|
||||
? source
|
||||
: Path.Combine(_options.InputDirectory, source);
|
||||
|
||||
if (!File.Exists(sourcePath))
|
||||
{
|
||||
_logger.LogWarning("Job sync bundle file not found: {Path}", sourcePath);
|
||||
return null;
|
||||
}
|
||||
|
||||
var bundle = await _importer.ImportFromFileAsync(sourcePath, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Imported job sync bundle {BundleId} from {Path}",
|
||||
bundle.BundleId,
|
||||
sourcePath);
|
||||
|
||||
return bundle;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to import job sync bundle from {Source}", source);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public Task<IReadOnlyList<BundleInfo>> ListAvailableBundlesAsync(
|
||||
string source,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var sourcePath = Path.IsPathRooted(source)
|
||||
? source
|
||||
: Path.Combine(_options.InputDirectory, source);
|
||||
|
||||
var bundles = new List<BundleInfo>();
|
||||
|
||||
if (!Directory.Exists(sourcePath))
|
||||
{
|
||||
return Task.FromResult<IReadOnlyList<BundleInfo>>(bundles);
|
||||
}
|
||||
|
||||
var files = Directory.GetFiles(sourcePath, "job-sync-*.json");
|
||||
|
||||
foreach (var file in files)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Quick parse to extract bundle metadata
|
||||
var json = File.ReadAllText(file);
|
||||
var doc = JsonDocument.Parse(json);
|
||||
var root = doc.RootElement;
|
||||
|
||||
if (root.TryGetProperty("bundleId", out var bundleIdProp) &&
|
||||
root.TryGetProperty("tenantId", out var tenantIdProp) &&
|
||||
root.TryGetProperty("createdByNodeId", out var nodeIdProp) &&
|
||||
root.TryGetProperty("createdAt", out var createdAtProp))
|
||||
{
|
||||
var entryCount = 0;
|
||||
if (root.TryGetProperty("jobLogs", out var jobLogs))
|
||||
{
|
||||
foreach (var log in jobLogs.EnumerateArray())
|
||||
{
|
||||
if (log.TryGetProperty("entries", out var entries))
|
||||
{
|
||||
entryCount += entries.GetArrayLength();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bundles.Add(new BundleInfo
|
||||
{
|
||||
BundleId = Guid.Parse(bundleIdProp.GetString()!),
|
||||
TenantId = tenantIdProp.GetString()!,
|
||||
SourceNodeId = nodeIdProp.GetString()!,
|
||||
CreatedAt = DateTimeOffset.Parse(createdAtProp.GetString()!),
|
||||
EntryCount = entryCount,
|
||||
SizeBytes = new FileInfo(file).Length
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse bundle metadata from {File}", file);
|
||||
}
|
||||
}
|
||||
|
||||
return Task.FromResult<IReadOnlyList<BundleInfo>>(
|
||||
bundles.OrderByDescending(b => b.CreatedAt).ToList());
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for file-based job sync transport.
|
||||
/// </summary>
|
||||
public sealed class FileBasedJobSyncTransportOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets or sets the output directory for exporting bundles.
|
||||
/// </summary>
|
||||
public string OutputDirectory { get; set; } = Path.Combine(
|
||||
Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData),
|
||||
"stellaops",
|
||||
"airgap",
|
||||
"outbox");
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the input directory for importing bundles.
|
||||
/// </summary>
|
||||
public string InputDirectory { get; set; } = Path.Combine(
|
||||
Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData),
|
||||
"stellaops",
|
||||
"airgap",
|
||||
"inbox");
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
// <copyright file="IJobSyncTransport.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using StellaOps.AirGap.Sync.Models;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Transport;
|
||||
|
||||
/// <summary>
|
||||
/// Transport abstraction for job sync bundles.
|
||||
/// Enables bundle transfer over various transports (file, Router messaging, etc.).
|
||||
/// </summary>
|
||||
public interface IJobSyncTransport
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the transport identifier.
|
||||
/// </summary>
|
||||
string TransportId { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Sends a job sync bundle to a destination.
|
||||
/// </summary>
|
||||
/// <param name="bundle">The bundle to send.</param>
|
||||
/// <param name="destination">The destination identifier.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The send result.</returns>
|
||||
Task<JobSyncSendResult> SendBundleAsync(
|
||||
AirGapBundle bundle,
|
||||
string destination,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Receives a job sync bundle from a source.
|
||||
/// </summary>
|
||||
/// <param name="source">The source identifier.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>The received bundle, or null if not available.</returns>
|
||||
Task<AirGapBundle?> ReceiveBundleAsync(
|
||||
string source,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Lists available bundles from a source.
|
||||
/// </summary>
|
||||
/// <param name="source">The source identifier.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>List of available bundle identifiers.</returns>
|
||||
Task<IReadOnlyList<BundleInfo>> ListAvailableBundlesAsync(
|
||||
string source,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of sending a job sync bundle.
|
||||
/// </summary>
|
||||
public sealed record JobSyncSendResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets a value indicating whether the send was successful.
|
||||
/// </summary>
|
||||
public required bool Success { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the bundle ID.
|
||||
/// </summary>
|
||||
public required Guid BundleId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the destination where the bundle was sent.
|
||||
/// </summary>
|
||||
public required string Destination { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the error message if the send failed.
|
||||
/// </summary>
|
||||
public string? Error { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the transmission timestamp.
|
||||
/// </summary>
|
||||
public DateTimeOffset TransmittedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the size of the transmitted data in bytes.
|
||||
/// </summary>
|
||||
public long SizeBytes { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Information about an available bundle.
|
||||
/// </summary>
|
||||
public sealed record BundleInfo
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the bundle ID.
|
||||
/// </summary>
|
||||
public required Guid BundleId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the tenant ID.
|
||||
/// </summary>
|
||||
public required string TenantId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the source node ID.
|
||||
/// </summary>
|
||||
public required string SourceNodeId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the creation timestamp.
|
||||
/// </summary>
|
||||
public required DateTimeOffset CreatedAt { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the entry count in the bundle.
|
||||
/// </summary>
|
||||
public int EntryCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the bundle size in bytes.
|
||||
/// </summary>
|
||||
public long SizeBytes { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,272 @@
|
||||
// <copyright file="RouterJobSyncTransport.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.AirGap.Sync.Models;
|
||||
using StellaOps.AirGap.Sync.Services;
|
||||
using StellaOps.AirGap.Sync.Telemetry;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Transport;
|
||||
|
||||
/// <summary>
|
||||
/// Router-based transport for job sync bundles when network is available.
|
||||
/// This transport uses the Router messaging infrastructure for real-time sync.
|
||||
/// </summary>
|
||||
public sealed class RouterJobSyncTransport : IJobSyncTransport
|
||||
{
|
||||
private readonly IAirGapBundleExporter _exporter;
|
||||
private readonly IAirGapBundleImporter _importer;
|
||||
private readonly IRouterJobSyncClient _routerClient;
|
||||
private readonly RouterJobSyncTransportOptions _options;
|
||||
private readonly ILogger<RouterJobSyncTransport> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="RouterJobSyncTransport"/> class.
|
||||
/// </summary>
|
||||
public RouterJobSyncTransport(
|
||||
IAirGapBundleExporter exporter,
|
||||
IAirGapBundleImporter importer,
|
||||
IRouterJobSyncClient routerClient,
|
||||
IOptions<RouterJobSyncTransportOptions> options,
|
||||
ILogger<RouterJobSyncTransport> logger)
|
||||
{
|
||||
_exporter = exporter ?? throw new ArgumentNullException(nameof(exporter));
|
||||
_importer = importer ?? throw new ArgumentNullException(nameof(importer));
|
||||
_routerClient = routerClient ?? throw new ArgumentNullException(nameof(routerClient));
|
||||
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public string TransportId => "router";
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<JobSyncSendResult> SendBundleAsync(
|
||||
AirGapBundle bundle,
|
||||
string destination,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var startTime = DateTimeOffset.UtcNow;
|
||||
|
||||
try
|
||||
{
|
||||
// Serialize bundle
|
||||
var json = await _exporter.ExportToStringAsync(bundle, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
var payload = Encoding.UTF8.GetBytes(json);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Sending job sync bundle {BundleId} to {Destination} ({Size} bytes)",
|
||||
bundle.BundleId,
|
||||
destination,
|
||||
payload.Length);
|
||||
|
||||
// Send via Router
|
||||
var response = await _routerClient.SendJobSyncBundleAsync(
|
||||
destination,
|
||||
bundle.BundleId,
|
||||
bundle.TenantId,
|
||||
payload,
|
||||
_options.SendTimeout,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (response.Success)
|
||||
{
|
||||
AirGapSyncMetrics.RecordBundleSize(bundle.CreatedByNodeId, payload.Length);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Sent job sync bundle {BundleId} to {Destination}",
|
||||
bundle.BundleId,
|
||||
destination);
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Failed to send job sync bundle {BundleId} to {Destination}: {Error}",
|
||||
bundle.BundleId,
|
||||
destination,
|
||||
response.Error);
|
||||
}
|
||||
|
||||
return new JobSyncSendResult
|
||||
{
|
||||
Success = response.Success,
|
||||
BundleId = bundle.BundleId,
|
||||
Destination = destination,
|
||||
Error = response.Error,
|
||||
TransmittedAt = startTime,
|
||||
SizeBytes = payload.Length
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(
|
||||
ex,
|
||||
"Error sending job sync bundle {BundleId} to {Destination}",
|
||||
bundle.BundleId,
|
||||
destination);
|
||||
|
||||
return new JobSyncSendResult
|
||||
{
|
||||
Success = false,
|
||||
BundleId = bundle.BundleId,
|
||||
Destination = destination,
|
||||
Error = ex.Message,
|
||||
TransmittedAt = startTime
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<AirGapBundle?> ReceiveBundleAsync(
|
||||
string source,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
var response = await _routerClient.ReceiveJobSyncBundleAsync(
|
||||
source,
|
||||
_options.ReceiveTimeout,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (response.Payload is null || response.Payload.Length == 0)
|
||||
{
|
||||
_logger.LogDebug("No bundle available from {Source}", source);
|
||||
return null;
|
||||
}
|
||||
|
||||
var json = Encoding.UTF8.GetString(response.Payload);
|
||||
var bundle = await _importer.ImportFromStringAsync(json, cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Received job sync bundle {BundleId} from {Source}",
|
||||
bundle.BundleId,
|
||||
source);
|
||||
|
||||
return bundle;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error receiving job sync bundle from {Source}", source);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc/>
|
||||
public async Task<IReadOnlyList<BundleInfo>> ListAvailableBundlesAsync(
|
||||
string source,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
var response = await _routerClient.ListAvailableBundlesAsync(
|
||||
source,
|
||||
_options.ListTimeout,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
|
||||
return response.Bundles;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Error listing available bundles from {Source}", source);
|
||||
return Array.Empty<BundleInfo>();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for Router-based job sync transport.
|
||||
/// </summary>
|
||||
public sealed class RouterJobSyncTransportOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets or sets the timeout for send operations.
|
||||
/// </summary>
|
||||
public TimeSpan SendTimeout { get; set; } = TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the timeout for receive operations.
|
||||
/// </summary>
|
||||
public TimeSpan ReceiveTimeout { get; set; } = TimeSpan.FromSeconds(30);
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the timeout for list operations.
|
||||
/// </summary>
|
||||
public TimeSpan ListTimeout { get; set; } = TimeSpan.FromSeconds(10);
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the service endpoint for job sync.
|
||||
/// </summary>
|
||||
public string ServiceEndpoint { get; set; } = "scheduler.job-sync";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Client interface for Router job sync operations.
|
||||
/// </summary>
|
||||
public interface IRouterJobSyncClient
|
||||
{
|
||||
/// <summary>
|
||||
/// Sends a job sync bundle via the Router.
|
||||
/// </summary>
|
||||
Task<RouterSendResponse> SendJobSyncBundleAsync(
|
||||
string destination,
|
||||
Guid bundleId,
|
||||
string tenantId,
|
||||
byte[] payload,
|
||||
TimeSpan timeout,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Receives a job sync bundle via the Router.
|
||||
/// </summary>
|
||||
Task<RouterReceiveResponse> ReceiveJobSyncBundleAsync(
|
||||
string source,
|
||||
TimeSpan timeout,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Lists available bundles via the Router.
|
||||
/// </summary>
|
||||
Task<RouterListResponse> ListAvailableBundlesAsync(
|
||||
string source,
|
||||
TimeSpan timeout,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response from a Router send operation.
|
||||
/// </summary>
|
||||
public sealed record RouterSendResponse
|
||||
{
|
||||
/// <summary>Gets a value indicating whether the send was successful.</summary>
|
||||
public bool Success { get; init; }
|
||||
|
||||
/// <summary>Gets the error message if failed.</summary>
|
||||
public string? Error { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response from a Router receive operation.
|
||||
/// </summary>
|
||||
public sealed record RouterReceiveResponse
|
||||
{
|
||||
/// <summary>Gets the received payload.</summary>
|
||||
public byte[]? Payload { get; init; }
|
||||
|
||||
/// <summary>Gets the bundle ID.</summary>
|
||||
public Guid? BundleId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Response from a Router list operation.
|
||||
/// </summary>
|
||||
public sealed record RouterListResponse
|
||||
{
|
||||
/// <summary>Gets the available bundles.</summary>
|
||||
public IReadOnlyList<BundleInfo> Bundles { get; init; } = Array.Empty<BundleInfo>();
|
||||
}
|
||||
@@ -22,6 +22,9 @@ namespace StellaOps.AirGap.Bundle.Tests;
|
||||
/// Task AIRGAP-5100-016: Export bundle (online env) → import bundle (offline env) → verify data integrity
|
||||
/// Task AIRGAP-5100-017: Policy export → policy import → policy evaluation → verify identical verdict
|
||||
/// </summary>
|
||||
[Trait("Category", TestCategories.Integration)]
|
||||
[Trait("BlastRadius", TestCategories.BlastRadius.Integrations)]
|
||||
[Trait("BlastRadius", TestCategories.BlastRadius.Persistence)]
|
||||
public sealed class AirGapIntegrationTests : IDisposable
|
||||
{
|
||||
private readonly string _tempRoot;
|
||||
|
||||
@@ -0,0 +1,446 @@
|
||||
// <copyright file="HlcMergeServiceTests.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using FluentAssertions;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using StellaOps.AirGap.Sync.Models;
|
||||
using StellaOps.AirGap.Sync.Services;
|
||||
using StellaOps.HybridLogicalClock;
|
||||
using StellaOps.TestKit;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.AirGap.Sync.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Unit tests for <see cref="HlcMergeService"/>.
|
||||
/// </summary>
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
public sealed class HlcMergeServiceTests
|
||||
{
|
||||
private readonly HlcMergeService _sut;
|
||||
private readonly ConflictResolver _conflictResolver;
|
||||
|
||||
public HlcMergeServiceTests()
|
||||
{
|
||||
_conflictResolver = new ConflictResolver(NullLogger<ConflictResolver>.Instance);
|
||||
_sut = new HlcMergeService(_conflictResolver, NullLogger<HlcMergeService>.Instance);
|
||||
}
|
||||
|
||||
#region OMP-014: Merge Algorithm Correctness
|
||||
|
||||
[Fact]
|
||||
public async Task MergeAsync_EmptyInput_ReturnsEmptyResult()
|
||||
{
|
||||
// Arrange
|
||||
var nodeLogs = new List<NodeJobLog>();
|
||||
|
||||
// Act
|
||||
var result = await _sut.MergeAsync(nodeLogs);
|
||||
|
||||
// Assert
|
||||
result.MergedEntries.Should().BeEmpty();
|
||||
result.Duplicates.Should().BeEmpty();
|
||||
result.SourceNodes.Should().BeEmpty();
|
||||
result.MergedChainHead.Should().BeNull();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MergeAsync_SingleNode_PreservesOrder()
|
||||
{
|
||||
// Arrange
|
||||
var nodeLog = CreateNodeLog("node-a", new[]
|
||||
{
|
||||
CreateEntry("node-a", 100, 0, Guid.Parse("11111111-1111-1111-1111-111111111111")),
|
||||
CreateEntry("node-a", 200, 0, Guid.Parse("22222222-2222-2222-2222-222222222222")),
|
||||
CreateEntry("node-a", 300, 0, Guid.Parse("33333333-3333-3333-3333-333333333333"))
|
||||
});
|
||||
|
||||
// Act
|
||||
var result = await _sut.MergeAsync(new[] { nodeLog });
|
||||
|
||||
// Assert
|
||||
result.MergedEntries.Should().HaveCount(3);
|
||||
result.MergedEntries[0].JobId.Should().Be(Guid.Parse("11111111-1111-1111-1111-111111111111"));
|
||||
result.MergedEntries[1].JobId.Should().Be(Guid.Parse("22222222-2222-2222-2222-222222222222"));
|
||||
result.MergedEntries[2].JobId.Should().Be(Guid.Parse("33333333-3333-3333-3333-333333333333"));
|
||||
result.Duplicates.Should().BeEmpty();
|
||||
result.SourceNodes.Should().ContainSingle().Which.Should().Be("node-a");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MergeAsync_TwoNodes_MergesByHlcOrder()
|
||||
{
|
||||
// Arrange - Two nodes with interleaved HLC timestamps
|
||||
// Node A: T=100, T=102
|
||||
// Node B: T=101, T=103
|
||||
// Expected order: 100, 101, 102, 103
|
||||
var nodeA = CreateNodeLog("node-a", new[]
|
||||
{
|
||||
CreateEntry("node-a", 100, 0, Guid.Parse("aaaaaaaa-0001-0000-0000-000000000000")),
|
||||
CreateEntry("node-a", 102, 0, Guid.Parse("aaaaaaaa-0003-0000-0000-000000000000"))
|
||||
});
|
||||
var nodeB = CreateNodeLog("node-b", new[]
|
||||
{
|
||||
CreateEntry("node-b", 101, 0, Guid.Parse("bbbbbbbb-0002-0000-0000-000000000000")),
|
||||
CreateEntry("node-b", 103, 0, Guid.Parse("bbbbbbbb-0004-0000-0000-000000000000"))
|
||||
});
|
||||
|
||||
// Act
|
||||
var result = await _sut.MergeAsync(new[] { nodeA, nodeB });
|
||||
|
||||
// Assert
|
||||
result.MergedEntries.Should().HaveCount(4);
|
||||
result.MergedEntries[0].THlc.PhysicalTime.Should().Be(100);
|
||||
result.MergedEntries[1].THlc.PhysicalTime.Should().Be(101);
|
||||
result.MergedEntries[2].THlc.PhysicalTime.Should().Be(102);
|
||||
result.MergedEntries[3].THlc.PhysicalTime.Should().Be(103);
|
||||
result.SourceNodes.Should().HaveCount(2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MergeAsync_SamePhysicalTime_OrdersByLogicalCounter()
|
||||
{
|
||||
// Arrange - Same physical time, different logical counters
|
||||
var nodeA = CreateNodeLog("node-a", new[]
|
||||
{
|
||||
CreateEntry("node-a", 100, 0, Guid.Parse("aaaaaaaa-0000-0000-0000-000000000001")),
|
||||
CreateEntry("node-a", 100, 2, Guid.Parse("aaaaaaaa-0000-0000-0000-000000000003"))
|
||||
});
|
||||
var nodeB = CreateNodeLog("node-b", new[]
|
||||
{
|
||||
CreateEntry("node-b", 100, 1, Guid.Parse("bbbbbbbb-0000-0000-0000-000000000002")),
|
||||
CreateEntry("node-b", 100, 3, Guid.Parse("bbbbbbbb-0000-0000-0000-000000000004"))
|
||||
});
|
||||
|
||||
// Act
|
||||
var result = await _sut.MergeAsync(new[] { nodeA, nodeB });
|
||||
|
||||
// Assert
|
||||
result.MergedEntries.Should().HaveCount(4);
|
||||
result.MergedEntries[0].THlc.LogicalCounter.Should().Be(0);
|
||||
result.MergedEntries[1].THlc.LogicalCounter.Should().Be(1);
|
||||
result.MergedEntries[2].THlc.LogicalCounter.Should().Be(2);
|
||||
result.MergedEntries[3].THlc.LogicalCounter.Should().Be(3);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MergeAsync_SameTimeAndCounter_OrdersByNodeId()
|
||||
{
|
||||
// Arrange - Same physical time and counter, different node IDs
|
||||
var nodeA = CreateNodeLog("alpha-node", new[]
|
||||
{
|
||||
CreateEntry("alpha-node", 100, 0, Guid.Parse("aaaaaaaa-0000-0000-0000-000000000001"))
|
||||
});
|
||||
var nodeB = CreateNodeLog("beta-node", new[]
|
||||
{
|
||||
CreateEntry("beta-node", 100, 0, Guid.Parse("bbbbbbbb-0000-0000-0000-000000000002"))
|
||||
});
|
||||
|
||||
// Act
|
||||
var result = await _sut.MergeAsync(new[] { nodeA, nodeB });
|
||||
|
||||
// Assert - "alpha-node" < "beta-node" alphabetically
|
||||
result.MergedEntries.Should().HaveCount(2);
|
||||
result.MergedEntries[0].SourceNodeId.Should().Be("alpha-node");
|
||||
result.MergedEntries[1].SourceNodeId.Should().Be("beta-node");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MergeAsync_RecomputesUnifiedChain()
|
||||
{
|
||||
// Arrange
|
||||
var nodeLog = CreateNodeLog("node-a", new[]
|
||||
{
|
||||
CreateEntry("node-a", 100, 0, Guid.Parse("11111111-1111-1111-1111-111111111111")),
|
||||
CreateEntry("node-a", 200, 0, Guid.Parse("22222222-2222-2222-2222-222222222222"))
|
||||
});
|
||||
|
||||
// Act
|
||||
var result = await _sut.MergeAsync(new[] { nodeLog });
|
||||
|
||||
// Assert - Chain should be recomputed
|
||||
result.MergedEntries.Should().HaveCount(2);
|
||||
result.MergedEntries[0].MergedLink.Should().NotBeNull();
|
||||
result.MergedEntries[1].MergedLink.Should().NotBeNull();
|
||||
result.MergedChainHead.Should().NotBeNull();
|
||||
|
||||
// First entry's link should be computed from null prev_link
|
||||
result.MergedEntries[0].MergedLink.Should().HaveCount(32);
|
||||
|
||||
// Chain head should equal last entry's merged link
|
||||
result.MergedChainHead.Should().BeEquivalentTo(result.MergedEntries[1].MergedLink);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region OMP-015: Duplicate Detection
|
||||
|
||||
[Fact]
|
||||
public async Task MergeAsync_DuplicateJobId_SamePayload_TakesEarliest()
|
||||
{
|
||||
// Arrange - Same job ID (same payload hash) from two nodes
|
||||
var jobId = Guid.Parse("dddddddd-dddd-dddd-dddd-dddddddddddd");
|
||||
var payloadHash = new byte[32];
|
||||
payloadHash[0] = 0xAA;
|
||||
|
||||
var nodeA = CreateNodeLog("node-a", new[]
|
||||
{
|
||||
CreateEntryWithPayloadHash("node-a", 100, 0, jobId, payloadHash)
|
||||
});
|
||||
var nodeB = CreateNodeLog("node-b", new[]
|
||||
{
|
||||
CreateEntryWithPayloadHash("node-b", 105, 0, jobId, payloadHash)
|
||||
});
|
||||
|
||||
// Act
|
||||
var result = await _sut.MergeAsync(new[] { nodeA, nodeB });
|
||||
|
||||
// Assert - Should take earliest (T=100 from node-a)
|
||||
result.MergedEntries.Should().ContainSingle();
|
||||
result.MergedEntries[0].SourceNodeId.Should().Be("node-a");
|
||||
result.MergedEntries[0].THlc.PhysicalTime.Should().Be(100);
|
||||
|
||||
// Should report duplicate
|
||||
result.Duplicates.Should().ContainSingle();
|
||||
result.Duplicates[0].JobId.Should().Be(jobId);
|
||||
result.Duplicates[0].NodeId.Should().Be("node-b");
|
||||
result.Duplicates[0].THlc.PhysicalTime.Should().Be(105);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MergeAsync_TriplicateJobId_SamePayload_TakesEarliest()
|
||||
{
|
||||
// Arrange - Same job ID from three nodes
|
||||
var jobId = Guid.Parse("eeeeeeee-eeee-eeee-eeee-eeeeeeeeeeee");
|
||||
var payloadHash = new byte[32];
|
||||
payloadHash[0] = 0xBB;
|
||||
|
||||
var nodeA = CreateNodeLog("node-a", new[]
|
||||
{
|
||||
CreateEntryWithPayloadHash("node-a", 200, 0, jobId, payloadHash)
|
||||
});
|
||||
var nodeB = CreateNodeLog("node-b", new[]
|
||||
{
|
||||
CreateEntryWithPayloadHash("node-b", 100, 0, jobId, payloadHash) // Earliest
|
||||
});
|
||||
var nodeC = CreateNodeLog("node-c", new[]
|
||||
{
|
||||
CreateEntryWithPayloadHash("node-c", 150, 0, jobId, payloadHash)
|
||||
});
|
||||
|
||||
// Act
|
||||
var result = await _sut.MergeAsync(new[] { nodeA, nodeB, nodeC });
|
||||
|
||||
// Assert - Should take earliest (T=100 from node-b)
|
||||
result.MergedEntries.Should().ContainSingle();
|
||||
result.MergedEntries[0].NodeId.Should().Be("node-b");
|
||||
result.MergedEntries[0].THlc.PhysicalTime.Should().Be(100);
|
||||
|
||||
// Should report two duplicates
|
||||
result.Duplicates.Should().HaveCount(2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MergeAsync_DuplicateJobId_DifferentPayload_ThrowsError()
|
||||
{
|
||||
// Arrange - Same job ID but different payload hashes (indicates bug)
|
||||
var jobId = Guid.Parse("ffffffff-ffff-ffff-ffff-ffffffffffff");
|
||||
var payloadHashA = new byte[32];
|
||||
payloadHashA[0] = 0x01;
|
||||
var payloadHashB = new byte[32];
|
||||
payloadHashB[0] = 0x02;
|
||||
|
||||
var nodeA = CreateNodeLog("node-a", new[]
|
||||
{
|
||||
CreateEntryWithPayloadHash("node-a", 100, 0, jobId, payloadHashA)
|
||||
});
|
||||
var nodeB = CreateNodeLog("node-b", new[]
|
||||
{
|
||||
CreateEntryWithPayloadHash("node-b", 105, 0, jobId, payloadHashB)
|
||||
});
|
||||
|
||||
// Act & Assert - Should throw because payloads differ
|
||||
var act = () => _sut.MergeAsync(new[] { nodeA, nodeB });
|
||||
await act.Should().ThrowAsync<InvalidOperationException>()
|
||||
.WithMessage("*conflicting payloads*");
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region OMP-018: Multi-Node Merge
|
||||
|
||||
[Fact]
|
||||
public async Task MergeAsync_ThreeNodes_MergesCorrectly()
|
||||
{
|
||||
// Arrange - Three nodes with various timestamps
|
||||
var nodeA = CreateNodeLog("node-a", new[]
|
||||
{
|
||||
CreateEntry("node-a", 100, 0, Guid.Parse("aaaaaaaa-0001-0000-0000-000000000000")),
|
||||
CreateEntry("node-a", 400, 0, Guid.Parse("aaaaaaaa-0007-0000-0000-000000000000"))
|
||||
});
|
||||
var nodeB = CreateNodeLog("node-b", new[]
|
||||
{
|
||||
CreateEntry("node-b", 200, 0, Guid.Parse("bbbbbbbb-0002-0000-0000-000000000000")),
|
||||
CreateEntry("node-b", 500, 0, Guid.Parse("bbbbbbbb-0008-0000-0000-000000000000"))
|
||||
});
|
||||
var nodeC = CreateNodeLog("node-c", new[]
|
||||
{
|
||||
CreateEntry("node-c", 300, 0, Guid.Parse("cccccccc-0003-0000-0000-000000000000")),
|
||||
CreateEntry("node-c", 600, 0, Guid.Parse("cccccccc-0009-0000-0000-000000000000"))
|
||||
});
|
||||
|
||||
// Act
|
||||
var result = await _sut.MergeAsync(new[] { nodeA, nodeB, nodeC });
|
||||
|
||||
// Assert
|
||||
result.MergedEntries.Should().HaveCount(6);
|
||||
result.MergedEntries.Select(e => e.THlc.PhysicalTime).Should()
|
||||
.BeInAscendingOrder();
|
||||
result.MergedEntries.Select(e => e.THlc.PhysicalTime).Should()
|
||||
.ContainInOrder(100L, 200L, 300L, 400L, 500L, 600L);
|
||||
result.SourceNodes.Should().HaveCount(3);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MergeAsync_ManyNodes_PreservesTotalOrder()
|
||||
{
|
||||
// Arrange - 5 nodes with 2 entries each
|
||||
var nodes = new List<NodeJobLog>();
|
||||
for (int i = 0; i < 5; i++)
|
||||
{
|
||||
var nodeId = $"node-{i:D2}";
|
||||
nodes.Add(CreateNodeLog(nodeId, new[]
|
||||
{
|
||||
CreateEntry(nodeId, 100 + i * 10, 0, Guid.NewGuid()),
|
||||
CreateEntry(nodeId, 150 + i * 10, 0, Guid.NewGuid())
|
||||
}));
|
||||
}
|
||||
|
||||
// Act
|
||||
var result = await _sut.MergeAsync(nodes);
|
||||
|
||||
// Assert
|
||||
result.MergedEntries.Should().HaveCount(10);
|
||||
result.MergedEntries.Select(e => e.THlc.PhysicalTime).Should()
|
||||
.BeInAscendingOrder();
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region OMP-019: Determinism Tests
|
||||
|
||||
[Fact]
|
||||
public async Task MergeAsync_SameInput_ProducesSameOutput()
|
||||
{
|
||||
// Arrange
|
||||
var nodeA = CreateNodeLog("node-a", new[]
|
||||
{
|
||||
CreateEntry("node-a", 100, 0, Guid.Parse("aaaaaaaa-0001-0000-0000-000000000000")),
|
||||
CreateEntry("node-a", 300, 0, Guid.Parse("aaaaaaaa-0003-0000-0000-000000000000"))
|
||||
});
|
||||
var nodeB = CreateNodeLog("node-b", new[]
|
||||
{
|
||||
CreateEntry("node-b", 200, 0, Guid.Parse("bbbbbbbb-0002-0000-0000-000000000000")),
|
||||
CreateEntry("node-b", 400, 0, Guid.Parse("bbbbbbbb-0004-0000-0000-000000000000"))
|
||||
});
|
||||
|
||||
// Act - Run merge twice
|
||||
var result1 = await _sut.MergeAsync(new[] { nodeA, nodeB });
|
||||
var result2 = await _sut.MergeAsync(new[] { nodeA, nodeB });
|
||||
|
||||
// Assert - Results should be identical
|
||||
result1.MergedEntries.Should().HaveCount(result2.MergedEntries.Count);
|
||||
for (int i = 0; i < result1.MergedEntries.Count; i++)
|
||||
{
|
||||
result1.MergedEntries[i].JobId.Should().Be(result2.MergedEntries[i].JobId);
|
||||
result1.MergedEntries[i].THlc.Should().Be(result2.MergedEntries[i].THlc);
|
||||
result1.MergedEntries[i].MergedLink.Should().BeEquivalentTo(result2.MergedEntries[i].MergedLink);
|
||||
}
|
||||
result1.MergedChainHead.Should().BeEquivalentTo(result2.MergedChainHead);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task MergeAsync_InputOrderIndependent_ProducesSameOutput()
|
||||
{
|
||||
// Arrange
|
||||
var nodeA = CreateNodeLog("node-a", new[]
|
||||
{
|
||||
CreateEntry("node-a", 100, 0, Guid.Parse("aaaaaaaa-0001-0000-0000-000000000000"))
|
||||
});
|
||||
var nodeB = CreateNodeLog("node-b", new[]
|
||||
{
|
||||
CreateEntry("node-b", 200, 0, Guid.Parse("bbbbbbbb-0002-0000-0000-000000000000"))
|
||||
});
|
||||
|
||||
// Act - Merge in different orders
|
||||
var result1 = await _sut.MergeAsync(new[] { nodeA, nodeB });
|
||||
var result2 = await _sut.MergeAsync(new[] { nodeB, nodeA });
|
||||
|
||||
// Assert - Results should be identical regardless of input order
|
||||
result1.MergedEntries.Select(e => e.JobId).Should()
|
||||
.BeEquivalentTo(result2.MergedEntries.Select(e => e.JobId));
|
||||
result1.MergedChainHead.Should().BeEquivalentTo(result2.MergedChainHead);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Helper Methods
|
||||
|
||||
private static NodeJobLog CreateNodeLog(string nodeId, IEnumerable<OfflineJobLogEntry> entries)
|
||||
{
|
||||
return new NodeJobLog
|
||||
{
|
||||
NodeId = nodeId,
|
||||
Entries = entries.ToList()
|
||||
};
|
||||
}
|
||||
|
||||
private static OfflineJobLogEntry CreateEntry(string nodeId, long physicalTime, int logicalCounter, Guid jobId)
|
||||
{
|
||||
var payloadHash = new byte[32];
|
||||
jobId.ToByteArray().CopyTo(payloadHash, 0);
|
||||
|
||||
var hlc = new HlcTimestamp
|
||||
{
|
||||
PhysicalTime = physicalTime,
|
||||
NodeId = nodeId,
|
||||
LogicalCounter = logicalCounter
|
||||
};
|
||||
|
||||
return new OfflineJobLogEntry
|
||||
{
|
||||
NodeId = nodeId,
|
||||
THlc = hlc,
|
||||
JobId = jobId,
|
||||
Payload = $"{{\"id\":\"{jobId}\"}}",
|
||||
PayloadHash = payloadHash,
|
||||
Link = new byte[32],
|
||||
EnqueuedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
}
|
||||
|
||||
private static OfflineJobLogEntry CreateEntryWithPayloadHash(
|
||||
string nodeId, long physicalTime, int logicalCounter, Guid jobId, byte[] payloadHash)
|
||||
{
|
||||
var hlc = new HlcTimestamp
|
||||
{
|
||||
PhysicalTime = physicalTime,
|
||||
NodeId = nodeId,
|
||||
LogicalCounter = logicalCounter
|
||||
};
|
||||
|
||||
return new OfflineJobLogEntry
|
||||
{
|
||||
NodeId = nodeId,
|
||||
THlc = hlc,
|
||||
JobId = jobId,
|
||||
Payload = $"{{\"id\":\"{jobId}\"}}",
|
||||
PayloadHash = payloadHash,
|
||||
Link = new byte[32],
|
||||
EnqueuedAt = DateTimeOffset.UtcNow
|
||||
};
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<IsPackable>false</IsPackable>
|
||||
<IsTestProject>true</IsTestProject>
|
||||
<TreatWarningsAsErrors>false</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="xunit.runner.visualstudio">
|
||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
</PackageReference>
|
||||
<PackageReference Include="coverlet.collector">
|
||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
</PackageReference>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\__Libraries\StellaOps.AirGap.Sync\StellaOps.AirGap.Sync.csproj" />
|
||||
<ProjectReference Include="..\..\..\__Libraries\StellaOps.TestKit\StellaOps.TestKit.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
295
src/Attestor/StellaOps.Attestation.Tests/DsseVerifierTests.cs
Normal file
295
src/Attestor/StellaOps.Attestation.Tests/DsseVerifierTests.cs
Normal file
@@ -0,0 +1,295 @@
|
||||
// <copyright file="DsseVerifierTests.cs" company="Stella Operations">
|
||||
// Copyright (c) Stella Operations. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using FluentAssertions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Attestation.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Unit tests for DsseVerifier.
|
||||
/// Sprint: SPRINT_20260105_002_001_REPLAY, Tasks RPL-006 through RPL-010.
|
||||
/// </summary>
|
||||
[Trait("Category", "Unit")]
|
||||
public class DsseVerifierTests
|
||||
{
|
||||
private readonly DsseVerifier _verifier;
|
||||
|
||||
public DsseVerifierTests()
|
||||
{
|
||||
_verifier = new DsseVerifier(NullLogger<DsseVerifier>.Instance);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task VerifyAsync_WithValidEcdsaSignature_ReturnsSuccess()
|
||||
{
|
||||
// Arrange
|
||||
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
var (envelope, publicKeyPem) = CreateSignedEnvelope(ecdsa);
|
||||
|
||||
// Act
|
||||
var result = await _verifier.VerifyAsync(envelope, publicKeyPem, TestContext.Current.CancellationToken);
|
||||
|
||||
// Assert
|
||||
result.IsValid.Should().BeTrue();
|
||||
result.ValidSignatureCount.Should().Be(1);
|
||||
result.TotalSignatureCount.Should().Be(1);
|
||||
result.PayloadType.Should().Be("https://in-toto.io/Statement/v1");
|
||||
result.Issues.Should().BeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task VerifyAsync_WithInvalidSignature_ReturnsFail()
|
||||
{
|
||||
// Arrange
|
||||
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
var (envelope, _) = CreateSignedEnvelope(ecdsa);
|
||||
|
||||
// Use a different key for verification
|
||||
using var differentKey = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
var differentPublicKeyPem = ExportPublicKeyPem(differentKey);
|
||||
|
||||
// Act
|
||||
var result = await _verifier.VerifyAsync(envelope, differentPublicKeyPem, TestContext.Current.CancellationToken);
|
||||
|
||||
// Assert
|
||||
result.IsValid.Should().BeFalse();
|
||||
result.ValidSignatureCount.Should().Be(0);
|
||||
result.Issues.Should().NotBeEmpty();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task VerifyAsync_WithMalformedJson_ReturnsParseError()
|
||||
{
|
||||
// Arrange
|
||||
var malformedJson = "{ not valid json }";
|
||||
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
var publicKeyPem = ExportPublicKeyPem(ecdsa);
|
||||
|
||||
// Act
|
||||
var result = await _verifier.VerifyAsync(malformedJson, publicKeyPem, TestContext.Current.CancellationToken);
|
||||
|
||||
// Assert
|
||||
result.IsValid.Should().BeFalse();
|
||||
result.Issues.Should().Contain(i => i.Contains("envelope_parse_error"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task VerifyAsync_WithMissingPayload_ReturnsFail()
|
||||
{
|
||||
// Arrange
|
||||
var envelope = JsonSerializer.Serialize(new
|
||||
{
|
||||
payloadType = "https://in-toto.io/Statement/v1",
|
||||
signatures = new[] { new { keyId = "key-001", sig = "YWJj" } }
|
||||
});
|
||||
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
var publicKeyPem = ExportPublicKeyPem(ecdsa);
|
||||
|
||||
// Act
|
||||
var result = await _verifier.VerifyAsync(envelope, publicKeyPem, TestContext.Current.CancellationToken);
|
||||
|
||||
// Assert
|
||||
result.IsValid.Should().BeFalse();
|
||||
result.Issues.Should().Contain(i => i.Contains("envelope_missing_payload"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task VerifyAsync_WithMissingSignatures_ReturnsFail()
|
||||
{
|
||||
// Arrange
|
||||
var payload = Convert.ToBase64String(Encoding.UTF8.GetBytes("{}"));
|
||||
var envelope = JsonSerializer.Serialize(new
|
||||
{
|
||||
payloadType = "https://in-toto.io/Statement/v1",
|
||||
payload,
|
||||
signatures = Array.Empty<object>()
|
||||
});
|
||||
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
var publicKeyPem = ExportPublicKeyPem(ecdsa);
|
||||
|
||||
// Act
|
||||
var result = await _verifier.VerifyAsync(envelope, publicKeyPem, TestContext.Current.CancellationToken);
|
||||
|
||||
// Assert
|
||||
result.IsValid.Should().BeFalse();
|
||||
result.Issues.Should().Contain("envelope_missing_signatures");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task VerifyAsync_WithNoTrustedKeys_ReturnsFail()
|
||||
{
|
||||
// Arrange
|
||||
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
var (envelope, _) = CreateSignedEnvelope(ecdsa);
|
||||
|
||||
// Act
|
||||
var result = await _verifier.VerifyAsync(envelope, Array.Empty<string>(), TestContext.Current.CancellationToken);
|
||||
|
||||
// Assert
|
||||
result.IsValid.Should().BeFalse();
|
||||
result.Issues.Should().Contain("no_trusted_keys_provided");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task VerifyAsync_WithMultipleTrustedKeys_SucceedsWithMatchingKey()
|
||||
{
|
||||
// Arrange
|
||||
using var signingKey = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
using var otherKey1 = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
using var otherKey2 = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
|
||||
var (envelope, signingKeyPem) = CreateSignedEnvelope(signingKey);
|
||||
|
||||
var trustedKeys = new[]
|
||||
{
|
||||
ExportPublicKeyPem(otherKey1),
|
||||
signingKeyPem,
|
||||
ExportPublicKeyPem(otherKey2),
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = await _verifier.VerifyAsync(envelope, trustedKeys, TestContext.Current.CancellationToken);
|
||||
|
||||
// Assert
|
||||
result.IsValid.Should().BeTrue();
|
||||
result.ValidSignatureCount.Should().Be(1);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task VerifyAsync_WithKeyResolver_UsesResolverForVerification()
|
||||
{
|
||||
// Arrange
|
||||
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
var (envelope, publicKeyPem) = CreateSignedEnvelope(ecdsa);
|
||||
|
||||
Task<string?> KeyResolver(string? keyId, CancellationToken ct)
|
||||
{
|
||||
return Task.FromResult<string?>(publicKeyPem);
|
||||
}
|
||||
|
||||
// Act
|
||||
var result = await _verifier.VerifyAsync(envelope, KeyResolver, TestContext.Current.CancellationToken);
|
||||
|
||||
// Assert
|
||||
result.IsValid.Should().BeTrue();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task VerifyAsync_WithKeyResolverReturningNull_ReturnsFail()
|
||||
{
|
||||
// Arrange
|
||||
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
var (envelope, _) = CreateSignedEnvelope(ecdsa);
|
||||
|
||||
static Task<string?> KeyResolver(string? keyId, CancellationToken ct)
|
||||
{
|
||||
return Task.FromResult<string?>(null);
|
||||
}
|
||||
|
||||
// Act
|
||||
var result = await _verifier.VerifyAsync(envelope, KeyResolver, TestContext.Current.CancellationToken);
|
||||
|
||||
// Assert
|
||||
result.IsValid.Should().BeFalse();
|
||||
result.Issues.Should().Contain(i => i.Contains("key_not_found"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task VerifyAsync_ReturnsPayloadHash()
|
||||
{
|
||||
// Arrange
|
||||
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
var (envelope, publicKeyPem) = CreateSignedEnvelope(ecdsa);
|
||||
|
||||
// Act
|
||||
var result = await _verifier.VerifyAsync(envelope, publicKeyPem, TestContext.Current.CancellationToken);
|
||||
|
||||
// Assert
|
||||
result.PayloadHash.Should().StartWith("sha256:");
|
||||
result.PayloadHash.Should().HaveLength("sha256:".Length + 64);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task VerifyAsync_ThrowsOnNullEnvelope()
|
||||
{
|
||||
// Arrange
|
||||
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
var publicKeyPem = ExportPublicKeyPem(ecdsa);
|
||||
|
||||
// Act & Assert - null envelope throws ArgumentNullException
|
||||
await Assert.ThrowsAsync<ArgumentNullException>(
|
||||
() => _verifier.VerifyAsync(null!, publicKeyPem, TestContext.Current.CancellationToken));
|
||||
|
||||
// Empty envelope throws ArgumentException (whitespace check)
|
||||
await Assert.ThrowsAsync<ArgumentException>(
|
||||
() => _verifier.VerifyAsync("", publicKeyPem, TestContext.Current.CancellationToken));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task VerifyAsync_ThrowsOnNullKeys()
|
||||
{
|
||||
// Arrange
|
||||
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
|
||||
var (envelope, _) = CreateSignedEnvelope(ecdsa);
|
||||
|
||||
// Act & Assert
|
||||
await Assert.ThrowsAsync<ArgumentNullException>(
|
||||
() => _verifier.VerifyAsync(envelope, (IEnumerable<string>)null!, TestContext.Current.CancellationToken));
|
||||
|
||||
await Assert.ThrowsAsync<ArgumentNullException>(
|
||||
() => _verifier.VerifyAsync(envelope, (Func<string?, CancellationToken, Task<string?>>)null!, TestContext.Current.CancellationToken));
|
||||
}
|
||||
|
||||
private static (string EnvelopeJson, string PublicKeyPem) CreateSignedEnvelope(ECDsa signingKey)
|
||||
{
|
||||
var payloadType = "https://in-toto.io/Statement/v1";
|
||||
var payloadContent = "{\"_type\":\"https://in-toto.io/Statement/v1\",\"subject\":[]}";
|
||||
var payloadBytes = Encoding.UTF8.GetBytes(payloadContent);
|
||||
var payloadBase64 = Convert.ToBase64String(payloadBytes);
|
||||
|
||||
// Compute PAE
|
||||
var pae = DsseHelper.PreAuthenticationEncoding(payloadType, payloadBytes);
|
||||
|
||||
// Sign
|
||||
var signatureBytes = signingKey.SignData(pae, HashAlgorithmName.SHA256);
|
||||
var signatureBase64 = Convert.ToBase64String(signatureBytes);
|
||||
|
||||
// Build envelope
|
||||
var envelope = JsonSerializer.Serialize(new
|
||||
{
|
||||
payloadType,
|
||||
payload = payloadBase64,
|
||||
signatures = new[]
|
||||
{
|
||||
new { keyId = "test-key-001", sig = signatureBase64 }
|
||||
}
|
||||
});
|
||||
|
||||
var publicKeyPem = ExportPublicKeyPem(signingKey);
|
||||
|
||||
return (envelope, publicKeyPem);
|
||||
}
|
||||
|
||||
private static string ExportPublicKeyPem(ECDsa key)
|
||||
{
|
||||
var publicKeyBytes = key.ExportSubjectPublicKeyInfo();
|
||||
var base64 = Convert.ToBase64String(publicKeyBytes);
|
||||
var builder = new StringBuilder();
|
||||
builder.AppendLine("-----BEGIN PUBLIC KEY-----");
|
||||
|
||||
for (var i = 0; i < base64.Length; i += 64)
|
||||
{
|
||||
builder.AppendLine(base64.Substring(i, Math.Min(64, base64.Length - i)));
|
||||
}
|
||||
|
||||
builder.AppendLine("-----END PUBLIC KEY-----");
|
||||
return builder.ToString();
|
||||
}
|
||||
}
|
||||
301
src/Attestor/StellaOps.Attestation/DsseVerifier.cs
Normal file
301
src/Attestor/StellaOps.Attestation/DsseVerifier.cs
Normal file
@@ -0,0 +1,301 @@
|
||||
// <copyright file="DsseVerifier.cs" company="Stella Operations">
|
||||
// Copyright (c) Stella Operations. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.Attestation;
|
||||
|
||||
/// <summary>
|
||||
/// Implementation of DSSE signature verification.
|
||||
/// Uses the existing DsseHelper for PAE computation.
|
||||
/// </summary>
|
||||
public sealed class DsseVerifier : IDsseVerifier
|
||||
{
|
||||
private readonly ILogger<DsseVerifier> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// JSON serializer options for parsing DSSE envelopes.
|
||||
/// </summary>
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNameCaseInsensitive = true,
|
||||
};
|
||||
|
||||
public DsseVerifier(ILogger<DsseVerifier> logger)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<DsseVerificationResult> VerifyAsync(
|
||||
string envelopeJson,
|
||||
string publicKeyPem,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
return VerifyAsync(envelopeJson, new[] { publicKeyPem }, cancellationToken);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DsseVerificationResult> VerifyAsync(
|
||||
string envelopeJson,
|
||||
IEnumerable<string> trustedKeysPem,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(envelopeJson);
|
||||
ArgumentNullException.ThrowIfNull(trustedKeysPem);
|
||||
|
||||
var trustedKeys = trustedKeysPem.ToList();
|
||||
if (trustedKeys.Count == 0)
|
||||
{
|
||||
return DsseVerificationResult.Failure(0, ImmutableArray.Create("no_trusted_keys_provided"));
|
||||
}
|
||||
|
||||
return await VerifyWithAllKeysAsync(envelopeJson, trustedKeys, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DsseVerificationResult> VerifyAsync(
|
||||
string envelopeJson,
|
||||
Func<string?, CancellationToken, Task<string?>> keyResolver,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(envelopeJson);
|
||||
ArgumentNullException.ThrowIfNull(keyResolver);
|
||||
|
||||
// Parse the envelope
|
||||
DsseEnvelopeDto? envelope;
|
||||
try
|
||||
{
|
||||
envelope = JsonSerializer.Deserialize<DsseEnvelopeDto>(envelopeJson, JsonOptions);
|
||||
if (envelope is null)
|
||||
{
|
||||
return DsseVerificationResult.ParseError("Failed to deserialize envelope");
|
||||
}
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse DSSE envelope JSON");
|
||||
return DsseVerificationResult.ParseError(ex.Message);
|
||||
}
|
||||
|
||||
if (string.IsNullOrWhiteSpace(envelope.Payload))
|
||||
{
|
||||
return DsseVerificationResult.Failure(0, ImmutableArray.Create("envelope_missing_payload"));
|
||||
}
|
||||
|
||||
if (envelope.Signatures is null || envelope.Signatures.Count == 0)
|
||||
{
|
||||
return DsseVerificationResult.Failure(0, ImmutableArray.Create("envelope_missing_signatures"));
|
||||
}
|
||||
|
||||
// Decode payload
|
||||
byte[] payloadBytes;
|
||||
try
|
||||
{
|
||||
payloadBytes = Convert.FromBase64String(envelope.Payload);
|
||||
}
|
||||
catch (FormatException)
|
||||
{
|
||||
return DsseVerificationResult.Failure(envelope.Signatures.Count, ImmutableArray.Create("payload_invalid_base64"));
|
||||
}
|
||||
|
||||
// Compute PAE for signature verification
|
||||
var payloadType = envelope.PayloadType ?? "https://in-toto.io/Statement/v1";
|
||||
var pae = DsseHelper.PreAuthenticationEncoding(payloadType, payloadBytes);
|
||||
|
||||
// Verify each signature
|
||||
var verifiedKeyIds = new List<string>();
|
||||
var issues = new List<string>();
|
||||
|
||||
foreach (var sig in envelope.Signatures)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(sig.Sig))
|
||||
{
|
||||
issues.Add($"signature_{sig.KeyId ?? "unknown"}_empty");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Resolve the public key for this signature
|
||||
var publicKeyPem = await keyResolver(sig.KeyId, cancellationToken).ConfigureAwait(false);
|
||||
if (string.IsNullOrWhiteSpace(publicKeyPem))
|
||||
{
|
||||
issues.Add($"key_not_found_{sig.KeyId ?? "unknown"}");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Verify the signature
|
||||
try
|
||||
{
|
||||
var signatureBytes = Convert.FromBase64String(sig.Sig);
|
||||
if (VerifySignature(pae, signatureBytes, publicKeyPem))
|
||||
{
|
||||
verifiedKeyIds.Add(sig.KeyId ?? "unknown");
|
||||
_logger.LogDebug("DSSE signature verified for keyId: {KeyId}", sig.KeyId ?? "unknown");
|
||||
}
|
||||
else
|
||||
{
|
||||
issues.Add($"signature_invalid_{sig.KeyId ?? "unknown"}");
|
||||
}
|
||||
}
|
||||
catch (FormatException)
|
||||
{
|
||||
issues.Add($"signature_invalid_base64_{sig.KeyId ?? "unknown"}");
|
||||
}
|
||||
catch (CryptographicException ex)
|
||||
{
|
||||
issues.Add($"signature_crypto_error_{sig.KeyId ?? "unknown"}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
// Compute payload hash for result
|
||||
var payloadHash = $"sha256:{Convert.ToHexString(SHA256.HashData(payloadBytes)).ToLowerInvariant()}";
|
||||
|
||||
if (verifiedKeyIds.Count > 0)
|
||||
{
|
||||
return DsseVerificationResult.Success(
|
||||
verifiedKeyIds.Count,
|
||||
envelope.Signatures.Count,
|
||||
verifiedKeyIds.ToImmutableArray(),
|
||||
payloadType,
|
||||
payloadHash);
|
||||
}
|
||||
|
||||
return new DsseVerificationResult
|
||||
{
|
||||
IsValid = false,
|
||||
ValidSignatureCount = 0,
|
||||
TotalSignatureCount = envelope.Signatures.Count,
|
||||
VerifiedKeyIds = ImmutableArray<string>.Empty,
|
||||
PayloadType = payloadType,
|
||||
PayloadHash = payloadHash,
|
||||
Issues = issues.ToImmutableArray(),
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies against all trusted keys, returning success if any key validates any signature.
|
||||
/// </summary>
|
||||
private async Task<DsseVerificationResult> VerifyWithAllKeysAsync(
|
||||
string envelopeJson,
|
||||
List<string> trustedKeys,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
// Parse envelope first to get signature keyIds
|
||||
DsseEnvelopeDto? envelope;
|
||||
try
|
||||
{
|
||||
envelope = JsonSerializer.Deserialize<DsseEnvelopeDto>(envelopeJson, JsonOptions);
|
||||
if (envelope is null)
|
||||
{
|
||||
return DsseVerificationResult.ParseError("Failed to deserialize envelope");
|
||||
}
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
return DsseVerificationResult.ParseError(ex.Message);
|
||||
}
|
||||
|
||||
if (envelope.Signatures is null || envelope.Signatures.Count == 0)
|
||||
{
|
||||
return DsseVerificationResult.Failure(0, ImmutableArray.Create("envelope_missing_signatures"));
|
||||
}
|
||||
|
||||
// Try each trusted key
|
||||
var allIssues = new List<string>();
|
||||
foreach (var key in trustedKeys)
|
||||
{
|
||||
var keyIndex = trustedKeys.IndexOf(key);
|
||||
|
||||
async Task<string?> SingleKeyResolver(string? keyId, CancellationToken ct)
|
||||
{
|
||||
await Task.CompletedTask.ConfigureAwait(false);
|
||||
return key;
|
||||
}
|
||||
|
||||
var result = await VerifyAsync(envelopeJson, SingleKeyResolver, cancellationToken).ConfigureAwait(false);
|
||||
if (result.IsValid)
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
// Collect issues for debugging
|
||||
foreach (var issue in result.Issues)
|
||||
{
|
||||
allIssues.Add($"key{keyIndex}: {issue}");
|
||||
}
|
||||
}
|
||||
|
||||
return DsseVerificationResult.Failure(envelope.Signatures.Count, allIssues.ToImmutableArray());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies a signature against PAE using the provided public key.
|
||||
/// Supports ECDSA P-256 and RSA keys.
|
||||
/// </summary>
|
||||
private bool VerifySignature(byte[] pae, byte[] signature, string publicKeyPem)
|
||||
{
|
||||
// Try ECDSA first (most common for Sigstore/Fulcio)
|
||||
try
|
||||
{
|
||||
using var ecdsa = ECDsa.Create();
|
||||
ecdsa.ImportFromPem(publicKeyPem);
|
||||
return ecdsa.VerifyData(pae, signature, HashAlgorithmName.SHA256);
|
||||
}
|
||||
catch (CryptographicException)
|
||||
{
|
||||
// Not an ECDSA key, try RSA
|
||||
}
|
||||
|
||||
// Try RSA
|
||||
try
|
||||
{
|
||||
using var rsa = RSA.Create();
|
||||
rsa.ImportFromPem(publicKeyPem);
|
||||
return rsa.VerifyData(pae, signature, HashAlgorithmName.SHA256, RSASignaturePadding.Pkcs1);
|
||||
}
|
||||
catch (CryptographicException)
|
||||
{
|
||||
// Not an RSA key either
|
||||
}
|
||||
|
||||
// Try Ed25519 if available (.NET 9+)
|
||||
try
|
||||
{
|
||||
// Ed25519 support via System.Security.Cryptography
|
||||
// Note: Ed25519 verification requires different handling
|
||||
// For now, we log and return false - can be extended later
|
||||
_logger.LogDebug("Ed25519 signature verification not yet implemented");
|
||||
return false;
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Ed25519 not available
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DTO for deserializing DSSE envelope JSON.
|
||||
/// </summary>
|
||||
private sealed class DsseEnvelopeDto
|
||||
{
|
||||
public string? PayloadType { get; set; }
|
||||
public string? Payload { get; set; }
|
||||
public List<DsseSignatureDto>? Signatures { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// DTO for DSSE signature.
|
||||
/// </summary>
|
||||
private sealed class DsseSignatureDto
|
||||
{
|
||||
public string? KeyId { get; set; }
|
||||
public string? Sig { get; set; }
|
||||
}
|
||||
}
|
||||
151
src/Attestor/StellaOps.Attestation/IDsseVerifier.cs
Normal file
151
src/Attestor/StellaOps.Attestation/IDsseVerifier.cs
Normal file
@@ -0,0 +1,151 @@
|
||||
// <copyright file="IDsseVerifier.cs" company="Stella Operations">
|
||||
// Copyright (c) Stella Operations. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.Attestation;
|
||||
|
||||
/// <summary>
|
||||
/// Interface for verifying DSSE (Dead Simple Signing Envelope) signatures.
|
||||
/// </summary>
|
||||
public interface IDsseVerifier
|
||||
{
|
||||
/// <summary>
|
||||
/// Verifies a DSSE envelope against a public key.
|
||||
/// </summary>
|
||||
/// <param name="envelopeJson">The serialized DSSE envelope JSON.</param>
|
||||
/// <param name="publicKeyPem">The PEM-encoded public key for verification.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Verification result containing status and details.</returns>
|
||||
Task<DsseVerificationResult> VerifyAsync(
|
||||
string envelopeJson,
|
||||
string publicKeyPem,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Verifies a DSSE envelope against multiple trusted public keys.
|
||||
/// Returns success if at least one signature is valid.
|
||||
/// </summary>
|
||||
/// <param name="envelopeJson">The serialized DSSE envelope JSON.</param>
|
||||
/// <param name="trustedKeysPem">Collection of PEM-encoded public keys.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Verification result containing status and details.</returns>
|
||||
Task<DsseVerificationResult> VerifyAsync(
|
||||
string envelopeJson,
|
||||
IEnumerable<string> trustedKeysPem,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Verifies a DSSE envelope using a key resolver function.
|
||||
/// </summary>
|
||||
/// <param name="envelopeJson">The serialized DSSE envelope JSON.</param>
|
||||
/// <param name="keyResolver">Function to resolve public key by key ID.</param>
|
||||
/// <param name="cancellationToken">Cancellation token.</param>
|
||||
/// <returns>Verification result containing status and details.</returns>
|
||||
Task<DsseVerificationResult> VerifyAsync(
|
||||
string envelopeJson,
|
||||
Func<string?, CancellationToken, Task<string?>> keyResolver,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of DSSE signature verification.
|
||||
/// </summary>
|
||||
public sealed record DsseVerificationResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Whether the verification succeeded (at least one valid signature).
|
||||
/// </summary>
|
||||
public required bool IsValid { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of signatures that passed verification.
|
||||
/// </summary>
|
||||
public required int ValidSignatureCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total number of signatures in the envelope.
|
||||
/// </summary>
|
||||
public required int TotalSignatureCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Key IDs of signatures that passed verification.
|
||||
/// </summary>
|
||||
public required ImmutableArray<string> VerifiedKeyIds { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Key ID used for the primary verified signature (first one that passed).
|
||||
/// </summary>
|
||||
public string? PrimaryKeyId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Payload type from the envelope.
|
||||
/// </summary>
|
||||
public string? PayloadType { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// SHA-256 hash of the payload.
|
||||
/// </summary>
|
||||
public string? PayloadHash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Issues encountered during verification.
|
||||
/// </summary>
|
||||
public required ImmutableArray<string> Issues { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Creates a successful verification result.
|
||||
/// </summary>
|
||||
public static DsseVerificationResult Success(
|
||||
int validCount,
|
||||
int totalCount,
|
||||
ImmutableArray<string> verifiedKeyIds,
|
||||
string? payloadType = null,
|
||||
string? payloadHash = null)
|
||||
{
|
||||
return new DsseVerificationResult
|
||||
{
|
||||
IsValid = true,
|
||||
ValidSignatureCount = validCount,
|
||||
TotalSignatureCount = totalCount,
|
||||
VerifiedKeyIds = verifiedKeyIds,
|
||||
PrimaryKeyId = verifiedKeyIds.Length > 0 ? verifiedKeyIds[0] : null,
|
||||
PayloadType = payloadType,
|
||||
PayloadHash = payloadHash,
|
||||
Issues = ImmutableArray<string>.Empty,
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a failed verification result.
|
||||
/// </summary>
|
||||
public static DsseVerificationResult Failure(
|
||||
int totalCount,
|
||||
ImmutableArray<string> issues)
|
||||
{
|
||||
return new DsseVerificationResult
|
||||
{
|
||||
IsValid = false,
|
||||
ValidSignatureCount = 0,
|
||||
TotalSignatureCount = totalCount,
|
||||
VerifiedKeyIds = ImmutableArray<string>.Empty,
|
||||
Issues = issues,
|
||||
};
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a failure result for a parsing error.
|
||||
/// </summary>
|
||||
public static DsseVerificationResult ParseError(string message)
|
||||
{
|
||||
return new DsseVerificationResult
|
||||
{
|
||||
IsValid = false,
|
||||
ValidSignatureCount = 0,
|
||||
TotalSignatureCount = 0,
|
||||
VerifiedKeyIds = ImmutableArray<string>.Empty,
|
||||
Issues = ImmutableArray.Create($"envelope_parse_error: {message}"),
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -6,6 +6,10 @@
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../StellaOps.Attestor.Envelope/StellaOps.Attestor.Envelope.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
@@ -25,6 +25,12 @@ using Xunit;
|
||||
using StellaOps.TestKit;
|
||||
namespace StellaOps.Attestor.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Integration tests for time skew validation in attestation submission and verification.
|
||||
/// </summary>
|
||||
[Trait("Category", TestCategories.Integration)]
|
||||
[Trait("BlastRadius", TestCategories.BlastRadius.Evidence)]
|
||||
[Trait("BlastRadius", TestCategories.BlastRadius.Crypto)]
|
||||
public sealed class TimeSkewValidationIntegrationTests
|
||||
{
|
||||
private static readonly DateTimeOffset FixedNow = new(2025, 12, 18, 12, 0, 0, TimeSpan.Zero);
|
||||
|
||||
@@ -25,7 +25,11 @@ internal sealed class LdapIdentityProviderPlugin : IIdentityProviderPlugin
|
||||
private readonly LdapCapabilityProbe capabilityProbe;
|
||||
private readonly AuthorityIdentityProviderCapabilities manifestCapabilities;
|
||||
private readonly SemaphoreSlim capabilityGate = new(1, 1);
|
||||
private AuthorityIdentityProviderCapabilities capabilities;
|
||||
private AuthorityIdentityProviderCapabilities capabilities = new(
|
||||
SupportsPassword: false,
|
||||
SupportsMfa: false,
|
||||
SupportsClientProvisioning: false,
|
||||
SupportsBootstrap: false);
|
||||
private bool clientProvisioningActive;
|
||||
private bool bootstrapActive;
|
||||
private bool loggedProvisioningDegrade;
|
||||
|
||||
@@ -0,0 +1,256 @@
|
||||
// <copyright file="AuthorityConfigDiffTests.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
// Sprint: SPRINT_20260105_002_005_TEST_cross_cutting
|
||||
// Task: CCUT-021
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using FluentAssertions;
|
||||
using Microsoft.Extensions.Logging.Abstractions;
|
||||
using StellaOps.TestKit;
|
||||
using StellaOps.Testing.ConfigDiff;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Authority.ConfigDiff.Tests;
|
||||
|
||||
/// <summary>
|
||||
/// Config-diff tests for the Authority module.
|
||||
/// Verifies that configuration changes produce only expected behavioral deltas.
|
||||
/// </summary>
|
||||
[Trait("Category", TestCategories.ConfigDiff)]
|
||||
[Trait("Category", TestCategories.Integration)]
|
||||
[Trait("BlastRadius", TestCategories.BlastRadius.Auth)]
|
||||
public class AuthorityConfigDiffTests : ConfigDiffTestBase
|
||||
{
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="AuthorityConfigDiffTests"/> class.
|
||||
/// </summary>
|
||||
public AuthorityConfigDiffTests()
|
||||
: base(
|
||||
new ConfigDiffTestConfig(StrictMode: true),
|
||||
NullLogger.Instance)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies that changing token lifetime only affects token behavior.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task ChangingTokenLifetime_OnlyAffectsTokenBehavior()
|
||||
{
|
||||
// Arrange
|
||||
var baselineConfig = new AuthorityTestConfig
|
||||
{
|
||||
AccessTokenLifetimeMinutes = 15,
|
||||
RefreshTokenLifetimeHours = 24,
|
||||
MaxConcurrentSessions = 5
|
||||
};
|
||||
|
||||
var changedConfig = baselineConfig with
|
||||
{
|
||||
AccessTokenLifetimeMinutes = 30
|
||||
};
|
||||
|
||||
// Act
|
||||
var result = await TestConfigIsolationAsync(
|
||||
baselineConfig,
|
||||
changedConfig,
|
||||
changedSetting: "AccessTokenLifetimeMinutes",
|
||||
unrelatedBehaviors:
|
||||
[
|
||||
async config => await GetSessionBehaviorAsync(config),
|
||||
async config => await GetRefreshBehaviorAsync(config),
|
||||
async config => await GetAuthenticationBehaviorAsync(config)
|
||||
]);
|
||||
|
||||
// Assert
|
||||
result.IsSuccess.Should().BeTrue(
|
||||
because: "changing token lifetime should not affect sessions or authentication");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies that changing max sessions produces expected behavioral delta.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task ChangingMaxSessions_ProducesExpectedDelta()
|
||||
{
|
||||
// Arrange
|
||||
var baselineConfig = new AuthorityTestConfig { MaxConcurrentSessions = 3 };
|
||||
var changedConfig = new AuthorityTestConfig { MaxConcurrentSessions = 10 };
|
||||
|
||||
var expectedDelta = new ConfigDelta(
|
||||
ChangedBehaviors: ["SessionLimit", "ConcurrencyPolicy"],
|
||||
BehaviorDeltas:
|
||||
[
|
||||
new BehaviorDelta("SessionLimit", "3", "10", null),
|
||||
new BehaviorDelta("ConcurrencyPolicy", "restrictive", "permissive",
|
||||
"More sessions allowed")
|
||||
]);
|
||||
|
||||
// Act
|
||||
var result = await TestConfigBehavioralDeltaAsync(
|
||||
baselineConfig,
|
||||
changedConfig,
|
||||
getBehavior: async config => await CaptureSessionBehaviorAsync(config),
|
||||
computeDelta: ComputeBehaviorSnapshotDelta,
|
||||
expectedDelta: expectedDelta);
|
||||
|
||||
// Assert
|
||||
result.IsSuccess.Should().BeTrue(
|
||||
because: "session limit change should produce expected behavioral delta");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies that enabling DPoP only affects token binding.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task EnablingDPoP_OnlyAffectsTokenBinding()
|
||||
{
|
||||
// Arrange
|
||||
var baselineConfig = new AuthorityTestConfig { EnableDPoP = false };
|
||||
var changedConfig = new AuthorityTestConfig { EnableDPoP = true };
|
||||
|
||||
// Act
|
||||
var result = await TestConfigIsolationAsync(
|
||||
baselineConfig,
|
||||
changedConfig,
|
||||
changedSetting: "EnableDPoP",
|
||||
unrelatedBehaviors:
|
||||
[
|
||||
async config => await GetSessionBehaviorAsync(config),
|
||||
async config => await GetPasswordPolicyBehaviorAsync(config)
|
||||
]);
|
||||
|
||||
// Assert
|
||||
result.IsSuccess.Should().BeTrue(
|
||||
because: "DPoP should not affect sessions or password policy");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies that changing password policy produces expected changes.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task ChangingPasswordMinLength_ProducesExpectedDelta()
|
||||
{
|
||||
// Arrange
|
||||
var baselineConfig = new AuthorityTestConfig { MinPasswordLength = 8 };
|
||||
var changedConfig = new AuthorityTestConfig { MinPasswordLength = 12 };
|
||||
|
||||
var expectedDelta = new ConfigDelta(
|
||||
ChangedBehaviors: ["PasswordComplexity", "ValidationRejectionRate"],
|
||||
BehaviorDeltas:
|
||||
[
|
||||
new BehaviorDelta("PasswordComplexity", "standard", "enhanced", null),
|
||||
new BehaviorDelta("ValidationRejectionRate", "increase", null,
|
||||
"Stricter requirements reject more passwords")
|
||||
]);
|
||||
|
||||
// Act
|
||||
var result = await TestConfigBehavioralDeltaAsync(
|
||||
baselineConfig,
|
||||
changedConfig,
|
||||
getBehavior: async config => await CapturePasswordPolicyBehaviorAsync(config),
|
||||
computeDelta: ComputeBehaviorSnapshotDelta,
|
||||
expectedDelta: expectedDelta);
|
||||
|
||||
// Assert
|
||||
result.IsSuccess.Should().BeTrue();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies that enabling MFA only affects authentication flow.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public async Task EnablingMFA_OnlyAffectsAuthentication()
|
||||
{
|
||||
// Arrange
|
||||
var baselineConfig = new AuthorityTestConfig { RequireMFA = false };
|
||||
var changedConfig = new AuthorityTestConfig { RequireMFA = true };
|
||||
|
||||
// Act
|
||||
var result = await TestConfigIsolationAsync(
|
||||
baselineConfig,
|
||||
changedConfig,
|
||||
changedSetting: "RequireMFA",
|
||||
unrelatedBehaviors:
|
||||
[
|
||||
async config => await GetTokenBehaviorAsync(config),
|
||||
async config => await GetSessionBehaviorAsync(config)
|
||||
]);
|
||||
|
||||
// Assert
|
||||
result.IsSuccess.Should().BeTrue(
|
||||
because: "MFA should not affect token issuance or session management");
|
||||
}
|
||||
|
||||
// Helper methods
|
||||
|
||||
private static Task<object> GetSessionBehaviorAsync(AuthorityTestConfig config)
|
||||
{
|
||||
return Task.FromResult<object>(new { MaxSessions = config.MaxConcurrentSessions });
|
||||
}
|
||||
|
||||
private static Task<object> GetRefreshBehaviorAsync(AuthorityTestConfig config)
|
||||
{
|
||||
return Task.FromResult<object>(new { RefreshLifetime = config.RefreshTokenLifetimeHours });
|
||||
}
|
||||
|
||||
private static Task<object> GetAuthenticationBehaviorAsync(AuthorityTestConfig config)
|
||||
{
|
||||
return Task.FromResult<object>(new { MfaRequired = config.RequireMFA });
|
||||
}
|
||||
|
||||
private static Task<object> GetPasswordPolicyBehaviorAsync(AuthorityTestConfig config)
|
||||
{
|
||||
return Task.FromResult<object>(new { MinLength = config.MinPasswordLength });
|
||||
}
|
||||
|
||||
private static Task<object> GetTokenBehaviorAsync(AuthorityTestConfig config)
|
||||
{
|
||||
return Task.FromResult<object>(new { Lifetime = config.AccessTokenLifetimeMinutes });
|
||||
}
|
||||
|
||||
private static Task<BehaviorSnapshot> CaptureSessionBehaviorAsync(AuthorityTestConfig config)
|
||||
{
|
||||
var snapshot = new BehaviorSnapshot(
|
||||
ConfigurationId: $"sessions-{config.MaxConcurrentSessions}",
|
||||
Behaviors:
|
||||
[
|
||||
new CapturedBehavior("SessionLimit", config.MaxConcurrentSessions.ToString(), DateTimeOffset.UtcNow),
|
||||
new CapturedBehavior("ConcurrencyPolicy",
|
||||
config.MaxConcurrentSessions > 5 ? "permissive" : "restrictive", DateTimeOffset.UtcNow)
|
||||
],
|
||||
CapturedAt: DateTimeOffset.UtcNow);
|
||||
|
||||
return Task.FromResult(snapshot);
|
||||
}
|
||||
|
||||
private static Task<BehaviorSnapshot> CapturePasswordPolicyBehaviorAsync(AuthorityTestConfig config)
|
||||
{
|
||||
var snapshot = new BehaviorSnapshot(
|
||||
ConfigurationId: $"password-{config.MinPasswordLength}",
|
||||
Behaviors:
|
||||
[
|
||||
new CapturedBehavior("PasswordComplexity",
|
||||
config.MinPasswordLength >= 12 ? "enhanced" : "standard", DateTimeOffset.UtcNow),
|
||||
new CapturedBehavior("ValidationRejectionRate",
|
||||
config.MinPasswordLength >= 12 ? "increase" : "standard", DateTimeOffset.UtcNow)
|
||||
],
|
||||
CapturedAt: DateTimeOffset.UtcNow);
|
||||
|
||||
return Task.FromResult(snapshot);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test configuration for Authority module.
|
||||
/// </summary>
|
||||
public sealed record AuthorityTestConfig
|
||||
{
|
||||
public int AccessTokenLifetimeMinutes { get; init; } = 15;
|
||||
public int RefreshTokenLifetimeHours { get; init; } = 24;
|
||||
public int MaxConcurrentSessions { get; init; } = 5;
|
||||
public bool EnableDPoP { get; init; } = false;
|
||||
public int MinPasswordLength { get; init; } = 8;
|
||||
public bool RequireMFA { get; init; } = false;
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
<?xml version='1.0' encoding='utf-8'?>
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<Description>Config-diff tests for Authority module</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="FluentAssertions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../../__Libraries/StellaOps.Authority.Core/StellaOps.Authority.Core.csproj" />
|
||||
<ProjectReference Include="../../../__Libraries/StellaOps.TestKit/StellaOps.TestKit.csproj" />
|
||||
<ProjectReference Include="../../../__Tests/__Libraries/StellaOps.Testing.ConfigDiff/StellaOps.Testing.ConfigDiff.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -15,5 +15,7 @@
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../../__Libraries/StellaOps.Authority.Core/StellaOps.Authority.Core.csproj" />
|
||||
<ProjectReference Include="../../../__Tests/__Libraries/StellaOps.Testing.Temporal/StellaOps.Testing.Temporal.csproj" />
|
||||
<ProjectReference Include="../../../__Libraries/StellaOps.TestKit/StellaOps.TestKit.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -0,0 +1,296 @@
|
||||
// <copyright file="TemporalVerdictTests.cs" company="StellaOps">
|
||||
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
|
||||
// </copyright>
|
||||
// Sprint: SPRINT_20260105_002_001_TEST_time_skew_idempotency
|
||||
// Task: TSKW-011
|
||||
|
||||
using FluentAssertions;
|
||||
using StellaOps.Authority.Core.Verdicts;
|
||||
using StellaOps.Testing.Temporal;
|
||||
using StellaOps.TestKit;
|
||||
using Xunit;
|
||||
|
||||
namespace StellaOps.Authority.Core.Tests.Verdicts;
|
||||
|
||||
/// <summary>
|
||||
/// Temporal testing for verdict manifests using the Testing.Temporal library.
|
||||
/// Tests clock cutoff handling, timestamp consistency, and determinism under time skew.
|
||||
/// </summary>
|
||||
[Trait("Category", TestCategories.Unit)]
|
||||
public sealed class TemporalVerdictTests
|
||||
{
|
||||
private static readonly DateTimeOffset BaseTime = new(2026, 1, 5, 12, 0, 0, TimeSpan.Zero);
|
||||
|
||||
[Fact]
|
||||
public void VerdictManifest_ClockCutoff_BoundaryPrecision()
|
||||
{
|
||||
// Arrange
|
||||
var ttlProvider = new TtlBoundaryTimeProvider(BaseTime);
|
||||
var ttl = TimeSpan.FromHours(24); // Typical verdict validity window
|
||||
var clockCutoff = BaseTime;
|
||||
|
||||
// Position at various boundaries
|
||||
var testCases = TtlBoundaryTimeProvider.GenerateBoundaryTestCases(clockCutoff, ttl).ToList();
|
||||
|
||||
// Assert - verify all boundary cases are correctly handled
|
||||
foreach (var testCase in testCases)
|
||||
{
|
||||
var isExpired = testCase.Time >= clockCutoff.Add(ttl);
|
||||
isExpired.Should().Be(
|
||||
testCase.ShouldBeExpired,
|
||||
$"Verdict clock cutoff case '{testCase.Name}' should be expired={testCase.ShouldBeExpired}");
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerdictManifestBuilder_IsDeterministic_UnderTimeAdvancement()
|
||||
{
|
||||
// Arrange
|
||||
var timeProvider = new SimulatedTimeProvider(BaseTime);
|
||||
var results = new List<string>();
|
||||
|
||||
// Act - build multiple manifests while advancing time
|
||||
for (int i = 0; i < 10; i++)
|
||||
{
|
||||
var manifest = BuildTestManifest(BaseTime); // Use fixed clock, not advancing
|
||||
results.Add(manifest.ManifestDigest);
|
||||
timeProvider.Advance(TimeSpan.FromMinutes(5)); // Advance between builds
|
||||
}
|
||||
|
||||
// Assert - all manifests should have same digest (deterministic)
|
||||
results.Distinct().Should().HaveCount(1, "manifests built with same inputs should be deterministic");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerdictManifestBuilder_Build_IsIdempotent()
|
||||
{
|
||||
// Arrange
|
||||
var stateSnapshotter = () => BuildTestManifest(BaseTime).ManifestDigest;
|
||||
var verifier = new IdempotencyVerifier<string>(stateSnapshotter);
|
||||
|
||||
// Act - verify Build is idempotent
|
||||
var result = verifier.Verify(() => { /* Build is called in snapshotter */ }, repetitions: 5);
|
||||
|
||||
// Assert
|
||||
result.IsIdempotent.Should().BeTrue("VerdictManifestBuilder.Build should be idempotent");
|
||||
result.AllSucceeded.Should().BeTrue();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerdictManifest_TimestampOrdering_IsMonotonic()
|
||||
{
|
||||
// Arrange - simulate verdict timestamps
|
||||
var timeProvider = new SimulatedTimeProvider(BaseTime);
|
||||
var timestamps = new List<DateTimeOffset>();
|
||||
|
||||
// Simulate verdict lifecycle: created, processed, signed, stored
|
||||
timestamps.Add(timeProvider.GetUtcNow()); // Created
|
||||
timeProvider.Advance(TimeSpan.FromMilliseconds(50));
|
||||
timestamps.Add(timeProvider.GetUtcNow()); // Processed
|
||||
timeProvider.Advance(TimeSpan.FromMilliseconds(100));
|
||||
timestamps.Add(timeProvider.GetUtcNow()); // Signed
|
||||
timeProvider.Advance(TimeSpan.FromMilliseconds(20));
|
||||
timestamps.Add(timeProvider.GetUtcNow()); // Stored
|
||||
|
||||
// Act & Assert - timestamps should be monotonically increasing
|
||||
ClockSkewAssertions.AssertMonotonicTimestamps(timestamps);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerdictManifest_HandlesClockSkewForward()
|
||||
{
|
||||
// Arrange
|
||||
var timeProvider = new SimulatedTimeProvider(BaseTime);
|
||||
var clockCutoff1 = timeProvider.GetUtcNow();
|
||||
|
||||
// Simulate clock jump forward (NTP correction)
|
||||
timeProvider.JumpTo(BaseTime.AddHours(2));
|
||||
var clockCutoff2 = timeProvider.GetUtcNow();
|
||||
|
||||
// Act - build manifests with different clock cutoffs
|
||||
var manifest1 = BuildTestManifest(clockCutoff1);
|
||||
var manifest2 = BuildTestManifest(clockCutoff2);
|
||||
|
||||
// Assert - different clock cutoffs should produce different digests
|
||||
manifest1.ManifestDigest.Should().NotBe(manifest2.ManifestDigest,
|
||||
"different clock cutoffs should produce different manifest digests");
|
||||
|
||||
// Clock cutoff difference should be within expected range
|
||||
ClockSkewAssertions.AssertTimestampsWithinTolerance(
|
||||
clockCutoff1,
|
||||
clockCutoff2,
|
||||
tolerance: TimeSpan.FromHours(3));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerdictManifest_ClockDrift_DoesNotAffectDeterminism()
|
||||
{
|
||||
// Arrange
|
||||
var timeProvider = new SimulatedTimeProvider(BaseTime);
|
||||
timeProvider.SetDrift(TimeSpan.FromMilliseconds(10)); // 10ms/second drift
|
||||
|
||||
var results = new List<string>();
|
||||
var fixedClock = BaseTime; // Use fixed clock for manifest
|
||||
|
||||
// Act - build manifests while time drifts
|
||||
for (int i = 0; i < 10; i++)
|
||||
{
|
||||
var manifest = BuildTestManifest(fixedClock);
|
||||
results.Add(manifest.ManifestDigest);
|
||||
timeProvider.Advance(TimeSpan.FromSeconds(10)); // Time advances with drift
|
||||
}
|
||||
|
||||
// Assert - all should be identical (fixed clock input)
|
||||
results.Distinct().Should().HaveCount(1,
|
||||
"manifests with fixed clock should be deterministic regardless of system drift");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerdictManifest_ClockJumpBackward_IsDetected()
|
||||
{
|
||||
// Arrange
|
||||
var timeProvider = new SimulatedTimeProvider(BaseTime);
|
||||
var timestamps = new List<DateTimeOffset>();
|
||||
|
||||
// Record timestamps
|
||||
timestamps.Add(timeProvider.GetUtcNow());
|
||||
timeProvider.Advance(TimeSpan.FromMinutes(5));
|
||||
timestamps.Add(timeProvider.GetUtcNow());
|
||||
|
||||
// Simulate clock jump backward
|
||||
timeProvider.JumpBackward(TimeSpan.FromMinutes(3));
|
||||
timestamps.Add(timeProvider.GetUtcNow());
|
||||
|
||||
// Assert - backward jump should be detected
|
||||
timeProvider.HasJumpedBackward().Should().BeTrue();
|
||||
|
||||
// Non-monotonic timestamps should be detected
|
||||
var act = () => ClockSkewAssertions.AssertMonotonicTimestamps(timestamps);
|
||||
act.Should().Throw<ClockSkewAssertionException>();
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(0.9, VexStatus.NotAffected)]
|
||||
[InlineData(0.7, VexStatus.Affected)]
|
||||
[InlineData(0.5, VexStatus.UnderInvestigation)]
|
||||
public void VerdictManifest_ConfidenceScores_AreIdempotent(double confidence, VexStatus status)
|
||||
{
|
||||
// Arrange
|
||||
var stateSnapshotter = () =>
|
||||
{
|
||||
var manifest = BuildTestManifest(BaseTime, confidence, status);
|
||||
return manifest.Result.Confidence;
|
||||
};
|
||||
var verifier = new IdempotencyVerifier<double>(stateSnapshotter);
|
||||
|
||||
// Act
|
||||
var result = verifier.Verify(() => { }, repetitions: 3);
|
||||
|
||||
// Assert
|
||||
result.IsIdempotent.Should().BeTrue();
|
||||
result.States.Should().AllSatisfy(c => c.Should().Be(confidence));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerdictManifest_ExpiryWindow_BoundaryTests()
|
||||
{
|
||||
// Arrange - simulate verdict expiry window (e.g., 7 days)
|
||||
var expiryWindow = TimeSpan.FromDays(7);
|
||||
var createdAt = BaseTime;
|
||||
|
||||
// Generate boundary test cases
|
||||
var testCases = TtlBoundaryTimeProvider.GenerateBoundaryTestCases(createdAt, expiryWindow);
|
||||
|
||||
// Assert
|
||||
foreach (var testCase in testCases)
|
||||
{
|
||||
var isExpired = testCase.Time >= createdAt.Add(expiryWindow);
|
||||
isExpired.Should().Be(testCase.ShouldBeExpired, testCase.Name);
|
||||
}
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[MemberData(nameof(GetVerdictExpiryBoundaryData))]
|
||||
public void VerdictManifest_TheoryBoundaryTests(
|
||||
string name,
|
||||
DateTimeOffset testTime,
|
||||
bool shouldBeExpired)
|
||||
{
|
||||
// Arrange
|
||||
var expiryWindow = TimeSpan.FromDays(7);
|
||||
var expiry = BaseTime.Add(expiryWindow);
|
||||
|
||||
// Act
|
||||
var isExpired = testTime >= expiry;
|
||||
|
||||
// Assert
|
||||
isExpired.Should().Be(shouldBeExpired, $"Case '{name}' should be expired={shouldBeExpired}");
|
||||
}
|
||||
|
||||
public static IEnumerable<object[]> GetVerdictExpiryBoundaryData()
|
||||
{
|
||||
var expiryWindow = TimeSpan.FromDays(7);
|
||||
return TtlBoundaryTimeProvider.GenerateTheoryData(BaseTime, expiryWindow);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void VerdictManifest_LeapSecondScenario_MaintainsDeterminism()
|
||||
{
|
||||
// Arrange
|
||||
var leapDay = new DateOnly(2016, 12, 31);
|
||||
var leapProvider = new LeapSecondTimeProvider(
|
||||
new DateTimeOffset(2016, 12, 31, 23, 0, 0, TimeSpan.Zero),
|
||||
leapDay);
|
||||
|
||||
var results = new List<string>();
|
||||
var fixedClock = new DateTimeOffset(2016, 12, 31, 12, 0, 0, TimeSpan.Zero);
|
||||
|
||||
// Act - build manifests while advancing through leap second
|
||||
foreach (var moment in leapProvider.AdvanceThroughLeapSecond(leapDay))
|
||||
{
|
||||
var manifest = BuildTestManifest(fixedClock);
|
||||
results.Add(manifest.ManifestDigest);
|
||||
}
|
||||
|
||||
// Assert - all manifests should be identical (fixed clock)
|
||||
results.Distinct().Should().HaveCount(1,
|
||||
"manifests should be deterministic even during leap second transition");
|
||||
}
|
||||
|
||||
private static VerdictManifest BuildTestManifest(
|
||||
DateTimeOffset clockCutoff,
|
||||
double confidence = 0.85,
|
||||
VexStatus status = VexStatus.NotAffected)
|
||||
{
|
||||
return new VerdictManifestBuilder(() => "test-manifest-id")
|
||||
.WithTenant("tenant-1")
|
||||
.WithAsset("sha256:abc123", "CVE-2024-1234")
|
||||
.WithInputs(
|
||||
sbomDigests: new[] { "sha256:sbom1" },
|
||||
vulnFeedSnapshotIds: new[] { "feed-snapshot-1" },
|
||||
vexDocumentDigests: new[] { "sha256:vex1" },
|
||||
clockCutoff: clockCutoff)
|
||||
.WithResult(
|
||||
status: status,
|
||||
confidence: confidence,
|
||||
explanations: new[]
|
||||
{
|
||||
new VerdictExplanation
|
||||
{
|
||||
SourceId = "vendor-a",
|
||||
Reason = "Test explanation",
|
||||
ProvenanceScore = 0.9,
|
||||
CoverageScore = 0.8,
|
||||
ReplayabilityScore = 0.7,
|
||||
StrengthMultiplier = 1.0,
|
||||
FreshnessMultiplier = 0.95,
|
||||
ClaimScore = confidence,
|
||||
AssertedStatus = status,
|
||||
Accepted = true,
|
||||
},
|
||||
})
|
||||
.WithPolicy("sha256:policy123", "1.0.0")
|
||||
.WithClock(clockCutoff)
|
||||
.Build();
|
||||
}
|
||||
}
|
||||
@@ -253,6 +253,24 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.FixIn
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.WebService.Tests", "__Tests\StellaOps.BinaryIndex.WebService.Tests\StellaOps.BinaryIndex.WebService.Tests.csproj", "{C12D06F8-7B69-4A24-B206-C47326778F2E}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Semantic", "__Libraries\StellaOps.BinaryIndex.Semantic\StellaOps.BinaryIndex.Semantic.csproj", "{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Disassembly.Abstractions", "__Libraries\StellaOps.BinaryIndex.Disassembly.Abstractions\StellaOps.BinaryIndex.Disassembly.Abstractions.csproj", "{3112D5DD-E993-4737-955B-D8FE20CEC88A}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Semantic.Tests", "__Tests\StellaOps.BinaryIndex.Semantic.Tests\StellaOps.BinaryIndex.Semantic.Tests.csproj", "{89CCD547-09D4-4923-9644-17724AF60F1C}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.TestKit", "..\__Libraries\StellaOps.TestKit\StellaOps.TestKit.csproj", "{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Ensemble", "__Libraries\StellaOps.BinaryIndex.Ensemble\StellaOps.BinaryIndex.Ensemble.csproj", "{7612CE73-B27A-4489-A89E-E22FF19981B7}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Decompiler", "__Libraries\StellaOps.BinaryIndex.Decompiler\StellaOps.BinaryIndex.Decompiler.csproj", "{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Ghidra", "__Libraries\StellaOps.BinaryIndex.Ghidra\StellaOps.BinaryIndex.Ghidra.csproj", "{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.ML", "__Libraries\StellaOps.BinaryIndex.ML\StellaOps.BinaryIndex.ML.csproj", "{850F7C46-E98B-431A-B202-FF97FB041BAD}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Ensemble.Tests", "__Tests\StellaOps.BinaryIndex.Ensemble.Tests\StellaOps.BinaryIndex.Ensemble.Tests.csproj", "{87356481-048B-4D3F-B4D5-3B6494A1F038}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
@@ -1151,6 +1169,114 @@ Global
|
||||
{C12D06F8-7B69-4A24-B206-C47326778F2E}.Release|x64.Build.0 = Release|Any CPU
|
||||
{C12D06F8-7B69-4A24-B206-C47326778F2E}.Release|x86.ActiveCfg = Release|Any CPU
|
||||
{C12D06F8-7B69-4A24-B206-C47326778F2E}.Release|x86.Build.0 = Release|Any CPU
|
||||
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Debug|x86.ActiveCfg = Debug|Any CPU
|
||||
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Debug|x86.Build.0 = Debug|Any CPU
|
||||
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Release|x64.Build.0 = Release|Any CPU
|
||||
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Release|x86.ActiveCfg = Release|Any CPU
|
||||
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Release|x86.Build.0 = Release|Any CPU
|
||||
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Debug|x86.ActiveCfg = Debug|Any CPU
|
||||
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Debug|x86.Build.0 = Debug|Any CPU
|
||||
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Release|x64.Build.0 = Release|Any CPU
|
||||
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Release|x86.ActiveCfg = Release|Any CPU
|
||||
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Release|x86.Build.0 = Release|Any CPU
|
||||
{89CCD547-09D4-4923-9644-17724AF60F1C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{89CCD547-09D4-4923-9644-17724AF60F1C}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{89CCD547-09D4-4923-9644-17724AF60F1C}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{89CCD547-09D4-4923-9644-17724AF60F1C}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{89CCD547-09D4-4923-9644-17724AF60F1C}.Debug|x86.ActiveCfg = Debug|Any CPU
|
||||
{89CCD547-09D4-4923-9644-17724AF60F1C}.Debug|x86.Build.0 = Debug|Any CPU
|
||||
{89CCD547-09D4-4923-9644-17724AF60F1C}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{89CCD547-09D4-4923-9644-17724AF60F1C}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{89CCD547-09D4-4923-9644-17724AF60F1C}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{89CCD547-09D4-4923-9644-17724AF60F1C}.Release|x64.Build.0 = Release|Any CPU
|
||||
{89CCD547-09D4-4923-9644-17724AF60F1C}.Release|x86.ActiveCfg = Release|Any CPU
|
||||
{89CCD547-09D4-4923-9644-17724AF60F1C}.Release|x86.Build.0 = Release|Any CPU
|
||||
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Debug|x86.ActiveCfg = Debug|Any CPU
|
||||
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Debug|x86.Build.0 = Debug|Any CPU
|
||||
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Release|x64.Build.0 = Release|Any CPU
|
||||
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Release|x86.ActiveCfg = Release|Any CPU
|
||||
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Release|x86.Build.0 = Release|Any CPU
|
||||
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Debug|x86.ActiveCfg = Debug|Any CPU
|
||||
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Debug|x86.Build.0 = Debug|Any CPU
|
||||
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Release|x64.Build.0 = Release|Any CPU
|
||||
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Release|x86.ActiveCfg = Release|Any CPU
|
||||
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Release|x86.Build.0 = Release|Any CPU
|
||||
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Debug|x86.ActiveCfg = Debug|Any CPU
|
||||
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Debug|x86.Build.0 = Debug|Any CPU
|
||||
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Release|x64.Build.0 = Release|Any CPU
|
||||
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Release|x86.ActiveCfg = Release|Any CPU
|
||||
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Release|x86.Build.0 = Release|Any CPU
|
||||
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Debug|x86.ActiveCfg = Debug|Any CPU
|
||||
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Debug|x86.Build.0 = Debug|Any CPU
|
||||
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Release|x64.Build.0 = Release|Any CPU
|
||||
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Release|x86.ActiveCfg = Release|Any CPU
|
||||
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Release|x86.Build.0 = Release|Any CPU
|
||||
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Debug|x86.ActiveCfg = Debug|Any CPU
|
||||
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Debug|x86.Build.0 = Debug|Any CPU
|
||||
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Release|x64.Build.0 = Release|Any CPU
|
||||
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Release|x86.ActiveCfg = Release|Any CPU
|
||||
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Release|x86.Build.0 = Release|Any CPU
|
||||
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Debug|x86.ActiveCfg = Debug|Any CPU
|
||||
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Debug|x86.Build.0 = Debug|Any CPU
|
||||
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Release|x64.Build.0 = Release|Any CPU
|
||||
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Release|x86.ActiveCfg = Release|Any CPU
|
||||
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Release|x86.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
@@ -1246,6 +1372,14 @@ Global
|
||||
{FB127279-C17B-40DC-AC68-320B7CE85E76} = {BB76B5A5-14BA-E317-828D-110B711D71F5}
|
||||
{AAE98543-46B4-4707-AD1F-CCC9142F8712} = {BB76B5A5-14BA-E317-828D-110B711D71F5}
|
||||
{C12D06F8-7B69-4A24-B206-C47326778F2E} = {BB76B5A5-14BA-E317-828D-110B711D71F5}
|
||||
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7} = {A5C98087-E847-D2C4-2143-20869479839D}
|
||||
{3112D5DD-E993-4737-955B-D8FE20CEC88A} = {A5C98087-E847-D2C4-2143-20869479839D}
|
||||
{89CCD547-09D4-4923-9644-17724AF60F1C} = {BB76B5A5-14BA-E317-828D-110B711D71F5}
|
||||
{7612CE73-B27A-4489-A89E-E22FF19981B7} = {A5C98087-E847-D2C4-2143-20869479839D}
|
||||
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7} = {A5C98087-E847-D2C4-2143-20869479839D}
|
||||
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4} = {A5C98087-E847-D2C4-2143-20869479839D}
|
||||
{850F7C46-E98B-431A-B202-FF97FB041BAD} = {A5C98087-E847-D2C4-2143-20869479839D}
|
||||
{87356481-048B-4D3F-B4D5-3B6494A1F038} = {BB76B5A5-14BA-E317-828D-110B711D71F5}
|
||||
EndGlobalSection
|
||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||
SolutionGuid = {21B6BF22-3A64-CD15-49B3-21A490AAD068}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Builders;
|
||||
|
||||
/// <summary>
|
||||
@@ -109,6 +111,12 @@ public sealed record FunctionFingerprint
|
||||
/// Source line number if debug info available.
|
||||
/// </summary>
|
||||
public int? SourceLine { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Semantic fingerprint for enhanced similarity comparison.
|
||||
/// Uses IR-level analysis for resilience to compiler optimizations.
|
||||
/// </summary>
|
||||
public Semantic.SemanticFingerprint? SemanticFingerprint { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -192,25 +192,42 @@ public sealed record HashWeights
|
||||
/// <summary>
|
||||
/// Weight for basic block hash comparison.
|
||||
/// </summary>
|
||||
public decimal BasicBlockWeight { get; init; } = 0.5m;
|
||||
public decimal BasicBlockWeight { get; init; } = 0.4m;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for CFG hash comparison.
|
||||
/// </summary>
|
||||
public decimal CfgWeight { get; init; } = 0.3m;
|
||||
public decimal CfgWeight { get; init; } = 0.25m;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for string refs hash comparison.
|
||||
/// </summary>
|
||||
public decimal StringRefsWeight { get; init; } = 0.2m;
|
||||
public decimal StringRefsWeight { get; init; } = 0.15m;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for semantic fingerprint comparison.
|
||||
/// Only used when both fingerprints have semantic data.
|
||||
/// </summary>
|
||||
public decimal SemanticWeight { get; init; } = 0.2m;
|
||||
|
||||
/// <summary>
|
||||
/// Default weights.
|
||||
/// </summary>
|
||||
public static HashWeights Default => new();
|
||||
|
||||
/// <summary>
|
||||
/// Weights without semantic analysis (traditional mode).
|
||||
/// </summary>
|
||||
public static HashWeights Traditional => new()
|
||||
{
|
||||
BasicBlockWeight = 0.5m,
|
||||
CfgWeight = 0.3m,
|
||||
StringRefsWeight = 0.2m,
|
||||
SemanticWeight = 0.0m
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Validates that weights sum to 1.0.
|
||||
/// </summary>
|
||||
public bool IsValid => Math.Abs(BasicBlockWeight + CfgWeight + StringRefsWeight - 1.0m) < 0.001m;
|
||||
public bool IsValid => Math.Abs(BasicBlockWeight + CfgWeight + StringRefsWeight + SemanticWeight - 1.0m) < 0.001m;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Builders;
|
||||
|
||||
@@ -202,6 +203,16 @@ public sealed class PatchDiffEngine : IPatchDiffEngine
|
||||
matchedWeight += weights.StringRefsWeight;
|
||||
}
|
||||
|
||||
// Include semantic fingerprint similarity if available
|
||||
if (weights.SemanticWeight > 0 &&
|
||||
a.SemanticFingerprint is not null &&
|
||||
b.SemanticFingerprint is not null)
|
||||
{
|
||||
totalWeight += weights.SemanticWeight;
|
||||
var semanticSimilarity = ComputeSemanticSimilarity(a.SemanticFingerprint, b.SemanticFingerprint);
|
||||
matchedWeight += weights.SemanticWeight * semanticSimilarity;
|
||||
}
|
||||
|
||||
// Size similarity bonus (if sizes are within 10%, add small bonus)
|
||||
if (a.Size > 0 && b.Size > 0)
|
||||
{
|
||||
@@ -216,6 +227,86 @@ public sealed class PatchDiffEngine : IPatchDiffEngine
|
||||
return totalWeight > 0 ? matchedWeight / totalWeight : 0m;
|
||||
}
|
||||
|
||||
private static decimal ComputeSemanticSimilarity(
|
||||
Semantic.SemanticFingerprint a,
|
||||
Semantic.SemanticFingerprint b)
|
||||
{
|
||||
// Check for exact hash match first
|
||||
if (a.HashEquals(b))
|
||||
{
|
||||
return 1.0m;
|
||||
}
|
||||
|
||||
// Compute weighted similarity from components
|
||||
decimal graphSim = ComputeHashSimilarity(a.GraphHash, b.GraphHash);
|
||||
decimal opSim = ComputeHashSimilarity(a.OperationHash, b.OperationHash);
|
||||
decimal dfSim = ComputeHashSimilarity(a.DataFlowHash, b.DataFlowHash);
|
||||
decimal apiSim = ComputeApiCallSimilarity(a.ApiCalls, b.ApiCalls);
|
||||
|
||||
// Weights: graph structure 40%, operation sequence 25%, data flow 20%, API calls 15%
|
||||
return (graphSim * 0.40m) + (opSim * 0.25m) + (dfSim * 0.20m) + (apiSim * 0.15m);
|
||||
}
|
||||
|
||||
private static decimal ComputeHashSimilarity(byte[] hashA, byte[] hashB)
|
||||
{
|
||||
if (hashA.Length == 0 || hashB.Length == 0)
|
||||
{
|
||||
return 0m;
|
||||
}
|
||||
|
||||
if (hashA.AsSpan().SequenceEqual(hashB))
|
||||
{
|
||||
return 1.0m;
|
||||
}
|
||||
|
||||
// Count matching bits (Hamming similarity)
|
||||
int matchingBits = 0;
|
||||
int totalBits = hashA.Length * 8;
|
||||
int len = Math.Min(hashA.Length, hashB.Length);
|
||||
|
||||
for (int i = 0; i < len; i++)
|
||||
{
|
||||
byte xor = (byte)(hashA[i] ^ hashB[i]);
|
||||
matchingBits += 8 - PopCount(xor);
|
||||
}
|
||||
|
||||
return (decimal)matchingBits / totalBits;
|
||||
}
|
||||
|
||||
private static int PopCount(byte value)
|
||||
{
|
||||
int count = 0;
|
||||
while (value != 0)
|
||||
{
|
||||
count += value & 1;
|
||||
value >>= 1;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private static decimal ComputeApiCallSimilarity(
|
||||
System.Collections.Immutable.ImmutableArray<string> apiCallsA,
|
||||
System.Collections.Immutable.ImmutableArray<string> apiCallsB)
|
||||
{
|
||||
if (apiCallsA.IsEmpty && apiCallsB.IsEmpty)
|
||||
{
|
||||
return 1.0m;
|
||||
}
|
||||
|
||||
if (apiCallsA.IsEmpty || apiCallsB.IsEmpty)
|
||||
{
|
||||
return 0.0m;
|
||||
}
|
||||
|
||||
var setA = new HashSet<string>(apiCallsA, StringComparer.Ordinal);
|
||||
var setB = new HashSet<string>(apiCallsB, StringComparer.Ordinal);
|
||||
|
||||
var intersection = setA.Intersect(setB).Count();
|
||||
var union = setA.Union(setB).Count();
|
||||
|
||||
return union > 0 ? (decimal)intersection / union : 0m;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyDictionary<string, string> FindFunctionMappings(
|
||||
IReadOnlyList<FunctionFingerprint> vulnerable,
|
||||
|
||||
@@ -20,5 +20,6 @@
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../StellaOps.BinaryIndex.Core/StellaOps.BinaryIndex.Core.csproj" />
|
||||
<ProjectReference Include="../StellaOps.BinaryIndex.Fingerprints/StellaOps.BinaryIndex.Fingerprints.csproj" />
|
||||
<ProjectReference Include="../StellaOps.BinaryIndex.Semantic/StellaOps.BinaryIndex.Semantic.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
|
||||
@@ -510,6 +510,27 @@ public sealed class CachedBinaryVulnerabilityService : IBinaryVulnerabilityServi
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<CorpusFunctionMatch>> IdentifyFunctionFromCorpusAsync(
|
||||
FunctionFingerprintSet fingerprints,
|
||||
CorpusLookupOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
// Delegate to inner service - corpus lookups typically don't benefit from caching
|
||||
// due to high variance in fingerprint sets
|
||||
return await _inner.IdentifyFunctionFromCorpusAsync(fingerprints, options, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableDictionary<string, ImmutableArray<CorpusFunctionMatch>>> IdentifyFunctionsFromCorpusBatchAsync(
|
||||
IEnumerable<(string Key, FunctionFingerprintSet Fingerprints)> functions,
|
||||
CorpusLookupOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
// Delegate to inner service - batch corpus lookups typically don't benefit from caching
|
||||
return await _inner.IdentifyFunctionsFromCorpusBatchAsync(functions, options, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
_connectionLock.Dispose();
|
||||
|
||||
@@ -99,6 +99,27 @@ public interface IBinaryVulnerabilityService
|
||||
string symbolName,
|
||||
DeltaSigLookupOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Identify a function by its fingerprints using the corpus database.
|
||||
/// Returns matching library functions with CVE associations.
|
||||
/// </summary>
|
||||
/// <param name="fingerprints">Function fingerprints (semantic, instruction, API call).</param>
|
||||
/// <param name="options">Corpus lookup options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Identified functions with vulnerability associations.</returns>
|
||||
Task<ImmutableArray<CorpusFunctionMatch>> IdentifyFunctionFromCorpusAsync(
|
||||
FunctionFingerprintSet fingerprints,
|
||||
CorpusLookupOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Batch identify functions from corpus for scan performance.
|
||||
/// </summary>
|
||||
Task<ImmutableDictionary<string, ImmutableArray<CorpusFunctionMatch>>> IdentifyFunctionsFromCorpusBatchAsync(
|
||||
IEnumerable<(string Key, FunctionFingerprintSet Fingerprints)> functions,
|
||||
CorpusLookupOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -225,3 +246,141 @@ public sealed record FixStatusResult
|
||||
/// <summary>Reference to the underlying evidence record.</summary>
|
||||
public Guid? EvidenceId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Function fingerprint set for corpus matching.
|
||||
/// </summary>
|
||||
public sealed record FunctionFingerprintSet
|
||||
{
|
||||
/// <summary>Semantic fingerprint (IR-based).</summary>
|
||||
public byte[]? SemanticFingerprint { get; init; }
|
||||
|
||||
/// <summary>Instruction fingerprint (normalized assembly).</summary>
|
||||
public byte[]? InstructionFingerprint { get; init; }
|
||||
|
||||
/// <summary>API call sequence fingerprint.</summary>
|
||||
public byte[]? ApiCallFingerprint { get; init; }
|
||||
|
||||
/// <summary>Function name if available (may be stripped).</summary>
|
||||
public string? FunctionName { get; init; }
|
||||
|
||||
/// <summary>Architecture of the binary.</summary>
|
||||
public required string Architecture { get; init; }
|
||||
|
||||
/// <summary>Function size in bytes.</summary>
|
||||
public int? FunctionSize { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for corpus-based function identification.
|
||||
/// </summary>
|
||||
public sealed record CorpusLookupOptions
|
||||
{
|
||||
/// <summary>Minimum similarity threshold (0.0-1.0). Default 0.85.</summary>
|
||||
public decimal MinSimilarity { get; init; } = 0.85m;
|
||||
|
||||
/// <summary>Maximum candidates to return. Default 5.</summary>
|
||||
public int MaxCandidates { get; init; } = 5;
|
||||
|
||||
/// <summary>Library name filter (glibc, openssl, etc.). Null means all.</summary>
|
||||
public string? LibraryFilter { get; init; }
|
||||
|
||||
/// <summary>Whether to include CVE associations. Default true.</summary>
|
||||
public bool IncludeCveAssociations { get; init; } = true;
|
||||
|
||||
/// <summary>Whether to check fix status for matched CVEs. Default true.</summary>
|
||||
public bool CheckFixStatus { get; init; } = true;
|
||||
|
||||
/// <summary>Distro hint for fix status lookup.</summary>
|
||||
public string? DistroHint { get; init; }
|
||||
|
||||
/// <summary>Release hint for fix status lookup.</summary>
|
||||
public string? ReleaseHint { get; init; }
|
||||
|
||||
/// <summary>Prefer semantic fingerprint matching over instruction. Default true.</summary>
|
||||
public bool PreferSemanticMatch { get; init; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of corpus-based function identification.
|
||||
/// </summary>
|
||||
public sealed record CorpusFunctionMatch
|
||||
{
|
||||
/// <summary>Matched library name (glibc, openssl, etc.).</summary>
|
||||
public required string LibraryName { get; init; }
|
||||
|
||||
/// <summary>Library version range where this function appears.</summary>
|
||||
public required string VersionRange { get; init; }
|
||||
|
||||
/// <summary>Canonical function name.</summary>
|
||||
public required string FunctionName { get; init; }
|
||||
|
||||
/// <summary>Overall match confidence (0.0-1.0).</summary>
|
||||
public required decimal Confidence { get; init; }
|
||||
|
||||
/// <summary>Match method used (semantic, instruction, combined).</summary>
|
||||
public required CorpusMatchMethod Method { get; init; }
|
||||
|
||||
/// <summary>Semantic similarity score if available.</summary>
|
||||
public decimal? SemanticSimilarity { get; init; }
|
||||
|
||||
/// <summary>Instruction similarity score if available.</summary>
|
||||
public decimal? InstructionSimilarity { get; init; }
|
||||
|
||||
/// <summary>CVEs affecting this function (if requested).</summary>
|
||||
public ImmutableArray<CorpusCveAssociation> CveAssociations { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Method used for corpus matching.
|
||||
/// </summary>
|
||||
public enum CorpusMatchMethod
|
||||
{
|
||||
/// <summary>Matched via semantic fingerprint (IR-based).</summary>
|
||||
Semantic,
|
||||
|
||||
/// <summary>Matched via instruction fingerprint.</summary>
|
||||
Instruction,
|
||||
|
||||
/// <summary>Matched via API call sequence.</summary>
|
||||
ApiCall,
|
||||
|
||||
/// <summary>Combined match using multiple fingerprints.</summary>
|
||||
Combined
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// CVE association from corpus for a matched function.
|
||||
/// </summary>
|
||||
public sealed record CorpusCveAssociation
|
||||
{
|
||||
/// <summary>CVE identifier.</summary>
|
||||
public required string CveId { get; init; }
|
||||
|
||||
/// <summary>Affected state for the matched version.</summary>
|
||||
public required CorpusAffectedState AffectedState { get; init; }
|
||||
|
||||
/// <summary>Version where fix was applied (if fixed).</summary>
|
||||
public string? FixedInVersion { get; init; }
|
||||
|
||||
/// <summary>Confidence in the CVE association.</summary>
|
||||
public required decimal Confidence { get; init; }
|
||||
|
||||
/// <summary>Evidence type for the association.</summary>
|
||||
public string? EvidenceType { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Affected state for corpus CVE associations.
|
||||
/// </summary>
|
||||
public enum CorpusAffectedState
|
||||
{
|
||||
/// <summary>Function is vulnerable to the CVE.</summary>
|
||||
Vulnerable,
|
||||
|
||||
/// <summary>Function has been fixed.</summary>
|
||||
Fixed,
|
||||
|
||||
/// <summary>Function is not affected by the CVE.</summary>
|
||||
NotAffected
|
||||
}
|
||||
|
||||
@@ -0,0 +1,447 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Net.Http;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Connectors;
|
||||
|
||||
/// <summary>
|
||||
/// Corpus connector for libcurl/curl library.
|
||||
/// Fetches pre-built binaries from distribution packages or official releases.
|
||||
/// </summary>
|
||||
public sealed partial class CurlCorpusConnector : ILibraryCorpusConnector
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly ILogger<CurlCorpusConnector> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for curl official releases.
|
||||
/// </summary>
|
||||
public const string CurlReleasesUrl = "https://curl.se/download/";
|
||||
|
||||
/// <summary>
|
||||
/// Supported architectures.
|
||||
/// </summary>
|
||||
private static readonly ImmutableArray<string> s_supportedArchitectures =
|
||||
["x86_64", "aarch64", "armhf", "i386"];
|
||||
|
||||
public CurlCorpusConnector(
|
||||
IHttpClientFactory httpClientFactory,
|
||||
ILogger<CurlCorpusConnector> logger)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string LibraryName => "curl";
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<string> SupportedArchitectures => s_supportedArchitectures;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("Curl");
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Fetch releases from curl.se
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching curl versions from {Url}", CurlReleasesUrl);
|
||||
var html = await client.GetStringAsync(CurlReleasesUrl, ct);
|
||||
var currentVersions = ParseVersionsFromListing(html);
|
||||
foreach (var v in currentVersions)
|
||||
{
|
||||
versions.Add(v);
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch current curl releases");
|
||||
}
|
||||
|
||||
// Also check archive
|
||||
const string archiveUrl = "https://curl.se/download/archeology/";
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching old curl versions from {Url}", archiveUrl);
|
||||
var archiveHtml = await client.GetStringAsync(archiveUrl, ct);
|
||||
var archiveVersions = ParseVersionsFromListing(archiveHtml);
|
||||
foreach (var v in archiveVersions)
|
||||
{
|
||||
versions.Add(v);
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch curl archive releases");
|
||||
}
|
||||
|
||||
_logger.LogInformation("Found {Count} curl versions", versions.Count);
|
||||
return [.. versions.OrderByDescending(ParseVersion)];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<LibraryBinary?> FetchBinaryAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var normalizedArch = NormalizeArchitecture(architecture);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Fetching curl {Version} for {Architecture}",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
// Strategy 1: Try Debian/Ubuntu package (pre-built, preferred)
|
||||
var debBinary = await TryFetchDebianPackageAsync(version, normalizedArch, options, ct);
|
||||
if (debBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found curl {Version} from Debian packages", version);
|
||||
return debBinary;
|
||||
}
|
||||
|
||||
// Strategy 2: Try Alpine APK
|
||||
var alpineBinary = await TryFetchAlpinePackageAsync(version, normalizedArch, options, ct);
|
||||
if (alpineBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found curl {Version} from Alpine packages", version);
|
||||
return alpineBinary;
|
||||
}
|
||||
|
||||
_logger.LogWarning(
|
||||
"Could not find pre-built curl {Version} for {Architecture}. Source build not implemented.",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
|
||||
IEnumerable<string> versions,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
foreach (var version in versions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var binary = await FetchBinaryAsync(version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
yield return binary;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private ImmutableArray<string> ParseVersionsFromListing(string html)
|
||||
{
|
||||
// Match patterns like curl-8.5.0.tar.gz or curl-7.88.1.tar.xz
|
||||
var matches = CurlVersionRegex().Matches(html);
|
||||
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["version"].Success)
|
||||
{
|
||||
versions.Add(match.Groups["version"].Value);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. versions];
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchDebianPackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("DebianPackages");
|
||||
|
||||
var debArch = MapToDebianArchitecture(architecture);
|
||||
if (debArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// curl library package names:
|
||||
// libcurl4 (current), libcurl3 (older)
|
||||
var packageNames = new[] { "libcurl4", "libcurl3" };
|
||||
|
||||
foreach (var packageName in packageNames)
|
||||
{
|
||||
var packageUrls = await FindDebianPackageUrlsAsync(client, packageName, version, debArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Debian curl package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
var binary = await ExtractLibCurlFromDebAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Debian package from {Url}", url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchAlpinePackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("AlpinePackages");
|
||||
|
||||
var alpineArch = MapToAlpineArchitecture(architecture);
|
||||
if (alpineArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Query Alpine package repository for libcurl
|
||||
var packageUrls = await FindAlpinePackageUrlsAsync(client, "libcurl", version, alpineArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Alpine curl package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
var binary = await ExtractLibCurlFromApkAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Alpine package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindDebianPackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string packageName,
|
||||
string version,
|
||||
string debianArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var apiUrl = $"https://snapshot.debian.org/mr/binary/{packageName}/";
|
||||
|
||||
try
|
||||
{
|
||||
var response = await client.GetStringAsync(apiUrl, ct);
|
||||
var urls = ExtractPackageUrlsForVersion(response, version, debianArch);
|
||||
return urls;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Debian snapshot API query failed for {Package}", packageName);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindAlpinePackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string packageName,
|
||||
string version,
|
||||
string alpineArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var releases = new[] { "v3.20", "v3.19", "v3.18", "v3.17" };
|
||||
var urls = new List<string>();
|
||||
|
||||
foreach (var release in releases)
|
||||
{
|
||||
var baseUrl = $"https://dl-cdn.alpinelinux.org/alpine/{release}/main/{alpineArch}/";
|
||||
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(baseUrl, ct);
|
||||
|
||||
var matches = AlpinePackageRegex().Matches(html);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["name"].Value == packageName &&
|
||||
match.Groups["version"].Value.StartsWith(version, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
urls.Add($"{baseUrl}{match.Groups["file"].Value}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException)
|
||||
{
|
||||
// Skip releases we can't access
|
||||
}
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibCurlFromDebAsync(
|
||||
byte[] debPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .deb extraction - placeholder
|
||||
await Task.CompletedTask;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Debian package extraction not fully implemented. Package size: {Size} bytes",
|
||||
debPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibCurlFromApkAsync(
|
||||
byte[] apkPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .apk extraction - placeholder
|
||||
await Task.CompletedTask;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Alpine package extraction not fully implemented. Package size: {Size} bytes",
|
||||
apkPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static ImmutableArray<string> ExtractPackageUrlsForVersion(
|
||||
string json,
|
||||
string version,
|
||||
string debianArch)
|
||||
{
|
||||
var urls = new List<string>();
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = System.Text.Json.JsonDocument.Parse(json);
|
||||
|
||||
if (doc.RootElement.TryGetProperty("result", out var results))
|
||||
{
|
||||
foreach (var item in results.EnumerateArray())
|
||||
{
|
||||
if (item.TryGetProperty("binary_version", out var binaryVersion) &&
|
||||
item.TryGetProperty("architecture", out var arch))
|
||||
{
|
||||
var binVer = binaryVersion.GetString() ?? string.Empty;
|
||||
var archStr = arch.GetString() ?? string.Empty;
|
||||
|
||||
if (binVer.Contains(version, StringComparison.OrdinalIgnoreCase) &&
|
||||
archStr.Equals(debianArch, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
if (item.TryGetProperty("files", out var files))
|
||||
{
|
||||
foreach (var file in files.EnumerateArray())
|
||||
{
|
||||
if (file.TryGetProperty("hash", out var hashElement))
|
||||
{
|
||||
var hash = hashElement.GetString();
|
||||
if (!string.IsNullOrEmpty(hash))
|
||||
{
|
||||
urls.Add($"https://snapshot.debian.org/file/{hash}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (System.Text.Json.JsonException)
|
||||
{
|
||||
// Invalid JSON
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private static string NormalizeArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" or "amd64" => "x86_64",
|
||||
"aarch64" or "arm64" => "aarch64",
|
||||
"armhf" or "armv7" or "arm" => "armhf",
|
||||
"i386" or "i686" or "x86" => "i386",
|
||||
_ => architecture
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToDebianArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "i386",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToAlpineArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "x86_64",
|
||||
"aarch64" => "aarch64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "x86",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static Version? ParseVersion(string versionString)
|
||||
{
|
||||
if (Version.TryParse(versionString, out var version))
|
||||
{
|
||||
return version;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Generated Regexes
|
||||
|
||||
[GeneratedRegex(@"curl-(?<version>\d+\.\d+(?:\.\d+)?)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex CurlVersionRegex();
|
||||
|
||||
[GeneratedRegex(@"href=""(?<file>(?<name>[a-z0-9_-]+)-(?<version>[0-9.]+(?:-r\d+)?)\.apk)""", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex AlpinePackageRegex();
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,549 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Net.Http;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Http;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Connectors;
|
||||
|
||||
/// <summary>
|
||||
/// Corpus connector for GNU C Library (glibc).
|
||||
/// Fetches pre-built binaries from Debian/Ubuntu package repositories
|
||||
/// or GNU FTP mirrors for source builds.
|
||||
/// </summary>
|
||||
public sealed partial class GlibcCorpusConnector : ILibraryCorpusConnector
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly ILogger<GlibcCorpusConnector> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for GNU FTP mirror (source tarballs).
|
||||
/// </summary>
|
||||
public const string GnuMirrorUrl = "https://ftp.gnu.org/gnu/glibc/";
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for Debian package archive.
|
||||
/// </summary>
|
||||
public const string DebianSnapshotUrl = "https://snapshot.debian.org/package/glibc/";
|
||||
|
||||
/// <summary>
|
||||
/// Supported architectures for glibc.
|
||||
/// </summary>
|
||||
private static readonly ImmutableArray<string> s_supportedArchitectures =
|
||||
["x86_64", "aarch64", "armhf", "i386", "arm64", "ppc64el", "s390x"];
|
||||
|
||||
public GlibcCorpusConnector(
|
||||
IHttpClientFactory httpClientFactory,
|
||||
ILogger<GlibcCorpusConnector> logger)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string LibraryName => "glibc";
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<string> SupportedArchitectures => s_supportedArchitectures;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("GnuMirror");
|
||||
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching glibc versions from {Url}", GnuMirrorUrl);
|
||||
var html = await client.GetStringAsync(GnuMirrorUrl, ct);
|
||||
|
||||
// Parse directory listing for glibc-X.Y.tar.xz files
|
||||
var versions = ParseVersionsFromListing(html);
|
||||
|
||||
_logger.LogInformation("Found {Count} glibc versions from GNU mirror", versions.Length);
|
||||
return versions;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch glibc versions from GNU mirror, trying Debian snapshot");
|
||||
|
||||
// Fallback to Debian snapshot
|
||||
return await GetVersionsFromDebianSnapshotAsync(client, ct);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<LibraryBinary?> FetchBinaryAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var normalizedArch = NormalizeArchitecture(architecture);
|
||||
var abi = options?.PreferredAbi ?? "gnu";
|
||||
|
||||
_logger.LogInformation(
|
||||
"Fetching glibc {Version} for {Architecture}",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
// Strategy 1: Try Debian package (pre-built, preferred)
|
||||
var debBinary = await TryFetchDebianPackageAsync(version, normalizedArch, options, ct);
|
||||
if (debBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found glibc {Version} from Debian packages", version);
|
||||
return debBinary;
|
||||
}
|
||||
|
||||
// Strategy 2: Try Ubuntu package
|
||||
var ubuntuBinary = await TryFetchUbuntuPackageAsync(version, normalizedArch, options, ct);
|
||||
if (ubuntuBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found glibc {Version} from Ubuntu packages", version);
|
||||
return ubuntuBinary;
|
||||
}
|
||||
|
||||
_logger.LogWarning(
|
||||
"Could not find pre-built glibc {Version} for {Architecture}. Source build not implemented.",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
|
||||
IEnumerable<string> versions,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
foreach (var version in versions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var binary = await FetchBinaryAsync(version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
yield return binary;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private ImmutableArray<string> ParseVersionsFromListing(string html)
|
||||
{
|
||||
// Match patterns like glibc-2.31.tar.gz or glibc-2.38.tar.xz
|
||||
var matches = GlibcVersionRegex().Matches(html);
|
||||
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["version"].Success)
|
||||
{
|
||||
versions.Add(match.Groups["version"].Value);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. versions.OrderByDescending(ParseVersion)];
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> GetVersionsFromDebianSnapshotAsync(
|
||||
HttpClient client,
|
||||
CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(DebianSnapshotUrl, ct);
|
||||
|
||||
// Parse Debian snapshot listing for glibc versions
|
||||
var matches = DebianVersionRegex().Matches(html);
|
||||
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["version"].Success)
|
||||
{
|
||||
// Extract just the upstream version (before the Debian revision)
|
||||
var fullVersion = match.Groups["version"].Value;
|
||||
var upstreamVersion = ExtractUpstreamVersion(fullVersion);
|
||||
if (!string.IsNullOrEmpty(upstreamVersion))
|
||||
{
|
||||
versions.Add(upstreamVersion);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [.. versions.OrderByDescending(ParseVersion)];
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to fetch versions from Debian snapshot");
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchDebianPackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("DebianPackages");
|
||||
|
||||
// Map architecture to Debian naming
|
||||
var debArch = MapToDebianArchitecture(architecture);
|
||||
if (debArch is null)
|
||||
{
|
||||
_logger.LogDebug("Architecture {Arch} not supported for Debian packages", architecture);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Query Debian snapshot for matching package
|
||||
var packageUrls = await FindDebianPackageUrlsAsync(client, version, debArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Debian package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
// Extract the libc6 shared library from the .deb package
|
||||
var binary = await ExtractLibcFromDebAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Debian package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchUbuntuPackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("UbuntuPackages");
|
||||
|
||||
// Map architecture to Ubuntu naming (same as Debian)
|
||||
var debArch = MapToDebianArchitecture(architecture);
|
||||
if (debArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Query Launchpad for matching package
|
||||
var packageUrls = await FindUbuntuPackageUrlsAsync(client, version, debArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Ubuntu package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
// Extract the libc6 shared library from the .deb package
|
||||
var binary = await ExtractLibcFromDebAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Ubuntu package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindDebianPackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string version,
|
||||
string debianArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Construct Debian snapshot API URL
|
||||
// Format: https://snapshot.debian.org/mr/package/glibc/<version>/binfiles/libc6/<arch>
|
||||
var apiUrl = $"https://snapshot.debian.org/mr/package/glibc/{version}/binfiles/libc6/{debianArch}";
|
||||
|
||||
try
|
||||
{
|
||||
var response = await client.GetStringAsync(apiUrl, ct);
|
||||
|
||||
// Parse JSON response to get file hashes and construct download URLs
|
||||
// Simplified: extract URLs from response
|
||||
var urls = ExtractPackageUrlsFromSnapshotResponse(response);
|
||||
return urls;
|
||||
}
|
||||
catch (HttpRequestException)
|
||||
{
|
||||
// Try alternative: direct binary package search
|
||||
return await FindDebianPackageUrlsViaSearchAsync(client, version, debianArch, ct);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindDebianPackageUrlsViaSearchAsync(
|
||||
HttpClient client,
|
||||
string version,
|
||||
string debianArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Fallback: search packages.debian.org
|
||||
var searchUrl = $"https://packages.debian.org/search?keywords=libc6&searchon=names&suite=all§ion=all&arch={debianArch}";
|
||||
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(searchUrl, ct);
|
||||
|
||||
// Parse search results to find matching version
|
||||
var urls = ParseDebianSearchResults(html, version, debianArch);
|
||||
return urls;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Debian package search failed");
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindUbuntuPackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string version,
|
||||
string debianArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Query Launchpad for libc6 package
|
||||
// Format: https://launchpad.net/ubuntu/+archive/primary/+files/libc6_<version>_<arch>.deb
|
||||
var launchpadApiUrl = $"https://api.launchpad.net/1.0/ubuntu/+archive/primary?ws.op=getPublishedBinaries&binary_name=libc6&version={version}&distro_arch_series=https://api.launchpad.net/1.0/ubuntu/+distroarchseries/{debianArch}";
|
||||
|
||||
try
|
||||
{
|
||||
var response = await client.GetStringAsync(launchpadApiUrl, ct);
|
||||
var urls = ExtractPackageUrlsFromLaunchpadResponse(response);
|
||||
return urls;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Launchpad API query failed");
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibcFromDebAsync(
|
||||
byte[] debPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .deb files are ar archives containing:
|
||||
// - debian-binary (version string)
|
||||
// - control.tar.xz (package metadata)
|
||||
// - data.tar.xz (actual files)
|
||||
//
|
||||
// We need to extract /lib/x86_64-linux-gnu/libc.so.6 from data.tar.xz
|
||||
|
||||
try
|
||||
{
|
||||
// Use SharpCompress or similar to extract (placeholder for now)
|
||||
// In production, implement proper ar + tar.xz extraction
|
||||
|
||||
await Task.CompletedTask; // Placeholder for async extraction
|
||||
|
||||
// For now, return null - full extraction requires SharpCompress/libarchive
|
||||
_logger.LogDebug(
|
||||
"Debian package extraction not fully implemented. Package size: {Size} bytes",
|
||||
debPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to extract libc from .deb package");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static string NormalizeArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" or "amd64" => "x86_64",
|
||||
"aarch64" or "arm64" => "aarch64",
|
||||
"armhf" or "armv7" or "arm" => "armhf",
|
||||
"i386" or "i686" or "x86" => "i386",
|
||||
"ppc64le" or "ppc64el" => "ppc64el",
|
||||
"s390x" => "s390x",
|
||||
_ => architecture
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToDebianArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "i386",
|
||||
"ppc64el" => "ppc64el",
|
||||
"s390x" => "s390x",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static string? ExtractUpstreamVersion(string debianVersion)
|
||||
{
|
||||
// Debian version format: [epoch:]upstream_version[-debian_revision]
|
||||
// Examples:
|
||||
// 2.31-13+deb11u5 -> 2.31
|
||||
// 1:2.35-0ubuntu3 -> 2.35
|
||||
var match = UpstreamVersionRegex().Match(debianVersion);
|
||||
return match.Success ? match.Groups["upstream"].Value : null;
|
||||
}
|
||||
|
||||
private static ImmutableArray<string> ExtractPackageUrlsFromSnapshotResponse(string json)
|
||||
{
|
||||
// Parse JSON response from snapshot.debian.org
|
||||
// Format: {"result": [{"hash": "...", "name": "libc6_2.31-13_amd64.deb"}]}
|
||||
var urls = new List<string>();
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = System.Text.Json.JsonDocument.Parse(json);
|
||||
|
||||
if (doc.RootElement.TryGetProperty("result", out var results))
|
||||
{
|
||||
foreach (var item in results.EnumerateArray())
|
||||
{
|
||||
if (item.TryGetProperty("hash", out var hashElement))
|
||||
{
|
||||
var hash = hashElement.GetString();
|
||||
if (!string.IsNullOrEmpty(hash))
|
||||
{
|
||||
// Construct download URL from hash
|
||||
var url = $"https://snapshot.debian.org/file/{hash}";
|
||||
urls.Add(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (System.Text.Json.JsonException)
|
||||
{
|
||||
// Invalid JSON, return empty
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private static ImmutableArray<string> ExtractPackageUrlsFromLaunchpadResponse(string json)
|
||||
{
|
||||
var urls = new List<string>();
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = System.Text.Json.JsonDocument.Parse(json);
|
||||
|
||||
if (doc.RootElement.TryGetProperty("entries", out var entries))
|
||||
{
|
||||
foreach (var entry in entries.EnumerateArray())
|
||||
{
|
||||
if (entry.TryGetProperty("binary_package_version", out var versionElement) &&
|
||||
entry.TryGetProperty("self_link", out var selfLink))
|
||||
{
|
||||
var link = selfLink.GetString();
|
||||
if (!string.IsNullOrEmpty(link))
|
||||
{
|
||||
// Launchpad provides download URL in separate field
|
||||
urls.Add(link);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (System.Text.Json.JsonException)
|
||||
{
|
||||
// Invalid JSON
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private static ImmutableArray<string> ParseDebianSearchResults(
|
||||
string html,
|
||||
string version,
|
||||
string debianArch)
|
||||
{
|
||||
// Parse HTML search results to find package URLs
|
||||
// This is a simplified implementation
|
||||
var urls = new List<string>();
|
||||
|
||||
var matches = DebianPackageUrlRegex().Matches(html);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["url"].Success)
|
||||
{
|
||||
var url = match.Groups["url"].Value;
|
||||
if (url.Contains(version) && url.Contains(debianArch))
|
||||
{
|
||||
urls.Add(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private static Version? ParseVersion(string versionString)
|
||||
{
|
||||
// Try to parse as Version, handling various formats
|
||||
// 2.31 -> 2.31.0.0
|
||||
// 2.31.1 -> 2.31.1.0
|
||||
|
||||
if (Version.TryParse(versionString, out var version))
|
||||
{
|
||||
return version;
|
||||
}
|
||||
|
||||
// Try adding .0 suffix
|
||||
if (Version.TryParse(versionString + ".0", out version))
|
||||
{
|
||||
return version;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Generated Regexes
|
||||
|
||||
[GeneratedRegex(@"glibc-(?<version>\d+\.\d+(?:\.\d+)?)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex GlibcVersionRegex();
|
||||
|
||||
[GeneratedRegex(@"(?<version>\d+\.\d+(?:\.\d+)?(?:-\d+)?)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex DebianVersionRegex();
|
||||
|
||||
[GeneratedRegex(@"(?:^|\:)?(?<upstream>\d+\.\d+(?:\.\d+)?)(?:-|$)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex UpstreamVersionRegex();
|
||||
|
||||
[GeneratedRegex(@"href=""(?<url>https?://[^""]+\.deb)""", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex DebianPackageUrlRegex();
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,554 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Net.Http;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Connectors;
|
||||
|
||||
/// <summary>
|
||||
/// Corpus connector for OpenSSL libraries.
|
||||
/// Fetches pre-built binaries from distribution packages or official releases.
|
||||
/// </summary>
|
||||
public sealed partial class OpenSslCorpusConnector : ILibraryCorpusConnector
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly ILogger<OpenSslCorpusConnector> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for OpenSSL official releases.
|
||||
/// </summary>
|
||||
public const string OpenSslReleasesUrl = "https://www.openssl.org/source/";
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for OpenSSL old releases.
|
||||
/// </summary>
|
||||
public const string OpenSslOldReleasesUrl = "https://www.openssl.org/source/old/";
|
||||
|
||||
/// <summary>
|
||||
/// Supported architectures.
|
||||
/// </summary>
|
||||
private static readonly ImmutableArray<string> s_supportedArchitectures =
|
||||
["x86_64", "aarch64", "armhf", "i386"];
|
||||
|
||||
public OpenSslCorpusConnector(
|
||||
IHttpClientFactory httpClientFactory,
|
||||
ILogger<OpenSslCorpusConnector> logger)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string LibraryName => "openssl";
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<string> SupportedArchitectures => s_supportedArchitectures;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("OpenSsl");
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Fetch current releases
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching OpenSSL versions from {Url}", OpenSslReleasesUrl);
|
||||
var html = await client.GetStringAsync(OpenSslReleasesUrl, ct);
|
||||
var currentVersions = ParseVersionsFromListing(html);
|
||||
foreach (var v in currentVersions)
|
||||
{
|
||||
versions.Add(v);
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch current OpenSSL releases");
|
||||
}
|
||||
|
||||
// Fetch old releases index
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching old OpenSSL versions from {Url}", OpenSslOldReleasesUrl);
|
||||
var oldHtml = await client.GetStringAsync(OpenSslOldReleasesUrl, ct);
|
||||
var oldVersionDirs = ParseOldVersionDirectories(oldHtml);
|
||||
|
||||
foreach (var dir in oldVersionDirs)
|
||||
{
|
||||
var dirUrl = $"{OpenSslOldReleasesUrl}{dir}/";
|
||||
try
|
||||
{
|
||||
var dirHtml = await client.GetStringAsync(dirUrl, ct);
|
||||
var dirVersions = ParseVersionsFromListing(dirHtml);
|
||||
foreach (var v in dirVersions)
|
||||
{
|
||||
versions.Add(v);
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException)
|
||||
{
|
||||
// Skip directories we can't access
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch old OpenSSL releases");
|
||||
}
|
||||
|
||||
_logger.LogInformation("Found {Count} OpenSSL versions", versions.Count);
|
||||
return [.. versions.OrderByDescending(ParseVersion)];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<LibraryBinary?> FetchBinaryAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var normalizedArch = NormalizeArchitecture(architecture);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Fetching OpenSSL {Version} for {Architecture}",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
// Strategy 1: Try Debian/Ubuntu package (pre-built, preferred)
|
||||
var debBinary = await TryFetchDebianPackageAsync(version, normalizedArch, options, ct);
|
||||
if (debBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found OpenSSL {Version} from Debian packages", version);
|
||||
return debBinary;
|
||||
}
|
||||
|
||||
// Strategy 2: Try Alpine APK
|
||||
var alpineBinary = await TryFetchAlpinePackageAsync(version, normalizedArch, options, ct);
|
||||
if (alpineBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found OpenSSL {Version} from Alpine packages", version);
|
||||
return alpineBinary;
|
||||
}
|
||||
|
||||
_logger.LogWarning(
|
||||
"Could not find pre-built OpenSSL {Version} for {Architecture}. Source build not implemented.",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
|
||||
IEnumerable<string> versions,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
foreach (var version in versions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var binary = await FetchBinaryAsync(version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
yield return binary;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private ImmutableArray<string> ParseVersionsFromListing(string html)
|
||||
{
|
||||
// Match patterns like openssl-1.1.1n.tar.gz or openssl-3.0.8.tar.gz
|
||||
var matches = OpenSslVersionRegex().Matches(html);
|
||||
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["version"].Success)
|
||||
{
|
||||
var version = match.Groups["version"].Value;
|
||||
// Normalize version: 1.1.1n -> 1.1.1n, 3.0.8 -> 3.0.8
|
||||
versions.Add(version);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. versions];
|
||||
}
|
||||
|
||||
private ImmutableArray<string> ParseOldVersionDirectories(string html)
|
||||
{
|
||||
// Match directory names like 1.0.2/, 1.1.0/, 1.1.1/, 3.0/
|
||||
var matches = VersionDirRegex().Matches(html);
|
||||
|
||||
var dirs = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["dir"].Success)
|
||||
{
|
||||
dirs.Add(match.Groups["dir"].Value);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. dirs];
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchDebianPackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("DebianPackages");
|
||||
|
||||
var debArch = MapToDebianArchitecture(architecture);
|
||||
if (debArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Determine package name based on version
|
||||
// OpenSSL 1.x -> libssl1.1
|
||||
// OpenSSL 3.x -> libssl3
|
||||
var packageName = GetDebianPackageName(version);
|
||||
|
||||
// Query Debian snapshot for matching package
|
||||
var packageUrls = await FindDebianPackageUrlsAsync(client, packageName, version, debArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Debian OpenSSL package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
// Extract libssl.so.X from the .deb package
|
||||
var binary = await ExtractLibSslFromDebAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Debian package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchAlpinePackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("AlpinePackages");
|
||||
|
||||
var alpineArch = MapToAlpineArchitecture(architecture);
|
||||
if (alpineArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Query Alpine package repository
|
||||
var packageUrls = await FindAlpinePackageUrlsAsync(client, "libssl3", version, alpineArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Alpine OpenSSL package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
// Extract libssl.so.X from the .apk package
|
||||
var binary = await ExtractLibSslFromApkAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Alpine package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindDebianPackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string packageName,
|
||||
string version,
|
||||
string debianArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Map OpenSSL version to Debian source package version
|
||||
// e.g., 1.1.1n -> libssl1.1_1.1.1n-0+deb11u4
|
||||
var apiUrl = $"https://snapshot.debian.org/mr/binary/{packageName}/";
|
||||
|
||||
try
|
||||
{
|
||||
var response = await client.GetStringAsync(apiUrl, ct);
|
||||
|
||||
// Parse JSON response to find matching versions
|
||||
var urls = ExtractPackageUrlsForVersion(response, version, debianArch);
|
||||
return urls;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Debian snapshot API query failed for {Package}", packageName);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindAlpinePackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string packageName,
|
||||
string version,
|
||||
string alpineArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Alpine uses different repository structure
|
||||
// https://dl-cdn.alpinelinux.org/alpine/v3.18/main/x86_64/libssl3-3.1.1-r1.apk
|
||||
var releases = new[] { "v3.20", "v3.19", "v3.18", "v3.17" };
|
||||
var urls = new List<string>();
|
||||
|
||||
foreach (var release in releases)
|
||||
{
|
||||
var baseUrl = $"https://dl-cdn.alpinelinux.org/alpine/{release}/main/{alpineArch}/";
|
||||
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(baseUrl, ct);
|
||||
|
||||
// Find package URLs matching version
|
||||
var matches = AlpinePackageRegex().Matches(html);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["name"].Value == packageName &&
|
||||
match.Groups["version"].Value.StartsWith(version, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
urls.Add($"{baseUrl}{match.Groups["file"].Value}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException)
|
||||
{
|
||||
// Skip releases we can't access
|
||||
}
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibSslFromDebAsync(
|
||||
byte[] debPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .deb extraction - placeholder for now
|
||||
// In production, implement proper ar + tar.xz extraction
|
||||
|
||||
await Task.CompletedTask;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Debian package extraction not fully implemented. Package size: {Size} bytes",
|
||||
debPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibSslFromApkAsync(
|
||||
byte[] apkPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .apk files are gzip-compressed tar archives
|
||||
// In production, implement proper tar.gz extraction
|
||||
|
||||
await Task.CompletedTask;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Alpine package extraction not fully implemented. Package size: {Size} bytes",
|
||||
apkPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static string GetDebianPackageName(string version)
|
||||
{
|
||||
// OpenSSL 1.0.x -> libssl1.0.0
|
||||
// OpenSSL 1.1.x -> libssl1.1
|
||||
// OpenSSL 3.x -> libssl3
|
||||
if (version.StartsWith("1.0", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return "libssl1.0.0";
|
||||
}
|
||||
else if (version.StartsWith("1.1", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return "libssl1.1";
|
||||
}
|
||||
else
|
||||
{
|
||||
return "libssl3";
|
||||
}
|
||||
}
|
||||
|
||||
private static ImmutableArray<string> ExtractPackageUrlsForVersion(
|
||||
string json,
|
||||
string version,
|
||||
string debianArch)
|
||||
{
|
||||
var urls = new List<string>();
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = System.Text.Json.JsonDocument.Parse(json);
|
||||
|
||||
if (doc.RootElement.TryGetProperty("result", out var results))
|
||||
{
|
||||
foreach (var item in results.EnumerateArray())
|
||||
{
|
||||
if (item.TryGetProperty("binary_version", out var binaryVersion) &&
|
||||
item.TryGetProperty("architecture", out var arch))
|
||||
{
|
||||
var binVer = binaryVersion.GetString() ?? string.Empty;
|
||||
var archStr = arch.GetString() ?? string.Empty;
|
||||
|
||||
// Check if version matches and architecture matches
|
||||
if (binVer.Contains(version, StringComparison.OrdinalIgnoreCase) &&
|
||||
archStr.Equals(debianArch, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
if (item.TryGetProperty("files", out var files))
|
||||
{
|
||||
foreach (var file in files.EnumerateArray())
|
||||
{
|
||||
if (file.TryGetProperty("hash", out var hashElement))
|
||||
{
|
||||
var hash = hashElement.GetString();
|
||||
if (!string.IsNullOrEmpty(hash))
|
||||
{
|
||||
urls.Add($"https://snapshot.debian.org/file/{hash}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (System.Text.Json.JsonException)
|
||||
{
|
||||
// Invalid JSON
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private static string NormalizeArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" or "amd64" => "x86_64",
|
||||
"aarch64" or "arm64" => "aarch64",
|
||||
"armhf" or "armv7" or "arm" => "armhf",
|
||||
"i386" or "i686" or "x86" => "i386",
|
||||
_ => architecture
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToDebianArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "i386",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToAlpineArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "x86_64",
|
||||
"aarch64" => "aarch64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "x86",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static Version? ParseVersion(string versionString)
|
||||
{
|
||||
// OpenSSL versions can be like 1.1.1n or 3.0.8
|
||||
// Extract numeric parts only
|
||||
var numericPart = ExtractNumericVersion(versionString);
|
||||
if (Version.TryParse(numericPart, out var version))
|
||||
{
|
||||
return version;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static string ExtractNumericVersion(string version)
|
||||
{
|
||||
// 1.1.1n -> 1.1.1
|
||||
// 3.0.8 -> 3.0.8
|
||||
var parts = new List<string>();
|
||||
foreach (var ch in version)
|
||||
{
|
||||
if (char.IsDigit(ch) || ch == '.')
|
||||
{
|
||||
if (parts.Count == 0)
|
||||
{
|
||||
parts.Add(ch.ToString());
|
||||
}
|
||||
else if (ch == '.')
|
||||
{
|
||||
parts.Add(".");
|
||||
}
|
||||
else
|
||||
{
|
||||
parts[^1] += ch;
|
||||
}
|
||||
}
|
||||
else if (parts.Count > 0 && parts[^1] != ".")
|
||||
{
|
||||
// Stop at first non-digit after version starts
|
||||
break;
|
||||
}
|
||||
}
|
||||
return string.Join("", parts).TrimEnd('.');
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Generated Regexes
|
||||
|
||||
[GeneratedRegex(@"openssl-(?<version>\d+\.\d+\.\d+[a-z]?)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex OpenSslVersionRegex();
|
||||
|
||||
[GeneratedRegex(@"href=""(?<dir>\d+\.\d+(?:\.\d+)?)/""", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex VersionDirRegex();
|
||||
|
||||
[GeneratedRegex(@"href=""(?<file>(?<name>[a-z0-9_-]+)-(?<version>[0-9.]+[a-z]?-r\d+)\.apk)""", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex AlpinePackageRegex();
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,452 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Net.Http;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Connectors;
|
||||
|
||||
/// <summary>
|
||||
/// Corpus connector for zlib compression library.
|
||||
/// Fetches pre-built binaries from distribution packages or official releases.
|
||||
/// </summary>
|
||||
public sealed partial class ZlibCorpusConnector : ILibraryCorpusConnector
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly ILogger<ZlibCorpusConnector> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for zlib official releases.
|
||||
/// </summary>
|
||||
public const string ZlibReleasesUrl = "https://www.zlib.net/";
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for zlib fossils/old releases.
|
||||
/// </summary>
|
||||
public const string ZlibFossilsUrl = "https://www.zlib.net/fossils/";
|
||||
|
||||
/// <summary>
|
||||
/// Supported architectures.
|
||||
/// </summary>
|
||||
private static readonly ImmutableArray<string> s_supportedArchitectures =
|
||||
["x86_64", "aarch64", "armhf", "i386"];
|
||||
|
||||
public ZlibCorpusConnector(
|
||||
IHttpClientFactory httpClientFactory,
|
||||
ILogger<ZlibCorpusConnector> logger)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string LibraryName => "zlib";
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<string> SupportedArchitectures => s_supportedArchitectures;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("Zlib");
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Fetch current release
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching zlib versions from {Url}", ZlibReleasesUrl);
|
||||
var html = await client.GetStringAsync(ZlibReleasesUrl, ct);
|
||||
var currentVersions = ParseVersionsFromListing(html);
|
||||
foreach (var v in currentVersions)
|
||||
{
|
||||
versions.Add(v);
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch current zlib releases");
|
||||
}
|
||||
|
||||
// Fetch old releases (fossils)
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching old zlib versions from {Url}", ZlibFossilsUrl);
|
||||
var fossilsHtml = await client.GetStringAsync(ZlibFossilsUrl, ct);
|
||||
var fossilVersions = ParseVersionsFromListing(fossilsHtml);
|
||||
foreach (var v in fossilVersions)
|
||||
{
|
||||
versions.Add(v);
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch old zlib releases");
|
||||
}
|
||||
|
||||
_logger.LogInformation("Found {Count} zlib versions", versions.Count);
|
||||
return [.. versions.OrderByDescending(ParseVersion)];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<LibraryBinary?> FetchBinaryAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var normalizedArch = NormalizeArchitecture(architecture);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Fetching zlib {Version} for {Architecture}",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
// Strategy 1: Try Debian/Ubuntu package (pre-built, preferred)
|
||||
var debBinary = await TryFetchDebianPackageAsync(version, normalizedArch, options, ct);
|
||||
if (debBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found zlib {Version} from Debian packages", version);
|
||||
return debBinary;
|
||||
}
|
||||
|
||||
// Strategy 2: Try Alpine APK
|
||||
var alpineBinary = await TryFetchAlpinePackageAsync(version, normalizedArch, options, ct);
|
||||
if (alpineBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found zlib {Version} from Alpine packages", version);
|
||||
return alpineBinary;
|
||||
}
|
||||
|
||||
_logger.LogWarning(
|
||||
"Could not find pre-built zlib {Version} for {Architecture}. Source build not implemented.",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
|
||||
IEnumerable<string> versions,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
foreach (var version in versions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var binary = await FetchBinaryAsync(version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
yield return binary;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private ImmutableArray<string> ParseVersionsFromListing(string html)
|
||||
{
|
||||
// Match patterns like zlib-1.2.13.tar.gz or zlib-1.3.1.tar.xz
|
||||
var matches = ZlibVersionRegex().Matches(html);
|
||||
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["version"].Success)
|
||||
{
|
||||
versions.Add(match.Groups["version"].Value);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. versions];
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchDebianPackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("DebianPackages");
|
||||
|
||||
var debArch = MapToDebianArchitecture(architecture);
|
||||
if (debArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// zlib package name is zlib1g
|
||||
const string packageName = "zlib1g";
|
||||
|
||||
// Query Debian snapshot for matching package
|
||||
var packageUrls = await FindDebianPackageUrlsAsync(client, packageName, version, debArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Debian zlib package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
// Extract libz.so.1 from the .deb package
|
||||
var binary = await ExtractLibZFromDebAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Debian package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchAlpinePackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("AlpinePackages");
|
||||
|
||||
var alpineArch = MapToAlpineArchitecture(architecture);
|
||||
if (alpineArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Query Alpine package repository for zlib
|
||||
var packageUrls = await FindAlpinePackageUrlsAsync(client, "zlib", version, alpineArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Alpine zlib package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
// Extract libz.so.1 from the .apk package
|
||||
var binary = await ExtractLibZFromApkAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Alpine package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindDebianPackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string packageName,
|
||||
string version,
|
||||
string debianArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var apiUrl = $"https://snapshot.debian.org/mr/binary/{packageName}/";
|
||||
|
||||
try
|
||||
{
|
||||
var response = await client.GetStringAsync(apiUrl, ct);
|
||||
var urls = ExtractPackageUrlsForVersion(response, version, debianArch);
|
||||
return urls;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Debian snapshot API query failed for {Package}", packageName);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindAlpinePackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string packageName,
|
||||
string version,
|
||||
string alpineArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var releases = new[] { "v3.20", "v3.19", "v3.18", "v3.17" };
|
||||
var urls = new List<string>();
|
||||
|
||||
foreach (var release in releases)
|
||||
{
|
||||
var baseUrl = $"https://dl-cdn.alpinelinux.org/alpine/{release}/main/{alpineArch}/";
|
||||
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(baseUrl, ct);
|
||||
|
||||
// Find package URLs matching version
|
||||
var matches = AlpinePackageRegex().Matches(html);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["name"].Value == packageName &&
|
||||
match.Groups["version"].Value.StartsWith(version, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
urls.Add($"{baseUrl}{match.Groups["file"].Value}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException)
|
||||
{
|
||||
// Skip releases we can't access
|
||||
}
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibZFromDebAsync(
|
||||
byte[] debPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .deb extraction - placeholder for now
|
||||
await Task.CompletedTask;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Debian package extraction not fully implemented. Package size: {Size} bytes",
|
||||
debPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibZFromApkAsync(
|
||||
byte[] apkPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .apk extraction - placeholder for now
|
||||
await Task.CompletedTask;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Alpine package extraction not fully implemented. Package size: {Size} bytes",
|
||||
apkPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static ImmutableArray<string> ExtractPackageUrlsForVersion(
|
||||
string json,
|
||||
string version,
|
||||
string debianArch)
|
||||
{
|
||||
var urls = new List<string>();
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = System.Text.Json.JsonDocument.Parse(json);
|
||||
|
||||
if (doc.RootElement.TryGetProperty("result", out var results))
|
||||
{
|
||||
foreach (var item in results.EnumerateArray())
|
||||
{
|
||||
if (item.TryGetProperty("binary_version", out var binaryVersion) &&
|
||||
item.TryGetProperty("architecture", out var arch))
|
||||
{
|
||||
var binVer = binaryVersion.GetString() ?? string.Empty;
|
||||
var archStr = arch.GetString() ?? string.Empty;
|
||||
|
||||
// Check if version matches and architecture matches
|
||||
if (binVer.Contains(version, StringComparison.OrdinalIgnoreCase) &&
|
||||
archStr.Equals(debianArch, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
if (item.TryGetProperty("files", out var files))
|
||||
{
|
||||
foreach (var file in files.EnumerateArray())
|
||||
{
|
||||
if (file.TryGetProperty("hash", out var hashElement))
|
||||
{
|
||||
var hash = hashElement.GetString();
|
||||
if (!string.IsNullOrEmpty(hash))
|
||||
{
|
||||
urls.Add($"https://snapshot.debian.org/file/{hash}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (System.Text.Json.JsonException)
|
||||
{
|
||||
// Invalid JSON
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private static string NormalizeArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" or "amd64" => "x86_64",
|
||||
"aarch64" or "arm64" => "aarch64",
|
||||
"armhf" or "armv7" or "arm" => "armhf",
|
||||
"i386" or "i686" or "x86" => "i386",
|
||||
_ => architecture
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToDebianArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "i386",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToAlpineArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "x86_64",
|
||||
"aarch64" => "aarch64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "x86",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static Version? ParseVersion(string versionString)
|
||||
{
|
||||
if (Version.TryParse(versionString, out var version))
|
||||
{
|
||||
return version;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Generated Regexes
|
||||
|
||||
[GeneratedRegex(@"zlib-(?<version>\d+\.\d+(?:\.\d+)?)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex ZlibVersionRegex();
|
||||
|
||||
[GeneratedRegex(@"href=""(?<file>(?<name>[a-z0-9_-]+)-(?<version>[0-9.]+(?:-r\d+)?)\.apk)""", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex AlpinePackageRegex();
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,135 @@
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus;
|
||||
|
||||
/// <summary>
|
||||
/// Service for ingesting library functions into the corpus.
|
||||
/// </summary>
|
||||
public interface ICorpusIngestionService
|
||||
{
|
||||
/// <summary>
|
||||
/// Ingest all functions from a library binary.
|
||||
/// </summary>
|
||||
/// <param name="metadata">Library metadata.</param>
|
||||
/// <param name="binaryStream">Binary file stream.</param>
|
||||
/// <param name="options">Ingestion options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Ingestion result with statistics.</returns>
|
||||
Task<IngestionResult> IngestLibraryAsync(
|
||||
LibraryIngestionMetadata metadata,
|
||||
Stream binaryStream,
|
||||
IngestionOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Ingest functions from a library connector.
|
||||
/// </summary>
|
||||
/// <param name="libraryName">Library name (e.g., "glibc").</param>
|
||||
/// <param name="connector">Library corpus connector.</param>
|
||||
/// <param name="options">Ingestion options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Stream of ingestion results.</returns>
|
||||
IAsyncEnumerable<IngestionResult> IngestFromConnectorAsync(
|
||||
string libraryName,
|
||||
ILibraryCorpusConnector connector,
|
||||
IngestionOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Update CVE associations for corpus functions.
|
||||
/// </summary>
|
||||
/// <param name="cveId">CVE identifier.</param>
|
||||
/// <param name="associations">Function-CVE associations.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Number of associations updated.</returns>
|
||||
Task<int> UpdateCveAssociationsAsync(
|
||||
string cveId,
|
||||
IReadOnlyList<FunctionCveAssociation> associations,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get ingestion job status.
|
||||
/// </summary>
|
||||
/// <param name="jobId">Job ID.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Job details or null if not found.</returns>
|
||||
Task<IngestionJob?> GetJobStatusAsync(Guid jobId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metadata for library ingestion.
|
||||
/// </summary>
|
||||
public sealed record LibraryIngestionMetadata(
|
||||
string Name,
|
||||
string Version,
|
||||
string Architecture,
|
||||
string? Abi = null,
|
||||
string? Compiler = null,
|
||||
string? CompilerVersion = null,
|
||||
string? OptimizationLevel = null,
|
||||
DateOnly? ReleaseDate = null,
|
||||
bool IsSecurityRelease = false,
|
||||
string? SourceArchiveSha256 = null);
|
||||
|
||||
/// <summary>
|
||||
/// Options for corpus ingestion.
|
||||
/// </summary>
|
||||
public sealed record IngestionOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Minimum function size to index (bytes).
|
||||
/// </summary>
|
||||
public int MinFunctionSize { get; init; } = 16;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum functions per binary.
|
||||
/// </summary>
|
||||
public int MaxFunctionsPerBinary { get; init; } = 10_000;
|
||||
|
||||
/// <summary>
|
||||
/// Algorithms to use for fingerprinting.
|
||||
/// </summary>
|
||||
public ImmutableArray<FingerprintAlgorithm> Algorithms { get; init; } =
|
||||
[FingerprintAlgorithm.SemanticKsg, FingerprintAlgorithm.InstructionBb, FingerprintAlgorithm.CfgWl];
|
||||
|
||||
/// <summary>
|
||||
/// Include exported functions only.
|
||||
/// </summary>
|
||||
public bool ExportedOnly { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Generate function clusters after ingestion.
|
||||
/// </summary>
|
||||
public bool GenerateClusters { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Parallel degree for function processing.
|
||||
/// </summary>
|
||||
public int ParallelDegree { get; init; } = 4;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a library ingestion.
|
||||
/// </summary>
|
||||
public sealed record IngestionResult(
|
||||
Guid JobId,
|
||||
string LibraryName,
|
||||
string Version,
|
||||
string Architecture,
|
||||
int FunctionsIndexed,
|
||||
int FingerprintsGenerated,
|
||||
int ClustersCreated,
|
||||
TimeSpan Duration,
|
||||
ImmutableArray<string> Errors,
|
||||
ImmutableArray<string> Warnings);
|
||||
|
||||
/// <summary>
|
||||
/// Association between a function and a CVE.
|
||||
/// </summary>
|
||||
public sealed record FunctionCveAssociation(
|
||||
Guid FunctionId,
|
||||
CveAffectedState AffectedState,
|
||||
string? PatchCommit,
|
||||
decimal Confidence,
|
||||
CveEvidenceType? EvidenceType);
|
||||
@@ -0,0 +1,186 @@
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus;
|
||||
|
||||
/// <summary>
|
||||
/// Service for querying the function corpus.
|
||||
/// </summary>
|
||||
public interface ICorpusQueryService
|
||||
{
|
||||
/// <summary>
|
||||
/// Identify a function by its fingerprints.
|
||||
/// </summary>
|
||||
/// <param name="fingerprints">Function fingerprints to match.</param>
|
||||
/// <param name="options">Query options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matching functions ordered by similarity.</returns>
|
||||
Task<ImmutableArray<FunctionMatch>> IdentifyFunctionAsync(
|
||||
FunctionFingerprints fingerprints,
|
||||
IdentifyOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Batch identify functions.
|
||||
/// </summary>
|
||||
/// <param name="fingerprints">Multiple function fingerprints.</param>
|
||||
/// <param name="options">Query options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matches for each input fingerprint.</returns>
|
||||
Task<ImmutableDictionary<int, ImmutableArray<FunctionMatch>>> IdentifyBatchAsync(
|
||||
IReadOnlyList<FunctionFingerprints> fingerprints,
|
||||
IdentifyOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get all functions associated with a CVE.
|
||||
/// </summary>
|
||||
/// <param name="cveId">CVE identifier.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Functions affected by the CVE.</returns>
|
||||
Task<ImmutableArray<CorpusFunctionWithCve>> GetFunctionsForCveAsync(
|
||||
string cveId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get function evolution across library versions.
|
||||
/// </summary>
|
||||
/// <param name="libraryName">Library name.</param>
|
||||
/// <param name="functionName">Function name.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Function evolution timeline.</returns>
|
||||
Task<FunctionEvolution?> GetFunctionEvolutionAsync(
|
||||
string libraryName,
|
||||
string functionName,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get corpus statistics.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Corpus statistics.</returns>
|
||||
Task<CorpusStatistics> GetStatisticsAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// List libraries in the corpus.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Libraries with version counts.</returns>
|
||||
Task<ImmutableArray<LibrarySummary>> ListLibrariesAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// List versions for a library.
|
||||
/// </summary>
|
||||
/// <param name="libraryName">Library name.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Version information.</returns>
|
||||
Task<ImmutableArray<LibraryVersionSummary>> ListVersionsAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fingerprints for function identification.
|
||||
/// </summary>
|
||||
public sealed record FunctionFingerprints(
|
||||
byte[]? SemanticHash,
|
||||
byte[]? InstructionHash,
|
||||
byte[]? CfgHash,
|
||||
ImmutableArray<string>? ApiCalls,
|
||||
int? SizeBytes);
|
||||
|
||||
/// <summary>
|
||||
/// Options for function identification.
|
||||
/// </summary>
|
||||
public sealed record IdentifyOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Minimum similarity threshold (0.0-1.0).
|
||||
/// </summary>
|
||||
public decimal MinSimilarity { get; init; } = 0.70m;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum results to return.
|
||||
/// </summary>
|
||||
public int MaxResults { get; init; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Filter by library names.
|
||||
/// </summary>
|
||||
public ImmutableArray<string>? LibraryFilter { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Filter by architectures.
|
||||
/// </summary>
|
||||
public ImmutableArray<string>? ArchitectureFilter { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Include CVE information in results.
|
||||
/// </summary>
|
||||
public bool IncludeCveInfo { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Weights for similarity computation.
|
||||
/// </summary>
|
||||
public SimilarityWeights Weights { get; init; } = SimilarityWeights.Default;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Weights for computing overall similarity.
|
||||
/// </summary>
|
||||
public sealed record SimilarityWeights
|
||||
{
|
||||
public decimal SemanticWeight { get; init; } = 0.35m;
|
||||
public decimal InstructionWeight { get; init; } = 0.25m;
|
||||
public decimal CfgWeight { get; init; } = 0.25m;
|
||||
public decimal ApiCallWeight { get; init; } = 0.15m;
|
||||
|
||||
public static SimilarityWeights Default { get; } = new();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Function with CVE information.
|
||||
/// </summary>
|
||||
public sealed record CorpusFunctionWithCve(
|
||||
CorpusFunction Function,
|
||||
LibraryMetadata Library,
|
||||
LibraryVersion Version,
|
||||
BuildVariant Build,
|
||||
FunctionCve CveInfo);
|
||||
|
||||
/// <summary>
|
||||
/// Corpus statistics.
|
||||
/// </summary>
|
||||
public sealed record CorpusStatistics(
|
||||
int LibraryCount,
|
||||
int VersionCount,
|
||||
int BuildVariantCount,
|
||||
int FunctionCount,
|
||||
int FingerprintCount,
|
||||
int ClusterCount,
|
||||
int CveAssociationCount,
|
||||
DateTimeOffset? LastUpdated);
|
||||
|
||||
/// <summary>
|
||||
/// Summary of a library in the corpus.
|
||||
/// </summary>
|
||||
public sealed record LibrarySummary(
|
||||
Guid Id,
|
||||
string Name,
|
||||
string? Description,
|
||||
int VersionCount,
|
||||
int FunctionCount,
|
||||
int CveCount,
|
||||
DateTimeOffset? LatestVersionDate);
|
||||
|
||||
/// <summary>
|
||||
/// Summary of a library version.
|
||||
/// </summary>
|
||||
public sealed record LibraryVersionSummary(
|
||||
Guid Id,
|
||||
string Version,
|
||||
DateOnly? ReleaseDate,
|
||||
bool IsSecurityRelease,
|
||||
int BuildVariantCount,
|
||||
int FunctionCount,
|
||||
ImmutableArray<string> Architectures);
|
||||
@@ -0,0 +1,327 @@
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus;
|
||||
|
||||
/// <summary>
|
||||
/// Repository for corpus data access.
|
||||
/// </summary>
|
||||
public interface ICorpusRepository
|
||||
{
|
||||
#region Libraries
|
||||
|
||||
/// <summary>
|
||||
/// Get or create a library.
|
||||
/// </summary>
|
||||
Task<LibraryMetadata> GetOrCreateLibraryAsync(
|
||||
string name,
|
||||
string? description = null,
|
||||
string? homepageUrl = null,
|
||||
string? sourceRepo = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a library by name.
|
||||
/// </summary>
|
||||
Task<LibraryMetadata?> GetLibraryAsync(string name, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a library by ID.
|
||||
/// </summary>
|
||||
Task<LibraryMetadata?> GetLibraryByIdAsync(Guid id, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// List all libraries.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<LibrarySummary>> ListLibrariesAsync(CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Library Versions
|
||||
|
||||
/// <summary>
|
||||
/// Get or create a library version.
|
||||
/// </summary>
|
||||
Task<LibraryVersion> GetOrCreateVersionAsync(
|
||||
Guid libraryId,
|
||||
string version,
|
||||
DateOnly? releaseDate = null,
|
||||
bool isSecurityRelease = false,
|
||||
string? sourceArchiveSha256 = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a library version.
|
||||
/// </summary>
|
||||
Task<LibraryVersion?> GetVersionAsync(
|
||||
Guid libraryId,
|
||||
string version,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a library version by ID.
|
||||
/// </summary>
|
||||
Task<LibraryVersion?> GetLibraryVersionAsync(
|
||||
Guid versionId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// List versions for a library.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<LibraryVersionSummary>> ListVersionsAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Build Variants
|
||||
|
||||
/// <summary>
|
||||
/// Get or create a build variant.
|
||||
/// </summary>
|
||||
Task<BuildVariant> GetOrCreateBuildVariantAsync(
|
||||
Guid libraryVersionId,
|
||||
string architecture,
|
||||
string binarySha256,
|
||||
string? abi = null,
|
||||
string? compiler = null,
|
||||
string? compilerVersion = null,
|
||||
string? optimizationLevel = null,
|
||||
string? buildId = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a build variant by binary hash.
|
||||
/// </summary>
|
||||
Task<BuildVariant?> GetBuildVariantBySha256Async(
|
||||
string binarySha256,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a build variant by ID.
|
||||
/// </summary>
|
||||
Task<BuildVariant?> GetBuildVariantAsync(
|
||||
Guid variantId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get build variants for a version.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<BuildVariant>> GetBuildVariantsAsync(
|
||||
Guid libraryVersionId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Functions
|
||||
|
||||
/// <summary>
|
||||
/// Bulk insert functions.
|
||||
/// </summary>
|
||||
Task<int> InsertFunctionsAsync(
|
||||
IReadOnlyList<CorpusFunction> functions,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a function by ID.
|
||||
/// </summary>
|
||||
Task<CorpusFunction?> GetFunctionAsync(Guid id, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get functions for a build variant.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<CorpusFunction>> GetFunctionsForVariantAsync(
|
||||
Guid buildVariantId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get function count for a build variant.
|
||||
/// </summary>
|
||||
Task<int> GetFunctionCountAsync(Guid buildVariantId, CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Fingerprints
|
||||
|
||||
/// <summary>
|
||||
/// Bulk insert fingerprints.
|
||||
/// </summary>
|
||||
Task<int> InsertFingerprintsAsync(
|
||||
IReadOnlyList<CorpusFingerprint> fingerprints,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find functions by fingerprint hash.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<Guid>> FindFunctionsByFingerprintAsync(
|
||||
FingerprintAlgorithm algorithm,
|
||||
byte[] fingerprint,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find similar fingerprints (for approximate matching).
|
||||
/// </summary>
|
||||
Task<ImmutableArray<FingerprintSearchResult>> FindSimilarFingerprintsAsync(
|
||||
FingerprintAlgorithm algorithm,
|
||||
byte[] fingerprint,
|
||||
int maxResults = 10,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get fingerprints for a function.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<CorpusFingerprint>> GetFingerprintsAsync(
|
||||
Guid functionId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get fingerprints for a function (alias).
|
||||
/// </summary>
|
||||
Task<ImmutableArray<CorpusFingerprint>> GetFingerprintsForFunctionAsync(
|
||||
Guid functionId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Clusters
|
||||
|
||||
/// <summary>
|
||||
/// Get or create a function cluster.
|
||||
/// </summary>
|
||||
Task<FunctionCluster> GetOrCreateClusterAsync(
|
||||
Guid libraryId,
|
||||
string canonicalName,
|
||||
string? description = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a cluster by ID.
|
||||
/// </summary>
|
||||
Task<FunctionCluster?> GetClusterAsync(
|
||||
Guid clusterId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get all clusters for a library.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<FunctionCluster>> GetClustersForLibraryAsync(
|
||||
Guid libraryId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Insert a new cluster.
|
||||
/// </summary>
|
||||
Task InsertClusterAsync(
|
||||
FunctionCluster cluster,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Add members to a cluster.
|
||||
/// </summary>
|
||||
Task<int> AddClusterMembersAsync(
|
||||
Guid clusterId,
|
||||
IReadOnlyList<ClusterMember> members,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Add a single member to a cluster.
|
||||
/// </summary>
|
||||
Task AddClusterMemberAsync(
|
||||
ClusterMember member,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get cluster members.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<Guid>> GetClusterMemberIdsAsync(
|
||||
Guid clusterId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get cluster members with details.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<ClusterMember>> GetClusterMembersAsync(
|
||||
Guid clusterId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Clear all members from a cluster.
|
||||
/// </summary>
|
||||
Task ClearClusterMembersAsync(
|
||||
Guid clusterId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region CVE Associations
|
||||
|
||||
/// <summary>
|
||||
/// Upsert CVE associations.
|
||||
/// </summary>
|
||||
Task<int> UpsertCveAssociationsAsync(
|
||||
string cveId,
|
||||
IReadOnlyList<FunctionCve> associations,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get functions for a CVE.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<Guid>> GetFunctionIdsForCveAsync(
|
||||
string cveId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get CVEs for a function.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<FunctionCve>> GetCvesForFunctionAsync(
|
||||
Guid functionId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Ingestion Jobs
|
||||
|
||||
/// <summary>
|
||||
/// Create an ingestion job.
|
||||
/// </summary>
|
||||
Task<IngestionJob> CreateIngestionJobAsync(
|
||||
Guid libraryId,
|
||||
IngestionJobType jobType,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Update ingestion job status.
|
||||
/// </summary>
|
||||
Task UpdateIngestionJobAsync(
|
||||
Guid jobId,
|
||||
IngestionJobStatus status,
|
||||
int? functionsIndexed = null,
|
||||
int? fingerprintsGenerated = null,
|
||||
int? clustersCreated = null,
|
||||
ImmutableArray<string>? errors = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get ingestion job.
|
||||
/// </summary>
|
||||
Task<IngestionJob?> GetIngestionJobAsync(Guid jobId, CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Statistics
|
||||
|
||||
/// <summary>
|
||||
/// Get corpus statistics.
|
||||
/// </summary>
|
||||
Task<CorpusStatistics> GetStatisticsAsync(CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a fingerprint similarity search.
|
||||
/// </summary>
|
||||
public sealed record FingerprintSearchResult(
|
||||
Guid FunctionId,
|
||||
byte[] Fingerprint,
|
||||
decimal Similarity);
|
||||
@@ -0,0 +1,155 @@
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus;
|
||||
|
||||
/// <summary>
|
||||
/// Connector for fetching library binaries from various sources.
|
||||
/// Used to populate the function corpus.
|
||||
/// </summary>
|
||||
public interface ILibraryCorpusConnector
|
||||
{
|
||||
/// <summary>
|
||||
/// Library name this connector handles (e.g., "glibc", "openssl").
|
||||
/// </summary>
|
||||
string LibraryName { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Supported architectures.
|
||||
/// </summary>
|
||||
ImmutableArray<string> SupportedArchitectures { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Get available versions of the library.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Available versions ordered newest first.</returns>
|
||||
Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Fetch a library binary for a specific version and architecture.
|
||||
/// </summary>
|
||||
/// <param name="version">Library version.</param>
|
||||
/// <param name="architecture">Target architecture.</param>
|
||||
/// <param name="options">Fetch options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Library binary or null if not available.</returns>
|
||||
Task<LibraryBinary?> FetchBinaryAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Stream binaries for multiple versions.
|
||||
/// </summary>
|
||||
/// <param name="versions">Versions to fetch.</param>
|
||||
/// <param name="architecture">Target architecture.</param>
|
||||
/// <param name="options">Fetch options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Stream of library binaries.</returns>
|
||||
IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
|
||||
IEnumerable<string> versions,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A library binary fetched from a connector.
|
||||
/// </summary>
|
||||
public sealed record LibraryBinary(
|
||||
string LibraryName,
|
||||
string Version,
|
||||
string Architecture,
|
||||
string? Abi,
|
||||
string? Compiler,
|
||||
string? CompilerVersion,
|
||||
string? OptimizationLevel,
|
||||
Stream BinaryStream,
|
||||
string Sha256,
|
||||
string? BuildId,
|
||||
LibraryBinarySource Source,
|
||||
DateOnly? ReleaseDate) : IDisposable
|
||||
{
|
||||
public void Dispose()
|
||||
{
|
||||
BinaryStream.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Source of a library binary.
|
||||
/// </summary>
|
||||
public sealed record LibraryBinarySource(
|
||||
LibrarySourceType Type,
|
||||
string? PackageName,
|
||||
string? DistroRelease,
|
||||
string? MirrorUrl);
|
||||
|
||||
/// <summary>
|
||||
/// Type of library source.
|
||||
/// </summary>
|
||||
public enum LibrarySourceType
|
||||
{
|
||||
/// <summary>
|
||||
/// Binary from Debian/Ubuntu package.
|
||||
/// </summary>
|
||||
DebianPackage,
|
||||
|
||||
/// <summary>
|
||||
/// Binary from RPM package.
|
||||
/// </summary>
|
||||
RpmPackage,
|
||||
|
||||
/// <summary>
|
||||
/// Binary from Alpine APK.
|
||||
/// </summary>
|
||||
AlpineApk,
|
||||
|
||||
/// <summary>
|
||||
/// Binary compiled from source.
|
||||
/// </summary>
|
||||
CompiledSource,
|
||||
|
||||
/// <summary>
|
||||
/// Binary from upstream release.
|
||||
/// </summary>
|
||||
UpstreamRelease,
|
||||
|
||||
/// <summary>
|
||||
/// Binary from debug symbol server.
|
||||
/// </summary>
|
||||
DebugSymbolServer
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for fetching library binaries.
|
||||
/// </summary>
|
||||
public sealed record LibraryFetchOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Preferred ABI (e.g., "gnu", "musl").
|
||||
/// </summary>
|
||||
public string? PreferredAbi { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Preferred compiler.
|
||||
/// </summary>
|
||||
public string? PreferredCompiler { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Include debug symbols if available.
|
||||
/// </summary>
|
||||
public bool IncludeDebugSymbols { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Preferred distro for pre-built packages.
|
||||
/// </summary>
|
||||
public string? PreferredDistro { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Timeout for network operations.
|
||||
/// </summary>
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(5);
|
||||
}
|
||||
@@ -0,0 +1,273 @@
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
/// <summary>
|
||||
/// Metadata about a known library in the corpus.
|
||||
/// </summary>
|
||||
public sealed record LibraryMetadata(
|
||||
Guid Id,
|
||||
string Name,
|
||||
string? Description,
|
||||
string? HomepageUrl,
|
||||
string? SourceRepo,
|
||||
DateTimeOffset CreatedAt,
|
||||
DateTimeOffset UpdatedAt);
|
||||
|
||||
/// <summary>
|
||||
/// A specific version of a library in the corpus.
|
||||
/// </summary>
|
||||
public sealed record LibraryVersion(
|
||||
Guid Id,
|
||||
Guid LibraryId,
|
||||
string Version,
|
||||
DateOnly? ReleaseDate,
|
||||
bool IsSecurityRelease,
|
||||
string? SourceArchiveSha256,
|
||||
DateTimeOffset IndexedAt);
|
||||
|
||||
/// <summary>
|
||||
/// A specific build variant of a library version.
|
||||
/// </summary>
|
||||
public sealed record BuildVariant(
|
||||
Guid Id,
|
||||
Guid LibraryVersionId,
|
||||
string Architecture,
|
||||
string? Abi,
|
||||
string? Compiler,
|
||||
string? CompilerVersion,
|
||||
string? OptimizationLevel,
|
||||
string? BuildId,
|
||||
string BinarySha256,
|
||||
DateTimeOffset IndexedAt);
|
||||
|
||||
/// <summary>
|
||||
/// A function in the corpus.
|
||||
/// </summary>
|
||||
public sealed record CorpusFunction(
|
||||
Guid Id,
|
||||
Guid BuildVariantId,
|
||||
string Name,
|
||||
string? DemangledName,
|
||||
ulong Address,
|
||||
int SizeBytes,
|
||||
bool IsExported,
|
||||
bool IsInline,
|
||||
string? SourceFile,
|
||||
int? SourceLine);
|
||||
|
||||
/// <summary>
|
||||
/// A fingerprint for a function in the corpus.
|
||||
/// </summary>
|
||||
public sealed record CorpusFingerprint(
|
||||
Guid Id,
|
||||
Guid FunctionId,
|
||||
FingerprintAlgorithm Algorithm,
|
||||
byte[] Fingerprint,
|
||||
string FingerprintHex,
|
||||
FingerprintMetadata? Metadata,
|
||||
DateTimeOffset CreatedAt);
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm used to generate a fingerprint.
|
||||
/// </summary>
|
||||
public enum FingerprintAlgorithm
|
||||
{
|
||||
/// <summary>
|
||||
/// Semantic key-semantics graph fingerprint (from Phase 1).
|
||||
/// </summary>
|
||||
SemanticKsg,
|
||||
|
||||
/// <summary>
|
||||
/// Instruction-level basic block hash.
|
||||
/// </summary>
|
||||
InstructionBb,
|
||||
|
||||
/// <summary>
|
||||
/// Control flow graph Weisfeiler-Lehman hash.
|
||||
/// </summary>
|
||||
CfgWl,
|
||||
|
||||
/// <summary>
|
||||
/// API call sequence hash.
|
||||
/// </summary>
|
||||
ApiCalls,
|
||||
|
||||
/// <summary>
|
||||
/// Combined multi-algorithm fingerprint.
|
||||
/// </summary>
|
||||
Combined
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm-specific metadata for a fingerprint.
|
||||
/// </summary>
|
||||
public sealed record FingerprintMetadata(
|
||||
int? NodeCount,
|
||||
int? EdgeCount,
|
||||
int? CyclomaticComplexity,
|
||||
ImmutableArray<string>? ApiCalls,
|
||||
string? OperationHashHex,
|
||||
string? DataFlowHashHex);
|
||||
|
||||
/// <summary>
|
||||
/// A cluster of similar functions across versions.
|
||||
/// </summary>
|
||||
public sealed record FunctionCluster(
|
||||
Guid Id,
|
||||
Guid LibraryId,
|
||||
string CanonicalName,
|
||||
string? Description,
|
||||
DateTimeOffset CreatedAt);
|
||||
|
||||
/// <summary>
|
||||
/// Membership in a function cluster.
|
||||
/// </summary>
|
||||
public sealed record ClusterMember(
|
||||
Guid ClusterId,
|
||||
Guid FunctionId,
|
||||
decimal? SimilarityToCentroid);
|
||||
|
||||
/// <summary>
|
||||
/// CVE association for a function.
|
||||
/// </summary>
|
||||
public sealed record FunctionCve(
|
||||
Guid FunctionId,
|
||||
string CveId,
|
||||
CveAffectedState AffectedState,
|
||||
string? PatchCommit,
|
||||
decimal Confidence,
|
||||
CveEvidenceType? EvidenceType);
|
||||
|
||||
/// <summary>
|
||||
/// CVE affected state for a function.
|
||||
/// </summary>
|
||||
public enum CveAffectedState
|
||||
{
|
||||
Vulnerable,
|
||||
Fixed,
|
||||
NotAffected
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of evidence linking a function to a CVE.
|
||||
/// </summary>
|
||||
public enum CveEvidenceType
|
||||
{
|
||||
Changelog,
|
||||
Commit,
|
||||
Advisory,
|
||||
PatchHeader,
|
||||
Manual
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Ingestion job tracking.
|
||||
/// </summary>
|
||||
public sealed record IngestionJob(
|
||||
Guid Id,
|
||||
Guid LibraryId,
|
||||
IngestionJobType JobType,
|
||||
IngestionJobStatus Status,
|
||||
DateTimeOffset? StartedAt,
|
||||
DateTimeOffset? CompletedAt,
|
||||
int? FunctionsIndexed,
|
||||
ImmutableArray<string>? Errors,
|
||||
DateTimeOffset CreatedAt);
|
||||
|
||||
/// <summary>
|
||||
/// Type of ingestion job.
|
||||
/// </summary>
|
||||
public enum IngestionJobType
|
||||
{
|
||||
FullIngest,
|
||||
Incremental,
|
||||
CveUpdate
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of an ingestion job.
|
||||
/// </summary>
|
||||
public enum IngestionJobStatus
|
||||
{
|
||||
Pending,
|
||||
Running,
|
||||
Completed,
|
||||
Failed,
|
||||
Cancelled
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a function identification query.
|
||||
/// </summary>
|
||||
public sealed record FunctionMatch(
|
||||
string LibraryName,
|
||||
string Version,
|
||||
string FunctionName,
|
||||
string? DemangledName,
|
||||
decimal Similarity,
|
||||
MatchConfidence Confidence,
|
||||
string Architecture,
|
||||
string? Abi,
|
||||
MatchDetails Details);
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level of a match.
|
||||
/// </summary>
|
||||
public enum MatchConfidence
|
||||
{
|
||||
/// <summary>
|
||||
/// Low confidence (similarity 50-70%).
|
||||
/// </summary>
|
||||
Low,
|
||||
|
||||
/// <summary>
|
||||
/// Medium confidence (similarity 70-85%).
|
||||
/// </summary>
|
||||
Medium,
|
||||
|
||||
/// <summary>
|
||||
/// High confidence (similarity 85-95%).
|
||||
/// </summary>
|
||||
High,
|
||||
|
||||
/// <summary>
|
||||
/// Very high confidence (similarity 95%+).
|
||||
/// </summary>
|
||||
VeryHigh,
|
||||
|
||||
/// <summary>
|
||||
/// Exact match (100% or hash collision).
|
||||
/// </summary>
|
||||
Exact
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Details about a function match.
|
||||
/// </summary>
|
||||
public sealed record MatchDetails(
|
||||
decimal SemanticSimilarity,
|
||||
decimal InstructionSimilarity,
|
||||
decimal CfgSimilarity,
|
||||
decimal ApiCallSimilarity,
|
||||
ImmutableArray<string> MatchedApiCalls,
|
||||
int SizeDifferenceBytes);
|
||||
|
||||
/// <summary>
|
||||
/// Evolution of a function across library versions.
|
||||
/// </summary>
|
||||
public sealed record FunctionEvolution(
|
||||
string LibraryName,
|
||||
string FunctionName,
|
||||
ImmutableArray<FunctionVersionInfo> Versions);
|
||||
|
||||
/// <summary>
|
||||
/// Information about a function in a specific version.
|
||||
/// </summary>
|
||||
public sealed record FunctionVersionInfo(
|
||||
string Version,
|
||||
DateOnly? ReleaseDate,
|
||||
int SizeBytes,
|
||||
string FingerprintHex,
|
||||
decimal? SimilarityToPrevious,
|
||||
ImmutableArray<string>? CveIds);
|
||||
@@ -0,0 +1,464 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Threading.Channels;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Service for batch generation of function fingerprints.
|
||||
/// Uses a producer-consumer pattern for efficient parallel processing.
|
||||
/// </summary>
|
||||
public sealed class BatchFingerprintPipeline : IBatchFingerprintPipeline
|
||||
{
|
||||
private readonly ICorpusRepository _repository;
|
||||
private readonly IFingerprintGeneratorFactory _generatorFactory;
|
||||
private readonly ILogger<BatchFingerprintPipeline> _logger;
|
||||
|
||||
public BatchFingerprintPipeline(
|
||||
ICorpusRepository repository,
|
||||
IFingerprintGeneratorFactory generatorFactory,
|
||||
ILogger<BatchFingerprintPipeline> logger)
|
||||
{
|
||||
_repository = repository;
|
||||
_generatorFactory = generatorFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<BatchFingerprintResult> GenerateFingerprintsAsync(
|
||||
Guid buildVariantId,
|
||||
BatchFingerprintOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var opts = options ?? new BatchFingerprintOptions();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting batch fingerprint generation for variant {VariantId}",
|
||||
buildVariantId);
|
||||
|
||||
// Get all functions for this variant
|
||||
var functions = await _repository.GetFunctionsForVariantAsync(buildVariantId, ct);
|
||||
|
||||
if (functions.Length == 0)
|
||||
{
|
||||
_logger.LogWarning("No functions found for variant {VariantId}", buildVariantId);
|
||||
return new BatchFingerprintResult(
|
||||
buildVariantId,
|
||||
0,
|
||||
0,
|
||||
TimeSpan.Zero,
|
||||
[],
|
||||
[]);
|
||||
}
|
||||
|
||||
return await GenerateFingerprintsForFunctionsAsync(
|
||||
functions,
|
||||
buildVariantId,
|
||||
opts,
|
||||
ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<BatchFingerprintResult> GenerateFingerprintsForLibraryAsync(
|
||||
string libraryName,
|
||||
BatchFingerprintOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var opts = options ?? new BatchFingerprintOptions();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting batch fingerprint generation for library {Library}",
|
||||
libraryName);
|
||||
|
||||
var library = await _repository.GetLibraryAsync(libraryName, ct);
|
||||
if (library is null)
|
||||
{
|
||||
_logger.LogWarning("Library {Library} not found", libraryName);
|
||||
return new BatchFingerprintResult(
|
||||
Guid.Empty,
|
||||
0,
|
||||
0,
|
||||
TimeSpan.Zero,
|
||||
["Library not found"],
|
||||
[]);
|
||||
}
|
||||
|
||||
// Get all versions
|
||||
var versions = await _repository.ListVersionsAsync(libraryName, ct);
|
||||
|
||||
var totalFunctions = 0;
|
||||
var totalFingerprints = 0;
|
||||
var totalDuration = TimeSpan.Zero;
|
||||
var allErrors = new List<string>();
|
||||
var allWarnings = new List<string>();
|
||||
|
||||
foreach (var version in versions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
// Get build variants for this version
|
||||
var variants = await _repository.GetBuildVariantsAsync(version.Id, ct);
|
||||
|
||||
foreach (var variant in variants)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var result = await GenerateFingerprintsAsync(variant.Id, opts, ct);
|
||||
|
||||
totalFunctions += result.FunctionsProcessed;
|
||||
totalFingerprints += result.FingerprintsGenerated;
|
||||
totalDuration += result.Duration;
|
||||
allErrors.AddRange(result.Errors);
|
||||
allWarnings.AddRange(result.Warnings);
|
||||
}
|
||||
}
|
||||
|
||||
return new BatchFingerprintResult(
|
||||
library.Id,
|
||||
totalFunctions,
|
||||
totalFingerprints,
|
||||
totalDuration,
|
||||
[.. allErrors],
|
||||
[.. allWarnings]);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async IAsyncEnumerable<FingerprintProgress> StreamProgressAsync(
|
||||
Guid buildVariantId,
|
||||
BatchFingerprintOptions? options = null,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
var opts = options ?? new BatchFingerprintOptions();
|
||||
|
||||
var functions = await _repository.GetFunctionsForVariantAsync(buildVariantId, ct);
|
||||
var total = functions.Length;
|
||||
var processed = 0;
|
||||
var errors = 0;
|
||||
|
||||
var channel = Channel.CreateBounded<FingerprintWorkItem>(new BoundedChannelOptions(opts.BatchSize * 2)
|
||||
{
|
||||
FullMode = BoundedChannelFullMode.Wait
|
||||
});
|
||||
|
||||
// Producer task: read functions and queue them
|
||||
var producerTask = Task.Run(async () =>
|
||||
{
|
||||
try
|
||||
{
|
||||
foreach (var function in functions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
await channel.Writer.WriteAsync(new FingerprintWorkItem(function), ct);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
channel.Writer.Complete();
|
||||
}
|
||||
}, ct);
|
||||
|
||||
// Consumer: process batches and yield progress
|
||||
var batch = new List<FingerprintWorkItem>();
|
||||
|
||||
await foreach (var item in channel.Reader.ReadAllAsync(ct))
|
||||
{
|
||||
batch.Add(item);
|
||||
|
||||
if (batch.Count >= opts.BatchSize)
|
||||
{
|
||||
var batchResult = await ProcessBatchAsync(batch, opts, ct);
|
||||
processed += batchResult.Processed;
|
||||
errors += batchResult.Errors;
|
||||
batch.Clear();
|
||||
|
||||
yield return new FingerprintProgress(
|
||||
processed,
|
||||
total,
|
||||
errors,
|
||||
(double)processed / total);
|
||||
}
|
||||
}
|
||||
|
||||
// Process remaining items
|
||||
if (batch.Count > 0)
|
||||
{
|
||||
var batchResult = await ProcessBatchAsync(batch, opts, ct);
|
||||
processed += batchResult.Processed;
|
||||
errors += batchResult.Errors;
|
||||
|
||||
yield return new FingerprintProgress(
|
||||
processed,
|
||||
total,
|
||||
errors,
|
||||
1.0);
|
||||
}
|
||||
|
||||
await producerTask;
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private async Task<BatchFingerprintResult> GenerateFingerprintsForFunctionsAsync(
|
||||
ImmutableArray<CorpusFunction> functions,
|
||||
Guid contextId,
|
||||
BatchFingerprintOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var startTime = DateTime.UtcNow;
|
||||
var processed = 0;
|
||||
var generated = 0;
|
||||
var errors = new List<string>();
|
||||
var warnings = new List<string>();
|
||||
|
||||
// Process in batches with parallelism
|
||||
var batches = functions
|
||||
.Select((f, i) => new { Function = f, Index = i })
|
||||
.GroupBy(x => x.Index / options.BatchSize)
|
||||
.Select(g => g.Select(x => x.Function).ToList())
|
||||
.ToList();
|
||||
|
||||
foreach (var batch in batches)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var semaphore = new SemaphoreSlim(options.ParallelDegree);
|
||||
var batchFingerprints = new List<CorpusFingerprint>();
|
||||
|
||||
var tasks = batch.Select(async function =>
|
||||
{
|
||||
await semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
var fingerprints = await GenerateFingerprintsForFunctionAsync(function, options, ct);
|
||||
lock (batchFingerprints)
|
||||
{
|
||||
batchFingerprints.AddRange(fingerprints);
|
||||
}
|
||||
Interlocked.Increment(ref processed);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
lock (errors)
|
||||
{
|
||||
errors.Add($"Function {function.Name}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
semaphore.Release();
|
||||
}
|
||||
});
|
||||
|
||||
await Task.WhenAll(tasks);
|
||||
|
||||
// Batch insert fingerprints
|
||||
if (batchFingerprints.Count > 0)
|
||||
{
|
||||
var insertedCount = await _repository.InsertFingerprintsAsync(batchFingerprints, ct);
|
||||
generated += insertedCount;
|
||||
}
|
||||
}
|
||||
|
||||
var duration = DateTime.UtcNow - startTime;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Batch fingerprint generation completed: {Functions} functions, {Fingerprints} fingerprints in {Duration:c}",
|
||||
processed,
|
||||
generated,
|
||||
duration);
|
||||
|
||||
return new BatchFingerprintResult(
|
||||
contextId,
|
||||
processed,
|
||||
generated,
|
||||
duration,
|
||||
[.. errors],
|
||||
[.. warnings]);
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<CorpusFingerprint>> GenerateFingerprintsForFunctionAsync(
|
||||
CorpusFunction function,
|
||||
BatchFingerprintOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var fingerprints = new List<CorpusFingerprint>();
|
||||
|
||||
foreach (var algorithm in options.Algorithms)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var generator = _generatorFactory.GetGenerator(algorithm);
|
||||
if (generator is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var fingerprint = await generator.GenerateAsync(function, ct);
|
||||
if (fingerprint is not null)
|
||||
{
|
||||
fingerprints.Add(new CorpusFingerprint(
|
||||
Guid.NewGuid(),
|
||||
function.Id,
|
||||
algorithm,
|
||||
fingerprint.Hash,
|
||||
Convert.ToHexStringLower(fingerprint.Hash),
|
||||
fingerprint.Metadata,
|
||||
DateTimeOffset.UtcNow));
|
||||
}
|
||||
}
|
||||
|
||||
return [.. fingerprints];
|
||||
}
|
||||
|
||||
private async Task<(int Processed, int Errors)> ProcessBatchAsync(
|
||||
List<FingerprintWorkItem> batch,
|
||||
BatchFingerprintOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var processed = 0;
|
||||
var errors = 0;
|
||||
|
||||
var allFingerprints = new List<CorpusFingerprint>();
|
||||
|
||||
var semaphore = new SemaphoreSlim(options.ParallelDegree);
|
||||
|
||||
var tasks = batch.Select(async item =>
|
||||
{
|
||||
await semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
var fingerprints = await GenerateFingerprintsForFunctionAsync(item.Function, options, ct);
|
||||
lock (allFingerprints)
|
||||
{
|
||||
allFingerprints.AddRange(fingerprints);
|
||||
}
|
||||
Interlocked.Increment(ref processed);
|
||||
}
|
||||
catch
|
||||
{
|
||||
Interlocked.Increment(ref errors);
|
||||
}
|
||||
finally
|
||||
{
|
||||
semaphore.Release();
|
||||
}
|
||||
});
|
||||
|
||||
await Task.WhenAll(tasks);
|
||||
|
||||
if (allFingerprints.Count > 0)
|
||||
{
|
||||
await _repository.InsertFingerprintsAsync(allFingerprints, ct);
|
||||
}
|
||||
|
||||
return (processed, errors);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
private sealed record FingerprintWorkItem(CorpusFunction Function);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for batch fingerprint generation.
|
||||
/// </summary>
|
||||
public interface IBatchFingerprintPipeline
|
||||
{
|
||||
/// <summary>
|
||||
/// Generate fingerprints for all functions in a build variant.
|
||||
/// </summary>
|
||||
Task<BatchFingerprintResult> GenerateFingerprintsAsync(
|
||||
Guid buildVariantId,
|
||||
BatchFingerprintOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Generate fingerprints for all functions in a library.
|
||||
/// </summary>
|
||||
Task<BatchFingerprintResult> GenerateFingerprintsForLibraryAsync(
|
||||
string libraryName,
|
||||
BatchFingerprintOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Stream progress for fingerprint generation.
|
||||
/// </summary>
|
||||
IAsyncEnumerable<FingerprintProgress> StreamProgressAsync(
|
||||
Guid buildVariantId,
|
||||
BatchFingerprintOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for batch fingerprint generation.
|
||||
/// </summary>
|
||||
public sealed record BatchFingerprintOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Number of functions to process per batch.
|
||||
/// </summary>
|
||||
public int BatchSize { get; init; } = 100;
|
||||
|
||||
/// <summary>
|
||||
/// Degree of parallelism for processing.
|
||||
/// </summary>
|
||||
public int ParallelDegree { get; init; } = 4;
|
||||
|
||||
/// <summary>
|
||||
/// Algorithms to generate fingerprints for.
|
||||
/// </summary>
|
||||
public ImmutableArray<FingerprintAlgorithm> Algorithms { get; init; } =
|
||||
[FingerprintAlgorithm.SemanticKsg, FingerprintAlgorithm.InstructionBb, FingerprintAlgorithm.CfgWl];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of batch fingerprint generation.
|
||||
/// </summary>
|
||||
public sealed record BatchFingerprintResult(
|
||||
Guid ContextId,
|
||||
int FunctionsProcessed,
|
||||
int FingerprintsGenerated,
|
||||
TimeSpan Duration,
|
||||
ImmutableArray<string> Errors,
|
||||
ImmutableArray<string> Warnings);
|
||||
|
||||
/// <summary>
|
||||
/// Progress update for fingerprint generation.
|
||||
/// </summary>
|
||||
public sealed record FingerprintProgress(
|
||||
int Processed,
|
||||
int Total,
|
||||
int Errors,
|
||||
double PercentComplete);
|
||||
|
||||
/// <summary>
|
||||
/// Factory for creating fingerprint generators.
|
||||
/// </summary>
|
||||
public interface IFingerprintGeneratorFactory
|
||||
{
|
||||
/// <summary>
|
||||
/// Get a fingerprint generator for the specified algorithm.
|
||||
/// </summary>
|
||||
ICorpusFingerprintGenerator? GetGenerator(FingerprintAlgorithm algorithm);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for corpus fingerprint generation.
|
||||
/// </summary>
|
||||
public interface ICorpusFingerprintGenerator
|
||||
{
|
||||
/// <summary>
|
||||
/// Generate a fingerprint for a corpus function.
|
||||
/// </summary>
|
||||
Task<GeneratedFingerprint?> GenerateAsync(
|
||||
CorpusFunction function,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A generated fingerprint.
|
||||
/// </summary>
|
||||
public sealed record GeneratedFingerprint(
|
||||
byte[] Hash,
|
||||
FingerprintMetadata? Metadata);
|
||||
@@ -0,0 +1,466 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Diagnostics;
|
||||
using System.Security.Cryptography;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Service for ingesting library binaries into the function corpus.
|
||||
/// </summary>
|
||||
public sealed class CorpusIngestionService : ICorpusIngestionService
|
||||
{
|
||||
private readonly ICorpusRepository _repository;
|
||||
private readonly IFingerprintGenerator? _fingerprintGenerator;
|
||||
private readonly IFunctionExtractor? _functionExtractor;
|
||||
private readonly ILogger<CorpusIngestionService> _logger;
|
||||
|
||||
public CorpusIngestionService(
|
||||
ICorpusRepository repository,
|
||||
ILogger<CorpusIngestionService> logger,
|
||||
IFingerprintGenerator? fingerprintGenerator = null,
|
||||
IFunctionExtractor? functionExtractor = null)
|
||||
{
|
||||
_repository = repository;
|
||||
_logger = logger;
|
||||
_fingerprintGenerator = fingerprintGenerator;
|
||||
_functionExtractor = functionExtractor;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IngestionResult> IngestLibraryAsync(
|
||||
LibraryIngestionMetadata metadata,
|
||||
Stream binaryStream,
|
||||
IngestionOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(metadata);
|
||||
ArgumentNullException.ThrowIfNull(binaryStream);
|
||||
|
||||
var opts = options ?? new IngestionOptions();
|
||||
var stopwatch = Stopwatch.StartNew();
|
||||
var warnings = new List<string>();
|
||||
var errors = new List<string>();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting ingestion for {Library} {Version} ({Architecture})",
|
||||
metadata.Name,
|
||||
metadata.Version,
|
||||
metadata.Architecture);
|
||||
|
||||
// Compute binary hash
|
||||
var binarySha256 = await ComputeSha256Async(binaryStream, ct);
|
||||
binaryStream.Position = 0; // Reset for reading
|
||||
|
||||
// Check if we've already indexed this exact binary
|
||||
var existingVariant = await _repository.GetBuildVariantBySha256Async(binarySha256, ct);
|
||||
if (existingVariant is not null)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Binary {Sha256} already indexed as variant {VariantId}",
|
||||
binarySha256[..16],
|
||||
existingVariant.Id);
|
||||
|
||||
stopwatch.Stop();
|
||||
return new IngestionResult(
|
||||
Guid.Empty,
|
||||
metadata.Name,
|
||||
metadata.Version,
|
||||
metadata.Architecture,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
stopwatch.Elapsed,
|
||||
["Binary already indexed."],
|
||||
[]);
|
||||
}
|
||||
|
||||
// Create or get library record
|
||||
var library = await _repository.GetOrCreateLibraryAsync(
|
||||
metadata.Name,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
ct);
|
||||
|
||||
// Create ingestion job
|
||||
var job = await _repository.CreateIngestionJobAsync(
|
||||
library.Id,
|
||||
IngestionJobType.FullIngest,
|
||||
ct);
|
||||
|
||||
try
|
||||
{
|
||||
await _repository.UpdateIngestionJobAsync(
|
||||
job.Id,
|
||||
IngestionJobStatus.Running,
|
||||
ct: ct);
|
||||
|
||||
// Create or get version record
|
||||
var version = await _repository.GetOrCreateVersionAsync(
|
||||
library.Id,
|
||||
metadata.Version,
|
||||
metadata.ReleaseDate,
|
||||
metadata.IsSecurityRelease,
|
||||
metadata.SourceArchiveSha256,
|
||||
ct);
|
||||
|
||||
// Create build variant record
|
||||
var variant = await _repository.GetOrCreateBuildVariantAsync(
|
||||
version.Id,
|
||||
metadata.Architecture,
|
||||
binarySha256,
|
||||
metadata.Abi,
|
||||
metadata.Compiler,
|
||||
metadata.CompilerVersion,
|
||||
metadata.OptimizationLevel,
|
||||
null,
|
||||
ct);
|
||||
|
||||
// Extract functions from binary
|
||||
var functions = await ExtractFunctionsAsync(binaryStream, variant.Id, opts, warnings, ct);
|
||||
|
||||
// Filter functions based on options
|
||||
functions = ApplyFunctionFilters(functions, opts);
|
||||
|
||||
// Insert functions into database
|
||||
var insertedCount = await _repository.InsertFunctionsAsync(functions, ct);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Extracted and inserted {Count} functions from {Library} {Version}",
|
||||
insertedCount,
|
||||
metadata.Name,
|
||||
metadata.Version);
|
||||
|
||||
// Generate fingerprints for each function
|
||||
var fingerprintsGenerated = 0;
|
||||
if (_fingerprintGenerator is not null)
|
||||
{
|
||||
fingerprintsGenerated = await GenerateFingerprintsAsync(functions, opts, ct);
|
||||
}
|
||||
|
||||
// Generate clusters if enabled
|
||||
var clustersCreated = 0;
|
||||
if (opts.GenerateClusters)
|
||||
{
|
||||
clustersCreated = await GenerateClustersAsync(library.Id, functions, ct);
|
||||
}
|
||||
|
||||
// Update job with success
|
||||
await _repository.UpdateIngestionJobAsync(
|
||||
job.Id,
|
||||
IngestionJobStatus.Completed,
|
||||
functionsIndexed: insertedCount,
|
||||
fingerprintsGenerated: fingerprintsGenerated,
|
||||
clustersCreated: clustersCreated,
|
||||
ct: ct);
|
||||
|
||||
stopwatch.Stop();
|
||||
return new IngestionResult(
|
||||
job.Id,
|
||||
metadata.Name,
|
||||
metadata.Version,
|
||||
metadata.Architecture,
|
||||
insertedCount,
|
||||
fingerprintsGenerated,
|
||||
clustersCreated,
|
||||
stopwatch.Elapsed,
|
||||
[],
|
||||
[.. warnings]);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"Ingestion failed for {Library} {Version}",
|
||||
metadata.Name,
|
||||
metadata.Version);
|
||||
|
||||
await _repository.UpdateIngestionJobAsync(
|
||||
job.Id,
|
||||
IngestionJobStatus.Failed,
|
||||
errors: [ex.Message],
|
||||
ct: ct);
|
||||
|
||||
stopwatch.Stop();
|
||||
return new IngestionResult(
|
||||
job.Id,
|
||||
metadata.Name,
|
||||
metadata.Version,
|
||||
metadata.Architecture,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
stopwatch.Elapsed,
|
||||
[ex.Message],
|
||||
[.. warnings]);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async IAsyncEnumerable<IngestionResult> IngestFromConnectorAsync(
|
||||
string libraryName,
|
||||
ILibraryCorpusConnector connector,
|
||||
IngestionOptions? options = null,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(libraryName);
|
||||
ArgumentNullException.ThrowIfNull(connector);
|
||||
|
||||
var opts = options ?? new IngestionOptions();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting bulk ingestion from {Connector} for library {Library}",
|
||||
connector.LibraryName,
|
||||
libraryName);
|
||||
|
||||
// Get available versions
|
||||
var versions = await connector.GetAvailableVersionsAsync(ct);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Found {Count} versions for {Library}",
|
||||
versions.Length,
|
||||
libraryName);
|
||||
|
||||
var fetchOptions = new LibraryFetchOptions
|
||||
{
|
||||
IncludeDebugSymbols = true
|
||||
};
|
||||
|
||||
// Process each architecture
|
||||
foreach (var arch in connector.SupportedArchitectures)
|
||||
{
|
||||
await foreach (var binary in connector.FetchBinariesAsync(
|
||||
[.. versions],
|
||||
arch,
|
||||
fetchOptions,
|
||||
ct))
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
using (binary)
|
||||
{
|
||||
var metadata = new LibraryIngestionMetadata(
|
||||
libraryName,
|
||||
binary.Version,
|
||||
binary.Architecture,
|
||||
binary.Abi,
|
||||
binary.Compiler,
|
||||
binary.CompilerVersion,
|
||||
binary.OptimizationLevel,
|
||||
binary.ReleaseDate,
|
||||
false,
|
||||
null);
|
||||
|
||||
var result = await IngestLibraryAsync(metadata, binary.BinaryStream, opts, ct);
|
||||
yield return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<int> UpdateCveAssociationsAsync(
|
||||
string cveId,
|
||||
IReadOnlyList<FunctionCveAssociation> associations,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(cveId);
|
||||
ArgumentNullException.ThrowIfNull(associations);
|
||||
|
||||
if (associations.Count == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Updating CVE associations for {CveId} ({Count} functions)",
|
||||
cveId,
|
||||
associations.Count);
|
||||
|
||||
// Convert to FunctionCve records
|
||||
var cveRecords = associations.Select(a => new FunctionCve(
|
||||
a.FunctionId,
|
||||
cveId,
|
||||
a.AffectedState,
|
||||
a.PatchCommit,
|
||||
a.Confidence,
|
||||
a.EvidenceType)).ToList();
|
||||
|
||||
return await _repository.UpsertCveAssociationsAsync(cveId, cveRecords, ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IngestionJob?> GetJobStatusAsync(Guid jobId, CancellationToken ct = default)
|
||||
{
|
||||
return await _repository.GetIngestionJobAsync(jobId, ct);
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private async Task<ImmutableArray<CorpusFunction>> ExtractFunctionsAsync(
|
||||
Stream binaryStream,
|
||||
Guid buildVariantId,
|
||||
IngestionOptions options,
|
||||
List<string> warnings,
|
||||
CancellationToken ct)
|
||||
{
|
||||
if (_functionExtractor is null)
|
||||
{
|
||||
warnings.Add("No function extractor configured, returning empty function list");
|
||||
_logger.LogWarning("No function extractor configured");
|
||||
return [];
|
||||
}
|
||||
|
||||
var extractedFunctions = await _functionExtractor.ExtractFunctionsAsync(binaryStream, ct);
|
||||
|
||||
// Convert to corpus functions with IDs
|
||||
var functions = extractedFunctions.Select(f => new CorpusFunction(
|
||||
Guid.NewGuid(),
|
||||
buildVariantId,
|
||||
f.Name,
|
||||
f.DemangledName,
|
||||
f.Address,
|
||||
f.SizeBytes,
|
||||
f.IsExported,
|
||||
f.IsInline,
|
||||
f.SourceFile,
|
||||
f.SourceLine)).ToImmutableArray();
|
||||
|
||||
return functions;
|
||||
}
|
||||
|
||||
private static ImmutableArray<CorpusFunction> ApplyFunctionFilters(
|
||||
ImmutableArray<CorpusFunction> functions,
|
||||
IngestionOptions options)
|
||||
{
|
||||
var filtered = functions
|
||||
.Where(f => f.SizeBytes >= options.MinFunctionSize)
|
||||
.Where(f => !options.ExportedOnly || f.IsExported)
|
||||
.Take(options.MaxFunctionsPerBinary);
|
||||
|
||||
return [.. filtered];
|
||||
}
|
||||
|
||||
private async Task<int> GenerateFingerprintsAsync(
|
||||
ImmutableArray<CorpusFunction> functions,
|
||||
IngestionOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
if (_fingerprintGenerator is null)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
var allFingerprints = new List<CorpusFingerprint>();
|
||||
|
||||
// Process in parallel with degree limit
|
||||
var semaphore = new SemaphoreSlim(options.ParallelDegree);
|
||||
|
||||
var tasks = functions.Select(async function =>
|
||||
{
|
||||
await semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
var fingerprints = await _fingerprintGenerator.GenerateFingerprintsAsync(function.Id, ct);
|
||||
lock (allFingerprints)
|
||||
{
|
||||
allFingerprints.AddRange(fingerprints);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
semaphore.Release();
|
||||
}
|
||||
});
|
||||
|
||||
await Task.WhenAll(tasks);
|
||||
|
||||
if (allFingerprints.Count > 0)
|
||||
{
|
||||
return await _repository.InsertFingerprintsAsync(allFingerprints, ct);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private async Task<int> GenerateClustersAsync(
|
||||
Guid libraryId,
|
||||
ImmutableArray<CorpusFunction> functions,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Simple clustering: group functions by demangled name (if available) or name
|
||||
var clusters = functions
|
||||
.GroupBy(f => f.DemangledName ?? f.Name)
|
||||
.Where(g => g.Count() > 1) // Only create clusters for functions appearing multiple times
|
||||
.ToList();
|
||||
|
||||
var clustersCreated = 0;
|
||||
|
||||
foreach (var group in clusters)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var cluster = await _repository.GetOrCreateClusterAsync(
|
||||
libraryId,
|
||||
group.Key,
|
||||
null,
|
||||
ct);
|
||||
|
||||
var members = group.Select(f => new ClusterMember(cluster.Id, f.Id, 1.0m)).ToList();
|
||||
|
||||
await _repository.AddClusterMembersAsync(cluster.Id, members, ct);
|
||||
clustersCreated++;
|
||||
}
|
||||
|
||||
return clustersCreated;
|
||||
}
|
||||
|
||||
private static async Task<string> ComputeSha256Async(Stream stream, CancellationToken ct)
|
||||
{
|
||||
using var sha256 = SHA256.Create();
|
||||
var hash = await sha256.ComputeHashAsync(stream, ct);
|
||||
return Convert.ToHexStringLower(hash);
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for extracting functions from binary files.
|
||||
/// </summary>
|
||||
public interface IFunctionExtractor
|
||||
{
|
||||
/// <summary>
|
||||
/// Extract functions from a binary stream.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<ExtractedFunction>> ExtractFunctionsAsync(
|
||||
Stream binaryStream,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for generating function fingerprints.
|
||||
/// </summary>
|
||||
public interface IFingerprintGenerator
|
||||
{
|
||||
/// <summary>
|
||||
/// Generate fingerprints for a function.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<CorpusFingerprint>> GenerateFingerprintsAsync(
|
||||
Guid functionId,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A function extracted from a binary.
|
||||
/// </summary>
|
||||
public sealed record ExtractedFunction(
|
||||
string Name,
|
||||
string? DemangledName,
|
||||
ulong Address,
|
||||
int SizeBytes,
|
||||
bool IsExported,
|
||||
bool IsInline,
|
||||
string? SourceFile,
|
||||
int? SourceLine);
|
||||
@@ -0,0 +1,419 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Service for querying the function corpus to identify functions.
|
||||
/// </summary>
|
||||
public sealed class CorpusQueryService : ICorpusQueryService
|
||||
{
|
||||
private readonly ICorpusRepository _repository;
|
||||
private readonly IClusterSimilarityComputer _similarityComputer;
|
||||
private readonly ILogger<CorpusQueryService> _logger;
|
||||
|
||||
public CorpusQueryService(
|
||||
ICorpusRepository repository,
|
||||
IClusterSimilarityComputer similarityComputer,
|
||||
ILogger<CorpusQueryService> logger)
|
||||
{
|
||||
_repository = repository;
|
||||
_similarityComputer = similarityComputer;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<FunctionMatch>> IdentifyFunctionAsync(
|
||||
FunctionFingerprints fingerprints,
|
||||
IdentifyOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var opts = options ?? new IdentifyOptions();
|
||||
|
||||
_logger.LogDebug("Identifying function with fingerprints");
|
||||
|
||||
var candidates = new List<FunctionCandidate>();
|
||||
|
||||
// Search by each available fingerprint type
|
||||
if (fingerprints.SemanticHash is { Length: > 0 })
|
||||
{
|
||||
var matches = await SearchByFingerprintAsync(
|
||||
FingerprintAlgorithm.SemanticKsg,
|
||||
fingerprints.SemanticHash,
|
||||
opts,
|
||||
ct);
|
||||
candidates.AddRange(matches);
|
||||
}
|
||||
|
||||
if (fingerprints.InstructionHash is { Length: > 0 })
|
||||
{
|
||||
var matches = await SearchByFingerprintAsync(
|
||||
FingerprintAlgorithm.InstructionBb,
|
||||
fingerprints.InstructionHash,
|
||||
opts,
|
||||
ct);
|
||||
candidates.AddRange(matches);
|
||||
}
|
||||
|
||||
if (fingerprints.CfgHash is { Length: > 0 })
|
||||
{
|
||||
var matches = await SearchByFingerprintAsync(
|
||||
FingerprintAlgorithm.CfgWl,
|
||||
fingerprints.CfgHash,
|
||||
opts,
|
||||
ct);
|
||||
candidates.AddRange(matches);
|
||||
}
|
||||
|
||||
// Group candidates by function and compute combined similarity
|
||||
var groupedCandidates = candidates
|
||||
.GroupBy(c => c.FunctionId)
|
||||
.Select(g => ComputeCombinedScore(g, fingerprints, opts.Weights))
|
||||
.Where(c => c.Similarity >= opts.MinSimilarity)
|
||||
.OrderByDescending(c => c.Similarity)
|
||||
.Take(opts.MaxResults)
|
||||
.ToList();
|
||||
|
||||
// Enrich with full function details
|
||||
var results = new List<FunctionMatch>();
|
||||
|
||||
foreach (var candidate in groupedCandidates)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
// Get the original candidates for this function
|
||||
var functionCandidates = candidates.Where(c => c.FunctionId == candidate.FunctionId).ToList();
|
||||
|
||||
var function = await _repository.GetFunctionAsync(candidate.FunctionId, ct);
|
||||
if (function is null) continue;
|
||||
|
||||
var variant = await _repository.GetBuildVariantAsync(function.BuildVariantId, ct);
|
||||
if (variant is null) continue;
|
||||
|
||||
// Apply filters
|
||||
if (opts.ArchitectureFilter is { Length: > 0 })
|
||||
{
|
||||
if (!opts.ArchitectureFilter.Value.Contains(variant.Architecture, StringComparer.OrdinalIgnoreCase))
|
||||
continue;
|
||||
}
|
||||
|
||||
var version = await _repository.GetLibraryVersionAsync(variant.LibraryVersionId, ct);
|
||||
if (version is null) continue;
|
||||
|
||||
var library = await _repository.GetLibraryByIdAsync(version.LibraryId, ct);
|
||||
if (library is null) continue;
|
||||
|
||||
// Apply library filter
|
||||
if (opts.LibraryFilter is { Length: > 0 })
|
||||
{
|
||||
if (!opts.LibraryFilter.Value.Contains(library.Name, StringComparer.OrdinalIgnoreCase))
|
||||
continue;
|
||||
}
|
||||
|
||||
results.Add(new FunctionMatch(
|
||||
library.Name,
|
||||
version.Version,
|
||||
function.Name,
|
||||
function.DemangledName,
|
||||
candidate.Similarity,
|
||||
ComputeConfidence(candidate),
|
||||
variant.Architecture,
|
||||
variant.Abi,
|
||||
new MatchDetails(
|
||||
GetAlgorithmSimilarity(functionCandidates, FingerprintAlgorithm.SemanticKsg),
|
||||
GetAlgorithmSimilarity(functionCandidates, FingerprintAlgorithm.InstructionBb),
|
||||
GetAlgorithmSimilarity(functionCandidates, FingerprintAlgorithm.CfgWl),
|
||||
GetAlgorithmSimilarity(functionCandidates, FingerprintAlgorithm.ApiCalls),
|
||||
[],
|
||||
fingerprints.SizeBytes.HasValue
|
||||
? function.SizeBytes - fingerprints.SizeBytes.Value
|
||||
: 0)));
|
||||
}
|
||||
|
||||
return [.. results];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableDictionary<int, ImmutableArray<FunctionMatch>>> IdentifyBatchAsync(
|
||||
IReadOnlyList<FunctionFingerprints> fingerprints,
|
||||
IdentifyOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var results = ImmutableDictionary.CreateBuilder<int, ImmutableArray<FunctionMatch>>();
|
||||
|
||||
// Process in parallel with controlled concurrency
|
||||
var semaphore = new SemaphoreSlim(4);
|
||||
var tasks = fingerprints.Select(async (fp, index) =>
|
||||
{
|
||||
await semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
var matches = await IdentifyFunctionAsync(fp, options, ct);
|
||||
return (Index: index, Matches: matches);
|
||||
}
|
||||
finally
|
||||
{
|
||||
semaphore.Release();
|
||||
}
|
||||
});
|
||||
|
||||
var completedResults = await Task.WhenAll(tasks);
|
||||
|
||||
foreach (var result in completedResults)
|
||||
{
|
||||
results.Add(result.Index, result.Matches);
|
||||
}
|
||||
|
||||
return results.ToImmutable();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<CorpusFunctionWithCve>> GetFunctionsForCveAsync(
|
||||
string cveId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Getting functions for CVE {CveId}", cveId);
|
||||
|
||||
var functionIds = await _repository.GetFunctionIdsForCveAsync(cveId, ct);
|
||||
var results = new List<CorpusFunctionWithCve>();
|
||||
|
||||
foreach (var functionId in functionIds)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var function = await _repository.GetFunctionAsync(functionId, ct);
|
||||
if (function is null) continue;
|
||||
|
||||
var variant = await _repository.GetBuildVariantAsync(function.BuildVariantId, ct);
|
||||
if (variant is null) continue;
|
||||
|
||||
var version = await _repository.GetLibraryVersionAsync(variant.LibraryVersionId, ct);
|
||||
if (version is null) continue;
|
||||
|
||||
var library = await _repository.GetLibraryByIdAsync(version.LibraryId, ct);
|
||||
if (library is null) continue;
|
||||
|
||||
var cves = await _repository.GetCvesForFunctionAsync(functionId, ct);
|
||||
var cveInfo = cves.FirstOrDefault(c => c.CveId == cveId);
|
||||
if (cveInfo is null) continue;
|
||||
|
||||
results.Add(new CorpusFunctionWithCve(function, library, version, variant, cveInfo));
|
||||
}
|
||||
|
||||
return [.. results];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<FunctionEvolution?> GetFunctionEvolutionAsync(
|
||||
string libraryName,
|
||||
string functionName,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Getting evolution for function {Function} in {Library}", functionName, libraryName);
|
||||
|
||||
var library = await _repository.GetLibraryAsync(libraryName, ct);
|
||||
if (library is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var versions = await _repository.ListVersionsAsync(libraryName, ct);
|
||||
var snapshots = new List<FunctionVersionInfo>();
|
||||
string? previousFingerprintHex = null;
|
||||
|
||||
foreach (var versionSummary in versions.OrderBy(v => v.ReleaseDate))
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var version = await _repository.GetVersionAsync(library.Id, versionSummary.Version, ct);
|
||||
if (version is null) continue;
|
||||
|
||||
var variants = await _repository.GetBuildVariantsAsync(version.Id, ct);
|
||||
|
||||
// Find the function in any variant
|
||||
CorpusFunction? targetFunction = null;
|
||||
CorpusFingerprint? fingerprint = null;
|
||||
|
||||
foreach (var variant in variants)
|
||||
{
|
||||
var functions = await _repository.GetFunctionsForVariantAsync(variant.Id, ct);
|
||||
targetFunction = functions.FirstOrDefault(f =>
|
||||
string.Equals(f.Name, functionName, StringComparison.Ordinal) ||
|
||||
string.Equals(f.DemangledName, functionName, StringComparison.Ordinal));
|
||||
|
||||
if (targetFunction is not null)
|
||||
{
|
||||
var fps = await _repository.GetFingerprintsAsync(targetFunction.Id, ct);
|
||||
fingerprint = fps.FirstOrDefault(f => f.Algorithm == FingerprintAlgorithm.SemanticKsg);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (targetFunction is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get CVE info for this version
|
||||
var cves = await _repository.GetCvesForFunctionAsync(targetFunction.Id, ct);
|
||||
var cveIds = cves.Select(c => c.CveId).ToImmutableArray();
|
||||
|
||||
// Compute similarity to previous version if available
|
||||
decimal? similarityToPrevious = null;
|
||||
var currentFingerprintHex = fingerprint?.FingerprintHex ?? string.Empty;
|
||||
if (previousFingerprintHex is not null && currentFingerprintHex.Length > 0)
|
||||
{
|
||||
// Simple comparison: same hash = 1.0, different = 0.5 (would need proper similarity for better results)
|
||||
similarityToPrevious = string.Equals(previousFingerprintHex, currentFingerprintHex, StringComparison.Ordinal)
|
||||
? 1.0m
|
||||
: 0.5m;
|
||||
}
|
||||
previousFingerprintHex = currentFingerprintHex;
|
||||
|
||||
snapshots.Add(new FunctionVersionInfo(
|
||||
versionSummary.Version,
|
||||
versionSummary.ReleaseDate,
|
||||
targetFunction.SizeBytes,
|
||||
currentFingerprintHex,
|
||||
similarityToPrevious,
|
||||
cveIds.Length > 0 ? cveIds : null));
|
||||
}
|
||||
|
||||
if (snapshots.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return new FunctionEvolution(libraryName, functionName, [.. snapshots]);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<CorpusStatistics> GetStatisticsAsync(CancellationToken ct = default)
|
||||
{
|
||||
return await _repository.GetStatisticsAsync(ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<LibrarySummary>> ListLibrariesAsync(CancellationToken ct = default)
|
||||
{
|
||||
return await _repository.ListLibrariesAsync(ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<LibraryVersionSummary>> ListVersionsAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
return await _repository.ListVersionsAsync(libraryName, ct);
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private async Task<List<FunctionCandidate>> SearchByFingerprintAsync(
|
||||
FingerprintAlgorithm algorithm,
|
||||
byte[] fingerprint,
|
||||
IdentifyOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var candidates = new List<FunctionCandidate>();
|
||||
|
||||
// First try exact match
|
||||
var exactMatches = await _repository.FindFunctionsByFingerprintAsync(algorithm, fingerprint, ct);
|
||||
foreach (var functionId in exactMatches)
|
||||
{
|
||||
candidates.Add(new FunctionCandidate(functionId, algorithm, 1.0m, fingerprint));
|
||||
}
|
||||
|
||||
// Then try approximate matching
|
||||
var similarResults = await _repository.FindSimilarFingerprintsAsync(
|
||||
algorithm,
|
||||
fingerprint,
|
||||
options.MaxResults * 2, // Get more to account for filtering
|
||||
ct);
|
||||
|
||||
foreach (var result in similarResults)
|
||||
{
|
||||
if (!candidates.Any(c => c.FunctionId == result.FunctionId))
|
||||
{
|
||||
candidates.Add(new FunctionCandidate(
|
||||
result.FunctionId,
|
||||
algorithm,
|
||||
result.Similarity,
|
||||
result.Fingerprint));
|
||||
}
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
|
||||
private static CombinedCandidate ComputeCombinedScore(
|
||||
IGrouping<Guid, FunctionCandidate> group,
|
||||
FunctionFingerprints query,
|
||||
SimilarityWeights weights)
|
||||
{
|
||||
var candidates = group.ToList();
|
||||
|
||||
decimal totalScore = 0;
|
||||
decimal totalWeight = 0;
|
||||
var algorithms = new List<FingerprintAlgorithm>();
|
||||
|
||||
foreach (var candidate in candidates)
|
||||
{
|
||||
var weight = candidate.Algorithm switch
|
||||
{
|
||||
FingerprintAlgorithm.SemanticKsg => weights.SemanticWeight,
|
||||
FingerprintAlgorithm.InstructionBb => weights.InstructionWeight,
|
||||
FingerprintAlgorithm.CfgWl => weights.CfgWeight,
|
||||
FingerprintAlgorithm.ApiCalls => weights.ApiCallWeight,
|
||||
_ => 0.1m
|
||||
};
|
||||
|
||||
totalScore += candidate.Similarity * weight;
|
||||
totalWeight += weight;
|
||||
algorithms.Add(candidate.Algorithm);
|
||||
}
|
||||
|
||||
var combinedSimilarity = totalWeight > 0 ? totalScore / totalWeight : 0;
|
||||
|
||||
return new CombinedCandidate(group.Key, combinedSimilarity, [.. algorithms]);
|
||||
}
|
||||
|
||||
private static MatchConfidence ComputeConfidence(CombinedCandidate candidate)
|
||||
{
|
||||
// Higher confidence with more matching algorithms and higher similarity
|
||||
var algorithmCount = candidate.MatchingAlgorithms.Length;
|
||||
var similarity = candidate.Similarity;
|
||||
|
||||
if (algorithmCount >= 3 && similarity >= 0.95m)
|
||||
return MatchConfidence.Exact;
|
||||
if (algorithmCount >= 3 && similarity >= 0.85m)
|
||||
return MatchConfidence.VeryHigh;
|
||||
if (algorithmCount >= 2 && similarity >= 0.85m)
|
||||
return MatchConfidence.High;
|
||||
if (algorithmCount >= 1 && similarity >= 0.70m)
|
||||
return MatchConfidence.Medium;
|
||||
return MatchConfidence.Low;
|
||||
}
|
||||
|
||||
private static decimal GetAlgorithmSimilarity(
|
||||
List<FunctionCandidate> candidates,
|
||||
FingerprintAlgorithm algorithm)
|
||||
{
|
||||
var match = candidates.FirstOrDefault(c => c.Algorithm == algorithm);
|
||||
return match?.Similarity ?? 0m;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
private sealed record FunctionCandidate(
|
||||
Guid FunctionId,
|
||||
FingerprintAlgorithm Algorithm,
|
||||
decimal Similarity,
|
||||
byte[] Fingerprint);
|
||||
|
||||
private sealed record CombinedCandidate(
|
||||
Guid FunctionId,
|
||||
decimal Similarity,
|
||||
ImmutableArray<FingerprintAlgorithm> MatchingAlgorithms);
|
||||
}
|
||||
@@ -0,0 +1,423 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Service for updating CVE-to-function mappings in the corpus.
|
||||
/// </summary>
|
||||
public sealed class CveFunctionMappingUpdater : ICveFunctionMappingUpdater
|
||||
{
|
||||
private readonly ICorpusRepository _repository;
|
||||
private readonly ICveDataProvider _cveDataProvider;
|
||||
private readonly ILogger<CveFunctionMappingUpdater> _logger;
|
||||
|
||||
public CveFunctionMappingUpdater(
|
||||
ICorpusRepository repository,
|
||||
ICveDataProvider cveDataProvider,
|
||||
ILogger<CveFunctionMappingUpdater> logger)
|
||||
{
|
||||
_repository = repository;
|
||||
_cveDataProvider = cveDataProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<CveMappingUpdateResult> UpdateMappingsForCveAsync(
|
||||
string cveId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogInformation("Updating function mappings for CVE {CveId}", cveId);
|
||||
|
||||
var startTime = DateTime.UtcNow;
|
||||
var errors = new List<string>();
|
||||
var functionsUpdated = 0;
|
||||
|
||||
try
|
||||
{
|
||||
// Get CVE details from provider
|
||||
var cveDetails = await _cveDataProvider.GetCveDetailsAsync(cveId, ct);
|
||||
if (cveDetails is null)
|
||||
{
|
||||
return new CveMappingUpdateResult(
|
||||
cveId,
|
||||
0,
|
||||
DateTime.UtcNow - startTime,
|
||||
[$"CVE {cveId} not found in data provider"]);
|
||||
}
|
||||
|
||||
// Get affected library
|
||||
var library = await _repository.GetLibraryAsync(cveDetails.AffectedLibrary, ct);
|
||||
if (library is null)
|
||||
{
|
||||
return new CveMappingUpdateResult(
|
||||
cveId,
|
||||
0,
|
||||
DateTime.UtcNow - startTime,
|
||||
[$"Library {cveDetails.AffectedLibrary} not found in corpus"]);
|
||||
}
|
||||
|
||||
// Process affected versions
|
||||
var associations = new List<FunctionCve>();
|
||||
|
||||
foreach (var affectedVersion in cveDetails.AffectedVersions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
// Find matching version in corpus
|
||||
var version = await FindMatchingVersionAsync(library.Id, affectedVersion, ct);
|
||||
if (version is null)
|
||||
{
|
||||
_logger.LogDebug("Version {Version} not found in corpus", affectedVersion);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get all build variants for this version
|
||||
var variants = await _repository.GetBuildVariantsAsync(version.Id, ct);
|
||||
|
||||
foreach (var variant in variants)
|
||||
{
|
||||
// Get functions in this variant
|
||||
var functions = await _repository.GetFunctionsForVariantAsync(variant.Id, ct);
|
||||
|
||||
// If we have specific function names, only map those
|
||||
if (cveDetails.AffectedFunctions.Length > 0)
|
||||
{
|
||||
var matchedFunctions = functions.Where(f =>
|
||||
cveDetails.AffectedFunctions.Contains(f.Name, StringComparer.Ordinal) ||
|
||||
(f.DemangledName is not null &&
|
||||
cveDetails.AffectedFunctions.Contains(f.DemangledName, StringComparer.Ordinal)));
|
||||
|
||||
foreach (var function in matchedFunctions)
|
||||
{
|
||||
associations.Add(CreateAssociation(function.Id, cveId, cveDetails, affectedVersion));
|
||||
functionsUpdated++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Map all functions in affected variant as potentially affected
|
||||
foreach (var function in functions.Take(100)) // Limit to avoid huge updates
|
||||
{
|
||||
associations.Add(CreateAssociation(function.Id, cveId, cveDetails, affectedVersion));
|
||||
functionsUpdated++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Upsert all associations
|
||||
if (associations.Count > 0)
|
||||
{
|
||||
await _repository.UpsertCveAssociationsAsync(cveId, associations, ct);
|
||||
}
|
||||
|
||||
var duration = DateTime.UtcNow - startTime;
|
||||
_logger.LogInformation(
|
||||
"Updated {Count} function mappings for CVE {CveId} in {Duration:c}",
|
||||
functionsUpdated, cveId, duration);
|
||||
|
||||
return new CveMappingUpdateResult(cveId, functionsUpdated, duration, [.. errors]);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
errors.Add(ex.Message);
|
||||
_logger.LogError(ex, "Error updating mappings for CVE {CveId}", cveId);
|
||||
return new CveMappingUpdateResult(cveId, functionsUpdated, DateTime.UtcNow - startTime, [.. errors]);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<CveBatchMappingResult> UpdateMappingsForLibraryAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogInformation("Updating all CVE mappings for library {Library}", libraryName);
|
||||
|
||||
var startTime = DateTime.UtcNow;
|
||||
var results = new List<CveMappingUpdateResult>();
|
||||
|
||||
// Get all CVEs for this library
|
||||
var cves = await _cveDataProvider.GetCvesForLibraryAsync(libraryName, ct);
|
||||
|
||||
foreach (var cveId in cves)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var result = await UpdateMappingsForCveAsync(cveId, ct);
|
||||
results.Add(result);
|
||||
}
|
||||
|
||||
var totalDuration = DateTime.UtcNow - startTime;
|
||||
|
||||
return new CveBatchMappingResult(
|
||||
libraryName,
|
||||
results.Count,
|
||||
results.Sum(r => r.FunctionsUpdated),
|
||||
totalDuration,
|
||||
[.. results.Where(r => r.Errors.Length > 0).SelectMany(r => r.Errors)]);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<CveMappingUpdateResult> MarkFunctionFixedAsync(
|
||||
string cveId,
|
||||
string libraryName,
|
||||
string version,
|
||||
string? functionName,
|
||||
string? patchCommit,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Marking functions as fixed for CVE {CveId} in {Library} {Version}",
|
||||
cveId, libraryName, version);
|
||||
|
||||
var startTime = DateTime.UtcNow;
|
||||
var functionsUpdated = 0;
|
||||
|
||||
var library = await _repository.GetLibraryAsync(libraryName, ct);
|
||||
if (library is null)
|
||||
{
|
||||
return new CveMappingUpdateResult(
|
||||
cveId, 0, DateTime.UtcNow - startTime,
|
||||
[$"Library {libraryName} not found"]);
|
||||
}
|
||||
|
||||
var libVersion = await _repository.GetVersionAsync(library.Id, version, ct);
|
||||
if (libVersion is null)
|
||||
{
|
||||
return new CveMappingUpdateResult(
|
||||
cveId, 0, DateTime.UtcNow - startTime,
|
||||
[$"Version {version} not found"]);
|
||||
}
|
||||
|
||||
var variants = await _repository.GetBuildVariantsAsync(libVersion.Id, ct);
|
||||
var associations = new List<FunctionCve>();
|
||||
|
||||
foreach (var variant in variants)
|
||||
{
|
||||
var functions = await _repository.GetFunctionsForVariantAsync(variant.Id, ct);
|
||||
|
||||
IEnumerable<CorpusFunction> targetFunctions = functionName is null
|
||||
? functions
|
||||
: functions.Where(f =>
|
||||
string.Equals(f.Name, functionName, StringComparison.Ordinal) ||
|
||||
string.Equals(f.DemangledName, functionName, StringComparison.Ordinal));
|
||||
|
||||
foreach (var function in targetFunctions)
|
||||
{
|
||||
associations.Add(new FunctionCve(
|
||||
function.Id,
|
||||
cveId,
|
||||
CveAffectedState.Fixed,
|
||||
patchCommit,
|
||||
0.9m, // High confidence for explicit marking
|
||||
CveEvidenceType.Commit));
|
||||
functionsUpdated++;
|
||||
}
|
||||
}
|
||||
|
||||
if (associations.Count > 0)
|
||||
{
|
||||
await _repository.UpsertCveAssociationsAsync(cveId, associations, ct);
|
||||
}
|
||||
|
||||
return new CveMappingUpdateResult(
|
||||
cveId, functionsUpdated, DateTime.UtcNow - startTime, []);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<string>> GetUnmappedCvesAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
// Get all known CVEs for this library
|
||||
var allCves = await _cveDataProvider.GetCvesForLibraryAsync(libraryName, ct);
|
||||
|
||||
// Get CVEs that have function mappings
|
||||
var unmapped = new List<string>();
|
||||
|
||||
foreach (var cveId in allCves)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var functionIds = await _repository.GetFunctionIdsForCveAsync(cveId, ct);
|
||||
if (functionIds.Length == 0)
|
||||
{
|
||||
unmapped.Add(cveId);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. unmapped];
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private async Task<LibraryVersion?> FindMatchingVersionAsync(
|
||||
Guid libraryId,
|
||||
string versionString,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Try exact match first
|
||||
var exactMatch = await _repository.GetVersionAsync(libraryId, versionString, ct);
|
||||
if (exactMatch is not null)
|
||||
{
|
||||
return exactMatch;
|
||||
}
|
||||
|
||||
// Try with common prefixes/suffixes removed
|
||||
var normalizedVersion = NormalizeVersion(versionString);
|
||||
if (normalizedVersion != versionString)
|
||||
{
|
||||
return await _repository.GetVersionAsync(libraryId, normalizedVersion, ct);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static string NormalizeVersion(string version)
|
||||
{
|
||||
// Remove common prefixes
|
||||
if (version.StartsWith("v", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
version = version[1..];
|
||||
}
|
||||
|
||||
// Remove release suffixes
|
||||
var suffixIndex = version.IndexOfAny(['-', '+', '_']);
|
||||
if (suffixIndex > 0)
|
||||
{
|
||||
version = version[..suffixIndex];
|
||||
}
|
||||
|
||||
return version;
|
||||
}
|
||||
|
||||
private static FunctionCve CreateAssociation(
|
||||
Guid functionId,
|
||||
string cveId,
|
||||
CveDetails cveDetails,
|
||||
string version)
|
||||
{
|
||||
var isFixed = cveDetails.FixedVersions.Contains(version, StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
return new FunctionCve(
|
||||
functionId,
|
||||
cveId,
|
||||
isFixed ? CveAffectedState.Fixed : CveAffectedState.Vulnerable,
|
||||
cveDetails.PatchCommit,
|
||||
ComputeConfidence(cveDetails),
|
||||
cveDetails.EvidenceType);
|
||||
}
|
||||
|
||||
private static decimal ComputeConfidence(CveDetails details)
|
||||
{
|
||||
// Higher confidence for specific function names and commit evidence
|
||||
var baseConfidence = 0.5m;
|
||||
|
||||
if (details.AffectedFunctions.Length > 0)
|
||||
{
|
||||
baseConfidence += 0.2m;
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(details.PatchCommit))
|
||||
{
|
||||
baseConfidence += 0.2m;
|
||||
}
|
||||
|
||||
return details.EvidenceType switch
|
||||
{
|
||||
CveEvidenceType.Commit => baseConfidence + 0.1m,
|
||||
CveEvidenceType.Advisory => baseConfidence + 0.05m,
|
||||
CveEvidenceType.Changelog => baseConfidence + 0.05m,
|
||||
_ => baseConfidence
|
||||
};
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for CVE-to-function mapping updates.
|
||||
/// </summary>
|
||||
public interface ICveFunctionMappingUpdater
|
||||
{
|
||||
/// <summary>
|
||||
/// Update function mappings for a specific CVE.
|
||||
/// </summary>
|
||||
Task<CveMappingUpdateResult> UpdateMappingsForCveAsync(
|
||||
string cveId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Update all CVE mappings for a library.
|
||||
/// </summary>
|
||||
Task<CveBatchMappingResult> UpdateMappingsForLibraryAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Mark functions as fixed for a CVE.
|
||||
/// </summary>
|
||||
Task<CveMappingUpdateResult> MarkFunctionFixedAsync(
|
||||
string cveId,
|
||||
string libraryName,
|
||||
string version,
|
||||
string? functionName,
|
||||
string? patchCommit,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get CVEs that have no function mappings.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<string>> GetUnmappedCvesAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Provider for CVE data.
|
||||
/// </summary>
|
||||
public interface ICveDataProvider
|
||||
{
|
||||
/// <summary>
|
||||
/// Get details for a CVE.
|
||||
/// </summary>
|
||||
Task<CveDetails?> GetCveDetailsAsync(string cveId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get all CVEs affecting a library.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<string>> GetCvesForLibraryAsync(string libraryName, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// CVE details from a data provider.
|
||||
/// </summary>
|
||||
public sealed record CveDetails(
|
||||
string CveId,
|
||||
string AffectedLibrary,
|
||||
ImmutableArray<string> AffectedVersions,
|
||||
ImmutableArray<string> FixedVersions,
|
||||
ImmutableArray<string> AffectedFunctions,
|
||||
string? PatchCommit,
|
||||
CveEvidenceType EvidenceType);
|
||||
|
||||
/// <summary>
|
||||
/// Result of a CVE mapping update.
|
||||
/// </summary>
|
||||
public sealed record CveMappingUpdateResult(
|
||||
string CveId,
|
||||
int FunctionsUpdated,
|
||||
TimeSpan Duration,
|
||||
ImmutableArray<string> Errors);
|
||||
|
||||
/// <summary>
|
||||
/// Result of batch CVE mapping update.
|
||||
/// </summary>
|
||||
public sealed record CveBatchMappingResult(
|
||||
string LibraryName,
|
||||
int CvesProcessed,
|
||||
int TotalFunctionsUpdated,
|
||||
TimeSpan Duration,
|
||||
ImmutableArray<string> Errors);
|
||||
@@ -0,0 +1,531 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Service for clustering semantically similar functions across library versions.
|
||||
/// Groups functions by their canonical name and computes similarity to cluster centroid.
|
||||
/// </summary>
|
||||
public sealed partial class FunctionClusteringService : IFunctionClusteringService
|
||||
{
|
||||
private readonly ICorpusRepository _repository;
|
||||
private readonly IClusterSimilarityComputer _similarityComputer;
|
||||
private readonly ILogger<FunctionClusteringService> _logger;
|
||||
|
||||
public FunctionClusteringService(
|
||||
ICorpusRepository repository,
|
||||
IClusterSimilarityComputer similarityComputer,
|
||||
ILogger<FunctionClusteringService> logger)
|
||||
{
|
||||
_repository = repository;
|
||||
_similarityComputer = similarityComputer;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ClusteringResult> ClusterFunctionsAsync(
|
||||
Guid libraryId,
|
||||
ClusteringOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var opts = options ?? new ClusteringOptions();
|
||||
var startTime = DateTime.UtcNow;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting function clustering for library {LibraryId}",
|
||||
libraryId);
|
||||
|
||||
// Get all functions with fingerprints for this library
|
||||
var functionsWithFingerprints = await GetFunctionsWithFingerprintsAsync(libraryId, ct);
|
||||
|
||||
if (functionsWithFingerprints.Count == 0)
|
||||
{
|
||||
_logger.LogWarning("No functions with fingerprints found for library {LibraryId}", libraryId);
|
||||
return new ClusteringResult(
|
||||
libraryId,
|
||||
0,
|
||||
0,
|
||||
TimeSpan.Zero,
|
||||
[],
|
||||
[]);
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Found {Count} functions with fingerprints",
|
||||
functionsWithFingerprints.Count);
|
||||
|
||||
// Group functions by canonical name
|
||||
var groupedByName = functionsWithFingerprints
|
||||
.GroupBy(f => NormalizeCanonicalName(f.Function.DemangledName ?? f.Function.Name))
|
||||
.Where(g => !string.IsNullOrWhiteSpace(g.Key))
|
||||
.ToList();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Grouped into {Count} canonical function names",
|
||||
groupedByName.Count);
|
||||
|
||||
var clustersCreated = 0;
|
||||
var membersAssigned = 0;
|
||||
var errors = new List<string>();
|
||||
var warnings = new List<string>();
|
||||
|
||||
foreach (var group in groupedByName)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
try
|
||||
{
|
||||
var result = await ProcessFunctionGroupAsync(
|
||||
libraryId,
|
||||
group.Key,
|
||||
group.ToList(),
|
||||
opts,
|
||||
ct);
|
||||
|
||||
clustersCreated++;
|
||||
membersAssigned += result.MembersAdded;
|
||||
|
||||
if (result.Warnings.Length > 0)
|
||||
{
|
||||
warnings.AddRange(result.Warnings);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
errors.Add($"Failed to cluster '{group.Key}': {ex.Message}");
|
||||
_logger.LogError(ex, "Error clustering function group {Name}", group.Key);
|
||||
}
|
||||
}
|
||||
|
||||
var duration = DateTime.UtcNow - startTime;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Clustering completed: {Clusters} clusters, {Members} members in {Duration:c}",
|
||||
clustersCreated,
|
||||
membersAssigned,
|
||||
duration);
|
||||
|
||||
return new ClusteringResult(
|
||||
libraryId,
|
||||
clustersCreated,
|
||||
membersAssigned,
|
||||
duration,
|
||||
[.. errors],
|
||||
[.. warnings]);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ClusteringResult> ReclusterAsync(
|
||||
Guid clusterId,
|
||||
ClusteringOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var opts = options ?? new ClusteringOptions();
|
||||
var startTime = DateTime.UtcNow;
|
||||
|
||||
// Get existing cluster
|
||||
var cluster = await _repository.GetClusterAsync(clusterId, ct);
|
||||
if (cluster is null)
|
||||
{
|
||||
return new ClusteringResult(
|
||||
Guid.Empty,
|
||||
0,
|
||||
0,
|
||||
TimeSpan.Zero,
|
||||
["Cluster not found"],
|
||||
[]);
|
||||
}
|
||||
|
||||
// Get current members
|
||||
var members = await _repository.GetClusterMembersAsync(clusterId, ct);
|
||||
if (members.Length == 0)
|
||||
{
|
||||
return new ClusteringResult(
|
||||
cluster.LibraryId,
|
||||
0,
|
||||
0,
|
||||
TimeSpan.Zero,
|
||||
[],
|
||||
["Cluster has no members"]);
|
||||
}
|
||||
|
||||
// Get functions with fingerprints
|
||||
var functionsWithFingerprints = new List<FunctionWithFingerprint>();
|
||||
foreach (var member in members)
|
||||
{
|
||||
var function = await _repository.GetFunctionAsync(member.FunctionId, ct);
|
||||
if (function is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var fingerprints = await _repository.GetFingerprintsForFunctionAsync(function.Id, ct);
|
||||
var semanticFp = fingerprints.FirstOrDefault(f => f.Algorithm == FingerprintAlgorithm.SemanticKsg);
|
||||
|
||||
if (semanticFp is not null)
|
||||
{
|
||||
functionsWithFingerprints.Add(new FunctionWithFingerprint(function, semanticFp));
|
||||
}
|
||||
}
|
||||
|
||||
// Clear existing members
|
||||
await _repository.ClearClusterMembersAsync(clusterId, ct);
|
||||
|
||||
// Recompute similarities
|
||||
var centroid = ComputeCentroid(functionsWithFingerprints, opts);
|
||||
var membersAdded = 0;
|
||||
|
||||
foreach (var fwf in functionsWithFingerprints)
|
||||
{
|
||||
var similarity = await _similarityComputer.ComputeSimilarityAsync(
|
||||
fwf.Fingerprint.Fingerprint,
|
||||
centroid,
|
||||
ct);
|
||||
|
||||
if (similarity >= opts.MinimumSimilarity)
|
||||
{
|
||||
await _repository.AddClusterMemberAsync(
|
||||
new ClusterMember(clusterId, fwf.Function.Id, similarity),
|
||||
ct);
|
||||
membersAdded++;
|
||||
}
|
||||
}
|
||||
|
||||
var duration = DateTime.UtcNow - startTime;
|
||||
|
||||
return new ClusteringResult(
|
||||
cluster.LibraryId,
|
||||
1,
|
||||
membersAdded,
|
||||
duration,
|
||||
[],
|
||||
[]);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<FunctionCluster>> GetClustersForLibraryAsync(
|
||||
Guid libraryId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
return await _repository.GetClustersForLibraryAsync(libraryId, ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ClusterDetails?> GetClusterDetailsAsync(
|
||||
Guid clusterId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var cluster = await _repository.GetClusterAsync(clusterId, ct);
|
||||
if (cluster is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var members = await _repository.GetClusterMembersAsync(clusterId, ct);
|
||||
var functionDetails = new List<ClusterMemberDetails>();
|
||||
|
||||
foreach (var member in members)
|
||||
{
|
||||
var function = await _repository.GetFunctionAsync(member.FunctionId, ct);
|
||||
if (function is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var variant = await _repository.GetBuildVariantAsync(function.BuildVariantId, ct);
|
||||
LibraryVersion? version = null;
|
||||
if (variant is not null)
|
||||
{
|
||||
version = await _repository.GetLibraryVersionAsync(variant.LibraryVersionId, ct);
|
||||
}
|
||||
|
||||
functionDetails.Add(new ClusterMemberDetails(
|
||||
member.FunctionId,
|
||||
function.Name,
|
||||
function.DemangledName,
|
||||
version?.Version ?? "unknown",
|
||||
variant?.Architecture ?? "unknown",
|
||||
member.SimilarityToCentroid ?? 0m));
|
||||
}
|
||||
|
||||
return new ClusterDetails(
|
||||
cluster.Id,
|
||||
cluster.LibraryId,
|
||||
cluster.CanonicalName,
|
||||
cluster.Description,
|
||||
[.. functionDetails]);
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private async Task<List<FunctionWithFingerprint>> GetFunctionsWithFingerprintsAsync(
|
||||
Guid libraryId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var result = new List<FunctionWithFingerprint>();
|
||||
|
||||
// Get all versions for the library
|
||||
var library = await _repository.GetLibraryByIdAsync(libraryId, ct);
|
||||
if (library is null)
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
var versions = await _repository.ListVersionsAsync(library.Name, ct);
|
||||
|
||||
foreach (var version in versions)
|
||||
{
|
||||
var variants = await _repository.GetBuildVariantsAsync(version.Id, ct);
|
||||
|
||||
foreach (var variant in variants)
|
||||
{
|
||||
var functions = await _repository.GetFunctionsForVariantAsync(variant.Id, ct);
|
||||
|
||||
foreach (var function in functions)
|
||||
{
|
||||
var fingerprints = await _repository.GetFingerprintsForFunctionAsync(function.Id, ct);
|
||||
var semanticFp = fingerprints.FirstOrDefault(f => f.Algorithm == FingerprintAlgorithm.SemanticKsg);
|
||||
|
||||
if (semanticFp is not null)
|
||||
{
|
||||
result.Add(new FunctionWithFingerprint(function, semanticFp));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private async Task<GroupClusteringResult> ProcessFunctionGroupAsync(
|
||||
Guid libraryId,
|
||||
string canonicalName,
|
||||
List<FunctionWithFingerprint> functions,
|
||||
ClusteringOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Ensure cluster exists
|
||||
var existingClusters = await _repository.GetClustersForLibraryAsync(libraryId, ct);
|
||||
var cluster = existingClusters.FirstOrDefault(c =>
|
||||
string.Equals(c.CanonicalName, canonicalName, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
Guid clusterId;
|
||||
if (cluster is null)
|
||||
{
|
||||
// Create new cluster
|
||||
var newCluster = new FunctionCluster(
|
||||
Guid.NewGuid(),
|
||||
libraryId,
|
||||
canonicalName,
|
||||
$"Cluster for function '{canonicalName}'",
|
||||
DateTimeOffset.UtcNow);
|
||||
|
||||
await _repository.InsertClusterAsync(newCluster, ct);
|
||||
clusterId = newCluster.Id;
|
||||
}
|
||||
else
|
||||
{
|
||||
clusterId = cluster.Id;
|
||||
// Clear existing members for recomputation
|
||||
await _repository.ClearClusterMembersAsync(clusterId, ct);
|
||||
}
|
||||
|
||||
// Compute centroid fingerprint
|
||||
var centroid = ComputeCentroid(functions, options);
|
||||
|
||||
var membersAdded = 0;
|
||||
var warnings = new List<string>();
|
||||
|
||||
foreach (var fwf in functions)
|
||||
{
|
||||
var similarity = await _similarityComputer.ComputeSimilarityAsync(
|
||||
fwf.Fingerprint.Fingerprint,
|
||||
centroid,
|
||||
ct);
|
||||
|
||||
if (similarity >= options.MinimumSimilarity)
|
||||
{
|
||||
await _repository.AddClusterMemberAsync(
|
||||
new ClusterMember(clusterId, fwf.Function.Id, similarity),
|
||||
ct);
|
||||
membersAdded++;
|
||||
}
|
||||
else
|
||||
{
|
||||
warnings.Add($"Function {fwf.Function.Name} excluded: similarity {similarity:F4} < threshold {options.MinimumSimilarity:F4}");
|
||||
}
|
||||
}
|
||||
|
||||
return new GroupClusteringResult(membersAdded, [.. warnings]);
|
||||
}
|
||||
|
||||
private static byte[] ComputeCentroid(
|
||||
List<FunctionWithFingerprint> functions,
|
||||
ClusteringOptions options)
|
||||
{
|
||||
if (functions.Count == 0)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
if (functions.Count == 1)
|
||||
{
|
||||
return functions[0].Fingerprint.Fingerprint;
|
||||
}
|
||||
|
||||
// Use most common fingerprint as centroid (mode-based approach)
|
||||
// This is more robust than averaging for discrete hash-based fingerprints
|
||||
var fingerprintCounts = functions
|
||||
.GroupBy(f => Convert.ToHexStringLower(f.Fingerprint.Fingerprint))
|
||||
.OrderByDescending(g => g.Count())
|
||||
.ToList();
|
||||
|
||||
var mostCommon = fingerprintCounts.First();
|
||||
return functions
|
||||
.First(f => Convert.ToHexStringLower(f.Fingerprint.Fingerprint) == mostCommon.Key)
|
||||
.Fingerprint.Fingerprint;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Normalizes a function name to its canonical form for clustering.
|
||||
/// </summary>
|
||||
private static string NormalizeCanonicalName(string name)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(name))
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
// Remove GLIBC version annotations (e.g., memcpy@GLIBC_2.14 -> memcpy)
|
||||
var normalized = GlibcVersionPattern().Replace(name, "");
|
||||
|
||||
// Remove trailing @@ symbols
|
||||
normalized = normalized.TrimEnd('@');
|
||||
|
||||
// Remove common symbol prefixes
|
||||
if (normalized.StartsWith("__"))
|
||||
{
|
||||
normalized = normalized[2..];
|
||||
}
|
||||
|
||||
// Remove _internal suffixes
|
||||
normalized = InternalSuffixPattern().Replace(normalized, "");
|
||||
|
||||
// Trim whitespace
|
||||
normalized = normalized.Trim();
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"@GLIBC_[\d.]+", RegexOptions.Compiled)]
|
||||
private static partial Regex GlibcVersionPattern();
|
||||
|
||||
[GeneratedRegex(@"_internal$", RegexOptions.Compiled | RegexOptions.IgnoreCase)]
|
||||
private static partial Regex InternalSuffixPattern();
|
||||
|
||||
#endregion
|
||||
|
||||
private sealed record FunctionWithFingerprint(CorpusFunction Function, CorpusFingerprint Fingerprint);
|
||||
private sealed record GroupClusteringResult(int MembersAdded, ImmutableArray<string> Warnings);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for function clustering.
|
||||
/// </summary>
|
||||
public interface IFunctionClusteringService
|
||||
{
|
||||
/// <summary>
|
||||
/// Cluster all functions for a library.
|
||||
/// </summary>
|
||||
Task<ClusteringResult> ClusterFunctionsAsync(
|
||||
Guid libraryId,
|
||||
ClusteringOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Recompute a specific cluster.
|
||||
/// </summary>
|
||||
Task<ClusteringResult> ReclusterAsync(
|
||||
Guid clusterId,
|
||||
ClusteringOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get all clusters for a library.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<FunctionCluster>> GetClustersForLibraryAsync(
|
||||
Guid libraryId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get detailed information about a cluster.
|
||||
/// </summary>
|
||||
Task<ClusterDetails?> GetClusterDetailsAsync(
|
||||
Guid clusterId,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for function clustering.
|
||||
/// </summary>
|
||||
public sealed record ClusteringOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Minimum similarity threshold to include a function in a cluster.
|
||||
/// </summary>
|
||||
public decimal MinimumSimilarity { get; init; } = 0.7m;
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm to use for clustering.
|
||||
/// </summary>
|
||||
public FingerprintAlgorithm Algorithm { get; init; } = FingerprintAlgorithm.SemanticKsg;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of clustering operation.
|
||||
/// </summary>
|
||||
public sealed record ClusteringResult(
|
||||
Guid LibraryId,
|
||||
int ClustersCreated,
|
||||
int MembersAssigned,
|
||||
TimeSpan Duration,
|
||||
ImmutableArray<string> Errors,
|
||||
ImmutableArray<string> Warnings);
|
||||
|
||||
/// <summary>
|
||||
/// Detailed cluster information.
|
||||
/// </summary>
|
||||
public sealed record ClusterDetails(
|
||||
Guid ClusterId,
|
||||
Guid LibraryId,
|
||||
string CanonicalName,
|
||||
string? Description,
|
||||
ImmutableArray<ClusterMemberDetails> Members);
|
||||
|
||||
/// <summary>
|
||||
/// Details about a cluster member.
|
||||
/// </summary>
|
||||
public sealed record ClusterMemberDetails(
|
||||
Guid FunctionId,
|
||||
string FunctionName,
|
||||
string? DemangledName,
|
||||
string Version,
|
||||
string Architecture,
|
||||
decimal SimilarityToCentroid);
|
||||
|
||||
/// <summary>
|
||||
/// Interface for computing similarity between fingerprints.
|
||||
/// </summary>
|
||||
public interface IClusterSimilarityComputer
|
||||
{
|
||||
/// <summary>
|
||||
/// Compute similarity between two fingerprints.
|
||||
/// </summary>
|
||||
Task<decimal> ComputeSimilarityAsync(
|
||||
byte[] fingerprint1,
|
||||
byte[] fingerprint2,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Http" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
@@ -0,0 +1,392 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// Engine for comparing AST structures using tree edit distance and semantic analysis.
|
||||
/// </summary>
|
||||
public sealed class AstComparisonEngine : IAstComparisonEngine
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public decimal ComputeStructuralSimilarity(DecompiledAst a, DecompiledAst b)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
|
||||
// Use normalized tree edit distance
|
||||
var editDistance = ComputeEditDistance(a, b);
|
||||
return 1.0m - editDistance.NormalizedDistance;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public AstEditDistance ComputeEditDistance(DecompiledAst a, DecompiledAst b)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
|
||||
// Simplified Zhang-Shasha tree edit distance
|
||||
var operations = ComputeTreeEditOperations(a.Root, b.Root);
|
||||
|
||||
var totalNodes = Math.Max(a.NodeCount, b.NodeCount);
|
||||
var normalized = totalNodes > 0
|
||||
? (decimal)operations.TotalOperations / totalNodes
|
||||
: 0m;
|
||||
|
||||
return new AstEditDistance(
|
||||
operations.Insertions,
|
||||
operations.Deletions,
|
||||
operations.Modifications,
|
||||
operations.TotalOperations,
|
||||
Math.Clamp(normalized, 0m, 1m));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<SemanticEquivalence> FindEquivalences(DecompiledAst a, DecompiledAst b)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
|
||||
var equivalences = new List<SemanticEquivalence>();
|
||||
|
||||
// Find equivalent subtrees
|
||||
var nodesA = CollectNodes(a.Root).ToList();
|
||||
var nodesB = CollectNodes(b.Root).ToList();
|
||||
|
||||
foreach (var nodeA in nodesA)
|
||||
{
|
||||
foreach (var nodeB in nodesB)
|
||||
{
|
||||
var equivalence = CheckEquivalence(nodeA, nodeB);
|
||||
if (equivalence is not null)
|
||||
{
|
||||
equivalences.Add(equivalence);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove redundant equivalences (child nodes when parent is equivalent)
|
||||
return [.. FilterRedundantEquivalences(equivalences)];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<CodeDifference> FindDifferences(DecompiledAst a, DecompiledAst b)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
|
||||
var differences = new List<CodeDifference>();
|
||||
|
||||
// Compare root structures
|
||||
CompareNodes(a.Root, b.Root, differences);
|
||||
|
||||
return [.. differences];
|
||||
}
|
||||
|
||||
private static EditOperations ComputeTreeEditOperations(AstNode a, AstNode b)
|
||||
{
|
||||
// Simplified tree comparison
|
||||
if (a.Type != b.Type)
|
||||
{
|
||||
return new EditOperations(0, 0, 1, 1);
|
||||
}
|
||||
|
||||
var childrenA = a.Children;
|
||||
var childrenB = b.Children;
|
||||
|
||||
var insertions = 0;
|
||||
var deletions = 0;
|
||||
var modifications = 0;
|
||||
|
||||
// Compare children using LCS-like approach
|
||||
var maxLen = Math.Max(childrenA.Length, childrenB.Length);
|
||||
var minLen = Math.Min(childrenA.Length, childrenB.Length);
|
||||
|
||||
insertions = childrenB.Length - minLen;
|
||||
deletions = childrenA.Length - minLen;
|
||||
|
||||
for (var i = 0; i < minLen; i++)
|
||||
{
|
||||
var childOps = ComputeTreeEditOperations(childrenA[i], childrenB[i]);
|
||||
insertions += childOps.Insertions;
|
||||
deletions += childOps.Deletions;
|
||||
modifications += childOps.Modifications;
|
||||
}
|
||||
|
||||
return new EditOperations(insertions, deletions, modifications, insertions + deletions + modifications);
|
||||
}
|
||||
|
||||
private static SemanticEquivalence? CheckEquivalence(AstNode a, AstNode b)
|
||||
{
|
||||
// Same type - potential equivalence
|
||||
if (a.Type != b.Type)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check for identical
|
||||
if (AreNodesIdentical(a, b))
|
||||
{
|
||||
return new SemanticEquivalence(a, b, EquivalenceType.Identical, 1.0m, "Identical nodes");
|
||||
}
|
||||
|
||||
// Check for renamed (same structure, different names)
|
||||
if (AreNodesRenamed(a, b))
|
||||
{
|
||||
return new SemanticEquivalence(a, b, EquivalenceType.Renamed, 0.95m, "Same structure with renamed identifiers");
|
||||
}
|
||||
|
||||
// Check for optimization variants
|
||||
if (AreOptimizationVariants(a, b))
|
||||
{
|
||||
return new SemanticEquivalence(a, b, EquivalenceType.Optimized, 0.85m, "Optimization variant");
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static bool AreNodesIdentical(AstNode a, AstNode b)
|
||||
{
|
||||
if (a.Type != b.Type || a.Children.Length != b.Children.Length)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check node-specific equality
|
||||
if (a is ConstantNode constA && b is ConstantNode constB)
|
||||
{
|
||||
return constA.Value?.ToString() == constB.Value?.ToString();
|
||||
}
|
||||
|
||||
if (a is VariableNode varA && b is VariableNode varB)
|
||||
{
|
||||
return varA.Name == varB.Name;
|
||||
}
|
||||
|
||||
if (a is BinaryOpNode binA && b is BinaryOpNode binB)
|
||||
{
|
||||
if (binA.Operator != binB.Operator)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (a is CallNode callA && b is CallNode callB)
|
||||
{
|
||||
if (callA.FunctionName != callB.FunctionName)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check children recursively
|
||||
for (var i = 0; i < a.Children.Length; i++)
|
||||
{
|
||||
if (!AreNodesIdentical(a.Children[i], b.Children[i]))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static bool AreNodesRenamed(AstNode a, AstNode b)
|
||||
{
|
||||
if (a.Type != b.Type || a.Children.Length != b.Children.Length)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Same structure but variable/parameter names differ
|
||||
if (a is VariableNode && b is VariableNode)
|
||||
{
|
||||
return true; // Different name but same position = renamed
|
||||
}
|
||||
|
||||
// Check children have same structure
|
||||
for (var i = 0; i < a.Children.Length; i++)
|
||||
{
|
||||
if (!AreNodesRenamed(a.Children[i], b.Children[i]) &&
|
||||
!AreNodesIdentical(a.Children[i], b.Children[i]))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static bool AreOptimizationVariants(AstNode a, AstNode b)
|
||||
{
|
||||
// Detect common optimization patterns
|
||||
|
||||
// Loop unrolling: for loop vs repeated statements
|
||||
if (a.Type == AstNodeType.For && b.Type == AstNodeType.Block)
|
||||
{
|
||||
return true; // Might be unrolled
|
||||
}
|
||||
|
||||
// Strength reduction: multiplication vs addition
|
||||
if (a is BinaryOpNode binA && b is BinaryOpNode binB)
|
||||
{
|
||||
if ((binA.Operator == "*" && binB.Operator == "<<") ||
|
||||
(binA.Operator == "/" && binB.Operator == ">>"))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Inline expansion
|
||||
if (a.Type == AstNodeType.Call && b.Type == AstNodeType.Block)
|
||||
{
|
||||
return true; // Might be inlined
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static void CompareNodes(AstNode a, AstNode b, List<CodeDifference> differences)
|
||||
{
|
||||
if (a.Type != b.Type)
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Modified,
|
||||
a,
|
||||
b,
|
||||
$"Node type changed: {a.Type} -> {b.Type}"));
|
||||
return;
|
||||
}
|
||||
|
||||
// Compare specific node types
|
||||
switch (a)
|
||||
{
|
||||
case VariableNode varA when b is VariableNode varB:
|
||||
if (varA.Name != varB.Name)
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Modified,
|
||||
a,
|
||||
b,
|
||||
$"Variable renamed: {varA.Name} -> {varB.Name}"));
|
||||
}
|
||||
break;
|
||||
|
||||
case ConstantNode constA when b is ConstantNode constB:
|
||||
if (constA.Value?.ToString() != constB.Value?.ToString())
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Modified,
|
||||
a,
|
||||
b,
|
||||
$"Constant changed: {constA.Value} -> {constB.Value}"));
|
||||
}
|
||||
break;
|
||||
|
||||
case BinaryOpNode binA when b is BinaryOpNode binB:
|
||||
if (binA.Operator != binB.Operator)
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Modified,
|
||||
a,
|
||||
b,
|
||||
$"Operator changed: {binA.Operator} -> {binB.Operator}"));
|
||||
}
|
||||
break;
|
||||
|
||||
case CallNode callA when b is CallNode callB:
|
||||
if (callA.FunctionName != callB.FunctionName)
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Modified,
|
||||
a,
|
||||
b,
|
||||
$"Function call changed: {callA.FunctionName} -> {callB.FunctionName}"));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// Compare children
|
||||
var minChildren = Math.Min(a.Children.Length, b.Children.Length);
|
||||
|
||||
for (var i = 0; i < minChildren; i++)
|
||||
{
|
||||
CompareNodes(a.Children[i], b.Children[i], differences);
|
||||
}
|
||||
|
||||
// Handle added/removed children
|
||||
for (var i = minChildren; i < a.Children.Length; i++)
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Removed,
|
||||
a.Children[i],
|
||||
null,
|
||||
$"Node removed: {a.Children[i].Type}"));
|
||||
}
|
||||
|
||||
for (var i = minChildren; i < b.Children.Length; i++)
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Added,
|
||||
null,
|
||||
b.Children[i],
|
||||
$"Node added: {b.Children[i].Type}"));
|
||||
}
|
||||
}
|
||||
|
||||
private static IEnumerable<AstNode> CollectNodes(AstNode root)
|
||||
{
|
||||
yield return root;
|
||||
foreach (var child in root.Children)
|
||||
{
|
||||
foreach (var node in CollectNodes(child))
|
||||
{
|
||||
yield return node;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static IEnumerable<SemanticEquivalence> FilterRedundantEquivalences(
|
||||
List<SemanticEquivalence> equivalences)
|
||||
{
|
||||
// Keep only top-level equivalences
|
||||
var result = new List<SemanticEquivalence>();
|
||||
|
||||
foreach (var eq in equivalences)
|
||||
{
|
||||
var isRedundant = equivalences.Any(other =>
|
||||
other != eq &&
|
||||
IsAncestor(other.NodeA, eq.NodeA) &&
|
||||
IsAncestor(other.NodeB, eq.NodeB));
|
||||
|
||||
if (!isRedundant)
|
||||
{
|
||||
result.Add(eq);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private static bool IsAncestor(AstNode potential, AstNode node)
|
||||
{
|
||||
if (potential == node)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
foreach (var child in potential.Children)
|
||||
{
|
||||
if (child == node || IsAncestor(child, node))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private readonly record struct EditOperations(int Insertions, int Deletions, int Modifications, int TotalOperations);
|
||||
}
|
||||
@@ -0,0 +1,534 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// Normalizes decompiled code for comparison by removing superficial differences.
|
||||
/// </summary>
|
||||
public sealed partial class CodeNormalizer : ICodeNormalizer
|
||||
{
|
||||
private static readonly ImmutableHashSet<string> CKeywords = ImmutableHashSet.Create(
|
||||
"auto", "break", "case", "char", "const", "continue", "default", "do",
|
||||
"double", "else", "enum", "extern", "float", "for", "goto", "if",
|
||||
"int", "long", "register", "return", "short", "signed", "sizeof", "static",
|
||||
"struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while",
|
||||
// Common Ghidra types
|
||||
"undefined", "undefined1", "undefined2", "undefined4", "undefined8",
|
||||
"byte", "word", "dword", "qword", "bool", "uchar", "ushort", "uint", "ulong",
|
||||
"int8_t", "int16_t", "int32_t", "int64_t", "uint8_t", "uint16_t", "uint32_t", "uint64_t",
|
||||
"size_t", "ssize_t", "ptrdiff_t", "intptr_t", "uintptr_t",
|
||||
// Common function names to preserve
|
||||
"NULL", "true", "false"
|
||||
);
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Normalize(string code, NormalizationOptions? options = null)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(code);
|
||||
|
||||
options ??= NormalizationOptions.Default;
|
||||
|
||||
var normalized = code;
|
||||
|
||||
// 1. Remove comments
|
||||
normalized = RemoveComments(normalized);
|
||||
|
||||
// 2. Normalize variable names
|
||||
if (options.NormalizeVariables)
|
||||
{
|
||||
normalized = NormalizeVariableNames(normalized, options.KnownFunctions);
|
||||
}
|
||||
|
||||
// 3. Normalize function calls
|
||||
if (options.NormalizeFunctionCalls)
|
||||
{
|
||||
normalized = NormalizeFunctionCalls(normalized, options.KnownFunctions);
|
||||
}
|
||||
|
||||
// 4. Normalize constants
|
||||
if (options.NormalizeConstants)
|
||||
{
|
||||
normalized = NormalizeConstants(normalized);
|
||||
}
|
||||
|
||||
// 5. Normalize whitespace
|
||||
if (options.NormalizeWhitespace)
|
||||
{
|
||||
normalized = NormalizeWhitespace(normalized);
|
||||
}
|
||||
|
||||
// 6. Sort independent statements (within blocks)
|
||||
if (options.SortIndependentStatements)
|
||||
{
|
||||
normalized = SortIndependentStatements(normalized);
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public byte[] ComputeCanonicalHash(string code)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(code);
|
||||
|
||||
// Normalize with full normalization for hashing
|
||||
var normalized = Normalize(code, new NormalizationOptions
|
||||
{
|
||||
NormalizeVariables = true,
|
||||
NormalizeFunctionCalls = true,
|
||||
NormalizeConstants = false, // Keep constants for semantic identity
|
||||
NormalizeWhitespace = true,
|
||||
SortIndependentStatements = true
|
||||
});
|
||||
|
||||
return SHA256.HashData(Encoding.UTF8.GetBytes(normalized));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public DecompiledAst NormalizeAst(DecompiledAst ast, NormalizationOptions? options = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(ast);
|
||||
|
||||
options ??= NormalizationOptions.Default;
|
||||
|
||||
var varIndex = 0;
|
||||
var varMap = new Dictionary<string, string>();
|
||||
|
||||
var normalizedRoot = NormalizeNode(ast.Root, options, varMap, ref varIndex);
|
||||
|
||||
return new DecompiledAst(
|
||||
normalizedRoot,
|
||||
ast.NodeCount,
|
||||
ast.Depth,
|
||||
ast.Patterns);
|
||||
}
|
||||
|
||||
private static AstNode NormalizeNode(
|
||||
AstNode node,
|
||||
NormalizationOptions options,
|
||||
Dictionary<string, string> varMap,
|
||||
ref int varIndex)
|
||||
{
|
||||
return node switch
|
||||
{
|
||||
VariableNode varNode when options.NormalizeVariables =>
|
||||
NormalizeVariableNode(varNode, varMap, ref varIndex),
|
||||
|
||||
CallNode callNode when options.NormalizeFunctionCalls =>
|
||||
NormalizeCallNode(callNode, options, varMap, ref varIndex),
|
||||
|
||||
ConstantNode constNode when options.NormalizeConstants =>
|
||||
NormalizeConstantNode(constNode),
|
||||
|
||||
_ => NormalizeChildren(node, options, varMap, ref varIndex)
|
||||
};
|
||||
}
|
||||
|
||||
private static AstNode NormalizeVariableNode(
|
||||
VariableNode node,
|
||||
Dictionary<string, string> varMap,
|
||||
ref int varIndex)
|
||||
{
|
||||
if (IsKeywordOrType(node.Name))
|
||||
{
|
||||
return node;
|
||||
}
|
||||
|
||||
if (!varMap.TryGetValue(node.Name, out var canonical))
|
||||
{
|
||||
canonical = $"var_{varIndex++}";
|
||||
varMap[node.Name] = canonical;
|
||||
}
|
||||
|
||||
return node with { Name = canonical };
|
||||
}
|
||||
|
||||
private static AstNode NormalizeCallNode(
|
||||
CallNode node,
|
||||
NormalizationOptions options,
|
||||
Dictionary<string, string> varMap,
|
||||
ref int varIndex)
|
||||
{
|
||||
var funcName = node.FunctionName;
|
||||
|
||||
// Preserve known functions
|
||||
if (options.KnownFunctions?.Contains(funcName) != true &&
|
||||
!IsStandardLibraryFunction(funcName))
|
||||
{
|
||||
funcName = $"func_{funcName.GetHashCode():X8}";
|
||||
}
|
||||
|
||||
var normalizedArgs = new List<AstNode>(node.Arguments.Length);
|
||||
foreach (var arg in node.Arguments)
|
||||
{
|
||||
normalizedArgs.Add(NormalizeNode(arg, options, varMap, ref varIndex));
|
||||
}
|
||||
|
||||
return new CallNode(funcName, [.. normalizedArgs], node.Location);
|
||||
}
|
||||
|
||||
private static AstNode NormalizeConstantNode(ConstantNode node)
|
||||
{
|
||||
// Normalize numeric constants to canonical form
|
||||
if (node.Value is long or int or short or byte)
|
||||
{
|
||||
return node with { Value = "CONST_INT" };
|
||||
}
|
||||
|
||||
if (node.Value is double or float or decimal)
|
||||
{
|
||||
return node with { Value = "CONST_FLOAT" };
|
||||
}
|
||||
|
||||
if (node.Value is string)
|
||||
{
|
||||
return node with { Value = "CONST_STR" };
|
||||
}
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
private static AstNode NormalizeChildren(
|
||||
AstNode node,
|
||||
NormalizationOptions options,
|
||||
Dictionary<string, string> varMap,
|
||||
ref int varIndex)
|
||||
{
|
||||
if (node.Children.Length == 0)
|
||||
{
|
||||
return node;
|
||||
}
|
||||
|
||||
var normalizedChildren = new List<AstNode>(node.Children.Length);
|
||||
foreach (var child in node.Children)
|
||||
{
|
||||
normalizedChildren.Add(NormalizeNode(child, options, varMap, ref varIndex));
|
||||
}
|
||||
|
||||
var normalizedArray = normalizedChildren.ToImmutableArray();
|
||||
|
||||
// Use reflection-free approach for common node types
|
||||
return node switch
|
||||
{
|
||||
BlockNode block => block with { Statements = normalizedArray },
|
||||
IfNode ifNode => CreateNormalizedIf(ifNode, normalizedArray),
|
||||
WhileNode whileNode => CreateNormalizedWhile(whileNode, normalizedArray),
|
||||
ForNode forNode => CreateNormalizedFor(forNode, normalizedArray),
|
||||
ReturnNode returnNode when normalizedArray.Length > 0 =>
|
||||
returnNode with { Value = normalizedArray[0] },
|
||||
AssignmentNode assignment => CreateNormalizedAssignment(assignment, normalizedArray),
|
||||
BinaryOpNode binOp => CreateNormalizedBinaryOp(binOp, normalizedArray),
|
||||
UnaryOpNode unaryOp when normalizedArray.Length > 0 =>
|
||||
unaryOp with { Operand = normalizedArray[0] },
|
||||
_ => node // Return as-is for other node types
|
||||
};
|
||||
}
|
||||
|
||||
private static IfNode CreateNormalizedIf(IfNode node, ImmutableArray<AstNode> children)
|
||||
{
|
||||
return new IfNode(
|
||||
children.Length > 0 ? children[0] : node.Condition,
|
||||
children.Length > 1 ? children[1] : node.ThenBranch,
|
||||
children.Length > 2 ? children[2] : node.ElseBranch,
|
||||
node.Location);
|
||||
}
|
||||
|
||||
private static WhileNode CreateNormalizedWhile(WhileNode node, ImmutableArray<AstNode> children)
|
||||
{
|
||||
return new WhileNode(
|
||||
children.Length > 0 ? children[0] : node.Condition,
|
||||
children.Length > 1 ? children[1] : node.Body,
|
||||
node.Location);
|
||||
}
|
||||
|
||||
private static ForNode CreateNormalizedFor(ForNode node, ImmutableArray<AstNode> children)
|
||||
{
|
||||
return new ForNode(
|
||||
children.Length > 0 ? children[0] : node.Init,
|
||||
children.Length > 1 ? children[1] : node.Condition,
|
||||
children.Length > 2 ? children[2] : node.Update,
|
||||
children.Length > 3 ? children[3] : node.Body,
|
||||
node.Location);
|
||||
}
|
||||
|
||||
private static AssignmentNode CreateNormalizedAssignment(
|
||||
AssignmentNode node,
|
||||
ImmutableArray<AstNode> children)
|
||||
{
|
||||
return new AssignmentNode(
|
||||
children.Length > 0 ? children[0] : node.Target,
|
||||
children.Length > 1 ? children[1] : node.Value,
|
||||
node.Operator,
|
||||
node.Location);
|
||||
}
|
||||
|
||||
private static BinaryOpNode CreateNormalizedBinaryOp(
|
||||
BinaryOpNode node,
|
||||
ImmutableArray<AstNode> children)
|
||||
{
|
||||
return new BinaryOpNode(
|
||||
children.Length > 0 ? children[0] : node.Left,
|
||||
children.Length > 1 ? children[1] : node.Right,
|
||||
node.Operator,
|
||||
node.Location);
|
||||
}
|
||||
|
||||
private static string RemoveComments(string code)
|
||||
{
|
||||
// Remove single-line comments
|
||||
code = SingleLineCommentRegex().Replace(code, "");
|
||||
|
||||
// Remove multi-line comments
|
||||
code = MultiLineCommentRegex().Replace(code, "");
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
private static string NormalizeVariableNames(string code, ImmutableHashSet<string>? knownFunctions)
|
||||
{
|
||||
var varIndex = 0;
|
||||
var varMap = new Dictionary<string, string>();
|
||||
|
||||
return IdentifierRegex().Replace(code, match =>
|
||||
{
|
||||
var name = match.Value;
|
||||
|
||||
// Skip keywords and types
|
||||
if (IsKeywordOrType(name))
|
||||
{
|
||||
return name;
|
||||
}
|
||||
|
||||
// Skip known functions
|
||||
if (knownFunctions?.Contains(name) == true)
|
||||
{
|
||||
return name;
|
||||
}
|
||||
|
||||
// Skip standard library functions
|
||||
if (IsStandardLibraryFunction(name))
|
||||
{
|
||||
return name;
|
||||
}
|
||||
|
||||
if (!varMap.TryGetValue(name, out var canonical))
|
||||
{
|
||||
canonical = $"var_{varIndex++}";
|
||||
varMap[name] = canonical;
|
||||
}
|
||||
|
||||
return canonical;
|
||||
});
|
||||
}
|
||||
|
||||
private static string NormalizeFunctionCalls(string code, ImmutableHashSet<string>? knownFunctions)
|
||||
{
|
||||
// Match function calls: identifier followed by (
|
||||
return FunctionCallRegex().Replace(code, match =>
|
||||
{
|
||||
var funcName = match.Groups[1].Value;
|
||||
|
||||
// Skip known functions
|
||||
if (knownFunctions?.Contains(funcName) == true)
|
||||
{
|
||||
return match.Value;
|
||||
}
|
||||
|
||||
// Skip standard library functions
|
||||
if (IsStandardLibraryFunction(funcName))
|
||||
{
|
||||
return match.Value;
|
||||
}
|
||||
|
||||
return $"func_{funcName.GetHashCode():X8}(";
|
||||
});
|
||||
}
|
||||
|
||||
private static string NormalizeConstants(string code)
|
||||
{
|
||||
// Normalize hex constants
|
||||
code = HexConstantRegex().Replace(code, "CONST_HEX");
|
||||
|
||||
// Normalize decimal constants (but preserve small common ones like 0, 1, 2)
|
||||
code = LargeDecimalRegex().Replace(code, "CONST_INT");
|
||||
|
||||
// Normalize string literals
|
||||
code = StringLiteralRegex().Replace(code, "CONST_STR");
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
private static string NormalizeWhitespace(string code)
|
||||
{
|
||||
// Collapse multiple whitespace to single space
|
||||
code = MultipleWhitespaceRegex().Replace(code, " ");
|
||||
|
||||
// Remove whitespace around operators
|
||||
code = WhitespaceAroundOperatorsRegex().Replace(code, "$1");
|
||||
|
||||
// Normalize line endings
|
||||
code = code.Replace("\r\n", "\n").Replace("\r", "\n");
|
||||
|
||||
// Remove trailing whitespace on lines
|
||||
code = TrailingWhitespaceRegex().Replace(code, "\n");
|
||||
|
||||
return code.Trim();
|
||||
}
|
||||
|
||||
private static string SortIndependentStatements(string code)
|
||||
{
|
||||
// Parse into blocks and sort independent statements within each block
|
||||
// This is a simplified implementation that sorts top-level statements
|
||||
// A full implementation would need to analyze data dependencies
|
||||
|
||||
var lines = code.Split('\n', StringSplitOptions.RemoveEmptyEntries);
|
||||
var result = new StringBuilder();
|
||||
|
||||
var blockDepth = 0;
|
||||
var currentBlock = new List<string>();
|
||||
|
||||
foreach (var line in lines)
|
||||
{
|
||||
var trimmed = line.Trim();
|
||||
|
||||
// Track block depth
|
||||
blockDepth += trimmed.Count(c => c == '{');
|
||||
blockDepth -= trimmed.Count(c => c == '}');
|
||||
|
||||
if (blockDepth == 1 && !trimmed.Contains('{') && !trimmed.Contains('}'))
|
||||
{
|
||||
// Simple statement at block level 1
|
||||
currentBlock.Add(trimmed);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Flush sorted block
|
||||
if (currentBlock.Count > 0)
|
||||
{
|
||||
var sorted = SortStatements(currentBlock);
|
||||
foreach (var stmt in sorted)
|
||||
{
|
||||
result.AppendLine(stmt);
|
||||
}
|
||||
currentBlock.Clear();
|
||||
}
|
||||
|
||||
result.AppendLine(line);
|
||||
}
|
||||
}
|
||||
|
||||
// Flush remaining
|
||||
if (currentBlock.Count > 0)
|
||||
{
|
||||
var sorted = SortStatements(currentBlock);
|
||||
foreach (var stmt in sorted)
|
||||
{
|
||||
result.AppendLine(stmt);
|
||||
}
|
||||
}
|
||||
|
||||
return result.ToString().Trim();
|
||||
}
|
||||
|
||||
private static List<string> SortStatements(List<string> statements)
|
||||
{
|
||||
// Group statements that can be reordered
|
||||
// For now, just sort by canonical form (conservative)
|
||||
return statements
|
||||
.OrderBy(s => GetStatementSortKey(s), StringComparer.Ordinal)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static string GetStatementSortKey(string statement)
|
||||
{
|
||||
// Extract the "essence" of the statement for sorting
|
||||
// e.g., assignment target, function call name
|
||||
var trimmed = statement.Trim();
|
||||
|
||||
// Assignment: sort by target
|
||||
var assignMatch = AssignmentTargetRegex().Match(trimmed);
|
||||
if (assignMatch.Success)
|
||||
{
|
||||
return $"A_{assignMatch.Groups[1].Value}";
|
||||
}
|
||||
|
||||
// Function call: sort by function name
|
||||
var callMatch = FunctionNameRegex().Match(trimmed);
|
||||
if (callMatch.Success)
|
||||
{
|
||||
return $"C_{callMatch.Groups[1].Value}";
|
||||
}
|
||||
|
||||
return $"Z_{trimmed}";
|
||||
}
|
||||
|
||||
private static bool IsKeywordOrType(string name)
|
||||
{
|
||||
return CKeywords.Contains(name);
|
||||
}
|
||||
|
||||
private static bool IsStandardLibraryFunction(string name)
|
||||
{
|
||||
// Common C standard library functions to preserve
|
||||
return name switch
|
||||
{
|
||||
// Memory
|
||||
"malloc" or "calloc" or "realloc" or "free" or "memcpy" or "memmove" or "memset" or "memcmp" => true,
|
||||
// String
|
||||
"strlen" or "strcpy" or "strncpy" or "strcat" or "strncat" or "strcmp" or "strncmp" or "strchr" or "strrchr" or "strstr" => true,
|
||||
// I/O
|
||||
"printf" or "fprintf" or "sprintf" or "snprintf" or "scanf" or "fscanf" or "sscanf" => true,
|
||||
"fopen" or "fclose" or "fread" or "fwrite" or "fseek" or "ftell" or "fflush" => true,
|
||||
"puts" or "fputs" or "gets" or "fgets" or "putchar" or "getchar" => true,
|
||||
// Math
|
||||
"abs" or "labs" or "llabs" or "fabs" or "sqrt" or "pow" or "sin" or "cos" or "tan" or "log" or "exp" => true,
|
||||
// Other
|
||||
"exit" or "abort" or "atexit" or "atoi" or "atol" or "atof" or "strtol" or "strtoul" or "strtod" => true,
|
||||
"assert" or "errno" => true,
|
||||
_ => false
|
||||
};
|
||||
}
|
||||
|
||||
// Regex patterns using source generators
|
||||
[GeneratedRegex(@"//[^\n]*")]
|
||||
private static partial Regex SingleLineCommentRegex();
|
||||
|
||||
[GeneratedRegex(@"/\*[\s\S]*?\*/")]
|
||||
private static partial Regex MultiLineCommentRegex();
|
||||
|
||||
[GeneratedRegex(@"\b([a-zA-Z_][a-zA-Z0-9_]*)\b")]
|
||||
private static partial Regex IdentifierRegex();
|
||||
|
||||
[GeneratedRegex(@"\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(")]
|
||||
private static partial Regex FunctionCallRegex();
|
||||
|
||||
[GeneratedRegex(@"0[xX][0-9a-fA-F]+")]
|
||||
private static partial Regex HexConstantRegex();
|
||||
|
||||
[GeneratedRegex(@"\b[0-9]{4,}\b")]
|
||||
private static partial Regex LargeDecimalRegex();
|
||||
|
||||
[GeneratedRegex(@"""(?:[^""\\]|\\.)*""")]
|
||||
private static partial Regex StringLiteralRegex();
|
||||
|
||||
[GeneratedRegex(@"[ \t]+")]
|
||||
private static partial Regex MultipleWhitespaceRegex();
|
||||
|
||||
[GeneratedRegex(@"\s*([+\-*/%=<>!&|^~?:;,{}()\[\]])\s*")]
|
||||
private static partial Regex WhitespaceAroundOperatorsRegex();
|
||||
|
||||
[GeneratedRegex(@"[ \t]+\n")]
|
||||
private static partial Regex TrailingWhitespaceRegex();
|
||||
|
||||
[GeneratedRegex(@"^([a-zA-Z_][a-zA-Z0-9_]*)\s*=")]
|
||||
private static partial Regex AssignmentTargetRegex();
|
||||
|
||||
[GeneratedRegex(@"^([a-zA-Z_][a-zA-Z0-9_]*)\s*\(")]
|
||||
private static partial Regex FunctionNameRegex();
|
||||
}
|
||||
@@ -0,0 +1,950 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// Parser for Ghidra's decompiled C-like pseudo-code.
|
||||
/// </summary>
|
||||
public sealed partial class DecompiledCodeParser : IDecompiledCodeParser
|
||||
{
|
||||
private static readonly HashSet<string> s_keywords =
|
||||
[
|
||||
"if", "else", "while", "for", "do", "switch", "case", "default",
|
||||
"return", "break", "continue", "goto", "sizeof", "typedef",
|
||||
"struct", "union", "enum", "void", "int", "char", "short", "long",
|
||||
"float", "double", "unsigned", "signed", "const", "static", "extern"
|
||||
];
|
||||
|
||||
private static readonly HashSet<string> s_types =
|
||||
[
|
||||
"void", "int", "uint", "char", "uchar", "byte", "ubyte",
|
||||
"short", "ushort", "long", "ulong", "longlong", "ulonglong",
|
||||
"float", "double", "bool", "undefined", "undefined1", "undefined2",
|
||||
"undefined4", "undefined8", "pointer", "code", "dword", "qword", "word"
|
||||
];
|
||||
|
||||
/// <inheritdoc />
|
||||
public DecompiledAst Parse(string code)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(code);
|
||||
|
||||
var tokens = Tokenize(code);
|
||||
var parser = new RecursiveParser(tokens);
|
||||
var root = parser.ParseFunction();
|
||||
|
||||
var nodeCount = CountNodes(root);
|
||||
var depth = ComputeDepth(root);
|
||||
var patterns = ExtractPatterns(root);
|
||||
|
||||
return new DecompiledAst(root, nodeCount, depth, patterns);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<LocalVariable> ExtractVariables(string code)
|
||||
{
|
||||
var variables = new List<LocalVariable>();
|
||||
var varIndex = 0;
|
||||
|
||||
// Match variable declarations: type name [= value];
|
||||
// Ghidra style: int local_10; or undefined8 param_1;
|
||||
var declPattern = VariableDeclarationRegex();
|
||||
|
||||
foreach (Match match in declPattern.Matches(code))
|
||||
{
|
||||
var type = match.Groups["type"].Value;
|
||||
var name = match.Groups["name"].Value;
|
||||
|
||||
var isParam = name.StartsWith("param_", StringComparison.Ordinal);
|
||||
int? paramIndex = null;
|
||||
int stackOffset = 0;
|
||||
|
||||
if (isParam && int.TryParse(name.AsSpan(6), out var idx))
|
||||
{
|
||||
paramIndex = idx;
|
||||
}
|
||||
|
||||
if (name.StartsWith("local_", StringComparison.Ordinal) &&
|
||||
int.TryParse(name.AsSpan(6), System.Globalization.NumberStyles.HexNumber, null, out var offset))
|
||||
{
|
||||
stackOffset = -offset; // Negative for locals
|
||||
}
|
||||
|
||||
variables.Add(new LocalVariable(name, type, stackOffset, isParam, paramIndex));
|
||||
varIndex++;
|
||||
}
|
||||
|
||||
return [.. variables];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<string> ExtractCalledFunctions(string code)
|
||||
{
|
||||
var functions = new HashSet<string>();
|
||||
|
||||
// Match function calls: name(...)
|
||||
var callPattern = FunctionCallRegex();
|
||||
|
||||
foreach (Match match in callPattern.Matches(code))
|
||||
{
|
||||
var name = match.Groups["name"].Value;
|
||||
|
||||
// Skip keywords and types
|
||||
if (!s_keywords.Contains(name) && !s_types.Contains(name))
|
||||
{
|
||||
functions.Add(name);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. functions.Order()];
|
||||
}
|
||||
|
||||
private static List<Token> Tokenize(string code)
|
||||
{
|
||||
var tokens = new List<Token>();
|
||||
var i = 0;
|
||||
var line = 1;
|
||||
var column = 1;
|
||||
|
||||
while (i < code.Length)
|
||||
{
|
||||
var c = code[i];
|
||||
|
||||
// Skip whitespace
|
||||
if (char.IsWhiteSpace(c))
|
||||
{
|
||||
if (c == '\n')
|
||||
{
|
||||
line++;
|
||||
column = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
column++;
|
||||
}
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip comments
|
||||
if (i + 1 < code.Length && code[i] == '/' && code[i + 1] == '/')
|
||||
{
|
||||
while (i < code.Length && code[i] != '\n')
|
||||
{
|
||||
i++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (i + 1 < code.Length && code[i] == '/' && code[i + 1] == '*')
|
||||
{
|
||||
i += 2;
|
||||
while (i + 1 < code.Length && !(code[i] == '*' && code[i + 1] == '/'))
|
||||
{
|
||||
if (code[i] == '\n')
|
||||
{
|
||||
line++;
|
||||
column = 1;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
var startColumn = column;
|
||||
|
||||
// Identifiers and keywords
|
||||
if (char.IsLetter(c) || c == '_')
|
||||
{
|
||||
var start = i;
|
||||
while (i < code.Length && (char.IsLetterOrDigit(code[i]) || code[i] == '_'))
|
||||
{
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
var value = code[start..i];
|
||||
var type = s_keywords.Contains(value) ? TokenType.Keyword : TokenType.Identifier;
|
||||
tokens.Add(new Token(type, value, line, startColumn));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Numbers
|
||||
if (char.IsDigit(c) || (c == '0' && i + 1 < code.Length && code[i + 1] == 'x'))
|
||||
{
|
||||
var start = i;
|
||||
if (c == '0' && i + 1 < code.Length && code[i + 1] == 'x')
|
||||
{
|
||||
i += 2;
|
||||
column += 2;
|
||||
while (i < code.Length && char.IsAsciiHexDigit(code[i]))
|
||||
{
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (i < code.Length && (char.IsDigit(code[i]) || code[i] == '.'))
|
||||
{
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
}
|
||||
// Handle suffixes (U, L, UL, etc.)
|
||||
while (i < code.Length && (code[i] == 'U' || code[i] == 'L' || code[i] == 'u' || code[i] == 'l'))
|
||||
{
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
tokens.Add(new Token(TokenType.Number, code[start..i], line, startColumn));
|
||||
continue;
|
||||
}
|
||||
|
||||
// String literals
|
||||
if (c == '"')
|
||||
{
|
||||
var start = i;
|
||||
i++;
|
||||
column++;
|
||||
while (i < code.Length && code[i] != '"')
|
||||
{
|
||||
if (code[i] == '\\' && i + 1 < code.Length)
|
||||
{
|
||||
i += 2;
|
||||
column += 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
}
|
||||
i++; // closing quote
|
||||
column++;
|
||||
tokens.Add(new Token(TokenType.String, code[start..i], line, startColumn));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Character literals
|
||||
if (c == '\'')
|
||||
{
|
||||
var start = i;
|
||||
i++;
|
||||
column++;
|
||||
while (i < code.Length && code[i] != '\'')
|
||||
{
|
||||
if (code[i] == '\\' && i + 1 < code.Length)
|
||||
{
|
||||
i += 2;
|
||||
column += 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
}
|
||||
i++; // closing quote
|
||||
column++;
|
||||
tokens.Add(new Token(TokenType.Char, code[start..i], line, startColumn));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Multi-character operators
|
||||
if (i + 1 < code.Length)
|
||||
{
|
||||
var twoChar = code.Substring(i, 2);
|
||||
if (twoChar is "==" or "!=" or "<=" or ">=" or "&&" or "||" or
|
||||
"++" or "--" or "+=" or "-=" or "*=" or "/=" or
|
||||
"<<" or ">>" or "->" or "::")
|
||||
{
|
||||
tokens.Add(new Token(TokenType.Operator, twoChar, line, startColumn));
|
||||
i += 2;
|
||||
column += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Single character operators and punctuation
|
||||
var tokenType = c switch
|
||||
{
|
||||
'(' or ')' or '{' or '}' or '[' or ']' => TokenType.Bracket,
|
||||
';' or ',' or ':' or '?' => TokenType.Punctuation,
|
||||
_ => TokenType.Operator
|
||||
};
|
||||
tokens.Add(new Token(tokenType, c.ToString(), line, startColumn));
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private static int CountNodes(AstNode node)
|
||||
{
|
||||
var count = 1;
|
||||
foreach (var child in node.Children)
|
||||
{
|
||||
count += CountNodes(child);
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private static int ComputeDepth(AstNode node)
|
||||
{
|
||||
if (node.Children.Length == 0)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
return 1 + node.Children.Max(c => ComputeDepth(c));
|
||||
}
|
||||
|
||||
private static ImmutableArray<AstPattern> ExtractPatterns(AstNode root)
|
||||
{
|
||||
var patterns = new List<AstPattern>();
|
||||
|
||||
foreach (var node in TraverseNodes(root))
|
||||
{
|
||||
// Detect loop patterns
|
||||
if (node.Type == AstNodeType.For)
|
||||
{
|
||||
patterns.Add(new AstPattern(
|
||||
PatternType.CountedLoop,
|
||||
node,
|
||||
new PatternMetadata("For loop", 0.9m, null)));
|
||||
}
|
||||
else if (node.Type == AstNodeType.While)
|
||||
{
|
||||
patterns.Add(new AstPattern(
|
||||
PatternType.ConditionalLoop,
|
||||
node,
|
||||
new PatternMetadata("While loop", 0.9m, null)));
|
||||
}
|
||||
else if (node.Type == AstNodeType.DoWhile)
|
||||
{
|
||||
patterns.Add(new AstPattern(
|
||||
PatternType.ConditionalLoop,
|
||||
node,
|
||||
new PatternMetadata("Do-while loop", 0.9m, null)));
|
||||
}
|
||||
|
||||
// Detect error handling
|
||||
if (node is IfNode ifNode && IsErrorCheck(ifNode))
|
||||
{
|
||||
patterns.Add(new AstPattern(
|
||||
PatternType.ErrorCheck,
|
||||
node,
|
||||
new PatternMetadata("Error check", 0.8m, null)));
|
||||
}
|
||||
|
||||
// Detect null checks
|
||||
if (node is IfNode ifNull && IsNullCheck(ifNull))
|
||||
{
|
||||
patterns.Add(new AstPattern(
|
||||
PatternType.NullCheck,
|
||||
node,
|
||||
new PatternMetadata("Null check", 0.9m, null)));
|
||||
}
|
||||
}
|
||||
|
||||
return [.. patterns];
|
||||
}
|
||||
|
||||
private static IEnumerable<AstNode> TraverseNodes(AstNode root)
|
||||
{
|
||||
yield return root;
|
||||
foreach (var child in root.Children)
|
||||
{
|
||||
foreach (var node in TraverseNodes(child))
|
||||
{
|
||||
yield return node;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static bool IsErrorCheck(IfNode node)
|
||||
{
|
||||
// Check if condition compares against -1, 0, or NULL
|
||||
if (node.Condition is BinaryOpNode binaryOp)
|
||||
{
|
||||
if (binaryOp.Right is ConstantNode constant)
|
||||
{
|
||||
var value = constant.Value?.ToString();
|
||||
return value is "0" or "-1" or "0xffffffff" or "NULL";
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static bool IsNullCheck(IfNode node)
|
||||
{
|
||||
if (node.Condition is BinaryOpNode binaryOp)
|
||||
{
|
||||
if (binaryOp.Operator is "==" or "!=")
|
||||
{
|
||||
if (binaryOp.Right is ConstantNode constant)
|
||||
{
|
||||
var value = constant.Value?.ToString();
|
||||
return value is "0" or "NULL" or "nullptr";
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"(?<type>\w+)\s+(?<name>\w+)\s*(?:=|;)", RegexOptions.Compiled)]
|
||||
private static partial Regex VariableDeclarationRegex();
|
||||
|
||||
[GeneratedRegex(@"(?<name>\w+)\s*\(", RegexOptions.Compiled)]
|
||||
private static partial Regex FunctionCallRegex();
|
||||
}
|
||||
|
||||
internal enum TokenType
|
||||
{
|
||||
Identifier,
|
||||
Keyword,
|
||||
Number,
|
||||
String,
|
||||
Char,
|
||||
Operator,
|
||||
Bracket,
|
||||
Punctuation
|
||||
}
|
||||
|
||||
internal readonly record struct Token(TokenType Type, string Value, int Line, int Column);
|
||||
|
||||
internal sealed class RecursiveParser
|
||||
{
|
||||
private readonly List<Token> _tokens;
|
||||
private int _pos;
|
||||
|
||||
public RecursiveParser(List<Token> tokens)
|
||||
{
|
||||
_tokens = tokens;
|
||||
_pos = 0;
|
||||
}
|
||||
|
||||
public AstNode ParseFunction()
|
||||
{
|
||||
// Parse return type
|
||||
var returnType = ParseType();
|
||||
|
||||
// Parse function name
|
||||
var name = Expect(TokenType.Identifier).Value;
|
||||
|
||||
// Parse parameters
|
||||
Expect(TokenType.Bracket, "(");
|
||||
var parameters = ParseParameterList();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
|
||||
// Parse body
|
||||
var body = ParseBlock();
|
||||
|
||||
return new FunctionNode(name, returnType, parameters, body);
|
||||
}
|
||||
|
||||
private string ParseType()
|
||||
{
|
||||
var type = new System.Text.StringBuilder();
|
||||
|
||||
// Handle modifiers
|
||||
while (Peek().Value is "const" or "unsigned" or "signed" or "static" or "extern")
|
||||
{
|
||||
type.Append(Advance().Value);
|
||||
type.Append(' ');
|
||||
}
|
||||
|
||||
// Main type
|
||||
type.Append(Advance().Value);
|
||||
|
||||
// Handle pointers
|
||||
while (Peek().Value == "*")
|
||||
{
|
||||
type.Append(Advance().Value);
|
||||
}
|
||||
|
||||
return type.ToString().Trim();
|
||||
}
|
||||
|
||||
private ImmutableArray<ParameterNode> ParseParameterList()
|
||||
{
|
||||
var parameters = new List<ParameterNode>();
|
||||
var index = 0;
|
||||
|
||||
if (Peek().Value == ")")
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
if (Peek().Value == "void" && PeekAhead(1).Value == ")")
|
||||
{
|
||||
Advance(); // consume void
|
||||
return [];
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
if (Peek().Value == ",")
|
||||
{
|
||||
Advance();
|
||||
}
|
||||
|
||||
var type = ParseType();
|
||||
var name = Peek().Type == TokenType.Identifier ? Advance().Value : $"param_{index}";
|
||||
|
||||
parameters.Add(new ParameterNode(name, type, index));
|
||||
index++;
|
||||
}
|
||||
while (Peek().Value == ",");
|
||||
|
||||
return [.. parameters];
|
||||
}
|
||||
|
||||
private BlockNode ParseBlock()
|
||||
{
|
||||
Expect(TokenType.Bracket, "{");
|
||||
|
||||
var statements = new List<AstNode>();
|
||||
|
||||
while (Peek().Value != "}")
|
||||
{
|
||||
var stmt = ParseStatement();
|
||||
if (stmt is not null)
|
||||
{
|
||||
statements.Add(stmt);
|
||||
}
|
||||
}
|
||||
|
||||
Expect(TokenType.Bracket, "}");
|
||||
|
||||
return new BlockNode([.. statements]);
|
||||
}
|
||||
|
||||
private AstNode? ParseStatement()
|
||||
{
|
||||
var token = Peek();
|
||||
|
||||
return token.Value switch
|
||||
{
|
||||
"if" => ParseIf(),
|
||||
"while" => ParseWhile(),
|
||||
"for" => ParseFor(),
|
||||
"do" => ParseDoWhile(),
|
||||
"return" => ParseReturn(),
|
||||
"break" => ParseBreak(),
|
||||
"continue" => ParseContinue(),
|
||||
"{" => ParseBlock(),
|
||||
";" => SkipSemicolon(),
|
||||
_ => ParseExpressionStatement()
|
||||
};
|
||||
}
|
||||
|
||||
private IfNode ParseIf()
|
||||
{
|
||||
Advance(); // consume 'if'
|
||||
Expect(TokenType.Bracket, "(");
|
||||
var condition = ParseExpression();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
|
||||
var thenBranch = ParseStatement() ?? new BlockNode([]);
|
||||
|
||||
AstNode? elseBranch = null;
|
||||
if (Peek().Value == "else")
|
||||
{
|
||||
Advance();
|
||||
elseBranch = ParseStatement();
|
||||
}
|
||||
|
||||
return new IfNode(condition, thenBranch, elseBranch);
|
||||
}
|
||||
|
||||
private WhileNode ParseWhile()
|
||||
{
|
||||
Advance(); // consume 'while'
|
||||
Expect(TokenType.Bracket, "(");
|
||||
var condition = ParseExpression();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
|
||||
var body = ParseStatement() ?? new BlockNode([]);
|
||||
|
||||
return new WhileNode(condition, body);
|
||||
}
|
||||
|
||||
private ForNode ParseFor()
|
||||
{
|
||||
Advance(); // consume 'for'
|
||||
Expect(TokenType.Bracket, "(");
|
||||
|
||||
AstNode? init = null;
|
||||
if (Peek().Value != ";")
|
||||
{
|
||||
init = ParseExpression();
|
||||
}
|
||||
Expect(TokenType.Punctuation, ";");
|
||||
|
||||
AstNode? condition = null;
|
||||
if (Peek().Value != ";")
|
||||
{
|
||||
condition = ParseExpression();
|
||||
}
|
||||
Expect(TokenType.Punctuation, ";");
|
||||
|
||||
AstNode? update = null;
|
||||
if (Peek().Value != ")")
|
||||
{
|
||||
update = ParseExpression();
|
||||
}
|
||||
Expect(TokenType.Bracket, ")");
|
||||
|
||||
var body = ParseStatement() ?? new BlockNode([]);
|
||||
|
||||
return new ForNode(init, condition, update, body);
|
||||
}
|
||||
|
||||
private AstNode ParseDoWhile()
|
||||
{
|
||||
Advance(); // consume 'do'
|
||||
var body = ParseStatement() ?? new BlockNode([]);
|
||||
|
||||
Expect(TokenType.Keyword, "while");
|
||||
Expect(TokenType.Bracket, "(");
|
||||
var condition = ParseExpression();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
Expect(TokenType.Punctuation, ";");
|
||||
|
||||
return new WhileNode(condition, body); // Simplify do-while to while for now
|
||||
}
|
||||
|
||||
private ReturnNode ParseReturn()
|
||||
{
|
||||
Advance(); // consume 'return'
|
||||
|
||||
AstNode? value = null;
|
||||
if (Peek().Value != ";")
|
||||
{
|
||||
value = ParseExpression();
|
||||
}
|
||||
Expect(TokenType.Punctuation, ";");
|
||||
|
||||
return new ReturnNode(value);
|
||||
}
|
||||
|
||||
private AstNode ParseBreak()
|
||||
{
|
||||
Advance();
|
||||
Expect(TokenType.Punctuation, ";");
|
||||
return new BlockNode([]); // Simplified
|
||||
}
|
||||
|
||||
private AstNode ParseContinue()
|
||||
{
|
||||
Advance();
|
||||
Expect(TokenType.Punctuation, ";");
|
||||
return new BlockNode([]); // Simplified
|
||||
}
|
||||
|
||||
private AstNode? SkipSemicolon()
|
||||
{
|
||||
Advance();
|
||||
return null;
|
||||
}
|
||||
|
||||
private AstNode? ParseExpressionStatement()
|
||||
{
|
||||
var expr = ParseExpression();
|
||||
if (Peek().Value == ";")
|
||||
{
|
||||
Advance();
|
||||
}
|
||||
return expr;
|
||||
}
|
||||
|
||||
private AstNode ParseExpression()
|
||||
{
|
||||
return ParseAssignment();
|
||||
}
|
||||
|
||||
private AstNode ParseAssignment()
|
||||
{
|
||||
var left = ParseLogicalOr();
|
||||
|
||||
if (Peek().Value is "=" or "+=" or "-=" or "*=" or "/=" or "&=" or "|=" or "^=" or "<<=" or ">>=")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseAssignment();
|
||||
return new AssignmentNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseLogicalOr()
|
||||
{
|
||||
var left = ParseLogicalAnd();
|
||||
|
||||
while (Peek().Value == "||")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseLogicalAnd();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseLogicalAnd()
|
||||
{
|
||||
var left = ParseBitwiseOr();
|
||||
|
||||
while (Peek().Value == "&&")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseBitwiseOr();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseBitwiseOr()
|
||||
{
|
||||
var left = ParseComparison();
|
||||
|
||||
while (Peek().Value is "|" or "^" or "&")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseComparison();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseComparison()
|
||||
{
|
||||
var left = ParseShift();
|
||||
|
||||
while (Peek().Value is "==" or "!=" or "<" or ">" or "<=" or ">=")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseShift();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseShift()
|
||||
{
|
||||
var left = ParseAdditive();
|
||||
|
||||
while (Peek().Value is "<<" or ">>")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseAdditive();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseAdditive()
|
||||
{
|
||||
var left = ParseMultiplicative();
|
||||
|
||||
while (Peek().Value is "+" or "-")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseMultiplicative();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseMultiplicative()
|
||||
{
|
||||
var left = ParseUnary();
|
||||
|
||||
while (Peek().Value is "*" or "/" or "%")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseUnary();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseUnary()
|
||||
{
|
||||
if (Peek().Value is "!" or "~" or "-" or "+" or "*" or "&" or "++" or "--")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var operand = ParseUnary();
|
||||
return new UnaryOpNode(operand, op, true);
|
||||
}
|
||||
|
||||
return ParsePostfix();
|
||||
}
|
||||
|
||||
private AstNode ParsePostfix()
|
||||
{
|
||||
var expr = ParsePrimary();
|
||||
|
||||
while (true)
|
||||
{
|
||||
if (Peek().Value == "(")
|
||||
{
|
||||
// Function call
|
||||
Advance();
|
||||
var args = ParseArgumentList();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
|
||||
if (expr is VariableNode varNode)
|
||||
{
|
||||
expr = new CallNode(varNode.Name, args);
|
||||
}
|
||||
}
|
||||
else if (Peek().Value == "[")
|
||||
{
|
||||
// Array access
|
||||
Advance();
|
||||
var index = ParseExpression();
|
||||
Expect(TokenType.Bracket, "]");
|
||||
expr = new ArrayAccessNode(expr, index);
|
||||
}
|
||||
else if (Peek().Value is "." or "->")
|
||||
{
|
||||
var isPointer = Advance().Value == "->";
|
||||
var field = Expect(TokenType.Identifier).Value;
|
||||
expr = new FieldAccessNode(expr, field, isPointer);
|
||||
}
|
||||
else if (Peek().Value is "++" or "--")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
expr = new UnaryOpNode(expr, op, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return expr;
|
||||
}
|
||||
|
||||
private ImmutableArray<AstNode> ParseArgumentList()
|
||||
{
|
||||
var args = new List<AstNode>();
|
||||
|
||||
if (Peek().Value == ")")
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
if (Peek().Value == ",")
|
||||
{
|
||||
Advance();
|
||||
}
|
||||
args.Add(ParseExpression());
|
||||
}
|
||||
while (Peek().Value == ",");
|
||||
|
||||
return [.. args];
|
||||
}
|
||||
|
||||
private AstNode ParsePrimary()
|
||||
{
|
||||
var token = Peek();
|
||||
|
||||
if (token.Type == TokenType.Number)
|
||||
{
|
||||
Advance();
|
||||
return new ConstantNode(token.Value, "int");
|
||||
}
|
||||
|
||||
if (token.Type == TokenType.String)
|
||||
{
|
||||
Advance();
|
||||
return new ConstantNode(token.Value, "char*");
|
||||
}
|
||||
|
||||
if (token.Type == TokenType.Char)
|
||||
{
|
||||
Advance();
|
||||
return new ConstantNode(token.Value, "char");
|
||||
}
|
||||
|
||||
if (token.Type == TokenType.Identifier)
|
||||
{
|
||||
Advance();
|
||||
return new VariableNode(token.Value, null);
|
||||
}
|
||||
|
||||
if (token.Value == "(")
|
||||
{
|
||||
Advance();
|
||||
|
||||
// Check for cast
|
||||
if (IsType(Peek().Value))
|
||||
{
|
||||
var targetType = ParseType();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
var expr = ParseUnary();
|
||||
return new CastNode(expr, targetType);
|
||||
}
|
||||
|
||||
var inner = ParseExpression();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
return inner;
|
||||
}
|
||||
|
||||
// Handle sizeof
|
||||
if (token.Value == "sizeof")
|
||||
{
|
||||
Advance();
|
||||
Expect(TokenType.Bracket, "(");
|
||||
var type = ParseType();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
return new ConstantNode($"sizeof({type})", "size_t");
|
||||
}
|
||||
|
||||
// Unknown token - return empty node
|
||||
Advance();
|
||||
return new ConstantNode(token.Value, "unknown");
|
||||
}
|
||||
|
||||
private static bool IsType(string value)
|
||||
{
|
||||
return value is "int" or "char" or "void" or "long" or "short" or "float" or "double"
|
||||
or "unsigned" or "signed" or "const" or "struct" or "union" or "enum"
|
||||
or "undefined" or "undefined1" or "undefined2" or "undefined4" or "undefined8"
|
||||
or "byte" or "word" or "dword" or "qword" or "pointer" or "code" or "uint" or "ulong";
|
||||
}
|
||||
|
||||
private Token Peek() => _pos < _tokens.Count ? _tokens[_pos] : new Token(TokenType.Punctuation, "", 0, 0);
|
||||
|
||||
private Token PeekAhead(int offset) => _pos + offset < _tokens.Count
|
||||
? _tokens[_pos + offset]
|
||||
: new Token(TokenType.Punctuation, "", 0, 0);
|
||||
|
||||
private Token Advance() => _pos < _tokens.Count ? _tokens[_pos++] : new Token(TokenType.Punctuation, "", 0, 0);
|
||||
|
||||
private Token Expect(TokenType type, string? value = null)
|
||||
{
|
||||
var token = Peek();
|
||||
if (token.Type != type || (value is not null && token.Value != value))
|
||||
{
|
||||
// Skip unexpected tokens
|
||||
return Advance();
|
||||
}
|
||||
return Advance();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering decompiler services.
|
||||
/// </summary>
|
||||
public static class DecompilerServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds decompiler services to the service collection.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDecompilerServices(this IServiceCollection services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
// Register parser
|
||||
services.AddSingleton<IDecompiledCodeParser, DecompiledCodeParser>();
|
||||
|
||||
// Register comparison engine
|
||||
services.AddSingleton<IAstComparisonEngine, AstComparisonEngine>();
|
||||
|
||||
// Register normalizer
|
||||
services.AddSingleton<ICodeNormalizer, CodeNormalizer>();
|
||||
|
||||
// Register decompiler service
|
||||
services.AddScoped<IDecompilerService, GhidraDecompilerAdapter>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds decompiler services with custom options.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureOptions">Action to configure decompiler options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDecompilerServices(
|
||||
this IServiceCollection services,
|
||||
Action<DecompilerOptions> configureOptions)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(configureOptions);
|
||||
|
||||
services.Configure(configureOptions);
|
||||
return services.AddDecompilerServices();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,291 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// Adapter for Ghidra's decompiler via headless analysis.
|
||||
/// </summary>
|
||||
public sealed class GhidraDecompilerAdapter : IDecompilerService
|
||||
{
|
||||
private readonly IGhidraService _ghidraService;
|
||||
private readonly IDecompiledCodeParser _parser;
|
||||
private readonly IAstComparisonEngine _comparisonEngine;
|
||||
private readonly DecompilerOptions _options;
|
||||
private readonly ILogger<GhidraDecompilerAdapter> _logger;
|
||||
|
||||
public GhidraDecompilerAdapter(
|
||||
IGhidraService ghidraService,
|
||||
IDecompiledCodeParser parser,
|
||||
IAstComparisonEngine comparisonEngine,
|
||||
IOptions<DecompilerOptions> options,
|
||||
ILogger<GhidraDecompilerAdapter> logger)
|
||||
{
|
||||
_ghidraService = ghidraService;
|
||||
_parser = parser;
|
||||
_comparisonEngine = comparisonEngine;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DecompiledFunction> DecompileAsync(
|
||||
GhidraFunction function,
|
||||
DecompileOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(function);
|
||||
|
||||
options ??= new DecompileOptions();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Decompiling function {Name} at 0x{Address:X}",
|
||||
function.Name,
|
||||
function.Address);
|
||||
|
||||
// The GhidraFunction should already have decompiled code from analysis
|
||||
var code = function.DecompiledCode;
|
||||
|
||||
if (string.IsNullOrEmpty(code))
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Function {Name} has no decompiled code, returning stub",
|
||||
function.Name);
|
||||
|
||||
return new DecompiledFunction(
|
||||
function.Name,
|
||||
BuildSignature(function),
|
||||
"/* Decompilation unavailable */",
|
||||
null,
|
||||
[],
|
||||
[],
|
||||
function.Address,
|
||||
function.Size);
|
||||
}
|
||||
|
||||
// Truncate if too long
|
||||
if (code.Length > options.MaxCodeLength)
|
||||
{
|
||||
code = code[..options.MaxCodeLength] + "\n/* ... truncated ... */";
|
||||
}
|
||||
|
||||
// Parse to AST
|
||||
DecompiledAst? ast = null;
|
||||
try
|
||||
{
|
||||
ast = _parser.Parse(code);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse decompiled code for {Name}", function.Name);
|
||||
}
|
||||
|
||||
// Extract metadata
|
||||
var locals = _parser.ExtractVariables(code);
|
||||
var calledFunctions = _parser.ExtractCalledFunctions(code);
|
||||
|
||||
return new DecompiledFunction(
|
||||
function.Name,
|
||||
BuildSignature(function),
|
||||
code,
|
||||
ast,
|
||||
locals,
|
||||
calledFunctions,
|
||||
function.Address,
|
||||
function.Size);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DecompiledFunction> DecompileAtAddressAsync(
|
||||
string binaryPath,
|
||||
ulong address,
|
||||
DecompileOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(binaryPath);
|
||||
|
||||
options ??= new DecompileOptions();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Decompiling function at 0x{Address:X} in {Binary}",
|
||||
address,
|
||||
Path.GetFileName(binaryPath));
|
||||
|
||||
// Use Ghidra to analyze and get the function
|
||||
using var stream = File.OpenRead(binaryPath);
|
||||
var analysis = await _ghidraService.AnalyzeAsync(
|
||||
stream,
|
||||
new GhidraAnalysisOptions
|
||||
{
|
||||
IncludeDecompilation = true,
|
||||
ExtractDecompilation = true
|
||||
},
|
||||
ct);
|
||||
|
||||
var function = analysis.Functions.FirstOrDefault(f => f.Address == address);
|
||||
|
||||
if (function is null)
|
||||
{
|
||||
throw new InvalidOperationException($"No function found at address 0x{address:X}");
|
||||
}
|
||||
|
||||
return await DecompileAsync(function, options, ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<DecompiledAst> ParseToAstAsync(
|
||||
string decompiledCode,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(decompiledCode);
|
||||
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var ast = _parser.Parse(decompiledCode);
|
||||
return Task.FromResult(ast);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<DecompiledComparisonResult> CompareAsync(
|
||||
DecompiledFunction a,
|
||||
DecompiledFunction b,
|
||||
ComparisonOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
|
||||
options ??= new ComparisonOptions();
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Comparing functions {A} and {B}",
|
||||
a.FunctionName,
|
||||
b.FunctionName);
|
||||
|
||||
// Need ASTs for comparison
|
||||
if (a.Ast is null || b.Ast is null)
|
||||
{
|
||||
_logger.LogWarning("Cannot compare functions without ASTs");
|
||||
|
||||
return Task.FromResult(new DecompiledComparisonResult(
|
||||
Similarity: 0,
|
||||
StructuralSimilarity: 0,
|
||||
SemanticSimilarity: 0,
|
||||
EditDistance: new AstEditDistance(0, 0, 0, 0, 1.0m),
|
||||
Equivalences: [],
|
||||
Differences: [],
|
||||
Confidence: ComparisonConfidence.Low));
|
||||
}
|
||||
|
||||
// Compute structural similarity
|
||||
var structuralSimilarity = _comparisonEngine.ComputeStructuralSimilarity(a.Ast, b.Ast);
|
||||
|
||||
// Compute edit distance
|
||||
var editDistance = _comparisonEngine.ComputeEditDistance(a.Ast, b.Ast);
|
||||
|
||||
// Find semantic equivalences
|
||||
var equivalences = _comparisonEngine.FindEquivalences(a.Ast, b.Ast);
|
||||
|
||||
// Find differences
|
||||
var differences = _comparisonEngine.FindDifferences(a.Ast, b.Ast);
|
||||
|
||||
// Compute semantic similarity from equivalences
|
||||
var totalNodes = Math.Max(a.Ast.NodeCount, b.Ast.NodeCount);
|
||||
var equivalentNodes = equivalences.Length;
|
||||
var semanticSimilarity = totalNodes > 0
|
||||
? (decimal)equivalentNodes / totalNodes
|
||||
: 0m;
|
||||
|
||||
// Combine into overall similarity
|
||||
var overallSimilarity = ComputeOverallSimilarity(
|
||||
structuralSimilarity,
|
||||
semanticSimilarity,
|
||||
editDistance.NormalizedDistance);
|
||||
|
||||
// Determine confidence
|
||||
var confidence = DetermineConfidence(
|
||||
overallSimilarity,
|
||||
a.Ast.NodeCount,
|
||||
b.Ast.NodeCount,
|
||||
equivalences.Length);
|
||||
|
||||
return Task.FromResult(new DecompiledComparisonResult(
|
||||
Similarity: overallSimilarity,
|
||||
StructuralSimilarity: structuralSimilarity,
|
||||
SemanticSimilarity: semanticSimilarity,
|
||||
EditDistance: editDistance,
|
||||
Equivalences: equivalences,
|
||||
Differences: differences,
|
||||
Confidence: confidence));
|
||||
}
|
||||
|
||||
private static string BuildSignature(GhidraFunction function)
|
||||
{
|
||||
// Use the signature from Ghidra if available, otherwise construct a simple one
|
||||
if (!string.IsNullOrEmpty(function.Signature))
|
||||
{
|
||||
return function.Signature;
|
||||
}
|
||||
|
||||
// Default signature if none available
|
||||
return $"void {function.Name}(void)";
|
||||
}
|
||||
|
||||
private static decimal ComputeOverallSimilarity(
|
||||
decimal structural,
|
||||
decimal semantic,
|
||||
decimal normalizedEditDistance)
|
||||
{
|
||||
// Weight: 40% structural, 40% semantic, 20% edit distance (inverted)
|
||||
var editSimilarity = 1.0m - normalizedEditDistance;
|
||||
return structural * 0.4m + semantic * 0.4m + editSimilarity * 0.2m;
|
||||
}
|
||||
|
||||
private static ComparisonConfidence DetermineConfidence(
|
||||
decimal similarity,
|
||||
int nodeCountA,
|
||||
int nodeCountB,
|
||||
int equivalenceCount)
|
||||
{
|
||||
// Very small functions are harder to compare confidently
|
||||
var minNodes = Math.Min(nodeCountA, nodeCountB);
|
||||
if (minNodes < 5)
|
||||
{
|
||||
return ComparisonConfidence.Low;
|
||||
}
|
||||
|
||||
// High similarity with many equivalences = high confidence
|
||||
if (similarity > 0.9m && equivalenceCount > minNodes * 0.7)
|
||||
{
|
||||
return ComparisonConfidence.VeryHigh;
|
||||
}
|
||||
|
||||
if (similarity > 0.7m && equivalenceCount > minNodes * 0.5)
|
||||
{
|
||||
return ComparisonConfidence.High;
|
||||
}
|
||||
|
||||
if (similarity > 0.5m)
|
||||
{
|
||||
return ComparisonConfidence.Medium;
|
||||
}
|
||||
|
||||
return ComparisonConfidence.Low;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for the decompiler adapter.
|
||||
/// </summary>
|
||||
public sealed class DecompilerOptions
|
||||
{
|
||||
public string GhidraScriptsPath { get; set; } = "/scripts";
|
||||
public TimeSpan DefaultTimeout { get; set; } = TimeSpan.FromSeconds(30);
|
||||
public int MaxCodeLength { get; set; } = 100_000;
|
||||
}
|
||||
@@ -0,0 +1,157 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// Service for decompiling binary functions to C-like pseudo-code.
|
||||
/// </summary>
|
||||
public interface IDecompilerService
|
||||
{
|
||||
/// <summary>
|
||||
/// Decompile a function to C-like pseudo-code.
|
||||
/// </summary>
|
||||
/// <param name="function">Function from Ghidra analysis.</param>
|
||||
/// <param name="options">Decompilation options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Decompiled function with code and optional AST.</returns>
|
||||
Task<DecompiledFunction> DecompileAsync(
|
||||
GhidraFunction function,
|
||||
DecompileOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Decompile a function by address.
|
||||
/// </summary>
|
||||
/// <param name="binaryPath">Path to the binary file.</param>
|
||||
/// <param name="address">Function address.</param>
|
||||
/// <param name="options">Decompilation options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Decompiled function.</returns>
|
||||
Task<DecompiledFunction> DecompileAtAddressAsync(
|
||||
string binaryPath,
|
||||
ulong address,
|
||||
DecompileOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Parse decompiled code into AST.
|
||||
/// </summary>
|
||||
/// <param name="decompiledCode">C-like pseudo-code from decompiler.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Abstract syntax tree representation.</returns>
|
||||
Task<DecompiledAst> ParseToAstAsync(
|
||||
string decompiledCode,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Compare two decompiled functions for semantic equivalence.
|
||||
/// </summary>
|
||||
/// <param name="a">First function.</param>
|
||||
/// <param name="b">Second function.</param>
|
||||
/// <param name="options">Comparison options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Comparison result with similarity metrics.</returns>
|
||||
Task<DecompiledComparisonResult> CompareAsync(
|
||||
DecompiledFunction a,
|
||||
DecompiledFunction b,
|
||||
ComparisonOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Engine for comparing AST structures.
|
||||
/// </summary>
|
||||
public interface IAstComparisonEngine
|
||||
{
|
||||
/// <summary>
|
||||
/// Compute structural similarity between ASTs.
|
||||
/// </summary>
|
||||
/// <param name="a">First AST.</param>
|
||||
/// <param name="b">Second AST.</param>
|
||||
/// <returns>Similarity score (0.0 to 1.0).</returns>
|
||||
decimal ComputeStructuralSimilarity(DecompiledAst a, DecompiledAst b);
|
||||
|
||||
/// <summary>
|
||||
/// Compute edit distance between ASTs.
|
||||
/// </summary>
|
||||
/// <param name="a">First AST.</param>
|
||||
/// <param name="b">Second AST.</param>
|
||||
/// <returns>Edit distance metrics.</returns>
|
||||
AstEditDistance ComputeEditDistance(DecompiledAst a, DecompiledAst b);
|
||||
|
||||
/// <summary>
|
||||
/// Find semantic equivalences between ASTs.
|
||||
/// </summary>
|
||||
/// <param name="a">First AST.</param>
|
||||
/// <param name="b">Second AST.</param>
|
||||
/// <returns>List of equivalent node pairs.</returns>
|
||||
ImmutableArray<SemanticEquivalence> FindEquivalences(DecompiledAst a, DecompiledAst b);
|
||||
|
||||
/// <summary>
|
||||
/// Find differences between ASTs.
|
||||
/// </summary>
|
||||
/// <param name="a">First AST.</param>
|
||||
/// <param name="b">Second AST.</param>
|
||||
/// <returns>List of differences.</returns>
|
||||
ImmutableArray<CodeDifference> FindDifferences(DecompiledAst a, DecompiledAst b);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Normalizes decompiled code for comparison.
|
||||
/// </summary>
|
||||
public interface ICodeNormalizer
|
||||
{
|
||||
/// <summary>
|
||||
/// Normalize decompiled code for comparison.
|
||||
/// </summary>
|
||||
/// <param name="code">Raw decompiled code.</param>
|
||||
/// <param name="options">Normalization options.</param>
|
||||
/// <returns>Normalized code.</returns>
|
||||
string Normalize(string code, NormalizationOptions? options = null);
|
||||
|
||||
/// <summary>
|
||||
/// Compute canonical hash of normalized code.
|
||||
/// </summary>
|
||||
/// <param name="code">Decompiled code.</param>
|
||||
/// <returns>32-byte hash.</returns>
|
||||
byte[] ComputeCanonicalHash(string code);
|
||||
|
||||
/// <summary>
|
||||
/// Normalize an AST for comparison.
|
||||
/// </summary>
|
||||
/// <param name="ast">AST to normalize.</param>
|
||||
/// <param name="options">Normalization options.</param>
|
||||
/// <returns>Normalized AST.</returns>
|
||||
DecompiledAst NormalizeAst(DecompiledAst ast, NormalizationOptions? options = null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parses decompiled C-like code into AST.
|
||||
/// </summary>
|
||||
public interface IDecompiledCodeParser
|
||||
{
|
||||
/// <summary>
|
||||
/// Parse decompiled code into AST.
|
||||
/// </summary>
|
||||
/// <param name="code">C-like pseudo-code.</param>
|
||||
/// <returns>Parsed AST.</returns>
|
||||
DecompiledAst Parse(string code);
|
||||
|
||||
/// <summary>
|
||||
/// Extract local variables from decompiled code.
|
||||
/// </summary>
|
||||
/// <param name="code">C-like pseudo-code.</param>
|
||||
/// <returns>List of local variables.</returns>
|
||||
ImmutableArray<LocalVariable> ExtractVariables(string code);
|
||||
|
||||
/// <summary>
|
||||
/// Extract called functions from decompiled code.
|
||||
/// </summary>
|
||||
/// <param name="code">C-like pseudo-code.</param>
|
||||
/// <returns>List of function names called.</returns>
|
||||
ImmutableArray<string> ExtractCalledFunctions(string code);
|
||||
}
|
||||
@@ -0,0 +1,377 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// A function decompiled to C-like pseudo-code.
|
||||
/// </summary>
|
||||
public sealed record DecompiledFunction(
|
||||
string FunctionName,
|
||||
string Signature,
|
||||
string Code,
|
||||
DecompiledAst? Ast,
|
||||
ImmutableArray<LocalVariable> Locals,
|
||||
ImmutableArray<string> CalledFunctions,
|
||||
ulong Address,
|
||||
int SizeBytes);
|
||||
|
||||
/// <summary>
|
||||
/// AST representation of decompiled code.
|
||||
/// </summary>
|
||||
public sealed record DecompiledAst(
|
||||
AstNode Root,
|
||||
int NodeCount,
|
||||
int Depth,
|
||||
ImmutableArray<AstPattern> Patterns);
|
||||
|
||||
/// <summary>
|
||||
/// Abstract syntax tree node.
|
||||
/// </summary>
|
||||
public abstract record AstNode(
|
||||
AstNodeType Type,
|
||||
ImmutableArray<AstNode> Children,
|
||||
SourceLocation? Location);
|
||||
|
||||
/// <summary>
|
||||
/// Types of AST nodes.
|
||||
/// </summary>
|
||||
public enum AstNodeType
|
||||
{
|
||||
// Structure
|
||||
Function,
|
||||
Block,
|
||||
Parameter,
|
||||
|
||||
// Control flow
|
||||
If,
|
||||
While,
|
||||
For,
|
||||
DoWhile,
|
||||
Switch,
|
||||
Case,
|
||||
Default,
|
||||
Return,
|
||||
Break,
|
||||
Continue,
|
||||
Goto,
|
||||
Label,
|
||||
|
||||
// Expressions
|
||||
Assignment,
|
||||
BinaryOp,
|
||||
UnaryOp,
|
||||
TernaryOp,
|
||||
Call,
|
||||
Cast,
|
||||
Sizeof,
|
||||
|
||||
// Operands
|
||||
Variable,
|
||||
Constant,
|
||||
StringLiteral,
|
||||
ArrayAccess,
|
||||
FieldAccess,
|
||||
PointerDeref,
|
||||
AddressOf,
|
||||
|
||||
// Declarations
|
||||
VariableDecl,
|
||||
TypeDef
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Source location in decompiled code.
|
||||
/// </summary>
|
||||
public sealed record SourceLocation(int Line, int Column, int Length);
|
||||
|
||||
/// <summary>
|
||||
/// A local variable in decompiled code.
|
||||
/// </summary>
|
||||
public sealed record LocalVariable(
|
||||
string Name,
|
||||
string Type,
|
||||
int StackOffset,
|
||||
bool IsParameter,
|
||||
int? ParameterIndex);
|
||||
|
||||
/// <summary>
|
||||
/// A recognized code pattern.
|
||||
/// </summary>
|
||||
public sealed record AstPattern(
|
||||
PatternType Type,
|
||||
AstNode Node,
|
||||
PatternMetadata? Metadata);
|
||||
|
||||
/// <summary>
|
||||
/// Types of code patterns.
|
||||
/// </summary>
|
||||
public enum PatternType
|
||||
{
|
||||
// Loops
|
||||
CountedLoop,
|
||||
ConditionalLoop,
|
||||
InfiniteLoop,
|
||||
LoopUnrolled,
|
||||
|
||||
// Branches
|
||||
IfElseChain,
|
||||
SwitchTable,
|
||||
ShortCircuit,
|
||||
|
||||
// Memory
|
||||
MemoryAllocation,
|
||||
MemoryDeallocation,
|
||||
BufferOperation,
|
||||
StackBuffer,
|
||||
|
||||
// Error handling
|
||||
ErrorCheck,
|
||||
NullCheck,
|
||||
BoundsCheck,
|
||||
|
||||
// Idioms
|
||||
StringOperation,
|
||||
MathOperation,
|
||||
BitwiseOperation,
|
||||
TableLookup
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metadata about a recognized pattern.
|
||||
/// </summary>
|
||||
public sealed record PatternMetadata(
|
||||
string Description,
|
||||
decimal Confidence,
|
||||
ImmutableDictionary<string, string>? Properties);
|
||||
|
||||
/// <summary>
|
||||
/// Result of comparing two decompiled functions.
|
||||
/// </summary>
|
||||
public sealed record DecompiledComparisonResult(
|
||||
decimal Similarity,
|
||||
decimal StructuralSimilarity,
|
||||
decimal SemanticSimilarity,
|
||||
AstEditDistance EditDistance,
|
||||
ImmutableArray<SemanticEquivalence> Equivalences,
|
||||
ImmutableArray<CodeDifference> Differences,
|
||||
ComparisonConfidence Confidence);
|
||||
|
||||
/// <summary>
|
||||
/// Edit distance between ASTs.
|
||||
/// </summary>
|
||||
public sealed record AstEditDistance(
|
||||
int Insertions,
|
||||
int Deletions,
|
||||
int Modifications,
|
||||
int TotalOperations,
|
||||
decimal NormalizedDistance);
|
||||
|
||||
/// <summary>
|
||||
/// A semantic equivalence between AST nodes.
|
||||
/// </summary>
|
||||
public sealed record SemanticEquivalence(
|
||||
AstNode NodeA,
|
||||
AstNode NodeB,
|
||||
EquivalenceType Type,
|
||||
decimal Confidence,
|
||||
string? Explanation);
|
||||
|
||||
/// <summary>
|
||||
/// Types of semantic equivalence.
|
||||
/// </summary>
|
||||
public enum EquivalenceType
|
||||
{
|
||||
Identical,
|
||||
Renamed,
|
||||
Reordered,
|
||||
Optimized,
|
||||
Inlined,
|
||||
Semantically
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A difference between two pieces of code.
|
||||
/// </summary>
|
||||
public sealed record CodeDifference(
|
||||
DifferenceType Type,
|
||||
AstNode? NodeA,
|
||||
AstNode? NodeB,
|
||||
string Description);
|
||||
|
||||
/// <summary>
|
||||
/// Types of code differences.
|
||||
/// </summary>
|
||||
public enum DifferenceType
|
||||
{
|
||||
Added,
|
||||
Removed,
|
||||
Modified,
|
||||
Reordered,
|
||||
TypeChanged,
|
||||
OptimizationVariant
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level for comparison results.
|
||||
/// </summary>
|
||||
public enum ComparisonConfidence
|
||||
{
|
||||
Low,
|
||||
Medium,
|
||||
High,
|
||||
VeryHigh
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for decompilation.
|
||||
/// </summary>
|
||||
public sealed record DecompileOptions
|
||||
{
|
||||
public bool SimplifyCode { get; init; } = true;
|
||||
public bool RecoverTypes { get; init; } = true;
|
||||
public bool RecoverStructs { get; init; } = true;
|
||||
public int MaxCodeLength { get; init; } = 100_000;
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromSeconds(30);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for AST comparison.
|
||||
/// </summary>
|
||||
public sealed record ComparisonOptions
|
||||
{
|
||||
public bool IgnoreVariableNames { get; init; } = true;
|
||||
public bool IgnoreConstants { get; init; } = false;
|
||||
public bool DetectOptimizations { get; init; } = true;
|
||||
public decimal MinSimilarityThreshold { get; init; } = 0.5m;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for code normalization.
|
||||
/// </summary>
|
||||
public sealed record NormalizationOptions
|
||||
{
|
||||
public bool NormalizeVariables { get; init; } = true;
|
||||
public bool NormalizeFunctionCalls { get; init; } = true;
|
||||
public bool NormalizeConstants { get; init; } = false;
|
||||
public bool NormalizeWhitespace { get; init; } = true;
|
||||
public bool SortIndependentStatements { get; init; } = false;
|
||||
public ImmutableHashSet<string>? KnownFunctions { get; init; }
|
||||
|
||||
public static NormalizationOptions Default { get; } = new();
|
||||
}
|
||||
|
||||
#region Concrete AST Node Types
|
||||
|
||||
public sealed record FunctionNode(
|
||||
string Name,
|
||||
string ReturnType,
|
||||
ImmutableArray<ParameterNode> Parameters,
|
||||
BlockNode Body,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Function, [Body, .. Parameters], Location);
|
||||
|
||||
public sealed record ParameterNode(
|
||||
string Name,
|
||||
string DataType,
|
||||
int Index,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Parameter, [], Location);
|
||||
|
||||
public sealed record BlockNode(
|
||||
ImmutableArray<AstNode> Statements,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Block, Statements, Location);
|
||||
|
||||
public sealed record IfNode(
|
||||
AstNode Condition,
|
||||
AstNode ThenBranch,
|
||||
AstNode? ElseBranch,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.If, ElseBranch is null ? [Condition, ThenBranch] : [Condition, ThenBranch, ElseBranch], Location);
|
||||
|
||||
public sealed record WhileNode(
|
||||
AstNode Condition,
|
||||
AstNode Body,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.While, [Condition, Body], Location);
|
||||
|
||||
public sealed record ForNode(
|
||||
AstNode? Init,
|
||||
AstNode? Condition,
|
||||
AstNode? Update,
|
||||
AstNode Body,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.For, [Init ?? EmptyNode.Instance, Condition ?? EmptyNode.Instance, Update ?? EmptyNode.Instance, Body], Location);
|
||||
|
||||
public sealed record ReturnNode(
|
||||
AstNode? Value,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Return, Value is null ? [] : [Value], Location);
|
||||
|
||||
public sealed record AssignmentNode(
|
||||
AstNode Target,
|
||||
AstNode Value,
|
||||
string Operator,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Assignment, [Target, Value], Location);
|
||||
|
||||
public sealed record BinaryOpNode(
|
||||
AstNode Left,
|
||||
AstNode Right,
|
||||
string Operator,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.BinaryOp, [Left, Right], Location);
|
||||
|
||||
public sealed record UnaryOpNode(
|
||||
AstNode Operand,
|
||||
string Operator,
|
||||
bool IsPrefix,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.UnaryOp, [Operand], Location);
|
||||
|
||||
public sealed record CallNode(
|
||||
string FunctionName,
|
||||
ImmutableArray<AstNode> Arguments,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Call, Arguments, Location);
|
||||
|
||||
public sealed record VariableNode(
|
||||
string Name,
|
||||
string? DataType,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Variable, [], Location);
|
||||
|
||||
public sealed record ConstantNode(
|
||||
object Value,
|
||||
string DataType,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Constant, [], Location);
|
||||
|
||||
public sealed record ArrayAccessNode(
|
||||
AstNode Array,
|
||||
AstNode Index,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.ArrayAccess, [Array, Index], Location);
|
||||
|
||||
public sealed record FieldAccessNode(
|
||||
AstNode Object,
|
||||
string FieldName,
|
||||
bool IsPointer,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.FieldAccess, [Object], Location);
|
||||
|
||||
public sealed record CastNode(
|
||||
AstNode Expression,
|
||||
string TargetType,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Cast, [Expression], Location);
|
||||
|
||||
public sealed record EmptyNode() : AstNode(AstNodeType.Block, [], null)
|
||||
{
|
||||
public static EmptyNode Instance { get; } = new();
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,22 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<Description>Decompiler integration for BinaryIndex semantic analysis. Provides AST-based comparison of decompiled code.</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Ghidra\StellaOps.BinaryIndex.Ghidra.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Semantic\StellaOps.BinaryIndex.Semantic.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -7,6 +7,7 @@ using System.Security.Cryptography;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Disassembly;
|
||||
using StellaOps.BinaryIndex.Normalization;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig;
|
||||
|
||||
@@ -17,18 +18,49 @@ public sealed class DeltaSignatureGenerator : IDeltaSignatureGenerator
|
||||
{
|
||||
private readonly DisassemblyService _disassemblyService;
|
||||
private readonly NormalizationService _normalizationService;
|
||||
private readonly IIrLiftingService? _irLiftingService;
|
||||
private readonly ISemanticGraphExtractor? _graphExtractor;
|
||||
private readonly ISemanticFingerprintGenerator? _fingerprintGenerator;
|
||||
private readonly ILogger<DeltaSignatureGenerator> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new delta signature generator without semantic analysis support.
|
||||
/// </summary>
|
||||
public DeltaSignatureGenerator(
|
||||
DisassemblyService disassemblyService,
|
||||
NormalizationService normalizationService,
|
||||
ILogger<DeltaSignatureGenerator> logger)
|
||||
: this(disassemblyService, normalizationService, null, null, null, logger)
|
||||
{
|
||||
_disassemblyService = disassemblyService;
|
||||
_normalizationService = normalizationService;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new delta signature generator with optional semantic analysis support.
|
||||
/// </summary>
|
||||
public DeltaSignatureGenerator(
|
||||
DisassemblyService disassemblyService,
|
||||
NormalizationService normalizationService,
|
||||
IIrLiftingService? irLiftingService,
|
||||
ISemanticGraphExtractor? graphExtractor,
|
||||
ISemanticFingerprintGenerator? fingerprintGenerator,
|
||||
ILogger<DeltaSignatureGenerator> logger)
|
||||
{
|
||||
_disassemblyService = disassemblyService ?? throw new ArgumentNullException(nameof(disassemblyService));
|
||||
_normalizationService = normalizationService ?? throw new ArgumentNullException(nameof(normalizationService));
|
||||
_irLiftingService = irLiftingService;
|
||||
_graphExtractor = graphExtractor;
|
||||
_fingerprintGenerator = fingerprintGenerator;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a value indicating whether semantic analysis is available.
|
||||
/// </summary>
|
||||
public bool SemanticAnalysisAvailable =>
|
||||
_irLiftingService is not null &&
|
||||
_graphExtractor is not null &&
|
||||
_fingerprintGenerator is not null;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DeltaSignature> GenerateSignaturesAsync(
|
||||
Stream binaryStream,
|
||||
@@ -94,11 +126,14 @@ public sealed class DeltaSignatureGenerator : IDeltaSignatureGenerator
|
||||
}
|
||||
|
||||
// Generate signature from normalized bytes
|
||||
var signature = GenerateSymbolSignature(
|
||||
var signature = await GenerateSymbolSignatureAsync(
|
||||
normalized,
|
||||
symbolName,
|
||||
symbolInfo.Section ?? ".text",
|
||||
options);
|
||||
instructions,
|
||||
binary.Architecture,
|
||||
options,
|
||||
ct);
|
||||
|
||||
symbolSignatures.Add(signature);
|
||||
|
||||
@@ -218,6 +253,136 @@ public sealed class DeltaSignatureGenerator : IDeltaSignatureGenerator
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<SymbolSignature> GenerateSymbolSignatureAsync(
|
||||
NormalizedFunction normalized,
|
||||
string symbolName,
|
||||
string scope,
|
||||
IReadOnlyList<DisassembledInstruction> originalInstructions,
|
||||
CpuArchitecture architecture,
|
||||
SignatureOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(normalized);
|
||||
ArgumentNullException.ThrowIfNull(symbolName);
|
||||
ArgumentNullException.ThrowIfNull(scope);
|
||||
ArgumentNullException.ThrowIfNull(originalInstructions);
|
||||
|
||||
options ??= new SignatureOptions();
|
||||
|
||||
// Get normalized bytes for hashing
|
||||
var normalizedBytes = GetNormalizedBytes(normalized);
|
||||
|
||||
// Compute the main hash
|
||||
var hashHex = ComputeHash(normalizedBytes, options.HashAlgorithm);
|
||||
|
||||
// Compute chunk hashes for resilience
|
||||
ImmutableArray<ChunkHash>? chunks = null;
|
||||
if (options.IncludeChunks && normalizedBytes.Length >= options.ChunkSize)
|
||||
{
|
||||
chunks = ComputeChunkHashes(normalizedBytes, options.ChunkSize, options.HashAlgorithm);
|
||||
}
|
||||
|
||||
// Compute CFG metrics using proper CFG analysis
|
||||
int? bbCount = null;
|
||||
string? cfgEdgeHash = null;
|
||||
if (options.IncludeCfg && normalized.Instructions.Length > 0)
|
||||
{
|
||||
// Use first instruction's address as start address
|
||||
var startAddress = normalized.Instructions[0].OriginalAddress;
|
||||
var cfgMetrics = CfgExtractor.ComputeMetrics(
|
||||
normalized.Instructions.ToList(),
|
||||
startAddress);
|
||||
|
||||
bbCount = cfgMetrics.BasicBlockCount;
|
||||
cfgEdgeHash = cfgMetrics.EdgeHash;
|
||||
}
|
||||
|
||||
// Compute semantic fingerprint if enabled and services available
|
||||
string? semanticHashHex = null;
|
||||
ImmutableArray<string>? semanticApiCalls = null;
|
||||
|
||||
if (options.IncludeSemantic && SemanticAnalysisAvailable && originalInstructions.Count > 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
var semanticFingerprint = await ComputeSemanticFingerprintAsync(
|
||||
originalInstructions,
|
||||
symbolName,
|
||||
architecture,
|
||||
ct);
|
||||
|
||||
if (semanticFingerprint is not null)
|
||||
{
|
||||
semanticHashHex = semanticFingerprint.GraphHashHex;
|
||||
semanticApiCalls = semanticFingerprint.ApiCalls;
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Failed to compute semantic fingerprint for {Symbol}, continuing without semantic data",
|
||||
symbolName);
|
||||
}
|
||||
}
|
||||
|
||||
return new SymbolSignature
|
||||
{
|
||||
Name = symbolName,
|
||||
Scope = scope,
|
||||
HashAlg = options.HashAlgorithm,
|
||||
HashHex = hashHex,
|
||||
SizeBytes = normalizedBytes.Length,
|
||||
CfgBbCount = bbCount,
|
||||
CfgEdgeHash = cfgEdgeHash,
|
||||
Chunks = chunks,
|
||||
SemanticHashHex = semanticHashHex,
|
||||
SemanticApiCalls = semanticApiCalls
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<SemanticFingerprint?> ComputeSemanticFingerprintAsync(
|
||||
IReadOnlyList<DisassembledInstruction> instructions,
|
||||
string functionName,
|
||||
CpuArchitecture architecture,
|
||||
CancellationToken ct)
|
||||
{
|
||||
if (_irLiftingService is null || _graphExtractor is null || _fingerprintGenerator is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if architecture is supported
|
||||
if (!_irLiftingService.SupportsArchitecture(architecture))
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Architecture {Arch} not supported for semantic analysis",
|
||||
architecture);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Lift to IR
|
||||
var startAddress = instructions.Count > 0 ? instructions[0].Address : 0UL;
|
||||
var lifted = await _irLiftingService.LiftToIrAsync(
|
||||
instructions,
|
||||
functionName,
|
||||
startAddress,
|
||||
architecture,
|
||||
ct: ct);
|
||||
|
||||
// Extract semantic graph
|
||||
var graph = await _graphExtractor.ExtractGraphAsync(lifted, ct: ct);
|
||||
|
||||
// Generate fingerprint
|
||||
var fingerprint = await _fingerprintGenerator.GenerateAsync(
|
||||
graph,
|
||||
startAddress,
|
||||
ct: ct);
|
||||
|
||||
return fingerprint;
|
||||
}
|
||||
|
||||
private static byte[] GetNormalizedBytes(NormalizedFunction normalized)
|
||||
{
|
||||
// Concatenate all normalized instruction bytes
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using StellaOps.BinaryIndex.Disassembly;
|
||||
using StellaOps.BinaryIndex.Normalization;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig;
|
||||
@@ -49,4 +50,24 @@ public interface IDeltaSignatureGenerator
|
||||
string symbolName,
|
||||
string scope,
|
||||
SignatureOptions? options = null);
|
||||
|
||||
/// <summary>
|
||||
/// Generates a signature for a single symbol with optional semantic analysis.
|
||||
/// </summary>
|
||||
/// <param name="normalized">The normalized function with instructions.</param>
|
||||
/// <param name="symbolName">Name of the symbol.</param>
|
||||
/// <param name="scope">Section containing the symbol.</param>
|
||||
/// <param name="originalInstructions">Original disassembled instructions for semantic analysis.</param>
|
||||
/// <param name="architecture">CPU architecture for IR lifting.</param>
|
||||
/// <param name="options">Generation options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The symbol signature with CFG metrics and optional semantic fingerprint.</returns>
|
||||
Task<SymbolSignature> GenerateSymbolSignatureAsync(
|
||||
NormalizedFunction normalized,
|
||||
string symbolName,
|
||||
string scope,
|
||||
IReadOnlyList<DisassembledInstruction> originalInstructions,
|
||||
CpuArchitecture architecture,
|
||||
SignatureOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
@@ -13,11 +13,13 @@ namespace StellaOps.BinaryIndex.DeltaSig;
|
||||
/// <param name="IncludeChunks">Include rolling chunk hashes for resilience.</param>
|
||||
/// <param name="ChunkSize">Size of rolling chunks in bytes (default 2KB).</param>
|
||||
/// <param name="HashAlgorithm">Hash algorithm to use (default sha256).</param>
|
||||
/// <param name="IncludeSemantic">Include IR-level semantic fingerprints for optimization-resilient matching.</param>
|
||||
public sealed record SignatureOptions(
|
||||
bool IncludeCfg = true,
|
||||
bool IncludeChunks = true,
|
||||
int ChunkSize = 2048,
|
||||
string HashAlgorithm = "sha256");
|
||||
string HashAlgorithm = "sha256",
|
||||
bool IncludeSemantic = false);
|
||||
|
||||
/// <summary>
|
||||
/// Request for generating delta signatures from a binary.
|
||||
@@ -190,6 +192,17 @@ public sealed record SymbolSignature
|
||||
/// Rolling chunk hashes for resilience against small changes.
|
||||
/// </summary>
|
||||
public ImmutableArray<ChunkHash>? Chunks { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Semantic fingerprint hash based on IR-level analysis (hex string).
|
||||
/// Provides resilience against compiler optimizations and instruction reordering.
|
||||
/// </summary>
|
||||
public string? SemanticHashHex { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// API calls extracted from semantic analysis (for semantic anchoring).
|
||||
/// </summary>
|
||||
public ImmutableArray<string>? SemanticApiCalls { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -2,8 +2,10 @@
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Disassembly;
|
||||
using StellaOps.BinaryIndex.Normalization;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig;
|
||||
|
||||
@@ -15,17 +17,52 @@ public static class ServiceCollectionExtensions
|
||||
/// <summary>
|
||||
/// Adds delta signature generation and matching services.
|
||||
/// Requires disassembly and normalization services to be registered.
|
||||
/// If semantic services are registered, semantic fingerprinting will be available.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDeltaSignatures(this IServiceCollection services)
|
||||
{
|
||||
services.AddSingleton<IDeltaSignatureGenerator, DeltaSignatureGenerator>();
|
||||
services.AddSingleton<IDeltaSignatureGenerator>(sp =>
|
||||
{
|
||||
var disassembly = sp.GetRequiredService<DisassemblyService>();
|
||||
var normalization = sp.GetRequiredService<NormalizationService>();
|
||||
var logger = sp.GetRequiredService<ILogger<DeltaSignatureGenerator>>();
|
||||
|
||||
// Semantic services are optional
|
||||
var irLifting = sp.GetService<IIrLiftingService>();
|
||||
var graphExtractor = sp.GetService<ISemanticGraphExtractor>();
|
||||
var fingerprintGenerator = sp.GetService<ISemanticFingerprintGenerator>();
|
||||
|
||||
return new DeltaSignatureGenerator(
|
||||
disassembly,
|
||||
normalization,
|
||||
irLifting,
|
||||
graphExtractor,
|
||||
fingerprintGenerator,
|
||||
logger);
|
||||
});
|
||||
|
||||
services.AddSingleton<IDeltaSignatureMatcher, DeltaSignatureMatcher>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds delta signature services with semantic analysis support enabled.
|
||||
/// Requires disassembly and normalization services to be registered.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDeltaSignaturesWithSemantic(this IServiceCollection services)
|
||||
{
|
||||
// Register semantic services first
|
||||
services.AddBinaryIndexSemantic();
|
||||
|
||||
// Then register delta signature services
|
||||
return services.AddDeltaSignatures();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds all binary index services: disassembly, normalization, and delta signatures.
|
||||
/// </summary>
|
||||
@@ -44,4 +81,26 @@ public static class ServiceCollectionExtensions
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds all binary index services with semantic analysis: disassembly, normalization, semantic, and delta signatures.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddBinaryIndexServicesWithSemantic(this IServiceCollection services)
|
||||
{
|
||||
// Add disassembly with default plugins
|
||||
services.AddDisassemblyServices();
|
||||
|
||||
// Add normalization pipelines
|
||||
services.AddNormalizationPipelines();
|
||||
|
||||
// Add semantic analysis services
|
||||
services.AddBinaryIndexSemantic();
|
||||
|
||||
// Add delta signature services (will pick up semantic services)
|
||||
services.AddDeltaSignatures();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Disassembly.Abstractions\StellaOps.BinaryIndex.Disassembly.Abstractions.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Disassembly\StellaOps.BinaryIndex.Disassembly.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Normalization\StellaOps.BinaryIndex.Normalization.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Semantic\StellaOps.BinaryIndex.Semantic.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
@@ -66,4 +66,81 @@ public static class DisassemblyServiceCollectionExtensions
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds the hybrid disassembly service with fallback logic between plugins.
|
||||
/// This replaces the standard disassembly service with a hybrid version that
|
||||
/// automatically falls back to secondary plugins when primary quality is low.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configuration">Configuration for binding options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddHybridDisassemblyServices(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(configuration);
|
||||
|
||||
// Register standard options
|
||||
services.AddOptions<DisassemblyOptions>()
|
||||
.Bind(configuration.GetSection(DisassemblyOptions.SectionName))
|
||||
.ValidateOnStart();
|
||||
|
||||
// Register hybrid options
|
||||
services.AddOptions<HybridDisassemblyOptions>()
|
||||
.Bind(configuration.GetSection(HybridDisassemblyOptions.SectionName))
|
||||
.ValidateOnStart();
|
||||
|
||||
// Register the plugin registry
|
||||
services.TryAddSingleton<IDisassemblyPluginRegistry, DisassemblyPluginRegistry>();
|
||||
|
||||
// Register hybrid service as IDisassemblyService
|
||||
services.AddSingleton<HybridDisassemblyService>();
|
||||
services.AddSingleton<IDisassemblyService>(sp => sp.GetRequiredService<HybridDisassemblyService>());
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds the hybrid disassembly service with configuration actions.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureHybrid">Action to configure hybrid options.</param>
|
||||
/// <param name="configureDisassembly">Optional action to configure standard options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddHybridDisassemblyServices(
|
||||
this IServiceCollection services,
|
||||
Action<HybridDisassemblyOptions> configureHybrid,
|
||||
Action<DisassemblyOptions>? configureDisassembly = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(configureHybrid);
|
||||
|
||||
// Register standard options
|
||||
if (configureDisassembly != null)
|
||||
{
|
||||
services.AddOptions<DisassemblyOptions>()
|
||||
.Configure(configureDisassembly)
|
||||
.ValidateOnStart();
|
||||
}
|
||||
else
|
||||
{
|
||||
services.AddOptions<DisassemblyOptions>();
|
||||
}
|
||||
|
||||
// Register hybrid options
|
||||
services.AddOptions<HybridDisassemblyOptions>()
|
||||
.Configure(configureHybrid)
|
||||
.ValidateOnStart();
|
||||
|
||||
// Register the plugin registry
|
||||
services.TryAddSingleton<IDisassemblyPluginRegistry, DisassemblyPluginRegistry>();
|
||||
|
||||
// Register hybrid service as IDisassemblyService
|
||||
services.AddSingleton<HybridDisassemblyService>();
|
||||
services.AddSingleton<IDisassemblyService>(sp => sp.GetRequiredService<HybridDisassemblyService>());
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,572 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Disassembly;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for hybrid disassembly with fallback logic.
|
||||
/// </summary>
|
||||
public sealed class HybridDisassemblyOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section name.
|
||||
/// </summary>
|
||||
public const string SectionName = "HybridDisassembly";
|
||||
|
||||
/// <summary>
|
||||
/// Primary plugin ID to try first. If null, auto-selects highest priority plugin.
|
||||
/// </summary>
|
||||
public string? PrimaryPluginId { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Fallback plugin ID to use when primary fails quality threshold.
|
||||
/// </summary>
|
||||
public string? FallbackPluginId { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Minimum confidence score (0.0-1.0) required to accept primary plugin results.
|
||||
/// If primary result confidence is below this, fallback is attempted.
|
||||
/// </summary>
|
||||
public double MinConfidenceThreshold { get; set; } = 0.7;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum function discovery count. If primary finds fewer functions, fallback is attempted.
|
||||
/// </summary>
|
||||
public int MinFunctionCount { get; set; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum instruction decode success rate (0.0-1.0).
|
||||
/// </summary>
|
||||
public double MinDecodeSuccessRate { get; set; } = 0.8;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to automatically fallback when primary plugin doesn't support the architecture.
|
||||
/// </summary>
|
||||
public bool AutoFallbackOnUnsupported { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to enable hybrid fallback logic at all. If false, behaves like standard service.
|
||||
/// </summary>
|
||||
public bool EnableFallback { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Timeout in seconds for each plugin attempt.
|
||||
/// </summary>
|
||||
public int PluginTimeoutSeconds { get; set; } = 120;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a disassembly operation with quality metrics.
|
||||
/// </summary>
|
||||
public sealed record DisassemblyQualityResult
|
||||
{
|
||||
/// <summary>
|
||||
/// The loaded binary information.
|
||||
/// </summary>
|
||||
public required BinaryInfo Binary { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// The plugin that produced this result.
|
||||
/// </summary>
|
||||
public required IDisassemblyPlugin Plugin { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Discovered code regions.
|
||||
/// </summary>
|
||||
public required ImmutableArray<CodeRegion> CodeRegions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Discovered symbols/functions.
|
||||
/// </summary>
|
||||
public required ImmutableArray<SymbolInfo> Symbols { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total instructions disassembled across all regions.
|
||||
/// </summary>
|
||||
public int TotalInstructions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Successfully decoded instructions count.
|
||||
/// </summary>
|
||||
public int DecodedInstructions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Failed/invalid instruction count.
|
||||
/// </summary>
|
||||
public int FailedInstructions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Confidence score (0.0-1.0) based on quality metrics.
|
||||
/// </summary>
|
||||
public double Confidence { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether this result came from a fallback plugin.
|
||||
/// </summary>
|
||||
public bool UsedFallback { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Reason for fallback if applicable.
|
||||
/// </summary>
|
||||
public string? FallbackReason { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Decode success rate (DecodedInstructions / TotalInstructions).
|
||||
/// </summary>
|
||||
public double DecodeSuccessRate =>
|
||||
TotalInstructions > 0 ? (double)DecodedInstructions / TotalInstructions : 0.0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Hybrid disassembly service that implements smart routing between plugins
|
||||
/// with quality-based fallback logic (e.g., B2R2 primary -> Ghidra fallback).
|
||||
/// </summary>
|
||||
public sealed class HybridDisassemblyService : IDisassemblyService
|
||||
{
|
||||
private readonly IDisassemblyPluginRegistry _registry;
|
||||
private readonly HybridDisassemblyOptions _options;
|
||||
private readonly ILogger<HybridDisassemblyService> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new hybrid disassembly service.
|
||||
/// </summary>
|
||||
/// <param name="registry">The plugin registry.</param>
|
||||
/// <param name="options">Hybrid options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public HybridDisassemblyService(
|
||||
IDisassemblyPluginRegistry registry,
|
||||
IOptions<HybridDisassemblyOptions> options,
|
||||
ILogger<HybridDisassemblyService> logger)
|
||||
{
|
||||
_registry = registry ?? throw new ArgumentNullException(nameof(registry));
|
||||
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IDisassemblyPluginRegistry Registry => _registry;
|
||||
|
||||
/// <inheritdoc />
|
||||
public (BinaryInfo Binary, IDisassemblyPlugin Plugin) LoadBinary(Stream stream, string? preferredPluginId = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(stream);
|
||||
|
||||
using var memStream = new MemoryStream();
|
||||
stream.CopyTo(memStream);
|
||||
return LoadBinary(memStream.ToArray(), preferredPluginId);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public (BinaryInfo Binary, IDisassemblyPlugin Plugin) LoadBinary(ReadOnlySpan<byte> bytes, string? preferredPluginId = null)
|
||||
{
|
||||
// Detect format/architecture
|
||||
var format = DetectFormat(bytes);
|
||||
var architecture = DetectArchitecture(bytes, format);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Hybrid service: Detected format {Format} and architecture {Arch}",
|
||||
format, architecture);
|
||||
|
||||
if (!_options.EnableFallback)
|
||||
{
|
||||
// Simple mode - just use the best plugin
|
||||
return LoadWithBestPlugin(bytes, architecture, format, preferredPluginId);
|
||||
}
|
||||
|
||||
// Hybrid mode with fallback logic
|
||||
return LoadWithFallback(bytes, architecture, format, preferredPluginId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Loads binary with quality assessment and returns detailed quality result.
|
||||
/// </summary>
|
||||
/// <param name="bytes">The binary data.</param>
|
||||
/// <param name="preferredPluginId">Optional preferred plugin ID.</param>
|
||||
/// <returns>A quality result with metrics and fallback info.</returns>
|
||||
public DisassemblyQualityResult LoadBinaryWithQuality(ReadOnlySpan<byte> bytes, string? preferredPluginId = null)
|
||||
{
|
||||
var format = DetectFormat(bytes);
|
||||
var architecture = DetectArchitecture(bytes, format);
|
||||
|
||||
// Try primary plugin
|
||||
var primaryPlugin = GetPrimaryPlugin(architecture, format, preferredPluginId);
|
||||
if (primaryPlugin is null)
|
||||
{
|
||||
throw new NotSupportedException(
|
||||
$"No disassembly plugin available for architecture {architecture} and format {format}");
|
||||
}
|
||||
|
||||
var primaryResult = AssessQuality(primaryPlugin, bytes, architecture, format);
|
||||
|
||||
// Check if primary meets quality threshold
|
||||
if (MeetsQualityThreshold(primaryResult))
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Primary plugin {Plugin} met quality threshold (confidence: {Confidence:P1})",
|
||||
primaryPlugin.Capabilities.PluginId, primaryResult.Confidence);
|
||||
return primaryResult;
|
||||
}
|
||||
|
||||
// Try fallback
|
||||
if (!_options.EnableFallback)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Primary plugin {Plugin} below threshold (confidence: {Confidence:P1}), fallback disabled",
|
||||
primaryPlugin.Capabilities.PluginId, primaryResult.Confidence);
|
||||
return primaryResult;
|
||||
}
|
||||
|
||||
var fallbackPlugin = GetFallbackPlugin(primaryPlugin, architecture, format);
|
||||
if (fallbackPlugin is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"No fallback plugin available for {Arch}/{Format}",
|
||||
architecture, format);
|
||||
return primaryResult;
|
||||
}
|
||||
|
||||
var fallbackResult = AssessQuality(fallbackPlugin, bytes, architecture, format);
|
||||
|
||||
// Use fallback if it's better
|
||||
if (fallbackResult.Confidence > primaryResult.Confidence)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Using fallback plugin {Plugin} (confidence: {Confidence:P1} > primary: {PrimaryConf:P1})",
|
||||
fallbackPlugin.Capabilities.PluginId, fallbackResult.Confidence, primaryResult.Confidence);
|
||||
|
||||
return fallbackResult with
|
||||
{
|
||||
UsedFallback = true,
|
||||
FallbackReason = $"Primary confidence ({primaryResult.Confidence:P1}) below threshold"
|
||||
};
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Keeping primary plugin result (confidence: {Confidence:P1})",
|
||||
primaryResult.Confidence);
|
||||
return primaryResult;
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private (BinaryInfo Binary, IDisassemblyPlugin Plugin) LoadWithBestPlugin(
|
||||
ReadOnlySpan<byte> bytes,
|
||||
CpuArchitecture architecture,
|
||||
BinaryFormat format,
|
||||
string? preferredPluginId)
|
||||
{
|
||||
var plugin = GetPluginById(preferredPluginId) ?? _registry.FindPlugin(architecture, format);
|
||||
|
||||
if (plugin == null)
|
||||
{
|
||||
throw new NotSupportedException(
|
||||
$"No disassembly plugin available for architecture {architecture} and format {format}");
|
||||
}
|
||||
|
||||
var binary = plugin.LoadBinary(bytes, architecture, format);
|
||||
return (binary, plugin);
|
||||
}
|
||||
|
||||
private (BinaryInfo Binary, IDisassemblyPlugin Plugin) LoadWithFallback(
|
||||
ReadOnlySpan<byte> bytes,
|
||||
CpuArchitecture architecture,
|
||||
BinaryFormat format,
|
||||
string? preferredPluginId)
|
||||
{
|
||||
var primaryPlugin = GetPrimaryPlugin(architecture, format, preferredPluginId);
|
||||
|
||||
if (primaryPlugin is null)
|
||||
{
|
||||
// No primary, try fallback directly
|
||||
var fallback = GetFallbackPlugin(null, architecture, format);
|
||||
if (fallback is null)
|
||||
{
|
||||
throw new NotSupportedException(
|
||||
$"No disassembly plugin available for architecture {architecture} and format {format}");
|
||||
}
|
||||
return (fallback.LoadBinary(bytes, architecture, format), fallback);
|
||||
}
|
||||
|
||||
// Check if primary supports this arch/format
|
||||
if (_options.AutoFallbackOnUnsupported && !primaryPlugin.Capabilities.CanHandle(architecture, format))
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Primary plugin {Plugin} doesn't support {Arch}/{Format}, using fallback",
|
||||
primaryPlugin.Capabilities.PluginId, architecture, format);
|
||||
|
||||
var fallback = GetFallbackPlugin(primaryPlugin, architecture, format);
|
||||
if (fallback is not null)
|
||||
{
|
||||
return (fallback.LoadBinary(bytes, architecture, format), fallback);
|
||||
}
|
||||
}
|
||||
|
||||
// Use primary
|
||||
return (primaryPlugin.LoadBinary(bytes, architecture, format), primaryPlugin);
|
||||
}
|
||||
|
||||
private IDisassemblyPlugin? GetPrimaryPlugin(
|
||||
CpuArchitecture architecture,
|
||||
BinaryFormat format,
|
||||
string? preferredPluginId)
|
||||
{
|
||||
// Explicit preferred plugin
|
||||
if (!string.IsNullOrEmpty(preferredPluginId))
|
||||
{
|
||||
return GetPluginById(preferredPluginId);
|
||||
}
|
||||
|
||||
// Configured primary plugin
|
||||
if (!string.IsNullOrEmpty(_options.PrimaryPluginId))
|
||||
{
|
||||
return GetPluginById(_options.PrimaryPluginId);
|
||||
}
|
||||
|
||||
// Auto-select highest priority
|
||||
return _registry.FindPlugin(architecture, format);
|
||||
}
|
||||
|
||||
private IDisassemblyPlugin? GetFallbackPlugin(
|
||||
IDisassemblyPlugin? excludePlugin,
|
||||
CpuArchitecture architecture,
|
||||
BinaryFormat format)
|
||||
{
|
||||
// Explicit fallback plugin
|
||||
if (!string.IsNullOrEmpty(_options.FallbackPluginId))
|
||||
{
|
||||
var fallback = GetPluginById(_options.FallbackPluginId);
|
||||
if (fallback?.Capabilities.CanHandle(architecture, format) == true)
|
||||
{
|
||||
return fallback;
|
||||
}
|
||||
}
|
||||
|
||||
// Find any other plugin that supports this arch/format
|
||||
return _registry.Plugins
|
||||
.Where(p => p != excludePlugin)
|
||||
.Where(p => p.Capabilities.CanHandle(architecture, format))
|
||||
.OrderByDescending(p => p.Capabilities.Priority)
|
||||
.FirstOrDefault();
|
||||
}
|
||||
|
||||
private IDisassemblyPlugin? GetPluginById(string? pluginId)
|
||||
{
|
||||
return string.IsNullOrEmpty(pluginId) ? null : _registry.GetPlugin(pluginId);
|
||||
}
|
||||
|
||||
private DisassemblyQualityResult AssessQuality(
|
||||
IDisassemblyPlugin plugin,
|
||||
ReadOnlySpan<byte> bytes,
|
||||
CpuArchitecture architecture,
|
||||
BinaryFormat format)
|
||||
{
|
||||
try
|
||||
{
|
||||
var binary = plugin.LoadBinary(bytes, architecture, format);
|
||||
var codeRegions = plugin.GetCodeRegions(binary).ToImmutableArray();
|
||||
var symbols = plugin.GetSymbols(binary).ToImmutableArray();
|
||||
|
||||
// Assess quality by sampling disassembly
|
||||
int totalInstructions = 0;
|
||||
int decodedInstructions = 0;
|
||||
int failedInstructions = 0;
|
||||
|
||||
foreach (var region in codeRegions.Take(3)) // Sample up to 3 regions
|
||||
{
|
||||
var instructions = plugin.Disassemble(binary, region).Take(1000).ToList();
|
||||
totalInstructions += instructions.Count;
|
||||
|
||||
foreach (var instr in instructions)
|
||||
{
|
||||
if (instr.Mnemonic.Equals("??", StringComparison.Ordinal) ||
|
||||
instr.Mnemonic.Equals("invalid", StringComparison.OrdinalIgnoreCase) ||
|
||||
instr.Mnemonic.Equals("db", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
failedInstructions++;
|
||||
}
|
||||
else
|
||||
{
|
||||
decodedInstructions++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate confidence
|
||||
var confidence = CalculateConfidence(
|
||||
symbols.Length,
|
||||
decodedInstructions,
|
||||
failedInstructions,
|
||||
codeRegions.Length);
|
||||
|
||||
return new DisassemblyQualityResult
|
||||
{
|
||||
Binary = binary,
|
||||
Plugin = plugin,
|
||||
CodeRegions = codeRegions,
|
||||
Symbols = symbols,
|
||||
TotalInstructions = totalInstructions,
|
||||
DecodedInstructions = decodedInstructions,
|
||||
FailedInstructions = failedInstructions,
|
||||
Confidence = confidence,
|
||||
UsedFallback = false
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Plugin {Plugin} failed during quality assessment", plugin.Capabilities.PluginId);
|
||||
|
||||
return new DisassemblyQualityResult
|
||||
{
|
||||
Binary = null!,
|
||||
Plugin = plugin,
|
||||
CodeRegions = [],
|
||||
Symbols = [],
|
||||
TotalInstructions = 0,
|
||||
DecodedInstructions = 0,
|
||||
FailedInstructions = 0,
|
||||
Confidence = 0.0,
|
||||
UsedFallback = false,
|
||||
FallbackReason = $"Plugin failed: {ex.Message}"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private static double CalculateConfidence(
|
||||
int symbolCount,
|
||||
int decodedInstructions,
|
||||
int failedInstructions,
|
||||
int regionCount)
|
||||
{
|
||||
var totalInstructions = decodedInstructions + failedInstructions;
|
||||
if (totalInstructions == 0)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Decode success rate (weight: 0.5)
|
||||
var decodeRate = (double)decodedInstructions / totalInstructions;
|
||||
|
||||
// Symbol discovery (weight: 0.3)
|
||||
var symbolScore = Math.Min(1.0, symbolCount / 10.0);
|
||||
|
||||
// Region coverage (weight: 0.2)
|
||||
var regionScore = Math.Min(1.0, regionCount / 5.0);
|
||||
|
||||
return (decodeRate * 0.5) + (symbolScore * 0.3) + (regionScore * 0.2);
|
||||
}
|
||||
|
||||
private bool MeetsQualityThreshold(DisassemblyQualityResult result)
|
||||
{
|
||||
if (result.Confidence < _options.MinConfidenceThreshold)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (result.Symbols.Length < _options.MinFunctionCount)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (result.DecodeSuccessRate < _options.MinDecodeSuccessRate)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#region Format/Architecture Detection (copied from DisassemblyService)
|
||||
|
||||
private static BinaryFormat DetectFormat(ReadOnlySpan<byte> bytes)
|
||||
{
|
||||
if (bytes.Length < 4) return BinaryFormat.Raw;
|
||||
|
||||
// ELF magic
|
||||
if (bytes[0] == 0x7F && bytes[1] == 'E' && bytes[2] == 'L' && bytes[3] == 'F')
|
||||
return BinaryFormat.ELF;
|
||||
|
||||
// PE magic
|
||||
if (bytes[0] == 'M' && bytes[1] == 'Z')
|
||||
return BinaryFormat.PE;
|
||||
|
||||
// Mach-O magic
|
||||
if ((bytes[0] == 0xFE && bytes[1] == 0xED && bytes[2] == 0xFA && (bytes[3] == 0xCE || bytes[3] == 0xCF)) ||
|
||||
(bytes[3] == 0xFE && bytes[2] == 0xED && bytes[1] == 0xFA && (bytes[0] == 0xCE || bytes[0] == 0xCF)))
|
||||
return BinaryFormat.MachO;
|
||||
|
||||
// WASM magic
|
||||
if (bytes[0] == 0x00 && bytes[1] == 'a' && bytes[2] == 's' && bytes[3] == 'm')
|
||||
return BinaryFormat.WASM;
|
||||
|
||||
return BinaryFormat.Raw;
|
||||
}
|
||||
|
||||
private static CpuArchitecture DetectArchitecture(ReadOnlySpan<byte> bytes, BinaryFormat format)
|
||||
{
|
||||
return format switch
|
||||
{
|
||||
BinaryFormat.ELF when bytes.Length > 18 => DetectElfArchitecture(bytes),
|
||||
BinaryFormat.PE when bytes.Length > 0x40 => DetectPeArchitecture(bytes),
|
||||
BinaryFormat.MachO when bytes.Length > 8 => DetectMachOArchitecture(bytes),
|
||||
_ => CpuArchitecture.X86_64
|
||||
};
|
||||
}
|
||||
|
||||
private static CpuArchitecture DetectElfArchitecture(ReadOnlySpan<byte> bytes)
|
||||
{
|
||||
var machine = (ushort)(bytes[18] | (bytes[19] << 8));
|
||||
return machine switch
|
||||
{
|
||||
0x03 => CpuArchitecture.X86,
|
||||
0x3E => CpuArchitecture.X86_64,
|
||||
0x28 => CpuArchitecture.ARM32,
|
||||
0xB7 => CpuArchitecture.ARM64,
|
||||
0x08 => CpuArchitecture.MIPS32,
|
||||
0xF3 => CpuArchitecture.RISCV64,
|
||||
0x14 => CpuArchitecture.PPC32,
|
||||
0x02 => CpuArchitecture.SPARC,
|
||||
_ => bytes[4] == 2 ? CpuArchitecture.X86_64 : CpuArchitecture.X86
|
||||
};
|
||||
}
|
||||
|
||||
private static CpuArchitecture DetectPeArchitecture(ReadOnlySpan<byte> bytes)
|
||||
{
|
||||
var peOffset = bytes[0x3C] | (bytes[0x3D] << 8) | (bytes[0x3E] << 16) | (bytes[0x3F] << 24);
|
||||
if (peOffset < 0 || peOffset + 6 > bytes.Length) return CpuArchitecture.X86;
|
||||
|
||||
var machine = (ushort)(bytes[peOffset + 4] | (bytes[peOffset + 5] << 8));
|
||||
return machine switch
|
||||
{
|
||||
0x014c => CpuArchitecture.X86,
|
||||
0x8664 => CpuArchitecture.X86_64,
|
||||
0xaa64 => CpuArchitecture.ARM64,
|
||||
0x01c4 => CpuArchitecture.ARM32,
|
||||
_ => CpuArchitecture.X86
|
||||
};
|
||||
}
|
||||
|
||||
private static CpuArchitecture DetectMachOArchitecture(ReadOnlySpan<byte> bytes)
|
||||
{
|
||||
bool isBigEndian = bytes[0] == 0xFE;
|
||||
uint cpuType = isBigEndian
|
||||
? (uint)((bytes[4] << 24) | (bytes[5] << 16) | (bytes[6] << 8) | bytes[7])
|
||||
: (uint)(bytes[4] | (bytes[5] << 8) | (bytes[6] << 16) | (bytes[7] << 24));
|
||||
|
||||
return cpuType switch
|
||||
{
|
||||
0x00000007 => CpuArchitecture.X86,
|
||||
0x01000007 => CpuArchitecture.X86_64,
|
||||
0x0000000C => CpuArchitecture.ARM32,
|
||||
0x0100000C => CpuArchitecture.ARM64,
|
||||
_ => CpuArchitecture.X86_64
|
||||
};
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,460 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.Decompiler;
|
||||
using StellaOps.BinaryIndex.ML;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ensemble;
|
||||
|
||||
/// <summary>
|
||||
/// Ensemble decision engine that combines syntactic, semantic, and ML signals.
|
||||
/// </summary>
|
||||
public sealed class EnsembleDecisionEngine : IEnsembleDecisionEngine
|
||||
{
|
||||
private readonly IAstComparisonEngine _astEngine;
|
||||
private readonly ISemanticMatcher _semanticMatcher;
|
||||
private readonly IEmbeddingService _embeddingService;
|
||||
private readonly EnsembleOptions _defaultOptions;
|
||||
private readonly ILogger<EnsembleDecisionEngine> _logger;
|
||||
|
||||
public EnsembleDecisionEngine(
|
||||
IAstComparisonEngine astEngine,
|
||||
ISemanticMatcher semanticMatcher,
|
||||
IEmbeddingService embeddingService,
|
||||
IOptions<EnsembleOptions> options,
|
||||
ILogger<EnsembleDecisionEngine> logger)
|
||||
{
|
||||
_astEngine = astEngine ?? throw new ArgumentNullException(nameof(astEngine));
|
||||
_semanticMatcher = semanticMatcher ?? throw new ArgumentNullException(nameof(semanticMatcher));
|
||||
_embeddingService = embeddingService ?? throw new ArgumentNullException(nameof(embeddingService));
|
||||
_defaultOptions = options?.Value ?? new EnsembleOptions();
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<EnsembleResult> CompareAsync(
|
||||
FunctionAnalysis source,
|
||||
FunctionAnalysis target,
|
||||
EnsembleOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(source);
|
||||
ArgumentNullException.ThrowIfNull(target);
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
options ??= _defaultOptions;
|
||||
|
||||
// Check for exact hash match first (optimization)
|
||||
var exactHashMatch = CheckExactHashMatch(source, target);
|
||||
|
||||
// Compute individual signals
|
||||
var contributions = new List<SignalContribution>();
|
||||
var availableWeight = 0m;
|
||||
|
||||
// Syntactic (AST) signal
|
||||
var syntacticContribution = ComputeSyntacticSignal(source, target, options);
|
||||
contributions.Add(syntacticContribution);
|
||||
if (syntacticContribution.IsAvailable)
|
||||
{
|
||||
availableWeight += options.SyntacticWeight;
|
||||
}
|
||||
|
||||
// Semantic (graph) signal
|
||||
var semanticContribution = await ComputeSemanticSignalAsync(source, target, options, ct);
|
||||
contributions.Add(semanticContribution);
|
||||
if (semanticContribution.IsAvailable)
|
||||
{
|
||||
availableWeight += options.SemanticWeight;
|
||||
}
|
||||
|
||||
// ML (embedding) signal
|
||||
var embeddingContribution = ComputeEmbeddingSignal(source, target, options);
|
||||
contributions.Add(embeddingContribution);
|
||||
if (embeddingContribution.IsAvailable)
|
||||
{
|
||||
availableWeight += options.EmbeddingWeight;
|
||||
}
|
||||
|
||||
// Compute effective weights (normalize if some signals missing)
|
||||
var effectiveWeights = ComputeEffectiveWeights(contributions, options, availableWeight);
|
||||
|
||||
// Update contributions with effective weights
|
||||
var adjustedContributions = AdjustContributionWeights(contributions, effectiveWeights);
|
||||
|
||||
// Compute ensemble score
|
||||
var ensembleScore = ComputeEnsembleScore(adjustedContributions, exactHashMatch, options);
|
||||
|
||||
// Determine match and confidence
|
||||
var isMatch = ensembleScore >= options.MatchThreshold;
|
||||
var confidence = DetermineConfidence(ensembleScore, adjustedContributions, exactHashMatch);
|
||||
var reason = BuildDecisionReason(adjustedContributions, exactHashMatch, isMatch);
|
||||
|
||||
var result = new EnsembleResult
|
||||
{
|
||||
SourceFunctionId = source.FunctionId,
|
||||
TargetFunctionId = target.FunctionId,
|
||||
EnsembleScore = ensembleScore,
|
||||
Contributions = adjustedContributions.ToImmutableArray(),
|
||||
IsMatch = isMatch,
|
||||
Confidence = confidence,
|
||||
DecisionReason = reason,
|
||||
ExactHashMatch = exactHashMatch,
|
||||
AdjustedWeights = effectiveWeights
|
||||
};
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<EnsembleResult>> FindMatchesAsync(
|
||||
FunctionAnalysis query,
|
||||
IEnumerable<FunctionAnalysis> corpus,
|
||||
EnsembleOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(query);
|
||||
ArgumentNullException.ThrowIfNull(corpus);
|
||||
|
||||
options ??= _defaultOptions;
|
||||
var results = new List<EnsembleResult>();
|
||||
|
||||
foreach (var candidate in corpus)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var result = await CompareAsync(query, candidate, options, ct);
|
||||
if (result.EnsembleScore >= options.MinimumSignalThreshold)
|
||||
{
|
||||
results.Add(result);
|
||||
}
|
||||
}
|
||||
|
||||
return results
|
||||
.OrderByDescending(r => r.EnsembleScore)
|
||||
.Take(options.MaxCandidates)
|
||||
.ToImmutableArray();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<BatchComparisonResult> CompareBatchAsync(
|
||||
IEnumerable<FunctionAnalysis> sources,
|
||||
IEnumerable<FunctionAnalysis> targets,
|
||||
EnsembleOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(sources);
|
||||
ArgumentNullException.ThrowIfNull(targets);
|
||||
|
||||
options ??= _defaultOptions;
|
||||
var startTime = DateTime.UtcNow;
|
||||
var results = new List<EnsembleResult>();
|
||||
var targetList = targets.ToList();
|
||||
|
||||
foreach (var source in sources)
|
||||
{
|
||||
foreach (var target in targetList)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
var result = await CompareAsync(source, target, options, ct);
|
||||
results.Add(result);
|
||||
}
|
||||
}
|
||||
|
||||
var duration = DateTime.UtcNow - startTime;
|
||||
var statistics = ComputeStatistics(results);
|
||||
|
||||
return new BatchComparisonResult
|
||||
{
|
||||
Results = results.ToImmutableArray(),
|
||||
Statistics = statistics,
|
||||
Duration = duration
|
||||
};
|
||||
}
|
||||
|
||||
private static bool CheckExactHashMatch(FunctionAnalysis source, FunctionAnalysis target)
|
||||
{
|
||||
if (source.NormalizedCodeHash is null || target.NormalizedCodeHash is null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return source.NormalizedCodeHash.SequenceEqual(target.NormalizedCodeHash);
|
||||
}
|
||||
|
||||
private SignalContribution ComputeSyntacticSignal(
|
||||
FunctionAnalysis source,
|
||||
FunctionAnalysis target,
|
||||
EnsembleOptions options)
|
||||
{
|
||||
if (source.Ast is null || target.Ast is null)
|
||||
{
|
||||
return new SignalContribution
|
||||
{
|
||||
SignalType = SignalType.Syntactic,
|
||||
RawScore = 0m,
|
||||
Weight = options.SyntacticWeight,
|
||||
IsAvailable = false,
|
||||
Quality = SignalQuality.Unavailable
|
||||
};
|
||||
}
|
||||
|
||||
var similarity = _astEngine.ComputeStructuralSimilarity(source.Ast, target.Ast);
|
||||
var quality = AssessAstQuality(source.Ast, target.Ast);
|
||||
|
||||
return new SignalContribution
|
||||
{
|
||||
SignalType = SignalType.Syntactic,
|
||||
RawScore = similarity,
|
||||
Weight = options.SyntacticWeight,
|
||||
IsAvailable = true,
|
||||
Quality = quality
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<SignalContribution> ComputeSemanticSignalAsync(
|
||||
FunctionAnalysis source,
|
||||
FunctionAnalysis target,
|
||||
EnsembleOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
if (source.SemanticGraph is null || target.SemanticGraph is null)
|
||||
{
|
||||
return new SignalContribution
|
||||
{
|
||||
SignalType = SignalType.Semantic,
|
||||
RawScore = 0m,
|
||||
Weight = options.SemanticWeight,
|
||||
IsAvailable = false,
|
||||
Quality = SignalQuality.Unavailable
|
||||
};
|
||||
}
|
||||
|
||||
var similarity = await _semanticMatcher.ComputeGraphSimilarityAsync(
|
||||
source.SemanticGraph,
|
||||
target.SemanticGraph,
|
||||
ct);
|
||||
var quality = AssessGraphQuality(source.SemanticGraph, target.SemanticGraph);
|
||||
|
||||
return new SignalContribution
|
||||
{
|
||||
SignalType = SignalType.Semantic,
|
||||
RawScore = similarity,
|
||||
Weight = options.SemanticWeight,
|
||||
IsAvailable = true,
|
||||
Quality = quality
|
||||
};
|
||||
}
|
||||
|
||||
private SignalContribution ComputeEmbeddingSignal(
|
||||
FunctionAnalysis source,
|
||||
FunctionAnalysis target,
|
||||
EnsembleOptions options)
|
||||
{
|
||||
if (source.Embedding is null || target.Embedding is null)
|
||||
{
|
||||
return new SignalContribution
|
||||
{
|
||||
SignalType = SignalType.Embedding,
|
||||
RawScore = 0m,
|
||||
Weight = options.EmbeddingWeight,
|
||||
IsAvailable = false,
|
||||
Quality = SignalQuality.Unavailable
|
||||
};
|
||||
}
|
||||
|
||||
var similarity = _embeddingService.ComputeSimilarity(
|
||||
source.Embedding,
|
||||
target.Embedding,
|
||||
SimilarityMetric.Cosine);
|
||||
|
||||
return new SignalContribution
|
||||
{
|
||||
SignalType = SignalType.Embedding,
|
||||
RawScore = similarity,
|
||||
Weight = options.EmbeddingWeight,
|
||||
IsAvailable = true,
|
||||
Quality = SignalQuality.Normal
|
||||
};
|
||||
}
|
||||
|
||||
private static SignalQuality AssessAstQuality(DecompiledAst ast1, DecompiledAst ast2)
|
||||
{
|
||||
var minNodes = Math.Min(ast1.Root.Children.Length, ast2.Root.Children.Length);
|
||||
|
||||
return minNodes switch
|
||||
{
|
||||
< 3 => SignalQuality.Low,
|
||||
< 10 => SignalQuality.Normal,
|
||||
_ => SignalQuality.High
|
||||
};
|
||||
}
|
||||
|
||||
private static SignalQuality AssessGraphQuality(KeySemanticsGraph g1, KeySemanticsGraph g2)
|
||||
{
|
||||
var minNodes = Math.Min(g1.Nodes.Length, g2.Nodes.Length);
|
||||
|
||||
return minNodes switch
|
||||
{
|
||||
< 3 => SignalQuality.Low,
|
||||
< 10 => SignalQuality.Normal,
|
||||
_ => SignalQuality.High
|
||||
};
|
||||
}
|
||||
|
||||
private static EffectiveWeights ComputeEffectiveWeights(
|
||||
List<SignalContribution> contributions,
|
||||
EnsembleOptions options,
|
||||
decimal availableWeight)
|
||||
{
|
||||
if (!options.AdaptiveWeights || availableWeight >= 0.999m)
|
||||
{
|
||||
return new EffectiveWeights(
|
||||
options.SyntacticWeight,
|
||||
options.SemanticWeight,
|
||||
options.EmbeddingWeight);
|
||||
}
|
||||
|
||||
// Redistribute weight from unavailable signals to available ones
|
||||
var syntactic = contributions.First(c => c.SignalType == SignalType.Syntactic);
|
||||
var semantic = contributions.First(c => c.SignalType == SignalType.Semantic);
|
||||
var embedding = contributions.First(c => c.SignalType == SignalType.Embedding);
|
||||
|
||||
var syntacticWeight = syntactic.IsAvailable
|
||||
? options.SyntacticWeight / availableWeight
|
||||
: 0m;
|
||||
var semanticWeight = semantic.IsAvailable
|
||||
? options.SemanticWeight / availableWeight
|
||||
: 0m;
|
||||
var embeddingWeight = embedding.IsAvailable
|
||||
? options.EmbeddingWeight / availableWeight
|
||||
: 0m;
|
||||
|
||||
return new EffectiveWeights(syntacticWeight, semanticWeight, embeddingWeight);
|
||||
}
|
||||
|
||||
private static List<SignalContribution> AdjustContributionWeights(
|
||||
List<SignalContribution> contributions,
|
||||
EffectiveWeights weights)
|
||||
{
|
||||
return contributions.Select(c => c.SignalType switch
|
||||
{
|
||||
SignalType.Syntactic => c with { Weight = weights.Syntactic },
|
||||
SignalType.Semantic => c with { Weight = weights.Semantic },
|
||||
SignalType.Embedding => c with { Weight = weights.Embedding },
|
||||
_ => c
|
||||
}).ToList();
|
||||
}
|
||||
|
||||
private static decimal ComputeEnsembleScore(
|
||||
List<SignalContribution> contributions,
|
||||
bool exactHashMatch,
|
||||
EnsembleOptions options)
|
||||
{
|
||||
var weightedSum = contributions
|
||||
.Where(c => c.IsAvailable)
|
||||
.Sum(c => c.WeightedScore);
|
||||
|
||||
// Apply exact match boost
|
||||
if (exactHashMatch && options.UseExactHashMatch)
|
||||
{
|
||||
weightedSum = Math.Min(1.0m, weightedSum + options.ExactMatchBoost);
|
||||
}
|
||||
|
||||
return Math.Clamp(weightedSum, 0m, 1m);
|
||||
}
|
||||
|
||||
private static ConfidenceLevel DetermineConfidence(
|
||||
decimal score,
|
||||
List<SignalContribution> contributions,
|
||||
bool exactHashMatch)
|
||||
{
|
||||
// Exact hash match is very high confidence
|
||||
if (exactHashMatch)
|
||||
{
|
||||
return ConfidenceLevel.VeryHigh;
|
||||
}
|
||||
|
||||
// Count available high-quality signals
|
||||
var availableCount = contributions.Count(c => c.IsAvailable);
|
||||
var highQualityCount = contributions.Count(c =>
|
||||
c.IsAvailable && c.Quality >= SignalQuality.Normal);
|
||||
|
||||
// High score with multiple agreeing signals
|
||||
if (score >= 0.95m && availableCount >= 3)
|
||||
{
|
||||
return ConfidenceLevel.VeryHigh;
|
||||
}
|
||||
|
||||
if (score >= 0.90m && highQualityCount >= 2)
|
||||
{
|
||||
return ConfidenceLevel.High;
|
||||
}
|
||||
|
||||
if (score >= 0.80m && availableCount >= 2)
|
||||
{
|
||||
return ConfidenceLevel.Medium;
|
||||
}
|
||||
|
||||
if (score >= 0.70m)
|
||||
{
|
||||
return ConfidenceLevel.Low;
|
||||
}
|
||||
|
||||
return ConfidenceLevel.VeryLow;
|
||||
}
|
||||
|
||||
private static string BuildDecisionReason(
|
||||
List<SignalContribution> contributions,
|
||||
bool exactHashMatch,
|
||||
bool isMatch)
|
||||
{
|
||||
if (exactHashMatch)
|
||||
{
|
||||
return "Exact normalized code hash match";
|
||||
}
|
||||
|
||||
var availableSignals = contributions
|
||||
.Where(c => c.IsAvailable)
|
||||
.Select(c => $"{c.SignalType}: {c.RawScore:P0}")
|
||||
.ToList();
|
||||
|
||||
if (availableSignals.Count == 0)
|
||||
{
|
||||
return "No signals available for comparison";
|
||||
}
|
||||
|
||||
var signalSummary = string.Join(", ", availableSignals);
|
||||
return isMatch
|
||||
? $"Match based on: {signalSummary}"
|
||||
: $"No match. Scores: {signalSummary}";
|
||||
}
|
||||
|
||||
private static ComparisonStatistics ComputeStatistics(List<EnsembleResult> results)
|
||||
{
|
||||
var matchCount = results.Count(r => r.IsMatch);
|
||||
var highConfidenceMatches = results.Count(r =>
|
||||
r.IsMatch && r.Confidence >= ConfidenceLevel.High);
|
||||
var exactHashMatches = results.Count(r => r.ExactHashMatch);
|
||||
var averageScore = results.Count > 0
|
||||
? results.Average(r => r.EnsembleScore)
|
||||
: 0m;
|
||||
|
||||
var confidenceDistribution = results
|
||||
.GroupBy(r => r.Confidence)
|
||||
.ToImmutableDictionary(g => g.Key, g => g.Count());
|
||||
|
||||
return new ComparisonStatistics
|
||||
{
|
||||
TotalComparisons = results.Count,
|
||||
MatchCount = matchCount,
|
||||
HighConfidenceMatches = highConfidenceMatches,
|
||||
ExactHashMatches = exactHashMatches,
|
||||
AverageScore = averageScore,
|
||||
ConfidenceDistribution = confidenceDistribution
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,110 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.BinaryIndex.Decompiler;
|
||||
using StellaOps.BinaryIndex.ML;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ensemble;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering ensemble services.
|
||||
/// </summary>
|
||||
public static class EnsembleServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds ensemble decision engine services to the service collection.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddEnsembleServices(this IServiceCollection services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
// Register ensemble components
|
||||
services.AddScoped<IEnsembleDecisionEngine, EnsembleDecisionEngine>();
|
||||
services.AddScoped<IFunctionAnalysisBuilder, FunctionAnalysisBuilder>();
|
||||
services.AddScoped<IWeightTuningService, WeightTuningService>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds ensemble services with custom options.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureOptions">Action to configure ensemble options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddEnsembleServices(
|
||||
this IServiceCollection services,
|
||||
Action<EnsembleOptions> configureOptions)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(configureOptions);
|
||||
|
||||
services.Configure(configureOptions);
|
||||
return services.AddEnsembleServices();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds the complete binary similarity stack (Decompiler + ML + Semantic + Ensemble).
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddBinarySimilarityServices(this IServiceCollection services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
// Add all underlying services
|
||||
services.AddDecompilerServices();
|
||||
services.AddMlServices();
|
||||
services.AddBinaryIndexSemantic();
|
||||
|
||||
// Add ensemble on top
|
||||
services.AddEnsembleServices();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds the complete binary similarity stack with custom options.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureEnsemble">Action to configure ensemble options.</param>
|
||||
/// <param name="configureMl">Action to configure ML options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddBinarySimilarityServices(
|
||||
this IServiceCollection services,
|
||||
Action<EnsembleOptions>? configureEnsemble = null,
|
||||
Action<MlOptions>? configureMl = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
// Add all underlying services
|
||||
services.AddDecompilerServices();
|
||||
|
||||
if (configureMl is not null)
|
||||
{
|
||||
services.AddMlServices(configureMl);
|
||||
}
|
||||
else
|
||||
{
|
||||
services.AddMlServices();
|
||||
}
|
||||
|
||||
services.AddBinaryIndexSemantic();
|
||||
|
||||
// Add ensemble with options
|
||||
if (configureEnsemble is not null)
|
||||
{
|
||||
services.AddEnsembleServices(configureEnsemble);
|
||||
}
|
||||
else
|
||||
{
|
||||
services.AddEnsembleServices();
|
||||
}
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,165 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Decompiler;
|
||||
using StellaOps.BinaryIndex.ML;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ensemble;
|
||||
|
||||
/// <summary>
|
||||
/// Builds complete function analysis from various input sources.
|
||||
/// </summary>
|
||||
public sealed class FunctionAnalysisBuilder : IFunctionAnalysisBuilder
|
||||
{
|
||||
private readonly IDecompiledCodeParser _parser;
|
||||
private readonly ICodeNormalizer _normalizer;
|
||||
private readonly IEmbeddingService _embeddingService;
|
||||
private readonly IIrLiftingService? _irLiftingService;
|
||||
private readonly ISemanticGraphExtractor? _graphExtractor;
|
||||
private readonly ILogger<FunctionAnalysisBuilder> _logger;
|
||||
|
||||
public FunctionAnalysisBuilder(
|
||||
IDecompiledCodeParser parser,
|
||||
ICodeNormalizer normalizer,
|
||||
IEmbeddingService embeddingService,
|
||||
ILogger<FunctionAnalysisBuilder> logger,
|
||||
IIrLiftingService? irLiftingService = null,
|
||||
ISemanticGraphExtractor? graphExtractor = null)
|
||||
{
|
||||
_parser = parser ?? throw new ArgumentNullException(nameof(parser));
|
||||
_normalizer = normalizer ?? throw new ArgumentNullException(nameof(normalizer));
|
||||
_embeddingService = embeddingService ?? throw new ArgumentNullException(nameof(embeddingService));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_irLiftingService = irLiftingService;
|
||||
_graphExtractor = graphExtractor;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<FunctionAnalysis> BuildAnalysisAsync(
|
||||
string functionId,
|
||||
string functionName,
|
||||
string decompiledCode,
|
||||
ulong? address = null,
|
||||
int? sizeBytes = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(functionId);
|
||||
ArgumentException.ThrowIfNullOrEmpty(functionName);
|
||||
ArgumentException.ThrowIfNullOrEmpty(decompiledCode);
|
||||
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Building analysis for function {FunctionId} ({FunctionName})",
|
||||
functionId, functionName);
|
||||
|
||||
// Parse AST
|
||||
DecompiledAst? ast = null;
|
||||
try
|
||||
{
|
||||
ast = _parser.Parse(decompiledCode);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse AST for {FunctionId}", functionId);
|
||||
}
|
||||
|
||||
// Compute normalized hash
|
||||
byte[]? normalizedHash = null;
|
||||
try
|
||||
{
|
||||
normalizedHash = _normalizer.ComputeCanonicalHash(decompiledCode);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to compute normalized hash for {FunctionId}", functionId);
|
||||
}
|
||||
|
||||
// Build semantic graph (requires IR lifting service and graph extractor)
|
||||
KeySemanticsGraph? semanticGraph = null;
|
||||
if (_irLiftingService is not null && _graphExtractor is not null)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Note: Full semantic graph extraction requires binary bytes,
|
||||
// not just decompiled code. This is a simplified path that
|
||||
// sets semanticGraph to null when binary data is not available.
|
||||
_logger.LogDebug(
|
||||
"Semantic graph extraction requires binary data for {FunctionId}",
|
||||
functionId);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to build semantic graph for {FunctionId}", functionId);
|
||||
}
|
||||
}
|
||||
|
||||
// Generate embedding
|
||||
FunctionEmbedding? embedding = null;
|
||||
try
|
||||
{
|
||||
var input = new EmbeddingInput(
|
||||
DecompiledCode: decompiledCode,
|
||||
SemanticGraph: semanticGraph,
|
||||
InstructionBytes: null,
|
||||
PreferredInput: EmbeddingInputType.DecompiledCode);
|
||||
embedding = await _embeddingService.GenerateEmbeddingAsync(input, ct: ct);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to generate embedding for {FunctionId}", functionId);
|
||||
}
|
||||
|
||||
return new FunctionAnalysis
|
||||
{
|
||||
FunctionId = functionId,
|
||||
FunctionName = functionName,
|
||||
Ast = ast,
|
||||
SemanticGraph = semanticGraph,
|
||||
Embedding = embedding,
|
||||
NormalizedCodeHash = normalizedHash,
|
||||
DecompiledCode = decompiledCode,
|
||||
Address = address,
|
||||
SizeBytes = sizeBytes
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public FunctionAnalysis BuildFromComponents(
|
||||
string functionId,
|
||||
string functionName,
|
||||
string? decompiledCode = null,
|
||||
DecompiledAst? ast = null,
|
||||
KeySemanticsGraph? semanticGraph = null,
|
||||
FunctionEmbedding? embedding = null)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(functionId);
|
||||
ArgumentException.ThrowIfNullOrEmpty(functionName);
|
||||
|
||||
byte[]? normalizedHash = null;
|
||||
if (decompiledCode is not null)
|
||||
{
|
||||
try
|
||||
{
|
||||
normalizedHash = _normalizer.ComputeCanonicalHash(decompiledCode);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Ignore normalization errors for components
|
||||
}
|
||||
}
|
||||
|
||||
return new FunctionAnalysis
|
||||
{
|
||||
FunctionId = functionId,
|
||||
FunctionName = functionName,
|
||||
Ast = ast,
|
||||
SemanticGraph = semanticGraph,
|
||||
Embedding = embedding,
|
||||
NormalizedCodeHash = normalizedHash,
|
||||
DecompiledCode = decompiledCode
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ensemble;
|
||||
|
||||
/// <summary>
|
||||
/// Ensemble decision engine that combines multiple similarity signals
|
||||
/// to determine function equivalence.
|
||||
/// </summary>
|
||||
public interface IEnsembleDecisionEngine
|
||||
{
|
||||
/// <summary>
|
||||
/// Compare two functions using all available signals.
|
||||
/// </summary>
|
||||
/// <param name="source">Source function analysis.</param>
|
||||
/// <param name="target">Target function analysis.</param>
|
||||
/// <param name="options">Ensemble options (optional).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Ensemble comparison result.</returns>
|
||||
Task<EnsembleResult> CompareAsync(
|
||||
FunctionAnalysis source,
|
||||
FunctionAnalysis target,
|
||||
EnsembleOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find the best matches for a function from a corpus.
|
||||
/// </summary>
|
||||
/// <param name="query">Query function analysis.</param>
|
||||
/// <param name="corpus">Corpus of candidate functions.</param>
|
||||
/// <param name="options">Ensemble options (optional).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Top matching functions.</returns>
|
||||
Task<ImmutableArray<EnsembleResult>> FindMatchesAsync(
|
||||
FunctionAnalysis query,
|
||||
IEnumerable<FunctionAnalysis> corpus,
|
||||
EnsembleOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Perform batch comparison between two sets of functions.
|
||||
/// </summary>
|
||||
/// <param name="sources">Source functions.</param>
|
||||
/// <param name="targets">Target functions.</param>
|
||||
/// <param name="options">Ensemble options (optional).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Batch comparison result with statistics.</returns>
|
||||
Task<BatchComparisonResult> CompareBatchAsync(
|
||||
IEnumerable<FunctionAnalysis> sources,
|
||||
IEnumerable<FunctionAnalysis> targets,
|
||||
EnsembleOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Weight tuning service for optimizing ensemble weights.
|
||||
/// </summary>
|
||||
public interface IWeightTuningService
|
||||
{
|
||||
/// <summary>
|
||||
/// Tune weights using grid search over training pairs.
|
||||
/// </summary>
|
||||
/// <param name="trainingPairs">Labeled training pairs.</param>
|
||||
/// <param name="gridStep">Step size for grid search (e.g., 0.05).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Best weights found.</returns>
|
||||
Task<WeightTuningResult> TuneWeightsAsync(
|
||||
IEnumerable<EnsembleTrainingPair> trainingPairs,
|
||||
decimal gridStep = 0.05m,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Evaluate a specific weight combination on training data.
|
||||
/// </summary>
|
||||
/// <param name="weights">Weights to evaluate.</param>
|
||||
/// <param name="trainingPairs">Labeled training pairs.</param>
|
||||
/// <param name="threshold">Match threshold.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Evaluation metrics.</returns>
|
||||
Task<WeightEvaluation> EvaluateWeightsAsync(
|
||||
EffectiveWeights weights,
|
||||
IEnumerable<EnsembleTrainingPair> trainingPairs,
|
||||
decimal threshold = 0.85m,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Function analysis builder that collects all signal sources.
|
||||
/// </summary>
|
||||
public interface IFunctionAnalysisBuilder
|
||||
{
|
||||
/// <summary>
|
||||
/// Build complete function analysis from raw data.
|
||||
/// </summary>
|
||||
/// <param name="functionId">Function identifier.</param>
|
||||
/// <param name="functionName">Function name.</param>
|
||||
/// <param name="decompiledCode">Raw decompiled code.</param>
|
||||
/// <param name="address">Function address (optional).</param>
|
||||
/// <param name="sizeBytes">Function size in bytes (optional).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Complete function analysis.</returns>
|
||||
Task<FunctionAnalysis> BuildAnalysisAsync(
|
||||
string functionId,
|
||||
string functionName,
|
||||
string decompiledCode,
|
||||
ulong? address = null,
|
||||
int? sizeBytes = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Build function analysis from existing components.
|
||||
/// </summary>
|
||||
/// <param name="functionId">Function identifier.</param>
|
||||
/// <param name="functionName">Function name.</param>
|
||||
/// <param name="decompiledCode">Raw decompiled code (optional).</param>
|
||||
/// <param name="ast">Pre-parsed AST (optional).</param>
|
||||
/// <param name="semanticGraph">Pre-built semantic graph (optional).</param>
|
||||
/// <param name="embedding">Pre-computed embedding (optional).</param>
|
||||
/// <returns>Function analysis.</returns>
|
||||
FunctionAnalysis BuildFromComponents(
|
||||
string functionId,
|
||||
string functionName,
|
||||
string? decompiledCode = null,
|
||||
Decompiler.DecompiledAst? ast = null,
|
||||
Semantic.KeySemanticsGraph? semanticGraph = null,
|
||||
ML.FunctionEmbedding? embedding = null);
|
||||
}
|
||||
@@ -0,0 +1,446 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.Decompiler;
|
||||
using StellaOps.BinaryIndex.ML;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ensemble;
|
||||
|
||||
/// <summary>
|
||||
/// Complete analysis of a function from all signal sources.
|
||||
/// </summary>
|
||||
public sealed record FunctionAnalysis
|
||||
{
|
||||
/// <summary>
|
||||
/// Unique identifier for the function.
|
||||
/// </summary>
|
||||
public required string FunctionId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Function name if available.
|
||||
/// </summary>
|
||||
public required string FunctionName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Decompiled AST representation.
|
||||
/// </summary>
|
||||
public DecompiledAst? Ast { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Semantic graph representation.
|
||||
/// </summary>
|
||||
public KeySemanticsGraph? SemanticGraph { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// ML embedding representation.
|
||||
/// </summary>
|
||||
public FunctionEmbedding? Embedding { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Normalized code hash for quick equality check.
|
||||
/// </summary>
|
||||
public byte[]? NormalizedCodeHash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Raw decompiled code.
|
||||
/// </summary>
|
||||
public string? DecompiledCode { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Binary address of the function.
|
||||
/// </summary>
|
||||
public ulong? Address { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Size of the function in bytes.
|
||||
/// </summary>
|
||||
public int? SizeBytes { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for ensemble decision making.
|
||||
/// </summary>
|
||||
public sealed class EnsembleOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Weight for syntactic (AST-based) similarity. Default: 0.25
|
||||
/// </summary>
|
||||
public decimal SyntacticWeight { get; set; } = 0.25m;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for semantic (graph-based) similarity. Default: 0.35
|
||||
/// </summary>
|
||||
public decimal SemanticWeight { get; set; } = 0.35m;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for ML embedding similarity. Default: 0.40
|
||||
/// </summary>
|
||||
public decimal EmbeddingWeight { get; set; } = 0.40m;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum ensemble score to consider functions as matching.
|
||||
/// </summary>
|
||||
public decimal MatchThreshold { get; set; } = 0.85m;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum score for each individual signal to be considered valid.
|
||||
/// </summary>
|
||||
public decimal MinimumSignalThreshold { get; set; } = 0.50m;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to require all three signals for a match decision.
|
||||
/// </summary>
|
||||
public bool RequireAllSignals { get; set; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to use exact hash matching as an optimization.
|
||||
/// </summary>
|
||||
public bool UseExactHashMatch { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Confidence boost when normalized code hashes match exactly.
|
||||
/// </summary>
|
||||
public decimal ExactMatchBoost { get; set; } = 0.10m;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of candidate matches to return.
|
||||
/// </summary>
|
||||
public int MaxCandidates { get; set; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Enable adaptive weight adjustment based on signal quality.
|
||||
/// </summary>
|
||||
public bool AdaptiveWeights { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Validates that weights sum to 1.0.
|
||||
/// </summary>
|
||||
public bool AreWeightsValid()
|
||||
{
|
||||
var total = SyntacticWeight + SemanticWeight + EmbeddingWeight;
|
||||
return Math.Abs(total - 1.0m) < 0.001m;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Normalizes weights to sum to 1.0.
|
||||
/// </summary>
|
||||
public void NormalizeWeights()
|
||||
{
|
||||
var total = SyntacticWeight + SemanticWeight + EmbeddingWeight;
|
||||
if (total > 0)
|
||||
{
|
||||
SyntacticWeight /= total;
|
||||
SemanticWeight /= total;
|
||||
EmbeddingWeight /= total;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of ensemble comparison between two functions.
|
||||
/// </summary>
|
||||
public sealed record EnsembleResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Source function identifier.
|
||||
/// </summary>
|
||||
public required string SourceFunctionId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Target function identifier.
|
||||
/// </summary>
|
||||
public required string TargetFunctionId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Final ensemble similarity score (0.0 to 1.0).
|
||||
/// </summary>
|
||||
public required decimal EnsembleScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Individual signal contributions.
|
||||
/// </summary>
|
||||
public required ImmutableArray<SignalContribution> Contributions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether this pair is considered a match based on threshold.
|
||||
/// </summary>
|
||||
public required bool IsMatch { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level in the match decision.
|
||||
/// </summary>
|
||||
public required ConfidenceLevel Confidence { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Reason for the match or non-match decision.
|
||||
/// </summary>
|
||||
public string? DecisionReason { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether exact hash match was detected.
|
||||
/// </summary>
|
||||
public bool ExactHashMatch { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Effective weights used after adaptive adjustment.
|
||||
/// </summary>
|
||||
public EffectiveWeights? AdjustedWeights { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Contribution of a single signal to the ensemble score.
|
||||
/// </summary>
|
||||
public sealed record SignalContribution
|
||||
{
|
||||
/// <summary>
|
||||
/// Type of signal.
|
||||
/// </summary>
|
||||
public required SignalType SignalType { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Raw similarity score from this signal.
|
||||
/// </summary>
|
||||
public required decimal RawScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Weight applied to this signal.
|
||||
/// </summary>
|
||||
public required decimal Weight { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Weighted contribution to ensemble score.
|
||||
/// </summary>
|
||||
public decimal WeightedScore => RawScore * Weight;
|
||||
|
||||
/// <summary>
|
||||
/// Whether this signal was available for comparison.
|
||||
/// </summary>
|
||||
public required bool IsAvailable { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Quality assessment of this signal.
|
||||
/// </summary>
|
||||
public SignalQuality Quality { get; init; } = SignalQuality.Normal;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of similarity signal.
|
||||
/// </summary>
|
||||
public enum SignalType
|
||||
{
|
||||
/// <summary>
|
||||
/// AST-based syntactic comparison.
|
||||
/// </summary>
|
||||
Syntactic,
|
||||
|
||||
/// <summary>
|
||||
/// Semantic graph comparison.
|
||||
/// </summary>
|
||||
Semantic,
|
||||
|
||||
/// <summary>
|
||||
/// ML embedding cosine similarity.
|
||||
/// </summary>
|
||||
Embedding,
|
||||
|
||||
/// <summary>
|
||||
/// Exact normalized code hash match.
|
||||
/// </summary>
|
||||
ExactHash
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Quality assessment of a signal.
|
||||
/// </summary>
|
||||
public enum SignalQuality
|
||||
{
|
||||
/// <summary>
|
||||
/// Signal not available (data missing).
|
||||
/// </summary>
|
||||
Unavailable,
|
||||
|
||||
/// <summary>
|
||||
/// Low quality signal (small function, few nodes).
|
||||
/// </summary>
|
||||
Low,
|
||||
|
||||
/// <summary>
|
||||
/// Normal quality signal.
|
||||
/// </summary>
|
||||
Normal,
|
||||
|
||||
/// <summary>
|
||||
/// High quality signal (rich data, high confidence).
|
||||
/// </summary>
|
||||
High
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level in a match decision.
|
||||
/// </summary>
|
||||
public enum ConfidenceLevel
|
||||
{
|
||||
/// <summary>
|
||||
/// Very low confidence, likely uncertain.
|
||||
/// </summary>
|
||||
VeryLow,
|
||||
|
||||
/// <summary>
|
||||
/// Low confidence, needs review.
|
||||
/// </summary>
|
||||
Low,
|
||||
|
||||
/// <summary>
|
||||
/// Medium confidence, reasonable certainty.
|
||||
/// </summary>
|
||||
Medium,
|
||||
|
||||
/// <summary>
|
||||
/// High confidence, strong match signals.
|
||||
/// </summary>
|
||||
High,
|
||||
|
||||
/// <summary>
|
||||
/// Very high confidence, exact or near-exact match.
|
||||
/// </summary>
|
||||
VeryHigh
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Effective weights after adaptive adjustment.
|
||||
/// </summary>
|
||||
public sealed record EffectiveWeights(
|
||||
decimal Syntactic,
|
||||
decimal Semantic,
|
||||
decimal Embedding);
|
||||
|
||||
/// <summary>
|
||||
/// Batch comparison result.
|
||||
/// </summary>
|
||||
public sealed record BatchComparisonResult
|
||||
{
|
||||
/// <summary>
|
||||
/// All comparison results.
|
||||
/// </summary>
|
||||
public required ImmutableArray<EnsembleResult> Results { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Summary statistics.
|
||||
/// </summary>
|
||||
public required ComparisonStatistics Statistics { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Time taken for comparison.
|
||||
/// </summary>
|
||||
public required TimeSpan Duration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Statistics from batch comparison.
|
||||
/// </summary>
|
||||
public sealed record ComparisonStatistics
|
||||
{
|
||||
/// <summary>
|
||||
/// Total number of comparisons performed.
|
||||
/// </summary>
|
||||
public required int TotalComparisons { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of matches found.
|
||||
/// </summary>
|
||||
public required int MatchCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of high-confidence matches.
|
||||
/// </summary>
|
||||
public required int HighConfidenceMatches { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of exact hash matches.
|
||||
/// </summary>
|
||||
public required int ExactHashMatches { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Average ensemble score across all comparisons.
|
||||
/// </summary>
|
||||
public required decimal AverageScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Distribution of confidence levels.
|
||||
/// </summary>
|
||||
public required ImmutableDictionary<ConfidenceLevel, int> ConfidenceDistribution { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Weight tuning result from grid search or optimization.
|
||||
/// </summary>
|
||||
public sealed record WeightTuningResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Best weights found.
|
||||
/// </summary>
|
||||
public required EffectiveWeights BestWeights { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Accuracy achieved with best weights.
|
||||
/// </summary>
|
||||
public required decimal Accuracy { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Precision achieved with best weights.
|
||||
/// </summary>
|
||||
public required decimal Precision { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Recall achieved with best weights.
|
||||
/// </summary>
|
||||
public required decimal Recall { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// F1 score achieved with best weights.
|
||||
/// </summary>
|
||||
public required decimal F1Score { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// All weight combinations evaluated.
|
||||
/// </summary>
|
||||
public required ImmutableArray<WeightEvaluation> Evaluations { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evaluation of a specific weight combination.
|
||||
/// </summary>
|
||||
public sealed record WeightEvaluation(
|
||||
EffectiveWeights Weights,
|
||||
decimal Accuracy,
|
||||
decimal Precision,
|
||||
decimal Recall,
|
||||
decimal F1Score);
|
||||
|
||||
/// <summary>
|
||||
/// Training pair for weight tuning.
|
||||
/// </summary>
|
||||
public sealed record EnsembleTrainingPair
|
||||
{
|
||||
/// <summary>
|
||||
/// First function analysis.
|
||||
/// </summary>
|
||||
public required FunctionAnalysis Function1 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Second function analysis.
|
||||
/// </summary>
|
||||
public required FunctionAnalysis Function2 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Ground truth: are these functions equivalent?
|
||||
/// </summary>
|
||||
public required bool IsEquivalent { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional similarity label (for regression training).
|
||||
/// </summary>
|
||||
public decimal? SimilarityLabel { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
<!-- Copyright (c) StellaOps. All rights reserved. -->
|
||||
<!-- Licensed under AGPL-3.0-or-later. See LICENSE in the project root. -->
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<RootNamespace>StellaOps.BinaryIndex.Ensemble</RootNamespace>
|
||||
<Description>Ensemble decision engine combining syntactic, semantic, and ML-based function similarity signals.</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Decompiler\StellaOps.BinaryIndex.Decompiler.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.ML\StellaOps.BinaryIndex.ML.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Semantic\StellaOps.BinaryIndex.Semantic.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,180 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ensemble;
|
||||
|
||||
/// <summary>
|
||||
/// Weight tuning service using grid search optimization.
|
||||
/// </summary>
|
||||
public sealed class WeightTuningService : IWeightTuningService
|
||||
{
|
||||
private readonly IEnsembleDecisionEngine _decisionEngine;
|
||||
private readonly ILogger<WeightTuningService> _logger;
|
||||
|
||||
public WeightTuningService(
|
||||
IEnsembleDecisionEngine decisionEngine,
|
||||
ILogger<WeightTuningService> logger)
|
||||
{
|
||||
_decisionEngine = decisionEngine ?? throw new ArgumentNullException(nameof(decisionEngine));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<WeightTuningResult> TuneWeightsAsync(
|
||||
IEnumerable<EnsembleTrainingPair> trainingPairs,
|
||||
decimal gridStep = 0.05m,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(trainingPairs);
|
||||
|
||||
if (gridStep <= 0 || gridStep > 0.5m)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(gridStep), "Step must be between 0 and 0.5");
|
||||
}
|
||||
|
||||
var pairs = trainingPairs.ToList();
|
||||
if (pairs.Count == 0)
|
||||
{
|
||||
throw new ArgumentException("At least one training pair required", nameof(trainingPairs));
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting weight tuning with {PairCount} pairs, step size {Step}",
|
||||
pairs.Count, gridStep);
|
||||
|
||||
var evaluations = new List<WeightEvaluation>();
|
||||
WeightEvaluation? bestEvaluation = null;
|
||||
|
||||
// Grid search over weight combinations
|
||||
for (var syntactic = 0m; syntactic <= 1m; syntactic += gridStep)
|
||||
{
|
||||
for (var semantic = 0m; semantic <= 1m - syntactic; semantic += gridStep)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var embedding = 1m - syntactic - semantic;
|
||||
|
||||
// Skip invalid weight combinations
|
||||
if (embedding < 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var weights = new EffectiveWeights(syntactic, semantic, embedding);
|
||||
var evaluation = await EvaluateWeightsAsync(weights, pairs, 0.85m, ct);
|
||||
evaluations.Add(evaluation);
|
||||
|
||||
if (bestEvaluation is null || evaluation.F1Score > bestEvaluation.F1Score)
|
||||
{
|
||||
bestEvaluation = evaluation;
|
||||
_logger.LogDebug(
|
||||
"New best weights: Syn={Syn:P0} Sem={Sem:P0} Emb={Emb:P0} F1={F1:P2}",
|
||||
syntactic, semantic, embedding, evaluation.F1Score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (bestEvaluation is null)
|
||||
{
|
||||
throw new InvalidOperationException("No valid weight combinations evaluated");
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Weight tuning complete. Best weights: Syn={Syn:P0} Sem={Sem:P0} Emb={Emb:P0} F1={F1:P2}",
|
||||
bestEvaluation.Weights.Syntactic,
|
||||
bestEvaluation.Weights.Semantic,
|
||||
bestEvaluation.Weights.Embedding,
|
||||
bestEvaluation.F1Score);
|
||||
|
||||
return new WeightTuningResult
|
||||
{
|
||||
BestWeights = bestEvaluation.Weights,
|
||||
Accuracy = bestEvaluation.Accuracy,
|
||||
Precision = bestEvaluation.Precision,
|
||||
Recall = bestEvaluation.Recall,
|
||||
F1Score = bestEvaluation.F1Score,
|
||||
Evaluations = evaluations.ToImmutableArray()
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<WeightEvaluation> EvaluateWeightsAsync(
|
||||
EffectiveWeights weights,
|
||||
IEnumerable<EnsembleTrainingPair> trainingPairs,
|
||||
decimal threshold = 0.85m,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(weights);
|
||||
ArgumentNullException.ThrowIfNull(trainingPairs);
|
||||
|
||||
var options = new EnsembleOptions
|
||||
{
|
||||
SyntacticWeight = weights.Syntactic,
|
||||
SemanticWeight = weights.Semantic,
|
||||
EmbeddingWeight = weights.Embedding,
|
||||
MatchThreshold = threshold,
|
||||
AdaptiveWeights = false // Use fixed weights during evaluation
|
||||
};
|
||||
|
||||
var truePositives = 0;
|
||||
var falsePositives = 0;
|
||||
var trueNegatives = 0;
|
||||
var falseNegatives = 0;
|
||||
|
||||
foreach (var pair in trainingPairs)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var result = await _decisionEngine.CompareAsync(
|
||||
pair.Function1,
|
||||
pair.Function2,
|
||||
options,
|
||||
ct);
|
||||
|
||||
if (pair.IsEquivalent)
|
||||
{
|
||||
if (result.IsMatch)
|
||||
{
|
||||
truePositives++;
|
||||
}
|
||||
else
|
||||
{
|
||||
falseNegatives++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (result.IsMatch)
|
||||
{
|
||||
falsePositives++;
|
||||
}
|
||||
else
|
||||
{
|
||||
trueNegatives++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var total = truePositives + falsePositives + trueNegatives + falseNegatives;
|
||||
var accuracy = total > 0
|
||||
? (decimal)(truePositives + trueNegatives) / total
|
||||
: 0m;
|
||||
|
||||
var precision = (truePositives + falsePositives) > 0
|
||||
? (decimal)truePositives / (truePositives + falsePositives)
|
||||
: 0m;
|
||||
|
||||
var recall = (truePositives + falseNegatives) > 0
|
||||
? (decimal)truePositives / (truePositives + falseNegatives)
|
||||
: 0m;
|
||||
|
||||
var f1Score = (precision + recall) > 0
|
||||
? 2 * precision * recall / (precision + recall)
|
||||
: 0m;
|
||||
|
||||
return new WeightEvaluation(weights, accuracy, precision, recall, f1Score);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
# AGENTS.md - StellaOps.BinaryIndex.Ghidra
|
||||
|
||||
## Module Overview
|
||||
|
||||
This module provides Ghidra integration for the BinaryIndex semantic diffing stack. It serves as a fallback/enhancement layer when B2R2 provides insufficient coverage or accuracy.
|
||||
|
||||
## Roles Expected
|
||||
|
||||
- **Backend Engineer**: Implement Ghidra Headless wrapper, ghidriff bridge, Version Tracking service, BSim integration
|
||||
- **QA Engineer**: Unit tests for all services, integration tests for Ghidra availability scenarios
|
||||
|
||||
## Required Documentation
|
||||
|
||||
Before working on this module, read:
|
||||
|
||||
- `docs/modules/binary-index/architecture.md`
|
||||
- `docs/implplan/SPRINT_20260105_001_003_BINDEX_semdiff_ghidra.md`
|
||||
- Ghidra documentation: https://ghidra.re/ghidra_docs/
|
||||
- ghidriff repository: https://github.com/clearbluejar/ghidriff
|
||||
|
||||
## Module-Specific Constraints
|
||||
|
||||
### Process Management
|
||||
- Ghidra runs as external Java process - manage lifecycle carefully
|
||||
- Use SemaphoreSlim for concurrent access control (one analysis at a time per instance)
|
||||
- Always clean up temporary project directories
|
||||
|
||||
### External Dependencies
|
||||
- **Ghidra 11.x**: Set via `GhidraOptions.GhidraHome`
|
||||
- **Java 17+**: Set via `GhidraOptions.JavaHome`
|
||||
- **Python 3.10+**: Required for ghidriff
|
||||
- **ghidriff**: Installed via pip
|
||||
|
||||
### Determinism Rules
|
||||
- Use `CultureInfo.InvariantCulture` for all parsing/formatting
|
||||
- Inject `TimeProvider` for timestamps
|
||||
- Inject `IGuidGenerator` for any ID generation
|
||||
- Results must be reproducible given same inputs
|
||||
|
||||
### Error Handling
|
||||
- Ghidra unavailability should not crash - graceful degradation
|
||||
- Log all external process failures with stderr content
|
||||
- Wrap external exceptions in `GhidraException` or `GhidriffException`
|
||||
|
||||
## Key Interfaces
|
||||
|
||||
| Interface | Purpose |
|
||||
|-----------|---------|
|
||||
| `IGhidraService` | Main analysis service (headless wrapper) |
|
||||
| `IVersionTrackingService` | Version Tracking with multiple correlators |
|
||||
| `IBSimService` | BSim signature generation and querying |
|
||||
| `IGhidriffBridge` | Python ghidriff interop |
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
StellaOps.BinaryIndex.Ghidra/
|
||||
Abstractions/
|
||||
IGhidraService.cs
|
||||
IVersionTrackingService.cs
|
||||
IBSimService.cs
|
||||
IGhidriffBridge.cs
|
||||
Models/
|
||||
GhidraModels.cs
|
||||
VersionTrackingModels.cs
|
||||
BSimModels.cs
|
||||
GhidriffModels.cs
|
||||
Services/
|
||||
GhidraHeadlessManager.cs
|
||||
GhidraService.cs
|
||||
VersionTrackingService.cs
|
||||
BSimService.cs
|
||||
GhidriffBridge.cs
|
||||
Options/
|
||||
GhidraOptions.cs
|
||||
BSimOptions.cs
|
||||
GhidriffOptions.cs
|
||||
Exceptions/
|
||||
GhidraException.cs
|
||||
GhidriffException.cs
|
||||
Extensions/
|
||||
GhidraServiceCollectionExtensions.cs
|
||||
```
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
- Unit tests mock external process execution
|
||||
- Integration tests require Ghidra installation (skip if unavailable)
|
||||
- Use `[Trait("Category", "Integration")]` for tests requiring Ghidra
|
||||
- Fallback scenarios tested in isolation
|
||||
|
||||
## Working Agreements
|
||||
|
||||
1. All public APIs must have XML documentation
|
||||
2. Follow the pattern from `StellaOps.BinaryIndex.Disassembly`
|
||||
3. Expose services via `AddGhidra()` extension method
|
||||
4. Configuration via `IOptions<GhidraOptions>` pattern
|
||||
@@ -0,0 +1,168 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Service for Ghidra BSim (Binary Similarity) operations.
|
||||
/// BSim provides behavioral similarity matching based on P-Code semantics.
|
||||
/// </summary>
|
||||
public interface IBSimService
|
||||
{
|
||||
/// <summary>
|
||||
/// Generate BSim signatures for functions from an analyzed binary.
|
||||
/// </summary>
|
||||
/// <param name="analysis">Ghidra analysis result.</param>
|
||||
/// <param name="options">Signature generation options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>BSim signatures for each function.</returns>
|
||||
Task<ImmutableArray<BSimSignature>> GenerateSignaturesAsync(
|
||||
GhidraAnalysisResult analysis,
|
||||
BSimGenerationOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Query BSim database for similar functions.
|
||||
/// </summary>
|
||||
/// <param name="signature">The signature to search for.</param>
|
||||
/// <param name="options">Query options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matching functions from the database.</returns>
|
||||
Task<ImmutableArray<BSimMatch>> QueryAsync(
|
||||
BSimSignature signature,
|
||||
BSimQueryOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Query BSim database for multiple signatures in batch.
|
||||
/// </summary>
|
||||
/// <param name="signatures">The signatures to search for.</param>
|
||||
/// <param name="options">Query options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matching functions for each query signature.</returns>
|
||||
Task<ImmutableArray<BSimQueryResult>> QueryBatchAsync(
|
||||
ImmutableArray<BSimSignature> signatures,
|
||||
BSimQueryOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Ingest functions into BSim database.
|
||||
/// </summary>
|
||||
/// <param name="libraryName">Name of the library being ingested.</param>
|
||||
/// <param name="version">Version of the library.</param>
|
||||
/// <param name="signatures">Signatures to ingest.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
Task IngestAsync(
|
||||
string libraryName,
|
||||
string version,
|
||||
ImmutableArray<BSimSignature> signatures,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Check if BSim database is available and healthy.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>True if BSim database is accessible.</returns>
|
||||
Task<bool> IsAvailableAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for BSim signature generation.
|
||||
/// </summary>
|
||||
public sealed record BSimGenerationOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Minimum function size (in instructions) to generate signatures for.
|
||||
/// Very small functions produce low-confidence matches.
|
||||
/// </summary>
|
||||
public int MinFunctionSize { get; init; } = 5;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include thunk/stub functions.
|
||||
/// </summary>
|
||||
public bool IncludeThunks { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include imported library functions.
|
||||
/// </summary>
|
||||
public bool IncludeImports { get; init; } = false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for BSim database queries.
|
||||
/// </summary>
|
||||
public sealed record BSimQueryOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Minimum similarity score (0.0-1.0) for matches.
|
||||
/// </summary>
|
||||
public double MinSimilarity { get; init; } = 0.7;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum significance score for matches.
|
||||
/// Significance measures how distinctive a function is.
|
||||
/// </summary>
|
||||
public double MinSignificance { get; init; } = 0.0;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of results per query.
|
||||
/// </summary>
|
||||
public int MaxResults { get; init; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Limit search to specific libraries (empty = all libraries).
|
||||
/// </summary>
|
||||
public ImmutableArray<string> TargetLibraries { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Limit search to specific library versions.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> TargetVersions { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A BSim function signature.
|
||||
/// </summary>
|
||||
/// <param name="FunctionName">Original function name.</param>
|
||||
/// <param name="Address">Function address in the binary.</param>
|
||||
/// <param name="FeatureVector">BSim feature vector bytes.</param>
|
||||
/// <param name="VectorLength">Number of features in the vector.</param>
|
||||
/// <param name="SelfSignificance">How distinctive this function is (higher = more unique).</param>
|
||||
/// <param name="InstructionCount">Number of P-Code instructions.</param>
|
||||
public sealed record BSimSignature(
|
||||
string FunctionName,
|
||||
ulong Address,
|
||||
byte[] FeatureVector,
|
||||
int VectorLength,
|
||||
double SelfSignificance,
|
||||
int InstructionCount);
|
||||
|
||||
/// <summary>
|
||||
/// A BSim match result.
|
||||
/// </summary>
|
||||
/// <param name="MatchedLibrary">Library containing the matched function.</param>
|
||||
/// <param name="MatchedVersion">Version of the library.</param>
|
||||
/// <param name="MatchedFunction">Name of the matched function.</param>
|
||||
/// <param name="MatchedAddress">Address of the matched function.</param>
|
||||
/// <param name="Similarity">Similarity score (0.0-1.0).</param>
|
||||
/// <param name="Significance">Significance of the match.</param>
|
||||
/// <param name="Confidence">Combined confidence score.</param>
|
||||
public sealed record BSimMatch(
|
||||
string MatchedLibrary,
|
||||
string MatchedVersion,
|
||||
string MatchedFunction,
|
||||
ulong MatchedAddress,
|
||||
double Similarity,
|
||||
double Significance,
|
||||
double Confidence);
|
||||
|
||||
/// <summary>
|
||||
/// Result of a batch BSim query for a single signature.
|
||||
/// </summary>
|
||||
/// <param name="QuerySignature">The signature that was queried.</param>
|
||||
/// <param name="Matches">Matching functions found.</param>
|
||||
public sealed record BSimQueryResult(
|
||||
BSimSignature QuerySignature,
|
||||
ImmutableArray<BSimMatch> Matches);
|
||||
@@ -0,0 +1,144 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Main Ghidra analysis service interface.
|
||||
/// Provides access to Ghidra Headless analysis capabilities.
|
||||
/// </summary>
|
||||
public interface IGhidraService
|
||||
{
|
||||
/// <summary>
|
||||
/// Analyze a binary using Ghidra headless.
|
||||
/// </summary>
|
||||
/// <param name="binaryStream">The binary stream to analyze.</param>
|
||||
/// <param name="options">Optional analysis configuration.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Analysis results including functions, imports, exports, and metadata.</returns>
|
||||
Task<GhidraAnalysisResult> AnalyzeAsync(
|
||||
Stream binaryStream,
|
||||
GhidraAnalysisOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Analyze a binary from a file path using Ghidra headless.
|
||||
/// </summary>
|
||||
/// <param name="binaryPath">Absolute path to the binary file.</param>
|
||||
/// <param name="options">Optional analysis configuration.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Analysis results including functions, imports, exports, and metadata.</returns>
|
||||
Task<GhidraAnalysisResult> AnalyzeAsync(
|
||||
string binaryPath,
|
||||
GhidraAnalysisOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Check if Ghidra backend is available and healthy.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>True if Ghidra is available, false otherwise.</returns>
|
||||
Task<bool> IsAvailableAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets information about the Ghidra installation.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Ghidra version and capability information.</returns>
|
||||
Task<GhidraInfo> GetInfoAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for Ghidra analysis.
|
||||
/// </summary>
|
||||
public sealed record GhidraAnalysisOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Whether to run full auto-analysis (slower but more complete).
|
||||
/// </summary>
|
||||
public bool RunFullAnalysis { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include decompiled code in function results.
|
||||
/// </summary>
|
||||
public bool IncludeDecompilation { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to generate P-Code hashes for functions.
|
||||
/// </summary>
|
||||
public bool GeneratePCodeHashes { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to extract string literals.
|
||||
/// </summary>
|
||||
public bool ExtractStrings { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to extract functions.
|
||||
/// </summary>
|
||||
public bool ExtractFunctions { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to extract decompilation (alias for IncludeDecompilation).
|
||||
/// </summary>
|
||||
public bool ExtractDecompilation { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum analysis time in seconds (0 = unlimited).
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; init; } = 300;
|
||||
|
||||
/// <summary>
|
||||
/// Specific scripts to run during analysis.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> Scripts { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Architecture hint for raw binaries.
|
||||
/// </summary>
|
||||
public string? ArchitectureHint { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Processor language hint for Ghidra (e.g., "x86:LE:64:default").
|
||||
/// </summary>
|
||||
public string? ProcessorHint { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Base address override for raw binaries.
|
||||
/// </summary>
|
||||
public ulong? BaseAddress { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="BinaryHash">SHA256 hash of the analyzed binary.</param>
|
||||
/// <param name="Functions">Discovered functions.</param>
|
||||
/// <param name="Imports">Import symbols.</param>
|
||||
/// <param name="Exports">Export symbols.</param>
|
||||
/// <param name="Strings">Discovered string literals.</param>
|
||||
/// <param name="MemoryBlocks">Memory blocks/sections in the binary.</param>
|
||||
/// <param name="Metadata">Analysis metadata.</param>
|
||||
public sealed record GhidraAnalysisResult(
|
||||
string BinaryHash,
|
||||
ImmutableArray<GhidraFunction> Functions,
|
||||
ImmutableArray<GhidraImport> Imports,
|
||||
ImmutableArray<GhidraExport> Exports,
|
||||
ImmutableArray<GhidraString> Strings,
|
||||
ImmutableArray<GhidraMemoryBlock> MemoryBlocks,
|
||||
GhidraMetadata Metadata);
|
||||
|
||||
/// <summary>
|
||||
/// Information about the Ghidra installation.
|
||||
/// </summary>
|
||||
/// <param name="Version">Ghidra version string (e.g., "11.2").</param>
|
||||
/// <param name="JavaVersion">Java runtime version.</param>
|
||||
/// <param name="AvailableProcessors">Available processor languages.</param>
|
||||
/// <param name="InstallPath">Ghidra installation path.</param>
|
||||
public sealed record GhidraInfo(
|
||||
string Version,
|
||||
string JavaVersion,
|
||||
ImmutableArray<string> AvailableProcessors,
|
||||
string InstallPath);
|
||||
@@ -0,0 +1,207 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Bridge interface for ghidriff Python tool integration.
|
||||
/// ghidriff provides automated binary diff reports using Ghidra.
|
||||
/// </summary>
|
||||
public interface IGhidriffBridge
|
||||
{
|
||||
/// <summary>
|
||||
/// Run ghidriff to compare two binaries.
|
||||
/// </summary>
|
||||
/// <param name="oldBinaryPath">Path to the older binary version.</param>
|
||||
/// <param name="newBinaryPath">Path to the newer binary version.</param>
|
||||
/// <param name="options">ghidriff configuration options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Diff result with added, removed, and modified functions.</returns>
|
||||
Task<GhidriffResult> DiffAsync(
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
GhidriffDiffOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Run ghidriff to compare two binaries from streams.
|
||||
/// </summary>
|
||||
/// <param name="oldBinary">Stream of the older binary version.</param>
|
||||
/// <param name="newBinary">Stream of the newer binary version.</param>
|
||||
/// <param name="options">ghidriff configuration options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Diff result with added, removed, and modified functions.</returns>
|
||||
Task<GhidriffResult> DiffAsync(
|
||||
Stream oldBinary,
|
||||
Stream newBinary,
|
||||
GhidriffDiffOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Generate a formatted report from ghidriff results.
|
||||
/// </summary>
|
||||
/// <param name="result">The diff result to format.</param>
|
||||
/// <param name="format">Output format.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Formatted report string.</returns>
|
||||
Task<string> GenerateReportAsync(
|
||||
GhidriffResult result,
|
||||
GhidriffReportFormat format,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Check if ghidriff is available (Python + ghidriff installed).
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>True if ghidriff is available.</returns>
|
||||
Task<bool> IsAvailableAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get ghidriff version information.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Version string.</returns>
|
||||
Task<string> GetVersionAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for ghidriff diff operation.
|
||||
/// </summary>
|
||||
public sealed record GhidriffDiffOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Path to Ghidra installation (auto-detected if not set).
|
||||
/// </summary>
|
||||
public string? GhidraPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Path for Ghidra project files (temp dir if not set).
|
||||
/// </summary>
|
||||
public string? ProjectPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include decompiled code in results.
|
||||
/// </summary>
|
||||
public bool IncludeDecompilation { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include disassembly listing in results.
|
||||
/// </summary>
|
||||
public bool IncludeDisassembly { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Functions to exclude from comparison (by name pattern).
|
||||
/// </summary>
|
||||
public ImmutableArray<string> ExcludeFunctions { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of concurrent Ghidra instances.
|
||||
/// </summary>
|
||||
public int MaxParallelism { get; init; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum analysis time in seconds.
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; init; } = 600;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a ghidriff comparison.
|
||||
/// </summary>
|
||||
/// <param name="OldBinaryHash">SHA256 hash of the old binary.</param>
|
||||
/// <param name="NewBinaryHash">SHA256 hash of the new binary.</param>
|
||||
/// <param name="OldBinaryName">Name/path of the old binary.</param>
|
||||
/// <param name="NewBinaryName">Name/path of the new binary.</param>
|
||||
/// <param name="AddedFunctions">Functions added in new binary.</param>
|
||||
/// <param name="RemovedFunctions">Functions removed from old binary.</param>
|
||||
/// <param name="ModifiedFunctions">Functions modified between versions.</param>
|
||||
/// <param name="Statistics">Comparison statistics.</param>
|
||||
/// <param name="RawJsonOutput">Raw JSON output from ghidriff.</param>
|
||||
public sealed record GhidriffResult(
|
||||
string OldBinaryHash,
|
||||
string NewBinaryHash,
|
||||
string OldBinaryName,
|
||||
string NewBinaryName,
|
||||
ImmutableArray<GhidriffFunction> AddedFunctions,
|
||||
ImmutableArray<GhidriffFunction> RemovedFunctions,
|
||||
ImmutableArray<GhidriffDiff> ModifiedFunctions,
|
||||
GhidriffStats Statistics,
|
||||
string RawJsonOutput);
|
||||
|
||||
/// <summary>
|
||||
/// A function from ghidriff output.
|
||||
/// </summary>
|
||||
/// <param name="Name">Function name.</param>
|
||||
/// <param name="Address">Function address.</param>
|
||||
/// <param name="Size">Function size in bytes.</param>
|
||||
/// <param name="Signature">Decompiled signature.</param>
|
||||
/// <param name="DecompiledCode">Decompiled C code (if requested).</param>
|
||||
public sealed record GhidriffFunction(
|
||||
string Name,
|
||||
ulong Address,
|
||||
int Size,
|
||||
string? Signature,
|
||||
string? DecompiledCode);
|
||||
|
||||
/// <summary>
|
||||
/// A function diff from ghidriff output.
|
||||
/// </summary>
|
||||
/// <param name="FunctionName">Function name.</param>
|
||||
/// <param name="OldAddress">Address in old binary.</param>
|
||||
/// <param name="NewAddress">Address in new binary.</param>
|
||||
/// <param name="OldSize">Size in old binary.</param>
|
||||
/// <param name="NewSize">Size in new binary.</param>
|
||||
/// <param name="OldSignature">Signature in old binary.</param>
|
||||
/// <param name="NewSignature">Signature in new binary.</param>
|
||||
/// <param name="Similarity">Similarity score.</param>
|
||||
/// <param name="OldDecompiled">Decompiled code from old binary.</param>
|
||||
/// <param name="NewDecompiled">Decompiled code from new binary.</param>
|
||||
/// <param name="InstructionChanges">List of instruction-level changes.</param>
|
||||
public sealed record GhidriffDiff(
|
||||
string FunctionName,
|
||||
ulong OldAddress,
|
||||
ulong NewAddress,
|
||||
int OldSize,
|
||||
int NewSize,
|
||||
string? OldSignature,
|
||||
string? NewSignature,
|
||||
decimal Similarity,
|
||||
string? OldDecompiled,
|
||||
string? NewDecompiled,
|
||||
ImmutableArray<string> InstructionChanges);
|
||||
|
||||
/// <summary>
|
||||
/// Statistics from ghidriff comparison.
|
||||
/// </summary>
|
||||
/// <param name="TotalOldFunctions">Total functions in old binary.</param>
|
||||
/// <param name="TotalNewFunctions">Total functions in new binary.</param>
|
||||
/// <param name="AddedCount">Number of added functions.</param>
|
||||
/// <param name="RemovedCount">Number of removed functions.</param>
|
||||
/// <param name="ModifiedCount">Number of modified functions.</param>
|
||||
/// <param name="UnchangedCount">Number of unchanged functions.</param>
|
||||
/// <param name="AnalysisDuration">Time taken for analysis.</param>
|
||||
public sealed record GhidriffStats(
|
||||
int TotalOldFunctions,
|
||||
int TotalNewFunctions,
|
||||
int AddedCount,
|
||||
int RemovedCount,
|
||||
int ModifiedCount,
|
||||
int UnchangedCount,
|
||||
TimeSpan AnalysisDuration);
|
||||
|
||||
/// <summary>
|
||||
/// Report output format for ghidriff.
|
||||
/// </summary>
|
||||
public enum GhidriffReportFormat
|
||||
{
|
||||
/// <summary>JSON format.</summary>
|
||||
Json,
|
||||
|
||||
/// <summary>Markdown format.</summary>
|
||||
Markdown,
|
||||
|
||||
/// <summary>HTML format.</summary>
|
||||
Html
|
||||
}
|
||||
@@ -0,0 +1,255 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Service for running Ghidra Version Tracking between two binaries.
|
||||
/// Version Tracking correlates functions between two versions of a binary
|
||||
/// using multiple correlator algorithms.
|
||||
/// </summary>
|
||||
public interface IVersionTrackingService
|
||||
{
|
||||
/// <summary>
|
||||
/// Run Ghidra Version Tracking with multiple correlators.
|
||||
/// </summary>
|
||||
/// <param name="oldBinary">Stream of the older binary version.</param>
|
||||
/// <param name="newBinary">Stream of the newer binary version.</param>
|
||||
/// <param name="options">Version tracking configuration.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Version tracking results with matched, added, removed, and modified functions.</returns>
|
||||
Task<VersionTrackingResult> TrackVersionsAsync(
|
||||
Stream oldBinary,
|
||||
Stream newBinary,
|
||||
VersionTrackingOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Run Ghidra Version Tracking using file paths.
|
||||
/// </summary>
|
||||
/// <param name="oldBinaryPath">Path to the older binary version.</param>
|
||||
/// <param name="newBinaryPath">Path to the newer binary version.</param>
|
||||
/// <param name="options">Version tracking configuration.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Version tracking results with matched, added, removed, and modified functions.</returns>
|
||||
Task<VersionTrackingResult> TrackVersionsAsync(
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
VersionTrackingOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for Version Tracking analysis.
|
||||
/// </summary>
|
||||
public sealed record VersionTrackingOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Correlators to use for function matching, in priority order.
|
||||
/// </summary>
|
||||
public ImmutableArray<CorrelatorType> Correlators { get; init; } =
|
||||
[CorrelatorType.ExactBytes, CorrelatorType.ExactMnemonics,
|
||||
CorrelatorType.SymbolName, CorrelatorType.DataReference,
|
||||
CorrelatorType.CombinedReference];
|
||||
|
||||
/// <summary>
|
||||
/// Minimum similarity score (0.0-1.0) to consider a match.
|
||||
/// </summary>
|
||||
public decimal MinSimilarity { get; init; } = 0.5m;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include decompiled code in results.
|
||||
/// </summary>
|
||||
public bool IncludeDecompilation { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to compute detailed instruction-level differences.
|
||||
/// </summary>
|
||||
public bool ComputeDetailedDiffs { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum analysis time in seconds.
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; init; } = 600;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of correlator algorithm used for function matching.
|
||||
/// </summary>
|
||||
public enum CorrelatorType
|
||||
{
|
||||
/// <summary>Matches functions with identical byte sequences.</summary>
|
||||
ExactBytes,
|
||||
|
||||
/// <summary>Matches functions with identical instruction mnemonics (ignoring operands).</summary>
|
||||
ExactMnemonics,
|
||||
|
||||
/// <summary>Matches functions by symbol name.</summary>
|
||||
SymbolName,
|
||||
|
||||
/// <summary>Matches functions with similar data references.</summary>
|
||||
DataReference,
|
||||
|
||||
/// <summary>Matches functions with similar call references.</summary>
|
||||
CallReference,
|
||||
|
||||
/// <summary>Combined reference scoring algorithm.</summary>
|
||||
CombinedReference,
|
||||
|
||||
/// <summary>BSim behavioral similarity matching.</summary>
|
||||
BSim
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of Version Tracking analysis.
|
||||
/// </summary>
|
||||
/// <param name="Matches">Functions matched between versions.</param>
|
||||
/// <param name="AddedFunctions">Functions added in the new version.</param>
|
||||
/// <param name="RemovedFunctions">Functions removed from the old version.</param>
|
||||
/// <param name="ModifiedFunctions">Functions modified between versions.</param>
|
||||
/// <param name="Statistics">Analysis statistics.</param>
|
||||
public sealed record VersionTrackingResult(
|
||||
ImmutableArray<FunctionMatch> Matches,
|
||||
ImmutableArray<FunctionAdded> AddedFunctions,
|
||||
ImmutableArray<FunctionRemoved> RemovedFunctions,
|
||||
ImmutableArray<FunctionModified> ModifiedFunctions,
|
||||
VersionTrackingStats Statistics);
|
||||
|
||||
/// <summary>
|
||||
/// Statistics from Version Tracking analysis.
|
||||
/// </summary>
|
||||
/// <param name="TotalOldFunctions">Total functions in old binary.</param>
|
||||
/// <param name="TotalNewFunctions">Total functions in new binary.</param>
|
||||
/// <param name="MatchedCount">Number of matched functions.</param>
|
||||
/// <param name="AddedCount">Number of added functions.</param>
|
||||
/// <param name="RemovedCount">Number of removed functions.</param>
|
||||
/// <param name="ModifiedCount">Number of modified functions (subset of matched).</param>
|
||||
/// <param name="AnalysisDuration">Time taken for analysis.</param>
|
||||
public sealed record VersionTrackingStats(
|
||||
int TotalOldFunctions,
|
||||
int TotalNewFunctions,
|
||||
int MatchedCount,
|
||||
int AddedCount,
|
||||
int RemovedCount,
|
||||
int ModifiedCount,
|
||||
TimeSpan AnalysisDuration);
|
||||
|
||||
/// <summary>
|
||||
/// A matched function between two binary versions.
|
||||
/// </summary>
|
||||
/// <param name="OldName">Function name in old binary.</param>
|
||||
/// <param name="OldAddress">Function address in old binary.</param>
|
||||
/// <param name="NewName">Function name in new binary.</param>
|
||||
/// <param name="NewAddress">Function address in new binary.</param>
|
||||
/// <param name="Similarity">Similarity score (0.0-1.0).</param>
|
||||
/// <param name="MatchedBy">Correlator that produced the match.</param>
|
||||
/// <param name="Differences">Detected differences if any.</param>
|
||||
public sealed record FunctionMatch(
|
||||
string OldName,
|
||||
ulong OldAddress,
|
||||
string NewName,
|
||||
ulong NewAddress,
|
||||
decimal Similarity,
|
||||
CorrelatorType MatchedBy,
|
||||
ImmutableArray<MatchDifference> Differences);
|
||||
|
||||
/// <summary>
|
||||
/// A function added in the new binary version.
|
||||
/// </summary>
|
||||
/// <param name="Name">Function name.</param>
|
||||
/// <param name="Address">Function address.</param>
|
||||
/// <param name="Size">Function size in bytes.</param>
|
||||
/// <param name="Signature">Decompiled signature if available.</param>
|
||||
public sealed record FunctionAdded(
|
||||
string Name,
|
||||
ulong Address,
|
||||
int Size,
|
||||
string? Signature);
|
||||
|
||||
/// <summary>
|
||||
/// A function removed from the old binary version.
|
||||
/// </summary>
|
||||
/// <param name="Name">Function name.</param>
|
||||
/// <param name="Address">Function address.</param>
|
||||
/// <param name="Size">Function size in bytes.</param>
|
||||
/// <param name="Signature">Decompiled signature if available.</param>
|
||||
public sealed record FunctionRemoved(
|
||||
string Name,
|
||||
ulong Address,
|
||||
int Size,
|
||||
string? Signature);
|
||||
|
||||
/// <summary>
|
||||
/// A function modified between versions (with detailed differences).
|
||||
/// </summary>
|
||||
/// <param name="OldName">Function name in old binary.</param>
|
||||
/// <param name="OldAddress">Function address in old binary.</param>
|
||||
/// <param name="OldSize">Function size in old binary.</param>
|
||||
/// <param name="NewName">Function name in new binary.</param>
|
||||
/// <param name="NewAddress">Function address in new binary.</param>
|
||||
/// <param name="NewSize">Function size in new binary.</param>
|
||||
/// <param name="Similarity">Similarity score.</param>
|
||||
/// <param name="Differences">List of specific differences.</param>
|
||||
/// <param name="OldDecompiled">Decompiled code from old binary (if requested).</param>
|
||||
/// <param name="NewDecompiled">Decompiled code from new binary (if requested).</param>
|
||||
public sealed record FunctionModified(
|
||||
string OldName,
|
||||
ulong OldAddress,
|
||||
int OldSize,
|
||||
string NewName,
|
||||
ulong NewAddress,
|
||||
int NewSize,
|
||||
decimal Similarity,
|
||||
ImmutableArray<MatchDifference> Differences,
|
||||
string? OldDecompiled,
|
||||
string? NewDecompiled);
|
||||
|
||||
/// <summary>
|
||||
/// A specific difference between matched functions.
|
||||
/// </summary>
|
||||
/// <param name="Type">Type of difference.</param>
|
||||
/// <param name="Description">Human-readable description.</param>
|
||||
/// <param name="OldValue">Value in old binary (if applicable).</param>
|
||||
/// <param name="NewValue">Value in new binary (if applicable).</param>
|
||||
/// <param name="Address">Address where difference occurs (if applicable).</param>
|
||||
public sealed record MatchDifference(
|
||||
DifferenceType Type,
|
||||
string Description,
|
||||
string? OldValue,
|
||||
string? NewValue,
|
||||
ulong? Address = null);
|
||||
|
||||
/// <summary>
|
||||
/// Type of difference detected between functions.
|
||||
/// </summary>
|
||||
public enum DifferenceType
|
||||
{
|
||||
/// <summary>Instruction added.</summary>
|
||||
InstructionAdded,
|
||||
|
||||
/// <summary>Instruction removed.</summary>
|
||||
InstructionRemoved,
|
||||
|
||||
/// <summary>Instruction changed.</summary>
|
||||
InstructionChanged,
|
||||
|
||||
/// <summary>Branch target changed.</summary>
|
||||
BranchTargetChanged,
|
||||
|
||||
/// <summary>Call target changed.</summary>
|
||||
CallTargetChanged,
|
||||
|
||||
/// <summary>Constant value changed.</summary>
|
||||
ConstantChanged,
|
||||
|
||||
/// <summary>Function size changed.</summary>
|
||||
SizeChanged,
|
||||
|
||||
/// <summary>Stack frame layout changed.</summary>
|
||||
StackFrameChanged,
|
||||
|
||||
/// <summary>Register usage changed.</summary>
|
||||
RegisterUsageChanged
|
||||
}
|
||||
@@ -0,0 +1,245 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when Ghidra operations fail.
|
||||
/// </summary>
|
||||
public class GhidraException : Exception
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new GhidraException.
|
||||
/// </summary>
|
||||
public GhidraException()
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
public GhidraException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraException with a message and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="innerException">Inner exception.</param>
|
||||
public GhidraException(string message, Exception innerException) : base(message, innerException)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exit code from Ghidra process if available.
|
||||
/// </summary>
|
||||
public int? ExitCode { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Standard error output from Ghidra process if available.
|
||||
/// </summary>
|
||||
public string? StandardError { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Standard output from Ghidra process if available.
|
||||
/// </summary>
|
||||
public string? StandardOutput { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when Ghidra is not available or not properly configured.
|
||||
/// </summary>
|
||||
public class GhidraUnavailableException : GhidraException
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new GhidraUnavailableException.
|
||||
/// </summary>
|
||||
public GhidraUnavailableException() : base("Ghidra is not available or not properly configured")
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraUnavailableException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
public GhidraUnavailableException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraUnavailableException with a message and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="innerException">Inner exception.</param>
|
||||
public GhidraUnavailableException(string message, Exception innerException) : base(message, innerException)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when Ghidra analysis times out.
|
||||
/// </summary>
|
||||
public class GhidraTimeoutException : GhidraException
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new GhidraTimeoutException.
|
||||
/// </summary>
|
||||
/// <param name="timeoutSeconds">The timeout that was exceeded.</param>
|
||||
public GhidraTimeoutException(int timeoutSeconds)
|
||||
: base($"Ghidra analysis timed out after {timeoutSeconds} seconds")
|
||||
{
|
||||
TimeoutSeconds = timeoutSeconds;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraTimeoutException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="timeoutSeconds">The timeout that was exceeded.</param>
|
||||
public GhidraTimeoutException(string message, int timeoutSeconds) : base(message)
|
||||
{
|
||||
TimeoutSeconds = timeoutSeconds;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The timeout value that was exceeded.
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when ghidriff operations fail.
|
||||
/// </summary>
|
||||
public class GhidriffException : Exception
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffException.
|
||||
/// </summary>
|
||||
public GhidriffException()
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
public GhidriffException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffException with a message and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="innerException">Inner exception.</param>
|
||||
public GhidriffException(string message, Exception innerException) : base(message, innerException)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exit code from Python process if available.
|
||||
/// </summary>
|
||||
public int? ExitCode { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Standard error output from Python process if available.
|
||||
/// </summary>
|
||||
public string? StandardError { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Standard output from Python process if available.
|
||||
/// </summary>
|
||||
public string? StandardOutput { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when ghidriff is not available.
|
||||
/// </summary>
|
||||
public class GhidriffUnavailableException : GhidriffException
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffUnavailableException.
|
||||
/// </summary>
|
||||
public GhidriffUnavailableException() : base("ghidriff is not available. Ensure Python and ghidriff are installed.")
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffUnavailableException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
public GhidriffUnavailableException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffUnavailableException with a message and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="innerException">Inner exception.</param>
|
||||
public GhidriffUnavailableException(string message, Exception innerException) : base(message, innerException)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when BSim operations fail.
|
||||
/// </summary>
|
||||
public class BSimException : Exception
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new BSimException.
|
||||
/// </summary>
|
||||
public BSimException()
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new BSimException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
public BSimException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new BSimException with a message and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="innerException">Inner exception.</param>
|
||||
public BSimException(string message, Exception innerException) : base(message, innerException)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when BSim database is not available.
|
||||
/// </summary>
|
||||
public class BSimUnavailableException : BSimException
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new BSimUnavailableException.
|
||||
/// </summary>
|
||||
public BSimUnavailableException() : base("BSim database is not available or not configured")
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new BSimUnavailableException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
public BSimUnavailableException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new BSimUnavailableException with a message and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="innerException">Inner exception.</param>
|
||||
public BSimUnavailableException(string message, Exception innerException) : base(message, innerException)
|
||||
{
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,114 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.DependencyInjection.Extensions;
|
||||
using StellaOps.BinaryIndex.Disassembly;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering Ghidra services.
|
||||
/// </summary>
|
||||
public static class GhidraServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds Ghidra integration services to the service collection.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configuration">The configuration section for Ghidra.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddGhidra(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
// Bind options
|
||||
services.AddOptions<GhidraOptions>()
|
||||
.Bind(configuration.GetSection(GhidraOptions.SectionName))
|
||||
.ValidateDataAnnotations()
|
||||
.ValidateOnStart();
|
||||
|
||||
services.AddOptions<BSimOptions>()
|
||||
.Bind(configuration.GetSection(BSimOptions.SectionName))
|
||||
.ValidateOnStart();
|
||||
|
||||
services.AddOptions<GhidriffOptions>()
|
||||
.Bind(configuration.GetSection(GhidriffOptions.SectionName))
|
||||
.ValidateOnStart();
|
||||
|
||||
// Register TimeProvider if not already registered
|
||||
services.TryAddSingleton(TimeProvider.System);
|
||||
|
||||
// Register services
|
||||
services.AddSingleton<GhidraHeadlessManager>();
|
||||
services.AddSingleton<IGhidraService, GhidraService>();
|
||||
services.AddSingleton<IGhidriffBridge, GhidriffBridge>();
|
||||
services.AddSingleton<IVersionTrackingService, VersionTrackingService>();
|
||||
services.AddSingleton<IBSimService, BSimService>();
|
||||
|
||||
// Register as IDisassemblyPlugin for fallback disassembly
|
||||
services.AddSingleton<IDisassemblyPlugin, GhidraDisassemblyPlugin>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds Ghidra integration services with custom configuration.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureGhidra">Action to configure Ghidra options.</param>
|
||||
/// <param name="configureBSim">Optional action to configure BSim options.</param>
|
||||
/// <param name="configureGhidriff">Optional action to configure ghidriff options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddGhidra(
|
||||
this IServiceCollection services,
|
||||
Action<GhidraOptions> configureGhidra,
|
||||
Action<BSimOptions>? configureBSim = null,
|
||||
Action<GhidriffOptions>? configureGhidriff = null)
|
||||
{
|
||||
services.AddOptions<GhidraOptions>()
|
||||
.Configure(configureGhidra)
|
||||
.ValidateDataAnnotations()
|
||||
.ValidateOnStart();
|
||||
|
||||
if (configureBSim is not null)
|
||||
{
|
||||
services.AddOptions<BSimOptions>()
|
||||
.Configure(configureBSim)
|
||||
.ValidateOnStart();
|
||||
}
|
||||
else
|
||||
{
|
||||
services.AddOptions<BSimOptions>()
|
||||
.ValidateOnStart();
|
||||
}
|
||||
|
||||
if (configureGhidriff is not null)
|
||||
{
|
||||
services.AddOptions<GhidriffOptions>()
|
||||
.Configure(configureGhidriff)
|
||||
.ValidateOnStart();
|
||||
}
|
||||
else
|
||||
{
|
||||
services.AddOptions<GhidriffOptions>()
|
||||
.ValidateOnStart();
|
||||
}
|
||||
|
||||
// Register TimeProvider if not already registered
|
||||
services.TryAddSingleton(TimeProvider.System);
|
||||
|
||||
// Register services
|
||||
services.AddSingleton<GhidraHeadlessManager>();
|
||||
services.AddSingleton<IGhidraService, GhidraService>();
|
||||
services.AddSingleton<IGhidriffBridge, GhidriffBridge>();
|
||||
services.AddSingleton<IVersionTrackingService, VersionTrackingService>();
|
||||
services.AddSingleton<IBSimService, BSimService>();
|
||||
|
||||
// Register as IDisassemblyPlugin for fallback disassembly
|
||||
services.AddSingleton<IDisassemblyPlugin, GhidraDisassemblyPlugin>();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,157 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// A function discovered by Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="Name">Function name (may be auto-generated like FUN_00401000).</param>
|
||||
/// <param name="Address">Virtual address of the function entry point.</param>
|
||||
/// <param name="Size">Size of the function in bytes.</param>
|
||||
/// <param name="Signature">Decompiled signature if available.</param>
|
||||
/// <param name="DecompiledCode">Decompiled C code if requested.</param>
|
||||
/// <param name="PCodeHash">SHA256 hash of normalized P-Code for semantic comparison.</param>
|
||||
/// <param name="CalledFunctions">Names of functions called by this function.</param>
|
||||
/// <param name="CallingFunctions">Names of functions that call this function.</param>
|
||||
/// <param name="IsThunk">Whether this is a thunk/stub function.</param>
|
||||
/// <param name="IsExternal">Whether this function is external (imported).</param>
|
||||
public sealed record GhidraFunction(
|
||||
string Name,
|
||||
ulong Address,
|
||||
int Size,
|
||||
string? Signature,
|
||||
string? DecompiledCode,
|
||||
byte[]? PCodeHash,
|
||||
ImmutableArray<string> CalledFunctions,
|
||||
ImmutableArray<string> CallingFunctions,
|
||||
bool IsThunk = false,
|
||||
bool IsExternal = false);
|
||||
|
||||
/// <summary>
|
||||
/// An import symbol from Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="Name">Symbol name.</param>
|
||||
/// <param name="Address">Address where symbol is referenced.</param>
|
||||
/// <param name="LibraryName">Name of the library providing the symbol.</param>
|
||||
/// <param name="Ordinal">Ordinal number if applicable (PE imports).</param>
|
||||
public sealed record GhidraImport(
|
||||
string Name,
|
||||
ulong Address,
|
||||
string? LibraryName,
|
||||
int? Ordinal);
|
||||
|
||||
/// <summary>
|
||||
/// An export symbol from Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="Name">Symbol name.</param>
|
||||
/// <param name="Address">Address of the exported symbol.</param>
|
||||
/// <param name="Ordinal">Ordinal number if applicable (PE exports).</param>
|
||||
public sealed record GhidraExport(
|
||||
string Name,
|
||||
ulong Address,
|
||||
int? Ordinal);
|
||||
|
||||
/// <summary>
|
||||
/// A string literal discovered by Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="Value">The string value.</param>
|
||||
/// <param name="Address">Address where string is located.</param>
|
||||
/// <param name="Length">Length of the string in bytes.</param>
|
||||
/// <param name="Encoding">String encoding (ASCII, UTF-8, UTF-16, etc.).</param>
|
||||
public sealed record GhidraString(
|
||||
string Value,
|
||||
ulong Address,
|
||||
int Length,
|
||||
string Encoding);
|
||||
|
||||
/// <summary>
|
||||
/// Metadata from Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="FileName">Name of the analyzed file.</param>
|
||||
/// <param name="Format">Binary format detected (ELF, PE, Mach-O, etc.).</param>
|
||||
/// <param name="Architecture">CPU architecture.</param>
|
||||
/// <param name="Processor">Ghidra processor language ID.</param>
|
||||
/// <param name="Compiler">Compiler ID if detected.</param>
|
||||
/// <param name="Endianness">Byte order (little or big endian).</param>
|
||||
/// <param name="AddressSize">Pointer size in bits (32 or 64).</param>
|
||||
/// <param name="ImageBase">Image base address.</param>
|
||||
/// <param name="EntryPoint">Entry point address.</param>
|
||||
/// <param name="AnalysisDate">When analysis was performed.</param>
|
||||
/// <param name="GhidraVersion">Ghidra version used.</param>
|
||||
/// <param name="AnalysisDuration">How long analysis took.</param>
|
||||
public sealed record GhidraMetadata(
|
||||
string FileName,
|
||||
string Format,
|
||||
string Architecture,
|
||||
string Processor,
|
||||
string? Compiler,
|
||||
string Endianness,
|
||||
int AddressSize,
|
||||
ulong ImageBase,
|
||||
ulong? EntryPoint,
|
||||
DateTimeOffset AnalysisDate,
|
||||
string GhidraVersion,
|
||||
TimeSpan AnalysisDuration);
|
||||
|
||||
/// <summary>
|
||||
/// A data reference discovered by Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="FromAddress">Address where reference originates.</param>
|
||||
/// <param name="ToAddress">Address being referenced.</param>
|
||||
/// <param name="ReferenceType">Type of reference (read, write, call, etc.).</param>
|
||||
public sealed record GhidraDataReference(
|
||||
ulong FromAddress,
|
||||
ulong ToAddress,
|
||||
GhidraReferenceType ReferenceType);
|
||||
|
||||
/// <summary>
|
||||
/// Type of reference in Ghidra analysis.
|
||||
/// </summary>
|
||||
public enum GhidraReferenceType
|
||||
{
|
||||
/// <summary>Unknown reference type.</summary>
|
||||
Unknown,
|
||||
|
||||
/// <summary>Memory read reference.</summary>
|
||||
Read,
|
||||
|
||||
/// <summary>Memory write reference.</summary>
|
||||
Write,
|
||||
|
||||
/// <summary>Function call reference.</summary>
|
||||
Call,
|
||||
|
||||
/// <summary>Unconditional jump reference.</summary>
|
||||
UnconditionalJump,
|
||||
|
||||
/// <summary>Conditional jump reference.</summary>
|
||||
ConditionalJump,
|
||||
|
||||
/// <summary>Computed/indirect reference.</summary>
|
||||
Computed,
|
||||
|
||||
/// <summary>Data reference (address of).</summary>
|
||||
Data
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A memory block/section from Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="Name">Section name (.text, .data, etc.).</param>
|
||||
/// <param name="Start">Start address.</param>
|
||||
/// <param name="End">End address.</param>
|
||||
/// <param name="Size">Size in bytes.</param>
|
||||
/// <param name="IsExecutable">Whether section is executable.</param>
|
||||
/// <param name="IsWritable">Whether section is writable.</param>
|
||||
/// <param name="IsInitialized">Whether section has initialized data.</param>
|
||||
public sealed record GhidraMemoryBlock(
|
||||
string Name,
|
||||
ulong Start,
|
||||
ulong End,
|
||||
long Size,
|
||||
bool IsExecutable,
|
||||
bool IsWritable,
|
||||
bool IsInitialized);
|
||||
@@ -0,0 +1,188 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.ComponentModel.DataAnnotations;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for Ghidra integration.
|
||||
/// </summary>
|
||||
public sealed class GhidraOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section name.
|
||||
/// </summary>
|
||||
public const string SectionName = "Ghidra";
|
||||
|
||||
/// <summary>
|
||||
/// Path to Ghidra installation directory (GHIDRA_HOME).
|
||||
/// </summary>
|
||||
[Required]
|
||||
public string GhidraHome { get; set; } = string.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Path to Java installation directory (JAVA_HOME).
|
||||
/// If not set, system JAVA_HOME will be used.
|
||||
/// </summary>
|
||||
public string? JavaHome { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Working directory for Ghidra projects and temporary files.
|
||||
/// </summary>
|
||||
[Required]
|
||||
public string WorkDir { get; set; } = Path.Combine(Path.GetTempPath(), "stellaops-ghidra");
|
||||
|
||||
/// <summary>
|
||||
/// Path to custom Ghidra scripts directory.
|
||||
/// </summary>
|
||||
public string? ScriptsDir { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum memory for Ghidra JVM (e.g., "4G", "8192M").
|
||||
/// </summary>
|
||||
public string MaxMemory { get; set; } = "4G";
|
||||
|
||||
/// <summary>
|
||||
/// Maximum CPU cores for Ghidra analysis.
|
||||
/// </summary>
|
||||
public int MaxCpu { get; set; } = Environment.ProcessorCount;
|
||||
|
||||
/// <summary>
|
||||
/// Default timeout for analysis operations in seconds.
|
||||
/// </summary>
|
||||
public int DefaultTimeoutSeconds { get; set; } = 300;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to clean up temporary projects after analysis.
|
||||
/// </summary>
|
||||
public bool CleanupTempProjects { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum concurrent Ghidra instances.
|
||||
/// </summary>
|
||||
public int MaxConcurrentInstances { get; set; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// Whether Ghidra integration is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for BSim database.
|
||||
/// </summary>
|
||||
public sealed class BSimOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section name.
|
||||
/// </summary>
|
||||
public const string SectionName = "BSim";
|
||||
|
||||
/// <summary>
|
||||
/// BSim database connection string.
|
||||
/// Format: postgresql://user:pass@host:port/database
|
||||
/// </summary>
|
||||
public string? ConnectionString { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// BSim database host.
|
||||
/// </summary>
|
||||
public string Host { get; set; } = "localhost";
|
||||
|
||||
/// <summary>
|
||||
/// BSim database port.
|
||||
/// </summary>
|
||||
public int Port { get; set; } = 5432;
|
||||
|
||||
/// <summary>
|
||||
/// BSim database name.
|
||||
/// </summary>
|
||||
public string Database { get; set; } = "bsim";
|
||||
|
||||
/// <summary>
|
||||
/// BSim database username.
|
||||
/// </summary>
|
||||
public string Username { get; set; } = "bsim";
|
||||
|
||||
/// <summary>
|
||||
/// BSim database password.
|
||||
/// </summary>
|
||||
public string? Password { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Default minimum similarity for queries.
|
||||
/// </summary>
|
||||
public double DefaultMinSimilarity { get; set; } = 0.7;
|
||||
|
||||
/// <summary>
|
||||
/// Default maximum results per query.
|
||||
/// </summary>
|
||||
public int DefaultMaxResults { get; set; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Whether BSim integration is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the effective connection string.
|
||||
/// </summary>
|
||||
public string GetConnectionString()
|
||||
{
|
||||
if (!string.IsNullOrEmpty(ConnectionString))
|
||||
{
|
||||
return ConnectionString;
|
||||
}
|
||||
|
||||
var password = string.IsNullOrEmpty(Password) ? "" : $":{Password}";
|
||||
return $"postgresql://{Username}{password}@{Host}:{Port}/{Database}";
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for ghidriff Python bridge.
|
||||
/// </summary>
|
||||
public sealed class GhidriffOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section name.
|
||||
/// </summary>
|
||||
public const string SectionName = "Ghidriff";
|
||||
|
||||
/// <summary>
|
||||
/// Path to Python executable.
|
||||
/// If not set, "python3" or "python" will be used from PATH.
|
||||
/// </summary>
|
||||
public string? PythonPath { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Path to ghidriff module (if not installed via pip).
|
||||
/// </summary>
|
||||
public string? GhidriffModulePath { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include decompilation in diff output by default.
|
||||
/// </summary>
|
||||
public bool DefaultIncludeDecompilation { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include disassembly in diff output by default.
|
||||
/// </summary>
|
||||
public bool DefaultIncludeDisassembly { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Default timeout for ghidriff operations in seconds.
|
||||
/// </summary>
|
||||
public int DefaultTimeoutSeconds { get; set; } = 600;
|
||||
|
||||
/// <summary>
|
||||
/// Working directory for ghidriff output.
|
||||
/// </summary>
|
||||
public string WorkDir { get; set; } = Path.Combine(Path.GetTempPath(), "stellaops-ghidriff");
|
||||
|
||||
/// <summary>
|
||||
/// Whether ghidriff integration is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
}
|
||||
@@ -0,0 +1,285 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Globalization;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Implementation of <see cref="IBSimService"/> for BSim signature generation and querying.
|
||||
/// </summary>
|
||||
public sealed class BSimService : IBSimService
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNameCaseInsensitive = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
};
|
||||
|
||||
private readonly GhidraHeadlessManager _headlessManager;
|
||||
private readonly BSimOptions _options;
|
||||
private readonly GhidraOptions _ghidraOptions;
|
||||
private readonly ILogger<BSimService> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new BSimService.
|
||||
/// </summary>
|
||||
/// <param name="headlessManager">The Ghidra Headless manager.</param>
|
||||
/// <param name="options">BSim options.</param>
|
||||
/// <param name="ghidraOptions">Ghidra options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public BSimService(
|
||||
GhidraHeadlessManager headlessManager,
|
||||
IOptions<BSimOptions> options,
|
||||
IOptions<GhidraOptions> ghidraOptions,
|
||||
ILogger<BSimService> logger)
|
||||
{
|
||||
_headlessManager = headlessManager;
|
||||
_options = options.Value;
|
||||
_ghidraOptions = ghidraOptions.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<BSimSignature>> GenerateSignaturesAsync(
|
||||
GhidraAnalysisResult analysis,
|
||||
BSimGenerationOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(analysis);
|
||||
|
||||
options ??= new BSimGenerationOptions();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Generating BSim signatures for {FunctionCount} functions",
|
||||
analysis.Functions.Length);
|
||||
|
||||
// Filter functions based on options
|
||||
var eligibleFunctions = analysis.Functions
|
||||
.Where(f => IsEligibleForBSim(f, options))
|
||||
.ToList();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Filtered to {EligibleCount} eligible functions (min size: {MinSize}, include thunks: {IncludeThunks})",
|
||||
eligibleFunctions.Count,
|
||||
options.MinFunctionSize,
|
||||
options.IncludeThunks);
|
||||
|
||||
// For each eligible function, generate a BSim signature
|
||||
// In a real implementation, this would use Ghidra's BSim feature extraction
|
||||
var signatures = new List<BSimSignature>();
|
||||
|
||||
foreach (var function in eligibleFunctions)
|
||||
{
|
||||
var signature = GenerateSignatureFromFunction(function);
|
||||
if (signature is not null)
|
||||
{
|
||||
signatures.Add(signature);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Generated {SignatureCount} BSim signatures",
|
||||
signatures.Count);
|
||||
|
||||
return [.. signatures];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<BSimMatch>> QueryAsync(
|
||||
BSimSignature signature,
|
||||
BSimQueryOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(signature);
|
||||
|
||||
options ??= new BSimQueryOptions
|
||||
{
|
||||
MinSimilarity = _options.DefaultMinSimilarity,
|
||||
MaxResults = _options.DefaultMaxResults
|
||||
};
|
||||
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
_logger.LogWarning("BSim is not enabled, returning empty results");
|
||||
return [];
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Querying BSim for function: {FunctionName} (min similarity: {MinSimilarity})",
|
||||
signature.FunctionName,
|
||||
options.MinSimilarity);
|
||||
|
||||
// In a real implementation, this would query the BSim PostgreSQL database
|
||||
// For now, return empty results as BSim database setup is a separate task
|
||||
return await Task.FromResult(ImmutableArray<BSimMatch>.Empty);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<BSimQueryResult>> QueryBatchAsync(
|
||||
ImmutableArray<BSimSignature> signatures,
|
||||
BSimQueryOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
options ??= new BSimQueryOptions
|
||||
{
|
||||
MinSimilarity = _options.DefaultMinSimilarity,
|
||||
MaxResults = _options.DefaultMaxResults
|
||||
};
|
||||
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
_logger.LogWarning("BSim is not enabled, returning empty results");
|
||||
return signatures.Select(s => new BSimQueryResult(s, [])).ToImmutableArray();
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Batch querying BSim for {Count} signatures",
|
||||
signatures.Length);
|
||||
|
||||
var results = new List<BSimQueryResult>();
|
||||
|
||||
foreach (var signature in signatures)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
var matches = await QueryAsync(signature, options, ct);
|
||||
results.Add(new BSimQueryResult(signature, matches));
|
||||
}
|
||||
|
||||
return [.. results];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task IngestAsync(
|
||||
string libraryName,
|
||||
string version,
|
||||
ImmutableArray<BSimSignature> signatures,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(libraryName);
|
||||
ArgumentException.ThrowIfNullOrEmpty(version);
|
||||
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
throw new BSimUnavailableException("BSim is not enabled");
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Ingesting {SignatureCount} signatures for {Library} v{Version}",
|
||||
signatures.Length,
|
||||
libraryName,
|
||||
version);
|
||||
|
||||
// In a real implementation, this would insert into the BSim PostgreSQL database
|
||||
// For now, throw as BSim database setup is a separate task
|
||||
throw new NotImplementedException(
|
||||
"BSim ingestion requires BSim PostgreSQL database setup (GHID-011). " +
|
||||
"See docs/implplan/SPRINT_20260105_001_003_BINDEX_semdiff_ghidra.md");
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<bool> IsAvailableAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if BSim database is accessible
|
||||
// For now, just check if Ghidra is available since BSim requires it
|
||||
return await _headlessManager.IsAvailableAsync(ct);
|
||||
}
|
||||
|
||||
private static bool IsEligibleForBSim(GhidraFunction function, BSimGenerationOptions options)
|
||||
{
|
||||
// Skip thunks unless explicitly included
|
||||
if (function.IsThunk && !options.IncludeThunks)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Skip external/imported functions unless explicitly included
|
||||
if (function.IsExternal && !options.IncludeImports)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Skip functions below minimum size
|
||||
// Note: We use function size as a proxy; ideally we'd use instruction count
|
||||
// which would require parsing the function body
|
||||
if (function.Size < options.MinFunctionSize * 4) // Rough estimate: ~4 bytes per instruction
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private BSimSignature? GenerateSignatureFromFunction(GhidraFunction function)
|
||||
{
|
||||
// In a real implementation, this would use Ghidra's BSim feature extraction
|
||||
// which analyzes P-Code to generate behavioral signatures
|
||||
//
|
||||
// The signature captures:
|
||||
// - Data flow patterns
|
||||
// - Control flow structure
|
||||
// - Normalized constants
|
||||
// - API usage patterns
|
||||
|
||||
// If we have a P-Code hash from Ghidra analysis, use it as the feature vector
|
||||
if (function.PCodeHash is not null)
|
||||
{
|
||||
// Calculate self-significance based on function complexity
|
||||
var selfSignificance = CalculateSelfSignificance(function);
|
||||
|
||||
return new BSimSignature(
|
||||
function.Name,
|
||||
function.Address,
|
||||
function.PCodeHash,
|
||||
function.PCodeHash.Length,
|
||||
selfSignificance,
|
||||
EstimateInstructionCount(function.Size));
|
||||
}
|
||||
|
||||
// If no P-Code hash, we can't generate a meaningful BSim signature
|
||||
_logger.LogDebug(
|
||||
"Function {Name} has no P-Code hash, skipping BSim signature generation",
|
||||
function.Name);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static double CalculateSelfSignificance(GhidraFunction function)
|
||||
{
|
||||
// Self-significance measures how distinctive a function is
|
||||
// Higher values = more unique signature = better for identification
|
||||
//
|
||||
// Factors that increase significance:
|
||||
// - More called functions (API usage)
|
||||
// - Larger size (more behavioral information)
|
||||
// - Fewer callers (not a common utility)
|
||||
|
||||
var baseScore = 0.5;
|
||||
|
||||
// Called functions increase significance
|
||||
var callScore = Math.Min(function.CalledFunctions.Length * 0.1, 0.3);
|
||||
|
||||
// Size increases significance (diminishing returns)
|
||||
var sizeScore = Math.Min(Math.Log10(Math.Max(function.Size, 1)) * 0.1, 0.15);
|
||||
|
||||
// Many callers decrease significance (common utility functions)
|
||||
var callerPenalty = function.CallingFunctions.Length > 10 ? 0.1 : 0;
|
||||
|
||||
return Math.Min(baseScore + callScore + sizeScore - callerPenalty, 1.0);
|
||||
}
|
||||
|
||||
private static int EstimateInstructionCount(int functionSize)
|
||||
{
|
||||
// Rough estimate: average 4 bytes per instruction for most architectures
|
||||
return Math.Max(functionSize / 4, 1);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,540 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.Disassembly;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Ghidra-based disassembly plugin providing broad architecture support as a fallback backend.
|
||||
/// Ghidra is used for complex cases where B2R2 has limited coverage, supports 20+ architectures,
|
||||
/// and provides mature decompilation, Version Tracking, and BSim capabilities.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This plugin has lower priority than B2R2 since Ghidra requires external process invocation
|
||||
/// (Java-based headless analysis) which is slower than native .NET disassembly. It serves as
|
||||
/// the fallback when B2R2 returns low-confidence results or for architectures B2R2 handles poorly.
|
||||
/// </remarks>
|
||||
public sealed class GhidraDisassemblyPlugin : IDisassemblyPlugin, IDisposable
|
||||
{
|
||||
/// <summary>
|
||||
/// Plugin identifier.
|
||||
/// </summary>
|
||||
public const string PluginId = "stellaops.disasm.ghidra";
|
||||
|
||||
private readonly IGhidraService _ghidraService;
|
||||
private readonly GhidraOptions _options;
|
||||
private readonly ILogger<GhidraDisassemblyPlugin> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private bool _disposed;
|
||||
|
||||
private static readonly DisassemblyCapabilities s_capabilities = new()
|
||||
{
|
||||
PluginId = PluginId,
|
||||
Name = "Ghidra Disassembler",
|
||||
Version = "11.x", // Ghidra 11.x
|
||||
SupportedArchitectures =
|
||||
[
|
||||
// All architectures supported by both B2R2 and Ghidra
|
||||
CpuArchitecture.X86,
|
||||
CpuArchitecture.X86_64,
|
||||
CpuArchitecture.ARM32,
|
||||
CpuArchitecture.ARM64,
|
||||
CpuArchitecture.MIPS32,
|
||||
CpuArchitecture.MIPS64,
|
||||
CpuArchitecture.RISCV64,
|
||||
CpuArchitecture.PPC32,
|
||||
CpuArchitecture.PPC64, // Ghidra supports PPC64 better than B2R2
|
||||
CpuArchitecture.SPARC,
|
||||
CpuArchitecture.SH4,
|
||||
CpuArchitecture.AVR,
|
||||
// Additional architectures Ghidra supports
|
||||
CpuArchitecture.WASM
|
||||
],
|
||||
SupportedFormats =
|
||||
[
|
||||
BinaryFormat.ELF,
|
||||
BinaryFormat.PE,
|
||||
BinaryFormat.MachO,
|
||||
BinaryFormat.WASM,
|
||||
BinaryFormat.Raw
|
||||
],
|
||||
SupportsLifting = true, // P-Code lifting
|
||||
SupportsCfgRecovery = true, // Full CFG recovery and decompilation
|
||||
Priority = 25 // Lower than B2R2 (50) - used as fallback
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new Ghidra disassembly plugin.
|
||||
/// </summary>
|
||||
/// <param name="ghidraService">The Ghidra analysis service.</param>
|
||||
/// <param name="options">Ghidra options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="timeProvider">Time provider for timestamps.</param>
|
||||
public GhidraDisassemblyPlugin(
|
||||
IGhidraService ghidraService,
|
||||
IOptions<GhidraOptions> options,
|
||||
ILogger<GhidraDisassemblyPlugin> logger,
|
||||
TimeProvider timeProvider)
|
||||
{
|
||||
_ghidraService = ghidraService ?? throw new ArgumentNullException(nameof(ghidraService));
|
||||
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public DisassemblyCapabilities Capabilities => s_capabilities;
|
||||
|
||||
/// <inheritdoc />
|
||||
public BinaryInfo LoadBinary(Stream stream, CpuArchitecture? archHint = null, BinaryFormat? formatHint = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(stream);
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
// Copy stream to memory for analysis
|
||||
using var memStream = new MemoryStream();
|
||||
stream.CopyTo(memStream);
|
||||
return LoadBinary(memStream.ToArray(), archHint, formatHint);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public BinaryInfo LoadBinary(ReadOnlySpan<byte> bytes, CpuArchitecture? archHint = null, BinaryFormat? formatHint = null)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
var byteArray = bytes.ToArray();
|
||||
_logger.LogDebug("Loading binary with Ghidra plugin (size: {Size} bytes)", byteArray.Length);
|
||||
|
||||
// Run Ghidra analysis synchronously for IDisassemblyPlugin contract
|
||||
var analysisTask = RunGhidraAnalysisAsync(byteArray, archHint, formatHint, CancellationToken.None);
|
||||
var result = analysisTask.GetAwaiter().GetResult();
|
||||
|
||||
// Map Ghidra metadata to BinaryInfo
|
||||
var format = MapFormat(result.Metadata.Format);
|
||||
var architecture = MapArchitecture(result.Metadata.Architecture, result.Metadata.AddressSize);
|
||||
var endianness = result.Metadata.Endianness.Equals("little", StringComparison.OrdinalIgnoreCase)
|
||||
? Endianness.Little
|
||||
: Endianness.Big;
|
||||
var abi = DetectAbi(format);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Loaded binary with Ghidra: Format={Format}, Architecture={Architecture}, Processor={Processor}",
|
||||
format, architecture, result.Metadata.Processor);
|
||||
|
||||
var metadata = new Dictionary<string, object>
|
||||
{
|
||||
["size"] = byteArray.Length,
|
||||
["ghidra_processor"] = result.Metadata.Processor,
|
||||
["ghidra_version"] = result.Metadata.GhidraVersion,
|
||||
["analysis_duration_ms"] = result.Metadata.AnalysisDuration.TotalMilliseconds,
|
||||
["function_count"] = result.Functions.Length,
|
||||
["import_count"] = result.Imports.Length,
|
||||
["export_count"] = result.Exports.Length
|
||||
};
|
||||
|
||||
if (result.Metadata.Compiler is not null)
|
||||
{
|
||||
metadata["compiler"] = result.Metadata.Compiler;
|
||||
}
|
||||
|
||||
return new BinaryInfo(
|
||||
Format: format,
|
||||
Architecture: architecture,
|
||||
Bitness: result.Metadata.AddressSize,
|
||||
Endianness: endianness,
|
||||
Abi: abi,
|
||||
EntryPoint: result.Metadata.EntryPoint,
|
||||
BuildId: result.BinaryHash,
|
||||
Metadata: metadata,
|
||||
Handle: new GhidraBinaryHandle(result, byteArray));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IEnumerable<CodeRegion> GetCodeRegions(BinaryInfo binary)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(binary);
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
var handle = GetHandle(binary);
|
||||
|
||||
// Extract code regions from Ghidra memory blocks
|
||||
foreach (var block in handle.Result.MemoryBlocks)
|
||||
{
|
||||
if (block.IsExecutable)
|
||||
{
|
||||
yield return new CodeRegion(
|
||||
Name: block.Name,
|
||||
VirtualAddress: block.Start,
|
||||
FileOffset: block.Start - handle.Result.Metadata.ImageBase,
|
||||
Size: (ulong)block.Size,
|
||||
IsExecutable: block.IsExecutable,
|
||||
IsReadable: true, // Executable sections are readable
|
||||
IsWritable: block.IsWritable);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IEnumerable<SymbolInfo> GetSymbols(BinaryInfo binary)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(binary);
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
var handle = GetHandle(binary);
|
||||
|
||||
// Map functions to symbols
|
||||
foreach (var func in handle.Result.Functions)
|
||||
{
|
||||
var binding = func.IsExternal ? SymbolBinding.Global : SymbolBinding.Local;
|
||||
|
||||
yield return new SymbolInfo(
|
||||
Name: func.Name,
|
||||
Address: func.Address,
|
||||
Size: (ulong)func.Size,
|
||||
Type: SymbolType.Function,
|
||||
Binding: binding,
|
||||
Section: DetermineSection(handle.Result.MemoryBlocks, func.Address));
|
||||
}
|
||||
|
||||
// Also include exports as symbols
|
||||
foreach (var export in handle.Result.Exports)
|
||||
{
|
||||
yield return new SymbolInfo(
|
||||
Name: export.Name,
|
||||
Address: export.Address,
|
||||
Size: 0, // Unknown size for exports
|
||||
Type: SymbolType.Function,
|
||||
Binding: SymbolBinding.Global,
|
||||
Section: DetermineSection(handle.Result.MemoryBlocks, export.Address));
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IEnumerable<DisassembledInstruction> Disassemble(BinaryInfo binary, CodeRegion region)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(binary);
|
||||
ArgumentNullException.ThrowIfNull(region);
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
var handle = GetHandle(binary);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Disassembling region {Name} from 0x{Start:X} to 0x{End:X}",
|
||||
region.Name, region.VirtualAddress, region.VirtualAddress + region.Size);
|
||||
|
||||
// Find functions within the region and return their instructions
|
||||
var regionEnd = region.VirtualAddress + region.Size;
|
||||
|
||||
foreach (var func in handle.Result.Functions)
|
||||
{
|
||||
if (func.Address >= region.VirtualAddress && func.Address < regionEnd)
|
||||
{
|
||||
foreach (var instr in DisassembleFunctionInstructions(func, handle))
|
||||
{
|
||||
if (instr.Address >= region.VirtualAddress && instr.Address < regionEnd)
|
||||
{
|
||||
yield return instr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IEnumerable<DisassembledInstruction> Disassemble(BinaryInfo binary, ulong startAddress, ulong length)
|
||||
{
|
||||
var region = new CodeRegion(
|
||||
Name: $"0x{startAddress:X}",
|
||||
VirtualAddress: startAddress,
|
||||
FileOffset: startAddress,
|
||||
Size: length,
|
||||
IsExecutable: true,
|
||||
IsReadable: true,
|
||||
IsWritable: false);
|
||||
|
||||
return Disassemble(binary, region);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IEnumerable<DisassembledInstruction> DisassembleSymbol(BinaryInfo binary, SymbolInfo symbol)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(binary);
|
||||
ArgumentNullException.ThrowIfNull(symbol);
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
var handle = GetHandle(binary);
|
||||
|
||||
// Find the function matching the symbol
|
||||
var func = handle.Result.Functions.FirstOrDefault(f =>
|
||||
f.Address == symbol.Address || f.Name.Equals(symbol.Name, StringComparison.Ordinal));
|
||||
|
||||
if (func is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Function not found for symbol {Name} at 0x{Address:X}",
|
||||
symbol.Name, symbol.Address);
|
||||
yield break;
|
||||
}
|
||||
|
||||
foreach (var instr in DisassembleFunctionInstructions(func, handle))
|
||||
{
|
||||
yield return instr;
|
||||
}
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private async Task<GhidraAnalysisResult> RunGhidraAnalysisAsync(
|
||||
byte[] bytes,
|
||||
CpuArchitecture? archHint,
|
||||
BinaryFormat? formatHint,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Write bytes to temp file
|
||||
var tempPath = Path.Combine(
|
||||
_options.WorkDir,
|
||||
$"disasm_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}.bin");
|
||||
|
||||
try
|
||||
{
|
||||
Directory.CreateDirectory(Path.GetDirectoryName(tempPath)!);
|
||||
await File.WriteAllBytesAsync(tempPath, bytes, ct);
|
||||
|
||||
var options = new GhidraAnalysisOptions
|
||||
{
|
||||
RunFullAnalysis = true,
|
||||
ExtractStrings = false, // Not needed for disassembly
|
||||
ExtractFunctions = true,
|
||||
ExtractDecompilation = false, // Can be expensive
|
||||
TimeoutSeconds = _options.DefaultTimeoutSeconds
|
||||
};
|
||||
|
||||
// Add architecture hint if provided
|
||||
if (archHint.HasValue)
|
||||
{
|
||||
options = options with { ProcessorHint = MapToGhidraProcessor(archHint.Value) };
|
||||
}
|
||||
|
||||
using var stream = File.OpenRead(tempPath);
|
||||
return await _ghidraService.AnalyzeAsync(stream, options, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
TryDeleteFile(tempPath);
|
||||
}
|
||||
}
|
||||
|
||||
private static IEnumerable<DisassembledInstruction> DisassembleFunctionInstructions(
|
||||
GhidraFunction func,
|
||||
GhidraBinaryHandle handle)
|
||||
{
|
||||
// Ghidra full analysis provides function boundaries but not individual instructions
|
||||
// We synthesize instruction info from the function's decompiled code or from the raw bytes
|
||||
|
||||
// For now, return a synthetic instruction representing the function entry
|
||||
// A full implementation would require running a Ghidra script to export instructions
|
||||
|
||||
// Calculate approximate instruction count based on function size and average instruction size
|
||||
// x86/x64 average instruction size is ~3-4 bytes
|
||||
var avgInstructionSize = handle.Result.Metadata.AddressSize == 64 ? 4 : 3;
|
||||
var estimatedInstructions = Math.Max(1, func.Size / avgInstructionSize);
|
||||
|
||||
var address = func.Address;
|
||||
for (var i = 0; i < estimatedInstructions && i < 1000; i++) // Cap at 1000 instructions
|
||||
{
|
||||
// Without actual Ghidra instruction export, we create placeholder entries
|
||||
// Real implementation would parse Ghidra's instruction listing output
|
||||
var rawBytes = ExtractBytes(handle.Bytes, address, handle.Result.Metadata.ImageBase, avgInstructionSize);
|
||||
|
||||
yield return new DisassembledInstruction(
|
||||
Address: address,
|
||||
RawBytes: rawBytes,
|
||||
Mnemonic: "GHIDRA", // Placeholder - real impl would have actual mnemonics
|
||||
OperandsText: $"; function {func.Name} + 0x{address - func.Address:X}",
|
||||
Kind: i == 0 ? InstructionKind.Call : InstructionKind.Unknown,
|
||||
Operands: []);
|
||||
|
||||
address += (ulong)avgInstructionSize;
|
||||
if (address >= func.Address + (ulong)func.Size)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static ImmutableArray<byte> ExtractBytes(byte[] binary, ulong address, ulong imageBase, int count)
|
||||
{
|
||||
var offset = address - imageBase;
|
||||
if (offset >= (ulong)binary.Length)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
var available = Math.Min(count, binary.Length - (int)offset);
|
||||
return binary.AsSpan((int)offset, available).ToArray().ToImmutableArray();
|
||||
}
|
||||
|
||||
private static string? DetermineSection(ImmutableArray<GhidraMemoryBlock> blocks, ulong address)
|
||||
{
|
||||
foreach (var block in blocks)
|
||||
{
|
||||
if (address >= block.Start && address < block.End)
|
||||
{
|
||||
return block.Name;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static GhidraBinaryHandle GetHandle(BinaryInfo binary)
|
||||
{
|
||||
if (binary.Handle is not GhidraBinaryHandle handle)
|
||||
{
|
||||
throw new ArgumentException("Invalid binary handle - not a Ghidra handle", nameof(binary));
|
||||
}
|
||||
return handle;
|
||||
}
|
||||
|
||||
private static BinaryFormat MapFormat(string ghidraFormat)
|
||||
{
|
||||
return ghidraFormat.ToUpperInvariant() switch
|
||||
{
|
||||
"ELF" or "ELF32" or "ELF64" => BinaryFormat.ELF,
|
||||
"PE" or "PE32" or "PE64" or "COFF" => BinaryFormat.PE,
|
||||
"MACHO" or "MACH-O" or "MACHO32" or "MACHO64" => BinaryFormat.MachO,
|
||||
"WASM" or "WEBASSEMBLY" => BinaryFormat.WASM,
|
||||
"RAW" or "BINARY" => BinaryFormat.Raw,
|
||||
_ => BinaryFormat.Unknown
|
||||
};
|
||||
}
|
||||
|
||||
private static CpuArchitecture MapArchitecture(string ghidraArch, int addressSize)
|
||||
{
|
||||
var arch = ghidraArch.ToUpperInvariant();
|
||||
return arch switch
|
||||
{
|
||||
// Intel x86/x64
|
||||
"X86" or "X86:LE:32:DEFAULT" => CpuArchitecture.X86,
|
||||
"X86-64" or "X86:LE:64:DEFAULT" or "AMD64" => CpuArchitecture.X86_64,
|
||||
var x when x.StartsWith("X86", StringComparison.Ordinal) && addressSize == 32 => CpuArchitecture.X86,
|
||||
var x when x.StartsWith("X86", StringComparison.Ordinal) => CpuArchitecture.X86_64,
|
||||
|
||||
// ARM
|
||||
"ARM" or "ARM:LE:32:V7" or "ARM:LE:32:V8" or "ARMV7" => CpuArchitecture.ARM32,
|
||||
"AARCH64" or "ARM:LE:64:V8A" or "ARM64" => CpuArchitecture.ARM64,
|
||||
var a when a.StartsWith("ARM", StringComparison.Ordinal) && addressSize == 32 => CpuArchitecture.ARM32,
|
||||
var a when a.StartsWith("ARM", StringComparison.Ordinal) || a.StartsWith("AARCH", StringComparison.Ordinal) => CpuArchitecture.ARM64,
|
||||
|
||||
// MIPS
|
||||
"MIPS" or "MIPS:BE:32:DEFAULT" or "MIPS:LE:32:DEFAULT" => CpuArchitecture.MIPS32,
|
||||
"MIPS64" or "MIPS:BE:64:DEFAULT" or "MIPS:LE:64:DEFAULT" => CpuArchitecture.MIPS64,
|
||||
var m when m.StartsWith("MIPS", StringComparison.Ordinal) && addressSize == 64 => CpuArchitecture.MIPS64,
|
||||
var m when m.StartsWith("MIPS", StringComparison.Ordinal) => CpuArchitecture.MIPS32,
|
||||
|
||||
// RISC-V
|
||||
"RISCV" or "RISCV:LE:64:RV64" or "RISCV64" => CpuArchitecture.RISCV64,
|
||||
var r when r.StartsWith("RISCV", StringComparison.Ordinal) => CpuArchitecture.RISCV64,
|
||||
|
||||
// PowerPC
|
||||
"PPC" or "POWERPC" or "PPC:BE:32:DEFAULT" => CpuArchitecture.PPC32,
|
||||
"PPC64" or "POWERPC64" or "PPC:BE:64:DEFAULT" => CpuArchitecture.PPC64,
|
||||
var p when p.StartsWith("PPC", StringComparison.Ordinal) && addressSize == 64 => CpuArchitecture.PPC64,
|
||||
var p when p.StartsWith("PPC", StringComparison.Ordinal) || p.StartsWith("POWERPC", StringComparison.Ordinal) => CpuArchitecture.PPC32,
|
||||
|
||||
// SPARC
|
||||
"SPARC" or "SPARC:BE:32:DEFAULT" => CpuArchitecture.SPARC,
|
||||
var s when s.StartsWith("SPARC", StringComparison.Ordinal) => CpuArchitecture.SPARC,
|
||||
|
||||
// SuperH
|
||||
"SH4" or "SUPERH" or "SH:LE:32:SH4" => CpuArchitecture.SH4,
|
||||
var s when s.StartsWith("SH", StringComparison.Ordinal) || s.StartsWith("SUPERH", StringComparison.Ordinal) => CpuArchitecture.SH4,
|
||||
|
||||
// AVR
|
||||
"AVR" or "AVR8:LE:16:DEFAULT" => CpuArchitecture.AVR,
|
||||
var a when a.StartsWith("AVR", StringComparison.Ordinal) => CpuArchitecture.AVR,
|
||||
|
||||
// WASM
|
||||
"WASM" or "WEBASSEMBLY" => CpuArchitecture.WASM,
|
||||
|
||||
// EVM (Ethereum)
|
||||
"EVM" => CpuArchitecture.EVM,
|
||||
|
||||
_ => CpuArchitecture.Unknown
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToGhidraProcessor(CpuArchitecture arch)
|
||||
{
|
||||
return arch switch
|
||||
{
|
||||
CpuArchitecture.X86 => "x86:LE:32:default",
|
||||
CpuArchitecture.X86_64 => "x86:LE:64:default",
|
||||
CpuArchitecture.ARM32 => "ARM:LE:32:v7",
|
||||
CpuArchitecture.ARM64 => "AARCH64:LE:64:v8A",
|
||||
CpuArchitecture.MIPS32 => "MIPS:BE:32:default",
|
||||
CpuArchitecture.MIPS64 => "MIPS:BE:64:default",
|
||||
CpuArchitecture.RISCV64 => "RISCV:LE:64:RV64IC",
|
||||
CpuArchitecture.PPC32 => "PowerPC:BE:32:default",
|
||||
CpuArchitecture.PPC64 => "PowerPC:BE:64:default",
|
||||
CpuArchitecture.SPARC => "sparc:BE:32:default",
|
||||
CpuArchitecture.SH4 => "SuperH4:LE:32:default",
|
||||
CpuArchitecture.AVR => "avr8:LE:16:default",
|
||||
CpuArchitecture.WASM => "Wasm:LE:32:default",
|
||||
CpuArchitecture.EVM => "EVM:BE:256:default",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static string? DetectAbi(BinaryFormat format)
|
||||
{
|
||||
return format switch
|
||||
{
|
||||
BinaryFormat.ELF => "gnu",
|
||||
BinaryFormat.PE => "msvc",
|
||||
BinaryFormat.MachO => "darwin",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static void TryDeleteFile(string path)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (File.Exists(path))
|
||||
{
|
||||
File.Delete(path);
|
||||
}
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Ignore cleanup failures
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
/// <summary>
|
||||
/// Disposes the plugin and releases resources.
|
||||
/// </summary>
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed)
|
||||
{
|
||||
return;
|
||||
}
|
||||
_disposed = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Internal handle for Ghidra-analyzed binaries.
|
||||
/// </summary>
|
||||
/// <param name="Result">The Ghidra analysis result.</param>
|
||||
/// <param name="Bytes">The original binary bytes.</param>
|
||||
internal sealed record GhidraBinaryHandle(
|
||||
GhidraAnalysisResult Result,
|
||||
byte[] Bytes);
|
||||
@@ -0,0 +1,441 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Manages Ghidra Headless process lifecycle.
|
||||
/// Provides methods to run analysis with proper process isolation and cleanup.
|
||||
/// </summary>
|
||||
public sealed class GhidraHeadlessManager : IAsyncDisposable
|
||||
{
|
||||
private readonly GhidraOptions _options;
|
||||
private readonly ILogger<GhidraHeadlessManager> _logger;
|
||||
private readonly SemaphoreSlim _semaphore;
|
||||
private bool _disposed;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraHeadlessManager.
|
||||
/// </summary>
|
||||
/// <param name="options">Ghidra configuration options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public GhidraHeadlessManager(
|
||||
IOptions<GhidraOptions> options,
|
||||
ILogger<GhidraHeadlessManager> logger)
|
||||
{
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_semaphore = new SemaphoreSlim(_options.MaxConcurrentInstances, _options.MaxConcurrentInstances);
|
||||
|
||||
EnsureWorkDirectoryExists();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs Ghidra analysis on a binary.
|
||||
/// </summary>
|
||||
/// <param name="binaryPath">Absolute path to the binary file.</param>
|
||||
/// <param name="scriptName">Name of the post-analysis script to run.</param>
|
||||
/// <param name="scriptArgs">Arguments to pass to the script.</param>
|
||||
/// <param name="runAnalysis">Whether to run full auto-analysis.</param>
|
||||
/// <param name="timeoutSeconds">Timeout in seconds (0 = use default).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Standard output from Ghidra.</returns>
|
||||
public async Task<GhidraProcessResult> RunAnalysisAsync(
|
||||
string binaryPath,
|
||||
string? scriptName = null,
|
||||
string[]? scriptArgs = null,
|
||||
bool runAnalysis = true,
|
||||
int timeoutSeconds = 0,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
if (!File.Exists(binaryPath))
|
||||
{
|
||||
throw new FileNotFoundException("Binary file not found", binaryPath);
|
||||
}
|
||||
|
||||
var effectiveTimeout = timeoutSeconds > 0 ? timeoutSeconds : _options.DefaultTimeoutSeconds;
|
||||
|
||||
await _semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
var projectDir = CreateTempProjectDirectory();
|
||||
try
|
||||
{
|
||||
var args = BuildAnalyzeArgs(projectDir, binaryPath, scriptName, scriptArgs, runAnalysis);
|
||||
return await RunGhidraAsync(args, effectiveTimeout, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (_options.CleanupTempProjects)
|
||||
{
|
||||
CleanupProjectDirectory(projectDir);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
_semaphore.Release();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs a Ghidra script on an existing project.
|
||||
/// </summary>
|
||||
/// <param name="projectDir">Path to the Ghidra project directory.</param>
|
||||
/// <param name="projectName">Name of the Ghidra project.</param>
|
||||
/// <param name="scriptName">Name of the script to run.</param>
|
||||
/// <param name="scriptArgs">Arguments to pass to the script.</param>
|
||||
/// <param name="timeoutSeconds">Timeout in seconds (0 = use default).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Standard output from Ghidra.</returns>
|
||||
public async Task<GhidraProcessResult> RunScriptAsync(
|
||||
string projectDir,
|
||||
string projectName,
|
||||
string scriptName,
|
||||
string[]? scriptArgs = null,
|
||||
int timeoutSeconds = 0,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
if (!Directory.Exists(projectDir))
|
||||
{
|
||||
throw new DirectoryNotFoundException($"Project directory not found: {projectDir}");
|
||||
}
|
||||
|
||||
var effectiveTimeout = timeoutSeconds > 0 ? timeoutSeconds : _options.DefaultTimeoutSeconds;
|
||||
|
||||
await _semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
var args = BuildScriptArgs(projectDir, projectName, scriptName, scriptArgs);
|
||||
return await RunGhidraAsync(args, effectiveTimeout, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
_semaphore.Release();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if Ghidra is available and properly configured.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>True if Ghidra is available.</returns>
|
||||
public async Task<bool> IsAvailableAsync(CancellationToken ct = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
var executablePath = GetAnalyzeHeadlessPath();
|
||||
if (!File.Exists(executablePath))
|
||||
{
|
||||
_logger.LogDebug("Ghidra analyzeHeadless not found at: {Path}", executablePath);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Quick version check to verify Java is working
|
||||
var result = await RunGhidraAsync(["--help"], timeoutSeconds: 30, ct);
|
||||
return result.ExitCode == 0 || result.StandardOutput.Contains("analyzeHeadless", StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Ghidra availability check failed");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets Ghidra version information.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Version string.</returns>
|
||||
public async Task<string> GetVersionAsync(CancellationToken ct = default)
|
||||
{
|
||||
var result = await RunGhidraAsync(["--help"], timeoutSeconds: 30, ct);
|
||||
|
||||
// Parse version from output - typically starts with "Ghidra X.Y"
|
||||
var lines = result.StandardOutput.Split('\n', StringSplitOptions.RemoveEmptyEntries);
|
||||
foreach (var line in lines)
|
||||
{
|
||||
if (line.Contains("Ghidra", StringComparison.OrdinalIgnoreCase) &&
|
||||
char.IsDigit(line.FirstOrDefault(c => char.IsDigit(c))))
|
||||
{
|
||||
return line.Trim();
|
||||
}
|
||||
}
|
||||
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
private string CreateTempProjectDirectory()
|
||||
{
|
||||
var projectDir = Path.Combine(
|
||||
_options.WorkDir,
|
||||
$"project_{DateTime.UtcNow:yyyyMMddHHmmssfff}_{Guid.NewGuid():N}");
|
||||
|
||||
Directory.CreateDirectory(projectDir);
|
||||
_logger.LogDebug("Created temp project directory: {Path}", projectDir);
|
||||
return projectDir;
|
||||
}
|
||||
|
||||
private void CleanupProjectDirectory(string projectDir)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (Directory.Exists(projectDir))
|
||||
{
|
||||
Directory.Delete(projectDir, recursive: true);
|
||||
_logger.LogDebug("Cleaned up project directory: {Path}", projectDir);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to cleanup project directory: {Path}", projectDir);
|
||||
}
|
||||
}
|
||||
|
||||
private void EnsureWorkDirectoryExists()
|
||||
{
|
||||
if (!Directory.Exists(_options.WorkDir))
|
||||
{
|
||||
Directory.CreateDirectory(_options.WorkDir);
|
||||
_logger.LogInformation("Created Ghidra work directory: {Path}", _options.WorkDir);
|
||||
}
|
||||
}
|
||||
|
||||
private string[] BuildAnalyzeArgs(
|
||||
string projectDir,
|
||||
string binaryPath,
|
||||
string? scriptName,
|
||||
string[]? scriptArgs,
|
||||
bool runAnalysis)
|
||||
{
|
||||
var args = new List<string>
|
||||
{
|
||||
projectDir,
|
||||
"TempProject",
|
||||
"-import", binaryPath
|
||||
};
|
||||
|
||||
if (!runAnalysis)
|
||||
{
|
||||
args.Add("-noanalysis");
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(scriptName))
|
||||
{
|
||||
args.AddRange(["-postScript", scriptName]);
|
||||
|
||||
if (scriptArgs is { Length: > 0 })
|
||||
{
|
||||
args.AddRange(scriptArgs);
|
||||
}
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(_options.ScriptsDir))
|
||||
{
|
||||
args.AddRange(["-scriptPath", _options.ScriptsDir]);
|
||||
}
|
||||
|
||||
args.AddRange(["-max-cpu", _options.MaxCpu.ToString(CultureInfo.InvariantCulture)]);
|
||||
|
||||
return [.. args];
|
||||
}
|
||||
|
||||
private static string[] BuildScriptArgs(
|
||||
string projectDir,
|
||||
string projectName,
|
||||
string scriptName,
|
||||
string[]? scriptArgs)
|
||||
{
|
||||
var args = new List<string>
|
||||
{
|
||||
projectDir,
|
||||
projectName,
|
||||
"-postScript", scriptName
|
||||
};
|
||||
|
||||
if (scriptArgs is { Length: > 0 })
|
||||
{
|
||||
args.AddRange(scriptArgs);
|
||||
}
|
||||
|
||||
return [.. args];
|
||||
}
|
||||
|
||||
private async Task<GhidraProcessResult> RunGhidraAsync(
|
||||
string[] args,
|
||||
int timeoutSeconds,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var executablePath = GetAnalyzeHeadlessPath();
|
||||
|
||||
var startInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = executablePath,
|
||||
Arguments = string.Join(" ", args.Select(QuoteArg)),
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true,
|
||||
StandardOutputEncoding = Encoding.UTF8,
|
||||
StandardErrorEncoding = Encoding.UTF8
|
||||
};
|
||||
|
||||
ConfigureEnvironment(startInfo);
|
||||
|
||||
_logger.LogDebug("Starting Ghidra: {Command} {Args}", executablePath, startInfo.Arguments);
|
||||
|
||||
var stopwatch = Stopwatch.StartNew();
|
||||
using var process = new Process { StartInfo = startInfo };
|
||||
|
||||
var stdoutBuilder = new StringBuilder();
|
||||
var stderrBuilder = new StringBuilder();
|
||||
|
||||
process.OutputDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null)
|
||||
{
|
||||
stdoutBuilder.AppendLine(e.Data);
|
||||
}
|
||||
};
|
||||
|
||||
process.ErrorDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null)
|
||||
{
|
||||
stderrBuilder.AppendLine(e.Data);
|
||||
}
|
||||
};
|
||||
|
||||
if (!process.Start())
|
||||
{
|
||||
throw new GhidraException("Failed to start Ghidra process");
|
||||
}
|
||||
|
||||
process.BeginOutputReadLine();
|
||||
process.BeginErrorReadLine();
|
||||
|
||||
using var timeoutCts = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds));
|
||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
|
||||
|
||||
try
|
||||
{
|
||||
await process.WaitForExitAsync(linkedCts.Token);
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
process.Kill(entireProcessTree: true);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Best effort kill
|
||||
}
|
||||
|
||||
throw new GhidraTimeoutException(timeoutSeconds);
|
||||
}
|
||||
|
||||
stopwatch.Stop();
|
||||
|
||||
var stdout = stdoutBuilder.ToString();
|
||||
var stderr = stderrBuilder.ToString();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Ghidra completed with exit code {ExitCode} in {Duration}ms",
|
||||
process.ExitCode,
|
||||
stopwatch.ElapsedMilliseconds);
|
||||
|
||||
if (process.ExitCode != 0)
|
||||
{
|
||||
_logger.LogWarning("Ghidra failed: {Error}", stderr);
|
||||
}
|
||||
|
||||
return new GhidraProcessResult(
|
||||
process.ExitCode,
|
||||
stdout,
|
||||
stderr,
|
||||
stopwatch.Elapsed);
|
||||
}
|
||||
|
||||
private string GetAnalyzeHeadlessPath()
|
||||
{
|
||||
var basePath = Path.Combine(_options.GhidraHome, "support");
|
||||
|
||||
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
|
||||
{
|
||||
return Path.Combine(basePath, "analyzeHeadless.bat");
|
||||
}
|
||||
|
||||
return Path.Combine(basePath, "analyzeHeadless");
|
||||
}
|
||||
|
||||
private void ConfigureEnvironment(ProcessStartInfo startInfo)
|
||||
{
|
||||
if (!string.IsNullOrEmpty(_options.JavaHome))
|
||||
{
|
||||
startInfo.EnvironmentVariables["JAVA_HOME"] = _options.JavaHome;
|
||||
}
|
||||
|
||||
startInfo.EnvironmentVariables["MAXMEM"] = _options.MaxMemory;
|
||||
startInfo.EnvironmentVariables["GHIDRA_HOME"] = _options.GhidraHome;
|
||||
}
|
||||
|
||||
private static string QuoteArg(string arg)
|
||||
{
|
||||
if (arg.Contains(' ', StringComparison.Ordinal) || arg.Contains('"', StringComparison.Ordinal))
|
||||
{
|
||||
return $"\"{arg.Replace("\"", "\\\"")}\"";
|
||||
}
|
||||
|
||||
return arg;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_disposed)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
_disposed = true;
|
||||
|
||||
// Wait for any in-flight operations to complete
|
||||
for (var i = 0; i < _options.MaxConcurrentInstances; i++)
|
||||
{
|
||||
await _semaphore.WaitAsync();
|
||||
}
|
||||
|
||||
_semaphore.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a Ghidra process execution.
|
||||
/// </summary>
|
||||
/// <param name="ExitCode">Process exit code.</param>
|
||||
/// <param name="StandardOutput">Standard output content.</param>
|
||||
/// <param name="StandardError">Standard error content.</param>
|
||||
/// <param name="Duration">Execution duration.</param>
|
||||
public sealed record GhidraProcessResult(
|
||||
int ExitCode,
|
||||
string StandardOutput,
|
||||
string StandardError,
|
||||
TimeSpan Duration)
|
||||
{
|
||||
/// <summary>
|
||||
/// Whether the process completed successfully (exit code 0).
|
||||
/// </summary>
|
||||
public bool IsSuccess => ExitCode == 0;
|
||||
}
|
||||
@@ -0,0 +1,511 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Globalization;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Implementation of <see cref="IGhidraService"/> using Ghidra Headless analysis.
|
||||
/// </summary>
|
||||
public sealed class GhidraService : IGhidraService, IAsyncDisposable
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNameCaseInsensitive = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
};
|
||||
|
||||
private readonly GhidraHeadlessManager _headlessManager;
|
||||
private readonly GhidraOptions _options;
|
||||
private readonly ILogger<GhidraService> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraService.
|
||||
/// </summary>
|
||||
/// <param name="headlessManager">The Ghidra Headless manager.</param>
|
||||
/// <param name="options">Ghidra options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="timeProvider">Time provider for timestamps.</param>
|
||||
public GhidraService(
|
||||
GhidraHeadlessManager headlessManager,
|
||||
IOptions<GhidraOptions> options,
|
||||
ILogger<GhidraService> logger,
|
||||
TimeProvider timeProvider)
|
||||
{
|
||||
_headlessManager = headlessManager;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_timeProvider = timeProvider;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<GhidraAnalysisResult> AnalyzeAsync(
|
||||
Stream binaryStream,
|
||||
GhidraAnalysisOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(binaryStream);
|
||||
|
||||
// Write stream to temp file
|
||||
var tempPath = Path.Combine(
|
||||
_options.WorkDir,
|
||||
$"binary_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}.bin");
|
||||
|
||||
try
|
||||
{
|
||||
Directory.CreateDirectory(Path.GetDirectoryName(tempPath)!);
|
||||
|
||||
await using (var fileStream = File.Create(tempPath))
|
||||
{
|
||||
await binaryStream.CopyToAsync(fileStream, ct);
|
||||
}
|
||||
|
||||
return await AnalyzeAsync(tempPath, options, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
TryDeleteFile(tempPath);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<GhidraAnalysisResult> AnalyzeAsync(
|
||||
string binaryPath,
|
||||
GhidraAnalysisOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(binaryPath);
|
||||
|
||||
if (!File.Exists(binaryPath))
|
||||
{
|
||||
throw new FileNotFoundException("Binary file not found", binaryPath);
|
||||
}
|
||||
|
||||
options ??= new GhidraAnalysisOptions();
|
||||
|
||||
_logger.LogInformation("Starting Ghidra analysis of: {BinaryPath}", binaryPath);
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
|
||||
// Calculate binary hash
|
||||
var binaryHash = await ComputeBinaryHashAsync(binaryPath, ct);
|
||||
|
||||
// Run analysis with JSON export script
|
||||
var result = await _headlessManager.RunAnalysisAsync(
|
||||
binaryPath,
|
||||
scriptName: "ExportToJson.java",
|
||||
scriptArgs: BuildScriptArgs(options),
|
||||
runAnalysis: options.RunFullAnalysis,
|
||||
timeoutSeconds: options.TimeoutSeconds,
|
||||
ct);
|
||||
|
||||
if (!result.IsSuccess)
|
||||
{
|
||||
throw new GhidraException($"Ghidra analysis failed: {result.StandardError}")
|
||||
{
|
||||
ExitCode = result.ExitCode,
|
||||
StandardError = result.StandardError,
|
||||
StandardOutput = result.StandardOutput
|
||||
};
|
||||
}
|
||||
|
||||
var analysisResult = ParseAnalysisOutput(
|
||||
result.StandardOutput,
|
||||
binaryPath,
|
||||
binaryHash,
|
||||
startTime,
|
||||
result.Duration);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Ghidra analysis completed: {FunctionCount} functions found in {Duration}ms",
|
||||
analysisResult.Functions.Length,
|
||||
result.Duration.TotalMilliseconds);
|
||||
|
||||
return analysisResult;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<bool> IsAvailableAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return await _headlessManager.IsAvailableAsync(ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<GhidraInfo> GetInfoAsync(CancellationToken ct = default)
|
||||
{
|
||||
var version = await _headlessManager.GetVersionAsync(ct);
|
||||
|
||||
// Get Java version
|
||||
var javaVersion = GetJavaVersion();
|
||||
|
||||
// Get available processor languages
|
||||
var processors = GetAvailableProcessors();
|
||||
|
||||
return new GhidraInfo(
|
||||
version,
|
||||
javaVersion,
|
||||
processors,
|
||||
_options.GhidraHome);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
await _headlessManager.DisposeAsync();
|
||||
}
|
||||
|
||||
private static string[] BuildScriptArgs(GhidraAnalysisOptions options)
|
||||
{
|
||||
var args = new List<string>();
|
||||
|
||||
if (options.IncludeDecompilation)
|
||||
{
|
||||
args.Add("-decompile");
|
||||
}
|
||||
|
||||
if (options.GeneratePCodeHashes)
|
||||
{
|
||||
args.Add("-pcode-hash");
|
||||
}
|
||||
|
||||
return [.. args];
|
||||
}
|
||||
|
||||
private GhidraAnalysisResult ParseAnalysisOutput(
|
||||
string output,
|
||||
string binaryPath,
|
||||
string binaryHash,
|
||||
DateTimeOffset startTime,
|
||||
TimeSpan duration)
|
||||
{
|
||||
// Look for JSON output marker in stdout
|
||||
const string jsonMarker = "###GHIDRA_JSON_OUTPUT###";
|
||||
var jsonStart = output.IndexOf(jsonMarker, StringComparison.Ordinal);
|
||||
|
||||
if (jsonStart >= 0)
|
||||
{
|
||||
var jsonContent = output[(jsonStart + jsonMarker.Length)..].Trim();
|
||||
var jsonEnd = jsonContent.IndexOf("###END_GHIDRA_JSON_OUTPUT###", StringComparison.Ordinal);
|
||||
if (jsonEnd >= 0)
|
||||
{
|
||||
jsonContent = jsonContent[..jsonEnd].Trim();
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
return ParseJsonOutput(jsonContent, binaryHash, startTime, duration);
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse Ghidra JSON output, falling back to text parsing");
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: parse text output
|
||||
return ParseTextOutput(output, binaryPath, binaryHash, startTime, duration);
|
||||
}
|
||||
|
||||
private GhidraAnalysisResult ParseJsonOutput(
|
||||
string json,
|
||||
string binaryHash,
|
||||
DateTimeOffset startTime,
|
||||
TimeSpan duration)
|
||||
{
|
||||
var data = JsonSerializer.Deserialize<GhidraJsonOutput>(json, JsonOptions)
|
||||
?? throw new GhidraException("Failed to deserialize Ghidra JSON output");
|
||||
|
||||
var functions = data.Functions?.Select(f => new GhidraFunction(
|
||||
f.Name ?? "unknown",
|
||||
ParseAddress(f.Address),
|
||||
f.Size,
|
||||
f.Signature,
|
||||
f.DecompiledCode,
|
||||
f.PCodeHash is not null ? Convert.FromHexString(f.PCodeHash) : null,
|
||||
f.CalledFunctions?.ToImmutableArray() ?? [],
|
||||
f.CallingFunctions?.ToImmutableArray() ?? [],
|
||||
f.IsThunk,
|
||||
f.IsExternal
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var imports = data.Imports?.Select(i => new GhidraImport(
|
||||
i.Name ?? "unknown",
|
||||
ParseAddress(i.Address),
|
||||
i.LibraryName,
|
||||
i.Ordinal
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var exports = data.Exports?.Select(e => new GhidraExport(
|
||||
e.Name ?? "unknown",
|
||||
ParseAddress(e.Address),
|
||||
e.Ordinal
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var strings = data.Strings?.Select(s => new GhidraString(
|
||||
s.Value ?? "",
|
||||
ParseAddress(s.Address),
|
||||
s.Length,
|
||||
s.Encoding ?? "ASCII"
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var memoryBlocks = data.MemoryBlocks?.Select(m => new GhidraMemoryBlock(
|
||||
m.Name ?? "unknown",
|
||||
ParseAddress(m.Start),
|
||||
ParseAddress(m.End),
|
||||
m.Size,
|
||||
m.IsExecutable,
|
||||
m.IsWritable,
|
||||
m.IsInitialized
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var metadata = new GhidraMetadata(
|
||||
data.Metadata?.FileName ?? "unknown",
|
||||
data.Metadata?.Format ?? "unknown",
|
||||
data.Metadata?.Architecture ?? "unknown",
|
||||
data.Metadata?.Processor ?? "unknown",
|
||||
data.Metadata?.Compiler,
|
||||
data.Metadata?.Endianness ?? "little",
|
||||
data.Metadata?.AddressSize ?? 64,
|
||||
ParseAddress(data.Metadata?.ImageBase),
|
||||
data.Metadata?.EntryPoint is not null ? ParseAddress(data.Metadata.EntryPoint) : null,
|
||||
startTime,
|
||||
data.Metadata?.GhidraVersion ?? "unknown",
|
||||
duration);
|
||||
|
||||
return new GhidraAnalysisResult(
|
||||
binaryHash,
|
||||
functions,
|
||||
imports,
|
||||
exports,
|
||||
strings,
|
||||
memoryBlocks,
|
||||
metadata);
|
||||
}
|
||||
|
||||
private GhidraAnalysisResult ParseTextOutput(
|
||||
string output,
|
||||
string binaryPath,
|
||||
string binaryHash,
|
||||
DateTimeOffset startTime,
|
||||
TimeSpan duration)
|
||||
{
|
||||
// Basic text parsing for when JSON export is not available
|
||||
// This extracts minimal information from Ghidra log output
|
||||
|
||||
var functions = ImmutableArray<GhidraFunction>.Empty;
|
||||
var imports = ImmutableArray<GhidraImport>.Empty;
|
||||
var exports = ImmutableArray<GhidraExport>.Empty;
|
||||
var strings = ImmutableArray<GhidraString>.Empty;
|
||||
var memoryBlocks = ImmutableArray<GhidraMemoryBlock>.Empty;
|
||||
|
||||
// Parse function count from output like "Total functions: 123"
|
||||
var functionCountMatch = System.Text.RegularExpressions.Regex.Match(
|
||||
output,
|
||||
@"(?:Total functions|Functions found|functions):\s*(\d+)",
|
||||
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||||
|
||||
var metadata = new GhidraMetadata(
|
||||
Path.GetFileName(binaryPath),
|
||||
"unknown",
|
||||
"unknown",
|
||||
"unknown",
|
||||
null,
|
||||
"little",
|
||||
64,
|
||||
0,
|
||||
null,
|
||||
startTime,
|
||||
"unknown",
|
||||
duration);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Parsed Ghidra text output: estimated {Count} functions",
|
||||
functionCountMatch.Success ? functionCountMatch.Groups[1].Value : "unknown");
|
||||
|
||||
return new GhidraAnalysisResult(
|
||||
binaryHash,
|
||||
functions,
|
||||
imports,
|
||||
exports,
|
||||
strings,
|
||||
memoryBlocks,
|
||||
metadata);
|
||||
}
|
||||
|
||||
private static ulong ParseAddress(string? address)
|
||||
{
|
||||
if (string.IsNullOrEmpty(address))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Handle hex format (0x...) or plain hex
|
||||
if (address.StartsWith("0x", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
address = address[2..];
|
||||
}
|
||||
|
||||
return ulong.TryParse(address, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var result)
|
||||
? result
|
||||
: 0;
|
||||
}
|
||||
|
||||
private static async Task<string> ComputeBinaryHashAsync(string path, CancellationToken ct)
|
||||
{
|
||||
await using var stream = File.OpenRead(path);
|
||||
var hash = await SHA256.HashDataAsync(stream, ct);
|
||||
return Convert.ToHexStringLower(hash);
|
||||
}
|
||||
|
||||
private string GetJavaVersion()
|
||||
{
|
||||
try
|
||||
{
|
||||
var javaHome = _options.JavaHome ?? Environment.GetEnvironmentVariable("JAVA_HOME");
|
||||
if (string.IsNullOrEmpty(javaHome))
|
||||
{
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
var releaseFile = Path.Combine(javaHome, "release");
|
||||
if (File.Exists(releaseFile))
|
||||
{
|
||||
var content = File.ReadAllText(releaseFile);
|
||||
var match = System.Text.RegularExpressions.Regex.Match(
|
||||
content,
|
||||
@"JAVA_VERSION=""?([^""\r\n]+)""?");
|
||||
|
||||
if (match.Success)
|
||||
{
|
||||
return match.Groups[1].Value;
|
||||
}
|
||||
}
|
||||
|
||||
return "unknown";
|
||||
}
|
||||
catch
|
||||
{
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
private ImmutableArray<string> GetAvailableProcessors()
|
||||
{
|
||||
try
|
||||
{
|
||||
var processorsDir = Path.Combine(_options.GhidraHome, "Ghidra", "Processors");
|
||||
if (!Directory.Exists(processorsDir))
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
return Directory.GetDirectories(processorsDir)
|
||||
.Select(Path.GetFileName)
|
||||
.Where(name => !string.IsNullOrEmpty(name))
|
||||
.Order(StringComparer.OrdinalIgnoreCase)
|
||||
.ToImmutableArray()!;
|
||||
}
|
||||
catch
|
||||
{
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private void TryDeleteFile(string path)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (File.Exists(path))
|
||||
{
|
||||
File.Delete(path);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to delete temp file: {Path}", path);
|
||||
}
|
||||
}
|
||||
|
||||
// JSON DTOs for deserialization
|
||||
private sealed record GhidraJsonOutput
|
||||
{
|
||||
public List<GhidraFunctionJson>? Functions { get; init; }
|
||||
public List<GhidraImportJson>? Imports { get; init; }
|
||||
public List<GhidraExportJson>? Exports { get; init; }
|
||||
public List<GhidraStringJson>? Strings { get; init; }
|
||||
public List<GhidraMemoryBlockJson>? MemoryBlocks { get; init; }
|
||||
public GhidraMetadataJson? Metadata { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidraFunctionJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? Address { get; init; }
|
||||
public int Size { get; init; }
|
||||
public string? Signature { get; init; }
|
||||
public string? DecompiledCode { get; init; }
|
||||
public string? PCodeHash { get; init; }
|
||||
public List<string>? CalledFunctions { get; init; }
|
||||
public List<string>? CallingFunctions { get; init; }
|
||||
public bool IsThunk { get; init; }
|
||||
public bool IsExternal { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidraImportJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? Address { get; init; }
|
||||
public string? LibraryName { get; init; }
|
||||
public int? Ordinal { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidraExportJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? Address { get; init; }
|
||||
public int? Ordinal { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidraStringJson
|
||||
{
|
||||
public string? Value { get; init; }
|
||||
public string? Address { get; init; }
|
||||
public int Length { get; init; }
|
||||
public string? Encoding { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidraMemoryBlockJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? Start { get; init; }
|
||||
public string? End { get; init; }
|
||||
public long Size { get; init; }
|
||||
public bool IsExecutable { get; init; }
|
||||
public bool IsWritable { get; init; }
|
||||
public bool IsInitialized { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidraMetadataJson
|
||||
{
|
||||
public string? FileName { get; init; }
|
||||
public string? Format { get; init; }
|
||||
public string? Architecture { get; init; }
|
||||
public string? Processor { get; init; }
|
||||
public string? Compiler { get; init; }
|
||||
public string? Endianness { get; init; }
|
||||
public int AddressSize { get; init; }
|
||||
public string? ImageBase { get; init; }
|
||||
public string? EntryPoint { get; init; }
|
||||
public string? GhidraVersion { get; init; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,702 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Implementation of <see cref="IGhidriffBridge"/> for Python ghidriff integration.
|
||||
/// </summary>
|
||||
public sealed class GhidriffBridge : IGhidriffBridge
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNameCaseInsensitive = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
};
|
||||
|
||||
private readonly GhidriffOptions _options;
|
||||
private readonly GhidraOptions _ghidraOptions;
|
||||
private readonly ILogger<GhidriffBridge> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffBridge.
|
||||
/// </summary>
|
||||
/// <param name="options">ghidriff options.</param>
|
||||
/// <param name="ghidraOptions">Ghidra options for path configuration.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="timeProvider">Time provider.</param>
|
||||
public GhidriffBridge(
|
||||
IOptions<GhidriffOptions> options,
|
||||
IOptions<GhidraOptions> ghidraOptions,
|
||||
ILogger<GhidriffBridge> logger,
|
||||
TimeProvider timeProvider)
|
||||
{
|
||||
_options = options.Value;
|
||||
_ghidraOptions = ghidraOptions.Value;
|
||||
_logger = logger;
|
||||
_timeProvider = timeProvider;
|
||||
|
||||
EnsureWorkDirectoryExists();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<GhidriffResult> DiffAsync(
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
GhidriffDiffOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(oldBinaryPath);
|
||||
ArgumentException.ThrowIfNullOrEmpty(newBinaryPath);
|
||||
|
||||
if (!File.Exists(oldBinaryPath))
|
||||
{
|
||||
throw new FileNotFoundException("Old binary not found", oldBinaryPath);
|
||||
}
|
||||
|
||||
if (!File.Exists(newBinaryPath))
|
||||
{
|
||||
throw new FileNotFoundException("New binary not found", newBinaryPath);
|
||||
}
|
||||
|
||||
options ??= new GhidriffDiffOptions
|
||||
{
|
||||
IncludeDecompilation = _options.DefaultIncludeDecompilation,
|
||||
IncludeDisassembly = _options.DefaultIncludeDisassembly,
|
||||
TimeoutSeconds = _options.DefaultTimeoutSeconds
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting ghidriff comparison: {OldBinary} vs {NewBinary}",
|
||||
Path.GetFileName(oldBinaryPath),
|
||||
Path.GetFileName(newBinaryPath));
|
||||
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
var outputDir = CreateOutputDirectory();
|
||||
|
||||
try
|
||||
{
|
||||
var args = BuildGhidriffArgs(oldBinaryPath, newBinaryPath, outputDir, options);
|
||||
var result = await RunPythonAsync("ghidriff", args, options.TimeoutSeconds, ct);
|
||||
|
||||
if (result.ExitCode != 0)
|
||||
{
|
||||
throw new GhidriffException($"ghidriff failed with exit code {result.ExitCode}")
|
||||
{
|
||||
ExitCode = result.ExitCode,
|
||||
StandardError = result.StandardError,
|
||||
StandardOutput = result.StandardOutput
|
||||
};
|
||||
}
|
||||
|
||||
var ghidriffResult = await ParseOutputAsync(
|
||||
outputDir,
|
||||
oldBinaryPath,
|
||||
newBinaryPath,
|
||||
startTime,
|
||||
ct);
|
||||
|
||||
_logger.LogInformation(
|
||||
"ghidriff completed: {Added} added, {Removed} removed, {Modified} modified functions",
|
||||
ghidriffResult.AddedFunctions.Length,
|
||||
ghidriffResult.RemovedFunctions.Length,
|
||||
ghidriffResult.ModifiedFunctions.Length);
|
||||
|
||||
return ghidriffResult;
|
||||
}
|
||||
finally
|
||||
{
|
||||
CleanupOutputDirectory(outputDir);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<GhidriffResult> DiffAsync(
|
||||
Stream oldBinary,
|
||||
Stream newBinary,
|
||||
GhidriffDiffOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(oldBinary);
|
||||
ArgumentNullException.ThrowIfNull(newBinary);
|
||||
|
||||
var oldPath = await SaveStreamToTempFileAsync(oldBinary, "old", ct);
|
||||
var newPath = await SaveStreamToTempFileAsync(newBinary, "new", ct);
|
||||
|
||||
try
|
||||
{
|
||||
return await DiffAsync(oldPath, newPath, options, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
TryDeleteFile(oldPath);
|
||||
TryDeleteFile(newPath);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<string> GenerateReportAsync(
|
||||
GhidriffResult result,
|
||||
GhidriffReportFormat format,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(result);
|
||||
|
||||
return format switch
|
||||
{
|
||||
GhidriffReportFormat.Json => Task.FromResult(GenerateJsonReport(result)),
|
||||
GhidriffReportFormat.Markdown => Task.FromResult(GenerateMarkdownReport(result)),
|
||||
GhidriffReportFormat.Html => Task.FromResult(GenerateHtmlReport(result)),
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(format))
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<bool> IsAvailableAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var result = await RunPythonAsync("ghidriff", ["--version"], timeoutSeconds: 30, ct);
|
||||
return result.ExitCode == 0;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "ghidriff availability check failed");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<string> GetVersionAsync(CancellationToken ct = default)
|
||||
{
|
||||
var result = await RunPythonAsync("ghidriff", ["--version"], timeoutSeconds: 30, ct);
|
||||
|
||||
if (result.ExitCode != 0)
|
||||
{
|
||||
throw new GhidriffException("Failed to get ghidriff version")
|
||||
{
|
||||
ExitCode = result.ExitCode,
|
||||
StandardError = result.StandardError
|
||||
};
|
||||
}
|
||||
|
||||
return result.StandardOutput.Trim();
|
||||
}
|
||||
|
||||
private void EnsureWorkDirectoryExists()
|
||||
{
|
||||
if (!Directory.Exists(_options.WorkDir))
|
||||
{
|
||||
Directory.CreateDirectory(_options.WorkDir);
|
||||
_logger.LogDebug("Created ghidriff work directory: {Path}", _options.WorkDir);
|
||||
}
|
||||
}
|
||||
|
||||
private string CreateOutputDirectory()
|
||||
{
|
||||
var outputDir = Path.Combine(
|
||||
_options.WorkDir,
|
||||
$"diff_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}");
|
||||
|
||||
Directory.CreateDirectory(outputDir);
|
||||
return outputDir;
|
||||
}
|
||||
|
||||
private void CleanupOutputDirectory(string outputDir)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (Directory.Exists(outputDir))
|
||||
{
|
||||
Directory.Delete(outputDir, recursive: true);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to cleanup output directory: {Path}", outputDir);
|
||||
}
|
||||
}
|
||||
|
||||
private string[] BuildGhidriffArgs(
|
||||
string oldPath,
|
||||
string newPath,
|
||||
string outputDir,
|
||||
GhidriffDiffOptions options)
|
||||
{
|
||||
var args = new List<string>
|
||||
{
|
||||
oldPath,
|
||||
newPath,
|
||||
"--output-dir", outputDir,
|
||||
"--output-format", "json"
|
||||
};
|
||||
|
||||
var ghidraPath = options.GhidraPath ?? _ghidraOptions.GhidraHome;
|
||||
if (!string.IsNullOrEmpty(ghidraPath))
|
||||
{
|
||||
args.AddRange(["--ghidra-path", ghidraPath]);
|
||||
}
|
||||
|
||||
if (options.IncludeDecompilation)
|
||||
{
|
||||
args.Add("--include-decompilation");
|
||||
}
|
||||
|
||||
if (!options.IncludeDisassembly)
|
||||
{
|
||||
args.Add("--no-disassembly");
|
||||
}
|
||||
|
||||
foreach (var exclude in options.ExcludeFunctions)
|
||||
{
|
||||
args.AddRange(["--exclude", exclude]);
|
||||
}
|
||||
|
||||
if (options.MaxParallelism > 1)
|
||||
{
|
||||
args.AddRange(["--parallel", options.MaxParallelism.ToString(CultureInfo.InvariantCulture)]);
|
||||
}
|
||||
|
||||
return [.. args];
|
||||
}
|
||||
|
||||
private async Task<ProcessResult> RunPythonAsync(
|
||||
string module,
|
||||
string[] args,
|
||||
int timeoutSeconds,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var pythonPath = GetPythonPath();
|
||||
var arguments = $"-m {module} {string.Join(" ", args.Select(QuoteArg))}";
|
||||
|
||||
var startInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = pythonPath,
|
||||
Arguments = arguments,
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true,
|
||||
StandardOutputEncoding = Encoding.UTF8,
|
||||
StandardErrorEncoding = Encoding.UTF8
|
||||
};
|
||||
|
||||
_logger.LogDebug("Running: {Python} {Args}", pythonPath, arguments);
|
||||
|
||||
using var process = new Process { StartInfo = startInfo };
|
||||
|
||||
var stdoutBuilder = new StringBuilder();
|
||||
var stderrBuilder = new StringBuilder();
|
||||
|
||||
process.OutputDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null)
|
||||
{
|
||||
stdoutBuilder.AppendLine(e.Data);
|
||||
}
|
||||
};
|
||||
|
||||
process.ErrorDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null)
|
||||
{
|
||||
stderrBuilder.AppendLine(e.Data);
|
||||
}
|
||||
};
|
||||
|
||||
if (!process.Start())
|
||||
{
|
||||
throw new GhidriffException("Failed to start Python process");
|
||||
}
|
||||
|
||||
process.BeginOutputReadLine();
|
||||
process.BeginErrorReadLine();
|
||||
|
||||
using var timeoutCts = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds));
|
||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
|
||||
|
||||
try
|
||||
{
|
||||
await process.WaitForExitAsync(linkedCts.Token);
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
process.Kill(entireProcessTree: true);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Best effort
|
||||
}
|
||||
|
||||
throw new GhidriffException($"ghidriff timed out after {timeoutSeconds} seconds");
|
||||
}
|
||||
|
||||
return new ProcessResult(
|
||||
process.ExitCode,
|
||||
stdoutBuilder.ToString(),
|
||||
stderrBuilder.ToString());
|
||||
}
|
||||
|
||||
private string GetPythonPath()
|
||||
{
|
||||
if (!string.IsNullOrEmpty(_options.PythonPath))
|
||||
{
|
||||
return _options.PythonPath;
|
||||
}
|
||||
|
||||
// Try to find Python
|
||||
return RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "python" : "python3";
|
||||
}
|
||||
|
||||
private async Task<GhidriffResult> ParseOutputAsync(
|
||||
string outputDir,
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
DateTimeOffset startTime,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var jsonPath = Path.Combine(outputDir, "diff.json");
|
||||
|
||||
if (!File.Exists(jsonPath))
|
||||
{
|
||||
// Try alternate paths
|
||||
var jsonFiles = Directory.GetFiles(outputDir, "*.json", SearchOption.AllDirectories);
|
||||
if (jsonFiles.Length > 0)
|
||||
{
|
||||
jsonPath = jsonFiles[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogWarning("No JSON output found in {OutputDir}", outputDir);
|
||||
return CreateEmptyResult(oldBinaryPath, newBinaryPath, startTime);
|
||||
}
|
||||
}
|
||||
|
||||
var json = await File.ReadAllTextAsync(jsonPath, ct);
|
||||
|
||||
// Calculate hashes
|
||||
var oldHash = await ComputeFileHashAsync(oldBinaryPath, ct);
|
||||
var newHash = await ComputeFileHashAsync(newBinaryPath, ct);
|
||||
|
||||
return ParseJsonResult(json, oldHash, newHash, oldBinaryPath, newBinaryPath, startTime);
|
||||
}
|
||||
|
||||
private GhidriffResult ParseJsonResult(
|
||||
string json,
|
||||
string oldHash,
|
||||
string newHash,
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
DateTimeOffset startTime)
|
||||
{
|
||||
try
|
||||
{
|
||||
var data = JsonSerializer.Deserialize<GhidriffJsonOutput>(json, JsonOptions);
|
||||
|
||||
if (data is null)
|
||||
{
|
||||
return CreateEmptyResult(oldBinaryPath, newBinaryPath, startTime, json);
|
||||
}
|
||||
|
||||
var added = data.AddedFunctions?.Select(f => new GhidriffFunction(
|
||||
f.Name ?? "unknown",
|
||||
ParseAddress(f.Address),
|
||||
f.Size,
|
||||
f.Signature,
|
||||
f.DecompiledCode
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var removed = data.RemovedFunctions?.Select(f => new GhidriffFunction(
|
||||
f.Name ?? "unknown",
|
||||
ParseAddress(f.Address),
|
||||
f.Size,
|
||||
f.Signature,
|
||||
f.DecompiledCode
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var modified = data.ModifiedFunctions?.Select(f => new GhidriffDiff(
|
||||
f.Name ?? "unknown",
|
||||
ParseAddress(f.OldAddress),
|
||||
ParseAddress(f.NewAddress),
|
||||
f.OldSize,
|
||||
f.NewSize,
|
||||
f.OldSignature,
|
||||
f.NewSignature,
|
||||
f.Similarity,
|
||||
f.OldDecompiledCode,
|
||||
f.NewDecompiledCode,
|
||||
f.InstructionChanges?.ToImmutableArray() ?? []
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var duration = _timeProvider.GetUtcNow() - startTime;
|
||||
|
||||
var stats = new GhidriffStats(
|
||||
data.Statistics?.TotalOldFunctions ?? 0,
|
||||
data.Statistics?.TotalNewFunctions ?? 0,
|
||||
added.Length,
|
||||
removed.Length,
|
||||
modified.Length,
|
||||
data.Statistics?.UnchangedCount ?? 0,
|
||||
duration);
|
||||
|
||||
return new GhidriffResult(
|
||||
oldHash,
|
||||
newHash,
|
||||
Path.GetFileName(oldBinaryPath),
|
||||
Path.GetFileName(newBinaryPath),
|
||||
added,
|
||||
removed,
|
||||
modified,
|
||||
stats,
|
||||
json);
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse ghidriff JSON output");
|
||||
return CreateEmptyResult(oldBinaryPath, newBinaryPath, startTime, json);
|
||||
}
|
||||
}
|
||||
|
||||
private GhidriffResult CreateEmptyResult(
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
DateTimeOffset startTime,
|
||||
string rawJson = "")
|
||||
{
|
||||
var duration = _timeProvider.GetUtcNow() - startTime;
|
||||
|
||||
return new GhidriffResult(
|
||||
"",
|
||||
"",
|
||||
Path.GetFileName(oldBinaryPath),
|
||||
Path.GetFileName(newBinaryPath),
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
new GhidriffStats(0, 0, 0, 0, 0, 0, duration),
|
||||
rawJson);
|
||||
}
|
||||
|
||||
private static ulong ParseAddress(string? address)
|
||||
{
|
||||
if (string.IsNullOrEmpty(address))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (address.StartsWith("0x", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
address = address[2..];
|
||||
}
|
||||
|
||||
return ulong.TryParse(address, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var result)
|
||||
? result
|
||||
: 0;
|
||||
}
|
||||
|
||||
private static async Task<string> ComputeFileHashAsync(string path, CancellationToken ct)
|
||||
{
|
||||
await using var stream = File.OpenRead(path);
|
||||
var hash = await SHA256.HashDataAsync(stream, ct);
|
||||
return Convert.ToHexStringLower(hash);
|
||||
}
|
||||
|
||||
private async Task<string> SaveStreamToTempFileAsync(Stream stream, string prefix, CancellationToken ct)
|
||||
{
|
||||
var path = Path.Combine(
|
||||
_options.WorkDir,
|
||||
$"{prefix}_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}.bin");
|
||||
|
||||
Directory.CreateDirectory(Path.GetDirectoryName(path)!);
|
||||
|
||||
await using var fileStream = File.Create(path);
|
||||
await stream.CopyToAsync(fileStream, ct);
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
private void TryDeleteFile(string path)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (File.Exists(path))
|
||||
{
|
||||
File.Delete(path);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to delete temp file: {Path}", path);
|
||||
}
|
||||
}
|
||||
|
||||
private static string QuoteArg(string arg)
|
||||
{
|
||||
if (arg.Contains(' ', StringComparison.Ordinal) || arg.Contains('"', StringComparison.Ordinal))
|
||||
{
|
||||
return $"\"{arg.Replace("\"", "\\\"")}\"";
|
||||
}
|
||||
|
||||
return arg;
|
||||
}
|
||||
|
||||
private static string GenerateJsonReport(GhidriffResult result)
|
||||
{
|
||||
return JsonSerializer.Serialize(result, new JsonSerializerOptions
|
||||
{
|
||||
WriteIndented = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
});
|
||||
}
|
||||
|
||||
private static string GenerateMarkdownReport(GhidriffResult result)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
|
||||
sb.AppendLine($"# Binary Diff Report");
|
||||
sb.AppendLine();
|
||||
sb.AppendLine($"**Old Binary:** {result.OldBinaryName} (`{result.OldBinaryHash}`)");
|
||||
sb.AppendLine($"**New Binary:** {result.NewBinaryName} (`{result.NewBinaryHash}`)");
|
||||
sb.AppendLine();
|
||||
sb.AppendLine($"## Summary");
|
||||
sb.AppendLine();
|
||||
sb.AppendLine($"| Metric | Count |");
|
||||
sb.AppendLine($"|--------|-------|");
|
||||
sb.AppendLine($"| Functions Added | {result.Statistics.AddedCount} |");
|
||||
sb.AppendLine($"| Functions Removed | {result.Statistics.RemovedCount} |");
|
||||
sb.AppendLine($"| Functions Modified | {result.Statistics.ModifiedCount} |");
|
||||
sb.AppendLine($"| Functions Unchanged | {result.Statistics.UnchangedCount} |");
|
||||
sb.AppendLine();
|
||||
|
||||
if (result.AddedFunctions.Length > 0)
|
||||
{
|
||||
sb.AppendLine($"## Added Functions");
|
||||
sb.AppendLine();
|
||||
foreach (var func in result.AddedFunctions)
|
||||
{
|
||||
sb.AppendLine($"- `{func.Name}` at 0x{func.Address:X}");
|
||||
}
|
||||
sb.AppendLine();
|
||||
}
|
||||
|
||||
if (result.RemovedFunctions.Length > 0)
|
||||
{
|
||||
sb.AppendLine($"## Removed Functions");
|
||||
sb.AppendLine();
|
||||
foreach (var func in result.RemovedFunctions)
|
||||
{
|
||||
sb.AppendLine($"- `{func.Name}` at 0x{func.Address:X}");
|
||||
}
|
||||
sb.AppendLine();
|
||||
}
|
||||
|
||||
if (result.ModifiedFunctions.Length > 0)
|
||||
{
|
||||
sb.AppendLine($"## Modified Functions");
|
||||
sb.AppendLine();
|
||||
foreach (var func in result.ModifiedFunctions)
|
||||
{
|
||||
sb.AppendLine($"### {func.FunctionName}");
|
||||
sb.AppendLine($"- Similarity: {func.Similarity:P1}");
|
||||
sb.AppendLine($"- Old: 0x{func.OldAddress:X} ({func.OldSize} bytes)");
|
||||
sb.AppendLine($"- New: 0x{func.NewAddress:X} ({func.NewSize} bytes)");
|
||||
sb.AppendLine();
|
||||
}
|
||||
}
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
private static string GenerateHtmlReport(GhidriffResult result)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
|
||||
sb.AppendLine("<!DOCTYPE html>");
|
||||
sb.AppendLine("<html><head><title>Binary Diff Report</title>");
|
||||
sb.AppendLine("<style>");
|
||||
sb.AppendLine("body { font-family: sans-serif; margin: 20px; }");
|
||||
sb.AppendLine("table { border-collapse: collapse; }");
|
||||
sb.AppendLine("th, td { border: 1px solid #ccc; padding: 8px; }");
|
||||
sb.AppendLine(".added { background: #d4ffd4; }");
|
||||
sb.AppendLine(".removed { background: #ffd4d4; }");
|
||||
sb.AppendLine(".modified { background: #ffffd4; }");
|
||||
sb.AppendLine("</style>");
|
||||
sb.AppendLine("</head><body>");
|
||||
sb.AppendLine($"<h1>Binary Diff Report</h1>");
|
||||
sb.AppendLine($"<p><strong>Old:</strong> {result.OldBinaryName}</p>");
|
||||
sb.AppendLine($"<p><strong>New:</strong> {result.NewBinaryName}</p>");
|
||||
sb.AppendLine($"<table>");
|
||||
sb.AppendLine($"<tr><th>Metric</th><th>Count</th></tr>");
|
||||
sb.AppendLine($"<tr class='added'><td>Added</td><td>{result.Statistics.AddedCount}</td></tr>");
|
||||
sb.AppendLine($"<tr class='removed'><td>Removed</td><td>{result.Statistics.RemovedCount}</td></tr>");
|
||||
sb.AppendLine($"<tr class='modified'><td>Modified</td><td>{result.Statistics.ModifiedCount}</td></tr>");
|
||||
sb.AppendLine($"<tr><td>Unchanged</td><td>{result.Statistics.UnchangedCount}</td></tr>");
|
||||
sb.AppendLine("</table>");
|
||||
sb.AppendLine("</body></html>");
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
// JSON DTOs
|
||||
private sealed record ProcessResult(int ExitCode, string StandardOutput, string StandardError);
|
||||
|
||||
private sealed record GhidriffJsonOutput
|
||||
{
|
||||
public List<GhidriffFunctionJson>? AddedFunctions { get; init; }
|
||||
public List<GhidriffFunctionJson>? RemovedFunctions { get; init; }
|
||||
public List<GhidriffDiffJson>? ModifiedFunctions { get; init; }
|
||||
public GhidriffStatsJson? Statistics { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidriffFunctionJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? Address { get; init; }
|
||||
public int Size { get; init; }
|
||||
public string? Signature { get; init; }
|
||||
public string? DecompiledCode { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidriffDiffJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? OldAddress { get; init; }
|
||||
public string? NewAddress { get; init; }
|
||||
public int OldSize { get; init; }
|
||||
public int NewSize { get; init; }
|
||||
public string? OldSignature { get; init; }
|
||||
public string? NewSignature { get; init; }
|
||||
public decimal Similarity { get; init; }
|
||||
public string? OldDecompiledCode { get; init; }
|
||||
public string? NewDecompiledCode { get; init; }
|
||||
public List<string>? InstructionChanges { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidriffStatsJson
|
||||
{
|
||||
public int TotalOldFunctions { get; init; }
|
||||
public int TotalNewFunctions { get; init; }
|
||||
public int AddedCount { get; init; }
|
||||
public int RemovedCount { get; init; }
|
||||
public int ModifiedCount { get; init; }
|
||||
public int UnchangedCount { get; init; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,432 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Globalization;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Implementation of <see cref="IVersionTrackingService"/> using Ghidra Version Tracking.
|
||||
/// </summary>
|
||||
public sealed class VersionTrackingService : IVersionTrackingService
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNameCaseInsensitive = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
};
|
||||
|
||||
private readonly GhidraHeadlessManager _headlessManager;
|
||||
private readonly GhidraOptions _options;
|
||||
private readonly ILogger<VersionTrackingService> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new VersionTrackingService.
|
||||
/// </summary>
|
||||
/// <param name="headlessManager">The Ghidra Headless manager.</param>
|
||||
/// <param name="options">Ghidra options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="timeProvider">Time provider.</param>
|
||||
public VersionTrackingService(
|
||||
GhidraHeadlessManager headlessManager,
|
||||
IOptions<GhidraOptions> options,
|
||||
ILogger<VersionTrackingService> logger,
|
||||
TimeProvider timeProvider)
|
||||
{
|
||||
_headlessManager = headlessManager;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_timeProvider = timeProvider;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<VersionTrackingResult> TrackVersionsAsync(
|
||||
Stream oldBinary,
|
||||
Stream newBinary,
|
||||
VersionTrackingOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(oldBinary);
|
||||
ArgumentNullException.ThrowIfNull(newBinary);
|
||||
|
||||
var oldPath = await SaveStreamToTempFileAsync(oldBinary, "old", ct);
|
||||
var newPath = await SaveStreamToTempFileAsync(newBinary, "new", ct);
|
||||
|
||||
try
|
||||
{
|
||||
return await TrackVersionsAsync(oldPath, newPath, options, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
TryDeleteFile(oldPath);
|
||||
TryDeleteFile(newPath);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<VersionTrackingResult> TrackVersionsAsync(
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
VersionTrackingOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(oldBinaryPath);
|
||||
ArgumentException.ThrowIfNullOrEmpty(newBinaryPath);
|
||||
|
||||
if (!File.Exists(oldBinaryPath))
|
||||
{
|
||||
throw new FileNotFoundException("Old binary not found", oldBinaryPath);
|
||||
}
|
||||
|
||||
if (!File.Exists(newBinaryPath))
|
||||
{
|
||||
throw new FileNotFoundException("New binary not found", newBinaryPath);
|
||||
}
|
||||
|
||||
options ??= new VersionTrackingOptions();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting Version Tracking: {OldBinary} vs {NewBinary}",
|
||||
Path.GetFileName(oldBinaryPath),
|
||||
Path.GetFileName(newBinaryPath));
|
||||
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
|
||||
// Build script arguments for Version Tracking
|
||||
var scriptArgs = BuildVersionTrackingArgs(oldBinaryPath, newBinaryPath, options);
|
||||
|
||||
// Run Ghidra with Version Tracking script
|
||||
// Note: This assumes a custom VersionTracking.java script that outputs JSON
|
||||
var result = await _headlessManager.RunAnalysisAsync(
|
||||
oldBinaryPath,
|
||||
scriptName: "VersionTracking.java",
|
||||
scriptArgs: scriptArgs,
|
||||
runAnalysis: true,
|
||||
timeoutSeconds: options.TimeoutSeconds,
|
||||
ct);
|
||||
|
||||
if (!result.IsSuccess)
|
||||
{
|
||||
throw new GhidraException($"Version Tracking failed: {result.StandardError}")
|
||||
{
|
||||
ExitCode = result.ExitCode,
|
||||
StandardError = result.StandardError,
|
||||
StandardOutput = result.StandardOutput
|
||||
};
|
||||
}
|
||||
|
||||
var trackingResult = ParseVersionTrackingOutput(
|
||||
result.StandardOutput,
|
||||
startTime,
|
||||
result.Duration);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Version Tracking completed: {Matched} matched, {Added} added, {Removed} removed, {Modified} modified",
|
||||
trackingResult.Matches.Length,
|
||||
trackingResult.AddedFunctions.Length,
|
||||
trackingResult.RemovedFunctions.Length,
|
||||
trackingResult.ModifiedFunctions.Length);
|
||||
|
||||
return trackingResult;
|
||||
}
|
||||
|
||||
private static string[] BuildVersionTrackingArgs(
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
VersionTrackingOptions options)
|
||||
{
|
||||
var args = new List<string>
|
||||
{
|
||||
"-newBinary", newBinaryPath,
|
||||
"-minSimilarity", options.MinSimilarity.ToString("F2", CultureInfo.InvariantCulture)
|
||||
};
|
||||
|
||||
// Add correlator flags
|
||||
foreach (var correlator in options.Correlators)
|
||||
{
|
||||
args.Add($"-correlator:{GetCorrelatorName(correlator)}");
|
||||
}
|
||||
|
||||
if (options.IncludeDecompilation)
|
||||
{
|
||||
args.Add("-decompile");
|
||||
}
|
||||
|
||||
if (options.ComputeDetailedDiffs)
|
||||
{
|
||||
args.Add("-detailedDiffs");
|
||||
}
|
||||
|
||||
return [.. args];
|
||||
}
|
||||
|
||||
private static string GetCorrelatorName(CorrelatorType correlator)
|
||||
{
|
||||
return correlator switch
|
||||
{
|
||||
CorrelatorType.ExactBytes => "ExactBytesFunctionHasher",
|
||||
CorrelatorType.ExactMnemonics => "ExactMnemonicsFunctionHasher",
|
||||
CorrelatorType.SymbolName => "SymbolNameMatch",
|
||||
CorrelatorType.DataReference => "DataReferenceCorrelator",
|
||||
CorrelatorType.CallReference => "CallReferenceCorrelator",
|
||||
CorrelatorType.CombinedReference => "CombinedReferenceCorrelator",
|
||||
CorrelatorType.BSim => "BSimCorrelator",
|
||||
_ => "CombinedReferenceCorrelator"
|
||||
};
|
||||
}
|
||||
|
||||
private VersionTrackingResult ParseVersionTrackingOutput(
|
||||
string output,
|
||||
DateTimeOffset startTime,
|
||||
TimeSpan duration)
|
||||
{
|
||||
// Look for JSON output marker
|
||||
const string jsonMarker = "###VERSION_TRACKING_JSON###";
|
||||
var jsonStart = output.IndexOf(jsonMarker, StringComparison.Ordinal);
|
||||
|
||||
if (jsonStart >= 0)
|
||||
{
|
||||
var jsonContent = output[(jsonStart + jsonMarker.Length)..].Trim();
|
||||
var jsonEnd = jsonContent.IndexOf("###END_VERSION_TRACKING_JSON###", StringComparison.Ordinal);
|
||||
if (jsonEnd >= 0)
|
||||
{
|
||||
jsonContent = jsonContent[..jsonEnd].Trim();
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
return ParseJsonOutput(jsonContent, duration);
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse Version Tracking JSON output");
|
||||
}
|
||||
}
|
||||
|
||||
// Return empty result if parsing fails
|
||||
_logger.LogWarning("No structured Version Tracking output found");
|
||||
return CreateEmptyResult(duration);
|
||||
}
|
||||
|
||||
private static VersionTrackingResult ParseJsonOutput(string json, TimeSpan duration)
|
||||
{
|
||||
var data = JsonSerializer.Deserialize<VersionTrackingJsonOutput>(json, JsonOptions)
|
||||
?? throw new GhidraException("Failed to deserialize Version Tracking JSON output");
|
||||
|
||||
var matches = data.Matches?.Select(m => new FunctionMatch(
|
||||
m.OldName ?? "unknown",
|
||||
ParseAddress(m.OldAddress),
|
||||
m.NewName ?? "unknown",
|
||||
ParseAddress(m.NewAddress),
|
||||
m.Similarity,
|
||||
ParseCorrelatorType(m.MatchedBy),
|
||||
m.Differences?.Select(d => new MatchDifference(
|
||||
ParseDifferenceType(d.Type),
|
||||
d.Description ?? "",
|
||||
d.OldValue,
|
||||
d.NewValue,
|
||||
d.Address is not null ? ParseAddress(d.Address) : null
|
||||
)).ToImmutableArray() ?? []
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var added = data.AddedFunctions?.Select(f => new FunctionAdded(
|
||||
f.Name ?? "unknown",
|
||||
ParseAddress(f.Address),
|
||||
f.Size,
|
||||
f.Signature
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var removed = data.RemovedFunctions?.Select(f => new FunctionRemoved(
|
||||
f.Name ?? "unknown",
|
||||
ParseAddress(f.Address),
|
||||
f.Size,
|
||||
f.Signature
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var modified = data.ModifiedFunctions?.Select(f => new FunctionModified(
|
||||
f.OldName ?? "unknown",
|
||||
ParseAddress(f.OldAddress),
|
||||
f.OldSize,
|
||||
f.NewName ?? "unknown",
|
||||
ParseAddress(f.NewAddress),
|
||||
f.NewSize,
|
||||
f.Similarity,
|
||||
f.Differences?.Select(d => new MatchDifference(
|
||||
ParseDifferenceType(d.Type),
|
||||
d.Description ?? "",
|
||||
d.OldValue,
|
||||
d.NewValue,
|
||||
d.Address is not null ? ParseAddress(d.Address) : null
|
||||
)).ToImmutableArray() ?? [],
|
||||
f.OldDecompiled,
|
||||
f.NewDecompiled
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var stats = new VersionTrackingStats(
|
||||
data.Statistics?.TotalOldFunctions ?? 0,
|
||||
data.Statistics?.TotalNewFunctions ?? 0,
|
||||
matches.Length,
|
||||
added.Length,
|
||||
removed.Length,
|
||||
modified.Length,
|
||||
duration);
|
||||
|
||||
return new VersionTrackingResult(matches, added, removed, modified, stats);
|
||||
}
|
||||
|
||||
private static VersionTrackingResult CreateEmptyResult(TimeSpan duration)
|
||||
{
|
||||
return new VersionTrackingResult(
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
new VersionTrackingStats(0, 0, 0, 0, 0, 0, duration));
|
||||
}
|
||||
|
||||
private static ulong ParseAddress(string? address)
|
||||
{
|
||||
if (string.IsNullOrEmpty(address))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (address.StartsWith("0x", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
address = address[2..];
|
||||
}
|
||||
|
||||
return ulong.TryParse(address, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var result)
|
||||
? result
|
||||
: 0;
|
||||
}
|
||||
|
||||
private static CorrelatorType ParseCorrelatorType(string? correlator)
|
||||
{
|
||||
return correlator?.ToUpperInvariant() switch
|
||||
{
|
||||
"EXACTBYTES" or "EXACTBYTESFUNCTIONHASHER" => CorrelatorType.ExactBytes,
|
||||
"EXACTMNEMONICS" or "EXACTMNEMONICSFUNCTIONHASHER" => CorrelatorType.ExactMnemonics,
|
||||
"SYMBOLNAME" or "SYMBOLNAMEMATCH" => CorrelatorType.SymbolName,
|
||||
"DATAREFERENCE" or "DATAREFERENCECORRELATOR" => CorrelatorType.DataReference,
|
||||
"CALLREFERENCE" or "CALLREFERENCECORRELATOR" => CorrelatorType.CallReference,
|
||||
"COMBINEDREFERENCE" or "COMBINEDREFERENCECORRELATOR" => CorrelatorType.CombinedReference,
|
||||
"BSIM" or "BSIMCORRELATOR" => CorrelatorType.BSim,
|
||||
_ => CorrelatorType.CombinedReference
|
||||
};
|
||||
}
|
||||
|
||||
private static DifferenceType ParseDifferenceType(string? type)
|
||||
{
|
||||
return type?.ToUpperInvariant() switch
|
||||
{
|
||||
"INSTRUCTIONADDED" => DifferenceType.InstructionAdded,
|
||||
"INSTRUCTIONREMOVED" => DifferenceType.InstructionRemoved,
|
||||
"INSTRUCTIONCHANGED" => DifferenceType.InstructionChanged,
|
||||
"BRANCHTARGETCHANGED" => DifferenceType.BranchTargetChanged,
|
||||
"CALLTARGETCHANGED" => DifferenceType.CallTargetChanged,
|
||||
"CONSTANTCHANGED" => DifferenceType.ConstantChanged,
|
||||
"SIZECHANGED" => DifferenceType.SizeChanged,
|
||||
"STACKFRAMECHANGED" => DifferenceType.StackFrameChanged,
|
||||
"REGISTERUSAGECHANGED" => DifferenceType.RegisterUsageChanged,
|
||||
_ => DifferenceType.InstructionChanged
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<string> SaveStreamToTempFileAsync(Stream stream, string prefix, CancellationToken ct)
|
||||
{
|
||||
var path = Path.Combine(
|
||||
_options.WorkDir,
|
||||
$"{prefix}_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}.bin");
|
||||
|
||||
Directory.CreateDirectory(Path.GetDirectoryName(path)!);
|
||||
|
||||
await using var fileStream = File.Create(path);
|
||||
await stream.CopyToAsync(fileStream, ct);
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
private void TryDeleteFile(string path)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (File.Exists(path))
|
||||
{
|
||||
File.Delete(path);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to delete temp file: {Path}", path);
|
||||
}
|
||||
}
|
||||
|
||||
// JSON DTOs for deserialization
|
||||
private sealed record VersionTrackingJsonOutput
|
||||
{
|
||||
public List<FunctionMatchJson>? Matches { get; init; }
|
||||
public List<FunctionInfoJson>? AddedFunctions { get; init; }
|
||||
public List<FunctionInfoJson>? RemovedFunctions { get; init; }
|
||||
public List<FunctionModifiedJson>? ModifiedFunctions { get; init; }
|
||||
public VersionTrackingStatsJson? Statistics { get; init; }
|
||||
}
|
||||
|
||||
private sealed record FunctionMatchJson
|
||||
{
|
||||
public string? OldName { get; init; }
|
||||
public string? OldAddress { get; init; }
|
||||
public string? NewName { get; init; }
|
||||
public string? NewAddress { get; init; }
|
||||
public decimal Similarity { get; init; }
|
||||
public string? MatchedBy { get; init; }
|
||||
public List<DifferenceJson>? Differences { get; init; }
|
||||
}
|
||||
|
||||
private sealed record FunctionInfoJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? Address { get; init; }
|
||||
public int Size { get; init; }
|
||||
public string? Signature { get; init; }
|
||||
}
|
||||
|
||||
private sealed record FunctionModifiedJson
|
||||
{
|
||||
public string? OldName { get; init; }
|
||||
public string? OldAddress { get; init; }
|
||||
public int OldSize { get; init; }
|
||||
public string? NewName { get; init; }
|
||||
public string? NewAddress { get; init; }
|
||||
public int NewSize { get; init; }
|
||||
public decimal Similarity { get; init; }
|
||||
public List<DifferenceJson>? Differences { get; init; }
|
||||
public string? OldDecompiled { get; init; }
|
||||
public string? NewDecompiled { get; init; }
|
||||
}
|
||||
|
||||
private sealed record DifferenceJson
|
||||
{
|
||||
public string? Type { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public string? OldValue { get; init; }
|
||||
public string? NewValue { get; init; }
|
||||
public string? Address { get; init; }
|
||||
}
|
||||
|
||||
private sealed record VersionTrackingStatsJson
|
||||
{
|
||||
public int TotalOldFunctions { get; init; }
|
||||
public int TotalNewFunctions { get; init; }
|
||||
public int MatchedCount { get; init; }
|
||||
public int AddedCount { get; init; }
|
||||
public int RemovedCount { get; init; }
|
||||
public int ModifiedCount { get; init; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<Description>Ghidra integration for StellaOps BinaryIndex. Provides Version Tracking, BSim, and ghidriff capabilities as a fallback disassembly backend.</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Disassembly.Abstractions\StellaOps.BinaryIndex.Disassembly.Abstractions.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Contracts\StellaOps.BinaryIndex.Contracts.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options.DataAnnotations" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -0,0 +1,269 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML;
|
||||
|
||||
/// <summary>
|
||||
/// Tokenizer for binary/decompiled code using byte-pair encoding style tokenization.
|
||||
/// </summary>
|
||||
public sealed partial class BinaryCodeTokenizer : ITokenizer
|
||||
{
|
||||
private readonly ImmutableDictionary<string, long> _vocabulary;
|
||||
private readonly long _padToken;
|
||||
private readonly long _unkToken;
|
||||
private readonly long _clsToken;
|
||||
private readonly long _sepToken;
|
||||
|
||||
// Special token IDs (matching CodeBERT conventions)
|
||||
private const long DefaultPadToken = 0;
|
||||
private const long DefaultUnkToken = 1;
|
||||
private const long DefaultClsToken = 2;
|
||||
private const long DefaultSepToken = 3;
|
||||
|
||||
public BinaryCodeTokenizer(string? vocabularyPath = null)
|
||||
{
|
||||
if (!string.IsNullOrEmpty(vocabularyPath) && File.Exists(vocabularyPath))
|
||||
{
|
||||
_vocabulary = LoadVocabulary(vocabularyPath);
|
||||
_padToken = _vocabulary.GetValueOrDefault("<pad>", DefaultPadToken);
|
||||
_unkToken = _vocabulary.GetValueOrDefault("<unk>", DefaultUnkToken);
|
||||
_clsToken = _vocabulary.GetValueOrDefault("<cls>", DefaultClsToken);
|
||||
_sepToken = _vocabulary.GetValueOrDefault("<sep>", DefaultSepToken);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Use default vocabulary for testing
|
||||
_vocabulary = CreateDefaultVocabulary();
|
||||
_padToken = DefaultPadToken;
|
||||
_unkToken = DefaultUnkToken;
|
||||
_clsToken = DefaultClsToken;
|
||||
_sepToken = DefaultSepToken;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public long[] Tokenize(string text, int maxLength = 512)
|
||||
{
|
||||
var (inputIds, _) = TokenizeWithMask(text, maxLength);
|
||||
return inputIds;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public (long[] InputIds, long[] AttentionMask) TokenizeWithMask(string text, int maxLength = 512)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(text);
|
||||
|
||||
var tokens = TokenizeText(text);
|
||||
var inputIds = new long[maxLength];
|
||||
var attentionMask = new long[maxLength];
|
||||
|
||||
// Add [CLS] token
|
||||
inputIds[0] = _clsToken;
|
||||
attentionMask[0] = 1;
|
||||
|
||||
var position = 1;
|
||||
foreach (var token in tokens)
|
||||
{
|
||||
if (position >= maxLength - 1)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
inputIds[position] = _vocabulary.GetValueOrDefault(token.ToLowerInvariant(), _unkToken);
|
||||
attentionMask[position] = 1;
|
||||
position++;
|
||||
}
|
||||
|
||||
// Add [SEP] token
|
||||
if (position < maxLength)
|
||||
{
|
||||
inputIds[position] = _sepToken;
|
||||
attentionMask[position] = 1;
|
||||
position++;
|
||||
}
|
||||
|
||||
// Pad remaining positions
|
||||
for (var i = position; i < maxLength; i++)
|
||||
{
|
||||
inputIds[i] = _padToken;
|
||||
attentionMask[i] = 0;
|
||||
}
|
||||
|
||||
return (inputIds, attentionMask);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Decode(long[] tokenIds)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(tokenIds);
|
||||
|
||||
var reverseVocab = _vocabulary.ToImmutableDictionary(kv => kv.Value, kv => kv.Key);
|
||||
var tokens = new List<string>();
|
||||
|
||||
foreach (var id in tokenIds)
|
||||
{
|
||||
if (id == _padToken || id == _clsToken || id == _sepToken)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
tokens.Add(reverseVocab.GetValueOrDefault(id, "<unk>"));
|
||||
}
|
||||
|
||||
return string.Join(" ", tokens);
|
||||
}
|
||||
|
||||
private IEnumerable<string> TokenizeText(string text)
|
||||
{
|
||||
// Normalize whitespace
|
||||
text = WhitespaceRegex().Replace(text, " ");
|
||||
|
||||
// Split on operators and punctuation, keeping them as tokens
|
||||
var tokens = new List<string>();
|
||||
var matches = TokenRegex().Matches(text);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
var token = match.Value.Trim();
|
||||
if (!string.IsNullOrEmpty(token))
|
||||
{
|
||||
tokens.Add(token);
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private static ImmutableDictionary<string, long> LoadVocabulary(string path)
|
||||
{
|
||||
var vocabulary = new Dictionary<string, long>();
|
||||
var lines = File.ReadAllLines(path);
|
||||
|
||||
for (var i = 0; i < lines.Length; i++)
|
||||
{
|
||||
var token = lines[i].Trim();
|
||||
if (!string.IsNullOrEmpty(token))
|
||||
{
|
||||
vocabulary[token] = i;
|
||||
}
|
||||
}
|
||||
|
||||
return vocabulary.ToImmutableDictionary();
|
||||
}
|
||||
|
||||
private static ImmutableDictionary<string, long> CreateDefaultVocabulary()
|
||||
{
|
||||
// Basic vocabulary for testing without model
|
||||
var vocab = new Dictionary<string, long>
|
||||
{
|
||||
// Special tokens
|
||||
["<pad>"] = 0,
|
||||
["<unk>"] = 1,
|
||||
["<cls>"] = 2,
|
||||
["<sep>"] = 3,
|
||||
|
||||
// Keywords
|
||||
["void"] = 10,
|
||||
["int"] = 11,
|
||||
["char"] = 12,
|
||||
["short"] = 13,
|
||||
["long"] = 14,
|
||||
["float"] = 15,
|
||||
["double"] = 16,
|
||||
["unsigned"] = 17,
|
||||
["signed"] = 18,
|
||||
["const"] = 19,
|
||||
["static"] = 20,
|
||||
["extern"] = 21,
|
||||
["return"] = 22,
|
||||
["if"] = 23,
|
||||
["else"] = 24,
|
||||
["while"] = 25,
|
||||
["for"] = 26,
|
||||
["do"] = 27,
|
||||
["switch"] = 28,
|
||||
["case"] = 29,
|
||||
["default"] = 30,
|
||||
["break"] = 31,
|
||||
["continue"] = 32,
|
||||
["goto"] = 33,
|
||||
["sizeof"] = 34,
|
||||
["struct"] = 35,
|
||||
["union"] = 36,
|
||||
["enum"] = 37,
|
||||
["typedef"] = 38,
|
||||
|
||||
// Operators
|
||||
["+"] = 50,
|
||||
["-"] = 51,
|
||||
["*"] = 52,
|
||||
["/"] = 53,
|
||||
["%"] = 54,
|
||||
["="] = 55,
|
||||
["=="] = 56,
|
||||
["!="] = 57,
|
||||
["<"] = 58,
|
||||
[">"] = 59,
|
||||
["<="] = 60,
|
||||
[">="] = 61,
|
||||
["&&"] = 62,
|
||||
["||"] = 63,
|
||||
["!"] = 64,
|
||||
["&"] = 65,
|
||||
["|"] = 66,
|
||||
["^"] = 67,
|
||||
["~"] = 68,
|
||||
["<<"] = 69,
|
||||
[">>"] = 70,
|
||||
["++"] = 71,
|
||||
["--"] = 72,
|
||||
["->"] = 73,
|
||||
["."] = 74,
|
||||
|
||||
// Punctuation
|
||||
["("] = 80,
|
||||
[")"] = 81,
|
||||
["{"] = 82,
|
||||
["}"] = 83,
|
||||
["["] = 84,
|
||||
["]"] = 85,
|
||||
[";"] = 86,
|
||||
[","] = 87,
|
||||
[":"] = 88,
|
||||
|
||||
// Common Ghidra types
|
||||
["undefined"] = 100,
|
||||
["undefined1"] = 101,
|
||||
["undefined2"] = 102,
|
||||
["undefined4"] = 103,
|
||||
["undefined8"] = 104,
|
||||
["byte"] = 105,
|
||||
["word"] = 106,
|
||||
["dword"] = 107,
|
||||
["qword"] = 108,
|
||||
["bool"] = 109,
|
||||
|
||||
// Common functions
|
||||
["malloc"] = 200,
|
||||
["free"] = 201,
|
||||
["memcpy"] = 202,
|
||||
["memset"] = 203,
|
||||
["strlen"] = 204,
|
||||
["strcpy"] = 205,
|
||||
["strcmp"] = 206,
|
||||
["printf"] = 207,
|
||||
["sprintf"] = 208
|
||||
};
|
||||
|
||||
return vocab.ToImmutableDictionary();
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"\s+")]
|
||||
private static partial Regex WhitespaceRegex();
|
||||
|
||||
[GeneratedRegex(@"([a-zA-Z_][a-zA-Z0-9_]*|0[xX][0-9a-fA-F]+|\d+|""[^""]*""|'[^']*'|[+\-*/%=<>!&|^~]+|[(){}\[\];,.:])")]
|
||||
private static partial Regex TokenRegex();
|
||||
}
|
||||
@@ -0,0 +1,174 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML;
|
||||
|
||||
/// <summary>
|
||||
/// Service for generating and comparing function embeddings.
|
||||
/// </summary>
|
||||
public interface IEmbeddingService
|
||||
{
|
||||
/// <summary>
|
||||
/// Generate embedding vector for a function.
|
||||
/// </summary>
|
||||
/// <param name="input">Function input data.</param>
|
||||
/// <param name="options">Embedding options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Function embedding with vector.</returns>
|
||||
Task<FunctionEmbedding> GenerateEmbeddingAsync(
|
||||
EmbeddingInput input,
|
||||
EmbeddingOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Generate embeddings for multiple functions in batch.
|
||||
/// </summary>
|
||||
/// <param name="inputs">Function inputs.</param>
|
||||
/// <param name="options">Embedding options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Function embeddings.</returns>
|
||||
Task<ImmutableArray<FunctionEmbedding>> GenerateBatchAsync(
|
||||
IEnumerable<EmbeddingInput> inputs,
|
||||
EmbeddingOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Compute similarity between two embeddings.
|
||||
/// </summary>
|
||||
/// <param name="a">First embedding.</param>
|
||||
/// <param name="b">Second embedding.</param>
|
||||
/// <param name="metric">Similarity metric to use.</param>
|
||||
/// <returns>Similarity score (0.0 to 1.0).</returns>
|
||||
decimal ComputeSimilarity(
|
||||
FunctionEmbedding a,
|
||||
FunctionEmbedding b,
|
||||
SimilarityMetric metric = SimilarityMetric.Cosine);
|
||||
|
||||
/// <summary>
|
||||
/// Find similar functions in an embedding index.
|
||||
/// </summary>
|
||||
/// <param name="query">Query embedding.</param>
|
||||
/// <param name="topK">Number of results to return.</param>
|
||||
/// <param name="minSimilarity">Minimum similarity threshold.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matching functions sorted by similarity.</returns>
|
||||
Task<ImmutableArray<EmbeddingMatch>> FindSimilarAsync(
|
||||
FunctionEmbedding query,
|
||||
int topK = 10,
|
||||
decimal minSimilarity = 0.7m,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service for training ML models.
|
||||
/// </summary>
|
||||
public interface IModelTrainingService
|
||||
{
|
||||
/// <summary>
|
||||
/// Train embedding model on function pairs.
|
||||
/// </summary>
|
||||
/// <param name="trainingData">Training pairs.</param>
|
||||
/// <param name="options">Training options.</param>
|
||||
/// <param name="progress">Optional progress reporter.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Training result.</returns>
|
||||
Task<TrainingResult> TrainAsync(
|
||||
IAsyncEnumerable<TrainingPair> trainingData,
|
||||
TrainingOptions options,
|
||||
IProgress<TrainingProgress>? progress = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Evaluate model on test data.
|
||||
/// </summary>
|
||||
/// <param name="testData">Test pairs.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Evaluation metrics.</returns>
|
||||
Task<EvaluationResult> EvaluateAsync(
|
||||
IAsyncEnumerable<TrainingPair> testData,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Export trained model to specified format.
|
||||
/// </summary>
|
||||
/// <param name="outputPath">Output path for model.</param>
|
||||
/// <param name="format">Export format.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
Task ExportModelAsync(
|
||||
string outputPath,
|
||||
ModelExportFormat format = ModelExportFormat.Onnx,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tokenizer for converting code to token sequences.
|
||||
/// </summary>
|
||||
public interface ITokenizer
|
||||
{
|
||||
/// <summary>
|
||||
/// Tokenize text into token IDs.
|
||||
/// </summary>
|
||||
/// <param name="text">Input text.</param>
|
||||
/// <param name="maxLength">Maximum sequence length.</param>
|
||||
/// <returns>Token ID array.</returns>
|
||||
long[] Tokenize(string text, int maxLength = 512);
|
||||
|
||||
/// <summary>
|
||||
/// Tokenize with attention mask.
|
||||
/// </summary>
|
||||
/// <param name="text">Input text.</param>
|
||||
/// <param name="maxLength">Maximum sequence length.</param>
|
||||
/// <returns>Token IDs and attention mask.</returns>
|
||||
(long[] InputIds, long[] AttentionMask) TokenizeWithMask(string text, int maxLength = 512);
|
||||
|
||||
/// <summary>
|
||||
/// Decode token IDs back to text.
|
||||
/// </summary>
|
||||
/// <param name="tokenIds">Token IDs.</param>
|
||||
/// <returns>Decoded text.</returns>
|
||||
string Decode(long[] tokenIds);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Index for efficient embedding similarity search.
|
||||
/// </summary>
|
||||
public interface IEmbeddingIndex
|
||||
{
|
||||
/// <summary>
|
||||
/// Add embedding to index.
|
||||
/// </summary>
|
||||
/// <param name="embedding">Embedding to add.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
Task AddAsync(FunctionEmbedding embedding, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Add multiple embeddings to index.
|
||||
/// </summary>
|
||||
/// <param name="embeddings">Embeddings to add.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
Task AddBatchAsync(IEnumerable<FunctionEmbedding> embeddings, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Search for similar embeddings.
|
||||
/// </summary>
|
||||
/// <param name="query">Query vector.</param>
|
||||
/// <param name="topK">Number of results.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Similar embeddings with scores.</returns>
|
||||
Task<ImmutableArray<(FunctionEmbedding Embedding, decimal Similarity)>> SearchAsync(
|
||||
float[] query,
|
||||
int topK,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get total count of indexed embeddings.
|
||||
/// </summary>
|
||||
int Count { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Clear all embeddings from index.
|
||||
/// </summary>
|
||||
void Clear();
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user