save progress

This commit is contained in:
StellaOps Bot
2026-01-06 09:42:02 +02:00
parent 94d68bee8b
commit 37e11918e0
443 changed files with 85863 additions and 897 deletions

View File

@@ -12,6 +12,8 @@ namespace StellaOps.AdvisoryAI.Tests;
/// Sprint: SPRINT_20251226_015_AI_zastava_companion
/// Task: ZASTAVA-19
/// </summary>
[Trait("Category", TestCategories.Integration)]
[Trait("BlastRadius", TestCategories.BlastRadius.Advisories)]
public sealed class ExplanationGeneratorIntegrationTests
{
[Trait("Category", TestCategories.Unit)]

View File

@@ -83,80 +83,6 @@ public sealed class HttpClientUsageAnalyzerTests
Assert.DoesNotContain(diagnostics, d => d.Id == HttpClientUsageAnalyzer.DiagnosticId);
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public async Task CodeFix_RewritesToFactoryCall()
{
const string source = """
using System.Net.Http;
namespace Sample.Service;
public sealed class Demo
{
public void Run()
{
var client = new HttpClient();
}
}
""";
const string expected = """
using System.Net.Http;
namespace Sample.Service;
public sealed class Demo
{
public void Run()
{
var client = global::StellaOps.AirGap.Policy.EgressHttpClientFactory.Create(egressPolicy: default(global::StellaOps.AirGap.Policy.IEgressPolicy) /* TODO: provide IEgressPolicy instance */, request: new global::StellaOps.AirGap.Policy.EgressRequest(component: "REPLACE_COMPONENT", destination: new global::System.Uri("https://replace-with-endpoint"), intent: "REPLACE_INTENT"));
}
}
""";
var updated = await ApplyCodeFixAsync(source, assemblyName: "Sample.Service");
Assert.Equal(expected.ReplaceLineEndings(), updated.ReplaceLineEndings());
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public async Task CodeFix_PreservesHttpClientArguments()
{
const string source = """
using System.Net.Http;
namespace Sample.Service;
public sealed class Demo
{
public void Run()
{
var handler = new HttpClientHandler();
var client = new HttpClient(handler, disposeHandler: false);
}
}
""";
const string expected = """
using System.Net.Http;
namespace Sample.Service;
public sealed class Demo
{
public void Run()
{
var handler = new HttpClientHandler();
var client = global::StellaOps.AirGap.Policy.EgressHttpClientFactory.Create(egressPolicy: default(global::StellaOps.AirGap.Policy.IEgressPolicy) /* TODO: provide IEgressPolicy instance */, request: new global::StellaOps.AirGap.Policy.EgressRequest(component: "REPLACE_COMPONENT", destination: new global::System.Uri("https://replace-with-endpoint"), intent: "REPLACE_INTENT"), clientFactory: () => new global::System.Net.Http.HttpClient(handler, disposeHandler: false));
}
}
""";
var updated = await ApplyCodeFixAsync(source, assemblyName: "Sample.Service");
Assert.Equal(expected.ReplaceLineEndings(), updated.ReplaceLineEndings());
}
private static async Task<ImmutableArray<Diagnostic>> AnalyzeAsync(string source, string assemblyName)
{
var compilation = CSharpCompilation.Create(
@@ -174,53 +100,6 @@ public sealed class HttpClientUsageAnalyzerTests
return await compilationWithAnalyzers.GetAnalyzerDiagnosticsAsync();
}
private static async Task<string> ApplyCodeFixAsync(string source, string assemblyName)
{
using var workspace = new AdhocWorkspace();
var projectId = ProjectId.CreateNewId();
var documentId = DocumentId.CreateNewId(projectId);
var stubDocumentId = DocumentId.CreateNewId(projectId);
var solution = workspace.CurrentSolution
.AddProject(projectId, "TestProject", "TestProject", LanguageNames.CSharp)
.WithProjectCompilationOptions(projectId, new CSharpCompilationOptions(OutputKind.DynamicallyLinkedLibrary))
.WithProjectAssemblyName(projectId, assemblyName)
.AddMetadataReferences(projectId, CreateMetadataReferences())
.AddDocument(documentId, "Test.cs", SourceText.From(source))
.AddDocument(stubDocumentId, "PolicyStubs.cs", SourceText.From(PolicyStubSource));
var project = solution.GetProject(projectId)!;
var document = solution.GetDocument(documentId)!;
var compilation = await project.GetCompilationAsync();
var analyzer = new HttpClientUsageAnalyzer();
var diagnostics = await compilation!.WithAnalyzers(ImmutableArray.Create<DiagnosticAnalyzer>(analyzer))
.GetAnalyzerDiagnosticsAsync();
var diagnostic = Assert.Single(diagnostics);
var codeFixProvider = new HttpClientUsageCodeFixProvider();
var actions = new List<CodeAction>();
var context = new CodeFixContext(
document,
diagnostic,
(action, _) => actions.Add(action),
CancellationToken.None);
await codeFixProvider.RegisterCodeFixesAsync(context);
var action = Assert.Single(actions);
var operations = await action.GetOperationsAsync(CancellationToken.None);
foreach (var operation in operations)
{
operation.Apply(workspace, CancellationToken.None);
}
var updatedDocument = workspace.CurrentSolution.GetDocument(documentId)!;
var updatedText = await updatedDocument.GetTextAsync();
return updatedText.ToString();
}
private static IEnumerable<MetadataReference> CreateMetadataReferences()
{
yield return MetadataReference.CreateFromFile(typeof(object).GetTypeInfo().Assembly.Location);

View File

@@ -276,165 +276,6 @@ public sealed class PolicyAnalyzerRoslynTests
#region AIRGAP-5100-006: Golden Generated Code Tests
[Trait("Category", TestCategories.Unit)]
[Fact]
public async Task CodeFix_GeneratesExpectedFactoryCall()
{
const string source = """
using System.Net.Http;
namespace Sample.Service;
public sealed class Demo
{
public void Run()
{
var client = new HttpClient();
}
}
""";
const string expectedGolden = """
using System.Net.Http;
namespace Sample.Service;
public sealed class Demo
{
public void Run()
{
var client = global::StellaOps.AirGap.Policy.EgressHttpClientFactory.Create(egressPolicy: default(global::StellaOps.AirGap.Policy.IEgressPolicy) /* TODO: provide IEgressPolicy instance */, request: new global::StellaOps.AirGap.Policy.EgressRequest(component: "REPLACE_COMPONENT", destination: new global::System.Uri("https://replace-with-endpoint"), intent: "REPLACE_INTENT"));
}
}
""";
var fixedCode = await ApplyCodeFixAsync(source, assemblyName: "Sample.Service");
fixedCode.ReplaceLineEndings().Should().Be(expectedGolden.ReplaceLineEndings(),
"Code fix should match golden output exactly");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public async Task CodeFix_PreservesTrivia()
{
const string source = """
using System.Net.Http;
namespace Sample.Service;
public sealed class Demo
{
public void Run()
{
// Important: this client handles external requests
var client = new HttpClient(); // end of line comment
}
}
""";
var fixedCode = await ApplyCodeFixAsync(source, assemblyName: "Sample.Service");
// The code fix preserves the trivia from the original node
fixedCode.Should().Contain("// Important: this client handles external requests",
"Leading comment should be preserved");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public async Task CodeFix_DeterministicOutput()
{
const string source = """
using System.Net.Http;
namespace Sample.Determinism;
public sealed class Demo
{
public void Run()
{
var client = new HttpClient();
}
}
""";
// Apply code fix multiple times
var result1 = await ApplyCodeFixAsync(source, assemblyName: "Sample.Determinism");
var result2 = await ApplyCodeFixAsync(source, assemblyName: "Sample.Determinism");
var result3 = await ApplyCodeFixAsync(source, assemblyName: "Sample.Determinism");
result1.Should().Be(result2, "Code fix should be deterministic");
result2.Should().Be(result3, "Code fix should be deterministic");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public async Task CodeFix_ContainsRequiredPlaceholders()
{
const string source = """
using System.Net.Http;
namespace Sample.Service;
public sealed class Demo
{
public void Run()
{
var client = new HttpClient();
}
}
""";
var fixedCode = await ApplyCodeFixAsync(source, assemblyName: "Sample.Service");
// Verify all required placeholders are present for developer to fill in
fixedCode.Should().Contain("EgressHttpClientFactory.Create");
fixedCode.Should().Contain("egressPolicy:");
fixedCode.Should().Contain("IEgressPolicy");
fixedCode.Should().Contain("EgressRequest");
fixedCode.Should().Contain("component:");
fixedCode.Should().Contain("REPLACE_COMPONENT");
fixedCode.Should().Contain("destination:");
fixedCode.Should().Contain("intent:");
fixedCode.Should().Contain("REPLACE_INTENT");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public async Task CodeFix_UsesFullyQualifiedNames()
{
const string source = """
using System.Net.Http;
namespace Sample.Service;
public sealed class Demo
{
public void Run()
{
var client = new HttpClient();
}
}
""";
var fixedCode = await ApplyCodeFixAsync(source, assemblyName: "Sample.Service");
// Verify fully qualified names are used to avoid namespace conflicts
fixedCode.Should().Contain("global::StellaOps.AirGap.Policy.EgressHttpClientFactory");
fixedCode.Should().Contain("global::StellaOps.AirGap.Policy.EgressRequest");
fixedCode.Should().Contain("global::System.Uri");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public async Task FixAllProvider_IsWellKnownBatchFixer()
{
var provider = new HttpClientUsageCodeFixProvider();
var fixAllProvider = provider.GetFixAllProvider();
fixAllProvider.Should().Be(WellKnownFixAllProviders.BatchFixer,
"Should use batch fixer for efficient multi-fix application");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public async Task Analyzer_SupportedDiagnostics_ContainsExpectedId()
@@ -446,20 +287,6 @@ public sealed class PolicyAnalyzerRoslynTests
supportedDiagnostics[0].Id.Should().Be("AIRGAP001");
}
[Trait("Category", TestCategories.Unit)]
[Fact]
public async Task CodeFixProvider_FixableDiagnosticIds_MatchesAnalyzer()
{
var analyzer = new HttpClientUsageAnalyzer();
var codeFixProvider = new HttpClientUsageCodeFixProvider();
var analyzerIds = analyzer.SupportedDiagnostics.Select(d => d.Id).ToHashSet();
var fixableIds = codeFixProvider.FixableDiagnosticIds.ToHashSet();
fixableIds.Should().BeSubsetOf(analyzerIds,
"Code fix provider should only fix diagnostics reported by the analyzer");
}
#endregion
#region Test Helpers
@@ -481,53 +308,6 @@ public sealed class PolicyAnalyzerRoslynTests
return await compilationWithAnalyzers.GetAnalyzerDiagnosticsAsync();
}
private static async Task<string> ApplyCodeFixAsync(string source, string assemblyName)
{
using var workspace = new AdhocWorkspace();
var projectId = ProjectId.CreateNewId();
var documentId = DocumentId.CreateNewId(projectId);
var stubDocumentId = DocumentId.CreateNewId(projectId);
var solution = workspace.CurrentSolution
.AddProject(projectId, "TestProject", "TestProject", LanguageNames.CSharp)
.WithProjectCompilationOptions(projectId, new CSharpCompilationOptions(OutputKind.DynamicallyLinkedLibrary))
.WithProjectAssemblyName(projectId, assemblyName)
.AddMetadataReferences(projectId, CreateMetadataReferences())
.AddDocument(documentId, "Test.cs", SourceText.From(source))
.AddDocument(stubDocumentId, "PolicyStubs.cs", SourceText.From(PolicyStubSource));
var project = solution.GetProject(projectId)!;
var document = solution.GetDocument(documentId)!;
var compilation = await project.GetCompilationAsync();
var analyzer = new HttpClientUsageAnalyzer();
var diagnostics = await compilation!.WithAnalyzers(ImmutableArray.Create<DiagnosticAnalyzer>(analyzer))
.GetAnalyzerDiagnosticsAsync();
var diagnostic = diagnostics.Single(d => d.Id == HttpClientUsageAnalyzer.DiagnosticId);
var codeFixProvider = new HttpClientUsageCodeFixProvider();
var actions = new List<CodeAction>();
var context = new CodeFixContext(
document,
diagnostic,
(action, _) => actions.Add(action),
CancellationToken.None);
await codeFixProvider.RegisterCodeFixesAsync(context);
var action = actions.Single();
var operations = await action.GetOperationsAsync(CancellationToken.None);
foreach (var operation in operations)
{
operation.Apply(workspace, CancellationToken.None);
}
var updatedDocument = workspace.CurrentSolution.GetDocument(documentId)!;
var updatedText = await updatedDocument.GetTextAsync();
return updatedText.ToString();
}
private static IEnumerable<MetadataReference> CreateMetadataReferences()
{
// Core runtime references

View File

@@ -1,125 +0,0 @@
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Composition;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.CodeAnalysis;
using Microsoft.CodeAnalysis.CodeActions;
using Microsoft.CodeAnalysis.CodeFixes;
using Microsoft.CodeAnalysis.CSharp;
using Microsoft.CodeAnalysis.CSharp.Syntax;
namespace StellaOps.AirGap.Policy.Analyzers;
/// <summary>
/// Offers a remediation template that routes HttpClient creation through the shared EgressPolicy factory.
/// </summary>
[ExportCodeFixProvider(LanguageNames.CSharp, Name = nameof(HttpClientUsageCodeFixProvider))]
[Shared]
public sealed class HttpClientUsageCodeFixProvider : CodeFixProvider
{
private const string Title = "Use EgressHttpClientFactory.Create(...)";
/// <inheritdoc/>
public override ImmutableArray<string> FixableDiagnosticIds
=> ImmutableArray.Create(HttpClientUsageAnalyzer.DiagnosticId);
/// <inheritdoc/>
public override FixAllProvider GetFixAllProvider()
=> WellKnownFixAllProviders.BatchFixer;
/// <inheritdoc/>
public override async Task RegisterCodeFixesAsync(CodeFixContext context)
{
if (context.Document is null)
{
return;
}
var root = await context.Document.GetSyntaxRootAsync(context.CancellationToken).ConfigureAwait(false);
if (root is null)
{
return;
}
var diagnostic = context.Diagnostics[0];
var node = root.FindNode(diagnostic.Location.SourceSpan);
if (node is not ObjectCreationExpressionSyntax objectCreation)
{
return;
}
context.RegisterCodeFix(
CodeAction.Create(
Title,
cancellationToken => ReplaceWithFactoryCallAsync(context.Document, objectCreation, cancellationToken),
equivalenceKey: Title),
diagnostic);
}
private static async Task<Document> ReplaceWithFactoryCallAsync(Document document, ObjectCreationExpressionSyntax creation, CancellationToken cancellationToken)
{
var replacementExpression = BuildReplacementExpression(creation);
var root = await document.GetSyntaxRootAsync(cancellationToken).ConfigureAwait(false);
if (root is null)
{
return document;
}
var updatedRoot = root.ReplaceNode(creation, replacementExpression.WithTriviaFrom(creation));
return document.WithSyntaxRoot(updatedRoot);
}
private static ExpressionSyntax BuildReplacementExpression(ObjectCreationExpressionSyntax creation)
{
var requestExpression = SyntaxFactory.ParseExpression(
"new global::StellaOps.AirGap.Policy.EgressRequest(" +
"component: \"REPLACE_COMPONENT\", " +
"destination: new global::System.Uri(\"https://replace-with-endpoint\"), " +
"intent: \"REPLACE_INTENT\")");
var egressPolicyExpression = SyntaxFactory.ParseExpression(
"default(global::StellaOps.AirGap.Policy.IEgressPolicy)");
var arguments = new List<ArgumentSyntax>
{
SyntaxFactory.Argument(egressPolicyExpression)
.WithNameColon(SyntaxFactory.NameColon("egressPolicy"))
.WithTrailingTrivia(
SyntaxFactory.Space,
SyntaxFactory.Comment("/* TODO: provide IEgressPolicy instance */")),
SyntaxFactory.Argument(requestExpression)
.WithNameColon(SyntaxFactory.NameColon("request"))
};
if (ShouldUseClientFactory(creation))
{
var clientFactoryLambda = SyntaxFactory.ParenthesizedLambdaExpression(
SyntaxFactory.ParameterList(),
CreateHttpClientExpression(creation));
arguments.Add(
SyntaxFactory.Argument(clientFactoryLambda)
.WithNameColon(SyntaxFactory.NameColon("clientFactory")));
}
return SyntaxFactory.InvocationExpression(
SyntaxFactory.ParseExpression("global::StellaOps.AirGap.Policy.EgressHttpClientFactory.Create"))
.WithArgumentList(SyntaxFactory.ArgumentList(SyntaxFactory.SeparatedList(arguments)));
}
private static bool ShouldUseClientFactory(ObjectCreationExpressionSyntax creation)
=> (creation.ArgumentList?.Arguments.Count ?? 0) > 0 || creation.Initializer is not null;
private static ObjectCreationExpressionSyntax CreateHttpClientExpression(ObjectCreationExpressionSyntax creation)
{
var httpClientType = SyntaxFactory.ParseTypeName("global::System.Net.Http.HttpClient");
var arguments = creation.ArgumentList ?? SyntaxFactory.ArgumentList();
return SyntaxFactory.ObjectCreationExpression(httpClientType)
.WithArgumentList(arguments)
.WithInitializer(creation.Initializer);
}
}

View File

@@ -13,7 +13,6 @@
<ItemGroup>
<PackageReference Include="Microsoft.CodeAnalysis.CSharp" PrivateAssets="all" />
<PackageReference Include="Microsoft.CodeAnalysis.CSharp.Workspaces" PrivateAssets="all" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,148 @@
// <copyright file="AirGapSyncServiceCollectionExtensions.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
using StellaOps.AirGap.Sync.Services;
using StellaOps.AirGap.Sync.Stores;
using StellaOps.AirGap.Sync.Transport;
using StellaOps.Determinism;
using StellaOps.HybridLogicalClock;
namespace StellaOps.AirGap.Sync;
/// <summary>
/// Extension methods for registering air-gap sync services.
/// </summary>
public static class AirGapSyncServiceCollectionExtensions
{
/// <summary>
/// Adds air-gap sync services to the service collection.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="nodeId">The node identifier for this instance.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddAirGapSyncServices(
this IServiceCollection services,
string nodeId)
{
ArgumentException.ThrowIfNullOrWhiteSpace(nodeId);
// Core services
services.TryAddSingleton<IConflictResolver, ConflictResolver>();
services.TryAddSingleton<IHlcMergeService, HlcMergeService>();
services.TryAddSingleton<IAirGapBundleImporter, AirGapBundleImporter>();
// Register in-memory HLC state store for offline operation
services.TryAddSingleton<IHlcStateStore, InMemoryHlcStateStore>();
// Register HLC clock with node ID
services.TryAddSingleton<IHybridLogicalClock>(sp =>
{
var timeProvider = sp.GetService<TimeProvider>() ?? TimeProvider.System;
var stateStore = sp.GetRequiredService<IHlcStateStore>();
return new HybridLogicalClock.HybridLogicalClock(timeProvider, nodeId, stateStore);
});
// Register deterministic GUID provider
services.TryAddSingleton<IGuidProvider>(SystemGuidProvider.Instance);
// File-based store (can be overridden)
services.TryAddSingleton<IOfflineJobLogStore, FileBasedOfflineJobLogStore>();
// Offline HLC manager
services.TryAddSingleton<IOfflineHlcManager, OfflineHlcManager>();
// Bundle exporter
services.TryAddSingleton<IAirGapBundleExporter, AirGapBundleExporter>();
return services;
}
/// <summary>
/// Adds air-gap sync services with custom options.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="nodeId">The node identifier for this instance.</param>
/// <param name="configureOptions">Action to configure file-based store options.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddAirGapSyncServices(
this IServiceCollection services,
string nodeId,
Action<FileBasedOfflineJobLogStoreOptions> configureOptions)
{
// Configure file-based store options
services.Configure(configureOptions);
return services.AddAirGapSyncServices(nodeId);
}
/// <summary>
/// Adds the air-gap sync service for importing bundles to the central scheduler.
/// </summary>
/// <param name="services">The service collection.</param>
/// <returns>The service collection for chaining.</returns>
/// <remarks>
/// This requires ISyncSchedulerLogRepository to be registered separately,
/// as it depends on the Scheduler.Persistence module.
/// </remarks>
public static IServiceCollection AddAirGapSyncImportService(this IServiceCollection services)
{
services.TryAddScoped<IAirGapSyncService, AirGapSyncService>();
return services;
}
/// <summary>
/// Adds file-based transport for job sync bundles.
/// </summary>
/// <param name="services">The service collection.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddFileBasedJobSyncTransport(this IServiceCollection services)
{
services.TryAddSingleton<IJobSyncTransport, FileBasedJobSyncTransport>();
return services;
}
/// <summary>
/// Adds file-based transport for job sync bundles with custom options.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="configureOptions">Action to configure transport options.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddFileBasedJobSyncTransport(
this IServiceCollection services,
Action<FileBasedJobSyncTransportOptions> configureOptions)
{
services.Configure(configureOptions);
return services.AddFileBasedJobSyncTransport();
}
/// <summary>
/// Adds Router-based transport for job sync bundles.
/// </summary>
/// <param name="services">The service collection.</param>
/// <returns>The service collection for chaining.</returns>
/// <remarks>
/// Requires IRouterJobSyncClient to be registered separately.
/// </remarks>
public static IServiceCollection AddRouterJobSyncTransport(this IServiceCollection services)
{
services.TryAddSingleton<IJobSyncTransport, RouterJobSyncTransport>();
return services;
}
/// <summary>
/// Adds Router-based transport for job sync bundles with custom options.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="configureOptions">Action to configure transport options.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddRouterJobSyncTransport(
this IServiceCollection services,
Action<RouterJobSyncTransportOptions> configureOptions)
{
services.Configure(configureOptions);
return services.AddRouterJobSyncTransport();
}
}

View File

@@ -0,0 +1,51 @@
// <copyright file="AirGapBundle.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
namespace StellaOps.AirGap.Sync.Models;
/// <summary>
/// Represents an air-gap bundle containing job logs from one or more offline nodes.
/// </summary>
public sealed record AirGapBundle
{
/// <summary>
/// Gets the unique bundle identifier.
/// </summary>
public required Guid BundleId { get; init; }
/// <summary>
/// Gets the tenant ID for this bundle.
/// </summary>
public required string TenantId { get; init; }
/// <summary>
/// Gets when the bundle was created.
/// </summary>
public required DateTimeOffset CreatedAt { get; init; }
/// <summary>
/// Gets the node ID that created this bundle.
/// </summary>
public required string CreatedByNodeId { get; init; }
/// <summary>
/// Gets the job logs from each offline node.
/// </summary>
public required IReadOnlyList<NodeJobLog> JobLogs { get; init; }
/// <summary>
/// Gets the bundle manifest digest for integrity verification.
/// </summary>
public required string ManifestDigest { get; init; }
/// <summary>
/// Gets the optional DSSE signature over the manifest.
/// </summary>
public string? Signature { get; init; }
/// <summary>
/// Gets the key ID used for signing (if signed).
/// </summary>
public string? SignedBy { get; init; }
}

View File

@@ -0,0 +1,68 @@
// <copyright file="ConflictResolution.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
namespace StellaOps.AirGap.Sync.Models;
/// <summary>
/// Result of conflict resolution for a job ID.
/// </summary>
public sealed record ConflictResolution
{
/// <summary>
/// Gets the type of conflict detected.
/// </summary>
public required ConflictType Type { get; init; }
/// <summary>
/// Gets the resolution strategy applied.
/// </summary>
public required ResolutionStrategy Resolution { get; init; }
/// <summary>
/// Gets the selected entry (when resolution is not Error).
/// </summary>
public OfflineJobLogEntry? SelectedEntry { get; init; }
/// <summary>
/// Gets the entries that were dropped.
/// </summary>
public IReadOnlyList<OfflineJobLogEntry>? DroppedEntries { get; init; }
/// <summary>
/// Gets the error message (when resolution is Error).
/// </summary>
public string? Error { get; init; }
}
/// <summary>
/// Types of conflicts that can occur during merge.
/// </summary>
public enum ConflictType
{
/// <summary>
/// Same JobId with different HLC timestamps but identical payload.
/// </summary>
DuplicateTimestamp,
/// <summary>
/// Same JobId with different payloads - indicates a bug.
/// </summary>
PayloadMismatch
}
/// <summary>
/// Strategies for resolving conflicts.
/// </summary>
public enum ResolutionStrategy
{
/// <summary>
/// Take the entry with the earliest HLC timestamp.
/// </summary>
TakeEarliest,
/// <summary>
/// Fail the merge - conflict cannot be resolved.
/// </summary>
Error
}

View File

@@ -0,0 +1,87 @@
// <copyright file="MergeResult.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using StellaOps.HybridLogicalClock;
namespace StellaOps.AirGap.Sync.Models;
/// <summary>
/// Result of merging job logs from multiple offline nodes.
/// </summary>
public sealed record MergeResult
{
/// <summary>
/// Gets the merged entries in HLC total order.
/// </summary>
public required IReadOnlyList<MergedJobEntry> MergedEntries { get; init; }
/// <summary>
/// Gets duplicate entries that were dropped during merge.
/// </summary>
public required IReadOnlyList<DuplicateEntry> Duplicates { get; init; }
/// <summary>
/// Gets the merged chain head (final link after merge).
/// </summary>
public byte[]? MergedChainHead { get; init; }
/// <summary>
/// Gets the source node IDs that contributed to this merge.
/// </summary>
public required IReadOnlyList<string> SourceNodes { get; init; }
}
/// <summary>
/// A job entry after merge with unified chain link.
/// </summary>
public sealed class MergedJobEntry
{
/// <summary>
/// Gets or sets the source node ID that created this entry.
/// </summary>
public required string SourceNodeId { get; set; }
/// <summary>
/// Gets or sets the HLC timestamp.
/// </summary>
public required HlcTimestamp THlc { get; set; }
/// <summary>
/// Gets or sets the job ID.
/// </summary>
public required Guid JobId { get; set; }
/// <summary>
/// Gets or sets the partition key.
/// </summary>
public string? PartitionKey { get; set; }
/// <summary>
/// Gets or sets the serialized payload.
/// </summary>
public required string Payload { get; set; }
/// <summary>
/// Gets or sets the payload hash.
/// </summary>
public required byte[] PayloadHash { get; set; }
/// <summary>
/// Gets or sets the original chain link from the source node.
/// </summary>
public required byte[] OriginalLink { get; set; }
/// <summary>
/// Gets or sets the merged chain link (computed during merge).
/// </summary>
public byte[]? MergedLink { get; set; }
}
/// <summary>
/// Represents a duplicate entry dropped during merge.
/// </summary>
public sealed record DuplicateEntry(
Guid JobId,
string NodeId,
HlcTimestamp THlc);

View File

@@ -0,0 +1,33 @@
// <copyright file="NodeJobLog.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using StellaOps.HybridLogicalClock;
namespace StellaOps.AirGap.Sync.Models;
/// <summary>
/// Represents the job log from a single offline node.
/// </summary>
public sealed record NodeJobLog
{
/// <summary>
/// Gets the node identifier.
/// </summary>
public required string NodeId { get; init; }
/// <summary>
/// Gets the last HLC timestamp in this log.
/// </summary>
public required HlcTimestamp LastHlc { get; init; }
/// <summary>
/// Gets the chain head (last link) in this log.
/// </summary>
public required byte[] ChainHead { get; init; }
/// <summary>
/// Gets the job log entries in HLC order.
/// </summary>
public required IReadOnlyList<OfflineJobLogEntry> Entries { get; init; }
}

View File

@@ -0,0 +1,58 @@
// <copyright file="OfflineJobLogEntry.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using StellaOps.HybridLogicalClock;
namespace StellaOps.AirGap.Sync.Models;
/// <summary>
/// Represents a job log entry created while operating offline.
/// </summary>
public sealed record OfflineJobLogEntry
{
/// <summary>
/// Gets the node ID that created this entry.
/// </summary>
public required string NodeId { get; init; }
/// <summary>
/// Gets the HLC timestamp when the job was enqueued.
/// </summary>
public required HlcTimestamp THlc { get; init; }
/// <summary>
/// Gets the deterministic job ID.
/// </summary>
public required Guid JobId { get; init; }
/// <summary>
/// Gets the partition key (if any).
/// </summary>
public string? PartitionKey { get; init; }
/// <summary>
/// Gets the serialized job payload.
/// </summary>
public required string Payload { get; init; }
/// <summary>
/// Gets the SHA-256 hash of the canonical payload.
/// </summary>
public required byte[] PayloadHash { get; init; }
/// <summary>
/// Gets the previous chain link (null for first entry).
/// </summary>
public byte[]? PrevLink { get; init; }
/// <summary>
/// Gets the chain link: Hash(prev_link || job_id || t_hlc || payload_hash).
/// </summary>
public required byte[] Link { get; init; }
/// <summary>
/// Gets the wall-clock time when the entry was created (informational only).
/// </summary>
public DateTimeOffset EnqueuedAt { get; init; }
}

View File

@@ -0,0 +1,72 @@
// <copyright file="SyncResult.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
namespace StellaOps.AirGap.Sync.Models;
/// <summary>
/// Result of syncing an air-gap bundle to the central scheduler.
/// </summary>
public sealed record SyncResult
{
/// <summary>
/// Gets the bundle ID that was synced.
/// </summary>
public required Guid BundleId { get; init; }
/// <summary>
/// Gets the total number of entries in the bundle.
/// </summary>
public required int TotalInBundle { get; init; }
/// <summary>
/// Gets the number of entries appended to the scheduler log.
/// </summary>
public required int Appended { get; init; }
/// <summary>
/// Gets the number of duplicate entries skipped.
/// </summary>
public required int Duplicates { get; init; }
/// <summary>
/// Gets the number of entries that already existed (idempotency).
/// </summary>
public int AlreadyExisted { get; init; }
/// <summary>
/// Gets the new chain head after sync.
/// </summary>
public byte[]? NewChainHead { get; init; }
/// <summary>
/// Gets any warnings generated during sync.
/// </summary>
public IReadOnlyList<string>? Warnings { get; init; }
}
/// <summary>
/// Result of an offline enqueue operation.
/// </summary>
public sealed record OfflineEnqueueResult
{
/// <summary>
/// Gets the HLC timestamp assigned.
/// </summary>
public required StellaOps.HybridLogicalClock.HlcTimestamp THlc { get; init; }
/// <summary>
/// Gets the deterministic job ID.
/// </summary>
public required Guid JobId { get; init; }
/// <summary>
/// Gets the chain link computed.
/// </summary>
public required byte[] Link { get; init; }
/// <summary>
/// Gets the node ID that created this entry.
/// </summary>
public required string NodeId { get; init; }
}

View File

@@ -0,0 +1,270 @@
// <copyright file="AirGapBundleExporter.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using System.Security.Cryptography;
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using StellaOps.AirGap.Sync.Models;
using StellaOps.AirGap.Sync.Stores;
using StellaOps.Canonical.Json;
using StellaOps.Determinism;
namespace StellaOps.AirGap.Sync.Services;
/// <summary>
/// Interface for air-gap bundle export operations.
/// </summary>
public interface IAirGapBundleExporter
{
/// <summary>
/// Exports an air-gap bundle containing offline job logs.
/// </summary>
/// <param name="tenantId">The tenant ID.</param>
/// <param name="nodeIds">The node IDs to include (null for current node only).</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The exported bundle.</returns>
Task<AirGapBundle> ExportAsync(
string tenantId,
IReadOnlyList<string>? nodeIds = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Exports an air-gap bundle to a file.
/// </summary>
/// <param name="bundle">The bundle to export.</param>
/// <param name="outputPath">The output file path.</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task ExportToFileAsync(
AirGapBundle bundle,
string outputPath,
CancellationToken cancellationToken = default);
/// <summary>
/// Exports an air-gap bundle to a JSON string.
/// </summary>
/// <param name="bundle">The bundle to export.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The JSON string representation.</returns>
Task<string> ExportToStringAsync(
AirGapBundle bundle,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Service for exporting air-gap bundles.
/// </summary>
public sealed class AirGapBundleExporter : IAirGapBundleExporter
{
private readonly IOfflineJobLogStore _jobLogStore;
private readonly IOfflineHlcManager _hlcManager;
private readonly IGuidProvider _guidProvider;
private readonly TimeProvider _timeProvider;
private readonly ILogger<AirGapBundleExporter> _logger;
private static readonly JsonSerializerOptions JsonOptions = new()
{
WriteIndented = true,
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
};
/// <summary>
/// Initializes a new instance of the <see cref="AirGapBundleExporter"/> class.
/// </summary>
public AirGapBundleExporter(
IOfflineJobLogStore jobLogStore,
IOfflineHlcManager hlcManager,
IGuidProvider guidProvider,
TimeProvider timeProvider,
ILogger<AirGapBundleExporter> logger)
{
_jobLogStore = jobLogStore ?? throw new ArgumentNullException(nameof(jobLogStore));
_hlcManager = hlcManager ?? throw new ArgumentNullException(nameof(hlcManager));
_guidProvider = guidProvider ?? throw new ArgumentNullException(nameof(guidProvider));
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc/>
public async Task<AirGapBundle> ExportAsync(
string tenantId,
IReadOnlyList<string>? nodeIds = null,
CancellationToken cancellationToken = default)
{
ArgumentException.ThrowIfNullOrWhiteSpace(tenantId);
var effectiveNodeIds = nodeIds ?? new[] { _hlcManager.NodeId };
_logger.LogInformation(
"Exporting air-gap bundle for tenant {TenantId} with {NodeCount} nodes",
tenantId, effectiveNodeIds.Count);
var jobLogs = new List<NodeJobLog>();
foreach (var nodeId in effectiveNodeIds)
{
cancellationToken.ThrowIfCancellationRequested();
var nodeLog = await _jobLogStore.GetNodeJobLogAsync(nodeId, cancellationToken)
.ConfigureAwait(false);
if (nodeLog is not null && nodeLog.Entries.Count > 0)
{
jobLogs.Add(nodeLog);
_logger.LogDebug(
"Added node {NodeId} with {EntryCount} entries to bundle",
nodeId, nodeLog.Entries.Count);
}
}
if (jobLogs.Count == 0)
{
_logger.LogWarning("No offline job logs found for export");
}
var bundle = new AirGapBundle
{
BundleId = _guidProvider.NewGuid(),
TenantId = tenantId,
CreatedAt = _timeProvider.GetUtcNow(),
CreatedByNodeId = _hlcManager.NodeId,
JobLogs = jobLogs,
ManifestDigest = ComputeManifestDigest(jobLogs)
};
_logger.LogInformation(
"Created bundle {BundleId} with {LogCount} node logs, {TotalEntries} total entries",
bundle.BundleId, jobLogs.Count, jobLogs.Sum(l => l.Entries.Count));
return bundle;
}
/// <inheritdoc/>
public async Task ExportToFileAsync(
AirGapBundle bundle,
string outputPath,
CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(bundle);
ArgumentException.ThrowIfNullOrWhiteSpace(outputPath);
var dto = ToExportDto(bundle);
var json = JsonSerializer.Serialize(dto, JsonOptions);
var directory = Path.GetDirectoryName(outputPath);
if (!string.IsNullOrEmpty(directory) && !Directory.Exists(directory))
{
Directory.CreateDirectory(directory);
}
await File.WriteAllTextAsync(outputPath, json, cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"Exported bundle {BundleId} to {OutputPath}",
bundle.BundleId, outputPath);
}
/// <inheritdoc/>
public Task<string> ExportToStringAsync(
AirGapBundle bundle,
CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(bundle);
cancellationToken.ThrowIfCancellationRequested();
var dto = ToExportDto(bundle);
var json = JsonSerializer.Serialize(dto, JsonOptions);
_logger.LogDebug(
"Exported bundle {BundleId} to string ({Length} chars)",
bundle.BundleId, json.Length);
return Task.FromResult(json);
}
private static string ComputeManifestDigest(IReadOnlyList<NodeJobLog> jobLogs)
{
// Create manifest of all chain heads for integrity
var manifest = jobLogs
.OrderBy(l => l.NodeId, StringComparer.Ordinal)
.Select(l => new
{
l.NodeId,
LastHlc = l.LastHlc.ToSortableString(),
ChainHead = Convert.ToHexString(l.ChainHead)
})
.ToList();
var json = CanonJson.Serialize(manifest);
var hash = SHA256.HashData(Encoding.UTF8.GetBytes(json));
return "sha256:" + Convert.ToHexString(hash).ToLowerInvariant();
}
private static AirGapBundleExportDto ToExportDto(AirGapBundle bundle) => new()
{
BundleId = bundle.BundleId,
TenantId = bundle.TenantId,
CreatedAt = bundle.CreatedAt,
CreatedByNodeId = bundle.CreatedByNodeId,
ManifestDigest = bundle.ManifestDigest,
Signature = bundle.Signature,
SignedBy = bundle.SignedBy,
JobLogs = bundle.JobLogs.Select(ToNodeJobLogDto).ToList()
};
private static NodeJobLogExportDto ToNodeJobLogDto(NodeJobLog log) => new()
{
NodeId = log.NodeId,
LastHlc = log.LastHlc.ToSortableString(),
ChainHead = Convert.ToBase64String(log.ChainHead),
Entries = log.Entries.Select(ToEntryDto).ToList()
};
private static OfflineJobLogEntryExportDto ToEntryDto(OfflineJobLogEntry entry) => new()
{
NodeId = entry.NodeId,
THlc = entry.THlc.ToSortableString(),
JobId = entry.JobId,
PartitionKey = entry.PartitionKey,
Payload = entry.Payload,
PayloadHash = Convert.ToBase64String(entry.PayloadHash),
PrevLink = entry.PrevLink is not null ? Convert.ToBase64String(entry.PrevLink) : null,
Link = Convert.ToBase64String(entry.Link),
EnqueuedAt = entry.EnqueuedAt
};
// Export DTOs
private sealed record AirGapBundleExportDto
{
public required Guid BundleId { get; init; }
public required string TenantId { get; init; }
public required DateTimeOffset CreatedAt { get; init; }
public required string CreatedByNodeId { get; init; }
public required string ManifestDigest { get; init; }
public string? Signature { get; init; }
public string? SignedBy { get; init; }
public required IReadOnlyList<NodeJobLogExportDto> JobLogs { get; init; }
}
private sealed record NodeJobLogExportDto
{
public required string NodeId { get; init; }
public required string LastHlc { get; init; }
public required string ChainHead { get; init; }
public required IReadOnlyList<OfflineJobLogEntryExportDto> Entries { get; init; }
}
private sealed record OfflineJobLogEntryExportDto
{
public required string NodeId { get; init; }
public required string THlc { get; init; }
public required Guid JobId { get; init; }
public string? PartitionKey { get; init; }
public required string Payload { get; init; }
public required string PayloadHash { get; init; }
public string? PrevLink { get; init; }
public required string Link { get; init; }
public DateTimeOffset EnqueuedAt { get; init; }
}
}

View File

@@ -0,0 +1,316 @@
// <copyright file="AirGapBundleImporter.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using System.Security.Cryptography;
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using StellaOps.AirGap.Sync.Models;
using StellaOps.Canonical.Json;
using StellaOps.HybridLogicalClock;
namespace StellaOps.AirGap.Sync.Services;
/// <summary>
/// Interface for air-gap bundle import operations.
/// </summary>
public interface IAirGapBundleImporter
{
/// <summary>
/// Imports an air-gap bundle from a file.
/// </summary>
/// <param name="inputPath">The input file path.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The imported bundle.</returns>
Task<AirGapBundle> ImportFromFileAsync(
string inputPath,
CancellationToken cancellationToken = default);
/// <summary>
/// Validates a bundle's integrity.
/// </summary>
/// <param name="bundle">The bundle to validate.</param>
/// <returns>Validation result with any issues found.</returns>
BundleValidationResult Validate(AirGapBundle bundle);
/// <summary>
/// Imports an air-gap bundle from a JSON string.
/// </summary>
/// <param name="json">The JSON string representation.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The imported bundle.</returns>
Task<AirGapBundle> ImportFromStringAsync(
string json,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Result of bundle validation.
/// </summary>
public sealed record BundleValidationResult
{
/// <summary>
/// Gets whether the bundle is valid.
/// </summary>
public required bool IsValid { get; init; }
/// <summary>
/// Gets validation issues found.
/// </summary>
public required IReadOnlyList<string> Issues { get; init; }
}
/// <summary>
/// Service for importing air-gap bundles.
/// </summary>
public sealed class AirGapBundleImporter : IAirGapBundleImporter
{
private readonly ILogger<AirGapBundleImporter> _logger;
private static readonly JsonSerializerOptions JsonOptions = new()
{
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
PropertyNameCaseInsensitive = true
};
/// <summary>
/// Initializes a new instance of the <see cref="AirGapBundleImporter"/> class.
/// </summary>
public AirGapBundleImporter(ILogger<AirGapBundleImporter> logger)
{
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc/>
public async Task<AirGapBundle> ImportFromFileAsync(
string inputPath,
CancellationToken cancellationToken = default)
{
ArgumentException.ThrowIfNullOrWhiteSpace(inputPath);
if (!File.Exists(inputPath))
{
throw new FileNotFoundException($"Bundle file not found: {inputPath}", inputPath);
}
_logger.LogInformation("Importing air-gap bundle from {InputPath}", inputPath);
var json = await File.ReadAllTextAsync(inputPath, cancellationToken).ConfigureAwait(false);
var dto = JsonSerializer.Deserialize<AirGapBundleImportDto>(json, JsonOptions);
if (dto is null)
{
throw new InvalidOperationException("Failed to deserialize bundle file");
}
var bundle = FromImportDto(dto);
_logger.LogInformation(
"Imported bundle {BundleId} from {InputPath}: {LogCount} node logs, {TotalEntries} total entries",
bundle.BundleId, inputPath, bundle.JobLogs.Count, bundle.JobLogs.Sum(l => l.Entries.Count));
return bundle;
}
/// <inheritdoc/>
public Task<AirGapBundle> ImportFromStringAsync(
string json,
CancellationToken cancellationToken = default)
{
ArgumentException.ThrowIfNullOrWhiteSpace(json);
cancellationToken.ThrowIfCancellationRequested();
_logger.LogDebug("Importing air-gap bundle from string ({Length} chars)", json.Length);
var dto = JsonSerializer.Deserialize<AirGapBundleImportDto>(json, JsonOptions);
if (dto is null)
{
throw new InvalidOperationException("Failed to deserialize bundle JSON");
}
var bundle = FromImportDto(dto);
_logger.LogInformation(
"Imported bundle {BundleId} from string: {LogCount} node logs, {TotalEntries} total entries",
bundle.BundleId, bundle.JobLogs.Count, bundle.JobLogs.Sum(l => l.Entries.Count));
return Task.FromResult(bundle);
}
/// <inheritdoc/>
public BundleValidationResult Validate(AirGapBundle bundle)
{
ArgumentNullException.ThrowIfNull(bundle);
var issues = new List<string>();
// 1. Validate manifest digest
var computedDigest = ComputeManifestDigest(bundle.JobLogs);
if (!string.Equals(computedDigest, bundle.ManifestDigest, StringComparison.Ordinal))
{
issues.Add($"Manifest digest mismatch: expected {bundle.ManifestDigest}, computed {computedDigest}");
}
// 2. Validate each node log's chain integrity
foreach (var nodeLog in bundle.JobLogs)
{
var nodeIssues = ValidateNodeLog(nodeLog);
issues.AddRange(nodeIssues);
}
// 3. Validate chain heads match last entry links
foreach (var nodeLog in bundle.JobLogs)
{
if (nodeLog.Entries.Count > 0)
{
var lastEntry = nodeLog.Entries[^1];
if (!ByteArrayEquals(nodeLog.ChainHead, lastEntry.Link))
{
issues.Add($"Node {nodeLog.NodeId}: chain head doesn't match last entry link");
}
}
}
var isValid = issues.Count == 0;
if (!isValid)
{
_logger.LogWarning(
"Bundle {BundleId} validation failed with {IssueCount} issues",
bundle.BundleId, issues.Count);
}
else
{
_logger.LogDebug("Bundle {BundleId} validation passed", bundle.BundleId);
}
return new BundleValidationResult
{
IsValid = isValid,
Issues = issues
};
}
private static IEnumerable<string> ValidateNodeLog(NodeJobLog nodeLog)
{
byte[]? expectedPrevLink = null;
for (var i = 0; i < nodeLog.Entries.Count; i++)
{
var entry = nodeLog.Entries[i];
// Verify prev_link matches expected
if (!ByteArrayEquals(entry.PrevLink, expectedPrevLink))
{
yield return $"Node {nodeLog.NodeId}, entry {i}: prev_link mismatch";
}
// Recompute and verify link
var computedLink = OfflineHlcManager.ComputeLink(
entry.PrevLink,
entry.JobId,
entry.THlc,
entry.PayloadHash);
if (!ByteArrayEquals(entry.Link, computedLink))
{
yield return $"Node {nodeLog.NodeId}, entry {i} (JobId {entry.JobId}): link mismatch";
}
expectedPrevLink = entry.Link;
}
}
private static string ComputeManifestDigest(IReadOnlyList<NodeJobLog> jobLogs)
{
var manifest = jobLogs
.OrderBy(l => l.NodeId, StringComparer.Ordinal)
.Select(l => new
{
l.NodeId,
LastHlc = l.LastHlc.ToSortableString(),
ChainHead = Convert.ToHexString(l.ChainHead)
})
.ToList();
var json = CanonJson.Serialize(manifest);
var hash = SHA256.HashData(Encoding.UTF8.GetBytes(json));
return "sha256:" + Convert.ToHexString(hash).ToLowerInvariant();
}
private static bool ByteArrayEquals(byte[]? a, byte[]? b)
{
if (a is null && b is null) return true;
if (a is null || b is null) return false;
return a.AsSpan().SequenceEqual(b);
}
private static AirGapBundle FromImportDto(AirGapBundleImportDto dto) => new()
{
BundleId = dto.BundleId,
TenantId = dto.TenantId,
CreatedAt = dto.CreatedAt,
CreatedByNodeId = dto.CreatedByNodeId,
ManifestDigest = dto.ManifestDigest,
Signature = dto.Signature,
SignedBy = dto.SignedBy,
JobLogs = dto.JobLogs.Select(FromNodeJobLogDto).ToList()
};
private static NodeJobLog FromNodeJobLogDto(NodeJobLogImportDto dto) => new()
{
NodeId = dto.NodeId,
LastHlc = HlcTimestamp.Parse(dto.LastHlc),
ChainHead = Convert.FromBase64String(dto.ChainHead),
Entries = dto.Entries.Select(FromEntryDto).ToList()
};
private static OfflineJobLogEntry FromEntryDto(OfflineJobLogEntryImportDto dto) => new()
{
NodeId = dto.NodeId,
THlc = HlcTimestamp.Parse(dto.THlc),
JobId = dto.JobId,
PartitionKey = dto.PartitionKey,
Payload = dto.Payload,
PayloadHash = Convert.FromBase64String(dto.PayloadHash),
PrevLink = dto.PrevLink is not null ? Convert.FromBase64String(dto.PrevLink) : null,
Link = Convert.FromBase64String(dto.Link),
EnqueuedAt = dto.EnqueuedAt
};
// Import DTOs
private sealed record AirGapBundleImportDto
{
public required Guid BundleId { get; init; }
public required string TenantId { get; init; }
public required DateTimeOffset CreatedAt { get; init; }
public required string CreatedByNodeId { get; init; }
public required string ManifestDigest { get; init; }
public string? Signature { get; init; }
public string? SignedBy { get; init; }
public required IReadOnlyList<NodeJobLogImportDto> JobLogs { get; init; }
}
private sealed record NodeJobLogImportDto
{
public required string NodeId { get; init; }
public required string LastHlc { get; init; }
public required string ChainHead { get; init; }
public required IReadOnlyList<OfflineJobLogEntryImportDto> Entries { get; init; }
}
private sealed record OfflineJobLogEntryImportDto
{
public required string NodeId { get; init; }
public required string THlc { get; init; }
public required Guid JobId { get; init; }
public string? PartitionKey { get; init; }
public required string Payload { get; init; }
public required string PayloadHash { get; init; }
public string? PrevLink { get; init; }
public required string Link { get; init; }
public DateTimeOffset EnqueuedAt { get; init; }
}
}

View File

@@ -0,0 +1,198 @@
// <copyright file="AirGapSyncService.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using Microsoft.Extensions.Logging;
using StellaOps.AirGap.Sync.Models;
using StellaOps.HybridLogicalClock;
namespace StellaOps.AirGap.Sync.Services;
/// <summary>
/// Interface for the scheduler log repository used by sync.
/// </summary>
/// <remarks>
/// This is a subset of the full ISchedulerLogRepository to avoid circular dependencies.
/// Implementations should delegate to the actual repository.
/// </remarks>
public interface ISyncSchedulerLogRepository
{
/// <summary>
/// Gets the chain head for a tenant/partition.
/// </summary>
Task<(byte[]? Link, string? THlc)> GetChainHeadAsync(
string tenantId,
string? partitionKey = null,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets an entry by job ID.
/// </summary>
Task<bool> ExistsByJobIdAsync(
string tenantId,
Guid jobId,
CancellationToken cancellationToken = default);
/// <summary>
/// Inserts a synced entry.
/// </summary>
Task InsertSyncedEntryAsync(
string tenantId,
string tHlc,
string? partitionKey,
Guid jobId,
byte[] payloadHash,
byte[]? prevLink,
byte[] link,
string sourceNodeId,
Guid syncedFromBundle,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Interface for air-gap sync operations.
/// </summary>
public interface IAirGapSyncService
{
/// <summary>
/// Syncs offline jobs from an air-gap bundle to the central scheduler.
/// </summary>
/// <param name="bundle">The bundle to sync.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The sync result.</returns>
Task<SyncResult> SyncFromBundleAsync(
AirGapBundle bundle,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Service for syncing air-gap bundles to the central scheduler.
/// </summary>
public sealed class AirGapSyncService : IAirGapSyncService
{
private readonly IHlcMergeService _mergeService;
private readonly ISyncSchedulerLogRepository _schedulerLogRepo;
private readonly IHybridLogicalClock _hlc;
private readonly ILogger<AirGapSyncService> _logger;
/// <summary>
/// Initializes a new instance of the <see cref="AirGapSyncService"/> class.
/// </summary>
public AirGapSyncService(
IHlcMergeService mergeService,
ISyncSchedulerLogRepository schedulerLogRepo,
IHybridLogicalClock hlc,
ILogger<AirGapSyncService> logger)
{
_mergeService = mergeService ?? throw new ArgumentNullException(nameof(mergeService));
_schedulerLogRepo = schedulerLogRepo ?? throw new ArgumentNullException(nameof(schedulerLogRepo));
_hlc = hlc ?? throw new ArgumentNullException(nameof(hlc));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc/>
public async Task<SyncResult> SyncFromBundleAsync(
AirGapBundle bundle,
CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(bundle);
_logger.LogInformation(
"Starting sync from bundle {BundleId} with {LogCount} node logs for tenant {TenantId}",
bundle.BundleId, bundle.JobLogs.Count, bundle.TenantId);
// 1. Merge all offline logs
var merged = await _mergeService.MergeAsync(bundle.JobLogs, cancellationToken)
.ConfigureAwait(false);
if (merged.MergedEntries.Count == 0)
{
_logger.LogInformation("Bundle {BundleId} has no entries to sync", bundle.BundleId);
return new SyncResult
{
BundleId = bundle.BundleId,
TotalInBundle = 0,
Appended = 0,
Duplicates = 0,
AlreadyExisted = 0
};
}
// 2. Get current scheduler chain head
var (currentLink, _) = await _schedulerLogRepo.GetChainHeadAsync(
bundle.TenantId,
cancellationToken: cancellationToken).ConfigureAwait(false);
// 3. For each merged entry, update HLC clock (receive)
// This ensures central clock advances past all offline timestamps
foreach (var entry in merged.MergedEntries)
{
_hlc.Receive(entry.THlc);
}
// 4. Append merged entries to scheduler log
// Chain links recomputed to extend from current head
byte[]? prevLink = currentLink;
var appended = 0;
var alreadyExisted = 0;
var warnings = new List<string>();
foreach (var entry in merged.MergedEntries)
{
cancellationToken.ThrowIfCancellationRequested();
// Check if job already exists (idempotency)
var exists = await _schedulerLogRepo.ExistsByJobIdAsync(
bundle.TenantId,
entry.JobId,
cancellationToken).ConfigureAwait(false);
if (exists)
{
_logger.LogDebug(
"Job {JobId} already exists in scheduler log, skipping",
entry.JobId);
alreadyExisted++;
continue;
}
// Compute new chain link extending from current chain
var newLink = OfflineHlcManager.ComputeLink(
prevLink,
entry.JobId,
entry.THlc,
entry.PayloadHash);
// Insert the entry
await _schedulerLogRepo.InsertSyncedEntryAsync(
bundle.TenantId,
entry.THlc.ToSortableString(),
entry.PartitionKey,
entry.JobId,
entry.PayloadHash,
prevLink,
newLink,
entry.SourceNodeId,
bundle.BundleId,
cancellationToken).ConfigureAwait(false);
prevLink = newLink;
appended++;
}
_logger.LogInformation(
"Sync complete for bundle {BundleId}: {Appended} appended, {Duplicates} duplicates, {AlreadyExisted} already existed",
bundle.BundleId, appended, merged.Duplicates.Count, alreadyExisted);
return new SyncResult
{
BundleId = bundle.BundleId,
TotalInBundle = merged.MergedEntries.Count,
Appended = appended,
Duplicates = merged.Duplicates.Count,
AlreadyExisted = alreadyExisted,
NewChainHead = prevLink,
Warnings = warnings.Count > 0 ? warnings : null
};
}
}

View File

@@ -0,0 +1,114 @@
// <copyright file="ConflictResolver.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using Microsoft.Extensions.Logging;
using StellaOps.AirGap.Sync.Models;
namespace StellaOps.AirGap.Sync.Services;
/// <summary>
/// Interface for conflict resolution during merge.
/// </summary>
public interface IConflictResolver
{
/// <summary>
/// Resolves conflicts when the same JobId appears in multiple entries.
/// </summary>
/// <param name="jobId">The conflicting job ID.</param>
/// <param name="conflicting">The conflicting entries with their source nodes.</param>
/// <returns>The resolution result.</returns>
ConflictResolution Resolve(
Guid jobId,
IReadOnlyList<(string NodeId, OfflineJobLogEntry Entry)> conflicting);
}
/// <summary>
/// Resolves conflicts during HLC merge operations.
/// </summary>
public sealed class ConflictResolver : IConflictResolver
{
private readonly ILogger<ConflictResolver> _logger;
/// <summary>
/// Initializes a new instance of the <see cref="ConflictResolver"/> class.
/// </summary>
public ConflictResolver(ILogger<ConflictResolver> logger)
{
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc/>
public ConflictResolution Resolve(
Guid jobId,
IReadOnlyList<(string NodeId, OfflineJobLogEntry Entry)> conflicting)
{
ArgumentNullException.ThrowIfNull(conflicting);
if (conflicting.Count == 0)
{
throw new ArgumentException("Conflicting list cannot be empty", nameof(conflicting));
}
if (conflicting.Count == 1)
{
// No conflict
return new ConflictResolution
{
Type = ConflictType.DuplicateTimestamp,
Resolution = ResolutionStrategy.TakeEarliest,
SelectedEntry = conflicting[0].Entry,
DroppedEntries = Array.Empty<OfflineJobLogEntry>()
};
}
// Verify payloads are actually different
var uniquePayloads = conflicting
.Select(c => Convert.ToHexString(c.Entry.PayloadHash))
.Distinct()
.ToList();
if (uniquePayloads.Count == 1)
{
// Same payload, different HLC timestamps - not a real conflict
// Take the earliest HLC (preserves causality)
var sorted = conflicting
.OrderBy(c => c.Entry.THlc.PhysicalTime)
.ThenBy(c => c.Entry.THlc.LogicalCounter)
.ThenBy(c => c.Entry.THlc.NodeId, StringComparer.Ordinal)
.ToList();
var earliest = sorted[0];
var dropped = sorted.Skip(1).Select(s => s.Entry).ToList();
_logger.LogDebug(
"Resolved duplicate timestamp conflict for JobId {JobId}: selected entry from node {NodeId} at {THlc}, dropped {DroppedCount} duplicates",
jobId, earliest.NodeId, earliest.Entry.THlc, dropped.Count);
return new ConflictResolution
{
Type = ConflictType.DuplicateTimestamp,
Resolution = ResolutionStrategy.TakeEarliest,
SelectedEntry = earliest.Entry,
DroppedEntries = dropped
};
}
// Actual conflict: same JobId, different payloads
// This indicates a bug in deterministic ID computation
var nodeIds = string.Join(", ", conflicting.Select(c => c.NodeId));
var payloadHashes = string.Join(", ", conflicting.Select(c => Convert.ToHexString(c.Entry.PayloadHash)[..16] + "..."));
_logger.LogError(
"Payload mismatch conflict for JobId {JobId}: different payloads from nodes [{NodeIds}] with hashes [{PayloadHashes}]",
jobId, nodeIds, payloadHashes);
return new ConflictResolution
{
Type = ConflictType.PayloadMismatch,
Resolution = ResolutionStrategy.Error,
Error = $"JobId {jobId} has conflicting payloads from nodes: {nodeIds}. " +
"This indicates a bug in deterministic job ID computation or payload tampering."
};
}
}

View File

@@ -0,0 +1,169 @@
// <copyright file="HlcMergeService.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using Microsoft.Extensions.Logging;
using StellaOps.AirGap.Sync.Models;
namespace StellaOps.AirGap.Sync.Services;
/// <summary>
/// Interface for HLC-based merge operations.
/// </summary>
public interface IHlcMergeService
{
/// <summary>
/// Merges job logs from multiple offline nodes into a unified, HLC-ordered stream.
/// </summary>
/// <param name="nodeLogs">The node logs to merge.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The merge result.</returns>
Task<MergeResult> MergeAsync(
IReadOnlyList<NodeJobLog> nodeLogs,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Service for merging job logs from multiple offline nodes using HLC total ordering.
/// </summary>
public sealed class HlcMergeService : IHlcMergeService
{
private readonly IConflictResolver _conflictResolver;
private readonly ILogger<HlcMergeService> _logger;
/// <summary>
/// Initializes a new instance of the <see cref="HlcMergeService"/> class.
/// </summary>
public HlcMergeService(
IConflictResolver conflictResolver,
ILogger<HlcMergeService> logger)
{
_conflictResolver = conflictResolver ?? throw new ArgumentNullException(nameof(conflictResolver));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc/>
public Task<MergeResult> MergeAsync(
IReadOnlyList<NodeJobLog> nodeLogs,
CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(nodeLogs);
cancellationToken.ThrowIfCancellationRequested();
if (nodeLogs.Count == 0)
{
return Task.FromResult(new MergeResult
{
MergedEntries = Array.Empty<MergedJobEntry>(),
Duplicates = Array.Empty<DuplicateEntry>(),
SourceNodes = Array.Empty<string>()
});
}
_logger.LogInformation(
"Starting merge of {NodeCount} node logs with {TotalEntries} total entries",
nodeLogs.Count,
nodeLogs.Sum(l => l.Entries.Count));
// 1. Collect all entries from all nodes
var allEntries = nodeLogs
.SelectMany(log => log.Entries.Select(e => (log.NodeId, Entry: e)))
.ToList();
// 2. Sort by HLC total order: (PhysicalTime, LogicalCounter, NodeId, JobId)
var sorted = allEntries
.OrderBy(x => x.Entry.THlc.PhysicalTime)
.ThenBy(x => x.Entry.THlc.LogicalCounter)
.ThenBy(x => x.Entry.THlc.NodeId, StringComparer.Ordinal)
.ThenBy(x => x.Entry.JobId)
.ToList();
// 3. Group by JobId to detect duplicates
var groupedByJobId = sorted.GroupBy(x => x.Entry.JobId).ToList();
var deduplicated = new List<MergedJobEntry>();
var duplicates = new List<DuplicateEntry>();
foreach (var group in groupedByJobId)
{
var entries = group.ToList();
if (entries.Count == 1)
{
// No conflict - add directly
var (nodeId, entry) = entries[0];
deduplicated.Add(CreateMergedEntry(nodeId, entry));
}
else
{
// Multiple entries with same JobId - resolve conflict
var resolution = _conflictResolver.Resolve(group.Key, entries);
if (resolution.Resolution == ResolutionStrategy.Error)
{
_logger.LogError(
"Conflict resolution failed for JobId {JobId}: {Error}",
group.Key, resolution.Error);
throw new InvalidOperationException(resolution.Error);
}
// Add the selected entry
if (resolution.SelectedEntry is not null)
{
var sourceEntry = entries.First(e => e.Entry == resolution.SelectedEntry);
deduplicated.Add(CreateMergedEntry(sourceEntry.NodeId, resolution.SelectedEntry));
}
// Record duplicates
foreach (var dropped in resolution.DroppedEntries ?? Array.Empty<OfflineJobLogEntry>())
{
var sourceEntry = entries.First(e => e.Entry == dropped);
duplicates.Add(new DuplicateEntry(dropped.JobId, sourceEntry.NodeId, dropped.THlc));
}
}
}
// 4. Sort deduplicated entries by HLC order
deduplicated = deduplicated
.OrderBy(x => x.THlc.PhysicalTime)
.ThenBy(x => x.THlc.LogicalCounter)
.ThenBy(x => x.THlc.NodeId, StringComparer.Ordinal)
.ThenBy(x => x.JobId)
.ToList();
// 5. Recompute unified chain
byte[]? prevLink = null;
foreach (var entry in deduplicated)
{
entry.MergedLink = OfflineHlcManager.ComputeLink(
prevLink,
entry.JobId,
entry.THlc,
entry.PayloadHash);
prevLink = entry.MergedLink;
}
_logger.LogInformation(
"Merge complete: {MergedCount} entries, {DuplicateCount} duplicates dropped",
deduplicated.Count, duplicates.Count);
return Task.FromResult(new MergeResult
{
MergedEntries = deduplicated,
Duplicates = duplicates,
MergedChainHead = prevLink,
SourceNodes = nodeLogs.Select(l => l.NodeId).ToList()
});
}
private static MergedJobEntry CreateMergedEntry(string nodeId, OfflineJobLogEntry entry) => new()
{
SourceNodeId = nodeId,
THlc = entry.THlc,
JobId = entry.JobId,
PartitionKey = entry.PartitionKey,
Payload = entry.Payload,
PayloadHash = entry.PayloadHash,
OriginalLink = entry.Link
};
}

View File

@@ -0,0 +1,172 @@
// <copyright file="OfflineHlcManager.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using System.Security.Cryptography;
using System.Text;
using Microsoft.Extensions.Logging;
using StellaOps.AirGap.Sync.Models;
using StellaOps.AirGap.Sync.Stores;
using StellaOps.Canonical.Json;
using StellaOps.Determinism;
using StellaOps.HybridLogicalClock;
namespace StellaOps.AirGap.Sync.Services;
/// <summary>
/// Interface for offline HLC management.
/// </summary>
public interface IOfflineHlcManager
{
/// <summary>
/// Enqueues a job locally while offline, maintaining the local chain.
/// </summary>
/// <typeparam name="T">The payload type.</typeparam>
/// <param name="payload">The job payload.</param>
/// <param name="idempotencyKey">The idempotency key for deterministic job ID.</param>
/// <param name="partitionKey">Optional partition key.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The enqueue result.</returns>
Task<OfflineEnqueueResult> EnqueueOfflineAsync<T>(
T payload,
string idempotencyKey,
string? partitionKey = null,
CancellationToken cancellationToken = default) where T : notnull;
/// <summary>
/// Gets the current node's job log for export.
/// </summary>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The node job log, or null if empty.</returns>
Task<NodeJobLog?> GetNodeJobLogAsync(CancellationToken cancellationToken = default);
/// <summary>
/// Gets the node ID.
/// </summary>
string NodeId { get; }
}
/// <summary>
/// Manages HLC operations for offline/air-gap scenarios.
/// </summary>
public sealed class OfflineHlcManager : IOfflineHlcManager
{
private readonly IHybridLogicalClock _hlc;
private readonly IOfflineJobLogStore _jobLogStore;
private readonly IGuidProvider _guidProvider;
private readonly ILogger<OfflineHlcManager> _logger;
/// <summary>
/// Initializes a new instance of the <see cref="OfflineHlcManager"/> class.
/// </summary>
public OfflineHlcManager(
IHybridLogicalClock hlc,
IOfflineJobLogStore jobLogStore,
IGuidProvider guidProvider,
ILogger<OfflineHlcManager> logger)
{
_hlc = hlc ?? throw new ArgumentNullException(nameof(hlc));
_jobLogStore = jobLogStore ?? throw new ArgumentNullException(nameof(jobLogStore));
_guidProvider = guidProvider ?? throw new ArgumentNullException(nameof(guidProvider));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc/>
public string NodeId => _hlc.NodeId;
/// <inheritdoc/>
public async Task<OfflineEnqueueResult> EnqueueOfflineAsync<T>(
T payload,
string idempotencyKey,
string? partitionKey = null,
CancellationToken cancellationToken = default) where T : notnull
{
ArgumentNullException.ThrowIfNull(payload);
ArgumentException.ThrowIfNullOrWhiteSpace(idempotencyKey);
// 1. Generate HLC timestamp
var tHlc = _hlc.Tick();
// 2. Compute deterministic job ID from idempotency key
var jobId = ComputeDeterministicJobId(idempotencyKey);
// 3. Serialize and hash payload
var payloadJson = CanonJson.Serialize(payload);
var payloadHash = SHA256.HashData(Encoding.UTF8.GetBytes(payloadJson));
// 4. Get previous chain link
var prevLink = await _jobLogStore.GetLastLinkAsync(NodeId, cancellationToken)
.ConfigureAwait(false);
// 5. Compute chain link
var link = ComputeLink(prevLink, jobId, tHlc, payloadHash);
// 6. Create and store entry
var entry = new OfflineJobLogEntry
{
NodeId = NodeId,
THlc = tHlc,
JobId = jobId,
PartitionKey = partitionKey,
Payload = payloadJson,
PayloadHash = payloadHash,
PrevLink = prevLink,
Link = link,
EnqueuedAt = DateTimeOffset.UtcNow
};
await _jobLogStore.AppendAsync(entry, cancellationToken).ConfigureAwait(false);
_logger.LogInformation(
"Enqueued offline job {JobId} with HLC {THlc} on node {NodeId}",
jobId, tHlc, NodeId);
return new OfflineEnqueueResult
{
THlc = tHlc,
JobId = jobId,
Link = link,
NodeId = NodeId
};
}
/// <inheritdoc/>
public Task<NodeJobLog?> GetNodeJobLogAsync(CancellationToken cancellationToken = default)
=> _jobLogStore.GetNodeJobLogAsync(NodeId, cancellationToken);
/// <summary>
/// Computes deterministic job ID from idempotency key.
/// </summary>
private Guid ComputeDeterministicJobId(string idempotencyKey)
{
var hash = SHA256.HashData(Encoding.UTF8.GetBytes(idempotencyKey));
// Use first 16 bytes of SHA-256 as deterministic GUID
return new Guid(hash.AsSpan(0, 16));
}
/// <summary>
/// Computes chain link: Hash(prev_link || job_id || t_hlc || payload_hash).
/// </summary>
internal static byte[] ComputeLink(
byte[]? prevLink,
Guid jobId,
HlcTimestamp tHlc,
byte[] payloadHash)
{
using var hasher = IncrementalHash.CreateHash(HashAlgorithmName.SHA256);
// Previous link (or 32 zero bytes for first entry)
hasher.AppendData(prevLink ?? new byte[32]);
// Job ID as bytes
hasher.AppendData(jobId.ToByteArray());
// HLC timestamp as UTF-8 bytes
hasher.AppendData(Encoding.UTF8.GetBytes(tHlc.ToSortableString()));
// Payload hash
hasher.AppendData(payloadHash);
return hasher.GetHashAndReset();
}
}

View File

@@ -0,0 +1,23 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<LangVersion>preview</LangVersion>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Canonical.Json\StellaOps.Canonical.Json.csproj" />
<ProjectReference Include="..\..\..\__Libraries\StellaOps.Determinism.Abstractions\StellaOps.Determinism.Abstractions.csproj" />
<ProjectReference Include="..\..\..\__Libraries\StellaOps.HybridLogicalClock\StellaOps.HybridLogicalClock.csproj" />
<ProjectReference Include="..\..\..\Scheduler\__Libraries\StellaOps.Scheduler.Models\StellaOps.Scheduler.Models.csproj" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,246 @@
// <copyright file="FileBasedOfflineJobLogStore.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using System.Text.Json;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.AirGap.Sync.Models;
using StellaOps.Canonical.Json;
using StellaOps.HybridLogicalClock;
namespace StellaOps.AirGap.Sync.Stores;
/// <summary>
/// Options for the file-based offline job log store.
/// </summary>
public sealed class FileBasedOfflineJobLogStoreOptions
{
/// <summary>
/// Gets or sets the directory for storing offline job logs.
/// </summary>
public string DataDirectory { get; set; } = "./offline-job-logs";
}
/// <summary>
/// File-based implementation of <see cref="IOfflineJobLogStore"/> for air-gap scenarios.
/// </summary>
public sealed class FileBasedOfflineJobLogStore : IOfflineJobLogStore
{
private readonly IOptions<FileBasedOfflineJobLogStoreOptions> _options;
private readonly ILogger<FileBasedOfflineJobLogStore> _logger;
private readonly SemaphoreSlim _lock = new(1, 1);
private static readonly JsonSerializerOptions JsonOptions = new()
{
WriteIndented = false,
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
};
/// <summary>
/// Initializes a new instance of the <see cref="FileBasedOfflineJobLogStore"/> class.
/// </summary>
public FileBasedOfflineJobLogStore(
IOptions<FileBasedOfflineJobLogStoreOptions> options,
ILogger<FileBasedOfflineJobLogStore> logger)
{
_options = options ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
EnsureDirectoryExists();
}
/// <inheritdoc/>
public async Task AppendAsync(OfflineJobLogEntry entry, CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(entry);
await _lock.WaitAsync(cancellationToken).ConfigureAwait(false);
try
{
var filePath = GetNodeLogFilePath(entry.NodeId);
var dto = ToDto(entry);
var line = JsonSerializer.Serialize(dto, JsonOptions);
await File.AppendAllTextAsync(filePath, line + Environment.NewLine, cancellationToken)
.ConfigureAwait(false);
_logger.LogDebug(
"Appended offline job entry {JobId} for node {NodeId}",
entry.JobId, entry.NodeId);
}
finally
{
_lock.Release();
}
}
/// <inheritdoc/>
public async Task<IReadOnlyList<OfflineJobLogEntry>> GetEntriesAsync(
string nodeId,
CancellationToken cancellationToken = default)
{
ArgumentException.ThrowIfNullOrWhiteSpace(nodeId);
var filePath = GetNodeLogFilePath(nodeId);
if (!File.Exists(filePath))
{
return Array.Empty<OfflineJobLogEntry>();
}
await _lock.WaitAsync(cancellationToken).ConfigureAwait(false);
try
{
var lines = await File.ReadAllLinesAsync(filePath, cancellationToken).ConfigureAwait(false);
var entries = new List<OfflineJobLogEntry>(lines.Length);
foreach (var line in lines)
{
if (string.IsNullOrWhiteSpace(line))
{
continue;
}
var dto = JsonSerializer.Deserialize<OfflineJobLogEntryDto>(line, JsonOptions);
if (dto is not null)
{
entries.Add(FromDto(dto));
}
}
// Return in HLC order
return entries.OrderBy(e => e.THlc).ToList();
}
finally
{
_lock.Release();
}
}
/// <inheritdoc/>
public async Task<byte[]?> GetLastLinkAsync(string nodeId, CancellationToken cancellationToken = default)
{
var entries = await GetEntriesAsync(nodeId, cancellationToken).ConfigureAwait(false);
return entries.Count > 0 ? entries[^1].Link : null;
}
/// <inheritdoc/>
public async Task<NodeJobLog?> GetNodeJobLogAsync(string nodeId, CancellationToken cancellationToken = default)
{
var entries = await GetEntriesAsync(nodeId, cancellationToken).ConfigureAwait(false);
if (entries.Count == 0)
{
return null;
}
var lastEntry = entries[^1];
return new NodeJobLog
{
NodeId = nodeId,
LastHlc = lastEntry.THlc,
ChainHead = lastEntry.Link,
Entries = entries
};
}
/// <inheritdoc/>
public async Task<int> ClearEntriesAsync(
string nodeId,
string upToHlc,
CancellationToken cancellationToken = default)
{
ArgumentException.ThrowIfNullOrWhiteSpace(nodeId);
await _lock.WaitAsync(cancellationToken).ConfigureAwait(false);
try
{
var entries = await GetEntriesAsync(nodeId, cancellationToken).ConfigureAwait(false);
var remaining = entries
.Where(e => string.CompareOrdinal(e.THlc.ToSortableString(), upToHlc) > 0)
.ToList();
var cleared = entries.Count - remaining.Count;
if (remaining.Count == 0)
{
var filePath = GetNodeLogFilePath(nodeId);
if (File.Exists(filePath))
{
File.Delete(filePath);
}
}
else
{
// Rewrite with remaining entries
var filePath = GetNodeLogFilePath(nodeId);
var lines = remaining.Select(e => JsonSerializer.Serialize(ToDto(e), JsonOptions));
await File.WriteAllLinesAsync(filePath, lines, cancellationToken).ConfigureAwait(false);
}
_logger.LogInformation(
"Cleared {Count} offline job entries for node {NodeId} up to HLC {UpToHlc}",
cleared, nodeId, upToHlc);
return cleared;
}
finally
{
_lock.Release();
}
}
private string GetNodeLogFilePath(string nodeId)
{
var safeNodeId = nodeId.Replace('/', '_').Replace('\\', '_').Replace(':', '_');
return Path.Combine(_options.Value.DataDirectory, $"offline-jobs-{safeNodeId}.ndjson");
}
private void EnsureDirectoryExists()
{
var dir = _options.Value.DataDirectory;
if (!Directory.Exists(dir))
{
Directory.CreateDirectory(dir);
_logger.LogInformation("Created offline job log directory: {Directory}", dir);
}
}
private static OfflineJobLogEntryDto ToDto(OfflineJobLogEntry entry) => new()
{
NodeId = entry.NodeId,
THlc = entry.THlc.ToSortableString(),
JobId = entry.JobId,
PartitionKey = entry.PartitionKey,
Payload = entry.Payload,
PayloadHash = Convert.ToBase64String(entry.PayloadHash),
PrevLink = entry.PrevLink is not null ? Convert.ToBase64String(entry.PrevLink) : null,
Link = Convert.ToBase64String(entry.Link),
EnqueuedAt = entry.EnqueuedAt
};
private static OfflineJobLogEntry FromDto(OfflineJobLogEntryDto dto) => new()
{
NodeId = dto.NodeId,
THlc = HlcTimestamp.Parse(dto.THlc),
JobId = dto.JobId,
PartitionKey = dto.PartitionKey,
Payload = dto.Payload,
PayloadHash = Convert.FromBase64String(dto.PayloadHash),
PrevLink = dto.PrevLink is not null ? Convert.FromBase64String(dto.PrevLink) : null,
Link = Convert.FromBase64String(dto.Link),
EnqueuedAt = dto.EnqueuedAt
};
private sealed record OfflineJobLogEntryDto
{
public required string NodeId { get; init; }
public required string THlc { get; init; }
public required Guid JobId { get; init; }
public string? PartitionKey { get; init; }
public required string Payload { get; init; }
public required string PayloadHash { get; init; }
public string? PrevLink { get; init; }
public required string Link { get; init; }
public DateTimeOffset EnqueuedAt { get; init; }
}
}

View File

@@ -0,0 +1,58 @@
// <copyright file="IOfflineJobLogStore.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using StellaOps.AirGap.Sync.Models;
namespace StellaOps.AirGap.Sync.Stores;
/// <summary>
/// Interface for storing offline job log entries.
/// </summary>
public interface IOfflineJobLogStore
{
/// <summary>
/// Appends an entry to the offline job log.
/// </summary>
/// <param name="entry">The entry to append.</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task AppendAsync(OfflineJobLogEntry entry, CancellationToken cancellationToken = default);
/// <summary>
/// Gets all entries for a node.
/// </summary>
/// <param name="nodeId">The node ID.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>All entries in HLC order.</returns>
Task<IReadOnlyList<OfflineJobLogEntry>> GetEntriesAsync(
string nodeId,
CancellationToken cancellationToken = default);
/// <summary>
/// Gets the last chain link for a node.
/// </summary>
/// <param name="nodeId">The node ID.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The last link, or null if no entries exist.</returns>
Task<byte[]?> GetLastLinkAsync(string nodeId, CancellationToken cancellationToken = default);
/// <summary>
/// Gets the node job log for export.
/// </summary>
/// <param name="nodeId">The node ID.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The complete node job log.</returns>
Task<NodeJobLog?> GetNodeJobLogAsync(string nodeId, CancellationToken cancellationToken = default);
/// <summary>
/// Clears entries for a node after successful sync.
/// </summary>
/// <param name="nodeId">The node ID.</param>
/// <param name="upToHlc">Clear entries up to and including this HLC timestamp.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Number of entries cleared.</returns>
Task<int> ClearEntriesAsync(
string nodeId,
string upToHlc,
CancellationToken cancellationToken = default);
}

View File

@@ -0,0 +1,161 @@
// <copyright file="AirGapSyncMetrics.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using System.Diagnostics.Metrics;
using StellaOps.AirGap.Sync.Models;
namespace StellaOps.AirGap.Sync.Telemetry;
/// <summary>
/// Metrics for air-gap sync operations.
/// </summary>
public static class AirGapSyncMetrics
{
private const string NodeIdTag = "node_id";
private const string TenantIdTag = "tenant_id";
private const string ConflictTypeTag = "conflict_type";
private static readonly Meter Meter = new("StellaOps.AirGap.Sync");
// Counters
private static readonly Counter<long> BundlesExportedCounter = Meter.CreateCounter<long>(
"airgap_bundles_exported_total",
unit: "{bundle}",
description: "Total number of air-gap bundles exported");
private static readonly Counter<long> BundlesImportedCounter = Meter.CreateCounter<long>(
"airgap_bundles_imported_total",
unit: "{bundle}",
description: "Total number of air-gap bundles imported");
private static readonly Counter<long> JobsSyncedCounter = Meter.CreateCounter<long>(
"airgap_jobs_synced_total",
unit: "{job}",
description: "Total number of jobs synced from air-gap bundles");
private static readonly Counter<long> DuplicatesDroppedCounter = Meter.CreateCounter<long>(
"airgap_duplicates_dropped_total",
unit: "{duplicate}",
description: "Total number of duplicate entries dropped during merge");
private static readonly Counter<long> MergeConflictsCounter = Meter.CreateCounter<long>(
"airgap_merge_conflicts_total",
unit: "{conflict}",
description: "Total number of merge conflicts by type");
private static readonly Counter<long> OfflineEnqueuesCounter = Meter.CreateCounter<long>(
"airgap_offline_enqueues_total",
unit: "{enqueue}",
description: "Total number of offline enqueue operations");
// Histograms
private static readonly Histogram<double> BundleSizeHistogram = Meter.CreateHistogram<double>(
"airgap_bundle_size_bytes",
unit: "By",
description: "Size of air-gap bundles in bytes");
private static readonly Histogram<double> SyncDurationHistogram = Meter.CreateHistogram<double>(
"airgap_sync_duration_seconds",
unit: "s",
description: "Duration of air-gap sync operations");
private static readonly Histogram<int> MergeEntriesHistogram = Meter.CreateHistogram<int>(
"airgap_merge_entries_count",
unit: "{entry}",
description: "Number of entries in merge operations");
/// <summary>
/// Records a bundle export.
/// </summary>
/// <param name="nodeId">The node ID that exported.</param>
/// <param name="tenantId">The tenant ID.</param>
/// <param name="entryCount">Number of entries in the bundle.</param>
public static void RecordBundleExported(string nodeId, string tenantId, int entryCount)
{
BundlesExportedCounter.Add(1,
new KeyValuePair<string, object?>(NodeIdTag, nodeId),
new KeyValuePair<string, object?>(TenantIdTag, tenantId));
MergeEntriesHistogram.Record(entryCount,
new KeyValuePair<string, object?>(NodeIdTag, nodeId));
}
/// <summary>
/// Records a bundle import.
/// </summary>
/// <param name="nodeId">The node ID that imported.</param>
/// <param name="tenantId">The tenant ID.</param>
public static void RecordBundleImported(string nodeId, string tenantId)
{
BundlesImportedCounter.Add(1,
new KeyValuePair<string, object?>(NodeIdTag, nodeId),
new KeyValuePair<string, object?>(TenantIdTag, tenantId));
}
/// <summary>
/// Records jobs synced from a bundle.
/// </summary>
/// <param name="nodeId">The node ID.</param>
/// <param name="count">Number of jobs synced.</param>
public static void RecordJobsSynced(string nodeId, int count)
{
JobsSyncedCounter.Add(count,
new KeyValuePair<string, object?>(NodeIdTag, nodeId));
}
/// <summary>
/// Records duplicates dropped during merge.
/// </summary>
/// <param name="nodeId">The node ID.</param>
/// <param name="count">Number of duplicates dropped.</param>
public static void RecordDuplicatesDropped(string nodeId, int count)
{
if (count > 0)
{
DuplicatesDroppedCounter.Add(count,
new KeyValuePair<string, object?>(NodeIdTag, nodeId));
}
}
/// <summary>
/// Records a merge conflict.
/// </summary>
/// <param name="conflictType">The type of conflict.</param>
public static void RecordMergeConflict(ConflictType conflictType)
{
MergeConflictsCounter.Add(1,
new KeyValuePair<string, object?>(ConflictTypeTag, conflictType.ToString()));
}
/// <summary>
/// Records an offline enqueue operation.
/// </summary>
/// <param name="nodeId">The node ID.</param>
public static void RecordOfflineEnqueue(string nodeId)
{
OfflineEnqueuesCounter.Add(1,
new KeyValuePair<string, object?>(NodeIdTag, nodeId));
}
/// <summary>
/// Records bundle size.
/// </summary>
/// <param name="nodeId">The node ID.</param>
/// <param name="sizeBytes">Size in bytes.</param>
public static void RecordBundleSize(string nodeId, long sizeBytes)
{
BundleSizeHistogram.Record(sizeBytes,
new KeyValuePair<string, object?>(NodeIdTag, nodeId));
}
/// <summary>
/// Records sync duration.
/// </summary>
/// <param name="nodeId">The node ID.</param>
/// <param name="durationSeconds">Duration in seconds.</param>
public static void RecordSyncDuration(string nodeId, double durationSeconds)
{
SyncDurationHistogram.Record(durationSeconds,
new KeyValuePair<string, object?>(NodeIdTag, nodeId));
}
}

View File

@@ -0,0 +1,221 @@
// <copyright file="FileBasedJobSyncTransport.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using System.Text.Json;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.AirGap.Sync.Models;
using StellaOps.AirGap.Sync.Services;
using StellaOps.AirGap.Sync.Telemetry;
namespace StellaOps.AirGap.Sync.Transport;
/// <summary>
/// File-based transport for job sync bundles in air-gapped scenarios.
/// </summary>
public sealed class FileBasedJobSyncTransport : IJobSyncTransport
{
private readonly IAirGapBundleExporter _exporter;
private readonly IAirGapBundleImporter _importer;
private readonly FileBasedJobSyncTransportOptions _options;
private readonly ILogger<FileBasedJobSyncTransport> _logger;
/// <summary>
/// Initializes a new instance of the <see cref="FileBasedJobSyncTransport"/> class.
/// </summary>
public FileBasedJobSyncTransport(
IAirGapBundleExporter exporter,
IAirGapBundleImporter importer,
IOptions<FileBasedJobSyncTransportOptions> options,
ILogger<FileBasedJobSyncTransport> logger)
{
_exporter = exporter ?? throw new ArgumentNullException(nameof(exporter));
_importer = importer ?? throw new ArgumentNullException(nameof(importer));
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc/>
public string TransportId => "file";
/// <inheritdoc/>
public async Task<JobSyncSendResult> SendBundleAsync(
AirGapBundle bundle,
string destination,
CancellationToken cancellationToken = default)
{
var startTime = DateTimeOffset.UtcNow;
try
{
// Ensure destination directory exists
var destPath = Path.IsPathRooted(destination)
? destination
: Path.Combine(_options.OutputDirectory, destination);
Directory.CreateDirectory(destPath);
// Export to file
var filePath = Path.Combine(destPath, $"job-sync-{bundle.BundleId:N}.json");
await _exporter.ExportToFileAsync(bundle, filePath, cancellationToken)
.ConfigureAwait(false);
var fileInfo = new FileInfo(filePath);
var sizeBytes = fileInfo.Exists ? fileInfo.Length : 0;
_logger.LogInformation(
"Exported job sync bundle {BundleId} to {Path} ({Size} bytes)",
bundle.BundleId,
filePath,
sizeBytes);
AirGapSyncMetrics.RecordBundleSize(bundle.CreatedByNodeId, sizeBytes);
return new JobSyncSendResult
{
Success = true,
BundleId = bundle.BundleId,
Destination = filePath,
TransmittedAt = startTime,
SizeBytes = sizeBytes
};
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to export job sync bundle {BundleId}", bundle.BundleId);
return new JobSyncSendResult
{
Success = false,
BundleId = bundle.BundleId,
Destination = destination,
Error = ex.Message,
TransmittedAt = startTime
};
}
}
/// <inheritdoc/>
public async Task<AirGapBundle?> ReceiveBundleAsync(
string source,
CancellationToken cancellationToken = default)
{
try
{
var sourcePath = Path.IsPathRooted(source)
? source
: Path.Combine(_options.InputDirectory, source);
if (!File.Exists(sourcePath))
{
_logger.LogWarning("Job sync bundle file not found: {Path}", sourcePath);
return null;
}
var bundle = await _importer.ImportFromFileAsync(sourcePath, cancellationToken)
.ConfigureAwait(false);
_logger.LogInformation(
"Imported job sync bundle {BundleId} from {Path}",
bundle.BundleId,
sourcePath);
return bundle;
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to import job sync bundle from {Source}", source);
return null;
}
}
/// <inheritdoc/>
public Task<IReadOnlyList<BundleInfo>> ListAvailableBundlesAsync(
string source,
CancellationToken cancellationToken = default)
{
var sourcePath = Path.IsPathRooted(source)
? source
: Path.Combine(_options.InputDirectory, source);
var bundles = new List<BundleInfo>();
if (!Directory.Exists(sourcePath))
{
return Task.FromResult<IReadOnlyList<BundleInfo>>(bundles);
}
var files = Directory.GetFiles(sourcePath, "job-sync-*.json");
foreach (var file in files)
{
try
{
// Quick parse to extract bundle metadata
var json = File.ReadAllText(file);
var doc = JsonDocument.Parse(json);
var root = doc.RootElement;
if (root.TryGetProperty("bundleId", out var bundleIdProp) &&
root.TryGetProperty("tenantId", out var tenantIdProp) &&
root.TryGetProperty("createdByNodeId", out var nodeIdProp) &&
root.TryGetProperty("createdAt", out var createdAtProp))
{
var entryCount = 0;
if (root.TryGetProperty("jobLogs", out var jobLogs))
{
foreach (var log in jobLogs.EnumerateArray())
{
if (log.TryGetProperty("entries", out var entries))
{
entryCount += entries.GetArrayLength();
}
}
}
bundles.Add(new BundleInfo
{
BundleId = Guid.Parse(bundleIdProp.GetString()!),
TenantId = tenantIdProp.GetString()!,
SourceNodeId = nodeIdProp.GetString()!,
CreatedAt = DateTimeOffset.Parse(createdAtProp.GetString()!),
EntryCount = entryCount,
SizeBytes = new FileInfo(file).Length
});
}
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to parse bundle metadata from {File}", file);
}
}
return Task.FromResult<IReadOnlyList<BundleInfo>>(
bundles.OrderByDescending(b => b.CreatedAt).ToList());
}
}
/// <summary>
/// Options for file-based job sync transport.
/// </summary>
public sealed class FileBasedJobSyncTransportOptions
{
/// <summary>
/// Gets or sets the output directory for exporting bundles.
/// </summary>
public string OutputDirectory { get; set; } = Path.Combine(
Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData),
"stellaops",
"airgap",
"outbox");
/// <summary>
/// Gets or sets the input directory for importing bundles.
/// </summary>
public string InputDirectory { get; set; } = Path.Combine(
Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData),
"stellaops",
"airgap",
"inbox");
}

View File

@@ -0,0 +1,123 @@
// <copyright file="IJobSyncTransport.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using StellaOps.AirGap.Sync.Models;
namespace StellaOps.AirGap.Sync.Transport;
/// <summary>
/// Transport abstraction for job sync bundles.
/// Enables bundle transfer over various transports (file, Router messaging, etc.).
/// </summary>
public interface IJobSyncTransport
{
/// <summary>
/// Gets the transport identifier.
/// </summary>
string TransportId { get; }
/// <summary>
/// Sends a job sync bundle to a destination.
/// </summary>
/// <param name="bundle">The bundle to send.</param>
/// <param name="destination">The destination identifier.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The send result.</returns>
Task<JobSyncSendResult> SendBundleAsync(
AirGapBundle bundle,
string destination,
CancellationToken cancellationToken = default);
/// <summary>
/// Receives a job sync bundle from a source.
/// </summary>
/// <param name="source">The source identifier.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>The received bundle, or null if not available.</returns>
Task<AirGapBundle?> ReceiveBundleAsync(
string source,
CancellationToken cancellationToken = default);
/// <summary>
/// Lists available bundles from a source.
/// </summary>
/// <param name="source">The source identifier.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>List of available bundle identifiers.</returns>
Task<IReadOnlyList<BundleInfo>> ListAvailableBundlesAsync(
string source,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Result of sending a job sync bundle.
/// </summary>
public sealed record JobSyncSendResult
{
/// <summary>
/// Gets a value indicating whether the send was successful.
/// </summary>
public required bool Success { get; init; }
/// <summary>
/// Gets the bundle ID.
/// </summary>
public required Guid BundleId { get; init; }
/// <summary>
/// Gets the destination where the bundle was sent.
/// </summary>
public required string Destination { get; init; }
/// <summary>
/// Gets the error message if the send failed.
/// </summary>
public string? Error { get; init; }
/// <summary>
/// Gets the transmission timestamp.
/// </summary>
public DateTimeOffset TransmittedAt { get; init; }
/// <summary>
/// Gets the size of the transmitted data in bytes.
/// </summary>
public long SizeBytes { get; init; }
}
/// <summary>
/// Information about an available bundle.
/// </summary>
public sealed record BundleInfo
{
/// <summary>
/// Gets the bundle ID.
/// </summary>
public required Guid BundleId { get; init; }
/// <summary>
/// Gets the tenant ID.
/// </summary>
public required string TenantId { get; init; }
/// <summary>
/// Gets the source node ID.
/// </summary>
public required string SourceNodeId { get; init; }
/// <summary>
/// Gets the creation timestamp.
/// </summary>
public required DateTimeOffset CreatedAt { get; init; }
/// <summary>
/// Gets the entry count in the bundle.
/// </summary>
public int EntryCount { get; init; }
/// <summary>
/// Gets the bundle size in bytes.
/// </summary>
public long SizeBytes { get; init; }
}

View File

@@ -0,0 +1,272 @@
// <copyright file="RouterJobSyncTransport.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.AirGap.Sync.Models;
using StellaOps.AirGap.Sync.Services;
using StellaOps.AirGap.Sync.Telemetry;
namespace StellaOps.AirGap.Sync.Transport;
/// <summary>
/// Router-based transport for job sync bundles when network is available.
/// This transport uses the Router messaging infrastructure for real-time sync.
/// </summary>
public sealed class RouterJobSyncTransport : IJobSyncTransport
{
private readonly IAirGapBundleExporter _exporter;
private readonly IAirGapBundleImporter _importer;
private readonly IRouterJobSyncClient _routerClient;
private readonly RouterJobSyncTransportOptions _options;
private readonly ILogger<RouterJobSyncTransport> _logger;
/// <summary>
/// Initializes a new instance of the <see cref="RouterJobSyncTransport"/> class.
/// </summary>
public RouterJobSyncTransport(
IAirGapBundleExporter exporter,
IAirGapBundleImporter importer,
IRouterJobSyncClient routerClient,
IOptions<RouterJobSyncTransportOptions> options,
ILogger<RouterJobSyncTransport> logger)
{
_exporter = exporter ?? throw new ArgumentNullException(nameof(exporter));
_importer = importer ?? throw new ArgumentNullException(nameof(importer));
_routerClient = routerClient ?? throw new ArgumentNullException(nameof(routerClient));
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc/>
public string TransportId => "router";
/// <inheritdoc/>
public async Task<JobSyncSendResult> SendBundleAsync(
AirGapBundle bundle,
string destination,
CancellationToken cancellationToken = default)
{
var startTime = DateTimeOffset.UtcNow;
try
{
// Serialize bundle
var json = await _exporter.ExportToStringAsync(bundle, cancellationToken)
.ConfigureAwait(false);
var payload = Encoding.UTF8.GetBytes(json);
_logger.LogDebug(
"Sending job sync bundle {BundleId} to {Destination} ({Size} bytes)",
bundle.BundleId,
destination,
payload.Length);
// Send via Router
var response = await _routerClient.SendJobSyncBundleAsync(
destination,
bundle.BundleId,
bundle.TenantId,
payload,
_options.SendTimeout,
cancellationToken).ConfigureAwait(false);
if (response.Success)
{
AirGapSyncMetrics.RecordBundleSize(bundle.CreatedByNodeId, payload.Length);
_logger.LogInformation(
"Sent job sync bundle {BundleId} to {Destination}",
bundle.BundleId,
destination);
}
else
{
_logger.LogWarning(
"Failed to send job sync bundle {BundleId} to {Destination}: {Error}",
bundle.BundleId,
destination,
response.Error);
}
return new JobSyncSendResult
{
Success = response.Success,
BundleId = bundle.BundleId,
Destination = destination,
Error = response.Error,
TransmittedAt = startTime,
SizeBytes = payload.Length
};
}
catch (Exception ex)
{
_logger.LogError(
ex,
"Error sending job sync bundle {BundleId} to {Destination}",
bundle.BundleId,
destination);
return new JobSyncSendResult
{
Success = false,
BundleId = bundle.BundleId,
Destination = destination,
Error = ex.Message,
TransmittedAt = startTime
};
}
}
/// <inheritdoc/>
public async Task<AirGapBundle?> ReceiveBundleAsync(
string source,
CancellationToken cancellationToken = default)
{
try
{
var response = await _routerClient.ReceiveJobSyncBundleAsync(
source,
_options.ReceiveTimeout,
cancellationToken).ConfigureAwait(false);
if (response.Payload is null || response.Payload.Length == 0)
{
_logger.LogDebug("No bundle available from {Source}", source);
return null;
}
var json = Encoding.UTF8.GetString(response.Payload);
var bundle = await _importer.ImportFromStringAsync(json, cancellationToken)
.ConfigureAwait(false);
_logger.LogInformation(
"Received job sync bundle {BundleId} from {Source}",
bundle.BundleId,
source);
return bundle;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error receiving job sync bundle from {Source}", source);
return null;
}
}
/// <inheritdoc/>
public async Task<IReadOnlyList<BundleInfo>> ListAvailableBundlesAsync(
string source,
CancellationToken cancellationToken = default)
{
try
{
var response = await _routerClient.ListAvailableBundlesAsync(
source,
_options.ListTimeout,
cancellationToken).ConfigureAwait(false);
return response.Bundles;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error listing available bundles from {Source}", source);
return Array.Empty<BundleInfo>();
}
}
}
/// <summary>
/// Options for Router-based job sync transport.
/// </summary>
public sealed class RouterJobSyncTransportOptions
{
/// <summary>
/// Gets or sets the timeout for send operations.
/// </summary>
public TimeSpan SendTimeout { get; set; } = TimeSpan.FromSeconds(30);
/// <summary>
/// Gets or sets the timeout for receive operations.
/// </summary>
public TimeSpan ReceiveTimeout { get; set; } = TimeSpan.FromSeconds(30);
/// <summary>
/// Gets or sets the timeout for list operations.
/// </summary>
public TimeSpan ListTimeout { get; set; } = TimeSpan.FromSeconds(10);
/// <summary>
/// Gets or sets the service endpoint for job sync.
/// </summary>
public string ServiceEndpoint { get; set; } = "scheduler.job-sync";
}
/// <summary>
/// Client interface for Router job sync operations.
/// </summary>
public interface IRouterJobSyncClient
{
/// <summary>
/// Sends a job sync bundle via the Router.
/// </summary>
Task<RouterSendResponse> SendJobSyncBundleAsync(
string destination,
Guid bundleId,
string tenantId,
byte[] payload,
TimeSpan timeout,
CancellationToken cancellationToken = default);
/// <summary>
/// Receives a job sync bundle via the Router.
/// </summary>
Task<RouterReceiveResponse> ReceiveJobSyncBundleAsync(
string source,
TimeSpan timeout,
CancellationToken cancellationToken = default);
/// <summary>
/// Lists available bundles via the Router.
/// </summary>
Task<RouterListResponse> ListAvailableBundlesAsync(
string source,
TimeSpan timeout,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Response from a Router send operation.
/// </summary>
public sealed record RouterSendResponse
{
/// <summary>Gets a value indicating whether the send was successful.</summary>
public bool Success { get; init; }
/// <summary>Gets the error message if failed.</summary>
public string? Error { get; init; }
}
/// <summary>
/// Response from a Router receive operation.
/// </summary>
public sealed record RouterReceiveResponse
{
/// <summary>Gets the received payload.</summary>
public byte[]? Payload { get; init; }
/// <summary>Gets the bundle ID.</summary>
public Guid? BundleId { get; init; }
}
/// <summary>
/// Response from a Router list operation.
/// </summary>
public sealed record RouterListResponse
{
/// <summary>Gets the available bundles.</summary>
public IReadOnlyList<BundleInfo> Bundles { get; init; } = Array.Empty<BundleInfo>();
}

View File

@@ -22,6 +22,9 @@ namespace StellaOps.AirGap.Bundle.Tests;
/// Task AIRGAP-5100-016: Export bundle (online env) → import bundle (offline env) → verify data integrity
/// Task AIRGAP-5100-017: Policy export → policy import → policy evaluation → verify identical verdict
/// </summary>
[Trait("Category", TestCategories.Integration)]
[Trait("BlastRadius", TestCategories.BlastRadius.Integrations)]
[Trait("BlastRadius", TestCategories.BlastRadius.Persistence)]
public sealed class AirGapIntegrationTests : IDisposable
{
private readonly string _tempRoot;

View File

@@ -0,0 +1,446 @@
// <copyright file="HlcMergeServiceTests.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
using FluentAssertions;
using Microsoft.Extensions.Logging.Abstractions;
using StellaOps.AirGap.Sync.Models;
using StellaOps.AirGap.Sync.Services;
using StellaOps.HybridLogicalClock;
using StellaOps.TestKit;
using Xunit;
namespace StellaOps.AirGap.Sync.Tests;
/// <summary>
/// Unit tests for <see cref="HlcMergeService"/>.
/// </summary>
[Trait("Category", TestCategories.Unit)]
public sealed class HlcMergeServiceTests
{
private readonly HlcMergeService _sut;
private readonly ConflictResolver _conflictResolver;
public HlcMergeServiceTests()
{
_conflictResolver = new ConflictResolver(NullLogger<ConflictResolver>.Instance);
_sut = new HlcMergeService(_conflictResolver, NullLogger<HlcMergeService>.Instance);
}
#region OMP-014: Merge Algorithm Correctness
[Fact]
public async Task MergeAsync_EmptyInput_ReturnsEmptyResult()
{
// Arrange
var nodeLogs = new List<NodeJobLog>();
// Act
var result = await _sut.MergeAsync(nodeLogs);
// Assert
result.MergedEntries.Should().BeEmpty();
result.Duplicates.Should().BeEmpty();
result.SourceNodes.Should().BeEmpty();
result.MergedChainHead.Should().BeNull();
}
[Fact]
public async Task MergeAsync_SingleNode_PreservesOrder()
{
// Arrange
var nodeLog = CreateNodeLog("node-a", new[]
{
CreateEntry("node-a", 100, 0, Guid.Parse("11111111-1111-1111-1111-111111111111")),
CreateEntry("node-a", 200, 0, Guid.Parse("22222222-2222-2222-2222-222222222222")),
CreateEntry("node-a", 300, 0, Guid.Parse("33333333-3333-3333-3333-333333333333"))
});
// Act
var result = await _sut.MergeAsync(new[] { nodeLog });
// Assert
result.MergedEntries.Should().HaveCount(3);
result.MergedEntries[0].JobId.Should().Be(Guid.Parse("11111111-1111-1111-1111-111111111111"));
result.MergedEntries[1].JobId.Should().Be(Guid.Parse("22222222-2222-2222-2222-222222222222"));
result.MergedEntries[2].JobId.Should().Be(Guid.Parse("33333333-3333-3333-3333-333333333333"));
result.Duplicates.Should().BeEmpty();
result.SourceNodes.Should().ContainSingle().Which.Should().Be("node-a");
}
[Fact]
public async Task MergeAsync_TwoNodes_MergesByHlcOrder()
{
// Arrange - Two nodes with interleaved HLC timestamps
// Node A: T=100, T=102
// Node B: T=101, T=103
// Expected order: 100, 101, 102, 103
var nodeA = CreateNodeLog("node-a", new[]
{
CreateEntry("node-a", 100, 0, Guid.Parse("aaaaaaaa-0001-0000-0000-000000000000")),
CreateEntry("node-a", 102, 0, Guid.Parse("aaaaaaaa-0003-0000-0000-000000000000"))
});
var nodeB = CreateNodeLog("node-b", new[]
{
CreateEntry("node-b", 101, 0, Guid.Parse("bbbbbbbb-0002-0000-0000-000000000000")),
CreateEntry("node-b", 103, 0, Guid.Parse("bbbbbbbb-0004-0000-0000-000000000000"))
});
// Act
var result = await _sut.MergeAsync(new[] { nodeA, nodeB });
// Assert
result.MergedEntries.Should().HaveCount(4);
result.MergedEntries[0].THlc.PhysicalTime.Should().Be(100);
result.MergedEntries[1].THlc.PhysicalTime.Should().Be(101);
result.MergedEntries[2].THlc.PhysicalTime.Should().Be(102);
result.MergedEntries[3].THlc.PhysicalTime.Should().Be(103);
result.SourceNodes.Should().HaveCount(2);
}
[Fact]
public async Task MergeAsync_SamePhysicalTime_OrdersByLogicalCounter()
{
// Arrange - Same physical time, different logical counters
var nodeA = CreateNodeLog("node-a", new[]
{
CreateEntry("node-a", 100, 0, Guid.Parse("aaaaaaaa-0000-0000-0000-000000000001")),
CreateEntry("node-a", 100, 2, Guid.Parse("aaaaaaaa-0000-0000-0000-000000000003"))
});
var nodeB = CreateNodeLog("node-b", new[]
{
CreateEntry("node-b", 100, 1, Guid.Parse("bbbbbbbb-0000-0000-0000-000000000002")),
CreateEntry("node-b", 100, 3, Guid.Parse("bbbbbbbb-0000-0000-0000-000000000004"))
});
// Act
var result = await _sut.MergeAsync(new[] { nodeA, nodeB });
// Assert
result.MergedEntries.Should().HaveCount(4);
result.MergedEntries[0].THlc.LogicalCounter.Should().Be(0);
result.MergedEntries[1].THlc.LogicalCounter.Should().Be(1);
result.MergedEntries[2].THlc.LogicalCounter.Should().Be(2);
result.MergedEntries[3].THlc.LogicalCounter.Should().Be(3);
}
[Fact]
public async Task MergeAsync_SameTimeAndCounter_OrdersByNodeId()
{
// Arrange - Same physical time and counter, different node IDs
var nodeA = CreateNodeLog("alpha-node", new[]
{
CreateEntry("alpha-node", 100, 0, Guid.Parse("aaaaaaaa-0000-0000-0000-000000000001"))
});
var nodeB = CreateNodeLog("beta-node", new[]
{
CreateEntry("beta-node", 100, 0, Guid.Parse("bbbbbbbb-0000-0000-0000-000000000002"))
});
// Act
var result = await _sut.MergeAsync(new[] { nodeA, nodeB });
// Assert - "alpha-node" < "beta-node" alphabetically
result.MergedEntries.Should().HaveCount(2);
result.MergedEntries[0].SourceNodeId.Should().Be("alpha-node");
result.MergedEntries[1].SourceNodeId.Should().Be("beta-node");
}
[Fact]
public async Task MergeAsync_RecomputesUnifiedChain()
{
// Arrange
var nodeLog = CreateNodeLog("node-a", new[]
{
CreateEntry("node-a", 100, 0, Guid.Parse("11111111-1111-1111-1111-111111111111")),
CreateEntry("node-a", 200, 0, Guid.Parse("22222222-2222-2222-2222-222222222222"))
});
// Act
var result = await _sut.MergeAsync(new[] { nodeLog });
// Assert - Chain should be recomputed
result.MergedEntries.Should().HaveCount(2);
result.MergedEntries[0].MergedLink.Should().NotBeNull();
result.MergedEntries[1].MergedLink.Should().NotBeNull();
result.MergedChainHead.Should().NotBeNull();
// First entry's link should be computed from null prev_link
result.MergedEntries[0].MergedLink.Should().HaveCount(32);
// Chain head should equal last entry's merged link
result.MergedChainHead.Should().BeEquivalentTo(result.MergedEntries[1].MergedLink);
}
#endregion
#region OMP-015: Duplicate Detection
[Fact]
public async Task MergeAsync_DuplicateJobId_SamePayload_TakesEarliest()
{
// Arrange - Same job ID (same payload hash) from two nodes
var jobId = Guid.Parse("dddddddd-dddd-dddd-dddd-dddddddddddd");
var payloadHash = new byte[32];
payloadHash[0] = 0xAA;
var nodeA = CreateNodeLog("node-a", new[]
{
CreateEntryWithPayloadHash("node-a", 100, 0, jobId, payloadHash)
});
var nodeB = CreateNodeLog("node-b", new[]
{
CreateEntryWithPayloadHash("node-b", 105, 0, jobId, payloadHash)
});
// Act
var result = await _sut.MergeAsync(new[] { nodeA, nodeB });
// Assert - Should take earliest (T=100 from node-a)
result.MergedEntries.Should().ContainSingle();
result.MergedEntries[0].SourceNodeId.Should().Be("node-a");
result.MergedEntries[0].THlc.PhysicalTime.Should().Be(100);
// Should report duplicate
result.Duplicates.Should().ContainSingle();
result.Duplicates[0].JobId.Should().Be(jobId);
result.Duplicates[0].NodeId.Should().Be("node-b");
result.Duplicates[0].THlc.PhysicalTime.Should().Be(105);
}
[Fact]
public async Task MergeAsync_TriplicateJobId_SamePayload_TakesEarliest()
{
// Arrange - Same job ID from three nodes
var jobId = Guid.Parse("eeeeeeee-eeee-eeee-eeee-eeeeeeeeeeee");
var payloadHash = new byte[32];
payloadHash[0] = 0xBB;
var nodeA = CreateNodeLog("node-a", new[]
{
CreateEntryWithPayloadHash("node-a", 200, 0, jobId, payloadHash)
});
var nodeB = CreateNodeLog("node-b", new[]
{
CreateEntryWithPayloadHash("node-b", 100, 0, jobId, payloadHash) // Earliest
});
var nodeC = CreateNodeLog("node-c", new[]
{
CreateEntryWithPayloadHash("node-c", 150, 0, jobId, payloadHash)
});
// Act
var result = await _sut.MergeAsync(new[] { nodeA, nodeB, nodeC });
// Assert - Should take earliest (T=100 from node-b)
result.MergedEntries.Should().ContainSingle();
result.MergedEntries[0].NodeId.Should().Be("node-b");
result.MergedEntries[0].THlc.PhysicalTime.Should().Be(100);
// Should report two duplicates
result.Duplicates.Should().HaveCount(2);
}
[Fact]
public async Task MergeAsync_DuplicateJobId_DifferentPayload_ThrowsError()
{
// Arrange - Same job ID but different payload hashes (indicates bug)
var jobId = Guid.Parse("ffffffff-ffff-ffff-ffff-ffffffffffff");
var payloadHashA = new byte[32];
payloadHashA[0] = 0x01;
var payloadHashB = new byte[32];
payloadHashB[0] = 0x02;
var nodeA = CreateNodeLog("node-a", new[]
{
CreateEntryWithPayloadHash("node-a", 100, 0, jobId, payloadHashA)
});
var nodeB = CreateNodeLog("node-b", new[]
{
CreateEntryWithPayloadHash("node-b", 105, 0, jobId, payloadHashB)
});
// Act & Assert - Should throw because payloads differ
var act = () => _sut.MergeAsync(new[] { nodeA, nodeB });
await act.Should().ThrowAsync<InvalidOperationException>()
.WithMessage("*conflicting payloads*");
}
#endregion
#region OMP-018: Multi-Node Merge
[Fact]
public async Task MergeAsync_ThreeNodes_MergesCorrectly()
{
// Arrange - Three nodes with various timestamps
var nodeA = CreateNodeLog("node-a", new[]
{
CreateEntry("node-a", 100, 0, Guid.Parse("aaaaaaaa-0001-0000-0000-000000000000")),
CreateEntry("node-a", 400, 0, Guid.Parse("aaaaaaaa-0007-0000-0000-000000000000"))
});
var nodeB = CreateNodeLog("node-b", new[]
{
CreateEntry("node-b", 200, 0, Guid.Parse("bbbbbbbb-0002-0000-0000-000000000000")),
CreateEntry("node-b", 500, 0, Guid.Parse("bbbbbbbb-0008-0000-0000-000000000000"))
});
var nodeC = CreateNodeLog("node-c", new[]
{
CreateEntry("node-c", 300, 0, Guid.Parse("cccccccc-0003-0000-0000-000000000000")),
CreateEntry("node-c", 600, 0, Guid.Parse("cccccccc-0009-0000-0000-000000000000"))
});
// Act
var result = await _sut.MergeAsync(new[] { nodeA, nodeB, nodeC });
// Assert
result.MergedEntries.Should().HaveCount(6);
result.MergedEntries.Select(e => e.THlc.PhysicalTime).Should()
.BeInAscendingOrder();
result.MergedEntries.Select(e => e.THlc.PhysicalTime).Should()
.ContainInOrder(100L, 200L, 300L, 400L, 500L, 600L);
result.SourceNodes.Should().HaveCount(3);
}
[Fact]
public async Task MergeAsync_ManyNodes_PreservesTotalOrder()
{
// Arrange - 5 nodes with 2 entries each
var nodes = new List<NodeJobLog>();
for (int i = 0; i < 5; i++)
{
var nodeId = $"node-{i:D2}";
nodes.Add(CreateNodeLog(nodeId, new[]
{
CreateEntry(nodeId, 100 + i * 10, 0, Guid.NewGuid()),
CreateEntry(nodeId, 150 + i * 10, 0, Guid.NewGuid())
}));
}
// Act
var result = await _sut.MergeAsync(nodes);
// Assert
result.MergedEntries.Should().HaveCount(10);
result.MergedEntries.Select(e => e.THlc.PhysicalTime).Should()
.BeInAscendingOrder();
}
#endregion
#region OMP-019: Determinism Tests
[Fact]
public async Task MergeAsync_SameInput_ProducesSameOutput()
{
// Arrange
var nodeA = CreateNodeLog("node-a", new[]
{
CreateEntry("node-a", 100, 0, Guid.Parse("aaaaaaaa-0001-0000-0000-000000000000")),
CreateEntry("node-a", 300, 0, Guid.Parse("aaaaaaaa-0003-0000-0000-000000000000"))
});
var nodeB = CreateNodeLog("node-b", new[]
{
CreateEntry("node-b", 200, 0, Guid.Parse("bbbbbbbb-0002-0000-0000-000000000000")),
CreateEntry("node-b", 400, 0, Guid.Parse("bbbbbbbb-0004-0000-0000-000000000000"))
});
// Act - Run merge twice
var result1 = await _sut.MergeAsync(new[] { nodeA, nodeB });
var result2 = await _sut.MergeAsync(new[] { nodeA, nodeB });
// Assert - Results should be identical
result1.MergedEntries.Should().HaveCount(result2.MergedEntries.Count);
for (int i = 0; i < result1.MergedEntries.Count; i++)
{
result1.MergedEntries[i].JobId.Should().Be(result2.MergedEntries[i].JobId);
result1.MergedEntries[i].THlc.Should().Be(result2.MergedEntries[i].THlc);
result1.MergedEntries[i].MergedLink.Should().BeEquivalentTo(result2.MergedEntries[i].MergedLink);
}
result1.MergedChainHead.Should().BeEquivalentTo(result2.MergedChainHead);
}
[Fact]
public async Task MergeAsync_InputOrderIndependent_ProducesSameOutput()
{
// Arrange
var nodeA = CreateNodeLog("node-a", new[]
{
CreateEntry("node-a", 100, 0, Guid.Parse("aaaaaaaa-0001-0000-0000-000000000000"))
});
var nodeB = CreateNodeLog("node-b", new[]
{
CreateEntry("node-b", 200, 0, Guid.Parse("bbbbbbbb-0002-0000-0000-000000000000"))
});
// Act - Merge in different orders
var result1 = await _sut.MergeAsync(new[] { nodeA, nodeB });
var result2 = await _sut.MergeAsync(new[] { nodeB, nodeA });
// Assert - Results should be identical regardless of input order
result1.MergedEntries.Select(e => e.JobId).Should()
.BeEquivalentTo(result2.MergedEntries.Select(e => e.JobId));
result1.MergedChainHead.Should().BeEquivalentTo(result2.MergedChainHead);
}
#endregion
#region Helper Methods
private static NodeJobLog CreateNodeLog(string nodeId, IEnumerable<OfflineJobLogEntry> entries)
{
return new NodeJobLog
{
NodeId = nodeId,
Entries = entries.ToList()
};
}
private static OfflineJobLogEntry CreateEntry(string nodeId, long physicalTime, int logicalCounter, Guid jobId)
{
var payloadHash = new byte[32];
jobId.ToByteArray().CopyTo(payloadHash, 0);
var hlc = new HlcTimestamp
{
PhysicalTime = physicalTime,
NodeId = nodeId,
LogicalCounter = logicalCounter
};
return new OfflineJobLogEntry
{
NodeId = nodeId,
THlc = hlc,
JobId = jobId,
Payload = $"{{\"id\":\"{jobId}\"}}",
PayloadHash = payloadHash,
Link = new byte[32],
EnqueuedAt = DateTimeOffset.UtcNow
};
}
private static OfflineJobLogEntry CreateEntryWithPayloadHash(
string nodeId, long physicalTime, int logicalCounter, Guid jobId, byte[] payloadHash)
{
var hlc = new HlcTimestamp
{
PhysicalTime = physicalTime,
NodeId = nodeId,
LogicalCounter = logicalCounter
};
return new OfflineJobLogEntry
{
NodeId = nodeId,
THlc = hlc,
JobId = jobId,
Payload = $"{{\"id\":\"{jobId}\"}}",
PayloadHash = payloadHash,
Link = new byte[32],
EnqueuedAt = DateTimeOffset.UtcNow
};
}
#endregion
}

View File

@@ -0,0 +1,29 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<LangVersion>preview</LangVersion>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<IsPackable>false</IsPackable>
<IsTestProject>true</IsTestProject>
<TreatWarningsAsErrors>false</TreatWarningsAsErrors>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="xunit.runner.visualstudio">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
<PackageReference Include="coverlet.collector">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets>
</PackageReference>
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\__Libraries\StellaOps.AirGap.Sync\StellaOps.AirGap.Sync.csproj" />
<ProjectReference Include="..\..\..\__Libraries\StellaOps.TestKit\StellaOps.TestKit.csproj" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,295 @@
// <copyright file="DsseVerifierTests.cs" company="Stella Operations">
// Copyright (c) Stella Operations. Licensed under AGPL-3.0-or-later.
// </copyright>
using System.Security.Cryptography;
using System.Text;
using System.Text.Json;
using FluentAssertions;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Logging.Abstractions;
using Xunit;
namespace StellaOps.Attestation.Tests;
/// <summary>
/// Unit tests for DsseVerifier.
/// Sprint: SPRINT_20260105_002_001_REPLAY, Tasks RPL-006 through RPL-010.
/// </summary>
[Trait("Category", "Unit")]
public class DsseVerifierTests
{
private readonly DsseVerifier _verifier;
public DsseVerifierTests()
{
_verifier = new DsseVerifier(NullLogger<DsseVerifier>.Instance);
}
[Fact]
public async Task VerifyAsync_WithValidEcdsaSignature_ReturnsSuccess()
{
// Arrange
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
var (envelope, publicKeyPem) = CreateSignedEnvelope(ecdsa);
// Act
var result = await _verifier.VerifyAsync(envelope, publicKeyPem, TestContext.Current.CancellationToken);
// Assert
result.IsValid.Should().BeTrue();
result.ValidSignatureCount.Should().Be(1);
result.TotalSignatureCount.Should().Be(1);
result.PayloadType.Should().Be("https://in-toto.io/Statement/v1");
result.Issues.Should().BeEmpty();
}
[Fact]
public async Task VerifyAsync_WithInvalidSignature_ReturnsFail()
{
// Arrange
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
var (envelope, _) = CreateSignedEnvelope(ecdsa);
// Use a different key for verification
using var differentKey = ECDsa.Create(ECCurve.NamedCurves.nistP256);
var differentPublicKeyPem = ExportPublicKeyPem(differentKey);
// Act
var result = await _verifier.VerifyAsync(envelope, differentPublicKeyPem, TestContext.Current.CancellationToken);
// Assert
result.IsValid.Should().BeFalse();
result.ValidSignatureCount.Should().Be(0);
result.Issues.Should().NotBeEmpty();
}
[Fact]
public async Task VerifyAsync_WithMalformedJson_ReturnsParseError()
{
// Arrange
var malformedJson = "{ not valid json }";
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
var publicKeyPem = ExportPublicKeyPem(ecdsa);
// Act
var result = await _verifier.VerifyAsync(malformedJson, publicKeyPem, TestContext.Current.CancellationToken);
// Assert
result.IsValid.Should().BeFalse();
result.Issues.Should().Contain(i => i.Contains("envelope_parse_error"));
}
[Fact]
public async Task VerifyAsync_WithMissingPayload_ReturnsFail()
{
// Arrange
var envelope = JsonSerializer.Serialize(new
{
payloadType = "https://in-toto.io/Statement/v1",
signatures = new[] { new { keyId = "key-001", sig = "YWJj" } }
});
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
var publicKeyPem = ExportPublicKeyPem(ecdsa);
// Act
var result = await _verifier.VerifyAsync(envelope, publicKeyPem, TestContext.Current.CancellationToken);
// Assert
result.IsValid.Should().BeFalse();
result.Issues.Should().Contain(i => i.Contains("envelope_missing_payload"));
}
[Fact]
public async Task VerifyAsync_WithMissingSignatures_ReturnsFail()
{
// Arrange
var payload = Convert.ToBase64String(Encoding.UTF8.GetBytes("{}"));
var envelope = JsonSerializer.Serialize(new
{
payloadType = "https://in-toto.io/Statement/v1",
payload,
signatures = Array.Empty<object>()
});
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
var publicKeyPem = ExportPublicKeyPem(ecdsa);
// Act
var result = await _verifier.VerifyAsync(envelope, publicKeyPem, TestContext.Current.CancellationToken);
// Assert
result.IsValid.Should().BeFalse();
result.Issues.Should().Contain("envelope_missing_signatures");
}
[Fact]
public async Task VerifyAsync_WithNoTrustedKeys_ReturnsFail()
{
// Arrange
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
var (envelope, _) = CreateSignedEnvelope(ecdsa);
// Act
var result = await _verifier.VerifyAsync(envelope, Array.Empty<string>(), TestContext.Current.CancellationToken);
// Assert
result.IsValid.Should().BeFalse();
result.Issues.Should().Contain("no_trusted_keys_provided");
}
[Fact]
public async Task VerifyAsync_WithMultipleTrustedKeys_SucceedsWithMatchingKey()
{
// Arrange
using var signingKey = ECDsa.Create(ECCurve.NamedCurves.nistP256);
using var otherKey1 = ECDsa.Create(ECCurve.NamedCurves.nistP256);
using var otherKey2 = ECDsa.Create(ECCurve.NamedCurves.nistP256);
var (envelope, signingKeyPem) = CreateSignedEnvelope(signingKey);
var trustedKeys = new[]
{
ExportPublicKeyPem(otherKey1),
signingKeyPem,
ExportPublicKeyPem(otherKey2),
};
// Act
var result = await _verifier.VerifyAsync(envelope, trustedKeys, TestContext.Current.CancellationToken);
// Assert
result.IsValid.Should().BeTrue();
result.ValidSignatureCount.Should().Be(1);
}
[Fact]
public async Task VerifyAsync_WithKeyResolver_UsesResolverForVerification()
{
// Arrange
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
var (envelope, publicKeyPem) = CreateSignedEnvelope(ecdsa);
Task<string?> KeyResolver(string? keyId, CancellationToken ct)
{
return Task.FromResult<string?>(publicKeyPem);
}
// Act
var result = await _verifier.VerifyAsync(envelope, KeyResolver, TestContext.Current.CancellationToken);
// Assert
result.IsValid.Should().BeTrue();
}
[Fact]
public async Task VerifyAsync_WithKeyResolverReturningNull_ReturnsFail()
{
// Arrange
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
var (envelope, _) = CreateSignedEnvelope(ecdsa);
static Task<string?> KeyResolver(string? keyId, CancellationToken ct)
{
return Task.FromResult<string?>(null);
}
// Act
var result = await _verifier.VerifyAsync(envelope, KeyResolver, TestContext.Current.CancellationToken);
// Assert
result.IsValid.Should().BeFalse();
result.Issues.Should().Contain(i => i.Contains("key_not_found"));
}
[Fact]
public async Task VerifyAsync_ReturnsPayloadHash()
{
// Arrange
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
var (envelope, publicKeyPem) = CreateSignedEnvelope(ecdsa);
// Act
var result = await _verifier.VerifyAsync(envelope, publicKeyPem, TestContext.Current.CancellationToken);
// Assert
result.PayloadHash.Should().StartWith("sha256:");
result.PayloadHash.Should().HaveLength("sha256:".Length + 64);
}
[Fact]
public async Task VerifyAsync_ThrowsOnNullEnvelope()
{
// Arrange
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
var publicKeyPem = ExportPublicKeyPem(ecdsa);
// Act & Assert - null envelope throws ArgumentNullException
await Assert.ThrowsAsync<ArgumentNullException>(
() => _verifier.VerifyAsync(null!, publicKeyPem, TestContext.Current.CancellationToken));
// Empty envelope throws ArgumentException (whitespace check)
await Assert.ThrowsAsync<ArgumentException>(
() => _verifier.VerifyAsync("", publicKeyPem, TestContext.Current.CancellationToken));
}
[Fact]
public async Task VerifyAsync_ThrowsOnNullKeys()
{
// Arrange
using var ecdsa = ECDsa.Create(ECCurve.NamedCurves.nistP256);
var (envelope, _) = CreateSignedEnvelope(ecdsa);
// Act & Assert
await Assert.ThrowsAsync<ArgumentNullException>(
() => _verifier.VerifyAsync(envelope, (IEnumerable<string>)null!, TestContext.Current.CancellationToken));
await Assert.ThrowsAsync<ArgumentNullException>(
() => _verifier.VerifyAsync(envelope, (Func<string?, CancellationToken, Task<string?>>)null!, TestContext.Current.CancellationToken));
}
private static (string EnvelopeJson, string PublicKeyPem) CreateSignedEnvelope(ECDsa signingKey)
{
var payloadType = "https://in-toto.io/Statement/v1";
var payloadContent = "{\"_type\":\"https://in-toto.io/Statement/v1\",\"subject\":[]}";
var payloadBytes = Encoding.UTF8.GetBytes(payloadContent);
var payloadBase64 = Convert.ToBase64String(payloadBytes);
// Compute PAE
var pae = DsseHelper.PreAuthenticationEncoding(payloadType, payloadBytes);
// Sign
var signatureBytes = signingKey.SignData(pae, HashAlgorithmName.SHA256);
var signatureBase64 = Convert.ToBase64String(signatureBytes);
// Build envelope
var envelope = JsonSerializer.Serialize(new
{
payloadType,
payload = payloadBase64,
signatures = new[]
{
new { keyId = "test-key-001", sig = signatureBase64 }
}
});
var publicKeyPem = ExportPublicKeyPem(signingKey);
return (envelope, publicKeyPem);
}
private static string ExportPublicKeyPem(ECDsa key)
{
var publicKeyBytes = key.ExportSubjectPublicKeyInfo();
var base64 = Convert.ToBase64String(publicKeyBytes);
var builder = new StringBuilder();
builder.AppendLine("-----BEGIN PUBLIC KEY-----");
for (var i = 0; i < base64.Length; i += 64)
{
builder.AppendLine(base64.Substring(i, Math.Min(64, base64.Length - i)));
}
builder.AppendLine("-----END PUBLIC KEY-----");
return builder.ToString();
}
}

View File

@@ -0,0 +1,301 @@
// <copyright file="DsseVerifier.cs" company="Stella Operations">
// Copyright (c) Stella Operations. Licensed under AGPL-3.0-or-later.
// </copyright>
using System.Collections.Immutable;
using System.Security.Cryptography;
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Logging;
namespace StellaOps.Attestation;
/// <summary>
/// Implementation of DSSE signature verification.
/// Uses the existing DsseHelper for PAE computation.
/// </summary>
public sealed class DsseVerifier : IDsseVerifier
{
private readonly ILogger<DsseVerifier> _logger;
/// <summary>
/// JSON serializer options for parsing DSSE envelopes.
/// </summary>
private static readonly JsonSerializerOptions JsonOptions = new()
{
PropertyNameCaseInsensitive = true,
};
public DsseVerifier(ILogger<DsseVerifier> logger)
{
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc />
public Task<DsseVerificationResult> VerifyAsync(
string envelopeJson,
string publicKeyPem,
CancellationToken cancellationToken = default)
{
return VerifyAsync(envelopeJson, new[] { publicKeyPem }, cancellationToken);
}
/// <inheritdoc />
public async Task<DsseVerificationResult> VerifyAsync(
string envelopeJson,
IEnumerable<string> trustedKeysPem,
CancellationToken cancellationToken = default)
{
ArgumentException.ThrowIfNullOrWhiteSpace(envelopeJson);
ArgumentNullException.ThrowIfNull(trustedKeysPem);
var trustedKeys = trustedKeysPem.ToList();
if (trustedKeys.Count == 0)
{
return DsseVerificationResult.Failure(0, ImmutableArray.Create("no_trusted_keys_provided"));
}
return await VerifyWithAllKeysAsync(envelopeJson, trustedKeys, cancellationToken).ConfigureAwait(false);
}
/// <inheritdoc />
public async Task<DsseVerificationResult> VerifyAsync(
string envelopeJson,
Func<string?, CancellationToken, Task<string?>> keyResolver,
CancellationToken cancellationToken = default)
{
ArgumentException.ThrowIfNullOrWhiteSpace(envelopeJson);
ArgumentNullException.ThrowIfNull(keyResolver);
// Parse the envelope
DsseEnvelopeDto? envelope;
try
{
envelope = JsonSerializer.Deserialize<DsseEnvelopeDto>(envelopeJson, JsonOptions);
if (envelope is null)
{
return DsseVerificationResult.ParseError("Failed to deserialize envelope");
}
}
catch (JsonException ex)
{
_logger.LogWarning(ex, "Failed to parse DSSE envelope JSON");
return DsseVerificationResult.ParseError(ex.Message);
}
if (string.IsNullOrWhiteSpace(envelope.Payload))
{
return DsseVerificationResult.Failure(0, ImmutableArray.Create("envelope_missing_payload"));
}
if (envelope.Signatures is null || envelope.Signatures.Count == 0)
{
return DsseVerificationResult.Failure(0, ImmutableArray.Create("envelope_missing_signatures"));
}
// Decode payload
byte[] payloadBytes;
try
{
payloadBytes = Convert.FromBase64String(envelope.Payload);
}
catch (FormatException)
{
return DsseVerificationResult.Failure(envelope.Signatures.Count, ImmutableArray.Create("payload_invalid_base64"));
}
// Compute PAE for signature verification
var payloadType = envelope.PayloadType ?? "https://in-toto.io/Statement/v1";
var pae = DsseHelper.PreAuthenticationEncoding(payloadType, payloadBytes);
// Verify each signature
var verifiedKeyIds = new List<string>();
var issues = new List<string>();
foreach (var sig in envelope.Signatures)
{
if (string.IsNullOrWhiteSpace(sig.Sig))
{
issues.Add($"signature_{sig.KeyId ?? "unknown"}_empty");
continue;
}
// Resolve the public key for this signature
var publicKeyPem = await keyResolver(sig.KeyId, cancellationToken).ConfigureAwait(false);
if (string.IsNullOrWhiteSpace(publicKeyPem))
{
issues.Add($"key_not_found_{sig.KeyId ?? "unknown"}");
continue;
}
// Verify the signature
try
{
var signatureBytes = Convert.FromBase64String(sig.Sig);
if (VerifySignature(pae, signatureBytes, publicKeyPem))
{
verifiedKeyIds.Add(sig.KeyId ?? "unknown");
_logger.LogDebug("DSSE signature verified for keyId: {KeyId}", sig.KeyId ?? "unknown");
}
else
{
issues.Add($"signature_invalid_{sig.KeyId ?? "unknown"}");
}
}
catch (FormatException)
{
issues.Add($"signature_invalid_base64_{sig.KeyId ?? "unknown"}");
}
catch (CryptographicException ex)
{
issues.Add($"signature_crypto_error_{sig.KeyId ?? "unknown"}: {ex.Message}");
}
}
// Compute payload hash for result
var payloadHash = $"sha256:{Convert.ToHexString(SHA256.HashData(payloadBytes)).ToLowerInvariant()}";
if (verifiedKeyIds.Count > 0)
{
return DsseVerificationResult.Success(
verifiedKeyIds.Count,
envelope.Signatures.Count,
verifiedKeyIds.ToImmutableArray(),
payloadType,
payloadHash);
}
return new DsseVerificationResult
{
IsValid = false,
ValidSignatureCount = 0,
TotalSignatureCount = envelope.Signatures.Count,
VerifiedKeyIds = ImmutableArray<string>.Empty,
PayloadType = payloadType,
PayloadHash = payloadHash,
Issues = issues.ToImmutableArray(),
};
}
/// <summary>
/// Verifies against all trusted keys, returning success if any key validates any signature.
/// </summary>
private async Task<DsseVerificationResult> VerifyWithAllKeysAsync(
string envelopeJson,
List<string> trustedKeys,
CancellationToken cancellationToken)
{
// Parse envelope first to get signature keyIds
DsseEnvelopeDto? envelope;
try
{
envelope = JsonSerializer.Deserialize<DsseEnvelopeDto>(envelopeJson, JsonOptions);
if (envelope is null)
{
return DsseVerificationResult.ParseError("Failed to deserialize envelope");
}
}
catch (JsonException ex)
{
return DsseVerificationResult.ParseError(ex.Message);
}
if (envelope.Signatures is null || envelope.Signatures.Count == 0)
{
return DsseVerificationResult.Failure(0, ImmutableArray.Create("envelope_missing_signatures"));
}
// Try each trusted key
var allIssues = new List<string>();
foreach (var key in trustedKeys)
{
var keyIndex = trustedKeys.IndexOf(key);
async Task<string?> SingleKeyResolver(string? keyId, CancellationToken ct)
{
await Task.CompletedTask.ConfigureAwait(false);
return key;
}
var result = await VerifyAsync(envelopeJson, SingleKeyResolver, cancellationToken).ConfigureAwait(false);
if (result.IsValid)
{
return result;
}
// Collect issues for debugging
foreach (var issue in result.Issues)
{
allIssues.Add($"key{keyIndex}: {issue}");
}
}
return DsseVerificationResult.Failure(envelope.Signatures.Count, allIssues.ToImmutableArray());
}
/// <summary>
/// Verifies a signature against PAE using the provided public key.
/// Supports ECDSA P-256 and RSA keys.
/// </summary>
private bool VerifySignature(byte[] pae, byte[] signature, string publicKeyPem)
{
// Try ECDSA first (most common for Sigstore/Fulcio)
try
{
using var ecdsa = ECDsa.Create();
ecdsa.ImportFromPem(publicKeyPem);
return ecdsa.VerifyData(pae, signature, HashAlgorithmName.SHA256);
}
catch (CryptographicException)
{
// Not an ECDSA key, try RSA
}
// Try RSA
try
{
using var rsa = RSA.Create();
rsa.ImportFromPem(publicKeyPem);
return rsa.VerifyData(pae, signature, HashAlgorithmName.SHA256, RSASignaturePadding.Pkcs1);
}
catch (CryptographicException)
{
// Not an RSA key either
}
// Try Ed25519 if available (.NET 9+)
try
{
// Ed25519 support via System.Security.Cryptography
// Note: Ed25519 verification requires different handling
// For now, we log and return false - can be extended later
_logger.LogDebug("Ed25519 signature verification not yet implemented");
return false;
}
catch
{
// Ed25519 not available
}
return false;
}
/// <summary>
/// DTO for deserializing DSSE envelope JSON.
/// </summary>
private sealed class DsseEnvelopeDto
{
public string? PayloadType { get; set; }
public string? Payload { get; set; }
public List<DsseSignatureDto>? Signatures { get; set; }
}
/// <summary>
/// DTO for DSSE signature.
/// </summary>
private sealed class DsseSignatureDto
{
public string? KeyId { get; set; }
public string? Sig { get; set; }
}
}

View File

@@ -0,0 +1,151 @@
// <copyright file="IDsseVerifier.cs" company="Stella Operations">
// Copyright (c) Stella Operations. Licensed under AGPL-3.0-or-later.
// </copyright>
using System.Collections.Immutable;
namespace StellaOps.Attestation;
/// <summary>
/// Interface for verifying DSSE (Dead Simple Signing Envelope) signatures.
/// </summary>
public interface IDsseVerifier
{
/// <summary>
/// Verifies a DSSE envelope against a public key.
/// </summary>
/// <param name="envelopeJson">The serialized DSSE envelope JSON.</param>
/// <param name="publicKeyPem">The PEM-encoded public key for verification.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Verification result containing status and details.</returns>
Task<DsseVerificationResult> VerifyAsync(
string envelopeJson,
string publicKeyPem,
CancellationToken cancellationToken = default);
/// <summary>
/// Verifies a DSSE envelope against multiple trusted public keys.
/// Returns success if at least one signature is valid.
/// </summary>
/// <param name="envelopeJson">The serialized DSSE envelope JSON.</param>
/// <param name="trustedKeysPem">Collection of PEM-encoded public keys.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Verification result containing status and details.</returns>
Task<DsseVerificationResult> VerifyAsync(
string envelopeJson,
IEnumerable<string> trustedKeysPem,
CancellationToken cancellationToken = default);
/// <summary>
/// Verifies a DSSE envelope using a key resolver function.
/// </summary>
/// <param name="envelopeJson">The serialized DSSE envelope JSON.</param>
/// <param name="keyResolver">Function to resolve public key by key ID.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns>Verification result containing status and details.</returns>
Task<DsseVerificationResult> VerifyAsync(
string envelopeJson,
Func<string?, CancellationToken, Task<string?>> keyResolver,
CancellationToken cancellationToken = default);
}
/// <summary>
/// Result of DSSE signature verification.
/// </summary>
public sealed record DsseVerificationResult
{
/// <summary>
/// Whether the verification succeeded (at least one valid signature).
/// </summary>
public required bool IsValid { get; init; }
/// <summary>
/// Number of signatures that passed verification.
/// </summary>
public required int ValidSignatureCount { get; init; }
/// <summary>
/// Total number of signatures in the envelope.
/// </summary>
public required int TotalSignatureCount { get; init; }
/// <summary>
/// Key IDs of signatures that passed verification.
/// </summary>
public required ImmutableArray<string> VerifiedKeyIds { get; init; }
/// <summary>
/// Key ID used for the primary verified signature (first one that passed).
/// </summary>
public string? PrimaryKeyId { get; init; }
/// <summary>
/// Payload type from the envelope.
/// </summary>
public string? PayloadType { get; init; }
/// <summary>
/// SHA-256 hash of the payload.
/// </summary>
public string? PayloadHash { get; init; }
/// <summary>
/// Issues encountered during verification.
/// </summary>
public required ImmutableArray<string> Issues { get; init; }
/// <summary>
/// Creates a successful verification result.
/// </summary>
public static DsseVerificationResult Success(
int validCount,
int totalCount,
ImmutableArray<string> verifiedKeyIds,
string? payloadType = null,
string? payloadHash = null)
{
return new DsseVerificationResult
{
IsValid = true,
ValidSignatureCount = validCount,
TotalSignatureCount = totalCount,
VerifiedKeyIds = verifiedKeyIds,
PrimaryKeyId = verifiedKeyIds.Length > 0 ? verifiedKeyIds[0] : null,
PayloadType = payloadType,
PayloadHash = payloadHash,
Issues = ImmutableArray<string>.Empty,
};
}
/// <summary>
/// Creates a failed verification result.
/// </summary>
public static DsseVerificationResult Failure(
int totalCount,
ImmutableArray<string> issues)
{
return new DsseVerificationResult
{
IsValid = false,
ValidSignatureCount = 0,
TotalSignatureCount = totalCount,
VerifiedKeyIds = ImmutableArray<string>.Empty,
Issues = issues,
};
}
/// <summary>
/// Creates a failure result for a parsing error.
/// </summary>
public static DsseVerificationResult ParseError(string message)
{
return new DsseVerificationResult
{
IsValid = false,
ValidSignatureCount = 0,
TotalSignatureCount = 0,
VerifiedKeyIds = ImmutableArray<string>.Empty,
Issues = ImmutableArray.Create($"envelope_parse_error: {message}"),
};
}
}

View File

@@ -6,6 +6,10 @@
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="../StellaOps.Attestor.Envelope/StellaOps.Attestor.Envelope.csproj" />
</ItemGroup>

View File

@@ -25,6 +25,12 @@ using Xunit;
using StellaOps.TestKit;
namespace StellaOps.Attestor.Tests;
/// <summary>
/// Integration tests for time skew validation in attestation submission and verification.
/// </summary>
[Trait("Category", TestCategories.Integration)]
[Trait("BlastRadius", TestCategories.BlastRadius.Evidence)]
[Trait("BlastRadius", TestCategories.BlastRadius.Crypto)]
public sealed class TimeSkewValidationIntegrationTests
{
private static readonly DateTimeOffset FixedNow = new(2025, 12, 18, 12, 0, 0, TimeSpan.Zero);

View File

@@ -25,7 +25,11 @@ internal sealed class LdapIdentityProviderPlugin : IIdentityProviderPlugin
private readonly LdapCapabilityProbe capabilityProbe;
private readonly AuthorityIdentityProviderCapabilities manifestCapabilities;
private readonly SemaphoreSlim capabilityGate = new(1, 1);
private AuthorityIdentityProviderCapabilities capabilities;
private AuthorityIdentityProviderCapabilities capabilities = new(
SupportsPassword: false,
SupportsMfa: false,
SupportsClientProvisioning: false,
SupportsBootstrap: false);
private bool clientProvisioningActive;
private bool bootstrapActive;
private bool loggedProvisioningDegrade;

View File

@@ -0,0 +1,256 @@
// <copyright file="AuthorityConfigDiffTests.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
// Sprint: SPRINT_20260105_002_005_TEST_cross_cutting
// Task: CCUT-021
using System.Collections.Immutable;
using FluentAssertions;
using Microsoft.Extensions.Logging.Abstractions;
using StellaOps.TestKit;
using StellaOps.Testing.ConfigDiff;
using Xunit;
namespace StellaOps.Authority.ConfigDiff.Tests;
/// <summary>
/// Config-diff tests for the Authority module.
/// Verifies that configuration changes produce only expected behavioral deltas.
/// </summary>
[Trait("Category", TestCategories.ConfigDiff)]
[Trait("Category", TestCategories.Integration)]
[Trait("BlastRadius", TestCategories.BlastRadius.Auth)]
public class AuthorityConfigDiffTests : ConfigDiffTestBase
{
/// <summary>
/// Initializes a new instance of the <see cref="AuthorityConfigDiffTests"/> class.
/// </summary>
public AuthorityConfigDiffTests()
: base(
new ConfigDiffTestConfig(StrictMode: true),
NullLogger.Instance)
{
}
/// <summary>
/// Verifies that changing token lifetime only affects token behavior.
/// </summary>
[Fact]
public async Task ChangingTokenLifetime_OnlyAffectsTokenBehavior()
{
// Arrange
var baselineConfig = new AuthorityTestConfig
{
AccessTokenLifetimeMinutes = 15,
RefreshTokenLifetimeHours = 24,
MaxConcurrentSessions = 5
};
var changedConfig = baselineConfig with
{
AccessTokenLifetimeMinutes = 30
};
// Act
var result = await TestConfigIsolationAsync(
baselineConfig,
changedConfig,
changedSetting: "AccessTokenLifetimeMinutes",
unrelatedBehaviors:
[
async config => await GetSessionBehaviorAsync(config),
async config => await GetRefreshBehaviorAsync(config),
async config => await GetAuthenticationBehaviorAsync(config)
]);
// Assert
result.IsSuccess.Should().BeTrue(
because: "changing token lifetime should not affect sessions or authentication");
}
/// <summary>
/// Verifies that changing max sessions produces expected behavioral delta.
/// </summary>
[Fact]
public async Task ChangingMaxSessions_ProducesExpectedDelta()
{
// Arrange
var baselineConfig = new AuthorityTestConfig { MaxConcurrentSessions = 3 };
var changedConfig = new AuthorityTestConfig { MaxConcurrentSessions = 10 };
var expectedDelta = new ConfigDelta(
ChangedBehaviors: ["SessionLimit", "ConcurrencyPolicy"],
BehaviorDeltas:
[
new BehaviorDelta("SessionLimit", "3", "10", null),
new BehaviorDelta("ConcurrencyPolicy", "restrictive", "permissive",
"More sessions allowed")
]);
// Act
var result = await TestConfigBehavioralDeltaAsync(
baselineConfig,
changedConfig,
getBehavior: async config => await CaptureSessionBehaviorAsync(config),
computeDelta: ComputeBehaviorSnapshotDelta,
expectedDelta: expectedDelta);
// Assert
result.IsSuccess.Should().BeTrue(
because: "session limit change should produce expected behavioral delta");
}
/// <summary>
/// Verifies that enabling DPoP only affects token binding.
/// </summary>
[Fact]
public async Task EnablingDPoP_OnlyAffectsTokenBinding()
{
// Arrange
var baselineConfig = new AuthorityTestConfig { EnableDPoP = false };
var changedConfig = new AuthorityTestConfig { EnableDPoP = true };
// Act
var result = await TestConfigIsolationAsync(
baselineConfig,
changedConfig,
changedSetting: "EnableDPoP",
unrelatedBehaviors:
[
async config => await GetSessionBehaviorAsync(config),
async config => await GetPasswordPolicyBehaviorAsync(config)
]);
// Assert
result.IsSuccess.Should().BeTrue(
because: "DPoP should not affect sessions or password policy");
}
/// <summary>
/// Verifies that changing password policy produces expected changes.
/// </summary>
[Fact]
public async Task ChangingPasswordMinLength_ProducesExpectedDelta()
{
// Arrange
var baselineConfig = new AuthorityTestConfig { MinPasswordLength = 8 };
var changedConfig = new AuthorityTestConfig { MinPasswordLength = 12 };
var expectedDelta = new ConfigDelta(
ChangedBehaviors: ["PasswordComplexity", "ValidationRejectionRate"],
BehaviorDeltas:
[
new BehaviorDelta("PasswordComplexity", "standard", "enhanced", null),
new BehaviorDelta("ValidationRejectionRate", "increase", null,
"Stricter requirements reject more passwords")
]);
// Act
var result = await TestConfigBehavioralDeltaAsync(
baselineConfig,
changedConfig,
getBehavior: async config => await CapturePasswordPolicyBehaviorAsync(config),
computeDelta: ComputeBehaviorSnapshotDelta,
expectedDelta: expectedDelta);
// Assert
result.IsSuccess.Should().BeTrue();
}
/// <summary>
/// Verifies that enabling MFA only affects authentication flow.
/// </summary>
[Fact]
public async Task EnablingMFA_OnlyAffectsAuthentication()
{
// Arrange
var baselineConfig = new AuthorityTestConfig { RequireMFA = false };
var changedConfig = new AuthorityTestConfig { RequireMFA = true };
// Act
var result = await TestConfigIsolationAsync(
baselineConfig,
changedConfig,
changedSetting: "RequireMFA",
unrelatedBehaviors:
[
async config => await GetTokenBehaviorAsync(config),
async config => await GetSessionBehaviorAsync(config)
]);
// Assert
result.IsSuccess.Should().BeTrue(
because: "MFA should not affect token issuance or session management");
}
// Helper methods
private static Task<object> GetSessionBehaviorAsync(AuthorityTestConfig config)
{
return Task.FromResult<object>(new { MaxSessions = config.MaxConcurrentSessions });
}
private static Task<object> GetRefreshBehaviorAsync(AuthorityTestConfig config)
{
return Task.FromResult<object>(new { RefreshLifetime = config.RefreshTokenLifetimeHours });
}
private static Task<object> GetAuthenticationBehaviorAsync(AuthorityTestConfig config)
{
return Task.FromResult<object>(new { MfaRequired = config.RequireMFA });
}
private static Task<object> GetPasswordPolicyBehaviorAsync(AuthorityTestConfig config)
{
return Task.FromResult<object>(new { MinLength = config.MinPasswordLength });
}
private static Task<object> GetTokenBehaviorAsync(AuthorityTestConfig config)
{
return Task.FromResult<object>(new { Lifetime = config.AccessTokenLifetimeMinutes });
}
private static Task<BehaviorSnapshot> CaptureSessionBehaviorAsync(AuthorityTestConfig config)
{
var snapshot = new BehaviorSnapshot(
ConfigurationId: $"sessions-{config.MaxConcurrentSessions}",
Behaviors:
[
new CapturedBehavior("SessionLimit", config.MaxConcurrentSessions.ToString(), DateTimeOffset.UtcNow),
new CapturedBehavior("ConcurrencyPolicy",
config.MaxConcurrentSessions > 5 ? "permissive" : "restrictive", DateTimeOffset.UtcNow)
],
CapturedAt: DateTimeOffset.UtcNow);
return Task.FromResult(snapshot);
}
private static Task<BehaviorSnapshot> CapturePasswordPolicyBehaviorAsync(AuthorityTestConfig config)
{
var snapshot = new BehaviorSnapshot(
ConfigurationId: $"password-{config.MinPasswordLength}",
Behaviors:
[
new CapturedBehavior("PasswordComplexity",
config.MinPasswordLength >= 12 ? "enhanced" : "standard", DateTimeOffset.UtcNow),
new CapturedBehavior("ValidationRejectionRate",
config.MinPasswordLength >= 12 ? "increase" : "standard", DateTimeOffset.UtcNow)
],
CapturedAt: DateTimeOffset.UtcNow);
return Task.FromResult(snapshot);
}
}
/// <summary>
/// Test configuration for Authority module.
/// </summary>
public sealed record AuthorityTestConfig
{
public int AccessTokenLifetimeMinutes { get; init; } = 15;
public int RefreshTokenLifetimeHours { get; init; } = 24;
public int MaxConcurrentSessions { get; init; } = 5;
public bool EnableDPoP { get; init; } = false;
public int MinPasswordLength { get; init; } = 8;
public bool RequireMFA { get; init; } = false;
}

View File

@@ -0,0 +1,23 @@
<?xml version='1.0' encoding='utf-8'?>
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<LangVersion>preview</LangVersion>
<Description>Config-diff tests for Authority module</Description>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="FluentAssertions" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="../../__Libraries/StellaOps.Authority.Core/StellaOps.Authority.Core.csproj" />
<ProjectReference Include="../../../__Libraries/StellaOps.TestKit/StellaOps.TestKit.csproj" />
<ProjectReference Include="../../../__Tests/__Libraries/StellaOps.Testing.ConfigDiff/StellaOps.Testing.ConfigDiff.csproj" />
</ItemGroup>
</Project>

View File

@@ -15,5 +15,7 @@
</ItemGroup>
<ItemGroup>
<ProjectReference Include="../../__Libraries/StellaOps.Authority.Core/StellaOps.Authority.Core.csproj" />
<ProjectReference Include="../../../__Tests/__Libraries/StellaOps.Testing.Temporal/StellaOps.Testing.Temporal.csproj" />
<ProjectReference Include="../../../__Libraries/StellaOps.TestKit/StellaOps.TestKit.csproj" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,296 @@
// <copyright file="TemporalVerdictTests.cs" company="StellaOps">
// Copyright (c) StellaOps. Licensed under AGPL-3.0-or-later.
// </copyright>
// Sprint: SPRINT_20260105_002_001_TEST_time_skew_idempotency
// Task: TSKW-011
using FluentAssertions;
using StellaOps.Authority.Core.Verdicts;
using StellaOps.Testing.Temporal;
using StellaOps.TestKit;
using Xunit;
namespace StellaOps.Authority.Core.Tests.Verdicts;
/// <summary>
/// Temporal testing for verdict manifests using the Testing.Temporal library.
/// Tests clock cutoff handling, timestamp consistency, and determinism under time skew.
/// </summary>
[Trait("Category", TestCategories.Unit)]
public sealed class TemporalVerdictTests
{
private static readonly DateTimeOffset BaseTime = new(2026, 1, 5, 12, 0, 0, TimeSpan.Zero);
[Fact]
public void VerdictManifest_ClockCutoff_BoundaryPrecision()
{
// Arrange
var ttlProvider = new TtlBoundaryTimeProvider(BaseTime);
var ttl = TimeSpan.FromHours(24); // Typical verdict validity window
var clockCutoff = BaseTime;
// Position at various boundaries
var testCases = TtlBoundaryTimeProvider.GenerateBoundaryTestCases(clockCutoff, ttl).ToList();
// Assert - verify all boundary cases are correctly handled
foreach (var testCase in testCases)
{
var isExpired = testCase.Time >= clockCutoff.Add(ttl);
isExpired.Should().Be(
testCase.ShouldBeExpired,
$"Verdict clock cutoff case '{testCase.Name}' should be expired={testCase.ShouldBeExpired}");
}
}
[Fact]
public void VerdictManifestBuilder_IsDeterministic_UnderTimeAdvancement()
{
// Arrange
var timeProvider = new SimulatedTimeProvider(BaseTime);
var results = new List<string>();
// Act - build multiple manifests while advancing time
for (int i = 0; i < 10; i++)
{
var manifest = BuildTestManifest(BaseTime); // Use fixed clock, not advancing
results.Add(manifest.ManifestDigest);
timeProvider.Advance(TimeSpan.FromMinutes(5)); // Advance between builds
}
// Assert - all manifests should have same digest (deterministic)
results.Distinct().Should().HaveCount(1, "manifests built with same inputs should be deterministic");
}
[Fact]
public void VerdictManifestBuilder_Build_IsIdempotent()
{
// Arrange
var stateSnapshotter = () => BuildTestManifest(BaseTime).ManifestDigest;
var verifier = new IdempotencyVerifier<string>(stateSnapshotter);
// Act - verify Build is idempotent
var result = verifier.Verify(() => { /* Build is called in snapshotter */ }, repetitions: 5);
// Assert
result.IsIdempotent.Should().BeTrue("VerdictManifestBuilder.Build should be idempotent");
result.AllSucceeded.Should().BeTrue();
}
[Fact]
public void VerdictManifest_TimestampOrdering_IsMonotonic()
{
// Arrange - simulate verdict timestamps
var timeProvider = new SimulatedTimeProvider(BaseTime);
var timestamps = new List<DateTimeOffset>();
// Simulate verdict lifecycle: created, processed, signed, stored
timestamps.Add(timeProvider.GetUtcNow()); // Created
timeProvider.Advance(TimeSpan.FromMilliseconds(50));
timestamps.Add(timeProvider.GetUtcNow()); // Processed
timeProvider.Advance(TimeSpan.FromMilliseconds(100));
timestamps.Add(timeProvider.GetUtcNow()); // Signed
timeProvider.Advance(TimeSpan.FromMilliseconds(20));
timestamps.Add(timeProvider.GetUtcNow()); // Stored
// Act & Assert - timestamps should be monotonically increasing
ClockSkewAssertions.AssertMonotonicTimestamps(timestamps);
}
[Fact]
public void VerdictManifest_HandlesClockSkewForward()
{
// Arrange
var timeProvider = new SimulatedTimeProvider(BaseTime);
var clockCutoff1 = timeProvider.GetUtcNow();
// Simulate clock jump forward (NTP correction)
timeProvider.JumpTo(BaseTime.AddHours(2));
var clockCutoff2 = timeProvider.GetUtcNow();
// Act - build manifests with different clock cutoffs
var manifest1 = BuildTestManifest(clockCutoff1);
var manifest2 = BuildTestManifest(clockCutoff2);
// Assert - different clock cutoffs should produce different digests
manifest1.ManifestDigest.Should().NotBe(manifest2.ManifestDigest,
"different clock cutoffs should produce different manifest digests");
// Clock cutoff difference should be within expected range
ClockSkewAssertions.AssertTimestampsWithinTolerance(
clockCutoff1,
clockCutoff2,
tolerance: TimeSpan.FromHours(3));
}
[Fact]
public void VerdictManifest_ClockDrift_DoesNotAffectDeterminism()
{
// Arrange
var timeProvider = new SimulatedTimeProvider(BaseTime);
timeProvider.SetDrift(TimeSpan.FromMilliseconds(10)); // 10ms/second drift
var results = new List<string>();
var fixedClock = BaseTime; // Use fixed clock for manifest
// Act - build manifests while time drifts
for (int i = 0; i < 10; i++)
{
var manifest = BuildTestManifest(fixedClock);
results.Add(manifest.ManifestDigest);
timeProvider.Advance(TimeSpan.FromSeconds(10)); // Time advances with drift
}
// Assert - all should be identical (fixed clock input)
results.Distinct().Should().HaveCount(1,
"manifests with fixed clock should be deterministic regardless of system drift");
}
[Fact]
public void VerdictManifest_ClockJumpBackward_IsDetected()
{
// Arrange
var timeProvider = new SimulatedTimeProvider(BaseTime);
var timestamps = new List<DateTimeOffset>();
// Record timestamps
timestamps.Add(timeProvider.GetUtcNow());
timeProvider.Advance(TimeSpan.FromMinutes(5));
timestamps.Add(timeProvider.GetUtcNow());
// Simulate clock jump backward
timeProvider.JumpBackward(TimeSpan.FromMinutes(3));
timestamps.Add(timeProvider.GetUtcNow());
// Assert - backward jump should be detected
timeProvider.HasJumpedBackward().Should().BeTrue();
// Non-monotonic timestamps should be detected
var act = () => ClockSkewAssertions.AssertMonotonicTimestamps(timestamps);
act.Should().Throw<ClockSkewAssertionException>();
}
[Theory]
[InlineData(0.9, VexStatus.NotAffected)]
[InlineData(0.7, VexStatus.Affected)]
[InlineData(0.5, VexStatus.UnderInvestigation)]
public void VerdictManifest_ConfidenceScores_AreIdempotent(double confidence, VexStatus status)
{
// Arrange
var stateSnapshotter = () =>
{
var manifest = BuildTestManifest(BaseTime, confidence, status);
return manifest.Result.Confidence;
};
var verifier = new IdempotencyVerifier<double>(stateSnapshotter);
// Act
var result = verifier.Verify(() => { }, repetitions: 3);
// Assert
result.IsIdempotent.Should().BeTrue();
result.States.Should().AllSatisfy(c => c.Should().Be(confidence));
}
[Fact]
public void VerdictManifest_ExpiryWindow_BoundaryTests()
{
// Arrange - simulate verdict expiry window (e.g., 7 days)
var expiryWindow = TimeSpan.FromDays(7);
var createdAt = BaseTime;
// Generate boundary test cases
var testCases = TtlBoundaryTimeProvider.GenerateBoundaryTestCases(createdAt, expiryWindow);
// Assert
foreach (var testCase in testCases)
{
var isExpired = testCase.Time >= createdAt.Add(expiryWindow);
isExpired.Should().Be(testCase.ShouldBeExpired, testCase.Name);
}
}
[Theory]
[MemberData(nameof(GetVerdictExpiryBoundaryData))]
public void VerdictManifest_TheoryBoundaryTests(
string name,
DateTimeOffset testTime,
bool shouldBeExpired)
{
// Arrange
var expiryWindow = TimeSpan.FromDays(7);
var expiry = BaseTime.Add(expiryWindow);
// Act
var isExpired = testTime >= expiry;
// Assert
isExpired.Should().Be(shouldBeExpired, $"Case '{name}' should be expired={shouldBeExpired}");
}
public static IEnumerable<object[]> GetVerdictExpiryBoundaryData()
{
var expiryWindow = TimeSpan.FromDays(7);
return TtlBoundaryTimeProvider.GenerateTheoryData(BaseTime, expiryWindow);
}
[Fact]
public void VerdictManifest_LeapSecondScenario_MaintainsDeterminism()
{
// Arrange
var leapDay = new DateOnly(2016, 12, 31);
var leapProvider = new LeapSecondTimeProvider(
new DateTimeOffset(2016, 12, 31, 23, 0, 0, TimeSpan.Zero),
leapDay);
var results = new List<string>();
var fixedClock = new DateTimeOffset(2016, 12, 31, 12, 0, 0, TimeSpan.Zero);
// Act - build manifests while advancing through leap second
foreach (var moment in leapProvider.AdvanceThroughLeapSecond(leapDay))
{
var manifest = BuildTestManifest(fixedClock);
results.Add(manifest.ManifestDigest);
}
// Assert - all manifests should be identical (fixed clock)
results.Distinct().Should().HaveCount(1,
"manifests should be deterministic even during leap second transition");
}
private static VerdictManifest BuildTestManifest(
DateTimeOffset clockCutoff,
double confidence = 0.85,
VexStatus status = VexStatus.NotAffected)
{
return new VerdictManifestBuilder(() => "test-manifest-id")
.WithTenant("tenant-1")
.WithAsset("sha256:abc123", "CVE-2024-1234")
.WithInputs(
sbomDigests: new[] { "sha256:sbom1" },
vulnFeedSnapshotIds: new[] { "feed-snapshot-1" },
vexDocumentDigests: new[] { "sha256:vex1" },
clockCutoff: clockCutoff)
.WithResult(
status: status,
confidence: confidence,
explanations: new[]
{
new VerdictExplanation
{
SourceId = "vendor-a",
Reason = "Test explanation",
ProvenanceScore = 0.9,
CoverageScore = 0.8,
ReplayabilityScore = 0.7,
StrengthMultiplier = 1.0,
FreshnessMultiplier = 0.95,
ClaimScore = confidence,
AssertedStatus = status,
Accepted = true,
},
})
.WithPolicy("sha256:policy123", "1.0.0")
.WithClock(clockCutoff)
.Build();
}
}

View File

@@ -253,6 +253,24 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.FixIn
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.WebService.Tests", "__Tests\StellaOps.BinaryIndex.WebService.Tests\StellaOps.BinaryIndex.WebService.Tests.csproj", "{C12D06F8-7B69-4A24-B206-C47326778F2E}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Semantic", "__Libraries\StellaOps.BinaryIndex.Semantic\StellaOps.BinaryIndex.Semantic.csproj", "{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Disassembly.Abstractions", "__Libraries\StellaOps.BinaryIndex.Disassembly.Abstractions\StellaOps.BinaryIndex.Disassembly.Abstractions.csproj", "{3112D5DD-E993-4737-955B-D8FE20CEC88A}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Semantic.Tests", "__Tests\StellaOps.BinaryIndex.Semantic.Tests\StellaOps.BinaryIndex.Semantic.Tests.csproj", "{89CCD547-09D4-4923-9644-17724AF60F1C}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.TestKit", "..\__Libraries\StellaOps.TestKit\StellaOps.TestKit.csproj", "{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Ensemble", "__Libraries\StellaOps.BinaryIndex.Ensemble\StellaOps.BinaryIndex.Ensemble.csproj", "{7612CE73-B27A-4489-A89E-E22FF19981B7}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Decompiler", "__Libraries\StellaOps.BinaryIndex.Decompiler\StellaOps.BinaryIndex.Decompiler.csproj", "{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Ghidra", "__Libraries\StellaOps.BinaryIndex.Ghidra\StellaOps.BinaryIndex.Ghidra.csproj", "{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.ML", "__Libraries\StellaOps.BinaryIndex.ML\StellaOps.BinaryIndex.ML.csproj", "{850F7C46-E98B-431A-B202-FF97FB041BAD}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StellaOps.BinaryIndex.Ensemble.Tests", "__Tests\StellaOps.BinaryIndex.Ensemble.Tests\StellaOps.BinaryIndex.Ensemble.Tests.csproj", "{87356481-048B-4D3F-B4D5-3B6494A1F038}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -1151,6 +1169,114 @@ Global
{C12D06F8-7B69-4A24-B206-C47326778F2E}.Release|x64.Build.0 = Release|Any CPU
{C12D06F8-7B69-4A24-B206-C47326778F2E}.Release|x86.ActiveCfg = Release|Any CPU
{C12D06F8-7B69-4A24-B206-C47326778F2E}.Release|x86.Build.0 = Release|Any CPU
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Debug|Any CPU.Build.0 = Debug|Any CPU
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Debug|x64.ActiveCfg = Debug|Any CPU
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Debug|x64.Build.0 = Debug|Any CPU
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Debug|x86.ActiveCfg = Debug|Any CPU
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Debug|x86.Build.0 = Debug|Any CPU
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Release|Any CPU.ActiveCfg = Release|Any CPU
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Release|Any CPU.Build.0 = Release|Any CPU
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Release|x64.ActiveCfg = Release|Any CPU
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Release|x64.Build.0 = Release|Any CPU
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Release|x86.ActiveCfg = Release|Any CPU
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7}.Release|x86.Build.0 = Release|Any CPU
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Debug|Any CPU.Build.0 = Debug|Any CPU
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Debug|x64.ActiveCfg = Debug|Any CPU
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Debug|x64.Build.0 = Debug|Any CPU
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Debug|x86.ActiveCfg = Debug|Any CPU
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Debug|x86.Build.0 = Debug|Any CPU
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Release|Any CPU.ActiveCfg = Release|Any CPU
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Release|Any CPU.Build.0 = Release|Any CPU
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Release|x64.ActiveCfg = Release|Any CPU
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Release|x64.Build.0 = Release|Any CPU
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Release|x86.ActiveCfg = Release|Any CPU
{3112D5DD-E993-4737-955B-D8FE20CEC88A}.Release|x86.Build.0 = Release|Any CPU
{89CCD547-09D4-4923-9644-17724AF60F1C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{89CCD547-09D4-4923-9644-17724AF60F1C}.Debug|Any CPU.Build.0 = Debug|Any CPU
{89CCD547-09D4-4923-9644-17724AF60F1C}.Debug|x64.ActiveCfg = Debug|Any CPU
{89CCD547-09D4-4923-9644-17724AF60F1C}.Debug|x64.Build.0 = Debug|Any CPU
{89CCD547-09D4-4923-9644-17724AF60F1C}.Debug|x86.ActiveCfg = Debug|Any CPU
{89CCD547-09D4-4923-9644-17724AF60F1C}.Debug|x86.Build.0 = Debug|Any CPU
{89CCD547-09D4-4923-9644-17724AF60F1C}.Release|Any CPU.ActiveCfg = Release|Any CPU
{89CCD547-09D4-4923-9644-17724AF60F1C}.Release|Any CPU.Build.0 = Release|Any CPU
{89CCD547-09D4-4923-9644-17724AF60F1C}.Release|x64.ActiveCfg = Release|Any CPU
{89CCD547-09D4-4923-9644-17724AF60F1C}.Release|x64.Build.0 = Release|Any CPU
{89CCD547-09D4-4923-9644-17724AF60F1C}.Release|x86.ActiveCfg = Release|Any CPU
{89CCD547-09D4-4923-9644-17724AF60F1C}.Release|x86.Build.0 = Release|Any CPU
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Debug|x64.ActiveCfg = Debug|Any CPU
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Debug|x64.Build.0 = Debug|Any CPU
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Debug|x86.ActiveCfg = Debug|Any CPU
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Debug|x86.Build.0 = Debug|Any CPU
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Release|Any CPU.Build.0 = Release|Any CPU
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Release|x64.ActiveCfg = Release|Any CPU
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Release|x64.Build.0 = Release|Any CPU
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Release|x86.ActiveCfg = Release|Any CPU
{C064F3B6-AF8E-4C92-A2FB-3BEF9FB7CC92}.Release|x86.Build.0 = Release|Any CPU
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Debug|Any CPU.Build.0 = Debug|Any CPU
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Debug|x64.ActiveCfg = Debug|Any CPU
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Debug|x64.Build.0 = Debug|Any CPU
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Debug|x86.ActiveCfg = Debug|Any CPU
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Debug|x86.Build.0 = Debug|Any CPU
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Release|Any CPU.ActiveCfg = Release|Any CPU
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Release|Any CPU.Build.0 = Release|Any CPU
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Release|x64.ActiveCfg = Release|Any CPU
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Release|x64.Build.0 = Release|Any CPU
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Release|x86.ActiveCfg = Release|Any CPU
{7612CE73-B27A-4489-A89E-E22FF19981B7}.Release|x86.Build.0 = Release|Any CPU
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Debug|Any CPU.Build.0 = Debug|Any CPU
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Debug|x64.ActiveCfg = Debug|Any CPU
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Debug|x64.Build.0 = Debug|Any CPU
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Debug|x86.ActiveCfg = Debug|Any CPU
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Debug|x86.Build.0 = Debug|Any CPU
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Release|Any CPU.ActiveCfg = Release|Any CPU
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Release|Any CPU.Build.0 = Release|Any CPU
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Release|x64.ActiveCfg = Release|Any CPU
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Release|x64.Build.0 = Release|Any CPU
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Release|x86.ActiveCfg = Release|Any CPU
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7}.Release|x86.Build.0 = Release|Any CPU
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Debug|x64.ActiveCfg = Debug|Any CPU
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Debug|x64.Build.0 = Debug|Any CPU
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Debug|x86.ActiveCfg = Debug|Any CPU
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Debug|x86.Build.0 = Debug|Any CPU
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Release|Any CPU.Build.0 = Release|Any CPU
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Release|x64.ActiveCfg = Release|Any CPU
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Release|x64.Build.0 = Release|Any CPU
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Release|x86.ActiveCfg = Release|Any CPU
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4}.Release|x86.Build.0 = Release|Any CPU
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Debug|Any CPU.Build.0 = Debug|Any CPU
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Debug|x64.ActiveCfg = Debug|Any CPU
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Debug|x64.Build.0 = Debug|Any CPU
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Debug|x86.ActiveCfg = Debug|Any CPU
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Debug|x86.Build.0 = Debug|Any CPU
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Release|Any CPU.ActiveCfg = Release|Any CPU
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Release|Any CPU.Build.0 = Release|Any CPU
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Release|x64.ActiveCfg = Release|Any CPU
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Release|x64.Build.0 = Release|Any CPU
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Release|x86.ActiveCfg = Release|Any CPU
{850F7C46-E98B-431A-B202-FF97FB041BAD}.Release|x86.Build.0 = Release|Any CPU
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Debug|Any CPU.Build.0 = Debug|Any CPU
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Debug|x64.ActiveCfg = Debug|Any CPU
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Debug|x64.Build.0 = Debug|Any CPU
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Debug|x86.ActiveCfg = Debug|Any CPU
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Debug|x86.Build.0 = Debug|Any CPU
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Release|Any CPU.ActiveCfg = Release|Any CPU
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Release|Any CPU.Build.0 = Release|Any CPU
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Release|x64.ActiveCfg = Release|Any CPU
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Release|x64.Build.0 = Release|Any CPU
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Release|x86.ActiveCfg = Release|Any CPU
{87356481-048B-4D3F-B4D5-3B6494A1F038}.Release|x86.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@@ -1246,6 +1372,14 @@ Global
{FB127279-C17B-40DC-AC68-320B7CE85E76} = {BB76B5A5-14BA-E317-828D-110B711D71F5}
{AAE98543-46B4-4707-AD1F-CCC9142F8712} = {BB76B5A5-14BA-E317-828D-110B711D71F5}
{C12D06F8-7B69-4A24-B206-C47326778F2E} = {BB76B5A5-14BA-E317-828D-110B711D71F5}
{1C21DB5D-C8FF-4EF2-9847-7049515A0FE7} = {A5C98087-E847-D2C4-2143-20869479839D}
{3112D5DD-E993-4737-955B-D8FE20CEC88A} = {A5C98087-E847-D2C4-2143-20869479839D}
{89CCD547-09D4-4923-9644-17724AF60F1C} = {BB76B5A5-14BA-E317-828D-110B711D71F5}
{7612CE73-B27A-4489-A89E-E22FF19981B7} = {A5C98087-E847-D2C4-2143-20869479839D}
{66EEF897-8006-4C53-B2AB-C55D82BDE6D7} = {A5C98087-E847-D2C4-2143-20869479839D}
{C5C87F73-6EEF-4296-A1DD-24563E4F05B4} = {A5C98087-E847-D2C4-2143-20869479839D}
{850F7C46-E98B-431A-B202-FF97FB041BAD} = {A5C98087-E847-D2C4-2143-20869479839D}
{87356481-048B-4D3F-B4D5-3B6494A1F038} = {BB76B5A5-14BA-E317-828D-110B711D71F5}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {21B6BF22-3A64-CD15-49B3-21A490AAD068}

View File

@@ -1,3 +1,5 @@
using StellaOps.BinaryIndex.Semantic;
namespace StellaOps.BinaryIndex.Builders;
/// <summary>
@@ -109,6 +111,12 @@ public sealed record FunctionFingerprint
/// Source line number if debug info available.
/// </summary>
public int? SourceLine { get; init; }
/// <summary>
/// Semantic fingerprint for enhanced similarity comparison.
/// Uses IR-level analysis for resilience to compiler optimizations.
/// </summary>
public Semantic.SemanticFingerprint? SemanticFingerprint { get; init; }
}
/// <summary>

View File

@@ -192,25 +192,42 @@ public sealed record HashWeights
/// <summary>
/// Weight for basic block hash comparison.
/// </summary>
public decimal BasicBlockWeight { get; init; } = 0.5m;
public decimal BasicBlockWeight { get; init; } = 0.4m;
/// <summary>
/// Weight for CFG hash comparison.
/// </summary>
public decimal CfgWeight { get; init; } = 0.3m;
public decimal CfgWeight { get; init; } = 0.25m;
/// <summary>
/// Weight for string refs hash comparison.
/// </summary>
public decimal StringRefsWeight { get; init; } = 0.2m;
public decimal StringRefsWeight { get; init; } = 0.15m;
/// <summary>
/// Weight for semantic fingerprint comparison.
/// Only used when both fingerprints have semantic data.
/// </summary>
public decimal SemanticWeight { get; init; } = 0.2m;
/// <summary>
/// Default weights.
/// </summary>
public static HashWeights Default => new();
/// <summary>
/// Weights without semantic analysis (traditional mode).
/// </summary>
public static HashWeights Traditional => new()
{
BasicBlockWeight = 0.5m,
CfgWeight = 0.3m,
StringRefsWeight = 0.2m,
SemanticWeight = 0.0m
};
/// <summary>
/// Validates that weights sum to 1.0.
/// </summary>
public bool IsValid => Math.Abs(BasicBlockWeight + CfgWeight + StringRefsWeight - 1.0m) < 0.001m;
public bool IsValid => Math.Abs(BasicBlockWeight + CfgWeight + StringRefsWeight + SemanticWeight - 1.0m) < 0.001m;
}

View File

@@ -1,4 +1,5 @@
using Microsoft.Extensions.Logging;
using StellaOps.BinaryIndex.Semantic;
namespace StellaOps.BinaryIndex.Builders;
@@ -202,6 +203,16 @@ public sealed class PatchDiffEngine : IPatchDiffEngine
matchedWeight += weights.StringRefsWeight;
}
// Include semantic fingerprint similarity if available
if (weights.SemanticWeight > 0 &&
a.SemanticFingerprint is not null &&
b.SemanticFingerprint is not null)
{
totalWeight += weights.SemanticWeight;
var semanticSimilarity = ComputeSemanticSimilarity(a.SemanticFingerprint, b.SemanticFingerprint);
matchedWeight += weights.SemanticWeight * semanticSimilarity;
}
// Size similarity bonus (if sizes are within 10%, add small bonus)
if (a.Size > 0 && b.Size > 0)
{
@@ -216,6 +227,86 @@ public sealed class PatchDiffEngine : IPatchDiffEngine
return totalWeight > 0 ? matchedWeight / totalWeight : 0m;
}
private static decimal ComputeSemanticSimilarity(
Semantic.SemanticFingerprint a,
Semantic.SemanticFingerprint b)
{
// Check for exact hash match first
if (a.HashEquals(b))
{
return 1.0m;
}
// Compute weighted similarity from components
decimal graphSim = ComputeHashSimilarity(a.GraphHash, b.GraphHash);
decimal opSim = ComputeHashSimilarity(a.OperationHash, b.OperationHash);
decimal dfSim = ComputeHashSimilarity(a.DataFlowHash, b.DataFlowHash);
decimal apiSim = ComputeApiCallSimilarity(a.ApiCalls, b.ApiCalls);
// Weights: graph structure 40%, operation sequence 25%, data flow 20%, API calls 15%
return (graphSim * 0.40m) + (opSim * 0.25m) + (dfSim * 0.20m) + (apiSim * 0.15m);
}
private static decimal ComputeHashSimilarity(byte[] hashA, byte[] hashB)
{
if (hashA.Length == 0 || hashB.Length == 0)
{
return 0m;
}
if (hashA.AsSpan().SequenceEqual(hashB))
{
return 1.0m;
}
// Count matching bits (Hamming similarity)
int matchingBits = 0;
int totalBits = hashA.Length * 8;
int len = Math.Min(hashA.Length, hashB.Length);
for (int i = 0; i < len; i++)
{
byte xor = (byte)(hashA[i] ^ hashB[i]);
matchingBits += 8 - PopCount(xor);
}
return (decimal)matchingBits / totalBits;
}
private static int PopCount(byte value)
{
int count = 0;
while (value != 0)
{
count += value & 1;
value >>= 1;
}
return count;
}
private static decimal ComputeApiCallSimilarity(
System.Collections.Immutable.ImmutableArray<string> apiCallsA,
System.Collections.Immutable.ImmutableArray<string> apiCallsB)
{
if (apiCallsA.IsEmpty && apiCallsB.IsEmpty)
{
return 1.0m;
}
if (apiCallsA.IsEmpty || apiCallsB.IsEmpty)
{
return 0.0m;
}
var setA = new HashSet<string>(apiCallsA, StringComparer.Ordinal);
var setB = new HashSet<string>(apiCallsB, StringComparer.Ordinal);
var intersection = setA.Intersect(setB).Count();
var union = setA.Union(setB).Count();
return union > 0 ? (decimal)intersection / union : 0m;
}
/// <inheritdoc />
public IReadOnlyDictionary<string, string> FindFunctionMappings(
IReadOnlyList<FunctionFingerprint> vulnerable,

View File

@@ -20,5 +20,6 @@
<ItemGroup>
<ProjectReference Include="../StellaOps.BinaryIndex.Core/StellaOps.BinaryIndex.Core.csproj" />
<ProjectReference Include="../StellaOps.BinaryIndex.Fingerprints/StellaOps.BinaryIndex.Fingerprints.csproj" />
<ProjectReference Include="../StellaOps.BinaryIndex.Semantic/StellaOps.BinaryIndex.Semantic.csproj" />
</ItemGroup>
</Project>

View File

@@ -510,6 +510,27 @@ public sealed class CachedBinaryVulnerabilityService : IBinaryVulnerabilityServi
}
}
/// <inheritdoc />
public async Task<ImmutableArray<CorpusFunctionMatch>> IdentifyFunctionFromCorpusAsync(
FunctionFingerprintSet fingerprints,
CorpusLookupOptions? options = null,
CancellationToken ct = default)
{
// Delegate to inner service - corpus lookups typically don't benefit from caching
// due to high variance in fingerprint sets
return await _inner.IdentifyFunctionFromCorpusAsync(fingerprints, options, ct).ConfigureAwait(false);
}
/// <inheritdoc />
public async Task<ImmutableDictionary<string, ImmutableArray<CorpusFunctionMatch>>> IdentifyFunctionsFromCorpusBatchAsync(
IEnumerable<(string Key, FunctionFingerprintSet Fingerprints)> functions,
CorpusLookupOptions? options = null,
CancellationToken ct = default)
{
// Delegate to inner service - batch corpus lookups typically don't benefit from caching
return await _inner.IdentifyFunctionsFromCorpusBatchAsync(functions, options, ct).ConfigureAwait(false);
}
public async ValueTask DisposeAsync()
{
_connectionLock.Dispose();

View File

@@ -99,6 +99,27 @@ public interface IBinaryVulnerabilityService
string symbolName,
DeltaSigLookupOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Identify a function by its fingerprints using the corpus database.
/// Returns matching library functions with CVE associations.
/// </summary>
/// <param name="fingerprints">Function fingerprints (semantic, instruction, API call).</param>
/// <param name="options">Corpus lookup options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Identified functions with vulnerability associations.</returns>
Task<ImmutableArray<CorpusFunctionMatch>> IdentifyFunctionFromCorpusAsync(
FunctionFingerprintSet fingerprints,
CorpusLookupOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Batch identify functions from corpus for scan performance.
/// </summary>
Task<ImmutableDictionary<string, ImmutableArray<CorpusFunctionMatch>>> IdentifyFunctionsFromCorpusBatchAsync(
IEnumerable<(string Key, FunctionFingerprintSet Fingerprints)> functions,
CorpusLookupOptions? options = null,
CancellationToken ct = default);
}
/// <summary>
@@ -225,3 +246,141 @@ public sealed record FixStatusResult
/// <summary>Reference to the underlying evidence record.</summary>
public Guid? EvidenceId { get; init; }
}
/// <summary>
/// Function fingerprint set for corpus matching.
/// </summary>
public sealed record FunctionFingerprintSet
{
/// <summary>Semantic fingerprint (IR-based).</summary>
public byte[]? SemanticFingerprint { get; init; }
/// <summary>Instruction fingerprint (normalized assembly).</summary>
public byte[]? InstructionFingerprint { get; init; }
/// <summary>API call sequence fingerprint.</summary>
public byte[]? ApiCallFingerprint { get; init; }
/// <summary>Function name if available (may be stripped).</summary>
public string? FunctionName { get; init; }
/// <summary>Architecture of the binary.</summary>
public required string Architecture { get; init; }
/// <summary>Function size in bytes.</summary>
public int? FunctionSize { get; init; }
}
/// <summary>
/// Options for corpus-based function identification.
/// </summary>
public sealed record CorpusLookupOptions
{
/// <summary>Minimum similarity threshold (0.0-1.0). Default 0.85.</summary>
public decimal MinSimilarity { get; init; } = 0.85m;
/// <summary>Maximum candidates to return. Default 5.</summary>
public int MaxCandidates { get; init; } = 5;
/// <summary>Library name filter (glibc, openssl, etc.). Null means all.</summary>
public string? LibraryFilter { get; init; }
/// <summary>Whether to include CVE associations. Default true.</summary>
public bool IncludeCveAssociations { get; init; } = true;
/// <summary>Whether to check fix status for matched CVEs. Default true.</summary>
public bool CheckFixStatus { get; init; } = true;
/// <summary>Distro hint for fix status lookup.</summary>
public string? DistroHint { get; init; }
/// <summary>Release hint for fix status lookup.</summary>
public string? ReleaseHint { get; init; }
/// <summary>Prefer semantic fingerprint matching over instruction. Default true.</summary>
public bool PreferSemanticMatch { get; init; } = true;
}
/// <summary>
/// Result of corpus-based function identification.
/// </summary>
public sealed record CorpusFunctionMatch
{
/// <summary>Matched library name (glibc, openssl, etc.).</summary>
public required string LibraryName { get; init; }
/// <summary>Library version range where this function appears.</summary>
public required string VersionRange { get; init; }
/// <summary>Canonical function name.</summary>
public required string FunctionName { get; init; }
/// <summary>Overall match confidence (0.0-1.0).</summary>
public required decimal Confidence { get; init; }
/// <summary>Match method used (semantic, instruction, combined).</summary>
public required CorpusMatchMethod Method { get; init; }
/// <summary>Semantic similarity score if available.</summary>
public decimal? SemanticSimilarity { get; init; }
/// <summary>Instruction similarity score if available.</summary>
public decimal? InstructionSimilarity { get; init; }
/// <summary>CVEs affecting this function (if requested).</summary>
public ImmutableArray<CorpusCveAssociation> CveAssociations { get; init; } = [];
}
/// <summary>
/// Method used for corpus matching.
/// </summary>
public enum CorpusMatchMethod
{
/// <summary>Matched via semantic fingerprint (IR-based).</summary>
Semantic,
/// <summary>Matched via instruction fingerprint.</summary>
Instruction,
/// <summary>Matched via API call sequence.</summary>
ApiCall,
/// <summary>Combined match using multiple fingerprints.</summary>
Combined
}
/// <summary>
/// CVE association from corpus for a matched function.
/// </summary>
public sealed record CorpusCveAssociation
{
/// <summary>CVE identifier.</summary>
public required string CveId { get; init; }
/// <summary>Affected state for the matched version.</summary>
public required CorpusAffectedState AffectedState { get; init; }
/// <summary>Version where fix was applied (if fixed).</summary>
public string? FixedInVersion { get; init; }
/// <summary>Confidence in the CVE association.</summary>
public required decimal Confidence { get; init; }
/// <summary>Evidence type for the association.</summary>
public string? EvidenceType { get; init; }
}
/// <summary>
/// Affected state for corpus CVE associations.
/// </summary>
public enum CorpusAffectedState
{
/// <summary>Function is vulnerable to the CVE.</summary>
Vulnerable,
/// <summary>Function has been fixed.</summary>
Fixed,
/// <summary>Function is not affected by the CVE.</summary>
NotAffected
}

View File

@@ -0,0 +1,447 @@
using System.Collections.Immutable;
using System.Net.Http;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using StellaOps.BinaryIndex.Corpus.Models;
namespace StellaOps.BinaryIndex.Corpus.Connectors;
/// <summary>
/// Corpus connector for libcurl/curl library.
/// Fetches pre-built binaries from distribution packages or official releases.
/// </summary>
public sealed partial class CurlCorpusConnector : ILibraryCorpusConnector
{
private readonly IHttpClientFactory _httpClientFactory;
private readonly ILogger<CurlCorpusConnector> _logger;
/// <summary>
/// Base URL for curl official releases.
/// </summary>
public const string CurlReleasesUrl = "https://curl.se/download/";
/// <summary>
/// Supported architectures.
/// </summary>
private static readonly ImmutableArray<string> s_supportedArchitectures =
["x86_64", "aarch64", "armhf", "i386"];
public CurlCorpusConnector(
IHttpClientFactory httpClientFactory,
ILogger<CurlCorpusConnector> logger)
{
_httpClientFactory = httpClientFactory;
_logger = logger;
}
/// <inheritdoc />
public string LibraryName => "curl";
/// <inheritdoc />
public ImmutableArray<string> SupportedArchitectures => s_supportedArchitectures;
/// <inheritdoc />
public async Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default)
{
var client = _httpClientFactory.CreateClient("Curl");
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
// Fetch releases from curl.se
try
{
_logger.LogDebug("Fetching curl versions from {Url}", CurlReleasesUrl);
var html = await client.GetStringAsync(CurlReleasesUrl, ct);
var currentVersions = ParseVersionsFromListing(html);
foreach (var v in currentVersions)
{
versions.Add(v);
}
}
catch (HttpRequestException ex)
{
_logger.LogWarning(ex, "Failed to fetch current curl releases");
}
// Also check archive
const string archiveUrl = "https://curl.se/download/archeology/";
try
{
_logger.LogDebug("Fetching old curl versions from {Url}", archiveUrl);
var archiveHtml = await client.GetStringAsync(archiveUrl, ct);
var archiveVersions = ParseVersionsFromListing(archiveHtml);
foreach (var v in archiveVersions)
{
versions.Add(v);
}
}
catch (HttpRequestException ex)
{
_logger.LogWarning(ex, "Failed to fetch curl archive releases");
}
_logger.LogInformation("Found {Count} curl versions", versions.Count);
return [.. versions.OrderByDescending(ParseVersion)];
}
/// <inheritdoc />
public async Task<LibraryBinary?> FetchBinaryAsync(
string version,
string architecture,
LibraryFetchOptions? options = null,
CancellationToken ct = default)
{
var normalizedArch = NormalizeArchitecture(architecture);
_logger.LogInformation(
"Fetching curl {Version} for {Architecture}",
version,
normalizedArch);
// Strategy 1: Try Debian/Ubuntu package (pre-built, preferred)
var debBinary = await TryFetchDebianPackageAsync(version, normalizedArch, options, ct);
if (debBinary is not null)
{
_logger.LogDebug("Found curl {Version} from Debian packages", version);
return debBinary;
}
// Strategy 2: Try Alpine APK
var alpineBinary = await TryFetchAlpinePackageAsync(version, normalizedArch, options, ct);
if (alpineBinary is not null)
{
_logger.LogDebug("Found curl {Version} from Alpine packages", version);
return alpineBinary;
}
_logger.LogWarning(
"Could not find pre-built curl {Version} for {Architecture}. Source build not implemented.",
version,
normalizedArch);
return null;
}
/// <inheritdoc />
public async IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
IEnumerable<string> versions,
string architecture,
LibraryFetchOptions? options = null,
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
{
foreach (var version in versions)
{
ct.ThrowIfCancellationRequested();
var binary = await FetchBinaryAsync(version, architecture, options, ct);
if (binary is not null)
{
yield return binary;
}
}
}
#region Private Methods
private ImmutableArray<string> ParseVersionsFromListing(string html)
{
// Match patterns like curl-8.5.0.tar.gz or curl-7.88.1.tar.xz
var matches = CurlVersionRegex().Matches(html);
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (Match match in matches)
{
if (match.Groups["version"].Success)
{
versions.Add(match.Groups["version"].Value);
}
}
return [.. versions];
}
private async Task<LibraryBinary?> TryFetchDebianPackageAsync(
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
var client = _httpClientFactory.CreateClient("DebianPackages");
var debArch = MapToDebianArchitecture(architecture);
if (debArch is null)
{
return null;
}
// curl library package names:
// libcurl4 (current), libcurl3 (older)
var packageNames = new[] { "libcurl4", "libcurl3" };
foreach (var packageName in packageNames)
{
var packageUrls = await FindDebianPackageUrlsAsync(client, packageName, version, debArch, ct);
foreach (var url in packageUrls)
{
try
{
_logger.LogDebug("Trying Debian curl package URL: {Url}", url);
var packageBytes = await client.GetByteArrayAsync(url, ct);
var binary = await ExtractLibCurlFromDebAsync(packageBytes, version, architecture, options, ct);
if (binary is not null)
{
return binary;
}
}
catch (HttpRequestException ex)
{
_logger.LogDebug(ex, "Failed to download Debian package from {Url}", url);
}
}
}
return null;
}
private async Task<LibraryBinary?> TryFetchAlpinePackageAsync(
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
var client = _httpClientFactory.CreateClient("AlpinePackages");
var alpineArch = MapToAlpineArchitecture(architecture);
if (alpineArch is null)
{
return null;
}
// Query Alpine package repository for libcurl
var packageUrls = await FindAlpinePackageUrlsAsync(client, "libcurl", version, alpineArch, ct);
foreach (var url in packageUrls)
{
try
{
_logger.LogDebug("Trying Alpine curl package URL: {Url}", url);
var packageBytes = await client.GetByteArrayAsync(url, ct);
var binary = await ExtractLibCurlFromApkAsync(packageBytes, version, architecture, options, ct);
if (binary is not null)
{
return binary;
}
}
catch (HttpRequestException ex)
{
_logger.LogDebug(ex, "Failed to download Alpine package from {Url}", url);
}
}
return null;
}
private async Task<ImmutableArray<string>> FindDebianPackageUrlsAsync(
HttpClient client,
string packageName,
string version,
string debianArch,
CancellationToken ct)
{
var apiUrl = $"https://snapshot.debian.org/mr/binary/{packageName}/";
try
{
var response = await client.GetStringAsync(apiUrl, ct);
var urls = ExtractPackageUrlsForVersion(response, version, debianArch);
return urls;
}
catch (HttpRequestException ex)
{
_logger.LogDebug(ex, "Debian snapshot API query failed for {Package}", packageName);
return [];
}
}
private async Task<ImmutableArray<string>> FindAlpinePackageUrlsAsync(
HttpClient client,
string packageName,
string version,
string alpineArch,
CancellationToken ct)
{
var releases = new[] { "v3.20", "v3.19", "v3.18", "v3.17" };
var urls = new List<string>();
foreach (var release in releases)
{
var baseUrl = $"https://dl-cdn.alpinelinux.org/alpine/{release}/main/{alpineArch}/";
try
{
var html = await client.GetStringAsync(baseUrl, ct);
var matches = AlpinePackageRegex().Matches(html);
foreach (Match match in matches)
{
if (match.Groups["name"].Value == packageName &&
match.Groups["version"].Value.StartsWith(version, StringComparison.OrdinalIgnoreCase))
{
urls.Add($"{baseUrl}{match.Groups["file"].Value}");
}
}
}
catch (HttpRequestException)
{
// Skip releases we can't access
}
}
return [.. urls];
}
private async Task<LibraryBinary?> ExtractLibCurlFromDebAsync(
byte[] debPackage,
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
// .deb extraction - placeholder
await Task.CompletedTask;
_logger.LogDebug(
"Debian package extraction not fully implemented. Package size: {Size} bytes",
debPackage.Length);
return null;
}
private async Task<LibraryBinary?> ExtractLibCurlFromApkAsync(
byte[] apkPackage,
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
// .apk extraction - placeholder
await Task.CompletedTask;
_logger.LogDebug(
"Alpine package extraction not fully implemented. Package size: {Size} bytes",
apkPackage.Length);
return null;
}
private static ImmutableArray<string> ExtractPackageUrlsForVersion(
string json,
string version,
string debianArch)
{
var urls = new List<string>();
try
{
using var doc = System.Text.Json.JsonDocument.Parse(json);
if (doc.RootElement.TryGetProperty("result", out var results))
{
foreach (var item in results.EnumerateArray())
{
if (item.TryGetProperty("binary_version", out var binaryVersion) &&
item.TryGetProperty("architecture", out var arch))
{
var binVer = binaryVersion.GetString() ?? string.Empty;
var archStr = arch.GetString() ?? string.Empty;
if (binVer.Contains(version, StringComparison.OrdinalIgnoreCase) &&
archStr.Equals(debianArch, StringComparison.OrdinalIgnoreCase))
{
if (item.TryGetProperty("files", out var files))
{
foreach (var file in files.EnumerateArray())
{
if (file.TryGetProperty("hash", out var hashElement))
{
var hash = hashElement.GetString();
if (!string.IsNullOrEmpty(hash))
{
urls.Add($"https://snapshot.debian.org/file/{hash}");
}
}
}
}
}
}
}
}
}
catch (System.Text.Json.JsonException)
{
// Invalid JSON
}
return [.. urls];
}
private static string NormalizeArchitecture(string architecture)
{
return architecture.ToLowerInvariant() switch
{
"x86_64" or "amd64" => "x86_64",
"aarch64" or "arm64" => "aarch64",
"armhf" or "armv7" or "arm" => "armhf",
"i386" or "i686" or "x86" => "i386",
_ => architecture
};
}
private static string? MapToDebianArchitecture(string architecture)
{
return architecture.ToLowerInvariant() switch
{
"x86_64" => "amd64",
"aarch64" => "arm64",
"armhf" or "armv7" => "armhf",
"i386" or "i686" => "i386",
_ => null
};
}
private static string? MapToAlpineArchitecture(string architecture)
{
return architecture.ToLowerInvariant() switch
{
"x86_64" => "x86_64",
"aarch64" => "aarch64",
"armhf" or "armv7" => "armhf",
"i386" or "i686" => "x86",
_ => null
};
}
private static Version? ParseVersion(string versionString)
{
if (Version.TryParse(versionString, out var version))
{
return version;
}
return null;
}
#endregion
#region Generated Regexes
[GeneratedRegex(@"curl-(?<version>\d+\.\d+(?:\.\d+)?)", RegexOptions.IgnoreCase)]
private static partial Regex CurlVersionRegex();
[GeneratedRegex(@"href=""(?<file>(?<name>[a-z0-9_-]+)-(?<version>[0-9.]+(?:-r\d+)?)\.apk)""", RegexOptions.IgnoreCase)]
private static partial Regex AlpinePackageRegex();
#endregion
}

View File

@@ -0,0 +1,549 @@
using System.Collections.Immutable;
using System.Net.Http;
using System.Security.Cryptography;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Http;
using Microsoft.Extensions.Logging;
using StellaOps.BinaryIndex.Corpus.Models;
namespace StellaOps.BinaryIndex.Corpus.Connectors;
/// <summary>
/// Corpus connector for GNU C Library (glibc).
/// Fetches pre-built binaries from Debian/Ubuntu package repositories
/// or GNU FTP mirrors for source builds.
/// </summary>
public sealed partial class GlibcCorpusConnector : ILibraryCorpusConnector
{
private readonly IHttpClientFactory _httpClientFactory;
private readonly ILogger<GlibcCorpusConnector> _logger;
/// <summary>
/// Base URL for GNU FTP mirror (source tarballs).
/// </summary>
public const string GnuMirrorUrl = "https://ftp.gnu.org/gnu/glibc/";
/// <summary>
/// Base URL for Debian package archive.
/// </summary>
public const string DebianSnapshotUrl = "https://snapshot.debian.org/package/glibc/";
/// <summary>
/// Supported architectures for glibc.
/// </summary>
private static readonly ImmutableArray<string> s_supportedArchitectures =
["x86_64", "aarch64", "armhf", "i386", "arm64", "ppc64el", "s390x"];
public GlibcCorpusConnector(
IHttpClientFactory httpClientFactory,
ILogger<GlibcCorpusConnector> logger)
{
_httpClientFactory = httpClientFactory;
_logger = logger;
}
/// <inheritdoc />
public string LibraryName => "glibc";
/// <inheritdoc />
public ImmutableArray<string> SupportedArchitectures => s_supportedArchitectures;
/// <inheritdoc />
public async Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default)
{
var client = _httpClientFactory.CreateClient("GnuMirror");
try
{
_logger.LogDebug("Fetching glibc versions from {Url}", GnuMirrorUrl);
var html = await client.GetStringAsync(GnuMirrorUrl, ct);
// Parse directory listing for glibc-X.Y.tar.xz files
var versions = ParseVersionsFromListing(html);
_logger.LogInformation("Found {Count} glibc versions from GNU mirror", versions.Length);
return versions;
}
catch (HttpRequestException ex)
{
_logger.LogWarning(ex, "Failed to fetch glibc versions from GNU mirror, trying Debian snapshot");
// Fallback to Debian snapshot
return await GetVersionsFromDebianSnapshotAsync(client, ct);
}
}
/// <inheritdoc />
public async Task<LibraryBinary?> FetchBinaryAsync(
string version,
string architecture,
LibraryFetchOptions? options = null,
CancellationToken ct = default)
{
var normalizedArch = NormalizeArchitecture(architecture);
var abi = options?.PreferredAbi ?? "gnu";
_logger.LogInformation(
"Fetching glibc {Version} for {Architecture}",
version,
normalizedArch);
// Strategy 1: Try Debian package (pre-built, preferred)
var debBinary = await TryFetchDebianPackageAsync(version, normalizedArch, options, ct);
if (debBinary is not null)
{
_logger.LogDebug("Found glibc {Version} from Debian packages", version);
return debBinary;
}
// Strategy 2: Try Ubuntu package
var ubuntuBinary = await TryFetchUbuntuPackageAsync(version, normalizedArch, options, ct);
if (ubuntuBinary is not null)
{
_logger.LogDebug("Found glibc {Version} from Ubuntu packages", version);
return ubuntuBinary;
}
_logger.LogWarning(
"Could not find pre-built glibc {Version} for {Architecture}. Source build not implemented.",
version,
normalizedArch);
return null;
}
/// <inheritdoc />
public async IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
IEnumerable<string> versions,
string architecture,
LibraryFetchOptions? options = null,
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
{
foreach (var version in versions)
{
ct.ThrowIfCancellationRequested();
var binary = await FetchBinaryAsync(version, architecture, options, ct);
if (binary is not null)
{
yield return binary;
}
}
}
#region Private Methods
private ImmutableArray<string> ParseVersionsFromListing(string html)
{
// Match patterns like glibc-2.31.tar.gz or glibc-2.38.tar.xz
var matches = GlibcVersionRegex().Matches(html);
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (Match match in matches)
{
if (match.Groups["version"].Success)
{
versions.Add(match.Groups["version"].Value);
}
}
return [.. versions.OrderByDescending(ParseVersion)];
}
private async Task<ImmutableArray<string>> GetVersionsFromDebianSnapshotAsync(
HttpClient client,
CancellationToken ct)
{
try
{
var html = await client.GetStringAsync(DebianSnapshotUrl, ct);
// Parse Debian snapshot listing for glibc versions
var matches = DebianVersionRegex().Matches(html);
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (Match match in matches)
{
if (match.Groups["version"].Success)
{
// Extract just the upstream version (before the Debian revision)
var fullVersion = match.Groups["version"].Value;
var upstreamVersion = ExtractUpstreamVersion(fullVersion);
if (!string.IsNullOrEmpty(upstreamVersion))
{
versions.Add(upstreamVersion);
}
}
}
return [.. versions.OrderByDescending(ParseVersion)];
}
catch (HttpRequestException ex)
{
_logger.LogError(ex, "Failed to fetch versions from Debian snapshot");
return [];
}
}
private async Task<LibraryBinary?> TryFetchDebianPackageAsync(
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
var client = _httpClientFactory.CreateClient("DebianPackages");
// Map architecture to Debian naming
var debArch = MapToDebianArchitecture(architecture);
if (debArch is null)
{
_logger.LogDebug("Architecture {Arch} not supported for Debian packages", architecture);
return null;
}
// Query Debian snapshot for matching package
var packageUrls = await FindDebianPackageUrlsAsync(client, version, debArch, ct);
foreach (var url in packageUrls)
{
try
{
_logger.LogDebug("Trying Debian package URL: {Url}", url);
var packageBytes = await client.GetByteArrayAsync(url, ct);
// Extract the libc6 shared library from the .deb package
var binary = await ExtractLibcFromDebAsync(packageBytes, version, architecture, options, ct);
if (binary is not null)
{
return binary;
}
}
catch (HttpRequestException ex)
{
_logger.LogDebug(ex, "Failed to download Debian package from {Url}", url);
}
}
return null;
}
private async Task<LibraryBinary?> TryFetchUbuntuPackageAsync(
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
var client = _httpClientFactory.CreateClient("UbuntuPackages");
// Map architecture to Ubuntu naming (same as Debian)
var debArch = MapToDebianArchitecture(architecture);
if (debArch is null)
{
return null;
}
// Query Launchpad for matching package
var packageUrls = await FindUbuntuPackageUrlsAsync(client, version, debArch, ct);
foreach (var url in packageUrls)
{
try
{
_logger.LogDebug("Trying Ubuntu package URL: {Url}", url);
var packageBytes = await client.GetByteArrayAsync(url, ct);
// Extract the libc6 shared library from the .deb package
var binary = await ExtractLibcFromDebAsync(packageBytes, version, architecture, options, ct);
if (binary is not null)
{
return binary;
}
}
catch (HttpRequestException ex)
{
_logger.LogDebug(ex, "Failed to download Ubuntu package from {Url}", url);
}
}
return null;
}
private async Task<ImmutableArray<string>> FindDebianPackageUrlsAsync(
HttpClient client,
string version,
string debianArch,
CancellationToken ct)
{
// Construct Debian snapshot API URL
// Format: https://snapshot.debian.org/mr/package/glibc/<version>/binfiles/libc6/<arch>
var apiUrl = $"https://snapshot.debian.org/mr/package/glibc/{version}/binfiles/libc6/{debianArch}";
try
{
var response = await client.GetStringAsync(apiUrl, ct);
// Parse JSON response to get file hashes and construct download URLs
// Simplified: extract URLs from response
var urls = ExtractPackageUrlsFromSnapshotResponse(response);
return urls;
}
catch (HttpRequestException)
{
// Try alternative: direct binary package search
return await FindDebianPackageUrlsViaSearchAsync(client, version, debianArch, ct);
}
}
private async Task<ImmutableArray<string>> FindDebianPackageUrlsViaSearchAsync(
HttpClient client,
string version,
string debianArch,
CancellationToken ct)
{
// Fallback: search packages.debian.org
var searchUrl = $"https://packages.debian.org/search?keywords=libc6&searchon=names&suite=all&section=all&arch={debianArch}";
try
{
var html = await client.GetStringAsync(searchUrl, ct);
// Parse search results to find matching version
var urls = ParseDebianSearchResults(html, version, debianArch);
return urls;
}
catch (HttpRequestException ex)
{
_logger.LogDebug(ex, "Debian package search failed");
return [];
}
}
private async Task<ImmutableArray<string>> FindUbuntuPackageUrlsAsync(
HttpClient client,
string version,
string debianArch,
CancellationToken ct)
{
// Query Launchpad for libc6 package
// Format: https://launchpad.net/ubuntu/+archive/primary/+files/libc6_<version>_<arch>.deb
var launchpadApiUrl = $"https://api.launchpad.net/1.0/ubuntu/+archive/primary?ws.op=getPublishedBinaries&binary_name=libc6&version={version}&distro_arch_series=https://api.launchpad.net/1.0/ubuntu/+distroarchseries/{debianArch}";
try
{
var response = await client.GetStringAsync(launchpadApiUrl, ct);
var urls = ExtractPackageUrlsFromLaunchpadResponse(response);
return urls;
}
catch (HttpRequestException ex)
{
_logger.LogDebug(ex, "Launchpad API query failed");
return [];
}
}
private async Task<LibraryBinary?> ExtractLibcFromDebAsync(
byte[] debPackage,
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
// .deb files are ar archives containing:
// - debian-binary (version string)
// - control.tar.xz (package metadata)
// - data.tar.xz (actual files)
//
// We need to extract /lib/x86_64-linux-gnu/libc.so.6 from data.tar.xz
try
{
// Use SharpCompress or similar to extract (placeholder for now)
// In production, implement proper ar + tar.xz extraction
await Task.CompletedTask; // Placeholder for async extraction
// For now, return null - full extraction requires SharpCompress/libarchive
_logger.LogDebug(
"Debian package extraction not fully implemented. Package size: {Size} bytes",
debPackage.Length);
return null;
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to extract libc from .deb package");
return null;
}
}
private static string NormalizeArchitecture(string architecture)
{
return architecture.ToLowerInvariant() switch
{
"x86_64" or "amd64" => "x86_64",
"aarch64" or "arm64" => "aarch64",
"armhf" or "armv7" or "arm" => "armhf",
"i386" or "i686" or "x86" => "i386",
"ppc64le" or "ppc64el" => "ppc64el",
"s390x" => "s390x",
_ => architecture
};
}
private static string? MapToDebianArchitecture(string architecture)
{
return architecture.ToLowerInvariant() switch
{
"x86_64" => "amd64",
"aarch64" => "arm64",
"armhf" or "armv7" => "armhf",
"i386" or "i686" => "i386",
"ppc64el" => "ppc64el",
"s390x" => "s390x",
_ => null
};
}
private static string? ExtractUpstreamVersion(string debianVersion)
{
// Debian version format: [epoch:]upstream_version[-debian_revision]
// Examples:
// 2.31-13+deb11u5 -> 2.31
// 1:2.35-0ubuntu3 -> 2.35
var match = UpstreamVersionRegex().Match(debianVersion);
return match.Success ? match.Groups["upstream"].Value : null;
}
private static ImmutableArray<string> ExtractPackageUrlsFromSnapshotResponse(string json)
{
// Parse JSON response from snapshot.debian.org
// Format: {"result": [{"hash": "...", "name": "libc6_2.31-13_amd64.deb"}]}
var urls = new List<string>();
try
{
using var doc = System.Text.Json.JsonDocument.Parse(json);
if (doc.RootElement.TryGetProperty("result", out var results))
{
foreach (var item in results.EnumerateArray())
{
if (item.TryGetProperty("hash", out var hashElement))
{
var hash = hashElement.GetString();
if (!string.IsNullOrEmpty(hash))
{
// Construct download URL from hash
var url = $"https://snapshot.debian.org/file/{hash}";
urls.Add(url);
}
}
}
}
}
catch (System.Text.Json.JsonException)
{
// Invalid JSON, return empty
}
return [.. urls];
}
private static ImmutableArray<string> ExtractPackageUrlsFromLaunchpadResponse(string json)
{
var urls = new List<string>();
try
{
using var doc = System.Text.Json.JsonDocument.Parse(json);
if (doc.RootElement.TryGetProperty("entries", out var entries))
{
foreach (var entry in entries.EnumerateArray())
{
if (entry.TryGetProperty("binary_package_version", out var versionElement) &&
entry.TryGetProperty("self_link", out var selfLink))
{
var link = selfLink.GetString();
if (!string.IsNullOrEmpty(link))
{
// Launchpad provides download URL in separate field
urls.Add(link);
}
}
}
}
}
catch (System.Text.Json.JsonException)
{
// Invalid JSON
}
return [.. urls];
}
private static ImmutableArray<string> ParseDebianSearchResults(
string html,
string version,
string debianArch)
{
// Parse HTML search results to find package URLs
// This is a simplified implementation
var urls = new List<string>();
var matches = DebianPackageUrlRegex().Matches(html);
foreach (Match match in matches)
{
if (match.Groups["url"].Success)
{
var url = match.Groups["url"].Value;
if (url.Contains(version) && url.Contains(debianArch))
{
urls.Add(url);
}
}
}
return [.. urls];
}
private static Version? ParseVersion(string versionString)
{
// Try to parse as Version, handling various formats
// 2.31 -> 2.31.0.0
// 2.31.1 -> 2.31.1.0
if (Version.TryParse(versionString, out var version))
{
return version;
}
// Try adding .0 suffix
if (Version.TryParse(versionString + ".0", out version))
{
return version;
}
return null;
}
#endregion
#region Generated Regexes
[GeneratedRegex(@"glibc-(?<version>\d+\.\d+(?:\.\d+)?)", RegexOptions.IgnoreCase)]
private static partial Regex GlibcVersionRegex();
[GeneratedRegex(@"(?<version>\d+\.\d+(?:\.\d+)?(?:-\d+)?)", RegexOptions.IgnoreCase)]
private static partial Regex DebianVersionRegex();
[GeneratedRegex(@"(?:^|\:)?(?<upstream>\d+\.\d+(?:\.\d+)?)(?:-|$)", RegexOptions.IgnoreCase)]
private static partial Regex UpstreamVersionRegex();
[GeneratedRegex(@"href=""(?<url>https?://[^""]+\.deb)""", RegexOptions.IgnoreCase)]
private static partial Regex DebianPackageUrlRegex();
#endregion
}

View File

@@ -0,0 +1,554 @@
using System.Collections.Immutable;
using System.Net.Http;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using StellaOps.BinaryIndex.Corpus.Models;
namespace StellaOps.BinaryIndex.Corpus.Connectors;
/// <summary>
/// Corpus connector for OpenSSL libraries.
/// Fetches pre-built binaries from distribution packages or official releases.
/// </summary>
public sealed partial class OpenSslCorpusConnector : ILibraryCorpusConnector
{
private readonly IHttpClientFactory _httpClientFactory;
private readonly ILogger<OpenSslCorpusConnector> _logger;
/// <summary>
/// Base URL for OpenSSL official releases.
/// </summary>
public const string OpenSslReleasesUrl = "https://www.openssl.org/source/";
/// <summary>
/// Base URL for OpenSSL old releases.
/// </summary>
public const string OpenSslOldReleasesUrl = "https://www.openssl.org/source/old/";
/// <summary>
/// Supported architectures.
/// </summary>
private static readonly ImmutableArray<string> s_supportedArchitectures =
["x86_64", "aarch64", "armhf", "i386"];
public OpenSslCorpusConnector(
IHttpClientFactory httpClientFactory,
ILogger<OpenSslCorpusConnector> logger)
{
_httpClientFactory = httpClientFactory;
_logger = logger;
}
/// <inheritdoc />
public string LibraryName => "openssl";
/// <inheritdoc />
public ImmutableArray<string> SupportedArchitectures => s_supportedArchitectures;
/// <inheritdoc />
public async Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default)
{
var client = _httpClientFactory.CreateClient("OpenSsl");
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
// Fetch current releases
try
{
_logger.LogDebug("Fetching OpenSSL versions from {Url}", OpenSslReleasesUrl);
var html = await client.GetStringAsync(OpenSslReleasesUrl, ct);
var currentVersions = ParseVersionsFromListing(html);
foreach (var v in currentVersions)
{
versions.Add(v);
}
}
catch (HttpRequestException ex)
{
_logger.LogWarning(ex, "Failed to fetch current OpenSSL releases");
}
// Fetch old releases index
try
{
_logger.LogDebug("Fetching old OpenSSL versions from {Url}", OpenSslOldReleasesUrl);
var oldHtml = await client.GetStringAsync(OpenSslOldReleasesUrl, ct);
var oldVersionDirs = ParseOldVersionDirectories(oldHtml);
foreach (var dir in oldVersionDirs)
{
var dirUrl = $"{OpenSslOldReleasesUrl}{dir}/";
try
{
var dirHtml = await client.GetStringAsync(dirUrl, ct);
var dirVersions = ParseVersionsFromListing(dirHtml);
foreach (var v in dirVersions)
{
versions.Add(v);
}
}
catch (HttpRequestException)
{
// Skip directories we can't access
}
}
}
catch (HttpRequestException ex)
{
_logger.LogWarning(ex, "Failed to fetch old OpenSSL releases");
}
_logger.LogInformation("Found {Count} OpenSSL versions", versions.Count);
return [.. versions.OrderByDescending(ParseVersion)];
}
/// <inheritdoc />
public async Task<LibraryBinary?> FetchBinaryAsync(
string version,
string architecture,
LibraryFetchOptions? options = null,
CancellationToken ct = default)
{
var normalizedArch = NormalizeArchitecture(architecture);
_logger.LogInformation(
"Fetching OpenSSL {Version} for {Architecture}",
version,
normalizedArch);
// Strategy 1: Try Debian/Ubuntu package (pre-built, preferred)
var debBinary = await TryFetchDebianPackageAsync(version, normalizedArch, options, ct);
if (debBinary is not null)
{
_logger.LogDebug("Found OpenSSL {Version} from Debian packages", version);
return debBinary;
}
// Strategy 2: Try Alpine APK
var alpineBinary = await TryFetchAlpinePackageAsync(version, normalizedArch, options, ct);
if (alpineBinary is not null)
{
_logger.LogDebug("Found OpenSSL {Version} from Alpine packages", version);
return alpineBinary;
}
_logger.LogWarning(
"Could not find pre-built OpenSSL {Version} for {Architecture}. Source build not implemented.",
version,
normalizedArch);
return null;
}
/// <inheritdoc />
public async IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
IEnumerable<string> versions,
string architecture,
LibraryFetchOptions? options = null,
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
{
foreach (var version in versions)
{
ct.ThrowIfCancellationRequested();
var binary = await FetchBinaryAsync(version, architecture, options, ct);
if (binary is not null)
{
yield return binary;
}
}
}
#region Private Methods
private ImmutableArray<string> ParseVersionsFromListing(string html)
{
// Match patterns like openssl-1.1.1n.tar.gz or openssl-3.0.8.tar.gz
var matches = OpenSslVersionRegex().Matches(html);
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (Match match in matches)
{
if (match.Groups["version"].Success)
{
var version = match.Groups["version"].Value;
// Normalize version: 1.1.1n -> 1.1.1n, 3.0.8 -> 3.0.8
versions.Add(version);
}
}
return [.. versions];
}
private ImmutableArray<string> ParseOldVersionDirectories(string html)
{
// Match directory names like 1.0.2/, 1.1.0/, 1.1.1/, 3.0/
var matches = VersionDirRegex().Matches(html);
var dirs = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (Match match in matches)
{
if (match.Groups["dir"].Success)
{
dirs.Add(match.Groups["dir"].Value);
}
}
return [.. dirs];
}
private async Task<LibraryBinary?> TryFetchDebianPackageAsync(
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
var client = _httpClientFactory.CreateClient("DebianPackages");
var debArch = MapToDebianArchitecture(architecture);
if (debArch is null)
{
return null;
}
// Determine package name based on version
// OpenSSL 1.x -> libssl1.1
// OpenSSL 3.x -> libssl3
var packageName = GetDebianPackageName(version);
// Query Debian snapshot for matching package
var packageUrls = await FindDebianPackageUrlsAsync(client, packageName, version, debArch, ct);
foreach (var url in packageUrls)
{
try
{
_logger.LogDebug("Trying Debian OpenSSL package URL: {Url}", url);
var packageBytes = await client.GetByteArrayAsync(url, ct);
// Extract libssl.so.X from the .deb package
var binary = await ExtractLibSslFromDebAsync(packageBytes, version, architecture, options, ct);
if (binary is not null)
{
return binary;
}
}
catch (HttpRequestException ex)
{
_logger.LogDebug(ex, "Failed to download Debian package from {Url}", url);
}
}
return null;
}
private async Task<LibraryBinary?> TryFetchAlpinePackageAsync(
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
var client = _httpClientFactory.CreateClient("AlpinePackages");
var alpineArch = MapToAlpineArchitecture(architecture);
if (alpineArch is null)
{
return null;
}
// Query Alpine package repository
var packageUrls = await FindAlpinePackageUrlsAsync(client, "libssl3", version, alpineArch, ct);
foreach (var url in packageUrls)
{
try
{
_logger.LogDebug("Trying Alpine OpenSSL package URL: {Url}", url);
var packageBytes = await client.GetByteArrayAsync(url, ct);
// Extract libssl.so.X from the .apk package
var binary = await ExtractLibSslFromApkAsync(packageBytes, version, architecture, options, ct);
if (binary is not null)
{
return binary;
}
}
catch (HttpRequestException ex)
{
_logger.LogDebug(ex, "Failed to download Alpine package from {Url}", url);
}
}
return null;
}
private async Task<ImmutableArray<string>> FindDebianPackageUrlsAsync(
HttpClient client,
string packageName,
string version,
string debianArch,
CancellationToken ct)
{
// Map OpenSSL version to Debian source package version
// e.g., 1.1.1n -> libssl1.1_1.1.1n-0+deb11u4
var apiUrl = $"https://snapshot.debian.org/mr/binary/{packageName}/";
try
{
var response = await client.GetStringAsync(apiUrl, ct);
// Parse JSON response to find matching versions
var urls = ExtractPackageUrlsForVersion(response, version, debianArch);
return urls;
}
catch (HttpRequestException ex)
{
_logger.LogDebug(ex, "Debian snapshot API query failed for {Package}", packageName);
return [];
}
}
private async Task<ImmutableArray<string>> FindAlpinePackageUrlsAsync(
HttpClient client,
string packageName,
string version,
string alpineArch,
CancellationToken ct)
{
// Alpine uses different repository structure
// https://dl-cdn.alpinelinux.org/alpine/v3.18/main/x86_64/libssl3-3.1.1-r1.apk
var releases = new[] { "v3.20", "v3.19", "v3.18", "v3.17" };
var urls = new List<string>();
foreach (var release in releases)
{
var baseUrl = $"https://dl-cdn.alpinelinux.org/alpine/{release}/main/{alpineArch}/";
try
{
var html = await client.GetStringAsync(baseUrl, ct);
// Find package URLs matching version
var matches = AlpinePackageRegex().Matches(html);
foreach (Match match in matches)
{
if (match.Groups["name"].Value == packageName &&
match.Groups["version"].Value.StartsWith(version, StringComparison.OrdinalIgnoreCase))
{
urls.Add($"{baseUrl}{match.Groups["file"].Value}");
}
}
}
catch (HttpRequestException)
{
// Skip releases we can't access
}
}
return [.. urls];
}
private async Task<LibraryBinary?> ExtractLibSslFromDebAsync(
byte[] debPackage,
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
// .deb extraction - placeholder for now
// In production, implement proper ar + tar.xz extraction
await Task.CompletedTask;
_logger.LogDebug(
"Debian package extraction not fully implemented. Package size: {Size} bytes",
debPackage.Length);
return null;
}
private async Task<LibraryBinary?> ExtractLibSslFromApkAsync(
byte[] apkPackage,
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
// .apk files are gzip-compressed tar archives
// In production, implement proper tar.gz extraction
await Task.CompletedTask;
_logger.LogDebug(
"Alpine package extraction not fully implemented. Package size: {Size} bytes",
apkPackage.Length);
return null;
}
private static string GetDebianPackageName(string version)
{
// OpenSSL 1.0.x -> libssl1.0.0
// OpenSSL 1.1.x -> libssl1.1
// OpenSSL 3.x -> libssl3
if (version.StartsWith("1.0", StringComparison.OrdinalIgnoreCase))
{
return "libssl1.0.0";
}
else if (version.StartsWith("1.1", StringComparison.OrdinalIgnoreCase))
{
return "libssl1.1";
}
else
{
return "libssl3";
}
}
private static ImmutableArray<string> ExtractPackageUrlsForVersion(
string json,
string version,
string debianArch)
{
var urls = new List<string>();
try
{
using var doc = System.Text.Json.JsonDocument.Parse(json);
if (doc.RootElement.TryGetProperty("result", out var results))
{
foreach (var item in results.EnumerateArray())
{
if (item.TryGetProperty("binary_version", out var binaryVersion) &&
item.TryGetProperty("architecture", out var arch))
{
var binVer = binaryVersion.GetString() ?? string.Empty;
var archStr = arch.GetString() ?? string.Empty;
// Check if version matches and architecture matches
if (binVer.Contains(version, StringComparison.OrdinalIgnoreCase) &&
archStr.Equals(debianArch, StringComparison.OrdinalIgnoreCase))
{
if (item.TryGetProperty("files", out var files))
{
foreach (var file in files.EnumerateArray())
{
if (file.TryGetProperty("hash", out var hashElement))
{
var hash = hashElement.GetString();
if (!string.IsNullOrEmpty(hash))
{
urls.Add($"https://snapshot.debian.org/file/{hash}");
}
}
}
}
}
}
}
}
}
catch (System.Text.Json.JsonException)
{
// Invalid JSON
}
return [.. urls];
}
private static string NormalizeArchitecture(string architecture)
{
return architecture.ToLowerInvariant() switch
{
"x86_64" or "amd64" => "x86_64",
"aarch64" or "arm64" => "aarch64",
"armhf" or "armv7" or "arm" => "armhf",
"i386" or "i686" or "x86" => "i386",
_ => architecture
};
}
private static string? MapToDebianArchitecture(string architecture)
{
return architecture.ToLowerInvariant() switch
{
"x86_64" => "amd64",
"aarch64" => "arm64",
"armhf" or "armv7" => "armhf",
"i386" or "i686" => "i386",
_ => null
};
}
private static string? MapToAlpineArchitecture(string architecture)
{
return architecture.ToLowerInvariant() switch
{
"x86_64" => "x86_64",
"aarch64" => "aarch64",
"armhf" or "armv7" => "armhf",
"i386" or "i686" => "x86",
_ => null
};
}
private static Version? ParseVersion(string versionString)
{
// OpenSSL versions can be like 1.1.1n or 3.0.8
// Extract numeric parts only
var numericPart = ExtractNumericVersion(versionString);
if (Version.TryParse(numericPart, out var version))
{
return version;
}
return null;
}
private static string ExtractNumericVersion(string version)
{
// 1.1.1n -> 1.1.1
// 3.0.8 -> 3.0.8
var parts = new List<string>();
foreach (var ch in version)
{
if (char.IsDigit(ch) || ch == '.')
{
if (parts.Count == 0)
{
parts.Add(ch.ToString());
}
else if (ch == '.')
{
parts.Add(".");
}
else
{
parts[^1] += ch;
}
}
else if (parts.Count > 0 && parts[^1] != ".")
{
// Stop at first non-digit after version starts
break;
}
}
return string.Join("", parts).TrimEnd('.');
}
#endregion
#region Generated Regexes
[GeneratedRegex(@"openssl-(?<version>\d+\.\d+\.\d+[a-z]?)", RegexOptions.IgnoreCase)]
private static partial Regex OpenSslVersionRegex();
[GeneratedRegex(@"href=""(?<dir>\d+\.\d+(?:\.\d+)?)/""", RegexOptions.IgnoreCase)]
private static partial Regex VersionDirRegex();
[GeneratedRegex(@"href=""(?<file>(?<name>[a-z0-9_-]+)-(?<version>[0-9.]+[a-z]?-r\d+)\.apk)""", RegexOptions.IgnoreCase)]
private static partial Regex AlpinePackageRegex();
#endregion
}

View File

@@ -0,0 +1,452 @@
using System.Collections.Immutable;
using System.Net.Http;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using StellaOps.BinaryIndex.Corpus.Models;
namespace StellaOps.BinaryIndex.Corpus.Connectors;
/// <summary>
/// Corpus connector for zlib compression library.
/// Fetches pre-built binaries from distribution packages or official releases.
/// </summary>
public sealed partial class ZlibCorpusConnector : ILibraryCorpusConnector
{
private readonly IHttpClientFactory _httpClientFactory;
private readonly ILogger<ZlibCorpusConnector> _logger;
/// <summary>
/// Base URL for zlib official releases.
/// </summary>
public const string ZlibReleasesUrl = "https://www.zlib.net/";
/// <summary>
/// Base URL for zlib fossils/old releases.
/// </summary>
public const string ZlibFossilsUrl = "https://www.zlib.net/fossils/";
/// <summary>
/// Supported architectures.
/// </summary>
private static readonly ImmutableArray<string> s_supportedArchitectures =
["x86_64", "aarch64", "armhf", "i386"];
public ZlibCorpusConnector(
IHttpClientFactory httpClientFactory,
ILogger<ZlibCorpusConnector> logger)
{
_httpClientFactory = httpClientFactory;
_logger = logger;
}
/// <inheritdoc />
public string LibraryName => "zlib";
/// <inheritdoc />
public ImmutableArray<string> SupportedArchitectures => s_supportedArchitectures;
/// <inheritdoc />
public async Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default)
{
var client = _httpClientFactory.CreateClient("Zlib");
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
// Fetch current release
try
{
_logger.LogDebug("Fetching zlib versions from {Url}", ZlibReleasesUrl);
var html = await client.GetStringAsync(ZlibReleasesUrl, ct);
var currentVersions = ParseVersionsFromListing(html);
foreach (var v in currentVersions)
{
versions.Add(v);
}
}
catch (HttpRequestException ex)
{
_logger.LogWarning(ex, "Failed to fetch current zlib releases");
}
// Fetch old releases (fossils)
try
{
_logger.LogDebug("Fetching old zlib versions from {Url}", ZlibFossilsUrl);
var fossilsHtml = await client.GetStringAsync(ZlibFossilsUrl, ct);
var fossilVersions = ParseVersionsFromListing(fossilsHtml);
foreach (var v in fossilVersions)
{
versions.Add(v);
}
}
catch (HttpRequestException ex)
{
_logger.LogWarning(ex, "Failed to fetch old zlib releases");
}
_logger.LogInformation("Found {Count} zlib versions", versions.Count);
return [.. versions.OrderByDescending(ParseVersion)];
}
/// <inheritdoc />
public async Task<LibraryBinary?> FetchBinaryAsync(
string version,
string architecture,
LibraryFetchOptions? options = null,
CancellationToken ct = default)
{
var normalizedArch = NormalizeArchitecture(architecture);
_logger.LogInformation(
"Fetching zlib {Version} for {Architecture}",
version,
normalizedArch);
// Strategy 1: Try Debian/Ubuntu package (pre-built, preferred)
var debBinary = await TryFetchDebianPackageAsync(version, normalizedArch, options, ct);
if (debBinary is not null)
{
_logger.LogDebug("Found zlib {Version} from Debian packages", version);
return debBinary;
}
// Strategy 2: Try Alpine APK
var alpineBinary = await TryFetchAlpinePackageAsync(version, normalizedArch, options, ct);
if (alpineBinary is not null)
{
_logger.LogDebug("Found zlib {Version} from Alpine packages", version);
return alpineBinary;
}
_logger.LogWarning(
"Could not find pre-built zlib {Version} for {Architecture}. Source build not implemented.",
version,
normalizedArch);
return null;
}
/// <inheritdoc />
public async IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
IEnumerable<string> versions,
string architecture,
LibraryFetchOptions? options = null,
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
{
foreach (var version in versions)
{
ct.ThrowIfCancellationRequested();
var binary = await FetchBinaryAsync(version, architecture, options, ct);
if (binary is not null)
{
yield return binary;
}
}
}
#region Private Methods
private ImmutableArray<string> ParseVersionsFromListing(string html)
{
// Match patterns like zlib-1.2.13.tar.gz or zlib-1.3.1.tar.xz
var matches = ZlibVersionRegex().Matches(html);
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
foreach (Match match in matches)
{
if (match.Groups["version"].Success)
{
versions.Add(match.Groups["version"].Value);
}
}
return [.. versions];
}
private async Task<LibraryBinary?> TryFetchDebianPackageAsync(
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
var client = _httpClientFactory.CreateClient("DebianPackages");
var debArch = MapToDebianArchitecture(architecture);
if (debArch is null)
{
return null;
}
// zlib package name is zlib1g
const string packageName = "zlib1g";
// Query Debian snapshot for matching package
var packageUrls = await FindDebianPackageUrlsAsync(client, packageName, version, debArch, ct);
foreach (var url in packageUrls)
{
try
{
_logger.LogDebug("Trying Debian zlib package URL: {Url}", url);
var packageBytes = await client.GetByteArrayAsync(url, ct);
// Extract libz.so.1 from the .deb package
var binary = await ExtractLibZFromDebAsync(packageBytes, version, architecture, options, ct);
if (binary is not null)
{
return binary;
}
}
catch (HttpRequestException ex)
{
_logger.LogDebug(ex, "Failed to download Debian package from {Url}", url);
}
}
return null;
}
private async Task<LibraryBinary?> TryFetchAlpinePackageAsync(
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
var client = _httpClientFactory.CreateClient("AlpinePackages");
var alpineArch = MapToAlpineArchitecture(architecture);
if (alpineArch is null)
{
return null;
}
// Query Alpine package repository for zlib
var packageUrls = await FindAlpinePackageUrlsAsync(client, "zlib", version, alpineArch, ct);
foreach (var url in packageUrls)
{
try
{
_logger.LogDebug("Trying Alpine zlib package URL: {Url}", url);
var packageBytes = await client.GetByteArrayAsync(url, ct);
// Extract libz.so.1 from the .apk package
var binary = await ExtractLibZFromApkAsync(packageBytes, version, architecture, options, ct);
if (binary is not null)
{
return binary;
}
}
catch (HttpRequestException ex)
{
_logger.LogDebug(ex, "Failed to download Alpine package from {Url}", url);
}
}
return null;
}
private async Task<ImmutableArray<string>> FindDebianPackageUrlsAsync(
HttpClient client,
string packageName,
string version,
string debianArch,
CancellationToken ct)
{
var apiUrl = $"https://snapshot.debian.org/mr/binary/{packageName}/";
try
{
var response = await client.GetStringAsync(apiUrl, ct);
var urls = ExtractPackageUrlsForVersion(response, version, debianArch);
return urls;
}
catch (HttpRequestException ex)
{
_logger.LogDebug(ex, "Debian snapshot API query failed for {Package}", packageName);
return [];
}
}
private async Task<ImmutableArray<string>> FindAlpinePackageUrlsAsync(
HttpClient client,
string packageName,
string version,
string alpineArch,
CancellationToken ct)
{
var releases = new[] { "v3.20", "v3.19", "v3.18", "v3.17" };
var urls = new List<string>();
foreach (var release in releases)
{
var baseUrl = $"https://dl-cdn.alpinelinux.org/alpine/{release}/main/{alpineArch}/";
try
{
var html = await client.GetStringAsync(baseUrl, ct);
// Find package URLs matching version
var matches = AlpinePackageRegex().Matches(html);
foreach (Match match in matches)
{
if (match.Groups["name"].Value == packageName &&
match.Groups["version"].Value.StartsWith(version, StringComparison.OrdinalIgnoreCase))
{
urls.Add($"{baseUrl}{match.Groups["file"].Value}");
}
}
}
catch (HttpRequestException)
{
// Skip releases we can't access
}
}
return [.. urls];
}
private async Task<LibraryBinary?> ExtractLibZFromDebAsync(
byte[] debPackage,
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
// .deb extraction - placeholder for now
await Task.CompletedTask;
_logger.LogDebug(
"Debian package extraction not fully implemented. Package size: {Size} bytes",
debPackage.Length);
return null;
}
private async Task<LibraryBinary?> ExtractLibZFromApkAsync(
byte[] apkPackage,
string version,
string architecture,
LibraryFetchOptions? options,
CancellationToken ct)
{
// .apk extraction - placeholder for now
await Task.CompletedTask;
_logger.LogDebug(
"Alpine package extraction not fully implemented. Package size: {Size} bytes",
apkPackage.Length);
return null;
}
private static ImmutableArray<string> ExtractPackageUrlsForVersion(
string json,
string version,
string debianArch)
{
var urls = new List<string>();
try
{
using var doc = System.Text.Json.JsonDocument.Parse(json);
if (doc.RootElement.TryGetProperty("result", out var results))
{
foreach (var item in results.EnumerateArray())
{
if (item.TryGetProperty("binary_version", out var binaryVersion) &&
item.TryGetProperty("architecture", out var arch))
{
var binVer = binaryVersion.GetString() ?? string.Empty;
var archStr = arch.GetString() ?? string.Empty;
// Check if version matches and architecture matches
if (binVer.Contains(version, StringComparison.OrdinalIgnoreCase) &&
archStr.Equals(debianArch, StringComparison.OrdinalIgnoreCase))
{
if (item.TryGetProperty("files", out var files))
{
foreach (var file in files.EnumerateArray())
{
if (file.TryGetProperty("hash", out var hashElement))
{
var hash = hashElement.GetString();
if (!string.IsNullOrEmpty(hash))
{
urls.Add($"https://snapshot.debian.org/file/{hash}");
}
}
}
}
}
}
}
}
}
catch (System.Text.Json.JsonException)
{
// Invalid JSON
}
return [.. urls];
}
private static string NormalizeArchitecture(string architecture)
{
return architecture.ToLowerInvariant() switch
{
"x86_64" or "amd64" => "x86_64",
"aarch64" or "arm64" => "aarch64",
"armhf" or "armv7" or "arm" => "armhf",
"i386" or "i686" or "x86" => "i386",
_ => architecture
};
}
private static string? MapToDebianArchitecture(string architecture)
{
return architecture.ToLowerInvariant() switch
{
"x86_64" => "amd64",
"aarch64" => "arm64",
"armhf" or "armv7" => "armhf",
"i386" or "i686" => "i386",
_ => null
};
}
private static string? MapToAlpineArchitecture(string architecture)
{
return architecture.ToLowerInvariant() switch
{
"x86_64" => "x86_64",
"aarch64" => "aarch64",
"armhf" or "armv7" => "armhf",
"i386" or "i686" => "x86",
_ => null
};
}
private static Version? ParseVersion(string versionString)
{
if (Version.TryParse(versionString, out var version))
{
return version;
}
return null;
}
#endregion
#region Generated Regexes
[GeneratedRegex(@"zlib-(?<version>\d+\.\d+(?:\.\d+)?)", RegexOptions.IgnoreCase)]
private static partial Regex ZlibVersionRegex();
[GeneratedRegex(@"href=""(?<file>(?<name>[a-z0-9_-]+)-(?<version>[0-9.]+(?:-r\d+)?)\.apk)""", RegexOptions.IgnoreCase)]
private static partial Regex AlpinePackageRegex();
#endregion
}

View File

@@ -0,0 +1,135 @@
using System.Collections.Immutable;
using StellaOps.BinaryIndex.Corpus.Models;
namespace StellaOps.BinaryIndex.Corpus;
/// <summary>
/// Service for ingesting library functions into the corpus.
/// </summary>
public interface ICorpusIngestionService
{
/// <summary>
/// Ingest all functions from a library binary.
/// </summary>
/// <param name="metadata">Library metadata.</param>
/// <param name="binaryStream">Binary file stream.</param>
/// <param name="options">Ingestion options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Ingestion result with statistics.</returns>
Task<IngestionResult> IngestLibraryAsync(
LibraryIngestionMetadata metadata,
Stream binaryStream,
IngestionOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Ingest functions from a library connector.
/// </summary>
/// <param name="libraryName">Library name (e.g., "glibc").</param>
/// <param name="connector">Library corpus connector.</param>
/// <param name="options">Ingestion options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Stream of ingestion results.</returns>
IAsyncEnumerable<IngestionResult> IngestFromConnectorAsync(
string libraryName,
ILibraryCorpusConnector connector,
IngestionOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Update CVE associations for corpus functions.
/// </summary>
/// <param name="cveId">CVE identifier.</param>
/// <param name="associations">Function-CVE associations.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Number of associations updated.</returns>
Task<int> UpdateCveAssociationsAsync(
string cveId,
IReadOnlyList<FunctionCveAssociation> associations,
CancellationToken ct = default);
/// <summary>
/// Get ingestion job status.
/// </summary>
/// <param name="jobId">Job ID.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Job details or null if not found.</returns>
Task<IngestionJob?> GetJobStatusAsync(Guid jobId, CancellationToken ct = default);
}
/// <summary>
/// Metadata for library ingestion.
/// </summary>
public sealed record LibraryIngestionMetadata(
string Name,
string Version,
string Architecture,
string? Abi = null,
string? Compiler = null,
string? CompilerVersion = null,
string? OptimizationLevel = null,
DateOnly? ReleaseDate = null,
bool IsSecurityRelease = false,
string? SourceArchiveSha256 = null);
/// <summary>
/// Options for corpus ingestion.
/// </summary>
public sealed record IngestionOptions
{
/// <summary>
/// Minimum function size to index (bytes).
/// </summary>
public int MinFunctionSize { get; init; } = 16;
/// <summary>
/// Maximum functions per binary.
/// </summary>
public int MaxFunctionsPerBinary { get; init; } = 10_000;
/// <summary>
/// Algorithms to use for fingerprinting.
/// </summary>
public ImmutableArray<FingerprintAlgorithm> Algorithms { get; init; } =
[FingerprintAlgorithm.SemanticKsg, FingerprintAlgorithm.InstructionBb, FingerprintAlgorithm.CfgWl];
/// <summary>
/// Include exported functions only.
/// </summary>
public bool ExportedOnly { get; init; } = false;
/// <summary>
/// Generate function clusters after ingestion.
/// </summary>
public bool GenerateClusters { get; init; } = true;
/// <summary>
/// Parallel degree for function processing.
/// </summary>
public int ParallelDegree { get; init; } = 4;
}
/// <summary>
/// Result of a library ingestion.
/// </summary>
public sealed record IngestionResult(
Guid JobId,
string LibraryName,
string Version,
string Architecture,
int FunctionsIndexed,
int FingerprintsGenerated,
int ClustersCreated,
TimeSpan Duration,
ImmutableArray<string> Errors,
ImmutableArray<string> Warnings);
/// <summary>
/// Association between a function and a CVE.
/// </summary>
public sealed record FunctionCveAssociation(
Guid FunctionId,
CveAffectedState AffectedState,
string? PatchCommit,
decimal Confidence,
CveEvidenceType? EvidenceType);

View File

@@ -0,0 +1,186 @@
using System.Collections.Immutable;
using StellaOps.BinaryIndex.Corpus.Models;
namespace StellaOps.BinaryIndex.Corpus;
/// <summary>
/// Service for querying the function corpus.
/// </summary>
public interface ICorpusQueryService
{
/// <summary>
/// Identify a function by its fingerprints.
/// </summary>
/// <param name="fingerprints">Function fingerprints to match.</param>
/// <param name="options">Query options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Matching functions ordered by similarity.</returns>
Task<ImmutableArray<FunctionMatch>> IdentifyFunctionAsync(
FunctionFingerprints fingerprints,
IdentifyOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Batch identify functions.
/// </summary>
/// <param name="fingerprints">Multiple function fingerprints.</param>
/// <param name="options">Query options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Matches for each input fingerprint.</returns>
Task<ImmutableDictionary<int, ImmutableArray<FunctionMatch>>> IdentifyBatchAsync(
IReadOnlyList<FunctionFingerprints> fingerprints,
IdentifyOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Get all functions associated with a CVE.
/// </summary>
/// <param name="cveId">CVE identifier.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Functions affected by the CVE.</returns>
Task<ImmutableArray<CorpusFunctionWithCve>> GetFunctionsForCveAsync(
string cveId,
CancellationToken ct = default);
/// <summary>
/// Get function evolution across library versions.
/// </summary>
/// <param name="libraryName">Library name.</param>
/// <param name="functionName">Function name.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Function evolution timeline.</returns>
Task<FunctionEvolution?> GetFunctionEvolutionAsync(
string libraryName,
string functionName,
CancellationToken ct = default);
/// <summary>
/// Get corpus statistics.
/// </summary>
/// <param name="ct">Cancellation token.</param>
/// <returns>Corpus statistics.</returns>
Task<CorpusStatistics> GetStatisticsAsync(CancellationToken ct = default);
/// <summary>
/// List libraries in the corpus.
/// </summary>
/// <param name="ct">Cancellation token.</param>
/// <returns>Libraries with version counts.</returns>
Task<ImmutableArray<LibrarySummary>> ListLibrariesAsync(CancellationToken ct = default);
/// <summary>
/// List versions for a library.
/// </summary>
/// <param name="libraryName">Library name.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Version information.</returns>
Task<ImmutableArray<LibraryVersionSummary>> ListVersionsAsync(
string libraryName,
CancellationToken ct = default);
}
/// <summary>
/// Fingerprints for function identification.
/// </summary>
public sealed record FunctionFingerprints(
byte[]? SemanticHash,
byte[]? InstructionHash,
byte[]? CfgHash,
ImmutableArray<string>? ApiCalls,
int? SizeBytes);
/// <summary>
/// Options for function identification.
/// </summary>
public sealed record IdentifyOptions
{
/// <summary>
/// Minimum similarity threshold (0.0-1.0).
/// </summary>
public decimal MinSimilarity { get; init; } = 0.70m;
/// <summary>
/// Maximum results to return.
/// </summary>
public int MaxResults { get; init; } = 10;
/// <summary>
/// Filter by library names.
/// </summary>
public ImmutableArray<string>? LibraryFilter { get; init; }
/// <summary>
/// Filter by architectures.
/// </summary>
public ImmutableArray<string>? ArchitectureFilter { get; init; }
/// <summary>
/// Include CVE information in results.
/// </summary>
public bool IncludeCveInfo { get; init; } = true;
/// <summary>
/// Weights for similarity computation.
/// </summary>
public SimilarityWeights Weights { get; init; } = SimilarityWeights.Default;
}
/// <summary>
/// Weights for computing overall similarity.
/// </summary>
public sealed record SimilarityWeights
{
public decimal SemanticWeight { get; init; } = 0.35m;
public decimal InstructionWeight { get; init; } = 0.25m;
public decimal CfgWeight { get; init; } = 0.25m;
public decimal ApiCallWeight { get; init; } = 0.15m;
public static SimilarityWeights Default { get; } = new();
}
/// <summary>
/// Function with CVE information.
/// </summary>
public sealed record CorpusFunctionWithCve(
CorpusFunction Function,
LibraryMetadata Library,
LibraryVersion Version,
BuildVariant Build,
FunctionCve CveInfo);
/// <summary>
/// Corpus statistics.
/// </summary>
public sealed record CorpusStatistics(
int LibraryCount,
int VersionCount,
int BuildVariantCount,
int FunctionCount,
int FingerprintCount,
int ClusterCount,
int CveAssociationCount,
DateTimeOffset? LastUpdated);
/// <summary>
/// Summary of a library in the corpus.
/// </summary>
public sealed record LibrarySummary(
Guid Id,
string Name,
string? Description,
int VersionCount,
int FunctionCount,
int CveCount,
DateTimeOffset? LatestVersionDate);
/// <summary>
/// Summary of a library version.
/// </summary>
public sealed record LibraryVersionSummary(
Guid Id,
string Version,
DateOnly? ReleaseDate,
bool IsSecurityRelease,
int BuildVariantCount,
int FunctionCount,
ImmutableArray<string> Architectures);

View File

@@ -0,0 +1,327 @@
using System.Collections.Immutable;
using StellaOps.BinaryIndex.Corpus.Models;
namespace StellaOps.BinaryIndex.Corpus;
/// <summary>
/// Repository for corpus data access.
/// </summary>
public interface ICorpusRepository
{
#region Libraries
/// <summary>
/// Get or create a library.
/// </summary>
Task<LibraryMetadata> GetOrCreateLibraryAsync(
string name,
string? description = null,
string? homepageUrl = null,
string? sourceRepo = null,
CancellationToken ct = default);
/// <summary>
/// Get a library by name.
/// </summary>
Task<LibraryMetadata?> GetLibraryAsync(string name, CancellationToken ct = default);
/// <summary>
/// Get a library by ID.
/// </summary>
Task<LibraryMetadata?> GetLibraryByIdAsync(Guid id, CancellationToken ct = default);
/// <summary>
/// List all libraries.
/// </summary>
Task<ImmutableArray<LibrarySummary>> ListLibrariesAsync(CancellationToken ct = default);
#endregion
#region Library Versions
/// <summary>
/// Get or create a library version.
/// </summary>
Task<LibraryVersion> GetOrCreateVersionAsync(
Guid libraryId,
string version,
DateOnly? releaseDate = null,
bool isSecurityRelease = false,
string? sourceArchiveSha256 = null,
CancellationToken ct = default);
/// <summary>
/// Get a library version.
/// </summary>
Task<LibraryVersion?> GetVersionAsync(
Guid libraryId,
string version,
CancellationToken ct = default);
/// <summary>
/// Get a library version by ID.
/// </summary>
Task<LibraryVersion?> GetLibraryVersionAsync(
Guid versionId,
CancellationToken ct = default);
/// <summary>
/// List versions for a library.
/// </summary>
Task<ImmutableArray<LibraryVersionSummary>> ListVersionsAsync(
string libraryName,
CancellationToken ct = default);
#endregion
#region Build Variants
/// <summary>
/// Get or create a build variant.
/// </summary>
Task<BuildVariant> GetOrCreateBuildVariantAsync(
Guid libraryVersionId,
string architecture,
string binarySha256,
string? abi = null,
string? compiler = null,
string? compilerVersion = null,
string? optimizationLevel = null,
string? buildId = null,
CancellationToken ct = default);
/// <summary>
/// Get a build variant by binary hash.
/// </summary>
Task<BuildVariant?> GetBuildVariantBySha256Async(
string binarySha256,
CancellationToken ct = default);
/// <summary>
/// Get a build variant by ID.
/// </summary>
Task<BuildVariant?> GetBuildVariantAsync(
Guid variantId,
CancellationToken ct = default);
/// <summary>
/// Get build variants for a version.
/// </summary>
Task<ImmutableArray<BuildVariant>> GetBuildVariantsAsync(
Guid libraryVersionId,
CancellationToken ct = default);
#endregion
#region Functions
/// <summary>
/// Bulk insert functions.
/// </summary>
Task<int> InsertFunctionsAsync(
IReadOnlyList<CorpusFunction> functions,
CancellationToken ct = default);
/// <summary>
/// Get a function by ID.
/// </summary>
Task<CorpusFunction?> GetFunctionAsync(Guid id, CancellationToken ct = default);
/// <summary>
/// Get functions for a build variant.
/// </summary>
Task<ImmutableArray<CorpusFunction>> GetFunctionsForVariantAsync(
Guid buildVariantId,
CancellationToken ct = default);
/// <summary>
/// Get function count for a build variant.
/// </summary>
Task<int> GetFunctionCountAsync(Guid buildVariantId, CancellationToken ct = default);
#endregion
#region Fingerprints
/// <summary>
/// Bulk insert fingerprints.
/// </summary>
Task<int> InsertFingerprintsAsync(
IReadOnlyList<CorpusFingerprint> fingerprints,
CancellationToken ct = default);
/// <summary>
/// Find functions by fingerprint hash.
/// </summary>
Task<ImmutableArray<Guid>> FindFunctionsByFingerprintAsync(
FingerprintAlgorithm algorithm,
byte[] fingerprint,
CancellationToken ct = default);
/// <summary>
/// Find similar fingerprints (for approximate matching).
/// </summary>
Task<ImmutableArray<FingerprintSearchResult>> FindSimilarFingerprintsAsync(
FingerprintAlgorithm algorithm,
byte[] fingerprint,
int maxResults = 10,
CancellationToken ct = default);
/// <summary>
/// Get fingerprints for a function.
/// </summary>
Task<ImmutableArray<CorpusFingerprint>> GetFingerprintsAsync(
Guid functionId,
CancellationToken ct = default);
/// <summary>
/// Get fingerprints for a function (alias).
/// </summary>
Task<ImmutableArray<CorpusFingerprint>> GetFingerprintsForFunctionAsync(
Guid functionId,
CancellationToken ct = default);
#endregion
#region Clusters
/// <summary>
/// Get or create a function cluster.
/// </summary>
Task<FunctionCluster> GetOrCreateClusterAsync(
Guid libraryId,
string canonicalName,
string? description = null,
CancellationToken ct = default);
/// <summary>
/// Get a cluster by ID.
/// </summary>
Task<FunctionCluster?> GetClusterAsync(
Guid clusterId,
CancellationToken ct = default);
/// <summary>
/// Get all clusters for a library.
/// </summary>
Task<ImmutableArray<FunctionCluster>> GetClustersForLibraryAsync(
Guid libraryId,
CancellationToken ct = default);
/// <summary>
/// Insert a new cluster.
/// </summary>
Task InsertClusterAsync(
FunctionCluster cluster,
CancellationToken ct = default);
/// <summary>
/// Add members to a cluster.
/// </summary>
Task<int> AddClusterMembersAsync(
Guid clusterId,
IReadOnlyList<ClusterMember> members,
CancellationToken ct = default);
/// <summary>
/// Add a single member to a cluster.
/// </summary>
Task AddClusterMemberAsync(
ClusterMember member,
CancellationToken ct = default);
/// <summary>
/// Get cluster members.
/// </summary>
Task<ImmutableArray<Guid>> GetClusterMemberIdsAsync(
Guid clusterId,
CancellationToken ct = default);
/// <summary>
/// Get cluster members with details.
/// </summary>
Task<ImmutableArray<ClusterMember>> GetClusterMembersAsync(
Guid clusterId,
CancellationToken ct = default);
/// <summary>
/// Clear all members from a cluster.
/// </summary>
Task ClearClusterMembersAsync(
Guid clusterId,
CancellationToken ct = default);
#endregion
#region CVE Associations
/// <summary>
/// Upsert CVE associations.
/// </summary>
Task<int> UpsertCveAssociationsAsync(
string cveId,
IReadOnlyList<FunctionCve> associations,
CancellationToken ct = default);
/// <summary>
/// Get functions for a CVE.
/// </summary>
Task<ImmutableArray<Guid>> GetFunctionIdsForCveAsync(
string cveId,
CancellationToken ct = default);
/// <summary>
/// Get CVEs for a function.
/// </summary>
Task<ImmutableArray<FunctionCve>> GetCvesForFunctionAsync(
Guid functionId,
CancellationToken ct = default);
#endregion
#region Ingestion Jobs
/// <summary>
/// Create an ingestion job.
/// </summary>
Task<IngestionJob> CreateIngestionJobAsync(
Guid libraryId,
IngestionJobType jobType,
CancellationToken ct = default);
/// <summary>
/// Update ingestion job status.
/// </summary>
Task UpdateIngestionJobAsync(
Guid jobId,
IngestionJobStatus status,
int? functionsIndexed = null,
int? fingerprintsGenerated = null,
int? clustersCreated = null,
ImmutableArray<string>? errors = null,
CancellationToken ct = default);
/// <summary>
/// Get ingestion job.
/// </summary>
Task<IngestionJob?> GetIngestionJobAsync(Guid jobId, CancellationToken ct = default);
#endregion
#region Statistics
/// <summary>
/// Get corpus statistics.
/// </summary>
Task<CorpusStatistics> GetStatisticsAsync(CancellationToken ct = default);
#endregion
}
/// <summary>
/// Result of a fingerprint similarity search.
/// </summary>
public sealed record FingerprintSearchResult(
Guid FunctionId,
byte[] Fingerprint,
decimal Similarity);

View File

@@ -0,0 +1,155 @@
using System.Collections.Immutable;
using StellaOps.BinaryIndex.Corpus.Models;
namespace StellaOps.BinaryIndex.Corpus;
/// <summary>
/// Connector for fetching library binaries from various sources.
/// Used to populate the function corpus.
/// </summary>
public interface ILibraryCorpusConnector
{
/// <summary>
/// Library name this connector handles (e.g., "glibc", "openssl").
/// </summary>
string LibraryName { get; }
/// <summary>
/// Supported architectures.
/// </summary>
ImmutableArray<string> SupportedArchitectures { get; }
/// <summary>
/// Get available versions of the library.
/// </summary>
/// <param name="ct">Cancellation token.</param>
/// <returns>Available versions ordered newest first.</returns>
Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default);
/// <summary>
/// Fetch a library binary for a specific version and architecture.
/// </summary>
/// <param name="version">Library version.</param>
/// <param name="architecture">Target architecture.</param>
/// <param name="options">Fetch options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Library binary or null if not available.</returns>
Task<LibraryBinary?> FetchBinaryAsync(
string version,
string architecture,
LibraryFetchOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Stream binaries for multiple versions.
/// </summary>
/// <param name="versions">Versions to fetch.</param>
/// <param name="architecture">Target architecture.</param>
/// <param name="options">Fetch options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Stream of library binaries.</returns>
IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
IEnumerable<string> versions,
string architecture,
LibraryFetchOptions? options = null,
CancellationToken ct = default);
}
/// <summary>
/// A library binary fetched from a connector.
/// </summary>
public sealed record LibraryBinary(
string LibraryName,
string Version,
string Architecture,
string? Abi,
string? Compiler,
string? CompilerVersion,
string? OptimizationLevel,
Stream BinaryStream,
string Sha256,
string? BuildId,
LibraryBinarySource Source,
DateOnly? ReleaseDate) : IDisposable
{
public void Dispose()
{
BinaryStream.Dispose();
}
}
/// <summary>
/// Source of a library binary.
/// </summary>
public sealed record LibraryBinarySource(
LibrarySourceType Type,
string? PackageName,
string? DistroRelease,
string? MirrorUrl);
/// <summary>
/// Type of library source.
/// </summary>
public enum LibrarySourceType
{
/// <summary>
/// Binary from Debian/Ubuntu package.
/// </summary>
DebianPackage,
/// <summary>
/// Binary from RPM package.
/// </summary>
RpmPackage,
/// <summary>
/// Binary from Alpine APK.
/// </summary>
AlpineApk,
/// <summary>
/// Binary compiled from source.
/// </summary>
CompiledSource,
/// <summary>
/// Binary from upstream release.
/// </summary>
UpstreamRelease,
/// <summary>
/// Binary from debug symbol server.
/// </summary>
DebugSymbolServer
}
/// <summary>
/// Options for fetching library binaries.
/// </summary>
public sealed record LibraryFetchOptions
{
/// <summary>
/// Preferred ABI (e.g., "gnu", "musl").
/// </summary>
public string? PreferredAbi { get; init; }
/// <summary>
/// Preferred compiler.
/// </summary>
public string? PreferredCompiler { get; init; }
/// <summary>
/// Include debug symbols if available.
/// </summary>
public bool IncludeDebugSymbols { get; init; } = true;
/// <summary>
/// Preferred distro for pre-built packages.
/// </summary>
public string? PreferredDistro { get; init; }
/// <summary>
/// Timeout for network operations.
/// </summary>
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(5);
}

View File

@@ -0,0 +1,273 @@
using System.Collections.Immutable;
namespace StellaOps.BinaryIndex.Corpus.Models;
/// <summary>
/// Metadata about a known library in the corpus.
/// </summary>
public sealed record LibraryMetadata(
Guid Id,
string Name,
string? Description,
string? HomepageUrl,
string? SourceRepo,
DateTimeOffset CreatedAt,
DateTimeOffset UpdatedAt);
/// <summary>
/// A specific version of a library in the corpus.
/// </summary>
public sealed record LibraryVersion(
Guid Id,
Guid LibraryId,
string Version,
DateOnly? ReleaseDate,
bool IsSecurityRelease,
string? SourceArchiveSha256,
DateTimeOffset IndexedAt);
/// <summary>
/// A specific build variant of a library version.
/// </summary>
public sealed record BuildVariant(
Guid Id,
Guid LibraryVersionId,
string Architecture,
string? Abi,
string? Compiler,
string? CompilerVersion,
string? OptimizationLevel,
string? BuildId,
string BinarySha256,
DateTimeOffset IndexedAt);
/// <summary>
/// A function in the corpus.
/// </summary>
public sealed record CorpusFunction(
Guid Id,
Guid BuildVariantId,
string Name,
string? DemangledName,
ulong Address,
int SizeBytes,
bool IsExported,
bool IsInline,
string? SourceFile,
int? SourceLine);
/// <summary>
/// A fingerprint for a function in the corpus.
/// </summary>
public sealed record CorpusFingerprint(
Guid Id,
Guid FunctionId,
FingerprintAlgorithm Algorithm,
byte[] Fingerprint,
string FingerprintHex,
FingerprintMetadata? Metadata,
DateTimeOffset CreatedAt);
/// <summary>
/// Algorithm used to generate a fingerprint.
/// </summary>
public enum FingerprintAlgorithm
{
/// <summary>
/// Semantic key-semantics graph fingerprint (from Phase 1).
/// </summary>
SemanticKsg,
/// <summary>
/// Instruction-level basic block hash.
/// </summary>
InstructionBb,
/// <summary>
/// Control flow graph Weisfeiler-Lehman hash.
/// </summary>
CfgWl,
/// <summary>
/// API call sequence hash.
/// </summary>
ApiCalls,
/// <summary>
/// Combined multi-algorithm fingerprint.
/// </summary>
Combined
}
/// <summary>
/// Algorithm-specific metadata for a fingerprint.
/// </summary>
public sealed record FingerprintMetadata(
int? NodeCount,
int? EdgeCount,
int? CyclomaticComplexity,
ImmutableArray<string>? ApiCalls,
string? OperationHashHex,
string? DataFlowHashHex);
/// <summary>
/// A cluster of similar functions across versions.
/// </summary>
public sealed record FunctionCluster(
Guid Id,
Guid LibraryId,
string CanonicalName,
string? Description,
DateTimeOffset CreatedAt);
/// <summary>
/// Membership in a function cluster.
/// </summary>
public sealed record ClusterMember(
Guid ClusterId,
Guid FunctionId,
decimal? SimilarityToCentroid);
/// <summary>
/// CVE association for a function.
/// </summary>
public sealed record FunctionCve(
Guid FunctionId,
string CveId,
CveAffectedState AffectedState,
string? PatchCommit,
decimal Confidence,
CveEvidenceType? EvidenceType);
/// <summary>
/// CVE affected state for a function.
/// </summary>
public enum CveAffectedState
{
Vulnerable,
Fixed,
NotAffected
}
/// <summary>
/// Type of evidence linking a function to a CVE.
/// </summary>
public enum CveEvidenceType
{
Changelog,
Commit,
Advisory,
PatchHeader,
Manual
}
/// <summary>
/// Ingestion job tracking.
/// </summary>
public sealed record IngestionJob(
Guid Id,
Guid LibraryId,
IngestionJobType JobType,
IngestionJobStatus Status,
DateTimeOffset? StartedAt,
DateTimeOffset? CompletedAt,
int? FunctionsIndexed,
ImmutableArray<string>? Errors,
DateTimeOffset CreatedAt);
/// <summary>
/// Type of ingestion job.
/// </summary>
public enum IngestionJobType
{
FullIngest,
Incremental,
CveUpdate
}
/// <summary>
/// Status of an ingestion job.
/// </summary>
public enum IngestionJobStatus
{
Pending,
Running,
Completed,
Failed,
Cancelled
}
/// <summary>
/// Result of a function identification query.
/// </summary>
public sealed record FunctionMatch(
string LibraryName,
string Version,
string FunctionName,
string? DemangledName,
decimal Similarity,
MatchConfidence Confidence,
string Architecture,
string? Abi,
MatchDetails Details);
/// <summary>
/// Confidence level of a match.
/// </summary>
public enum MatchConfidence
{
/// <summary>
/// Low confidence (similarity 50-70%).
/// </summary>
Low,
/// <summary>
/// Medium confidence (similarity 70-85%).
/// </summary>
Medium,
/// <summary>
/// High confidence (similarity 85-95%).
/// </summary>
High,
/// <summary>
/// Very high confidence (similarity 95%+).
/// </summary>
VeryHigh,
/// <summary>
/// Exact match (100% or hash collision).
/// </summary>
Exact
}
/// <summary>
/// Details about a function match.
/// </summary>
public sealed record MatchDetails(
decimal SemanticSimilarity,
decimal InstructionSimilarity,
decimal CfgSimilarity,
decimal ApiCallSimilarity,
ImmutableArray<string> MatchedApiCalls,
int SizeDifferenceBytes);
/// <summary>
/// Evolution of a function across library versions.
/// </summary>
public sealed record FunctionEvolution(
string LibraryName,
string FunctionName,
ImmutableArray<FunctionVersionInfo> Versions);
/// <summary>
/// Information about a function in a specific version.
/// </summary>
public sealed record FunctionVersionInfo(
string Version,
DateOnly? ReleaseDate,
int SizeBytes,
string FingerprintHex,
decimal? SimilarityToPrevious,
ImmutableArray<string>? CveIds);

View File

@@ -0,0 +1,464 @@
using System.Collections.Immutable;
using System.Threading.Channels;
using Microsoft.Extensions.Logging;
using StellaOps.BinaryIndex.Corpus.Models;
namespace StellaOps.BinaryIndex.Corpus.Services;
/// <summary>
/// Service for batch generation of function fingerprints.
/// Uses a producer-consumer pattern for efficient parallel processing.
/// </summary>
public sealed class BatchFingerprintPipeline : IBatchFingerprintPipeline
{
private readonly ICorpusRepository _repository;
private readonly IFingerprintGeneratorFactory _generatorFactory;
private readonly ILogger<BatchFingerprintPipeline> _logger;
public BatchFingerprintPipeline(
ICorpusRepository repository,
IFingerprintGeneratorFactory generatorFactory,
ILogger<BatchFingerprintPipeline> logger)
{
_repository = repository;
_generatorFactory = generatorFactory;
_logger = logger;
}
/// <inheritdoc />
public async Task<BatchFingerprintResult> GenerateFingerprintsAsync(
Guid buildVariantId,
BatchFingerprintOptions? options = null,
CancellationToken ct = default)
{
var opts = options ?? new BatchFingerprintOptions();
_logger.LogInformation(
"Starting batch fingerprint generation for variant {VariantId}",
buildVariantId);
// Get all functions for this variant
var functions = await _repository.GetFunctionsForVariantAsync(buildVariantId, ct);
if (functions.Length == 0)
{
_logger.LogWarning("No functions found for variant {VariantId}", buildVariantId);
return new BatchFingerprintResult(
buildVariantId,
0,
0,
TimeSpan.Zero,
[],
[]);
}
return await GenerateFingerprintsForFunctionsAsync(
functions,
buildVariantId,
opts,
ct);
}
/// <inheritdoc />
public async Task<BatchFingerprintResult> GenerateFingerprintsForLibraryAsync(
string libraryName,
BatchFingerprintOptions? options = null,
CancellationToken ct = default)
{
var opts = options ?? new BatchFingerprintOptions();
_logger.LogInformation(
"Starting batch fingerprint generation for library {Library}",
libraryName);
var library = await _repository.GetLibraryAsync(libraryName, ct);
if (library is null)
{
_logger.LogWarning("Library {Library} not found", libraryName);
return new BatchFingerprintResult(
Guid.Empty,
0,
0,
TimeSpan.Zero,
["Library not found"],
[]);
}
// Get all versions
var versions = await _repository.ListVersionsAsync(libraryName, ct);
var totalFunctions = 0;
var totalFingerprints = 0;
var totalDuration = TimeSpan.Zero;
var allErrors = new List<string>();
var allWarnings = new List<string>();
foreach (var version in versions)
{
ct.ThrowIfCancellationRequested();
// Get build variants for this version
var variants = await _repository.GetBuildVariantsAsync(version.Id, ct);
foreach (var variant in variants)
{
ct.ThrowIfCancellationRequested();
var result = await GenerateFingerprintsAsync(variant.Id, opts, ct);
totalFunctions += result.FunctionsProcessed;
totalFingerprints += result.FingerprintsGenerated;
totalDuration += result.Duration;
allErrors.AddRange(result.Errors);
allWarnings.AddRange(result.Warnings);
}
}
return new BatchFingerprintResult(
library.Id,
totalFunctions,
totalFingerprints,
totalDuration,
[.. allErrors],
[.. allWarnings]);
}
/// <inheritdoc />
public async IAsyncEnumerable<FingerprintProgress> StreamProgressAsync(
Guid buildVariantId,
BatchFingerprintOptions? options = null,
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
{
var opts = options ?? new BatchFingerprintOptions();
var functions = await _repository.GetFunctionsForVariantAsync(buildVariantId, ct);
var total = functions.Length;
var processed = 0;
var errors = 0;
var channel = Channel.CreateBounded<FingerprintWorkItem>(new BoundedChannelOptions(opts.BatchSize * 2)
{
FullMode = BoundedChannelFullMode.Wait
});
// Producer task: read functions and queue them
var producerTask = Task.Run(async () =>
{
try
{
foreach (var function in functions)
{
ct.ThrowIfCancellationRequested();
await channel.Writer.WriteAsync(new FingerprintWorkItem(function), ct);
}
}
finally
{
channel.Writer.Complete();
}
}, ct);
// Consumer: process batches and yield progress
var batch = new List<FingerprintWorkItem>();
await foreach (var item in channel.Reader.ReadAllAsync(ct))
{
batch.Add(item);
if (batch.Count >= opts.BatchSize)
{
var batchResult = await ProcessBatchAsync(batch, opts, ct);
processed += batchResult.Processed;
errors += batchResult.Errors;
batch.Clear();
yield return new FingerprintProgress(
processed,
total,
errors,
(double)processed / total);
}
}
// Process remaining items
if (batch.Count > 0)
{
var batchResult = await ProcessBatchAsync(batch, opts, ct);
processed += batchResult.Processed;
errors += batchResult.Errors;
yield return new FingerprintProgress(
processed,
total,
errors,
1.0);
}
await producerTask;
}
#region Private Methods
private async Task<BatchFingerprintResult> GenerateFingerprintsForFunctionsAsync(
ImmutableArray<CorpusFunction> functions,
Guid contextId,
BatchFingerprintOptions options,
CancellationToken ct)
{
var startTime = DateTime.UtcNow;
var processed = 0;
var generated = 0;
var errors = new List<string>();
var warnings = new List<string>();
// Process in batches with parallelism
var batches = functions
.Select((f, i) => new { Function = f, Index = i })
.GroupBy(x => x.Index / options.BatchSize)
.Select(g => g.Select(x => x.Function).ToList())
.ToList();
foreach (var batch in batches)
{
ct.ThrowIfCancellationRequested();
var semaphore = new SemaphoreSlim(options.ParallelDegree);
var batchFingerprints = new List<CorpusFingerprint>();
var tasks = batch.Select(async function =>
{
await semaphore.WaitAsync(ct);
try
{
var fingerprints = await GenerateFingerprintsForFunctionAsync(function, options, ct);
lock (batchFingerprints)
{
batchFingerprints.AddRange(fingerprints);
}
Interlocked.Increment(ref processed);
}
catch (Exception ex)
{
lock (errors)
{
errors.Add($"Function {function.Name}: {ex.Message}");
}
}
finally
{
semaphore.Release();
}
});
await Task.WhenAll(tasks);
// Batch insert fingerprints
if (batchFingerprints.Count > 0)
{
var insertedCount = await _repository.InsertFingerprintsAsync(batchFingerprints, ct);
generated += insertedCount;
}
}
var duration = DateTime.UtcNow - startTime;
_logger.LogInformation(
"Batch fingerprint generation completed: {Functions} functions, {Fingerprints} fingerprints in {Duration:c}",
processed,
generated,
duration);
return new BatchFingerprintResult(
contextId,
processed,
generated,
duration,
[.. errors],
[.. warnings]);
}
private async Task<ImmutableArray<CorpusFingerprint>> GenerateFingerprintsForFunctionAsync(
CorpusFunction function,
BatchFingerprintOptions options,
CancellationToken ct)
{
var fingerprints = new List<CorpusFingerprint>();
foreach (var algorithm in options.Algorithms)
{
ct.ThrowIfCancellationRequested();
var generator = _generatorFactory.GetGenerator(algorithm);
if (generator is null)
{
continue;
}
var fingerprint = await generator.GenerateAsync(function, ct);
if (fingerprint is not null)
{
fingerprints.Add(new CorpusFingerprint(
Guid.NewGuid(),
function.Id,
algorithm,
fingerprint.Hash,
Convert.ToHexStringLower(fingerprint.Hash),
fingerprint.Metadata,
DateTimeOffset.UtcNow));
}
}
return [.. fingerprints];
}
private async Task<(int Processed, int Errors)> ProcessBatchAsync(
List<FingerprintWorkItem> batch,
BatchFingerprintOptions options,
CancellationToken ct)
{
var processed = 0;
var errors = 0;
var allFingerprints = new List<CorpusFingerprint>();
var semaphore = new SemaphoreSlim(options.ParallelDegree);
var tasks = batch.Select(async item =>
{
await semaphore.WaitAsync(ct);
try
{
var fingerprints = await GenerateFingerprintsForFunctionAsync(item.Function, options, ct);
lock (allFingerprints)
{
allFingerprints.AddRange(fingerprints);
}
Interlocked.Increment(ref processed);
}
catch
{
Interlocked.Increment(ref errors);
}
finally
{
semaphore.Release();
}
});
await Task.WhenAll(tasks);
if (allFingerprints.Count > 0)
{
await _repository.InsertFingerprintsAsync(allFingerprints, ct);
}
return (processed, errors);
}
#endregion
private sealed record FingerprintWorkItem(CorpusFunction Function);
}
/// <summary>
/// Interface for batch fingerprint generation.
/// </summary>
public interface IBatchFingerprintPipeline
{
/// <summary>
/// Generate fingerprints for all functions in a build variant.
/// </summary>
Task<BatchFingerprintResult> GenerateFingerprintsAsync(
Guid buildVariantId,
BatchFingerprintOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Generate fingerprints for all functions in a library.
/// </summary>
Task<BatchFingerprintResult> GenerateFingerprintsForLibraryAsync(
string libraryName,
BatchFingerprintOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Stream progress for fingerprint generation.
/// </summary>
IAsyncEnumerable<FingerprintProgress> StreamProgressAsync(
Guid buildVariantId,
BatchFingerprintOptions? options = null,
CancellationToken ct = default);
}
/// <summary>
/// Options for batch fingerprint generation.
/// </summary>
public sealed record BatchFingerprintOptions
{
/// <summary>
/// Number of functions to process per batch.
/// </summary>
public int BatchSize { get; init; } = 100;
/// <summary>
/// Degree of parallelism for processing.
/// </summary>
public int ParallelDegree { get; init; } = 4;
/// <summary>
/// Algorithms to generate fingerprints for.
/// </summary>
public ImmutableArray<FingerprintAlgorithm> Algorithms { get; init; } =
[FingerprintAlgorithm.SemanticKsg, FingerprintAlgorithm.InstructionBb, FingerprintAlgorithm.CfgWl];
}
/// <summary>
/// Result of batch fingerprint generation.
/// </summary>
public sealed record BatchFingerprintResult(
Guid ContextId,
int FunctionsProcessed,
int FingerprintsGenerated,
TimeSpan Duration,
ImmutableArray<string> Errors,
ImmutableArray<string> Warnings);
/// <summary>
/// Progress update for fingerprint generation.
/// </summary>
public sealed record FingerprintProgress(
int Processed,
int Total,
int Errors,
double PercentComplete);
/// <summary>
/// Factory for creating fingerprint generators.
/// </summary>
public interface IFingerprintGeneratorFactory
{
/// <summary>
/// Get a fingerprint generator for the specified algorithm.
/// </summary>
ICorpusFingerprintGenerator? GetGenerator(FingerprintAlgorithm algorithm);
}
/// <summary>
/// Interface for corpus fingerprint generation.
/// </summary>
public interface ICorpusFingerprintGenerator
{
/// <summary>
/// Generate a fingerprint for a corpus function.
/// </summary>
Task<GeneratedFingerprint?> GenerateAsync(
CorpusFunction function,
CancellationToken ct = default);
}
/// <summary>
/// A generated fingerprint.
/// </summary>
public sealed record GeneratedFingerprint(
byte[] Hash,
FingerprintMetadata? Metadata);

View File

@@ -0,0 +1,466 @@
using System.Collections.Immutable;
using System.Diagnostics;
using System.Security.Cryptography;
using Microsoft.Extensions.Logging;
using StellaOps.BinaryIndex.Corpus.Models;
namespace StellaOps.BinaryIndex.Corpus.Services;
/// <summary>
/// Service for ingesting library binaries into the function corpus.
/// </summary>
public sealed class CorpusIngestionService : ICorpusIngestionService
{
private readonly ICorpusRepository _repository;
private readonly IFingerprintGenerator? _fingerprintGenerator;
private readonly IFunctionExtractor? _functionExtractor;
private readonly ILogger<CorpusIngestionService> _logger;
public CorpusIngestionService(
ICorpusRepository repository,
ILogger<CorpusIngestionService> logger,
IFingerprintGenerator? fingerprintGenerator = null,
IFunctionExtractor? functionExtractor = null)
{
_repository = repository;
_logger = logger;
_fingerprintGenerator = fingerprintGenerator;
_functionExtractor = functionExtractor;
}
/// <inheritdoc />
public async Task<IngestionResult> IngestLibraryAsync(
LibraryIngestionMetadata metadata,
Stream binaryStream,
IngestionOptions? options = null,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(metadata);
ArgumentNullException.ThrowIfNull(binaryStream);
var opts = options ?? new IngestionOptions();
var stopwatch = Stopwatch.StartNew();
var warnings = new List<string>();
var errors = new List<string>();
_logger.LogInformation(
"Starting ingestion for {Library} {Version} ({Architecture})",
metadata.Name,
metadata.Version,
metadata.Architecture);
// Compute binary hash
var binarySha256 = await ComputeSha256Async(binaryStream, ct);
binaryStream.Position = 0; // Reset for reading
// Check if we've already indexed this exact binary
var existingVariant = await _repository.GetBuildVariantBySha256Async(binarySha256, ct);
if (existingVariant is not null)
{
_logger.LogInformation(
"Binary {Sha256} already indexed as variant {VariantId}",
binarySha256[..16],
existingVariant.Id);
stopwatch.Stop();
return new IngestionResult(
Guid.Empty,
metadata.Name,
metadata.Version,
metadata.Architecture,
0,
0,
0,
stopwatch.Elapsed,
["Binary already indexed."],
[]);
}
// Create or get library record
var library = await _repository.GetOrCreateLibraryAsync(
metadata.Name,
null,
null,
null,
ct);
// Create ingestion job
var job = await _repository.CreateIngestionJobAsync(
library.Id,
IngestionJobType.FullIngest,
ct);
try
{
await _repository.UpdateIngestionJobAsync(
job.Id,
IngestionJobStatus.Running,
ct: ct);
// Create or get version record
var version = await _repository.GetOrCreateVersionAsync(
library.Id,
metadata.Version,
metadata.ReleaseDate,
metadata.IsSecurityRelease,
metadata.SourceArchiveSha256,
ct);
// Create build variant record
var variant = await _repository.GetOrCreateBuildVariantAsync(
version.Id,
metadata.Architecture,
binarySha256,
metadata.Abi,
metadata.Compiler,
metadata.CompilerVersion,
metadata.OptimizationLevel,
null,
ct);
// Extract functions from binary
var functions = await ExtractFunctionsAsync(binaryStream, variant.Id, opts, warnings, ct);
// Filter functions based on options
functions = ApplyFunctionFilters(functions, opts);
// Insert functions into database
var insertedCount = await _repository.InsertFunctionsAsync(functions, ct);
_logger.LogInformation(
"Extracted and inserted {Count} functions from {Library} {Version}",
insertedCount,
metadata.Name,
metadata.Version);
// Generate fingerprints for each function
var fingerprintsGenerated = 0;
if (_fingerprintGenerator is not null)
{
fingerprintsGenerated = await GenerateFingerprintsAsync(functions, opts, ct);
}
// Generate clusters if enabled
var clustersCreated = 0;
if (opts.GenerateClusters)
{
clustersCreated = await GenerateClustersAsync(library.Id, functions, ct);
}
// Update job with success
await _repository.UpdateIngestionJobAsync(
job.Id,
IngestionJobStatus.Completed,
functionsIndexed: insertedCount,
fingerprintsGenerated: fingerprintsGenerated,
clustersCreated: clustersCreated,
ct: ct);
stopwatch.Stop();
return new IngestionResult(
job.Id,
metadata.Name,
metadata.Version,
metadata.Architecture,
insertedCount,
fingerprintsGenerated,
clustersCreated,
stopwatch.Elapsed,
[],
[.. warnings]);
}
catch (Exception ex)
{
_logger.LogError(ex,
"Ingestion failed for {Library} {Version}",
metadata.Name,
metadata.Version);
await _repository.UpdateIngestionJobAsync(
job.Id,
IngestionJobStatus.Failed,
errors: [ex.Message],
ct: ct);
stopwatch.Stop();
return new IngestionResult(
job.Id,
metadata.Name,
metadata.Version,
metadata.Architecture,
0,
0,
0,
stopwatch.Elapsed,
[ex.Message],
[.. warnings]);
}
}
/// <inheritdoc />
public async IAsyncEnumerable<IngestionResult> IngestFromConnectorAsync(
string libraryName,
ILibraryCorpusConnector connector,
IngestionOptions? options = null,
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
{
ArgumentException.ThrowIfNullOrEmpty(libraryName);
ArgumentNullException.ThrowIfNull(connector);
var opts = options ?? new IngestionOptions();
_logger.LogInformation(
"Starting bulk ingestion from {Connector} for library {Library}",
connector.LibraryName,
libraryName);
// Get available versions
var versions = await connector.GetAvailableVersionsAsync(ct);
_logger.LogInformation(
"Found {Count} versions for {Library}",
versions.Length,
libraryName);
var fetchOptions = new LibraryFetchOptions
{
IncludeDebugSymbols = true
};
// Process each architecture
foreach (var arch in connector.SupportedArchitectures)
{
await foreach (var binary in connector.FetchBinariesAsync(
[.. versions],
arch,
fetchOptions,
ct))
{
ct.ThrowIfCancellationRequested();
using (binary)
{
var metadata = new LibraryIngestionMetadata(
libraryName,
binary.Version,
binary.Architecture,
binary.Abi,
binary.Compiler,
binary.CompilerVersion,
binary.OptimizationLevel,
binary.ReleaseDate,
false,
null);
var result = await IngestLibraryAsync(metadata, binary.BinaryStream, opts, ct);
yield return result;
}
}
}
}
/// <inheritdoc />
public async Task<int> UpdateCveAssociationsAsync(
string cveId,
IReadOnlyList<FunctionCveAssociation> associations,
CancellationToken ct = default)
{
ArgumentException.ThrowIfNullOrEmpty(cveId);
ArgumentNullException.ThrowIfNull(associations);
if (associations.Count == 0)
{
return 0;
}
_logger.LogInformation(
"Updating CVE associations for {CveId} ({Count} functions)",
cveId,
associations.Count);
// Convert to FunctionCve records
var cveRecords = associations.Select(a => new FunctionCve(
a.FunctionId,
cveId,
a.AffectedState,
a.PatchCommit,
a.Confidence,
a.EvidenceType)).ToList();
return await _repository.UpsertCveAssociationsAsync(cveId, cveRecords, ct);
}
/// <inheritdoc />
public async Task<IngestionJob?> GetJobStatusAsync(Guid jobId, CancellationToken ct = default)
{
return await _repository.GetIngestionJobAsync(jobId, ct);
}
#region Private Methods
private async Task<ImmutableArray<CorpusFunction>> ExtractFunctionsAsync(
Stream binaryStream,
Guid buildVariantId,
IngestionOptions options,
List<string> warnings,
CancellationToken ct)
{
if (_functionExtractor is null)
{
warnings.Add("No function extractor configured, returning empty function list");
_logger.LogWarning("No function extractor configured");
return [];
}
var extractedFunctions = await _functionExtractor.ExtractFunctionsAsync(binaryStream, ct);
// Convert to corpus functions with IDs
var functions = extractedFunctions.Select(f => new CorpusFunction(
Guid.NewGuid(),
buildVariantId,
f.Name,
f.DemangledName,
f.Address,
f.SizeBytes,
f.IsExported,
f.IsInline,
f.SourceFile,
f.SourceLine)).ToImmutableArray();
return functions;
}
private static ImmutableArray<CorpusFunction> ApplyFunctionFilters(
ImmutableArray<CorpusFunction> functions,
IngestionOptions options)
{
var filtered = functions
.Where(f => f.SizeBytes >= options.MinFunctionSize)
.Where(f => !options.ExportedOnly || f.IsExported)
.Take(options.MaxFunctionsPerBinary);
return [.. filtered];
}
private async Task<int> GenerateFingerprintsAsync(
ImmutableArray<CorpusFunction> functions,
IngestionOptions options,
CancellationToken ct)
{
if (_fingerprintGenerator is null)
{
return 0;
}
var allFingerprints = new List<CorpusFingerprint>();
// Process in parallel with degree limit
var semaphore = new SemaphoreSlim(options.ParallelDegree);
var tasks = functions.Select(async function =>
{
await semaphore.WaitAsync(ct);
try
{
var fingerprints = await _fingerprintGenerator.GenerateFingerprintsAsync(function.Id, ct);
lock (allFingerprints)
{
allFingerprints.AddRange(fingerprints);
}
}
finally
{
semaphore.Release();
}
});
await Task.WhenAll(tasks);
if (allFingerprints.Count > 0)
{
return await _repository.InsertFingerprintsAsync(allFingerprints, ct);
}
return 0;
}
private async Task<int> GenerateClustersAsync(
Guid libraryId,
ImmutableArray<CorpusFunction> functions,
CancellationToken ct)
{
// Simple clustering: group functions by demangled name (if available) or name
var clusters = functions
.GroupBy(f => f.DemangledName ?? f.Name)
.Where(g => g.Count() > 1) // Only create clusters for functions appearing multiple times
.ToList();
var clustersCreated = 0;
foreach (var group in clusters)
{
ct.ThrowIfCancellationRequested();
var cluster = await _repository.GetOrCreateClusterAsync(
libraryId,
group.Key,
null,
ct);
var members = group.Select(f => new ClusterMember(cluster.Id, f.Id, 1.0m)).ToList();
await _repository.AddClusterMembersAsync(cluster.Id, members, ct);
clustersCreated++;
}
return clustersCreated;
}
private static async Task<string> ComputeSha256Async(Stream stream, CancellationToken ct)
{
using var sha256 = SHA256.Create();
var hash = await sha256.ComputeHashAsync(stream, ct);
return Convert.ToHexStringLower(hash);
}
#endregion
}
/// <summary>
/// Interface for extracting functions from binary files.
/// </summary>
public interface IFunctionExtractor
{
/// <summary>
/// Extract functions from a binary stream.
/// </summary>
Task<ImmutableArray<ExtractedFunction>> ExtractFunctionsAsync(
Stream binaryStream,
CancellationToken ct = default);
}
/// <summary>
/// Interface for generating function fingerprints.
/// </summary>
public interface IFingerprintGenerator
{
/// <summary>
/// Generate fingerprints for a function.
/// </summary>
Task<ImmutableArray<CorpusFingerprint>> GenerateFingerprintsAsync(
Guid functionId,
CancellationToken ct = default);
}
/// <summary>
/// A function extracted from a binary.
/// </summary>
public sealed record ExtractedFunction(
string Name,
string? DemangledName,
ulong Address,
int SizeBytes,
bool IsExported,
bool IsInline,
string? SourceFile,
int? SourceLine);

View File

@@ -0,0 +1,419 @@
using System.Collections.Immutable;
using Microsoft.Extensions.Logging;
using StellaOps.BinaryIndex.Corpus.Models;
namespace StellaOps.BinaryIndex.Corpus.Services;
/// <summary>
/// Service for querying the function corpus to identify functions.
/// </summary>
public sealed class CorpusQueryService : ICorpusQueryService
{
private readonly ICorpusRepository _repository;
private readonly IClusterSimilarityComputer _similarityComputer;
private readonly ILogger<CorpusQueryService> _logger;
public CorpusQueryService(
ICorpusRepository repository,
IClusterSimilarityComputer similarityComputer,
ILogger<CorpusQueryService> logger)
{
_repository = repository;
_similarityComputer = similarityComputer;
_logger = logger;
}
/// <inheritdoc />
public async Task<ImmutableArray<FunctionMatch>> IdentifyFunctionAsync(
FunctionFingerprints fingerprints,
IdentifyOptions? options = null,
CancellationToken ct = default)
{
var opts = options ?? new IdentifyOptions();
_logger.LogDebug("Identifying function with fingerprints");
var candidates = new List<FunctionCandidate>();
// Search by each available fingerprint type
if (fingerprints.SemanticHash is { Length: > 0 })
{
var matches = await SearchByFingerprintAsync(
FingerprintAlgorithm.SemanticKsg,
fingerprints.SemanticHash,
opts,
ct);
candidates.AddRange(matches);
}
if (fingerprints.InstructionHash is { Length: > 0 })
{
var matches = await SearchByFingerprintAsync(
FingerprintAlgorithm.InstructionBb,
fingerprints.InstructionHash,
opts,
ct);
candidates.AddRange(matches);
}
if (fingerprints.CfgHash is { Length: > 0 })
{
var matches = await SearchByFingerprintAsync(
FingerprintAlgorithm.CfgWl,
fingerprints.CfgHash,
opts,
ct);
candidates.AddRange(matches);
}
// Group candidates by function and compute combined similarity
var groupedCandidates = candidates
.GroupBy(c => c.FunctionId)
.Select(g => ComputeCombinedScore(g, fingerprints, opts.Weights))
.Where(c => c.Similarity >= opts.MinSimilarity)
.OrderByDescending(c => c.Similarity)
.Take(opts.MaxResults)
.ToList();
// Enrich with full function details
var results = new List<FunctionMatch>();
foreach (var candidate in groupedCandidates)
{
ct.ThrowIfCancellationRequested();
// Get the original candidates for this function
var functionCandidates = candidates.Where(c => c.FunctionId == candidate.FunctionId).ToList();
var function = await _repository.GetFunctionAsync(candidate.FunctionId, ct);
if (function is null) continue;
var variant = await _repository.GetBuildVariantAsync(function.BuildVariantId, ct);
if (variant is null) continue;
// Apply filters
if (opts.ArchitectureFilter is { Length: > 0 })
{
if (!opts.ArchitectureFilter.Value.Contains(variant.Architecture, StringComparer.OrdinalIgnoreCase))
continue;
}
var version = await _repository.GetLibraryVersionAsync(variant.LibraryVersionId, ct);
if (version is null) continue;
var library = await _repository.GetLibraryByIdAsync(version.LibraryId, ct);
if (library is null) continue;
// Apply library filter
if (opts.LibraryFilter is { Length: > 0 })
{
if (!opts.LibraryFilter.Value.Contains(library.Name, StringComparer.OrdinalIgnoreCase))
continue;
}
results.Add(new FunctionMatch(
library.Name,
version.Version,
function.Name,
function.DemangledName,
candidate.Similarity,
ComputeConfidence(candidate),
variant.Architecture,
variant.Abi,
new MatchDetails(
GetAlgorithmSimilarity(functionCandidates, FingerprintAlgorithm.SemanticKsg),
GetAlgorithmSimilarity(functionCandidates, FingerprintAlgorithm.InstructionBb),
GetAlgorithmSimilarity(functionCandidates, FingerprintAlgorithm.CfgWl),
GetAlgorithmSimilarity(functionCandidates, FingerprintAlgorithm.ApiCalls),
[],
fingerprints.SizeBytes.HasValue
? function.SizeBytes - fingerprints.SizeBytes.Value
: 0)));
}
return [.. results];
}
/// <inheritdoc />
public async Task<ImmutableDictionary<int, ImmutableArray<FunctionMatch>>> IdentifyBatchAsync(
IReadOnlyList<FunctionFingerprints> fingerprints,
IdentifyOptions? options = null,
CancellationToken ct = default)
{
var results = ImmutableDictionary.CreateBuilder<int, ImmutableArray<FunctionMatch>>();
// Process in parallel with controlled concurrency
var semaphore = new SemaphoreSlim(4);
var tasks = fingerprints.Select(async (fp, index) =>
{
await semaphore.WaitAsync(ct);
try
{
var matches = await IdentifyFunctionAsync(fp, options, ct);
return (Index: index, Matches: matches);
}
finally
{
semaphore.Release();
}
});
var completedResults = await Task.WhenAll(tasks);
foreach (var result in completedResults)
{
results.Add(result.Index, result.Matches);
}
return results.ToImmutable();
}
/// <inheritdoc />
public async Task<ImmutableArray<CorpusFunctionWithCve>> GetFunctionsForCveAsync(
string cveId,
CancellationToken ct = default)
{
_logger.LogDebug("Getting functions for CVE {CveId}", cveId);
var functionIds = await _repository.GetFunctionIdsForCveAsync(cveId, ct);
var results = new List<CorpusFunctionWithCve>();
foreach (var functionId in functionIds)
{
ct.ThrowIfCancellationRequested();
var function = await _repository.GetFunctionAsync(functionId, ct);
if (function is null) continue;
var variant = await _repository.GetBuildVariantAsync(function.BuildVariantId, ct);
if (variant is null) continue;
var version = await _repository.GetLibraryVersionAsync(variant.LibraryVersionId, ct);
if (version is null) continue;
var library = await _repository.GetLibraryByIdAsync(version.LibraryId, ct);
if (library is null) continue;
var cves = await _repository.GetCvesForFunctionAsync(functionId, ct);
var cveInfo = cves.FirstOrDefault(c => c.CveId == cveId);
if (cveInfo is null) continue;
results.Add(new CorpusFunctionWithCve(function, library, version, variant, cveInfo));
}
return [.. results];
}
/// <inheritdoc />
public async Task<FunctionEvolution?> GetFunctionEvolutionAsync(
string libraryName,
string functionName,
CancellationToken ct = default)
{
_logger.LogDebug("Getting evolution for function {Function} in {Library}", functionName, libraryName);
var library = await _repository.GetLibraryAsync(libraryName, ct);
if (library is null)
{
return null;
}
var versions = await _repository.ListVersionsAsync(libraryName, ct);
var snapshots = new List<FunctionVersionInfo>();
string? previousFingerprintHex = null;
foreach (var versionSummary in versions.OrderBy(v => v.ReleaseDate))
{
ct.ThrowIfCancellationRequested();
var version = await _repository.GetVersionAsync(library.Id, versionSummary.Version, ct);
if (version is null) continue;
var variants = await _repository.GetBuildVariantsAsync(version.Id, ct);
// Find the function in any variant
CorpusFunction? targetFunction = null;
CorpusFingerprint? fingerprint = null;
foreach (var variant in variants)
{
var functions = await _repository.GetFunctionsForVariantAsync(variant.Id, ct);
targetFunction = functions.FirstOrDefault(f =>
string.Equals(f.Name, functionName, StringComparison.Ordinal) ||
string.Equals(f.DemangledName, functionName, StringComparison.Ordinal));
if (targetFunction is not null)
{
var fps = await _repository.GetFingerprintsAsync(targetFunction.Id, ct);
fingerprint = fps.FirstOrDefault(f => f.Algorithm == FingerprintAlgorithm.SemanticKsg);
break;
}
}
if (targetFunction is null)
{
continue;
}
// Get CVE info for this version
var cves = await _repository.GetCvesForFunctionAsync(targetFunction.Id, ct);
var cveIds = cves.Select(c => c.CveId).ToImmutableArray();
// Compute similarity to previous version if available
decimal? similarityToPrevious = null;
var currentFingerprintHex = fingerprint?.FingerprintHex ?? string.Empty;
if (previousFingerprintHex is not null && currentFingerprintHex.Length > 0)
{
// Simple comparison: same hash = 1.0, different = 0.5 (would need proper similarity for better results)
similarityToPrevious = string.Equals(previousFingerprintHex, currentFingerprintHex, StringComparison.Ordinal)
? 1.0m
: 0.5m;
}
previousFingerprintHex = currentFingerprintHex;
snapshots.Add(new FunctionVersionInfo(
versionSummary.Version,
versionSummary.ReleaseDate,
targetFunction.SizeBytes,
currentFingerprintHex,
similarityToPrevious,
cveIds.Length > 0 ? cveIds : null));
}
if (snapshots.Count == 0)
{
return null;
}
return new FunctionEvolution(libraryName, functionName, [.. snapshots]);
}
/// <inheritdoc />
public async Task<CorpusStatistics> GetStatisticsAsync(CancellationToken ct = default)
{
return await _repository.GetStatisticsAsync(ct);
}
/// <inheritdoc />
public async Task<ImmutableArray<LibrarySummary>> ListLibrariesAsync(CancellationToken ct = default)
{
return await _repository.ListLibrariesAsync(ct);
}
/// <inheritdoc />
public async Task<ImmutableArray<LibraryVersionSummary>> ListVersionsAsync(
string libraryName,
CancellationToken ct = default)
{
return await _repository.ListVersionsAsync(libraryName, ct);
}
#region Private Methods
private async Task<List<FunctionCandidate>> SearchByFingerprintAsync(
FingerprintAlgorithm algorithm,
byte[] fingerprint,
IdentifyOptions options,
CancellationToken ct)
{
var candidates = new List<FunctionCandidate>();
// First try exact match
var exactMatches = await _repository.FindFunctionsByFingerprintAsync(algorithm, fingerprint, ct);
foreach (var functionId in exactMatches)
{
candidates.Add(new FunctionCandidate(functionId, algorithm, 1.0m, fingerprint));
}
// Then try approximate matching
var similarResults = await _repository.FindSimilarFingerprintsAsync(
algorithm,
fingerprint,
options.MaxResults * 2, // Get more to account for filtering
ct);
foreach (var result in similarResults)
{
if (!candidates.Any(c => c.FunctionId == result.FunctionId))
{
candidates.Add(new FunctionCandidate(
result.FunctionId,
algorithm,
result.Similarity,
result.Fingerprint));
}
}
return candidates;
}
private static CombinedCandidate ComputeCombinedScore(
IGrouping<Guid, FunctionCandidate> group,
FunctionFingerprints query,
SimilarityWeights weights)
{
var candidates = group.ToList();
decimal totalScore = 0;
decimal totalWeight = 0;
var algorithms = new List<FingerprintAlgorithm>();
foreach (var candidate in candidates)
{
var weight = candidate.Algorithm switch
{
FingerprintAlgorithm.SemanticKsg => weights.SemanticWeight,
FingerprintAlgorithm.InstructionBb => weights.InstructionWeight,
FingerprintAlgorithm.CfgWl => weights.CfgWeight,
FingerprintAlgorithm.ApiCalls => weights.ApiCallWeight,
_ => 0.1m
};
totalScore += candidate.Similarity * weight;
totalWeight += weight;
algorithms.Add(candidate.Algorithm);
}
var combinedSimilarity = totalWeight > 0 ? totalScore / totalWeight : 0;
return new CombinedCandidate(group.Key, combinedSimilarity, [.. algorithms]);
}
private static MatchConfidence ComputeConfidence(CombinedCandidate candidate)
{
// Higher confidence with more matching algorithms and higher similarity
var algorithmCount = candidate.MatchingAlgorithms.Length;
var similarity = candidate.Similarity;
if (algorithmCount >= 3 && similarity >= 0.95m)
return MatchConfidence.Exact;
if (algorithmCount >= 3 && similarity >= 0.85m)
return MatchConfidence.VeryHigh;
if (algorithmCount >= 2 && similarity >= 0.85m)
return MatchConfidence.High;
if (algorithmCount >= 1 && similarity >= 0.70m)
return MatchConfidence.Medium;
return MatchConfidence.Low;
}
private static decimal GetAlgorithmSimilarity(
List<FunctionCandidate> candidates,
FingerprintAlgorithm algorithm)
{
var match = candidates.FirstOrDefault(c => c.Algorithm == algorithm);
return match?.Similarity ?? 0m;
}
#endregion
private sealed record FunctionCandidate(
Guid FunctionId,
FingerprintAlgorithm Algorithm,
decimal Similarity,
byte[] Fingerprint);
private sealed record CombinedCandidate(
Guid FunctionId,
decimal Similarity,
ImmutableArray<FingerprintAlgorithm> MatchingAlgorithms);
}

View File

@@ -0,0 +1,423 @@
using System.Collections.Immutable;
using Microsoft.Extensions.Logging;
using StellaOps.BinaryIndex.Corpus.Models;
namespace StellaOps.BinaryIndex.Corpus.Services;
/// <summary>
/// Service for updating CVE-to-function mappings in the corpus.
/// </summary>
public sealed class CveFunctionMappingUpdater : ICveFunctionMappingUpdater
{
private readonly ICorpusRepository _repository;
private readonly ICveDataProvider _cveDataProvider;
private readonly ILogger<CveFunctionMappingUpdater> _logger;
public CveFunctionMappingUpdater(
ICorpusRepository repository,
ICveDataProvider cveDataProvider,
ILogger<CveFunctionMappingUpdater> logger)
{
_repository = repository;
_cveDataProvider = cveDataProvider;
_logger = logger;
}
/// <inheritdoc />
public async Task<CveMappingUpdateResult> UpdateMappingsForCveAsync(
string cveId,
CancellationToken ct = default)
{
_logger.LogInformation("Updating function mappings for CVE {CveId}", cveId);
var startTime = DateTime.UtcNow;
var errors = new List<string>();
var functionsUpdated = 0;
try
{
// Get CVE details from provider
var cveDetails = await _cveDataProvider.GetCveDetailsAsync(cveId, ct);
if (cveDetails is null)
{
return new CveMappingUpdateResult(
cveId,
0,
DateTime.UtcNow - startTime,
[$"CVE {cveId} not found in data provider"]);
}
// Get affected library
var library = await _repository.GetLibraryAsync(cveDetails.AffectedLibrary, ct);
if (library is null)
{
return new CveMappingUpdateResult(
cveId,
0,
DateTime.UtcNow - startTime,
[$"Library {cveDetails.AffectedLibrary} not found in corpus"]);
}
// Process affected versions
var associations = new List<FunctionCve>();
foreach (var affectedVersion in cveDetails.AffectedVersions)
{
ct.ThrowIfCancellationRequested();
// Find matching version in corpus
var version = await FindMatchingVersionAsync(library.Id, affectedVersion, ct);
if (version is null)
{
_logger.LogDebug("Version {Version} not found in corpus", affectedVersion);
continue;
}
// Get all build variants for this version
var variants = await _repository.GetBuildVariantsAsync(version.Id, ct);
foreach (var variant in variants)
{
// Get functions in this variant
var functions = await _repository.GetFunctionsForVariantAsync(variant.Id, ct);
// If we have specific function names, only map those
if (cveDetails.AffectedFunctions.Length > 0)
{
var matchedFunctions = functions.Where(f =>
cveDetails.AffectedFunctions.Contains(f.Name, StringComparer.Ordinal) ||
(f.DemangledName is not null &&
cveDetails.AffectedFunctions.Contains(f.DemangledName, StringComparer.Ordinal)));
foreach (var function in matchedFunctions)
{
associations.Add(CreateAssociation(function.Id, cveId, cveDetails, affectedVersion));
functionsUpdated++;
}
}
else
{
// Map all functions in affected variant as potentially affected
foreach (var function in functions.Take(100)) // Limit to avoid huge updates
{
associations.Add(CreateAssociation(function.Id, cveId, cveDetails, affectedVersion));
functionsUpdated++;
}
}
}
}
// Upsert all associations
if (associations.Count > 0)
{
await _repository.UpsertCveAssociationsAsync(cveId, associations, ct);
}
var duration = DateTime.UtcNow - startTime;
_logger.LogInformation(
"Updated {Count} function mappings for CVE {CveId} in {Duration:c}",
functionsUpdated, cveId, duration);
return new CveMappingUpdateResult(cveId, functionsUpdated, duration, [.. errors]);
}
catch (Exception ex)
{
errors.Add(ex.Message);
_logger.LogError(ex, "Error updating mappings for CVE {CveId}", cveId);
return new CveMappingUpdateResult(cveId, functionsUpdated, DateTime.UtcNow - startTime, [.. errors]);
}
}
/// <inheritdoc />
public async Task<CveBatchMappingResult> UpdateMappingsForLibraryAsync(
string libraryName,
CancellationToken ct = default)
{
_logger.LogInformation("Updating all CVE mappings for library {Library}", libraryName);
var startTime = DateTime.UtcNow;
var results = new List<CveMappingUpdateResult>();
// Get all CVEs for this library
var cves = await _cveDataProvider.GetCvesForLibraryAsync(libraryName, ct);
foreach (var cveId in cves)
{
ct.ThrowIfCancellationRequested();
var result = await UpdateMappingsForCveAsync(cveId, ct);
results.Add(result);
}
var totalDuration = DateTime.UtcNow - startTime;
return new CveBatchMappingResult(
libraryName,
results.Count,
results.Sum(r => r.FunctionsUpdated),
totalDuration,
[.. results.Where(r => r.Errors.Length > 0).SelectMany(r => r.Errors)]);
}
/// <inheritdoc />
public async Task<CveMappingUpdateResult> MarkFunctionFixedAsync(
string cveId,
string libraryName,
string version,
string? functionName,
string? patchCommit,
CancellationToken ct = default)
{
_logger.LogInformation(
"Marking functions as fixed for CVE {CveId} in {Library} {Version}",
cveId, libraryName, version);
var startTime = DateTime.UtcNow;
var functionsUpdated = 0;
var library = await _repository.GetLibraryAsync(libraryName, ct);
if (library is null)
{
return new CveMappingUpdateResult(
cveId, 0, DateTime.UtcNow - startTime,
[$"Library {libraryName} not found"]);
}
var libVersion = await _repository.GetVersionAsync(library.Id, version, ct);
if (libVersion is null)
{
return new CveMappingUpdateResult(
cveId, 0, DateTime.UtcNow - startTime,
[$"Version {version} not found"]);
}
var variants = await _repository.GetBuildVariantsAsync(libVersion.Id, ct);
var associations = new List<FunctionCve>();
foreach (var variant in variants)
{
var functions = await _repository.GetFunctionsForVariantAsync(variant.Id, ct);
IEnumerable<CorpusFunction> targetFunctions = functionName is null
? functions
: functions.Where(f =>
string.Equals(f.Name, functionName, StringComparison.Ordinal) ||
string.Equals(f.DemangledName, functionName, StringComparison.Ordinal));
foreach (var function in targetFunctions)
{
associations.Add(new FunctionCve(
function.Id,
cveId,
CveAffectedState.Fixed,
patchCommit,
0.9m, // High confidence for explicit marking
CveEvidenceType.Commit));
functionsUpdated++;
}
}
if (associations.Count > 0)
{
await _repository.UpsertCveAssociationsAsync(cveId, associations, ct);
}
return new CveMappingUpdateResult(
cveId, functionsUpdated, DateTime.UtcNow - startTime, []);
}
/// <inheritdoc />
public async Task<ImmutableArray<string>> GetUnmappedCvesAsync(
string libraryName,
CancellationToken ct = default)
{
// Get all known CVEs for this library
var allCves = await _cveDataProvider.GetCvesForLibraryAsync(libraryName, ct);
// Get CVEs that have function mappings
var unmapped = new List<string>();
foreach (var cveId in allCves)
{
ct.ThrowIfCancellationRequested();
var functionIds = await _repository.GetFunctionIdsForCveAsync(cveId, ct);
if (functionIds.Length == 0)
{
unmapped.Add(cveId);
}
}
return [.. unmapped];
}
#region Private Methods
private async Task<LibraryVersion?> FindMatchingVersionAsync(
Guid libraryId,
string versionString,
CancellationToken ct)
{
// Try exact match first
var exactMatch = await _repository.GetVersionAsync(libraryId, versionString, ct);
if (exactMatch is not null)
{
return exactMatch;
}
// Try with common prefixes/suffixes removed
var normalizedVersion = NormalizeVersion(versionString);
if (normalizedVersion != versionString)
{
return await _repository.GetVersionAsync(libraryId, normalizedVersion, ct);
}
return null;
}
private static string NormalizeVersion(string version)
{
// Remove common prefixes
if (version.StartsWith("v", StringComparison.OrdinalIgnoreCase))
{
version = version[1..];
}
// Remove release suffixes
var suffixIndex = version.IndexOfAny(['-', '+', '_']);
if (suffixIndex > 0)
{
version = version[..suffixIndex];
}
return version;
}
private static FunctionCve CreateAssociation(
Guid functionId,
string cveId,
CveDetails cveDetails,
string version)
{
var isFixed = cveDetails.FixedVersions.Contains(version, StringComparer.OrdinalIgnoreCase);
return new FunctionCve(
functionId,
cveId,
isFixed ? CveAffectedState.Fixed : CveAffectedState.Vulnerable,
cveDetails.PatchCommit,
ComputeConfidence(cveDetails),
cveDetails.EvidenceType);
}
private static decimal ComputeConfidence(CveDetails details)
{
// Higher confidence for specific function names and commit evidence
var baseConfidence = 0.5m;
if (details.AffectedFunctions.Length > 0)
{
baseConfidence += 0.2m;
}
if (!string.IsNullOrEmpty(details.PatchCommit))
{
baseConfidence += 0.2m;
}
return details.EvidenceType switch
{
CveEvidenceType.Commit => baseConfidence + 0.1m,
CveEvidenceType.Advisory => baseConfidence + 0.05m,
CveEvidenceType.Changelog => baseConfidence + 0.05m,
_ => baseConfidence
};
}
#endregion
}
/// <summary>
/// Interface for CVE-to-function mapping updates.
/// </summary>
public interface ICveFunctionMappingUpdater
{
/// <summary>
/// Update function mappings for a specific CVE.
/// </summary>
Task<CveMappingUpdateResult> UpdateMappingsForCveAsync(
string cveId,
CancellationToken ct = default);
/// <summary>
/// Update all CVE mappings for a library.
/// </summary>
Task<CveBatchMappingResult> UpdateMappingsForLibraryAsync(
string libraryName,
CancellationToken ct = default);
/// <summary>
/// Mark functions as fixed for a CVE.
/// </summary>
Task<CveMappingUpdateResult> MarkFunctionFixedAsync(
string cveId,
string libraryName,
string version,
string? functionName,
string? patchCommit,
CancellationToken ct = default);
/// <summary>
/// Get CVEs that have no function mappings.
/// </summary>
Task<ImmutableArray<string>> GetUnmappedCvesAsync(
string libraryName,
CancellationToken ct = default);
}
/// <summary>
/// Provider for CVE data.
/// </summary>
public interface ICveDataProvider
{
/// <summary>
/// Get details for a CVE.
/// </summary>
Task<CveDetails?> GetCveDetailsAsync(string cveId, CancellationToken ct = default);
/// <summary>
/// Get all CVEs affecting a library.
/// </summary>
Task<ImmutableArray<string>> GetCvesForLibraryAsync(string libraryName, CancellationToken ct = default);
}
/// <summary>
/// CVE details from a data provider.
/// </summary>
public sealed record CveDetails(
string CveId,
string AffectedLibrary,
ImmutableArray<string> AffectedVersions,
ImmutableArray<string> FixedVersions,
ImmutableArray<string> AffectedFunctions,
string? PatchCommit,
CveEvidenceType EvidenceType);
/// <summary>
/// Result of a CVE mapping update.
/// </summary>
public sealed record CveMappingUpdateResult(
string CveId,
int FunctionsUpdated,
TimeSpan Duration,
ImmutableArray<string> Errors);
/// <summary>
/// Result of batch CVE mapping update.
/// </summary>
public sealed record CveBatchMappingResult(
string LibraryName,
int CvesProcessed,
int TotalFunctionsUpdated,
TimeSpan Duration,
ImmutableArray<string> Errors);

View File

@@ -0,0 +1,531 @@
using System.Collections.Immutable;
using System.Text.RegularExpressions;
using Microsoft.Extensions.Logging;
using StellaOps.BinaryIndex.Corpus.Models;
namespace StellaOps.BinaryIndex.Corpus.Services;
/// <summary>
/// Service for clustering semantically similar functions across library versions.
/// Groups functions by their canonical name and computes similarity to cluster centroid.
/// </summary>
public sealed partial class FunctionClusteringService : IFunctionClusteringService
{
private readonly ICorpusRepository _repository;
private readonly IClusterSimilarityComputer _similarityComputer;
private readonly ILogger<FunctionClusteringService> _logger;
public FunctionClusteringService(
ICorpusRepository repository,
IClusterSimilarityComputer similarityComputer,
ILogger<FunctionClusteringService> logger)
{
_repository = repository;
_similarityComputer = similarityComputer;
_logger = logger;
}
/// <inheritdoc />
public async Task<ClusteringResult> ClusterFunctionsAsync(
Guid libraryId,
ClusteringOptions? options = null,
CancellationToken ct = default)
{
var opts = options ?? new ClusteringOptions();
var startTime = DateTime.UtcNow;
_logger.LogInformation(
"Starting function clustering for library {LibraryId}",
libraryId);
// Get all functions with fingerprints for this library
var functionsWithFingerprints = await GetFunctionsWithFingerprintsAsync(libraryId, ct);
if (functionsWithFingerprints.Count == 0)
{
_logger.LogWarning("No functions with fingerprints found for library {LibraryId}", libraryId);
return new ClusteringResult(
libraryId,
0,
0,
TimeSpan.Zero,
[],
[]);
}
_logger.LogInformation(
"Found {Count} functions with fingerprints",
functionsWithFingerprints.Count);
// Group functions by canonical name
var groupedByName = functionsWithFingerprints
.GroupBy(f => NormalizeCanonicalName(f.Function.DemangledName ?? f.Function.Name))
.Where(g => !string.IsNullOrWhiteSpace(g.Key))
.ToList();
_logger.LogInformation(
"Grouped into {Count} canonical function names",
groupedByName.Count);
var clustersCreated = 0;
var membersAssigned = 0;
var errors = new List<string>();
var warnings = new List<string>();
foreach (var group in groupedByName)
{
ct.ThrowIfCancellationRequested();
try
{
var result = await ProcessFunctionGroupAsync(
libraryId,
group.Key,
group.ToList(),
opts,
ct);
clustersCreated++;
membersAssigned += result.MembersAdded;
if (result.Warnings.Length > 0)
{
warnings.AddRange(result.Warnings);
}
}
catch (Exception ex)
{
errors.Add($"Failed to cluster '{group.Key}': {ex.Message}");
_logger.LogError(ex, "Error clustering function group {Name}", group.Key);
}
}
var duration = DateTime.UtcNow - startTime;
_logger.LogInformation(
"Clustering completed: {Clusters} clusters, {Members} members in {Duration:c}",
clustersCreated,
membersAssigned,
duration);
return new ClusteringResult(
libraryId,
clustersCreated,
membersAssigned,
duration,
[.. errors],
[.. warnings]);
}
/// <inheritdoc />
public async Task<ClusteringResult> ReclusterAsync(
Guid clusterId,
ClusteringOptions? options = null,
CancellationToken ct = default)
{
var opts = options ?? new ClusteringOptions();
var startTime = DateTime.UtcNow;
// Get existing cluster
var cluster = await _repository.GetClusterAsync(clusterId, ct);
if (cluster is null)
{
return new ClusteringResult(
Guid.Empty,
0,
0,
TimeSpan.Zero,
["Cluster not found"],
[]);
}
// Get current members
var members = await _repository.GetClusterMembersAsync(clusterId, ct);
if (members.Length == 0)
{
return new ClusteringResult(
cluster.LibraryId,
0,
0,
TimeSpan.Zero,
[],
["Cluster has no members"]);
}
// Get functions with fingerprints
var functionsWithFingerprints = new List<FunctionWithFingerprint>();
foreach (var member in members)
{
var function = await _repository.GetFunctionAsync(member.FunctionId, ct);
if (function is null)
{
continue;
}
var fingerprints = await _repository.GetFingerprintsForFunctionAsync(function.Id, ct);
var semanticFp = fingerprints.FirstOrDefault(f => f.Algorithm == FingerprintAlgorithm.SemanticKsg);
if (semanticFp is not null)
{
functionsWithFingerprints.Add(new FunctionWithFingerprint(function, semanticFp));
}
}
// Clear existing members
await _repository.ClearClusterMembersAsync(clusterId, ct);
// Recompute similarities
var centroid = ComputeCentroid(functionsWithFingerprints, opts);
var membersAdded = 0;
foreach (var fwf in functionsWithFingerprints)
{
var similarity = await _similarityComputer.ComputeSimilarityAsync(
fwf.Fingerprint.Fingerprint,
centroid,
ct);
if (similarity >= opts.MinimumSimilarity)
{
await _repository.AddClusterMemberAsync(
new ClusterMember(clusterId, fwf.Function.Id, similarity),
ct);
membersAdded++;
}
}
var duration = DateTime.UtcNow - startTime;
return new ClusteringResult(
cluster.LibraryId,
1,
membersAdded,
duration,
[],
[]);
}
/// <inheritdoc />
public async Task<ImmutableArray<FunctionCluster>> GetClustersForLibraryAsync(
Guid libraryId,
CancellationToken ct = default)
{
return await _repository.GetClustersForLibraryAsync(libraryId, ct);
}
/// <inheritdoc />
public async Task<ClusterDetails?> GetClusterDetailsAsync(
Guid clusterId,
CancellationToken ct = default)
{
var cluster = await _repository.GetClusterAsync(clusterId, ct);
if (cluster is null)
{
return null;
}
var members = await _repository.GetClusterMembersAsync(clusterId, ct);
var functionDetails = new List<ClusterMemberDetails>();
foreach (var member in members)
{
var function = await _repository.GetFunctionAsync(member.FunctionId, ct);
if (function is null)
{
continue;
}
var variant = await _repository.GetBuildVariantAsync(function.BuildVariantId, ct);
LibraryVersion? version = null;
if (variant is not null)
{
version = await _repository.GetLibraryVersionAsync(variant.LibraryVersionId, ct);
}
functionDetails.Add(new ClusterMemberDetails(
member.FunctionId,
function.Name,
function.DemangledName,
version?.Version ?? "unknown",
variant?.Architecture ?? "unknown",
member.SimilarityToCentroid ?? 0m));
}
return new ClusterDetails(
cluster.Id,
cluster.LibraryId,
cluster.CanonicalName,
cluster.Description,
[.. functionDetails]);
}
#region Private Methods
private async Task<List<FunctionWithFingerprint>> GetFunctionsWithFingerprintsAsync(
Guid libraryId,
CancellationToken ct)
{
var result = new List<FunctionWithFingerprint>();
// Get all versions for the library
var library = await _repository.GetLibraryByIdAsync(libraryId, ct);
if (library is null)
{
return result;
}
var versions = await _repository.ListVersionsAsync(library.Name, ct);
foreach (var version in versions)
{
var variants = await _repository.GetBuildVariantsAsync(version.Id, ct);
foreach (var variant in variants)
{
var functions = await _repository.GetFunctionsForVariantAsync(variant.Id, ct);
foreach (var function in functions)
{
var fingerprints = await _repository.GetFingerprintsForFunctionAsync(function.Id, ct);
var semanticFp = fingerprints.FirstOrDefault(f => f.Algorithm == FingerprintAlgorithm.SemanticKsg);
if (semanticFp is not null)
{
result.Add(new FunctionWithFingerprint(function, semanticFp));
}
}
}
}
return result;
}
private async Task<GroupClusteringResult> ProcessFunctionGroupAsync(
Guid libraryId,
string canonicalName,
List<FunctionWithFingerprint> functions,
ClusteringOptions options,
CancellationToken ct)
{
// Ensure cluster exists
var existingClusters = await _repository.GetClustersForLibraryAsync(libraryId, ct);
var cluster = existingClusters.FirstOrDefault(c =>
string.Equals(c.CanonicalName, canonicalName, StringComparison.OrdinalIgnoreCase));
Guid clusterId;
if (cluster is null)
{
// Create new cluster
var newCluster = new FunctionCluster(
Guid.NewGuid(),
libraryId,
canonicalName,
$"Cluster for function '{canonicalName}'",
DateTimeOffset.UtcNow);
await _repository.InsertClusterAsync(newCluster, ct);
clusterId = newCluster.Id;
}
else
{
clusterId = cluster.Id;
// Clear existing members for recomputation
await _repository.ClearClusterMembersAsync(clusterId, ct);
}
// Compute centroid fingerprint
var centroid = ComputeCentroid(functions, options);
var membersAdded = 0;
var warnings = new List<string>();
foreach (var fwf in functions)
{
var similarity = await _similarityComputer.ComputeSimilarityAsync(
fwf.Fingerprint.Fingerprint,
centroid,
ct);
if (similarity >= options.MinimumSimilarity)
{
await _repository.AddClusterMemberAsync(
new ClusterMember(clusterId, fwf.Function.Id, similarity),
ct);
membersAdded++;
}
else
{
warnings.Add($"Function {fwf.Function.Name} excluded: similarity {similarity:F4} < threshold {options.MinimumSimilarity:F4}");
}
}
return new GroupClusteringResult(membersAdded, [.. warnings]);
}
private static byte[] ComputeCentroid(
List<FunctionWithFingerprint> functions,
ClusteringOptions options)
{
if (functions.Count == 0)
{
return [];
}
if (functions.Count == 1)
{
return functions[0].Fingerprint.Fingerprint;
}
// Use most common fingerprint as centroid (mode-based approach)
// This is more robust than averaging for discrete hash-based fingerprints
var fingerprintCounts = functions
.GroupBy(f => Convert.ToHexStringLower(f.Fingerprint.Fingerprint))
.OrderByDescending(g => g.Count())
.ToList();
var mostCommon = fingerprintCounts.First();
return functions
.First(f => Convert.ToHexStringLower(f.Fingerprint.Fingerprint) == mostCommon.Key)
.Fingerprint.Fingerprint;
}
/// <summary>
/// Normalizes a function name to its canonical form for clustering.
/// </summary>
private static string NormalizeCanonicalName(string name)
{
if (string.IsNullOrWhiteSpace(name))
{
return string.Empty;
}
// Remove GLIBC version annotations (e.g., memcpy@GLIBC_2.14 -> memcpy)
var normalized = GlibcVersionPattern().Replace(name, "");
// Remove trailing @@ symbols
normalized = normalized.TrimEnd('@');
// Remove common symbol prefixes
if (normalized.StartsWith("__"))
{
normalized = normalized[2..];
}
// Remove _internal suffixes
normalized = InternalSuffixPattern().Replace(normalized, "");
// Trim whitespace
normalized = normalized.Trim();
return normalized;
}
[GeneratedRegex(@"@GLIBC_[\d.]+", RegexOptions.Compiled)]
private static partial Regex GlibcVersionPattern();
[GeneratedRegex(@"_internal$", RegexOptions.Compiled | RegexOptions.IgnoreCase)]
private static partial Regex InternalSuffixPattern();
#endregion
private sealed record FunctionWithFingerprint(CorpusFunction Function, CorpusFingerprint Fingerprint);
private sealed record GroupClusteringResult(int MembersAdded, ImmutableArray<string> Warnings);
}
/// <summary>
/// Interface for function clustering.
/// </summary>
public interface IFunctionClusteringService
{
/// <summary>
/// Cluster all functions for a library.
/// </summary>
Task<ClusteringResult> ClusterFunctionsAsync(
Guid libraryId,
ClusteringOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Recompute a specific cluster.
/// </summary>
Task<ClusteringResult> ReclusterAsync(
Guid clusterId,
ClusteringOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Get all clusters for a library.
/// </summary>
Task<ImmutableArray<FunctionCluster>> GetClustersForLibraryAsync(
Guid libraryId,
CancellationToken ct = default);
/// <summary>
/// Get detailed information about a cluster.
/// </summary>
Task<ClusterDetails?> GetClusterDetailsAsync(
Guid clusterId,
CancellationToken ct = default);
}
/// <summary>
/// Options for function clustering.
/// </summary>
public sealed record ClusteringOptions
{
/// <summary>
/// Minimum similarity threshold to include a function in a cluster.
/// </summary>
public decimal MinimumSimilarity { get; init; } = 0.7m;
/// <summary>
/// Algorithm to use for clustering.
/// </summary>
public FingerprintAlgorithm Algorithm { get; init; } = FingerprintAlgorithm.SemanticKsg;
}
/// <summary>
/// Result of clustering operation.
/// </summary>
public sealed record ClusteringResult(
Guid LibraryId,
int ClustersCreated,
int MembersAssigned,
TimeSpan Duration,
ImmutableArray<string> Errors,
ImmutableArray<string> Warnings);
/// <summary>
/// Detailed cluster information.
/// </summary>
public sealed record ClusterDetails(
Guid ClusterId,
Guid LibraryId,
string CanonicalName,
string? Description,
ImmutableArray<ClusterMemberDetails> Members);
/// <summary>
/// Details about a cluster member.
/// </summary>
public sealed record ClusterMemberDetails(
Guid FunctionId,
string FunctionName,
string? DemangledName,
string Version,
string Architecture,
decimal SimilarityToCentroid);
/// <summary>
/// Interface for computing similarity between fingerprints.
/// </summary>
public interface IClusterSimilarityComputer
{
/// <summary>
/// Compute similarity between two fingerprints.
/// </summary>
Task<decimal> ComputeSimilarityAsync(
byte[] fingerprint1,
byte[] fingerprint2,
CancellationToken ct = default);
}

View File

@@ -10,6 +10,7 @@
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Http" />
</ItemGroup>
<ItemGroup>

View File

@@ -0,0 +1,392 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
namespace StellaOps.BinaryIndex.Decompiler;
/// <summary>
/// Engine for comparing AST structures using tree edit distance and semantic analysis.
/// </summary>
public sealed class AstComparisonEngine : IAstComparisonEngine
{
/// <inheritdoc />
public decimal ComputeStructuralSimilarity(DecompiledAst a, DecompiledAst b)
{
ArgumentNullException.ThrowIfNull(a);
ArgumentNullException.ThrowIfNull(b);
// Use normalized tree edit distance
var editDistance = ComputeEditDistance(a, b);
return 1.0m - editDistance.NormalizedDistance;
}
/// <inheritdoc />
public AstEditDistance ComputeEditDistance(DecompiledAst a, DecompiledAst b)
{
ArgumentNullException.ThrowIfNull(a);
ArgumentNullException.ThrowIfNull(b);
// Simplified Zhang-Shasha tree edit distance
var operations = ComputeTreeEditOperations(a.Root, b.Root);
var totalNodes = Math.Max(a.NodeCount, b.NodeCount);
var normalized = totalNodes > 0
? (decimal)operations.TotalOperations / totalNodes
: 0m;
return new AstEditDistance(
operations.Insertions,
operations.Deletions,
operations.Modifications,
operations.TotalOperations,
Math.Clamp(normalized, 0m, 1m));
}
/// <inheritdoc />
public ImmutableArray<SemanticEquivalence> FindEquivalences(DecompiledAst a, DecompiledAst b)
{
ArgumentNullException.ThrowIfNull(a);
ArgumentNullException.ThrowIfNull(b);
var equivalences = new List<SemanticEquivalence>();
// Find equivalent subtrees
var nodesA = CollectNodes(a.Root).ToList();
var nodesB = CollectNodes(b.Root).ToList();
foreach (var nodeA in nodesA)
{
foreach (var nodeB in nodesB)
{
var equivalence = CheckEquivalence(nodeA, nodeB);
if (equivalence is not null)
{
equivalences.Add(equivalence);
}
}
}
// Remove redundant equivalences (child nodes when parent is equivalent)
return [.. FilterRedundantEquivalences(equivalences)];
}
/// <inheritdoc />
public ImmutableArray<CodeDifference> FindDifferences(DecompiledAst a, DecompiledAst b)
{
ArgumentNullException.ThrowIfNull(a);
ArgumentNullException.ThrowIfNull(b);
var differences = new List<CodeDifference>();
// Compare root structures
CompareNodes(a.Root, b.Root, differences);
return [.. differences];
}
private static EditOperations ComputeTreeEditOperations(AstNode a, AstNode b)
{
// Simplified tree comparison
if (a.Type != b.Type)
{
return new EditOperations(0, 0, 1, 1);
}
var childrenA = a.Children;
var childrenB = b.Children;
var insertions = 0;
var deletions = 0;
var modifications = 0;
// Compare children using LCS-like approach
var maxLen = Math.Max(childrenA.Length, childrenB.Length);
var minLen = Math.Min(childrenA.Length, childrenB.Length);
insertions = childrenB.Length - minLen;
deletions = childrenA.Length - minLen;
for (var i = 0; i < minLen; i++)
{
var childOps = ComputeTreeEditOperations(childrenA[i], childrenB[i]);
insertions += childOps.Insertions;
deletions += childOps.Deletions;
modifications += childOps.Modifications;
}
return new EditOperations(insertions, deletions, modifications, insertions + deletions + modifications);
}
private static SemanticEquivalence? CheckEquivalence(AstNode a, AstNode b)
{
// Same type - potential equivalence
if (a.Type != b.Type)
{
return null;
}
// Check for identical
if (AreNodesIdentical(a, b))
{
return new SemanticEquivalence(a, b, EquivalenceType.Identical, 1.0m, "Identical nodes");
}
// Check for renamed (same structure, different names)
if (AreNodesRenamed(a, b))
{
return new SemanticEquivalence(a, b, EquivalenceType.Renamed, 0.95m, "Same structure with renamed identifiers");
}
// Check for optimization variants
if (AreOptimizationVariants(a, b))
{
return new SemanticEquivalence(a, b, EquivalenceType.Optimized, 0.85m, "Optimization variant");
}
return null;
}
private static bool AreNodesIdentical(AstNode a, AstNode b)
{
if (a.Type != b.Type || a.Children.Length != b.Children.Length)
{
return false;
}
// Check node-specific equality
if (a is ConstantNode constA && b is ConstantNode constB)
{
return constA.Value?.ToString() == constB.Value?.ToString();
}
if (a is VariableNode varA && b is VariableNode varB)
{
return varA.Name == varB.Name;
}
if (a is BinaryOpNode binA && b is BinaryOpNode binB)
{
if (binA.Operator != binB.Operator)
{
return false;
}
}
if (a is CallNode callA && b is CallNode callB)
{
if (callA.FunctionName != callB.FunctionName)
{
return false;
}
}
// Check children recursively
for (var i = 0; i < a.Children.Length; i++)
{
if (!AreNodesIdentical(a.Children[i], b.Children[i]))
{
return false;
}
}
return true;
}
private static bool AreNodesRenamed(AstNode a, AstNode b)
{
if (a.Type != b.Type || a.Children.Length != b.Children.Length)
{
return false;
}
// Same structure but variable/parameter names differ
if (a is VariableNode && b is VariableNode)
{
return true; // Different name but same position = renamed
}
// Check children have same structure
for (var i = 0; i < a.Children.Length; i++)
{
if (!AreNodesRenamed(a.Children[i], b.Children[i]) &&
!AreNodesIdentical(a.Children[i], b.Children[i]))
{
return false;
}
}
return true;
}
private static bool AreOptimizationVariants(AstNode a, AstNode b)
{
// Detect common optimization patterns
// Loop unrolling: for loop vs repeated statements
if (a.Type == AstNodeType.For && b.Type == AstNodeType.Block)
{
return true; // Might be unrolled
}
// Strength reduction: multiplication vs addition
if (a is BinaryOpNode binA && b is BinaryOpNode binB)
{
if ((binA.Operator == "*" && binB.Operator == "<<") ||
(binA.Operator == "/" && binB.Operator == ">>"))
{
return true;
}
}
// Inline expansion
if (a.Type == AstNodeType.Call && b.Type == AstNodeType.Block)
{
return true; // Might be inlined
}
return false;
}
private static void CompareNodes(AstNode a, AstNode b, List<CodeDifference> differences)
{
if (a.Type != b.Type)
{
differences.Add(new CodeDifference(
DifferenceType.Modified,
a,
b,
$"Node type changed: {a.Type} -> {b.Type}"));
return;
}
// Compare specific node types
switch (a)
{
case VariableNode varA when b is VariableNode varB:
if (varA.Name != varB.Name)
{
differences.Add(new CodeDifference(
DifferenceType.Modified,
a,
b,
$"Variable renamed: {varA.Name} -> {varB.Name}"));
}
break;
case ConstantNode constA when b is ConstantNode constB:
if (constA.Value?.ToString() != constB.Value?.ToString())
{
differences.Add(new CodeDifference(
DifferenceType.Modified,
a,
b,
$"Constant changed: {constA.Value} -> {constB.Value}"));
}
break;
case BinaryOpNode binA when b is BinaryOpNode binB:
if (binA.Operator != binB.Operator)
{
differences.Add(new CodeDifference(
DifferenceType.Modified,
a,
b,
$"Operator changed: {binA.Operator} -> {binB.Operator}"));
}
break;
case CallNode callA when b is CallNode callB:
if (callA.FunctionName != callB.FunctionName)
{
differences.Add(new CodeDifference(
DifferenceType.Modified,
a,
b,
$"Function call changed: {callA.FunctionName} -> {callB.FunctionName}"));
}
break;
}
// Compare children
var minChildren = Math.Min(a.Children.Length, b.Children.Length);
for (var i = 0; i < minChildren; i++)
{
CompareNodes(a.Children[i], b.Children[i], differences);
}
// Handle added/removed children
for (var i = minChildren; i < a.Children.Length; i++)
{
differences.Add(new CodeDifference(
DifferenceType.Removed,
a.Children[i],
null,
$"Node removed: {a.Children[i].Type}"));
}
for (var i = minChildren; i < b.Children.Length; i++)
{
differences.Add(new CodeDifference(
DifferenceType.Added,
null,
b.Children[i],
$"Node added: {b.Children[i].Type}"));
}
}
private static IEnumerable<AstNode> CollectNodes(AstNode root)
{
yield return root;
foreach (var child in root.Children)
{
foreach (var node in CollectNodes(child))
{
yield return node;
}
}
}
private static IEnumerable<SemanticEquivalence> FilterRedundantEquivalences(
List<SemanticEquivalence> equivalences)
{
// Keep only top-level equivalences
var result = new List<SemanticEquivalence>();
foreach (var eq in equivalences)
{
var isRedundant = equivalences.Any(other =>
other != eq &&
IsAncestor(other.NodeA, eq.NodeA) &&
IsAncestor(other.NodeB, eq.NodeB));
if (!isRedundant)
{
result.Add(eq);
}
}
return result;
}
private static bool IsAncestor(AstNode potential, AstNode node)
{
if (potential == node)
{
return false;
}
foreach (var child in potential.Children)
{
if (child == node || IsAncestor(child, node))
{
return true;
}
}
return false;
}
private readonly record struct EditOperations(int Insertions, int Deletions, int Modifications, int TotalOperations);
}

View File

@@ -0,0 +1,534 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
using System.Security.Cryptography;
using System.Text;
using System.Text.RegularExpressions;
namespace StellaOps.BinaryIndex.Decompiler;
/// <summary>
/// Normalizes decompiled code for comparison by removing superficial differences.
/// </summary>
public sealed partial class CodeNormalizer : ICodeNormalizer
{
private static readonly ImmutableHashSet<string> CKeywords = ImmutableHashSet.Create(
"auto", "break", "case", "char", "const", "continue", "default", "do",
"double", "else", "enum", "extern", "float", "for", "goto", "if",
"int", "long", "register", "return", "short", "signed", "sizeof", "static",
"struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while",
// Common Ghidra types
"undefined", "undefined1", "undefined2", "undefined4", "undefined8",
"byte", "word", "dword", "qword", "bool", "uchar", "ushort", "uint", "ulong",
"int8_t", "int16_t", "int32_t", "int64_t", "uint8_t", "uint16_t", "uint32_t", "uint64_t",
"size_t", "ssize_t", "ptrdiff_t", "intptr_t", "uintptr_t",
// Common function names to preserve
"NULL", "true", "false"
);
/// <inheritdoc />
public string Normalize(string code, NormalizationOptions? options = null)
{
ArgumentException.ThrowIfNullOrEmpty(code);
options ??= NormalizationOptions.Default;
var normalized = code;
// 1. Remove comments
normalized = RemoveComments(normalized);
// 2. Normalize variable names
if (options.NormalizeVariables)
{
normalized = NormalizeVariableNames(normalized, options.KnownFunctions);
}
// 3. Normalize function calls
if (options.NormalizeFunctionCalls)
{
normalized = NormalizeFunctionCalls(normalized, options.KnownFunctions);
}
// 4. Normalize constants
if (options.NormalizeConstants)
{
normalized = NormalizeConstants(normalized);
}
// 5. Normalize whitespace
if (options.NormalizeWhitespace)
{
normalized = NormalizeWhitespace(normalized);
}
// 6. Sort independent statements (within blocks)
if (options.SortIndependentStatements)
{
normalized = SortIndependentStatements(normalized);
}
return normalized;
}
/// <inheritdoc />
public byte[] ComputeCanonicalHash(string code)
{
ArgumentException.ThrowIfNullOrEmpty(code);
// Normalize with full normalization for hashing
var normalized = Normalize(code, new NormalizationOptions
{
NormalizeVariables = true,
NormalizeFunctionCalls = true,
NormalizeConstants = false, // Keep constants for semantic identity
NormalizeWhitespace = true,
SortIndependentStatements = true
});
return SHA256.HashData(Encoding.UTF8.GetBytes(normalized));
}
/// <inheritdoc />
public DecompiledAst NormalizeAst(DecompiledAst ast, NormalizationOptions? options = null)
{
ArgumentNullException.ThrowIfNull(ast);
options ??= NormalizationOptions.Default;
var varIndex = 0;
var varMap = new Dictionary<string, string>();
var normalizedRoot = NormalizeNode(ast.Root, options, varMap, ref varIndex);
return new DecompiledAst(
normalizedRoot,
ast.NodeCount,
ast.Depth,
ast.Patterns);
}
private static AstNode NormalizeNode(
AstNode node,
NormalizationOptions options,
Dictionary<string, string> varMap,
ref int varIndex)
{
return node switch
{
VariableNode varNode when options.NormalizeVariables =>
NormalizeVariableNode(varNode, varMap, ref varIndex),
CallNode callNode when options.NormalizeFunctionCalls =>
NormalizeCallNode(callNode, options, varMap, ref varIndex),
ConstantNode constNode when options.NormalizeConstants =>
NormalizeConstantNode(constNode),
_ => NormalizeChildren(node, options, varMap, ref varIndex)
};
}
private static AstNode NormalizeVariableNode(
VariableNode node,
Dictionary<string, string> varMap,
ref int varIndex)
{
if (IsKeywordOrType(node.Name))
{
return node;
}
if (!varMap.TryGetValue(node.Name, out var canonical))
{
canonical = $"var_{varIndex++}";
varMap[node.Name] = canonical;
}
return node with { Name = canonical };
}
private static AstNode NormalizeCallNode(
CallNode node,
NormalizationOptions options,
Dictionary<string, string> varMap,
ref int varIndex)
{
var funcName = node.FunctionName;
// Preserve known functions
if (options.KnownFunctions?.Contains(funcName) != true &&
!IsStandardLibraryFunction(funcName))
{
funcName = $"func_{funcName.GetHashCode():X8}";
}
var normalizedArgs = new List<AstNode>(node.Arguments.Length);
foreach (var arg in node.Arguments)
{
normalizedArgs.Add(NormalizeNode(arg, options, varMap, ref varIndex));
}
return new CallNode(funcName, [.. normalizedArgs], node.Location);
}
private static AstNode NormalizeConstantNode(ConstantNode node)
{
// Normalize numeric constants to canonical form
if (node.Value is long or int or short or byte)
{
return node with { Value = "CONST_INT" };
}
if (node.Value is double or float or decimal)
{
return node with { Value = "CONST_FLOAT" };
}
if (node.Value is string)
{
return node with { Value = "CONST_STR" };
}
return node;
}
private static AstNode NormalizeChildren(
AstNode node,
NormalizationOptions options,
Dictionary<string, string> varMap,
ref int varIndex)
{
if (node.Children.Length == 0)
{
return node;
}
var normalizedChildren = new List<AstNode>(node.Children.Length);
foreach (var child in node.Children)
{
normalizedChildren.Add(NormalizeNode(child, options, varMap, ref varIndex));
}
var normalizedArray = normalizedChildren.ToImmutableArray();
// Use reflection-free approach for common node types
return node switch
{
BlockNode block => block with { Statements = normalizedArray },
IfNode ifNode => CreateNormalizedIf(ifNode, normalizedArray),
WhileNode whileNode => CreateNormalizedWhile(whileNode, normalizedArray),
ForNode forNode => CreateNormalizedFor(forNode, normalizedArray),
ReturnNode returnNode when normalizedArray.Length > 0 =>
returnNode with { Value = normalizedArray[0] },
AssignmentNode assignment => CreateNormalizedAssignment(assignment, normalizedArray),
BinaryOpNode binOp => CreateNormalizedBinaryOp(binOp, normalizedArray),
UnaryOpNode unaryOp when normalizedArray.Length > 0 =>
unaryOp with { Operand = normalizedArray[0] },
_ => node // Return as-is for other node types
};
}
private static IfNode CreateNormalizedIf(IfNode node, ImmutableArray<AstNode> children)
{
return new IfNode(
children.Length > 0 ? children[0] : node.Condition,
children.Length > 1 ? children[1] : node.ThenBranch,
children.Length > 2 ? children[2] : node.ElseBranch,
node.Location);
}
private static WhileNode CreateNormalizedWhile(WhileNode node, ImmutableArray<AstNode> children)
{
return new WhileNode(
children.Length > 0 ? children[0] : node.Condition,
children.Length > 1 ? children[1] : node.Body,
node.Location);
}
private static ForNode CreateNormalizedFor(ForNode node, ImmutableArray<AstNode> children)
{
return new ForNode(
children.Length > 0 ? children[0] : node.Init,
children.Length > 1 ? children[1] : node.Condition,
children.Length > 2 ? children[2] : node.Update,
children.Length > 3 ? children[3] : node.Body,
node.Location);
}
private static AssignmentNode CreateNormalizedAssignment(
AssignmentNode node,
ImmutableArray<AstNode> children)
{
return new AssignmentNode(
children.Length > 0 ? children[0] : node.Target,
children.Length > 1 ? children[1] : node.Value,
node.Operator,
node.Location);
}
private static BinaryOpNode CreateNormalizedBinaryOp(
BinaryOpNode node,
ImmutableArray<AstNode> children)
{
return new BinaryOpNode(
children.Length > 0 ? children[0] : node.Left,
children.Length > 1 ? children[1] : node.Right,
node.Operator,
node.Location);
}
private static string RemoveComments(string code)
{
// Remove single-line comments
code = SingleLineCommentRegex().Replace(code, "");
// Remove multi-line comments
code = MultiLineCommentRegex().Replace(code, "");
return code;
}
private static string NormalizeVariableNames(string code, ImmutableHashSet<string>? knownFunctions)
{
var varIndex = 0;
var varMap = new Dictionary<string, string>();
return IdentifierRegex().Replace(code, match =>
{
var name = match.Value;
// Skip keywords and types
if (IsKeywordOrType(name))
{
return name;
}
// Skip known functions
if (knownFunctions?.Contains(name) == true)
{
return name;
}
// Skip standard library functions
if (IsStandardLibraryFunction(name))
{
return name;
}
if (!varMap.TryGetValue(name, out var canonical))
{
canonical = $"var_{varIndex++}";
varMap[name] = canonical;
}
return canonical;
});
}
private static string NormalizeFunctionCalls(string code, ImmutableHashSet<string>? knownFunctions)
{
// Match function calls: identifier followed by (
return FunctionCallRegex().Replace(code, match =>
{
var funcName = match.Groups[1].Value;
// Skip known functions
if (knownFunctions?.Contains(funcName) == true)
{
return match.Value;
}
// Skip standard library functions
if (IsStandardLibraryFunction(funcName))
{
return match.Value;
}
return $"func_{funcName.GetHashCode():X8}(";
});
}
private static string NormalizeConstants(string code)
{
// Normalize hex constants
code = HexConstantRegex().Replace(code, "CONST_HEX");
// Normalize decimal constants (but preserve small common ones like 0, 1, 2)
code = LargeDecimalRegex().Replace(code, "CONST_INT");
// Normalize string literals
code = StringLiteralRegex().Replace(code, "CONST_STR");
return code;
}
private static string NormalizeWhitespace(string code)
{
// Collapse multiple whitespace to single space
code = MultipleWhitespaceRegex().Replace(code, " ");
// Remove whitespace around operators
code = WhitespaceAroundOperatorsRegex().Replace(code, "$1");
// Normalize line endings
code = code.Replace("\r\n", "\n").Replace("\r", "\n");
// Remove trailing whitespace on lines
code = TrailingWhitespaceRegex().Replace(code, "\n");
return code.Trim();
}
private static string SortIndependentStatements(string code)
{
// Parse into blocks and sort independent statements within each block
// This is a simplified implementation that sorts top-level statements
// A full implementation would need to analyze data dependencies
var lines = code.Split('\n', StringSplitOptions.RemoveEmptyEntries);
var result = new StringBuilder();
var blockDepth = 0;
var currentBlock = new List<string>();
foreach (var line in lines)
{
var trimmed = line.Trim();
// Track block depth
blockDepth += trimmed.Count(c => c == '{');
blockDepth -= trimmed.Count(c => c == '}');
if (blockDepth == 1 && !trimmed.Contains('{') && !trimmed.Contains('}'))
{
// Simple statement at block level 1
currentBlock.Add(trimmed);
}
else
{
// Flush sorted block
if (currentBlock.Count > 0)
{
var sorted = SortStatements(currentBlock);
foreach (var stmt in sorted)
{
result.AppendLine(stmt);
}
currentBlock.Clear();
}
result.AppendLine(line);
}
}
// Flush remaining
if (currentBlock.Count > 0)
{
var sorted = SortStatements(currentBlock);
foreach (var stmt in sorted)
{
result.AppendLine(stmt);
}
}
return result.ToString().Trim();
}
private static List<string> SortStatements(List<string> statements)
{
// Group statements that can be reordered
// For now, just sort by canonical form (conservative)
return statements
.OrderBy(s => GetStatementSortKey(s), StringComparer.Ordinal)
.ToList();
}
private static string GetStatementSortKey(string statement)
{
// Extract the "essence" of the statement for sorting
// e.g., assignment target, function call name
var trimmed = statement.Trim();
// Assignment: sort by target
var assignMatch = AssignmentTargetRegex().Match(trimmed);
if (assignMatch.Success)
{
return $"A_{assignMatch.Groups[1].Value}";
}
// Function call: sort by function name
var callMatch = FunctionNameRegex().Match(trimmed);
if (callMatch.Success)
{
return $"C_{callMatch.Groups[1].Value}";
}
return $"Z_{trimmed}";
}
private static bool IsKeywordOrType(string name)
{
return CKeywords.Contains(name);
}
private static bool IsStandardLibraryFunction(string name)
{
// Common C standard library functions to preserve
return name switch
{
// Memory
"malloc" or "calloc" or "realloc" or "free" or "memcpy" or "memmove" or "memset" or "memcmp" => true,
// String
"strlen" or "strcpy" or "strncpy" or "strcat" or "strncat" or "strcmp" or "strncmp" or "strchr" or "strrchr" or "strstr" => true,
// I/O
"printf" or "fprintf" or "sprintf" or "snprintf" or "scanf" or "fscanf" or "sscanf" => true,
"fopen" or "fclose" or "fread" or "fwrite" or "fseek" or "ftell" or "fflush" => true,
"puts" or "fputs" or "gets" or "fgets" or "putchar" or "getchar" => true,
// Math
"abs" or "labs" or "llabs" or "fabs" or "sqrt" or "pow" or "sin" or "cos" or "tan" or "log" or "exp" => true,
// Other
"exit" or "abort" or "atexit" or "atoi" or "atol" or "atof" or "strtol" or "strtoul" or "strtod" => true,
"assert" or "errno" => true,
_ => false
};
}
// Regex patterns using source generators
[GeneratedRegex(@"//[^\n]*")]
private static partial Regex SingleLineCommentRegex();
[GeneratedRegex(@"/\*[\s\S]*?\*/")]
private static partial Regex MultiLineCommentRegex();
[GeneratedRegex(@"\b([a-zA-Z_][a-zA-Z0-9_]*)\b")]
private static partial Regex IdentifierRegex();
[GeneratedRegex(@"\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(")]
private static partial Regex FunctionCallRegex();
[GeneratedRegex(@"0[xX][0-9a-fA-F]+")]
private static partial Regex HexConstantRegex();
[GeneratedRegex(@"\b[0-9]{4,}\b")]
private static partial Regex LargeDecimalRegex();
[GeneratedRegex(@"""(?:[^""\\]|\\.)*""")]
private static partial Regex StringLiteralRegex();
[GeneratedRegex(@"[ \t]+")]
private static partial Regex MultipleWhitespaceRegex();
[GeneratedRegex(@"\s*([+\-*/%=<>!&|^~?:;,{}()\[\]])\s*")]
private static partial Regex WhitespaceAroundOperatorsRegex();
[GeneratedRegex(@"[ \t]+\n")]
private static partial Regex TrailingWhitespaceRegex();
[GeneratedRegex(@"^([a-zA-Z_][a-zA-Z0-9_]*)\s*=")]
private static partial Regex AssignmentTargetRegex();
[GeneratedRegex(@"^([a-zA-Z_][a-zA-Z0-9_]*)\s*\(")]
private static partial Regex FunctionNameRegex();
}

View File

@@ -0,0 +1,950 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
using System.Text.RegularExpressions;
namespace StellaOps.BinaryIndex.Decompiler;
/// <summary>
/// Parser for Ghidra's decompiled C-like pseudo-code.
/// </summary>
public sealed partial class DecompiledCodeParser : IDecompiledCodeParser
{
private static readonly HashSet<string> s_keywords =
[
"if", "else", "while", "for", "do", "switch", "case", "default",
"return", "break", "continue", "goto", "sizeof", "typedef",
"struct", "union", "enum", "void", "int", "char", "short", "long",
"float", "double", "unsigned", "signed", "const", "static", "extern"
];
private static readonly HashSet<string> s_types =
[
"void", "int", "uint", "char", "uchar", "byte", "ubyte",
"short", "ushort", "long", "ulong", "longlong", "ulonglong",
"float", "double", "bool", "undefined", "undefined1", "undefined2",
"undefined4", "undefined8", "pointer", "code", "dword", "qword", "word"
];
/// <inheritdoc />
public DecompiledAst Parse(string code)
{
ArgumentException.ThrowIfNullOrEmpty(code);
var tokens = Tokenize(code);
var parser = new RecursiveParser(tokens);
var root = parser.ParseFunction();
var nodeCount = CountNodes(root);
var depth = ComputeDepth(root);
var patterns = ExtractPatterns(root);
return new DecompiledAst(root, nodeCount, depth, patterns);
}
/// <inheritdoc />
public ImmutableArray<LocalVariable> ExtractVariables(string code)
{
var variables = new List<LocalVariable>();
var varIndex = 0;
// Match variable declarations: type name [= value];
// Ghidra style: int local_10; or undefined8 param_1;
var declPattern = VariableDeclarationRegex();
foreach (Match match in declPattern.Matches(code))
{
var type = match.Groups["type"].Value;
var name = match.Groups["name"].Value;
var isParam = name.StartsWith("param_", StringComparison.Ordinal);
int? paramIndex = null;
int stackOffset = 0;
if (isParam && int.TryParse(name.AsSpan(6), out var idx))
{
paramIndex = idx;
}
if (name.StartsWith("local_", StringComparison.Ordinal) &&
int.TryParse(name.AsSpan(6), System.Globalization.NumberStyles.HexNumber, null, out var offset))
{
stackOffset = -offset; // Negative for locals
}
variables.Add(new LocalVariable(name, type, stackOffset, isParam, paramIndex));
varIndex++;
}
return [.. variables];
}
/// <inheritdoc />
public ImmutableArray<string> ExtractCalledFunctions(string code)
{
var functions = new HashSet<string>();
// Match function calls: name(...)
var callPattern = FunctionCallRegex();
foreach (Match match in callPattern.Matches(code))
{
var name = match.Groups["name"].Value;
// Skip keywords and types
if (!s_keywords.Contains(name) && !s_types.Contains(name))
{
functions.Add(name);
}
}
return [.. functions.Order()];
}
private static List<Token> Tokenize(string code)
{
var tokens = new List<Token>();
var i = 0;
var line = 1;
var column = 1;
while (i < code.Length)
{
var c = code[i];
// Skip whitespace
if (char.IsWhiteSpace(c))
{
if (c == '\n')
{
line++;
column = 1;
}
else
{
column++;
}
i++;
continue;
}
// Skip comments
if (i + 1 < code.Length && code[i] == '/' && code[i + 1] == '/')
{
while (i < code.Length && code[i] != '\n')
{
i++;
}
continue;
}
if (i + 1 < code.Length && code[i] == '/' && code[i + 1] == '*')
{
i += 2;
while (i + 1 < code.Length && !(code[i] == '*' && code[i + 1] == '/'))
{
if (code[i] == '\n')
{
line++;
column = 1;
}
i++;
}
i += 2;
continue;
}
var startColumn = column;
// Identifiers and keywords
if (char.IsLetter(c) || c == '_')
{
var start = i;
while (i < code.Length && (char.IsLetterOrDigit(code[i]) || code[i] == '_'))
{
i++;
column++;
}
var value = code[start..i];
var type = s_keywords.Contains(value) ? TokenType.Keyword : TokenType.Identifier;
tokens.Add(new Token(type, value, line, startColumn));
continue;
}
// Numbers
if (char.IsDigit(c) || (c == '0' && i + 1 < code.Length && code[i + 1] == 'x'))
{
var start = i;
if (c == '0' && i + 1 < code.Length && code[i + 1] == 'x')
{
i += 2;
column += 2;
while (i < code.Length && char.IsAsciiHexDigit(code[i]))
{
i++;
column++;
}
}
else
{
while (i < code.Length && (char.IsDigit(code[i]) || code[i] == '.'))
{
i++;
column++;
}
}
// Handle suffixes (U, L, UL, etc.)
while (i < code.Length && (code[i] == 'U' || code[i] == 'L' || code[i] == 'u' || code[i] == 'l'))
{
i++;
column++;
}
tokens.Add(new Token(TokenType.Number, code[start..i], line, startColumn));
continue;
}
// String literals
if (c == '"')
{
var start = i;
i++;
column++;
while (i < code.Length && code[i] != '"')
{
if (code[i] == '\\' && i + 1 < code.Length)
{
i += 2;
column += 2;
}
else
{
i++;
column++;
}
}
i++; // closing quote
column++;
tokens.Add(new Token(TokenType.String, code[start..i], line, startColumn));
continue;
}
// Character literals
if (c == '\'')
{
var start = i;
i++;
column++;
while (i < code.Length && code[i] != '\'')
{
if (code[i] == '\\' && i + 1 < code.Length)
{
i += 2;
column += 2;
}
else
{
i++;
column++;
}
}
i++; // closing quote
column++;
tokens.Add(new Token(TokenType.Char, code[start..i], line, startColumn));
continue;
}
// Multi-character operators
if (i + 1 < code.Length)
{
var twoChar = code.Substring(i, 2);
if (twoChar is "==" or "!=" or "<=" or ">=" or "&&" or "||" or
"++" or "--" or "+=" or "-=" or "*=" or "/=" or
"<<" or ">>" or "->" or "::")
{
tokens.Add(new Token(TokenType.Operator, twoChar, line, startColumn));
i += 2;
column += 2;
continue;
}
}
// Single character operators and punctuation
var tokenType = c switch
{
'(' or ')' or '{' or '}' or '[' or ']' => TokenType.Bracket,
';' or ',' or ':' or '?' => TokenType.Punctuation,
_ => TokenType.Operator
};
tokens.Add(new Token(tokenType, c.ToString(), line, startColumn));
i++;
column++;
}
return tokens;
}
private static int CountNodes(AstNode node)
{
var count = 1;
foreach (var child in node.Children)
{
count += CountNodes(child);
}
return count;
}
private static int ComputeDepth(AstNode node)
{
if (node.Children.Length == 0)
{
return 1;
}
return 1 + node.Children.Max(c => ComputeDepth(c));
}
private static ImmutableArray<AstPattern> ExtractPatterns(AstNode root)
{
var patterns = new List<AstPattern>();
foreach (var node in TraverseNodes(root))
{
// Detect loop patterns
if (node.Type == AstNodeType.For)
{
patterns.Add(new AstPattern(
PatternType.CountedLoop,
node,
new PatternMetadata("For loop", 0.9m, null)));
}
else if (node.Type == AstNodeType.While)
{
patterns.Add(new AstPattern(
PatternType.ConditionalLoop,
node,
new PatternMetadata("While loop", 0.9m, null)));
}
else if (node.Type == AstNodeType.DoWhile)
{
patterns.Add(new AstPattern(
PatternType.ConditionalLoop,
node,
new PatternMetadata("Do-while loop", 0.9m, null)));
}
// Detect error handling
if (node is IfNode ifNode && IsErrorCheck(ifNode))
{
patterns.Add(new AstPattern(
PatternType.ErrorCheck,
node,
new PatternMetadata("Error check", 0.8m, null)));
}
// Detect null checks
if (node is IfNode ifNull && IsNullCheck(ifNull))
{
patterns.Add(new AstPattern(
PatternType.NullCheck,
node,
new PatternMetadata("Null check", 0.9m, null)));
}
}
return [.. patterns];
}
private static IEnumerable<AstNode> TraverseNodes(AstNode root)
{
yield return root;
foreach (var child in root.Children)
{
foreach (var node in TraverseNodes(child))
{
yield return node;
}
}
}
private static bool IsErrorCheck(IfNode node)
{
// Check if condition compares against -1, 0, or NULL
if (node.Condition is BinaryOpNode binaryOp)
{
if (binaryOp.Right is ConstantNode constant)
{
var value = constant.Value?.ToString();
return value is "0" or "-1" or "0xffffffff" or "NULL";
}
}
return false;
}
private static bool IsNullCheck(IfNode node)
{
if (node.Condition is BinaryOpNode binaryOp)
{
if (binaryOp.Operator is "==" or "!=")
{
if (binaryOp.Right is ConstantNode constant)
{
var value = constant.Value?.ToString();
return value is "0" or "NULL" or "nullptr";
}
}
}
return false;
}
[GeneratedRegex(@"(?<type>\w+)\s+(?<name>\w+)\s*(?:=|;)", RegexOptions.Compiled)]
private static partial Regex VariableDeclarationRegex();
[GeneratedRegex(@"(?<name>\w+)\s*\(", RegexOptions.Compiled)]
private static partial Regex FunctionCallRegex();
}
internal enum TokenType
{
Identifier,
Keyword,
Number,
String,
Char,
Operator,
Bracket,
Punctuation
}
internal readonly record struct Token(TokenType Type, string Value, int Line, int Column);
internal sealed class RecursiveParser
{
private readonly List<Token> _tokens;
private int _pos;
public RecursiveParser(List<Token> tokens)
{
_tokens = tokens;
_pos = 0;
}
public AstNode ParseFunction()
{
// Parse return type
var returnType = ParseType();
// Parse function name
var name = Expect(TokenType.Identifier).Value;
// Parse parameters
Expect(TokenType.Bracket, "(");
var parameters = ParseParameterList();
Expect(TokenType.Bracket, ")");
// Parse body
var body = ParseBlock();
return new FunctionNode(name, returnType, parameters, body);
}
private string ParseType()
{
var type = new System.Text.StringBuilder();
// Handle modifiers
while (Peek().Value is "const" or "unsigned" or "signed" or "static" or "extern")
{
type.Append(Advance().Value);
type.Append(' ');
}
// Main type
type.Append(Advance().Value);
// Handle pointers
while (Peek().Value == "*")
{
type.Append(Advance().Value);
}
return type.ToString().Trim();
}
private ImmutableArray<ParameterNode> ParseParameterList()
{
var parameters = new List<ParameterNode>();
var index = 0;
if (Peek().Value == ")")
{
return [];
}
if (Peek().Value == "void" && PeekAhead(1).Value == ")")
{
Advance(); // consume void
return [];
}
do
{
if (Peek().Value == ",")
{
Advance();
}
var type = ParseType();
var name = Peek().Type == TokenType.Identifier ? Advance().Value : $"param_{index}";
parameters.Add(new ParameterNode(name, type, index));
index++;
}
while (Peek().Value == ",");
return [.. parameters];
}
private BlockNode ParseBlock()
{
Expect(TokenType.Bracket, "{");
var statements = new List<AstNode>();
while (Peek().Value != "}")
{
var stmt = ParseStatement();
if (stmt is not null)
{
statements.Add(stmt);
}
}
Expect(TokenType.Bracket, "}");
return new BlockNode([.. statements]);
}
private AstNode? ParseStatement()
{
var token = Peek();
return token.Value switch
{
"if" => ParseIf(),
"while" => ParseWhile(),
"for" => ParseFor(),
"do" => ParseDoWhile(),
"return" => ParseReturn(),
"break" => ParseBreak(),
"continue" => ParseContinue(),
"{" => ParseBlock(),
";" => SkipSemicolon(),
_ => ParseExpressionStatement()
};
}
private IfNode ParseIf()
{
Advance(); // consume 'if'
Expect(TokenType.Bracket, "(");
var condition = ParseExpression();
Expect(TokenType.Bracket, ")");
var thenBranch = ParseStatement() ?? new BlockNode([]);
AstNode? elseBranch = null;
if (Peek().Value == "else")
{
Advance();
elseBranch = ParseStatement();
}
return new IfNode(condition, thenBranch, elseBranch);
}
private WhileNode ParseWhile()
{
Advance(); // consume 'while'
Expect(TokenType.Bracket, "(");
var condition = ParseExpression();
Expect(TokenType.Bracket, ")");
var body = ParseStatement() ?? new BlockNode([]);
return new WhileNode(condition, body);
}
private ForNode ParseFor()
{
Advance(); // consume 'for'
Expect(TokenType.Bracket, "(");
AstNode? init = null;
if (Peek().Value != ";")
{
init = ParseExpression();
}
Expect(TokenType.Punctuation, ";");
AstNode? condition = null;
if (Peek().Value != ";")
{
condition = ParseExpression();
}
Expect(TokenType.Punctuation, ";");
AstNode? update = null;
if (Peek().Value != ")")
{
update = ParseExpression();
}
Expect(TokenType.Bracket, ")");
var body = ParseStatement() ?? new BlockNode([]);
return new ForNode(init, condition, update, body);
}
private AstNode ParseDoWhile()
{
Advance(); // consume 'do'
var body = ParseStatement() ?? new BlockNode([]);
Expect(TokenType.Keyword, "while");
Expect(TokenType.Bracket, "(");
var condition = ParseExpression();
Expect(TokenType.Bracket, ")");
Expect(TokenType.Punctuation, ";");
return new WhileNode(condition, body); // Simplify do-while to while for now
}
private ReturnNode ParseReturn()
{
Advance(); // consume 'return'
AstNode? value = null;
if (Peek().Value != ";")
{
value = ParseExpression();
}
Expect(TokenType.Punctuation, ";");
return new ReturnNode(value);
}
private AstNode ParseBreak()
{
Advance();
Expect(TokenType.Punctuation, ";");
return new BlockNode([]); // Simplified
}
private AstNode ParseContinue()
{
Advance();
Expect(TokenType.Punctuation, ";");
return new BlockNode([]); // Simplified
}
private AstNode? SkipSemicolon()
{
Advance();
return null;
}
private AstNode? ParseExpressionStatement()
{
var expr = ParseExpression();
if (Peek().Value == ";")
{
Advance();
}
return expr;
}
private AstNode ParseExpression()
{
return ParseAssignment();
}
private AstNode ParseAssignment()
{
var left = ParseLogicalOr();
if (Peek().Value is "=" or "+=" or "-=" or "*=" or "/=" or "&=" or "|=" or "^=" or "<<=" or ">>=")
{
var op = Advance().Value;
var right = ParseAssignment();
return new AssignmentNode(left, right, op);
}
return left;
}
private AstNode ParseLogicalOr()
{
var left = ParseLogicalAnd();
while (Peek().Value == "||")
{
var op = Advance().Value;
var right = ParseLogicalAnd();
left = new BinaryOpNode(left, right, op);
}
return left;
}
private AstNode ParseLogicalAnd()
{
var left = ParseBitwiseOr();
while (Peek().Value == "&&")
{
var op = Advance().Value;
var right = ParseBitwiseOr();
left = new BinaryOpNode(left, right, op);
}
return left;
}
private AstNode ParseBitwiseOr()
{
var left = ParseComparison();
while (Peek().Value is "|" or "^" or "&")
{
var op = Advance().Value;
var right = ParseComparison();
left = new BinaryOpNode(left, right, op);
}
return left;
}
private AstNode ParseComparison()
{
var left = ParseShift();
while (Peek().Value is "==" or "!=" or "<" or ">" or "<=" or ">=")
{
var op = Advance().Value;
var right = ParseShift();
left = new BinaryOpNode(left, right, op);
}
return left;
}
private AstNode ParseShift()
{
var left = ParseAdditive();
while (Peek().Value is "<<" or ">>")
{
var op = Advance().Value;
var right = ParseAdditive();
left = new BinaryOpNode(left, right, op);
}
return left;
}
private AstNode ParseAdditive()
{
var left = ParseMultiplicative();
while (Peek().Value is "+" or "-")
{
var op = Advance().Value;
var right = ParseMultiplicative();
left = new BinaryOpNode(left, right, op);
}
return left;
}
private AstNode ParseMultiplicative()
{
var left = ParseUnary();
while (Peek().Value is "*" or "/" or "%")
{
var op = Advance().Value;
var right = ParseUnary();
left = new BinaryOpNode(left, right, op);
}
return left;
}
private AstNode ParseUnary()
{
if (Peek().Value is "!" or "~" or "-" or "+" or "*" or "&" or "++" or "--")
{
var op = Advance().Value;
var operand = ParseUnary();
return new UnaryOpNode(operand, op, true);
}
return ParsePostfix();
}
private AstNode ParsePostfix()
{
var expr = ParsePrimary();
while (true)
{
if (Peek().Value == "(")
{
// Function call
Advance();
var args = ParseArgumentList();
Expect(TokenType.Bracket, ")");
if (expr is VariableNode varNode)
{
expr = new CallNode(varNode.Name, args);
}
}
else if (Peek().Value == "[")
{
// Array access
Advance();
var index = ParseExpression();
Expect(TokenType.Bracket, "]");
expr = new ArrayAccessNode(expr, index);
}
else if (Peek().Value is "." or "->")
{
var isPointer = Advance().Value == "->";
var field = Expect(TokenType.Identifier).Value;
expr = new FieldAccessNode(expr, field, isPointer);
}
else if (Peek().Value is "++" or "--")
{
var op = Advance().Value;
expr = new UnaryOpNode(expr, op, false);
}
else
{
break;
}
}
return expr;
}
private ImmutableArray<AstNode> ParseArgumentList()
{
var args = new List<AstNode>();
if (Peek().Value == ")")
{
return [];
}
do
{
if (Peek().Value == ",")
{
Advance();
}
args.Add(ParseExpression());
}
while (Peek().Value == ",");
return [.. args];
}
private AstNode ParsePrimary()
{
var token = Peek();
if (token.Type == TokenType.Number)
{
Advance();
return new ConstantNode(token.Value, "int");
}
if (token.Type == TokenType.String)
{
Advance();
return new ConstantNode(token.Value, "char*");
}
if (token.Type == TokenType.Char)
{
Advance();
return new ConstantNode(token.Value, "char");
}
if (token.Type == TokenType.Identifier)
{
Advance();
return new VariableNode(token.Value, null);
}
if (token.Value == "(")
{
Advance();
// Check for cast
if (IsType(Peek().Value))
{
var targetType = ParseType();
Expect(TokenType.Bracket, ")");
var expr = ParseUnary();
return new CastNode(expr, targetType);
}
var inner = ParseExpression();
Expect(TokenType.Bracket, ")");
return inner;
}
// Handle sizeof
if (token.Value == "sizeof")
{
Advance();
Expect(TokenType.Bracket, "(");
var type = ParseType();
Expect(TokenType.Bracket, ")");
return new ConstantNode($"sizeof({type})", "size_t");
}
// Unknown token - return empty node
Advance();
return new ConstantNode(token.Value, "unknown");
}
private static bool IsType(string value)
{
return value is "int" or "char" or "void" or "long" or "short" or "float" or "double"
or "unsigned" or "signed" or "const" or "struct" or "union" or "enum"
or "undefined" or "undefined1" or "undefined2" or "undefined4" or "undefined8"
or "byte" or "word" or "dword" or "qword" or "pointer" or "code" or "uint" or "ulong";
}
private Token Peek() => _pos < _tokens.Count ? _tokens[_pos] : new Token(TokenType.Punctuation, "", 0, 0);
private Token PeekAhead(int offset) => _pos + offset < _tokens.Count
? _tokens[_pos + offset]
: new Token(TokenType.Punctuation, "", 0, 0);
private Token Advance() => _pos < _tokens.Count ? _tokens[_pos++] : new Token(TokenType.Punctuation, "", 0, 0);
private Token Expect(TokenType type, string? value = null)
{
var token = Peek();
if (token.Type != type || (value is not null && token.Value != value))
{
// Skip unexpected tokens
return Advance();
}
return Advance();
}
}

View File

@@ -0,0 +1,53 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using Microsoft.Extensions.DependencyInjection;
namespace StellaOps.BinaryIndex.Decompiler;
/// <summary>
/// Extension methods for registering decompiler services.
/// </summary>
public static class DecompilerServiceCollectionExtensions
{
/// <summary>
/// Adds decompiler services to the service collection.
/// </summary>
/// <param name="services">The service collection.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddDecompilerServices(this IServiceCollection services)
{
ArgumentNullException.ThrowIfNull(services);
// Register parser
services.AddSingleton<IDecompiledCodeParser, DecompiledCodeParser>();
// Register comparison engine
services.AddSingleton<IAstComparisonEngine, AstComparisonEngine>();
// Register normalizer
services.AddSingleton<ICodeNormalizer, CodeNormalizer>();
// Register decompiler service
services.AddScoped<IDecompilerService, GhidraDecompilerAdapter>();
return services;
}
/// <summary>
/// Adds decompiler services with custom options.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="configureOptions">Action to configure decompiler options.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddDecompilerServices(
this IServiceCollection services,
Action<DecompilerOptions> configureOptions)
{
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(configureOptions);
services.Configure(configureOptions);
return services.AddDecompilerServices();
}
}

View File

@@ -0,0 +1,291 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Text.Json;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.BinaryIndex.Ghidra;
namespace StellaOps.BinaryIndex.Decompiler;
/// <summary>
/// Adapter for Ghidra's decompiler via headless analysis.
/// </summary>
public sealed class GhidraDecompilerAdapter : IDecompilerService
{
private readonly IGhidraService _ghidraService;
private readonly IDecompiledCodeParser _parser;
private readonly IAstComparisonEngine _comparisonEngine;
private readonly DecompilerOptions _options;
private readonly ILogger<GhidraDecompilerAdapter> _logger;
public GhidraDecompilerAdapter(
IGhidraService ghidraService,
IDecompiledCodeParser parser,
IAstComparisonEngine comparisonEngine,
IOptions<DecompilerOptions> options,
ILogger<GhidraDecompilerAdapter> logger)
{
_ghidraService = ghidraService;
_parser = parser;
_comparisonEngine = comparisonEngine;
_options = options.Value;
_logger = logger;
}
/// <inheritdoc />
public async Task<DecompiledFunction> DecompileAsync(
GhidraFunction function,
DecompileOptions? options = null,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(function);
options ??= new DecompileOptions();
_logger.LogDebug(
"Decompiling function {Name} at 0x{Address:X}",
function.Name,
function.Address);
// The GhidraFunction should already have decompiled code from analysis
var code = function.DecompiledCode;
if (string.IsNullOrEmpty(code))
{
_logger.LogWarning(
"Function {Name} has no decompiled code, returning stub",
function.Name);
return new DecompiledFunction(
function.Name,
BuildSignature(function),
"/* Decompilation unavailable */",
null,
[],
[],
function.Address,
function.Size);
}
// Truncate if too long
if (code.Length > options.MaxCodeLength)
{
code = code[..options.MaxCodeLength] + "\n/* ... truncated ... */";
}
// Parse to AST
DecompiledAst? ast = null;
try
{
ast = _parser.Parse(code);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to parse decompiled code for {Name}", function.Name);
}
// Extract metadata
var locals = _parser.ExtractVariables(code);
var calledFunctions = _parser.ExtractCalledFunctions(code);
return new DecompiledFunction(
function.Name,
BuildSignature(function),
code,
ast,
locals,
calledFunctions,
function.Address,
function.Size);
}
/// <inheritdoc />
public async Task<DecompiledFunction> DecompileAtAddressAsync(
string binaryPath,
ulong address,
DecompileOptions? options = null,
CancellationToken ct = default)
{
ArgumentException.ThrowIfNullOrEmpty(binaryPath);
options ??= new DecompileOptions();
_logger.LogDebug(
"Decompiling function at 0x{Address:X} in {Binary}",
address,
Path.GetFileName(binaryPath));
// Use Ghidra to analyze and get the function
using var stream = File.OpenRead(binaryPath);
var analysis = await _ghidraService.AnalyzeAsync(
stream,
new GhidraAnalysisOptions
{
IncludeDecompilation = true,
ExtractDecompilation = true
},
ct);
var function = analysis.Functions.FirstOrDefault(f => f.Address == address);
if (function is null)
{
throw new InvalidOperationException($"No function found at address 0x{address:X}");
}
return await DecompileAsync(function, options, ct);
}
/// <inheritdoc />
public Task<DecompiledAst> ParseToAstAsync(
string decompiledCode,
CancellationToken ct = default)
{
ArgumentException.ThrowIfNullOrEmpty(decompiledCode);
ct.ThrowIfCancellationRequested();
var ast = _parser.Parse(decompiledCode);
return Task.FromResult(ast);
}
/// <inheritdoc />
public Task<DecompiledComparisonResult> CompareAsync(
DecompiledFunction a,
DecompiledFunction b,
ComparisonOptions? options = null,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(a);
ArgumentNullException.ThrowIfNull(b);
options ??= new ComparisonOptions();
ct.ThrowIfCancellationRequested();
_logger.LogDebug(
"Comparing functions {A} and {B}",
a.FunctionName,
b.FunctionName);
// Need ASTs for comparison
if (a.Ast is null || b.Ast is null)
{
_logger.LogWarning("Cannot compare functions without ASTs");
return Task.FromResult(new DecompiledComparisonResult(
Similarity: 0,
StructuralSimilarity: 0,
SemanticSimilarity: 0,
EditDistance: new AstEditDistance(0, 0, 0, 0, 1.0m),
Equivalences: [],
Differences: [],
Confidence: ComparisonConfidence.Low));
}
// Compute structural similarity
var structuralSimilarity = _comparisonEngine.ComputeStructuralSimilarity(a.Ast, b.Ast);
// Compute edit distance
var editDistance = _comparisonEngine.ComputeEditDistance(a.Ast, b.Ast);
// Find semantic equivalences
var equivalences = _comparisonEngine.FindEquivalences(a.Ast, b.Ast);
// Find differences
var differences = _comparisonEngine.FindDifferences(a.Ast, b.Ast);
// Compute semantic similarity from equivalences
var totalNodes = Math.Max(a.Ast.NodeCount, b.Ast.NodeCount);
var equivalentNodes = equivalences.Length;
var semanticSimilarity = totalNodes > 0
? (decimal)equivalentNodes / totalNodes
: 0m;
// Combine into overall similarity
var overallSimilarity = ComputeOverallSimilarity(
structuralSimilarity,
semanticSimilarity,
editDistance.NormalizedDistance);
// Determine confidence
var confidence = DetermineConfidence(
overallSimilarity,
a.Ast.NodeCount,
b.Ast.NodeCount,
equivalences.Length);
return Task.FromResult(new DecompiledComparisonResult(
Similarity: overallSimilarity,
StructuralSimilarity: structuralSimilarity,
SemanticSimilarity: semanticSimilarity,
EditDistance: editDistance,
Equivalences: equivalences,
Differences: differences,
Confidence: confidence));
}
private static string BuildSignature(GhidraFunction function)
{
// Use the signature from Ghidra if available, otherwise construct a simple one
if (!string.IsNullOrEmpty(function.Signature))
{
return function.Signature;
}
// Default signature if none available
return $"void {function.Name}(void)";
}
private static decimal ComputeOverallSimilarity(
decimal structural,
decimal semantic,
decimal normalizedEditDistance)
{
// Weight: 40% structural, 40% semantic, 20% edit distance (inverted)
var editSimilarity = 1.0m - normalizedEditDistance;
return structural * 0.4m + semantic * 0.4m + editSimilarity * 0.2m;
}
private static ComparisonConfidence DetermineConfidence(
decimal similarity,
int nodeCountA,
int nodeCountB,
int equivalenceCount)
{
// Very small functions are harder to compare confidently
var minNodes = Math.Min(nodeCountA, nodeCountB);
if (minNodes < 5)
{
return ComparisonConfidence.Low;
}
// High similarity with many equivalences = high confidence
if (similarity > 0.9m && equivalenceCount > minNodes * 0.7)
{
return ComparisonConfidence.VeryHigh;
}
if (similarity > 0.7m && equivalenceCount > minNodes * 0.5)
{
return ComparisonConfidence.High;
}
if (similarity > 0.5m)
{
return ComparisonConfidence.Medium;
}
return ComparisonConfidence.Low;
}
}
/// <summary>
/// Options for the decompiler adapter.
/// </summary>
public sealed class DecompilerOptions
{
public string GhidraScriptsPath { get; set; } = "/scripts";
public TimeSpan DefaultTimeout { get; set; } = TimeSpan.FromSeconds(30);
public int MaxCodeLength { get; set; } = 100_000;
}

View File

@@ -0,0 +1,157 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
using StellaOps.BinaryIndex.Ghidra;
namespace StellaOps.BinaryIndex.Decompiler;
/// <summary>
/// Service for decompiling binary functions to C-like pseudo-code.
/// </summary>
public interface IDecompilerService
{
/// <summary>
/// Decompile a function to C-like pseudo-code.
/// </summary>
/// <param name="function">Function from Ghidra analysis.</param>
/// <param name="options">Decompilation options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Decompiled function with code and optional AST.</returns>
Task<DecompiledFunction> DecompileAsync(
GhidraFunction function,
DecompileOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Decompile a function by address.
/// </summary>
/// <param name="binaryPath">Path to the binary file.</param>
/// <param name="address">Function address.</param>
/// <param name="options">Decompilation options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Decompiled function.</returns>
Task<DecompiledFunction> DecompileAtAddressAsync(
string binaryPath,
ulong address,
DecompileOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Parse decompiled code into AST.
/// </summary>
/// <param name="decompiledCode">C-like pseudo-code from decompiler.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Abstract syntax tree representation.</returns>
Task<DecompiledAst> ParseToAstAsync(
string decompiledCode,
CancellationToken ct = default);
/// <summary>
/// Compare two decompiled functions for semantic equivalence.
/// </summary>
/// <param name="a">First function.</param>
/// <param name="b">Second function.</param>
/// <param name="options">Comparison options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Comparison result with similarity metrics.</returns>
Task<DecompiledComparisonResult> CompareAsync(
DecompiledFunction a,
DecompiledFunction b,
ComparisonOptions? options = null,
CancellationToken ct = default);
}
/// <summary>
/// Engine for comparing AST structures.
/// </summary>
public interface IAstComparisonEngine
{
/// <summary>
/// Compute structural similarity between ASTs.
/// </summary>
/// <param name="a">First AST.</param>
/// <param name="b">Second AST.</param>
/// <returns>Similarity score (0.0 to 1.0).</returns>
decimal ComputeStructuralSimilarity(DecompiledAst a, DecompiledAst b);
/// <summary>
/// Compute edit distance between ASTs.
/// </summary>
/// <param name="a">First AST.</param>
/// <param name="b">Second AST.</param>
/// <returns>Edit distance metrics.</returns>
AstEditDistance ComputeEditDistance(DecompiledAst a, DecompiledAst b);
/// <summary>
/// Find semantic equivalences between ASTs.
/// </summary>
/// <param name="a">First AST.</param>
/// <param name="b">Second AST.</param>
/// <returns>List of equivalent node pairs.</returns>
ImmutableArray<SemanticEquivalence> FindEquivalences(DecompiledAst a, DecompiledAst b);
/// <summary>
/// Find differences between ASTs.
/// </summary>
/// <param name="a">First AST.</param>
/// <param name="b">Second AST.</param>
/// <returns>List of differences.</returns>
ImmutableArray<CodeDifference> FindDifferences(DecompiledAst a, DecompiledAst b);
}
/// <summary>
/// Normalizes decompiled code for comparison.
/// </summary>
public interface ICodeNormalizer
{
/// <summary>
/// Normalize decompiled code for comparison.
/// </summary>
/// <param name="code">Raw decompiled code.</param>
/// <param name="options">Normalization options.</param>
/// <returns>Normalized code.</returns>
string Normalize(string code, NormalizationOptions? options = null);
/// <summary>
/// Compute canonical hash of normalized code.
/// </summary>
/// <param name="code">Decompiled code.</param>
/// <returns>32-byte hash.</returns>
byte[] ComputeCanonicalHash(string code);
/// <summary>
/// Normalize an AST for comparison.
/// </summary>
/// <param name="ast">AST to normalize.</param>
/// <param name="options">Normalization options.</param>
/// <returns>Normalized AST.</returns>
DecompiledAst NormalizeAst(DecompiledAst ast, NormalizationOptions? options = null);
}
/// <summary>
/// Parses decompiled C-like code into AST.
/// </summary>
public interface IDecompiledCodeParser
{
/// <summary>
/// Parse decompiled code into AST.
/// </summary>
/// <param name="code">C-like pseudo-code.</param>
/// <returns>Parsed AST.</returns>
DecompiledAst Parse(string code);
/// <summary>
/// Extract local variables from decompiled code.
/// </summary>
/// <param name="code">C-like pseudo-code.</param>
/// <returns>List of local variables.</returns>
ImmutableArray<LocalVariable> ExtractVariables(string code);
/// <summary>
/// Extract called functions from decompiled code.
/// </summary>
/// <param name="code">C-like pseudo-code.</param>
/// <returns>List of function names called.</returns>
ImmutableArray<string> ExtractCalledFunctions(string code);
}

View File

@@ -0,0 +1,377 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
namespace StellaOps.BinaryIndex.Decompiler;
/// <summary>
/// A function decompiled to C-like pseudo-code.
/// </summary>
public sealed record DecompiledFunction(
string FunctionName,
string Signature,
string Code,
DecompiledAst? Ast,
ImmutableArray<LocalVariable> Locals,
ImmutableArray<string> CalledFunctions,
ulong Address,
int SizeBytes);
/// <summary>
/// AST representation of decompiled code.
/// </summary>
public sealed record DecompiledAst(
AstNode Root,
int NodeCount,
int Depth,
ImmutableArray<AstPattern> Patterns);
/// <summary>
/// Abstract syntax tree node.
/// </summary>
public abstract record AstNode(
AstNodeType Type,
ImmutableArray<AstNode> Children,
SourceLocation? Location);
/// <summary>
/// Types of AST nodes.
/// </summary>
public enum AstNodeType
{
// Structure
Function,
Block,
Parameter,
// Control flow
If,
While,
For,
DoWhile,
Switch,
Case,
Default,
Return,
Break,
Continue,
Goto,
Label,
// Expressions
Assignment,
BinaryOp,
UnaryOp,
TernaryOp,
Call,
Cast,
Sizeof,
// Operands
Variable,
Constant,
StringLiteral,
ArrayAccess,
FieldAccess,
PointerDeref,
AddressOf,
// Declarations
VariableDecl,
TypeDef
}
/// <summary>
/// Source location in decompiled code.
/// </summary>
public sealed record SourceLocation(int Line, int Column, int Length);
/// <summary>
/// A local variable in decompiled code.
/// </summary>
public sealed record LocalVariable(
string Name,
string Type,
int StackOffset,
bool IsParameter,
int? ParameterIndex);
/// <summary>
/// A recognized code pattern.
/// </summary>
public sealed record AstPattern(
PatternType Type,
AstNode Node,
PatternMetadata? Metadata);
/// <summary>
/// Types of code patterns.
/// </summary>
public enum PatternType
{
// Loops
CountedLoop,
ConditionalLoop,
InfiniteLoop,
LoopUnrolled,
// Branches
IfElseChain,
SwitchTable,
ShortCircuit,
// Memory
MemoryAllocation,
MemoryDeallocation,
BufferOperation,
StackBuffer,
// Error handling
ErrorCheck,
NullCheck,
BoundsCheck,
// Idioms
StringOperation,
MathOperation,
BitwiseOperation,
TableLookup
}
/// <summary>
/// Metadata about a recognized pattern.
/// </summary>
public sealed record PatternMetadata(
string Description,
decimal Confidence,
ImmutableDictionary<string, string>? Properties);
/// <summary>
/// Result of comparing two decompiled functions.
/// </summary>
public sealed record DecompiledComparisonResult(
decimal Similarity,
decimal StructuralSimilarity,
decimal SemanticSimilarity,
AstEditDistance EditDistance,
ImmutableArray<SemanticEquivalence> Equivalences,
ImmutableArray<CodeDifference> Differences,
ComparisonConfidence Confidence);
/// <summary>
/// Edit distance between ASTs.
/// </summary>
public sealed record AstEditDistance(
int Insertions,
int Deletions,
int Modifications,
int TotalOperations,
decimal NormalizedDistance);
/// <summary>
/// A semantic equivalence between AST nodes.
/// </summary>
public sealed record SemanticEquivalence(
AstNode NodeA,
AstNode NodeB,
EquivalenceType Type,
decimal Confidence,
string? Explanation);
/// <summary>
/// Types of semantic equivalence.
/// </summary>
public enum EquivalenceType
{
Identical,
Renamed,
Reordered,
Optimized,
Inlined,
Semantically
}
/// <summary>
/// A difference between two pieces of code.
/// </summary>
public sealed record CodeDifference(
DifferenceType Type,
AstNode? NodeA,
AstNode? NodeB,
string Description);
/// <summary>
/// Types of code differences.
/// </summary>
public enum DifferenceType
{
Added,
Removed,
Modified,
Reordered,
TypeChanged,
OptimizationVariant
}
/// <summary>
/// Confidence level for comparison results.
/// </summary>
public enum ComparisonConfidence
{
Low,
Medium,
High,
VeryHigh
}
/// <summary>
/// Options for decompilation.
/// </summary>
public sealed record DecompileOptions
{
public bool SimplifyCode { get; init; } = true;
public bool RecoverTypes { get; init; } = true;
public bool RecoverStructs { get; init; } = true;
public int MaxCodeLength { get; init; } = 100_000;
public TimeSpan Timeout { get; init; } = TimeSpan.FromSeconds(30);
}
/// <summary>
/// Options for AST comparison.
/// </summary>
public sealed record ComparisonOptions
{
public bool IgnoreVariableNames { get; init; } = true;
public bool IgnoreConstants { get; init; } = false;
public bool DetectOptimizations { get; init; } = true;
public decimal MinSimilarityThreshold { get; init; } = 0.5m;
}
/// <summary>
/// Options for code normalization.
/// </summary>
public sealed record NormalizationOptions
{
public bool NormalizeVariables { get; init; } = true;
public bool NormalizeFunctionCalls { get; init; } = true;
public bool NormalizeConstants { get; init; } = false;
public bool NormalizeWhitespace { get; init; } = true;
public bool SortIndependentStatements { get; init; } = false;
public ImmutableHashSet<string>? KnownFunctions { get; init; }
public static NormalizationOptions Default { get; } = new();
}
#region Concrete AST Node Types
public sealed record FunctionNode(
string Name,
string ReturnType,
ImmutableArray<ParameterNode> Parameters,
BlockNode Body,
SourceLocation? Location = null)
: AstNode(AstNodeType.Function, [Body, .. Parameters], Location);
public sealed record ParameterNode(
string Name,
string DataType,
int Index,
SourceLocation? Location = null)
: AstNode(AstNodeType.Parameter, [], Location);
public sealed record BlockNode(
ImmutableArray<AstNode> Statements,
SourceLocation? Location = null)
: AstNode(AstNodeType.Block, Statements, Location);
public sealed record IfNode(
AstNode Condition,
AstNode ThenBranch,
AstNode? ElseBranch,
SourceLocation? Location = null)
: AstNode(AstNodeType.If, ElseBranch is null ? [Condition, ThenBranch] : [Condition, ThenBranch, ElseBranch], Location);
public sealed record WhileNode(
AstNode Condition,
AstNode Body,
SourceLocation? Location = null)
: AstNode(AstNodeType.While, [Condition, Body], Location);
public sealed record ForNode(
AstNode? Init,
AstNode? Condition,
AstNode? Update,
AstNode Body,
SourceLocation? Location = null)
: AstNode(AstNodeType.For, [Init ?? EmptyNode.Instance, Condition ?? EmptyNode.Instance, Update ?? EmptyNode.Instance, Body], Location);
public sealed record ReturnNode(
AstNode? Value,
SourceLocation? Location = null)
: AstNode(AstNodeType.Return, Value is null ? [] : [Value], Location);
public sealed record AssignmentNode(
AstNode Target,
AstNode Value,
string Operator,
SourceLocation? Location = null)
: AstNode(AstNodeType.Assignment, [Target, Value], Location);
public sealed record BinaryOpNode(
AstNode Left,
AstNode Right,
string Operator,
SourceLocation? Location = null)
: AstNode(AstNodeType.BinaryOp, [Left, Right], Location);
public sealed record UnaryOpNode(
AstNode Operand,
string Operator,
bool IsPrefix,
SourceLocation? Location = null)
: AstNode(AstNodeType.UnaryOp, [Operand], Location);
public sealed record CallNode(
string FunctionName,
ImmutableArray<AstNode> Arguments,
SourceLocation? Location = null)
: AstNode(AstNodeType.Call, Arguments, Location);
public sealed record VariableNode(
string Name,
string? DataType,
SourceLocation? Location = null)
: AstNode(AstNodeType.Variable, [], Location);
public sealed record ConstantNode(
object Value,
string DataType,
SourceLocation? Location = null)
: AstNode(AstNodeType.Constant, [], Location);
public sealed record ArrayAccessNode(
AstNode Array,
AstNode Index,
SourceLocation? Location = null)
: AstNode(AstNodeType.ArrayAccess, [Array, Index], Location);
public sealed record FieldAccessNode(
AstNode Object,
string FieldName,
bool IsPointer,
SourceLocation? Location = null)
: AstNode(AstNodeType.FieldAccess, [Object], Location);
public sealed record CastNode(
AstNode Expression,
string TargetType,
SourceLocation? Location = null)
: AstNode(AstNodeType.Cast, [Expression], Location);
public sealed record EmptyNode() : AstNode(AstNodeType.Block, [], null)
{
public static EmptyNode Instance { get; } = new();
}
#endregion

View File

@@ -0,0 +1,22 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<Description>Decompiler integration for BinaryIndex semantic analysis. Provides AST-based comparison of decompiled code.</Description>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\StellaOps.BinaryIndex.Ghidra\StellaOps.BinaryIndex.Ghidra.csproj" />
<ProjectReference Include="..\StellaOps.BinaryIndex.Semantic\StellaOps.BinaryIndex.Semantic.csproj" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
</ItemGroup>
</Project>

View File

@@ -7,6 +7,7 @@ using System.Security.Cryptography;
using Microsoft.Extensions.Logging;
using StellaOps.BinaryIndex.Disassembly;
using StellaOps.BinaryIndex.Normalization;
using StellaOps.BinaryIndex.Semantic;
namespace StellaOps.BinaryIndex.DeltaSig;
@@ -17,18 +18,49 @@ public sealed class DeltaSignatureGenerator : IDeltaSignatureGenerator
{
private readonly DisassemblyService _disassemblyService;
private readonly NormalizationService _normalizationService;
private readonly IIrLiftingService? _irLiftingService;
private readonly ISemanticGraphExtractor? _graphExtractor;
private readonly ISemanticFingerprintGenerator? _fingerprintGenerator;
private readonly ILogger<DeltaSignatureGenerator> _logger;
/// <summary>
/// Creates a new delta signature generator without semantic analysis support.
/// </summary>
public DeltaSignatureGenerator(
DisassemblyService disassemblyService,
NormalizationService normalizationService,
ILogger<DeltaSignatureGenerator> logger)
: this(disassemblyService, normalizationService, null, null, null, logger)
{
_disassemblyService = disassemblyService;
_normalizationService = normalizationService;
_logger = logger;
}
/// <summary>
/// Creates a new delta signature generator with optional semantic analysis support.
/// </summary>
public DeltaSignatureGenerator(
DisassemblyService disassemblyService,
NormalizationService normalizationService,
IIrLiftingService? irLiftingService,
ISemanticGraphExtractor? graphExtractor,
ISemanticFingerprintGenerator? fingerprintGenerator,
ILogger<DeltaSignatureGenerator> logger)
{
_disassemblyService = disassemblyService ?? throw new ArgumentNullException(nameof(disassemblyService));
_normalizationService = normalizationService ?? throw new ArgumentNullException(nameof(normalizationService));
_irLiftingService = irLiftingService;
_graphExtractor = graphExtractor;
_fingerprintGenerator = fingerprintGenerator;
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <summary>
/// Gets a value indicating whether semantic analysis is available.
/// </summary>
public bool SemanticAnalysisAvailable =>
_irLiftingService is not null &&
_graphExtractor is not null &&
_fingerprintGenerator is not null;
/// <inheritdoc />
public async Task<DeltaSignature> GenerateSignaturesAsync(
Stream binaryStream,
@@ -94,11 +126,14 @@ public sealed class DeltaSignatureGenerator : IDeltaSignatureGenerator
}
// Generate signature from normalized bytes
var signature = GenerateSymbolSignature(
var signature = await GenerateSymbolSignatureAsync(
normalized,
symbolName,
symbolInfo.Section ?? ".text",
options);
instructions,
binary.Architecture,
options,
ct);
symbolSignatures.Add(signature);
@@ -218,6 +253,136 @@ public sealed class DeltaSignatureGenerator : IDeltaSignatureGenerator
};
}
/// <inheritdoc />
public async Task<SymbolSignature> GenerateSymbolSignatureAsync(
NormalizedFunction normalized,
string symbolName,
string scope,
IReadOnlyList<DisassembledInstruction> originalInstructions,
CpuArchitecture architecture,
SignatureOptions? options = null,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(normalized);
ArgumentNullException.ThrowIfNull(symbolName);
ArgumentNullException.ThrowIfNull(scope);
ArgumentNullException.ThrowIfNull(originalInstructions);
options ??= new SignatureOptions();
// Get normalized bytes for hashing
var normalizedBytes = GetNormalizedBytes(normalized);
// Compute the main hash
var hashHex = ComputeHash(normalizedBytes, options.HashAlgorithm);
// Compute chunk hashes for resilience
ImmutableArray<ChunkHash>? chunks = null;
if (options.IncludeChunks && normalizedBytes.Length >= options.ChunkSize)
{
chunks = ComputeChunkHashes(normalizedBytes, options.ChunkSize, options.HashAlgorithm);
}
// Compute CFG metrics using proper CFG analysis
int? bbCount = null;
string? cfgEdgeHash = null;
if (options.IncludeCfg && normalized.Instructions.Length > 0)
{
// Use first instruction's address as start address
var startAddress = normalized.Instructions[0].OriginalAddress;
var cfgMetrics = CfgExtractor.ComputeMetrics(
normalized.Instructions.ToList(),
startAddress);
bbCount = cfgMetrics.BasicBlockCount;
cfgEdgeHash = cfgMetrics.EdgeHash;
}
// Compute semantic fingerprint if enabled and services available
string? semanticHashHex = null;
ImmutableArray<string>? semanticApiCalls = null;
if (options.IncludeSemantic && SemanticAnalysisAvailable && originalInstructions.Count > 0)
{
try
{
var semanticFingerprint = await ComputeSemanticFingerprintAsync(
originalInstructions,
symbolName,
architecture,
ct);
if (semanticFingerprint is not null)
{
semanticHashHex = semanticFingerprint.GraphHashHex;
semanticApiCalls = semanticFingerprint.ApiCalls;
}
}
catch (Exception ex)
{
_logger.LogWarning(
ex,
"Failed to compute semantic fingerprint for {Symbol}, continuing without semantic data",
symbolName);
}
}
return new SymbolSignature
{
Name = symbolName,
Scope = scope,
HashAlg = options.HashAlgorithm,
HashHex = hashHex,
SizeBytes = normalizedBytes.Length,
CfgBbCount = bbCount,
CfgEdgeHash = cfgEdgeHash,
Chunks = chunks,
SemanticHashHex = semanticHashHex,
SemanticApiCalls = semanticApiCalls
};
}
private async Task<SemanticFingerprint?> ComputeSemanticFingerprintAsync(
IReadOnlyList<DisassembledInstruction> instructions,
string functionName,
CpuArchitecture architecture,
CancellationToken ct)
{
if (_irLiftingService is null || _graphExtractor is null || _fingerprintGenerator is null)
{
return null;
}
// Check if architecture is supported
if (!_irLiftingService.SupportsArchitecture(architecture))
{
_logger.LogDebug(
"Architecture {Arch} not supported for semantic analysis",
architecture);
return null;
}
// Lift to IR
var startAddress = instructions.Count > 0 ? instructions[0].Address : 0UL;
var lifted = await _irLiftingService.LiftToIrAsync(
instructions,
functionName,
startAddress,
architecture,
ct: ct);
// Extract semantic graph
var graph = await _graphExtractor.ExtractGraphAsync(lifted, ct: ct);
// Generate fingerprint
var fingerprint = await _fingerprintGenerator.GenerateAsync(
graph,
startAddress,
ct: ct);
return fingerprint;
}
private static byte[] GetNormalizedBytes(NormalizedFunction normalized)
{
// Concatenate all normalized instruction bytes

View File

@@ -1,6 +1,7 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using StellaOps.BinaryIndex.Disassembly;
using StellaOps.BinaryIndex.Normalization;
namespace StellaOps.BinaryIndex.DeltaSig;
@@ -49,4 +50,24 @@ public interface IDeltaSignatureGenerator
string symbolName,
string scope,
SignatureOptions? options = null);
/// <summary>
/// Generates a signature for a single symbol with optional semantic analysis.
/// </summary>
/// <param name="normalized">The normalized function with instructions.</param>
/// <param name="symbolName">Name of the symbol.</param>
/// <param name="scope">Section containing the symbol.</param>
/// <param name="originalInstructions">Original disassembled instructions for semantic analysis.</param>
/// <param name="architecture">CPU architecture for IR lifting.</param>
/// <param name="options">Generation options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>The symbol signature with CFG metrics and optional semantic fingerprint.</returns>
Task<SymbolSignature> GenerateSymbolSignatureAsync(
NormalizedFunction normalized,
string symbolName,
string scope,
IReadOnlyList<DisassembledInstruction> originalInstructions,
CpuArchitecture architecture,
SignatureOptions? options = null,
CancellationToken ct = default);
}

View File

@@ -13,11 +13,13 @@ namespace StellaOps.BinaryIndex.DeltaSig;
/// <param name="IncludeChunks">Include rolling chunk hashes for resilience.</param>
/// <param name="ChunkSize">Size of rolling chunks in bytes (default 2KB).</param>
/// <param name="HashAlgorithm">Hash algorithm to use (default sha256).</param>
/// <param name="IncludeSemantic">Include IR-level semantic fingerprints for optimization-resilient matching.</param>
public sealed record SignatureOptions(
bool IncludeCfg = true,
bool IncludeChunks = true,
int ChunkSize = 2048,
string HashAlgorithm = "sha256");
string HashAlgorithm = "sha256",
bool IncludeSemantic = false);
/// <summary>
/// Request for generating delta signatures from a binary.
@@ -190,6 +192,17 @@ public sealed record SymbolSignature
/// Rolling chunk hashes for resilience against small changes.
/// </summary>
public ImmutableArray<ChunkHash>? Chunks { get; init; }
/// <summary>
/// Semantic fingerprint hash based on IR-level analysis (hex string).
/// Provides resilience against compiler optimizations and instruction reordering.
/// </summary>
public string? SemanticHashHex { get; init; }
/// <summary>
/// API calls extracted from semantic analysis (for semantic anchoring).
/// </summary>
public ImmutableArray<string>? SemanticApiCalls { get; init; }
}
/// <summary>

View File

@@ -2,8 +2,10 @@
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using StellaOps.BinaryIndex.Disassembly;
using StellaOps.BinaryIndex.Normalization;
using StellaOps.BinaryIndex.Semantic;
namespace StellaOps.BinaryIndex.DeltaSig;
@@ -15,17 +17,52 @@ public static class ServiceCollectionExtensions
/// <summary>
/// Adds delta signature generation and matching services.
/// Requires disassembly and normalization services to be registered.
/// If semantic services are registered, semantic fingerprinting will be available.
/// </summary>
/// <param name="services">The service collection.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddDeltaSignatures(this IServiceCollection services)
{
services.AddSingleton<IDeltaSignatureGenerator, DeltaSignatureGenerator>();
services.AddSingleton<IDeltaSignatureGenerator>(sp =>
{
var disassembly = sp.GetRequiredService<DisassemblyService>();
var normalization = sp.GetRequiredService<NormalizationService>();
var logger = sp.GetRequiredService<ILogger<DeltaSignatureGenerator>>();
// Semantic services are optional
var irLifting = sp.GetService<IIrLiftingService>();
var graphExtractor = sp.GetService<ISemanticGraphExtractor>();
var fingerprintGenerator = sp.GetService<ISemanticFingerprintGenerator>();
return new DeltaSignatureGenerator(
disassembly,
normalization,
irLifting,
graphExtractor,
fingerprintGenerator,
logger);
});
services.AddSingleton<IDeltaSignatureMatcher, DeltaSignatureMatcher>();
return services;
}
/// <summary>
/// Adds delta signature services with semantic analysis support enabled.
/// Requires disassembly and normalization services to be registered.
/// </summary>
/// <param name="services">The service collection.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddDeltaSignaturesWithSemantic(this IServiceCollection services)
{
// Register semantic services first
services.AddBinaryIndexSemantic();
// Then register delta signature services
return services.AddDeltaSignatures();
}
/// <summary>
/// Adds all binary index services: disassembly, normalization, and delta signatures.
/// </summary>
@@ -44,4 +81,26 @@ public static class ServiceCollectionExtensions
return services;
}
/// <summary>
/// Adds all binary index services with semantic analysis: disassembly, normalization, semantic, and delta signatures.
/// </summary>
/// <param name="services">The service collection.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddBinaryIndexServicesWithSemantic(this IServiceCollection services)
{
// Add disassembly with default plugins
services.AddDisassemblyServices();
// Add normalization pipelines
services.AddNormalizationPipelines();
// Add semantic analysis services
services.AddBinaryIndexSemantic();
// Add delta signature services (will pick up semantic services)
services.AddDeltaSignatures();
return services;
}
}

View File

@@ -14,6 +14,7 @@
<ProjectReference Include="..\StellaOps.BinaryIndex.Disassembly.Abstractions\StellaOps.BinaryIndex.Disassembly.Abstractions.csproj" />
<ProjectReference Include="..\StellaOps.BinaryIndex.Disassembly\StellaOps.BinaryIndex.Disassembly.csproj" />
<ProjectReference Include="..\StellaOps.BinaryIndex.Normalization\StellaOps.BinaryIndex.Normalization.csproj" />
<ProjectReference Include="..\StellaOps.BinaryIndex.Semantic\StellaOps.BinaryIndex.Semantic.csproj" />
</ItemGroup>
<ItemGroup>

View File

@@ -66,4 +66,81 @@ public static class DisassemblyServiceCollectionExtensions
return services;
}
/// <summary>
/// Adds the hybrid disassembly service with fallback logic between plugins.
/// This replaces the standard disassembly service with a hybrid version that
/// automatically falls back to secondary plugins when primary quality is low.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="configuration">Configuration for binding options.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddHybridDisassemblyServices(
this IServiceCollection services,
IConfiguration configuration)
{
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(configuration);
// Register standard options
services.AddOptions<DisassemblyOptions>()
.Bind(configuration.GetSection(DisassemblyOptions.SectionName))
.ValidateOnStart();
// Register hybrid options
services.AddOptions<HybridDisassemblyOptions>()
.Bind(configuration.GetSection(HybridDisassemblyOptions.SectionName))
.ValidateOnStart();
// Register the plugin registry
services.TryAddSingleton<IDisassemblyPluginRegistry, DisassemblyPluginRegistry>();
// Register hybrid service as IDisassemblyService
services.AddSingleton<HybridDisassemblyService>();
services.AddSingleton<IDisassemblyService>(sp => sp.GetRequiredService<HybridDisassemblyService>());
return services;
}
/// <summary>
/// Adds the hybrid disassembly service with configuration actions.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="configureHybrid">Action to configure hybrid options.</param>
/// <param name="configureDisassembly">Optional action to configure standard options.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddHybridDisassemblyServices(
this IServiceCollection services,
Action<HybridDisassemblyOptions> configureHybrid,
Action<DisassemblyOptions>? configureDisassembly = null)
{
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(configureHybrid);
// Register standard options
if (configureDisassembly != null)
{
services.AddOptions<DisassemblyOptions>()
.Configure(configureDisassembly)
.ValidateOnStart();
}
else
{
services.AddOptions<DisassemblyOptions>();
}
// Register hybrid options
services.AddOptions<HybridDisassemblyOptions>()
.Configure(configureHybrid)
.ValidateOnStart();
// Register the plugin registry
services.TryAddSingleton<IDisassemblyPluginRegistry, DisassemblyPluginRegistry>();
// Register hybrid service as IDisassemblyService
services.AddSingleton<HybridDisassemblyService>();
services.AddSingleton<IDisassemblyService>(sp => sp.GetRequiredService<HybridDisassemblyService>());
return services;
}
}

View File

@@ -0,0 +1,572 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.BinaryIndex.Disassembly;
/// <summary>
/// Configuration options for hybrid disassembly with fallback logic.
/// </summary>
public sealed class HybridDisassemblyOptions
{
/// <summary>
/// Configuration section name.
/// </summary>
public const string SectionName = "HybridDisassembly";
/// <summary>
/// Primary plugin ID to try first. If null, auto-selects highest priority plugin.
/// </summary>
public string? PrimaryPluginId { get; set; }
/// <summary>
/// Fallback plugin ID to use when primary fails quality threshold.
/// </summary>
public string? FallbackPluginId { get; set; }
/// <summary>
/// Minimum confidence score (0.0-1.0) required to accept primary plugin results.
/// If primary result confidence is below this, fallback is attempted.
/// </summary>
public double MinConfidenceThreshold { get; set; } = 0.7;
/// <summary>
/// Minimum function discovery count. If primary finds fewer functions, fallback is attempted.
/// </summary>
public int MinFunctionCount { get; set; } = 1;
/// <summary>
/// Minimum instruction decode success rate (0.0-1.0).
/// </summary>
public double MinDecodeSuccessRate { get; set; } = 0.8;
/// <summary>
/// Whether to automatically fallback when primary plugin doesn't support the architecture.
/// </summary>
public bool AutoFallbackOnUnsupported { get; set; } = true;
/// <summary>
/// Whether to enable hybrid fallback logic at all. If false, behaves like standard service.
/// </summary>
public bool EnableFallback { get; set; } = true;
/// <summary>
/// Timeout in seconds for each plugin attempt.
/// </summary>
public int PluginTimeoutSeconds { get; set; } = 120;
}
/// <summary>
/// Result of a disassembly operation with quality metrics.
/// </summary>
public sealed record DisassemblyQualityResult
{
/// <summary>
/// The loaded binary information.
/// </summary>
public required BinaryInfo Binary { get; init; }
/// <summary>
/// The plugin that produced this result.
/// </summary>
public required IDisassemblyPlugin Plugin { get; init; }
/// <summary>
/// Discovered code regions.
/// </summary>
public required ImmutableArray<CodeRegion> CodeRegions { get; init; }
/// <summary>
/// Discovered symbols/functions.
/// </summary>
public required ImmutableArray<SymbolInfo> Symbols { get; init; }
/// <summary>
/// Total instructions disassembled across all regions.
/// </summary>
public int TotalInstructions { get; init; }
/// <summary>
/// Successfully decoded instructions count.
/// </summary>
public int DecodedInstructions { get; init; }
/// <summary>
/// Failed/invalid instruction count.
/// </summary>
public int FailedInstructions { get; init; }
/// <summary>
/// Confidence score (0.0-1.0) based on quality metrics.
/// </summary>
public double Confidence { get; init; }
/// <summary>
/// Whether this result came from a fallback plugin.
/// </summary>
public bool UsedFallback { get; init; }
/// <summary>
/// Reason for fallback if applicable.
/// </summary>
public string? FallbackReason { get; init; }
/// <summary>
/// Decode success rate (DecodedInstructions / TotalInstructions).
/// </summary>
public double DecodeSuccessRate =>
TotalInstructions > 0 ? (double)DecodedInstructions / TotalInstructions : 0.0;
}
/// <summary>
/// Hybrid disassembly service that implements smart routing between plugins
/// with quality-based fallback logic (e.g., B2R2 primary -> Ghidra fallback).
/// </summary>
public sealed class HybridDisassemblyService : IDisassemblyService
{
private readonly IDisassemblyPluginRegistry _registry;
private readonly HybridDisassemblyOptions _options;
private readonly ILogger<HybridDisassemblyService> _logger;
/// <summary>
/// Creates a new hybrid disassembly service.
/// </summary>
/// <param name="registry">The plugin registry.</param>
/// <param name="options">Hybrid options.</param>
/// <param name="logger">Logger instance.</param>
public HybridDisassemblyService(
IDisassemblyPluginRegistry registry,
IOptions<HybridDisassemblyOptions> options,
ILogger<HybridDisassemblyService> logger)
{
_registry = registry ?? throw new ArgumentNullException(nameof(registry));
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc />
public IDisassemblyPluginRegistry Registry => _registry;
/// <inheritdoc />
public (BinaryInfo Binary, IDisassemblyPlugin Plugin) LoadBinary(Stream stream, string? preferredPluginId = null)
{
ArgumentNullException.ThrowIfNull(stream);
using var memStream = new MemoryStream();
stream.CopyTo(memStream);
return LoadBinary(memStream.ToArray(), preferredPluginId);
}
/// <inheritdoc />
public (BinaryInfo Binary, IDisassemblyPlugin Plugin) LoadBinary(ReadOnlySpan<byte> bytes, string? preferredPluginId = null)
{
// Detect format/architecture
var format = DetectFormat(bytes);
var architecture = DetectArchitecture(bytes, format);
_logger.LogDebug(
"Hybrid service: Detected format {Format} and architecture {Arch}",
format, architecture);
if (!_options.EnableFallback)
{
// Simple mode - just use the best plugin
return LoadWithBestPlugin(bytes, architecture, format, preferredPluginId);
}
// Hybrid mode with fallback logic
return LoadWithFallback(bytes, architecture, format, preferredPluginId);
}
/// <summary>
/// Loads binary with quality assessment and returns detailed quality result.
/// </summary>
/// <param name="bytes">The binary data.</param>
/// <param name="preferredPluginId">Optional preferred plugin ID.</param>
/// <returns>A quality result with metrics and fallback info.</returns>
public DisassemblyQualityResult LoadBinaryWithQuality(ReadOnlySpan<byte> bytes, string? preferredPluginId = null)
{
var format = DetectFormat(bytes);
var architecture = DetectArchitecture(bytes, format);
// Try primary plugin
var primaryPlugin = GetPrimaryPlugin(architecture, format, preferredPluginId);
if (primaryPlugin is null)
{
throw new NotSupportedException(
$"No disassembly plugin available for architecture {architecture} and format {format}");
}
var primaryResult = AssessQuality(primaryPlugin, bytes, architecture, format);
// Check if primary meets quality threshold
if (MeetsQualityThreshold(primaryResult))
{
_logger.LogInformation(
"Primary plugin {Plugin} met quality threshold (confidence: {Confidence:P1})",
primaryPlugin.Capabilities.PluginId, primaryResult.Confidence);
return primaryResult;
}
// Try fallback
if (!_options.EnableFallback)
{
_logger.LogWarning(
"Primary plugin {Plugin} below threshold (confidence: {Confidence:P1}), fallback disabled",
primaryPlugin.Capabilities.PluginId, primaryResult.Confidence);
return primaryResult;
}
var fallbackPlugin = GetFallbackPlugin(primaryPlugin, architecture, format);
if (fallbackPlugin is null)
{
_logger.LogWarning(
"No fallback plugin available for {Arch}/{Format}",
architecture, format);
return primaryResult;
}
var fallbackResult = AssessQuality(fallbackPlugin, bytes, architecture, format);
// Use fallback if it's better
if (fallbackResult.Confidence > primaryResult.Confidence)
{
_logger.LogInformation(
"Using fallback plugin {Plugin} (confidence: {Confidence:P1} > primary: {PrimaryConf:P1})",
fallbackPlugin.Capabilities.PluginId, fallbackResult.Confidence, primaryResult.Confidence);
return fallbackResult with
{
UsedFallback = true,
FallbackReason = $"Primary confidence ({primaryResult.Confidence:P1}) below threshold"
};
}
_logger.LogDebug(
"Keeping primary plugin result (confidence: {Confidence:P1})",
primaryResult.Confidence);
return primaryResult;
}
#region Private Methods
private (BinaryInfo Binary, IDisassemblyPlugin Plugin) LoadWithBestPlugin(
ReadOnlySpan<byte> bytes,
CpuArchitecture architecture,
BinaryFormat format,
string? preferredPluginId)
{
var plugin = GetPluginById(preferredPluginId) ?? _registry.FindPlugin(architecture, format);
if (plugin == null)
{
throw new NotSupportedException(
$"No disassembly plugin available for architecture {architecture} and format {format}");
}
var binary = plugin.LoadBinary(bytes, architecture, format);
return (binary, plugin);
}
private (BinaryInfo Binary, IDisassemblyPlugin Plugin) LoadWithFallback(
ReadOnlySpan<byte> bytes,
CpuArchitecture architecture,
BinaryFormat format,
string? preferredPluginId)
{
var primaryPlugin = GetPrimaryPlugin(architecture, format, preferredPluginId);
if (primaryPlugin is null)
{
// No primary, try fallback directly
var fallback = GetFallbackPlugin(null, architecture, format);
if (fallback is null)
{
throw new NotSupportedException(
$"No disassembly plugin available for architecture {architecture} and format {format}");
}
return (fallback.LoadBinary(bytes, architecture, format), fallback);
}
// Check if primary supports this arch/format
if (_options.AutoFallbackOnUnsupported && !primaryPlugin.Capabilities.CanHandle(architecture, format))
{
_logger.LogDebug(
"Primary plugin {Plugin} doesn't support {Arch}/{Format}, using fallback",
primaryPlugin.Capabilities.PluginId, architecture, format);
var fallback = GetFallbackPlugin(primaryPlugin, architecture, format);
if (fallback is not null)
{
return (fallback.LoadBinary(bytes, architecture, format), fallback);
}
}
// Use primary
return (primaryPlugin.LoadBinary(bytes, architecture, format), primaryPlugin);
}
private IDisassemblyPlugin? GetPrimaryPlugin(
CpuArchitecture architecture,
BinaryFormat format,
string? preferredPluginId)
{
// Explicit preferred plugin
if (!string.IsNullOrEmpty(preferredPluginId))
{
return GetPluginById(preferredPluginId);
}
// Configured primary plugin
if (!string.IsNullOrEmpty(_options.PrimaryPluginId))
{
return GetPluginById(_options.PrimaryPluginId);
}
// Auto-select highest priority
return _registry.FindPlugin(architecture, format);
}
private IDisassemblyPlugin? GetFallbackPlugin(
IDisassemblyPlugin? excludePlugin,
CpuArchitecture architecture,
BinaryFormat format)
{
// Explicit fallback plugin
if (!string.IsNullOrEmpty(_options.FallbackPluginId))
{
var fallback = GetPluginById(_options.FallbackPluginId);
if (fallback?.Capabilities.CanHandle(architecture, format) == true)
{
return fallback;
}
}
// Find any other plugin that supports this arch/format
return _registry.Plugins
.Where(p => p != excludePlugin)
.Where(p => p.Capabilities.CanHandle(architecture, format))
.OrderByDescending(p => p.Capabilities.Priority)
.FirstOrDefault();
}
private IDisassemblyPlugin? GetPluginById(string? pluginId)
{
return string.IsNullOrEmpty(pluginId) ? null : _registry.GetPlugin(pluginId);
}
private DisassemblyQualityResult AssessQuality(
IDisassemblyPlugin plugin,
ReadOnlySpan<byte> bytes,
CpuArchitecture architecture,
BinaryFormat format)
{
try
{
var binary = plugin.LoadBinary(bytes, architecture, format);
var codeRegions = plugin.GetCodeRegions(binary).ToImmutableArray();
var symbols = plugin.GetSymbols(binary).ToImmutableArray();
// Assess quality by sampling disassembly
int totalInstructions = 0;
int decodedInstructions = 0;
int failedInstructions = 0;
foreach (var region in codeRegions.Take(3)) // Sample up to 3 regions
{
var instructions = plugin.Disassemble(binary, region).Take(1000).ToList();
totalInstructions += instructions.Count;
foreach (var instr in instructions)
{
if (instr.Mnemonic.Equals("??", StringComparison.Ordinal) ||
instr.Mnemonic.Equals("invalid", StringComparison.OrdinalIgnoreCase) ||
instr.Mnemonic.Equals("db", StringComparison.OrdinalIgnoreCase))
{
failedInstructions++;
}
else
{
decodedInstructions++;
}
}
}
// Calculate confidence
var confidence = CalculateConfidence(
symbols.Length,
decodedInstructions,
failedInstructions,
codeRegions.Length);
return new DisassemblyQualityResult
{
Binary = binary,
Plugin = plugin,
CodeRegions = codeRegions,
Symbols = symbols,
TotalInstructions = totalInstructions,
DecodedInstructions = decodedInstructions,
FailedInstructions = failedInstructions,
Confidence = confidence,
UsedFallback = false
};
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Plugin {Plugin} failed during quality assessment", plugin.Capabilities.PluginId);
return new DisassemblyQualityResult
{
Binary = null!,
Plugin = plugin,
CodeRegions = [],
Symbols = [],
TotalInstructions = 0,
DecodedInstructions = 0,
FailedInstructions = 0,
Confidence = 0.0,
UsedFallback = false,
FallbackReason = $"Plugin failed: {ex.Message}"
};
}
}
private static double CalculateConfidence(
int symbolCount,
int decodedInstructions,
int failedInstructions,
int regionCount)
{
var totalInstructions = decodedInstructions + failedInstructions;
if (totalInstructions == 0)
{
return 0.0;
}
// Decode success rate (weight: 0.5)
var decodeRate = (double)decodedInstructions / totalInstructions;
// Symbol discovery (weight: 0.3)
var symbolScore = Math.Min(1.0, symbolCount / 10.0);
// Region coverage (weight: 0.2)
var regionScore = Math.Min(1.0, regionCount / 5.0);
return (decodeRate * 0.5) + (symbolScore * 0.3) + (regionScore * 0.2);
}
private bool MeetsQualityThreshold(DisassemblyQualityResult result)
{
if (result.Confidence < _options.MinConfidenceThreshold)
{
return false;
}
if (result.Symbols.Length < _options.MinFunctionCount)
{
return false;
}
if (result.DecodeSuccessRate < _options.MinDecodeSuccessRate)
{
return false;
}
return true;
}
#region Format/Architecture Detection (copied from DisassemblyService)
private static BinaryFormat DetectFormat(ReadOnlySpan<byte> bytes)
{
if (bytes.Length < 4) return BinaryFormat.Raw;
// ELF magic
if (bytes[0] == 0x7F && bytes[1] == 'E' && bytes[2] == 'L' && bytes[3] == 'F')
return BinaryFormat.ELF;
// PE magic
if (bytes[0] == 'M' && bytes[1] == 'Z')
return BinaryFormat.PE;
// Mach-O magic
if ((bytes[0] == 0xFE && bytes[1] == 0xED && bytes[2] == 0xFA && (bytes[3] == 0xCE || bytes[3] == 0xCF)) ||
(bytes[3] == 0xFE && bytes[2] == 0xED && bytes[1] == 0xFA && (bytes[0] == 0xCE || bytes[0] == 0xCF)))
return BinaryFormat.MachO;
// WASM magic
if (bytes[0] == 0x00 && bytes[1] == 'a' && bytes[2] == 's' && bytes[3] == 'm')
return BinaryFormat.WASM;
return BinaryFormat.Raw;
}
private static CpuArchitecture DetectArchitecture(ReadOnlySpan<byte> bytes, BinaryFormat format)
{
return format switch
{
BinaryFormat.ELF when bytes.Length > 18 => DetectElfArchitecture(bytes),
BinaryFormat.PE when bytes.Length > 0x40 => DetectPeArchitecture(bytes),
BinaryFormat.MachO when bytes.Length > 8 => DetectMachOArchitecture(bytes),
_ => CpuArchitecture.X86_64
};
}
private static CpuArchitecture DetectElfArchitecture(ReadOnlySpan<byte> bytes)
{
var machine = (ushort)(bytes[18] | (bytes[19] << 8));
return machine switch
{
0x03 => CpuArchitecture.X86,
0x3E => CpuArchitecture.X86_64,
0x28 => CpuArchitecture.ARM32,
0xB7 => CpuArchitecture.ARM64,
0x08 => CpuArchitecture.MIPS32,
0xF3 => CpuArchitecture.RISCV64,
0x14 => CpuArchitecture.PPC32,
0x02 => CpuArchitecture.SPARC,
_ => bytes[4] == 2 ? CpuArchitecture.X86_64 : CpuArchitecture.X86
};
}
private static CpuArchitecture DetectPeArchitecture(ReadOnlySpan<byte> bytes)
{
var peOffset = bytes[0x3C] | (bytes[0x3D] << 8) | (bytes[0x3E] << 16) | (bytes[0x3F] << 24);
if (peOffset < 0 || peOffset + 6 > bytes.Length) return CpuArchitecture.X86;
var machine = (ushort)(bytes[peOffset + 4] | (bytes[peOffset + 5] << 8));
return machine switch
{
0x014c => CpuArchitecture.X86,
0x8664 => CpuArchitecture.X86_64,
0xaa64 => CpuArchitecture.ARM64,
0x01c4 => CpuArchitecture.ARM32,
_ => CpuArchitecture.X86
};
}
private static CpuArchitecture DetectMachOArchitecture(ReadOnlySpan<byte> bytes)
{
bool isBigEndian = bytes[0] == 0xFE;
uint cpuType = isBigEndian
? (uint)((bytes[4] << 24) | (bytes[5] << 16) | (bytes[6] << 8) | bytes[7])
: (uint)(bytes[4] | (bytes[5] << 8) | (bytes[6] << 16) | (bytes[7] << 24));
return cpuType switch
{
0x00000007 => CpuArchitecture.X86,
0x01000007 => CpuArchitecture.X86_64,
0x0000000C => CpuArchitecture.ARM32,
0x0100000C => CpuArchitecture.ARM64,
_ => CpuArchitecture.X86_64
};
}
#endregion
#endregion
}

View File

@@ -0,0 +1,460 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.BinaryIndex.Decompiler;
using StellaOps.BinaryIndex.ML;
using StellaOps.BinaryIndex.Semantic;
namespace StellaOps.BinaryIndex.Ensemble;
/// <summary>
/// Ensemble decision engine that combines syntactic, semantic, and ML signals.
/// </summary>
public sealed class EnsembleDecisionEngine : IEnsembleDecisionEngine
{
private readonly IAstComparisonEngine _astEngine;
private readonly ISemanticMatcher _semanticMatcher;
private readonly IEmbeddingService _embeddingService;
private readonly EnsembleOptions _defaultOptions;
private readonly ILogger<EnsembleDecisionEngine> _logger;
public EnsembleDecisionEngine(
IAstComparisonEngine astEngine,
ISemanticMatcher semanticMatcher,
IEmbeddingService embeddingService,
IOptions<EnsembleOptions> options,
ILogger<EnsembleDecisionEngine> logger)
{
_astEngine = astEngine ?? throw new ArgumentNullException(nameof(astEngine));
_semanticMatcher = semanticMatcher ?? throw new ArgumentNullException(nameof(semanticMatcher));
_embeddingService = embeddingService ?? throw new ArgumentNullException(nameof(embeddingService));
_defaultOptions = options?.Value ?? new EnsembleOptions();
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc />
public async Task<EnsembleResult> CompareAsync(
FunctionAnalysis source,
FunctionAnalysis target,
EnsembleOptions? options = null,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(source);
ArgumentNullException.ThrowIfNull(target);
ct.ThrowIfCancellationRequested();
options ??= _defaultOptions;
// Check for exact hash match first (optimization)
var exactHashMatch = CheckExactHashMatch(source, target);
// Compute individual signals
var contributions = new List<SignalContribution>();
var availableWeight = 0m;
// Syntactic (AST) signal
var syntacticContribution = ComputeSyntacticSignal(source, target, options);
contributions.Add(syntacticContribution);
if (syntacticContribution.IsAvailable)
{
availableWeight += options.SyntacticWeight;
}
// Semantic (graph) signal
var semanticContribution = await ComputeSemanticSignalAsync(source, target, options, ct);
contributions.Add(semanticContribution);
if (semanticContribution.IsAvailable)
{
availableWeight += options.SemanticWeight;
}
// ML (embedding) signal
var embeddingContribution = ComputeEmbeddingSignal(source, target, options);
contributions.Add(embeddingContribution);
if (embeddingContribution.IsAvailable)
{
availableWeight += options.EmbeddingWeight;
}
// Compute effective weights (normalize if some signals missing)
var effectiveWeights = ComputeEffectiveWeights(contributions, options, availableWeight);
// Update contributions with effective weights
var adjustedContributions = AdjustContributionWeights(contributions, effectiveWeights);
// Compute ensemble score
var ensembleScore = ComputeEnsembleScore(adjustedContributions, exactHashMatch, options);
// Determine match and confidence
var isMatch = ensembleScore >= options.MatchThreshold;
var confidence = DetermineConfidence(ensembleScore, adjustedContributions, exactHashMatch);
var reason = BuildDecisionReason(adjustedContributions, exactHashMatch, isMatch);
var result = new EnsembleResult
{
SourceFunctionId = source.FunctionId,
TargetFunctionId = target.FunctionId,
EnsembleScore = ensembleScore,
Contributions = adjustedContributions.ToImmutableArray(),
IsMatch = isMatch,
Confidence = confidence,
DecisionReason = reason,
ExactHashMatch = exactHashMatch,
AdjustedWeights = effectiveWeights
};
return result;
}
/// <inheritdoc />
public async Task<ImmutableArray<EnsembleResult>> FindMatchesAsync(
FunctionAnalysis query,
IEnumerable<FunctionAnalysis> corpus,
EnsembleOptions? options = null,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(query);
ArgumentNullException.ThrowIfNull(corpus);
options ??= _defaultOptions;
var results = new List<EnsembleResult>();
foreach (var candidate in corpus)
{
ct.ThrowIfCancellationRequested();
var result = await CompareAsync(query, candidate, options, ct);
if (result.EnsembleScore >= options.MinimumSignalThreshold)
{
results.Add(result);
}
}
return results
.OrderByDescending(r => r.EnsembleScore)
.Take(options.MaxCandidates)
.ToImmutableArray();
}
/// <inheritdoc />
public async Task<BatchComparisonResult> CompareBatchAsync(
IEnumerable<FunctionAnalysis> sources,
IEnumerable<FunctionAnalysis> targets,
EnsembleOptions? options = null,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(sources);
ArgumentNullException.ThrowIfNull(targets);
options ??= _defaultOptions;
var startTime = DateTime.UtcNow;
var results = new List<EnsembleResult>();
var targetList = targets.ToList();
foreach (var source in sources)
{
foreach (var target in targetList)
{
ct.ThrowIfCancellationRequested();
var result = await CompareAsync(source, target, options, ct);
results.Add(result);
}
}
var duration = DateTime.UtcNow - startTime;
var statistics = ComputeStatistics(results);
return new BatchComparisonResult
{
Results = results.ToImmutableArray(),
Statistics = statistics,
Duration = duration
};
}
private static bool CheckExactHashMatch(FunctionAnalysis source, FunctionAnalysis target)
{
if (source.NormalizedCodeHash is null || target.NormalizedCodeHash is null)
{
return false;
}
return source.NormalizedCodeHash.SequenceEqual(target.NormalizedCodeHash);
}
private SignalContribution ComputeSyntacticSignal(
FunctionAnalysis source,
FunctionAnalysis target,
EnsembleOptions options)
{
if (source.Ast is null || target.Ast is null)
{
return new SignalContribution
{
SignalType = SignalType.Syntactic,
RawScore = 0m,
Weight = options.SyntacticWeight,
IsAvailable = false,
Quality = SignalQuality.Unavailable
};
}
var similarity = _astEngine.ComputeStructuralSimilarity(source.Ast, target.Ast);
var quality = AssessAstQuality(source.Ast, target.Ast);
return new SignalContribution
{
SignalType = SignalType.Syntactic,
RawScore = similarity,
Weight = options.SyntacticWeight,
IsAvailable = true,
Quality = quality
};
}
private async Task<SignalContribution> ComputeSemanticSignalAsync(
FunctionAnalysis source,
FunctionAnalysis target,
EnsembleOptions options,
CancellationToken ct)
{
if (source.SemanticGraph is null || target.SemanticGraph is null)
{
return new SignalContribution
{
SignalType = SignalType.Semantic,
RawScore = 0m,
Weight = options.SemanticWeight,
IsAvailable = false,
Quality = SignalQuality.Unavailable
};
}
var similarity = await _semanticMatcher.ComputeGraphSimilarityAsync(
source.SemanticGraph,
target.SemanticGraph,
ct);
var quality = AssessGraphQuality(source.SemanticGraph, target.SemanticGraph);
return new SignalContribution
{
SignalType = SignalType.Semantic,
RawScore = similarity,
Weight = options.SemanticWeight,
IsAvailable = true,
Quality = quality
};
}
private SignalContribution ComputeEmbeddingSignal(
FunctionAnalysis source,
FunctionAnalysis target,
EnsembleOptions options)
{
if (source.Embedding is null || target.Embedding is null)
{
return new SignalContribution
{
SignalType = SignalType.Embedding,
RawScore = 0m,
Weight = options.EmbeddingWeight,
IsAvailable = false,
Quality = SignalQuality.Unavailable
};
}
var similarity = _embeddingService.ComputeSimilarity(
source.Embedding,
target.Embedding,
SimilarityMetric.Cosine);
return new SignalContribution
{
SignalType = SignalType.Embedding,
RawScore = similarity,
Weight = options.EmbeddingWeight,
IsAvailable = true,
Quality = SignalQuality.Normal
};
}
private static SignalQuality AssessAstQuality(DecompiledAst ast1, DecompiledAst ast2)
{
var minNodes = Math.Min(ast1.Root.Children.Length, ast2.Root.Children.Length);
return minNodes switch
{
< 3 => SignalQuality.Low,
< 10 => SignalQuality.Normal,
_ => SignalQuality.High
};
}
private static SignalQuality AssessGraphQuality(KeySemanticsGraph g1, KeySemanticsGraph g2)
{
var minNodes = Math.Min(g1.Nodes.Length, g2.Nodes.Length);
return minNodes switch
{
< 3 => SignalQuality.Low,
< 10 => SignalQuality.Normal,
_ => SignalQuality.High
};
}
private static EffectiveWeights ComputeEffectiveWeights(
List<SignalContribution> contributions,
EnsembleOptions options,
decimal availableWeight)
{
if (!options.AdaptiveWeights || availableWeight >= 0.999m)
{
return new EffectiveWeights(
options.SyntacticWeight,
options.SemanticWeight,
options.EmbeddingWeight);
}
// Redistribute weight from unavailable signals to available ones
var syntactic = contributions.First(c => c.SignalType == SignalType.Syntactic);
var semantic = contributions.First(c => c.SignalType == SignalType.Semantic);
var embedding = contributions.First(c => c.SignalType == SignalType.Embedding);
var syntacticWeight = syntactic.IsAvailable
? options.SyntacticWeight / availableWeight
: 0m;
var semanticWeight = semantic.IsAvailable
? options.SemanticWeight / availableWeight
: 0m;
var embeddingWeight = embedding.IsAvailable
? options.EmbeddingWeight / availableWeight
: 0m;
return new EffectiveWeights(syntacticWeight, semanticWeight, embeddingWeight);
}
private static List<SignalContribution> AdjustContributionWeights(
List<SignalContribution> contributions,
EffectiveWeights weights)
{
return contributions.Select(c => c.SignalType switch
{
SignalType.Syntactic => c with { Weight = weights.Syntactic },
SignalType.Semantic => c with { Weight = weights.Semantic },
SignalType.Embedding => c with { Weight = weights.Embedding },
_ => c
}).ToList();
}
private static decimal ComputeEnsembleScore(
List<SignalContribution> contributions,
bool exactHashMatch,
EnsembleOptions options)
{
var weightedSum = contributions
.Where(c => c.IsAvailable)
.Sum(c => c.WeightedScore);
// Apply exact match boost
if (exactHashMatch && options.UseExactHashMatch)
{
weightedSum = Math.Min(1.0m, weightedSum + options.ExactMatchBoost);
}
return Math.Clamp(weightedSum, 0m, 1m);
}
private static ConfidenceLevel DetermineConfidence(
decimal score,
List<SignalContribution> contributions,
bool exactHashMatch)
{
// Exact hash match is very high confidence
if (exactHashMatch)
{
return ConfidenceLevel.VeryHigh;
}
// Count available high-quality signals
var availableCount = contributions.Count(c => c.IsAvailable);
var highQualityCount = contributions.Count(c =>
c.IsAvailable && c.Quality >= SignalQuality.Normal);
// High score with multiple agreeing signals
if (score >= 0.95m && availableCount >= 3)
{
return ConfidenceLevel.VeryHigh;
}
if (score >= 0.90m && highQualityCount >= 2)
{
return ConfidenceLevel.High;
}
if (score >= 0.80m && availableCount >= 2)
{
return ConfidenceLevel.Medium;
}
if (score >= 0.70m)
{
return ConfidenceLevel.Low;
}
return ConfidenceLevel.VeryLow;
}
private static string BuildDecisionReason(
List<SignalContribution> contributions,
bool exactHashMatch,
bool isMatch)
{
if (exactHashMatch)
{
return "Exact normalized code hash match";
}
var availableSignals = contributions
.Where(c => c.IsAvailable)
.Select(c => $"{c.SignalType}: {c.RawScore:P0}")
.ToList();
if (availableSignals.Count == 0)
{
return "No signals available for comparison";
}
var signalSummary = string.Join(", ", availableSignals);
return isMatch
? $"Match based on: {signalSummary}"
: $"No match. Scores: {signalSummary}";
}
private static ComparisonStatistics ComputeStatistics(List<EnsembleResult> results)
{
var matchCount = results.Count(r => r.IsMatch);
var highConfidenceMatches = results.Count(r =>
r.IsMatch && r.Confidence >= ConfidenceLevel.High);
var exactHashMatches = results.Count(r => r.ExactHashMatch);
var averageScore = results.Count > 0
? results.Average(r => r.EnsembleScore)
: 0m;
var confidenceDistribution = results
.GroupBy(r => r.Confidence)
.ToImmutableDictionary(g => g.Key, g => g.Count());
return new ComparisonStatistics
{
TotalComparisons = results.Count,
MatchCount = matchCount,
HighConfidenceMatches = highConfidenceMatches,
ExactHashMatches = exactHashMatches,
AverageScore = averageScore,
ConfidenceDistribution = confidenceDistribution
};
}
}

View File

@@ -0,0 +1,110 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using Microsoft.Extensions.DependencyInjection;
using StellaOps.BinaryIndex.Decompiler;
using StellaOps.BinaryIndex.ML;
using StellaOps.BinaryIndex.Semantic;
namespace StellaOps.BinaryIndex.Ensemble;
/// <summary>
/// Extension methods for registering ensemble services.
/// </summary>
public static class EnsembleServiceCollectionExtensions
{
/// <summary>
/// Adds ensemble decision engine services to the service collection.
/// </summary>
/// <param name="services">The service collection.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddEnsembleServices(this IServiceCollection services)
{
ArgumentNullException.ThrowIfNull(services);
// Register ensemble components
services.AddScoped<IEnsembleDecisionEngine, EnsembleDecisionEngine>();
services.AddScoped<IFunctionAnalysisBuilder, FunctionAnalysisBuilder>();
services.AddScoped<IWeightTuningService, WeightTuningService>();
return services;
}
/// <summary>
/// Adds ensemble services with custom options.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="configureOptions">Action to configure ensemble options.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddEnsembleServices(
this IServiceCollection services,
Action<EnsembleOptions> configureOptions)
{
ArgumentNullException.ThrowIfNull(services);
ArgumentNullException.ThrowIfNull(configureOptions);
services.Configure(configureOptions);
return services.AddEnsembleServices();
}
/// <summary>
/// Adds the complete binary similarity stack (Decompiler + ML + Semantic + Ensemble).
/// </summary>
/// <param name="services">The service collection.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddBinarySimilarityServices(this IServiceCollection services)
{
ArgumentNullException.ThrowIfNull(services);
// Add all underlying services
services.AddDecompilerServices();
services.AddMlServices();
services.AddBinaryIndexSemantic();
// Add ensemble on top
services.AddEnsembleServices();
return services;
}
/// <summary>
/// Adds the complete binary similarity stack with custom options.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="configureEnsemble">Action to configure ensemble options.</param>
/// <param name="configureMl">Action to configure ML options.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddBinarySimilarityServices(
this IServiceCollection services,
Action<EnsembleOptions>? configureEnsemble = null,
Action<MlOptions>? configureMl = null)
{
ArgumentNullException.ThrowIfNull(services);
// Add all underlying services
services.AddDecompilerServices();
if (configureMl is not null)
{
services.AddMlServices(configureMl);
}
else
{
services.AddMlServices();
}
services.AddBinaryIndexSemantic();
// Add ensemble with options
if (configureEnsemble is not null)
{
services.AddEnsembleServices(configureEnsemble);
}
else
{
services.AddEnsembleServices();
}
return services;
}
}

View File

@@ -0,0 +1,165 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using Microsoft.Extensions.Logging;
using StellaOps.BinaryIndex.Decompiler;
using StellaOps.BinaryIndex.ML;
using StellaOps.BinaryIndex.Semantic;
namespace StellaOps.BinaryIndex.Ensemble;
/// <summary>
/// Builds complete function analysis from various input sources.
/// </summary>
public sealed class FunctionAnalysisBuilder : IFunctionAnalysisBuilder
{
private readonly IDecompiledCodeParser _parser;
private readonly ICodeNormalizer _normalizer;
private readonly IEmbeddingService _embeddingService;
private readonly IIrLiftingService? _irLiftingService;
private readonly ISemanticGraphExtractor? _graphExtractor;
private readonly ILogger<FunctionAnalysisBuilder> _logger;
public FunctionAnalysisBuilder(
IDecompiledCodeParser parser,
ICodeNormalizer normalizer,
IEmbeddingService embeddingService,
ILogger<FunctionAnalysisBuilder> logger,
IIrLiftingService? irLiftingService = null,
ISemanticGraphExtractor? graphExtractor = null)
{
_parser = parser ?? throw new ArgumentNullException(nameof(parser));
_normalizer = normalizer ?? throw new ArgumentNullException(nameof(normalizer));
_embeddingService = embeddingService ?? throw new ArgumentNullException(nameof(embeddingService));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_irLiftingService = irLiftingService;
_graphExtractor = graphExtractor;
}
/// <inheritdoc />
public async Task<FunctionAnalysis> BuildAnalysisAsync(
string functionId,
string functionName,
string decompiledCode,
ulong? address = null,
int? sizeBytes = null,
CancellationToken ct = default)
{
ArgumentException.ThrowIfNullOrEmpty(functionId);
ArgumentException.ThrowIfNullOrEmpty(functionName);
ArgumentException.ThrowIfNullOrEmpty(decompiledCode);
ct.ThrowIfCancellationRequested();
_logger.LogDebug(
"Building analysis for function {FunctionId} ({FunctionName})",
functionId, functionName);
// Parse AST
DecompiledAst? ast = null;
try
{
ast = _parser.Parse(decompiledCode);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to parse AST for {FunctionId}", functionId);
}
// Compute normalized hash
byte[]? normalizedHash = null;
try
{
normalizedHash = _normalizer.ComputeCanonicalHash(decompiledCode);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to compute normalized hash for {FunctionId}", functionId);
}
// Build semantic graph (requires IR lifting service and graph extractor)
KeySemanticsGraph? semanticGraph = null;
if (_irLiftingService is not null && _graphExtractor is not null)
{
try
{
// Note: Full semantic graph extraction requires binary bytes,
// not just decompiled code. This is a simplified path that
// sets semanticGraph to null when binary data is not available.
_logger.LogDebug(
"Semantic graph extraction requires binary data for {FunctionId}",
functionId);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to build semantic graph for {FunctionId}", functionId);
}
}
// Generate embedding
FunctionEmbedding? embedding = null;
try
{
var input = new EmbeddingInput(
DecompiledCode: decompiledCode,
SemanticGraph: semanticGraph,
InstructionBytes: null,
PreferredInput: EmbeddingInputType.DecompiledCode);
embedding = await _embeddingService.GenerateEmbeddingAsync(input, ct: ct);
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to generate embedding for {FunctionId}", functionId);
}
return new FunctionAnalysis
{
FunctionId = functionId,
FunctionName = functionName,
Ast = ast,
SemanticGraph = semanticGraph,
Embedding = embedding,
NormalizedCodeHash = normalizedHash,
DecompiledCode = decompiledCode,
Address = address,
SizeBytes = sizeBytes
};
}
/// <inheritdoc />
public FunctionAnalysis BuildFromComponents(
string functionId,
string functionName,
string? decompiledCode = null,
DecompiledAst? ast = null,
KeySemanticsGraph? semanticGraph = null,
FunctionEmbedding? embedding = null)
{
ArgumentException.ThrowIfNullOrEmpty(functionId);
ArgumentException.ThrowIfNullOrEmpty(functionName);
byte[]? normalizedHash = null;
if (decompiledCode is not null)
{
try
{
normalizedHash = _normalizer.ComputeCanonicalHash(decompiledCode);
}
catch
{
// Ignore normalization errors for components
}
}
return new FunctionAnalysis
{
FunctionId = functionId,
FunctionName = functionName,
Ast = ast,
SemanticGraph = semanticGraph,
Embedding = embedding,
NormalizedCodeHash = normalizedHash,
DecompiledCode = decompiledCode
};
}
}

View File

@@ -0,0 +1,129 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
namespace StellaOps.BinaryIndex.Ensemble;
/// <summary>
/// Ensemble decision engine that combines multiple similarity signals
/// to determine function equivalence.
/// </summary>
public interface IEnsembleDecisionEngine
{
/// <summary>
/// Compare two functions using all available signals.
/// </summary>
/// <param name="source">Source function analysis.</param>
/// <param name="target">Target function analysis.</param>
/// <param name="options">Ensemble options (optional).</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Ensemble comparison result.</returns>
Task<EnsembleResult> CompareAsync(
FunctionAnalysis source,
FunctionAnalysis target,
EnsembleOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Find the best matches for a function from a corpus.
/// </summary>
/// <param name="query">Query function analysis.</param>
/// <param name="corpus">Corpus of candidate functions.</param>
/// <param name="options">Ensemble options (optional).</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Top matching functions.</returns>
Task<ImmutableArray<EnsembleResult>> FindMatchesAsync(
FunctionAnalysis query,
IEnumerable<FunctionAnalysis> corpus,
EnsembleOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Perform batch comparison between two sets of functions.
/// </summary>
/// <param name="sources">Source functions.</param>
/// <param name="targets">Target functions.</param>
/// <param name="options">Ensemble options (optional).</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Batch comparison result with statistics.</returns>
Task<BatchComparisonResult> CompareBatchAsync(
IEnumerable<FunctionAnalysis> sources,
IEnumerable<FunctionAnalysis> targets,
EnsembleOptions? options = null,
CancellationToken ct = default);
}
/// <summary>
/// Weight tuning service for optimizing ensemble weights.
/// </summary>
public interface IWeightTuningService
{
/// <summary>
/// Tune weights using grid search over training pairs.
/// </summary>
/// <param name="trainingPairs">Labeled training pairs.</param>
/// <param name="gridStep">Step size for grid search (e.g., 0.05).</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Best weights found.</returns>
Task<WeightTuningResult> TuneWeightsAsync(
IEnumerable<EnsembleTrainingPair> trainingPairs,
decimal gridStep = 0.05m,
CancellationToken ct = default);
/// <summary>
/// Evaluate a specific weight combination on training data.
/// </summary>
/// <param name="weights">Weights to evaluate.</param>
/// <param name="trainingPairs">Labeled training pairs.</param>
/// <param name="threshold">Match threshold.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Evaluation metrics.</returns>
Task<WeightEvaluation> EvaluateWeightsAsync(
EffectiveWeights weights,
IEnumerable<EnsembleTrainingPair> trainingPairs,
decimal threshold = 0.85m,
CancellationToken ct = default);
}
/// <summary>
/// Function analysis builder that collects all signal sources.
/// </summary>
public interface IFunctionAnalysisBuilder
{
/// <summary>
/// Build complete function analysis from raw data.
/// </summary>
/// <param name="functionId">Function identifier.</param>
/// <param name="functionName">Function name.</param>
/// <param name="decompiledCode">Raw decompiled code.</param>
/// <param name="address">Function address (optional).</param>
/// <param name="sizeBytes">Function size in bytes (optional).</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Complete function analysis.</returns>
Task<FunctionAnalysis> BuildAnalysisAsync(
string functionId,
string functionName,
string decompiledCode,
ulong? address = null,
int? sizeBytes = null,
CancellationToken ct = default);
/// <summary>
/// Build function analysis from existing components.
/// </summary>
/// <param name="functionId">Function identifier.</param>
/// <param name="functionName">Function name.</param>
/// <param name="decompiledCode">Raw decompiled code (optional).</param>
/// <param name="ast">Pre-parsed AST (optional).</param>
/// <param name="semanticGraph">Pre-built semantic graph (optional).</param>
/// <param name="embedding">Pre-computed embedding (optional).</param>
/// <returns>Function analysis.</returns>
FunctionAnalysis BuildFromComponents(
string functionId,
string functionName,
string? decompiledCode = null,
Decompiler.DecompiledAst? ast = null,
Semantic.KeySemanticsGraph? semanticGraph = null,
ML.FunctionEmbedding? embedding = null);
}

View File

@@ -0,0 +1,446 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
using StellaOps.BinaryIndex.Decompiler;
using StellaOps.BinaryIndex.ML;
using StellaOps.BinaryIndex.Semantic;
namespace StellaOps.BinaryIndex.Ensemble;
/// <summary>
/// Complete analysis of a function from all signal sources.
/// </summary>
public sealed record FunctionAnalysis
{
/// <summary>
/// Unique identifier for the function.
/// </summary>
public required string FunctionId { get; init; }
/// <summary>
/// Function name if available.
/// </summary>
public required string FunctionName { get; init; }
/// <summary>
/// Decompiled AST representation.
/// </summary>
public DecompiledAst? Ast { get; init; }
/// <summary>
/// Semantic graph representation.
/// </summary>
public KeySemanticsGraph? SemanticGraph { get; init; }
/// <summary>
/// ML embedding representation.
/// </summary>
public FunctionEmbedding? Embedding { get; init; }
/// <summary>
/// Normalized code hash for quick equality check.
/// </summary>
public byte[]? NormalizedCodeHash { get; init; }
/// <summary>
/// Raw decompiled code.
/// </summary>
public string? DecompiledCode { get; init; }
/// <summary>
/// Binary address of the function.
/// </summary>
public ulong? Address { get; init; }
/// <summary>
/// Size of the function in bytes.
/// </summary>
public int? SizeBytes { get; init; }
}
/// <summary>
/// Configuration options for ensemble decision making.
/// </summary>
public sealed class EnsembleOptions
{
/// <summary>
/// Weight for syntactic (AST-based) similarity. Default: 0.25
/// </summary>
public decimal SyntacticWeight { get; set; } = 0.25m;
/// <summary>
/// Weight for semantic (graph-based) similarity. Default: 0.35
/// </summary>
public decimal SemanticWeight { get; set; } = 0.35m;
/// <summary>
/// Weight for ML embedding similarity. Default: 0.40
/// </summary>
public decimal EmbeddingWeight { get; set; } = 0.40m;
/// <summary>
/// Minimum ensemble score to consider functions as matching.
/// </summary>
public decimal MatchThreshold { get; set; } = 0.85m;
/// <summary>
/// Minimum score for each individual signal to be considered valid.
/// </summary>
public decimal MinimumSignalThreshold { get; set; } = 0.50m;
/// <summary>
/// Whether to require all three signals for a match decision.
/// </summary>
public bool RequireAllSignals { get; set; } = false;
/// <summary>
/// Whether to use exact hash matching as an optimization.
/// </summary>
public bool UseExactHashMatch { get; set; } = true;
/// <summary>
/// Confidence boost when normalized code hashes match exactly.
/// </summary>
public decimal ExactMatchBoost { get; set; } = 0.10m;
/// <summary>
/// Maximum number of candidate matches to return.
/// </summary>
public int MaxCandidates { get; set; } = 10;
/// <summary>
/// Enable adaptive weight adjustment based on signal quality.
/// </summary>
public bool AdaptiveWeights { get; set; } = true;
/// <summary>
/// Validates that weights sum to 1.0.
/// </summary>
public bool AreWeightsValid()
{
var total = SyntacticWeight + SemanticWeight + EmbeddingWeight;
return Math.Abs(total - 1.0m) < 0.001m;
}
/// <summary>
/// Normalizes weights to sum to 1.0.
/// </summary>
public void NormalizeWeights()
{
var total = SyntacticWeight + SemanticWeight + EmbeddingWeight;
if (total > 0)
{
SyntacticWeight /= total;
SemanticWeight /= total;
EmbeddingWeight /= total;
}
}
}
/// <summary>
/// Result of ensemble comparison between two functions.
/// </summary>
public sealed record EnsembleResult
{
/// <summary>
/// Source function identifier.
/// </summary>
public required string SourceFunctionId { get; init; }
/// <summary>
/// Target function identifier.
/// </summary>
public required string TargetFunctionId { get; init; }
/// <summary>
/// Final ensemble similarity score (0.0 to 1.0).
/// </summary>
public required decimal EnsembleScore { get; init; }
/// <summary>
/// Individual signal contributions.
/// </summary>
public required ImmutableArray<SignalContribution> Contributions { get; init; }
/// <summary>
/// Whether this pair is considered a match based on threshold.
/// </summary>
public required bool IsMatch { get; init; }
/// <summary>
/// Confidence level in the match decision.
/// </summary>
public required ConfidenceLevel Confidence { get; init; }
/// <summary>
/// Reason for the match or non-match decision.
/// </summary>
public string? DecisionReason { get; init; }
/// <summary>
/// Whether exact hash match was detected.
/// </summary>
public bool ExactHashMatch { get; init; }
/// <summary>
/// Effective weights used after adaptive adjustment.
/// </summary>
public EffectiveWeights? AdjustedWeights { get; init; }
}
/// <summary>
/// Contribution of a single signal to the ensemble score.
/// </summary>
public sealed record SignalContribution
{
/// <summary>
/// Type of signal.
/// </summary>
public required SignalType SignalType { get; init; }
/// <summary>
/// Raw similarity score from this signal.
/// </summary>
public required decimal RawScore { get; init; }
/// <summary>
/// Weight applied to this signal.
/// </summary>
public required decimal Weight { get; init; }
/// <summary>
/// Weighted contribution to ensemble score.
/// </summary>
public decimal WeightedScore => RawScore * Weight;
/// <summary>
/// Whether this signal was available for comparison.
/// </summary>
public required bool IsAvailable { get; init; }
/// <summary>
/// Quality assessment of this signal.
/// </summary>
public SignalQuality Quality { get; init; } = SignalQuality.Normal;
}
/// <summary>
/// Type of similarity signal.
/// </summary>
public enum SignalType
{
/// <summary>
/// AST-based syntactic comparison.
/// </summary>
Syntactic,
/// <summary>
/// Semantic graph comparison.
/// </summary>
Semantic,
/// <summary>
/// ML embedding cosine similarity.
/// </summary>
Embedding,
/// <summary>
/// Exact normalized code hash match.
/// </summary>
ExactHash
}
/// <summary>
/// Quality assessment of a signal.
/// </summary>
public enum SignalQuality
{
/// <summary>
/// Signal not available (data missing).
/// </summary>
Unavailable,
/// <summary>
/// Low quality signal (small function, few nodes).
/// </summary>
Low,
/// <summary>
/// Normal quality signal.
/// </summary>
Normal,
/// <summary>
/// High quality signal (rich data, high confidence).
/// </summary>
High
}
/// <summary>
/// Confidence level in a match decision.
/// </summary>
public enum ConfidenceLevel
{
/// <summary>
/// Very low confidence, likely uncertain.
/// </summary>
VeryLow,
/// <summary>
/// Low confidence, needs review.
/// </summary>
Low,
/// <summary>
/// Medium confidence, reasonable certainty.
/// </summary>
Medium,
/// <summary>
/// High confidence, strong match signals.
/// </summary>
High,
/// <summary>
/// Very high confidence, exact or near-exact match.
/// </summary>
VeryHigh
}
/// <summary>
/// Effective weights after adaptive adjustment.
/// </summary>
public sealed record EffectiveWeights(
decimal Syntactic,
decimal Semantic,
decimal Embedding);
/// <summary>
/// Batch comparison result.
/// </summary>
public sealed record BatchComparisonResult
{
/// <summary>
/// All comparison results.
/// </summary>
public required ImmutableArray<EnsembleResult> Results { get; init; }
/// <summary>
/// Summary statistics.
/// </summary>
public required ComparisonStatistics Statistics { get; init; }
/// <summary>
/// Time taken for comparison.
/// </summary>
public required TimeSpan Duration { get; init; }
}
/// <summary>
/// Statistics from batch comparison.
/// </summary>
public sealed record ComparisonStatistics
{
/// <summary>
/// Total number of comparisons performed.
/// </summary>
public required int TotalComparisons { get; init; }
/// <summary>
/// Number of matches found.
/// </summary>
public required int MatchCount { get; init; }
/// <summary>
/// Number of high-confidence matches.
/// </summary>
public required int HighConfidenceMatches { get; init; }
/// <summary>
/// Number of exact hash matches.
/// </summary>
public required int ExactHashMatches { get; init; }
/// <summary>
/// Average ensemble score across all comparisons.
/// </summary>
public required decimal AverageScore { get; init; }
/// <summary>
/// Distribution of confidence levels.
/// </summary>
public required ImmutableDictionary<ConfidenceLevel, int> ConfidenceDistribution { get; init; }
}
/// <summary>
/// Weight tuning result from grid search or optimization.
/// </summary>
public sealed record WeightTuningResult
{
/// <summary>
/// Best weights found.
/// </summary>
public required EffectiveWeights BestWeights { get; init; }
/// <summary>
/// Accuracy achieved with best weights.
/// </summary>
public required decimal Accuracy { get; init; }
/// <summary>
/// Precision achieved with best weights.
/// </summary>
public required decimal Precision { get; init; }
/// <summary>
/// Recall achieved with best weights.
/// </summary>
public required decimal Recall { get; init; }
/// <summary>
/// F1 score achieved with best weights.
/// </summary>
public required decimal F1Score { get; init; }
/// <summary>
/// All weight combinations evaluated.
/// </summary>
public required ImmutableArray<WeightEvaluation> Evaluations { get; init; }
}
/// <summary>
/// Evaluation of a specific weight combination.
/// </summary>
public sealed record WeightEvaluation(
EffectiveWeights Weights,
decimal Accuracy,
decimal Precision,
decimal Recall,
decimal F1Score);
/// <summary>
/// Training pair for weight tuning.
/// </summary>
public sealed record EnsembleTrainingPair
{
/// <summary>
/// First function analysis.
/// </summary>
public required FunctionAnalysis Function1 { get; init; }
/// <summary>
/// Second function analysis.
/// </summary>
public required FunctionAnalysis Function2 { get; init; }
/// <summary>
/// Ground truth: are these functions equivalent?
/// </summary>
public required bool IsEquivalent { get; init; }
/// <summary>
/// Optional similarity label (for regression training).
/// </summary>
public decimal? SimilarityLabel { get; init; }
}

View File

@@ -0,0 +1,26 @@
<!-- Copyright (c) StellaOps. All rights reserved. -->
<!-- Licensed under AGPL-3.0-or-later. See LICENSE in the project root. -->
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<RootNamespace>StellaOps.BinaryIndex.Ensemble</RootNamespace>
<Description>Ensemble decision engine combining syntactic, semantic, and ML-based function similarity signals.</Description>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\StellaOps.BinaryIndex.Decompiler\StellaOps.BinaryIndex.Decompiler.csproj" />
<ProjectReference Include="..\StellaOps.BinaryIndex.ML\StellaOps.BinaryIndex.ML.csproj" />
<ProjectReference Include="..\StellaOps.BinaryIndex.Semantic\StellaOps.BinaryIndex.Semantic.csproj" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,180 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
using Microsoft.Extensions.Logging;
namespace StellaOps.BinaryIndex.Ensemble;
/// <summary>
/// Weight tuning service using grid search optimization.
/// </summary>
public sealed class WeightTuningService : IWeightTuningService
{
private readonly IEnsembleDecisionEngine _decisionEngine;
private readonly ILogger<WeightTuningService> _logger;
public WeightTuningService(
IEnsembleDecisionEngine decisionEngine,
ILogger<WeightTuningService> logger)
{
_decisionEngine = decisionEngine ?? throw new ArgumentNullException(nameof(decisionEngine));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
}
/// <inheritdoc />
public async Task<WeightTuningResult> TuneWeightsAsync(
IEnumerable<EnsembleTrainingPair> trainingPairs,
decimal gridStep = 0.05m,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(trainingPairs);
if (gridStep <= 0 || gridStep > 0.5m)
{
throw new ArgumentOutOfRangeException(nameof(gridStep), "Step must be between 0 and 0.5");
}
var pairs = trainingPairs.ToList();
if (pairs.Count == 0)
{
throw new ArgumentException("At least one training pair required", nameof(trainingPairs));
}
_logger.LogInformation(
"Starting weight tuning with {PairCount} pairs, step size {Step}",
pairs.Count, gridStep);
var evaluations = new List<WeightEvaluation>();
WeightEvaluation? bestEvaluation = null;
// Grid search over weight combinations
for (var syntactic = 0m; syntactic <= 1m; syntactic += gridStep)
{
for (var semantic = 0m; semantic <= 1m - syntactic; semantic += gridStep)
{
ct.ThrowIfCancellationRequested();
var embedding = 1m - syntactic - semantic;
// Skip invalid weight combinations
if (embedding < 0)
{
continue;
}
var weights = new EffectiveWeights(syntactic, semantic, embedding);
var evaluation = await EvaluateWeightsAsync(weights, pairs, 0.85m, ct);
evaluations.Add(evaluation);
if (bestEvaluation is null || evaluation.F1Score > bestEvaluation.F1Score)
{
bestEvaluation = evaluation;
_logger.LogDebug(
"New best weights: Syn={Syn:P0} Sem={Sem:P0} Emb={Emb:P0} F1={F1:P2}",
syntactic, semantic, embedding, evaluation.F1Score);
}
}
}
if (bestEvaluation is null)
{
throw new InvalidOperationException("No valid weight combinations evaluated");
}
_logger.LogInformation(
"Weight tuning complete. Best weights: Syn={Syn:P0} Sem={Sem:P0} Emb={Emb:P0} F1={F1:P2}",
bestEvaluation.Weights.Syntactic,
bestEvaluation.Weights.Semantic,
bestEvaluation.Weights.Embedding,
bestEvaluation.F1Score);
return new WeightTuningResult
{
BestWeights = bestEvaluation.Weights,
Accuracy = bestEvaluation.Accuracy,
Precision = bestEvaluation.Precision,
Recall = bestEvaluation.Recall,
F1Score = bestEvaluation.F1Score,
Evaluations = evaluations.ToImmutableArray()
};
}
/// <inheritdoc />
public async Task<WeightEvaluation> EvaluateWeightsAsync(
EffectiveWeights weights,
IEnumerable<EnsembleTrainingPair> trainingPairs,
decimal threshold = 0.85m,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(weights);
ArgumentNullException.ThrowIfNull(trainingPairs);
var options = new EnsembleOptions
{
SyntacticWeight = weights.Syntactic,
SemanticWeight = weights.Semantic,
EmbeddingWeight = weights.Embedding,
MatchThreshold = threshold,
AdaptiveWeights = false // Use fixed weights during evaluation
};
var truePositives = 0;
var falsePositives = 0;
var trueNegatives = 0;
var falseNegatives = 0;
foreach (var pair in trainingPairs)
{
ct.ThrowIfCancellationRequested();
var result = await _decisionEngine.CompareAsync(
pair.Function1,
pair.Function2,
options,
ct);
if (pair.IsEquivalent)
{
if (result.IsMatch)
{
truePositives++;
}
else
{
falseNegatives++;
}
}
else
{
if (result.IsMatch)
{
falsePositives++;
}
else
{
trueNegatives++;
}
}
}
var total = truePositives + falsePositives + trueNegatives + falseNegatives;
var accuracy = total > 0
? (decimal)(truePositives + trueNegatives) / total
: 0m;
var precision = (truePositives + falsePositives) > 0
? (decimal)truePositives / (truePositives + falsePositives)
: 0m;
var recall = (truePositives + falseNegatives) > 0
? (decimal)truePositives / (truePositives + falseNegatives)
: 0m;
var f1Score = (precision + recall) > 0
? 2 * precision * recall / (precision + recall)
: 0m;
return new WeightEvaluation(weights, accuracy, precision, recall, f1Score);
}
}

View File

@@ -0,0 +1,97 @@
# AGENTS.md - StellaOps.BinaryIndex.Ghidra
## Module Overview
This module provides Ghidra integration for the BinaryIndex semantic diffing stack. It serves as a fallback/enhancement layer when B2R2 provides insufficient coverage or accuracy.
## Roles Expected
- **Backend Engineer**: Implement Ghidra Headless wrapper, ghidriff bridge, Version Tracking service, BSim integration
- **QA Engineer**: Unit tests for all services, integration tests for Ghidra availability scenarios
## Required Documentation
Before working on this module, read:
- `docs/modules/binary-index/architecture.md`
- `docs/implplan/SPRINT_20260105_001_003_BINDEX_semdiff_ghidra.md`
- Ghidra documentation: https://ghidra.re/ghidra_docs/
- ghidriff repository: https://github.com/clearbluejar/ghidriff
## Module-Specific Constraints
### Process Management
- Ghidra runs as external Java process - manage lifecycle carefully
- Use SemaphoreSlim for concurrent access control (one analysis at a time per instance)
- Always clean up temporary project directories
### External Dependencies
- **Ghidra 11.x**: Set via `GhidraOptions.GhidraHome`
- **Java 17+**: Set via `GhidraOptions.JavaHome`
- **Python 3.10+**: Required for ghidriff
- **ghidriff**: Installed via pip
### Determinism Rules
- Use `CultureInfo.InvariantCulture` for all parsing/formatting
- Inject `TimeProvider` for timestamps
- Inject `IGuidGenerator` for any ID generation
- Results must be reproducible given same inputs
### Error Handling
- Ghidra unavailability should not crash - graceful degradation
- Log all external process failures with stderr content
- Wrap external exceptions in `GhidraException` or `GhidriffException`
## Key Interfaces
| Interface | Purpose |
|-----------|---------|
| `IGhidraService` | Main analysis service (headless wrapper) |
| `IVersionTrackingService` | Version Tracking with multiple correlators |
| `IBSimService` | BSim signature generation and querying |
| `IGhidriffBridge` | Python ghidriff interop |
## Directory Structure
```
StellaOps.BinaryIndex.Ghidra/
Abstractions/
IGhidraService.cs
IVersionTrackingService.cs
IBSimService.cs
IGhidriffBridge.cs
Models/
GhidraModels.cs
VersionTrackingModels.cs
BSimModels.cs
GhidriffModels.cs
Services/
GhidraHeadlessManager.cs
GhidraService.cs
VersionTrackingService.cs
BSimService.cs
GhidriffBridge.cs
Options/
GhidraOptions.cs
BSimOptions.cs
GhidriffOptions.cs
Exceptions/
GhidraException.cs
GhidriffException.cs
Extensions/
GhidraServiceCollectionExtensions.cs
```
## Testing Strategy
- Unit tests mock external process execution
- Integration tests require Ghidra installation (skip if unavailable)
- Use `[Trait("Category", "Integration")]` for tests requiring Ghidra
- Fallback scenarios tested in isolation
## Working Agreements
1. All public APIs must have XML documentation
2. Follow the pattern from `StellaOps.BinaryIndex.Disassembly`
3. Expose services via `AddGhidra()` extension method
4. Configuration via `IOptions<GhidraOptions>` pattern

View File

@@ -0,0 +1,168 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// Service for Ghidra BSim (Binary Similarity) operations.
/// BSim provides behavioral similarity matching based on P-Code semantics.
/// </summary>
public interface IBSimService
{
/// <summary>
/// Generate BSim signatures for functions from an analyzed binary.
/// </summary>
/// <param name="analysis">Ghidra analysis result.</param>
/// <param name="options">Signature generation options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>BSim signatures for each function.</returns>
Task<ImmutableArray<BSimSignature>> GenerateSignaturesAsync(
GhidraAnalysisResult analysis,
BSimGenerationOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Query BSim database for similar functions.
/// </summary>
/// <param name="signature">The signature to search for.</param>
/// <param name="options">Query options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Matching functions from the database.</returns>
Task<ImmutableArray<BSimMatch>> QueryAsync(
BSimSignature signature,
BSimQueryOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Query BSim database for multiple signatures in batch.
/// </summary>
/// <param name="signatures">The signatures to search for.</param>
/// <param name="options">Query options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Matching functions for each query signature.</returns>
Task<ImmutableArray<BSimQueryResult>> QueryBatchAsync(
ImmutableArray<BSimSignature> signatures,
BSimQueryOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Ingest functions into BSim database.
/// </summary>
/// <param name="libraryName">Name of the library being ingested.</param>
/// <param name="version">Version of the library.</param>
/// <param name="signatures">Signatures to ingest.</param>
/// <param name="ct">Cancellation token.</param>
Task IngestAsync(
string libraryName,
string version,
ImmutableArray<BSimSignature> signatures,
CancellationToken ct = default);
/// <summary>
/// Check if BSim database is available and healthy.
/// </summary>
/// <param name="ct">Cancellation token.</param>
/// <returns>True if BSim database is accessible.</returns>
Task<bool> IsAvailableAsync(CancellationToken ct = default);
}
/// <summary>
/// Options for BSim signature generation.
/// </summary>
public sealed record BSimGenerationOptions
{
/// <summary>
/// Minimum function size (in instructions) to generate signatures for.
/// Very small functions produce low-confidence matches.
/// </summary>
public int MinFunctionSize { get; init; } = 5;
/// <summary>
/// Whether to include thunk/stub functions.
/// </summary>
public bool IncludeThunks { get; init; } = false;
/// <summary>
/// Whether to include imported library functions.
/// </summary>
public bool IncludeImports { get; init; } = false;
}
/// <summary>
/// Options for BSim database queries.
/// </summary>
public sealed record BSimQueryOptions
{
/// <summary>
/// Minimum similarity score (0.0-1.0) for matches.
/// </summary>
public double MinSimilarity { get; init; } = 0.7;
/// <summary>
/// Minimum significance score for matches.
/// Significance measures how distinctive a function is.
/// </summary>
public double MinSignificance { get; init; } = 0.0;
/// <summary>
/// Maximum number of results per query.
/// </summary>
public int MaxResults { get; init; } = 10;
/// <summary>
/// Limit search to specific libraries (empty = all libraries).
/// </summary>
public ImmutableArray<string> TargetLibraries { get; init; } = [];
/// <summary>
/// Limit search to specific library versions.
/// </summary>
public ImmutableArray<string> TargetVersions { get; init; } = [];
}
/// <summary>
/// A BSim function signature.
/// </summary>
/// <param name="FunctionName">Original function name.</param>
/// <param name="Address">Function address in the binary.</param>
/// <param name="FeatureVector">BSim feature vector bytes.</param>
/// <param name="VectorLength">Number of features in the vector.</param>
/// <param name="SelfSignificance">How distinctive this function is (higher = more unique).</param>
/// <param name="InstructionCount">Number of P-Code instructions.</param>
public sealed record BSimSignature(
string FunctionName,
ulong Address,
byte[] FeatureVector,
int VectorLength,
double SelfSignificance,
int InstructionCount);
/// <summary>
/// A BSim match result.
/// </summary>
/// <param name="MatchedLibrary">Library containing the matched function.</param>
/// <param name="MatchedVersion">Version of the library.</param>
/// <param name="MatchedFunction">Name of the matched function.</param>
/// <param name="MatchedAddress">Address of the matched function.</param>
/// <param name="Similarity">Similarity score (0.0-1.0).</param>
/// <param name="Significance">Significance of the match.</param>
/// <param name="Confidence">Combined confidence score.</param>
public sealed record BSimMatch(
string MatchedLibrary,
string MatchedVersion,
string MatchedFunction,
ulong MatchedAddress,
double Similarity,
double Significance,
double Confidence);
/// <summary>
/// Result of a batch BSim query for a single signature.
/// </summary>
/// <param name="QuerySignature">The signature that was queried.</param>
/// <param name="Matches">Matching functions found.</param>
public sealed record BSimQueryResult(
BSimSignature QuerySignature,
ImmutableArray<BSimMatch> Matches);

View File

@@ -0,0 +1,144 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// Main Ghidra analysis service interface.
/// Provides access to Ghidra Headless analysis capabilities.
/// </summary>
public interface IGhidraService
{
/// <summary>
/// Analyze a binary using Ghidra headless.
/// </summary>
/// <param name="binaryStream">The binary stream to analyze.</param>
/// <param name="options">Optional analysis configuration.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Analysis results including functions, imports, exports, and metadata.</returns>
Task<GhidraAnalysisResult> AnalyzeAsync(
Stream binaryStream,
GhidraAnalysisOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Analyze a binary from a file path using Ghidra headless.
/// </summary>
/// <param name="binaryPath">Absolute path to the binary file.</param>
/// <param name="options">Optional analysis configuration.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Analysis results including functions, imports, exports, and metadata.</returns>
Task<GhidraAnalysisResult> AnalyzeAsync(
string binaryPath,
GhidraAnalysisOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Check if Ghidra backend is available and healthy.
/// </summary>
/// <param name="ct">Cancellation token.</param>
/// <returns>True if Ghidra is available, false otherwise.</returns>
Task<bool> IsAvailableAsync(CancellationToken ct = default);
/// <summary>
/// Gets information about the Ghidra installation.
/// </summary>
/// <param name="ct">Cancellation token.</param>
/// <returns>Ghidra version and capability information.</returns>
Task<GhidraInfo> GetInfoAsync(CancellationToken ct = default);
}
/// <summary>
/// Options for Ghidra analysis.
/// </summary>
public sealed record GhidraAnalysisOptions
{
/// <summary>
/// Whether to run full auto-analysis (slower but more complete).
/// </summary>
public bool RunFullAnalysis { get; init; } = true;
/// <summary>
/// Whether to include decompiled code in function results.
/// </summary>
public bool IncludeDecompilation { get; init; } = false;
/// <summary>
/// Whether to generate P-Code hashes for functions.
/// </summary>
public bool GeneratePCodeHashes { get; init; } = true;
/// <summary>
/// Whether to extract string literals.
/// </summary>
public bool ExtractStrings { get; init; } = true;
/// <summary>
/// Whether to extract functions.
/// </summary>
public bool ExtractFunctions { get; init; } = true;
/// <summary>
/// Whether to extract decompilation (alias for IncludeDecompilation).
/// </summary>
public bool ExtractDecompilation { get; init; } = false;
/// <summary>
/// Maximum analysis time in seconds (0 = unlimited).
/// </summary>
public int TimeoutSeconds { get; init; } = 300;
/// <summary>
/// Specific scripts to run during analysis.
/// </summary>
public ImmutableArray<string> Scripts { get; init; } = [];
/// <summary>
/// Architecture hint for raw binaries.
/// </summary>
public string? ArchitectureHint { get; init; }
/// <summary>
/// Processor language hint for Ghidra (e.g., "x86:LE:64:default").
/// </summary>
public string? ProcessorHint { get; init; }
/// <summary>
/// Base address override for raw binaries.
/// </summary>
public ulong? BaseAddress { get; init; }
}
/// <summary>
/// Result of Ghidra analysis.
/// </summary>
/// <param name="BinaryHash">SHA256 hash of the analyzed binary.</param>
/// <param name="Functions">Discovered functions.</param>
/// <param name="Imports">Import symbols.</param>
/// <param name="Exports">Export symbols.</param>
/// <param name="Strings">Discovered string literals.</param>
/// <param name="MemoryBlocks">Memory blocks/sections in the binary.</param>
/// <param name="Metadata">Analysis metadata.</param>
public sealed record GhidraAnalysisResult(
string BinaryHash,
ImmutableArray<GhidraFunction> Functions,
ImmutableArray<GhidraImport> Imports,
ImmutableArray<GhidraExport> Exports,
ImmutableArray<GhidraString> Strings,
ImmutableArray<GhidraMemoryBlock> MemoryBlocks,
GhidraMetadata Metadata);
/// <summary>
/// Information about the Ghidra installation.
/// </summary>
/// <param name="Version">Ghidra version string (e.g., "11.2").</param>
/// <param name="JavaVersion">Java runtime version.</param>
/// <param name="AvailableProcessors">Available processor languages.</param>
/// <param name="InstallPath">Ghidra installation path.</param>
public sealed record GhidraInfo(
string Version,
string JavaVersion,
ImmutableArray<string> AvailableProcessors,
string InstallPath);

View File

@@ -0,0 +1,207 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// Bridge interface for ghidriff Python tool integration.
/// ghidriff provides automated binary diff reports using Ghidra.
/// </summary>
public interface IGhidriffBridge
{
/// <summary>
/// Run ghidriff to compare two binaries.
/// </summary>
/// <param name="oldBinaryPath">Path to the older binary version.</param>
/// <param name="newBinaryPath">Path to the newer binary version.</param>
/// <param name="options">ghidriff configuration options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Diff result with added, removed, and modified functions.</returns>
Task<GhidriffResult> DiffAsync(
string oldBinaryPath,
string newBinaryPath,
GhidriffDiffOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Run ghidriff to compare two binaries from streams.
/// </summary>
/// <param name="oldBinary">Stream of the older binary version.</param>
/// <param name="newBinary">Stream of the newer binary version.</param>
/// <param name="options">ghidriff configuration options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Diff result with added, removed, and modified functions.</returns>
Task<GhidriffResult> DiffAsync(
Stream oldBinary,
Stream newBinary,
GhidriffDiffOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Generate a formatted report from ghidriff results.
/// </summary>
/// <param name="result">The diff result to format.</param>
/// <param name="format">Output format.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Formatted report string.</returns>
Task<string> GenerateReportAsync(
GhidriffResult result,
GhidriffReportFormat format,
CancellationToken ct = default);
/// <summary>
/// Check if ghidriff is available (Python + ghidriff installed).
/// </summary>
/// <param name="ct">Cancellation token.</param>
/// <returns>True if ghidriff is available.</returns>
Task<bool> IsAvailableAsync(CancellationToken ct = default);
/// <summary>
/// Get ghidriff version information.
/// </summary>
/// <param name="ct">Cancellation token.</param>
/// <returns>Version string.</returns>
Task<string> GetVersionAsync(CancellationToken ct = default);
}
/// <summary>
/// Options for ghidriff diff operation.
/// </summary>
public sealed record GhidriffDiffOptions
{
/// <summary>
/// Path to Ghidra installation (auto-detected if not set).
/// </summary>
public string? GhidraPath { get; init; }
/// <summary>
/// Path for Ghidra project files (temp dir if not set).
/// </summary>
public string? ProjectPath { get; init; }
/// <summary>
/// Whether to include decompiled code in results.
/// </summary>
public bool IncludeDecompilation { get; init; } = true;
/// <summary>
/// Whether to include disassembly listing in results.
/// </summary>
public bool IncludeDisassembly { get; init; } = true;
/// <summary>
/// Functions to exclude from comparison (by name pattern).
/// </summary>
public ImmutableArray<string> ExcludeFunctions { get; init; } = [];
/// <summary>
/// Maximum number of concurrent Ghidra instances.
/// </summary>
public int MaxParallelism { get; init; } = 1;
/// <summary>
/// Maximum analysis time in seconds.
/// </summary>
public int TimeoutSeconds { get; init; } = 600;
}
/// <summary>
/// Result of a ghidriff comparison.
/// </summary>
/// <param name="OldBinaryHash">SHA256 hash of the old binary.</param>
/// <param name="NewBinaryHash">SHA256 hash of the new binary.</param>
/// <param name="OldBinaryName">Name/path of the old binary.</param>
/// <param name="NewBinaryName">Name/path of the new binary.</param>
/// <param name="AddedFunctions">Functions added in new binary.</param>
/// <param name="RemovedFunctions">Functions removed from old binary.</param>
/// <param name="ModifiedFunctions">Functions modified between versions.</param>
/// <param name="Statistics">Comparison statistics.</param>
/// <param name="RawJsonOutput">Raw JSON output from ghidriff.</param>
public sealed record GhidriffResult(
string OldBinaryHash,
string NewBinaryHash,
string OldBinaryName,
string NewBinaryName,
ImmutableArray<GhidriffFunction> AddedFunctions,
ImmutableArray<GhidriffFunction> RemovedFunctions,
ImmutableArray<GhidriffDiff> ModifiedFunctions,
GhidriffStats Statistics,
string RawJsonOutput);
/// <summary>
/// A function from ghidriff output.
/// </summary>
/// <param name="Name">Function name.</param>
/// <param name="Address">Function address.</param>
/// <param name="Size">Function size in bytes.</param>
/// <param name="Signature">Decompiled signature.</param>
/// <param name="DecompiledCode">Decompiled C code (if requested).</param>
public sealed record GhidriffFunction(
string Name,
ulong Address,
int Size,
string? Signature,
string? DecompiledCode);
/// <summary>
/// A function diff from ghidriff output.
/// </summary>
/// <param name="FunctionName">Function name.</param>
/// <param name="OldAddress">Address in old binary.</param>
/// <param name="NewAddress">Address in new binary.</param>
/// <param name="OldSize">Size in old binary.</param>
/// <param name="NewSize">Size in new binary.</param>
/// <param name="OldSignature">Signature in old binary.</param>
/// <param name="NewSignature">Signature in new binary.</param>
/// <param name="Similarity">Similarity score.</param>
/// <param name="OldDecompiled">Decompiled code from old binary.</param>
/// <param name="NewDecompiled">Decompiled code from new binary.</param>
/// <param name="InstructionChanges">List of instruction-level changes.</param>
public sealed record GhidriffDiff(
string FunctionName,
ulong OldAddress,
ulong NewAddress,
int OldSize,
int NewSize,
string? OldSignature,
string? NewSignature,
decimal Similarity,
string? OldDecompiled,
string? NewDecompiled,
ImmutableArray<string> InstructionChanges);
/// <summary>
/// Statistics from ghidriff comparison.
/// </summary>
/// <param name="TotalOldFunctions">Total functions in old binary.</param>
/// <param name="TotalNewFunctions">Total functions in new binary.</param>
/// <param name="AddedCount">Number of added functions.</param>
/// <param name="RemovedCount">Number of removed functions.</param>
/// <param name="ModifiedCount">Number of modified functions.</param>
/// <param name="UnchangedCount">Number of unchanged functions.</param>
/// <param name="AnalysisDuration">Time taken for analysis.</param>
public sealed record GhidriffStats(
int TotalOldFunctions,
int TotalNewFunctions,
int AddedCount,
int RemovedCount,
int ModifiedCount,
int UnchangedCount,
TimeSpan AnalysisDuration);
/// <summary>
/// Report output format for ghidriff.
/// </summary>
public enum GhidriffReportFormat
{
/// <summary>JSON format.</summary>
Json,
/// <summary>Markdown format.</summary>
Markdown,
/// <summary>HTML format.</summary>
Html
}

View File

@@ -0,0 +1,255 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// Service for running Ghidra Version Tracking between two binaries.
/// Version Tracking correlates functions between two versions of a binary
/// using multiple correlator algorithms.
/// </summary>
public interface IVersionTrackingService
{
/// <summary>
/// Run Ghidra Version Tracking with multiple correlators.
/// </summary>
/// <param name="oldBinary">Stream of the older binary version.</param>
/// <param name="newBinary">Stream of the newer binary version.</param>
/// <param name="options">Version tracking configuration.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Version tracking results with matched, added, removed, and modified functions.</returns>
Task<VersionTrackingResult> TrackVersionsAsync(
Stream oldBinary,
Stream newBinary,
VersionTrackingOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Run Ghidra Version Tracking using file paths.
/// </summary>
/// <param name="oldBinaryPath">Path to the older binary version.</param>
/// <param name="newBinaryPath">Path to the newer binary version.</param>
/// <param name="options">Version tracking configuration.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Version tracking results with matched, added, removed, and modified functions.</returns>
Task<VersionTrackingResult> TrackVersionsAsync(
string oldBinaryPath,
string newBinaryPath,
VersionTrackingOptions? options = null,
CancellationToken ct = default);
}
/// <summary>
/// Options for Version Tracking analysis.
/// </summary>
public sealed record VersionTrackingOptions
{
/// <summary>
/// Correlators to use for function matching, in priority order.
/// </summary>
public ImmutableArray<CorrelatorType> Correlators { get; init; } =
[CorrelatorType.ExactBytes, CorrelatorType.ExactMnemonics,
CorrelatorType.SymbolName, CorrelatorType.DataReference,
CorrelatorType.CombinedReference];
/// <summary>
/// Minimum similarity score (0.0-1.0) to consider a match.
/// </summary>
public decimal MinSimilarity { get; init; } = 0.5m;
/// <summary>
/// Whether to include decompiled code in results.
/// </summary>
public bool IncludeDecompilation { get; init; } = false;
/// <summary>
/// Whether to compute detailed instruction-level differences.
/// </summary>
public bool ComputeDetailedDiffs { get; init; } = true;
/// <summary>
/// Maximum analysis time in seconds.
/// </summary>
public int TimeoutSeconds { get; init; } = 600;
}
/// <summary>
/// Type of correlator algorithm used for function matching.
/// </summary>
public enum CorrelatorType
{
/// <summary>Matches functions with identical byte sequences.</summary>
ExactBytes,
/// <summary>Matches functions with identical instruction mnemonics (ignoring operands).</summary>
ExactMnemonics,
/// <summary>Matches functions by symbol name.</summary>
SymbolName,
/// <summary>Matches functions with similar data references.</summary>
DataReference,
/// <summary>Matches functions with similar call references.</summary>
CallReference,
/// <summary>Combined reference scoring algorithm.</summary>
CombinedReference,
/// <summary>BSim behavioral similarity matching.</summary>
BSim
}
/// <summary>
/// Result of Version Tracking analysis.
/// </summary>
/// <param name="Matches">Functions matched between versions.</param>
/// <param name="AddedFunctions">Functions added in the new version.</param>
/// <param name="RemovedFunctions">Functions removed from the old version.</param>
/// <param name="ModifiedFunctions">Functions modified between versions.</param>
/// <param name="Statistics">Analysis statistics.</param>
public sealed record VersionTrackingResult(
ImmutableArray<FunctionMatch> Matches,
ImmutableArray<FunctionAdded> AddedFunctions,
ImmutableArray<FunctionRemoved> RemovedFunctions,
ImmutableArray<FunctionModified> ModifiedFunctions,
VersionTrackingStats Statistics);
/// <summary>
/// Statistics from Version Tracking analysis.
/// </summary>
/// <param name="TotalOldFunctions">Total functions in old binary.</param>
/// <param name="TotalNewFunctions">Total functions in new binary.</param>
/// <param name="MatchedCount">Number of matched functions.</param>
/// <param name="AddedCount">Number of added functions.</param>
/// <param name="RemovedCount">Number of removed functions.</param>
/// <param name="ModifiedCount">Number of modified functions (subset of matched).</param>
/// <param name="AnalysisDuration">Time taken for analysis.</param>
public sealed record VersionTrackingStats(
int TotalOldFunctions,
int TotalNewFunctions,
int MatchedCount,
int AddedCount,
int RemovedCount,
int ModifiedCount,
TimeSpan AnalysisDuration);
/// <summary>
/// A matched function between two binary versions.
/// </summary>
/// <param name="OldName">Function name in old binary.</param>
/// <param name="OldAddress">Function address in old binary.</param>
/// <param name="NewName">Function name in new binary.</param>
/// <param name="NewAddress">Function address in new binary.</param>
/// <param name="Similarity">Similarity score (0.0-1.0).</param>
/// <param name="MatchedBy">Correlator that produced the match.</param>
/// <param name="Differences">Detected differences if any.</param>
public sealed record FunctionMatch(
string OldName,
ulong OldAddress,
string NewName,
ulong NewAddress,
decimal Similarity,
CorrelatorType MatchedBy,
ImmutableArray<MatchDifference> Differences);
/// <summary>
/// A function added in the new binary version.
/// </summary>
/// <param name="Name">Function name.</param>
/// <param name="Address">Function address.</param>
/// <param name="Size">Function size in bytes.</param>
/// <param name="Signature">Decompiled signature if available.</param>
public sealed record FunctionAdded(
string Name,
ulong Address,
int Size,
string? Signature);
/// <summary>
/// A function removed from the old binary version.
/// </summary>
/// <param name="Name">Function name.</param>
/// <param name="Address">Function address.</param>
/// <param name="Size">Function size in bytes.</param>
/// <param name="Signature">Decompiled signature if available.</param>
public sealed record FunctionRemoved(
string Name,
ulong Address,
int Size,
string? Signature);
/// <summary>
/// A function modified between versions (with detailed differences).
/// </summary>
/// <param name="OldName">Function name in old binary.</param>
/// <param name="OldAddress">Function address in old binary.</param>
/// <param name="OldSize">Function size in old binary.</param>
/// <param name="NewName">Function name in new binary.</param>
/// <param name="NewAddress">Function address in new binary.</param>
/// <param name="NewSize">Function size in new binary.</param>
/// <param name="Similarity">Similarity score.</param>
/// <param name="Differences">List of specific differences.</param>
/// <param name="OldDecompiled">Decompiled code from old binary (if requested).</param>
/// <param name="NewDecompiled">Decompiled code from new binary (if requested).</param>
public sealed record FunctionModified(
string OldName,
ulong OldAddress,
int OldSize,
string NewName,
ulong NewAddress,
int NewSize,
decimal Similarity,
ImmutableArray<MatchDifference> Differences,
string? OldDecompiled,
string? NewDecompiled);
/// <summary>
/// A specific difference between matched functions.
/// </summary>
/// <param name="Type">Type of difference.</param>
/// <param name="Description">Human-readable description.</param>
/// <param name="OldValue">Value in old binary (if applicable).</param>
/// <param name="NewValue">Value in new binary (if applicable).</param>
/// <param name="Address">Address where difference occurs (if applicable).</param>
public sealed record MatchDifference(
DifferenceType Type,
string Description,
string? OldValue,
string? NewValue,
ulong? Address = null);
/// <summary>
/// Type of difference detected between functions.
/// </summary>
public enum DifferenceType
{
/// <summary>Instruction added.</summary>
InstructionAdded,
/// <summary>Instruction removed.</summary>
InstructionRemoved,
/// <summary>Instruction changed.</summary>
InstructionChanged,
/// <summary>Branch target changed.</summary>
BranchTargetChanged,
/// <summary>Call target changed.</summary>
CallTargetChanged,
/// <summary>Constant value changed.</summary>
ConstantChanged,
/// <summary>Function size changed.</summary>
SizeChanged,
/// <summary>Stack frame layout changed.</summary>
StackFrameChanged,
/// <summary>Register usage changed.</summary>
RegisterUsageChanged
}

View File

@@ -0,0 +1,245 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// Exception thrown when Ghidra operations fail.
/// </summary>
public class GhidraException : Exception
{
/// <summary>
/// Creates a new GhidraException.
/// </summary>
public GhidraException()
{
}
/// <summary>
/// Creates a new GhidraException with a message.
/// </summary>
/// <param name="message">Error message.</param>
public GhidraException(string message) : base(message)
{
}
/// <summary>
/// Creates a new GhidraException with a message and inner exception.
/// </summary>
/// <param name="message">Error message.</param>
/// <param name="innerException">Inner exception.</param>
public GhidraException(string message, Exception innerException) : base(message, innerException)
{
}
/// <summary>
/// Exit code from Ghidra process if available.
/// </summary>
public int? ExitCode { get; init; }
/// <summary>
/// Standard error output from Ghidra process if available.
/// </summary>
public string? StandardError { get; init; }
/// <summary>
/// Standard output from Ghidra process if available.
/// </summary>
public string? StandardOutput { get; init; }
}
/// <summary>
/// Exception thrown when Ghidra is not available or not properly configured.
/// </summary>
public class GhidraUnavailableException : GhidraException
{
/// <summary>
/// Creates a new GhidraUnavailableException.
/// </summary>
public GhidraUnavailableException() : base("Ghidra is not available or not properly configured")
{
}
/// <summary>
/// Creates a new GhidraUnavailableException with a message.
/// </summary>
/// <param name="message">Error message.</param>
public GhidraUnavailableException(string message) : base(message)
{
}
/// <summary>
/// Creates a new GhidraUnavailableException with a message and inner exception.
/// </summary>
/// <param name="message">Error message.</param>
/// <param name="innerException">Inner exception.</param>
public GhidraUnavailableException(string message, Exception innerException) : base(message, innerException)
{
}
}
/// <summary>
/// Exception thrown when Ghidra analysis times out.
/// </summary>
public class GhidraTimeoutException : GhidraException
{
/// <summary>
/// Creates a new GhidraTimeoutException.
/// </summary>
/// <param name="timeoutSeconds">The timeout that was exceeded.</param>
public GhidraTimeoutException(int timeoutSeconds)
: base($"Ghidra analysis timed out after {timeoutSeconds} seconds")
{
TimeoutSeconds = timeoutSeconds;
}
/// <summary>
/// Creates a new GhidraTimeoutException with a message.
/// </summary>
/// <param name="message">Error message.</param>
/// <param name="timeoutSeconds">The timeout that was exceeded.</param>
public GhidraTimeoutException(string message, int timeoutSeconds) : base(message)
{
TimeoutSeconds = timeoutSeconds;
}
/// <summary>
/// The timeout value that was exceeded.
/// </summary>
public int TimeoutSeconds { get; }
}
/// <summary>
/// Exception thrown when ghidriff operations fail.
/// </summary>
public class GhidriffException : Exception
{
/// <summary>
/// Creates a new GhidriffException.
/// </summary>
public GhidriffException()
{
}
/// <summary>
/// Creates a new GhidriffException with a message.
/// </summary>
/// <param name="message">Error message.</param>
public GhidriffException(string message) : base(message)
{
}
/// <summary>
/// Creates a new GhidriffException with a message and inner exception.
/// </summary>
/// <param name="message">Error message.</param>
/// <param name="innerException">Inner exception.</param>
public GhidriffException(string message, Exception innerException) : base(message, innerException)
{
}
/// <summary>
/// Exit code from Python process if available.
/// </summary>
public int? ExitCode { get; init; }
/// <summary>
/// Standard error output from Python process if available.
/// </summary>
public string? StandardError { get; init; }
/// <summary>
/// Standard output from Python process if available.
/// </summary>
public string? StandardOutput { get; init; }
}
/// <summary>
/// Exception thrown when ghidriff is not available.
/// </summary>
public class GhidriffUnavailableException : GhidriffException
{
/// <summary>
/// Creates a new GhidriffUnavailableException.
/// </summary>
public GhidriffUnavailableException() : base("ghidriff is not available. Ensure Python and ghidriff are installed.")
{
}
/// <summary>
/// Creates a new GhidriffUnavailableException with a message.
/// </summary>
/// <param name="message">Error message.</param>
public GhidriffUnavailableException(string message) : base(message)
{
}
/// <summary>
/// Creates a new GhidriffUnavailableException with a message and inner exception.
/// </summary>
/// <param name="message">Error message.</param>
/// <param name="innerException">Inner exception.</param>
public GhidriffUnavailableException(string message, Exception innerException) : base(message, innerException)
{
}
}
/// <summary>
/// Exception thrown when BSim operations fail.
/// </summary>
public class BSimException : Exception
{
/// <summary>
/// Creates a new BSimException.
/// </summary>
public BSimException()
{
}
/// <summary>
/// Creates a new BSimException with a message.
/// </summary>
/// <param name="message">Error message.</param>
public BSimException(string message) : base(message)
{
}
/// <summary>
/// Creates a new BSimException with a message and inner exception.
/// </summary>
/// <param name="message">Error message.</param>
/// <param name="innerException">Inner exception.</param>
public BSimException(string message, Exception innerException) : base(message, innerException)
{
}
}
/// <summary>
/// Exception thrown when BSim database is not available.
/// </summary>
public class BSimUnavailableException : BSimException
{
/// <summary>
/// Creates a new BSimUnavailableException.
/// </summary>
public BSimUnavailableException() : base("BSim database is not available or not configured")
{
}
/// <summary>
/// Creates a new BSimUnavailableException with a message.
/// </summary>
/// <param name="message">Error message.</param>
public BSimUnavailableException(string message) : base(message)
{
}
/// <summary>
/// Creates a new BSimUnavailableException with a message and inner exception.
/// </summary>
/// <param name="message">Error message.</param>
/// <param name="innerException">Inner exception.</param>
public BSimUnavailableException(string message, Exception innerException) : base(message, innerException)
{
}
}

View File

@@ -0,0 +1,114 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.DependencyInjection.Extensions;
using StellaOps.BinaryIndex.Disassembly;
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// Extension methods for registering Ghidra services.
/// </summary>
public static class GhidraServiceCollectionExtensions
{
/// <summary>
/// Adds Ghidra integration services to the service collection.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="configuration">The configuration section for Ghidra.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddGhidra(
this IServiceCollection services,
IConfiguration configuration)
{
// Bind options
services.AddOptions<GhidraOptions>()
.Bind(configuration.GetSection(GhidraOptions.SectionName))
.ValidateDataAnnotations()
.ValidateOnStart();
services.AddOptions<BSimOptions>()
.Bind(configuration.GetSection(BSimOptions.SectionName))
.ValidateOnStart();
services.AddOptions<GhidriffOptions>()
.Bind(configuration.GetSection(GhidriffOptions.SectionName))
.ValidateOnStart();
// Register TimeProvider if not already registered
services.TryAddSingleton(TimeProvider.System);
// Register services
services.AddSingleton<GhidraHeadlessManager>();
services.AddSingleton<IGhidraService, GhidraService>();
services.AddSingleton<IGhidriffBridge, GhidriffBridge>();
services.AddSingleton<IVersionTrackingService, VersionTrackingService>();
services.AddSingleton<IBSimService, BSimService>();
// Register as IDisassemblyPlugin for fallback disassembly
services.AddSingleton<IDisassemblyPlugin, GhidraDisassemblyPlugin>();
return services;
}
/// <summary>
/// Adds Ghidra integration services with custom configuration.
/// </summary>
/// <param name="services">The service collection.</param>
/// <param name="configureGhidra">Action to configure Ghidra options.</param>
/// <param name="configureBSim">Optional action to configure BSim options.</param>
/// <param name="configureGhidriff">Optional action to configure ghidriff options.</param>
/// <returns>The service collection for chaining.</returns>
public static IServiceCollection AddGhidra(
this IServiceCollection services,
Action<GhidraOptions> configureGhidra,
Action<BSimOptions>? configureBSim = null,
Action<GhidriffOptions>? configureGhidriff = null)
{
services.AddOptions<GhidraOptions>()
.Configure(configureGhidra)
.ValidateDataAnnotations()
.ValidateOnStart();
if (configureBSim is not null)
{
services.AddOptions<BSimOptions>()
.Configure(configureBSim)
.ValidateOnStart();
}
else
{
services.AddOptions<BSimOptions>()
.ValidateOnStart();
}
if (configureGhidriff is not null)
{
services.AddOptions<GhidriffOptions>()
.Configure(configureGhidriff)
.ValidateOnStart();
}
else
{
services.AddOptions<GhidriffOptions>()
.ValidateOnStart();
}
// Register TimeProvider if not already registered
services.TryAddSingleton(TimeProvider.System);
// Register services
services.AddSingleton<GhidraHeadlessManager>();
services.AddSingleton<IGhidraService, GhidraService>();
services.AddSingleton<IGhidriffBridge, GhidriffBridge>();
services.AddSingleton<IVersionTrackingService, VersionTrackingService>();
services.AddSingleton<IBSimService, BSimService>();
// Register as IDisassemblyPlugin for fallback disassembly
services.AddSingleton<IDisassemblyPlugin, GhidraDisassemblyPlugin>();
return services;
}
}

View File

@@ -0,0 +1,157 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// A function discovered by Ghidra analysis.
/// </summary>
/// <param name="Name">Function name (may be auto-generated like FUN_00401000).</param>
/// <param name="Address">Virtual address of the function entry point.</param>
/// <param name="Size">Size of the function in bytes.</param>
/// <param name="Signature">Decompiled signature if available.</param>
/// <param name="DecompiledCode">Decompiled C code if requested.</param>
/// <param name="PCodeHash">SHA256 hash of normalized P-Code for semantic comparison.</param>
/// <param name="CalledFunctions">Names of functions called by this function.</param>
/// <param name="CallingFunctions">Names of functions that call this function.</param>
/// <param name="IsThunk">Whether this is a thunk/stub function.</param>
/// <param name="IsExternal">Whether this function is external (imported).</param>
public sealed record GhidraFunction(
string Name,
ulong Address,
int Size,
string? Signature,
string? DecompiledCode,
byte[]? PCodeHash,
ImmutableArray<string> CalledFunctions,
ImmutableArray<string> CallingFunctions,
bool IsThunk = false,
bool IsExternal = false);
/// <summary>
/// An import symbol from Ghidra analysis.
/// </summary>
/// <param name="Name">Symbol name.</param>
/// <param name="Address">Address where symbol is referenced.</param>
/// <param name="LibraryName">Name of the library providing the symbol.</param>
/// <param name="Ordinal">Ordinal number if applicable (PE imports).</param>
public sealed record GhidraImport(
string Name,
ulong Address,
string? LibraryName,
int? Ordinal);
/// <summary>
/// An export symbol from Ghidra analysis.
/// </summary>
/// <param name="Name">Symbol name.</param>
/// <param name="Address">Address of the exported symbol.</param>
/// <param name="Ordinal">Ordinal number if applicable (PE exports).</param>
public sealed record GhidraExport(
string Name,
ulong Address,
int? Ordinal);
/// <summary>
/// A string literal discovered by Ghidra analysis.
/// </summary>
/// <param name="Value">The string value.</param>
/// <param name="Address">Address where string is located.</param>
/// <param name="Length">Length of the string in bytes.</param>
/// <param name="Encoding">String encoding (ASCII, UTF-8, UTF-16, etc.).</param>
public sealed record GhidraString(
string Value,
ulong Address,
int Length,
string Encoding);
/// <summary>
/// Metadata from Ghidra analysis.
/// </summary>
/// <param name="FileName">Name of the analyzed file.</param>
/// <param name="Format">Binary format detected (ELF, PE, Mach-O, etc.).</param>
/// <param name="Architecture">CPU architecture.</param>
/// <param name="Processor">Ghidra processor language ID.</param>
/// <param name="Compiler">Compiler ID if detected.</param>
/// <param name="Endianness">Byte order (little or big endian).</param>
/// <param name="AddressSize">Pointer size in bits (32 or 64).</param>
/// <param name="ImageBase">Image base address.</param>
/// <param name="EntryPoint">Entry point address.</param>
/// <param name="AnalysisDate">When analysis was performed.</param>
/// <param name="GhidraVersion">Ghidra version used.</param>
/// <param name="AnalysisDuration">How long analysis took.</param>
public sealed record GhidraMetadata(
string FileName,
string Format,
string Architecture,
string Processor,
string? Compiler,
string Endianness,
int AddressSize,
ulong ImageBase,
ulong? EntryPoint,
DateTimeOffset AnalysisDate,
string GhidraVersion,
TimeSpan AnalysisDuration);
/// <summary>
/// A data reference discovered by Ghidra analysis.
/// </summary>
/// <param name="FromAddress">Address where reference originates.</param>
/// <param name="ToAddress">Address being referenced.</param>
/// <param name="ReferenceType">Type of reference (read, write, call, etc.).</param>
public sealed record GhidraDataReference(
ulong FromAddress,
ulong ToAddress,
GhidraReferenceType ReferenceType);
/// <summary>
/// Type of reference in Ghidra analysis.
/// </summary>
public enum GhidraReferenceType
{
/// <summary>Unknown reference type.</summary>
Unknown,
/// <summary>Memory read reference.</summary>
Read,
/// <summary>Memory write reference.</summary>
Write,
/// <summary>Function call reference.</summary>
Call,
/// <summary>Unconditional jump reference.</summary>
UnconditionalJump,
/// <summary>Conditional jump reference.</summary>
ConditionalJump,
/// <summary>Computed/indirect reference.</summary>
Computed,
/// <summary>Data reference (address of).</summary>
Data
}
/// <summary>
/// A memory block/section from Ghidra analysis.
/// </summary>
/// <param name="Name">Section name (.text, .data, etc.).</param>
/// <param name="Start">Start address.</param>
/// <param name="End">End address.</param>
/// <param name="Size">Size in bytes.</param>
/// <param name="IsExecutable">Whether section is executable.</param>
/// <param name="IsWritable">Whether section is writable.</param>
/// <param name="IsInitialized">Whether section has initialized data.</param>
public sealed record GhidraMemoryBlock(
string Name,
ulong Start,
ulong End,
long Size,
bool IsExecutable,
bool IsWritable,
bool IsInitialized);

View File

@@ -0,0 +1,188 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.ComponentModel.DataAnnotations;
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// Configuration options for Ghidra integration.
/// </summary>
public sealed class GhidraOptions
{
/// <summary>
/// Configuration section name.
/// </summary>
public const string SectionName = "Ghidra";
/// <summary>
/// Path to Ghidra installation directory (GHIDRA_HOME).
/// </summary>
[Required]
public string GhidraHome { get; set; } = string.Empty;
/// <summary>
/// Path to Java installation directory (JAVA_HOME).
/// If not set, system JAVA_HOME will be used.
/// </summary>
public string? JavaHome { get; set; }
/// <summary>
/// Working directory for Ghidra projects and temporary files.
/// </summary>
[Required]
public string WorkDir { get; set; } = Path.Combine(Path.GetTempPath(), "stellaops-ghidra");
/// <summary>
/// Path to custom Ghidra scripts directory.
/// </summary>
public string? ScriptsDir { get; set; }
/// <summary>
/// Maximum memory for Ghidra JVM (e.g., "4G", "8192M").
/// </summary>
public string MaxMemory { get; set; } = "4G";
/// <summary>
/// Maximum CPU cores for Ghidra analysis.
/// </summary>
public int MaxCpu { get; set; } = Environment.ProcessorCount;
/// <summary>
/// Default timeout for analysis operations in seconds.
/// </summary>
public int DefaultTimeoutSeconds { get; set; } = 300;
/// <summary>
/// Whether to clean up temporary projects after analysis.
/// </summary>
public bool CleanupTempProjects { get; set; } = true;
/// <summary>
/// Maximum concurrent Ghidra instances.
/// </summary>
public int MaxConcurrentInstances { get; set; } = 1;
/// <summary>
/// Whether Ghidra integration is enabled.
/// </summary>
public bool Enabled { get; set; } = true;
}
/// <summary>
/// Configuration options for BSim database.
/// </summary>
public sealed class BSimOptions
{
/// <summary>
/// Configuration section name.
/// </summary>
public const string SectionName = "BSim";
/// <summary>
/// BSim database connection string.
/// Format: postgresql://user:pass@host:port/database
/// </summary>
public string? ConnectionString { get; set; }
/// <summary>
/// BSim database host.
/// </summary>
public string Host { get; set; } = "localhost";
/// <summary>
/// BSim database port.
/// </summary>
public int Port { get; set; } = 5432;
/// <summary>
/// BSim database name.
/// </summary>
public string Database { get; set; } = "bsim";
/// <summary>
/// BSim database username.
/// </summary>
public string Username { get; set; } = "bsim";
/// <summary>
/// BSim database password.
/// </summary>
public string? Password { get; set; }
/// <summary>
/// Default minimum similarity for queries.
/// </summary>
public double DefaultMinSimilarity { get; set; } = 0.7;
/// <summary>
/// Default maximum results per query.
/// </summary>
public int DefaultMaxResults { get; set; } = 10;
/// <summary>
/// Whether BSim integration is enabled.
/// </summary>
public bool Enabled { get; set; } = false;
/// <summary>
/// Gets the effective connection string.
/// </summary>
public string GetConnectionString()
{
if (!string.IsNullOrEmpty(ConnectionString))
{
return ConnectionString;
}
var password = string.IsNullOrEmpty(Password) ? "" : $":{Password}";
return $"postgresql://{Username}{password}@{Host}:{Port}/{Database}";
}
}
/// <summary>
/// Configuration options for ghidriff Python bridge.
/// </summary>
public sealed class GhidriffOptions
{
/// <summary>
/// Configuration section name.
/// </summary>
public const string SectionName = "Ghidriff";
/// <summary>
/// Path to Python executable.
/// If not set, "python3" or "python" will be used from PATH.
/// </summary>
public string? PythonPath { get; set; }
/// <summary>
/// Path to ghidriff module (if not installed via pip).
/// </summary>
public string? GhidriffModulePath { get; set; }
/// <summary>
/// Whether to include decompilation in diff output by default.
/// </summary>
public bool DefaultIncludeDecompilation { get; set; } = true;
/// <summary>
/// Whether to include disassembly in diff output by default.
/// </summary>
public bool DefaultIncludeDisassembly { get; set; } = true;
/// <summary>
/// Default timeout for ghidriff operations in seconds.
/// </summary>
public int DefaultTimeoutSeconds { get; set; } = 600;
/// <summary>
/// Working directory for ghidriff output.
/// </summary>
public string WorkDir { get; set; } = Path.Combine(Path.GetTempPath(), "stellaops-ghidriff");
/// <summary>
/// Whether ghidriff integration is enabled.
/// </summary>
public bool Enabled { get; set; } = true;
}

View File

@@ -0,0 +1,285 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
using System.Globalization;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// Implementation of <see cref="IBSimService"/> for BSim signature generation and querying.
/// </summary>
public sealed class BSimService : IBSimService
{
private static readonly JsonSerializerOptions JsonOptions = new()
{
PropertyNameCaseInsensitive = true,
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
};
private readonly GhidraHeadlessManager _headlessManager;
private readonly BSimOptions _options;
private readonly GhidraOptions _ghidraOptions;
private readonly ILogger<BSimService> _logger;
/// <summary>
/// Creates a new BSimService.
/// </summary>
/// <param name="headlessManager">The Ghidra Headless manager.</param>
/// <param name="options">BSim options.</param>
/// <param name="ghidraOptions">Ghidra options.</param>
/// <param name="logger">Logger instance.</param>
public BSimService(
GhidraHeadlessManager headlessManager,
IOptions<BSimOptions> options,
IOptions<GhidraOptions> ghidraOptions,
ILogger<BSimService> logger)
{
_headlessManager = headlessManager;
_options = options.Value;
_ghidraOptions = ghidraOptions.Value;
_logger = logger;
}
/// <inheritdoc />
public async Task<ImmutableArray<BSimSignature>> GenerateSignaturesAsync(
GhidraAnalysisResult analysis,
BSimGenerationOptions? options = null,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(analysis);
options ??= new BSimGenerationOptions();
_logger.LogInformation(
"Generating BSim signatures for {FunctionCount} functions",
analysis.Functions.Length);
// Filter functions based on options
var eligibleFunctions = analysis.Functions
.Where(f => IsEligibleForBSim(f, options))
.ToList();
_logger.LogDebug(
"Filtered to {EligibleCount} eligible functions (min size: {MinSize}, include thunks: {IncludeThunks})",
eligibleFunctions.Count,
options.MinFunctionSize,
options.IncludeThunks);
// For each eligible function, generate a BSim signature
// In a real implementation, this would use Ghidra's BSim feature extraction
var signatures = new List<BSimSignature>();
foreach (var function in eligibleFunctions)
{
var signature = GenerateSignatureFromFunction(function);
if (signature is not null)
{
signatures.Add(signature);
}
}
_logger.LogInformation(
"Generated {SignatureCount} BSim signatures",
signatures.Count);
return [.. signatures];
}
/// <inheritdoc />
public async Task<ImmutableArray<BSimMatch>> QueryAsync(
BSimSignature signature,
BSimQueryOptions? options = null,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(signature);
options ??= new BSimQueryOptions
{
MinSimilarity = _options.DefaultMinSimilarity,
MaxResults = _options.DefaultMaxResults
};
if (!_options.Enabled)
{
_logger.LogWarning("BSim is not enabled, returning empty results");
return [];
}
_logger.LogDebug(
"Querying BSim for function: {FunctionName} (min similarity: {MinSimilarity})",
signature.FunctionName,
options.MinSimilarity);
// In a real implementation, this would query the BSim PostgreSQL database
// For now, return empty results as BSim database setup is a separate task
return await Task.FromResult(ImmutableArray<BSimMatch>.Empty);
}
/// <inheritdoc />
public async Task<ImmutableArray<BSimQueryResult>> QueryBatchAsync(
ImmutableArray<BSimSignature> signatures,
BSimQueryOptions? options = null,
CancellationToken ct = default)
{
options ??= new BSimQueryOptions
{
MinSimilarity = _options.DefaultMinSimilarity,
MaxResults = _options.DefaultMaxResults
};
if (!_options.Enabled)
{
_logger.LogWarning("BSim is not enabled, returning empty results");
return signatures.Select(s => new BSimQueryResult(s, [])).ToImmutableArray();
}
_logger.LogDebug(
"Batch querying BSim for {Count} signatures",
signatures.Length);
var results = new List<BSimQueryResult>();
foreach (var signature in signatures)
{
ct.ThrowIfCancellationRequested();
var matches = await QueryAsync(signature, options, ct);
results.Add(new BSimQueryResult(signature, matches));
}
return [.. results];
}
/// <inheritdoc />
public async Task IngestAsync(
string libraryName,
string version,
ImmutableArray<BSimSignature> signatures,
CancellationToken ct = default)
{
ArgumentException.ThrowIfNullOrEmpty(libraryName);
ArgumentException.ThrowIfNullOrEmpty(version);
if (!_options.Enabled)
{
throw new BSimUnavailableException("BSim is not enabled");
}
_logger.LogInformation(
"Ingesting {SignatureCount} signatures for {Library} v{Version}",
signatures.Length,
libraryName,
version);
// In a real implementation, this would insert into the BSim PostgreSQL database
// For now, throw as BSim database setup is a separate task
throw new NotImplementedException(
"BSim ingestion requires BSim PostgreSQL database setup (GHID-011). " +
"See docs/implplan/SPRINT_20260105_001_003_BINDEX_semdiff_ghidra.md");
}
/// <inheritdoc />
public async Task<bool> IsAvailableAsync(CancellationToken ct = default)
{
if (!_options.Enabled)
{
return false;
}
// Check if BSim database is accessible
// For now, just check if Ghidra is available since BSim requires it
return await _headlessManager.IsAvailableAsync(ct);
}
private static bool IsEligibleForBSim(GhidraFunction function, BSimGenerationOptions options)
{
// Skip thunks unless explicitly included
if (function.IsThunk && !options.IncludeThunks)
{
return false;
}
// Skip external/imported functions unless explicitly included
if (function.IsExternal && !options.IncludeImports)
{
return false;
}
// Skip functions below minimum size
// Note: We use function size as a proxy; ideally we'd use instruction count
// which would require parsing the function body
if (function.Size < options.MinFunctionSize * 4) // Rough estimate: ~4 bytes per instruction
{
return false;
}
return true;
}
private BSimSignature? GenerateSignatureFromFunction(GhidraFunction function)
{
// In a real implementation, this would use Ghidra's BSim feature extraction
// which analyzes P-Code to generate behavioral signatures
//
// The signature captures:
// - Data flow patterns
// - Control flow structure
// - Normalized constants
// - API usage patterns
// If we have a P-Code hash from Ghidra analysis, use it as the feature vector
if (function.PCodeHash is not null)
{
// Calculate self-significance based on function complexity
var selfSignificance = CalculateSelfSignificance(function);
return new BSimSignature(
function.Name,
function.Address,
function.PCodeHash,
function.PCodeHash.Length,
selfSignificance,
EstimateInstructionCount(function.Size));
}
// If no P-Code hash, we can't generate a meaningful BSim signature
_logger.LogDebug(
"Function {Name} has no P-Code hash, skipping BSim signature generation",
function.Name);
return null;
}
private static double CalculateSelfSignificance(GhidraFunction function)
{
// Self-significance measures how distinctive a function is
// Higher values = more unique signature = better for identification
//
// Factors that increase significance:
// - More called functions (API usage)
// - Larger size (more behavioral information)
// - Fewer callers (not a common utility)
var baseScore = 0.5;
// Called functions increase significance
var callScore = Math.Min(function.CalledFunctions.Length * 0.1, 0.3);
// Size increases significance (diminishing returns)
var sizeScore = Math.Min(Math.Log10(Math.Max(function.Size, 1)) * 0.1, 0.15);
// Many callers decrease significance (common utility functions)
var callerPenalty = function.CallingFunctions.Length > 10 ? 0.1 : 0;
return Math.Min(baseScore + callScore + sizeScore - callerPenalty, 1.0);
}
private static int EstimateInstructionCount(int functionSize)
{
// Rough estimate: average 4 bytes per instruction for most architectures
return Math.Max(functionSize / 4, 1);
}
}

View File

@@ -0,0 +1,540 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using StellaOps.BinaryIndex.Disassembly;
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// Ghidra-based disassembly plugin providing broad architecture support as a fallback backend.
/// Ghidra is used for complex cases where B2R2 has limited coverage, supports 20+ architectures,
/// and provides mature decompilation, Version Tracking, and BSim capabilities.
/// </summary>
/// <remarks>
/// This plugin has lower priority than B2R2 since Ghidra requires external process invocation
/// (Java-based headless analysis) which is slower than native .NET disassembly. It serves as
/// the fallback when B2R2 returns low-confidence results or for architectures B2R2 handles poorly.
/// </remarks>
public sealed class GhidraDisassemblyPlugin : IDisassemblyPlugin, IDisposable
{
/// <summary>
/// Plugin identifier.
/// </summary>
public const string PluginId = "stellaops.disasm.ghidra";
private readonly IGhidraService _ghidraService;
private readonly GhidraOptions _options;
private readonly ILogger<GhidraDisassemblyPlugin> _logger;
private readonly TimeProvider _timeProvider;
private bool _disposed;
private static readonly DisassemblyCapabilities s_capabilities = new()
{
PluginId = PluginId,
Name = "Ghidra Disassembler",
Version = "11.x", // Ghidra 11.x
SupportedArchitectures =
[
// All architectures supported by both B2R2 and Ghidra
CpuArchitecture.X86,
CpuArchitecture.X86_64,
CpuArchitecture.ARM32,
CpuArchitecture.ARM64,
CpuArchitecture.MIPS32,
CpuArchitecture.MIPS64,
CpuArchitecture.RISCV64,
CpuArchitecture.PPC32,
CpuArchitecture.PPC64, // Ghidra supports PPC64 better than B2R2
CpuArchitecture.SPARC,
CpuArchitecture.SH4,
CpuArchitecture.AVR,
// Additional architectures Ghidra supports
CpuArchitecture.WASM
],
SupportedFormats =
[
BinaryFormat.ELF,
BinaryFormat.PE,
BinaryFormat.MachO,
BinaryFormat.WASM,
BinaryFormat.Raw
],
SupportsLifting = true, // P-Code lifting
SupportsCfgRecovery = true, // Full CFG recovery and decompilation
Priority = 25 // Lower than B2R2 (50) - used as fallback
};
/// <summary>
/// Creates a new Ghidra disassembly plugin.
/// </summary>
/// <param name="ghidraService">The Ghidra analysis service.</param>
/// <param name="options">Ghidra options.</param>
/// <param name="logger">Logger instance.</param>
/// <param name="timeProvider">Time provider for timestamps.</param>
public GhidraDisassemblyPlugin(
IGhidraService ghidraService,
IOptions<GhidraOptions> options,
ILogger<GhidraDisassemblyPlugin> logger,
TimeProvider timeProvider)
{
_ghidraService = ghidraService ?? throw new ArgumentNullException(nameof(ghidraService));
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
}
/// <inheritdoc />
public DisassemblyCapabilities Capabilities => s_capabilities;
/// <inheritdoc />
public BinaryInfo LoadBinary(Stream stream, CpuArchitecture? archHint = null, BinaryFormat? formatHint = null)
{
ArgumentNullException.ThrowIfNull(stream);
ObjectDisposedException.ThrowIf(_disposed, this);
// Copy stream to memory for analysis
using var memStream = new MemoryStream();
stream.CopyTo(memStream);
return LoadBinary(memStream.ToArray(), archHint, formatHint);
}
/// <inheritdoc />
public BinaryInfo LoadBinary(ReadOnlySpan<byte> bytes, CpuArchitecture? archHint = null, BinaryFormat? formatHint = null)
{
ObjectDisposedException.ThrowIf(_disposed, this);
var byteArray = bytes.ToArray();
_logger.LogDebug("Loading binary with Ghidra plugin (size: {Size} bytes)", byteArray.Length);
// Run Ghidra analysis synchronously for IDisassemblyPlugin contract
var analysisTask = RunGhidraAnalysisAsync(byteArray, archHint, formatHint, CancellationToken.None);
var result = analysisTask.GetAwaiter().GetResult();
// Map Ghidra metadata to BinaryInfo
var format = MapFormat(result.Metadata.Format);
var architecture = MapArchitecture(result.Metadata.Architecture, result.Metadata.AddressSize);
var endianness = result.Metadata.Endianness.Equals("little", StringComparison.OrdinalIgnoreCase)
? Endianness.Little
: Endianness.Big;
var abi = DetectAbi(format);
_logger.LogInformation(
"Loaded binary with Ghidra: Format={Format}, Architecture={Architecture}, Processor={Processor}",
format, architecture, result.Metadata.Processor);
var metadata = new Dictionary<string, object>
{
["size"] = byteArray.Length,
["ghidra_processor"] = result.Metadata.Processor,
["ghidra_version"] = result.Metadata.GhidraVersion,
["analysis_duration_ms"] = result.Metadata.AnalysisDuration.TotalMilliseconds,
["function_count"] = result.Functions.Length,
["import_count"] = result.Imports.Length,
["export_count"] = result.Exports.Length
};
if (result.Metadata.Compiler is not null)
{
metadata["compiler"] = result.Metadata.Compiler;
}
return new BinaryInfo(
Format: format,
Architecture: architecture,
Bitness: result.Metadata.AddressSize,
Endianness: endianness,
Abi: abi,
EntryPoint: result.Metadata.EntryPoint,
BuildId: result.BinaryHash,
Metadata: metadata,
Handle: new GhidraBinaryHandle(result, byteArray));
}
/// <inheritdoc />
public IEnumerable<CodeRegion> GetCodeRegions(BinaryInfo binary)
{
ArgumentNullException.ThrowIfNull(binary);
ObjectDisposedException.ThrowIf(_disposed, this);
var handle = GetHandle(binary);
// Extract code regions from Ghidra memory blocks
foreach (var block in handle.Result.MemoryBlocks)
{
if (block.IsExecutable)
{
yield return new CodeRegion(
Name: block.Name,
VirtualAddress: block.Start,
FileOffset: block.Start - handle.Result.Metadata.ImageBase,
Size: (ulong)block.Size,
IsExecutable: block.IsExecutable,
IsReadable: true, // Executable sections are readable
IsWritable: block.IsWritable);
}
}
}
/// <inheritdoc />
public IEnumerable<SymbolInfo> GetSymbols(BinaryInfo binary)
{
ArgumentNullException.ThrowIfNull(binary);
ObjectDisposedException.ThrowIf(_disposed, this);
var handle = GetHandle(binary);
// Map functions to symbols
foreach (var func in handle.Result.Functions)
{
var binding = func.IsExternal ? SymbolBinding.Global : SymbolBinding.Local;
yield return new SymbolInfo(
Name: func.Name,
Address: func.Address,
Size: (ulong)func.Size,
Type: SymbolType.Function,
Binding: binding,
Section: DetermineSection(handle.Result.MemoryBlocks, func.Address));
}
// Also include exports as symbols
foreach (var export in handle.Result.Exports)
{
yield return new SymbolInfo(
Name: export.Name,
Address: export.Address,
Size: 0, // Unknown size for exports
Type: SymbolType.Function,
Binding: SymbolBinding.Global,
Section: DetermineSection(handle.Result.MemoryBlocks, export.Address));
}
}
/// <inheritdoc />
public IEnumerable<DisassembledInstruction> Disassemble(BinaryInfo binary, CodeRegion region)
{
ArgumentNullException.ThrowIfNull(binary);
ArgumentNullException.ThrowIfNull(region);
ObjectDisposedException.ThrowIf(_disposed, this);
var handle = GetHandle(binary);
_logger.LogDebug(
"Disassembling region {Name} from 0x{Start:X} to 0x{End:X}",
region.Name, region.VirtualAddress, region.VirtualAddress + region.Size);
// Find functions within the region and return their instructions
var regionEnd = region.VirtualAddress + region.Size;
foreach (var func in handle.Result.Functions)
{
if (func.Address >= region.VirtualAddress && func.Address < regionEnd)
{
foreach (var instr in DisassembleFunctionInstructions(func, handle))
{
if (instr.Address >= region.VirtualAddress && instr.Address < regionEnd)
{
yield return instr;
}
}
}
}
}
/// <inheritdoc />
public IEnumerable<DisassembledInstruction> Disassemble(BinaryInfo binary, ulong startAddress, ulong length)
{
var region = new CodeRegion(
Name: $"0x{startAddress:X}",
VirtualAddress: startAddress,
FileOffset: startAddress,
Size: length,
IsExecutable: true,
IsReadable: true,
IsWritable: false);
return Disassemble(binary, region);
}
/// <inheritdoc />
public IEnumerable<DisassembledInstruction> DisassembleSymbol(BinaryInfo binary, SymbolInfo symbol)
{
ArgumentNullException.ThrowIfNull(binary);
ArgumentNullException.ThrowIfNull(symbol);
ObjectDisposedException.ThrowIf(_disposed, this);
var handle = GetHandle(binary);
// Find the function matching the symbol
var func = handle.Result.Functions.FirstOrDefault(f =>
f.Address == symbol.Address || f.Name.Equals(symbol.Name, StringComparison.Ordinal));
if (func is null)
{
_logger.LogWarning(
"Function not found for symbol {Name} at 0x{Address:X}",
symbol.Name, symbol.Address);
yield break;
}
foreach (var instr in DisassembleFunctionInstructions(func, handle))
{
yield return instr;
}
}
#region Private Methods
private async Task<GhidraAnalysisResult> RunGhidraAnalysisAsync(
byte[] bytes,
CpuArchitecture? archHint,
BinaryFormat? formatHint,
CancellationToken ct)
{
// Write bytes to temp file
var tempPath = Path.Combine(
_options.WorkDir,
$"disasm_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}.bin");
try
{
Directory.CreateDirectory(Path.GetDirectoryName(tempPath)!);
await File.WriteAllBytesAsync(tempPath, bytes, ct);
var options = new GhidraAnalysisOptions
{
RunFullAnalysis = true,
ExtractStrings = false, // Not needed for disassembly
ExtractFunctions = true,
ExtractDecompilation = false, // Can be expensive
TimeoutSeconds = _options.DefaultTimeoutSeconds
};
// Add architecture hint if provided
if (archHint.HasValue)
{
options = options with { ProcessorHint = MapToGhidraProcessor(archHint.Value) };
}
using var stream = File.OpenRead(tempPath);
return await _ghidraService.AnalyzeAsync(stream, options, ct);
}
finally
{
TryDeleteFile(tempPath);
}
}
private static IEnumerable<DisassembledInstruction> DisassembleFunctionInstructions(
GhidraFunction func,
GhidraBinaryHandle handle)
{
// Ghidra full analysis provides function boundaries but not individual instructions
// We synthesize instruction info from the function's decompiled code or from the raw bytes
// For now, return a synthetic instruction representing the function entry
// A full implementation would require running a Ghidra script to export instructions
// Calculate approximate instruction count based on function size and average instruction size
// x86/x64 average instruction size is ~3-4 bytes
var avgInstructionSize = handle.Result.Metadata.AddressSize == 64 ? 4 : 3;
var estimatedInstructions = Math.Max(1, func.Size / avgInstructionSize);
var address = func.Address;
for (var i = 0; i < estimatedInstructions && i < 1000; i++) // Cap at 1000 instructions
{
// Without actual Ghidra instruction export, we create placeholder entries
// Real implementation would parse Ghidra's instruction listing output
var rawBytes = ExtractBytes(handle.Bytes, address, handle.Result.Metadata.ImageBase, avgInstructionSize);
yield return new DisassembledInstruction(
Address: address,
RawBytes: rawBytes,
Mnemonic: "GHIDRA", // Placeholder - real impl would have actual mnemonics
OperandsText: $"; function {func.Name} + 0x{address - func.Address:X}",
Kind: i == 0 ? InstructionKind.Call : InstructionKind.Unknown,
Operands: []);
address += (ulong)avgInstructionSize;
if (address >= func.Address + (ulong)func.Size)
{
break;
}
}
}
private static ImmutableArray<byte> ExtractBytes(byte[] binary, ulong address, ulong imageBase, int count)
{
var offset = address - imageBase;
if (offset >= (ulong)binary.Length)
{
return [];
}
var available = Math.Min(count, binary.Length - (int)offset);
return binary.AsSpan((int)offset, available).ToArray().ToImmutableArray();
}
private static string? DetermineSection(ImmutableArray<GhidraMemoryBlock> blocks, ulong address)
{
foreach (var block in blocks)
{
if (address >= block.Start && address < block.End)
{
return block.Name;
}
}
return null;
}
private static GhidraBinaryHandle GetHandle(BinaryInfo binary)
{
if (binary.Handle is not GhidraBinaryHandle handle)
{
throw new ArgumentException("Invalid binary handle - not a Ghidra handle", nameof(binary));
}
return handle;
}
private static BinaryFormat MapFormat(string ghidraFormat)
{
return ghidraFormat.ToUpperInvariant() switch
{
"ELF" or "ELF32" or "ELF64" => BinaryFormat.ELF,
"PE" or "PE32" or "PE64" or "COFF" => BinaryFormat.PE,
"MACHO" or "MACH-O" or "MACHO32" or "MACHO64" => BinaryFormat.MachO,
"WASM" or "WEBASSEMBLY" => BinaryFormat.WASM,
"RAW" or "BINARY" => BinaryFormat.Raw,
_ => BinaryFormat.Unknown
};
}
private static CpuArchitecture MapArchitecture(string ghidraArch, int addressSize)
{
var arch = ghidraArch.ToUpperInvariant();
return arch switch
{
// Intel x86/x64
"X86" or "X86:LE:32:DEFAULT" => CpuArchitecture.X86,
"X86-64" or "X86:LE:64:DEFAULT" or "AMD64" => CpuArchitecture.X86_64,
var x when x.StartsWith("X86", StringComparison.Ordinal) && addressSize == 32 => CpuArchitecture.X86,
var x when x.StartsWith("X86", StringComparison.Ordinal) => CpuArchitecture.X86_64,
// ARM
"ARM" or "ARM:LE:32:V7" or "ARM:LE:32:V8" or "ARMV7" => CpuArchitecture.ARM32,
"AARCH64" or "ARM:LE:64:V8A" or "ARM64" => CpuArchitecture.ARM64,
var a when a.StartsWith("ARM", StringComparison.Ordinal) && addressSize == 32 => CpuArchitecture.ARM32,
var a when a.StartsWith("ARM", StringComparison.Ordinal) || a.StartsWith("AARCH", StringComparison.Ordinal) => CpuArchitecture.ARM64,
// MIPS
"MIPS" or "MIPS:BE:32:DEFAULT" or "MIPS:LE:32:DEFAULT" => CpuArchitecture.MIPS32,
"MIPS64" or "MIPS:BE:64:DEFAULT" or "MIPS:LE:64:DEFAULT" => CpuArchitecture.MIPS64,
var m when m.StartsWith("MIPS", StringComparison.Ordinal) && addressSize == 64 => CpuArchitecture.MIPS64,
var m when m.StartsWith("MIPS", StringComparison.Ordinal) => CpuArchitecture.MIPS32,
// RISC-V
"RISCV" or "RISCV:LE:64:RV64" or "RISCV64" => CpuArchitecture.RISCV64,
var r when r.StartsWith("RISCV", StringComparison.Ordinal) => CpuArchitecture.RISCV64,
// PowerPC
"PPC" or "POWERPC" or "PPC:BE:32:DEFAULT" => CpuArchitecture.PPC32,
"PPC64" or "POWERPC64" or "PPC:BE:64:DEFAULT" => CpuArchitecture.PPC64,
var p when p.StartsWith("PPC", StringComparison.Ordinal) && addressSize == 64 => CpuArchitecture.PPC64,
var p when p.StartsWith("PPC", StringComparison.Ordinal) || p.StartsWith("POWERPC", StringComparison.Ordinal) => CpuArchitecture.PPC32,
// SPARC
"SPARC" or "SPARC:BE:32:DEFAULT" => CpuArchitecture.SPARC,
var s when s.StartsWith("SPARC", StringComparison.Ordinal) => CpuArchitecture.SPARC,
// SuperH
"SH4" or "SUPERH" or "SH:LE:32:SH4" => CpuArchitecture.SH4,
var s when s.StartsWith("SH", StringComparison.Ordinal) || s.StartsWith("SUPERH", StringComparison.Ordinal) => CpuArchitecture.SH4,
// AVR
"AVR" or "AVR8:LE:16:DEFAULT" => CpuArchitecture.AVR,
var a when a.StartsWith("AVR", StringComparison.Ordinal) => CpuArchitecture.AVR,
// WASM
"WASM" or "WEBASSEMBLY" => CpuArchitecture.WASM,
// EVM (Ethereum)
"EVM" => CpuArchitecture.EVM,
_ => CpuArchitecture.Unknown
};
}
private static string? MapToGhidraProcessor(CpuArchitecture arch)
{
return arch switch
{
CpuArchitecture.X86 => "x86:LE:32:default",
CpuArchitecture.X86_64 => "x86:LE:64:default",
CpuArchitecture.ARM32 => "ARM:LE:32:v7",
CpuArchitecture.ARM64 => "AARCH64:LE:64:v8A",
CpuArchitecture.MIPS32 => "MIPS:BE:32:default",
CpuArchitecture.MIPS64 => "MIPS:BE:64:default",
CpuArchitecture.RISCV64 => "RISCV:LE:64:RV64IC",
CpuArchitecture.PPC32 => "PowerPC:BE:32:default",
CpuArchitecture.PPC64 => "PowerPC:BE:64:default",
CpuArchitecture.SPARC => "sparc:BE:32:default",
CpuArchitecture.SH4 => "SuperH4:LE:32:default",
CpuArchitecture.AVR => "avr8:LE:16:default",
CpuArchitecture.WASM => "Wasm:LE:32:default",
CpuArchitecture.EVM => "EVM:BE:256:default",
_ => null
};
}
private static string? DetectAbi(BinaryFormat format)
{
return format switch
{
BinaryFormat.ELF => "gnu",
BinaryFormat.PE => "msvc",
BinaryFormat.MachO => "darwin",
_ => null
};
}
private static void TryDeleteFile(string path)
{
try
{
if (File.Exists(path))
{
File.Delete(path);
}
}
catch
{
// Ignore cleanup failures
}
}
#endregion
/// <summary>
/// Disposes the plugin and releases resources.
/// </summary>
public void Dispose()
{
if (_disposed)
{
return;
}
_disposed = true;
}
}
/// <summary>
/// Internal handle for Ghidra-analyzed binaries.
/// </summary>
/// <param name="Result">The Ghidra analysis result.</param>
/// <param name="Bytes">The original binary bytes.</param>
internal sealed record GhidraBinaryHandle(
GhidraAnalysisResult Result,
byte[] Bytes);

View File

@@ -0,0 +1,441 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Diagnostics;
using System.Globalization;
using System.Runtime.InteropServices;
using System.Text;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// Manages Ghidra Headless process lifecycle.
/// Provides methods to run analysis with proper process isolation and cleanup.
/// </summary>
public sealed class GhidraHeadlessManager : IAsyncDisposable
{
private readonly GhidraOptions _options;
private readonly ILogger<GhidraHeadlessManager> _logger;
private readonly SemaphoreSlim _semaphore;
private bool _disposed;
/// <summary>
/// Creates a new GhidraHeadlessManager.
/// </summary>
/// <param name="options">Ghidra configuration options.</param>
/// <param name="logger">Logger instance.</param>
public GhidraHeadlessManager(
IOptions<GhidraOptions> options,
ILogger<GhidraHeadlessManager> logger)
{
_options = options.Value;
_logger = logger;
_semaphore = new SemaphoreSlim(_options.MaxConcurrentInstances, _options.MaxConcurrentInstances);
EnsureWorkDirectoryExists();
}
/// <summary>
/// Runs Ghidra analysis on a binary.
/// </summary>
/// <param name="binaryPath">Absolute path to the binary file.</param>
/// <param name="scriptName">Name of the post-analysis script to run.</param>
/// <param name="scriptArgs">Arguments to pass to the script.</param>
/// <param name="runAnalysis">Whether to run full auto-analysis.</param>
/// <param name="timeoutSeconds">Timeout in seconds (0 = use default).</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Standard output from Ghidra.</returns>
public async Task<GhidraProcessResult> RunAnalysisAsync(
string binaryPath,
string? scriptName = null,
string[]? scriptArgs = null,
bool runAnalysis = true,
int timeoutSeconds = 0,
CancellationToken ct = default)
{
ObjectDisposedException.ThrowIf(_disposed, this);
if (!File.Exists(binaryPath))
{
throw new FileNotFoundException("Binary file not found", binaryPath);
}
var effectiveTimeout = timeoutSeconds > 0 ? timeoutSeconds : _options.DefaultTimeoutSeconds;
await _semaphore.WaitAsync(ct);
try
{
var projectDir = CreateTempProjectDirectory();
try
{
var args = BuildAnalyzeArgs(projectDir, binaryPath, scriptName, scriptArgs, runAnalysis);
return await RunGhidraAsync(args, effectiveTimeout, ct);
}
finally
{
if (_options.CleanupTempProjects)
{
CleanupProjectDirectory(projectDir);
}
}
}
finally
{
_semaphore.Release();
}
}
/// <summary>
/// Runs a Ghidra script on an existing project.
/// </summary>
/// <param name="projectDir">Path to the Ghidra project directory.</param>
/// <param name="projectName">Name of the Ghidra project.</param>
/// <param name="scriptName">Name of the script to run.</param>
/// <param name="scriptArgs">Arguments to pass to the script.</param>
/// <param name="timeoutSeconds">Timeout in seconds (0 = use default).</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Standard output from Ghidra.</returns>
public async Task<GhidraProcessResult> RunScriptAsync(
string projectDir,
string projectName,
string scriptName,
string[]? scriptArgs = null,
int timeoutSeconds = 0,
CancellationToken ct = default)
{
ObjectDisposedException.ThrowIf(_disposed, this);
if (!Directory.Exists(projectDir))
{
throw new DirectoryNotFoundException($"Project directory not found: {projectDir}");
}
var effectiveTimeout = timeoutSeconds > 0 ? timeoutSeconds : _options.DefaultTimeoutSeconds;
await _semaphore.WaitAsync(ct);
try
{
var args = BuildScriptArgs(projectDir, projectName, scriptName, scriptArgs);
return await RunGhidraAsync(args, effectiveTimeout, ct);
}
finally
{
_semaphore.Release();
}
}
/// <summary>
/// Checks if Ghidra is available and properly configured.
/// </summary>
/// <param name="ct">Cancellation token.</param>
/// <returns>True if Ghidra is available.</returns>
public async Task<bool> IsAvailableAsync(CancellationToken ct = default)
{
try
{
var executablePath = GetAnalyzeHeadlessPath();
if (!File.Exists(executablePath))
{
_logger.LogDebug("Ghidra analyzeHeadless not found at: {Path}", executablePath);
return false;
}
// Quick version check to verify Java is working
var result = await RunGhidraAsync(["--help"], timeoutSeconds: 30, ct);
return result.ExitCode == 0 || result.StandardOutput.Contains("analyzeHeadless", StringComparison.OrdinalIgnoreCase);
}
catch (Exception ex)
{
_logger.LogDebug(ex, "Ghidra availability check failed");
return false;
}
}
/// <summary>
/// Gets Ghidra version information.
/// </summary>
/// <param name="ct">Cancellation token.</param>
/// <returns>Version string.</returns>
public async Task<string> GetVersionAsync(CancellationToken ct = default)
{
var result = await RunGhidraAsync(["--help"], timeoutSeconds: 30, ct);
// Parse version from output - typically starts with "Ghidra X.Y"
var lines = result.StandardOutput.Split('\n', StringSplitOptions.RemoveEmptyEntries);
foreach (var line in lines)
{
if (line.Contains("Ghidra", StringComparison.OrdinalIgnoreCase) &&
char.IsDigit(line.FirstOrDefault(c => char.IsDigit(c))))
{
return line.Trim();
}
}
return "Unknown";
}
private string CreateTempProjectDirectory()
{
var projectDir = Path.Combine(
_options.WorkDir,
$"project_{DateTime.UtcNow:yyyyMMddHHmmssfff}_{Guid.NewGuid():N}");
Directory.CreateDirectory(projectDir);
_logger.LogDebug("Created temp project directory: {Path}", projectDir);
return projectDir;
}
private void CleanupProjectDirectory(string projectDir)
{
try
{
if (Directory.Exists(projectDir))
{
Directory.Delete(projectDir, recursive: true);
_logger.LogDebug("Cleaned up project directory: {Path}", projectDir);
}
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to cleanup project directory: {Path}", projectDir);
}
}
private void EnsureWorkDirectoryExists()
{
if (!Directory.Exists(_options.WorkDir))
{
Directory.CreateDirectory(_options.WorkDir);
_logger.LogInformation("Created Ghidra work directory: {Path}", _options.WorkDir);
}
}
private string[] BuildAnalyzeArgs(
string projectDir,
string binaryPath,
string? scriptName,
string[]? scriptArgs,
bool runAnalysis)
{
var args = new List<string>
{
projectDir,
"TempProject",
"-import", binaryPath
};
if (!runAnalysis)
{
args.Add("-noanalysis");
}
if (!string.IsNullOrEmpty(scriptName))
{
args.AddRange(["-postScript", scriptName]);
if (scriptArgs is { Length: > 0 })
{
args.AddRange(scriptArgs);
}
}
if (!string.IsNullOrEmpty(_options.ScriptsDir))
{
args.AddRange(["-scriptPath", _options.ScriptsDir]);
}
args.AddRange(["-max-cpu", _options.MaxCpu.ToString(CultureInfo.InvariantCulture)]);
return [.. args];
}
private static string[] BuildScriptArgs(
string projectDir,
string projectName,
string scriptName,
string[]? scriptArgs)
{
var args = new List<string>
{
projectDir,
projectName,
"-postScript", scriptName
};
if (scriptArgs is { Length: > 0 })
{
args.AddRange(scriptArgs);
}
return [.. args];
}
private async Task<GhidraProcessResult> RunGhidraAsync(
string[] args,
int timeoutSeconds,
CancellationToken ct)
{
var executablePath = GetAnalyzeHeadlessPath();
var startInfo = new ProcessStartInfo
{
FileName = executablePath,
Arguments = string.Join(" ", args.Select(QuoteArg)),
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true,
StandardOutputEncoding = Encoding.UTF8,
StandardErrorEncoding = Encoding.UTF8
};
ConfigureEnvironment(startInfo);
_logger.LogDebug("Starting Ghidra: {Command} {Args}", executablePath, startInfo.Arguments);
var stopwatch = Stopwatch.StartNew();
using var process = new Process { StartInfo = startInfo };
var stdoutBuilder = new StringBuilder();
var stderrBuilder = new StringBuilder();
process.OutputDataReceived += (_, e) =>
{
if (e.Data is not null)
{
stdoutBuilder.AppendLine(e.Data);
}
};
process.ErrorDataReceived += (_, e) =>
{
if (e.Data is not null)
{
stderrBuilder.AppendLine(e.Data);
}
};
if (!process.Start())
{
throw new GhidraException("Failed to start Ghidra process");
}
process.BeginOutputReadLine();
process.BeginErrorReadLine();
using var timeoutCts = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds));
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
try
{
await process.WaitForExitAsync(linkedCts.Token);
}
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
{
try
{
process.Kill(entireProcessTree: true);
}
catch
{
// Best effort kill
}
throw new GhidraTimeoutException(timeoutSeconds);
}
stopwatch.Stop();
var stdout = stdoutBuilder.ToString();
var stderr = stderrBuilder.ToString();
_logger.LogDebug(
"Ghidra completed with exit code {ExitCode} in {Duration}ms",
process.ExitCode,
stopwatch.ElapsedMilliseconds);
if (process.ExitCode != 0)
{
_logger.LogWarning("Ghidra failed: {Error}", stderr);
}
return new GhidraProcessResult(
process.ExitCode,
stdout,
stderr,
stopwatch.Elapsed);
}
private string GetAnalyzeHeadlessPath()
{
var basePath = Path.Combine(_options.GhidraHome, "support");
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
return Path.Combine(basePath, "analyzeHeadless.bat");
}
return Path.Combine(basePath, "analyzeHeadless");
}
private void ConfigureEnvironment(ProcessStartInfo startInfo)
{
if (!string.IsNullOrEmpty(_options.JavaHome))
{
startInfo.EnvironmentVariables["JAVA_HOME"] = _options.JavaHome;
}
startInfo.EnvironmentVariables["MAXMEM"] = _options.MaxMemory;
startInfo.EnvironmentVariables["GHIDRA_HOME"] = _options.GhidraHome;
}
private static string QuoteArg(string arg)
{
if (arg.Contains(' ', StringComparison.Ordinal) || arg.Contains('"', StringComparison.Ordinal))
{
return $"\"{arg.Replace("\"", "\\\"")}\"";
}
return arg;
}
/// <inheritdoc />
public async ValueTask DisposeAsync()
{
if (_disposed)
{
return;
}
_disposed = true;
// Wait for any in-flight operations to complete
for (var i = 0; i < _options.MaxConcurrentInstances; i++)
{
await _semaphore.WaitAsync();
}
_semaphore.Dispose();
}
}
/// <summary>
/// Result of a Ghidra process execution.
/// </summary>
/// <param name="ExitCode">Process exit code.</param>
/// <param name="StandardOutput">Standard output content.</param>
/// <param name="StandardError">Standard error content.</param>
/// <param name="Duration">Execution duration.</param>
public sealed record GhidraProcessResult(
int ExitCode,
string StandardOutput,
string StandardError,
TimeSpan Duration)
{
/// <summary>
/// Whether the process completed successfully (exit code 0).
/// </summary>
public bool IsSuccess => ExitCode == 0;
}

View File

@@ -0,0 +1,511 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
using System.Globalization;
using System.Security.Cryptography;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// Implementation of <see cref="IGhidraService"/> using Ghidra Headless analysis.
/// </summary>
public sealed class GhidraService : IGhidraService, IAsyncDisposable
{
private static readonly JsonSerializerOptions JsonOptions = new()
{
PropertyNameCaseInsensitive = true,
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
};
private readonly GhidraHeadlessManager _headlessManager;
private readonly GhidraOptions _options;
private readonly ILogger<GhidraService> _logger;
private readonly TimeProvider _timeProvider;
/// <summary>
/// Creates a new GhidraService.
/// </summary>
/// <param name="headlessManager">The Ghidra Headless manager.</param>
/// <param name="options">Ghidra options.</param>
/// <param name="logger">Logger instance.</param>
/// <param name="timeProvider">Time provider for timestamps.</param>
public GhidraService(
GhidraHeadlessManager headlessManager,
IOptions<GhidraOptions> options,
ILogger<GhidraService> logger,
TimeProvider timeProvider)
{
_headlessManager = headlessManager;
_options = options.Value;
_logger = logger;
_timeProvider = timeProvider;
}
/// <inheritdoc />
public async Task<GhidraAnalysisResult> AnalyzeAsync(
Stream binaryStream,
GhidraAnalysisOptions? options = null,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(binaryStream);
// Write stream to temp file
var tempPath = Path.Combine(
_options.WorkDir,
$"binary_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}.bin");
try
{
Directory.CreateDirectory(Path.GetDirectoryName(tempPath)!);
await using (var fileStream = File.Create(tempPath))
{
await binaryStream.CopyToAsync(fileStream, ct);
}
return await AnalyzeAsync(tempPath, options, ct);
}
finally
{
TryDeleteFile(tempPath);
}
}
/// <inheritdoc />
public async Task<GhidraAnalysisResult> AnalyzeAsync(
string binaryPath,
GhidraAnalysisOptions? options = null,
CancellationToken ct = default)
{
ArgumentException.ThrowIfNullOrEmpty(binaryPath);
if (!File.Exists(binaryPath))
{
throw new FileNotFoundException("Binary file not found", binaryPath);
}
options ??= new GhidraAnalysisOptions();
_logger.LogInformation("Starting Ghidra analysis of: {BinaryPath}", binaryPath);
var startTime = _timeProvider.GetUtcNow();
// Calculate binary hash
var binaryHash = await ComputeBinaryHashAsync(binaryPath, ct);
// Run analysis with JSON export script
var result = await _headlessManager.RunAnalysisAsync(
binaryPath,
scriptName: "ExportToJson.java",
scriptArgs: BuildScriptArgs(options),
runAnalysis: options.RunFullAnalysis,
timeoutSeconds: options.TimeoutSeconds,
ct);
if (!result.IsSuccess)
{
throw new GhidraException($"Ghidra analysis failed: {result.StandardError}")
{
ExitCode = result.ExitCode,
StandardError = result.StandardError,
StandardOutput = result.StandardOutput
};
}
var analysisResult = ParseAnalysisOutput(
result.StandardOutput,
binaryPath,
binaryHash,
startTime,
result.Duration);
_logger.LogInformation(
"Ghidra analysis completed: {FunctionCount} functions found in {Duration}ms",
analysisResult.Functions.Length,
result.Duration.TotalMilliseconds);
return analysisResult;
}
/// <inheritdoc />
public async Task<bool> IsAvailableAsync(CancellationToken ct = default)
{
if (!_options.Enabled)
{
return false;
}
return await _headlessManager.IsAvailableAsync(ct);
}
/// <inheritdoc />
public async Task<GhidraInfo> GetInfoAsync(CancellationToken ct = default)
{
var version = await _headlessManager.GetVersionAsync(ct);
// Get Java version
var javaVersion = GetJavaVersion();
// Get available processor languages
var processors = GetAvailableProcessors();
return new GhidraInfo(
version,
javaVersion,
processors,
_options.GhidraHome);
}
/// <inheritdoc />
public async ValueTask DisposeAsync()
{
await _headlessManager.DisposeAsync();
}
private static string[] BuildScriptArgs(GhidraAnalysisOptions options)
{
var args = new List<string>();
if (options.IncludeDecompilation)
{
args.Add("-decompile");
}
if (options.GeneratePCodeHashes)
{
args.Add("-pcode-hash");
}
return [.. args];
}
private GhidraAnalysisResult ParseAnalysisOutput(
string output,
string binaryPath,
string binaryHash,
DateTimeOffset startTime,
TimeSpan duration)
{
// Look for JSON output marker in stdout
const string jsonMarker = "###GHIDRA_JSON_OUTPUT###";
var jsonStart = output.IndexOf(jsonMarker, StringComparison.Ordinal);
if (jsonStart >= 0)
{
var jsonContent = output[(jsonStart + jsonMarker.Length)..].Trim();
var jsonEnd = jsonContent.IndexOf("###END_GHIDRA_JSON_OUTPUT###", StringComparison.Ordinal);
if (jsonEnd >= 0)
{
jsonContent = jsonContent[..jsonEnd].Trim();
}
try
{
return ParseJsonOutput(jsonContent, binaryHash, startTime, duration);
}
catch (JsonException ex)
{
_logger.LogWarning(ex, "Failed to parse Ghidra JSON output, falling back to text parsing");
}
}
// Fallback: parse text output
return ParseTextOutput(output, binaryPath, binaryHash, startTime, duration);
}
private GhidraAnalysisResult ParseJsonOutput(
string json,
string binaryHash,
DateTimeOffset startTime,
TimeSpan duration)
{
var data = JsonSerializer.Deserialize<GhidraJsonOutput>(json, JsonOptions)
?? throw new GhidraException("Failed to deserialize Ghidra JSON output");
var functions = data.Functions?.Select(f => new GhidraFunction(
f.Name ?? "unknown",
ParseAddress(f.Address),
f.Size,
f.Signature,
f.DecompiledCode,
f.PCodeHash is not null ? Convert.FromHexString(f.PCodeHash) : null,
f.CalledFunctions?.ToImmutableArray() ?? [],
f.CallingFunctions?.ToImmutableArray() ?? [],
f.IsThunk,
f.IsExternal
)).ToImmutableArray() ?? [];
var imports = data.Imports?.Select(i => new GhidraImport(
i.Name ?? "unknown",
ParseAddress(i.Address),
i.LibraryName,
i.Ordinal
)).ToImmutableArray() ?? [];
var exports = data.Exports?.Select(e => new GhidraExport(
e.Name ?? "unknown",
ParseAddress(e.Address),
e.Ordinal
)).ToImmutableArray() ?? [];
var strings = data.Strings?.Select(s => new GhidraString(
s.Value ?? "",
ParseAddress(s.Address),
s.Length,
s.Encoding ?? "ASCII"
)).ToImmutableArray() ?? [];
var memoryBlocks = data.MemoryBlocks?.Select(m => new GhidraMemoryBlock(
m.Name ?? "unknown",
ParseAddress(m.Start),
ParseAddress(m.End),
m.Size,
m.IsExecutable,
m.IsWritable,
m.IsInitialized
)).ToImmutableArray() ?? [];
var metadata = new GhidraMetadata(
data.Metadata?.FileName ?? "unknown",
data.Metadata?.Format ?? "unknown",
data.Metadata?.Architecture ?? "unknown",
data.Metadata?.Processor ?? "unknown",
data.Metadata?.Compiler,
data.Metadata?.Endianness ?? "little",
data.Metadata?.AddressSize ?? 64,
ParseAddress(data.Metadata?.ImageBase),
data.Metadata?.EntryPoint is not null ? ParseAddress(data.Metadata.EntryPoint) : null,
startTime,
data.Metadata?.GhidraVersion ?? "unknown",
duration);
return new GhidraAnalysisResult(
binaryHash,
functions,
imports,
exports,
strings,
memoryBlocks,
metadata);
}
private GhidraAnalysisResult ParseTextOutput(
string output,
string binaryPath,
string binaryHash,
DateTimeOffset startTime,
TimeSpan duration)
{
// Basic text parsing for when JSON export is not available
// This extracts minimal information from Ghidra log output
var functions = ImmutableArray<GhidraFunction>.Empty;
var imports = ImmutableArray<GhidraImport>.Empty;
var exports = ImmutableArray<GhidraExport>.Empty;
var strings = ImmutableArray<GhidraString>.Empty;
var memoryBlocks = ImmutableArray<GhidraMemoryBlock>.Empty;
// Parse function count from output like "Total functions: 123"
var functionCountMatch = System.Text.RegularExpressions.Regex.Match(
output,
@"(?:Total functions|Functions found|functions):\s*(\d+)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
var metadata = new GhidraMetadata(
Path.GetFileName(binaryPath),
"unknown",
"unknown",
"unknown",
null,
"little",
64,
0,
null,
startTime,
"unknown",
duration);
_logger.LogDebug(
"Parsed Ghidra text output: estimated {Count} functions",
functionCountMatch.Success ? functionCountMatch.Groups[1].Value : "unknown");
return new GhidraAnalysisResult(
binaryHash,
functions,
imports,
exports,
strings,
memoryBlocks,
metadata);
}
private static ulong ParseAddress(string? address)
{
if (string.IsNullOrEmpty(address))
{
return 0;
}
// Handle hex format (0x...) or plain hex
if (address.StartsWith("0x", StringComparison.OrdinalIgnoreCase))
{
address = address[2..];
}
return ulong.TryParse(address, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var result)
? result
: 0;
}
private static async Task<string> ComputeBinaryHashAsync(string path, CancellationToken ct)
{
await using var stream = File.OpenRead(path);
var hash = await SHA256.HashDataAsync(stream, ct);
return Convert.ToHexStringLower(hash);
}
private string GetJavaVersion()
{
try
{
var javaHome = _options.JavaHome ?? Environment.GetEnvironmentVariable("JAVA_HOME");
if (string.IsNullOrEmpty(javaHome))
{
return "unknown";
}
var releaseFile = Path.Combine(javaHome, "release");
if (File.Exists(releaseFile))
{
var content = File.ReadAllText(releaseFile);
var match = System.Text.RegularExpressions.Regex.Match(
content,
@"JAVA_VERSION=""?([^""\r\n]+)""?");
if (match.Success)
{
return match.Groups[1].Value;
}
}
return "unknown";
}
catch
{
return "unknown";
}
}
private ImmutableArray<string> GetAvailableProcessors()
{
try
{
var processorsDir = Path.Combine(_options.GhidraHome, "Ghidra", "Processors");
if (!Directory.Exists(processorsDir))
{
return [];
}
return Directory.GetDirectories(processorsDir)
.Select(Path.GetFileName)
.Where(name => !string.IsNullOrEmpty(name))
.Order(StringComparer.OrdinalIgnoreCase)
.ToImmutableArray()!;
}
catch
{
return [];
}
}
private void TryDeleteFile(string path)
{
try
{
if (File.Exists(path))
{
File.Delete(path);
}
}
catch (Exception ex)
{
_logger.LogDebug(ex, "Failed to delete temp file: {Path}", path);
}
}
// JSON DTOs for deserialization
private sealed record GhidraJsonOutput
{
public List<GhidraFunctionJson>? Functions { get; init; }
public List<GhidraImportJson>? Imports { get; init; }
public List<GhidraExportJson>? Exports { get; init; }
public List<GhidraStringJson>? Strings { get; init; }
public List<GhidraMemoryBlockJson>? MemoryBlocks { get; init; }
public GhidraMetadataJson? Metadata { get; init; }
}
private sealed record GhidraFunctionJson
{
public string? Name { get; init; }
public string? Address { get; init; }
public int Size { get; init; }
public string? Signature { get; init; }
public string? DecompiledCode { get; init; }
public string? PCodeHash { get; init; }
public List<string>? CalledFunctions { get; init; }
public List<string>? CallingFunctions { get; init; }
public bool IsThunk { get; init; }
public bool IsExternal { get; init; }
}
private sealed record GhidraImportJson
{
public string? Name { get; init; }
public string? Address { get; init; }
public string? LibraryName { get; init; }
public int? Ordinal { get; init; }
}
private sealed record GhidraExportJson
{
public string? Name { get; init; }
public string? Address { get; init; }
public int? Ordinal { get; init; }
}
private sealed record GhidraStringJson
{
public string? Value { get; init; }
public string? Address { get; init; }
public int Length { get; init; }
public string? Encoding { get; init; }
}
private sealed record GhidraMemoryBlockJson
{
public string? Name { get; init; }
public string? Start { get; init; }
public string? End { get; init; }
public long Size { get; init; }
public bool IsExecutable { get; init; }
public bool IsWritable { get; init; }
public bool IsInitialized { get; init; }
}
private sealed record GhidraMetadataJson
{
public string? FileName { get; init; }
public string? Format { get; init; }
public string? Architecture { get; init; }
public string? Processor { get; init; }
public string? Compiler { get; init; }
public string? Endianness { get; init; }
public int AddressSize { get; init; }
public string? ImageBase { get; init; }
public string? EntryPoint { get; init; }
public string? GhidraVersion { get; init; }
}
}

View File

@@ -0,0 +1,702 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
using System.Diagnostics;
using System.Globalization;
using System.Runtime.InteropServices;
using System.Security.Cryptography;
using System.Text;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// Implementation of <see cref="IGhidriffBridge"/> for Python ghidriff integration.
/// </summary>
public sealed class GhidriffBridge : IGhidriffBridge
{
private static readonly JsonSerializerOptions JsonOptions = new()
{
PropertyNameCaseInsensitive = true,
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
};
private readonly GhidriffOptions _options;
private readonly GhidraOptions _ghidraOptions;
private readonly ILogger<GhidriffBridge> _logger;
private readonly TimeProvider _timeProvider;
/// <summary>
/// Creates a new GhidriffBridge.
/// </summary>
/// <param name="options">ghidriff options.</param>
/// <param name="ghidraOptions">Ghidra options for path configuration.</param>
/// <param name="logger">Logger instance.</param>
/// <param name="timeProvider">Time provider.</param>
public GhidriffBridge(
IOptions<GhidriffOptions> options,
IOptions<GhidraOptions> ghidraOptions,
ILogger<GhidriffBridge> logger,
TimeProvider timeProvider)
{
_options = options.Value;
_ghidraOptions = ghidraOptions.Value;
_logger = logger;
_timeProvider = timeProvider;
EnsureWorkDirectoryExists();
}
/// <inheritdoc />
public async Task<GhidriffResult> DiffAsync(
string oldBinaryPath,
string newBinaryPath,
GhidriffDiffOptions? options = null,
CancellationToken ct = default)
{
ArgumentException.ThrowIfNullOrEmpty(oldBinaryPath);
ArgumentException.ThrowIfNullOrEmpty(newBinaryPath);
if (!File.Exists(oldBinaryPath))
{
throw new FileNotFoundException("Old binary not found", oldBinaryPath);
}
if (!File.Exists(newBinaryPath))
{
throw new FileNotFoundException("New binary not found", newBinaryPath);
}
options ??= new GhidriffDiffOptions
{
IncludeDecompilation = _options.DefaultIncludeDecompilation,
IncludeDisassembly = _options.DefaultIncludeDisassembly,
TimeoutSeconds = _options.DefaultTimeoutSeconds
};
_logger.LogInformation(
"Starting ghidriff comparison: {OldBinary} vs {NewBinary}",
Path.GetFileName(oldBinaryPath),
Path.GetFileName(newBinaryPath));
var startTime = _timeProvider.GetUtcNow();
var outputDir = CreateOutputDirectory();
try
{
var args = BuildGhidriffArgs(oldBinaryPath, newBinaryPath, outputDir, options);
var result = await RunPythonAsync("ghidriff", args, options.TimeoutSeconds, ct);
if (result.ExitCode != 0)
{
throw new GhidriffException($"ghidriff failed with exit code {result.ExitCode}")
{
ExitCode = result.ExitCode,
StandardError = result.StandardError,
StandardOutput = result.StandardOutput
};
}
var ghidriffResult = await ParseOutputAsync(
outputDir,
oldBinaryPath,
newBinaryPath,
startTime,
ct);
_logger.LogInformation(
"ghidriff completed: {Added} added, {Removed} removed, {Modified} modified functions",
ghidriffResult.AddedFunctions.Length,
ghidriffResult.RemovedFunctions.Length,
ghidriffResult.ModifiedFunctions.Length);
return ghidriffResult;
}
finally
{
CleanupOutputDirectory(outputDir);
}
}
/// <inheritdoc />
public async Task<GhidriffResult> DiffAsync(
Stream oldBinary,
Stream newBinary,
GhidriffDiffOptions? options = null,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(oldBinary);
ArgumentNullException.ThrowIfNull(newBinary);
var oldPath = await SaveStreamToTempFileAsync(oldBinary, "old", ct);
var newPath = await SaveStreamToTempFileAsync(newBinary, "new", ct);
try
{
return await DiffAsync(oldPath, newPath, options, ct);
}
finally
{
TryDeleteFile(oldPath);
TryDeleteFile(newPath);
}
}
/// <inheritdoc />
public Task<string> GenerateReportAsync(
GhidriffResult result,
GhidriffReportFormat format,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(result);
return format switch
{
GhidriffReportFormat.Json => Task.FromResult(GenerateJsonReport(result)),
GhidriffReportFormat.Markdown => Task.FromResult(GenerateMarkdownReport(result)),
GhidriffReportFormat.Html => Task.FromResult(GenerateHtmlReport(result)),
_ => throw new ArgumentOutOfRangeException(nameof(format))
};
}
/// <inheritdoc />
public async Task<bool> IsAvailableAsync(CancellationToken ct = default)
{
if (!_options.Enabled)
{
return false;
}
try
{
var result = await RunPythonAsync("ghidriff", ["--version"], timeoutSeconds: 30, ct);
return result.ExitCode == 0;
}
catch (Exception ex)
{
_logger.LogDebug(ex, "ghidriff availability check failed");
return false;
}
}
/// <inheritdoc />
public async Task<string> GetVersionAsync(CancellationToken ct = default)
{
var result = await RunPythonAsync("ghidriff", ["--version"], timeoutSeconds: 30, ct);
if (result.ExitCode != 0)
{
throw new GhidriffException("Failed to get ghidriff version")
{
ExitCode = result.ExitCode,
StandardError = result.StandardError
};
}
return result.StandardOutput.Trim();
}
private void EnsureWorkDirectoryExists()
{
if (!Directory.Exists(_options.WorkDir))
{
Directory.CreateDirectory(_options.WorkDir);
_logger.LogDebug("Created ghidriff work directory: {Path}", _options.WorkDir);
}
}
private string CreateOutputDirectory()
{
var outputDir = Path.Combine(
_options.WorkDir,
$"diff_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}");
Directory.CreateDirectory(outputDir);
return outputDir;
}
private void CleanupOutputDirectory(string outputDir)
{
try
{
if (Directory.Exists(outputDir))
{
Directory.Delete(outputDir, recursive: true);
}
}
catch (Exception ex)
{
_logger.LogDebug(ex, "Failed to cleanup output directory: {Path}", outputDir);
}
}
private string[] BuildGhidriffArgs(
string oldPath,
string newPath,
string outputDir,
GhidriffDiffOptions options)
{
var args = new List<string>
{
oldPath,
newPath,
"--output-dir", outputDir,
"--output-format", "json"
};
var ghidraPath = options.GhidraPath ?? _ghidraOptions.GhidraHome;
if (!string.IsNullOrEmpty(ghidraPath))
{
args.AddRange(["--ghidra-path", ghidraPath]);
}
if (options.IncludeDecompilation)
{
args.Add("--include-decompilation");
}
if (!options.IncludeDisassembly)
{
args.Add("--no-disassembly");
}
foreach (var exclude in options.ExcludeFunctions)
{
args.AddRange(["--exclude", exclude]);
}
if (options.MaxParallelism > 1)
{
args.AddRange(["--parallel", options.MaxParallelism.ToString(CultureInfo.InvariantCulture)]);
}
return [.. args];
}
private async Task<ProcessResult> RunPythonAsync(
string module,
string[] args,
int timeoutSeconds,
CancellationToken ct)
{
var pythonPath = GetPythonPath();
var arguments = $"-m {module} {string.Join(" ", args.Select(QuoteArg))}";
var startInfo = new ProcessStartInfo
{
FileName = pythonPath,
Arguments = arguments,
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true,
StandardOutputEncoding = Encoding.UTF8,
StandardErrorEncoding = Encoding.UTF8
};
_logger.LogDebug("Running: {Python} {Args}", pythonPath, arguments);
using var process = new Process { StartInfo = startInfo };
var stdoutBuilder = new StringBuilder();
var stderrBuilder = new StringBuilder();
process.OutputDataReceived += (_, e) =>
{
if (e.Data is not null)
{
stdoutBuilder.AppendLine(e.Data);
}
};
process.ErrorDataReceived += (_, e) =>
{
if (e.Data is not null)
{
stderrBuilder.AppendLine(e.Data);
}
};
if (!process.Start())
{
throw new GhidriffException("Failed to start Python process");
}
process.BeginOutputReadLine();
process.BeginErrorReadLine();
using var timeoutCts = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds));
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
try
{
await process.WaitForExitAsync(linkedCts.Token);
}
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
{
try
{
process.Kill(entireProcessTree: true);
}
catch
{
// Best effort
}
throw new GhidriffException($"ghidriff timed out after {timeoutSeconds} seconds");
}
return new ProcessResult(
process.ExitCode,
stdoutBuilder.ToString(),
stderrBuilder.ToString());
}
private string GetPythonPath()
{
if (!string.IsNullOrEmpty(_options.PythonPath))
{
return _options.PythonPath;
}
// Try to find Python
return RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "python" : "python3";
}
private async Task<GhidriffResult> ParseOutputAsync(
string outputDir,
string oldBinaryPath,
string newBinaryPath,
DateTimeOffset startTime,
CancellationToken ct)
{
var jsonPath = Path.Combine(outputDir, "diff.json");
if (!File.Exists(jsonPath))
{
// Try alternate paths
var jsonFiles = Directory.GetFiles(outputDir, "*.json", SearchOption.AllDirectories);
if (jsonFiles.Length > 0)
{
jsonPath = jsonFiles[0];
}
else
{
_logger.LogWarning("No JSON output found in {OutputDir}", outputDir);
return CreateEmptyResult(oldBinaryPath, newBinaryPath, startTime);
}
}
var json = await File.ReadAllTextAsync(jsonPath, ct);
// Calculate hashes
var oldHash = await ComputeFileHashAsync(oldBinaryPath, ct);
var newHash = await ComputeFileHashAsync(newBinaryPath, ct);
return ParseJsonResult(json, oldHash, newHash, oldBinaryPath, newBinaryPath, startTime);
}
private GhidriffResult ParseJsonResult(
string json,
string oldHash,
string newHash,
string oldBinaryPath,
string newBinaryPath,
DateTimeOffset startTime)
{
try
{
var data = JsonSerializer.Deserialize<GhidriffJsonOutput>(json, JsonOptions);
if (data is null)
{
return CreateEmptyResult(oldBinaryPath, newBinaryPath, startTime, json);
}
var added = data.AddedFunctions?.Select(f => new GhidriffFunction(
f.Name ?? "unknown",
ParseAddress(f.Address),
f.Size,
f.Signature,
f.DecompiledCode
)).ToImmutableArray() ?? [];
var removed = data.RemovedFunctions?.Select(f => new GhidriffFunction(
f.Name ?? "unknown",
ParseAddress(f.Address),
f.Size,
f.Signature,
f.DecompiledCode
)).ToImmutableArray() ?? [];
var modified = data.ModifiedFunctions?.Select(f => new GhidriffDiff(
f.Name ?? "unknown",
ParseAddress(f.OldAddress),
ParseAddress(f.NewAddress),
f.OldSize,
f.NewSize,
f.OldSignature,
f.NewSignature,
f.Similarity,
f.OldDecompiledCode,
f.NewDecompiledCode,
f.InstructionChanges?.ToImmutableArray() ?? []
)).ToImmutableArray() ?? [];
var duration = _timeProvider.GetUtcNow() - startTime;
var stats = new GhidriffStats(
data.Statistics?.TotalOldFunctions ?? 0,
data.Statistics?.TotalNewFunctions ?? 0,
added.Length,
removed.Length,
modified.Length,
data.Statistics?.UnchangedCount ?? 0,
duration);
return new GhidriffResult(
oldHash,
newHash,
Path.GetFileName(oldBinaryPath),
Path.GetFileName(newBinaryPath),
added,
removed,
modified,
stats,
json);
}
catch (JsonException ex)
{
_logger.LogWarning(ex, "Failed to parse ghidriff JSON output");
return CreateEmptyResult(oldBinaryPath, newBinaryPath, startTime, json);
}
}
private GhidriffResult CreateEmptyResult(
string oldBinaryPath,
string newBinaryPath,
DateTimeOffset startTime,
string rawJson = "")
{
var duration = _timeProvider.GetUtcNow() - startTime;
return new GhidriffResult(
"",
"",
Path.GetFileName(oldBinaryPath),
Path.GetFileName(newBinaryPath),
[],
[],
[],
new GhidriffStats(0, 0, 0, 0, 0, 0, duration),
rawJson);
}
private static ulong ParseAddress(string? address)
{
if (string.IsNullOrEmpty(address))
{
return 0;
}
if (address.StartsWith("0x", StringComparison.OrdinalIgnoreCase))
{
address = address[2..];
}
return ulong.TryParse(address, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var result)
? result
: 0;
}
private static async Task<string> ComputeFileHashAsync(string path, CancellationToken ct)
{
await using var stream = File.OpenRead(path);
var hash = await SHA256.HashDataAsync(stream, ct);
return Convert.ToHexStringLower(hash);
}
private async Task<string> SaveStreamToTempFileAsync(Stream stream, string prefix, CancellationToken ct)
{
var path = Path.Combine(
_options.WorkDir,
$"{prefix}_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}.bin");
Directory.CreateDirectory(Path.GetDirectoryName(path)!);
await using var fileStream = File.Create(path);
await stream.CopyToAsync(fileStream, ct);
return path;
}
private void TryDeleteFile(string path)
{
try
{
if (File.Exists(path))
{
File.Delete(path);
}
}
catch (Exception ex)
{
_logger.LogDebug(ex, "Failed to delete temp file: {Path}", path);
}
}
private static string QuoteArg(string arg)
{
if (arg.Contains(' ', StringComparison.Ordinal) || arg.Contains('"', StringComparison.Ordinal))
{
return $"\"{arg.Replace("\"", "\\\"")}\"";
}
return arg;
}
private static string GenerateJsonReport(GhidriffResult result)
{
return JsonSerializer.Serialize(result, new JsonSerializerOptions
{
WriteIndented = true,
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
});
}
private static string GenerateMarkdownReport(GhidriffResult result)
{
var sb = new StringBuilder();
sb.AppendLine($"# Binary Diff Report");
sb.AppendLine();
sb.AppendLine($"**Old Binary:** {result.OldBinaryName} (`{result.OldBinaryHash}`)");
sb.AppendLine($"**New Binary:** {result.NewBinaryName} (`{result.NewBinaryHash}`)");
sb.AppendLine();
sb.AppendLine($"## Summary");
sb.AppendLine();
sb.AppendLine($"| Metric | Count |");
sb.AppendLine($"|--------|-------|");
sb.AppendLine($"| Functions Added | {result.Statistics.AddedCount} |");
sb.AppendLine($"| Functions Removed | {result.Statistics.RemovedCount} |");
sb.AppendLine($"| Functions Modified | {result.Statistics.ModifiedCount} |");
sb.AppendLine($"| Functions Unchanged | {result.Statistics.UnchangedCount} |");
sb.AppendLine();
if (result.AddedFunctions.Length > 0)
{
sb.AppendLine($"## Added Functions");
sb.AppendLine();
foreach (var func in result.AddedFunctions)
{
sb.AppendLine($"- `{func.Name}` at 0x{func.Address:X}");
}
sb.AppendLine();
}
if (result.RemovedFunctions.Length > 0)
{
sb.AppendLine($"## Removed Functions");
sb.AppendLine();
foreach (var func in result.RemovedFunctions)
{
sb.AppendLine($"- `{func.Name}` at 0x{func.Address:X}");
}
sb.AppendLine();
}
if (result.ModifiedFunctions.Length > 0)
{
sb.AppendLine($"## Modified Functions");
sb.AppendLine();
foreach (var func in result.ModifiedFunctions)
{
sb.AppendLine($"### {func.FunctionName}");
sb.AppendLine($"- Similarity: {func.Similarity:P1}");
sb.AppendLine($"- Old: 0x{func.OldAddress:X} ({func.OldSize} bytes)");
sb.AppendLine($"- New: 0x{func.NewAddress:X} ({func.NewSize} bytes)");
sb.AppendLine();
}
}
return sb.ToString();
}
private static string GenerateHtmlReport(GhidriffResult result)
{
var sb = new StringBuilder();
sb.AppendLine("<!DOCTYPE html>");
sb.AppendLine("<html><head><title>Binary Diff Report</title>");
sb.AppendLine("<style>");
sb.AppendLine("body { font-family: sans-serif; margin: 20px; }");
sb.AppendLine("table { border-collapse: collapse; }");
sb.AppendLine("th, td { border: 1px solid #ccc; padding: 8px; }");
sb.AppendLine(".added { background: #d4ffd4; }");
sb.AppendLine(".removed { background: #ffd4d4; }");
sb.AppendLine(".modified { background: #ffffd4; }");
sb.AppendLine("</style>");
sb.AppendLine("</head><body>");
sb.AppendLine($"<h1>Binary Diff Report</h1>");
sb.AppendLine($"<p><strong>Old:</strong> {result.OldBinaryName}</p>");
sb.AppendLine($"<p><strong>New:</strong> {result.NewBinaryName}</p>");
sb.AppendLine($"<table>");
sb.AppendLine($"<tr><th>Metric</th><th>Count</th></tr>");
sb.AppendLine($"<tr class='added'><td>Added</td><td>{result.Statistics.AddedCount}</td></tr>");
sb.AppendLine($"<tr class='removed'><td>Removed</td><td>{result.Statistics.RemovedCount}</td></tr>");
sb.AppendLine($"<tr class='modified'><td>Modified</td><td>{result.Statistics.ModifiedCount}</td></tr>");
sb.AppendLine($"<tr><td>Unchanged</td><td>{result.Statistics.UnchangedCount}</td></tr>");
sb.AppendLine("</table>");
sb.AppendLine("</body></html>");
return sb.ToString();
}
// JSON DTOs
private sealed record ProcessResult(int ExitCode, string StandardOutput, string StandardError);
private sealed record GhidriffJsonOutput
{
public List<GhidriffFunctionJson>? AddedFunctions { get; init; }
public List<GhidriffFunctionJson>? RemovedFunctions { get; init; }
public List<GhidriffDiffJson>? ModifiedFunctions { get; init; }
public GhidriffStatsJson? Statistics { get; init; }
}
private sealed record GhidriffFunctionJson
{
public string? Name { get; init; }
public string? Address { get; init; }
public int Size { get; init; }
public string? Signature { get; init; }
public string? DecompiledCode { get; init; }
}
private sealed record GhidriffDiffJson
{
public string? Name { get; init; }
public string? OldAddress { get; init; }
public string? NewAddress { get; init; }
public int OldSize { get; init; }
public int NewSize { get; init; }
public string? OldSignature { get; init; }
public string? NewSignature { get; init; }
public decimal Similarity { get; init; }
public string? OldDecompiledCode { get; init; }
public string? NewDecompiledCode { get; init; }
public List<string>? InstructionChanges { get; init; }
}
private sealed record GhidriffStatsJson
{
public int TotalOldFunctions { get; init; }
public int TotalNewFunctions { get; init; }
public int AddedCount { get; init; }
public int RemovedCount { get; init; }
public int ModifiedCount { get; init; }
public int UnchangedCount { get; init; }
}
}

View File

@@ -0,0 +1,432 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
using System.Globalization;
using System.Security.Cryptography;
using System.Text.Json;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
namespace StellaOps.BinaryIndex.Ghidra;
/// <summary>
/// Implementation of <see cref="IVersionTrackingService"/> using Ghidra Version Tracking.
/// </summary>
public sealed class VersionTrackingService : IVersionTrackingService
{
private static readonly JsonSerializerOptions JsonOptions = new()
{
PropertyNameCaseInsensitive = true,
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
};
private readonly GhidraHeadlessManager _headlessManager;
private readonly GhidraOptions _options;
private readonly ILogger<VersionTrackingService> _logger;
private readonly TimeProvider _timeProvider;
/// <summary>
/// Creates a new VersionTrackingService.
/// </summary>
/// <param name="headlessManager">The Ghidra Headless manager.</param>
/// <param name="options">Ghidra options.</param>
/// <param name="logger">Logger instance.</param>
/// <param name="timeProvider">Time provider.</param>
public VersionTrackingService(
GhidraHeadlessManager headlessManager,
IOptions<GhidraOptions> options,
ILogger<VersionTrackingService> logger,
TimeProvider timeProvider)
{
_headlessManager = headlessManager;
_options = options.Value;
_logger = logger;
_timeProvider = timeProvider;
}
/// <inheritdoc />
public async Task<VersionTrackingResult> TrackVersionsAsync(
Stream oldBinary,
Stream newBinary,
VersionTrackingOptions? options = null,
CancellationToken ct = default)
{
ArgumentNullException.ThrowIfNull(oldBinary);
ArgumentNullException.ThrowIfNull(newBinary);
var oldPath = await SaveStreamToTempFileAsync(oldBinary, "old", ct);
var newPath = await SaveStreamToTempFileAsync(newBinary, "new", ct);
try
{
return await TrackVersionsAsync(oldPath, newPath, options, ct);
}
finally
{
TryDeleteFile(oldPath);
TryDeleteFile(newPath);
}
}
/// <inheritdoc />
public async Task<VersionTrackingResult> TrackVersionsAsync(
string oldBinaryPath,
string newBinaryPath,
VersionTrackingOptions? options = null,
CancellationToken ct = default)
{
ArgumentException.ThrowIfNullOrEmpty(oldBinaryPath);
ArgumentException.ThrowIfNullOrEmpty(newBinaryPath);
if (!File.Exists(oldBinaryPath))
{
throw new FileNotFoundException("Old binary not found", oldBinaryPath);
}
if (!File.Exists(newBinaryPath))
{
throw new FileNotFoundException("New binary not found", newBinaryPath);
}
options ??= new VersionTrackingOptions();
_logger.LogInformation(
"Starting Version Tracking: {OldBinary} vs {NewBinary}",
Path.GetFileName(oldBinaryPath),
Path.GetFileName(newBinaryPath));
var startTime = _timeProvider.GetUtcNow();
// Build script arguments for Version Tracking
var scriptArgs = BuildVersionTrackingArgs(oldBinaryPath, newBinaryPath, options);
// Run Ghidra with Version Tracking script
// Note: This assumes a custom VersionTracking.java script that outputs JSON
var result = await _headlessManager.RunAnalysisAsync(
oldBinaryPath,
scriptName: "VersionTracking.java",
scriptArgs: scriptArgs,
runAnalysis: true,
timeoutSeconds: options.TimeoutSeconds,
ct);
if (!result.IsSuccess)
{
throw new GhidraException($"Version Tracking failed: {result.StandardError}")
{
ExitCode = result.ExitCode,
StandardError = result.StandardError,
StandardOutput = result.StandardOutput
};
}
var trackingResult = ParseVersionTrackingOutput(
result.StandardOutput,
startTime,
result.Duration);
_logger.LogInformation(
"Version Tracking completed: {Matched} matched, {Added} added, {Removed} removed, {Modified} modified",
trackingResult.Matches.Length,
trackingResult.AddedFunctions.Length,
trackingResult.RemovedFunctions.Length,
trackingResult.ModifiedFunctions.Length);
return trackingResult;
}
private static string[] BuildVersionTrackingArgs(
string oldBinaryPath,
string newBinaryPath,
VersionTrackingOptions options)
{
var args = new List<string>
{
"-newBinary", newBinaryPath,
"-minSimilarity", options.MinSimilarity.ToString("F2", CultureInfo.InvariantCulture)
};
// Add correlator flags
foreach (var correlator in options.Correlators)
{
args.Add($"-correlator:{GetCorrelatorName(correlator)}");
}
if (options.IncludeDecompilation)
{
args.Add("-decompile");
}
if (options.ComputeDetailedDiffs)
{
args.Add("-detailedDiffs");
}
return [.. args];
}
private static string GetCorrelatorName(CorrelatorType correlator)
{
return correlator switch
{
CorrelatorType.ExactBytes => "ExactBytesFunctionHasher",
CorrelatorType.ExactMnemonics => "ExactMnemonicsFunctionHasher",
CorrelatorType.SymbolName => "SymbolNameMatch",
CorrelatorType.DataReference => "DataReferenceCorrelator",
CorrelatorType.CallReference => "CallReferenceCorrelator",
CorrelatorType.CombinedReference => "CombinedReferenceCorrelator",
CorrelatorType.BSim => "BSimCorrelator",
_ => "CombinedReferenceCorrelator"
};
}
private VersionTrackingResult ParseVersionTrackingOutput(
string output,
DateTimeOffset startTime,
TimeSpan duration)
{
// Look for JSON output marker
const string jsonMarker = "###VERSION_TRACKING_JSON###";
var jsonStart = output.IndexOf(jsonMarker, StringComparison.Ordinal);
if (jsonStart >= 0)
{
var jsonContent = output[(jsonStart + jsonMarker.Length)..].Trim();
var jsonEnd = jsonContent.IndexOf("###END_VERSION_TRACKING_JSON###", StringComparison.Ordinal);
if (jsonEnd >= 0)
{
jsonContent = jsonContent[..jsonEnd].Trim();
}
try
{
return ParseJsonOutput(jsonContent, duration);
}
catch (JsonException ex)
{
_logger.LogWarning(ex, "Failed to parse Version Tracking JSON output");
}
}
// Return empty result if parsing fails
_logger.LogWarning("No structured Version Tracking output found");
return CreateEmptyResult(duration);
}
private static VersionTrackingResult ParseJsonOutput(string json, TimeSpan duration)
{
var data = JsonSerializer.Deserialize<VersionTrackingJsonOutput>(json, JsonOptions)
?? throw new GhidraException("Failed to deserialize Version Tracking JSON output");
var matches = data.Matches?.Select(m => new FunctionMatch(
m.OldName ?? "unknown",
ParseAddress(m.OldAddress),
m.NewName ?? "unknown",
ParseAddress(m.NewAddress),
m.Similarity,
ParseCorrelatorType(m.MatchedBy),
m.Differences?.Select(d => new MatchDifference(
ParseDifferenceType(d.Type),
d.Description ?? "",
d.OldValue,
d.NewValue,
d.Address is not null ? ParseAddress(d.Address) : null
)).ToImmutableArray() ?? []
)).ToImmutableArray() ?? [];
var added = data.AddedFunctions?.Select(f => new FunctionAdded(
f.Name ?? "unknown",
ParseAddress(f.Address),
f.Size,
f.Signature
)).ToImmutableArray() ?? [];
var removed = data.RemovedFunctions?.Select(f => new FunctionRemoved(
f.Name ?? "unknown",
ParseAddress(f.Address),
f.Size,
f.Signature
)).ToImmutableArray() ?? [];
var modified = data.ModifiedFunctions?.Select(f => new FunctionModified(
f.OldName ?? "unknown",
ParseAddress(f.OldAddress),
f.OldSize,
f.NewName ?? "unknown",
ParseAddress(f.NewAddress),
f.NewSize,
f.Similarity,
f.Differences?.Select(d => new MatchDifference(
ParseDifferenceType(d.Type),
d.Description ?? "",
d.OldValue,
d.NewValue,
d.Address is not null ? ParseAddress(d.Address) : null
)).ToImmutableArray() ?? [],
f.OldDecompiled,
f.NewDecompiled
)).ToImmutableArray() ?? [];
var stats = new VersionTrackingStats(
data.Statistics?.TotalOldFunctions ?? 0,
data.Statistics?.TotalNewFunctions ?? 0,
matches.Length,
added.Length,
removed.Length,
modified.Length,
duration);
return new VersionTrackingResult(matches, added, removed, modified, stats);
}
private static VersionTrackingResult CreateEmptyResult(TimeSpan duration)
{
return new VersionTrackingResult(
[],
[],
[],
[],
new VersionTrackingStats(0, 0, 0, 0, 0, 0, duration));
}
private static ulong ParseAddress(string? address)
{
if (string.IsNullOrEmpty(address))
{
return 0;
}
if (address.StartsWith("0x", StringComparison.OrdinalIgnoreCase))
{
address = address[2..];
}
return ulong.TryParse(address, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var result)
? result
: 0;
}
private static CorrelatorType ParseCorrelatorType(string? correlator)
{
return correlator?.ToUpperInvariant() switch
{
"EXACTBYTES" or "EXACTBYTESFUNCTIONHASHER" => CorrelatorType.ExactBytes,
"EXACTMNEMONICS" or "EXACTMNEMONICSFUNCTIONHASHER" => CorrelatorType.ExactMnemonics,
"SYMBOLNAME" or "SYMBOLNAMEMATCH" => CorrelatorType.SymbolName,
"DATAREFERENCE" or "DATAREFERENCECORRELATOR" => CorrelatorType.DataReference,
"CALLREFERENCE" or "CALLREFERENCECORRELATOR" => CorrelatorType.CallReference,
"COMBINEDREFERENCE" or "COMBINEDREFERENCECORRELATOR" => CorrelatorType.CombinedReference,
"BSIM" or "BSIMCORRELATOR" => CorrelatorType.BSim,
_ => CorrelatorType.CombinedReference
};
}
private static DifferenceType ParseDifferenceType(string? type)
{
return type?.ToUpperInvariant() switch
{
"INSTRUCTIONADDED" => DifferenceType.InstructionAdded,
"INSTRUCTIONREMOVED" => DifferenceType.InstructionRemoved,
"INSTRUCTIONCHANGED" => DifferenceType.InstructionChanged,
"BRANCHTARGETCHANGED" => DifferenceType.BranchTargetChanged,
"CALLTARGETCHANGED" => DifferenceType.CallTargetChanged,
"CONSTANTCHANGED" => DifferenceType.ConstantChanged,
"SIZECHANGED" => DifferenceType.SizeChanged,
"STACKFRAMECHANGED" => DifferenceType.StackFrameChanged,
"REGISTERUSAGECHANGED" => DifferenceType.RegisterUsageChanged,
_ => DifferenceType.InstructionChanged
};
}
private async Task<string> SaveStreamToTempFileAsync(Stream stream, string prefix, CancellationToken ct)
{
var path = Path.Combine(
_options.WorkDir,
$"{prefix}_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}.bin");
Directory.CreateDirectory(Path.GetDirectoryName(path)!);
await using var fileStream = File.Create(path);
await stream.CopyToAsync(fileStream, ct);
return path;
}
private void TryDeleteFile(string path)
{
try
{
if (File.Exists(path))
{
File.Delete(path);
}
}
catch (Exception ex)
{
_logger.LogDebug(ex, "Failed to delete temp file: {Path}", path);
}
}
// JSON DTOs for deserialization
private sealed record VersionTrackingJsonOutput
{
public List<FunctionMatchJson>? Matches { get; init; }
public List<FunctionInfoJson>? AddedFunctions { get; init; }
public List<FunctionInfoJson>? RemovedFunctions { get; init; }
public List<FunctionModifiedJson>? ModifiedFunctions { get; init; }
public VersionTrackingStatsJson? Statistics { get; init; }
}
private sealed record FunctionMatchJson
{
public string? OldName { get; init; }
public string? OldAddress { get; init; }
public string? NewName { get; init; }
public string? NewAddress { get; init; }
public decimal Similarity { get; init; }
public string? MatchedBy { get; init; }
public List<DifferenceJson>? Differences { get; init; }
}
private sealed record FunctionInfoJson
{
public string? Name { get; init; }
public string? Address { get; init; }
public int Size { get; init; }
public string? Signature { get; init; }
}
private sealed record FunctionModifiedJson
{
public string? OldName { get; init; }
public string? OldAddress { get; init; }
public int OldSize { get; init; }
public string? NewName { get; init; }
public string? NewAddress { get; init; }
public int NewSize { get; init; }
public decimal Similarity { get; init; }
public List<DifferenceJson>? Differences { get; init; }
public string? OldDecompiled { get; init; }
public string? NewDecompiled { get; init; }
}
private sealed record DifferenceJson
{
public string? Type { get; init; }
public string? Description { get; init; }
public string? OldValue { get; init; }
public string? NewValue { get; init; }
public string? Address { get; init; }
}
private sealed record VersionTrackingStatsJson
{
public int TotalOldFunctions { get; init; }
public int TotalNewFunctions { get; init; }
public int MatchedCount { get; init; }
public int AddedCount { get; init; }
public int RemovedCount { get; init; }
public int ModifiedCount { get; init; }
}
}

View File

@@ -0,0 +1,24 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<LangVersion>preview</LangVersion>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<Description>Ghidra integration for StellaOps BinaryIndex. Provides Version Tracking, BSim, and ghidriff capabilities as a fallback disassembly backend.</Description>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\StellaOps.BinaryIndex.Disassembly.Abstractions\StellaOps.BinaryIndex.Disassembly.Abstractions.csproj" />
<ProjectReference Include="..\StellaOps.BinaryIndex.Contracts\StellaOps.BinaryIndex.Contracts.csproj" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
<PackageReference Include="Microsoft.Extensions.Options" />
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" />
<PackageReference Include="Microsoft.Extensions.Options.DataAnnotations" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,269 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
using System.Text.RegularExpressions;
namespace StellaOps.BinaryIndex.ML;
/// <summary>
/// Tokenizer for binary/decompiled code using byte-pair encoding style tokenization.
/// </summary>
public sealed partial class BinaryCodeTokenizer : ITokenizer
{
private readonly ImmutableDictionary<string, long> _vocabulary;
private readonly long _padToken;
private readonly long _unkToken;
private readonly long _clsToken;
private readonly long _sepToken;
// Special token IDs (matching CodeBERT conventions)
private const long DefaultPadToken = 0;
private const long DefaultUnkToken = 1;
private const long DefaultClsToken = 2;
private const long DefaultSepToken = 3;
public BinaryCodeTokenizer(string? vocabularyPath = null)
{
if (!string.IsNullOrEmpty(vocabularyPath) && File.Exists(vocabularyPath))
{
_vocabulary = LoadVocabulary(vocabularyPath);
_padToken = _vocabulary.GetValueOrDefault("<pad>", DefaultPadToken);
_unkToken = _vocabulary.GetValueOrDefault("<unk>", DefaultUnkToken);
_clsToken = _vocabulary.GetValueOrDefault("<cls>", DefaultClsToken);
_sepToken = _vocabulary.GetValueOrDefault("<sep>", DefaultSepToken);
}
else
{
// Use default vocabulary for testing
_vocabulary = CreateDefaultVocabulary();
_padToken = DefaultPadToken;
_unkToken = DefaultUnkToken;
_clsToken = DefaultClsToken;
_sepToken = DefaultSepToken;
}
}
/// <inheritdoc />
public long[] Tokenize(string text, int maxLength = 512)
{
var (inputIds, _) = TokenizeWithMask(text, maxLength);
return inputIds;
}
/// <inheritdoc />
public (long[] InputIds, long[] AttentionMask) TokenizeWithMask(string text, int maxLength = 512)
{
ArgumentException.ThrowIfNullOrEmpty(text);
var tokens = TokenizeText(text);
var inputIds = new long[maxLength];
var attentionMask = new long[maxLength];
// Add [CLS] token
inputIds[0] = _clsToken;
attentionMask[0] = 1;
var position = 1;
foreach (var token in tokens)
{
if (position >= maxLength - 1)
{
break;
}
inputIds[position] = _vocabulary.GetValueOrDefault(token.ToLowerInvariant(), _unkToken);
attentionMask[position] = 1;
position++;
}
// Add [SEP] token
if (position < maxLength)
{
inputIds[position] = _sepToken;
attentionMask[position] = 1;
position++;
}
// Pad remaining positions
for (var i = position; i < maxLength; i++)
{
inputIds[i] = _padToken;
attentionMask[i] = 0;
}
return (inputIds, attentionMask);
}
/// <inheritdoc />
public string Decode(long[] tokenIds)
{
ArgumentNullException.ThrowIfNull(tokenIds);
var reverseVocab = _vocabulary.ToImmutableDictionary(kv => kv.Value, kv => kv.Key);
var tokens = new List<string>();
foreach (var id in tokenIds)
{
if (id == _padToken || id == _clsToken || id == _sepToken)
{
continue;
}
tokens.Add(reverseVocab.GetValueOrDefault(id, "<unk>"));
}
return string.Join(" ", tokens);
}
private IEnumerable<string> TokenizeText(string text)
{
// Normalize whitespace
text = WhitespaceRegex().Replace(text, " ");
// Split on operators and punctuation, keeping them as tokens
var tokens = new List<string>();
var matches = TokenRegex().Matches(text);
foreach (Match match in matches)
{
var token = match.Value.Trim();
if (!string.IsNullOrEmpty(token))
{
tokens.Add(token);
}
}
return tokens;
}
private static ImmutableDictionary<string, long> LoadVocabulary(string path)
{
var vocabulary = new Dictionary<string, long>();
var lines = File.ReadAllLines(path);
for (var i = 0; i < lines.Length; i++)
{
var token = lines[i].Trim();
if (!string.IsNullOrEmpty(token))
{
vocabulary[token] = i;
}
}
return vocabulary.ToImmutableDictionary();
}
private static ImmutableDictionary<string, long> CreateDefaultVocabulary()
{
// Basic vocabulary for testing without model
var vocab = new Dictionary<string, long>
{
// Special tokens
["<pad>"] = 0,
["<unk>"] = 1,
["<cls>"] = 2,
["<sep>"] = 3,
// Keywords
["void"] = 10,
["int"] = 11,
["char"] = 12,
["short"] = 13,
["long"] = 14,
["float"] = 15,
["double"] = 16,
["unsigned"] = 17,
["signed"] = 18,
["const"] = 19,
["static"] = 20,
["extern"] = 21,
["return"] = 22,
["if"] = 23,
["else"] = 24,
["while"] = 25,
["for"] = 26,
["do"] = 27,
["switch"] = 28,
["case"] = 29,
["default"] = 30,
["break"] = 31,
["continue"] = 32,
["goto"] = 33,
["sizeof"] = 34,
["struct"] = 35,
["union"] = 36,
["enum"] = 37,
["typedef"] = 38,
// Operators
["+"] = 50,
["-"] = 51,
["*"] = 52,
["/"] = 53,
["%"] = 54,
["="] = 55,
["=="] = 56,
["!="] = 57,
["<"] = 58,
[">"] = 59,
["<="] = 60,
[">="] = 61,
["&&"] = 62,
["||"] = 63,
["!"] = 64,
["&"] = 65,
["|"] = 66,
["^"] = 67,
["~"] = 68,
["<<"] = 69,
[">>"] = 70,
["++"] = 71,
["--"] = 72,
["->"] = 73,
["."] = 74,
// Punctuation
["("] = 80,
[")"] = 81,
["{"] = 82,
["}"] = 83,
["["] = 84,
["]"] = 85,
[";"] = 86,
[","] = 87,
[":"] = 88,
// Common Ghidra types
["undefined"] = 100,
["undefined1"] = 101,
["undefined2"] = 102,
["undefined4"] = 103,
["undefined8"] = 104,
["byte"] = 105,
["word"] = 106,
["dword"] = 107,
["qword"] = 108,
["bool"] = 109,
// Common functions
["malloc"] = 200,
["free"] = 201,
["memcpy"] = 202,
["memset"] = 203,
["strlen"] = 204,
["strcpy"] = 205,
["strcmp"] = 206,
["printf"] = 207,
["sprintf"] = 208
};
return vocab.ToImmutableDictionary();
}
[GeneratedRegex(@"\s+")]
private static partial Regex WhitespaceRegex();
[GeneratedRegex(@"([a-zA-Z_][a-zA-Z0-9_]*|0[xX][0-9a-fA-F]+|\d+|""[^""]*""|'[^']*'|[+\-*/%=<>!&|^~]+|[(){}\[\];,.:])")]
private static partial Regex TokenRegex();
}

View File

@@ -0,0 +1,174 @@
// Copyright (c) StellaOps. All rights reserved.
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
using System.Collections.Immutable;
namespace StellaOps.BinaryIndex.ML;
/// <summary>
/// Service for generating and comparing function embeddings.
/// </summary>
public interface IEmbeddingService
{
/// <summary>
/// Generate embedding vector for a function.
/// </summary>
/// <param name="input">Function input data.</param>
/// <param name="options">Embedding options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Function embedding with vector.</returns>
Task<FunctionEmbedding> GenerateEmbeddingAsync(
EmbeddingInput input,
EmbeddingOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Generate embeddings for multiple functions in batch.
/// </summary>
/// <param name="inputs">Function inputs.</param>
/// <param name="options">Embedding options.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Function embeddings.</returns>
Task<ImmutableArray<FunctionEmbedding>> GenerateBatchAsync(
IEnumerable<EmbeddingInput> inputs,
EmbeddingOptions? options = null,
CancellationToken ct = default);
/// <summary>
/// Compute similarity between two embeddings.
/// </summary>
/// <param name="a">First embedding.</param>
/// <param name="b">Second embedding.</param>
/// <param name="metric">Similarity metric to use.</param>
/// <returns>Similarity score (0.0 to 1.0).</returns>
decimal ComputeSimilarity(
FunctionEmbedding a,
FunctionEmbedding b,
SimilarityMetric metric = SimilarityMetric.Cosine);
/// <summary>
/// Find similar functions in an embedding index.
/// </summary>
/// <param name="query">Query embedding.</param>
/// <param name="topK">Number of results to return.</param>
/// <param name="minSimilarity">Minimum similarity threshold.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Matching functions sorted by similarity.</returns>
Task<ImmutableArray<EmbeddingMatch>> FindSimilarAsync(
FunctionEmbedding query,
int topK = 10,
decimal minSimilarity = 0.7m,
CancellationToken ct = default);
}
/// <summary>
/// Service for training ML models.
/// </summary>
public interface IModelTrainingService
{
/// <summary>
/// Train embedding model on function pairs.
/// </summary>
/// <param name="trainingData">Training pairs.</param>
/// <param name="options">Training options.</param>
/// <param name="progress">Optional progress reporter.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Training result.</returns>
Task<TrainingResult> TrainAsync(
IAsyncEnumerable<TrainingPair> trainingData,
TrainingOptions options,
IProgress<TrainingProgress>? progress = null,
CancellationToken ct = default);
/// <summary>
/// Evaluate model on test data.
/// </summary>
/// <param name="testData">Test pairs.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Evaluation metrics.</returns>
Task<EvaluationResult> EvaluateAsync(
IAsyncEnumerable<TrainingPair> testData,
CancellationToken ct = default);
/// <summary>
/// Export trained model to specified format.
/// </summary>
/// <param name="outputPath">Output path for model.</param>
/// <param name="format">Export format.</param>
/// <param name="ct">Cancellation token.</param>
Task ExportModelAsync(
string outputPath,
ModelExportFormat format = ModelExportFormat.Onnx,
CancellationToken ct = default);
}
/// <summary>
/// Tokenizer for converting code to token sequences.
/// </summary>
public interface ITokenizer
{
/// <summary>
/// Tokenize text into token IDs.
/// </summary>
/// <param name="text">Input text.</param>
/// <param name="maxLength">Maximum sequence length.</param>
/// <returns>Token ID array.</returns>
long[] Tokenize(string text, int maxLength = 512);
/// <summary>
/// Tokenize with attention mask.
/// </summary>
/// <param name="text">Input text.</param>
/// <param name="maxLength">Maximum sequence length.</param>
/// <returns>Token IDs and attention mask.</returns>
(long[] InputIds, long[] AttentionMask) TokenizeWithMask(string text, int maxLength = 512);
/// <summary>
/// Decode token IDs back to text.
/// </summary>
/// <param name="tokenIds">Token IDs.</param>
/// <returns>Decoded text.</returns>
string Decode(long[] tokenIds);
}
/// <summary>
/// Index for efficient embedding similarity search.
/// </summary>
public interface IEmbeddingIndex
{
/// <summary>
/// Add embedding to index.
/// </summary>
/// <param name="embedding">Embedding to add.</param>
/// <param name="ct">Cancellation token.</param>
Task AddAsync(FunctionEmbedding embedding, CancellationToken ct = default);
/// <summary>
/// Add multiple embeddings to index.
/// </summary>
/// <param name="embeddings">Embeddings to add.</param>
/// <param name="ct">Cancellation token.</param>
Task AddBatchAsync(IEnumerable<FunctionEmbedding> embeddings, CancellationToken ct = default);
/// <summary>
/// Search for similar embeddings.
/// </summary>
/// <param name="query">Query vector.</param>
/// <param name="topK">Number of results.</param>
/// <param name="ct">Cancellation token.</param>
/// <returns>Similar embeddings with scores.</returns>
Task<ImmutableArray<(FunctionEmbedding Embedding, decimal Similarity)>> SearchAsync(
float[] query,
int topK,
CancellationToken ct = default);
/// <summary>
/// Get total count of indexed embeddings.
/// </summary>
int Count { get; }
/// <summary>
/// Clear all embeddings from index.
/// </summary>
void Clear();
}

Some files were not shown because too many files have changed in this diff Show More