Merge branch 'main' of https://git.stella-ops.org/stella-ops.org/git.stella-ops.org
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Builders;
|
||||
|
||||
/// <summary>
|
||||
@@ -109,6 +111,12 @@ public sealed record FunctionFingerprint
|
||||
/// Source line number if debug info available.
|
||||
/// </summary>
|
||||
public int? SourceLine { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Semantic fingerprint for enhanced similarity comparison.
|
||||
/// Uses IR-level analysis for resilience to compiler optimizations.
|
||||
/// </summary>
|
||||
public Semantic.SemanticFingerprint? SemanticFingerprint { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -192,25 +192,42 @@ public sealed record HashWeights
|
||||
/// <summary>
|
||||
/// Weight for basic block hash comparison.
|
||||
/// </summary>
|
||||
public decimal BasicBlockWeight { get; init; } = 0.5m;
|
||||
public decimal BasicBlockWeight { get; init; } = 0.4m;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for CFG hash comparison.
|
||||
/// </summary>
|
||||
public decimal CfgWeight { get; init; } = 0.3m;
|
||||
public decimal CfgWeight { get; init; } = 0.25m;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for string refs hash comparison.
|
||||
/// </summary>
|
||||
public decimal StringRefsWeight { get; init; } = 0.2m;
|
||||
public decimal StringRefsWeight { get; init; } = 0.15m;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for semantic fingerprint comparison.
|
||||
/// Only used when both fingerprints have semantic data.
|
||||
/// </summary>
|
||||
public decimal SemanticWeight { get; init; } = 0.2m;
|
||||
|
||||
/// <summary>
|
||||
/// Default weights.
|
||||
/// </summary>
|
||||
public static HashWeights Default => new();
|
||||
|
||||
/// <summary>
|
||||
/// Weights without semantic analysis (traditional mode).
|
||||
/// </summary>
|
||||
public static HashWeights Traditional => new()
|
||||
{
|
||||
BasicBlockWeight = 0.5m,
|
||||
CfgWeight = 0.3m,
|
||||
StringRefsWeight = 0.2m,
|
||||
SemanticWeight = 0.0m
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Validates that weights sum to 1.0.
|
||||
/// </summary>
|
||||
public bool IsValid => Math.Abs(BasicBlockWeight + CfgWeight + StringRefsWeight - 1.0m) < 0.001m;
|
||||
public bool IsValid => Math.Abs(BasicBlockWeight + CfgWeight + StringRefsWeight + SemanticWeight - 1.0m) < 0.001m;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Builders;
|
||||
|
||||
@@ -202,6 +203,16 @@ public sealed class PatchDiffEngine : IPatchDiffEngine
|
||||
matchedWeight += weights.StringRefsWeight;
|
||||
}
|
||||
|
||||
// Include semantic fingerprint similarity if available
|
||||
if (weights.SemanticWeight > 0 &&
|
||||
a.SemanticFingerprint is not null &&
|
||||
b.SemanticFingerprint is not null)
|
||||
{
|
||||
totalWeight += weights.SemanticWeight;
|
||||
var semanticSimilarity = ComputeSemanticSimilarity(a.SemanticFingerprint, b.SemanticFingerprint);
|
||||
matchedWeight += weights.SemanticWeight * semanticSimilarity;
|
||||
}
|
||||
|
||||
// Size similarity bonus (if sizes are within 10%, add small bonus)
|
||||
if (a.Size > 0 && b.Size > 0)
|
||||
{
|
||||
@@ -216,6 +227,86 @@ public sealed class PatchDiffEngine : IPatchDiffEngine
|
||||
return totalWeight > 0 ? matchedWeight / totalWeight : 0m;
|
||||
}
|
||||
|
||||
private static decimal ComputeSemanticSimilarity(
|
||||
Semantic.SemanticFingerprint a,
|
||||
Semantic.SemanticFingerprint b)
|
||||
{
|
||||
// Check for exact hash match first
|
||||
if (a.HashEquals(b))
|
||||
{
|
||||
return 1.0m;
|
||||
}
|
||||
|
||||
// Compute weighted similarity from components
|
||||
decimal graphSim = ComputeHashSimilarity(a.GraphHash, b.GraphHash);
|
||||
decimal opSim = ComputeHashSimilarity(a.OperationHash, b.OperationHash);
|
||||
decimal dfSim = ComputeHashSimilarity(a.DataFlowHash, b.DataFlowHash);
|
||||
decimal apiSim = ComputeApiCallSimilarity(a.ApiCalls, b.ApiCalls);
|
||||
|
||||
// Weights: graph structure 40%, operation sequence 25%, data flow 20%, API calls 15%
|
||||
return (graphSim * 0.40m) + (opSim * 0.25m) + (dfSim * 0.20m) + (apiSim * 0.15m);
|
||||
}
|
||||
|
||||
private static decimal ComputeHashSimilarity(byte[] hashA, byte[] hashB)
|
||||
{
|
||||
if (hashA.Length == 0 || hashB.Length == 0)
|
||||
{
|
||||
return 0m;
|
||||
}
|
||||
|
||||
if (hashA.AsSpan().SequenceEqual(hashB))
|
||||
{
|
||||
return 1.0m;
|
||||
}
|
||||
|
||||
// Count matching bits (Hamming similarity)
|
||||
int matchingBits = 0;
|
||||
int totalBits = hashA.Length * 8;
|
||||
int len = Math.Min(hashA.Length, hashB.Length);
|
||||
|
||||
for (int i = 0; i < len; i++)
|
||||
{
|
||||
byte xor = (byte)(hashA[i] ^ hashB[i]);
|
||||
matchingBits += 8 - PopCount(xor);
|
||||
}
|
||||
|
||||
return (decimal)matchingBits / totalBits;
|
||||
}
|
||||
|
||||
private static int PopCount(byte value)
|
||||
{
|
||||
int count = 0;
|
||||
while (value != 0)
|
||||
{
|
||||
count += value & 1;
|
||||
value >>= 1;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private static decimal ComputeApiCallSimilarity(
|
||||
System.Collections.Immutable.ImmutableArray<string> apiCallsA,
|
||||
System.Collections.Immutable.ImmutableArray<string> apiCallsB)
|
||||
{
|
||||
if (apiCallsA.IsEmpty && apiCallsB.IsEmpty)
|
||||
{
|
||||
return 1.0m;
|
||||
}
|
||||
|
||||
if (apiCallsA.IsEmpty || apiCallsB.IsEmpty)
|
||||
{
|
||||
return 0.0m;
|
||||
}
|
||||
|
||||
var setA = new HashSet<string>(apiCallsA, StringComparer.Ordinal);
|
||||
var setB = new HashSet<string>(apiCallsB, StringComparer.Ordinal);
|
||||
|
||||
var intersection = setA.Intersect(setB).Count();
|
||||
var union = setA.Union(setB).Count();
|
||||
|
||||
return union > 0 ? (decimal)intersection / union : 0m;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IReadOnlyDictionary<string, string> FindFunctionMappings(
|
||||
IReadOnlyList<FunctionFingerprint> vulnerable,
|
||||
|
||||
@@ -20,5 +20,6 @@
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="../StellaOps.BinaryIndex.Core/StellaOps.BinaryIndex.Core.csproj" />
|
||||
<ProjectReference Include="../StellaOps.BinaryIndex.Fingerprints/StellaOps.BinaryIndex.Fingerprints.csproj" />
|
||||
<ProjectReference Include="../StellaOps.BinaryIndex.Semantic/StellaOps.BinaryIndex.Semantic.csproj" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
|
||||
@@ -510,6 +510,27 @@ public sealed class CachedBinaryVulnerabilityService : IBinaryVulnerabilityServi
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<CorpusFunctionMatch>> IdentifyFunctionFromCorpusAsync(
|
||||
FunctionFingerprintSet fingerprints,
|
||||
CorpusLookupOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
// Delegate to inner service - corpus lookups typically don't benefit from caching
|
||||
// due to high variance in fingerprint sets
|
||||
return await _inner.IdentifyFunctionFromCorpusAsync(fingerprints, options, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableDictionary<string, ImmutableArray<CorpusFunctionMatch>>> IdentifyFunctionsFromCorpusBatchAsync(
|
||||
IEnumerable<(string Key, FunctionFingerprintSet Fingerprints)> functions,
|
||||
CorpusLookupOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
// Delegate to inner service - batch corpus lookups typically don't benefit from caching
|
||||
return await _inner.IdentifyFunctionsFromCorpusBatchAsync(functions, options, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
_connectionLock.Dispose();
|
||||
|
||||
@@ -99,6 +99,27 @@ public interface IBinaryVulnerabilityService
|
||||
string symbolName,
|
||||
DeltaSigLookupOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Identify a function by its fingerprints using the corpus database.
|
||||
/// Returns matching library functions with CVE associations.
|
||||
/// </summary>
|
||||
/// <param name="fingerprints">Function fingerprints (semantic, instruction, API call).</param>
|
||||
/// <param name="options">Corpus lookup options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Identified functions with vulnerability associations.</returns>
|
||||
Task<ImmutableArray<CorpusFunctionMatch>> IdentifyFunctionFromCorpusAsync(
|
||||
FunctionFingerprintSet fingerprints,
|
||||
CorpusLookupOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Batch identify functions from corpus for scan performance.
|
||||
/// </summary>
|
||||
Task<ImmutableDictionary<string, ImmutableArray<CorpusFunctionMatch>>> IdentifyFunctionsFromCorpusBatchAsync(
|
||||
IEnumerable<(string Key, FunctionFingerprintSet Fingerprints)> functions,
|
||||
CorpusLookupOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
@@ -225,3 +246,141 @@ public sealed record FixStatusResult
|
||||
/// <summary>Reference to the underlying evidence record.</summary>
|
||||
public Guid? EvidenceId { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Function fingerprint set for corpus matching.
|
||||
/// </summary>
|
||||
public sealed record FunctionFingerprintSet
|
||||
{
|
||||
/// <summary>Semantic fingerprint (IR-based).</summary>
|
||||
public byte[]? SemanticFingerprint { get; init; }
|
||||
|
||||
/// <summary>Instruction fingerprint (normalized assembly).</summary>
|
||||
public byte[]? InstructionFingerprint { get; init; }
|
||||
|
||||
/// <summary>API call sequence fingerprint.</summary>
|
||||
public byte[]? ApiCallFingerprint { get; init; }
|
||||
|
||||
/// <summary>Function name if available (may be stripped).</summary>
|
||||
public string? FunctionName { get; init; }
|
||||
|
||||
/// <summary>Architecture of the binary.</summary>
|
||||
public required string Architecture { get; init; }
|
||||
|
||||
/// <summary>Function size in bytes.</summary>
|
||||
public int? FunctionSize { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for corpus-based function identification.
|
||||
/// </summary>
|
||||
public sealed record CorpusLookupOptions
|
||||
{
|
||||
/// <summary>Minimum similarity threshold (0.0-1.0). Default 0.85.</summary>
|
||||
public decimal MinSimilarity { get; init; } = 0.85m;
|
||||
|
||||
/// <summary>Maximum candidates to return. Default 5.</summary>
|
||||
public int MaxCandidates { get; init; } = 5;
|
||||
|
||||
/// <summary>Library name filter (glibc, openssl, etc.). Null means all.</summary>
|
||||
public string? LibraryFilter { get; init; }
|
||||
|
||||
/// <summary>Whether to include CVE associations. Default true.</summary>
|
||||
public bool IncludeCveAssociations { get; init; } = true;
|
||||
|
||||
/// <summary>Whether to check fix status for matched CVEs. Default true.</summary>
|
||||
public bool CheckFixStatus { get; init; } = true;
|
||||
|
||||
/// <summary>Distro hint for fix status lookup.</summary>
|
||||
public string? DistroHint { get; init; }
|
||||
|
||||
/// <summary>Release hint for fix status lookup.</summary>
|
||||
public string? ReleaseHint { get; init; }
|
||||
|
||||
/// <summary>Prefer semantic fingerprint matching over instruction. Default true.</summary>
|
||||
public bool PreferSemanticMatch { get; init; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of corpus-based function identification.
|
||||
/// </summary>
|
||||
public sealed record CorpusFunctionMatch
|
||||
{
|
||||
/// <summary>Matched library name (glibc, openssl, etc.).</summary>
|
||||
public required string LibraryName { get; init; }
|
||||
|
||||
/// <summary>Library version range where this function appears.</summary>
|
||||
public required string VersionRange { get; init; }
|
||||
|
||||
/// <summary>Canonical function name.</summary>
|
||||
public required string FunctionName { get; init; }
|
||||
|
||||
/// <summary>Overall match confidence (0.0-1.0).</summary>
|
||||
public required decimal Confidence { get; init; }
|
||||
|
||||
/// <summary>Match method used (semantic, instruction, combined).</summary>
|
||||
public required CorpusMatchMethod Method { get; init; }
|
||||
|
||||
/// <summary>Semantic similarity score if available.</summary>
|
||||
public decimal? SemanticSimilarity { get; init; }
|
||||
|
||||
/// <summary>Instruction similarity score if available.</summary>
|
||||
public decimal? InstructionSimilarity { get; init; }
|
||||
|
||||
/// <summary>CVEs affecting this function (if requested).</summary>
|
||||
public ImmutableArray<CorpusCveAssociation> CveAssociations { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Method used for corpus matching.
|
||||
/// </summary>
|
||||
public enum CorpusMatchMethod
|
||||
{
|
||||
/// <summary>Matched via semantic fingerprint (IR-based).</summary>
|
||||
Semantic,
|
||||
|
||||
/// <summary>Matched via instruction fingerprint.</summary>
|
||||
Instruction,
|
||||
|
||||
/// <summary>Matched via API call sequence.</summary>
|
||||
ApiCall,
|
||||
|
||||
/// <summary>Combined match using multiple fingerprints.</summary>
|
||||
Combined
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// CVE association from corpus for a matched function.
|
||||
/// </summary>
|
||||
public sealed record CorpusCveAssociation
|
||||
{
|
||||
/// <summary>CVE identifier.</summary>
|
||||
public required string CveId { get; init; }
|
||||
|
||||
/// <summary>Affected state for the matched version.</summary>
|
||||
public required CorpusAffectedState AffectedState { get; init; }
|
||||
|
||||
/// <summary>Version where fix was applied (if fixed).</summary>
|
||||
public string? FixedInVersion { get; init; }
|
||||
|
||||
/// <summary>Confidence in the CVE association.</summary>
|
||||
public required decimal Confidence { get; init; }
|
||||
|
||||
/// <summary>Evidence type for the association.</summary>
|
||||
public string? EvidenceType { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Affected state for corpus CVE associations.
|
||||
/// </summary>
|
||||
public enum CorpusAffectedState
|
||||
{
|
||||
/// <summary>Function is vulnerable to the CVE.</summary>
|
||||
Vulnerable,
|
||||
|
||||
/// <summary>Function has been fixed.</summary>
|
||||
Fixed,
|
||||
|
||||
/// <summary>Function is not affected by the CVE.</summary>
|
||||
NotAffected
|
||||
}
|
||||
|
||||
@@ -0,0 +1,447 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Net.Http;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Connectors;
|
||||
|
||||
/// <summary>
|
||||
/// Corpus connector for libcurl/curl library.
|
||||
/// Fetches pre-built binaries from distribution packages or official releases.
|
||||
/// </summary>
|
||||
public sealed partial class CurlCorpusConnector : ILibraryCorpusConnector
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly ILogger<CurlCorpusConnector> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for curl official releases.
|
||||
/// </summary>
|
||||
public const string CurlReleasesUrl = "https://curl.se/download/";
|
||||
|
||||
/// <summary>
|
||||
/// Supported architectures.
|
||||
/// </summary>
|
||||
private static readonly ImmutableArray<string> s_supportedArchitectures =
|
||||
["x86_64", "aarch64", "armhf", "i386"];
|
||||
|
||||
public CurlCorpusConnector(
|
||||
IHttpClientFactory httpClientFactory,
|
||||
ILogger<CurlCorpusConnector> logger)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string LibraryName => "curl";
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<string> SupportedArchitectures => s_supportedArchitectures;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("Curl");
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Fetch releases from curl.se
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching curl versions from {Url}", CurlReleasesUrl);
|
||||
var html = await client.GetStringAsync(CurlReleasesUrl, ct);
|
||||
var currentVersions = ParseVersionsFromListing(html);
|
||||
foreach (var v in currentVersions)
|
||||
{
|
||||
versions.Add(v);
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch current curl releases");
|
||||
}
|
||||
|
||||
// Also check archive
|
||||
const string archiveUrl = "https://curl.se/download/archeology/";
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching old curl versions from {Url}", archiveUrl);
|
||||
var archiveHtml = await client.GetStringAsync(archiveUrl, ct);
|
||||
var archiveVersions = ParseVersionsFromListing(archiveHtml);
|
||||
foreach (var v in archiveVersions)
|
||||
{
|
||||
versions.Add(v);
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch curl archive releases");
|
||||
}
|
||||
|
||||
_logger.LogInformation("Found {Count} curl versions", versions.Count);
|
||||
return [.. versions.OrderByDescending(ParseVersion)];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<LibraryBinary?> FetchBinaryAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var normalizedArch = NormalizeArchitecture(architecture);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Fetching curl {Version} for {Architecture}",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
// Strategy 1: Try Debian/Ubuntu package (pre-built, preferred)
|
||||
var debBinary = await TryFetchDebianPackageAsync(version, normalizedArch, options, ct);
|
||||
if (debBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found curl {Version} from Debian packages", version);
|
||||
return debBinary;
|
||||
}
|
||||
|
||||
// Strategy 2: Try Alpine APK
|
||||
var alpineBinary = await TryFetchAlpinePackageAsync(version, normalizedArch, options, ct);
|
||||
if (alpineBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found curl {Version} from Alpine packages", version);
|
||||
return alpineBinary;
|
||||
}
|
||||
|
||||
_logger.LogWarning(
|
||||
"Could not find pre-built curl {Version} for {Architecture}. Source build not implemented.",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
|
||||
IEnumerable<string> versions,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
foreach (var version in versions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var binary = await FetchBinaryAsync(version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
yield return binary;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private ImmutableArray<string> ParseVersionsFromListing(string html)
|
||||
{
|
||||
// Match patterns like curl-8.5.0.tar.gz or curl-7.88.1.tar.xz
|
||||
var matches = CurlVersionRegex().Matches(html);
|
||||
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["version"].Success)
|
||||
{
|
||||
versions.Add(match.Groups["version"].Value);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. versions];
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchDebianPackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("DebianPackages");
|
||||
|
||||
var debArch = MapToDebianArchitecture(architecture);
|
||||
if (debArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// curl library package names:
|
||||
// libcurl4 (current), libcurl3 (older)
|
||||
var packageNames = new[] { "libcurl4", "libcurl3" };
|
||||
|
||||
foreach (var packageName in packageNames)
|
||||
{
|
||||
var packageUrls = await FindDebianPackageUrlsAsync(client, packageName, version, debArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Debian curl package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
var binary = await ExtractLibCurlFromDebAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Debian package from {Url}", url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchAlpinePackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("AlpinePackages");
|
||||
|
||||
var alpineArch = MapToAlpineArchitecture(architecture);
|
||||
if (alpineArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Query Alpine package repository for libcurl
|
||||
var packageUrls = await FindAlpinePackageUrlsAsync(client, "libcurl", version, alpineArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Alpine curl package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
var binary = await ExtractLibCurlFromApkAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Alpine package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindDebianPackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string packageName,
|
||||
string version,
|
||||
string debianArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var apiUrl = $"https://snapshot.debian.org/mr/binary/{packageName}/";
|
||||
|
||||
try
|
||||
{
|
||||
var response = await client.GetStringAsync(apiUrl, ct);
|
||||
var urls = ExtractPackageUrlsForVersion(response, version, debianArch);
|
||||
return urls;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Debian snapshot API query failed for {Package}", packageName);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindAlpinePackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string packageName,
|
||||
string version,
|
||||
string alpineArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var releases = new[] { "v3.20", "v3.19", "v3.18", "v3.17" };
|
||||
var urls = new List<string>();
|
||||
|
||||
foreach (var release in releases)
|
||||
{
|
||||
var baseUrl = $"https://dl-cdn.alpinelinux.org/alpine/{release}/main/{alpineArch}/";
|
||||
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(baseUrl, ct);
|
||||
|
||||
var matches = AlpinePackageRegex().Matches(html);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["name"].Value == packageName &&
|
||||
match.Groups["version"].Value.StartsWith(version, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
urls.Add($"{baseUrl}{match.Groups["file"].Value}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException)
|
||||
{
|
||||
// Skip releases we can't access
|
||||
}
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibCurlFromDebAsync(
|
||||
byte[] debPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .deb extraction - placeholder
|
||||
await Task.CompletedTask;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Debian package extraction not fully implemented. Package size: {Size} bytes",
|
||||
debPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibCurlFromApkAsync(
|
||||
byte[] apkPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .apk extraction - placeholder
|
||||
await Task.CompletedTask;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Alpine package extraction not fully implemented. Package size: {Size} bytes",
|
||||
apkPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static ImmutableArray<string> ExtractPackageUrlsForVersion(
|
||||
string json,
|
||||
string version,
|
||||
string debianArch)
|
||||
{
|
||||
var urls = new List<string>();
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = System.Text.Json.JsonDocument.Parse(json);
|
||||
|
||||
if (doc.RootElement.TryGetProperty("result", out var results))
|
||||
{
|
||||
foreach (var item in results.EnumerateArray())
|
||||
{
|
||||
if (item.TryGetProperty("binary_version", out var binaryVersion) &&
|
||||
item.TryGetProperty("architecture", out var arch))
|
||||
{
|
||||
var binVer = binaryVersion.GetString() ?? string.Empty;
|
||||
var archStr = arch.GetString() ?? string.Empty;
|
||||
|
||||
if (binVer.Contains(version, StringComparison.OrdinalIgnoreCase) &&
|
||||
archStr.Equals(debianArch, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
if (item.TryGetProperty("files", out var files))
|
||||
{
|
||||
foreach (var file in files.EnumerateArray())
|
||||
{
|
||||
if (file.TryGetProperty("hash", out var hashElement))
|
||||
{
|
||||
var hash = hashElement.GetString();
|
||||
if (!string.IsNullOrEmpty(hash))
|
||||
{
|
||||
urls.Add($"https://snapshot.debian.org/file/{hash}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (System.Text.Json.JsonException)
|
||||
{
|
||||
// Invalid JSON
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private static string NormalizeArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" or "amd64" => "x86_64",
|
||||
"aarch64" or "arm64" => "aarch64",
|
||||
"armhf" or "armv7" or "arm" => "armhf",
|
||||
"i386" or "i686" or "x86" => "i386",
|
||||
_ => architecture
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToDebianArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "i386",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToAlpineArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "x86_64",
|
||||
"aarch64" => "aarch64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "x86",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static Version? ParseVersion(string versionString)
|
||||
{
|
||||
if (Version.TryParse(versionString, out var version))
|
||||
{
|
||||
return version;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Generated Regexes
|
||||
|
||||
[GeneratedRegex(@"curl-(?<version>\d+\.\d+(?:\.\d+)?)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex CurlVersionRegex();
|
||||
|
||||
[GeneratedRegex(@"href=""(?<file>(?<name>[a-z0-9_-]+)-(?<version>[0-9.]+(?:-r\d+)?)\.apk)""", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex AlpinePackageRegex();
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,549 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Net.Http;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Http;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Connectors;
|
||||
|
||||
/// <summary>
|
||||
/// Corpus connector for GNU C Library (glibc).
|
||||
/// Fetches pre-built binaries from Debian/Ubuntu package repositories
|
||||
/// or GNU FTP mirrors for source builds.
|
||||
/// </summary>
|
||||
public sealed partial class GlibcCorpusConnector : ILibraryCorpusConnector
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly ILogger<GlibcCorpusConnector> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for GNU FTP mirror (source tarballs).
|
||||
/// </summary>
|
||||
public const string GnuMirrorUrl = "https://ftp.gnu.org/gnu/glibc/";
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for Debian package archive.
|
||||
/// </summary>
|
||||
public const string DebianSnapshotUrl = "https://snapshot.debian.org/package/glibc/";
|
||||
|
||||
/// <summary>
|
||||
/// Supported architectures for glibc.
|
||||
/// </summary>
|
||||
private static readonly ImmutableArray<string> s_supportedArchitectures =
|
||||
["x86_64", "aarch64", "armhf", "i386", "arm64", "ppc64el", "s390x"];
|
||||
|
||||
public GlibcCorpusConnector(
|
||||
IHttpClientFactory httpClientFactory,
|
||||
ILogger<GlibcCorpusConnector> logger)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string LibraryName => "glibc";
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<string> SupportedArchitectures => s_supportedArchitectures;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("GnuMirror");
|
||||
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching glibc versions from {Url}", GnuMirrorUrl);
|
||||
var html = await client.GetStringAsync(GnuMirrorUrl, ct);
|
||||
|
||||
// Parse directory listing for glibc-X.Y.tar.xz files
|
||||
var versions = ParseVersionsFromListing(html);
|
||||
|
||||
_logger.LogInformation("Found {Count} glibc versions from GNU mirror", versions.Length);
|
||||
return versions;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch glibc versions from GNU mirror, trying Debian snapshot");
|
||||
|
||||
// Fallback to Debian snapshot
|
||||
return await GetVersionsFromDebianSnapshotAsync(client, ct);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<LibraryBinary?> FetchBinaryAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var normalizedArch = NormalizeArchitecture(architecture);
|
||||
var abi = options?.PreferredAbi ?? "gnu";
|
||||
|
||||
_logger.LogInformation(
|
||||
"Fetching glibc {Version} for {Architecture}",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
// Strategy 1: Try Debian package (pre-built, preferred)
|
||||
var debBinary = await TryFetchDebianPackageAsync(version, normalizedArch, options, ct);
|
||||
if (debBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found glibc {Version} from Debian packages", version);
|
||||
return debBinary;
|
||||
}
|
||||
|
||||
// Strategy 2: Try Ubuntu package
|
||||
var ubuntuBinary = await TryFetchUbuntuPackageAsync(version, normalizedArch, options, ct);
|
||||
if (ubuntuBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found glibc {Version} from Ubuntu packages", version);
|
||||
return ubuntuBinary;
|
||||
}
|
||||
|
||||
_logger.LogWarning(
|
||||
"Could not find pre-built glibc {Version} for {Architecture}. Source build not implemented.",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
|
||||
IEnumerable<string> versions,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
foreach (var version in versions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var binary = await FetchBinaryAsync(version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
yield return binary;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private ImmutableArray<string> ParseVersionsFromListing(string html)
|
||||
{
|
||||
// Match patterns like glibc-2.31.tar.gz or glibc-2.38.tar.xz
|
||||
var matches = GlibcVersionRegex().Matches(html);
|
||||
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["version"].Success)
|
||||
{
|
||||
versions.Add(match.Groups["version"].Value);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. versions.OrderByDescending(ParseVersion)];
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> GetVersionsFromDebianSnapshotAsync(
|
||||
HttpClient client,
|
||||
CancellationToken ct)
|
||||
{
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(DebianSnapshotUrl, ct);
|
||||
|
||||
// Parse Debian snapshot listing for glibc versions
|
||||
var matches = DebianVersionRegex().Matches(html);
|
||||
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["version"].Success)
|
||||
{
|
||||
// Extract just the upstream version (before the Debian revision)
|
||||
var fullVersion = match.Groups["version"].Value;
|
||||
var upstreamVersion = ExtractUpstreamVersion(fullVersion);
|
||||
if (!string.IsNullOrEmpty(upstreamVersion))
|
||||
{
|
||||
versions.Add(upstreamVersion);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [.. versions.OrderByDescending(ParseVersion)];
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to fetch versions from Debian snapshot");
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchDebianPackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("DebianPackages");
|
||||
|
||||
// Map architecture to Debian naming
|
||||
var debArch = MapToDebianArchitecture(architecture);
|
||||
if (debArch is null)
|
||||
{
|
||||
_logger.LogDebug("Architecture {Arch} not supported for Debian packages", architecture);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Query Debian snapshot for matching package
|
||||
var packageUrls = await FindDebianPackageUrlsAsync(client, version, debArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Debian package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
// Extract the libc6 shared library from the .deb package
|
||||
var binary = await ExtractLibcFromDebAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Debian package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchUbuntuPackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("UbuntuPackages");
|
||||
|
||||
// Map architecture to Ubuntu naming (same as Debian)
|
||||
var debArch = MapToDebianArchitecture(architecture);
|
||||
if (debArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Query Launchpad for matching package
|
||||
var packageUrls = await FindUbuntuPackageUrlsAsync(client, version, debArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Ubuntu package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
// Extract the libc6 shared library from the .deb package
|
||||
var binary = await ExtractLibcFromDebAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Ubuntu package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindDebianPackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string version,
|
||||
string debianArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Construct Debian snapshot API URL
|
||||
// Format: https://snapshot.debian.org/mr/package/glibc/<version>/binfiles/libc6/<arch>
|
||||
var apiUrl = $"https://snapshot.debian.org/mr/package/glibc/{version}/binfiles/libc6/{debianArch}";
|
||||
|
||||
try
|
||||
{
|
||||
var response = await client.GetStringAsync(apiUrl, ct);
|
||||
|
||||
// Parse JSON response to get file hashes and construct download URLs
|
||||
// Simplified: extract URLs from response
|
||||
var urls = ExtractPackageUrlsFromSnapshotResponse(response);
|
||||
return urls;
|
||||
}
|
||||
catch (HttpRequestException)
|
||||
{
|
||||
// Try alternative: direct binary package search
|
||||
return await FindDebianPackageUrlsViaSearchAsync(client, version, debianArch, ct);
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindDebianPackageUrlsViaSearchAsync(
|
||||
HttpClient client,
|
||||
string version,
|
||||
string debianArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Fallback: search packages.debian.org
|
||||
var searchUrl = $"https://packages.debian.org/search?keywords=libc6&searchon=names&suite=all§ion=all&arch={debianArch}";
|
||||
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(searchUrl, ct);
|
||||
|
||||
// Parse search results to find matching version
|
||||
var urls = ParseDebianSearchResults(html, version, debianArch);
|
||||
return urls;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Debian package search failed");
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindUbuntuPackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string version,
|
||||
string debianArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Query Launchpad for libc6 package
|
||||
// Format: https://launchpad.net/ubuntu/+archive/primary/+files/libc6_<version>_<arch>.deb
|
||||
var launchpadApiUrl = $"https://api.launchpad.net/1.0/ubuntu/+archive/primary?ws.op=getPublishedBinaries&binary_name=libc6&version={version}&distro_arch_series=https://api.launchpad.net/1.0/ubuntu/+distroarchseries/{debianArch}";
|
||||
|
||||
try
|
||||
{
|
||||
var response = await client.GetStringAsync(launchpadApiUrl, ct);
|
||||
var urls = ExtractPackageUrlsFromLaunchpadResponse(response);
|
||||
return urls;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Launchpad API query failed");
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibcFromDebAsync(
|
||||
byte[] debPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .deb files are ar archives containing:
|
||||
// - debian-binary (version string)
|
||||
// - control.tar.xz (package metadata)
|
||||
// - data.tar.xz (actual files)
|
||||
//
|
||||
// We need to extract /lib/x86_64-linux-gnu/libc.so.6 from data.tar.xz
|
||||
|
||||
try
|
||||
{
|
||||
// Use SharpCompress or similar to extract (placeholder for now)
|
||||
// In production, implement proper ar + tar.xz extraction
|
||||
|
||||
await Task.CompletedTask; // Placeholder for async extraction
|
||||
|
||||
// For now, return null - full extraction requires SharpCompress/libarchive
|
||||
_logger.LogDebug(
|
||||
"Debian package extraction not fully implemented. Package size: {Size} bytes",
|
||||
debPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to extract libc from .deb package");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static string NormalizeArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" or "amd64" => "x86_64",
|
||||
"aarch64" or "arm64" => "aarch64",
|
||||
"armhf" or "armv7" or "arm" => "armhf",
|
||||
"i386" or "i686" or "x86" => "i386",
|
||||
"ppc64le" or "ppc64el" => "ppc64el",
|
||||
"s390x" => "s390x",
|
||||
_ => architecture
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToDebianArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "i386",
|
||||
"ppc64el" => "ppc64el",
|
||||
"s390x" => "s390x",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static string? ExtractUpstreamVersion(string debianVersion)
|
||||
{
|
||||
// Debian version format: [epoch:]upstream_version[-debian_revision]
|
||||
// Examples:
|
||||
// 2.31-13+deb11u5 -> 2.31
|
||||
// 1:2.35-0ubuntu3 -> 2.35
|
||||
var match = UpstreamVersionRegex().Match(debianVersion);
|
||||
return match.Success ? match.Groups["upstream"].Value : null;
|
||||
}
|
||||
|
||||
private static ImmutableArray<string> ExtractPackageUrlsFromSnapshotResponse(string json)
|
||||
{
|
||||
// Parse JSON response from snapshot.debian.org
|
||||
// Format: {"result": [{"hash": "...", "name": "libc6_2.31-13_amd64.deb"}]}
|
||||
var urls = new List<string>();
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = System.Text.Json.JsonDocument.Parse(json);
|
||||
|
||||
if (doc.RootElement.TryGetProperty("result", out var results))
|
||||
{
|
||||
foreach (var item in results.EnumerateArray())
|
||||
{
|
||||
if (item.TryGetProperty("hash", out var hashElement))
|
||||
{
|
||||
var hash = hashElement.GetString();
|
||||
if (!string.IsNullOrEmpty(hash))
|
||||
{
|
||||
// Construct download URL from hash
|
||||
var url = $"https://snapshot.debian.org/file/{hash}";
|
||||
urls.Add(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (System.Text.Json.JsonException)
|
||||
{
|
||||
// Invalid JSON, return empty
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private static ImmutableArray<string> ExtractPackageUrlsFromLaunchpadResponse(string json)
|
||||
{
|
||||
var urls = new List<string>();
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = System.Text.Json.JsonDocument.Parse(json);
|
||||
|
||||
if (doc.RootElement.TryGetProperty("entries", out var entries))
|
||||
{
|
||||
foreach (var entry in entries.EnumerateArray())
|
||||
{
|
||||
if (entry.TryGetProperty("binary_package_version", out var versionElement) &&
|
||||
entry.TryGetProperty("self_link", out var selfLink))
|
||||
{
|
||||
var link = selfLink.GetString();
|
||||
if (!string.IsNullOrEmpty(link))
|
||||
{
|
||||
// Launchpad provides download URL in separate field
|
||||
urls.Add(link);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (System.Text.Json.JsonException)
|
||||
{
|
||||
// Invalid JSON
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private static ImmutableArray<string> ParseDebianSearchResults(
|
||||
string html,
|
||||
string version,
|
||||
string debianArch)
|
||||
{
|
||||
// Parse HTML search results to find package URLs
|
||||
// This is a simplified implementation
|
||||
var urls = new List<string>();
|
||||
|
||||
var matches = DebianPackageUrlRegex().Matches(html);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["url"].Success)
|
||||
{
|
||||
var url = match.Groups["url"].Value;
|
||||
if (url.Contains(version) && url.Contains(debianArch))
|
||||
{
|
||||
urls.Add(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private static Version? ParseVersion(string versionString)
|
||||
{
|
||||
// Try to parse as Version, handling various formats
|
||||
// 2.31 -> 2.31.0.0
|
||||
// 2.31.1 -> 2.31.1.0
|
||||
|
||||
if (Version.TryParse(versionString, out var version))
|
||||
{
|
||||
return version;
|
||||
}
|
||||
|
||||
// Try adding .0 suffix
|
||||
if (Version.TryParse(versionString + ".0", out version))
|
||||
{
|
||||
return version;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Generated Regexes
|
||||
|
||||
[GeneratedRegex(@"glibc-(?<version>\d+\.\d+(?:\.\d+)?)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex GlibcVersionRegex();
|
||||
|
||||
[GeneratedRegex(@"(?<version>\d+\.\d+(?:\.\d+)?(?:-\d+)?)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex DebianVersionRegex();
|
||||
|
||||
[GeneratedRegex(@"(?:^|\:)?(?<upstream>\d+\.\d+(?:\.\d+)?)(?:-|$)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex UpstreamVersionRegex();
|
||||
|
||||
[GeneratedRegex(@"href=""(?<url>https?://[^""]+\.deb)""", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex DebianPackageUrlRegex();
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,554 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Net.Http;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Connectors;
|
||||
|
||||
/// <summary>
|
||||
/// Corpus connector for OpenSSL libraries.
|
||||
/// Fetches pre-built binaries from distribution packages or official releases.
|
||||
/// </summary>
|
||||
public sealed partial class OpenSslCorpusConnector : ILibraryCorpusConnector
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly ILogger<OpenSslCorpusConnector> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for OpenSSL official releases.
|
||||
/// </summary>
|
||||
public const string OpenSslReleasesUrl = "https://www.openssl.org/source/";
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for OpenSSL old releases.
|
||||
/// </summary>
|
||||
public const string OpenSslOldReleasesUrl = "https://www.openssl.org/source/old/";
|
||||
|
||||
/// <summary>
|
||||
/// Supported architectures.
|
||||
/// </summary>
|
||||
private static readonly ImmutableArray<string> s_supportedArchitectures =
|
||||
["x86_64", "aarch64", "armhf", "i386"];
|
||||
|
||||
public OpenSslCorpusConnector(
|
||||
IHttpClientFactory httpClientFactory,
|
||||
ILogger<OpenSslCorpusConnector> logger)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string LibraryName => "openssl";
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<string> SupportedArchitectures => s_supportedArchitectures;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("OpenSsl");
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Fetch current releases
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching OpenSSL versions from {Url}", OpenSslReleasesUrl);
|
||||
var html = await client.GetStringAsync(OpenSslReleasesUrl, ct);
|
||||
var currentVersions = ParseVersionsFromListing(html);
|
||||
foreach (var v in currentVersions)
|
||||
{
|
||||
versions.Add(v);
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch current OpenSSL releases");
|
||||
}
|
||||
|
||||
// Fetch old releases index
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching old OpenSSL versions from {Url}", OpenSslOldReleasesUrl);
|
||||
var oldHtml = await client.GetStringAsync(OpenSslOldReleasesUrl, ct);
|
||||
var oldVersionDirs = ParseOldVersionDirectories(oldHtml);
|
||||
|
||||
foreach (var dir in oldVersionDirs)
|
||||
{
|
||||
var dirUrl = $"{OpenSslOldReleasesUrl}{dir}/";
|
||||
try
|
||||
{
|
||||
var dirHtml = await client.GetStringAsync(dirUrl, ct);
|
||||
var dirVersions = ParseVersionsFromListing(dirHtml);
|
||||
foreach (var v in dirVersions)
|
||||
{
|
||||
versions.Add(v);
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException)
|
||||
{
|
||||
// Skip directories we can't access
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch old OpenSSL releases");
|
||||
}
|
||||
|
||||
_logger.LogInformation("Found {Count} OpenSSL versions", versions.Count);
|
||||
return [.. versions.OrderByDescending(ParseVersion)];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<LibraryBinary?> FetchBinaryAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var normalizedArch = NormalizeArchitecture(architecture);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Fetching OpenSSL {Version} for {Architecture}",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
// Strategy 1: Try Debian/Ubuntu package (pre-built, preferred)
|
||||
var debBinary = await TryFetchDebianPackageAsync(version, normalizedArch, options, ct);
|
||||
if (debBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found OpenSSL {Version} from Debian packages", version);
|
||||
return debBinary;
|
||||
}
|
||||
|
||||
// Strategy 2: Try Alpine APK
|
||||
var alpineBinary = await TryFetchAlpinePackageAsync(version, normalizedArch, options, ct);
|
||||
if (alpineBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found OpenSSL {Version} from Alpine packages", version);
|
||||
return alpineBinary;
|
||||
}
|
||||
|
||||
_logger.LogWarning(
|
||||
"Could not find pre-built OpenSSL {Version} for {Architecture}. Source build not implemented.",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
|
||||
IEnumerable<string> versions,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
foreach (var version in versions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var binary = await FetchBinaryAsync(version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
yield return binary;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private ImmutableArray<string> ParseVersionsFromListing(string html)
|
||||
{
|
||||
// Match patterns like openssl-1.1.1n.tar.gz or openssl-3.0.8.tar.gz
|
||||
var matches = OpenSslVersionRegex().Matches(html);
|
||||
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["version"].Success)
|
||||
{
|
||||
var version = match.Groups["version"].Value;
|
||||
// Normalize version: 1.1.1n -> 1.1.1n, 3.0.8 -> 3.0.8
|
||||
versions.Add(version);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. versions];
|
||||
}
|
||||
|
||||
private ImmutableArray<string> ParseOldVersionDirectories(string html)
|
||||
{
|
||||
// Match directory names like 1.0.2/, 1.1.0/, 1.1.1/, 3.0/
|
||||
var matches = VersionDirRegex().Matches(html);
|
||||
|
||||
var dirs = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["dir"].Success)
|
||||
{
|
||||
dirs.Add(match.Groups["dir"].Value);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. dirs];
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchDebianPackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("DebianPackages");
|
||||
|
||||
var debArch = MapToDebianArchitecture(architecture);
|
||||
if (debArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Determine package name based on version
|
||||
// OpenSSL 1.x -> libssl1.1
|
||||
// OpenSSL 3.x -> libssl3
|
||||
var packageName = GetDebianPackageName(version);
|
||||
|
||||
// Query Debian snapshot for matching package
|
||||
var packageUrls = await FindDebianPackageUrlsAsync(client, packageName, version, debArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Debian OpenSSL package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
// Extract libssl.so.X from the .deb package
|
||||
var binary = await ExtractLibSslFromDebAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Debian package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchAlpinePackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("AlpinePackages");
|
||||
|
||||
var alpineArch = MapToAlpineArchitecture(architecture);
|
||||
if (alpineArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Query Alpine package repository
|
||||
var packageUrls = await FindAlpinePackageUrlsAsync(client, "libssl3", version, alpineArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Alpine OpenSSL package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
// Extract libssl.so.X from the .apk package
|
||||
var binary = await ExtractLibSslFromApkAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Alpine package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindDebianPackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string packageName,
|
||||
string version,
|
||||
string debianArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Map OpenSSL version to Debian source package version
|
||||
// e.g., 1.1.1n -> libssl1.1_1.1.1n-0+deb11u4
|
||||
var apiUrl = $"https://snapshot.debian.org/mr/binary/{packageName}/";
|
||||
|
||||
try
|
||||
{
|
||||
var response = await client.GetStringAsync(apiUrl, ct);
|
||||
|
||||
// Parse JSON response to find matching versions
|
||||
var urls = ExtractPackageUrlsForVersion(response, version, debianArch);
|
||||
return urls;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Debian snapshot API query failed for {Package}", packageName);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindAlpinePackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string packageName,
|
||||
string version,
|
||||
string alpineArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Alpine uses different repository structure
|
||||
// https://dl-cdn.alpinelinux.org/alpine/v3.18/main/x86_64/libssl3-3.1.1-r1.apk
|
||||
var releases = new[] { "v3.20", "v3.19", "v3.18", "v3.17" };
|
||||
var urls = new List<string>();
|
||||
|
||||
foreach (var release in releases)
|
||||
{
|
||||
var baseUrl = $"https://dl-cdn.alpinelinux.org/alpine/{release}/main/{alpineArch}/";
|
||||
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(baseUrl, ct);
|
||||
|
||||
// Find package URLs matching version
|
||||
var matches = AlpinePackageRegex().Matches(html);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["name"].Value == packageName &&
|
||||
match.Groups["version"].Value.StartsWith(version, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
urls.Add($"{baseUrl}{match.Groups["file"].Value}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException)
|
||||
{
|
||||
// Skip releases we can't access
|
||||
}
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibSslFromDebAsync(
|
||||
byte[] debPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .deb extraction - placeholder for now
|
||||
// In production, implement proper ar + tar.xz extraction
|
||||
|
||||
await Task.CompletedTask;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Debian package extraction not fully implemented. Package size: {Size} bytes",
|
||||
debPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibSslFromApkAsync(
|
||||
byte[] apkPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .apk files are gzip-compressed tar archives
|
||||
// In production, implement proper tar.gz extraction
|
||||
|
||||
await Task.CompletedTask;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Alpine package extraction not fully implemented. Package size: {Size} bytes",
|
||||
apkPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static string GetDebianPackageName(string version)
|
||||
{
|
||||
// OpenSSL 1.0.x -> libssl1.0.0
|
||||
// OpenSSL 1.1.x -> libssl1.1
|
||||
// OpenSSL 3.x -> libssl3
|
||||
if (version.StartsWith("1.0", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return "libssl1.0.0";
|
||||
}
|
||||
else if (version.StartsWith("1.1", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return "libssl1.1";
|
||||
}
|
||||
else
|
||||
{
|
||||
return "libssl3";
|
||||
}
|
||||
}
|
||||
|
||||
private static ImmutableArray<string> ExtractPackageUrlsForVersion(
|
||||
string json,
|
||||
string version,
|
||||
string debianArch)
|
||||
{
|
||||
var urls = new List<string>();
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = System.Text.Json.JsonDocument.Parse(json);
|
||||
|
||||
if (doc.RootElement.TryGetProperty("result", out var results))
|
||||
{
|
||||
foreach (var item in results.EnumerateArray())
|
||||
{
|
||||
if (item.TryGetProperty("binary_version", out var binaryVersion) &&
|
||||
item.TryGetProperty("architecture", out var arch))
|
||||
{
|
||||
var binVer = binaryVersion.GetString() ?? string.Empty;
|
||||
var archStr = arch.GetString() ?? string.Empty;
|
||||
|
||||
// Check if version matches and architecture matches
|
||||
if (binVer.Contains(version, StringComparison.OrdinalIgnoreCase) &&
|
||||
archStr.Equals(debianArch, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
if (item.TryGetProperty("files", out var files))
|
||||
{
|
||||
foreach (var file in files.EnumerateArray())
|
||||
{
|
||||
if (file.TryGetProperty("hash", out var hashElement))
|
||||
{
|
||||
var hash = hashElement.GetString();
|
||||
if (!string.IsNullOrEmpty(hash))
|
||||
{
|
||||
urls.Add($"https://snapshot.debian.org/file/{hash}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (System.Text.Json.JsonException)
|
||||
{
|
||||
// Invalid JSON
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private static string NormalizeArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" or "amd64" => "x86_64",
|
||||
"aarch64" or "arm64" => "aarch64",
|
||||
"armhf" or "armv7" or "arm" => "armhf",
|
||||
"i386" or "i686" or "x86" => "i386",
|
||||
_ => architecture
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToDebianArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "i386",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToAlpineArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "x86_64",
|
||||
"aarch64" => "aarch64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "x86",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static Version? ParseVersion(string versionString)
|
||||
{
|
||||
// OpenSSL versions can be like 1.1.1n or 3.0.8
|
||||
// Extract numeric parts only
|
||||
var numericPart = ExtractNumericVersion(versionString);
|
||||
if (Version.TryParse(numericPart, out var version))
|
||||
{
|
||||
return version;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static string ExtractNumericVersion(string version)
|
||||
{
|
||||
// 1.1.1n -> 1.1.1
|
||||
// 3.0.8 -> 3.0.8
|
||||
var parts = new List<string>();
|
||||
foreach (var ch in version)
|
||||
{
|
||||
if (char.IsDigit(ch) || ch == '.')
|
||||
{
|
||||
if (parts.Count == 0)
|
||||
{
|
||||
parts.Add(ch.ToString());
|
||||
}
|
||||
else if (ch == '.')
|
||||
{
|
||||
parts.Add(".");
|
||||
}
|
||||
else
|
||||
{
|
||||
parts[^1] += ch;
|
||||
}
|
||||
}
|
||||
else if (parts.Count > 0 && parts[^1] != ".")
|
||||
{
|
||||
// Stop at first non-digit after version starts
|
||||
break;
|
||||
}
|
||||
}
|
||||
return string.Join("", parts).TrimEnd('.');
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Generated Regexes
|
||||
|
||||
[GeneratedRegex(@"openssl-(?<version>\d+\.\d+\.\d+[a-z]?)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex OpenSslVersionRegex();
|
||||
|
||||
[GeneratedRegex(@"href=""(?<dir>\d+\.\d+(?:\.\d+)?)/""", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex VersionDirRegex();
|
||||
|
||||
[GeneratedRegex(@"href=""(?<file>(?<name>[a-z0-9_-]+)-(?<version>[0-9.]+[a-z]?-r\d+)\.apk)""", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex AlpinePackageRegex();
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,452 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Net.Http;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Connectors;
|
||||
|
||||
/// <summary>
|
||||
/// Corpus connector for zlib compression library.
|
||||
/// Fetches pre-built binaries from distribution packages or official releases.
|
||||
/// </summary>
|
||||
public sealed partial class ZlibCorpusConnector : ILibraryCorpusConnector
|
||||
{
|
||||
private readonly IHttpClientFactory _httpClientFactory;
|
||||
private readonly ILogger<ZlibCorpusConnector> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for zlib official releases.
|
||||
/// </summary>
|
||||
public const string ZlibReleasesUrl = "https://www.zlib.net/";
|
||||
|
||||
/// <summary>
|
||||
/// Base URL for zlib fossils/old releases.
|
||||
/// </summary>
|
||||
public const string ZlibFossilsUrl = "https://www.zlib.net/fossils/";
|
||||
|
||||
/// <summary>
|
||||
/// Supported architectures.
|
||||
/// </summary>
|
||||
private static readonly ImmutableArray<string> s_supportedArchitectures =
|
||||
["x86_64", "aarch64", "armhf", "i386"];
|
||||
|
||||
public ZlibCorpusConnector(
|
||||
IHttpClientFactory httpClientFactory,
|
||||
ILogger<ZlibCorpusConnector> logger)
|
||||
{
|
||||
_httpClientFactory = httpClientFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string LibraryName => "zlib";
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<string> SupportedArchitectures => s_supportedArchitectures;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("Zlib");
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
// Fetch current release
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching zlib versions from {Url}", ZlibReleasesUrl);
|
||||
var html = await client.GetStringAsync(ZlibReleasesUrl, ct);
|
||||
var currentVersions = ParseVersionsFromListing(html);
|
||||
foreach (var v in currentVersions)
|
||||
{
|
||||
versions.Add(v);
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch current zlib releases");
|
||||
}
|
||||
|
||||
// Fetch old releases (fossils)
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Fetching old zlib versions from {Url}", ZlibFossilsUrl);
|
||||
var fossilsHtml = await client.GetStringAsync(ZlibFossilsUrl, ct);
|
||||
var fossilVersions = ParseVersionsFromListing(fossilsHtml);
|
||||
foreach (var v in fossilVersions)
|
||||
{
|
||||
versions.Add(v);
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to fetch old zlib releases");
|
||||
}
|
||||
|
||||
_logger.LogInformation("Found {Count} zlib versions", versions.Count);
|
||||
return [.. versions.OrderByDescending(ParseVersion)];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<LibraryBinary?> FetchBinaryAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var normalizedArch = NormalizeArchitecture(architecture);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Fetching zlib {Version} for {Architecture}",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
// Strategy 1: Try Debian/Ubuntu package (pre-built, preferred)
|
||||
var debBinary = await TryFetchDebianPackageAsync(version, normalizedArch, options, ct);
|
||||
if (debBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found zlib {Version} from Debian packages", version);
|
||||
return debBinary;
|
||||
}
|
||||
|
||||
// Strategy 2: Try Alpine APK
|
||||
var alpineBinary = await TryFetchAlpinePackageAsync(version, normalizedArch, options, ct);
|
||||
if (alpineBinary is not null)
|
||||
{
|
||||
_logger.LogDebug("Found zlib {Version} from Alpine packages", version);
|
||||
return alpineBinary;
|
||||
}
|
||||
|
||||
_logger.LogWarning(
|
||||
"Could not find pre-built zlib {Version} for {Architecture}. Source build not implemented.",
|
||||
version,
|
||||
normalizedArch);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
|
||||
IEnumerable<string> versions,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
foreach (var version in versions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var binary = await FetchBinaryAsync(version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
yield return binary;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private ImmutableArray<string> ParseVersionsFromListing(string html)
|
||||
{
|
||||
// Match patterns like zlib-1.2.13.tar.gz or zlib-1.3.1.tar.xz
|
||||
var matches = ZlibVersionRegex().Matches(html);
|
||||
|
||||
var versions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["version"].Success)
|
||||
{
|
||||
versions.Add(match.Groups["version"].Value);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. versions];
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchDebianPackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("DebianPackages");
|
||||
|
||||
var debArch = MapToDebianArchitecture(architecture);
|
||||
if (debArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// zlib package name is zlib1g
|
||||
const string packageName = "zlib1g";
|
||||
|
||||
// Query Debian snapshot for matching package
|
||||
var packageUrls = await FindDebianPackageUrlsAsync(client, packageName, version, debArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Debian zlib package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
// Extract libz.so.1 from the .deb package
|
||||
var binary = await ExtractLibZFromDebAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Debian package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> TryFetchAlpinePackageAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var client = _httpClientFactory.CreateClient("AlpinePackages");
|
||||
|
||||
var alpineArch = MapToAlpineArchitecture(architecture);
|
||||
if (alpineArch is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Query Alpine package repository for zlib
|
||||
var packageUrls = await FindAlpinePackageUrlsAsync(client, "zlib", version, alpineArch, ct);
|
||||
|
||||
foreach (var url in packageUrls)
|
||||
{
|
||||
try
|
||||
{
|
||||
_logger.LogDebug("Trying Alpine zlib package URL: {Url}", url);
|
||||
var packageBytes = await client.GetByteArrayAsync(url, ct);
|
||||
|
||||
// Extract libz.so.1 from the .apk package
|
||||
var binary = await ExtractLibZFromApkAsync(packageBytes, version, architecture, options, ct);
|
||||
if (binary is not null)
|
||||
{
|
||||
return binary;
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to download Alpine package from {Url}", url);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindDebianPackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string packageName,
|
||||
string version,
|
||||
string debianArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var apiUrl = $"https://snapshot.debian.org/mr/binary/{packageName}/";
|
||||
|
||||
try
|
||||
{
|
||||
var response = await client.GetStringAsync(apiUrl, ct);
|
||||
var urls = ExtractPackageUrlsForVersion(response, version, debianArch);
|
||||
return urls;
|
||||
}
|
||||
catch (HttpRequestException ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Debian snapshot API query failed for {Package}", packageName);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<string>> FindAlpinePackageUrlsAsync(
|
||||
HttpClient client,
|
||||
string packageName,
|
||||
string version,
|
||||
string alpineArch,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var releases = new[] { "v3.20", "v3.19", "v3.18", "v3.17" };
|
||||
var urls = new List<string>();
|
||||
|
||||
foreach (var release in releases)
|
||||
{
|
||||
var baseUrl = $"https://dl-cdn.alpinelinux.org/alpine/{release}/main/{alpineArch}/";
|
||||
|
||||
try
|
||||
{
|
||||
var html = await client.GetStringAsync(baseUrl, ct);
|
||||
|
||||
// Find package URLs matching version
|
||||
var matches = AlpinePackageRegex().Matches(html);
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
if (match.Groups["name"].Value == packageName &&
|
||||
match.Groups["version"].Value.StartsWith(version, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
urls.Add($"{baseUrl}{match.Groups["file"].Value}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (HttpRequestException)
|
||||
{
|
||||
// Skip releases we can't access
|
||||
}
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibZFromDebAsync(
|
||||
byte[] debPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .deb extraction - placeholder for now
|
||||
await Task.CompletedTask;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Debian package extraction not fully implemented. Package size: {Size} bytes",
|
||||
debPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<LibraryBinary?> ExtractLibZFromApkAsync(
|
||||
byte[] apkPackage,
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// .apk extraction - placeholder for now
|
||||
await Task.CompletedTask;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Alpine package extraction not fully implemented. Package size: {Size} bytes",
|
||||
apkPackage.Length);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static ImmutableArray<string> ExtractPackageUrlsForVersion(
|
||||
string json,
|
||||
string version,
|
||||
string debianArch)
|
||||
{
|
||||
var urls = new List<string>();
|
||||
|
||||
try
|
||||
{
|
||||
using var doc = System.Text.Json.JsonDocument.Parse(json);
|
||||
|
||||
if (doc.RootElement.TryGetProperty("result", out var results))
|
||||
{
|
||||
foreach (var item in results.EnumerateArray())
|
||||
{
|
||||
if (item.TryGetProperty("binary_version", out var binaryVersion) &&
|
||||
item.TryGetProperty("architecture", out var arch))
|
||||
{
|
||||
var binVer = binaryVersion.GetString() ?? string.Empty;
|
||||
var archStr = arch.GetString() ?? string.Empty;
|
||||
|
||||
// Check if version matches and architecture matches
|
||||
if (binVer.Contains(version, StringComparison.OrdinalIgnoreCase) &&
|
||||
archStr.Equals(debianArch, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
if (item.TryGetProperty("files", out var files))
|
||||
{
|
||||
foreach (var file in files.EnumerateArray())
|
||||
{
|
||||
if (file.TryGetProperty("hash", out var hashElement))
|
||||
{
|
||||
var hash = hashElement.GetString();
|
||||
if (!string.IsNullOrEmpty(hash))
|
||||
{
|
||||
urls.Add($"https://snapshot.debian.org/file/{hash}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (System.Text.Json.JsonException)
|
||||
{
|
||||
// Invalid JSON
|
||||
}
|
||||
|
||||
return [.. urls];
|
||||
}
|
||||
|
||||
private static string NormalizeArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" or "amd64" => "x86_64",
|
||||
"aarch64" or "arm64" => "aarch64",
|
||||
"armhf" or "armv7" or "arm" => "armhf",
|
||||
"i386" or "i686" or "x86" => "i386",
|
||||
_ => architecture
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToDebianArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "i386",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToAlpineArchitecture(string architecture)
|
||||
{
|
||||
return architecture.ToLowerInvariant() switch
|
||||
{
|
||||
"x86_64" => "x86_64",
|
||||
"aarch64" => "aarch64",
|
||||
"armhf" or "armv7" => "armhf",
|
||||
"i386" or "i686" => "x86",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static Version? ParseVersion(string versionString)
|
||||
{
|
||||
if (Version.TryParse(versionString, out var version))
|
||||
{
|
||||
return version;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Generated Regexes
|
||||
|
||||
[GeneratedRegex(@"zlib-(?<version>\d+\.\d+(?:\.\d+)?)", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex ZlibVersionRegex();
|
||||
|
||||
[GeneratedRegex(@"href=""(?<file>(?<name>[a-z0-9_-]+)-(?<version>[0-9.]+(?:-r\d+)?)\.apk)""", RegexOptions.IgnoreCase)]
|
||||
private static partial Regex AlpinePackageRegex();
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,135 @@
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus;
|
||||
|
||||
/// <summary>
|
||||
/// Service for ingesting library functions into the corpus.
|
||||
/// </summary>
|
||||
public interface ICorpusIngestionService
|
||||
{
|
||||
/// <summary>
|
||||
/// Ingest all functions from a library binary.
|
||||
/// </summary>
|
||||
/// <param name="metadata">Library metadata.</param>
|
||||
/// <param name="binaryStream">Binary file stream.</param>
|
||||
/// <param name="options">Ingestion options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Ingestion result with statistics.</returns>
|
||||
Task<IngestionResult> IngestLibraryAsync(
|
||||
LibraryIngestionMetadata metadata,
|
||||
Stream binaryStream,
|
||||
IngestionOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Ingest functions from a library connector.
|
||||
/// </summary>
|
||||
/// <param name="libraryName">Library name (e.g., "glibc").</param>
|
||||
/// <param name="connector">Library corpus connector.</param>
|
||||
/// <param name="options">Ingestion options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Stream of ingestion results.</returns>
|
||||
IAsyncEnumerable<IngestionResult> IngestFromConnectorAsync(
|
||||
string libraryName,
|
||||
ILibraryCorpusConnector connector,
|
||||
IngestionOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Update CVE associations for corpus functions.
|
||||
/// </summary>
|
||||
/// <param name="cveId">CVE identifier.</param>
|
||||
/// <param name="associations">Function-CVE associations.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Number of associations updated.</returns>
|
||||
Task<int> UpdateCveAssociationsAsync(
|
||||
string cveId,
|
||||
IReadOnlyList<FunctionCveAssociation> associations,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get ingestion job status.
|
||||
/// </summary>
|
||||
/// <param name="jobId">Job ID.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Job details or null if not found.</returns>
|
||||
Task<IngestionJob?> GetJobStatusAsync(Guid jobId, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metadata for library ingestion.
|
||||
/// </summary>
|
||||
public sealed record LibraryIngestionMetadata(
|
||||
string Name,
|
||||
string Version,
|
||||
string Architecture,
|
||||
string? Abi = null,
|
||||
string? Compiler = null,
|
||||
string? CompilerVersion = null,
|
||||
string? OptimizationLevel = null,
|
||||
DateOnly? ReleaseDate = null,
|
||||
bool IsSecurityRelease = false,
|
||||
string? SourceArchiveSha256 = null);
|
||||
|
||||
/// <summary>
|
||||
/// Options for corpus ingestion.
|
||||
/// </summary>
|
||||
public sealed record IngestionOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Minimum function size to index (bytes).
|
||||
/// </summary>
|
||||
public int MinFunctionSize { get; init; } = 16;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum functions per binary.
|
||||
/// </summary>
|
||||
public int MaxFunctionsPerBinary { get; init; } = 10_000;
|
||||
|
||||
/// <summary>
|
||||
/// Algorithms to use for fingerprinting.
|
||||
/// </summary>
|
||||
public ImmutableArray<FingerprintAlgorithm> Algorithms { get; init; } =
|
||||
[FingerprintAlgorithm.SemanticKsg, FingerprintAlgorithm.InstructionBb, FingerprintAlgorithm.CfgWl];
|
||||
|
||||
/// <summary>
|
||||
/// Include exported functions only.
|
||||
/// </summary>
|
||||
public bool ExportedOnly { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Generate function clusters after ingestion.
|
||||
/// </summary>
|
||||
public bool GenerateClusters { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Parallel degree for function processing.
|
||||
/// </summary>
|
||||
public int ParallelDegree { get; init; } = 4;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a library ingestion.
|
||||
/// </summary>
|
||||
public sealed record IngestionResult(
|
||||
Guid JobId,
|
||||
string LibraryName,
|
||||
string Version,
|
||||
string Architecture,
|
||||
int FunctionsIndexed,
|
||||
int FingerprintsGenerated,
|
||||
int ClustersCreated,
|
||||
TimeSpan Duration,
|
||||
ImmutableArray<string> Errors,
|
||||
ImmutableArray<string> Warnings);
|
||||
|
||||
/// <summary>
|
||||
/// Association between a function and a CVE.
|
||||
/// </summary>
|
||||
public sealed record FunctionCveAssociation(
|
||||
Guid FunctionId,
|
||||
CveAffectedState AffectedState,
|
||||
string? PatchCommit,
|
||||
decimal Confidence,
|
||||
CveEvidenceType? EvidenceType);
|
||||
@@ -0,0 +1,186 @@
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus;
|
||||
|
||||
/// <summary>
|
||||
/// Service for querying the function corpus.
|
||||
/// </summary>
|
||||
public interface ICorpusQueryService
|
||||
{
|
||||
/// <summary>
|
||||
/// Identify a function by its fingerprints.
|
||||
/// </summary>
|
||||
/// <param name="fingerprints">Function fingerprints to match.</param>
|
||||
/// <param name="options">Query options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matching functions ordered by similarity.</returns>
|
||||
Task<ImmutableArray<FunctionMatch>> IdentifyFunctionAsync(
|
||||
FunctionFingerprints fingerprints,
|
||||
IdentifyOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Batch identify functions.
|
||||
/// </summary>
|
||||
/// <param name="fingerprints">Multiple function fingerprints.</param>
|
||||
/// <param name="options">Query options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matches for each input fingerprint.</returns>
|
||||
Task<ImmutableDictionary<int, ImmutableArray<FunctionMatch>>> IdentifyBatchAsync(
|
||||
IReadOnlyList<FunctionFingerprints> fingerprints,
|
||||
IdentifyOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get all functions associated with a CVE.
|
||||
/// </summary>
|
||||
/// <param name="cveId">CVE identifier.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Functions affected by the CVE.</returns>
|
||||
Task<ImmutableArray<CorpusFunctionWithCve>> GetFunctionsForCveAsync(
|
||||
string cveId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get function evolution across library versions.
|
||||
/// </summary>
|
||||
/// <param name="libraryName">Library name.</param>
|
||||
/// <param name="functionName">Function name.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Function evolution timeline.</returns>
|
||||
Task<FunctionEvolution?> GetFunctionEvolutionAsync(
|
||||
string libraryName,
|
||||
string functionName,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get corpus statistics.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Corpus statistics.</returns>
|
||||
Task<CorpusStatistics> GetStatisticsAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// List libraries in the corpus.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Libraries with version counts.</returns>
|
||||
Task<ImmutableArray<LibrarySummary>> ListLibrariesAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// List versions for a library.
|
||||
/// </summary>
|
||||
/// <param name="libraryName">Library name.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Version information.</returns>
|
||||
Task<ImmutableArray<LibraryVersionSummary>> ListVersionsAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Fingerprints for function identification.
|
||||
/// </summary>
|
||||
public sealed record FunctionFingerprints(
|
||||
byte[]? SemanticHash,
|
||||
byte[]? InstructionHash,
|
||||
byte[]? CfgHash,
|
||||
ImmutableArray<string>? ApiCalls,
|
||||
int? SizeBytes);
|
||||
|
||||
/// <summary>
|
||||
/// Options for function identification.
|
||||
/// </summary>
|
||||
public sealed record IdentifyOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Minimum similarity threshold (0.0-1.0).
|
||||
/// </summary>
|
||||
public decimal MinSimilarity { get; init; } = 0.70m;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum results to return.
|
||||
/// </summary>
|
||||
public int MaxResults { get; init; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Filter by library names.
|
||||
/// </summary>
|
||||
public ImmutableArray<string>? LibraryFilter { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Filter by architectures.
|
||||
/// </summary>
|
||||
public ImmutableArray<string>? ArchitectureFilter { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Include CVE information in results.
|
||||
/// </summary>
|
||||
public bool IncludeCveInfo { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Weights for similarity computation.
|
||||
/// </summary>
|
||||
public SimilarityWeights Weights { get; init; } = SimilarityWeights.Default;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Weights for computing overall similarity.
|
||||
/// </summary>
|
||||
public sealed record SimilarityWeights
|
||||
{
|
||||
public decimal SemanticWeight { get; init; } = 0.35m;
|
||||
public decimal InstructionWeight { get; init; } = 0.25m;
|
||||
public decimal CfgWeight { get; init; } = 0.25m;
|
||||
public decimal ApiCallWeight { get; init; } = 0.15m;
|
||||
|
||||
public static SimilarityWeights Default { get; } = new();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Function with CVE information.
|
||||
/// </summary>
|
||||
public sealed record CorpusFunctionWithCve(
|
||||
CorpusFunction Function,
|
||||
LibraryMetadata Library,
|
||||
LibraryVersion Version,
|
||||
BuildVariant Build,
|
||||
FunctionCve CveInfo);
|
||||
|
||||
/// <summary>
|
||||
/// Corpus statistics.
|
||||
/// </summary>
|
||||
public sealed record CorpusStatistics(
|
||||
int LibraryCount,
|
||||
int VersionCount,
|
||||
int BuildVariantCount,
|
||||
int FunctionCount,
|
||||
int FingerprintCount,
|
||||
int ClusterCount,
|
||||
int CveAssociationCount,
|
||||
DateTimeOffset? LastUpdated);
|
||||
|
||||
/// <summary>
|
||||
/// Summary of a library in the corpus.
|
||||
/// </summary>
|
||||
public sealed record LibrarySummary(
|
||||
Guid Id,
|
||||
string Name,
|
||||
string? Description,
|
||||
int VersionCount,
|
||||
int FunctionCount,
|
||||
int CveCount,
|
||||
DateTimeOffset? LatestVersionDate);
|
||||
|
||||
/// <summary>
|
||||
/// Summary of a library version.
|
||||
/// </summary>
|
||||
public sealed record LibraryVersionSummary(
|
||||
Guid Id,
|
||||
string Version,
|
||||
DateOnly? ReleaseDate,
|
||||
bool IsSecurityRelease,
|
||||
int BuildVariantCount,
|
||||
int FunctionCount,
|
||||
ImmutableArray<string> Architectures);
|
||||
@@ -0,0 +1,327 @@
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus;
|
||||
|
||||
/// <summary>
|
||||
/// Repository for corpus data access.
|
||||
/// </summary>
|
||||
public interface ICorpusRepository
|
||||
{
|
||||
#region Libraries
|
||||
|
||||
/// <summary>
|
||||
/// Get or create a library.
|
||||
/// </summary>
|
||||
Task<LibraryMetadata> GetOrCreateLibraryAsync(
|
||||
string name,
|
||||
string? description = null,
|
||||
string? homepageUrl = null,
|
||||
string? sourceRepo = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a library by name.
|
||||
/// </summary>
|
||||
Task<LibraryMetadata?> GetLibraryAsync(string name, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a library by ID.
|
||||
/// </summary>
|
||||
Task<LibraryMetadata?> GetLibraryByIdAsync(Guid id, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// List all libraries.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<LibrarySummary>> ListLibrariesAsync(CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Library Versions
|
||||
|
||||
/// <summary>
|
||||
/// Get or create a library version.
|
||||
/// </summary>
|
||||
Task<LibraryVersion> GetOrCreateVersionAsync(
|
||||
Guid libraryId,
|
||||
string version,
|
||||
DateOnly? releaseDate = null,
|
||||
bool isSecurityRelease = false,
|
||||
string? sourceArchiveSha256 = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a library version.
|
||||
/// </summary>
|
||||
Task<LibraryVersion?> GetVersionAsync(
|
||||
Guid libraryId,
|
||||
string version,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a library version by ID.
|
||||
/// </summary>
|
||||
Task<LibraryVersion?> GetLibraryVersionAsync(
|
||||
Guid versionId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// List versions for a library.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<LibraryVersionSummary>> ListVersionsAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Build Variants
|
||||
|
||||
/// <summary>
|
||||
/// Get or create a build variant.
|
||||
/// </summary>
|
||||
Task<BuildVariant> GetOrCreateBuildVariantAsync(
|
||||
Guid libraryVersionId,
|
||||
string architecture,
|
||||
string binarySha256,
|
||||
string? abi = null,
|
||||
string? compiler = null,
|
||||
string? compilerVersion = null,
|
||||
string? optimizationLevel = null,
|
||||
string? buildId = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a build variant by binary hash.
|
||||
/// </summary>
|
||||
Task<BuildVariant?> GetBuildVariantBySha256Async(
|
||||
string binarySha256,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a build variant by ID.
|
||||
/// </summary>
|
||||
Task<BuildVariant?> GetBuildVariantAsync(
|
||||
Guid variantId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get build variants for a version.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<BuildVariant>> GetBuildVariantsAsync(
|
||||
Guid libraryVersionId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Functions
|
||||
|
||||
/// <summary>
|
||||
/// Bulk insert functions.
|
||||
/// </summary>
|
||||
Task<int> InsertFunctionsAsync(
|
||||
IReadOnlyList<CorpusFunction> functions,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a function by ID.
|
||||
/// </summary>
|
||||
Task<CorpusFunction?> GetFunctionAsync(Guid id, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get functions for a build variant.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<CorpusFunction>> GetFunctionsForVariantAsync(
|
||||
Guid buildVariantId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get function count for a build variant.
|
||||
/// </summary>
|
||||
Task<int> GetFunctionCountAsync(Guid buildVariantId, CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Fingerprints
|
||||
|
||||
/// <summary>
|
||||
/// Bulk insert fingerprints.
|
||||
/// </summary>
|
||||
Task<int> InsertFingerprintsAsync(
|
||||
IReadOnlyList<CorpusFingerprint> fingerprints,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find functions by fingerprint hash.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<Guid>> FindFunctionsByFingerprintAsync(
|
||||
FingerprintAlgorithm algorithm,
|
||||
byte[] fingerprint,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find similar fingerprints (for approximate matching).
|
||||
/// </summary>
|
||||
Task<ImmutableArray<FingerprintSearchResult>> FindSimilarFingerprintsAsync(
|
||||
FingerprintAlgorithm algorithm,
|
||||
byte[] fingerprint,
|
||||
int maxResults = 10,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get fingerprints for a function.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<CorpusFingerprint>> GetFingerprintsAsync(
|
||||
Guid functionId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get fingerprints for a function (alias).
|
||||
/// </summary>
|
||||
Task<ImmutableArray<CorpusFingerprint>> GetFingerprintsForFunctionAsync(
|
||||
Guid functionId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Clusters
|
||||
|
||||
/// <summary>
|
||||
/// Get or create a function cluster.
|
||||
/// </summary>
|
||||
Task<FunctionCluster> GetOrCreateClusterAsync(
|
||||
Guid libraryId,
|
||||
string canonicalName,
|
||||
string? description = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get a cluster by ID.
|
||||
/// </summary>
|
||||
Task<FunctionCluster?> GetClusterAsync(
|
||||
Guid clusterId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get all clusters for a library.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<FunctionCluster>> GetClustersForLibraryAsync(
|
||||
Guid libraryId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Insert a new cluster.
|
||||
/// </summary>
|
||||
Task InsertClusterAsync(
|
||||
FunctionCluster cluster,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Add members to a cluster.
|
||||
/// </summary>
|
||||
Task<int> AddClusterMembersAsync(
|
||||
Guid clusterId,
|
||||
IReadOnlyList<ClusterMember> members,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Add a single member to a cluster.
|
||||
/// </summary>
|
||||
Task AddClusterMemberAsync(
|
||||
ClusterMember member,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get cluster members.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<Guid>> GetClusterMemberIdsAsync(
|
||||
Guid clusterId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get cluster members with details.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<ClusterMember>> GetClusterMembersAsync(
|
||||
Guid clusterId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Clear all members from a cluster.
|
||||
/// </summary>
|
||||
Task ClearClusterMembersAsync(
|
||||
Guid clusterId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region CVE Associations
|
||||
|
||||
/// <summary>
|
||||
/// Upsert CVE associations.
|
||||
/// </summary>
|
||||
Task<int> UpsertCveAssociationsAsync(
|
||||
string cveId,
|
||||
IReadOnlyList<FunctionCve> associations,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get functions for a CVE.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<Guid>> GetFunctionIdsForCveAsync(
|
||||
string cveId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get CVEs for a function.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<FunctionCve>> GetCvesForFunctionAsync(
|
||||
Guid functionId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Ingestion Jobs
|
||||
|
||||
/// <summary>
|
||||
/// Create an ingestion job.
|
||||
/// </summary>
|
||||
Task<IngestionJob> CreateIngestionJobAsync(
|
||||
Guid libraryId,
|
||||
IngestionJobType jobType,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Update ingestion job status.
|
||||
/// </summary>
|
||||
Task UpdateIngestionJobAsync(
|
||||
Guid jobId,
|
||||
IngestionJobStatus status,
|
||||
int? functionsIndexed = null,
|
||||
int? fingerprintsGenerated = null,
|
||||
int? clustersCreated = null,
|
||||
ImmutableArray<string>? errors = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get ingestion job.
|
||||
/// </summary>
|
||||
Task<IngestionJob?> GetIngestionJobAsync(Guid jobId, CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
|
||||
#region Statistics
|
||||
|
||||
/// <summary>
|
||||
/// Get corpus statistics.
|
||||
/// </summary>
|
||||
Task<CorpusStatistics> GetStatisticsAsync(CancellationToken ct = default);
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a fingerprint similarity search.
|
||||
/// </summary>
|
||||
public sealed record FingerprintSearchResult(
|
||||
Guid FunctionId,
|
||||
byte[] Fingerprint,
|
||||
decimal Similarity);
|
||||
@@ -0,0 +1,155 @@
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus;
|
||||
|
||||
/// <summary>
|
||||
/// Connector for fetching library binaries from various sources.
|
||||
/// Used to populate the function corpus.
|
||||
/// </summary>
|
||||
public interface ILibraryCorpusConnector
|
||||
{
|
||||
/// <summary>
|
||||
/// Library name this connector handles (e.g., "glibc", "openssl").
|
||||
/// </summary>
|
||||
string LibraryName { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Supported architectures.
|
||||
/// </summary>
|
||||
ImmutableArray<string> SupportedArchitectures { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Get available versions of the library.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Available versions ordered newest first.</returns>
|
||||
Task<ImmutableArray<string>> GetAvailableVersionsAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Fetch a library binary for a specific version and architecture.
|
||||
/// </summary>
|
||||
/// <param name="version">Library version.</param>
|
||||
/// <param name="architecture">Target architecture.</param>
|
||||
/// <param name="options">Fetch options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Library binary or null if not available.</returns>
|
||||
Task<LibraryBinary?> FetchBinaryAsync(
|
||||
string version,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Stream binaries for multiple versions.
|
||||
/// </summary>
|
||||
/// <param name="versions">Versions to fetch.</param>
|
||||
/// <param name="architecture">Target architecture.</param>
|
||||
/// <param name="options">Fetch options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Stream of library binaries.</returns>
|
||||
IAsyncEnumerable<LibraryBinary> FetchBinariesAsync(
|
||||
IEnumerable<string> versions,
|
||||
string architecture,
|
||||
LibraryFetchOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A library binary fetched from a connector.
|
||||
/// </summary>
|
||||
public sealed record LibraryBinary(
|
||||
string LibraryName,
|
||||
string Version,
|
||||
string Architecture,
|
||||
string? Abi,
|
||||
string? Compiler,
|
||||
string? CompilerVersion,
|
||||
string? OptimizationLevel,
|
||||
Stream BinaryStream,
|
||||
string Sha256,
|
||||
string? BuildId,
|
||||
LibraryBinarySource Source,
|
||||
DateOnly? ReleaseDate) : IDisposable
|
||||
{
|
||||
public void Dispose()
|
||||
{
|
||||
BinaryStream.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Source of a library binary.
|
||||
/// </summary>
|
||||
public sealed record LibraryBinarySource(
|
||||
LibrarySourceType Type,
|
||||
string? PackageName,
|
||||
string? DistroRelease,
|
||||
string? MirrorUrl);
|
||||
|
||||
/// <summary>
|
||||
/// Type of library source.
|
||||
/// </summary>
|
||||
public enum LibrarySourceType
|
||||
{
|
||||
/// <summary>
|
||||
/// Binary from Debian/Ubuntu package.
|
||||
/// </summary>
|
||||
DebianPackage,
|
||||
|
||||
/// <summary>
|
||||
/// Binary from RPM package.
|
||||
/// </summary>
|
||||
RpmPackage,
|
||||
|
||||
/// <summary>
|
||||
/// Binary from Alpine APK.
|
||||
/// </summary>
|
||||
AlpineApk,
|
||||
|
||||
/// <summary>
|
||||
/// Binary compiled from source.
|
||||
/// </summary>
|
||||
CompiledSource,
|
||||
|
||||
/// <summary>
|
||||
/// Binary from upstream release.
|
||||
/// </summary>
|
||||
UpstreamRelease,
|
||||
|
||||
/// <summary>
|
||||
/// Binary from debug symbol server.
|
||||
/// </summary>
|
||||
DebugSymbolServer
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for fetching library binaries.
|
||||
/// </summary>
|
||||
public sealed record LibraryFetchOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Preferred ABI (e.g., "gnu", "musl").
|
||||
/// </summary>
|
||||
public string? PreferredAbi { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Preferred compiler.
|
||||
/// </summary>
|
||||
public string? PreferredCompiler { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Include debug symbols if available.
|
||||
/// </summary>
|
||||
public bool IncludeDebugSymbols { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Preferred distro for pre-built packages.
|
||||
/// </summary>
|
||||
public string? PreferredDistro { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Timeout for network operations.
|
||||
/// </summary>
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromMinutes(5);
|
||||
}
|
||||
@@ -0,0 +1,273 @@
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
/// <summary>
|
||||
/// Metadata about a known library in the corpus.
|
||||
/// </summary>
|
||||
public sealed record LibraryMetadata(
|
||||
Guid Id,
|
||||
string Name,
|
||||
string? Description,
|
||||
string? HomepageUrl,
|
||||
string? SourceRepo,
|
||||
DateTimeOffset CreatedAt,
|
||||
DateTimeOffset UpdatedAt);
|
||||
|
||||
/// <summary>
|
||||
/// A specific version of a library in the corpus.
|
||||
/// </summary>
|
||||
public sealed record LibraryVersion(
|
||||
Guid Id,
|
||||
Guid LibraryId,
|
||||
string Version,
|
||||
DateOnly? ReleaseDate,
|
||||
bool IsSecurityRelease,
|
||||
string? SourceArchiveSha256,
|
||||
DateTimeOffset IndexedAt);
|
||||
|
||||
/// <summary>
|
||||
/// A specific build variant of a library version.
|
||||
/// </summary>
|
||||
public sealed record BuildVariant(
|
||||
Guid Id,
|
||||
Guid LibraryVersionId,
|
||||
string Architecture,
|
||||
string? Abi,
|
||||
string? Compiler,
|
||||
string? CompilerVersion,
|
||||
string? OptimizationLevel,
|
||||
string? BuildId,
|
||||
string BinarySha256,
|
||||
DateTimeOffset IndexedAt);
|
||||
|
||||
/// <summary>
|
||||
/// A function in the corpus.
|
||||
/// </summary>
|
||||
public sealed record CorpusFunction(
|
||||
Guid Id,
|
||||
Guid BuildVariantId,
|
||||
string Name,
|
||||
string? DemangledName,
|
||||
ulong Address,
|
||||
int SizeBytes,
|
||||
bool IsExported,
|
||||
bool IsInline,
|
||||
string? SourceFile,
|
||||
int? SourceLine);
|
||||
|
||||
/// <summary>
|
||||
/// A fingerprint for a function in the corpus.
|
||||
/// </summary>
|
||||
public sealed record CorpusFingerprint(
|
||||
Guid Id,
|
||||
Guid FunctionId,
|
||||
FingerprintAlgorithm Algorithm,
|
||||
byte[] Fingerprint,
|
||||
string FingerprintHex,
|
||||
FingerprintMetadata? Metadata,
|
||||
DateTimeOffset CreatedAt);
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm used to generate a fingerprint.
|
||||
/// </summary>
|
||||
public enum FingerprintAlgorithm
|
||||
{
|
||||
/// <summary>
|
||||
/// Semantic key-semantics graph fingerprint (from Phase 1).
|
||||
/// </summary>
|
||||
SemanticKsg,
|
||||
|
||||
/// <summary>
|
||||
/// Instruction-level basic block hash.
|
||||
/// </summary>
|
||||
InstructionBb,
|
||||
|
||||
/// <summary>
|
||||
/// Control flow graph Weisfeiler-Lehman hash.
|
||||
/// </summary>
|
||||
CfgWl,
|
||||
|
||||
/// <summary>
|
||||
/// API call sequence hash.
|
||||
/// </summary>
|
||||
ApiCalls,
|
||||
|
||||
/// <summary>
|
||||
/// Combined multi-algorithm fingerprint.
|
||||
/// </summary>
|
||||
Combined
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm-specific metadata for a fingerprint.
|
||||
/// </summary>
|
||||
public sealed record FingerprintMetadata(
|
||||
int? NodeCount,
|
||||
int? EdgeCount,
|
||||
int? CyclomaticComplexity,
|
||||
ImmutableArray<string>? ApiCalls,
|
||||
string? OperationHashHex,
|
||||
string? DataFlowHashHex);
|
||||
|
||||
/// <summary>
|
||||
/// A cluster of similar functions across versions.
|
||||
/// </summary>
|
||||
public sealed record FunctionCluster(
|
||||
Guid Id,
|
||||
Guid LibraryId,
|
||||
string CanonicalName,
|
||||
string? Description,
|
||||
DateTimeOffset CreatedAt);
|
||||
|
||||
/// <summary>
|
||||
/// Membership in a function cluster.
|
||||
/// </summary>
|
||||
public sealed record ClusterMember(
|
||||
Guid ClusterId,
|
||||
Guid FunctionId,
|
||||
decimal? SimilarityToCentroid);
|
||||
|
||||
/// <summary>
|
||||
/// CVE association for a function.
|
||||
/// </summary>
|
||||
public sealed record FunctionCve(
|
||||
Guid FunctionId,
|
||||
string CveId,
|
||||
CveAffectedState AffectedState,
|
||||
string? PatchCommit,
|
||||
decimal Confidence,
|
||||
CveEvidenceType? EvidenceType);
|
||||
|
||||
/// <summary>
|
||||
/// CVE affected state for a function.
|
||||
/// </summary>
|
||||
public enum CveAffectedState
|
||||
{
|
||||
Vulnerable,
|
||||
Fixed,
|
||||
NotAffected
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of evidence linking a function to a CVE.
|
||||
/// </summary>
|
||||
public enum CveEvidenceType
|
||||
{
|
||||
Changelog,
|
||||
Commit,
|
||||
Advisory,
|
||||
PatchHeader,
|
||||
Manual
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Ingestion job tracking.
|
||||
/// </summary>
|
||||
public sealed record IngestionJob(
|
||||
Guid Id,
|
||||
Guid LibraryId,
|
||||
IngestionJobType JobType,
|
||||
IngestionJobStatus Status,
|
||||
DateTimeOffset? StartedAt,
|
||||
DateTimeOffset? CompletedAt,
|
||||
int? FunctionsIndexed,
|
||||
ImmutableArray<string>? Errors,
|
||||
DateTimeOffset CreatedAt);
|
||||
|
||||
/// <summary>
|
||||
/// Type of ingestion job.
|
||||
/// </summary>
|
||||
public enum IngestionJobType
|
||||
{
|
||||
FullIngest,
|
||||
Incremental,
|
||||
CveUpdate
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Status of an ingestion job.
|
||||
/// </summary>
|
||||
public enum IngestionJobStatus
|
||||
{
|
||||
Pending,
|
||||
Running,
|
||||
Completed,
|
||||
Failed,
|
||||
Cancelled
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a function identification query.
|
||||
/// </summary>
|
||||
public sealed record FunctionMatch(
|
||||
string LibraryName,
|
||||
string Version,
|
||||
string FunctionName,
|
||||
string? DemangledName,
|
||||
decimal Similarity,
|
||||
MatchConfidence Confidence,
|
||||
string Architecture,
|
||||
string? Abi,
|
||||
MatchDetails Details);
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level of a match.
|
||||
/// </summary>
|
||||
public enum MatchConfidence
|
||||
{
|
||||
/// <summary>
|
||||
/// Low confidence (similarity 50-70%).
|
||||
/// </summary>
|
||||
Low,
|
||||
|
||||
/// <summary>
|
||||
/// Medium confidence (similarity 70-85%).
|
||||
/// </summary>
|
||||
Medium,
|
||||
|
||||
/// <summary>
|
||||
/// High confidence (similarity 85-95%).
|
||||
/// </summary>
|
||||
High,
|
||||
|
||||
/// <summary>
|
||||
/// Very high confidence (similarity 95%+).
|
||||
/// </summary>
|
||||
VeryHigh,
|
||||
|
||||
/// <summary>
|
||||
/// Exact match (100% or hash collision).
|
||||
/// </summary>
|
||||
Exact
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Details about a function match.
|
||||
/// </summary>
|
||||
public sealed record MatchDetails(
|
||||
decimal SemanticSimilarity,
|
||||
decimal InstructionSimilarity,
|
||||
decimal CfgSimilarity,
|
||||
decimal ApiCallSimilarity,
|
||||
ImmutableArray<string> MatchedApiCalls,
|
||||
int SizeDifferenceBytes);
|
||||
|
||||
/// <summary>
|
||||
/// Evolution of a function across library versions.
|
||||
/// </summary>
|
||||
public sealed record FunctionEvolution(
|
||||
string LibraryName,
|
||||
string FunctionName,
|
||||
ImmutableArray<FunctionVersionInfo> Versions);
|
||||
|
||||
/// <summary>
|
||||
/// Information about a function in a specific version.
|
||||
/// </summary>
|
||||
public sealed record FunctionVersionInfo(
|
||||
string Version,
|
||||
DateOnly? ReleaseDate,
|
||||
int SizeBytes,
|
||||
string FingerprintHex,
|
||||
decimal? SimilarityToPrevious,
|
||||
ImmutableArray<string>? CveIds);
|
||||
@@ -0,0 +1,464 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Threading.Channels;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Service for batch generation of function fingerprints.
|
||||
/// Uses a producer-consumer pattern for efficient parallel processing.
|
||||
/// </summary>
|
||||
public sealed class BatchFingerprintPipeline : IBatchFingerprintPipeline
|
||||
{
|
||||
private readonly ICorpusRepository _repository;
|
||||
private readonly IFingerprintGeneratorFactory _generatorFactory;
|
||||
private readonly ILogger<BatchFingerprintPipeline> _logger;
|
||||
|
||||
public BatchFingerprintPipeline(
|
||||
ICorpusRepository repository,
|
||||
IFingerprintGeneratorFactory generatorFactory,
|
||||
ILogger<BatchFingerprintPipeline> logger)
|
||||
{
|
||||
_repository = repository;
|
||||
_generatorFactory = generatorFactory;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<BatchFingerprintResult> GenerateFingerprintsAsync(
|
||||
Guid buildVariantId,
|
||||
BatchFingerprintOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var opts = options ?? new BatchFingerprintOptions();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting batch fingerprint generation for variant {VariantId}",
|
||||
buildVariantId);
|
||||
|
||||
// Get all functions for this variant
|
||||
var functions = await _repository.GetFunctionsForVariantAsync(buildVariantId, ct);
|
||||
|
||||
if (functions.Length == 0)
|
||||
{
|
||||
_logger.LogWarning("No functions found for variant {VariantId}", buildVariantId);
|
||||
return new BatchFingerprintResult(
|
||||
buildVariantId,
|
||||
0,
|
||||
0,
|
||||
TimeSpan.Zero,
|
||||
[],
|
||||
[]);
|
||||
}
|
||||
|
||||
return await GenerateFingerprintsForFunctionsAsync(
|
||||
functions,
|
||||
buildVariantId,
|
||||
opts,
|
||||
ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<BatchFingerprintResult> GenerateFingerprintsForLibraryAsync(
|
||||
string libraryName,
|
||||
BatchFingerprintOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var opts = options ?? new BatchFingerprintOptions();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting batch fingerprint generation for library {Library}",
|
||||
libraryName);
|
||||
|
||||
var library = await _repository.GetLibraryAsync(libraryName, ct);
|
||||
if (library is null)
|
||||
{
|
||||
_logger.LogWarning("Library {Library} not found", libraryName);
|
||||
return new BatchFingerprintResult(
|
||||
Guid.Empty,
|
||||
0,
|
||||
0,
|
||||
TimeSpan.Zero,
|
||||
["Library not found"],
|
||||
[]);
|
||||
}
|
||||
|
||||
// Get all versions
|
||||
var versions = await _repository.ListVersionsAsync(libraryName, ct);
|
||||
|
||||
var totalFunctions = 0;
|
||||
var totalFingerprints = 0;
|
||||
var totalDuration = TimeSpan.Zero;
|
||||
var allErrors = new List<string>();
|
||||
var allWarnings = new List<string>();
|
||||
|
||||
foreach (var version in versions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
// Get build variants for this version
|
||||
var variants = await _repository.GetBuildVariantsAsync(version.Id, ct);
|
||||
|
||||
foreach (var variant in variants)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var result = await GenerateFingerprintsAsync(variant.Id, opts, ct);
|
||||
|
||||
totalFunctions += result.FunctionsProcessed;
|
||||
totalFingerprints += result.FingerprintsGenerated;
|
||||
totalDuration += result.Duration;
|
||||
allErrors.AddRange(result.Errors);
|
||||
allWarnings.AddRange(result.Warnings);
|
||||
}
|
||||
}
|
||||
|
||||
return new BatchFingerprintResult(
|
||||
library.Id,
|
||||
totalFunctions,
|
||||
totalFingerprints,
|
||||
totalDuration,
|
||||
[.. allErrors],
|
||||
[.. allWarnings]);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async IAsyncEnumerable<FingerprintProgress> StreamProgressAsync(
|
||||
Guid buildVariantId,
|
||||
BatchFingerprintOptions? options = null,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
var opts = options ?? new BatchFingerprintOptions();
|
||||
|
||||
var functions = await _repository.GetFunctionsForVariantAsync(buildVariantId, ct);
|
||||
var total = functions.Length;
|
||||
var processed = 0;
|
||||
var errors = 0;
|
||||
|
||||
var channel = Channel.CreateBounded<FingerprintWorkItem>(new BoundedChannelOptions(opts.BatchSize * 2)
|
||||
{
|
||||
FullMode = BoundedChannelFullMode.Wait
|
||||
});
|
||||
|
||||
// Producer task: read functions and queue them
|
||||
var producerTask = Task.Run(async () =>
|
||||
{
|
||||
try
|
||||
{
|
||||
foreach (var function in functions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
await channel.Writer.WriteAsync(new FingerprintWorkItem(function), ct);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
channel.Writer.Complete();
|
||||
}
|
||||
}, ct);
|
||||
|
||||
// Consumer: process batches and yield progress
|
||||
var batch = new List<FingerprintWorkItem>();
|
||||
|
||||
await foreach (var item in channel.Reader.ReadAllAsync(ct))
|
||||
{
|
||||
batch.Add(item);
|
||||
|
||||
if (batch.Count >= opts.BatchSize)
|
||||
{
|
||||
var batchResult = await ProcessBatchAsync(batch, opts, ct);
|
||||
processed += batchResult.Processed;
|
||||
errors += batchResult.Errors;
|
||||
batch.Clear();
|
||||
|
||||
yield return new FingerprintProgress(
|
||||
processed,
|
||||
total,
|
||||
errors,
|
||||
(double)processed / total);
|
||||
}
|
||||
}
|
||||
|
||||
// Process remaining items
|
||||
if (batch.Count > 0)
|
||||
{
|
||||
var batchResult = await ProcessBatchAsync(batch, opts, ct);
|
||||
processed += batchResult.Processed;
|
||||
errors += batchResult.Errors;
|
||||
|
||||
yield return new FingerprintProgress(
|
||||
processed,
|
||||
total,
|
||||
errors,
|
||||
1.0);
|
||||
}
|
||||
|
||||
await producerTask;
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private async Task<BatchFingerprintResult> GenerateFingerprintsForFunctionsAsync(
|
||||
ImmutableArray<CorpusFunction> functions,
|
||||
Guid contextId,
|
||||
BatchFingerprintOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var startTime = DateTime.UtcNow;
|
||||
var processed = 0;
|
||||
var generated = 0;
|
||||
var errors = new List<string>();
|
||||
var warnings = new List<string>();
|
||||
|
||||
// Process in batches with parallelism
|
||||
var batches = functions
|
||||
.Select((f, i) => new { Function = f, Index = i })
|
||||
.GroupBy(x => x.Index / options.BatchSize)
|
||||
.Select(g => g.Select(x => x.Function).ToList())
|
||||
.ToList();
|
||||
|
||||
foreach (var batch in batches)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var semaphore = new SemaphoreSlim(options.ParallelDegree);
|
||||
var batchFingerprints = new List<CorpusFingerprint>();
|
||||
|
||||
var tasks = batch.Select(async function =>
|
||||
{
|
||||
await semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
var fingerprints = await GenerateFingerprintsForFunctionAsync(function, options, ct);
|
||||
lock (batchFingerprints)
|
||||
{
|
||||
batchFingerprints.AddRange(fingerprints);
|
||||
}
|
||||
Interlocked.Increment(ref processed);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
lock (errors)
|
||||
{
|
||||
errors.Add($"Function {function.Name}: {ex.Message}");
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
semaphore.Release();
|
||||
}
|
||||
});
|
||||
|
||||
await Task.WhenAll(tasks);
|
||||
|
||||
// Batch insert fingerprints
|
||||
if (batchFingerprints.Count > 0)
|
||||
{
|
||||
var insertedCount = await _repository.InsertFingerprintsAsync(batchFingerprints, ct);
|
||||
generated += insertedCount;
|
||||
}
|
||||
}
|
||||
|
||||
var duration = DateTime.UtcNow - startTime;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Batch fingerprint generation completed: {Functions} functions, {Fingerprints} fingerprints in {Duration:c}",
|
||||
processed,
|
||||
generated,
|
||||
duration);
|
||||
|
||||
return new BatchFingerprintResult(
|
||||
contextId,
|
||||
processed,
|
||||
generated,
|
||||
duration,
|
||||
[.. errors],
|
||||
[.. warnings]);
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<CorpusFingerprint>> GenerateFingerprintsForFunctionAsync(
|
||||
CorpusFunction function,
|
||||
BatchFingerprintOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var fingerprints = new List<CorpusFingerprint>();
|
||||
|
||||
foreach (var algorithm in options.Algorithms)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var generator = _generatorFactory.GetGenerator(algorithm);
|
||||
if (generator is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var fingerprint = await generator.GenerateAsync(function, ct);
|
||||
if (fingerprint is not null)
|
||||
{
|
||||
fingerprints.Add(new CorpusFingerprint(
|
||||
Guid.NewGuid(),
|
||||
function.Id,
|
||||
algorithm,
|
||||
fingerprint.Hash,
|
||||
Convert.ToHexStringLower(fingerprint.Hash),
|
||||
fingerprint.Metadata,
|
||||
DateTimeOffset.UtcNow));
|
||||
}
|
||||
}
|
||||
|
||||
return [.. fingerprints];
|
||||
}
|
||||
|
||||
private async Task<(int Processed, int Errors)> ProcessBatchAsync(
|
||||
List<FingerprintWorkItem> batch,
|
||||
BatchFingerprintOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var processed = 0;
|
||||
var errors = 0;
|
||||
|
||||
var allFingerprints = new List<CorpusFingerprint>();
|
||||
|
||||
var semaphore = new SemaphoreSlim(options.ParallelDegree);
|
||||
|
||||
var tasks = batch.Select(async item =>
|
||||
{
|
||||
await semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
var fingerprints = await GenerateFingerprintsForFunctionAsync(item.Function, options, ct);
|
||||
lock (allFingerprints)
|
||||
{
|
||||
allFingerprints.AddRange(fingerprints);
|
||||
}
|
||||
Interlocked.Increment(ref processed);
|
||||
}
|
||||
catch
|
||||
{
|
||||
Interlocked.Increment(ref errors);
|
||||
}
|
||||
finally
|
||||
{
|
||||
semaphore.Release();
|
||||
}
|
||||
});
|
||||
|
||||
await Task.WhenAll(tasks);
|
||||
|
||||
if (allFingerprints.Count > 0)
|
||||
{
|
||||
await _repository.InsertFingerprintsAsync(allFingerprints, ct);
|
||||
}
|
||||
|
||||
return (processed, errors);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
private sealed record FingerprintWorkItem(CorpusFunction Function);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for batch fingerprint generation.
|
||||
/// </summary>
|
||||
public interface IBatchFingerprintPipeline
|
||||
{
|
||||
/// <summary>
|
||||
/// Generate fingerprints for all functions in a build variant.
|
||||
/// </summary>
|
||||
Task<BatchFingerprintResult> GenerateFingerprintsAsync(
|
||||
Guid buildVariantId,
|
||||
BatchFingerprintOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Generate fingerprints for all functions in a library.
|
||||
/// </summary>
|
||||
Task<BatchFingerprintResult> GenerateFingerprintsForLibraryAsync(
|
||||
string libraryName,
|
||||
BatchFingerprintOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Stream progress for fingerprint generation.
|
||||
/// </summary>
|
||||
IAsyncEnumerable<FingerprintProgress> StreamProgressAsync(
|
||||
Guid buildVariantId,
|
||||
BatchFingerprintOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for batch fingerprint generation.
|
||||
/// </summary>
|
||||
public sealed record BatchFingerprintOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Number of functions to process per batch.
|
||||
/// </summary>
|
||||
public int BatchSize { get; init; } = 100;
|
||||
|
||||
/// <summary>
|
||||
/// Degree of parallelism for processing.
|
||||
/// </summary>
|
||||
public int ParallelDegree { get; init; } = 4;
|
||||
|
||||
/// <summary>
|
||||
/// Algorithms to generate fingerprints for.
|
||||
/// </summary>
|
||||
public ImmutableArray<FingerprintAlgorithm> Algorithms { get; init; } =
|
||||
[FingerprintAlgorithm.SemanticKsg, FingerprintAlgorithm.InstructionBb, FingerprintAlgorithm.CfgWl];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of batch fingerprint generation.
|
||||
/// </summary>
|
||||
public sealed record BatchFingerprintResult(
|
||||
Guid ContextId,
|
||||
int FunctionsProcessed,
|
||||
int FingerprintsGenerated,
|
||||
TimeSpan Duration,
|
||||
ImmutableArray<string> Errors,
|
||||
ImmutableArray<string> Warnings);
|
||||
|
||||
/// <summary>
|
||||
/// Progress update for fingerprint generation.
|
||||
/// </summary>
|
||||
public sealed record FingerprintProgress(
|
||||
int Processed,
|
||||
int Total,
|
||||
int Errors,
|
||||
double PercentComplete);
|
||||
|
||||
/// <summary>
|
||||
/// Factory for creating fingerprint generators.
|
||||
/// </summary>
|
||||
public interface IFingerprintGeneratorFactory
|
||||
{
|
||||
/// <summary>
|
||||
/// Get a fingerprint generator for the specified algorithm.
|
||||
/// </summary>
|
||||
ICorpusFingerprintGenerator? GetGenerator(FingerprintAlgorithm algorithm);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for corpus fingerprint generation.
|
||||
/// </summary>
|
||||
public interface ICorpusFingerprintGenerator
|
||||
{
|
||||
/// <summary>
|
||||
/// Generate a fingerprint for a corpus function.
|
||||
/// </summary>
|
||||
Task<GeneratedFingerprint?> GenerateAsync(
|
||||
CorpusFunction function,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A generated fingerprint.
|
||||
/// </summary>
|
||||
public sealed record GeneratedFingerprint(
|
||||
byte[] Hash,
|
||||
FingerprintMetadata? Metadata);
|
||||
@@ -0,0 +1,466 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Diagnostics;
|
||||
using System.Security.Cryptography;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Service for ingesting library binaries into the function corpus.
|
||||
/// </summary>
|
||||
public sealed class CorpusIngestionService : ICorpusIngestionService
|
||||
{
|
||||
private readonly ICorpusRepository _repository;
|
||||
private readonly IFingerprintGenerator? _fingerprintGenerator;
|
||||
private readonly IFunctionExtractor? _functionExtractor;
|
||||
private readonly ILogger<CorpusIngestionService> _logger;
|
||||
|
||||
public CorpusIngestionService(
|
||||
ICorpusRepository repository,
|
||||
ILogger<CorpusIngestionService> logger,
|
||||
IFingerprintGenerator? fingerprintGenerator = null,
|
||||
IFunctionExtractor? functionExtractor = null)
|
||||
{
|
||||
_repository = repository;
|
||||
_logger = logger;
|
||||
_fingerprintGenerator = fingerprintGenerator;
|
||||
_functionExtractor = functionExtractor;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IngestionResult> IngestLibraryAsync(
|
||||
LibraryIngestionMetadata metadata,
|
||||
Stream binaryStream,
|
||||
IngestionOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(metadata);
|
||||
ArgumentNullException.ThrowIfNull(binaryStream);
|
||||
|
||||
var opts = options ?? new IngestionOptions();
|
||||
var stopwatch = Stopwatch.StartNew();
|
||||
var warnings = new List<string>();
|
||||
var errors = new List<string>();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting ingestion for {Library} {Version} ({Architecture})",
|
||||
metadata.Name,
|
||||
metadata.Version,
|
||||
metadata.Architecture);
|
||||
|
||||
// Compute binary hash
|
||||
var binarySha256 = await ComputeSha256Async(binaryStream, ct);
|
||||
binaryStream.Position = 0; // Reset for reading
|
||||
|
||||
// Check if we've already indexed this exact binary
|
||||
var existingVariant = await _repository.GetBuildVariantBySha256Async(binarySha256, ct);
|
||||
if (existingVariant is not null)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Binary {Sha256} already indexed as variant {VariantId}",
|
||||
binarySha256[..16],
|
||||
existingVariant.Id);
|
||||
|
||||
stopwatch.Stop();
|
||||
return new IngestionResult(
|
||||
Guid.Empty,
|
||||
metadata.Name,
|
||||
metadata.Version,
|
||||
metadata.Architecture,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
stopwatch.Elapsed,
|
||||
["Binary already indexed."],
|
||||
[]);
|
||||
}
|
||||
|
||||
// Create or get library record
|
||||
var library = await _repository.GetOrCreateLibraryAsync(
|
||||
metadata.Name,
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
ct);
|
||||
|
||||
// Create ingestion job
|
||||
var job = await _repository.CreateIngestionJobAsync(
|
||||
library.Id,
|
||||
IngestionJobType.FullIngest,
|
||||
ct);
|
||||
|
||||
try
|
||||
{
|
||||
await _repository.UpdateIngestionJobAsync(
|
||||
job.Id,
|
||||
IngestionJobStatus.Running,
|
||||
ct: ct);
|
||||
|
||||
// Create or get version record
|
||||
var version = await _repository.GetOrCreateVersionAsync(
|
||||
library.Id,
|
||||
metadata.Version,
|
||||
metadata.ReleaseDate,
|
||||
metadata.IsSecurityRelease,
|
||||
metadata.SourceArchiveSha256,
|
||||
ct);
|
||||
|
||||
// Create build variant record
|
||||
var variant = await _repository.GetOrCreateBuildVariantAsync(
|
||||
version.Id,
|
||||
metadata.Architecture,
|
||||
binarySha256,
|
||||
metadata.Abi,
|
||||
metadata.Compiler,
|
||||
metadata.CompilerVersion,
|
||||
metadata.OptimizationLevel,
|
||||
null,
|
||||
ct);
|
||||
|
||||
// Extract functions from binary
|
||||
var functions = await ExtractFunctionsAsync(binaryStream, variant.Id, opts, warnings, ct);
|
||||
|
||||
// Filter functions based on options
|
||||
functions = ApplyFunctionFilters(functions, opts);
|
||||
|
||||
// Insert functions into database
|
||||
var insertedCount = await _repository.InsertFunctionsAsync(functions, ct);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Extracted and inserted {Count} functions from {Library} {Version}",
|
||||
insertedCount,
|
||||
metadata.Name,
|
||||
metadata.Version);
|
||||
|
||||
// Generate fingerprints for each function
|
||||
var fingerprintsGenerated = 0;
|
||||
if (_fingerprintGenerator is not null)
|
||||
{
|
||||
fingerprintsGenerated = await GenerateFingerprintsAsync(functions, opts, ct);
|
||||
}
|
||||
|
||||
// Generate clusters if enabled
|
||||
var clustersCreated = 0;
|
||||
if (opts.GenerateClusters)
|
||||
{
|
||||
clustersCreated = await GenerateClustersAsync(library.Id, functions, ct);
|
||||
}
|
||||
|
||||
// Update job with success
|
||||
await _repository.UpdateIngestionJobAsync(
|
||||
job.Id,
|
||||
IngestionJobStatus.Completed,
|
||||
functionsIndexed: insertedCount,
|
||||
fingerprintsGenerated: fingerprintsGenerated,
|
||||
clustersCreated: clustersCreated,
|
||||
ct: ct);
|
||||
|
||||
stopwatch.Stop();
|
||||
return new IngestionResult(
|
||||
job.Id,
|
||||
metadata.Name,
|
||||
metadata.Version,
|
||||
metadata.Architecture,
|
||||
insertedCount,
|
||||
fingerprintsGenerated,
|
||||
clustersCreated,
|
||||
stopwatch.Elapsed,
|
||||
[],
|
||||
[.. warnings]);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex,
|
||||
"Ingestion failed for {Library} {Version}",
|
||||
metadata.Name,
|
||||
metadata.Version);
|
||||
|
||||
await _repository.UpdateIngestionJobAsync(
|
||||
job.Id,
|
||||
IngestionJobStatus.Failed,
|
||||
errors: [ex.Message],
|
||||
ct: ct);
|
||||
|
||||
stopwatch.Stop();
|
||||
return new IngestionResult(
|
||||
job.Id,
|
||||
metadata.Name,
|
||||
metadata.Version,
|
||||
metadata.Architecture,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
stopwatch.Elapsed,
|
||||
[ex.Message],
|
||||
[.. warnings]);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async IAsyncEnumerable<IngestionResult> IngestFromConnectorAsync(
|
||||
string libraryName,
|
||||
ILibraryCorpusConnector connector,
|
||||
IngestionOptions? options = null,
|
||||
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(libraryName);
|
||||
ArgumentNullException.ThrowIfNull(connector);
|
||||
|
||||
var opts = options ?? new IngestionOptions();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting bulk ingestion from {Connector} for library {Library}",
|
||||
connector.LibraryName,
|
||||
libraryName);
|
||||
|
||||
// Get available versions
|
||||
var versions = await connector.GetAvailableVersionsAsync(ct);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Found {Count} versions for {Library}",
|
||||
versions.Length,
|
||||
libraryName);
|
||||
|
||||
var fetchOptions = new LibraryFetchOptions
|
||||
{
|
||||
IncludeDebugSymbols = true
|
||||
};
|
||||
|
||||
// Process each architecture
|
||||
foreach (var arch in connector.SupportedArchitectures)
|
||||
{
|
||||
await foreach (var binary in connector.FetchBinariesAsync(
|
||||
[.. versions],
|
||||
arch,
|
||||
fetchOptions,
|
||||
ct))
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
using (binary)
|
||||
{
|
||||
var metadata = new LibraryIngestionMetadata(
|
||||
libraryName,
|
||||
binary.Version,
|
||||
binary.Architecture,
|
||||
binary.Abi,
|
||||
binary.Compiler,
|
||||
binary.CompilerVersion,
|
||||
binary.OptimizationLevel,
|
||||
binary.ReleaseDate,
|
||||
false,
|
||||
null);
|
||||
|
||||
var result = await IngestLibraryAsync(metadata, binary.BinaryStream, opts, ct);
|
||||
yield return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<int> UpdateCveAssociationsAsync(
|
||||
string cveId,
|
||||
IReadOnlyList<FunctionCveAssociation> associations,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(cveId);
|
||||
ArgumentNullException.ThrowIfNull(associations);
|
||||
|
||||
if (associations.Count == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Updating CVE associations for {CveId} ({Count} functions)",
|
||||
cveId,
|
||||
associations.Count);
|
||||
|
||||
// Convert to FunctionCve records
|
||||
var cveRecords = associations.Select(a => new FunctionCve(
|
||||
a.FunctionId,
|
||||
cveId,
|
||||
a.AffectedState,
|
||||
a.PatchCommit,
|
||||
a.Confidence,
|
||||
a.EvidenceType)).ToList();
|
||||
|
||||
return await _repository.UpsertCveAssociationsAsync(cveId, cveRecords, ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<IngestionJob?> GetJobStatusAsync(Guid jobId, CancellationToken ct = default)
|
||||
{
|
||||
return await _repository.GetIngestionJobAsync(jobId, ct);
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private async Task<ImmutableArray<CorpusFunction>> ExtractFunctionsAsync(
|
||||
Stream binaryStream,
|
||||
Guid buildVariantId,
|
||||
IngestionOptions options,
|
||||
List<string> warnings,
|
||||
CancellationToken ct)
|
||||
{
|
||||
if (_functionExtractor is null)
|
||||
{
|
||||
warnings.Add("No function extractor configured, returning empty function list");
|
||||
_logger.LogWarning("No function extractor configured");
|
||||
return [];
|
||||
}
|
||||
|
||||
var extractedFunctions = await _functionExtractor.ExtractFunctionsAsync(binaryStream, ct);
|
||||
|
||||
// Convert to corpus functions with IDs
|
||||
var functions = extractedFunctions.Select(f => new CorpusFunction(
|
||||
Guid.NewGuid(),
|
||||
buildVariantId,
|
||||
f.Name,
|
||||
f.DemangledName,
|
||||
f.Address,
|
||||
f.SizeBytes,
|
||||
f.IsExported,
|
||||
f.IsInline,
|
||||
f.SourceFile,
|
||||
f.SourceLine)).ToImmutableArray();
|
||||
|
||||
return functions;
|
||||
}
|
||||
|
||||
private static ImmutableArray<CorpusFunction> ApplyFunctionFilters(
|
||||
ImmutableArray<CorpusFunction> functions,
|
||||
IngestionOptions options)
|
||||
{
|
||||
var filtered = functions
|
||||
.Where(f => f.SizeBytes >= options.MinFunctionSize)
|
||||
.Where(f => !options.ExportedOnly || f.IsExported)
|
||||
.Take(options.MaxFunctionsPerBinary);
|
||||
|
||||
return [.. filtered];
|
||||
}
|
||||
|
||||
private async Task<int> GenerateFingerprintsAsync(
|
||||
ImmutableArray<CorpusFunction> functions,
|
||||
IngestionOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
if (_fingerprintGenerator is null)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
var allFingerprints = new List<CorpusFingerprint>();
|
||||
|
||||
// Process in parallel with degree limit
|
||||
var semaphore = new SemaphoreSlim(options.ParallelDegree);
|
||||
|
||||
var tasks = functions.Select(async function =>
|
||||
{
|
||||
await semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
var fingerprints = await _fingerprintGenerator.GenerateFingerprintsAsync(function.Id, ct);
|
||||
lock (allFingerprints)
|
||||
{
|
||||
allFingerprints.AddRange(fingerprints);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
semaphore.Release();
|
||||
}
|
||||
});
|
||||
|
||||
await Task.WhenAll(tasks);
|
||||
|
||||
if (allFingerprints.Count > 0)
|
||||
{
|
||||
return await _repository.InsertFingerprintsAsync(allFingerprints, ct);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private async Task<int> GenerateClustersAsync(
|
||||
Guid libraryId,
|
||||
ImmutableArray<CorpusFunction> functions,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Simple clustering: group functions by demangled name (if available) or name
|
||||
var clusters = functions
|
||||
.GroupBy(f => f.DemangledName ?? f.Name)
|
||||
.Where(g => g.Count() > 1) // Only create clusters for functions appearing multiple times
|
||||
.ToList();
|
||||
|
||||
var clustersCreated = 0;
|
||||
|
||||
foreach (var group in clusters)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var cluster = await _repository.GetOrCreateClusterAsync(
|
||||
libraryId,
|
||||
group.Key,
|
||||
null,
|
||||
ct);
|
||||
|
||||
var members = group.Select(f => new ClusterMember(cluster.Id, f.Id, 1.0m)).ToList();
|
||||
|
||||
await _repository.AddClusterMembersAsync(cluster.Id, members, ct);
|
||||
clustersCreated++;
|
||||
}
|
||||
|
||||
return clustersCreated;
|
||||
}
|
||||
|
||||
private static async Task<string> ComputeSha256Async(Stream stream, CancellationToken ct)
|
||||
{
|
||||
using var sha256 = SHA256.Create();
|
||||
var hash = await sha256.ComputeHashAsync(stream, ct);
|
||||
return Convert.ToHexStringLower(hash);
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for extracting functions from binary files.
|
||||
/// </summary>
|
||||
public interface IFunctionExtractor
|
||||
{
|
||||
/// <summary>
|
||||
/// Extract functions from a binary stream.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<ExtractedFunction>> ExtractFunctionsAsync(
|
||||
Stream binaryStream,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for generating function fingerprints.
|
||||
/// </summary>
|
||||
public interface IFingerprintGenerator
|
||||
{
|
||||
/// <summary>
|
||||
/// Generate fingerprints for a function.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<CorpusFingerprint>> GenerateFingerprintsAsync(
|
||||
Guid functionId,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A function extracted from a binary.
|
||||
/// </summary>
|
||||
public sealed record ExtractedFunction(
|
||||
string Name,
|
||||
string? DemangledName,
|
||||
ulong Address,
|
||||
int SizeBytes,
|
||||
bool IsExported,
|
||||
bool IsInline,
|
||||
string? SourceFile,
|
||||
int? SourceLine);
|
||||
@@ -0,0 +1,419 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Service for querying the function corpus to identify functions.
|
||||
/// </summary>
|
||||
public sealed class CorpusQueryService : ICorpusQueryService
|
||||
{
|
||||
private readonly ICorpusRepository _repository;
|
||||
private readonly IClusterSimilarityComputer _similarityComputer;
|
||||
private readonly ILogger<CorpusQueryService> _logger;
|
||||
|
||||
public CorpusQueryService(
|
||||
ICorpusRepository repository,
|
||||
IClusterSimilarityComputer similarityComputer,
|
||||
ILogger<CorpusQueryService> logger)
|
||||
{
|
||||
_repository = repository;
|
||||
_similarityComputer = similarityComputer;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<FunctionMatch>> IdentifyFunctionAsync(
|
||||
FunctionFingerprints fingerprints,
|
||||
IdentifyOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var opts = options ?? new IdentifyOptions();
|
||||
|
||||
_logger.LogDebug("Identifying function with fingerprints");
|
||||
|
||||
var candidates = new List<FunctionCandidate>();
|
||||
|
||||
// Search by each available fingerprint type
|
||||
if (fingerprints.SemanticHash is { Length: > 0 })
|
||||
{
|
||||
var matches = await SearchByFingerprintAsync(
|
||||
FingerprintAlgorithm.SemanticKsg,
|
||||
fingerprints.SemanticHash,
|
||||
opts,
|
||||
ct);
|
||||
candidates.AddRange(matches);
|
||||
}
|
||||
|
||||
if (fingerprints.InstructionHash is { Length: > 0 })
|
||||
{
|
||||
var matches = await SearchByFingerprintAsync(
|
||||
FingerprintAlgorithm.InstructionBb,
|
||||
fingerprints.InstructionHash,
|
||||
opts,
|
||||
ct);
|
||||
candidates.AddRange(matches);
|
||||
}
|
||||
|
||||
if (fingerprints.CfgHash is { Length: > 0 })
|
||||
{
|
||||
var matches = await SearchByFingerprintAsync(
|
||||
FingerprintAlgorithm.CfgWl,
|
||||
fingerprints.CfgHash,
|
||||
opts,
|
||||
ct);
|
||||
candidates.AddRange(matches);
|
||||
}
|
||||
|
||||
// Group candidates by function and compute combined similarity
|
||||
var groupedCandidates = candidates
|
||||
.GroupBy(c => c.FunctionId)
|
||||
.Select(g => ComputeCombinedScore(g, fingerprints, opts.Weights))
|
||||
.Where(c => c.Similarity >= opts.MinSimilarity)
|
||||
.OrderByDescending(c => c.Similarity)
|
||||
.Take(opts.MaxResults)
|
||||
.ToList();
|
||||
|
||||
// Enrich with full function details
|
||||
var results = new List<FunctionMatch>();
|
||||
|
||||
foreach (var candidate in groupedCandidates)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
// Get the original candidates for this function
|
||||
var functionCandidates = candidates.Where(c => c.FunctionId == candidate.FunctionId).ToList();
|
||||
|
||||
var function = await _repository.GetFunctionAsync(candidate.FunctionId, ct);
|
||||
if (function is null) continue;
|
||||
|
||||
var variant = await _repository.GetBuildVariantAsync(function.BuildVariantId, ct);
|
||||
if (variant is null) continue;
|
||||
|
||||
// Apply filters
|
||||
if (opts.ArchitectureFilter is { Length: > 0 })
|
||||
{
|
||||
if (!opts.ArchitectureFilter.Value.Contains(variant.Architecture, StringComparer.OrdinalIgnoreCase))
|
||||
continue;
|
||||
}
|
||||
|
||||
var version = await _repository.GetLibraryVersionAsync(variant.LibraryVersionId, ct);
|
||||
if (version is null) continue;
|
||||
|
||||
var library = await _repository.GetLibraryByIdAsync(version.LibraryId, ct);
|
||||
if (library is null) continue;
|
||||
|
||||
// Apply library filter
|
||||
if (opts.LibraryFilter is { Length: > 0 })
|
||||
{
|
||||
if (!opts.LibraryFilter.Value.Contains(library.Name, StringComparer.OrdinalIgnoreCase))
|
||||
continue;
|
||||
}
|
||||
|
||||
results.Add(new FunctionMatch(
|
||||
library.Name,
|
||||
version.Version,
|
||||
function.Name,
|
||||
function.DemangledName,
|
||||
candidate.Similarity,
|
||||
ComputeConfidence(candidate),
|
||||
variant.Architecture,
|
||||
variant.Abi,
|
||||
new MatchDetails(
|
||||
GetAlgorithmSimilarity(functionCandidates, FingerprintAlgorithm.SemanticKsg),
|
||||
GetAlgorithmSimilarity(functionCandidates, FingerprintAlgorithm.InstructionBb),
|
||||
GetAlgorithmSimilarity(functionCandidates, FingerprintAlgorithm.CfgWl),
|
||||
GetAlgorithmSimilarity(functionCandidates, FingerprintAlgorithm.ApiCalls),
|
||||
[],
|
||||
fingerprints.SizeBytes.HasValue
|
||||
? function.SizeBytes - fingerprints.SizeBytes.Value
|
||||
: 0)));
|
||||
}
|
||||
|
||||
return [.. results];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableDictionary<int, ImmutableArray<FunctionMatch>>> IdentifyBatchAsync(
|
||||
IReadOnlyList<FunctionFingerprints> fingerprints,
|
||||
IdentifyOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var results = ImmutableDictionary.CreateBuilder<int, ImmutableArray<FunctionMatch>>();
|
||||
|
||||
// Process in parallel with controlled concurrency
|
||||
var semaphore = new SemaphoreSlim(4);
|
||||
var tasks = fingerprints.Select(async (fp, index) =>
|
||||
{
|
||||
await semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
var matches = await IdentifyFunctionAsync(fp, options, ct);
|
||||
return (Index: index, Matches: matches);
|
||||
}
|
||||
finally
|
||||
{
|
||||
semaphore.Release();
|
||||
}
|
||||
});
|
||||
|
||||
var completedResults = await Task.WhenAll(tasks);
|
||||
|
||||
foreach (var result in completedResults)
|
||||
{
|
||||
results.Add(result.Index, result.Matches);
|
||||
}
|
||||
|
||||
return results.ToImmutable();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<CorpusFunctionWithCve>> GetFunctionsForCveAsync(
|
||||
string cveId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Getting functions for CVE {CveId}", cveId);
|
||||
|
||||
var functionIds = await _repository.GetFunctionIdsForCveAsync(cveId, ct);
|
||||
var results = new List<CorpusFunctionWithCve>();
|
||||
|
||||
foreach (var functionId in functionIds)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var function = await _repository.GetFunctionAsync(functionId, ct);
|
||||
if (function is null) continue;
|
||||
|
||||
var variant = await _repository.GetBuildVariantAsync(function.BuildVariantId, ct);
|
||||
if (variant is null) continue;
|
||||
|
||||
var version = await _repository.GetLibraryVersionAsync(variant.LibraryVersionId, ct);
|
||||
if (version is null) continue;
|
||||
|
||||
var library = await _repository.GetLibraryByIdAsync(version.LibraryId, ct);
|
||||
if (library is null) continue;
|
||||
|
||||
var cves = await _repository.GetCvesForFunctionAsync(functionId, ct);
|
||||
var cveInfo = cves.FirstOrDefault(c => c.CveId == cveId);
|
||||
if (cveInfo is null) continue;
|
||||
|
||||
results.Add(new CorpusFunctionWithCve(function, library, version, variant, cveInfo));
|
||||
}
|
||||
|
||||
return [.. results];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<FunctionEvolution?> GetFunctionEvolutionAsync(
|
||||
string libraryName,
|
||||
string functionName,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogDebug("Getting evolution for function {Function} in {Library}", functionName, libraryName);
|
||||
|
||||
var library = await _repository.GetLibraryAsync(libraryName, ct);
|
||||
if (library is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var versions = await _repository.ListVersionsAsync(libraryName, ct);
|
||||
var snapshots = new List<FunctionVersionInfo>();
|
||||
string? previousFingerprintHex = null;
|
||||
|
||||
foreach (var versionSummary in versions.OrderBy(v => v.ReleaseDate))
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var version = await _repository.GetVersionAsync(library.Id, versionSummary.Version, ct);
|
||||
if (version is null) continue;
|
||||
|
||||
var variants = await _repository.GetBuildVariantsAsync(version.Id, ct);
|
||||
|
||||
// Find the function in any variant
|
||||
CorpusFunction? targetFunction = null;
|
||||
CorpusFingerprint? fingerprint = null;
|
||||
|
||||
foreach (var variant in variants)
|
||||
{
|
||||
var functions = await _repository.GetFunctionsForVariantAsync(variant.Id, ct);
|
||||
targetFunction = functions.FirstOrDefault(f =>
|
||||
string.Equals(f.Name, functionName, StringComparison.Ordinal) ||
|
||||
string.Equals(f.DemangledName, functionName, StringComparison.Ordinal));
|
||||
|
||||
if (targetFunction is not null)
|
||||
{
|
||||
var fps = await _repository.GetFingerprintsAsync(targetFunction.Id, ct);
|
||||
fingerprint = fps.FirstOrDefault(f => f.Algorithm == FingerprintAlgorithm.SemanticKsg);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (targetFunction is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get CVE info for this version
|
||||
var cves = await _repository.GetCvesForFunctionAsync(targetFunction.Id, ct);
|
||||
var cveIds = cves.Select(c => c.CveId).ToImmutableArray();
|
||||
|
||||
// Compute similarity to previous version if available
|
||||
decimal? similarityToPrevious = null;
|
||||
var currentFingerprintHex = fingerprint?.FingerprintHex ?? string.Empty;
|
||||
if (previousFingerprintHex is not null && currentFingerprintHex.Length > 0)
|
||||
{
|
||||
// Simple comparison: same hash = 1.0, different = 0.5 (would need proper similarity for better results)
|
||||
similarityToPrevious = string.Equals(previousFingerprintHex, currentFingerprintHex, StringComparison.Ordinal)
|
||||
? 1.0m
|
||||
: 0.5m;
|
||||
}
|
||||
previousFingerprintHex = currentFingerprintHex;
|
||||
|
||||
snapshots.Add(new FunctionVersionInfo(
|
||||
versionSummary.Version,
|
||||
versionSummary.ReleaseDate,
|
||||
targetFunction.SizeBytes,
|
||||
currentFingerprintHex,
|
||||
similarityToPrevious,
|
||||
cveIds.Length > 0 ? cveIds : null));
|
||||
}
|
||||
|
||||
if (snapshots.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return new FunctionEvolution(libraryName, functionName, [.. snapshots]);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<CorpusStatistics> GetStatisticsAsync(CancellationToken ct = default)
|
||||
{
|
||||
return await _repository.GetStatisticsAsync(ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<LibrarySummary>> ListLibrariesAsync(CancellationToken ct = default)
|
||||
{
|
||||
return await _repository.ListLibrariesAsync(ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<LibraryVersionSummary>> ListVersionsAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
return await _repository.ListVersionsAsync(libraryName, ct);
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private async Task<List<FunctionCandidate>> SearchByFingerprintAsync(
|
||||
FingerprintAlgorithm algorithm,
|
||||
byte[] fingerprint,
|
||||
IdentifyOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var candidates = new List<FunctionCandidate>();
|
||||
|
||||
// First try exact match
|
||||
var exactMatches = await _repository.FindFunctionsByFingerprintAsync(algorithm, fingerprint, ct);
|
||||
foreach (var functionId in exactMatches)
|
||||
{
|
||||
candidates.Add(new FunctionCandidate(functionId, algorithm, 1.0m, fingerprint));
|
||||
}
|
||||
|
||||
// Then try approximate matching
|
||||
var similarResults = await _repository.FindSimilarFingerprintsAsync(
|
||||
algorithm,
|
||||
fingerprint,
|
||||
options.MaxResults * 2, // Get more to account for filtering
|
||||
ct);
|
||||
|
||||
foreach (var result in similarResults)
|
||||
{
|
||||
if (!candidates.Any(c => c.FunctionId == result.FunctionId))
|
||||
{
|
||||
candidates.Add(new FunctionCandidate(
|
||||
result.FunctionId,
|
||||
algorithm,
|
||||
result.Similarity,
|
||||
result.Fingerprint));
|
||||
}
|
||||
}
|
||||
|
||||
return candidates;
|
||||
}
|
||||
|
||||
private static CombinedCandidate ComputeCombinedScore(
|
||||
IGrouping<Guid, FunctionCandidate> group,
|
||||
FunctionFingerprints query,
|
||||
SimilarityWeights weights)
|
||||
{
|
||||
var candidates = group.ToList();
|
||||
|
||||
decimal totalScore = 0;
|
||||
decimal totalWeight = 0;
|
||||
var algorithms = new List<FingerprintAlgorithm>();
|
||||
|
||||
foreach (var candidate in candidates)
|
||||
{
|
||||
var weight = candidate.Algorithm switch
|
||||
{
|
||||
FingerprintAlgorithm.SemanticKsg => weights.SemanticWeight,
|
||||
FingerprintAlgorithm.InstructionBb => weights.InstructionWeight,
|
||||
FingerprintAlgorithm.CfgWl => weights.CfgWeight,
|
||||
FingerprintAlgorithm.ApiCalls => weights.ApiCallWeight,
|
||||
_ => 0.1m
|
||||
};
|
||||
|
||||
totalScore += candidate.Similarity * weight;
|
||||
totalWeight += weight;
|
||||
algorithms.Add(candidate.Algorithm);
|
||||
}
|
||||
|
||||
var combinedSimilarity = totalWeight > 0 ? totalScore / totalWeight : 0;
|
||||
|
||||
return new CombinedCandidate(group.Key, combinedSimilarity, [.. algorithms]);
|
||||
}
|
||||
|
||||
private static MatchConfidence ComputeConfidence(CombinedCandidate candidate)
|
||||
{
|
||||
// Higher confidence with more matching algorithms and higher similarity
|
||||
var algorithmCount = candidate.MatchingAlgorithms.Length;
|
||||
var similarity = candidate.Similarity;
|
||||
|
||||
if (algorithmCount >= 3 && similarity >= 0.95m)
|
||||
return MatchConfidence.Exact;
|
||||
if (algorithmCount >= 3 && similarity >= 0.85m)
|
||||
return MatchConfidence.VeryHigh;
|
||||
if (algorithmCount >= 2 && similarity >= 0.85m)
|
||||
return MatchConfidence.High;
|
||||
if (algorithmCount >= 1 && similarity >= 0.70m)
|
||||
return MatchConfidence.Medium;
|
||||
return MatchConfidence.Low;
|
||||
}
|
||||
|
||||
private static decimal GetAlgorithmSimilarity(
|
||||
List<FunctionCandidate> candidates,
|
||||
FingerprintAlgorithm algorithm)
|
||||
{
|
||||
var match = candidates.FirstOrDefault(c => c.Algorithm == algorithm);
|
||||
return match?.Similarity ?? 0m;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
private sealed record FunctionCandidate(
|
||||
Guid FunctionId,
|
||||
FingerprintAlgorithm Algorithm,
|
||||
decimal Similarity,
|
||||
byte[] Fingerprint);
|
||||
|
||||
private sealed record CombinedCandidate(
|
||||
Guid FunctionId,
|
||||
decimal Similarity,
|
||||
ImmutableArray<FingerprintAlgorithm> MatchingAlgorithms);
|
||||
}
|
||||
@@ -0,0 +1,423 @@
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Service for updating CVE-to-function mappings in the corpus.
|
||||
/// </summary>
|
||||
public sealed class CveFunctionMappingUpdater : ICveFunctionMappingUpdater
|
||||
{
|
||||
private readonly ICorpusRepository _repository;
|
||||
private readonly ICveDataProvider _cveDataProvider;
|
||||
private readonly ILogger<CveFunctionMappingUpdater> _logger;
|
||||
|
||||
public CveFunctionMappingUpdater(
|
||||
ICorpusRepository repository,
|
||||
ICveDataProvider cveDataProvider,
|
||||
ILogger<CveFunctionMappingUpdater> logger)
|
||||
{
|
||||
_repository = repository;
|
||||
_cveDataProvider = cveDataProvider;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<CveMappingUpdateResult> UpdateMappingsForCveAsync(
|
||||
string cveId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogInformation("Updating function mappings for CVE {CveId}", cveId);
|
||||
|
||||
var startTime = DateTime.UtcNow;
|
||||
var errors = new List<string>();
|
||||
var functionsUpdated = 0;
|
||||
|
||||
try
|
||||
{
|
||||
// Get CVE details from provider
|
||||
var cveDetails = await _cveDataProvider.GetCveDetailsAsync(cveId, ct);
|
||||
if (cveDetails is null)
|
||||
{
|
||||
return new CveMappingUpdateResult(
|
||||
cveId,
|
||||
0,
|
||||
DateTime.UtcNow - startTime,
|
||||
[$"CVE {cveId} not found in data provider"]);
|
||||
}
|
||||
|
||||
// Get affected library
|
||||
var library = await _repository.GetLibraryAsync(cveDetails.AffectedLibrary, ct);
|
||||
if (library is null)
|
||||
{
|
||||
return new CveMappingUpdateResult(
|
||||
cveId,
|
||||
0,
|
||||
DateTime.UtcNow - startTime,
|
||||
[$"Library {cveDetails.AffectedLibrary} not found in corpus"]);
|
||||
}
|
||||
|
||||
// Process affected versions
|
||||
var associations = new List<FunctionCve>();
|
||||
|
||||
foreach (var affectedVersion in cveDetails.AffectedVersions)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
// Find matching version in corpus
|
||||
var version = await FindMatchingVersionAsync(library.Id, affectedVersion, ct);
|
||||
if (version is null)
|
||||
{
|
||||
_logger.LogDebug("Version {Version} not found in corpus", affectedVersion);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Get all build variants for this version
|
||||
var variants = await _repository.GetBuildVariantsAsync(version.Id, ct);
|
||||
|
||||
foreach (var variant in variants)
|
||||
{
|
||||
// Get functions in this variant
|
||||
var functions = await _repository.GetFunctionsForVariantAsync(variant.Id, ct);
|
||||
|
||||
// If we have specific function names, only map those
|
||||
if (cveDetails.AffectedFunctions.Length > 0)
|
||||
{
|
||||
var matchedFunctions = functions.Where(f =>
|
||||
cveDetails.AffectedFunctions.Contains(f.Name, StringComparer.Ordinal) ||
|
||||
(f.DemangledName is not null &&
|
||||
cveDetails.AffectedFunctions.Contains(f.DemangledName, StringComparer.Ordinal)));
|
||||
|
||||
foreach (var function in matchedFunctions)
|
||||
{
|
||||
associations.Add(CreateAssociation(function.Id, cveId, cveDetails, affectedVersion));
|
||||
functionsUpdated++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Map all functions in affected variant as potentially affected
|
||||
foreach (var function in functions.Take(100)) // Limit to avoid huge updates
|
||||
{
|
||||
associations.Add(CreateAssociation(function.Id, cveId, cveDetails, affectedVersion));
|
||||
functionsUpdated++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Upsert all associations
|
||||
if (associations.Count > 0)
|
||||
{
|
||||
await _repository.UpsertCveAssociationsAsync(cveId, associations, ct);
|
||||
}
|
||||
|
||||
var duration = DateTime.UtcNow - startTime;
|
||||
_logger.LogInformation(
|
||||
"Updated {Count} function mappings for CVE {CveId} in {Duration:c}",
|
||||
functionsUpdated, cveId, duration);
|
||||
|
||||
return new CveMappingUpdateResult(cveId, functionsUpdated, duration, [.. errors]);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
errors.Add(ex.Message);
|
||||
_logger.LogError(ex, "Error updating mappings for CVE {CveId}", cveId);
|
||||
return new CveMappingUpdateResult(cveId, functionsUpdated, DateTime.UtcNow - startTime, [.. errors]);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<CveBatchMappingResult> UpdateMappingsForLibraryAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogInformation("Updating all CVE mappings for library {Library}", libraryName);
|
||||
|
||||
var startTime = DateTime.UtcNow;
|
||||
var results = new List<CveMappingUpdateResult>();
|
||||
|
||||
// Get all CVEs for this library
|
||||
var cves = await _cveDataProvider.GetCvesForLibraryAsync(libraryName, ct);
|
||||
|
||||
foreach (var cveId in cves)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var result = await UpdateMappingsForCveAsync(cveId, ct);
|
||||
results.Add(result);
|
||||
}
|
||||
|
||||
var totalDuration = DateTime.UtcNow - startTime;
|
||||
|
||||
return new CveBatchMappingResult(
|
||||
libraryName,
|
||||
results.Count,
|
||||
results.Sum(r => r.FunctionsUpdated),
|
||||
totalDuration,
|
||||
[.. results.Where(r => r.Errors.Length > 0).SelectMany(r => r.Errors)]);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<CveMappingUpdateResult> MarkFunctionFixedAsync(
|
||||
string cveId,
|
||||
string libraryName,
|
||||
string version,
|
||||
string? functionName,
|
||||
string? patchCommit,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Marking functions as fixed for CVE {CveId} in {Library} {Version}",
|
||||
cveId, libraryName, version);
|
||||
|
||||
var startTime = DateTime.UtcNow;
|
||||
var functionsUpdated = 0;
|
||||
|
||||
var library = await _repository.GetLibraryAsync(libraryName, ct);
|
||||
if (library is null)
|
||||
{
|
||||
return new CveMappingUpdateResult(
|
||||
cveId, 0, DateTime.UtcNow - startTime,
|
||||
[$"Library {libraryName} not found"]);
|
||||
}
|
||||
|
||||
var libVersion = await _repository.GetVersionAsync(library.Id, version, ct);
|
||||
if (libVersion is null)
|
||||
{
|
||||
return new CveMappingUpdateResult(
|
||||
cveId, 0, DateTime.UtcNow - startTime,
|
||||
[$"Version {version} not found"]);
|
||||
}
|
||||
|
||||
var variants = await _repository.GetBuildVariantsAsync(libVersion.Id, ct);
|
||||
var associations = new List<FunctionCve>();
|
||||
|
||||
foreach (var variant in variants)
|
||||
{
|
||||
var functions = await _repository.GetFunctionsForVariantAsync(variant.Id, ct);
|
||||
|
||||
IEnumerable<CorpusFunction> targetFunctions = functionName is null
|
||||
? functions
|
||||
: functions.Where(f =>
|
||||
string.Equals(f.Name, functionName, StringComparison.Ordinal) ||
|
||||
string.Equals(f.DemangledName, functionName, StringComparison.Ordinal));
|
||||
|
||||
foreach (var function in targetFunctions)
|
||||
{
|
||||
associations.Add(new FunctionCve(
|
||||
function.Id,
|
||||
cveId,
|
||||
CveAffectedState.Fixed,
|
||||
patchCommit,
|
||||
0.9m, // High confidence for explicit marking
|
||||
CveEvidenceType.Commit));
|
||||
functionsUpdated++;
|
||||
}
|
||||
}
|
||||
|
||||
if (associations.Count > 0)
|
||||
{
|
||||
await _repository.UpsertCveAssociationsAsync(cveId, associations, ct);
|
||||
}
|
||||
|
||||
return new CveMappingUpdateResult(
|
||||
cveId, functionsUpdated, DateTime.UtcNow - startTime, []);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<string>> GetUnmappedCvesAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
// Get all known CVEs for this library
|
||||
var allCves = await _cveDataProvider.GetCvesForLibraryAsync(libraryName, ct);
|
||||
|
||||
// Get CVEs that have function mappings
|
||||
var unmapped = new List<string>();
|
||||
|
||||
foreach (var cveId in allCves)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var functionIds = await _repository.GetFunctionIdsForCveAsync(cveId, ct);
|
||||
if (functionIds.Length == 0)
|
||||
{
|
||||
unmapped.Add(cveId);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. unmapped];
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private async Task<LibraryVersion?> FindMatchingVersionAsync(
|
||||
Guid libraryId,
|
||||
string versionString,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Try exact match first
|
||||
var exactMatch = await _repository.GetVersionAsync(libraryId, versionString, ct);
|
||||
if (exactMatch is not null)
|
||||
{
|
||||
return exactMatch;
|
||||
}
|
||||
|
||||
// Try with common prefixes/suffixes removed
|
||||
var normalizedVersion = NormalizeVersion(versionString);
|
||||
if (normalizedVersion != versionString)
|
||||
{
|
||||
return await _repository.GetVersionAsync(libraryId, normalizedVersion, ct);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static string NormalizeVersion(string version)
|
||||
{
|
||||
// Remove common prefixes
|
||||
if (version.StartsWith("v", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
version = version[1..];
|
||||
}
|
||||
|
||||
// Remove release suffixes
|
||||
var suffixIndex = version.IndexOfAny(['-', '+', '_']);
|
||||
if (suffixIndex > 0)
|
||||
{
|
||||
version = version[..suffixIndex];
|
||||
}
|
||||
|
||||
return version;
|
||||
}
|
||||
|
||||
private static FunctionCve CreateAssociation(
|
||||
Guid functionId,
|
||||
string cveId,
|
||||
CveDetails cveDetails,
|
||||
string version)
|
||||
{
|
||||
var isFixed = cveDetails.FixedVersions.Contains(version, StringComparer.OrdinalIgnoreCase);
|
||||
|
||||
return new FunctionCve(
|
||||
functionId,
|
||||
cveId,
|
||||
isFixed ? CveAffectedState.Fixed : CveAffectedState.Vulnerable,
|
||||
cveDetails.PatchCommit,
|
||||
ComputeConfidence(cveDetails),
|
||||
cveDetails.EvidenceType);
|
||||
}
|
||||
|
||||
private static decimal ComputeConfidence(CveDetails details)
|
||||
{
|
||||
// Higher confidence for specific function names and commit evidence
|
||||
var baseConfidence = 0.5m;
|
||||
|
||||
if (details.AffectedFunctions.Length > 0)
|
||||
{
|
||||
baseConfidence += 0.2m;
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(details.PatchCommit))
|
||||
{
|
||||
baseConfidence += 0.2m;
|
||||
}
|
||||
|
||||
return details.EvidenceType switch
|
||||
{
|
||||
CveEvidenceType.Commit => baseConfidence + 0.1m,
|
||||
CveEvidenceType.Advisory => baseConfidence + 0.05m,
|
||||
CveEvidenceType.Changelog => baseConfidence + 0.05m,
|
||||
_ => baseConfidence
|
||||
};
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for CVE-to-function mapping updates.
|
||||
/// </summary>
|
||||
public interface ICveFunctionMappingUpdater
|
||||
{
|
||||
/// <summary>
|
||||
/// Update function mappings for a specific CVE.
|
||||
/// </summary>
|
||||
Task<CveMappingUpdateResult> UpdateMappingsForCveAsync(
|
||||
string cveId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Update all CVE mappings for a library.
|
||||
/// </summary>
|
||||
Task<CveBatchMappingResult> UpdateMappingsForLibraryAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Mark functions as fixed for a CVE.
|
||||
/// </summary>
|
||||
Task<CveMappingUpdateResult> MarkFunctionFixedAsync(
|
||||
string cveId,
|
||||
string libraryName,
|
||||
string version,
|
||||
string? functionName,
|
||||
string? patchCommit,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get CVEs that have no function mappings.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<string>> GetUnmappedCvesAsync(
|
||||
string libraryName,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Provider for CVE data.
|
||||
/// </summary>
|
||||
public interface ICveDataProvider
|
||||
{
|
||||
/// <summary>
|
||||
/// Get details for a CVE.
|
||||
/// </summary>
|
||||
Task<CveDetails?> GetCveDetailsAsync(string cveId, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get all CVEs affecting a library.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<string>> GetCvesForLibraryAsync(string libraryName, CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// CVE details from a data provider.
|
||||
/// </summary>
|
||||
public sealed record CveDetails(
|
||||
string CveId,
|
||||
string AffectedLibrary,
|
||||
ImmutableArray<string> AffectedVersions,
|
||||
ImmutableArray<string> FixedVersions,
|
||||
ImmutableArray<string> AffectedFunctions,
|
||||
string? PatchCommit,
|
||||
CveEvidenceType EvidenceType);
|
||||
|
||||
/// <summary>
|
||||
/// Result of a CVE mapping update.
|
||||
/// </summary>
|
||||
public sealed record CveMappingUpdateResult(
|
||||
string CveId,
|
||||
int FunctionsUpdated,
|
||||
TimeSpan Duration,
|
||||
ImmutableArray<string> Errors);
|
||||
|
||||
/// <summary>
|
||||
/// Result of batch CVE mapping update.
|
||||
/// </summary>
|
||||
public sealed record CveBatchMappingResult(
|
||||
string LibraryName,
|
||||
int CvesProcessed,
|
||||
int TotalFunctionsUpdated,
|
||||
TimeSpan Duration,
|
||||
ImmutableArray<string> Errors);
|
||||
@@ -0,0 +1,531 @@
|
||||
using System.Collections.Immutable;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Corpus.Services;
|
||||
|
||||
/// <summary>
|
||||
/// Service for clustering semantically similar functions across library versions.
|
||||
/// Groups functions by their canonical name and computes similarity to cluster centroid.
|
||||
/// </summary>
|
||||
public sealed partial class FunctionClusteringService : IFunctionClusteringService
|
||||
{
|
||||
private readonly ICorpusRepository _repository;
|
||||
private readonly IClusterSimilarityComputer _similarityComputer;
|
||||
private readonly ILogger<FunctionClusteringService> _logger;
|
||||
|
||||
public FunctionClusteringService(
|
||||
ICorpusRepository repository,
|
||||
IClusterSimilarityComputer similarityComputer,
|
||||
ILogger<FunctionClusteringService> logger)
|
||||
{
|
||||
_repository = repository;
|
||||
_similarityComputer = similarityComputer;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ClusteringResult> ClusterFunctionsAsync(
|
||||
Guid libraryId,
|
||||
ClusteringOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var opts = options ?? new ClusteringOptions();
|
||||
var startTime = DateTime.UtcNow;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting function clustering for library {LibraryId}",
|
||||
libraryId);
|
||||
|
||||
// Get all functions with fingerprints for this library
|
||||
var functionsWithFingerprints = await GetFunctionsWithFingerprintsAsync(libraryId, ct);
|
||||
|
||||
if (functionsWithFingerprints.Count == 0)
|
||||
{
|
||||
_logger.LogWarning("No functions with fingerprints found for library {LibraryId}", libraryId);
|
||||
return new ClusteringResult(
|
||||
libraryId,
|
||||
0,
|
||||
0,
|
||||
TimeSpan.Zero,
|
||||
[],
|
||||
[]);
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Found {Count} functions with fingerprints",
|
||||
functionsWithFingerprints.Count);
|
||||
|
||||
// Group functions by canonical name
|
||||
var groupedByName = functionsWithFingerprints
|
||||
.GroupBy(f => NormalizeCanonicalName(f.Function.DemangledName ?? f.Function.Name))
|
||||
.Where(g => !string.IsNullOrWhiteSpace(g.Key))
|
||||
.ToList();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Grouped into {Count} canonical function names",
|
||||
groupedByName.Count);
|
||||
|
||||
var clustersCreated = 0;
|
||||
var membersAssigned = 0;
|
||||
var errors = new List<string>();
|
||||
var warnings = new List<string>();
|
||||
|
||||
foreach (var group in groupedByName)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
try
|
||||
{
|
||||
var result = await ProcessFunctionGroupAsync(
|
||||
libraryId,
|
||||
group.Key,
|
||||
group.ToList(),
|
||||
opts,
|
||||
ct);
|
||||
|
||||
clustersCreated++;
|
||||
membersAssigned += result.MembersAdded;
|
||||
|
||||
if (result.Warnings.Length > 0)
|
||||
{
|
||||
warnings.AddRange(result.Warnings);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
errors.Add($"Failed to cluster '{group.Key}': {ex.Message}");
|
||||
_logger.LogError(ex, "Error clustering function group {Name}", group.Key);
|
||||
}
|
||||
}
|
||||
|
||||
var duration = DateTime.UtcNow - startTime;
|
||||
|
||||
_logger.LogInformation(
|
||||
"Clustering completed: {Clusters} clusters, {Members} members in {Duration:c}",
|
||||
clustersCreated,
|
||||
membersAssigned,
|
||||
duration);
|
||||
|
||||
return new ClusteringResult(
|
||||
libraryId,
|
||||
clustersCreated,
|
||||
membersAssigned,
|
||||
duration,
|
||||
[.. errors],
|
||||
[.. warnings]);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ClusteringResult> ReclusterAsync(
|
||||
Guid clusterId,
|
||||
ClusteringOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var opts = options ?? new ClusteringOptions();
|
||||
var startTime = DateTime.UtcNow;
|
||||
|
||||
// Get existing cluster
|
||||
var cluster = await _repository.GetClusterAsync(clusterId, ct);
|
||||
if (cluster is null)
|
||||
{
|
||||
return new ClusteringResult(
|
||||
Guid.Empty,
|
||||
0,
|
||||
0,
|
||||
TimeSpan.Zero,
|
||||
["Cluster not found"],
|
||||
[]);
|
||||
}
|
||||
|
||||
// Get current members
|
||||
var members = await _repository.GetClusterMembersAsync(clusterId, ct);
|
||||
if (members.Length == 0)
|
||||
{
|
||||
return new ClusteringResult(
|
||||
cluster.LibraryId,
|
||||
0,
|
||||
0,
|
||||
TimeSpan.Zero,
|
||||
[],
|
||||
["Cluster has no members"]);
|
||||
}
|
||||
|
||||
// Get functions with fingerprints
|
||||
var functionsWithFingerprints = new List<FunctionWithFingerprint>();
|
||||
foreach (var member in members)
|
||||
{
|
||||
var function = await _repository.GetFunctionAsync(member.FunctionId, ct);
|
||||
if (function is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var fingerprints = await _repository.GetFingerprintsForFunctionAsync(function.Id, ct);
|
||||
var semanticFp = fingerprints.FirstOrDefault(f => f.Algorithm == FingerprintAlgorithm.SemanticKsg);
|
||||
|
||||
if (semanticFp is not null)
|
||||
{
|
||||
functionsWithFingerprints.Add(new FunctionWithFingerprint(function, semanticFp));
|
||||
}
|
||||
}
|
||||
|
||||
// Clear existing members
|
||||
await _repository.ClearClusterMembersAsync(clusterId, ct);
|
||||
|
||||
// Recompute similarities
|
||||
var centroid = ComputeCentroid(functionsWithFingerprints, opts);
|
||||
var membersAdded = 0;
|
||||
|
||||
foreach (var fwf in functionsWithFingerprints)
|
||||
{
|
||||
var similarity = await _similarityComputer.ComputeSimilarityAsync(
|
||||
fwf.Fingerprint.Fingerprint,
|
||||
centroid,
|
||||
ct);
|
||||
|
||||
if (similarity >= opts.MinimumSimilarity)
|
||||
{
|
||||
await _repository.AddClusterMemberAsync(
|
||||
new ClusterMember(clusterId, fwf.Function.Id, similarity),
|
||||
ct);
|
||||
membersAdded++;
|
||||
}
|
||||
}
|
||||
|
||||
var duration = DateTime.UtcNow - startTime;
|
||||
|
||||
return new ClusteringResult(
|
||||
cluster.LibraryId,
|
||||
1,
|
||||
membersAdded,
|
||||
duration,
|
||||
[],
|
||||
[]);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<FunctionCluster>> GetClustersForLibraryAsync(
|
||||
Guid libraryId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
return await _repository.GetClustersForLibraryAsync(libraryId, ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ClusterDetails?> GetClusterDetailsAsync(
|
||||
Guid clusterId,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var cluster = await _repository.GetClusterAsync(clusterId, ct);
|
||||
if (cluster is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var members = await _repository.GetClusterMembersAsync(clusterId, ct);
|
||||
var functionDetails = new List<ClusterMemberDetails>();
|
||||
|
||||
foreach (var member in members)
|
||||
{
|
||||
var function = await _repository.GetFunctionAsync(member.FunctionId, ct);
|
||||
if (function is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var variant = await _repository.GetBuildVariantAsync(function.BuildVariantId, ct);
|
||||
LibraryVersion? version = null;
|
||||
if (variant is not null)
|
||||
{
|
||||
version = await _repository.GetLibraryVersionAsync(variant.LibraryVersionId, ct);
|
||||
}
|
||||
|
||||
functionDetails.Add(new ClusterMemberDetails(
|
||||
member.FunctionId,
|
||||
function.Name,
|
||||
function.DemangledName,
|
||||
version?.Version ?? "unknown",
|
||||
variant?.Architecture ?? "unknown",
|
||||
member.SimilarityToCentroid ?? 0m));
|
||||
}
|
||||
|
||||
return new ClusterDetails(
|
||||
cluster.Id,
|
||||
cluster.LibraryId,
|
||||
cluster.CanonicalName,
|
||||
cluster.Description,
|
||||
[.. functionDetails]);
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private async Task<List<FunctionWithFingerprint>> GetFunctionsWithFingerprintsAsync(
|
||||
Guid libraryId,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var result = new List<FunctionWithFingerprint>();
|
||||
|
||||
// Get all versions for the library
|
||||
var library = await _repository.GetLibraryByIdAsync(libraryId, ct);
|
||||
if (library is null)
|
||||
{
|
||||
return result;
|
||||
}
|
||||
|
||||
var versions = await _repository.ListVersionsAsync(library.Name, ct);
|
||||
|
||||
foreach (var version in versions)
|
||||
{
|
||||
var variants = await _repository.GetBuildVariantsAsync(version.Id, ct);
|
||||
|
||||
foreach (var variant in variants)
|
||||
{
|
||||
var functions = await _repository.GetFunctionsForVariantAsync(variant.Id, ct);
|
||||
|
||||
foreach (var function in functions)
|
||||
{
|
||||
var fingerprints = await _repository.GetFingerprintsForFunctionAsync(function.Id, ct);
|
||||
var semanticFp = fingerprints.FirstOrDefault(f => f.Algorithm == FingerprintAlgorithm.SemanticKsg);
|
||||
|
||||
if (semanticFp is not null)
|
||||
{
|
||||
result.Add(new FunctionWithFingerprint(function, semanticFp));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private async Task<GroupClusteringResult> ProcessFunctionGroupAsync(
|
||||
Guid libraryId,
|
||||
string canonicalName,
|
||||
List<FunctionWithFingerprint> functions,
|
||||
ClusteringOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Ensure cluster exists
|
||||
var existingClusters = await _repository.GetClustersForLibraryAsync(libraryId, ct);
|
||||
var cluster = existingClusters.FirstOrDefault(c =>
|
||||
string.Equals(c.CanonicalName, canonicalName, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
Guid clusterId;
|
||||
if (cluster is null)
|
||||
{
|
||||
// Create new cluster
|
||||
var newCluster = new FunctionCluster(
|
||||
Guid.NewGuid(),
|
||||
libraryId,
|
||||
canonicalName,
|
||||
$"Cluster for function '{canonicalName}'",
|
||||
DateTimeOffset.UtcNow);
|
||||
|
||||
await _repository.InsertClusterAsync(newCluster, ct);
|
||||
clusterId = newCluster.Id;
|
||||
}
|
||||
else
|
||||
{
|
||||
clusterId = cluster.Id;
|
||||
// Clear existing members for recomputation
|
||||
await _repository.ClearClusterMembersAsync(clusterId, ct);
|
||||
}
|
||||
|
||||
// Compute centroid fingerprint
|
||||
var centroid = ComputeCentroid(functions, options);
|
||||
|
||||
var membersAdded = 0;
|
||||
var warnings = new List<string>();
|
||||
|
||||
foreach (var fwf in functions)
|
||||
{
|
||||
var similarity = await _similarityComputer.ComputeSimilarityAsync(
|
||||
fwf.Fingerprint.Fingerprint,
|
||||
centroid,
|
||||
ct);
|
||||
|
||||
if (similarity >= options.MinimumSimilarity)
|
||||
{
|
||||
await _repository.AddClusterMemberAsync(
|
||||
new ClusterMember(clusterId, fwf.Function.Id, similarity),
|
||||
ct);
|
||||
membersAdded++;
|
||||
}
|
||||
else
|
||||
{
|
||||
warnings.Add($"Function {fwf.Function.Name} excluded: similarity {similarity:F4} < threshold {options.MinimumSimilarity:F4}");
|
||||
}
|
||||
}
|
||||
|
||||
return new GroupClusteringResult(membersAdded, [.. warnings]);
|
||||
}
|
||||
|
||||
private static byte[] ComputeCentroid(
|
||||
List<FunctionWithFingerprint> functions,
|
||||
ClusteringOptions options)
|
||||
{
|
||||
if (functions.Count == 0)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
if (functions.Count == 1)
|
||||
{
|
||||
return functions[0].Fingerprint.Fingerprint;
|
||||
}
|
||||
|
||||
// Use most common fingerprint as centroid (mode-based approach)
|
||||
// This is more robust than averaging for discrete hash-based fingerprints
|
||||
var fingerprintCounts = functions
|
||||
.GroupBy(f => Convert.ToHexStringLower(f.Fingerprint.Fingerprint))
|
||||
.OrderByDescending(g => g.Count())
|
||||
.ToList();
|
||||
|
||||
var mostCommon = fingerprintCounts.First();
|
||||
return functions
|
||||
.First(f => Convert.ToHexStringLower(f.Fingerprint.Fingerprint) == mostCommon.Key)
|
||||
.Fingerprint.Fingerprint;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Normalizes a function name to its canonical form for clustering.
|
||||
/// </summary>
|
||||
private static string NormalizeCanonicalName(string name)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(name))
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
// Remove GLIBC version annotations (e.g., memcpy@GLIBC_2.14 -> memcpy)
|
||||
var normalized = GlibcVersionPattern().Replace(name, "");
|
||||
|
||||
// Remove trailing @@ symbols
|
||||
normalized = normalized.TrimEnd('@');
|
||||
|
||||
// Remove common symbol prefixes
|
||||
if (normalized.StartsWith("__"))
|
||||
{
|
||||
normalized = normalized[2..];
|
||||
}
|
||||
|
||||
// Remove _internal suffixes
|
||||
normalized = InternalSuffixPattern().Replace(normalized, "");
|
||||
|
||||
// Trim whitespace
|
||||
normalized = normalized.Trim();
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"@GLIBC_[\d.]+", RegexOptions.Compiled)]
|
||||
private static partial Regex GlibcVersionPattern();
|
||||
|
||||
[GeneratedRegex(@"_internal$", RegexOptions.Compiled | RegexOptions.IgnoreCase)]
|
||||
private static partial Regex InternalSuffixPattern();
|
||||
|
||||
#endregion
|
||||
|
||||
private sealed record FunctionWithFingerprint(CorpusFunction Function, CorpusFingerprint Fingerprint);
|
||||
private sealed record GroupClusteringResult(int MembersAdded, ImmutableArray<string> Warnings);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for function clustering.
|
||||
/// </summary>
|
||||
public interface IFunctionClusteringService
|
||||
{
|
||||
/// <summary>
|
||||
/// Cluster all functions for a library.
|
||||
/// </summary>
|
||||
Task<ClusteringResult> ClusterFunctionsAsync(
|
||||
Guid libraryId,
|
||||
ClusteringOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Recompute a specific cluster.
|
||||
/// </summary>
|
||||
Task<ClusteringResult> ReclusterAsync(
|
||||
Guid clusterId,
|
||||
ClusteringOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get all clusters for a library.
|
||||
/// </summary>
|
||||
Task<ImmutableArray<FunctionCluster>> GetClustersForLibraryAsync(
|
||||
Guid libraryId,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get detailed information about a cluster.
|
||||
/// </summary>
|
||||
Task<ClusterDetails?> GetClusterDetailsAsync(
|
||||
Guid clusterId,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for function clustering.
|
||||
/// </summary>
|
||||
public sealed record ClusteringOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Minimum similarity threshold to include a function in a cluster.
|
||||
/// </summary>
|
||||
public decimal MinimumSimilarity { get; init; } = 0.7m;
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm to use for clustering.
|
||||
/// </summary>
|
||||
public FingerprintAlgorithm Algorithm { get; init; } = FingerprintAlgorithm.SemanticKsg;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of clustering operation.
|
||||
/// </summary>
|
||||
public sealed record ClusteringResult(
|
||||
Guid LibraryId,
|
||||
int ClustersCreated,
|
||||
int MembersAssigned,
|
||||
TimeSpan Duration,
|
||||
ImmutableArray<string> Errors,
|
||||
ImmutableArray<string> Warnings);
|
||||
|
||||
/// <summary>
|
||||
/// Detailed cluster information.
|
||||
/// </summary>
|
||||
public sealed record ClusterDetails(
|
||||
Guid ClusterId,
|
||||
Guid LibraryId,
|
||||
string CanonicalName,
|
||||
string? Description,
|
||||
ImmutableArray<ClusterMemberDetails> Members);
|
||||
|
||||
/// <summary>
|
||||
/// Details about a cluster member.
|
||||
/// </summary>
|
||||
public sealed record ClusterMemberDetails(
|
||||
Guid FunctionId,
|
||||
string FunctionName,
|
||||
string? DemangledName,
|
||||
string Version,
|
||||
string Architecture,
|
||||
decimal SimilarityToCentroid);
|
||||
|
||||
/// <summary>
|
||||
/// Interface for computing similarity between fingerprints.
|
||||
/// </summary>
|
||||
public interface IClusterSimilarityComputer
|
||||
{
|
||||
/// <summary>
|
||||
/// Compute similarity between two fingerprints.
|
||||
/// </summary>
|
||||
Task<decimal> ComputeSimilarityAsync(
|
||||
byte[] fingerprint1,
|
||||
byte[] fingerprint2,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
@@ -10,6 +10,7 @@
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Http" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
@@ -0,0 +1,392 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// Engine for comparing AST structures using tree edit distance and semantic analysis.
|
||||
/// </summary>
|
||||
public sealed class AstComparisonEngine : IAstComparisonEngine
|
||||
{
|
||||
/// <inheritdoc />
|
||||
public decimal ComputeStructuralSimilarity(DecompiledAst a, DecompiledAst b)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
|
||||
// Use normalized tree edit distance
|
||||
var editDistance = ComputeEditDistance(a, b);
|
||||
return 1.0m - editDistance.NormalizedDistance;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public AstEditDistance ComputeEditDistance(DecompiledAst a, DecompiledAst b)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
|
||||
// Simplified Zhang-Shasha tree edit distance
|
||||
var operations = ComputeTreeEditOperations(a.Root, b.Root);
|
||||
|
||||
var totalNodes = Math.Max(a.NodeCount, b.NodeCount);
|
||||
var normalized = totalNodes > 0
|
||||
? (decimal)operations.TotalOperations / totalNodes
|
||||
: 0m;
|
||||
|
||||
return new AstEditDistance(
|
||||
operations.Insertions,
|
||||
operations.Deletions,
|
||||
operations.Modifications,
|
||||
operations.TotalOperations,
|
||||
Math.Clamp(normalized, 0m, 1m));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<SemanticEquivalence> FindEquivalences(DecompiledAst a, DecompiledAst b)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
|
||||
var equivalences = new List<SemanticEquivalence>();
|
||||
|
||||
// Find equivalent subtrees
|
||||
var nodesA = CollectNodes(a.Root).ToList();
|
||||
var nodesB = CollectNodes(b.Root).ToList();
|
||||
|
||||
foreach (var nodeA in nodesA)
|
||||
{
|
||||
foreach (var nodeB in nodesB)
|
||||
{
|
||||
var equivalence = CheckEquivalence(nodeA, nodeB);
|
||||
if (equivalence is not null)
|
||||
{
|
||||
equivalences.Add(equivalence);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove redundant equivalences (child nodes when parent is equivalent)
|
||||
return [.. FilterRedundantEquivalences(equivalences)];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<CodeDifference> FindDifferences(DecompiledAst a, DecompiledAst b)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
|
||||
var differences = new List<CodeDifference>();
|
||||
|
||||
// Compare root structures
|
||||
CompareNodes(a.Root, b.Root, differences);
|
||||
|
||||
return [.. differences];
|
||||
}
|
||||
|
||||
private static EditOperations ComputeTreeEditOperations(AstNode a, AstNode b)
|
||||
{
|
||||
// Simplified tree comparison
|
||||
if (a.Type != b.Type)
|
||||
{
|
||||
return new EditOperations(0, 0, 1, 1);
|
||||
}
|
||||
|
||||
var childrenA = a.Children;
|
||||
var childrenB = b.Children;
|
||||
|
||||
var insertions = 0;
|
||||
var deletions = 0;
|
||||
var modifications = 0;
|
||||
|
||||
// Compare children using LCS-like approach
|
||||
var maxLen = Math.Max(childrenA.Length, childrenB.Length);
|
||||
var minLen = Math.Min(childrenA.Length, childrenB.Length);
|
||||
|
||||
insertions = childrenB.Length - minLen;
|
||||
deletions = childrenA.Length - minLen;
|
||||
|
||||
for (var i = 0; i < minLen; i++)
|
||||
{
|
||||
var childOps = ComputeTreeEditOperations(childrenA[i], childrenB[i]);
|
||||
insertions += childOps.Insertions;
|
||||
deletions += childOps.Deletions;
|
||||
modifications += childOps.Modifications;
|
||||
}
|
||||
|
||||
return new EditOperations(insertions, deletions, modifications, insertions + deletions + modifications);
|
||||
}
|
||||
|
||||
private static SemanticEquivalence? CheckEquivalence(AstNode a, AstNode b)
|
||||
{
|
||||
// Same type - potential equivalence
|
||||
if (a.Type != b.Type)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check for identical
|
||||
if (AreNodesIdentical(a, b))
|
||||
{
|
||||
return new SemanticEquivalence(a, b, EquivalenceType.Identical, 1.0m, "Identical nodes");
|
||||
}
|
||||
|
||||
// Check for renamed (same structure, different names)
|
||||
if (AreNodesRenamed(a, b))
|
||||
{
|
||||
return new SemanticEquivalence(a, b, EquivalenceType.Renamed, 0.95m, "Same structure with renamed identifiers");
|
||||
}
|
||||
|
||||
// Check for optimization variants
|
||||
if (AreOptimizationVariants(a, b))
|
||||
{
|
||||
return new SemanticEquivalence(a, b, EquivalenceType.Optimized, 0.85m, "Optimization variant");
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static bool AreNodesIdentical(AstNode a, AstNode b)
|
||||
{
|
||||
if (a.Type != b.Type || a.Children.Length != b.Children.Length)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check node-specific equality
|
||||
if (a is ConstantNode constA && b is ConstantNode constB)
|
||||
{
|
||||
return constA.Value?.ToString() == constB.Value?.ToString();
|
||||
}
|
||||
|
||||
if (a is VariableNode varA && b is VariableNode varB)
|
||||
{
|
||||
return varA.Name == varB.Name;
|
||||
}
|
||||
|
||||
if (a is BinaryOpNode binA && b is BinaryOpNode binB)
|
||||
{
|
||||
if (binA.Operator != binB.Operator)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (a is CallNode callA && b is CallNode callB)
|
||||
{
|
||||
if (callA.FunctionName != callB.FunctionName)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check children recursively
|
||||
for (var i = 0; i < a.Children.Length; i++)
|
||||
{
|
||||
if (!AreNodesIdentical(a.Children[i], b.Children[i]))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static bool AreNodesRenamed(AstNode a, AstNode b)
|
||||
{
|
||||
if (a.Type != b.Type || a.Children.Length != b.Children.Length)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Same structure but variable/parameter names differ
|
||||
if (a is VariableNode && b is VariableNode)
|
||||
{
|
||||
return true; // Different name but same position = renamed
|
||||
}
|
||||
|
||||
// Check children have same structure
|
||||
for (var i = 0; i < a.Children.Length; i++)
|
||||
{
|
||||
if (!AreNodesRenamed(a.Children[i], b.Children[i]) &&
|
||||
!AreNodesIdentical(a.Children[i], b.Children[i]))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private static bool AreOptimizationVariants(AstNode a, AstNode b)
|
||||
{
|
||||
// Detect common optimization patterns
|
||||
|
||||
// Loop unrolling: for loop vs repeated statements
|
||||
if (a.Type == AstNodeType.For && b.Type == AstNodeType.Block)
|
||||
{
|
||||
return true; // Might be unrolled
|
||||
}
|
||||
|
||||
// Strength reduction: multiplication vs addition
|
||||
if (a is BinaryOpNode binA && b is BinaryOpNode binB)
|
||||
{
|
||||
if ((binA.Operator == "*" && binB.Operator == "<<") ||
|
||||
(binA.Operator == "/" && binB.Operator == ">>"))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Inline expansion
|
||||
if (a.Type == AstNodeType.Call && b.Type == AstNodeType.Block)
|
||||
{
|
||||
return true; // Might be inlined
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static void CompareNodes(AstNode a, AstNode b, List<CodeDifference> differences)
|
||||
{
|
||||
if (a.Type != b.Type)
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Modified,
|
||||
a,
|
||||
b,
|
||||
$"Node type changed: {a.Type} -> {b.Type}"));
|
||||
return;
|
||||
}
|
||||
|
||||
// Compare specific node types
|
||||
switch (a)
|
||||
{
|
||||
case VariableNode varA when b is VariableNode varB:
|
||||
if (varA.Name != varB.Name)
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Modified,
|
||||
a,
|
||||
b,
|
||||
$"Variable renamed: {varA.Name} -> {varB.Name}"));
|
||||
}
|
||||
break;
|
||||
|
||||
case ConstantNode constA when b is ConstantNode constB:
|
||||
if (constA.Value?.ToString() != constB.Value?.ToString())
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Modified,
|
||||
a,
|
||||
b,
|
||||
$"Constant changed: {constA.Value} -> {constB.Value}"));
|
||||
}
|
||||
break;
|
||||
|
||||
case BinaryOpNode binA when b is BinaryOpNode binB:
|
||||
if (binA.Operator != binB.Operator)
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Modified,
|
||||
a,
|
||||
b,
|
||||
$"Operator changed: {binA.Operator} -> {binB.Operator}"));
|
||||
}
|
||||
break;
|
||||
|
||||
case CallNode callA when b is CallNode callB:
|
||||
if (callA.FunctionName != callB.FunctionName)
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Modified,
|
||||
a,
|
||||
b,
|
||||
$"Function call changed: {callA.FunctionName} -> {callB.FunctionName}"));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// Compare children
|
||||
var minChildren = Math.Min(a.Children.Length, b.Children.Length);
|
||||
|
||||
for (var i = 0; i < minChildren; i++)
|
||||
{
|
||||
CompareNodes(a.Children[i], b.Children[i], differences);
|
||||
}
|
||||
|
||||
// Handle added/removed children
|
||||
for (var i = minChildren; i < a.Children.Length; i++)
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Removed,
|
||||
a.Children[i],
|
||||
null,
|
||||
$"Node removed: {a.Children[i].Type}"));
|
||||
}
|
||||
|
||||
for (var i = minChildren; i < b.Children.Length; i++)
|
||||
{
|
||||
differences.Add(new CodeDifference(
|
||||
DifferenceType.Added,
|
||||
null,
|
||||
b.Children[i],
|
||||
$"Node added: {b.Children[i].Type}"));
|
||||
}
|
||||
}
|
||||
|
||||
private static IEnumerable<AstNode> CollectNodes(AstNode root)
|
||||
{
|
||||
yield return root;
|
||||
foreach (var child in root.Children)
|
||||
{
|
||||
foreach (var node in CollectNodes(child))
|
||||
{
|
||||
yield return node;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static IEnumerable<SemanticEquivalence> FilterRedundantEquivalences(
|
||||
List<SemanticEquivalence> equivalences)
|
||||
{
|
||||
// Keep only top-level equivalences
|
||||
var result = new List<SemanticEquivalence>();
|
||||
|
||||
foreach (var eq in equivalences)
|
||||
{
|
||||
var isRedundant = equivalences.Any(other =>
|
||||
other != eq &&
|
||||
IsAncestor(other.NodeA, eq.NodeA) &&
|
||||
IsAncestor(other.NodeB, eq.NodeB));
|
||||
|
||||
if (!isRedundant)
|
||||
{
|
||||
result.Add(eq);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private static bool IsAncestor(AstNode potential, AstNode node)
|
||||
{
|
||||
if (potential == node)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
foreach (var child in potential.Children)
|
||||
{
|
||||
if (child == node || IsAncestor(child, node))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private readonly record struct EditOperations(int Insertions, int Deletions, int Modifications, int TotalOperations);
|
||||
}
|
||||
@@ -0,0 +1,534 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// Normalizes decompiled code for comparison by removing superficial differences.
|
||||
/// </summary>
|
||||
public sealed partial class CodeNormalizer : ICodeNormalizer
|
||||
{
|
||||
private static readonly ImmutableHashSet<string> CKeywords = ImmutableHashSet.Create(
|
||||
"auto", "break", "case", "char", "const", "continue", "default", "do",
|
||||
"double", "else", "enum", "extern", "float", "for", "goto", "if",
|
||||
"int", "long", "register", "return", "short", "signed", "sizeof", "static",
|
||||
"struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while",
|
||||
// Common Ghidra types
|
||||
"undefined", "undefined1", "undefined2", "undefined4", "undefined8",
|
||||
"byte", "word", "dword", "qword", "bool", "uchar", "ushort", "uint", "ulong",
|
||||
"int8_t", "int16_t", "int32_t", "int64_t", "uint8_t", "uint16_t", "uint32_t", "uint64_t",
|
||||
"size_t", "ssize_t", "ptrdiff_t", "intptr_t", "uintptr_t",
|
||||
// Common function names to preserve
|
||||
"NULL", "true", "false"
|
||||
);
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Normalize(string code, NormalizationOptions? options = null)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(code);
|
||||
|
||||
options ??= NormalizationOptions.Default;
|
||||
|
||||
var normalized = code;
|
||||
|
||||
// 1. Remove comments
|
||||
normalized = RemoveComments(normalized);
|
||||
|
||||
// 2. Normalize variable names
|
||||
if (options.NormalizeVariables)
|
||||
{
|
||||
normalized = NormalizeVariableNames(normalized, options.KnownFunctions);
|
||||
}
|
||||
|
||||
// 3. Normalize function calls
|
||||
if (options.NormalizeFunctionCalls)
|
||||
{
|
||||
normalized = NormalizeFunctionCalls(normalized, options.KnownFunctions);
|
||||
}
|
||||
|
||||
// 4. Normalize constants
|
||||
if (options.NormalizeConstants)
|
||||
{
|
||||
normalized = NormalizeConstants(normalized);
|
||||
}
|
||||
|
||||
// 5. Normalize whitespace
|
||||
if (options.NormalizeWhitespace)
|
||||
{
|
||||
normalized = NormalizeWhitespace(normalized);
|
||||
}
|
||||
|
||||
// 6. Sort independent statements (within blocks)
|
||||
if (options.SortIndependentStatements)
|
||||
{
|
||||
normalized = SortIndependentStatements(normalized);
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public byte[] ComputeCanonicalHash(string code)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(code);
|
||||
|
||||
// Normalize with full normalization for hashing
|
||||
var normalized = Normalize(code, new NormalizationOptions
|
||||
{
|
||||
NormalizeVariables = true,
|
||||
NormalizeFunctionCalls = true,
|
||||
NormalizeConstants = false, // Keep constants for semantic identity
|
||||
NormalizeWhitespace = true,
|
||||
SortIndependentStatements = true
|
||||
});
|
||||
|
||||
return SHA256.HashData(Encoding.UTF8.GetBytes(normalized));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public DecompiledAst NormalizeAst(DecompiledAst ast, NormalizationOptions? options = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(ast);
|
||||
|
||||
options ??= NormalizationOptions.Default;
|
||||
|
||||
var varIndex = 0;
|
||||
var varMap = new Dictionary<string, string>();
|
||||
|
||||
var normalizedRoot = NormalizeNode(ast.Root, options, varMap, ref varIndex);
|
||||
|
||||
return new DecompiledAst(
|
||||
normalizedRoot,
|
||||
ast.NodeCount,
|
||||
ast.Depth,
|
||||
ast.Patterns);
|
||||
}
|
||||
|
||||
private static AstNode NormalizeNode(
|
||||
AstNode node,
|
||||
NormalizationOptions options,
|
||||
Dictionary<string, string> varMap,
|
||||
ref int varIndex)
|
||||
{
|
||||
return node switch
|
||||
{
|
||||
VariableNode varNode when options.NormalizeVariables =>
|
||||
NormalizeVariableNode(varNode, varMap, ref varIndex),
|
||||
|
||||
CallNode callNode when options.NormalizeFunctionCalls =>
|
||||
NormalizeCallNode(callNode, options, varMap, ref varIndex),
|
||||
|
||||
ConstantNode constNode when options.NormalizeConstants =>
|
||||
NormalizeConstantNode(constNode),
|
||||
|
||||
_ => NormalizeChildren(node, options, varMap, ref varIndex)
|
||||
};
|
||||
}
|
||||
|
||||
private static AstNode NormalizeVariableNode(
|
||||
VariableNode node,
|
||||
Dictionary<string, string> varMap,
|
||||
ref int varIndex)
|
||||
{
|
||||
if (IsKeywordOrType(node.Name))
|
||||
{
|
||||
return node;
|
||||
}
|
||||
|
||||
if (!varMap.TryGetValue(node.Name, out var canonical))
|
||||
{
|
||||
canonical = $"var_{varIndex++}";
|
||||
varMap[node.Name] = canonical;
|
||||
}
|
||||
|
||||
return node with { Name = canonical };
|
||||
}
|
||||
|
||||
private static AstNode NormalizeCallNode(
|
||||
CallNode node,
|
||||
NormalizationOptions options,
|
||||
Dictionary<string, string> varMap,
|
||||
ref int varIndex)
|
||||
{
|
||||
var funcName = node.FunctionName;
|
||||
|
||||
// Preserve known functions
|
||||
if (options.KnownFunctions?.Contains(funcName) != true &&
|
||||
!IsStandardLibraryFunction(funcName))
|
||||
{
|
||||
funcName = $"func_{funcName.GetHashCode():X8}";
|
||||
}
|
||||
|
||||
var normalizedArgs = new List<AstNode>(node.Arguments.Length);
|
||||
foreach (var arg in node.Arguments)
|
||||
{
|
||||
normalizedArgs.Add(NormalizeNode(arg, options, varMap, ref varIndex));
|
||||
}
|
||||
|
||||
return new CallNode(funcName, [.. normalizedArgs], node.Location);
|
||||
}
|
||||
|
||||
private static AstNode NormalizeConstantNode(ConstantNode node)
|
||||
{
|
||||
// Normalize numeric constants to canonical form
|
||||
if (node.Value is long or int or short or byte)
|
||||
{
|
||||
return node with { Value = "CONST_INT" };
|
||||
}
|
||||
|
||||
if (node.Value is double or float or decimal)
|
||||
{
|
||||
return node with { Value = "CONST_FLOAT" };
|
||||
}
|
||||
|
||||
if (node.Value is string)
|
||||
{
|
||||
return node with { Value = "CONST_STR" };
|
||||
}
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
private static AstNode NormalizeChildren(
|
||||
AstNode node,
|
||||
NormalizationOptions options,
|
||||
Dictionary<string, string> varMap,
|
||||
ref int varIndex)
|
||||
{
|
||||
if (node.Children.Length == 0)
|
||||
{
|
||||
return node;
|
||||
}
|
||||
|
||||
var normalizedChildren = new List<AstNode>(node.Children.Length);
|
||||
foreach (var child in node.Children)
|
||||
{
|
||||
normalizedChildren.Add(NormalizeNode(child, options, varMap, ref varIndex));
|
||||
}
|
||||
|
||||
var normalizedArray = normalizedChildren.ToImmutableArray();
|
||||
|
||||
// Use reflection-free approach for common node types
|
||||
return node switch
|
||||
{
|
||||
BlockNode block => block with { Statements = normalizedArray },
|
||||
IfNode ifNode => CreateNormalizedIf(ifNode, normalizedArray),
|
||||
WhileNode whileNode => CreateNormalizedWhile(whileNode, normalizedArray),
|
||||
ForNode forNode => CreateNormalizedFor(forNode, normalizedArray),
|
||||
ReturnNode returnNode when normalizedArray.Length > 0 =>
|
||||
returnNode with { Value = normalizedArray[0] },
|
||||
AssignmentNode assignment => CreateNormalizedAssignment(assignment, normalizedArray),
|
||||
BinaryOpNode binOp => CreateNormalizedBinaryOp(binOp, normalizedArray),
|
||||
UnaryOpNode unaryOp when normalizedArray.Length > 0 =>
|
||||
unaryOp with { Operand = normalizedArray[0] },
|
||||
_ => node // Return as-is for other node types
|
||||
};
|
||||
}
|
||||
|
||||
private static IfNode CreateNormalizedIf(IfNode node, ImmutableArray<AstNode> children)
|
||||
{
|
||||
return new IfNode(
|
||||
children.Length > 0 ? children[0] : node.Condition,
|
||||
children.Length > 1 ? children[1] : node.ThenBranch,
|
||||
children.Length > 2 ? children[2] : node.ElseBranch,
|
||||
node.Location);
|
||||
}
|
||||
|
||||
private static WhileNode CreateNormalizedWhile(WhileNode node, ImmutableArray<AstNode> children)
|
||||
{
|
||||
return new WhileNode(
|
||||
children.Length > 0 ? children[0] : node.Condition,
|
||||
children.Length > 1 ? children[1] : node.Body,
|
||||
node.Location);
|
||||
}
|
||||
|
||||
private static ForNode CreateNormalizedFor(ForNode node, ImmutableArray<AstNode> children)
|
||||
{
|
||||
return new ForNode(
|
||||
children.Length > 0 ? children[0] : node.Init,
|
||||
children.Length > 1 ? children[1] : node.Condition,
|
||||
children.Length > 2 ? children[2] : node.Update,
|
||||
children.Length > 3 ? children[3] : node.Body,
|
||||
node.Location);
|
||||
}
|
||||
|
||||
private static AssignmentNode CreateNormalizedAssignment(
|
||||
AssignmentNode node,
|
||||
ImmutableArray<AstNode> children)
|
||||
{
|
||||
return new AssignmentNode(
|
||||
children.Length > 0 ? children[0] : node.Target,
|
||||
children.Length > 1 ? children[1] : node.Value,
|
||||
node.Operator,
|
||||
node.Location);
|
||||
}
|
||||
|
||||
private static BinaryOpNode CreateNormalizedBinaryOp(
|
||||
BinaryOpNode node,
|
||||
ImmutableArray<AstNode> children)
|
||||
{
|
||||
return new BinaryOpNode(
|
||||
children.Length > 0 ? children[0] : node.Left,
|
||||
children.Length > 1 ? children[1] : node.Right,
|
||||
node.Operator,
|
||||
node.Location);
|
||||
}
|
||||
|
||||
private static string RemoveComments(string code)
|
||||
{
|
||||
// Remove single-line comments
|
||||
code = SingleLineCommentRegex().Replace(code, "");
|
||||
|
||||
// Remove multi-line comments
|
||||
code = MultiLineCommentRegex().Replace(code, "");
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
private static string NormalizeVariableNames(string code, ImmutableHashSet<string>? knownFunctions)
|
||||
{
|
||||
var varIndex = 0;
|
||||
var varMap = new Dictionary<string, string>();
|
||||
|
||||
return IdentifierRegex().Replace(code, match =>
|
||||
{
|
||||
var name = match.Value;
|
||||
|
||||
// Skip keywords and types
|
||||
if (IsKeywordOrType(name))
|
||||
{
|
||||
return name;
|
||||
}
|
||||
|
||||
// Skip known functions
|
||||
if (knownFunctions?.Contains(name) == true)
|
||||
{
|
||||
return name;
|
||||
}
|
||||
|
||||
// Skip standard library functions
|
||||
if (IsStandardLibraryFunction(name))
|
||||
{
|
||||
return name;
|
||||
}
|
||||
|
||||
if (!varMap.TryGetValue(name, out var canonical))
|
||||
{
|
||||
canonical = $"var_{varIndex++}";
|
||||
varMap[name] = canonical;
|
||||
}
|
||||
|
||||
return canonical;
|
||||
});
|
||||
}
|
||||
|
||||
private static string NormalizeFunctionCalls(string code, ImmutableHashSet<string>? knownFunctions)
|
||||
{
|
||||
// Match function calls: identifier followed by (
|
||||
return FunctionCallRegex().Replace(code, match =>
|
||||
{
|
||||
var funcName = match.Groups[1].Value;
|
||||
|
||||
// Skip known functions
|
||||
if (knownFunctions?.Contains(funcName) == true)
|
||||
{
|
||||
return match.Value;
|
||||
}
|
||||
|
||||
// Skip standard library functions
|
||||
if (IsStandardLibraryFunction(funcName))
|
||||
{
|
||||
return match.Value;
|
||||
}
|
||||
|
||||
return $"func_{funcName.GetHashCode():X8}(";
|
||||
});
|
||||
}
|
||||
|
||||
private static string NormalizeConstants(string code)
|
||||
{
|
||||
// Normalize hex constants
|
||||
code = HexConstantRegex().Replace(code, "CONST_HEX");
|
||||
|
||||
// Normalize decimal constants (but preserve small common ones like 0, 1, 2)
|
||||
code = LargeDecimalRegex().Replace(code, "CONST_INT");
|
||||
|
||||
// Normalize string literals
|
||||
code = StringLiteralRegex().Replace(code, "CONST_STR");
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
private static string NormalizeWhitespace(string code)
|
||||
{
|
||||
// Collapse multiple whitespace to single space
|
||||
code = MultipleWhitespaceRegex().Replace(code, " ");
|
||||
|
||||
// Remove whitespace around operators
|
||||
code = WhitespaceAroundOperatorsRegex().Replace(code, "$1");
|
||||
|
||||
// Normalize line endings
|
||||
code = code.Replace("\r\n", "\n").Replace("\r", "\n");
|
||||
|
||||
// Remove trailing whitespace on lines
|
||||
code = TrailingWhitespaceRegex().Replace(code, "\n");
|
||||
|
||||
return code.Trim();
|
||||
}
|
||||
|
||||
private static string SortIndependentStatements(string code)
|
||||
{
|
||||
// Parse into blocks and sort independent statements within each block
|
||||
// This is a simplified implementation that sorts top-level statements
|
||||
// A full implementation would need to analyze data dependencies
|
||||
|
||||
var lines = code.Split('\n', StringSplitOptions.RemoveEmptyEntries);
|
||||
var result = new StringBuilder();
|
||||
|
||||
var blockDepth = 0;
|
||||
var currentBlock = new List<string>();
|
||||
|
||||
foreach (var line in lines)
|
||||
{
|
||||
var trimmed = line.Trim();
|
||||
|
||||
// Track block depth
|
||||
blockDepth += trimmed.Count(c => c == '{');
|
||||
blockDepth -= trimmed.Count(c => c == '}');
|
||||
|
||||
if (blockDepth == 1 && !trimmed.Contains('{') && !trimmed.Contains('}'))
|
||||
{
|
||||
// Simple statement at block level 1
|
||||
currentBlock.Add(trimmed);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Flush sorted block
|
||||
if (currentBlock.Count > 0)
|
||||
{
|
||||
var sorted = SortStatements(currentBlock);
|
||||
foreach (var stmt in sorted)
|
||||
{
|
||||
result.AppendLine(stmt);
|
||||
}
|
||||
currentBlock.Clear();
|
||||
}
|
||||
|
||||
result.AppendLine(line);
|
||||
}
|
||||
}
|
||||
|
||||
// Flush remaining
|
||||
if (currentBlock.Count > 0)
|
||||
{
|
||||
var sorted = SortStatements(currentBlock);
|
||||
foreach (var stmt in sorted)
|
||||
{
|
||||
result.AppendLine(stmt);
|
||||
}
|
||||
}
|
||||
|
||||
return result.ToString().Trim();
|
||||
}
|
||||
|
||||
private static List<string> SortStatements(List<string> statements)
|
||||
{
|
||||
// Group statements that can be reordered
|
||||
// For now, just sort by canonical form (conservative)
|
||||
return statements
|
||||
.OrderBy(s => GetStatementSortKey(s), StringComparer.Ordinal)
|
||||
.ToList();
|
||||
}
|
||||
|
||||
private static string GetStatementSortKey(string statement)
|
||||
{
|
||||
// Extract the "essence" of the statement for sorting
|
||||
// e.g., assignment target, function call name
|
||||
var trimmed = statement.Trim();
|
||||
|
||||
// Assignment: sort by target
|
||||
var assignMatch = AssignmentTargetRegex().Match(trimmed);
|
||||
if (assignMatch.Success)
|
||||
{
|
||||
return $"A_{assignMatch.Groups[1].Value}";
|
||||
}
|
||||
|
||||
// Function call: sort by function name
|
||||
var callMatch = FunctionNameRegex().Match(trimmed);
|
||||
if (callMatch.Success)
|
||||
{
|
||||
return $"C_{callMatch.Groups[1].Value}";
|
||||
}
|
||||
|
||||
return $"Z_{trimmed}";
|
||||
}
|
||||
|
||||
private static bool IsKeywordOrType(string name)
|
||||
{
|
||||
return CKeywords.Contains(name);
|
||||
}
|
||||
|
||||
private static bool IsStandardLibraryFunction(string name)
|
||||
{
|
||||
// Common C standard library functions to preserve
|
||||
return name switch
|
||||
{
|
||||
// Memory
|
||||
"malloc" or "calloc" or "realloc" or "free" or "memcpy" or "memmove" or "memset" or "memcmp" => true,
|
||||
// String
|
||||
"strlen" or "strcpy" or "strncpy" or "strcat" or "strncat" or "strcmp" or "strncmp" or "strchr" or "strrchr" or "strstr" => true,
|
||||
// I/O
|
||||
"printf" or "fprintf" or "sprintf" or "snprintf" or "scanf" or "fscanf" or "sscanf" => true,
|
||||
"fopen" or "fclose" or "fread" or "fwrite" or "fseek" or "ftell" or "fflush" => true,
|
||||
"puts" or "fputs" or "gets" or "fgets" or "putchar" or "getchar" => true,
|
||||
// Math
|
||||
"abs" or "labs" or "llabs" or "fabs" or "sqrt" or "pow" or "sin" or "cos" or "tan" or "log" or "exp" => true,
|
||||
// Other
|
||||
"exit" or "abort" or "atexit" or "atoi" or "atol" or "atof" or "strtol" or "strtoul" or "strtod" => true,
|
||||
"assert" or "errno" => true,
|
||||
_ => false
|
||||
};
|
||||
}
|
||||
|
||||
// Regex patterns using source generators
|
||||
[GeneratedRegex(@"//[^\n]*")]
|
||||
private static partial Regex SingleLineCommentRegex();
|
||||
|
||||
[GeneratedRegex(@"/\*[\s\S]*?\*/")]
|
||||
private static partial Regex MultiLineCommentRegex();
|
||||
|
||||
[GeneratedRegex(@"\b([a-zA-Z_][a-zA-Z0-9_]*)\b")]
|
||||
private static partial Regex IdentifierRegex();
|
||||
|
||||
[GeneratedRegex(@"\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(")]
|
||||
private static partial Regex FunctionCallRegex();
|
||||
|
||||
[GeneratedRegex(@"0[xX][0-9a-fA-F]+")]
|
||||
private static partial Regex HexConstantRegex();
|
||||
|
||||
[GeneratedRegex(@"\b[0-9]{4,}\b")]
|
||||
private static partial Regex LargeDecimalRegex();
|
||||
|
||||
[GeneratedRegex(@"""(?:[^""\\]|\\.)*""")]
|
||||
private static partial Regex StringLiteralRegex();
|
||||
|
||||
[GeneratedRegex(@"[ \t]+")]
|
||||
private static partial Regex MultipleWhitespaceRegex();
|
||||
|
||||
[GeneratedRegex(@"\s*([+\-*/%=<>!&|^~?:;,{}()\[\]])\s*")]
|
||||
private static partial Regex WhitespaceAroundOperatorsRegex();
|
||||
|
||||
[GeneratedRegex(@"[ \t]+\n")]
|
||||
private static partial Regex TrailingWhitespaceRegex();
|
||||
|
||||
[GeneratedRegex(@"^([a-zA-Z_][a-zA-Z0-9_]*)\s*=")]
|
||||
private static partial Regex AssignmentTargetRegex();
|
||||
|
||||
[GeneratedRegex(@"^([a-zA-Z_][a-zA-Z0-9_]*)\s*\(")]
|
||||
private static partial Regex FunctionNameRegex();
|
||||
}
|
||||
@@ -0,0 +1,950 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// Parser for Ghidra's decompiled C-like pseudo-code.
|
||||
/// </summary>
|
||||
public sealed partial class DecompiledCodeParser : IDecompiledCodeParser
|
||||
{
|
||||
private static readonly HashSet<string> s_keywords =
|
||||
[
|
||||
"if", "else", "while", "for", "do", "switch", "case", "default",
|
||||
"return", "break", "continue", "goto", "sizeof", "typedef",
|
||||
"struct", "union", "enum", "void", "int", "char", "short", "long",
|
||||
"float", "double", "unsigned", "signed", "const", "static", "extern"
|
||||
];
|
||||
|
||||
private static readonly HashSet<string> s_types =
|
||||
[
|
||||
"void", "int", "uint", "char", "uchar", "byte", "ubyte",
|
||||
"short", "ushort", "long", "ulong", "longlong", "ulonglong",
|
||||
"float", "double", "bool", "undefined", "undefined1", "undefined2",
|
||||
"undefined4", "undefined8", "pointer", "code", "dword", "qword", "word"
|
||||
];
|
||||
|
||||
/// <inheritdoc />
|
||||
public DecompiledAst Parse(string code)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(code);
|
||||
|
||||
var tokens = Tokenize(code);
|
||||
var parser = new RecursiveParser(tokens);
|
||||
var root = parser.ParseFunction();
|
||||
|
||||
var nodeCount = CountNodes(root);
|
||||
var depth = ComputeDepth(root);
|
||||
var patterns = ExtractPatterns(root);
|
||||
|
||||
return new DecompiledAst(root, nodeCount, depth, patterns);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<LocalVariable> ExtractVariables(string code)
|
||||
{
|
||||
var variables = new List<LocalVariable>();
|
||||
var varIndex = 0;
|
||||
|
||||
// Match variable declarations: type name [= value];
|
||||
// Ghidra style: int local_10; or undefined8 param_1;
|
||||
var declPattern = VariableDeclarationRegex();
|
||||
|
||||
foreach (Match match in declPattern.Matches(code))
|
||||
{
|
||||
var type = match.Groups["type"].Value;
|
||||
var name = match.Groups["name"].Value;
|
||||
|
||||
var isParam = name.StartsWith("param_", StringComparison.Ordinal);
|
||||
int? paramIndex = null;
|
||||
int stackOffset = 0;
|
||||
|
||||
if (isParam && int.TryParse(name.AsSpan(6), out var idx))
|
||||
{
|
||||
paramIndex = idx;
|
||||
}
|
||||
|
||||
if (name.StartsWith("local_", StringComparison.Ordinal) &&
|
||||
int.TryParse(name.AsSpan(6), System.Globalization.NumberStyles.HexNumber, null, out var offset))
|
||||
{
|
||||
stackOffset = -offset; // Negative for locals
|
||||
}
|
||||
|
||||
variables.Add(new LocalVariable(name, type, stackOffset, isParam, paramIndex));
|
||||
varIndex++;
|
||||
}
|
||||
|
||||
return [.. variables];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public ImmutableArray<string> ExtractCalledFunctions(string code)
|
||||
{
|
||||
var functions = new HashSet<string>();
|
||||
|
||||
// Match function calls: name(...)
|
||||
var callPattern = FunctionCallRegex();
|
||||
|
||||
foreach (Match match in callPattern.Matches(code))
|
||||
{
|
||||
var name = match.Groups["name"].Value;
|
||||
|
||||
// Skip keywords and types
|
||||
if (!s_keywords.Contains(name) && !s_types.Contains(name))
|
||||
{
|
||||
functions.Add(name);
|
||||
}
|
||||
}
|
||||
|
||||
return [.. functions.Order()];
|
||||
}
|
||||
|
||||
private static List<Token> Tokenize(string code)
|
||||
{
|
||||
var tokens = new List<Token>();
|
||||
var i = 0;
|
||||
var line = 1;
|
||||
var column = 1;
|
||||
|
||||
while (i < code.Length)
|
||||
{
|
||||
var c = code[i];
|
||||
|
||||
// Skip whitespace
|
||||
if (char.IsWhiteSpace(c))
|
||||
{
|
||||
if (c == '\n')
|
||||
{
|
||||
line++;
|
||||
column = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
column++;
|
||||
}
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip comments
|
||||
if (i + 1 < code.Length && code[i] == '/' && code[i + 1] == '/')
|
||||
{
|
||||
while (i < code.Length && code[i] != '\n')
|
||||
{
|
||||
i++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (i + 1 < code.Length && code[i] == '/' && code[i + 1] == '*')
|
||||
{
|
||||
i += 2;
|
||||
while (i + 1 < code.Length && !(code[i] == '*' && code[i + 1] == '/'))
|
||||
{
|
||||
if (code[i] == '\n')
|
||||
{
|
||||
line++;
|
||||
column = 1;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
var startColumn = column;
|
||||
|
||||
// Identifiers and keywords
|
||||
if (char.IsLetter(c) || c == '_')
|
||||
{
|
||||
var start = i;
|
||||
while (i < code.Length && (char.IsLetterOrDigit(code[i]) || code[i] == '_'))
|
||||
{
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
var value = code[start..i];
|
||||
var type = s_keywords.Contains(value) ? TokenType.Keyword : TokenType.Identifier;
|
||||
tokens.Add(new Token(type, value, line, startColumn));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Numbers
|
||||
if (char.IsDigit(c) || (c == '0' && i + 1 < code.Length && code[i + 1] == 'x'))
|
||||
{
|
||||
var start = i;
|
||||
if (c == '0' && i + 1 < code.Length && code[i + 1] == 'x')
|
||||
{
|
||||
i += 2;
|
||||
column += 2;
|
||||
while (i < code.Length && char.IsAsciiHexDigit(code[i]))
|
||||
{
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (i < code.Length && (char.IsDigit(code[i]) || code[i] == '.'))
|
||||
{
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
}
|
||||
// Handle suffixes (U, L, UL, etc.)
|
||||
while (i < code.Length && (code[i] == 'U' || code[i] == 'L' || code[i] == 'u' || code[i] == 'l'))
|
||||
{
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
tokens.Add(new Token(TokenType.Number, code[start..i], line, startColumn));
|
||||
continue;
|
||||
}
|
||||
|
||||
// String literals
|
||||
if (c == '"')
|
||||
{
|
||||
var start = i;
|
||||
i++;
|
||||
column++;
|
||||
while (i < code.Length && code[i] != '"')
|
||||
{
|
||||
if (code[i] == '\\' && i + 1 < code.Length)
|
||||
{
|
||||
i += 2;
|
||||
column += 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
}
|
||||
i++; // closing quote
|
||||
column++;
|
||||
tokens.Add(new Token(TokenType.String, code[start..i], line, startColumn));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Character literals
|
||||
if (c == '\'')
|
||||
{
|
||||
var start = i;
|
||||
i++;
|
||||
column++;
|
||||
while (i < code.Length && code[i] != '\'')
|
||||
{
|
||||
if (code[i] == '\\' && i + 1 < code.Length)
|
||||
{
|
||||
i += 2;
|
||||
column += 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
}
|
||||
i++; // closing quote
|
||||
column++;
|
||||
tokens.Add(new Token(TokenType.Char, code[start..i], line, startColumn));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Multi-character operators
|
||||
if (i + 1 < code.Length)
|
||||
{
|
||||
var twoChar = code.Substring(i, 2);
|
||||
if (twoChar is "==" or "!=" or "<=" or ">=" or "&&" or "||" or
|
||||
"++" or "--" or "+=" or "-=" or "*=" or "/=" or
|
||||
"<<" or ">>" or "->" or "::")
|
||||
{
|
||||
tokens.Add(new Token(TokenType.Operator, twoChar, line, startColumn));
|
||||
i += 2;
|
||||
column += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Single character operators and punctuation
|
||||
var tokenType = c switch
|
||||
{
|
||||
'(' or ')' or '{' or '}' or '[' or ']' => TokenType.Bracket,
|
||||
';' or ',' or ':' or '?' => TokenType.Punctuation,
|
||||
_ => TokenType.Operator
|
||||
};
|
||||
tokens.Add(new Token(tokenType, c.ToString(), line, startColumn));
|
||||
i++;
|
||||
column++;
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private static int CountNodes(AstNode node)
|
||||
{
|
||||
var count = 1;
|
||||
foreach (var child in node.Children)
|
||||
{
|
||||
count += CountNodes(child);
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private static int ComputeDepth(AstNode node)
|
||||
{
|
||||
if (node.Children.Length == 0)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
return 1 + node.Children.Max(c => ComputeDepth(c));
|
||||
}
|
||||
|
||||
private static ImmutableArray<AstPattern> ExtractPatterns(AstNode root)
|
||||
{
|
||||
var patterns = new List<AstPattern>();
|
||||
|
||||
foreach (var node in TraverseNodes(root))
|
||||
{
|
||||
// Detect loop patterns
|
||||
if (node.Type == AstNodeType.For)
|
||||
{
|
||||
patterns.Add(new AstPattern(
|
||||
PatternType.CountedLoop,
|
||||
node,
|
||||
new PatternMetadata("For loop", 0.9m, null)));
|
||||
}
|
||||
else if (node.Type == AstNodeType.While)
|
||||
{
|
||||
patterns.Add(new AstPattern(
|
||||
PatternType.ConditionalLoop,
|
||||
node,
|
||||
new PatternMetadata("While loop", 0.9m, null)));
|
||||
}
|
||||
else if (node.Type == AstNodeType.DoWhile)
|
||||
{
|
||||
patterns.Add(new AstPattern(
|
||||
PatternType.ConditionalLoop,
|
||||
node,
|
||||
new PatternMetadata("Do-while loop", 0.9m, null)));
|
||||
}
|
||||
|
||||
// Detect error handling
|
||||
if (node is IfNode ifNode && IsErrorCheck(ifNode))
|
||||
{
|
||||
patterns.Add(new AstPattern(
|
||||
PatternType.ErrorCheck,
|
||||
node,
|
||||
new PatternMetadata("Error check", 0.8m, null)));
|
||||
}
|
||||
|
||||
// Detect null checks
|
||||
if (node is IfNode ifNull && IsNullCheck(ifNull))
|
||||
{
|
||||
patterns.Add(new AstPattern(
|
||||
PatternType.NullCheck,
|
||||
node,
|
||||
new PatternMetadata("Null check", 0.9m, null)));
|
||||
}
|
||||
}
|
||||
|
||||
return [.. patterns];
|
||||
}
|
||||
|
||||
private static IEnumerable<AstNode> TraverseNodes(AstNode root)
|
||||
{
|
||||
yield return root;
|
||||
foreach (var child in root.Children)
|
||||
{
|
||||
foreach (var node in TraverseNodes(child))
|
||||
{
|
||||
yield return node;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static bool IsErrorCheck(IfNode node)
|
||||
{
|
||||
// Check if condition compares against -1, 0, or NULL
|
||||
if (node.Condition is BinaryOpNode binaryOp)
|
||||
{
|
||||
if (binaryOp.Right is ConstantNode constant)
|
||||
{
|
||||
var value = constant.Value?.ToString();
|
||||
return value is "0" or "-1" or "0xffffffff" or "NULL";
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static bool IsNullCheck(IfNode node)
|
||||
{
|
||||
if (node.Condition is BinaryOpNode binaryOp)
|
||||
{
|
||||
if (binaryOp.Operator is "==" or "!=")
|
||||
{
|
||||
if (binaryOp.Right is ConstantNode constant)
|
||||
{
|
||||
var value = constant.Value?.ToString();
|
||||
return value is "0" or "NULL" or "nullptr";
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"(?<type>\w+)\s+(?<name>\w+)\s*(?:=|;)", RegexOptions.Compiled)]
|
||||
private static partial Regex VariableDeclarationRegex();
|
||||
|
||||
[GeneratedRegex(@"(?<name>\w+)\s*\(", RegexOptions.Compiled)]
|
||||
private static partial Regex FunctionCallRegex();
|
||||
}
|
||||
|
||||
internal enum TokenType
|
||||
{
|
||||
Identifier,
|
||||
Keyword,
|
||||
Number,
|
||||
String,
|
||||
Char,
|
||||
Operator,
|
||||
Bracket,
|
||||
Punctuation
|
||||
}
|
||||
|
||||
internal readonly record struct Token(TokenType Type, string Value, int Line, int Column);
|
||||
|
||||
internal sealed class RecursiveParser
|
||||
{
|
||||
private readonly List<Token> _tokens;
|
||||
private int _pos;
|
||||
|
||||
public RecursiveParser(List<Token> tokens)
|
||||
{
|
||||
_tokens = tokens;
|
||||
_pos = 0;
|
||||
}
|
||||
|
||||
public AstNode ParseFunction()
|
||||
{
|
||||
// Parse return type
|
||||
var returnType = ParseType();
|
||||
|
||||
// Parse function name
|
||||
var name = Expect(TokenType.Identifier).Value;
|
||||
|
||||
// Parse parameters
|
||||
Expect(TokenType.Bracket, "(");
|
||||
var parameters = ParseParameterList();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
|
||||
// Parse body
|
||||
var body = ParseBlock();
|
||||
|
||||
return new FunctionNode(name, returnType, parameters, body);
|
||||
}
|
||||
|
||||
private string ParseType()
|
||||
{
|
||||
var type = new System.Text.StringBuilder();
|
||||
|
||||
// Handle modifiers
|
||||
while (Peek().Value is "const" or "unsigned" or "signed" or "static" or "extern")
|
||||
{
|
||||
type.Append(Advance().Value);
|
||||
type.Append(' ');
|
||||
}
|
||||
|
||||
// Main type
|
||||
type.Append(Advance().Value);
|
||||
|
||||
// Handle pointers
|
||||
while (Peek().Value == "*")
|
||||
{
|
||||
type.Append(Advance().Value);
|
||||
}
|
||||
|
||||
return type.ToString().Trim();
|
||||
}
|
||||
|
||||
private ImmutableArray<ParameterNode> ParseParameterList()
|
||||
{
|
||||
var parameters = new List<ParameterNode>();
|
||||
var index = 0;
|
||||
|
||||
if (Peek().Value == ")")
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
if (Peek().Value == "void" && PeekAhead(1).Value == ")")
|
||||
{
|
||||
Advance(); // consume void
|
||||
return [];
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
if (Peek().Value == ",")
|
||||
{
|
||||
Advance();
|
||||
}
|
||||
|
||||
var type = ParseType();
|
||||
var name = Peek().Type == TokenType.Identifier ? Advance().Value : $"param_{index}";
|
||||
|
||||
parameters.Add(new ParameterNode(name, type, index));
|
||||
index++;
|
||||
}
|
||||
while (Peek().Value == ",");
|
||||
|
||||
return [.. parameters];
|
||||
}
|
||||
|
||||
private BlockNode ParseBlock()
|
||||
{
|
||||
Expect(TokenType.Bracket, "{");
|
||||
|
||||
var statements = new List<AstNode>();
|
||||
|
||||
while (Peek().Value != "}")
|
||||
{
|
||||
var stmt = ParseStatement();
|
||||
if (stmt is not null)
|
||||
{
|
||||
statements.Add(stmt);
|
||||
}
|
||||
}
|
||||
|
||||
Expect(TokenType.Bracket, "}");
|
||||
|
||||
return new BlockNode([.. statements]);
|
||||
}
|
||||
|
||||
private AstNode? ParseStatement()
|
||||
{
|
||||
var token = Peek();
|
||||
|
||||
return token.Value switch
|
||||
{
|
||||
"if" => ParseIf(),
|
||||
"while" => ParseWhile(),
|
||||
"for" => ParseFor(),
|
||||
"do" => ParseDoWhile(),
|
||||
"return" => ParseReturn(),
|
||||
"break" => ParseBreak(),
|
||||
"continue" => ParseContinue(),
|
||||
"{" => ParseBlock(),
|
||||
";" => SkipSemicolon(),
|
||||
_ => ParseExpressionStatement()
|
||||
};
|
||||
}
|
||||
|
||||
private IfNode ParseIf()
|
||||
{
|
||||
Advance(); // consume 'if'
|
||||
Expect(TokenType.Bracket, "(");
|
||||
var condition = ParseExpression();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
|
||||
var thenBranch = ParseStatement() ?? new BlockNode([]);
|
||||
|
||||
AstNode? elseBranch = null;
|
||||
if (Peek().Value == "else")
|
||||
{
|
||||
Advance();
|
||||
elseBranch = ParseStatement();
|
||||
}
|
||||
|
||||
return new IfNode(condition, thenBranch, elseBranch);
|
||||
}
|
||||
|
||||
private WhileNode ParseWhile()
|
||||
{
|
||||
Advance(); // consume 'while'
|
||||
Expect(TokenType.Bracket, "(");
|
||||
var condition = ParseExpression();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
|
||||
var body = ParseStatement() ?? new BlockNode([]);
|
||||
|
||||
return new WhileNode(condition, body);
|
||||
}
|
||||
|
||||
private ForNode ParseFor()
|
||||
{
|
||||
Advance(); // consume 'for'
|
||||
Expect(TokenType.Bracket, "(");
|
||||
|
||||
AstNode? init = null;
|
||||
if (Peek().Value != ";")
|
||||
{
|
||||
init = ParseExpression();
|
||||
}
|
||||
Expect(TokenType.Punctuation, ";");
|
||||
|
||||
AstNode? condition = null;
|
||||
if (Peek().Value != ";")
|
||||
{
|
||||
condition = ParseExpression();
|
||||
}
|
||||
Expect(TokenType.Punctuation, ";");
|
||||
|
||||
AstNode? update = null;
|
||||
if (Peek().Value != ")")
|
||||
{
|
||||
update = ParseExpression();
|
||||
}
|
||||
Expect(TokenType.Bracket, ")");
|
||||
|
||||
var body = ParseStatement() ?? new BlockNode([]);
|
||||
|
||||
return new ForNode(init, condition, update, body);
|
||||
}
|
||||
|
||||
private AstNode ParseDoWhile()
|
||||
{
|
||||
Advance(); // consume 'do'
|
||||
var body = ParseStatement() ?? new BlockNode([]);
|
||||
|
||||
Expect(TokenType.Keyword, "while");
|
||||
Expect(TokenType.Bracket, "(");
|
||||
var condition = ParseExpression();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
Expect(TokenType.Punctuation, ";");
|
||||
|
||||
return new WhileNode(condition, body); // Simplify do-while to while for now
|
||||
}
|
||||
|
||||
private ReturnNode ParseReturn()
|
||||
{
|
||||
Advance(); // consume 'return'
|
||||
|
||||
AstNode? value = null;
|
||||
if (Peek().Value != ";")
|
||||
{
|
||||
value = ParseExpression();
|
||||
}
|
||||
Expect(TokenType.Punctuation, ";");
|
||||
|
||||
return new ReturnNode(value);
|
||||
}
|
||||
|
||||
private AstNode ParseBreak()
|
||||
{
|
||||
Advance();
|
||||
Expect(TokenType.Punctuation, ";");
|
||||
return new BlockNode([]); // Simplified
|
||||
}
|
||||
|
||||
private AstNode ParseContinue()
|
||||
{
|
||||
Advance();
|
||||
Expect(TokenType.Punctuation, ";");
|
||||
return new BlockNode([]); // Simplified
|
||||
}
|
||||
|
||||
private AstNode? SkipSemicolon()
|
||||
{
|
||||
Advance();
|
||||
return null;
|
||||
}
|
||||
|
||||
private AstNode? ParseExpressionStatement()
|
||||
{
|
||||
var expr = ParseExpression();
|
||||
if (Peek().Value == ";")
|
||||
{
|
||||
Advance();
|
||||
}
|
||||
return expr;
|
||||
}
|
||||
|
||||
private AstNode ParseExpression()
|
||||
{
|
||||
return ParseAssignment();
|
||||
}
|
||||
|
||||
private AstNode ParseAssignment()
|
||||
{
|
||||
var left = ParseLogicalOr();
|
||||
|
||||
if (Peek().Value is "=" or "+=" or "-=" or "*=" or "/=" or "&=" or "|=" or "^=" or "<<=" or ">>=")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseAssignment();
|
||||
return new AssignmentNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseLogicalOr()
|
||||
{
|
||||
var left = ParseLogicalAnd();
|
||||
|
||||
while (Peek().Value == "||")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseLogicalAnd();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseLogicalAnd()
|
||||
{
|
||||
var left = ParseBitwiseOr();
|
||||
|
||||
while (Peek().Value == "&&")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseBitwiseOr();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseBitwiseOr()
|
||||
{
|
||||
var left = ParseComparison();
|
||||
|
||||
while (Peek().Value is "|" or "^" or "&")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseComparison();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseComparison()
|
||||
{
|
||||
var left = ParseShift();
|
||||
|
||||
while (Peek().Value is "==" or "!=" or "<" or ">" or "<=" or ">=")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseShift();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseShift()
|
||||
{
|
||||
var left = ParseAdditive();
|
||||
|
||||
while (Peek().Value is "<<" or ">>")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseAdditive();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseAdditive()
|
||||
{
|
||||
var left = ParseMultiplicative();
|
||||
|
||||
while (Peek().Value is "+" or "-")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseMultiplicative();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseMultiplicative()
|
||||
{
|
||||
var left = ParseUnary();
|
||||
|
||||
while (Peek().Value is "*" or "/" or "%")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var right = ParseUnary();
|
||||
left = new BinaryOpNode(left, right, op);
|
||||
}
|
||||
|
||||
return left;
|
||||
}
|
||||
|
||||
private AstNode ParseUnary()
|
||||
{
|
||||
if (Peek().Value is "!" or "~" or "-" or "+" or "*" or "&" or "++" or "--")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
var operand = ParseUnary();
|
||||
return new UnaryOpNode(operand, op, true);
|
||||
}
|
||||
|
||||
return ParsePostfix();
|
||||
}
|
||||
|
||||
private AstNode ParsePostfix()
|
||||
{
|
||||
var expr = ParsePrimary();
|
||||
|
||||
while (true)
|
||||
{
|
||||
if (Peek().Value == "(")
|
||||
{
|
||||
// Function call
|
||||
Advance();
|
||||
var args = ParseArgumentList();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
|
||||
if (expr is VariableNode varNode)
|
||||
{
|
||||
expr = new CallNode(varNode.Name, args);
|
||||
}
|
||||
}
|
||||
else if (Peek().Value == "[")
|
||||
{
|
||||
// Array access
|
||||
Advance();
|
||||
var index = ParseExpression();
|
||||
Expect(TokenType.Bracket, "]");
|
||||
expr = new ArrayAccessNode(expr, index);
|
||||
}
|
||||
else if (Peek().Value is "." or "->")
|
||||
{
|
||||
var isPointer = Advance().Value == "->";
|
||||
var field = Expect(TokenType.Identifier).Value;
|
||||
expr = new FieldAccessNode(expr, field, isPointer);
|
||||
}
|
||||
else if (Peek().Value is "++" or "--")
|
||||
{
|
||||
var op = Advance().Value;
|
||||
expr = new UnaryOpNode(expr, op, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return expr;
|
||||
}
|
||||
|
||||
private ImmutableArray<AstNode> ParseArgumentList()
|
||||
{
|
||||
var args = new List<AstNode>();
|
||||
|
||||
if (Peek().Value == ")")
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
do
|
||||
{
|
||||
if (Peek().Value == ",")
|
||||
{
|
||||
Advance();
|
||||
}
|
||||
args.Add(ParseExpression());
|
||||
}
|
||||
while (Peek().Value == ",");
|
||||
|
||||
return [.. args];
|
||||
}
|
||||
|
||||
private AstNode ParsePrimary()
|
||||
{
|
||||
var token = Peek();
|
||||
|
||||
if (token.Type == TokenType.Number)
|
||||
{
|
||||
Advance();
|
||||
return new ConstantNode(token.Value, "int");
|
||||
}
|
||||
|
||||
if (token.Type == TokenType.String)
|
||||
{
|
||||
Advance();
|
||||
return new ConstantNode(token.Value, "char*");
|
||||
}
|
||||
|
||||
if (token.Type == TokenType.Char)
|
||||
{
|
||||
Advance();
|
||||
return new ConstantNode(token.Value, "char");
|
||||
}
|
||||
|
||||
if (token.Type == TokenType.Identifier)
|
||||
{
|
||||
Advance();
|
||||
return new VariableNode(token.Value, null);
|
||||
}
|
||||
|
||||
if (token.Value == "(")
|
||||
{
|
||||
Advance();
|
||||
|
||||
// Check for cast
|
||||
if (IsType(Peek().Value))
|
||||
{
|
||||
var targetType = ParseType();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
var expr = ParseUnary();
|
||||
return new CastNode(expr, targetType);
|
||||
}
|
||||
|
||||
var inner = ParseExpression();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
return inner;
|
||||
}
|
||||
|
||||
// Handle sizeof
|
||||
if (token.Value == "sizeof")
|
||||
{
|
||||
Advance();
|
||||
Expect(TokenType.Bracket, "(");
|
||||
var type = ParseType();
|
||||
Expect(TokenType.Bracket, ")");
|
||||
return new ConstantNode($"sizeof({type})", "size_t");
|
||||
}
|
||||
|
||||
// Unknown token - return empty node
|
||||
Advance();
|
||||
return new ConstantNode(token.Value, "unknown");
|
||||
}
|
||||
|
||||
private static bool IsType(string value)
|
||||
{
|
||||
return value is "int" or "char" or "void" or "long" or "short" or "float" or "double"
|
||||
or "unsigned" or "signed" or "const" or "struct" or "union" or "enum"
|
||||
or "undefined" or "undefined1" or "undefined2" or "undefined4" or "undefined8"
|
||||
or "byte" or "word" or "dword" or "qword" or "pointer" or "code" or "uint" or "ulong";
|
||||
}
|
||||
|
||||
private Token Peek() => _pos < _tokens.Count ? _tokens[_pos] : new Token(TokenType.Punctuation, "", 0, 0);
|
||||
|
||||
private Token PeekAhead(int offset) => _pos + offset < _tokens.Count
|
||||
? _tokens[_pos + offset]
|
||||
: new Token(TokenType.Punctuation, "", 0, 0);
|
||||
|
||||
private Token Advance() => _pos < _tokens.Count ? _tokens[_pos++] : new Token(TokenType.Punctuation, "", 0, 0);
|
||||
|
||||
private Token Expect(TokenType type, string? value = null)
|
||||
{
|
||||
var token = Peek();
|
||||
if (token.Type != type || (value is not null && token.Value != value))
|
||||
{
|
||||
// Skip unexpected tokens
|
||||
return Advance();
|
||||
}
|
||||
return Advance();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering decompiler services.
|
||||
/// </summary>
|
||||
public static class DecompilerServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds decompiler services to the service collection.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDecompilerServices(this IServiceCollection services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
// Register parser
|
||||
services.AddSingleton<IDecompiledCodeParser, DecompiledCodeParser>();
|
||||
|
||||
// Register comparison engine
|
||||
services.AddSingleton<IAstComparisonEngine, AstComparisonEngine>();
|
||||
|
||||
// Register normalizer
|
||||
services.AddSingleton<ICodeNormalizer, CodeNormalizer>();
|
||||
|
||||
// Register decompiler service
|
||||
services.AddScoped<IDecompilerService, GhidraDecompilerAdapter>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds decompiler services with custom options.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureOptions">Action to configure decompiler options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDecompilerServices(
|
||||
this IServiceCollection services,
|
||||
Action<DecompilerOptions> configureOptions)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(configureOptions);
|
||||
|
||||
services.Configure(configureOptions);
|
||||
return services.AddDecompilerServices();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,291 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// Adapter for Ghidra's decompiler via headless analysis.
|
||||
/// </summary>
|
||||
public sealed class GhidraDecompilerAdapter : IDecompilerService
|
||||
{
|
||||
private readonly IGhidraService _ghidraService;
|
||||
private readonly IDecompiledCodeParser _parser;
|
||||
private readonly IAstComparisonEngine _comparisonEngine;
|
||||
private readonly DecompilerOptions _options;
|
||||
private readonly ILogger<GhidraDecompilerAdapter> _logger;
|
||||
|
||||
public GhidraDecompilerAdapter(
|
||||
IGhidraService ghidraService,
|
||||
IDecompiledCodeParser parser,
|
||||
IAstComparisonEngine comparisonEngine,
|
||||
IOptions<DecompilerOptions> options,
|
||||
ILogger<GhidraDecompilerAdapter> logger)
|
||||
{
|
||||
_ghidraService = ghidraService;
|
||||
_parser = parser;
|
||||
_comparisonEngine = comparisonEngine;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DecompiledFunction> DecompileAsync(
|
||||
GhidraFunction function,
|
||||
DecompileOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(function);
|
||||
|
||||
options ??= new DecompileOptions();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Decompiling function {Name} at 0x{Address:X}",
|
||||
function.Name,
|
||||
function.Address);
|
||||
|
||||
// The GhidraFunction should already have decompiled code from analysis
|
||||
var code = function.DecompiledCode;
|
||||
|
||||
if (string.IsNullOrEmpty(code))
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Function {Name} has no decompiled code, returning stub",
|
||||
function.Name);
|
||||
|
||||
return new DecompiledFunction(
|
||||
function.Name,
|
||||
BuildSignature(function),
|
||||
"/* Decompilation unavailable */",
|
||||
null,
|
||||
[],
|
||||
[],
|
||||
function.Address,
|
||||
function.Size);
|
||||
}
|
||||
|
||||
// Truncate if too long
|
||||
if (code.Length > options.MaxCodeLength)
|
||||
{
|
||||
code = code[..options.MaxCodeLength] + "\n/* ... truncated ... */";
|
||||
}
|
||||
|
||||
// Parse to AST
|
||||
DecompiledAst? ast = null;
|
||||
try
|
||||
{
|
||||
ast = _parser.Parse(code);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse decompiled code for {Name}", function.Name);
|
||||
}
|
||||
|
||||
// Extract metadata
|
||||
var locals = _parser.ExtractVariables(code);
|
||||
var calledFunctions = _parser.ExtractCalledFunctions(code);
|
||||
|
||||
return new DecompiledFunction(
|
||||
function.Name,
|
||||
BuildSignature(function),
|
||||
code,
|
||||
ast,
|
||||
locals,
|
||||
calledFunctions,
|
||||
function.Address,
|
||||
function.Size);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DecompiledFunction> DecompileAtAddressAsync(
|
||||
string binaryPath,
|
||||
ulong address,
|
||||
DecompileOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(binaryPath);
|
||||
|
||||
options ??= new DecompileOptions();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Decompiling function at 0x{Address:X} in {Binary}",
|
||||
address,
|
||||
Path.GetFileName(binaryPath));
|
||||
|
||||
// Use Ghidra to analyze and get the function
|
||||
using var stream = File.OpenRead(binaryPath);
|
||||
var analysis = await _ghidraService.AnalyzeAsync(
|
||||
stream,
|
||||
new GhidraAnalysisOptions
|
||||
{
|
||||
IncludeDecompilation = true,
|
||||
ExtractDecompilation = true
|
||||
},
|
||||
ct);
|
||||
|
||||
var function = analysis.Functions.FirstOrDefault(f => f.Address == address);
|
||||
|
||||
if (function is null)
|
||||
{
|
||||
throw new InvalidOperationException($"No function found at address 0x{address:X}");
|
||||
}
|
||||
|
||||
return await DecompileAsync(function, options, ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<DecompiledAst> ParseToAstAsync(
|
||||
string decompiledCode,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(decompiledCode);
|
||||
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var ast = _parser.Parse(decompiledCode);
|
||||
return Task.FromResult(ast);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<DecompiledComparisonResult> CompareAsync(
|
||||
DecompiledFunction a,
|
||||
DecompiledFunction b,
|
||||
ComparisonOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
|
||||
options ??= new ComparisonOptions();
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Comparing functions {A} and {B}",
|
||||
a.FunctionName,
|
||||
b.FunctionName);
|
||||
|
||||
// Need ASTs for comparison
|
||||
if (a.Ast is null || b.Ast is null)
|
||||
{
|
||||
_logger.LogWarning("Cannot compare functions without ASTs");
|
||||
|
||||
return Task.FromResult(new DecompiledComparisonResult(
|
||||
Similarity: 0,
|
||||
StructuralSimilarity: 0,
|
||||
SemanticSimilarity: 0,
|
||||
EditDistance: new AstEditDistance(0, 0, 0, 0, 1.0m),
|
||||
Equivalences: [],
|
||||
Differences: [],
|
||||
Confidence: ComparisonConfidence.Low));
|
||||
}
|
||||
|
||||
// Compute structural similarity
|
||||
var structuralSimilarity = _comparisonEngine.ComputeStructuralSimilarity(a.Ast, b.Ast);
|
||||
|
||||
// Compute edit distance
|
||||
var editDistance = _comparisonEngine.ComputeEditDistance(a.Ast, b.Ast);
|
||||
|
||||
// Find semantic equivalences
|
||||
var equivalences = _comparisonEngine.FindEquivalences(a.Ast, b.Ast);
|
||||
|
||||
// Find differences
|
||||
var differences = _comparisonEngine.FindDifferences(a.Ast, b.Ast);
|
||||
|
||||
// Compute semantic similarity from equivalences
|
||||
var totalNodes = Math.Max(a.Ast.NodeCount, b.Ast.NodeCount);
|
||||
var equivalentNodes = equivalences.Length;
|
||||
var semanticSimilarity = totalNodes > 0
|
||||
? (decimal)equivalentNodes / totalNodes
|
||||
: 0m;
|
||||
|
||||
// Combine into overall similarity
|
||||
var overallSimilarity = ComputeOverallSimilarity(
|
||||
structuralSimilarity,
|
||||
semanticSimilarity,
|
||||
editDistance.NormalizedDistance);
|
||||
|
||||
// Determine confidence
|
||||
var confidence = DetermineConfidence(
|
||||
overallSimilarity,
|
||||
a.Ast.NodeCount,
|
||||
b.Ast.NodeCount,
|
||||
equivalences.Length);
|
||||
|
||||
return Task.FromResult(new DecompiledComparisonResult(
|
||||
Similarity: overallSimilarity,
|
||||
StructuralSimilarity: structuralSimilarity,
|
||||
SemanticSimilarity: semanticSimilarity,
|
||||
EditDistance: editDistance,
|
||||
Equivalences: equivalences,
|
||||
Differences: differences,
|
||||
Confidence: confidence));
|
||||
}
|
||||
|
||||
private static string BuildSignature(GhidraFunction function)
|
||||
{
|
||||
// Use the signature from Ghidra if available, otherwise construct a simple one
|
||||
if (!string.IsNullOrEmpty(function.Signature))
|
||||
{
|
||||
return function.Signature;
|
||||
}
|
||||
|
||||
// Default signature if none available
|
||||
return $"void {function.Name}(void)";
|
||||
}
|
||||
|
||||
private static decimal ComputeOverallSimilarity(
|
||||
decimal structural,
|
||||
decimal semantic,
|
||||
decimal normalizedEditDistance)
|
||||
{
|
||||
// Weight: 40% structural, 40% semantic, 20% edit distance (inverted)
|
||||
var editSimilarity = 1.0m - normalizedEditDistance;
|
||||
return structural * 0.4m + semantic * 0.4m + editSimilarity * 0.2m;
|
||||
}
|
||||
|
||||
private static ComparisonConfidence DetermineConfidence(
|
||||
decimal similarity,
|
||||
int nodeCountA,
|
||||
int nodeCountB,
|
||||
int equivalenceCount)
|
||||
{
|
||||
// Very small functions are harder to compare confidently
|
||||
var minNodes = Math.Min(nodeCountA, nodeCountB);
|
||||
if (minNodes < 5)
|
||||
{
|
||||
return ComparisonConfidence.Low;
|
||||
}
|
||||
|
||||
// High similarity with many equivalences = high confidence
|
||||
if (similarity > 0.9m && equivalenceCount > minNodes * 0.7)
|
||||
{
|
||||
return ComparisonConfidence.VeryHigh;
|
||||
}
|
||||
|
||||
if (similarity > 0.7m && equivalenceCount > minNodes * 0.5)
|
||||
{
|
||||
return ComparisonConfidence.High;
|
||||
}
|
||||
|
||||
if (similarity > 0.5m)
|
||||
{
|
||||
return ComparisonConfidence.Medium;
|
||||
}
|
||||
|
||||
return ComparisonConfidence.Low;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for the decompiler adapter.
|
||||
/// </summary>
|
||||
public sealed class DecompilerOptions
|
||||
{
|
||||
public string GhidraScriptsPath { get; set; } = "/scripts";
|
||||
public TimeSpan DefaultTimeout { get; set; } = TimeSpan.FromSeconds(30);
|
||||
public int MaxCodeLength { get; set; } = 100_000;
|
||||
}
|
||||
@@ -0,0 +1,157 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// Service for decompiling binary functions to C-like pseudo-code.
|
||||
/// </summary>
|
||||
public interface IDecompilerService
|
||||
{
|
||||
/// <summary>
|
||||
/// Decompile a function to C-like pseudo-code.
|
||||
/// </summary>
|
||||
/// <param name="function">Function from Ghidra analysis.</param>
|
||||
/// <param name="options">Decompilation options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Decompiled function with code and optional AST.</returns>
|
||||
Task<DecompiledFunction> DecompileAsync(
|
||||
GhidraFunction function,
|
||||
DecompileOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Decompile a function by address.
|
||||
/// </summary>
|
||||
/// <param name="binaryPath">Path to the binary file.</param>
|
||||
/// <param name="address">Function address.</param>
|
||||
/// <param name="options">Decompilation options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Decompiled function.</returns>
|
||||
Task<DecompiledFunction> DecompileAtAddressAsync(
|
||||
string binaryPath,
|
||||
ulong address,
|
||||
DecompileOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Parse decompiled code into AST.
|
||||
/// </summary>
|
||||
/// <param name="decompiledCode">C-like pseudo-code from decompiler.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Abstract syntax tree representation.</returns>
|
||||
Task<DecompiledAst> ParseToAstAsync(
|
||||
string decompiledCode,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Compare two decompiled functions for semantic equivalence.
|
||||
/// </summary>
|
||||
/// <param name="a">First function.</param>
|
||||
/// <param name="b">Second function.</param>
|
||||
/// <param name="options">Comparison options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Comparison result with similarity metrics.</returns>
|
||||
Task<DecompiledComparisonResult> CompareAsync(
|
||||
DecompiledFunction a,
|
||||
DecompiledFunction b,
|
||||
ComparisonOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Engine for comparing AST structures.
|
||||
/// </summary>
|
||||
public interface IAstComparisonEngine
|
||||
{
|
||||
/// <summary>
|
||||
/// Compute structural similarity between ASTs.
|
||||
/// </summary>
|
||||
/// <param name="a">First AST.</param>
|
||||
/// <param name="b">Second AST.</param>
|
||||
/// <returns>Similarity score (0.0 to 1.0).</returns>
|
||||
decimal ComputeStructuralSimilarity(DecompiledAst a, DecompiledAst b);
|
||||
|
||||
/// <summary>
|
||||
/// Compute edit distance between ASTs.
|
||||
/// </summary>
|
||||
/// <param name="a">First AST.</param>
|
||||
/// <param name="b">Second AST.</param>
|
||||
/// <returns>Edit distance metrics.</returns>
|
||||
AstEditDistance ComputeEditDistance(DecompiledAst a, DecompiledAst b);
|
||||
|
||||
/// <summary>
|
||||
/// Find semantic equivalences between ASTs.
|
||||
/// </summary>
|
||||
/// <param name="a">First AST.</param>
|
||||
/// <param name="b">Second AST.</param>
|
||||
/// <returns>List of equivalent node pairs.</returns>
|
||||
ImmutableArray<SemanticEquivalence> FindEquivalences(DecompiledAst a, DecompiledAst b);
|
||||
|
||||
/// <summary>
|
||||
/// Find differences between ASTs.
|
||||
/// </summary>
|
||||
/// <param name="a">First AST.</param>
|
||||
/// <param name="b">Second AST.</param>
|
||||
/// <returns>List of differences.</returns>
|
||||
ImmutableArray<CodeDifference> FindDifferences(DecompiledAst a, DecompiledAst b);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Normalizes decompiled code for comparison.
|
||||
/// </summary>
|
||||
public interface ICodeNormalizer
|
||||
{
|
||||
/// <summary>
|
||||
/// Normalize decompiled code for comparison.
|
||||
/// </summary>
|
||||
/// <param name="code">Raw decompiled code.</param>
|
||||
/// <param name="options">Normalization options.</param>
|
||||
/// <returns>Normalized code.</returns>
|
||||
string Normalize(string code, NormalizationOptions? options = null);
|
||||
|
||||
/// <summary>
|
||||
/// Compute canonical hash of normalized code.
|
||||
/// </summary>
|
||||
/// <param name="code">Decompiled code.</param>
|
||||
/// <returns>32-byte hash.</returns>
|
||||
byte[] ComputeCanonicalHash(string code);
|
||||
|
||||
/// <summary>
|
||||
/// Normalize an AST for comparison.
|
||||
/// </summary>
|
||||
/// <param name="ast">AST to normalize.</param>
|
||||
/// <param name="options">Normalization options.</param>
|
||||
/// <returns>Normalized AST.</returns>
|
||||
DecompiledAst NormalizeAst(DecompiledAst ast, NormalizationOptions? options = null);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parses decompiled C-like code into AST.
|
||||
/// </summary>
|
||||
public interface IDecompiledCodeParser
|
||||
{
|
||||
/// <summary>
|
||||
/// Parse decompiled code into AST.
|
||||
/// </summary>
|
||||
/// <param name="code">C-like pseudo-code.</param>
|
||||
/// <returns>Parsed AST.</returns>
|
||||
DecompiledAst Parse(string code);
|
||||
|
||||
/// <summary>
|
||||
/// Extract local variables from decompiled code.
|
||||
/// </summary>
|
||||
/// <param name="code">C-like pseudo-code.</param>
|
||||
/// <returns>List of local variables.</returns>
|
||||
ImmutableArray<LocalVariable> ExtractVariables(string code);
|
||||
|
||||
/// <summary>
|
||||
/// Extract called functions from decompiled code.
|
||||
/// </summary>
|
||||
/// <param name="code">C-like pseudo-code.</param>
|
||||
/// <returns>List of function names called.</returns>
|
||||
ImmutableArray<string> ExtractCalledFunctions(string code);
|
||||
}
|
||||
@@ -0,0 +1,377 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Decompiler;
|
||||
|
||||
/// <summary>
|
||||
/// A function decompiled to C-like pseudo-code.
|
||||
/// </summary>
|
||||
public sealed record DecompiledFunction(
|
||||
string FunctionName,
|
||||
string Signature,
|
||||
string Code,
|
||||
DecompiledAst? Ast,
|
||||
ImmutableArray<LocalVariable> Locals,
|
||||
ImmutableArray<string> CalledFunctions,
|
||||
ulong Address,
|
||||
int SizeBytes);
|
||||
|
||||
/// <summary>
|
||||
/// AST representation of decompiled code.
|
||||
/// </summary>
|
||||
public sealed record DecompiledAst(
|
||||
AstNode Root,
|
||||
int NodeCount,
|
||||
int Depth,
|
||||
ImmutableArray<AstPattern> Patterns);
|
||||
|
||||
/// <summary>
|
||||
/// Abstract syntax tree node.
|
||||
/// </summary>
|
||||
public abstract record AstNode(
|
||||
AstNodeType Type,
|
||||
ImmutableArray<AstNode> Children,
|
||||
SourceLocation? Location);
|
||||
|
||||
/// <summary>
|
||||
/// Types of AST nodes.
|
||||
/// </summary>
|
||||
public enum AstNodeType
|
||||
{
|
||||
// Structure
|
||||
Function,
|
||||
Block,
|
||||
Parameter,
|
||||
|
||||
// Control flow
|
||||
If,
|
||||
While,
|
||||
For,
|
||||
DoWhile,
|
||||
Switch,
|
||||
Case,
|
||||
Default,
|
||||
Return,
|
||||
Break,
|
||||
Continue,
|
||||
Goto,
|
||||
Label,
|
||||
|
||||
// Expressions
|
||||
Assignment,
|
||||
BinaryOp,
|
||||
UnaryOp,
|
||||
TernaryOp,
|
||||
Call,
|
||||
Cast,
|
||||
Sizeof,
|
||||
|
||||
// Operands
|
||||
Variable,
|
||||
Constant,
|
||||
StringLiteral,
|
||||
ArrayAccess,
|
||||
FieldAccess,
|
||||
PointerDeref,
|
||||
AddressOf,
|
||||
|
||||
// Declarations
|
||||
VariableDecl,
|
||||
TypeDef
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Source location in decompiled code.
|
||||
/// </summary>
|
||||
public sealed record SourceLocation(int Line, int Column, int Length);
|
||||
|
||||
/// <summary>
|
||||
/// A local variable in decompiled code.
|
||||
/// </summary>
|
||||
public sealed record LocalVariable(
|
||||
string Name,
|
||||
string Type,
|
||||
int StackOffset,
|
||||
bool IsParameter,
|
||||
int? ParameterIndex);
|
||||
|
||||
/// <summary>
|
||||
/// A recognized code pattern.
|
||||
/// </summary>
|
||||
public sealed record AstPattern(
|
||||
PatternType Type,
|
||||
AstNode Node,
|
||||
PatternMetadata? Metadata);
|
||||
|
||||
/// <summary>
|
||||
/// Types of code patterns.
|
||||
/// </summary>
|
||||
public enum PatternType
|
||||
{
|
||||
// Loops
|
||||
CountedLoop,
|
||||
ConditionalLoop,
|
||||
InfiniteLoop,
|
||||
LoopUnrolled,
|
||||
|
||||
// Branches
|
||||
IfElseChain,
|
||||
SwitchTable,
|
||||
ShortCircuit,
|
||||
|
||||
// Memory
|
||||
MemoryAllocation,
|
||||
MemoryDeallocation,
|
||||
BufferOperation,
|
||||
StackBuffer,
|
||||
|
||||
// Error handling
|
||||
ErrorCheck,
|
||||
NullCheck,
|
||||
BoundsCheck,
|
||||
|
||||
// Idioms
|
||||
StringOperation,
|
||||
MathOperation,
|
||||
BitwiseOperation,
|
||||
TableLookup
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Metadata about a recognized pattern.
|
||||
/// </summary>
|
||||
public sealed record PatternMetadata(
|
||||
string Description,
|
||||
decimal Confidence,
|
||||
ImmutableDictionary<string, string>? Properties);
|
||||
|
||||
/// <summary>
|
||||
/// Result of comparing two decompiled functions.
|
||||
/// </summary>
|
||||
public sealed record DecompiledComparisonResult(
|
||||
decimal Similarity,
|
||||
decimal StructuralSimilarity,
|
||||
decimal SemanticSimilarity,
|
||||
AstEditDistance EditDistance,
|
||||
ImmutableArray<SemanticEquivalence> Equivalences,
|
||||
ImmutableArray<CodeDifference> Differences,
|
||||
ComparisonConfidence Confidence);
|
||||
|
||||
/// <summary>
|
||||
/// Edit distance between ASTs.
|
||||
/// </summary>
|
||||
public sealed record AstEditDistance(
|
||||
int Insertions,
|
||||
int Deletions,
|
||||
int Modifications,
|
||||
int TotalOperations,
|
||||
decimal NormalizedDistance);
|
||||
|
||||
/// <summary>
|
||||
/// A semantic equivalence between AST nodes.
|
||||
/// </summary>
|
||||
public sealed record SemanticEquivalence(
|
||||
AstNode NodeA,
|
||||
AstNode NodeB,
|
||||
EquivalenceType Type,
|
||||
decimal Confidence,
|
||||
string? Explanation);
|
||||
|
||||
/// <summary>
|
||||
/// Types of semantic equivalence.
|
||||
/// </summary>
|
||||
public enum EquivalenceType
|
||||
{
|
||||
Identical,
|
||||
Renamed,
|
||||
Reordered,
|
||||
Optimized,
|
||||
Inlined,
|
||||
Semantically
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A difference between two pieces of code.
|
||||
/// </summary>
|
||||
public sealed record CodeDifference(
|
||||
DifferenceType Type,
|
||||
AstNode? NodeA,
|
||||
AstNode? NodeB,
|
||||
string Description);
|
||||
|
||||
/// <summary>
|
||||
/// Types of code differences.
|
||||
/// </summary>
|
||||
public enum DifferenceType
|
||||
{
|
||||
Added,
|
||||
Removed,
|
||||
Modified,
|
||||
Reordered,
|
||||
TypeChanged,
|
||||
OptimizationVariant
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level for comparison results.
|
||||
/// </summary>
|
||||
public enum ComparisonConfidence
|
||||
{
|
||||
Low,
|
||||
Medium,
|
||||
High,
|
||||
VeryHigh
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for decompilation.
|
||||
/// </summary>
|
||||
public sealed record DecompileOptions
|
||||
{
|
||||
public bool SimplifyCode { get; init; } = true;
|
||||
public bool RecoverTypes { get; init; } = true;
|
||||
public bool RecoverStructs { get; init; } = true;
|
||||
public int MaxCodeLength { get; init; } = 100_000;
|
||||
public TimeSpan Timeout { get; init; } = TimeSpan.FromSeconds(30);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for AST comparison.
|
||||
/// </summary>
|
||||
public sealed record ComparisonOptions
|
||||
{
|
||||
public bool IgnoreVariableNames { get; init; } = true;
|
||||
public bool IgnoreConstants { get; init; } = false;
|
||||
public bool DetectOptimizations { get; init; } = true;
|
||||
public decimal MinSimilarityThreshold { get; init; } = 0.5m;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for code normalization.
|
||||
/// </summary>
|
||||
public sealed record NormalizationOptions
|
||||
{
|
||||
public bool NormalizeVariables { get; init; } = true;
|
||||
public bool NormalizeFunctionCalls { get; init; } = true;
|
||||
public bool NormalizeConstants { get; init; } = false;
|
||||
public bool NormalizeWhitespace { get; init; } = true;
|
||||
public bool SortIndependentStatements { get; init; } = false;
|
||||
public ImmutableHashSet<string>? KnownFunctions { get; init; }
|
||||
|
||||
public static NormalizationOptions Default { get; } = new();
|
||||
}
|
||||
|
||||
#region Concrete AST Node Types
|
||||
|
||||
public sealed record FunctionNode(
|
||||
string Name,
|
||||
string ReturnType,
|
||||
ImmutableArray<ParameterNode> Parameters,
|
||||
BlockNode Body,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Function, [Body, .. Parameters], Location);
|
||||
|
||||
public sealed record ParameterNode(
|
||||
string Name,
|
||||
string DataType,
|
||||
int Index,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Parameter, [], Location);
|
||||
|
||||
public sealed record BlockNode(
|
||||
ImmutableArray<AstNode> Statements,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Block, Statements, Location);
|
||||
|
||||
public sealed record IfNode(
|
||||
AstNode Condition,
|
||||
AstNode ThenBranch,
|
||||
AstNode? ElseBranch,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.If, ElseBranch is null ? [Condition, ThenBranch] : [Condition, ThenBranch, ElseBranch], Location);
|
||||
|
||||
public sealed record WhileNode(
|
||||
AstNode Condition,
|
||||
AstNode Body,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.While, [Condition, Body], Location);
|
||||
|
||||
public sealed record ForNode(
|
||||
AstNode? Init,
|
||||
AstNode? Condition,
|
||||
AstNode? Update,
|
||||
AstNode Body,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.For, [Init ?? EmptyNode.Instance, Condition ?? EmptyNode.Instance, Update ?? EmptyNode.Instance, Body], Location);
|
||||
|
||||
public sealed record ReturnNode(
|
||||
AstNode? Value,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Return, Value is null ? [] : [Value], Location);
|
||||
|
||||
public sealed record AssignmentNode(
|
||||
AstNode Target,
|
||||
AstNode Value,
|
||||
string Operator,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Assignment, [Target, Value], Location);
|
||||
|
||||
public sealed record BinaryOpNode(
|
||||
AstNode Left,
|
||||
AstNode Right,
|
||||
string Operator,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.BinaryOp, [Left, Right], Location);
|
||||
|
||||
public sealed record UnaryOpNode(
|
||||
AstNode Operand,
|
||||
string Operator,
|
||||
bool IsPrefix,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.UnaryOp, [Operand], Location);
|
||||
|
||||
public sealed record CallNode(
|
||||
string FunctionName,
|
||||
ImmutableArray<AstNode> Arguments,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Call, Arguments, Location);
|
||||
|
||||
public sealed record VariableNode(
|
||||
string Name,
|
||||
string? DataType,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Variable, [], Location);
|
||||
|
||||
public sealed record ConstantNode(
|
||||
object Value,
|
||||
string DataType,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Constant, [], Location);
|
||||
|
||||
public sealed record ArrayAccessNode(
|
||||
AstNode Array,
|
||||
AstNode Index,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.ArrayAccess, [Array, Index], Location);
|
||||
|
||||
public sealed record FieldAccessNode(
|
||||
AstNode Object,
|
||||
string FieldName,
|
||||
bool IsPointer,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.FieldAccess, [Object], Location);
|
||||
|
||||
public sealed record CastNode(
|
||||
AstNode Expression,
|
||||
string TargetType,
|
||||
SourceLocation? Location = null)
|
||||
: AstNode(AstNodeType.Cast, [Expression], Location);
|
||||
|
||||
public sealed record EmptyNode() : AstNode(AstNodeType.Block, [], null)
|
||||
{
|
||||
public static EmptyNode Instance { get; } = new();
|
||||
}
|
||||
|
||||
#endregion
|
||||
@@ -0,0 +1,22 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<Description>Decompiler integration for BinaryIndex semantic analysis. Provides AST-based comparison of decompiled code.</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Ghidra\StellaOps.BinaryIndex.Ghidra.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Semantic\StellaOps.BinaryIndex.Semantic.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -7,6 +7,7 @@ using System.Security.Cryptography;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Disassembly;
|
||||
using StellaOps.BinaryIndex.Normalization;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig;
|
||||
|
||||
@@ -17,18 +18,49 @@ public sealed class DeltaSignatureGenerator : IDeltaSignatureGenerator
|
||||
{
|
||||
private readonly DisassemblyService _disassemblyService;
|
||||
private readonly NormalizationService _normalizationService;
|
||||
private readonly IIrLiftingService? _irLiftingService;
|
||||
private readonly ISemanticGraphExtractor? _graphExtractor;
|
||||
private readonly ISemanticFingerprintGenerator? _fingerprintGenerator;
|
||||
private readonly ILogger<DeltaSignatureGenerator> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new delta signature generator without semantic analysis support.
|
||||
/// </summary>
|
||||
public DeltaSignatureGenerator(
|
||||
DisassemblyService disassemblyService,
|
||||
NormalizationService normalizationService,
|
||||
ILogger<DeltaSignatureGenerator> logger)
|
||||
: this(disassemblyService, normalizationService, null, null, null, logger)
|
||||
{
|
||||
_disassemblyService = disassemblyService;
|
||||
_normalizationService = normalizationService;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new delta signature generator with optional semantic analysis support.
|
||||
/// </summary>
|
||||
public DeltaSignatureGenerator(
|
||||
DisassemblyService disassemblyService,
|
||||
NormalizationService normalizationService,
|
||||
IIrLiftingService? irLiftingService,
|
||||
ISemanticGraphExtractor? graphExtractor,
|
||||
ISemanticFingerprintGenerator? fingerprintGenerator,
|
||||
ILogger<DeltaSignatureGenerator> logger)
|
||||
{
|
||||
_disassemblyService = disassemblyService ?? throw new ArgumentNullException(nameof(disassemblyService));
|
||||
_normalizationService = normalizationService ?? throw new ArgumentNullException(nameof(normalizationService));
|
||||
_irLiftingService = irLiftingService;
|
||||
_graphExtractor = graphExtractor;
|
||||
_fingerprintGenerator = fingerprintGenerator;
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a value indicating whether semantic analysis is available.
|
||||
/// </summary>
|
||||
public bool SemanticAnalysisAvailable =>
|
||||
_irLiftingService is not null &&
|
||||
_graphExtractor is not null &&
|
||||
_fingerprintGenerator is not null;
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<DeltaSignature> GenerateSignaturesAsync(
|
||||
Stream binaryStream,
|
||||
@@ -94,11 +126,14 @@ public sealed class DeltaSignatureGenerator : IDeltaSignatureGenerator
|
||||
}
|
||||
|
||||
// Generate signature from normalized bytes
|
||||
var signature = GenerateSymbolSignature(
|
||||
var signature = await GenerateSymbolSignatureAsync(
|
||||
normalized,
|
||||
symbolName,
|
||||
symbolInfo.Section ?? ".text",
|
||||
options);
|
||||
instructions,
|
||||
binary.Architecture,
|
||||
options,
|
||||
ct);
|
||||
|
||||
symbolSignatures.Add(signature);
|
||||
|
||||
@@ -218,6 +253,136 @@ public sealed class DeltaSignatureGenerator : IDeltaSignatureGenerator
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<SymbolSignature> GenerateSymbolSignatureAsync(
|
||||
NormalizedFunction normalized,
|
||||
string symbolName,
|
||||
string scope,
|
||||
IReadOnlyList<DisassembledInstruction> originalInstructions,
|
||||
CpuArchitecture architecture,
|
||||
SignatureOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(normalized);
|
||||
ArgumentNullException.ThrowIfNull(symbolName);
|
||||
ArgumentNullException.ThrowIfNull(scope);
|
||||
ArgumentNullException.ThrowIfNull(originalInstructions);
|
||||
|
||||
options ??= new SignatureOptions();
|
||||
|
||||
// Get normalized bytes for hashing
|
||||
var normalizedBytes = GetNormalizedBytes(normalized);
|
||||
|
||||
// Compute the main hash
|
||||
var hashHex = ComputeHash(normalizedBytes, options.HashAlgorithm);
|
||||
|
||||
// Compute chunk hashes for resilience
|
||||
ImmutableArray<ChunkHash>? chunks = null;
|
||||
if (options.IncludeChunks && normalizedBytes.Length >= options.ChunkSize)
|
||||
{
|
||||
chunks = ComputeChunkHashes(normalizedBytes, options.ChunkSize, options.HashAlgorithm);
|
||||
}
|
||||
|
||||
// Compute CFG metrics using proper CFG analysis
|
||||
int? bbCount = null;
|
||||
string? cfgEdgeHash = null;
|
||||
if (options.IncludeCfg && normalized.Instructions.Length > 0)
|
||||
{
|
||||
// Use first instruction's address as start address
|
||||
var startAddress = normalized.Instructions[0].OriginalAddress;
|
||||
var cfgMetrics = CfgExtractor.ComputeMetrics(
|
||||
normalized.Instructions.ToList(),
|
||||
startAddress);
|
||||
|
||||
bbCount = cfgMetrics.BasicBlockCount;
|
||||
cfgEdgeHash = cfgMetrics.EdgeHash;
|
||||
}
|
||||
|
||||
// Compute semantic fingerprint if enabled and services available
|
||||
string? semanticHashHex = null;
|
||||
ImmutableArray<string>? semanticApiCalls = null;
|
||||
|
||||
if (options.IncludeSemantic && SemanticAnalysisAvailable && originalInstructions.Count > 0)
|
||||
{
|
||||
try
|
||||
{
|
||||
var semanticFingerprint = await ComputeSemanticFingerprintAsync(
|
||||
originalInstructions,
|
||||
symbolName,
|
||||
architecture,
|
||||
ct);
|
||||
|
||||
if (semanticFingerprint is not null)
|
||||
{
|
||||
semanticHashHex = semanticFingerprint.GraphHashHex;
|
||||
semanticApiCalls = semanticFingerprint.ApiCalls;
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
ex,
|
||||
"Failed to compute semantic fingerprint for {Symbol}, continuing without semantic data",
|
||||
symbolName);
|
||||
}
|
||||
}
|
||||
|
||||
return new SymbolSignature
|
||||
{
|
||||
Name = symbolName,
|
||||
Scope = scope,
|
||||
HashAlg = options.HashAlgorithm,
|
||||
HashHex = hashHex,
|
||||
SizeBytes = normalizedBytes.Length,
|
||||
CfgBbCount = bbCount,
|
||||
CfgEdgeHash = cfgEdgeHash,
|
||||
Chunks = chunks,
|
||||
SemanticHashHex = semanticHashHex,
|
||||
SemanticApiCalls = semanticApiCalls
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<SemanticFingerprint?> ComputeSemanticFingerprintAsync(
|
||||
IReadOnlyList<DisassembledInstruction> instructions,
|
||||
string functionName,
|
||||
CpuArchitecture architecture,
|
||||
CancellationToken ct)
|
||||
{
|
||||
if (_irLiftingService is null || _graphExtractor is null || _fingerprintGenerator is null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if architecture is supported
|
||||
if (!_irLiftingService.SupportsArchitecture(architecture))
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Architecture {Arch} not supported for semantic analysis",
|
||||
architecture);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Lift to IR
|
||||
var startAddress = instructions.Count > 0 ? instructions[0].Address : 0UL;
|
||||
var lifted = await _irLiftingService.LiftToIrAsync(
|
||||
instructions,
|
||||
functionName,
|
||||
startAddress,
|
||||
architecture,
|
||||
ct: ct);
|
||||
|
||||
// Extract semantic graph
|
||||
var graph = await _graphExtractor.ExtractGraphAsync(lifted, ct: ct);
|
||||
|
||||
// Generate fingerprint
|
||||
var fingerprint = await _fingerprintGenerator.GenerateAsync(
|
||||
graph,
|
||||
startAddress,
|
||||
ct: ct);
|
||||
|
||||
return fingerprint;
|
||||
}
|
||||
|
||||
private static byte[] GetNormalizedBytes(NormalizedFunction normalized)
|
||||
{
|
||||
// Concatenate all normalized instruction bytes
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using StellaOps.BinaryIndex.Disassembly;
|
||||
using StellaOps.BinaryIndex.Normalization;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig;
|
||||
@@ -49,4 +50,24 @@ public interface IDeltaSignatureGenerator
|
||||
string symbolName,
|
||||
string scope,
|
||||
SignatureOptions? options = null);
|
||||
|
||||
/// <summary>
|
||||
/// Generates a signature for a single symbol with optional semantic analysis.
|
||||
/// </summary>
|
||||
/// <param name="normalized">The normalized function with instructions.</param>
|
||||
/// <param name="symbolName">Name of the symbol.</param>
|
||||
/// <param name="scope">Section containing the symbol.</param>
|
||||
/// <param name="originalInstructions">Original disassembled instructions for semantic analysis.</param>
|
||||
/// <param name="architecture">CPU architecture for IR lifting.</param>
|
||||
/// <param name="options">Generation options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The symbol signature with CFG metrics and optional semantic fingerprint.</returns>
|
||||
Task<SymbolSignature> GenerateSymbolSignatureAsync(
|
||||
NormalizedFunction normalized,
|
||||
string symbolName,
|
||||
string scope,
|
||||
IReadOnlyList<DisassembledInstruction> originalInstructions,
|
||||
CpuArchitecture architecture,
|
||||
SignatureOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
@@ -13,11 +13,13 @@ namespace StellaOps.BinaryIndex.DeltaSig;
|
||||
/// <param name="IncludeChunks">Include rolling chunk hashes for resilience.</param>
|
||||
/// <param name="ChunkSize">Size of rolling chunks in bytes (default 2KB).</param>
|
||||
/// <param name="HashAlgorithm">Hash algorithm to use (default sha256).</param>
|
||||
/// <param name="IncludeSemantic">Include IR-level semantic fingerprints for optimization-resilient matching.</param>
|
||||
public sealed record SignatureOptions(
|
||||
bool IncludeCfg = true,
|
||||
bool IncludeChunks = true,
|
||||
int ChunkSize = 2048,
|
||||
string HashAlgorithm = "sha256");
|
||||
string HashAlgorithm = "sha256",
|
||||
bool IncludeSemantic = false);
|
||||
|
||||
/// <summary>
|
||||
/// Request for generating delta signatures from a binary.
|
||||
@@ -190,6 +192,17 @@ public sealed record SymbolSignature
|
||||
/// Rolling chunk hashes for resilience against small changes.
|
||||
/// </summary>
|
||||
public ImmutableArray<ChunkHash>? Chunks { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Semantic fingerprint hash based on IR-level analysis (hex string).
|
||||
/// Provides resilience against compiler optimizations and instruction reordering.
|
||||
/// </summary>
|
||||
public string? SemanticHashHex { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// API calls extracted from semantic analysis (for semantic anchoring).
|
||||
/// </summary>
|
||||
public ImmutableArray<string>? SemanticApiCalls { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
||||
@@ -2,8 +2,10 @@
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Disassembly;
|
||||
using StellaOps.BinaryIndex.Normalization;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.DeltaSig;
|
||||
|
||||
@@ -15,17 +17,52 @@ public static class ServiceCollectionExtensions
|
||||
/// <summary>
|
||||
/// Adds delta signature generation and matching services.
|
||||
/// Requires disassembly and normalization services to be registered.
|
||||
/// If semantic services are registered, semantic fingerprinting will be available.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDeltaSignatures(this IServiceCollection services)
|
||||
{
|
||||
services.AddSingleton<IDeltaSignatureGenerator, DeltaSignatureGenerator>();
|
||||
services.AddSingleton<IDeltaSignatureGenerator>(sp =>
|
||||
{
|
||||
var disassembly = sp.GetRequiredService<DisassemblyService>();
|
||||
var normalization = sp.GetRequiredService<NormalizationService>();
|
||||
var logger = sp.GetRequiredService<ILogger<DeltaSignatureGenerator>>();
|
||||
|
||||
// Semantic services are optional
|
||||
var irLifting = sp.GetService<IIrLiftingService>();
|
||||
var graphExtractor = sp.GetService<ISemanticGraphExtractor>();
|
||||
var fingerprintGenerator = sp.GetService<ISemanticFingerprintGenerator>();
|
||||
|
||||
return new DeltaSignatureGenerator(
|
||||
disassembly,
|
||||
normalization,
|
||||
irLifting,
|
||||
graphExtractor,
|
||||
fingerprintGenerator,
|
||||
logger);
|
||||
});
|
||||
|
||||
services.AddSingleton<IDeltaSignatureMatcher, DeltaSignatureMatcher>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds delta signature services with semantic analysis support enabled.
|
||||
/// Requires disassembly and normalization services to be registered.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddDeltaSignaturesWithSemantic(this IServiceCollection services)
|
||||
{
|
||||
// Register semantic services first
|
||||
services.AddBinaryIndexSemantic();
|
||||
|
||||
// Then register delta signature services
|
||||
return services.AddDeltaSignatures();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds all binary index services: disassembly, normalization, and delta signatures.
|
||||
/// </summary>
|
||||
@@ -44,4 +81,26 @@ public static class ServiceCollectionExtensions
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds all binary index services with semantic analysis: disassembly, normalization, semantic, and delta signatures.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddBinaryIndexServicesWithSemantic(this IServiceCollection services)
|
||||
{
|
||||
// Add disassembly with default plugins
|
||||
services.AddDisassemblyServices();
|
||||
|
||||
// Add normalization pipelines
|
||||
services.AddNormalizationPipelines();
|
||||
|
||||
// Add semantic analysis services
|
||||
services.AddBinaryIndexSemantic();
|
||||
|
||||
// Add delta signature services (will pick up semantic services)
|
||||
services.AddDeltaSignatures();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Disassembly.Abstractions\StellaOps.BinaryIndex.Disassembly.Abstractions.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Disassembly\StellaOps.BinaryIndex.Disassembly.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Normalization\StellaOps.BinaryIndex.Normalization.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Semantic\StellaOps.BinaryIndex.Semantic.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
@@ -66,4 +66,81 @@ public static class DisassemblyServiceCollectionExtensions
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds the hybrid disassembly service with fallback logic between plugins.
|
||||
/// This replaces the standard disassembly service with a hybrid version that
|
||||
/// automatically falls back to secondary plugins when primary quality is low.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configuration">Configuration for binding options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddHybridDisassemblyServices(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(configuration);
|
||||
|
||||
// Register standard options
|
||||
services.AddOptions<DisassemblyOptions>()
|
||||
.Bind(configuration.GetSection(DisassemblyOptions.SectionName))
|
||||
.ValidateOnStart();
|
||||
|
||||
// Register hybrid options
|
||||
services.AddOptions<HybridDisassemblyOptions>()
|
||||
.Bind(configuration.GetSection(HybridDisassemblyOptions.SectionName))
|
||||
.ValidateOnStart();
|
||||
|
||||
// Register the plugin registry
|
||||
services.TryAddSingleton<IDisassemblyPluginRegistry, DisassemblyPluginRegistry>();
|
||||
|
||||
// Register hybrid service as IDisassemblyService
|
||||
services.AddSingleton<HybridDisassemblyService>();
|
||||
services.AddSingleton<IDisassemblyService>(sp => sp.GetRequiredService<HybridDisassemblyService>());
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds the hybrid disassembly service with configuration actions.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureHybrid">Action to configure hybrid options.</param>
|
||||
/// <param name="configureDisassembly">Optional action to configure standard options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddHybridDisassemblyServices(
|
||||
this IServiceCollection services,
|
||||
Action<HybridDisassemblyOptions> configureHybrid,
|
||||
Action<DisassemblyOptions>? configureDisassembly = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(configureHybrid);
|
||||
|
||||
// Register standard options
|
||||
if (configureDisassembly != null)
|
||||
{
|
||||
services.AddOptions<DisassemblyOptions>()
|
||||
.Configure(configureDisassembly)
|
||||
.ValidateOnStart();
|
||||
}
|
||||
else
|
||||
{
|
||||
services.AddOptions<DisassemblyOptions>();
|
||||
}
|
||||
|
||||
// Register hybrid options
|
||||
services.AddOptions<HybridDisassemblyOptions>()
|
||||
.Configure(configureHybrid)
|
||||
.ValidateOnStart();
|
||||
|
||||
// Register the plugin registry
|
||||
services.TryAddSingleton<IDisassemblyPluginRegistry, DisassemblyPluginRegistry>();
|
||||
|
||||
// Register hybrid service as IDisassemblyService
|
||||
services.AddSingleton<HybridDisassemblyService>();
|
||||
services.AddSingleton<IDisassemblyService>(sp => sp.GetRequiredService<HybridDisassemblyService>());
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,572 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Disassembly;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for hybrid disassembly with fallback logic.
|
||||
/// </summary>
|
||||
public sealed class HybridDisassemblyOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section name.
|
||||
/// </summary>
|
||||
public const string SectionName = "HybridDisassembly";
|
||||
|
||||
/// <summary>
|
||||
/// Primary plugin ID to try first. If null, auto-selects highest priority plugin.
|
||||
/// </summary>
|
||||
public string? PrimaryPluginId { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Fallback plugin ID to use when primary fails quality threshold.
|
||||
/// </summary>
|
||||
public string? FallbackPluginId { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Minimum confidence score (0.0-1.0) required to accept primary plugin results.
|
||||
/// If primary result confidence is below this, fallback is attempted.
|
||||
/// </summary>
|
||||
public double MinConfidenceThreshold { get; set; } = 0.7;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum function discovery count. If primary finds fewer functions, fallback is attempted.
|
||||
/// </summary>
|
||||
public int MinFunctionCount { get; set; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum instruction decode success rate (0.0-1.0).
|
||||
/// </summary>
|
||||
public double MinDecodeSuccessRate { get; set; } = 0.8;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to automatically fallback when primary plugin doesn't support the architecture.
|
||||
/// </summary>
|
||||
public bool AutoFallbackOnUnsupported { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to enable hybrid fallback logic at all. If false, behaves like standard service.
|
||||
/// </summary>
|
||||
public bool EnableFallback { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Timeout in seconds for each plugin attempt.
|
||||
/// </summary>
|
||||
public int PluginTimeoutSeconds { get; set; } = 120;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a disassembly operation with quality metrics.
|
||||
/// </summary>
|
||||
public sealed record DisassemblyQualityResult
|
||||
{
|
||||
/// <summary>
|
||||
/// The loaded binary information.
|
||||
/// </summary>
|
||||
public required BinaryInfo Binary { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// The plugin that produced this result.
|
||||
/// </summary>
|
||||
public required IDisassemblyPlugin Plugin { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Discovered code regions.
|
||||
/// </summary>
|
||||
public required ImmutableArray<CodeRegion> CodeRegions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Discovered symbols/functions.
|
||||
/// </summary>
|
||||
public required ImmutableArray<SymbolInfo> Symbols { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Total instructions disassembled across all regions.
|
||||
/// </summary>
|
||||
public int TotalInstructions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Successfully decoded instructions count.
|
||||
/// </summary>
|
||||
public int DecodedInstructions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Failed/invalid instruction count.
|
||||
/// </summary>
|
||||
public int FailedInstructions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Confidence score (0.0-1.0) based on quality metrics.
|
||||
/// </summary>
|
||||
public double Confidence { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether this result came from a fallback plugin.
|
||||
/// </summary>
|
||||
public bool UsedFallback { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Reason for fallback if applicable.
|
||||
/// </summary>
|
||||
public string? FallbackReason { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Decode success rate (DecodedInstructions / TotalInstructions).
|
||||
/// </summary>
|
||||
public double DecodeSuccessRate =>
|
||||
TotalInstructions > 0 ? (double)DecodedInstructions / TotalInstructions : 0.0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Hybrid disassembly service that implements smart routing between plugins
|
||||
/// with quality-based fallback logic (e.g., B2R2 primary -> Ghidra fallback).
|
||||
/// </summary>
|
||||
public sealed class HybridDisassemblyService : IDisassemblyService
|
||||
{
|
||||
private readonly IDisassemblyPluginRegistry _registry;
|
||||
private readonly HybridDisassemblyOptions _options;
|
||||
private readonly ILogger<HybridDisassemblyService> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new hybrid disassembly service.
|
||||
/// </summary>
|
||||
/// <param name="registry">The plugin registry.</param>
|
||||
/// <param name="options">Hybrid options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public HybridDisassemblyService(
|
||||
IDisassemblyPluginRegistry registry,
|
||||
IOptions<HybridDisassemblyOptions> options,
|
||||
ILogger<HybridDisassemblyService> logger)
|
||||
{
|
||||
_registry = registry ?? throw new ArgumentNullException(nameof(registry));
|
||||
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IDisassemblyPluginRegistry Registry => _registry;
|
||||
|
||||
/// <inheritdoc />
|
||||
public (BinaryInfo Binary, IDisassemblyPlugin Plugin) LoadBinary(Stream stream, string? preferredPluginId = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(stream);
|
||||
|
||||
using var memStream = new MemoryStream();
|
||||
stream.CopyTo(memStream);
|
||||
return LoadBinary(memStream.ToArray(), preferredPluginId);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public (BinaryInfo Binary, IDisassemblyPlugin Plugin) LoadBinary(ReadOnlySpan<byte> bytes, string? preferredPluginId = null)
|
||||
{
|
||||
// Detect format/architecture
|
||||
var format = DetectFormat(bytes);
|
||||
var architecture = DetectArchitecture(bytes, format);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Hybrid service: Detected format {Format} and architecture {Arch}",
|
||||
format, architecture);
|
||||
|
||||
if (!_options.EnableFallback)
|
||||
{
|
||||
// Simple mode - just use the best plugin
|
||||
return LoadWithBestPlugin(bytes, architecture, format, preferredPluginId);
|
||||
}
|
||||
|
||||
// Hybrid mode with fallback logic
|
||||
return LoadWithFallback(bytes, architecture, format, preferredPluginId);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Loads binary with quality assessment and returns detailed quality result.
|
||||
/// </summary>
|
||||
/// <param name="bytes">The binary data.</param>
|
||||
/// <param name="preferredPluginId">Optional preferred plugin ID.</param>
|
||||
/// <returns>A quality result with metrics and fallback info.</returns>
|
||||
public DisassemblyQualityResult LoadBinaryWithQuality(ReadOnlySpan<byte> bytes, string? preferredPluginId = null)
|
||||
{
|
||||
var format = DetectFormat(bytes);
|
||||
var architecture = DetectArchitecture(bytes, format);
|
||||
|
||||
// Try primary plugin
|
||||
var primaryPlugin = GetPrimaryPlugin(architecture, format, preferredPluginId);
|
||||
if (primaryPlugin is null)
|
||||
{
|
||||
throw new NotSupportedException(
|
||||
$"No disassembly plugin available for architecture {architecture} and format {format}");
|
||||
}
|
||||
|
||||
var primaryResult = AssessQuality(primaryPlugin, bytes, architecture, format);
|
||||
|
||||
// Check if primary meets quality threshold
|
||||
if (MeetsQualityThreshold(primaryResult))
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Primary plugin {Plugin} met quality threshold (confidence: {Confidence:P1})",
|
||||
primaryPlugin.Capabilities.PluginId, primaryResult.Confidence);
|
||||
return primaryResult;
|
||||
}
|
||||
|
||||
// Try fallback
|
||||
if (!_options.EnableFallback)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Primary plugin {Plugin} below threshold (confidence: {Confidence:P1}), fallback disabled",
|
||||
primaryPlugin.Capabilities.PluginId, primaryResult.Confidence);
|
||||
return primaryResult;
|
||||
}
|
||||
|
||||
var fallbackPlugin = GetFallbackPlugin(primaryPlugin, architecture, format);
|
||||
if (fallbackPlugin is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"No fallback plugin available for {Arch}/{Format}",
|
||||
architecture, format);
|
||||
return primaryResult;
|
||||
}
|
||||
|
||||
var fallbackResult = AssessQuality(fallbackPlugin, bytes, architecture, format);
|
||||
|
||||
// Use fallback if it's better
|
||||
if (fallbackResult.Confidence > primaryResult.Confidence)
|
||||
{
|
||||
_logger.LogInformation(
|
||||
"Using fallback plugin {Plugin} (confidence: {Confidence:P1} > primary: {PrimaryConf:P1})",
|
||||
fallbackPlugin.Capabilities.PluginId, fallbackResult.Confidence, primaryResult.Confidence);
|
||||
|
||||
return fallbackResult with
|
||||
{
|
||||
UsedFallback = true,
|
||||
FallbackReason = $"Primary confidence ({primaryResult.Confidence:P1}) below threshold"
|
||||
};
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Keeping primary plugin result (confidence: {Confidence:P1})",
|
||||
primaryResult.Confidence);
|
||||
return primaryResult;
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private (BinaryInfo Binary, IDisassemblyPlugin Plugin) LoadWithBestPlugin(
|
||||
ReadOnlySpan<byte> bytes,
|
||||
CpuArchitecture architecture,
|
||||
BinaryFormat format,
|
||||
string? preferredPluginId)
|
||||
{
|
||||
var plugin = GetPluginById(preferredPluginId) ?? _registry.FindPlugin(architecture, format);
|
||||
|
||||
if (plugin == null)
|
||||
{
|
||||
throw new NotSupportedException(
|
||||
$"No disassembly plugin available for architecture {architecture} and format {format}");
|
||||
}
|
||||
|
||||
var binary = plugin.LoadBinary(bytes, architecture, format);
|
||||
return (binary, plugin);
|
||||
}
|
||||
|
||||
private (BinaryInfo Binary, IDisassemblyPlugin Plugin) LoadWithFallback(
|
||||
ReadOnlySpan<byte> bytes,
|
||||
CpuArchitecture architecture,
|
||||
BinaryFormat format,
|
||||
string? preferredPluginId)
|
||||
{
|
||||
var primaryPlugin = GetPrimaryPlugin(architecture, format, preferredPluginId);
|
||||
|
||||
if (primaryPlugin is null)
|
||||
{
|
||||
// No primary, try fallback directly
|
||||
var fallback = GetFallbackPlugin(null, architecture, format);
|
||||
if (fallback is null)
|
||||
{
|
||||
throw new NotSupportedException(
|
||||
$"No disassembly plugin available for architecture {architecture} and format {format}");
|
||||
}
|
||||
return (fallback.LoadBinary(bytes, architecture, format), fallback);
|
||||
}
|
||||
|
||||
// Check if primary supports this arch/format
|
||||
if (_options.AutoFallbackOnUnsupported && !primaryPlugin.Capabilities.CanHandle(architecture, format))
|
||||
{
|
||||
_logger.LogDebug(
|
||||
"Primary plugin {Plugin} doesn't support {Arch}/{Format}, using fallback",
|
||||
primaryPlugin.Capabilities.PluginId, architecture, format);
|
||||
|
||||
var fallback = GetFallbackPlugin(primaryPlugin, architecture, format);
|
||||
if (fallback is not null)
|
||||
{
|
||||
return (fallback.LoadBinary(bytes, architecture, format), fallback);
|
||||
}
|
||||
}
|
||||
|
||||
// Use primary
|
||||
return (primaryPlugin.LoadBinary(bytes, architecture, format), primaryPlugin);
|
||||
}
|
||||
|
||||
private IDisassemblyPlugin? GetPrimaryPlugin(
|
||||
CpuArchitecture architecture,
|
||||
BinaryFormat format,
|
||||
string? preferredPluginId)
|
||||
{
|
||||
// Explicit preferred plugin
|
||||
if (!string.IsNullOrEmpty(preferredPluginId))
|
||||
{
|
||||
return GetPluginById(preferredPluginId);
|
||||
}
|
||||
|
||||
// Configured primary plugin
|
||||
if (!string.IsNullOrEmpty(_options.PrimaryPluginId))
|
||||
{
|
||||
return GetPluginById(_options.PrimaryPluginId);
|
||||
}
|
||||
|
||||
// Auto-select highest priority
|
||||
return _registry.FindPlugin(architecture, format);
|
||||
}
|
||||
|
||||
private IDisassemblyPlugin? GetFallbackPlugin(
|
||||
IDisassemblyPlugin? excludePlugin,
|
||||
CpuArchitecture architecture,
|
||||
BinaryFormat format)
|
||||
{
|
||||
// Explicit fallback plugin
|
||||
if (!string.IsNullOrEmpty(_options.FallbackPluginId))
|
||||
{
|
||||
var fallback = GetPluginById(_options.FallbackPluginId);
|
||||
if (fallback?.Capabilities.CanHandle(architecture, format) == true)
|
||||
{
|
||||
return fallback;
|
||||
}
|
||||
}
|
||||
|
||||
// Find any other plugin that supports this arch/format
|
||||
return _registry.Plugins
|
||||
.Where(p => p != excludePlugin)
|
||||
.Where(p => p.Capabilities.CanHandle(architecture, format))
|
||||
.OrderByDescending(p => p.Capabilities.Priority)
|
||||
.FirstOrDefault();
|
||||
}
|
||||
|
||||
private IDisassemblyPlugin? GetPluginById(string? pluginId)
|
||||
{
|
||||
return string.IsNullOrEmpty(pluginId) ? null : _registry.GetPlugin(pluginId);
|
||||
}
|
||||
|
||||
private DisassemblyQualityResult AssessQuality(
|
||||
IDisassemblyPlugin plugin,
|
||||
ReadOnlySpan<byte> bytes,
|
||||
CpuArchitecture architecture,
|
||||
BinaryFormat format)
|
||||
{
|
||||
try
|
||||
{
|
||||
var binary = plugin.LoadBinary(bytes, architecture, format);
|
||||
var codeRegions = plugin.GetCodeRegions(binary).ToImmutableArray();
|
||||
var symbols = plugin.GetSymbols(binary).ToImmutableArray();
|
||||
|
||||
// Assess quality by sampling disassembly
|
||||
int totalInstructions = 0;
|
||||
int decodedInstructions = 0;
|
||||
int failedInstructions = 0;
|
||||
|
||||
foreach (var region in codeRegions.Take(3)) // Sample up to 3 regions
|
||||
{
|
||||
var instructions = plugin.Disassemble(binary, region).Take(1000).ToList();
|
||||
totalInstructions += instructions.Count;
|
||||
|
||||
foreach (var instr in instructions)
|
||||
{
|
||||
if (instr.Mnemonic.Equals("??", StringComparison.Ordinal) ||
|
||||
instr.Mnemonic.Equals("invalid", StringComparison.OrdinalIgnoreCase) ||
|
||||
instr.Mnemonic.Equals("db", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
failedInstructions++;
|
||||
}
|
||||
else
|
||||
{
|
||||
decodedInstructions++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate confidence
|
||||
var confidence = CalculateConfidence(
|
||||
symbols.Length,
|
||||
decodedInstructions,
|
||||
failedInstructions,
|
||||
codeRegions.Length);
|
||||
|
||||
return new DisassemblyQualityResult
|
||||
{
|
||||
Binary = binary,
|
||||
Plugin = plugin,
|
||||
CodeRegions = codeRegions,
|
||||
Symbols = symbols,
|
||||
TotalInstructions = totalInstructions,
|
||||
DecodedInstructions = decodedInstructions,
|
||||
FailedInstructions = failedInstructions,
|
||||
Confidence = confidence,
|
||||
UsedFallback = false
|
||||
};
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Plugin {Plugin} failed during quality assessment", plugin.Capabilities.PluginId);
|
||||
|
||||
return new DisassemblyQualityResult
|
||||
{
|
||||
Binary = null!,
|
||||
Plugin = plugin,
|
||||
CodeRegions = [],
|
||||
Symbols = [],
|
||||
TotalInstructions = 0,
|
||||
DecodedInstructions = 0,
|
||||
FailedInstructions = 0,
|
||||
Confidence = 0.0,
|
||||
UsedFallback = false,
|
||||
FallbackReason = $"Plugin failed: {ex.Message}"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private static double CalculateConfidence(
|
||||
int symbolCount,
|
||||
int decodedInstructions,
|
||||
int failedInstructions,
|
||||
int regionCount)
|
||||
{
|
||||
var totalInstructions = decodedInstructions + failedInstructions;
|
||||
if (totalInstructions == 0)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Decode success rate (weight: 0.5)
|
||||
var decodeRate = (double)decodedInstructions / totalInstructions;
|
||||
|
||||
// Symbol discovery (weight: 0.3)
|
||||
var symbolScore = Math.Min(1.0, symbolCount / 10.0);
|
||||
|
||||
// Region coverage (weight: 0.2)
|
||||
var regionScore = Math.Min(1.0, regionCount / 5.0);
|
||||
|
||||
return (decodeRate * 0.5) + (symbolScore * 0.3) + (regionScore * 0.2);
|
||||
}
|
||||
|
||||
private bool MeetsQualityThreshold(DisassemblyQualityResult result)
|
||||
{
|
||||
if (result.Confidence < _options.MinConfidenceThreshold)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (result.Symbols.Length < _options.MinFunctionCount)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (result.DecodeSuccessRate < _options.MinDecodeSuccessRate)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#region Format/Architecture Detection (copied from DisassemblyService)
|
||||
|
||||
private static BinaryFormat DetectFormat(ReadOnlySpan<byte> bytes)
|
||||
{
|
||||
if (bytes.Length < 4) return BinaryFormat.Raw;
|
||||
|
||||
// ELF magic
|
||||
if (bytes[0] == 0x7F && bytes[1] == 'E' && bytes[2] == 'L' && bytes[3] == 'F')
|
||||
return BinaryFormat.ELF;
|
||||
|
||||
// PE magic
|
||||
if (bytes[0] == 'M' && bytes[1] == 'Z')
|
||||
return BinaryFormat.PE;
|
||||
|
||||
// Mach-O magic
|
||||
if ((bytes[0] == 0xFE && bytes[1] == 0xED && bytes[2] == 0xFA && (bytes[3] == 0xCE || bytes[3] == 0xCF)) ||
|
||||
(bytes[3] == 0xFE && bytes[2] == 0xED && bytes[1] == 0xFA && (bytes[0] == 0xCE || bytes[0] == 0xCF)))
|
||||
return BinaryFormat.MachO;
|
||||
|
||||
// WASM magic
|
||||
if (bytes[0] == 0x00 && bytes[1] == 'a' && bytes[2] == 's' && bytes[3] == 'm')
|
||||
return BinaryFormat.WASM;
|
||||
|
||||
return BinaryFormat.Raw;
|
||||
}
|
||||
|
||||
private static CpuArchitecture DetectArchitecture(ReadOnlySpan<byte> bytes, BinaryFormat format)
|
||||
{
|
||||
return format switch
|
||||
{
|
||||
BinaryFormat.ELF when bytes.Length > 18 => DetectElfArchitecture(bytes),
|
||||
BinaryFormat.PE when bytes.Length > 0x40 => DetectPeArchitecture(bytes),
|
||||
BinaryFormat.MachO when bytes.Length > 8 => DetectMachOArchitecture(bytes),
|
||||
_ => CpuArchitecture.X86_64
|
||||
};
|
||||
}
|
||||
|
||||
private static CpuArchitecture DetectElfArchitecture(ReadOnlySpan<byte> bytes)
|
||||
{
|
||||
var machine = (ushort)(bytes[18] | (bytes[19] << 8));
|
||||
return machine switch
|
||||
{
|
||||
0x03 => CpuArchitecture.X86,
|
||||
0x3E => CpuArchitecture.X86_64,
|
||||
0x28 => CpuArchitecture.ARM32,
|
||||
0xB7 => CpuArchitecture.ARM64,
|
||||
0x08 => CpuArchitecture.MIPS32,
|
||||
0xF3 => CpuArchitecture.RISCV64,
|
||||
0x14 => CpuArchitecture.PPC32,
|
||||
0x02 => CpuArchitecture.SPARC,
|
||||
_ => bytes[4] == 2 ? CpuArchitecture.X86_64 : CpuArchitecture.X86
|
||||
};
|
||||
}
|
||||
|
||||
private static CpuArchitecture DetectPeArchitecture(ReadOnlySpan<byte> bytes)
|
||||
{
|
||||
var peOffset = bytes[0x3C] | (bytes[0x3D] << 8) | (bytes[0x3E] << 16) | (bytes[0x3F] << 24);
|
||||
if (peOffset < 0 || peOffset + 6 > bytes.Length) return CpuArchitecture.X86;
|
||||
|
||||
var machine = (ushort)(bytes[peOffset + 4] | (bytes[peOffset + 5] << 8));
|
||||
return machine switch
|
||||
{
|
||||
0x014c => CpuArchitecture.X86,
|
||||
0x8664 => CpuArchitecture.X86_64,
|
||||
0xaa64 => CpuArchitecture.ARM64,
|
||||
0x01c4 => CpuArchitecture.ARM32,
|
||||
_ => CpuArchitecture.X86
|
||||
};
|
||||
}
|
||||
|
||||
private static CpuArchitecture DetectMachOArchitecture(ReadOnlySpan<byte> bytes)
|
||||
{
|
||||
bool isBigEndian = bytes[0] == 0xFE;
|
||||
uint cpuType = isBigEndian
|
||||
? (uint)((bytes[4] << 24) | (bytes[5] << 16) | (bytes[6] << 8) | bytes[7])
|
||||
: (uint)(bytes[4] | (bytes[5] << 8) | (bytes[6] << 16) | (bytes[7] << 24));
|
||||
|
||||
return cpuType switch
|
||||
{
|
||||
0x00000007 => CpuArchitecture.X86,
|
||||
0x01000007 => CpuArchitecture.X86_64,
|
||||
0x0000000C => CpuArchitecture.ARM32,
|
||||
0x0100000C => CpuArchitecture.ARM64,
|
||||
_ => CpuArchitecture.X86_64
|
||||
};
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#endregion
|
||||
}
|
||||
@@ -0,0 +1,460 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.Decompiler;
|
||||
using StellaOps.BinaryIndex.ML;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ensemble;
|
||||
|
||||
/// <summary>
|
||||
/// Ensemble decision engine that combines syntactic, semantic, and ML signals.
|
||||
/// </summary>
|
||||
public sealed class EnsembleDecisionEngine : IEnsembleDecisionEngine
|
||||
{
|
||||
private readonly IAstComparisonEngine _astEngine;
|
||||
private readonly ISemanticMatcher _semanticMatcher;
|
||||
private readonly IEmbeddingService _embeddingService;
|
||||
private readonly EnsembleOptions _defaultOptions;
|
||||
private readonly ILogger<EnsembleDecisionEngine> _logger;
|
||||
|
||||
public EnsembleDecisionEngine(
|
||||
IAstComparisonEngine astEngine,
|
||||
ISemanticMatcher semanticMatcher,
|
||||
IEmbeddingService embeddingService,
|
||||
IOptions<EnsembleOptions> options,
|
||||
ILogger<EnsembleDecisionEngine> logger)
|
||||
{
|
||||
_astEngine = astEngine ?? throw new ArgumentNullException(nameof(astEngine));
|
||||
_semanticMatcher = semanticMatcher ?? throw new ArgumentNullException(nameof(semanticMatcher));
|
||||
_embeddingService = embeddingService ?? throw new ArgumentNullException(nameof(embeddingService));
|
||||
_defaultOptions = options?.Value ?? new EnsembleOptions();
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<EnsembleResult> CompareAsync(
|
||||
FunctionAnalysis source,
|
||||
FunctionAnalysis target,
|
||||
EnsembleOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(source);
|
||||
ArgumentNullException.ThrowIfNull(target);
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
options ??= _defaultOptions;
|
||||
|
||||
// Check for exact hash match first (optimization)
|
||||
var exactHashMatch = CheckExactHashMatch(source, target);
|
||||
|
||||
// Compute individual signals
|
||||
var contributions = new List<SignalContribution>();
|
||||
var availableWeight = 0m;
|
||||
|
||||
// Syntactic (AST) signal
|
||||
var syntacticContribution = ComputeSyntacticSignal(source, target, options);
|
||||
contributions.Add(syntacticContribution);
|
||||
if (syntacticContribution.IsAvailable)
|
||||
{
|
||||
availableWeight += options.SyntacticWeight;
|
||||
}
|
||||
|
||||
// Semantic (graph) signal
|
||||
var semanticContribution = await ComputeSemanticSignalAsync(source, target, options, ct);
|
||||
contributions.Add(semanticContribution);
|
||||
if (semanticContribution.IsAvailable)
|
||||
{
|
||||
availableWeight += options.SemanticWeight;
|
||||
}
|
||||
|
||||
// ML (embedding) signal
|
||||
var embeddingContribution = ComputeEmbeddingSignal(source, target, options);
|
||||
contributions.Add(embeddingContribution);
|
||||
if (embeddingContribution.IsAvailable)
|
||||
{
|
||||
availableWeight += options.EmbeddingWeight;
|
||||
}
|
||||
|
||||
// Compute effective weights (normalize if some signals missing)
|
||||
var effectiveWeights = ComputeEffectiveWeights(contributions, options, availableWeight);
|
||||
|
||||
// Update contributions with effective weights
|
||||
var adjustedContributions = AdjustContributionWeights(contributions, effectiveWeights);
|
||||
|
||||
// Compute ensemble score
|
||||
var ensembleScore = ComputeEnsembleScore(adjustedContributions, exactHashMatch, options);
|
||||
|
||||
// Determine match and confidence
|
||||
var isMatch = ensembleScore >= options.MatchThreshold;
|
||||
var confidence = DetermineConfidence(ensembleScore, adjustedContributions, exactHashMatch);
|
||||
var reason = BuildDecisionReason(adjustedContributions, exactHashMatch, isMatch);
|
||||
|
||||
var result = new EnsembleResult
|
||||
{
|
||||
SourceFunctionId = source.FunctionId,
|
||||
TargetFunctionId = target.FunctionId,
|
||||
EnsembleScore = ensembleScore,
|
||||
Contributions = adjustedContributions.ToImmutableArray(),
|
||||
IsMatch = isMatch,
|
||||
Confidence = confidence,
|
||||
DecisionReason = reason,
|
||||
ExactHashMatch = exactHashMatch,
|
||||
AdjustedWeights = effectiveWeights
|
||||
};
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<EnsembleResult>> FindMatchesAsync(
|
||||
FunctionAnalysis query,
|
||||
IEnumerable<FunctionAnalysis> corpus,
|
||||
EnsembleOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(query);
|
||||
ArgumentNullException.ThrowIfNull(corpus);
|
||||
|
||||
options ??= _defaultOptions;
|
||||
var results = new List<EnsembleResult>();
|
||||
|
||||
foreach (var candidate in corpus)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var result = await CompareAsync(query, candidate, options, ct);
|
||||
if (result.EnsembleScore >= options.MinimumSignalThreshold)
|
||||
{
|
||||
results.Add(result);
|
||||
}
|
||||
}
|
||||
|
||||
return results
|
||||
.OrderByDescending(r => r.EnsembleScore)
|
||||
.Take(options.MaxCandidates)
|
||||
.ToImmutableArray();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<BatchComparisonResult> CompareBatchAsync(
|
||||
IEnumerable<FunctionAnalysis> sources,
|
||||
IEnumerable<FunctionAnalysis> targets,
|
||||
EnsembleOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(sources);
|
||||
ArgumentNullException.ThrowIfNull(targets);
|
||||
|
||||
options ??= _defaultOptions;
|
||||
var startTime = DateTime.UtcNow;
|
||||
var results = new List<EnsembleResult>();
|
||||
var targetList = targets.ToList();
|
||||
|
||||
foreach (var source in sources)
|
||||
{
|
||||
foreach (var target in targetList)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
var result = await CompareAsync(source, target, options, ct);
|
||||
results.Add(result);
|
||||
}
|
||||
}
|
||||
|
||||
var duration = DateTime.UtcNow - startTime;
|
||||
var statistics = ComputeStatistics(results);
|
||||
|
||||
return new BatchComparisonResult
|
||||
{
|
||||
Results = results.ToImmutableArray(),
|
||||
Statistics = statistics,
|
||||
Duration = duration
|
||||
};
|
||||
}
|
||||
|
||||
private static bool CheckExactHashMatch(FunctionAnalysis source, FunctionAnalysis target)
|
||||
{
|
||||
if (source.NormalizedCodeHash is null || target.NormalizedCodeHash is null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return source.NormalizedCodeHash.SequenceEqual(target.NormalizedCodeHash);
|
||||
}
|
||||
|
||||
private SignalContribution ComputeSyntacticSignal(
|
||||
FunctionAnalysis source,
|
||||
FunctionAnalysis target,
|
||||
EnsembleOptions options)
|
||||
{
|
||||
if (source.Ast is null || target.Ast is null)
|
||||
{
|
||||
return new SignalContribution
|
||||
{
|
||||
SignalType = SignalType.Syntactic,
|
||||
RawScore = 0m,
|
||||
Weight = options.SyntacticWeight,
|
||||
IsAvailable = false,
|
||||
Quality = SignalQuality.Unavailable
|
||||
};
|
||||
}
|
||||
|
||||
var similarity = _astEngine.ComputeStructuralSimilarity(source.Ast, target.Ast);
|
||||
var quality = AssessAstQuality(source.Ast, target.Ast);
|
||||
|
||||
return new SignalContribution
|
||||
{
|
||||
SignalType = SignalType.Syntactic,
|
||||
RawScore = similarity,
|
||||
Weight = options.SyntacticWeight,
|
||||
IsAvailable = true,
|
||||
Quality = quality
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<SignalContribution> ComputeSemanticSignalAsync(
|
||||
FunctionAnalysis source,
|
||||
FunctionAnalysis target,
|
||||
EnsembleOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
if (source.SemanticGraph is null || target.SemanticGraph is null)
|
||||
{
|
||||
return new SignalContribution
|
||||
{
|
||||
SignalType = SignalType.Semantic,
|
||||
RawScore = 0m,
|
||||
Weight = options.SemanticWeight,
|
||||
IsAvailable = false,
|
||||
Quality = SignalQuality.Unavailable
|
||||
};
|
||||
}
|
||||
|
||||
var similarity = await _semanticMatcher.ComputeGraphSimilarityAsync(
|
||||
source.SemanticGraph,
|
||||
target.SemanticGraph,
|
||||
ct);
|
||||
var quality = AssessGraphQuality(source.SemanticGraph, target.SemanticGraph);
|
||||
|
||||
return new SignalContribution
|
||||
{
|
||||
SignalType = SignalType.Semantic,
|
||||
RawScore = similarity,
|
||||
Weight = options.SemanticWeight,
|
||||
IsAvailable = true,
|
||||
Quality = quality
|
||||
};
|
||||
}
|
||||
|
||||
private SignalContribution ComputeEmbeddingSignal(
|
||||
FunctionAnalysis source,
|
||||
FunctionAnalysis target,
|
||||
EnsembleOptions options)
|
||||
{
|
||||
if (source.Embedding is null || target.Embedding is null)
|
||||
{
|
||||
return new SignalContribution
|
||||
{
|
||||
SignalType = SignalType.Embedding,
|
||||
RawScore = 0m,
|
||||
Weight = options.EmbeddingWeight,
|
||||
IsAvailable = false,
|
||||
Quality = SignalQuality.Unavailable
|
||||
};
|
||||
}
|
||||
|
||||
var similarity = _embeddingService.ComputeSimilarity(
|
||||
source.Embedding,
|
||||
target.Embedding,
|
||||
SimilarityMetric.Cosine);
|
||||
|
||||
return new SignalContribution
|
||||
{
|
||||
SignalType = SignalType.Embedding,
|
||||
RawScore = similarity,
|
||||
Weight = options.EmbeddingWeight,
|
||||
IsAvailable = true,
|
||||
Quality = SignalQuality.Normal
|
||||
};
|
||||
}
|
||||
|
||||
private static SignalQuality AssessAstQuality(DecompiledAst ast1, DecompiledAst ast2)
|
||||
{
|
||||
var minNodes = Math.Min(ast1.Root.Children.Length, ast2.Root.Children.Length);
|
||||
|
||||
return minNodes switch
|
||||
{
|
||||
< 3 => SignalQuality.Low,
|
||||
< 10 => SignalQuality.Normal,
|
||||
_ => SignalQuality.High
|
||||
};
|
||||
}
|
||||
|
||||
private static SignalQuality AssessGraphQuality(KeySemanticsGraph g1, KeySemanticsGraph g2)
|
||||
{
|
||||
var minNodes = Math.Min(g1.Nodes.Length, g2.Nodes.Length);
|
||||
|
||||
return minNodes switch
|
||||
{
|
||||
< 3 => SignalQuality.Low,
|
||||
< 10 => SignalQuality.Normal,
|
||||
_ => SignalQuality.High
|
||||
};
|
||||
}
|
||||
|
||||
private static EffectiveWeights ComputeEffectiveWeights(
|
||||
List<SignalContribution> contributions,
|
||||
EnsembleOptions options,
|
||||
decimal availableWeight)
|
||||
{
|
||||
if (!options.AdaptiveWeights || availableWeight >= 0.999m)
|
||||
{
|
||||
return new EffectiveWeights(
|
||||
options.SyntacticWeight,
|
||||
options.SemanticWeight,
|
||||
options.EmbeddingWeight);
|
||||
}
|
||||
|
||||
// Redistribute weight from unavailable signals to available ones
|
||||
var syntactic = contributions.First(c => c.SignalType == SignalType.Syntactic);
|
||||
var semantic = contributions.First(c => c.SignalType == SignalType.Semantic);
|
||||
var embedding = contributions.First(c => c.SignalType == SignalType.Embedding);
|
||||
|
||||
var syntacticWeight = syntactic.IsAvailable
|
||||
? options.SyntacticWeight / availableWeight
|
||||
: 0m;
|
||||
var semanticWeight = semantic.IsAvailable
|
||||
? options.SemanticWeight / availableWeight
|
||||
: 0m;
|
||||
var embeddingWeight = embedding.IsAvailable
|
||||
? options.EmbeddingWeight / availableWeight
|
||||
: 0m;
|
||||
|
||||
return new EffectiveWeights(syntacticWeight, semanticWeight, embeddingWeight);
|
||||
}
|
||||
|
||||
private static List<SignalContribution> AdjustContributionWeights(
|
||||
List<SignalContribution> contributions,
|
||||
EffectiveWeights weights)
|
||||
{
|
||||
return contributions.Select(c => c.SignalType switch
|
||||
{
|
||||
SignalType.Syntactic => c with { Weight = weights.Syntactic },
|
||||
SignalType.Semantic => c with { Weight = weights.Semantic },
|
||||
SignalType.Embedding => c with { Weight = weights.Embedding },
|
||||
_ => c
|
||||
}).ToList();
|
||||
}
|
||||
|
||||
private static decimal ComputeEnsembleScore(
|
||||
List<SignalContribution> contributions,
|
||||
bool exactHashMatch,
|
||||
EnsembleOptions options)
|
||||
{
|
||||
var weightedSum = contributions
|
||||
.Where(c => c.IsAvailable)
|
||||
.Sum(c => c.WeightedScore);
|
||||
|
||||
// Apply exact match boost
|
||||
if (exactHashMatch && options.UseExactHashMatch)
|
||||
{
|
||||
weightedSum = Math.Min(1.0m, weightedSum + options.ExactMatchBoost);
|
||||
}
|
||||
|
||||
return Math.Clamp(weightedSum, 0m, 1m);
|
||||
}
|
||||
|
||||
private static ConfidenceLevel DetermineConfidence(
|
||||
decimal score,
|
||||
List<SignalContribution> contributions,
|
||||
bool exactHashMatch)
|
||||
{
|
||||
// Exact hash match is very high confidence
|
||||
if (exactHashMatch)
|
||||
{
|
||||
return ConfidenceLevel.VeryHigh;
|
||||
}
|
||||
|
||||
// Count available high-quality signals
|
||||
var availableCount = contributions.Count(c => c.IsAvailable);
|
||||
var highQualityCount = contributions.Count(c =>
|
||||
c.IsAvailable && c.Quality >= SignalQuality.Normal);
|
||||
|
||||
// High score with multiple agreeing signals
|
||||
if (score >= 0.95m && availableCount >= 3)
|
||||
{
|
||||
return ConfidenceLevel.VeryHigh;
|
||||
}
|
||||
|
||||
if (score >= 0.90m && highQualityCount >= 2)
|
||||
{
|
||||
return ConfidenceLevel.High;
|
||||
}
|
||||
|
||||
if (score >= 0.80m && availableCount >= 2)
|
||||
{
|
||||
return ConfidenceLevel.Medium;
|
||||
}
|
||||
|
||||
if (score >= 0.70m)
|
||||
{
|
||||
return ConfidenceLevel.Low;
|
||||
}
|
||||
|
||||
return ConfidenceLevel.VeryLow;
|
||||
}
|
||||
|
||||
private static string BuildDecisionReason(
|
||||
List<SignalContribution> contributions,
|
||||
bool exactHashMatch,
|
||||
bool isMatch)
|
||||
{
|
||||
if (exactHashMatch)
|
||||
{
|
||||
return "Exact normalized code hash match";
|
||||
}
|
||||
|
||||
var availableSignals = contributions
|
||||
.Where(c => c.IsAvailable)
|
||||
.Select(c => $"{c.SignalType}: {c.RawScore:P0}")
|
||||
.ToList();
|
||||
|
||||
if (availableSignals.Count == 0)
|
||||
{
|
||||
return "No signals available for comparison";
|
||||
}
|
||||
|
||||
var signalSummary = string.Join(", ", availableSignals);
|
||||
return isMatch
|
||||
? $"Match based on: {signalSummary}"
|
||||
: $"No match. Scores: {signalSummary}";
|
||||
}
|
||||
|
||||
private static ComparisonStatistics ComputeStatistics(List<EnsembleResult> results)
|
||||
{
|
||||
var matchCount = results.Count(r => r.IsMatch);
|
||||
var highConfidenceMatches = results.Count(r =>
|
||||
r.IsMatch && r.Confidence >= ConfidenceLevel.High);
|
||||
var exactHashMatches = results.Count(r => r.ExactHashMatch);
|
||||
var averageScore = results.Count > 0
|
||||
? results.Average(r => r.EnsembleScore)
|
||||
: 0m;
|
||||
|
||||
var confidenceDistribution = results
|
||||
.GroupBy(r => r.Confidence)
|
||||
.ToImmutableDictionary(g => g.Key, g => g.Count());
|
||||
|
||||
return new ComparisonStatistics
|
||||
{
|
||||
TotalComparisons = results.Count,
|
||||
MatchCount = matchCount,
|
||||
HighConfidenceMatches = highConfidenceMatches,
|
||||
ExactHashMatches = exactHashMatches,
|
||||
AverageScore = averageScore,
|
||||
ConfidenceDistribution = confidenceDistribution
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,110 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using StellaOps.BinaryIndex.Decompiler;
|
||||
using StellaOps.BinaryIndex.ML;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ensemble;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering ensemble services.
|
||||
/// </summary>
|
||||
public static class EnsembleServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds ensemble decision engine services to the service collection.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddEnsembleServices(this IServiceCollection services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
// Register ensemble components
|
||||
services.AddScoped<IEnsembleDecisionEngine, EnsembleDecisionEngine>();
|
||||
services.AddScoped<IFunctionAnalysisBuilder, FunctionAnalysisBuilder>();
|
||||
services.AddScoped<IWeightTuningService, WeightTuningService>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds ensemble services with custom options.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureOptions">Action to configure ensemble options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddEnsembleServices(
|
||||
this IServiceCollection services,
|
||||
Action<EnsembleOptions> configureOptions)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(configureOptions);
|
||||
|
||||
services.Configure(configureOptions);
|
||||
return services.AddEnsembleServices();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds the complete binary similarity stack (Decompiler + ML + Semantic + Ensemble).
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddBinarySimilarityServices(this IServiceCollection services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
// Add all underlying services
|
||||
services.AddDecompilerServices();
|
||||
services.AddMlServices();
|
||||
services.AddBinaryIndexSemantic();
|
||||
|
||||
// Add ensemble on top
|
||||
services.AddEnsembleServices();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds the complete binary similarity stack with custom options.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureEnsemble">Action to configure ensemble options.</param>
|
||||
/// <param name="configureMl">Action to configure ML options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddBinarySimilarityServices(
|
||||
this IServiceCollection services,
|
||||
Action<EnsembleOptions>? configureEnsemble = null,
|
||||
Action<MlOptions>? configureMl = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
// Add all underlying services
|
||||
services.AddDecompilerServices();
|
||||
|
||||
if (configureMl is not null)
|
||||
{
|
||||
services.AddMlServices(configureMl);
|
||||
}
|
||||
else
|
||||
{
|
||||
services.AddMlServices();
|
||||
}
|
||||
|
||||
services.AddBinaryIndexSemantic();
|
||||
|
||||
// Add ensemble with options
|
||||
if (configureEnsemble is not null)
|
||||
{
|
||||
services.AddEnsembleServices(configureEnsemble);
|
||||
}
|
||||
else
|
||||
{
|
||||
services.AddEnsembleServices();
|
||||
}
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,165 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Decompiler;
|
||||
using StellaOps.BinaryIndex.ML;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ensemble;
|
||||
|
||||
/// <summary>
|
||||
/// Builds complete function analysis from various input sources.
|
||||
/// </summary>
|
||||
public sealed class FunctionAnalysisBuilder : IFunctionAnalysisBuilder
|
||||
{
|
||||
private readonly IDecompiledCodeParser _parser;
|
||||
private readonly ICodeNormalizer _normalizer;
|
||||
private readonly IEmbeddingService _embeddingService;
|
||||
private readonly IIrLiftingService? _irLiftingService;
|
||||
private readonly ISemanticGraphExtractor? _graphExtractor;
|
||||
private readonly ILogger<FunctionAnalysisBuilder> _logger;
|
||||
|
||||
public FunctionAnalysisBuilder(
|
||||
IDecompiledCodeParser parser,
|
||||
ICodeNormalizer normalizer,
|
||||
IEmbeddingService embeddingService,
|
||||
ILogger<FunctionAnalysisBuilder> logger,
|
||||
IIrLiftingService? irLiftingService = null,
|
||||
ISemanticGraphExtractor? graphExtractor = null)
|
||||
{
|
||||
_parser = parser ?? throw new ArgumentNullException(nameof(parser));
|
||||
_normalizer = normalizer ?? throw new ArgumentNullException(nameof(normalizer));
|
||||
_embeddingService = embeddingService ?? throw new ArgumentNullException(nameof(embeddingService));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_irLiftingService = irLiftingService;
|
||||
_graphExtractor = graphExtractor;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<FunctionAnalysis> BuildAnalysisAsync(
|
||||
string functionId,
|
||||
string functionName,
|
||||
string decompiledCode,
|
||||
ulong? address = null,
|
||||
int? sizeBytes = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(functionId);
|
||||
ArgumentException.ThrowIfNullOrEmpty(functionName);
|
||||
ArgumentException.ThrowIfNullOrEmpty(decompiledCode);
|
||||
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Building analysis for function {FunctionId} ({FunctionName})",
|
||||
functionId, functionName);
|
||||
|
||||
// Parse AST
|
||||
DecompiledAst? ast = null;
|
||||
try
|
||||
{
|
||||
ast = _parser.Parse(decompiledCode);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse AST for {FunctionId}", functionId);
|
||||
}
|
||||
|
||||
// Compute normalized hash
|
||||
byte[]? normalizedHash = null;
|
||||
try
|
||||
{
|
||||
normalizedHash = _normalizer.ComputeCanonicalHash(decompiledCode);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to compute normalized hash for {FunctionId}", functionId);
|
||||
}
|
||||
|
||||
// Build semantic graph (requires IR lifting service and graph extractor)
|
||||
KeySemanticsGraph? semanticGraph = null;
|
||||
if (_irLiftingService is not null && _graphExtractor is not null)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Note: Full semantic graph extraction requires binary bytes,
|
||||
// not just decompiled code. This is a simplified path that
|
||||
// sets semanticGraph to null when binary data is not available.
|
||||
_logger.LogDebug(
|
||||
"Semantic graph extraction requires binary data for {FunctionId}",
|
||||
functionId);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to build semantic graph for {FunctionId}", functionId);
|
||||
}
|
||||
}
|
||||
|
||||
// Generate embedding
|
||||
FunctionEmbedding? embedding = null;
|
||||
try
|
||||
{
|
||||
var input = new EmbeddingInput(
|
||||
DecompiledCode: decompiledCode,
|
||||
SemanticGraph: semanticGraph,
|
||||
InstructionBytes: null,
|
||||
PreferredInput: EmbeddingInputType.DecompiledCode);
|
||||
embedding = await _embeddingService.GenerateEmbeddingAsync(input, ct: ct);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to generate embedding for {FunctionId}", functionId);
|
||||
}
|
||||
|
||||
return new FunctionAnalysis
|
||||
{
|
||||
FunctionId = functionId,
|
||||
FunctionName = functionName,
|
||||
Ast = ast,
|
||||
SemanticGraph = semanticGraph,
|
||||
Embedding = embedding,
|
||||
NormalizedCodeHash = normalizedHash,
|
||||
DecompiledCode = decompiledCode,
|
||||
Address = address,
|
||||
SizeBytes = sizeBytes
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public FunctionAnalysis BuildFromComponents(
|
||||
string functionId,
|
||||
string functionName,
|
||||
string? decompiledCode = null,
|
||||
DecompiledAst? ast = null,
|
||||
KeySemanticsGraph? semanticGraph = null,
|
||||
FunctionEmbedding? embedding = null)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(functionId);
|
||||
ArgumentException.ThrowIfNullOrEmpty(functionName);
|
||||
|
||||
byte[]? normalizedHash = null;
|
||||
if (decompiledCode is not null)
|
||||
{
|
||||
try
|
||||
{
|
||||
normalizedHash = _normalizer.ComputeCanonicalHash(decompiledCode);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Ignore normalization errors for components
|
||||
}
|
||||
}
|
||||
|
||||
return new FunctionAnalysis
|
||||
{
|
||||
FunctionId = functionId,
|
||||
FunctionName = functionName,
|
||||
Ast = ast,
|
||||
SemanticGraph = semanticGraph,
|
||||
Embedding = embedding,
|
||||
NormalizedCodeHash = normalizedHash,
|
||||
DecompiledCode = decompiledCode
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ensemble;
|
||||
|
||||
/// <summary>
|
||||
/// Ensemble decision engine that combines multiple similarity signals
|
||||
/// to determine function equivalence.
|
||||
/// </summary>
|
||||
public interface IEnsembleDecisionEngine
|
||||
{
|
||||
/// <summary>
|
||||
/// Compare two functions using all available signals.
|
||||
/// </summary>
|
||||
/// <param name="source">Source function analysis.</param>
|
||||
/// <param name="target">Target function analysis.</param>
|
||||
/// <param name="options">Ensemble options (optional).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Ensemble comparison result.</returns>
|
||||
Task<EnsembleResult> CompareAsync(
|
||||
FunctionAnalysis source,
|
||||
FunctionAnalysis target,
|
||||
EnsembleOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find the best matches for a function from a corpus.
|
||||
/// </summary>
|
||||
/// <param name="query">Query function analysis.</param>
|
||||
/// <param name="corpus">Corpus of candidate functions.</param>
|
||||
/// <param name="options">Ensemble options (optional).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Top matching functions.</returns>
|
||||
Task<ImmutableArray<EnsembleResult>> FindMatchesAsync(
|
||||
FunctionAnalysis query,
|
||||
IEnumerable<FunctionAnalysis> corpus,
|
||||
EnsembleOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Perform batch comparison between two sets of functions.
|
||||
/// </summary>
|
||||
/// <param name="sources">Source functions.</param>
|
||||
/// <param name="targets">Target functions.</param>
|
||||
/// <param name="options">Ensemble options (optional).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Batch comparison result with statistics.</returns>
|
||||
Task<BatchComparisonResult> CompareBatchAsync(
|
||||
IEnumerable<FunctionAnalysis> sources,
|
||||
IEnumerable<FunctionAnalysis> targets,
|
||||
EnsembleOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Weight tuning service for optimizing ensemble weights.
|
||||
/// </summary>
|
||||
public interface IWeightTuningService
|
||||
{
|
||||
/// <summary>
|
||||
/// Tune weights using grid search over training pairs.
|
||||
/// </summary>
|
||||
/// <param name="trainingPairs">Labeled training pairs.</param>
|
||||
/// <param name="gridStep">Step size for grid search (e.g., 0.05).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Best weights found.</returns>
|
||||
Task<WeightTuningResult> TuneWeightsAsync(
|
||||
IEnumerable<EnsembleTrainingPair> trainingPairs,
|
||||
decimal gridStep = 0.05m,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Evaluate a specific weight combination on training data.
|
||||
/// </summary>
|
||||
/// <param name="weights">Weights to evaluate.</param>
|
||||
/// <param name="trainingPairs">Labeled training pairs.</param>
|
||||
/// <param name="threshold">Match threshold.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Evaluation metrics.</returns>
|
||||
Task<WeightEvaluation> EvaluateWeightsAsync(
|
||||
EffectiveWeights weights,
|
||||
IEnumerable<EnsembleTrainingPair> trainingPairs,
|
||||
decimal threshold = 0.85m,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Function analysis builder that collects all signal sources.
|
||||
/// </summary>
|
||||
public interface IFunctionAnalysisBuilder
|
||||
{
|
||||
/// <summary>
|
||||
/// Build complete function analysis from raw data.
|
||||
/// </summary>
|
||||
/// <param name="functionId">Function identifier.</param>
|
||||
/// <param name="functionName">Function name.</param>
|
||||
/// <param name="decompiledCode">Raw decompiled code.</param>
|
||||
/// <param name="address">Function address (optional).</param>
|
||||
/// <param name="sizeBytes">Function size in bytes (optional).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Complete function analysis.</returns>
|
||||
Task<FunctionAnalysis> BuildAnalysisAsync(
|
||||
string functionId,
|
||||
string functionName,
|
||||
string decompiledCode,
|
||||
ulong? address = null,
|
||||
int? sizeBytes = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Build function analysis from existing components.
|
||||
/// </summary>
|
||||
/// <param name="functionId">Function identifier.</param>
|
||||
/// <param name="functionName">Function name.</param>
|
||||
/// <param name="decompiledCode">Raw decompiled code (optional).</param>
|
||||
/// <param name="ast">Pre-parsed AST (optional).</param>
|
||||
/// <param name="semanticGraph">Pre-built semantic graph (optional).</param>
|
||||
/// <param name="embedding">Pre-computed embedding (optional).</param>
|
||||
/// <returns>Function analysis.</returns>
|
||||
FunctionAnalysis BuildFromComponents(
|
||||
string functionId,
|
||||
string functionName,
|
||||
string? decompiledCode = null,
|
||||
Decompiler.DecompiledAst? ast = null,
|
||||
Semantic.KeySemanticsGraph? semanticGraph = null,
|
||||
ML.FunctionEmbedding? embedding = null);
|
||||
}
|
||||
@@ -0,0 +1,446 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.Decompiler;
|
||||
using StellaOps.BinaryIndex.ML;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ensemble;
|
||||
|
||||
/// <summary>
|
||||
/// Complete analysis of a function from all signal sources.
|
||||
/// </summary>
|
||||
public sealed record FunctionAnalysis
|
||||
{
|
||||
/// <summary>
|
||||
/// Unique identifier for the function.
|
||||
/// </summary>
|
||||
public required string FunctionId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Function name if available.
|
||||
/// </summary>
|
||||
public required string FunctionName { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Decompiled AST representation.
|
||||
/// </summary>
|
||||
public DecompiledAst? Ast { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Semantic graph representation.
|
||||
/// </summary>
|
||||
public KeySemanticsGraph? SemanticGraph { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// ML embedding representation.
|
||||
/// </summary>
|
||||
public FunctionEmbedding? Embedding { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Normalized code hash for quick equality check.
|
||||
/// </summary>
|
||||
public byte[]? NormalizedCodeHash { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Raw decompiled code.
|
||||
/// </summary>
|
||||
public string? DecompiledCode { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Binary address of the function.
|
||||
/// </summary>
|
||||
public ulong? Address { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Size of the function in bytes.
|
||||
/// </summary>
|
||||
public int? SizeBytes { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for ensemble decision making.
|
||||
/// </summary>
|
||||
public sealed class EnsembleOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Weight for syntactic (AST-based) similarity. Default: 0.25
|
||||
/// </summary>
|
||||
public decimal SyntacticWeight { get; set; } = 0.25m;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for semantic (graph-based) similarity. Default: 0.35
|
||||
/// </summary>
|
||||
public decimal SemanticWeight { get; set; } = 0.35m;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for ML embedding similarity. Default: 0.40
|
||||
/// </summary>
|
||||
public decimal EmbeddingWeight { get; set; } = 0.40m;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum ensemble score to consider functions as matching.
|
||||
/// </summary>
|
||||
public decimal MatchThreshold { get; set; } = 0.85m;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum score for each individual signal to be considered valid.
|
||||
/// </summary>
|
||||
public decimal MinimumSignalThreshold { get; set; } = 0.50m;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to require all three signals for a match decision.
|
||||
/// </summary>
|
||||
public bool RequireAllSignals { get; set; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to use exact hash matching as an optimization.
|
||||
/// </summary>
|
||||
public bool UseExactHashMatch { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Confidence boost when normalized code hashes match exactly.
|
||||
/// </summary>
|
||||
public decimal ExactMatchBoost { get; set; } = 0.10m;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of candidate matches to return.
|
||||
/// </summary>
|
||||
public int MaxCandidates { get; set; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Enable adaptive weight adjustment based on signal quality.
|
||||
/// </summary>
|
||||
public bool AdaptiveWeights { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Validates that weights sum to 1.0.
|
||||
/// </summary>
|
||||
public bool AreWeightsValid()
|
||||
{
|
||||
var total = SyntacticWeight + SemanticWeight + EmbeddingWeight;
|
||||
return Math.Abs(total - 1.0m) < 0.001m;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Normalizes weights to sum to 1.0.
|
||||
/// </summary>
|
||||
public void NormalizeWeights()
|
||||
{
|
||||
var total = SyntacticWeight + SemanticWeight + EmbeddingWeight;
|
||||
if (total > 0)
|
||||
{
|
||||
SyntacticWeight /= total;
|
||||
SemanticWeight /= total;
|
||||
EmbeddingWeight /= total;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of ensemble comparison between two functions.
|
||||
/// </summary>
|
||||
public sealed record EnsembleResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Source function identifier.
|
||||
/// </summary>
|
||||
public required string SourceFunctionId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Target function identifier.
|
||||
/// </summary>
|
||||
public required string TargetFunctionId { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Final ensemble similarity score (0.0 to 1.0).
|
||||
/// </summary>
|
||||
public required decimal EnsembleScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Individual signal contributions.
|
||||
/// </summary>
|
||||
public required ImmutableArray<SignalContribution> Contributions { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether this pair is considered a match based on threshold.
|
||||
/// </summary>
|
||||
public required bool IsMatch { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level in the match decision.
|
||||
/// </summary>
|
||||
public required ConfidenceLevel Confidence { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Reason for the match or non-match decision.
|
||||
/// </summary>
|
||||
public string? DecisionReason { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether exact hash match was detected.
|
||||
/// </summary>
|
||||
public bool ExactHashMatch { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Effective weights used after adaptive adjustment.
|
||||
/// </summary>
|
||||
public EffectiveWeights? AdjustedWeights { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Contribution of a single signal to the ensemble score.
|
||||
/// </summary>
|
||||
public sealed record SignalContribution
|
||||
{
|
||||
/// <summary>
|
||||
/// Type of signal.
|
||||
/// </summary>
|
||||
public required SignalType SignalType { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Raw similarity score from this signal.
|
||||
/// </summary>
|
||||
public required decimal RawScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Weight applied to this signal.
|
||||
/// </summary>
|
||||
public required decimal Weight { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Weighted contribution to ensemble score.
|
||||
/// </summary>
|
||||
public decimal WeightedScore => RawScore * Weight;
|
||||
|
||||
/// <summary>
|
||||
/// Whether this signal was available for comparison.
|
||||
/// </summary>
|
||||
public required bool IsAvailable { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Quality assessment of this signal.
|
||||
/// </summary>
|
||||
public SignalQuality Quality { get; init; } = SignalQuality.Normal;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of similarity signal.
|
||||
/// </summary>
|
||||
public enum SignalType
|
||||
{
|
||||
/// <summary>
|
||||
/// AST-based syntactic comparison.
|
||||
/// </summary>
|
||||
Syntactic,
|
||||
|
||||
/// <summary>
|
||||
/// Semantic graph comparison.
|
||||
/// </summary>
|
||||
Semantic,
|
||||
|
||||
/// <summary>
|
||||
/// ML embedding cosine similarity.
|
||||
/// </summary>
|
||||
Embedding,
|
||||
|
||||
/// <summary>
|
||||
/// Exact normalized code hash match.
|
||||
/// </summary>
|
||||
ExactHash
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Quality assessment of a signal.
|
||||
/// </summary>
|
||||
public enum SignalQuality
|
||||
{
|
||||
/// <summary>
|
||||
/// Signal not available (data missing).
|
||||
/// </summary>
|
||||
Unavailable,
|
||||
|
||||
/// <summary>
|
||||
/// Low quality signal (small function, few nodes).
|
||||
/// </summary>
|
||||
Low,
|
||||
|
||||
/// <summary>
|
||||
/// Normal quality signal.
|
||||
/// </summary>
|
||||
Normal,
|
||||
|
||||
/// <summary>
|
||||
/// High quality signal (rich data, high confidence).
|
||||
/// </summary>
|
||||
High
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level in a match decision.
|
||||
/// </summary>
|
||||
public enum ConfidenceLevel
|
||||
{
|
||||
/// <summary>
|
||||
/// Very low confidence, likely uncertain.
|
||||
/// </summary>
|
||||
VeryLow,
|
||||
|
||||
/// <summary>
|
||||
/// Low confidence, needs review.
|
||||
/// </summary>
|
||||
Low,
|
||||
|
||||
/// <summary>
|
||||
/// Medium confidence, reasonable certainty.
|
||||
/// </summary>
|
||||
Medium,
|
||||
|
||||
/// <summary>
|
||||
/// High confidence, strong match signals.
|
||||
/// </summary>
|
||||
High,
|
||||
|
||||
/// <summary>
|
||||
/// Very high confidence, exact or near-exact match.
|
||||
/// </summary>
|
||||
VeryHigh
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Effective weights after adaptive adjustment.
|
||||
/// </summary>
|
||||
public sealed record EffectiveWeights(
|
||||
decimal Syntactic,
|
||||
decimal Semantic,
|
||||
decimal Embedding);
|
||||
|
||||
/// <summary>
|
||||
/// Batch comparison result.
|
||||
/// </summary>
|
||||
public sealed record BatchComparisonResult
|
||||
{
|
||||
/// <summary>
|
||||
/// All comparison results.
|
||||
/// </summary>
|
||||
public required ImmutableArray<EnsembleResult> Results { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Summary statistics.
|
||||
/// </summary>
|
||||
public required ComparisonStatistics Statistics { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Time taken for comparison.
|
||||
/// </summary>
|
||||
public required TimeSpan Duration { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Statistics from batch comparison.
|
||||
/// </summary>
|
||||
public sealed record ComparisonStatistics
|
||||
{
|
||||
/// <summary>
|
||||
/// Total number of comparisons performed.
|
||||
/// </summary>
|
||||
public required int TotalComparisons { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of matches found.
|
||||
/// </summary>
|
||||
public required int MatchCount { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of high-confidence matches.
|
||||
/// </summary>
|
||||
public required int HighConfidenceMatches { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Number of exact hash matches.
|
||||
/// </summary>
|
||||
public required int ExactHashMatches { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Average ensemble score across all comparisons.
|
||||
/// </summary>
|
||||
public required decimal AverageScore { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Distribution of confidence levels.
|
||||
/// </summary>
|
||||
public required ImmutableDictionary<ConfidenceLevel, int> ConfidenceDistribution { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Weight tuning result from grid search or optimization.
|
||||
/// </summary>
|
||||
public sealed record WeightTuningResult
|
||||
{
|
||||
/// <summary>
|
||||
/// Best weights found.
|
||||
/// </summary>
|
||||
public required EffectiveWeights BestWeights { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Accuracy achieved with best weights.
|
||||
/// </summary>
|
||||
public required decimal Accuracy { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Precision achieved with best weights.
|
||||
/// </summary>
|
||||
public required decimal Precision { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Recall achieved with best weights.
|
||||
/// </summary>
|
||||
public required decimal Recall { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// F1 score achieved with best weights.
|
||||
/// </summary>
|
||||
public required decimal F1Score { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// All weight combinations evaluated.
|
||||
/// </summary>
|
||||
public required ImmutableArray<WeightEvaluation> Evaluations { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evaluation of a specific weight combination.
|
||||
/// </summary>
|
||||
public sealed record WeightEvaluation(
|
||||
EffectiveWeights Weights,
|
||||
decimal Accuracy,
|
||||
decimal Precision,
|
||||
decimal Recall,
|
||||
decimal F1Score);
|
||||
|
||||
/// <summary>
|
||||
/// Training pair for weight tuning.
|
||||
/// </summary>
|
||||
public sealed record EnsembleTrainingPair
|
||||
{
|
||||
/// <summary>
|
||||
/// First function analysis.
|
||||
/// </summary>
|
||||
public required FunctionAnalysis Function1 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Second function analysis.
|
||||
/// </summary>
|
||||
public required FunctionAnalysis Function2 { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Ground truth: are these functions equivalent?
|
||||
/// </summary>
|
||||
public required bool IsEquivalent { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Optional similarity label (for regression training).
|
||||
/// </summary>
|
||||
public decimal? SimilarityLabel { get; init; }
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
<!-- Copyright (c) StellaOps. All rights reserved. -->
|
||||
<!-- Licensed under AGPL-3.0-or-later. See LICENSE in the project root. -->
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<RootNamespace>StellaOps.BinaryIndex.Ensemble</RootNamespace>
|
||||
<Description>Ensemble decision engine combining syntactic, semantic, and ML-based function similarity signals.</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Decompiler\StellaOps.BinaryIndex.Decompiler.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.ML\StellaOps.BinaryIndex.ML.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Semantic\StellaOps.BinaryIndex.Semantic.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,180 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ensemble;
|
||||
|
||||
/// <summary>
|
||||
/// Weight tuning service using grid search optimization.
|
||||
/// </summary>
|
||||
public sealed class WeightTuningService : IWeightTuningService
|
||||
{
|
||||
private readonly IEnsembleDecisionEngine _decisionEngine;
|
||||
private readonly ILogger<WeightTuningService> _logger;
|
||||
|
||||
public WeightTuningService(
|
||||
IEnsembleDecisionEngine decisionEngine,
|
||||
ILogger<WeightTuningService> logger)
|
||||
{
|
||||
_decisionEngine = decisionEngine ?? throw new ArgumentNullException(nameof(decisionEngine));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<WeightTuningResult> TuneWeightsAsync(
|
||||
IEnumerable<EnsembleTrainingPair> trainingPairs,
|
||||
decimal gridStep = 0.05m,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(trainingPairs);
|
||||
|
||||
if (gridStep <= 0 || gridStep > 0.5m)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(gridStep), "Step must be between 0 and 0.5");
|
||||
}
|
||||
|
||||
var pairs = trainingPairs.ToList();
|
||||
if (pairs.Count == 0)
|
||||
{
|
||||
throw new ArgumentException("At least one training pair required", nameof(trainingPairs));
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting weight tuning with {PairCount} pairs, step size {Step}",
|
||||
pairs.Count, gridStep);
|
||||
|
||||
var evaluations = new List<WeightEvaluation>();
|
||||
WeightEvaluation? bestEvaluation = null;
|
||||
|
||||
// Grid search over weight combinations
|
||||
for (var syntactic = 0m; syntactic <= 1m; syntactic += gridStep)
|
||||
{
|
||||
for (var semantic = 0m; semantic <= 1m - syntactic; semantic += gridStep)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var embedding = 1m - syntactic - semantic;
|
||||
|
||||
// Skip invalid weight combinations
|
||||
if (embedding < 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
var weights = new EffectiveWeights(syntactic, semantic, embedding);
|
||||
var evaluation = await EvaluateWeightsAsync(weights, pairs, 0.85m, ct);
|
||||
evaluations.Add(evaluation);
|
||||
|
||||
if (bestEvaluation is null || evaluation.F1Score > bestEvaluation.F1Score)
|
||||
{
|
||||
bestEvaluation = evaluation;
|
||||
_logger.LogDebug(
|
||||
"New best weights: Syn={Syn:P0} Sem={Sem:P0} Emb={Emb:P0} F1={F1:P2}",
|
||||
syntactic, semantic, embedding, evaluation.F1Score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (bestEvaluation is null)
|
||||
{
|
||||
throw new InvalidOperationException("No valid weight combinations evaluated");
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Weight tuning complete. Best weights: Syn={Syn:P0} Sem={Sem:P0} Emb={Emb:P0} F1={F1:P2}",
|
||||
bestEvaluation.Weights.Syntactic,
|
||||
bestEvaluation.Weights.Semantic,
|
||||
bestEvaluation.Weights.Embedding,
|
||||
bestEvaluation.F1Score);
|
||||
|
||||
return new WeightTuningResult
|
||||
{
|
||||
BestWeights = bestEvaluation.Weights,
|
||||
Accuracy = bestEvaluation.Accuracy,
|
||||
Precision = bestEvaluation.Precision,
|
||||
Recall = bestEvaluation.Recall,
|
||||
F1Score = bestEvaluation.F1Score,
|
||||
Evaluations = evaluations.ToImmutableArray()
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<WeightEvaluation> EvaluateWeightsAsync(
|
||||
EffectiveWeights weights,
|
||||
IEnumerable<EnsembleTrainingPair> trainingPairs,
|
||||
decimal threshold = 0.85m,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(weights);
|
||||
ArgumentNullException.ThrowIfNull(trainingPairs);
|
||||
|
||||
var options = new EnsembleOptions
|
||||
{
|
||||
SyntacticWeight = weights.Syntactic,
|
||||
SemanticWeight = weights.Semantic,
|
||||
EmbeddingWeight = weights.Embedding,
|
||||
MatchThreshold = threshold,
|
||||
AdaptiveWeights = false // Use fixed weights during evaluation
|
||||
};
|
||||
|
||||
var truePositives = 0;
|
||||
var falsePositives = 0;
|
||||
var trueNegatives = 0;
|
||||
var falseNegatives = 0;
|
||||
|
||||
foreach (var pair in trainingPairs)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var result = await _decisionEngine.CompareAsync(
|
||||
pair.Function1,
|
||||
pair.Function2,
|
||||
options,
|
||||
ct);
|
||||
|
||||
if (pair.IsEquivalent)
|
||||
{
|
||||
if (result.IsMatch)
|
||||
{
|
||||
truePositives++;
|
||||
}
|
||||
else
|
||||
{
|
||||
falseNegatives++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (result.IsMatch)
|
||||
{
|
||||
falsePositives++;
|
||||
}
|
||||
else
|
||||
{
|
||||
trueNegatives++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var total = truePositives + falsePositives + trueNegatives + falseNegatives;
|
||||
var accuracy = total > 0
|
||||
? (decimal)(truePositives + trueNegatives) / total
|
||||
: 0m;
|
||||
|
||||
var precision = (truePositives + falsePositives) > 0
|
||||
? (decimal)truePositives / (truePositives + falsePositives)
|
||||
: 0m;
|
||||
|
||||
var recall = (truePositives + falseNegatives) > 0
|
||||
? (decimal)truePositives / (truePositives + falseNegatives)
|
||||
: 0m;
|
||||
|
||||
var f1Score = (precision + recall) > 0
|
||||
? 2 * precision * recall / (precision + recall)
|
||||
: 0m;
|
||||
|
||||
return new WeightEvaluation(weights, accuracy, precision, recall, f1Score);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
# AGENTS.md - StellaOps.BinaryIndex.Ghidra
|
||||
|
||||
## Module Overview
|
||||
|
||||
This module provides Ghidra integration for the BinaryIndex semantic diffing stack. It serves as a fallback/enhancement layer when B2R2 provides insufficient coverage or accuracy.
|
||||
|
||||
## Roles Expected
|
||||
|
||||
- **Backend Engineer**: Implement Ghidra Headless wrapper, ghidriff bridge, Version Tracking service, BSim integration
|
||||
- **QA Engineer**: Unit tests for all services, integration tests for Ghidra availability scenarios
|
||||
|
||||
## Required Documentation
|
||||
|
||||
Before working on this module, read:
|
||||
|
||||
- `docs/modules/binary-index/architecture.md`
|
||||
- `docs/implplan/SPRINT_20260105_001_003_BINDEX_semdiff_ghidra.md`
|
||||
- Ghidra documentation: https://ghidra.re/ghidra_docs/
|
||||
- ghidriff repository: https://github.com/clearbluejar/ghidriff
|
||||
|
||||
## Module-Specific Constraints
|
||||
|
||||
### Process Management
|
||||
- Ghidra runs as external Java process - manage lifecycle carefully
|
||||
- Use SemaphoreSlim for concurrent access control (one analysis at a time per instance)
|
||||
- Always clean up temporary project directories
|
||||
|
||||
### External Dependencies
|
||||
- **Ghidra 11.x**: Set via `GhidraOptions.GhidraHome`
|
||||
- **Java 17+**: Set via `GhidraOptions.JavaHome`
|
||||
- **Python 3.10+**: Required for ghidriff
|
||||
- **ghidriff**: Installed via pip
|
||||
|
||||
### Determinism Rules
|
||||
- Use `CultureInfo.InvariantCulture` for all parsing/formatting
|
||||
- Inject `TimeProvider` for timestamps
|
||||
- Inject `IGuidGenerator` for any ID generation
|
||||
- Results must be reproducible given same inputs
|
||||
|
||||
### Error Handling
|
||||
- Ghidra unavailability should not crash - graceful degradation
|
||||
- Log all external process failures with stderr content
|
||||
- Wrap external exceptions in `GhidraException` or `GhidriffException`
|
||||
|
||||
## Key Interfaces
|
||||
|
||||
| Interface | Purpose |
|
||||
|-----------|---------|
|
||||
| `IGhidraService` | Main analysis service (headless wrapper) |
|
||||
| `IVersionTrackingService` | Version Tracking with multiple correlators |
|
||||
| `IBSimService` | BSim signature generation and querying |
|
||||
| `IGhidriffBridge` | Python ghidriff interop |
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
StellaOps.BinaryIndex.Ghidra/
|
||||
Abstractions/
|
||||
IGhidraService.cs
|
||||
IVersionTrackingService.cs
|
||||
IBSimService.cs
|
||||
IGhidriffBridge.cs
|
||||
Models/
|
||||
GhidraModels.cs
|
||||
VersionTrackingModels.cs
|
||||
BSimModels.cs
|
||||
GhidriffModels.cs
|
||||
Services/
|
||||
GhidraHeadlessManager.cs
|
||||
GhidraService.cs
|
||||
VersionTrackingService.cs
|
||||
BSimService.cs
|
||||
GhidriffBridge.cs
|
||||
Options/
|
||||
GhidraOptions.cs
|
||||
BSimOptions.cs
|
||||
GhidriffOptions.cs
|
||||
Exceptions/
|
||||
GhidraException.cs
|
||||
GhidriffException.cs
|
||||
Extensions/
|
||||
GhidraServiceCollectionExtensions.cs
|
||||
```
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
- Unit tests mock external process execution
|
||||
- Integration tests require Ghidra installation (skip if unavailable)
|
||||
- Use `[Trait("Category", "Integration")]` for tests requiring Ghidra
|
||||
- Fallback scenarios tested in isolation
|
||||
|
||||
## Working Agreements
|
||||
|
||||
1. All public APIs must have XML documentation
|
||||
2. Follow the pattern from `StellaOps.BinaryIndex.Disassembly`
|
||||
3. Expose services via `AddGhidra()` extension method
|
||||
4. Configuration via `IOptions<GhidraOptions>` pattern
|
||||
@@ -0,0 +1,168 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Service for Ghidra BSim (Binary Similarity) operations.
|
||||
/// BSim provides behavioral similarity matching based on P-Code semantics.
|
||||
/// </summary>
|
||||
public interface IBSimService
|
||||
{
|
||||
/// <summary>
|
||||
/// Generate BSim signatures for functions from an analyzed binary.
|
||||
/// </summary>
|
||||
/// <param name="analysis">Ghidra analysis result.</param>
|
||||
/// <param name="options">Signature generation options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>BSim signatures for each function.</returns>
|
||||
Task<ImmutableArray<BSimSignature>> GenerateSignaturesAsync(
|
||||
GhidraAnalysisResult analysis,
|
||||
BSimGenerationOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Query BSim database for similar functions.
|
||||
/// </summary>
|
||||
/// <param name="signature">The signature to search for.</param>
|
||||
/// <param name="options">Query options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matching functions from the database.</returns>
|
||||
Task<ImmutableArray<BSimMatch>> QueryAsync(
|
||||
BSimSignature signature,
|
||||
BSimQueryOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Query BSim database for multiple signatures in batch.
|
||||
/// </summary>
|
||||
/// <param name="signatures">The signatures to search for.</param>
|
||||
/// <param name="options">Query options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matching functions for each query signature.</returns>
|
||||
Task<ImmutableArray<BSimQueryResult>> QueryBatchAsync(
|
||||
ImmutableArray<BSimSignature> signatures,
|
||||
BSimQueryOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Ingest functions into BSim database.
|
||||
/// </summary>
|
||||
/// <param name="libraryName">Name of the library being ingested.</param>
|
||||
/// <param name="version">Version of the library.</param>
|
||||
/// <param name="signatures">Signatures to ingest.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
Task IngestAsync(
|
||||
string libraryName,
|
||||
string version,
|
||||
ImmutableArray<BSimSignature> signatures,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Check if BSim database is available and healthy.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>True if BSim database is accessible.</returns>
|
||||
Task<bool> IsAvailableAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for BSim signature generation.
|
||||
/// </summary>
|
||||
public sealed record BSimGenerationOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Minimum function size (in instructions) to generate signatures for.
|
||||
/// Very small functions produce low-confidence matches.
|
||||
/// </summary>
|
||||
public int MinFunctionSize { get; init; } = 5;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include thunk/stub functions.
|
||||
/// </summary>
|
||||
public bool IncludeThunks { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include imported library functions.
|
||||
/// </summary>
|
||||
public bool IncludeImports { get; init; } = false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for BSim database queries.
|
||||
/// </summary>
|
||||
public sealed record BSimQueryOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Minimum similarity score (0.0-1.0) for matches.
|
||||
/// </summary>
|
||||
public double MinSimilarity { get; init; } = 0.7;
|
||||
|
||||
/// <summary>
|
||||
/// Minimum significance score for matches.
|
||||
/// Significance measures how distinctive a function is.
|
||||
/// </summary>
|
||||
public double MinSignificance { get; init; } = 0.0;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of results per query.
|
||||
/// </summary>
|
||||
public int MaxResults { get; init; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Limit search to specific libraries (empty = all libraries).
|
||||
/// </summary>
|
||||
public ImmutableArray<string> TargetLibraries { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Limit search to specific library versions.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> TargetVersions { get; init; } = [];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A BSim function signature.
|
||||
/// </summary>
|
||||
/// <param name="FunctionName">Original function name.</param>
|
||||
/// <param name="Address">Function address in the binary.</param>
|
||||
/// <param name="FeatureVector">BSim feature vector bytes.</param>
|
||||
/// <param name="VectorLength">Number of features in the vector.</param>
|
||||
/// <param name="SelfSignificance">How distinctive this function is (higher = more unique).</param>
|
||||
/// <param name="InstructionCount">Number of P-Code instructions.</param>
|
||||
public sealed record BSimSignature(
|
||||
string FunctionName,
|
||||
ulong Address,
|
||||
byte[] FeatureVector,
|
||||
int VectorLength,
|
||||
double SelfSignificance,
|
||||
int InstructionCount);
|
||||
|
||||
/// <summary>
|
||||
/// A BSim match result.
|
||||
/// </summary>
|
||||
/// <param name="MatchedLibrary">Library containing the matched function.</param>
|
||||
/// <param name="MatchedVersion">Version of the library.</param>
|
||||
/// <param name="MatchedFunction">Name of the matched function.</param>
|
||||
/// <param name="MatchedAddress">Address of the matched function.</param>
|
||||
/// <param name="Similarity">Similarity score (0.0-1.0).</param>
|
||||
/// <param name="Significance">Significance of the match.</param>
|
||||
/// <param name="Confidence">Combined confidence score.</param>
|
||||
public sealed record BSimMatch(
|
||||
string MatchedLibrary,
|
||||
string MatchedVersion,
|
||||
string MatchedFunction,
|
||||
ulong MatchedAddress,
|
||||
double Similarity,
|
||||
double Significance,
|
||||
double Confidence);
|
||||
|
||||
/// <summary>
|
||||
/// Result of a batch BSim query for a single signature.
|
||||
/// </summary>
|
||||
/// <param name="QuerySignature">The signature that was queried.</param>
|
||||
/// <param name="Matches">Matching functions found.</param>
|
||||
public sealed record BSimQueryResult(
|
||||
BSimSignature QuerySignature,
|
||||
ImmutableArray<BSimMatch> Matches);
|
||||
@@ -0,0 +1,144 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Main Ghidra analysis service interface.
|
||||
/// Provides access to Ghidra Headless analysis capabilities.
|
||||
/// </summary>
|
||||
public interface IGhidraService
|
||||
{
|
||||
/// <summary>
|
||||
/// Analyze a binary using Ghidra headless.
|
||||
/// </summary>
|
||||
/// <param name="binaryStream">The binary stream to analyze.</param>
|
||||
/// <param name="options">Optional analysis configuration.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Analysis results including functions, imports, exports, and metadata.</returns>
|
||||
Task<GhidraAnalysisResult> AnalyzeAsync(
|
||||
Stream binaryStream,
|
||||
GhidraAnalysisOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Analyze a binary from a file path using Ghidra headless.
|
||||
/// </summary>
|
||||
/// <param name="binaryPath">Absolute path to the binary file.</param>
|
||||
/// <param name="options">Optional analysis configuration.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Analysis results including functions, imports, exports, and metadata.</returns>
|
||||
Task<GhidraAnalysisResult> AnalyzeAsync(
|
||||
string binaryPath,
|
||||
GhidraAnalysisOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Check if Ghidra backend is available and healthy.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>True if Ghidra is available, false otherwise.</returns>
|
||||
Task<bool> IsAvailableAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets information about the Ghidra installation.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Ghidra version and capability information.</returns>
|
||||
Task<GhidraInfo> GetInfoAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for Ghidra analysis.
|
||||
/// </summary>
|
||||
public sealed record GhidraAnalysisOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Whether to run full auto-analysis (slower but more complete).
|
||||
/// </summary>
|
||||
public bool RunFullAnalysis { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include decompiled code in function results.
|
||||
/// </summary>
|
||||
public bool IncludeDecompilation { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to generate P-Code hashes for functions.
|
||||
/// </summary>
|
||||
public bool GeneratePCodeHashes { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to extract string literals.
|
||||
/// </summary>
|
||||
public bool ExtractStrings { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to extract functions.
|
||||
/// </summary>
|
||||
public bool ExtractFunctions { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to extract decompilation (alias for IncludeDecompilation).
|
||||
/// </summary>
|
||||
public bool ExtractDecompilation { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum analysis time in seconds (0 = unlimited).
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; init; } = 300;
|
||||
|
||||
/// <summary>
|
||||
/// Specific scripts to run during analysis.
|
||||
/// </summary>
|
||||
public ImmutableArray<string> Scripts { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Architecture hint for raw binaries.
|
||||
/// </summary>
|
||||
public string? ArchitectureHint { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Processor language hint for Ghidra (e.g., "x86:LE:64:default").
|
||||
/// </summary>
|
||||
public string? ProcessorHint { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Base address override for raw binaries.
|
||||
/// </summary>
|
||||
public ulong? BaseAddress { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="BinaryHash">SHA256 hash of the analyzed binary.</param>
|
||||
/// <param name="Functions">Discovered functions.</param>
|
||||
/// <param name="Imports">Import symbols.</param>
|
||||
/// <param name="Exports">Export symbols.</param>
|
||||
/// <param name="Strings">Discovered string literals.</param>
|
||||
/// <param name="MemoryBlocks">Memory blocks/sections in the binary.</param>
|
||||
/// <param name="Metadata">Analysis metadata.</param>
|
||||
public sealed record GhidraAnalysisResult(
|
||||
string BinaryHash,
|
||||
ImmutableArray<GhidraFunction> Functions,
|
||||
ImmutableArray<GhidraImport> Imports,
|
||||
ImmutableArray<GhidraExport> Exports,
|
||||
ImmutableArray<GhidraString> Strings,
|
||||
ImmutableArray<GhidraMemoryBlock> MemoryBlocks,
|
||||
GhidraMetadata Metadata);
|
||||
|
||||
/// <summary>
|
||||
/// Information about the Ghidra installation.
|
||||
/// </summary>
|
||||
/// <param name="Version">Ghidra version string (e.g., "11.2").</param>
|
||||
/// <param name="JavaVersion">Java runtime version.</param>
|
||||
/// <param name="AvailableProcessors">Available processor languages.</param>
|
||||
/// <param name="InstallPath">Ghidra installation path.</param>
|
||||
public sealed record GhidraInfo(
|
||||
string Version,
|
||||
string JavaVersion,
|
||||
ImmutableArray<string> AvailableProcessors,
|
||||
string InstallPath);
|
||||
@@ -0,0 +1,207 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Bridge interface for ghidriff Python tool integration.
|
||||
/// ghidriff provides automated binary diff reports using Ghidra.
|
||||
/// </summary>
|
||||
public interface IGhidriffBridge
|
||||
{
|
||||
/// <summary>
|
||||
/// Run ghidriff to compare two binaries.
|
||||
/// </summary>
|
||||
/// <param name="oldBinaryPath">Path to the older binary version.</param>
|
||||
/// <param name="newBinaryPath">Path to the newer binary version.</param>
|
||||
/// <param name="options">ghidriff configuration options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Diff result with added, removed, and modified functions.</returns>
|
||||
Task<GhidriffResult> DiffAsync(
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
GhidriffDiffOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Run ghidriff to compare two binaries from streams.
|
||||
/// </summary>
|
||||
/// <param name="oldBinary">Stream of the older binary version.</param>
|
||||
/// <param name="newBinary">Stream of the newer binary version.</param>
|
||||
/// <param name="options">ghidriff configuration options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Diff result with added, removed, and modified functions.</returns>
|
||||
Task<GhidriffResult> DiffAsync(
|
||||
Stream oldBinary,
|
||||
Stream newBinary,
|
||||
GhidriffDiffOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Generate a formatted report from ghidriff results.
|
||||
/// </summary>
|
||||
/// <param name="result">The diff result to format.</param>
|
||||
/// <param name="format">Output format.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Formatted report string.</returns>
|
||||
Task<string> GenerateReportAsync(
|
||||
GhidriffResult result,
|
||||
GhidriffReportFormat format,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Check if ghidriff is available (Python + ghidriff installed).
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>True if ghidriff is available.</returns>
|
||||
Task<bool> IsAvailableAsync(CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get ghidriff version information.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Version string.</returns>
|
||||
Task<string> GetVersionAsync(CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for ghidriff diff operation.
|
||||
/// </summary>
|
||||
public sealed record GhidriffDiffOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Path to Ghidra installation (auto-detected if not set).
|
||||
/// </summary>
|
||||
public string? GhidraPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Path for Ghidra project files (temp dir if not set).
|
||||
/// </summary>
|
||||
public string? ProjectPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include decompiled code in results.
|
||||
/// </summary>
|
||||
public bool IncludeDecompilation { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include disassembly listing in results.
|
||||
/// </summary>
|
||||
public bool IncludeDisassembly { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Functions to exclude from comparison (by name pattern).
|
||||
/// </summary>
|
||||
public ImmutableArray<string> ExcludeFunctions { get; init; } = [];
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of concurrent Ghidra instances.
|
||||
/// </summary>
|
||||
public int MaxParallelism { get; init; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum analysis time in seconds.
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; init; } = 600;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a ghidriff comparison.
|
||||
/// </summary>
|
||||
/// <param name="OldBinaryHash">SHA256 hash of the old binary.</param>
|
||||
/// <param name="NewBinaryHash">SHA256 hash of the new binary.</param>
|
||||
/// <param name="OldBinaryName">Name/path of the old binary.</param>
|
||||
/// <param name="NewBinaryName">Name/path of the new binary.</param>
|
||||
/// <param name="AddedFunctions">Functions added in new binary.</param>
|
||||
/// <param name="RemovedFunctions">Functions removed from old binary.</param>
|
||||
/// <param name="ModifiedFunctions">Functions modified between versions.</param>
|
||||
/// <param name="Statistics">Comparison statistics.</param>
|
||||
/// <param name="RawJsonOutput">Raw JSON output from ghidriff.</param>
|
||||
public sealed record GhidriffResult(
|
||||
string OldBinaryHash,
|
||||
string NewBinaryHash,
|
||||
string OldBinaryName,
|
||||
string NewBinaryName,
|
||||
ImmutableArray<GhidriffFunction> AddedFunctions,
|
||||
ImmutableArray<GhidriffFunction> RemovedFunctions,
|
||||
ImmutableArray<GhidriffDiff> ModifiedFunctions,
|
||||
GhidriffStats Statistics,
|
||||
string RawJsonOutput);
|
||||
|
||||
/// <summary>
|
||||
/// A function from ghidriff output.
|
||||
/// </summary>
|
||||
/// <param name="Name">Function name.</param>
|
||||
/// <param name="Address">Function address.</param>
|
||||
/// <param name="Size">Function size in bytes.</param>
|
||||
/// <param name="Signature">Decompiled signature.</param>
|
||||
/// <param name="DecompiledCode">Decompiled C code (if requested).</param>
|
||||
public sealed record GhidriffFunction(
|
||||
string Name,
|
||||
ulong Address,
|
||||
int Size,
|
||||
string? Signature,
|
||||
string? DecompiledCode);
|
||||
|
||||
/// <summary>
|
||||
/// A function diff from ghidriff output.
|
||||
/// </summary>
|
||||
/// <param name="FunctionName">Function name.</param>
|
||||
/// <param name="OldAddress">Address in old binary.</param>
|
||||
/// <param name="NewAddress">Address in new binary.</param>
|
||||
/// <param name="OldSize">Size in old binary.</param>
|
||||
/// <param name="NewSize">Size in new binary.</param>
|
||||
/// <param name="OldSignature">Signature in old binary.</param>
|
||||
/// <param name="NewSignature">Signature in new binary.</param>
|
||||
/// <param name="Similarity">Similarity score.</param>
|
||||
/// <param name="OldDecompiled">Decompiled code from old binary.</param>
|
||||
/// <param name="NewDecompiled">Decompiled code from new binary.</param>
|
||||
/// <param name="InstructionChanges">List of instruction-level changes.</param>
|
||||
public sealed record GhidriffDiff(
|
||||
string FunctionName,
|
||||
ulong OldAddress,
|
||||
ulong NewAddress,
|
||||
int OldSize,
|
||||
int NewSize,
|
||||
string? OldSignature,
|
||||
string? NewSignature,
|
||||
decimal Similarity,
|
||||
string? OldDecompiled,
|
||||
string? NewDecompiled,
|
||||
ImmutableArray<string> InstructionChanges);
|
||||
|
||||
/// <summary>
|
||||
/// Statistics from ghidriff comparison.
|
||||
/// </summary>
|
||||
/// <param name="TotalOldFunctions">Total functions in old binary.</param>
|
||||
/// <param name="TotalNewFunctions">Total functions in new binary.</param>
|
||||
/// <param name="AddedCount">Number of added functions.</param>
|
||||
/// <param name="RemovedCount">Number of removed functions.</param>
|
||||
/// <param name="ModifiedCount">Number of modified functions.</param>
|
||||
/// <param name="UnchangedCount">Number of unchanged functions.</param>
|
||||
/// <param name="AnalysisDuration">Time taken for analysis.</param>
|
||||
public sealed record GhidriffStats(
|
||||
int TotalOldFunctions,
|
||||
int TotalNewFunctions,
|
||||
int AddedCount,
|
||||
int RemovedCount,
|
||||
int ModifiedCount,
|
||||
int UnchangedCount,
|
||||
TimeSpan AnalysisDuration);
|
||||
|
||||
/// <summary>
|
||||
/// Report output format for ghidriff.
|
||||
/// </summary>
|
||||
public enum GhidriffReportFormat
|
||||
{
|
||||
/// <summary>JSON format.</summary>
|
||||
Json,
|
||||
|
||||
/// <summary>Markdown format.</summary>
|
||||
Markdown,
|
||||
|
||||
/// <summary>HTML format.</summary>
|
||||
Html
|
||||
}
|
||||
@@ -0,0 +1,255 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Service for running Ghidra Version Tracking between two binaries.
|
||||
/// Version Tracking correlates functions between two versions of a binary
|
||||
/// using multiple correlator algorithms.
|
||||
/// </summary>
|
||||
public interface IVersionTrackingService
|
||||
{
|
||||
/// <summary>
|
||||
/// Run Ghidra Version Tracking with multiple correlators.
|
||||
/// </summary>
|
||||
/// <param name="oldBinary">Stream of the older binary version.</param>
|
||||
/// <param name="newBinary">Stream of the newer binary version.</param>
|
||||
/// <param name="options">Version tracking configuration.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Version tracking results with matched, added, removed, and modified functions.</returns>
|
||||
Task<VersionTrackingResult> TrackVersionsAsync(
|
||||
Stream oldBinary,
|
||||
Stream newBinary,
|
||||
VersionTrackingOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Run Ghidra Version Tracking using file paths.
|
||||
/// </summary>
|
||||
/// <param name="oldBinaryPath">Path to the older binary version.</param>
|
||||
/// <param name="newBinaryPath">Path to the newer binary version.</param>
|
||||
/// <param name="options">Version tracking configuration.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Version tracking results with matched, added, removed, and modified functions.</returns>
|
||||
Task<VersionTrackingResult> TrackVersionsAsync(
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
VersionTrackingOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for Version Tracking analysis.
|
||||
/// </summary>
|
||||
public sealed record VersionTrackingOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Correlators to use for function matching, in priority order.
|
||||
/// </summary>
|
||||
public ImmutableArray<CorrelatorType> Correlators { get; init; } =
|
||||
[CorrelatorType.ExactBytes, CorrelatorType.ExactMnemonics,
|
||||
CorrelatorType.SymbolName, CorrelatorType.DataReference,
|
||||
CorrelatorType.CombinedReference];
|
||||
|
||||
/// <summary>
|
||||
/// Minimum similarity score (0.0-1.0) to consider a match.
|
||||
/// </summary>
|
||||
public decimal MinSimilarity { get; init; } = 0.5m;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include decompiled code in results.
|
||||
/// </summary>
|
||||
public bool IncludeDecompilation { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to compute detailed instruction-level differences.
|
||||
/// </summary>
|
||||
public bool ComputeDetailedDiffs { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum analysis time in seconds.
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; init; } = 600;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Type of correlator algorithm used for function matching.
|
||||
/// </summary>
|
||||
public enum CorrelatorType
|
||||
{
|
||||
/// <summary>Matches functions with identical byte sequences.</summary>
|
||||
ExactBytes,
|
||||
|
||||
/// <summary>Matches functions with identical instruction mnemonics (ignoring operands).</summary>
|
||||
ExactMnemonics,
|
||||
|
||||
/// <summary>Matches functions by symbol name.</summary>
|
||||
SymbolName,
|
||||
|
||||
/// <summary>Matches functions with similar data references.</summary>
|
||||
DataReference,
|
||||
|
||||
/// <summary>Matches functions with similar call references.</summary>
|
||||
CallReference,
|
||||
|
||||
/// <summary>Combined reference scoring algorithm.</summary>
|
||||
CombinedReference,
|
||||
|
||||
/// <summary>BSim behavioral similarity matching.</summary>
|
||||
BSim
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of Version Tracking analysis.
|
||||
/// </summary>
|
||||
/// <param name="Matches">Functions matched between versions.</param>
|
||||
/// <param name="AddedFunctions">Functions added in the new version.</param>
|
||||
/// <param name="RemovedFunctions">Functions removed from the old version.</param>
|
||||
/// <param name="ModifiedFunctions">Functions modified between versions.</param>
|
||||
/// <param name="Statistics">Analysis statistics.</param>
|
||||
public sealed record VersionTrackingResult(
|
||||
ImmutableArray<FunctionMatch> Matches,
|
||||
ImmutableArray<FunctionAdded> AddedFunctions,
|
||||
ImmutableArray<FunctionRemoved> RemovedFunctions,
|
||||
ImmutableArray<FunctionModified> ModifiedFunctions,
|
||||
VersionTrackingStats Statistics);
|
||||
|
||||
/// <summary>
|
||||
/// Statistics from Version Tracking analysis.
|
||||
/// </summary>
|
||||
/// <param name="TotalOldFunctions">Total functions in old binary.</param>
|
||||
/// <param name="TotalNewFunctions">Total functions in new binary.</param>
|
||||
/// <param name="MatchedCount">Number of matched functions.</param>
|
||||
/// <param name="AddedCount">Number of added functions.</param>
|
||||
/// <param name="RemovedCount">Number of removed functions.</param>
|
||||
/// <param name="ModifiedCount">Number of modified functions (subset of matched).</param>
|
||||
/// <param name="AnalysisDuration">Time taken for analysis.</param>
|
||||
public sealed record VersionTrackingStats(
|
||||
int TotalOldFunctions,
|
||||
int TotalNewFunctions,
|
||||
int MatchedCount,
|
||||
int AddedCount,
|
||||
int RemovedCount,
|
||||
int ModifiedCount,
|
||||
TimeSpan AnalysisDuration);
|
||||
|
||||
/// <summary>
|
||||
/// A matched function between two binary versions.
|
||||
/// </summary>
|
||||
/// <param name="OldName">Function name in old binary.</param>
|
||||
/// <param name="OldAddress">Function address in old binary.</param>
|
||||
/// <param name="NewName">Function name in new binary.</param>
|
||||
/// <param name="NewAddress">Function address in new binary.</param>
|
||||
/// <param name="Similarity">Similarity score (0.0-1.0).</param>
|
||||
/// <param name="MatchedBy">Correlator that produced the match.</param>
|
||||
/// <param name="Differences">Detected differences if any.</param>
|
||||
public sealed record FunctionMatch(
|
||||
string OldName,
|
||||
ulong OldAddress,
|
||||
string NewName,
|
||||
ulong NewAddress,
|
||||
decimal Similarity,
|
||||
CorrelatorType MatchedBy,
|
||||
ImmutableArray<MatchDifference> Differences);
|
||||
|
||||
/// <summary>
|
||||
/// A function added in the new binary version.
|
||||
/// </summary>
|
||||
/// <param name="Name">Function name.</param>
|
||||
/// <param name="Address">Function address.</param>
|
||||
/// <param name="Size">Function size in bytes.</param>
|
||||
/// <param name="Signature">Decompiled signature if available.</param>
|
||||
public sealed record FunctionAdded(
|
||||
string Name,
|
||||
ulong Address,
|
||||
int Size,
|
||||
string? Signature);
|
||||
|
||||
/// <summary>
|
||||
/// A function removed from the old binary version.
|
||||
/// </summary>
|
||||
/// <param name="Name">Function name.</param>
|
||||
/// <param name="Address">Function address.</param>
|
||||
/// <param name="Size">Function size in bytes.</param>
|
||||
/// <param name="Signature">Decompiled signature if available.</param>
|
||||
public sealed record FunctionRemoved(
|
||||
string Name,
|
||||
ulong Address,
|
||||
int Size,
|
||||
string? Signature);
|
||||
|
||||
/// <summary>
|
||||
/// A function modified between versions (with detailed differences).
|
||||
/// </summary>
|
||||
/// <param name="OldName">Function name in old binary.</param>
|
||||
/// <param name="OldAddress">Function address in old binary.</param>
|
||||
/// <param name="OldSize">Function size in old binary.</param>
|
||||
/// <param name="NewName">Function name in new binary.</param>
|
||||
/// <param name="NewAddress">Function address in new binary.</param>
|
||||
/// <param name="NewSize">Function size in new binary.</param>
|
||||
/// <param name="Similarity">Similarity score.</param>
|
||||
/// <param name="Differences">List of specific differences.</param>
|
||||
/// <param name="OldDecompiled">Decompiled code from old binary (if requested).</param>
|
||||
/// <param name="NewDecompiled">Decompiled code from new binary (if requested).</param>
|
||||
public sealed record FunctionModified(
|
||||
string OldName,
|
||||
ulong OldAddress,
|
||||
int OldSize,
|
||||
string NewName,
|
||||
ulong NewAddress,
|
||||
int NewSize,
|
||||
decimal Similarity,
|
||||
ImmutableArray<MatchDifference> Differences,
|
||||
string? OldDecompiled,
|
||||
string? NewDecompiled);
|
||||
|
||||
/// <summary>
|
||||
/// A specific difference between matched functions.
|
||||
/// </summary>
|
||||
/// <param name="Type">Type of difference.</param>
|
||||
/// <param name="Description">Human-readable description.</param>
|
||||
/// <param name="OldValue">Value in old binary (if applicable).</param>
|
||||
/// <param name="NewValue">Value in new binary (if applicable).</param>
|
||||
/// <param name="Address">Address where difference occurs (if applicable).</param>
|
||||
public sealed record MatchDifference(
|
||||
DifferenceType Type,
|
||||
string Description,
|
||||
string? OldValue,
|
||||
string? NewValue,
|
||||
ulong? Address = null);
|
||||
|
||||
/// <summary>
|
||||
/// Type of difference detected between functions.
|
||||
/// </summary>
|
||||
public enum DifferenceType
|
||||
{
|
||||
/// <summary>Instruction added.</summary>
|
||||
InstructionAdded,
|
||||
|
||||
/// <summary>Instruction removed.</summary>
|
||||
InstructionRemoved,
|
||||
|
||||
/// <summary>Instruction changed.</summary>
|
||||
InstructionChanged,
|
||||
|
||||
/// <summary>Branch target changed.</summary>
|
||||
BranchTargetChanged,
|
||||
|
||||
/// <summary>Call target changed.</summary>
|
||||
CallTargetChanged,
|
||||
|
||||
/// <summary>Constant value changed.</summary>
|
||||
ConstantChanged,
|
||||
|
||||
/// <summary>Function size changed.</summary>
|
||||
SizeChanged,
|
||||
|
||||
/// <summary>Stack frame layout changed.</summary>
|
||||
StackFrameChanged,
|
||||
|
||||
/// <summary>Register usage changed.</summary>
|
||||
RegisterUsageChanged
|
||||
}
|
||||
@@ -0,0 +1,245 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when Ghidra operations fail.
|
||||
/// </summary>
|
||||
public class GhidraException : Exception
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new GhidraException.
|
||||
/// </summary>
|
||||
public GhidraException()
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
public GhidraException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraException with a message and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="innerException">Inner exception.</param>
|
||||
public GhidraException(string message, Exception innerException) : base(message, innerException)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exit code from Ghidra process if available.
|
||||
/// </summary>
|
||||
public int? ExitCode { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Standard error output from Ghidra process if available.
|
||||
/// </summary>
|
||||
public string? StandardError { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Standard output from Ghidra process if available.
|
||||
/// </summary>
|
||||
public string? StandardOutput { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when Ghidra is not available or not properly configured.
|
||||
/// </summary>
|
||||
public class GhidraUnavailableException : GhidraException
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new GhidraUnavailableException.
|
||||
/// </summary>
|
||||
public GhidraUnavailableException() : base("Ghidra is not available or not properly configured")
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraUnavailableException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
public GhidraUnavailableException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraUnavailableException with a message and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="innerException">Inner exception.</param>
|
||||
public GhidraUnavailableException(string message, Exception innerException) : base(message, innerException)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when Ghidra analysis times out.
|
||||
/// </summary>
|
||||
public class GhidraTimeoutException : GhidraException
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new GhidraTimeoutException.
|
||||
/// </summary>
|
||||
/// <param name="timeoutSeconds">The timeout that was exceeded.</param>
|
||||
public GhidraTimeoutException(int timeoutSeconds)
|
||||
: base($"Ghidra analysis timed out after {timeoutSeconds} seconds")
|
||||
{
|
||||
TimeoutSeconds = timeoutSeconds;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraTimeoutException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="timeoutSeconds">The timeout that was exceeded.</param>
|
||||
public GhidraTimeoutException(string message, int timeoutSeconds) : base(message)
|
||||
{
|
||||
TimeoutSeconds = timeoutSeconds;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The timeout value that was exceeded.
|
||||
/// </summary>
|
||||
public int TimeoutSeconds { get; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when ghidriff operations fail.
|
||||
/// </summary>
|
||||
public class GhidriffException : Exception
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffException.
|
||||
/// </summary>
|
||||
public GhidriffException()
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
public GhidriffException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffException with a message and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="innerException">Inner exception.</param>
|
||||
public GhidriffException(string message, Exception innerException) : base(message, innerException)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exit code from Python process if available.
|
||||
/// </summary>
|
||||
public int? ExitCode { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Standard error output from Python process if available.
|
||||
/// </summary>
|
||||
public string? StandardError { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Standard output from Python process if available.
|
||||
/// </summary>
|
||||
public string? StandardOutput { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when ghidriff is not available.
|
||||
/// </summary>
|
||||
public class GhidriffUnavailableException : GhidriffException
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffUnavailableException.
|
||||
/// </summary>
|
||||
public GhidriffUnavailableException() : base("ghidriff is not available. Ensure Python and ghidriff are installed.")
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffUnavailableException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
public GhidriffUnavailableException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffUnavailableException with a message and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="innerException">Inner exception.</param>
|
||||
public GhidriffUnavailableException(string message, Exception innerException) : base(message, innerException)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when BSim operations fail.
|
||||
/// </summary>
|
||||
public class BSimException : Exception
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new BSimException.
|
||||
/// </summary>
|
||||
public BSimException()
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new BSimException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
public BSimException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new BSimException with a message and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="innerException">Inner exception.</param>
|
||||
public BSimException(string message, Exception innerException) : base(message, innerException)
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Exception thrown when BSim database is not available.
|
||||
/// </summary>
|
||||
public class BSimUnavailableException : BSimException
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a new BSimUnavailableException.
|
||||
/// </summary>
|
||||
public BSimUnavailableException() : base("BSim database is not available or not configured")
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new BSimUnavailableException with a message.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
public BSimUnavailableException(string message) : base(message)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new BSimUnavailableException with a message and inner exception.
|
||||
/// </summary>
|
||||
/// <param name="message">Error message.</param>
|
||||
/// <param name="innerException">Inner exception.</param>
|
||||
public BSimUnavailableException(string message, Exception innerException) : base(message, innerException)
|
||||
{
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,114 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.DependencyInjection.Extensions;
|
||||
using StellaOps.BinaryIndex.Disassembly;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering Ghidra services.
|
||||
/// </summary>
|
||||
public static class GhidraServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds Ghidra integration services to the service collection.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configuration">The configuration section for Ghidra.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddGhidra(
|
||||
this IServiceCollection services,
|
||||
IConfiguration configuration)
|
||||
{
|
||||
// Bind options
|
||||
services.AddOptions<GhidraOptions>()
|
||||
.Bind(configuration.GetSection(GhidraOptions.SectionName))
|
||||
.ValidateDataAnnotations()
|
||||
.ValidateOnStart();
|
||||
|
||||
services.AddOptions<BSimOptions>()
|
||||
.Bind(configuration.GetSection(BSimOptions.SectionName))
|
||||
.ValidateOnStart();
|
||||
|
||||
services.AddOptions<GhidriffOptions>()
|
||||
.Bind(configuration.GetSection(GhidriffOptions.SectionName))
|
||||
.ValidateOnStart();
|
||||
|
||||
// Register TimeProvider if not already registered
|
||||
services.TryAddSingleton(TimeProvider.System);
|
||||
|
||||
// Register services
|
||||
services.AddSingleton<GhidraHeadlessManager>();
|
||||
services.AddSingleton<IGhidraService, GhidraService>();
|
||||
services.AddSingleton<IGhidriffBridge, GhidriffBridge>();
|
||||
services.AddSingleton<IVersionTrackingService, VersionTrackingService>();
|
||||
services.AddSingleton<IBSimService, BSimService>();
|
||||
|
||||
// Register as IDisassemblyPlugin for fallback disassembly
|
||||
services.AddSingleton<IDisassemblyPlugin, GhidraDisassemblyPlugin>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds Ghidra integration services with custom configuration.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureGhidra">Action to configure Ghidra options.</param>
|
||||
/// <param name="configureBSim">Optional action to configure BSim options.</param>
|
||||
/// <param name="configureGhidriff">Optional action to configure ghidriff options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddGhidra(
|
||||
this IServiceCollection services,
|
||||
Action<GhidraOptions> configureGhidra,
|
||||
Action<BSimOptions>? configureBSim = null,
|
||||
Action<GhidriffOptions>? configureGhidriff = null)
|
||||
{
|
||||
services.AddOptions<GhidraOptions>()
|
||||
.Configure(configureGhidra)
|
||||
.ValidateDataAnnotations()
|
||||
.ValidateOnStart();
|
||||
|
||||
if (configureBSim is not null)
|
||||
{
|
||||
services.AddOptions<BSimOptions>()
|
||||
.Configure(configureBSim)
|
||||
.ValidateOnStart();
|
||||
}
|
||||
else
|
||||
{
|
||||
services.AddOptions<BSimOptions>()
|
||||
.ValidateOnStart();
|
||||
}
|
||||
|
||||
if (configureGhidriff is not null)
|
||||
{
|
||||
services.AddOptions<GhidriffOptions>()
|
||||
.Configure(configureGhidriff)
|
||||
.ValidateOnStart();
|
||||
}
|
||||
else
|
||||
{
|
||||
services.AddOptions<GhidriffOptions>()
|
||||
.ValidateOnStart();
|
||||
}
|
||||
|
||||
// Register TimeProvider if not already registered
|
||||
services.TryAddSingleton(TimeProvider.System);
|
||||
|
||||
// Register services
|
||||
services.AddSingleton<GhidraHeadlessManager>();
|
||||
services.AddSingleton<IGhidraService, GhidraService>();
|
||||
services.AddSingleton<IGhidriffBridge, GhidriffBridge>();
|
||||
services.AddSingleton<IVersionTrackingService, VersionTrackingService>();
|
||||
services.AddSingleton<IBSimService, BSimService>();
|
||||
|
||||
// Register as IDisassemblyPlugin for fallback disassembly
|
||||
services.AddSingleton<IDisassemblyPlugin, GhidraDisassemblyPlugin>();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,157 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// A function discovered by Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="Name">Function name (may be auto-generated like FUN_00401000).</param>
|
||||
/// <param name="Address">Virtual address of the function entry point.</param>
|
||||
/// <param name="Size">Size of the function in bytes.</param>
|
||||
/// <param name="Signature">Decompiled signature if available.</param>
|
||||
/// <param name="DecompiledCode">Decompiled C code if requested.</param>
|
||||
/// <param name="PCodeHash">SHA256 hash of normalized P-Code for semantic comparison.</param>
|
||||
/// <param name="CalledFunctions">Names of functions called by this function.</param>
|
||||
/// <param name="CallingFunctions">Names of functions that call this function.</param>
|
||||
/// <param name="IsThunk">Whether this is a thunk/stub function.</param>
|
||||
/// <param name="IsExternal">Whether this function is external (imported).</param>
|
||||
public sealed record GhidraFunction(
|
||||
string Name,
|
||||
ulong Address,
|
||||
int Size,
|
||||
string? Signature,
|
||||
string? DecompiledCode,
|
||||
byte[]? PCodeHash,
|
||||
ImmutableArray<string> CalledFunctions,
|
||||
ImmutableArray<string> CallingFunctions,
|
||||
bool IsThunk = false,
|
||||
bool IsExternal = false);
|
||||
|
||||
/// <summary>
|
||||
/// An import symbol from Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="Name">Symbol name.</param>
|
||||
/// <param name="Address">Address where symbol is referenced.</param>
|
||||
/// <param name="LibraryName">Name of the library providing the symbol.</param>
|
||||
/// <param name="Ordinal">Ordinal number if applicable (PE imports).</param>
|
||||
public sealed record GhidraImport(
|
||||
string Name,
|
||||
ulong Address,
|
||||
string? LibraryName,
|
||||
int? Ordinal);
|
||||
|
||||
/// <summary>
|
||||
/// An export symbol from Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="Name">Symbol name.</param>
|
||||
/// <param name="Address">Address of the exported symbol.</param>
|
||||
/// <param name="Ordinal">Ordinal number if applicable (PE exports).</param>
|
||||
public sealed record GhidraExport(
|
||||
string Name,
|
||||
ulong Address,
|
||||
int? Ordinal);
|
||||
|
||||
/// <summary>
|
||||
/// A string literal discovered by Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="Value">The string value.</param>
|
||||
/// <param name="Address">Address where string is located.</param>
|
||||
/// <param name="Length">Length of the string in bytes.</param>
|
||||
/// <param name="Encoding">String encoding (ASCII, UTF-8, UTF-16, etc.).</param>
|
||||
public sealed record GhidraString(
|
||||
string Value,
|
||||
ulong Address,
|
||||
int Length,
|
||||
string Encoding);
|
||||
|
||||
/// <summary>
|
||||
/// Metadata from Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="FileName">Name of the analyzed file.</param>
|
||||
/// <param name="Format">Binary format detected (ELF, PE, Mach-O, etc.).</param>
|
||||
/// <param name="Architecture">CPU architecture.</param>
|
||||
/// <param name="Processor">Ghidra processor language ID.</param>
|
||||
/// <param name="Compiler">Compiler ID if detected.</param>
|
||||
/// <param name="Endianness">Byte order (little or big endian).</param>
|
||||
/// <param name="AddressSize">Pointer size in bits (32 or 64).</param>
|
||||
/// <param name="ImageBase">Image base address.</param>
|
||||
/// <param name="EntryPoint">Entry point address.</param>
|
||||
/// <param name="AnalysisDate">When analysis was performed.</param>
|
||||
/// <param name="GhidraVersion">Ghidra version used.</param>
|
||||
/// <param name="AnalysisDuration">How long analysis took.</param>
|
||||
public sealed record GhidraMetadata(
|
||||
string FileName,
|
||||
string Format,
|
||||
string Architecture,
|
||||
string Processor,
|
||||
string? Compiler,
|
||||
string Endianness,
|
||||
int AddressSize,
|
||||
ulong ImageBase,
|
||||
ulong? EntryPoint,
|
||||
DateTimeOffset AnalysisDate,
|
||||
string GhidraVersion,
|
||||
TimeSpan AnalysisDuration);
|
||||
|
||||
/// <summary>
|
||||
/// A data reference discovered by Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="FromAddress">Address where reference originates.</param>
|
||||
/// <param name="ToAddress">Address being referenced.</param>
|
||||
/// <param name="ReferenceType">Type of reference (read, write, call, etc.).</param>
|
||||
public sealed record GhidraDataReference(
|
||||
ulong FromAddress,
|
||||
ulong ToAddress,
|
||||
GhidraReferenceType ReferenceType);
|
||||
|
||||
/// <summary>
|
||||
/// Type of reference in Ghidra analysis.
|
||||
/// </summary>
|
||||
public enum GhidraReferenceType
|
||||
{
|
||||
/// <summary>Unknown reference type.</summary>
|
||||
Unknown,
|
||||
|
||||
/// <summary>Memory read reference.</summary>
|
||||
Read,
|
||||
|
||||
/// <summary>Memory write reference.</summary>
|
||||
Write,
|
||||
|
||||
/// <summary>Function call reference.</summary>
|
||||
Call,
|
||||
|
||||
/// <summary>Unconditional jump reference.</summary>
|
||||
UnconditionalJump,
|
||||
|
||||
/// <summary>Conditional jump reference.</summary>
|
||||
ConditionalJump,
|
||||
|
||||
/// <summary>Computed/indirect reference.</summary>
|
||||
Computed,
|
||||
|
||||
/// <summary>Data reference (address of).</summary>
|
||||
Data
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A memory block/section from Ghidra analysis.
|
||||
/// </summary>
|
||||
/// <param name="Name">Section name (.text, .data, etc.).</param>
|
||||
/// <param name="Start">Start address.</param>
|
||||
/// <param name="End">End address.</param>
|
||||
/// <param name="Size">Size in bytes.</param>
|
||||
/// <param name="IsExecutable">Whether section is executable.</param>
|
||||
/// <param name="IsWritable">Whether section is writable.</param>
|
||||
/// <param name="IsInitialized">Whether section has initialized data.</param>
|
||||
public sealed record GhidraMemoryBlock(
|
||||
string Name,
|
||||
ulong Start,
|
||||
ulong End,
|
||||
long Size,
|
||||
bool IsExecutable,
|
||||
bool IsWritable,
|
||||
bool IsInitialized);
|
||||
@@ -0,0 +1,188 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.ComponentModel.DataAnnotations;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for Ghidra integration.
|
||||
/// </summary>
|
||||
public sealed class GhidraOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section name.
|
||||
/// </summary>
|
||||
public const string SectionName = "Ghidra";
|
||||
|
||||
/// <summary>
|
||||
/// Path to Ghidra installation directory (GHIDRA_HOME).
|
||||
/// </summary>
|
||||
[Required]
|
||||
public string GhidraHome { get; set; } = string.Empty;
|
||||
|
||||
/// <summary>
|
||||
/// Path to Java installation directory (JAVA_HOME).
|
||||
/// If not set, system JAVA_HOME will be used.
|
||||
/// </summary>
|
||||
public string? JavaHome { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Working directory for Ghidra projects and temporary files.
|
||||
/// </summary>
|
||||
[Required]
|
||||
public string WorkDir { get; set; } = Path.Combine(Path.GetTempPath(), "stellaops-ghidra");
|
||||
|
||||
/// <summary>
|
||||
/// Path to custom Ghidra scripts directory.
|
||||
/// </summary>
|
||||
public string? ScriptsDir { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Maximum memory for Ghidra JVM (e.g., "4G", "8192M").
|
||||
/// </summary>
|
||||
public string MaxMemory { get; set; } = "4G";
|
||||
|
||||
/// <summary>
|
||||
/// Maximum CPU cores for Ghidra analysis.
|
||||
/// </summary>
|
||||
public int MaxCpu { get; set; } = Environment.ProcessorCount;
|
||||
|
||||
/// <summary>
|
||||
/// Default timeout for analysis operations in seconds.
|
||||
/// </summary>
|
||||
public int DefaultTimeoutSeconds { get; set; } = 300;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to clean up temporary projects after analysis.
|
||||
/// </summary>
|
||||
public bool CleanupTempProjects { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum concurrent Ghidra instances.
|
||||
/// </summary>
|
||||
public int MaxConcurrentInstances { get; set; } = 1;
|
||||
|
||||
/// <summary>
|
||||
/// Whether Ghidra integration is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for BSim database.
|
||||
/// </summary>
|
||||
public sealed class BSimOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section name.
|
||||
/// </summary>
|
||||
public const string SectionName = "BSim";
|
||||
|
||||
/// <summary>
|
||||
/// BSim database connection string.
|
||||
/// Format: postgresql://user:pass@host:port/database
|
||||
/// </summary>
|
||||
public string? ConnectionString { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// BSim database host.
|
||||
/// </summary>
|
||||
public string Host { get; set; } = "localhost";
|
||||
|
||||
/// <summary>
|
||||
/// BSim database port.
|
||||
/// </summary>
|
||||
public int Port { get; set; } = 5432;
|
||||
|
||||
/// <summary>
|
||||
/// BSim database name.
|
||||
/// </summary>
|
||||
public string Database { get; set; } = "bsim";
|
||||
|
||||
/// <summary>
|
||||
/// BSim database username.
|
||||
/// </summary>
|
||||
public string Username { get; set; } = "bsim";
|
||||
|
||||
/// <summary>
|
||||
/// BSim database password.
|
||||
/// </summary>
|
||||
public string? Password { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Default minimum similarity for queries.
|
||||
/// </summary>
|
||||
public double DefaultMinSimilarity { get; set; } = 0.7;
|
||||
|
||||
/// <summary>
|
||||
/// Default maximum results per query.
|
||||
/// </summary>
|
||||
public int DefaultMaxResults { get; set; } = 10;
|
||||
|
||||
/// <summary>
|
||||
/// Whether BSim integration is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Gets the effective connection string.
|
||||
/// </summary>
|
||||
public string GetConnectionString()
|
||||
{
|
||||
if (!string.IsNullOrEmpty(ConnectionString))
|
||||
{
|
||||
return ConnectionString;
|
||||
}
|
||||
|
||||
var password = string.IsNullOrEmpty(Password) ? "" : $":{Password}";
|
||||
return $"postgresql://{Username}{password}@{Host}:{Port}/{Database}";
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration options for ghidriff Python bridge.
|
||||
/// </summary>
|
||||
public sealed class GhidriffOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Configuration section name.
|
||||
/// </summary>
|
||||
public const string SectionName = "Ghidriff";
|
||||
|
||||
/// <summary>
|
||||
/// Path to Python executable.
|
||||
/// If not set, "python3" or "python" will be used from PATH.
|
||||
/// </summary>
|
||||
public string? PythonPath { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Path to ghidriff module (if not installed via pip).
|
||||
/// </summary>
|
||||
public string? GhidriffModulePath { get; set; }
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include decompilation in diff output by default.
|
||||
/// </summary>
|
||||
public bool DefaultIncludeDecompilation { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include disassembly in diff output by default.
|
||||
/// </summary>
|
||||
public bool DefaultIncludeDisassembly { get; set; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Default timeout for ghidriff operations in seconds.
|
||||
/// </summary>
|
||||
public int DefaultTimeoutSeconds { get; set; } = 600;
|
||||
|
||||
/// <summary>
|
||||
/// Working directory for ghidriff output.
|
||||
/// </summary>
|
||||
public string WorkDir { get; set; } = Path.Combine(Path.GetTempPath(), "stellaops-ghidriff");
|
||||
|
||||
/// <summary>
|
||||
/// Whether ghidriff integration is enabled.
|
||||
/// </summary>
|
||||
public bool Enabled { get; set; } = true;
|
||||
}
|
||||
@@ -0,0 +1,285 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Globalization;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Implementation of <see cref="IBSimService"/> for BSim signature generation and querying.
|
||||
/// </summary>
|
||||
public sealed class BSimService : IBSimService
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNameCaseInsensitive = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
};
|
||||
|
||||
private readonly GhidraHeadlessManager _headlessManager;
|
||||
private readonly BSimOptions _options;
|
||||
private readonly GhidraOptions _ghidraOptions;
|
||||
private readonly ILogger<BSimService> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new BSimService.
|
||||
/// </summary>
|
||||
/// <param name="headlessManager">The Ghidra Headless manager.</param>
|
||||
/// <param name="options">BSim options.</param>
|
||||
/// <param name="ghidraOptions">Ghidra options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public BSimService(
|
||||
GhidraHeadlessManager headlessManager,
|
||||
IOptions<BSimOptions> options,
|
||||
IOptions<GhidraOptions> ghidraOptions,
|
||||
ILogger<BSimService> logger)
|
||||
{
|
||||
_headlessManager = headlessManager;
|
||||
_options = options.Value;
|
||||
_ghidraOptions = ghidraOptions.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<BSimSignature>> GenerateSignaturesAsync(
|
||||
GhidraAnalysisResult analysis,
|
||||
BSimGenerationOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(analysis);
|
||||
|
||||
options ??= new BSimGenerationOptions();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Generating BSim signatures for {FunctionCount} functions",
|
||||
analysis.Functions.Length);
|
||||
|
||||
// Filter functions based on options
|
||||
var eligibleFunctions = analysis.Functions
|
||||
.Where(f => IsEligibleForBSim(f, options))
|
||||
.ToList();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Filtered to {EligibleCount} eligible functions (min size: {MinSize}, include thunks: {IncludeThunks})",
|
||||
eligibleFunctions.Count,
|
||||
options.MinFunctionSize,
|
||||
options.IncludeThunks);
|
||||
|
||||
// For each eligible function, generate a BSim signature
|
||||
// In a real implementation, this would use Ghidra's BSim feature extraction
|
||||
var signatures = new List<BSimSignature>();
|
||||
|
||||
foreach (var function in eligibleFunctions)
|
||||
{
|
||||
var signature = GenerateSignatureFromFunction(function);
|
||||
if (signature is not null)
|
||||
{
|
||||
signatures.Add(signature);
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Generated {SignatureCount} BSim signatures",
|
||||
signatures.Count);
|
||||
|
||||
return [.. signatures];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<BSimMatch>> QueryAsync(
|
||||
BSimSignature signature,
|
||||
BSimQueryOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(signature);
|
||||
|
||||
options ??= new BSimQueryOptions
|
||||
{
|
||||
MinSimilarity = _options.DefaultMinSimilarity,
|
||||
MaxResults = _options.DefaultMaxResults
|
||||
};
|
||||
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
_logger.LogWarning("BSim is not enabled, returning empty results");
|
||||
return [];
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Querying BSim for function: {FunctionName} (min similarity: {MinSimilarity})",
|
||||
signature.FunctionName,
|
||||
options.MinSimilarity);
|
||||
|
||||
// In a real implementation, this would query the BSim PostgreSQL database
|
||||
// For now, return empty results as BSim database setup is a separate task
|
||||
return await Task.FromResult(ImmutableArray<BSimMatch>.Empty);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<BSimQueryResult>> QueryBatchAsync(
|
||||
ImmutableArray<BSimSignature> signatures,
|
||||
BSimQueryOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
options ??= new BSimQueryOptions
|
||||
{
|
||||
MinSimilarity = _options.DefaultMinSimilarity,
|
||||
MaxResults = _options.DefaultMaxResults
|
||||
};
|
||||
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
_logger.LogWarning("BSim is not enabled, returning empty results");
|
||||
return signatures.Select(s => new BSimQueryResult(s, [])).ToImmutableArray();
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Batch querying BSim for {Count} signatures",
|
||||
signatures.Length);
|
||||
|
||||
var results = new List<BSimQueryResult>();
|
||||
|
||||
foreach (var signature in signatures)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
var matches = await QueryAsync(signature, options, ct);
|
||||
results.Add(new BSimQueryResult(signature, matches));
|
||||
}
|
||||
|
||||
return [.. results];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task IngestAsync(
|
||||
string libraryName,
|
||||
string version,
|
||||
ImmutableArray<BSimSignature> signatures,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(libraryName);
|
||||
ArgumentException.ThrowIfNullOrEmpty(version);
|
||||
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
throw new BSimUnavailableException("BSim is not enabled");
|
||||
}
|
||||
|
||||
_logger.LogInformation(
|
||||
"Ingesting {SignatureCount} signatures for {Library} v{Version}",
|
||||
signatures.Length,
|
||||
libraryName,
|
||||
version);
|
||||
|
||||
// In a real implementation, this would insert into the BSim PostgreSQL database
|
||||
// For now, throw as BSim database setup is a separate task
|
||||
throw new NotImplementedException(
|
||||
"BSim ingestion requires BSim PostgreSQL database setup (GHID-011). " +
|
||||
"See docs/implplan/SPRINT_20260105_001_003_BINDEX_semdiff_ghidra.md");
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<bool> IsAvailableAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if BSim database is accessible
|
||||
// For now, just check if Ghidra is available since BSim requires it
|
||||
return await _headlessManager.IsAvailableAsync(ct);
|
||||
}
|
||||
|
||||
private static bool IsEligibleForBSim(GhidraFunction function, BSimGenerationOptions options)
|
||||
{
|
||||
// Skip thunks unless explicitly included
|
||||
if (function.IsThunk && !options.IncludeThunks)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Skip external/imported functions unless explicitly included
|
||||
if (function.IsExternal && !options.IncludeImports)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// Skip functions below minimum size
|
||||
// Note: We use function size as a proxy; ideally we'd use instruction count
|
||||
// which would require parsing the function body
|
||||
if (function.Size < options.MinFunctionSize * 4) // Rough estimate: ~4 bytes per instruction
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private BSimSignature? GenerateSignatureFromFunction(GhidraFunction function)
|
||||
{
|
||||
// In a real implementation, this would use Ghidra's BSim feature extraction
|
||||
// which analyzes P-Code to generate behavioral signatures
|
||||
//
|
||||
// The signature captures:
|
||||
// - Data flow patterns
|
||||
// - Control flow structure
|
||||
// - Normalized constants
|
||||
// - API usage patterns
|
||||
|
||||
// If we have a P-Code hash from Ghidra analysis, use it as the feature vector
|
||||
if (function.PCodeHash is not null)
|
||||
{
|
||||
// Calculate self-significance based on function complexity
|
||||
var selfSignificance = CalculateSelfSignificance(function);
|
||||
|
||||
return new BSimSignature(
|
||||
function.Name,
|
||||
function.Address,
|
||||
function.PCodeHash,
|
||||
function.PCodeHash.Length,
|
||||
selfSignificance,
|
||||
EstimateInstructionCount(function.Size));
|
||||
}
|
||||
|
||||
// If no P-Code hash, we can't generate a meaningful BSim signature
|
||||
_logger.LogDebug(
|
||||
"Function {Name} has no P-Code hash, skipping BSim signature generation",
|
||||
function.Name);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static double CalculateSelfSignificance(GhidraFunction function)
|
||||
{
|
||||
// Self-significance measures how distinctive a function is
|
||||
// Higher values = more unique signature = better for identification
|
||||
//
|
||||
// Factors that increase significance:
|
||||
// - More called functions (API usage)
|
||||
// - Larger size (more behavioral information)
|
||||
// - Fewer callers (not a common utility)
|
||||
|
||||
var baseScore = 0.5;
|
||||
|
||||
// Called functions increase significance
|
||||
var callScore = Math.Min(function.CalledFunctions.Length * 0.1, 0.3);
|
||||
|
||||
// Size increases significance (diminishing returns)
|
||||
var sizeScore = Math.Min(Math.Log10(Math.Max(function.Size, 1)) * 0.1, 0.15);
|
||||
|
||||
// Many callers decrease significance (common utility functions)
|
||||
var callerPenalty = function.CallingFunctions.Length > 10 ? 0.1 : 0;
|
||||
|
||||
return Math.Min(baseScore + callScore + sizeScore - callerPenalty, 1.0);
|
||||
}
|
||||
|
||||
private static int EstimateInstructionCount(int functionSize)
|
||||
{
|
||||
// Rough estimate: average 4 bytes per instruction for most architectures
|
||||
return Math.Max(functionSize / 4, 1);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,540 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using StellaOps.BinaryIndex.Disassembly;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Ghidra-based disassembly plugin providing broad architecture support as a fallback backend.
|
||||
/// Ghidra is used for complex cases where B2R2 has limited coverage, supports 20+ architectures,
|
||||
/// and provides mature decompilation, Version Tracking, and BSim capabilities.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This plugin has lower priority than B2R2 since Ghidra requires external process invocation
|
||||
/// (Java-based headless analysis) which is slower than native .NET disassembly. It serves as
|
||||
/// the fallback when B2R2 returns low-confidence results or for architectures B2R2 handles poorly.
|
||||
/// </remarks>
|
||||
public sealed class GhidraDisassemblyPlugin : IDisassemblyPlugin, IDisposable
|
||||
{
|
||||
/// <summary>
|
||||
/// Plugin identifier.
|
||||
/// </summary>
|
||||
public const string PluginId = "stellaops.disasm.ghidra";
|
||||
|
||||
private readonly IGhidraService _ghidraService;
|
||||
private readonly GhidraOptions _options;
|
||||
private readonly ILogger<GhidraDisassemblyPlugin> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private bool _disposed;
|
||||
|
||||
private static readonly DisassemblyCapabilities s_capabilities = new()
|
||||
{
|
||||
PluginId = PluginId,
|
||||
Name = "Ghidra Disassembler",
|
||||
Version = "11.x", // Ghidra 11.x
|
||||
SupportedArchitectures =
|
||||
[
|
||||
// All architectures supported by both B2R2 and Ghidra
|
||||
CpuArchitecture.X86,
|
||||
CpuArchitecture.X86_64,
|
||||
CpuArchitecture.ARM32,
|
||||
CpuArchitecture.ARM64,
|
||||
CpuArchitecture.MIPS32,
|
||||
CpuArchitecture.MIPS64,
|
||||
CpuArchitecture.RISCV64,
|
||||
CpuArchitecture.PPC32,
|
||||
CpuArchitecture.PPC64, // Ghidra supports PPC64 better than B2R2
|
||||
CpuArchitecture.SPARC,
|
||||
CpuArchitecture.SH4,
|
||||
CpuArchitecture.AVR,
|
||||
// Additional architectures Ghidra supports
|
||||
CpuArchitecture.WASM
|
||||
],
|
||||
SupportedFormats =
|
||||
[
|
||||
BinaryFormat.ELF,
|
||||
BinaryFormat.PE,
|
||||
BinaryFormat.MachO,
|
||||
BinaryFormat.WASM,
|
||||
BinaryFormat.Raw
|
||||
],
|
||||
SupportsLifting = true, // P-Code lifting
|
||||
SupportsCfgRecovery = true, // Full CFG recovery and decompilation
|
||||
Priority = 25 // Lower than B2R2 (50) - used as fallback
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new Ghidra disassembly plugin.
|
||||
/// </summary>
|
||||
/// <param name="ghidraService">The Ghidra analysis service.</param>
|
||||
/// <param name="options">Ghidra options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="timeProvider">Time provider for timestamps.</param>
|
||||
public GhidraDisassemblyPlugin(
|
||||
IGhidraService ghidraService,
|
||||
IOptions<GhidraOptions> options,
|
||||
ILogger<GhidraDisassemblyPlugin> logger,
|
||||
TimeProvider timeProvider)
|
||||
{
|
||||
_ghidraService = ghidraService ?? throw new ArgumentNullException(nameof(ghidraService));
|
||||
_options = options?.Value ?? throw new ArgumentNullException(nameof(options));
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_timeProvider = timeProvider ?? throw new ArgumentNullException(nameof(timeProvider));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public DisassemblyCapabilities Capabilities => s_capabilities;
|
||||
|
||||
/// <inheritdoc />
|
||||
public BinaryInfo LoadBinary(Stream stream, CpuArchitecture? archHint = null, BinaryFormat? formatHint = null)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(stream);
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
// Copy stream to memory for analysis
|
||||
using var memStream = new MemoryStream();
|
||||
stream.CopyTo(memStream);
|
||||
return LoadBinary(memStream.ToArray(), archHint, formatHint);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public BinaryInfo LoadBinary(ReadOnlySpan<byte> bytes, CpuArchitecture? archHint = null, BinaryFormat? formatHint = null)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
var byteArray = bytes.ToArray();
|
||||
_logger.LogDebug("Loading binary with Ghidra plugin (size: {Size} bytes)", byteArray.Length);
|
||||
|
||||
// Run Ghidra analysis synchronously for IDisassemblyPlugin contract
|
||||
var analysisTask = RunGhidraAnalysisAsync(byteArray, archHint, formatHint, CancellationToken.None);
|
||||
var result = analysisTask.GetAwaiter().GetResult();
|
||||
|
||||
// Map Ghidra metadata to BinaryInfo
|
||||
var format = MapFormat(result.Metadata.Format);
|
||||
var architecture = MapArchitecture(result.Metadata.Architecture, result.Metadata.AddressSize);
|
||||
var endianness = result.Metadata.Endianness.Equals("little", StringComparison.OrdinalIgnoreCase)
|
||||
? Endianness.Little
|
||||
: Endianness.Big;
|
||||
var abi = DetectAbi(format);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Loaded binary with Ghidra: Format={Format}, Architecture={Architecture}, Processor={Processor}",
|
||||
format, architecture, result.Metadata.Processor);
|
||||
|
||||
var metadata = new Dictionary<string, object>
|
||||
{
|
||||
["size"] = byteArray.Length,
|
||||
["ghidra_processor"] = result.Metadata.Processor,
|
||||
["ghidra_version"] = result.Metadata.GhidraVersion,
|
||||
["analysis_duration_ms"] = result.Metadata.AnalysisDuration.TotalMilliseconds,
|
||||
["function_count"] = result.Functions.Length,
|
||||
["import_count"] = result.Imports.Length,
|
||||
["export_count"] = result.Exports.Length
|
||||
};
|
||||
|
||||
if (result.Metadata.Compiler is not null)
|
||||
{
|
||||
metadata["compiler"] = result.Metadata.Compiler;
|
||||
}
|
||||
|
||||
return new BinaryInfo(
|
||||
Format: format,
|
||||
Architecture: architecture,
|
||||
Bitness: result.Metadata.AddressSize,
|
||||
Endianness: endianness,
|
||||
Abi: abi,
|
||||
EntryPoint: result.Metadata.EntryPoint,
|
||||
BuildId: result.BinaryHash,
|
||||
Metadata: metadata,
|
||||
Handle: new GhidraBinaryHandle(result, byteArray));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IEnumerable<CodeRegion> GetCodeRegions(BinaryInfo binary)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(binary);
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
var handle = GetHandle(binary);
|
||||
|
||||
// Extract code regions from Ghidra memory blocks
|
||||
foreach (var block in handle.Result.MemoryBlocks)
|
||||
{
|
||||
if (block.IsExecutable)
|
||||
{
|
||||
yield return new CodeRegion(
|
||||
Name: block.Name,
|
||||
VirtualAddress: block.Start,
|
||||
FileOffset: block.Start - handle.Result.Metadata.ImageBase,
|
||||
Size: (ulong)block.Size,
|
||||
IsExecutable: block.IsExecutable,
|
||||
IsReadable: true, // Executable sections are readable
|
||||
IsWritable: block.IsWritable);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IEnumerable<SymbolInfo> GetSymbols(BinaryInfo binary)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(binary);
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
var handle = GetHandle(binary);
|
||||
|
||||
// Map functions to symbols
|
||||
foreach (var func in handle.Result.Functions)
|
||||
{
|
||||
var binding = func.IsExternal ? SymbolBinding.Global : SymbolBinding.Local;
|
||||
|
||||
yield return new SymbolInfo(
|
||||
Name: func.Name,
|
||||
Address: func.Address,
|
||||
Size: (ulong)func.Size,
|
||||
Type: SymbolType.Function,
|
||||
Binding: binding,
|
||||
Section: DetermineSection(handle.Result.MemoryBlocks, func.Address));
|
||||
}
|
||||
|
||||
// Also include exports as symbols
|
||||
foreach (var export in handle.Result.Exports)
|
||||
{
|
||||
yield return new SymbolInfo(
|
||||
Name: export.Name,
|
||||
Address: export.Address,
|
||||
Size: 0, // Unknown size for exports
|
||||
Type: SymbolType.Function,
|
||||
Binding: SymbolBinding.Global,
|
||||
Section: DetermineSection(handle.Result.MemoryBlocks, export.Address));
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IEnumerable<DisassembledInstruction> Disassemble(BinaryInfo binary, CodeRegion region)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(binary);
|
||||
ArgumentNullException.ThrowIfNull(region);
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
var handle = GetHandle(binary);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Disassembling region {Name} from 0x{Start:X} to 0x{End:X}",
|
||||
region.Name, region.VirtualAddress, region.VirtualAddress + region.Size);
|
||||
|
||||
// Find functions within the region and return their instructions
|
||||
var regionEnd = region.VirtualAddress + region.Size;
|
||||
|
||||
foreach (var func in handle.Result.Functions)
|
||||
{
|
||||
if (func.Address >= region.VirtualAddress && func.Address < regionEnd)
|
||||
{
|
||||
foreach (var instr in DisassembleFunctionInstructions(func, handle))
|
||||
{
|
||||
if (instr.Address >= region.VirtualAddress && instr.Address < regionEnd)
|
||||
{
|
||||
yield return instr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IEnumerable<DisassembledInstruction> Disassemble(BinaryInfo binary, ulong startAddress, ulong length)
|
||||
{
|
||||
var region = new CodeRegion(
|
||||
Name: $"0x{startAddress:X}",
|
||||
VirtualAddress: startAddress,
|
||||
FileOffset: startAddress,
|
||||
Size: length,
|
||||
IsExecutable: true,
|
||||
IsReadable: true,
|
||||
IsWritable: false);
|
||||
|
||||
return Disassemble(binary, region);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public IEnumerable<DisassembledInstruction> DisassembleSymbol(BinaryInfo binary, SymbolInfo symbol)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(binary);
|
||||
ArgumentNullException.ThrowIfNull(symbol);
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
var handle = GetHandle(binary);
|
||||
|
||||
// Find the function matching the symbol
|
||||
var func = handle.Result.Functions.FirstOrDefault(f =>
|
||||
f.Address == symbol.Address || f.Name.Equals(symbol.Name, StringComparison.Ordinal));
|
||||
|
||||
if (func is null)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Function not found for symbol {Name} at 0x{Address:X}",
|
||||
symbol.Name, symbol.Address);
|
||||
yield break;
|
||||
}
|
||||
|
||||
foreach (var instr in DisassembleFunctionInstructions(func, handle))
|
||||
{
|
||||
yield return instr;
|
||||
}
|
||||
}
|
||||
|
||||
#region Private Methods
|
||||
|
||||
private async Task<GhidraAnalysisResult> RunGhidraAnalysisAsync(
|
||||
byte[] bytes,
|
||||
CpuArchitecture? archHint,
|
||||
BinaryFormat? formatHint,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// Write bytes to temp file
|
||||
var tempPath = Path.Combine(
|
||||
_options.WorkDir,
|
||||
$"disasm_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}.bin");
|
||||
|
||||
try
|
||||
{
|
||||
Directory.CreateDirectory(Path.GetDirectoryName(tempPath)!);
|
||||
await File.WriteAllBytesAsync(tempPath, bytes, ct);
|
||||
|
||||
var options = new GhidraAnalysisOptions
|
||||
{
|
||||
RunFullAnalysis = true,
|
||||
ExtractStrings = false, // Not needed for disassembly
|
||||
ExtractFunctions = true,
|
||||
ExtractDecompilation = false, // Can be expensive
|
||||
TimeoutSeconds = _options.DefaultTimeoutSeconds
|
||||
};
|
||||
|
||||
// Add architecture hint if provided
|
||||
if (archHint.HasValue)
|
||||
{
|
||||
options = options with { ProcessorHint = MapToGhidraProcessor(archHint.Value) };
|
||||
}
|
||||
|
||||
using var stream = File.OpenRead(tempPath);
|
||||
return await _ghidraService.AnalyzeAsync(stream, options, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
TryDeleteFile(tempPath);
|
||||
}
|
||||
}
|
||||
|
||||
private static IEnumerable<DisassembledInstruction> DisassembleFunctionInstructions(
|
||||
GhidraFunction func,
|
||||
GhidraBinaryHandle handle)
|
||||
{
|
||||
// Ghidra full analysis provides function boundaries but not individual instructions
|
||||
// We synthesize instruction info from the function's decompiled code or from the raw bytes
|
||||
|
||||
// For now, return a synthetic instruction representing the function entry
|
||||
// A full implementation would require running a Ghidra script to export instructions
|
||||
|
||||
// Calculate approximate instruction count based on function size and average instruction size
|
||||
// x86/x64 average instruction size is ~3-4 bytes
|
||||
var avgInstructionSize = handle.Result.Metadata.AddressSize == 64 ? 4 : 3;
|
||||
var estimatedInstructions = Math.Max(1, func.Size / avgInstructionSize);
|
||||
|
||||
var address = func.Address;
|
||||
for (var i = 0; i < estimatedInstructions && i < 1000; i++) // Cap at 1000 instructions
|
||||
{
|
||||
// Without actual Ghidra instruction export, we create placeholder entries
|
||||
// Real implementation would parse Ghidra's instruction listing output
|
||||
var rawBytes = ExtractBytes(handle.Bytes, address, handle.Result.Metadata.ImageBase, avgInstructionSize);
|
||||
|
||||
yield return new DisassembledInstruction(
|
||||
Address: address,
|
||||
RawBytes: rawBytes,
|
||||
Mnemonic: "GHIDRA", // Placeholder - real impl would have actual mnemonics
|
||||
OperandsText: $"; function {func.Name} + 0x{address - func.Address:X}",
|
||||
Kind: i == 0 ? InstructionKind.Call : InstructionKind.Unknown,
|
||||
Operands: []);
|
||||
|
||||
address += (ulong)avgInstructionSize;
|
||||
if (address >= func.Address + (ulong)func.Size)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static ImmutableArray<byte> ExtractBytes(byte[] binary, ulong address, ulong imageBase, int count)
|
||||
{
|
||||
var offset = address - imageBase;
|
||||
if (offset >= (ulong)binary.Length)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
var available = Math.Min(count, binary.Length - (int)offset);
|
||||
return binary.AsSpan((int)offset, available).ToArray().ToImmutableArray();
|
||||
}
|
||||
|
||||
private static string? DetermineSection(ImmutableArray<GhidraMemoryBlock> blocks, ulong address)
|
||||
{
|
||||
foreach (var block in blocks)
|
||||
{
|
||||
if (address >= block.Start && address < block.End)
|
||||
{
|
||||
return block.Name;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static GhidraBinaryHandle GetHandle(BinaryInfo binary)
|
||||
{
|
||||
if (binary.Handle is not GhidraBinaryHandle handle)
|
||||
{
|
||||
throw new ArgumentException("Invalid binary handle - not a Ghidra handle", nameof(binary));
|
||||
}
|
||||
return handle;
|
||||
}
|
||||
|
||||
private static BinaryFormat MapFormat(string ghidraFormat)
|
||||
{
|
||||
return ghidraFormat.ToUpperInvariant() switch
|
||||
{
|
||||
"ELF" or "ELF32" or "ELF64" => BinaryFormat.ELF,
|
||||
"PE" or "PE32" or "PE64" or "COFF" => BinaryFormat.PE,
|
||||
"MACHO" or "MACH-O" or "MACHO32" or "MACHO64" => BinaryFormat.MachO,
|
||||
"WASM" or "WEBASSEMBLY" => BinaryFormat.WASM,
|
||||
"RAW" or "BINARY" => BinaryFormat.Raw,
|
||||
_ => BinaryFormat.Unknown
|
||||
};
|
||||
}
|
||||
|
||||
private static CpuArchitecture MapArchitecture(string ghidraArch, int addressSize)
|
||||
{
|
||||
var arch = ghidraArch.ToUpperInvariant();
|
||||
return arch switch
|
||||
{
|
||||
// Intel x86/x64
|
||||
"X86" or "X86:LE:32:DEFAULT" => CpuArchitecture.X86,
|
||||
"X86-64" or "X86:LE:64:DEFAULT" or "AMD64" => CpuArchitecture.X86_64,
|
||||
var x when x.StartsWith("X86", StringComparison.Ordinal) && addressSize == 32 => CpuArchitecture.X86,
|
||||
var x when x.StartsWith("X86", StringComparison.Ordinal) => CpuArchitecture.X86_64,
|
||||
|
||||
// ARM
|
||||
"ARM" or "ARM:LE:32:V7" or "ARM:LE:32:V8" or "ARMV7" => CpuArchitecture.ARM32,
|
||||
"AARCH64" or "ARM:LE:64:V8A" or "ARM64" => CpuArchitecture.ARM64,
|
||||
var a when a.StartsWith("ARM", StringComparison.Ordinal) && addressSize == 32 => CpuArchitecture.ARM32,
|
||||
var a when a.StartsWith("ARM", StringComparison.Ordinal) || a.StartsWith("AARCH", StringComparison.Ordinal) => CpuArchitecture.ARM64,
|
||||
|
||||
// MIPS
|
||||
"MIPS" or "MIPS:BE:32:DEFAULT" or "MIPS:LE:32:DEFAULT" => CpuArchitecture.MIPS32,
|
||||
"MIPS64" or "MIPS:BE:64:DEFAULT" or "MIPS:LE:64:DEFAULT" => CpuArchitecture.MIPS64,
|
||||
var m when m.StartsWith("MIPS", StringComparison.Ordinal) && addressSize == 64 => CpuArchitecture.MIPS64,
|
||||
var m when m.StartsWith("MIPS", StringComparison.Ordinal) => CpuArchitecture.MIPS32,
|
||||
|
||||
// RISC-V
|
||||
"RISCV" or "RISCV:LE:64:RV64" or "RISCV64" => CpuArchitecture.RISCV64,
|
||||
var r when r.StartsWith("RISCV", StringComparison.Ordinal) => CpuArchitecture.RISCV64,
|
||||
|
||||
// PowerPC
|
||||
"PPC" or "POWERPC" or "PPC:BE:32:DEFAULT" => CpuArchitecture.PPC32,
|
||||
"PPC64" or "POWERPC64" or "PPC:BE:64:DEFAULT" => CpuArchitecture.PPC64,
|
||||
var p when p.StartsWith("PPC", StringComparison.Ordinal) && addressSize == 64 => CpuArchitecture.PPC64,
|
||||
var p when p.StartsWith("PPC", StringComparison.Ordinal) || p.StartsWith("POWERPC", StringComparison.Ordinal) => CpuArchitecture.PPC32,
|
||||
|
||||
// SPARC
|
||||
"SPARC" or "SPARC:BE:32:DEFAULT" => CpuArchitecture.SPARC,
|
||||
var s when s.StartsWith("SPARC", StringComparison.Ordinal) => CpuArchitecture.SPARC,
|
||||
|
||||
// SuperH
|
||||
"SH4" or "SUPERH" or "SH:LE:32:SH4" => CpuArchitecture.SH4,
|
||||
var s when s.StartsWith("SH", StringComparison.Ordinal) || s.StartsWith("SUPERH", StringComparison.Ordinal) => CpuArchitecture.SH4,
|
||||
|
||||
// AVR
|
||||
"AVR" or "AVR8:LE:16:DEFAULT" => CpuArchitecture.AVR,
|
||||
var a when a.StartsWith("AVR", StringComparison.Ordinal) => CpuArchitecture.AVR,
|
||||
|
||||
// WASM
|
||||
"WASM" or "WEBASSEMBLY" => CpuArchitecture.WASM,
|
||||
|
||||
// EVM (Ethereum)
|
||||
"EVM" => CpuArchitecture.EVM,
|
||||
|
||||
_ => CpuArchitecture.Unknown
|
||||
};
|
||||
}
|
||||
|
||||
private static string? MapToGhidraProcessor(CpuArchitecture arch)
|
||||
{
|
||||
return arch switch
|
||||
{
|
||||
CpuArchitecture.X86 => "x86:LE:32:default",
|
||||
CpuArchitecture.X86_64 => "x86:LE:64:default",
|
||||
CpuArchitecture.ARM32 => "ARM:LE:32:v7",
|
||||
CpuArchitecture.ARM64 => "AARCH64:LE:64:v8A",
|
||||
CpuArchitecture.MIPS32 => "MIPS:BE:32:default",
|
||||
CpuArchitecture.MIPS64 => "MIPS:BE:64:default",
|
||||
CpuArchitecture.RISCV64 => "RISCV:LE:64:RV64IC",
|
||||
CpuArchitecture.PPC32 => "PowerPC:BE:32:default",
|
||||
CpuArchitecture.PPC64 => "PowerPC:BE:64:default",
|
||||
CpuArchitecture.SPARC => "sparc:BE:32:default",
|
||||
CpuArchitecture.SH4 => "SuperH4:LE:32:default",
|
||||
CpuArchitecture.AVR => "avr8:LE:16:default",
|
||||
CpuArchitecture.WASM => "Wasm:LE:32:default",
|
||||
CpuArchitecture.EVM => "EVM:BE:256:default",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static string? DetectAbi(BinaryFormat format)
|
||||
{
|
||||
return format switch
|
||||
{
|
||||
BinaryFormat.ELF => "gnu",
|
||||
BinaryFormat.PE => "msvc",
|
||||
BinaryFormat.MachO => "darwin",
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static void TryDeleteFile(string path)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (File.Exists(path))
|
||||
{
|
||||
File.Delete(path);
|
||||
}
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Ignore cleanup failures
|
||||
}
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
/// <summary>
|
||||
/// Disposes the plugin and releases resources.
|
||||
/// </summary>
|
||||
public void Dispose()
|
||||
{
|
||||
if (_disposed)
|
||||
{
|
||||
return;
|
||||
}
|
||||
_disposed = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Internal handle for Ghidra-analyzed binaries.
|
||||
/// </summary>
|
||||
/// <param name="Result">The Ghidra analysis result.</param>
|
||||
/// <param name="Bytes">The original binary bytes.</param>
|
||||
internal sealed record GhidraBinaryHandle(
|
||||
GhidraAnalysisResult Result,
|
||||
byte[] Bytes);
|
||||
@@ -0,0 +1,441 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Manages Ghidra Headless process lifecycle.
|
||||
/// Provides methods to run analysis with proper process isolation and cleanup.
|
||||
/// </summary>
|
||||
public sealed class GhidraHeadlessManager : IAsyncDisposable
|
||||
{
|
||||
private readonly GhidraOptions _options;
|
||||
private readonly ILogger<GhidraHeadlessManager> _logger;
|
||||
private readonly SemaphoreSlim _semaphore;
|
||||
private bool _disposed;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraHeadlessManager.
|
||||
/// </summary>
|
||||
/// <param name="options">Ghidra configuration options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public GhidraHeadlessManager(
|
||||
IOptions<GhidraOptions> options,
|
||||
ILogger<GhidraHeadlessManager> logger)
|
||||
{
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_semaphore = new SemaphoreSlim(_options.MaxConcurrentInstances, _options.MaxConcurrentInstances);
|
||||
|
||||
EnsureWorkDirectoryExists();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs Ghidra analysis on a binary.
|
||||
/// </summary>
|
||||
/// <param name="binaryPath">Absolute path to the binary file.</param>
|
||||
/// <param name="scriptName">Name of the post-analysis script to run.</param>
|
||||
/// <param name="scriptArgs">Arguments to pass to the script.</param>
|
||||
/// <param name="runAnalysis">Whether to run full auto-analysis.</param>
|
||||
/// <param name="timeoutSeconds">Timeout in seconds (0 = use default).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Standard output from Ghidra.</returns>
|
||||
public async Task<GhidraProcessResult> RunAnalysisAsync(
|
||||
string binaryPath,
|
||||
string? scriptName = null,
|
||||
string[]? scriptArgs = null,
|
||||
bool runAnalysis = true,
|
||||
int timeoutSeconds = 0,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
if (!File.Exists(binaryPath))
|
||||
{
|
||||
throw new FileNotFoundException("Binary file not found", binaryPath);
|
||||
}
|
||||
|
||||
var effectiveTimeout = timeoutSeconds > 0 ? timeoutSeconds : _options.DefaultTimeoutSeconds;
|
||||
|
||||
await _semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
var projectDir = CreateTempProjectDirectory();
|
||||
try
|
||||
{
|
||||
var args = BuildAnalyzeArgs(projectDir, binaryPath, scriptName, scriptArgs, runAnalysis);
|
||||
return await RunGhidraAsync(args, effectiveTimeout, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (_options.CleanupTempProjects)
|
||||
{
|
||||
CleanupProjectDirectory(projectDir);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
_semaphore.Release();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs a Ghidra script on an existing project.
|
||||
/// </summary>
|
||||
/// <param name="projectDir">Path to the Ghidra project directory.</param>
|
||||
/// <param name="projectName">Name of the Ghidra project.</param>
|
||||
/// <param name="scriptName">Name of the script to run.</param>
|
||||
/// <param name="scriptArgs">Arguments to pass to the script.</param>
|
||||
/// <param name="timeoutSeconds">Timeout in seconds (0 = use default).</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Standard output from Ghidra.</returns>
|
||||
public async Task<GhidraProcessResult> RunScriptAsync(
|
||||
string projectDir,
|
||||
string projectName,
|
||||
string scriptName,
|
||||
string[]? scriptArgs = null,
|
||||
int timeoutSeconds = 0,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ObjectDisposedException.ThrowIf(_disposed, this);
|
||||
|
||||
if (!Directory.Exists(projectDir))
|
||||
{
|
||||
throw new DirectoryNotFoundException($"Project directory not found: {projectDir}");
|
||||
}
|
||||
|
||||
var effectiveTimeout = timeoutSeconds > 0 ? timeoutSeconds : _options.DefaultTimeoutSeconds;
|
||||
|
||||
await _semaphore.WaitAsync(ct);
|
||||
try
|
||||
{
|
||||
var args = BuildScriptArgs(projectDir, projectName, scriptName, scriptArgs);
|
||||
return await RunGhidraAsync(args, effectiveTimeout, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
_semaphore.Release();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if Ghidra is available and properly configured.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>True if Ghidra is available.</returns>
|
||||
public async Task<bool> IsAvailableAsync(CancellationToken ct = default)
|
||||
{
|
||||
try
|
||||
{
|
||||
var executablePath = GetAnalyzeHeadlessPath();
|
||||
if (!File.Exists(executablePath))
|
||||
{
|
||||
_logger.LogDebug("Ghidra analyzeHeadless not found at: {Path}", executablePath);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Quick version check to verify Java is working
|
||||
var result = await RunGhidraAsync(["--help"], timeoutSeconds: 30, ct);
|
||||
return result.ExitCode == 0 || result.StandardOutput.Contains("analyzeHeadless", StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Ghidra availability check failed");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets Ghidra version information.
|
||||
/// </summary>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Version string.</returns>
|
||||
public async Task<string> GetVersionAsync(CancellationToken ct = default)
|
||||
{
|
||||
var result = await RunGhidraAsync(["--help"], timeoutSeconds: 30, ct);
|
||||
|
||||
// Parse version from output - typically starts with "Ghidra X.Y"
|
||||
var lines = result.StandardOutput.Split('\n', StringSplitOptions.RemoveEmptyEntries);
|
||||
foreach (var line in lines)
|
||||
{
|
||||
if (line.Contains("Ghidra", StringComparison.OrdinalIgnoreCase) &&
|
||||
char.IsDigit(line.FirstOrDefault(c => char.IsDigit(c))))
|
||||
{
|
||||
return line.Trim();
|
||||
}
|
||||
}
|
||||
|
||||
return "Unknown";
|
||||
}
|
||||
|
||||
private string CreateTempProjectDirectory()
|
||||
{
|
||||
var projectDir = Path.Combine(
|
||||
_options.WorkDir,
|
||||
$"project_{DateTime.UtcNow:yyyyMMddHHmmssfff}_{Guid.NewGuid():N}");
|
||||
|
||||
Directory.CreateDirectory(projectDir);
|
||||
_logger.LogDebug("Created temp project directory: {Path}", projectDir);
|
||||
return projectDir;
|
||||
}
|
||||
|
||||
private void CleanupProjectDirectory(string projectDir)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (Directory.Exists(projectDir))
|
||||
{
|
||||
Directory.Delete(projectDir, recursive: true);
|
||||
_logger.LogDebug("Cleaned up project directory: {Path}", projectDir);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to cleanup project directory: {Path}", projectDir);
|
||||
}
|
||||
}
|
||||
|
||||
private void EnsureWorkDirectoryExists()
|
||||
{
|
||||
if (!Directory.Exists(_options.WorkDir))
|
||||
{
|
||||
Directory.CreateDirectory(_options.WorkDir);
|
||||
_logger.LogInformation("Created Ghidra work directory: {Path}", _options.WorkDir);
|
||||
}
|
||||
}
|
||||
|
||||
private string[] BuildAnalyzeArgs(
|
||||
string projectDir,
|
||||
string binaryPath,
|
||||
string? scriptName,
|
||||
string[]? scriptArgs,
|
||||
bool runAnalysis)
|
||||
{
|
||||
var args = new List<string>
|
||||
{
|
||||
projectDir,
|
||||
"TempProject",
|
||||
"-import", binaryPath
|
||||
};
|
||||
|
||||
if (!runAnalysis)
|
||||
{
|
||||
args.Add("-noanalysis");
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(scriptName))
|
||||
{
|
||||
args.AddRange(["-postScript", scriptName]);
|
||||
|
||||
if (scriptArgs is { Length: > 0 })
|
||||
{
|
||||
args.AddRange(scriptArgs);
|
||||
}
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(_options.ScriptsDir))
|
||||
{
|
||||
args.AddRange(["-scriptPath", _options.ScriptsDir]);
|
||||
}
|
||||
|
||||
args.AddRange(["-max-cpu", _options.MaxCpu.ToString(CultureInfo.InvariantCulture)]);
|
||||
|
||||
return [.. args];
|
||||
}
|
||||
|
||||
private static string[] BuildScriptArgs(
|
||||
string projectDir,
|
||||
string projectName,
|
||||
string scriptName,
|
||||
string[]? scriptArgs)
|
||||
{
|
||||
var args = new List<string>
|
||||
{
|
||||
projectDir,
|
||||
projectName,
|
||||
"-postScript", scriptName
|
||||
};
|
||||
|
||||
if (scriptArgs is { Length: > 0 })
|
||||
{
|
||||
args.AddRange(scriptArgs);
|
||||
}
|
||||
|
||||
return [.. args];
|
||||
}
|
||||
|
||||
private async Task<GhidraProcessResult> RunGhidraAsync(
|
||||
string[] args,
|
||||
int timeoutSeconds,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var executablePath = GetAnalyzeHeadlessPath();
|
||||
|
||||
var startInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = executablePath,
|
||||
Arguments = string.Join(" ", args.Select(QuoteArg)),
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true,
|
||||
StandardOutputEncoding = Encoding.UTF8,
|
||||
StandardErrorEncoding = Encoding.UTF8
|
||||
};
|
||||
|
||||
ConfigureEnvironment(startInfo);
|
||||
|
||||
_logger.LogDebug("Starting Ghidra: {Command} {Args}", executablePath, startInfo.Arguments);
|
||||
|
||||
var stopwatch = Stopwatch.StartNew();
|
||||
using var process = new Process { StartInfo = startInfo };
|
||||
|
||||
var stdoutBuilder = new StringBuilder();
|
||||
var stderrBuilder = new StringBuilder();
|
||||
|
||||
process.OutputDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null)
|
||||
{
|
||||
stdoutBuilder.AppendLine(e.Data);
|
||||
}
|
||||
};
|
||||
|
||||
process.ErrorDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null)
|
||||
{
|
||||
stderrBuilder.AppendLine(e.Data);
|
||||
}
|
||||
};
|
||||
|
||||
if (!process.Start())
|
||||
{
|
||||
throw new GhidraException("Failed to start Ghidra process");
|
||||
}
|
||||
|
||||
process.BeginOutputReadLine();
|
||||
process.BeginErrorReadLine();
|
||||
|
||||
using var timeoutCts = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds));
|
||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
|
||||
|
||||
try
|
||||
{
|
||||
await process.WaitForExitAsync(linkedCts.Token);
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
process.Kill(entireProcessTree: true);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Best effort kill
|
||||
}
|
||||
|
||||
throw new GhidraTimeoutException(timeoutSeconds);
|
||||
}
|
||||
|
||||
stopwatch.Stop();
|
||||
|
||||
var stdout = stdoutBuilder.ToString();
|
||||
var stderr = stderrBuilder.ToString();
|
||||
|
||||
_logger.LogDebug(
|
||||
"Ghidra completed with exit code {ExitCode} in {Duration}ms",
|
||||
process.ExitCode,
|
||||
stopwatch.ElapsedMilliseconds);
|
||||
|
||||
if (process.ExitCode != 0)
|
||||
{
|
||||
_logger.LogWarning("Ghidra failed: {Error}", stderr);
|
||||
}
|
||||
|
||||
return new GhidraProcessResult(
|
||||
process.ExitCode,
|
||||
stdout,
|
||||
stderr,
|
||||
stopwatch.Elapsed);
|
||||
}
|
||||
|
||||
private string GetAnalyzeHeadlessPath()
|
||||
{
|
||||
var basePath = Path.Combine(_options.GhidraHome, "support");
|
||||
|
||||
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
|
||||
{
|
||||
return Path.Combine(basePath, "analyzeHeadless.bat");
|
||||
}
|
||||
|
||||
return Path.Combine(basePath, "analyzeHeadless");
|
||||
}
|
||||
|
||||
private void ConfigureEnvironment(ProcessStartInfo startInfo)
|
||||
{
|
||||
if (!string.IsNullOrEmpty(_options.JavaHome))
|
||||
{
|
||||
startInfo.EnvironmentVariables["JAVA_HOME"] = _options.JavaHome;
|
||||
}
|
||||
|
||||
startInfo.EnvironmentVariables["MAXMEM"] = _options.MaxMemory;
|
||||
startInfo.EnvironmentVariables["GHIDRA_HOME"] = _options.GhidraHome;
|
||||
}
|
||||
|
||||
private static string QuoteArg(string arg)
|
||||
{
|
||||
if (arg.Contains(' ', StringComparison.Ordinal) || arg.Contains('"', StringComparison.Ordinal))
|
||||
{
|
||||
return $"\"{arg.Replace("\"", "\\\"")}\"";
|
||||
}
|
||||
|
||||
return arg;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (_disposed)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
_disposed = true;
|
||||
|
||||
// Wait for any in-flight operations to complete
|
||||
for (var i = 0; i < _options.MaxConcurrentInstances; i++)
|
||||
{
|
||||
await _semaphore.WaitAsync();
|
||||
}
|
||||
|
||||
_semaphore.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of a Ghidra process execution.
|
||||
/// </summary>
|
||||
/// <param name="ExitCode">Process exit code.</param>
|
||||
/// <param name="StandardOutput">Standard output content.</param>
|
||||
/// <param name="StandardError">Standard error content.</param>
|
||||
/// <param name="Duration">Execution duration.</param>
|
||||
public sealed record GhidraProcessResult(
|
||||
int ExitCode,
|
||||
string StandardOutput,
|
||||
string StandardError,
|
||||
TimeSpan Duration)
|
||||
{
|
||||
/// <summary>
|
||||
/// Whether the process completed successfully (exit code 0).
|
||||
/// </summary>
|
||||
public bool IsSuccess => ExitCode == 0;
|
||||
}
|
||||
@@ -0,0 +1,511 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Globalization;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Implementation of <see cref="IGhidraService"/> using Ghidra Headless analysis.
|
||||
/// </summary>
|
||||
public sealed class GhidraService : IGhidraService, IAsyncDisposable
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNameCaseInsensitive = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
};
|
||||
|
||||
private readonly GhidraHeadlessManager _headlessManager;
|
||||
private readonly GhidraOptions _options;
|
||||
private readonly ILogger<GhidraService> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidraService.
|
||||
/// </summary>
|
||||
/// <param name="headlessManager">The Ghidra Headless manager.</param>
|
||||
/// <param name="options">Ghidra options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="timeProvider">Time provider for timestamps.</param>
|
||||
public GhidraService(
|
||||
GhidraHeadlessManager headlessManager,
|
||||
IOptions<GhidraOptions> options,
|
||||
ILogger<GhidraService> logger,
|
||||
TimeProvider timeProvider)
|
||||
{
|
||||
_headlessManager = headlessManager;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_timeProvider = timeProvider;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<GhidraAnalysisResult> AnalyzeAsync(
|
||||
Stream binaryStream,
|
||||
GhidraAnalysisOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(binaryStream);
|
||||
|
||||
// Write stream to temp file
|
||||
var tempPath = Path.Combine(
|
||||
_options.WorkDir,
|
||||
$"binary_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}.bin");
|
||||
|
||||
try
|
||||
{
|
||||
Directory.CreateDirectory(Path.GetDirectoryName(tempPath)!);
|
||||
|
||||
await using (var fileStream = File.Create(tempPath))
|
||||
{
|
||||
await binaryStream.CopyToAsync(fileStream, ct);
|
||||
}
|
||||
|
||||
return await AnalyzeAsync(tempPath, options, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
TryDeleteFile(tempPath);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<GhidraAnalysisResult> AnalyzeAsync(
|
||||
string binaryPath,
|
||||
GhidraAnalysisOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(binaryPath);
|
||||
|
||||
if (!File.Exists(binaryPath))
|
||||
{
|
||||
throw new FileNotFoundException("Binary file not found", binaryPath);
|
||||
}
|
||||
|
||||
options ??= new GhidraAnalysisOptions();
|
||||
|
||||
_logger.LogInformation("Starting Ghidra analysis of: {BinaryPath}", binaryPath);
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
|
||||
// Calculate binary hash
|
||||
var binaryHash = await ComputeBinaryHashAsync(binaryPath, ct);
|
||||
|
||||
// Run analysis with JSON export script
|
||||
var result = await _headlessManager.RunAnalysisAsync(
|
||||
binaryPath,
|
||||
scriptName: "ExportToJson.java",
|
||||
scriptArgs: BuildScriptArgs(options),
|
||||
runAnalysis: options.RunFullAnalysis,
|
||||
timeoutSeconds: options.TimeoutSeconds,
|
||||
ct);
|
||||
|
||||
if (!result.IsSuccess)
|
||||
{
|
||||
throw new GhidraException($"Ghidra analysis failed: {result.StandardError}")
|
||||
{
|
||||
ExitCode = result.ExitCode,
|
||||
StandardError = result.StandardError,
|
||||
StandardOutput = result.StandardOutput
|
||||
};
|
||||
}
|
||||
|
||||
var analysisResult = ParseAnalysisOutput(
|
||||
result.StandardOutput,
|
||||
binaryPath,
|
||||
binaryHash,
|
||||
startTime,
|
||||
result.Duration);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Ghidra analysis completed: {FunctionCount} functions found in {Duration}ms",
|
||||
analysisResult.Functions.Length,
|
||||
result.Duration.TotalMilliseconds);
|
||||
|
||||
return analysisResult;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<bool> IsAvailableAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return await _headlessManager.IsAvailableAsync(ct);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<GhidraInfo> GetInfoAsync(CancellationToken ct = default)
|
||||
{
|
||||
var version = await _headlessManager.GetVersionAsync(ct);
|
||||
|
||||
// Get Java version
|
||||
var javaVersion = GetJavaVersion();
|
||||
|
||||
// Get available processor languages
|
||||
var processors = GetAvailableProcessors();
|
||||
|
||||
return new GhidraInfo(
|
||||
version,
|
||||
javaVersion,
|
||||
processors,
|
||||
_options.GhidraHome);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
await _headlessManager.DisposeAsync();
|
||||
}
|
||||
|
||||
private static string[] BuildScriptArgs(GhidraAnalysisOptions options)
|
||||
{
|
||||
var args = new List<string>();
|
||||
|
||||
if (options.IncludeDecompilation)
|
||||
{
|
||||
args.Add("-decompile");
|
||||
}
|
||||
|
||||
if (options.GeneratePCodeHashes)
|
||||
{
|
||||
args.Add("-pcode-hash");
|
||||
}
|
||||
|
||||
return [.. args];
|
||||
}
|
||||
|
||||
private GhidraAnalysisResult ParseAnalysisOutput(
|
||||
string output,
|
||||
string binaryPath,
|
||||
string binaryHash,
|
||||
DateTimeOffset startTime,
|
||||
TimeSpan duration)
|
||||
{
|
||||
// Look for JSON output marker in stdout
|
||||
const string jsonMarker = "###GHIDRA_JSON_OUTPUT###";
|
||||
var jsonStart = output.IndexOf(jsonMarker, StringComparison.Ordinal);
|
||||
|
||||
if (jsonStart >= 0)
|
||||
{
|
||||
var jsonContent = output[(jsonStart + jsonMarker.Length)..].Trim();
|
||||
var jsonEnd = jsonContent.IndexOf("###END_GHIDRA_JSON_OUTPUT###", StringComparison.Ordinal);
|
||||
if (jsonEnd >= 0)
|
||||
{
|
||||
jsonContent = jsonContent[..jsonEnd].Trim();
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
return ParseJsonOutput(jsonContent, binaryHash, startTime, duration);
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse Ghidra JSON output, falling back to text parsing");
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: parse text output
|
||||
return ParseTextOutput(output, binaryPath, binaryHash, startTime, duration);
|
||||
}
|
||||
|
||||
private GhidraAnalysisResult ParseJsonOutput(
|
||||
string json,
|
||||
string binaryHash,
|
||||
DateTimeOffset startTime,
|
||||
TimeSpan duration)
|
||||
{
|
||||
var data = JsonSerializer.Deserialize<GhidraJsonOutput>(json, JsonOptions)
|
||||
?? throw new GhidraException("Failed to deserialize Ghidra JSON output");
|
||||
|
||||
var functions = data.Functions?.Select(f => new GhidraFunction(
|
||||
f.Name ?? "unknown",
|
||||
ParseAddress(f.Address),
|
||||
f.Size,
|
||||
f.Signature,
|
||||
f.DecompiledCode,
|
||||
f.PCodeHash is not null ? Convert.FromHexString(f.PCodeHash) : null,
|
||||
f.CalledFunctions?.ToImmutableArray() ?? [],
|
||||
f.CallingFunctions?.ToImmutableArray() ?? [],
|
||||
f.IsThunk,
|
||||
f.IsExternal
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var imports = data.Imports?.Select(i => new GhidraImport(
|
||||
i.Name ?? "unknown",
|
||||
ParseAddress(i.Address),
|
||||
i.LibraryName,
|
||||
i.Ordinal
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var exports = data.Exports?.Select(e => new GhidraExport(
|
||||
e.Name ?? "unknown",
|
||||
ParseAddress(e.Address),
|
||||
e.Ordinal
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var strings = data.Strings?.Select(s => new GhidraString(
|
||||
s.Value ?? "",
|
||||
ParseAddress(s.Address),
|
||||
s.Length,
|
||||
s.Encoding ?? "ASCII"
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var memoryBlocks = data.MemoryBlocks?.Select(m => new GhidraMemoryBlock(
|
||||
m.Name ?? "unknown",
|
||||
ParseAddress(m.Start),
|
||||
ParseAddress(m.End),
|
||||
m.Size,
|
||||
m.IsExecutable,
|
||||
m.IsWritable,
|
||||
m.IsInitialized
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var metadata = new GhidraMetadata(
|
||||
data.Metadata?.FileName ?? "unknown",
|
||||
data.Metadata?.Format ?? "unknown",
|
||||
data.Metadata?.Architecture ?? "unknown",
|
||||
data.Metadata?.Processor ?? "unknown",
|
||||
data.Metadata?.Compiler,
|
||||
data.Metadata?.Endianness ?? "little",
|
||||
data.Metadata?.AddressSize ?? 64,
|
||||
ParseAddress(data.Metadata?.ImageBase),
|
||||
data.Metadata?.EntryPoint is not null ? ParseAddress(data.Metadata.EntryPoint) : null,
|
||||
startTime,
|
||||
data.Metadata?.GhidraVersion ?? "unknown",
|
||||
duration);
|
||||
|
||||
return new GhidraAnalysisResult(
|
||||
binaryHash,
|
||||
functions,
|
||||
imports,
|
||||
exports,
|
||||
strings,
|
||||
memoryBlocks,
|
||||
metadata);
|
||||
}
|
||||
|
||||
private GhidraAnalysisResult ParseTextOutput(
|
||||
string output,
|
||||
string binaryPath,
|
||||
string binaryHash,
|
||||
DateTimeOffset startTime,
|
||||
TimeSpan duration)
|
||||
{
|
||||
// Basic text parsing for when JSON export is not available
|
||||
// This extracts minimal information from Ghidra log output
|
||||
|
||||
var functions = ImmutableArray<GhidraFunction>.Empty;
|
||||
var imports = ImmutableArray<GhidraImport>.Empty;
|
||||
var exports = ImmutableArray<GhidraExport>.Empty;
|
||||
var strings = ImmutableArray<GhidraString>.Empty;
|
||||
var memoryBlocks = ImmutableArray<GhidraMemoryBlock>.Empty;
|
||||
|
||||
// Parse function count from output like "Total functions: 123"
|
||||
var functionCountMatch = System.Text.RegularExpressions.Regex.Match(
|
||||
output,
|
||||
@"(?:Total functions|Functions found|functions):\s*(\d+)",
|
||||
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||||
|
||||
var metadata = new GhidraMetadata(
|
||||
Path.GetFileName(binaryPath),
|
||||
"unknown",
|
||||
"unknown",
|
||||
"unknown",
|
||||
null,
|
||||
"little",
|
||||
64,
|
||||
0,
|
||||
null,
|
||||
startTime,
|
||||
"unknown",
|
||||
duration);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Parsed Ghidra text output: estimated {Count} functions",
|
||||
functionCountMatch.Success ? functionCountMatch.Groups[1].Value : "unknown");
|
||||
|
||||
return new GhidraAnalysisResult(
|
||||
binaryHash,
|
||||
functions,
|
||||
imports,
|
||||
exports,
|
||||
strings,
|
||||
memoryBlocks,
|
||||
metadata);
|
||||
}
|
||||
|
||||
private static ulong ParseAddress(string? address)
|
||||
{
|
||||
if (string.IsNullOrEmpty(address))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Handle hex format (0x...) or plain hex
|
||||
if (address.StartsWith("0x", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
address = address[2..];
|
||||
}
|
||||
|
||||
return ulong.TryParse(address, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var result)
|
||||
? result
|
||||
: 0;
|
||||
}
|
||||
|
||||
private static async Task<string> ComputeBinaryHashAsync(string path, CancellationToken ct)
|
||||
{
|
||||
await using var stream = File.OpenRead(path);
|
||||
var hash = await SHA256.HashDataAsync(stream, ct);
|
||||
return Convert.ToHexStringLower(hash);
|
||||
}
|
||||
|
||||
private string GetJavaVersion()
|
||||
{
|
||||
try
|
||||
{
|
||||
var javaHome = _options.JavaHome ?? Environment.GetEnvironmentVariable("JAVA_HOME");
|
||||
if (string.IsNullOrEmpty(javaHome))
|
||||
{
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
var releaseFile = Path.Combine(javaHome, "release");
|
||||
if (File.Exists(releaseFile))
|
||||
{
|
||||
var content = File.ReadAllText(releaseFile);
|
||||
var match = System.Text.RegularExpressions.Regex.Match(
|
||||
content,
|
||||
@"JAVA_VERSION=""?([^""\r\n]+)""?");
|
||||
|
||||
if (match.Success)
|
||||
{
|
||||
return match.Groups[1].Value;
|
||||
}
|
||||
}
|
||||
|
||||
return "unknown";
|
||||
}
|
||||
catch
|
||||
{
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
private ImmutableArray<string> GetAvailableProcessors()
|
||||
{
|
||||
try
|
||||
{
|
||||
var processorsDir = Path.Combine(_options.GhidraHome, "Ghidra", "Processors");
|
||||
if (!Directory.Exists(processorsDir))
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
return Directory.GetDirectories(processorsDir)
|
||||
.Select(Path.GetFileName)
|
||||
.Where(name => !string.IsNullOrEmpty(name))
|
||||
.Order(StringComparer.OrdinalIgnoreCase)
|
||||
.ToImmutableArray()!;
|
||||
}
|
||||
catch
|
||||
{
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private void TryDeleteFile(string path)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (File.Exists(path))
|
||||
{
|
||||
File.Delete(path);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to delete temp file: {Path}", path);
|
||||
}
|
||||
}
|
||||
|
||||
// JSON DTOs for deserialization
|
||||
private sealed record GhidraJsonOutput
|
||||
{
|
||||
public List<GhidraFunctionJson>? Functions { get; init; }
|
||||
public List<GhidraImportJson>? Imports { get; init; }
|
||||
public List<GhidraExportJson>? Exports { get; init; }
|
||||
public List<GhidraStringJson>? Strings { get; init; }
|
||||
public List<GhidraMemoryBlockJson>? MemoryBlocks { get; init; }
|
||||
public GhidraMetadataJson? Metadata { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidraFunctionJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? Address { get; init; }
|
||||
public int Size { get; init; }
|
||||
public string? Signature { get; init; }
|
||||
public string? DecompiledCode { get; init; }
|
||||
public string? PCodeHash { get; init; }
|
||||
public List<string>? CalledFunctions { get; init; }
|
||||
public List<string>? CallingFunctions { get; init; }
|
||||
public bool IsThunk { get; init; }
|
||||
public bool IsExternal { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidraImportJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? Address { get; init; }
|
||||
public string? LibraryName { get; init; }
|
||||
public int? Ordinal { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidraExportJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? Address { get; init; }
|
||||
public int? Ordinal { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidraStringJson
|
||||
{
|
||||
public string? Value { get; init; }
|
||||
public string? Address { get; init; }
|
||||
public int Length { get; init; }
|
||||
public string? Encoding { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidraMemoryBlockJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? Start { get; init; }
|
||||
public string? End { get; init; }
|
||||
public long Size { get; init; }
|
||||
public bool IsExecutable { get; init; }
|
||||
public bool IsWritable { get; init; }
|
||||
public bool IsInitialized { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidraMetadataJson
|
||||
{
|
||||
public string? FileName { get; init; }
|
||||
public string? Format { get; init; }
|
||||
public string? Architecture { get; init; }
|
||||
public string? Processor { get; init; }
|
||||
public string? Compiler { get; init; }
|
||||
public string? Endianness { get; init; }
|
||||
public int AddressSize { get; init; }
|
||||
public string? ImageBase { get; init; }
|
||||
public string? EntryPoint { get; init; }
|
||||
public string? GhidraVersion { get; init; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,702 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Implementation of <see cref="IGhidriffBridge"/> for Python ghidriff integration.
|
||||
/// </summary>
|
||||
public sealed class GhidriffBridge : IGhidriffBridge
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNameCaseInsensitive = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
};
|
||||
|
||||
private readonly GhidriffOptions _options;
|
||||
private readonly GhidraOptions _ghidraOptions;
|
||||
private readonly ILogger<GhidriffBridge> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new GhidriffBridge.
|
||||
/// </summary>
|
||||
/// <param name="options">ghidriff options.</param>
|
||||
/// <param name="ghidraOptions">Ghidra options for path configuration.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="timeProvider">Time provider.</param>
|
||||
public GhidriffBridge(
|
||||
IOptions<GhidriffOptions> options,
|
||||
IOptions<GhidraOptions> ghidraOptions,
|
||||
ILogger<GhidriffBridge> logger,
|
||||
TimeProvider timeProvider)
|
||||
{
|
||||
_options = options.Value;
|
||||
_ghidraOptions = ghidraOptions.Value;
|
||||
_logger = logger;
|
||||
_timeProvider = timeProvider;
|
||||
|
||||
EnsureWorkDirectoryExists();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<GhidriffResult> DiffAsync(
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
GhidriffDiffOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(oldBinaryPath);
|
||||
ArgumentException.ThrowIfNullOrEmpty(newBinaryPath);
|
||||
|
||||
if (!File.Exists(oldBinaryPath))
|
||||
{
|
||||
throw new FileNotFoundException("Old binary not found", oldBinaryPath);
|
||||
}
|
||||
|
||||
if (!File.Exists(newBinaryPath))
|
||||
{
|
||||
throw new FileNotFoundException("New binary not found", newBinaryPath);
|
||||
}
|
||||
|
||||
options ??= new GhidriffDiffOptions
|
||||
{
|
||||
IncludeDecompilation = _options.DefaultIncludeDecompilation,
|
||||
IncludeDisassembly = _options.DefaultIncludeDisassembly,
|
||||
TimeoutSeconds = _options.DefaultTimeoutSeconds
|
||||
};
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting ghidriff comparison: {OldBinary} vs {NewBinary}",
|
||||
Path.GetFileName(oldBinaryPath),
|
||||
Path.GetFileName(newBinaryPath));
|
||||
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
var outputDir = CreateOutputDirectory();
|
||||
|
||||
try
|
||||
{
|
||||
var args = BuildGhidriffArgs(oldBinaryPath, newBinaryPath, outputDir, options);
|
||||
var result = await RunPythonAsync("ghidriff", args, options.TimeoutSeconds, ct);
|
||||
|
||||
if (result.ExitCode != 0)
|
||||
{
|
||||
throw new GhidriffException($"ghidriff failed with exit code {result.ExitCode}")
|
||||
{
|
||||
ExitCode = result.ExitCode,
|
||||
StandardError = result.StandardError,
|
||||
StandardOutput = result.StandardOutput
|
||||
};
|
||||
}
|
||||
|
||||
var ghidriffResult = await ParseOutputAsync(
|
||||
outputDir,
|
||||
oldBinaryPath,
|
||||
newBinaryPath,
|
||||
startTime,
|
||||
ct);
|
||||
|
||||
_logger.LogInformation(
|
||||
"ghidriff completed: {Added} added, {Removed} removed, {Modified} modified functions",
|
||||
ghidriffResult.AddedFunctions.Length,
|
||||
ghidriffResult.RemovedFunctions.Length,
|
||||
ghidriffResult.ModifiedFunctions.Length);
|
||||
|
||||
return ghidriffResult;
|
||||
}
|
||||
finally
|
||||
{
|
||||
CleanupOutputDirectory(outputDir);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<GhidriffResult> DiffAsync(
|
||||
Stream oldBinary,
|
||||
Stream newBinary,
|
||||
GhidriffDiffOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(oldBinary);
|
||||
ArgumentNullException.ThrowIfNull(newBinary);
|
||||
|
||||
var oldPath = await SaveStreamToTempFileAsync(oldBinary, "old", ct);
|
||||
var newPath = await SaveStreamToTempFileAsync(newBinary, "new", ct);
|
||||
|
||||
try
|
||||
{
|
||||
return await DiffAsync(oldPath, newPath, options, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
TryDeleteFile(oldPath);
|
||||
TryDeleteFile(newPath);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<string> GenerateReportAsync(
|
||||
GhidriffResult result,
|
||||
GhidriffReportFormat format,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(result);
|
||||
|
||||
return format switch
|
||||
{
|
||||
GhidriffReportFormat.Json => Task.FromResult(GenerateJsonReport(result)),
|
||||
GhidriffReportFormat.Markdown => Task.FromResult(GenerateMarkdownReport(result)),
|
||||
GhidriffReportFormat.Html => Task.FromResult(GenerateHtmlReport(result)),
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(format))
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<bool> IsAvailableAsync(CancellationToken ct = default)
|
||||
{
|
||||
if (!_options.Enabled)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var result = await RunPythonAsync("ghidriff", ["--version"], timeoutSeconds: 30, ct);
|
||||
return result.ExitCode == 0;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "ghidriff availability check failed");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<string> GetVersionAsync(CancellationToken ct = default)
|
||||
{
|
||||
var result = await RunPythonAsync("ghidriff", ["--version"], timeoutSeconds: 30, ct);
|
||||
|
||||
if (result.ExitCode != 0)
|
||||
{
|
||||
throw new GhidriffException("Failed to get ghidriff version")
|
||||
{
|
||||
ExitCode = result.ExitCode,
|
||||
StandardError = result.StandardError
|
||||
};
|
||||
}
|
||||
|
||||
return result.StandardOutput.Trim();
|
||||
}
|
||||
|
||||
private void EnsureWorkDirectoryExists()
|
||||
{
|
||||
if (!Directory.Exists(_options.WorkDir))
|
||||
{
|
||||
Directory.CreateDirectory(_options.WorkDir);
|
||||
_logger.LogDebug("Created ghidriff work directory: {Path}", _options.WorkDir);
|
||||
}
|
||||
}
|
||||
|
||||
private string CreateOutputDirectory()
|
||||
{
|
||||
var outputDir = Path.Combine(
|
||||
_options.WorkDir,
|
||||
$"diff_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}");
|
||||
|
||||
Directory.CreateDirectory(outputDir);
|
||||
return outputDir;
|
||||
}
|
||||
|
||||
private void CleanupOutputDirectory(string outputDir)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (Directory.Exists(outputDir))
|
||||
{
|
||||
Directory.Delete(outputDir, recursive: true);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to cleanup output directory: {Path}", outputDir);
|
||||
}
|
||||
}
|
||||
|
||||
private string[] BuildGhidriffArgs(
|
||||
string oldPath,
|
||||
string newPath,
|
||||
string outputDir,
|
||||
GhidriffDiffOptions options)
|
||||
{
|
||||
var args = new List<string>
|
||||
{
|
||||
oldPath,
|
||||
newPath,
|
||||
"--output-dir", outputDir,
|
||||
"--output-format", "json"
|
||||
};
|
||||
|
||||
var ghidraPath = options.GhidraPath ?? _ghidraOptions.GhidraHome;
|
||||
if (!string.IsNullOrEmpty(ghidraPath))
|
||||
{
|
||||
args.AddRange(["--ghidra-path", ghidraPath]);
|
||||
}
|
||||
|
||||
if (options.IncludeDecompilation)
|
||||
{
|
||||
args.Add("--include-decompilation");
|
||||
}
|
||||
|
||||
if (!options.IncludeDisassembly)
|
||||
{
|
||||
args.Add("--no-disassembly");
|
||||
}
|
||||
|
||||
foreach (var exclude in options.ExcludeFunctions)
|
||||
{
|
||||
args.AddRange(["--exclude", exclude]);
|
||||
}
|
||||
|
||||
if (options.MaxParallelism > 1)
|
||||
{
|
||||
args.AddRange(["--parallel", options.MaxParallelism.ToString(CultureInfo.InvariantCulture)]);
|
||||
}
|
||||
|
||||
return [.. args];
|
||||
}
|
||||
|
||||
private async Task<ProcessResult> RunPythonAsync(
|
||||
string module,
|
||||
string[] args,
|
||||
int timeoutSeconds,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var pythonPath = GetPythonPath();
|
||||
var arguments = $"-m {module} {string.Join(" ", args.Select(QuoteArg))}";
|
||||
|
||||
var startInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = pythonPath,
|
||||
Arguments = arguments,
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true,
|
||||
StandardOutputEncoding = Encoding.UTF8,
|
||||
StandardErrorEncoding = Encoding.UTF8
|
||||
};
|
||||
|
||||
_logger.LogDebug("Running: {Python} {Args}", pythonPath, arguments);
|
||||
|
||||
using var process = new Process { StartInfo = startInfo };
|
||||
|
||||
var stdoutBuilder = new StringBuilder();
|
||||
var stderrBuilder = new StringBuilder();
|
||||
|
||||
process.OutputDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null)
|
||||
{
|
||||
stdoutBuilder.AppendLine(e.Data);
|
||||
}
|
||||
};
|
||||
|
||||
process.ErrorDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null)
|
||||
{
|
||||
stderrBuilder.AppendLine(e.Data);
|
||||
}
|
||||
};
|
||||
|
||||
if (!process.Start())
|
||||
{
|
||||
throw new GhidriffException("Failed to start Python process");
|
||||
}
|
||||
|
||||
process.BeginOutputReadLine();
|
||||
process.BeginErrorReadLine();
|
||||
|
||||
using var timeoutCts = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds));
|
||||
using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource(ct, timeoutCts.Token);
|
||||
|
||||
try
|
||||
{
|
||||
await process.WaitForExitAsync(linkedCts.Token);
|
||||
}
|
||||
catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested)
|
||||
{
|
||||
try
|
||||
{
|
||||
process.Kill(entireProcessTree: true);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Best effort
|
||||
}
|
||||
|
||||
throw new GhidriffException($"ghidriff timed out after {timeoutSeconds} seconds");
|
||||
}
|
||||
|
||||
return new ProcessResult(
|
||||
process.ExitCode,
|
||||
stdoutBuilder.ToString(),
|
||||
stderrBuilder.ToString());
|
||||
}
|
||||
|
||||
private string GetPythonPath()
|
||||
{
|
||||
if (!string.IsNullOrEmpty(_options.PythonPath))
|
||||
{
|
||||
return _options.PythonPath;
|
||||
}
|
||||
|
||||
// Try to find Python
|
||||
return RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "python" : "python3";
|
||||
}
|
||||
|
||||
private async Task<GhidriffResult> ParseOutputAsync(
|
||||
string outputDir,
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
DateTimeOffset startTime,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var jsonPath = Path.Combine(outputDir, "diff.json");
|
||||
|
||||
if (!File.Exists(jsonPath))
|
||||
{
|
||||
// Try alternate paths
|
||||
var jsonFiles = Directory.GetFiles(outputDir, "*.json", SearchOption.AllDirectories);
|
||||
if (jsonFiles.Length > 0)
|
||||
{
|
||||
jsonPath = jsonFiles[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogWarning("No JSON output found in {OutputDir}", outputDir);
|
||||
return CreateEmptyResult(oldBinaryPath, newBinaryPath, startTime);
|
||||
}
|
||||
}
|
||||
|
||||
var json = await File.ReadAllTextAsync(jsonPath, ct);
|
||||
|
||||
// Calculate hashes
|
||||
var oldHash = await ComputeFileHashAsync(oldBinaryPath, ct);
|
||||
var newHash = await ComputeFileHashAsync(newBinaryPath, ct);
|
||||
|
||||
return ParseJsonResult(json, oldHash, newHash, oldBinaryPath, newBinaryPath, startTime);
|
||||
}
|
||||
|
||||
private GhidriffResult ParseJsonResult(
|
||||
string json,
|
||||
string oldHash,
|
||||
string newHash,
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
DateTimeOffset startTime)
|
||||
{
|
||||
try
|
||||
{
|
||||
var data = JsonSerializer.Deserialize<GhidriffJsonOutput>(json, JsonOptions);
|
||||
|
||||
if (data is null)
|
||||
{
|
||||
return CreateEmptyResult(oldBinaryPath, newBinaryPath, startTime, json);
|
||||
}
|
||||
|
||||
var added = data.AddedFunctions?.Select(f => new GhidriffFunction(
|
||||
f.Name ?? "unknown",
|
||||
ParseAddress(f.Address),
|
||||
f.Size,
|
||||
f.Signature,
|
||||
f.DecompiledCode
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var removed = data.RemovedFunctions?.Select(f => new GhidriffFunction(
|
||||
f.Name ?? "unknown",
|
||||
ParseAddress(f.Address),
|
||||
f.Size,
|
||||
f.Signature,
|
||||
f.DecompiledCode
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var modified = data.ModifiedFunctions?.Select(f => new GhidriffDiff(
|
||||
f.Name ?? "unknown",
|
||||
ParseAddress(f.OldAddress),
|
||||
ParseAddress(f.NewAddress),
|
||||
f.OldSize,
|
||||
f.NewSize,
|
||||
f.OldSignature,
|
||||
f.NewSignature,
|
||||
f.Similarity,
|
||||
f.OldDecompiledCode,
|
||||
f.NewDecompiledCode,
|
||||
f.InstructionChanges?.ToImmutableArray() ?? []
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var duration = _timeProvider.GetUtcNow() - startTime;
|
||||
|
||||
var stats = new GhidriffStats(
|
||||
data.Statistics?.TotalOldFunctions ?? 0,
|
||||
data.Statistics?.TotalNewFunctions ?? 0,
|
||||
added.Length,
|
||||
removed.Length,
|
||||
modified.Length,
|
||||
data.Statistics?.UnchangedCount ?? 0,
|
||||
duration);
|
||||
|
||||
return new GhidriffResult(
|
||||
oldHash,
|
||||
newHash,
|
||||
Path.GetFileName(oldBinaryPath),
|
||||
Path.GetFileName(newBinaryPath),
|
||||
added,
|
||||
removed,
|
||||
modified,
|
||||
stats,
|
||||
json);
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse ghidriff JSON output");
|
||||
return CreateEmptyResult(oldBinaryPath, newBinaryPath, startTime, json);
|
||||
}
|
||||
}
|
||||
|
||||
private GhidriffResult CreateEmptyResult(
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
DateTimeOffset startTime,
|
||||
string rawJson = "")
|
||||
{
|
||||
var duration = _timeProvider.GetUtcNow() - startTime;
|
||||
|
||||
return new GhidriffResult(
|
||||
"",
|
||||
"",
|
||||
Path.GetFileName(oldBinaryPath),
|
||||
Path.GetFileName(newBinaryPath),
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
new GhidriffStats(0, 0, 0, 0, 0, 0, duration),
|
||||
rawJson);
|
||||
}
|
||||
|
||||
private static ulong ParseAddress(string? address)
|
||||
{
|
||||
if (string.IsNullOrEmpty(address))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (address.StartsWith("0x", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
address = address[2..];
|
||||
}
|
||||
|
||||
return ulong.TryParse(address, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var result)
|
||||
? result
|
||||
: 0;
|
||||
}
|
||||
|
||||
private static async Task<string> ComputeFileHashAsync(string path, CancellationToken ct)
|
||||
{
|
||||
await using var stream = File.OpenRead(path);
|
||||
var hash = await SHA256.HashDataAsync(stream, ct);
|
||||
return Convert.ToHexStringLower(hash);
|
||||
}
|
||||
|
||||
private async Task<string> SaveStreamToTempFileAsync(Stream stream, string prefix, CancellationToken ct)
|
||||
{
|
||||
var path = Path.Combine(
|
||||
_options.WorkDir,
|
||||
$"{prefix}_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}.bin");
|
||||
|
||||
Directory.CreateDirectory(Path.GetDirectoryName(path)!);
|
||||
|
||||
await using var fileStream = File.Create(path);
|
||||
await stream.CopyToAsync(fileStream, ct);
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
private void TryDeleteFile(string path)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (File.Exists(path))
|
||||
{
|
||||
File.Delete(path);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to delete temp file: {Path}", path);
|
||||
}
|
||||
}
|
||||
|
||||
private static string QuoteArg(string arg)
|
||||
{
|
||||
if (arg.Contains(' ', StringComparison.Ordinal) || arg.Contains('"', StringComparison.Ordinal))
|
||||
{
|
||||
return $"\"{arg.Replace("\"", "\\\"")}\"";
|
||||
}
|
||||
|
||||
return arg;
|
||||
}
|
||||
|
||||
private static string GenerateJsonReport(GhidriffResult result)
|
||||
{
|
||||
return JsonSerializer.Serialize(result, new JsonSerializerOptions
|
||||
{
|
||||
WriteIndented = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
});
|
||||
}
|
||||
|
||||
private static string GenerateMarkdownReport(GhidriffResult result)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
|
||||
sb.AppendLine($"# Binary Diff Report");
|
||||
sb.AppendLine();
|
||||
sb.AppendLine($"**Old Binary:** {result.OldBinaryName} (`{result.OldBinaryHash}`)");
|
||||
sb.AppendLine($"**New Binary:** {result.NewBinaryName} (`{result.NewBinaryHash}`)");
|
||||
sb.AppendLine();
|
||||
sb.AppendLine($"## Summary");
|
||||
sb.AppendLine();
|
||||
sb.AppendLine($"| Metric | Count |");
|
||||
sb.AppendLine($"|--------|-------|");
|
||||
sb.AppendLine($"| Functions Added | {result.Statistics.AddedCount} |");
|
||||
sb.AppendLine($"| Functions Removed | {result.Statistics.RemovedCount} |");
|
||||
sb.AppendLine($"| Functions Modified | {result.Statistics.ModifiedCount} |");
|
||||
sb.AppendLine($"| Functions Unchanged | {result.Statistics.UnchangedCount} |");
|
||||
sb.AppendLine();
|
||||
|
||||
if (result.AddedFunctions.Length > 0)
|
||||
{
|
||||
sb.AppendLine($"## Added Functions");
|
||||
sb.AppendLine();
|
||||
foreach (var func in result.AddedFunctions)
|
||||
{
|
||||
sb.AppendLine($"- `{func.Name}` at 0x{func.Address:X}");
|
||||
}
|
||||
sb.AppendLine();
|
||||
}
|
||||
|
||||
if (result.RemovedFunctions.Length > 0)
|
||||
{
|
||||
sb.AppendLine($"## Removed Functions");
|
||||
sb.AppendLine();
|
||||
foreach (var func in result.RemovedFunctions)
|
||||
{
|
||||
sb.AppendLine($"- `{func.Name}` at 0x{func.Address:X}");
|
||||
}
|
||||
sb.AppendLine();
|
||||
}
|
||||
|
||||
if (result.ModifiedFunctions.Length > 0)
|
||||
{
|
||||
sb.AppendLine($"## Modified Functions");
|
||||
sb.AppendLine();
|
||||
foreach (var func in result.ModifiedFunctions)
|
||||
{
|
||||
sb.AppendLine($"### {func.FunctionName}");
|
||||
sb.AppendLine($"- Similarity: {func.Similarity:P1}");
|
||||
sb.AppendLine($"- Old: 0x{func.OldAddress:X} ({func.OldSize} bytes)");
|
||||
sb.AppendLine($"- New: 0x{func.NewAddress:X} ({func.NewSize} bytes)");
|
||||
sb.AppendLine();
|
||||
}
|
||||
}
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
private static string GenerateHtmlReport(GhidriffResult result)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
|
||||
sb.AppendLine("<!DOCTYPE html>");
|
||||
sb.AppendLine("<html><head><title>Binary Diff Report</title>");
|
||||
sb.AppendLine("<style>");
|
||||
sb.AppendLine("body { font-family: sans-serif; margin: 20px; }");
|
||||
sb.AppendLine("table { border-collapse: collapse; }");
|
||||
sb.AppendLine("th, td { border: 1px solid #ccc; padding: 8px; }");
|
||||
sb.AppendLine(".added { background: #d4ffd4; }");
|
||||
sb.AppendLine(".removed { background: #ffd4d4; }");
|
||||
sb.AppendLine(".modified { background: #ffffd4; }");
|
||||
sb.AppendLine("</style>");
|
||||
sb.AppendLine("</head><body>");
|
||||
sb.AppendLine($"<h1>Binary Diff Report</h1>");
|
||||
sb.AppendLine($"<p><strong>Old:</strong> {result.OldBinaryName}</p>");
|
||||
sb.AppendLine($"<p><strong>New:</strong> {result.NewBinaryName}</p>");
|
||||
sb.AppendLine($"<table>");
|
||||
sb.AppendLine($"<tr><th>Metric</th><th>Count</th></tr>");
|
||||
sb.AppendLine($"<tr class='added'><td>Added</td><td>{result.Statistics.AddedCount}</td></tr>");
|
||||
sb.AppendLine($"<tr class='removed'><td>Removed</td><td>{result.Statistics.RemovedCount}</td></tr>");
|
||||
sb.AppendLine($"<tr class='modified'><td>Modified</td><td>{result.Statistics.ModifiedCount}</td></tr>");
|
||||
sb.AppendLine($"<tr><td>Unchanged</td><td>{result.Statistics.UnchangedCount}</td></tr>");
|
||||
sb.AppendLine("</table>");
|
||||
sb.AppendLine("</body></html>");
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
// JSON DTOs
|
||||
private sealed record ProcessResult(int ExitCode, string StandardOutput, string StandardError);
|
||||
|
||||
private sealed record GhidriffJsonOutput
|
||||
{
|
||||
public List<GhidriffFunctionJson>? AddedFunctions { get; init; }
|
||||
public List<GhidriffFunctionJson>? RemovedFunctions { get; init; }
|
||||
public List<GhidriffDiffJson>? ModifiedFunctions { get; init; }
|
||||
public GhidriffStatsJson? Statistics { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidriffFunctionJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? Address { get; init; }
|
||||
public int Size { get; init; }
|
||||
public string? Signature { get; init; }
|
||||
public string? DecompiledCode { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidriffDiffJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? OldAddress { get; init; }
|
||||
public string? NewAddress { get; init; }
|
||||
public int OldSize { get; init; }
|
||||
public int NewSize { get; init; }
|
||||
public string? OldSignature { get; init; }
|
||||
public string? NewSignature { get; init; }
|
||||
public decimal Similarity { get; init; }
|
||||
public string? OldDecompiledCode { get; init; }
|
||||
public string? NewDecompiledCode { get; init; }
|
||||
public List<string>? InstructionChanges { get; init; }
|
||||
}
|
||||
|
||||
private sealed record GhidriffStatsJson
|
||||
{
|
||||
public int TotalOldFunctions { get; init; }
|
||||
public int TotalNewFunctions { get; init; }
|
||||
public int AddedCount { get; init; }
|
||||
public int RemovedCount { get; init; }
|
||||
public int ModifiedCount { get; init; }
|
||||
public int UnchangedCount { get; init; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,432 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Globalization;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text.Json;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Ghidra;
|
||||
|
||||
/// <summary>
|
||||
/// Implementation of <see cref="IVersionTrackingService"/> using Ghidra Version Tracking.
|
||||
/// </summary>
|
||||
public sealed class VersionTrackingService : IVersionTrackingService
|
||||
{
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNameCaseInsensitive = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase
|
||||
};
|
||||
|
||||
private readonly GhidraHeadlessManager _headlessManager;
|
||||
private readonly GhidraOptions _options;
|
||||
private readonly ILogger<VersionTrackingService> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new VersionTrackingService.
|
||||
/// </summary>
|
||||
/// <param name="headlessManager">The Ghidra Headless manager.</param>
|
||||
/// <param name="options">Ghidra options.</param>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
/// <param name="timeProvider">Time provider.</param>
|
||||
public VersionTrackingService(
|
||||
GhidraHeadlessManager headlessManager,
|
||||
IOptions<GhidraOptions> options,
|
||||
ILogger<VersionTrackingService> logger,
|
||||
TimeProvider timeProvider)
|
||||
{
|
||||
_headlessManager = headlessManager;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_timeProvider = timeProvider;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<VersionTrackingResult> TrackVersionsAsync(
|
||||
Stream oldBinary,
|
||||
Stream newBinary,
|
||||
VersionTrackingOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(oldBinary);
|
||||
ArgumentNullException.ThrowIfNull(newBinary);
|
||||
|
||||
var oldPath = await SaveStreamToTempFileAsync(oldBinary, "old", ct);
|
||||
var newPath = await SaveStreamToTempFileAsync(newBinary, "new", ct);
|
||||
|
||||
try
|
||||
{
|
||||
return await TrackVersionsAsync(oldPath, newPath, options, ct);
|
||||
}
|
||||
finally
|
||||
{
|
||||
TryDeleteFile(oldPath);
|
||||
TryDeleteFile(newPath);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<VersionTrackingResult> TrackVersionsAsync(
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
VersionTrackingOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(oldBinaryPath);
|
||||
ArgumentException.ThrowIfNullOrEmpty(newBinaryPath);
|
||||
|
||||
if (!File.Exists(oldBinaryPath))
|
||||
{
|
||||
throw new FileNotFoundException("Old binary not found", oldBinaryPath);
|
||||
}
|
||||
|
||||
if (!File.Exists(newBinaryPath))
|
||||
{
|
||||
throw new FileNotFoundException("New binary not found", newBinaryPath);
|
||||
}
|
||||
|
||||
options ??= new VersionTrackingOptions();
|
||||
|
||||
_logger.LogInformation(
|
||||
"Starting Version Tracking: {OldBinary} vs {NewBinary}",
|
||||
Path.GetFileName(oldBinaryPath),
|
||||
Path.GetFileName(newBinaryPath));
|
||||
|
||||
var startTime = _timeProvider.GetUtcNow();
|
||||
|
||||
// Build script arguments for Version Tracking
|
||||
var scriptArgs = BuildVersionTrackingArgs(oldBinaryPath, newBinaryPath, options);
|
||||
|
||||
// Run Ghidra with Version Tracking script
|
||||
// Note: This assumes a custom VersionTracking.java script that outputs JSON
|
||||
var result = await _headlessManager.RunAnalysisAsync(
|
||||
oldBinaryPath,
|
||||
scriptName: "VersionTracking.java",
|
||||
scriptArgs: scriptArgs,
|
||||
runAnalysis: true,
|
||||
timeoutSeconds: options.TimeoutSeconds,
|
||||
ct);
|
||||
|
||||
if (!result.IsSuccess)
|
||||
{
|
||||
throw new GhidraException($"Version Tracking failed: {result.StandardError}")
|
||||
{
|
||||
ExitCode = result.ExitCode,
|
||||
StandardError = result.StandardError,
|
||||
StandardOutput = result.StandardOutput
|
||||
};
|
||||
}
|
||||
|
||||
var trackingResult = ParseVersionTrackingOutput(
|
||||
result.StandardOutput,
|
||||
startTime,
|
||||
result.Duration);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Version Tracking completed: {Matched} matched, {Added} added, {Removed} removed, {Modified} modified",
|
||||
trackingResult.Matches.Length,
|
||||
trackingResult.AddedFunctions.Length,
|
||||
trackingResult.RemovedFunctions.Length,
|
||||
trackingResult.ModifiedFunctions.Length);
|
||||
|
||||
return trackingResult;
|
||||
}
|
||||
|
||||
private static string[] BuildVersionTrackingArgs(
|
||||
string oldBinaryPath,
|
||||
string newBinaryPath,
|
||||
VersionTrackingOptions options)
|
||||
{
|
||||
var args = new List<string>
|
||||
{
|
||||
"-newBinary", newBinaryPath,
|
||||
"-minSimilarity", options.MinSimilarity.ToString("F2", CultureInfo.InvariantCulture)
|
||||
};
|
||||
|
||||
// Add correlator flags
|
||||
foreach (var correlator in options.Correlators)
|
||||
{
|
||||
args.Add($"-correlator:{GetCorrelatorName(correlator)}");
|
||||
}
|
||||
|
||||
if (options.IncludeDecompilation)
|
||||
{
|
||||
args.Add("-decompile");
|
||||
}
|
||||
|
||||
if (options.ComputeDetailedDiffs)
|
||||
{
|
||||
args.Add("-detailedDiffs");
|
||||
}
|
||||
|
||||
return [.. args];
|
||||
}
|
||||
|
||||
private static string GetCorrelatorName(CorrelatorType correlator)
|
||||
{
|
||||
return correlator switch
|
||||
{
|
||||
CorrelatorType.ExactBytes => "ExactBytesFunctionHasher",
|
||||
CorrelatorType.ExactMnemonics => "ExactMnemonicsFunctionHasher",
|
||||
CorrelatorType.SymbolName => "SymbolNameMatch",
|
||||
CorrelatorType.DataReference => "DataReferenceCorrelator",
|
||||
CorrelatorType.CallReference => "CallReferenceCorrelator",
|
||||
CorrelatorType.CombinedReference => "CombinedReferenceCorrelator",
|
||||
CorrelatorType.BSim => "BSimCorrelator",
|
||||
_ => "CombinedReferenceCorrelator"
|
||||
};
|
||||
}
|
||||
|
||||
private VersionTrackingResult ParseVersionTrackingOutput(
|
||||
string output,
|
||||
DateTimeOffset startTime,
|
||||
TimeSpan duration)
|
||||
{
|
||||
// Look for JSON output marker
|
||||
const string jsonMarker = "###VERSION_TRACKING_JSON###";
|
||||
var jsonStart = output.IndexOf(jsonMarker, StringComparison.Ordinal);
|
||||
|
||||
if (jsonStart >= 0)
|
||||
{
|
||||
var jsonContent = output[(jsonStart + jsonMarker.Length)..].Trim();
|
||||
var jsonEnd = jsonContent.IndexOf("###END_VERSION_TRACKING_JSON###", StringComparison.Ordinal);
|
||||
if (jsonEnd >= 0)
|
||||
{
|
||||
jsonContent = jsonContent[..jsonEnd].Trim();
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
return ParseJsonOutput(jsonContent, duration);
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
_logger.LogWarning(ex, "Failed to parse Version Tracking JSON output");
|
||||
}
|
||||
}
|
||||
|
||||
// Return empty result if parsing fails
|
||||
_logger.LogWarning("No structured Version Tracking output found");
|
||||
return CreateEmptyResult(duration);
|
||||
}
|
||||
|
||||
private static VersionTrackingResult ParseJsonOutput(string json, TimeSpan duration)
|
||||
{
|
||||
var data = JsonSerializer.Deserialize<VersionTrackingJsonOutput>(json, JsonOptions)
|
||||
?? throw new GhidraException("Failed to deserialize Version Tracking JSON output");
|
||||
|
||||
var matches = data.Matches?.Select(m => new FunctionMatch(
|
||||
m.OldName ?? "unknown",
|
||||
ParseAddress(m.OldAddress),
|
||||
m.NewName ?? "unknown",
|
||||
ParseAddress(m.NewAddress),
|
||||
m.Similarity,
|
||||
ParseCorrelatorType(m.MatchedBy),
|
||||
m.Differences?.Select(d => new MatchDifference(
|
||||
ParseDifferenceType(d.Type),
|
||||
d.Description ?? "",
|
||||
d.OldValue,
|
||||
d.NewValue,
|
||||
d.Address is not null ? ParseAddress(d.Address) : null
|
||||
)).ToImmutableArray() ?? []
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var added = data.AddedFunctions?.Select(f => new FunctionAdded(
|
||||
f.Name ?? "unknown",
|
||||
ParseAddress(f.Address),
|
||||
f.Size,
|
||||
f.Signature
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var removed = data.RemovedFunctions?.Select(f => new FunctionRemoved(
|
||||
f.Name ?? "unknown",
|
||||
ParseAddress(f.Address),
|
||||
f.Size,
|
||||
f.Signature
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var modified = data.ModifiedFunctions?.Select(f => new FunctionModified(
|
||||
f.OldName ?? "unknown",
|
||||
ParseAddress(f.OldAddress),
|
||||
f.OldSize,
|
||||
f.NewName ?? "unknown",
|
||||
ParseAddress(f.NewAddress),
|
||||
f.NewSize,
|
||||
f.Similarity,
|
||||
f.Differences?.Select(d => new MatchDifference(
|
||||
ParseDifferenceType(d.Type),
|
||||
d.Description ?? "",
|
||||
d.OldValue,
|
||||
d.NewValue,
|
||||
d.Address is not null ? ParseAddress(d.Address) : null
|
||||
)).ToImmutableArray() ?? [],
|
||||
f.OldDecompiled,
|
||||
f.NewDecompiled
|
||||
)).ToImmutableArray() ?? [];
|
||||
|
||||
var stats = new VersionTrackingStats(
|
||||
data.Statistics?.TotalOldFunctions ?? 0,
|
||||
data.Statistics?.TotalNewFunctions ?? 0,
|
||||
matches.Length,
|
||||
added.Length,
|
||||
removed.Length,
|
||||
modified.Length,
|
||||
duration);
|
||||
|
||||
return new VersionTrackingResult(matches, added, removed, modified, stats);
|
||||
}
|
||||
|
||||
private static VersionTrackingResult CreateEmptyResult(TimeSpan duration)
|
||||
{
|
||||
return new VersionTrackingResult(
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
new VersionTrackingStats(0, 0, 0, 0, 0, 0, duration));
|
||||
}
|
||||
|
||||
private static ulong ParseAddress(string? address)
|
||||
{
|
||||
if (string.IsNullOrEmpty(address))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (address.StartsWith("0x", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
address = address[2..];
|
||||
}
|
||||
|
||||
return ulong.TryParse(address, NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var result)
|
||||
? result
|
||||
: 0;
|
||||
}
|
||||
|
||||
private static CorrelatorType ParseCorrelatorType(string? correlator)
|
||||
{
|
||||
return correlator?.ToUpperInvariant() switch
|
||||
{
|
||||
"EXACTBYTES" or "EXACTBYTESFUNCTIONHASHER" => CorrelatorType.ExactBytes,
|
||||
"EXACTMNEMONICS" or "EXACTMNEMONICSFUNCTIONHASHER" => CorrelatorType.ExactMnemonics,
|
||||
"SYMBOLNAME" or "SYMBOLNAMEMATCH" => CorrelatorType.SymbolName,
|
||||
"DATAREFERENCE" or "DATAREFERENCECORRELATOR" => CorrelatorType.DataReference,
|
||||
"CALLREFERENCE" or "CALLREFERENCECORRELATOR" => CorrelatorType.CallReference,
|
||||
"COMBINEDREFERENCE" or "COMBINEDREFERENCECORRELATOR" => CorrelatorType.CombinedReference,
|
||||
"BSIM" or "BSIMCORRELATOR" => CorrelatorType.BSim,
|
||||
_ => CorrelatorType.CombinedReference
|
||||
};
|
||||
}
|
||||
|
||||
private static DifferenceType ParseDifferenceType(string? type)
|
||||
{
|
||||
return type?.ToUpperInvariant() switch
|
||||
{
|
||||
"INSTRUCTIONADDED" => DifferenceType.InstructionAdded,
|
||||
"INSTRUCTIONREMOVED" => DifferenceType.InstructionRemoved,
|
||||
"INSTRUCTIONCHANGED" => DifferenceType.InstructionChanged,
|
||||
"BRANCHTARGETCHANGED" => DifferenceType.BranchTargetChanged,
|
||||
"CALLTARGETCHANGED" => DifferenceType.CallTargetChanged,
|
||||
"CONSTANTCHANGED" => DifferenceType.ConstantChanged,
|
||||
"SIZECHANGED" => DifferenceType.SizeChanged,
|
||||
"STACKFRAMECHANGED" => DifferenceType.StackFrameChanged,
|
||||
"REGISTERUSAGECHANGED" => DifferenceType.RegisterUsageChanged,
|
||||
_ => DifferenceType.InstructionChanged
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<string> SaveStreamToTempFileAsync(Stream stream, string prefix, CancellationToken ct)
|
||||
{
|
||||
var path = Path.Combine(
|
||||
_options.WorkDir,
|
||||
$"{prefix}_{_timeProvider.GetUtcNow():yyyyMMddHHmmssfff}_{Guid.NewGuid():N}.bin");
|
||||
|
||||
Directory.CreateDirectory(Path.GetDirectoryName(path)!);
|
||||
|
||||
await using var fileStream = File.Create(path);
|
||||
await stream.CopyToAsync(fileStream, ct);
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
private void TryDeleteFile(string path)
|
||||
{
|
||||
try
|
||||
{
|
||||
if (File.Exists(path))
|
||||
{
|
||||
File.Delete(path);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogDebug(ex, "Failed to delete temp file: {Path}", path);
|
||||
}
|
||||
}
|
||||
|
||||
// JSON DTOs for deserialization
|
||||
private sealed record VersionTrackingJsonOutput
|
||||
{
|
||||
public List<FunctionMatchJson>? Matches { get; init; }
|
||||
public List<FunctionInfoJson>? AddedFunctions { get; init; }
|
||||
public List<FunctionInfoJson>? RemovedFunctions { get; init; }
|
||||
public List<FunctionModifiedJson>? ModifiedFunctions { get; init; }
|
||||
public VersionTrackingStatsJson? Statistics { get; init; }
|
||||
}
|
||||
|
||||
private sealed record FunctionMatchJson
|
||||
{
|
||||
public string? OldName { get; init; }
|
||||
public string? OldAddress { get; init; }
|
||||
public string? NewName { get; init; }
|
||||
public string? NewAddress { get; init; }
|
||||
public decimal Similarity { get; init; }
|
||||
public string? MatchedBy { get; init; }
|
||||
public List<DifferenceJson>? Differences { get; init; }
|
||||
}
|
||||
|
||||
private sealed record FunctionInfoJson
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? Address { get; init; }
|
||||
public int Size { get; init; }
|
||||
public string? Signature { get; init; }
|
||||
}
|
||||
|
||||
private sealed record FunctionModifiedJson
|
||||
{
|
||||
public string? OldName { get; init; }
|
||||
public string? OldAddress { get; init; }
|
||||
public int OldSize { get; init; }
|
||||
public string? NewName { get; init; }
|
||||
public string? NewAddress { get; init; }
|
||||
public int NewSize { get; init; }
|
||||
public decimal Similarity { get; init; }
|
||||
public List<DifferenceJson>? Differences { get; init; }
|
||||
public string? OldDecompiled { get; init; }
|
||||
public string? NewDecompiled { get; init; }
|
||||
}
|
||||
|
||||
private sealed record DifferenceJson
|
||||
{
|
||||
public string? Type { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public string? OldValue { get; init; }
|
||||
public string? NewValue { get; init; }
|
||||
public string? Address { get; init; }
|
||||
}
|
||||
|
||||
private sealed record VersionTrackingStatsJson
|
||||
{
|
||||
public int TotalOldFunctions { get; init; }
|
||||
public int TotalNewFunctions { get; init; }
|
||||
public int MatchedCount { get; init; }
|
||||
public int AddedCount { get; init; }
|
||||
public int RemovedCount { get; init; }
|
||||
public int ModifiedCount { get; init; }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<Description>Ghidra integration for StellaOps BinaryIndex. Provides Version Tracking, BSim, and ghidriff capabilities as a fallback disassembly backend.</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Disassembly.Abstractions\StellaOps.BinaryIndex.Disassembly.Abstractions.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Contracts\StellaOps.BinaryIndex.Contracts.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options.ConfigurationExtensions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options.DataAnnotations" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
@@ -0,0 +1,269 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML;
|
||||
|
||||
/// <summary>
|
||||
/// Tokenizer for binary/decompiled code using byte-pair encoding style tokenization.
|
||||
/// </summary>
|
||||
public sealed partial class BinaryCodeTokenizer : ITokenizer
|
||||
{
|
||||
private readonly ImmutableDictionary<string, long> _vocabulary;
|
||||
private readonly long _padToken;
|
||||
private readonly long _unkToken;
|
||||
private readonly long _clsToken;
|
||||
private readonly long _sepToken;
|
||||
|
||||
// Special token IDs (matching CodeBERT conventions)
|
||||
private const long DefaultPadToken = 0;
|
||||
private const long DefaultUnkToken = 1;
|
||||
private const long DefaultClsToken = 2;
|
||||
private const long DefaultSepToken = 3;
|
||||
|
||||
public BinaryCodeTokenizer(string? vocabularyPath = null)
|
||||
{
|
||||
if (!string.IsNullOrEmpty(vocabularyPath) && File.Exists(vocabularyPath))
|
||||
{
|
||||
_vocabulary = LoadVocabulary(vocabularyPath);
|
||||
_padToken = _vocabulary.GetValueOrDefault("<pad>", DefaultPadToken);
|
||||
_unkToken = _vocabulary.GetValueOrDefault("<unk>", DefaultUnkToken);
|
||||
_clsToken = _vocabulary.GetValueOrDefault("<cls>", DefaultClsToken);
|
||||
_sepToken = _vocabulary.GetValueOrDefault("<sep>", DefaultSepToken);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Use default vocabulary for testing
|
||||
_vocabulary = CreateDefaultVocabulary();
|
||||
_padToken = DefaultPadToken;
|
||||
_unkToken = DefaultUnkToken;
|
||||
_clsToken = DefaultClsToken;
|
||||
_sepToken = DefaultSepToken;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public long[] Tokenize(string text, int maxLength = 512)
|
||||
{
|
||||
var (inputIds, _) = TokenizeWithMask(text, maxLength);
|
||||
return inputIds;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public (long[] InputIds, long[] AttentionMask) TokenizeWithMask(string text, int maxLength = 512)
|
||||
{
|
||||
ArgumentException.ThrowIfNullOrEmpty(text);
|
||||
|
||||
var tokens = TokenizeText(text);
|
||||
var inputIds = new long[maxLength];
|
||||
var attentionMask = new long[maxLength];
|
||||
|
||||
// Add [CLS] token
|
||||
inputIds[0] = _clsToken;
|
||||
attentionMask[0] = 1;
|
||||
|
||||
var position = 1;
|
||||
foreach (var token in tokens)
|
||||
{
|
||||
if (position >= maxLength - 1)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
inputIds[position] = _vocabulary.GetValueOrDefault(token.ToLowerInvariant(), _unkToken);
|
||||
attentionMask[position] = 1;
|
||||
position++;
|
||||
}
|
||||
|
||||
// Add [SEP] token
|
||||
if (position < maxLength)
|
||||
{
|
||||
inputIds[position] = _sepToken;
|
||||
attentionMask[position] = 1;
|
||||
position++;
|
||||
}
|
||||
|
||||
// Pad remaining positions
|
||||
for (var i = position; i < maxLength; i++)
|
||||
{
|
||||
inputIds[i] = _padToken;
|
||||
attentionMask[i] = 0;
|
||||
}
|
||||
|
||||
return (inputIds, attentionMask);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Decode(long[] tokenIds)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(tokenIds);
|
||||
|
||||
var reverseVocab = _vocabulary.ToImmutableDictionary(kv => kv.Value, kv => kv.Key);
|
||||
var tokens = new List<string>();
|
||||
|
||||
foreach (var id in tokenIds)
|
||||
{
|
||||
if (id == _padToken || id == _clsToken || id == _sepToken)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
tokens.Add(reverseVocab.GetValueOrDefault(id, "<unk>"));
|
||||
}
|
||||
|
||||
return string.Join(" ", tokens);
|
||||
}
|
||||
|
||||
private IEnumerable<string> TokenizeText(string text)
|
||||
{
|
||||
// Normalize whitespace
|
||||
text = WhitespaceRegex().Replace(text, " ");
|
||||
|
||||
// Split on operators and punctuation, keeping them as tokens
|
||||
var tokens = new List<string>();
|
||||
var matches = TokenRegex().Matches(text);
|
||||
|
||||
foreach (Match match in matches)
|
||||
{
|
||||
var token = match.Value.Trim();
|
||||
if (!string.IsNullOrEmpty(token))
|
||||
{
|
||||
tokens.Add(token);
|
||||
}
|
||||
}
|
||||
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private static ImmutableDictionary<string, long> LoadVocabulary(string path)
|
||||
{
|
||||
var vocabulary = new Dictionary<string, long>();
|
||||
var lines = File.ReadAllLines(path);
|
||||
|
||||
for (var i = 0; i < lines.Length; i++)
|
||||
{
|
||||
var token = lines[i].Trim();
|
||||
if (!string.IsNullOrEmpty(token))
|
||||
{
|
||||
vocabulary[token] = i;
|
||||
}
|
||||
}
|
||||
|
||||
return vocabulary.ToImmutableDictionary();
|
||||
}
|
||||
|
||||
private static ImmutableDictionary<string, long> CreateDefaultVocabulary()
|
||||
{
|
||||
// Basic vocabulary for testing without model
|
||||
var vocab = new Dictionary<string, long>
|
||||
{
|
||||
// Special tokens
|
||||
["<pad>"] = 0,
|
||||
["<unk>"] = 1,
|
||||
["<cls>"] = 2,
|
||||
["<sep>"] = 3,
|
||||
|
||||
// Keywords
|
||||
["void"] = 10,
|
||||
["int"] = 11,
|
||||
["char"] = 12,
|
||||
["short"] = 13,
|
||||
["long"] = 14,
|
||||
["float"] = 15,
|
||||
["double"] = 16,
|
||||
["unsigned"] = 17,
|
||||
["signed"] = 18,
|
||||
["const"] = 19,
|
||||
["static"] = 20,
|
||||
["extern"] = 21,
|
||||
["return"] = 22,
|
||||
["if"] = 23,
|
||||
["else"] = 24,
|
||||
["while"] = 25,
|
||||
["for"] = 26,
|
||||
["do"] = 27,
|
||||
["switch"] = 28,
|
||||
["case"] = 29,
|
||||
["default"] = 30,
|
||||
["break"] = 31,
|
||||
["continue"] = 32,
|
||||
["goto"] = 33,
|
||||
["sizeof"] = 34,
|
||||
["struct"] = 35,
|
||||
["union"] = 36,
|
||||
["enum"] = 37,
|
||||
["typedef"] = 38,
|
||||
|
||||
// Operators
|
||||
["+"] = 50,
|
||||
["-"] = 51,
|
||||
["*"] = 52,
|
||||
["/"] = 53,
|
||||
["%"] = 54,
|
||||
["="] = 55,
|
||||
["=="] = 56,
|
||||
["!="] = 57,
|
||||
["<"] = 58,
|
||||
[">"] = 59,
|
||||
["<="] = 60,
|
||||
[">="] = 61,
|
||||
["&&"] = 62,
|
||||
["||"] = 63,
|
||||
["!"] = 64,
|
||||
["&"] = 65,
|
||||
["|"] = 66,
|
||||
["^"] = 67,
|
||||
["~"] = 68,
|
||||
["<<"] = 69,
|
||||
[">>"] = 70,
|
||||
["++"] = 71,
|
||||
["--"] = 72,
|
||||
["->"] = 73,
|
||||
["."] = 74,
|
||||
|
||||
// Punctuation
|
||||
["("] = 80,
|
||||
[")"] = 81,
|
||||
["{"] = 82,
|
||||
["}"] = 83,
|
||||
["["] = 84,
|
||||
["]"] = 85,
|
||||
[";"] = 86,
|
||||
[","] = 87,
|
||||
[":"] = 88,
|
||||
|
||||
// Common Ghidra types
|
||||
["undefined"] = 100,
|
||||
["undefined1"] = 101,
|
||||
["undefined2"] = 102,
|
||||
["undefined4"] = 103,
|
||||
["undefined8"] = 104,
|
||||
["byte"] = 105,
|
||||
["word"] = 106,
|
||||
["dword"] = 107,
|
||||
["qword"] = 108,
|
||||
["bool"] = 109,
|
||||
|
||||
// Common functions
|
||||
["malloc"] = 200,
|
||||
["free"] = 201,
|
||||
["memcpy"] = 202,
|
||||
["memset"] = 203,
|
||||
["strlen"] = 204,
|
||||
["strcpy"] = 205,
|
||||
["strcmp"] = 206,
|
||||
["printf"] = 207,
|
||||
["sprintf"] = 208
|
||||
};
|
||||
|
||||
return vocab.ToImmutableDictionary();
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"\s+")]
|
||||
private static partial Regex WhitespaceRegex();
|
||||
|
||||
[GeneratedRegex(@"([a-zA-Z_][a-zA-Z0-9_]*|0[xX][0-9a-fA-F]+|\d+|""[^""]*""|'[^']*'|[+\-*/%=<>!&|^~]+|[(){}\[\];,.:])")]
|
||||
private static partial Regex TokenRegex();
|
||||
}
|
||||
@@ -0,0 +1,174 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML;
|
||||
|
||||
/// <summary>
|
||||
/// Service for generating and comparing function embeddings.
|
||||
/// </summary>
|
||||
public interface IEmbeddingService
|
||||
{
|
||||
/// <summary>
|
||||
/// Generate embedding vector for a function.
|
||||
/// </summary>
|
||||
/// <param name="input">Function input data.</param>
|
||||
/// <param name="options">Embedding options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Function embedding with vector.</returns>
|
||||
Task<FunctionEmbedding> GenerateEmbeddingAsync(
|
||||
EmbeddingInput input,
|
||||
EmbeddingOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Generate embeddings for multiple functions in batch.
|
||||
/// </summary>
|
||||
/// <param name="inputs">Function inputs.</param>
|
||||
/// <param name="options">Embedding options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Function embeddings.</returns>
|
||||
Task<ImmutableArray<FunctionEmbedding>> GenerateBatchAsync(
|
||||
IEnumerable<EmbeddingInput> inputs,
|
||||
EmbeddingOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Compute similarity between two embeddings.
|
||||
/// </summary>
|
||||
/// <param name="a">First embedding.</param>
|
||||
/// <param name="b">Second embedding.</param>
|
||||
/// <param name="metric">Similarity metric to use.</param>
|
||||
/// <returns>Similarity score (0.0 to 1.0).</returns>
|
||||
decimal ComputeSimilarity(
|
||||
FunctionEmbedding a,
|
||||
FunctionEmbedding b,
|
||||
SimilarityMetric metric = SimilarityMetric.Cosine);
|
||||
|
||||
/// <summary>
|
||||
/// Find similar functions in an embedding index.
|
||||
/// </summary>
|
||||
/// <param name="query">Query embedding.</param>
|
||||
/// <param name="topK">Number of results to return.</param>
|
||||
/// <param name="minSimilarity">Minimum similarity threshold.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Matching functions sorted by similarity.</returns>
|
||||
Task<ImmutableArray<EmbeddingMatch>> FindSimilarAsync(
|
||||
FunctionEmbedding query,
|
||||
int topK = 10,
|
||||
decimal minSimilarity = 0.7m,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Service for training ML models.
|
||||
/// </summary>
|
||||
public interface IModelTrainingService
|
||||
{
|
||||
/// <summary>
|
||||
/// Train embedding model on function pairs.
|
||||
/// </summary>
|
||||
/// <param name="trainingData">Training pairs.</param>
|
||||
/// <param name="options">Training options.</param>
|
||||
/// <param name="progress">Optional progress reporter.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Training result.</returns>
|
||||
Task<TrainingResult> TrainAsync(
|
||||
IAsyncEnumerable<TrainingPair> trainingData,
|
||||
TrainingOptions options,
|
||||
IProgress<TrainingProgress>? progress = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Evaluate model on test data.
|
||||
/// </summary>
|
||||
/// <param name="testData">Test pairs.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Evaluation metrics.</returns>
|
||||
Task<EvaluationResult> EvaluateAsync(
|
||||
IAsyncEnumerable<TrainingPair> testData,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Export trained model to specified format.
|
||||
/// </summary>
|
||||
/// <param name="outputPath">Output path for model.</param>
|
||||
/// <param name="format">Export format.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
Task ExportModelAsync(
|
||||
string outputPath,
|
||||
ModelExportFormat format = ModelExportFormat.Onnx,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tokenizer for converting code to token sequences.
|
||||
/// </summary>
|
||||
public interface ITokenizer
|
||||
{
|
||||
/// <summary>
|
||||
/// Tokenize text into token IDs.
|
||||
/// </summary>
|
||||
/// <param name="text">Input text.</param>
|
||||
/// <param name="maxLength">Maximum sequence length.</param>
|
||||
/// <returns>Token ID array.</returns>
|
||||
long[] Tokenize(string text, int maxLength = 512);
|
||||
|
||||
/// <summary>
|
||||
/// Tokenize with attention mask.
|
||||
/// </summary>
|
||||
/// <param name="text">Input text.</param>
|
||||
/// <param name="maxLength">Maximum sequence length.</param>
|
||||
/// <returns>Token IDs and attention mask.</returns>
|
||||
(long[] InputIds, long[] AttentionMask) TokenizeWithMask(string text, int maxLength = 512);
|
||||
|
||||
/// <summary>
|
||||
/// Decode token IDs back to text.
|
||||
/// </summary>
|
||||
/// <param name="tokenIds">Token IDs.</param>
|
||||
/// <returns>Decoded text.</returns>
|
||||
string Decode(long[] tokenIds);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Index for efficient embedding similarity search.
|
||||
/// </summary>
|
||||
public interface IEmbeddingIndex
|
||||
{
|
||||
/// <summary>
|
||||
/// Add embedding to index.
|
||||
/// </summary>
|
||||
/// <param name="embedding">Embedding to add.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
Task AddAsync(FunctionEmbedding embedding, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Add multiple embeddings to index.
|
||||
/// </summary>
|
||||
/// <param name="embeddings">Embeddings to add.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
Task AddBatchAsync(IEnumerable<FunctionEmbedding> embeddings, CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Search for similar embeddings.
|
||||
/// </summary>
|
||||
/// <param name="query">Query vector.</param>
|
||||
/// <param name="topK">Number of results.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Similar embeddings with scores.</returns>
|
||||
Task<ImmutableArray<(FunctionEmbedding Embedding, decimal Similarity)>> SearchAsync(
|
||||
float[] query,
|
||||
int topK,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Get total count of indexed embeddings.
|
||||
/// </summary>
|
||||
int Count { get; }
|
||||
|
||||
/// <summary>
|
||||
/// Clear all embeddings from index.
|
||||
/// </summary>
|
||||
void Clear();
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML;
|
||||
|
||||
/// <summary>
|
||||
/// In-memory embedding index using brute-force cosine similarity search.
|
||||
/// For production use, consider using a vector database like Milvus or Pinecone.
|
||||
/// </summary>
|
||||
public sealed class InMemoryEmbeddingIndex : IEmbeddingIndex
|
||||
{
|
||||
private readonly ConcurrentDictionary<string, FunctionEmbedding> _embeddings = new();
|
||||
private readonly object _lock = new();
|
||||
|
||||
/// <inheritdoc />
|
||||
public int Count => _embeddings.Count;
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task AddAsync(FunctionEmbedding embedding, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(embedding);
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
_embeddings[embedding.FunctionId] = embedding;
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task AddBatchAsync(IEnumerable<FunctionEmbedding> embeddings, CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(embeddings);
|
||||
|
||||
foreach (var embedding in embeddings)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
_embeddings[embedding.FunctionId] = embedding;
|
||||
}
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<ImmutableArray<(FunctionEmbedding Embedding, decimal Similarity)>> SearchAsync(
|
||||
float[] query,
|
||||
int topK,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(query);
|
||||
if (topK <= 0)
|
||||
{
|
||||
throw new ArgumentOutOfRangeException(nameof(topK), "topK must be positive");
|
||||
}
|
||||
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
// Calculate similarity for all embeddings
|
||||
var similarities = new List<(FunctionEmbedding Embedding, decimal Similarity)>();
|
||||
|
||||
foreach (var embedding in _embeddings.Values)
|
||||
{
|
||||
if (embedding.Vector.Length != query.Length)
|
||||
{
|
||||
continue; // Skip incompatible dimensions
|
||||
}
|
||||
|
||||
var similarity = CosineSimilarity(query, embedding.Vector);
|
||||
similarities.Add((embedding, similarity));
|
||||
}
|
||||
|
||||
// Sort by similarity descending and take top K
|
||||
var results = similarities
|
||||
.OrderByDescending(s => s.Similarity)
|
||||
.Take(topK)
|
||||
.ToImmutableArray();
|
||||
|
||||
return Task.FromResult(results);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public void Clear()
|
||||
{
|
||||
_embeddings.Clear();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get an embedding by function ID.
|
||||
/// </summary>
|
||||
/// <param name="functionId">Function identifier.</param>
|
||||
/// <returns>Embedding if found, null otherwise.</returns>
|
||||
public FunctionEmbedding? Get(string functionId)
|
||||
{
|
||||
return _embeddings.TryGetValue(functionId, out var embedding) ? embedding : null;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove an embedding by function ID.
|
||||
/// </summary>
|
||||
/// <param name="functionId">Function identifier.</param>
|
||||
/// <returns>True if removed, false if not found.</returns>
|
||||
public bool Remove(string functionId)
|
||||
{
|
||||
return _embeddings.TryRemove(functionId, out _);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get all embeddings.
|
||||
/// </summary>
|
||||
/// <returns>All stored embeddings.</returns>
|
||||
public IEnumerable<FunctionEmbedding> GetAll()
|
||||
{
|
||||
return _embeddings.Values;
|
||||
}
|
||||
|
||||
private static decimal CosineSimilarity(float[] a, float[] b)
|
||||
{
|
||||
var dotProduct = 0.0;
|
||||
var normA = 0.0;
|
||||
var normB = 0.0;
|
||||
|
||||
for (var i = 0; i < a.Length; i++)
|
||||
{
|
||||
dotProduct += a[i] * b[i];
|
||||
normA += a[i] * a[i];
|
||||
normB += b[i] * b[i];
|
||||
}
|
||||
|
||||
if (normA == 0 || normB == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
var similarity = dotProduct / (Math.Sqrt(normA) * Math.Sqrt(normB));
|
||||
return (decimal)Math.Clamp(similarity, -1.0, 1.0);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering ML services.
|
||||
/// </summary>
|
||||
public static class MlServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds ML embedding services to the service collection.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddMlServices(this IServiceCollection services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
// Register tokenizer
|
||||
services.AddSingleton<ITokenizer, BinaryCodeTokenizer>();
|
||||
|
||||
// Register embedding index
|
||||
services.AddSingleton<IEmbeddingIndex, InMemoryEmbeddingIndex>();
|
||||
|
||||
// Register embedding service
|
||||
services.AddScoped<IEmbeddingService, OnnxInferenceEngine>();
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds ML services with custom options.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="configureOptions">Action to configure ML options.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddMlServices(
|
||||
this IServiceCollection services,
|
||||
Action<MlOptions> configureOptions)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentNullException.ThrowIfNull(configureOptions);
|
||||
|
||||
services.Configure(configureOptions);
|
||||
return services.AddMlServices();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adds ML services with a custom tokenizer.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <param name="vocabularyPath">Path to vocabulary file.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddMlServicesWithVocabulary(
|
||||
this IServiceCollection services,
|
||||
string vocabularyPath)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
ArgumentException.ThrowIfNullOrEmpty(vocabularyPath);
|
||||
|
||||
// Register tokenizer with vocabulary
|
||||
services.AddSingleton<ITokenizer>(sp => new BinaryCodeTokenizer(vocabularyPath));
|
||||
|
||||
// Register embedding index
|
||||
services.AddSingleton<IEmbeddingIndex, InMemoryEmbeddingIndex>();
|
||||
|
||||
// Register embedding service
|
||||
services.AddScoped<IEmbeddingService, OnnxInferenceEngine>();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
259
src/BinaryIndex/__Libraries/StellaOps.BinaryIndex.ML/Models.cs
Normal file
259
src/BinaryIndex/__Libraries/StellaOps.BinaryIndex.ML/Models.cs
Normal file
@@ -0,0 +1,259 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML;
|
||||
|
||||
/// <summary>
|
||||
/// Input for generating function embeddings.
|
||||
/// </summary>
|
||||
/// <param name="DecompiledCode">Decompiled C-like code if available.</param>
|
||||
/// <param name="SemanticGraph">Semantic graph from IR analysis if available.</param>
|
||||
/// <param name="InstructionBytes">Raw instruction bytes if available.</param>
|
||||
/// <param name="PreferredInput">Which input type to prefer for embedding generation.</param>
|
||||
public sealed record EmbeddingInput(
|
||||
string? DecompiledCode,
|
||||
KeySemanticsGraph? SemanticGraph,
|
||||
byte[]? InstructionBytes,
|
||||
EmbeddingInputType PreferredInput);
|
||||
|
||||
/// <summary>
|
||||
/// Type of input for embedding generation.
|
||||
/// </summary>
|
||||
public enum EmbeddingInputType
|
||||
{
|
||||
/// <summary>Use decompiled C-like code.</summary>
|
||||
DecompiledCode,
|
||||
|
||||
/// <summary>Use semantic graph from IR analysis.</summary>
|
||||
SemanticGraph,
|
||||
|
||||
/// <summary>Use raw instruction bytes.</summary>
|
||||
Instructions
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A function embedding vector.
|
||||
/// </summary>
|
||||
/// <param name="FunctionId">Identifier for the function.</param>
|
||||
/// <param name="FunctionName">Name of the function.</param>
|
||||
/// <param name="Vector">Embedding vector (typically 768 dimensions).</param>
|
||||
/// <param name="Model">Model used to generate the embedding.</param>
|
||||
/// <param name="InputType">Type of input used.</param>
|
||||
/// <param name="GeneratedAt">When the embedding was generated.</param>
|
||||
public sealed record FunctionEmbedding(
|
||||
string FunctionId,
|
||||
string FunctionName,
|
||||
float[] Vector,
|
||||
EmbeddingModel Model,
|
||||
EmbeddingInputType InputType,
|
||||
DateTimeOffset GeneratedAt);
|
||||
|
||||
/// <summary>
|
||||
/// Available embedding models.
|
||||
/// </summary>
|
||||
public enum EmbeddingModel
|
||||
{
|
||||
/// <summary>Fine-tuned CodeBERT for binary code analysis.</summary>
|
||||
CodeBertBinary,
|
||||
|
||||
/// <summary>Graph neural network for CFG/call graph analysis.</summary>
|
||||
GraphSageFunction,
|
||||
|
||||
/// <summary>Contrastive learning model for function similarity.</summary>
|
||||
ContrastiveFunction
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Similarity metrics for comparing embeddings.
|
||||
/// </summary>
|
||||
public enum SimilarityMetric
|
||||
{
|
||||
/// <summary>Cosine similarity (angle between vectors).</summary>
|
||||
Cosine,
|
||||
|
||||
/// <summary>Euclidean distance (inverted to similarity).</summary>
|
||||
Euclidean,
|
||||
|
||||
/// <summary>Manhattan distance (inverted to similarity).</summary>
|
||||
Manhattan,
|
||||
|
||||
/// <summary>Learned metric from model.</summary>
|
||||
LearnedMetric
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A match from embedding similarity search.
|
||||
/// </summary>
|
||||
/// <param name="FunctionId">Matched function identifier.</param>
|
||||
/// <param name="FunctionName">Matched function name.</param>
|
||||
/// <param name="Similarity">Similarity score (0.0 to 1.0).</param>
|
||||
/// <param name="LibraryName">Library containing the function.</param>
|
||||
/// <param name="LibraryVersion">Version of the library.</param>
|
||||
public sealed record EmbeddingMatch(
|
||||
string FunctionId,
|
||||
string FunctionName,
|
||||
decimal Similarity,
|
||||
string? LibraryName,
|
||||
string? LibraryVersion);
|
||||
|
||||
/// <summary>
|
||||
/// Options for embedding generation.
|
||||
/// </summary>
|
||||
public sealed record EmbeddingOptions
|
||||
{
|
||||
/// <summary>Maximum sequence length for tokenization.</summary>
|
||||
public int MaxSequenceLength { get; init; } = 512;
|
||||
|
||||
/// <summary>Whether to normalize the embedding vector.</summary>
|
||||
public bool NormalizeVector { get; init; } = true;
|
||||
|
||||
/// <summary>Batch size for batch inference.</summary>
|
||||
public int BatchSize { get; init; } = 32;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Training pair for model training.
|
||||
/// </summary>
|
||||
/// <param name="FunctionA">First function input.</param>
|
||||
/// <param name="FunctionB">Second function input.</param>
|
||||
/// <param name="IsSimilar">Ground truth: are these the same function?</param>
|
||||
/// <param name="SimilarityScore">Optional fine-grained similarity score.</param>
|
||||
public sealed record TrainingPair(
|
||||
EmbeddingInput FunctionA,
|
||||
EmbeddingInput FunctionB,
|
||||
bool IsSimilar,
|
||||
decimal? SimilarityScore);
|
||||
|
||||
/// <summary>
|
||||
/// Options for model training.
|
||||
/// </summary>
|
||||
public sealed record TrainingOptions
|
||||
{
|
||||
/// <summary>Model architecture to train.</summary>
|
||||
public EmbeddingModel Model { get; init; } = EmbeddingModel.CodeBertBinary;
|
||||
|
||||
/// <summary>Embedding vector dimension.</summary>
|
||||
public int EmbeddingDimension { get; init; } = 768;
|
||||
|
||||
/// <summary>Training batch size.</summary>
|
||||
public int BatchSize { get; init; } = 32;
|
||||
|
||||
/// <summary>Number of training epochs.</summary>
|
||||
public int Epochs { get; init; } = 10;
|
||||
|
||||
/// <summary>Learning rate.</summary>
|
||||
public double LearningRate { get; init; } = 1e-5;
|
||||
|
||||
/// <summary>Margin for contrastive loss.</summary>
|
||||
public double MarginLoss { get; init; } = 0.5;
|
||||
|
||||
/// <summary>Path to pretrained model weights.</summary>
|
||||
public string? PretrainedModelPath { get; init; }
|
||||
|
||||
/// <summary>Path to save checkpoints.</summary>
|
||||
public string? CheckpointPath { get; init; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Progress update during training.
|
||||
/// </summary>
|
||||
/// <param name="Epoch">Current epoch.</param>
|
||||
/// <param name="TotalEpochs">Total epochs.</param>
|
||||
/// <param name="Batch">Current batch.</param>
|
||||
/// <param name="TotalBatches">Total batches.</param>
|
||||
/// <param name="Loss">Current loss value.</param>
|
||||
/// <param name="Accuracy">Current accuracy.</param>
|
||||
public sealed record TrainingProgress(
|
||||
int Epoch,
|
||||
int TotalEpochs,
|
||||
int Batch,
|
||||
int TotalBatches,
|
||||
double Loss,
|
||||
double Accuracy);
|
||||
|
||||
/// <summary>
|
||||
/// Result of model training.
|
||||
/// </summary>
|
||||
/// <param name="ModelPath">Path to saved model.</param>
|
||||
/// <param name="TotalPairs">Number of training pairs used.</param>
|
||||
/// <param name="Epochs">Number of epochs completed.</param>
|
||||
/// <param name="FinalLoss">Final loss value.</param>
|
||||
/// <param name="ValidationAccuracy">Validation accuracy.</param>
|
||||
/// <param name="TrainingTime">Total training time.</param>
|
||||
public sealed record TrainingResult(
|
||||
string ModelPath,
|
||||
int TotalPairs,
|
||||
int Epochs,
|
||||
double FinalLoss,
|
||||
double ValidationAccuracy,
|
||||
TimeSpan TrainingTime);
|
||||
|
||||
/// <summary>
|
||||
/// Result of model evaluation.
|
||||
/// </summary>
|
||||
/// <param name="Accuracy">Overall accuracy.</param>
|
||||
/// <param name="Precision">Precision (true positives / predicted positives).</param>
|
||||
/// <param name="Recall">Recall (true positives / actual positives).</param>
|
||||
/// <param name="F1Score">F1 score (harmonic mean of precision and recall).</param>
|
||||
/// <param name="AucRoc">Area under ROC curve.</param>
|
||||
/// <param name="ConfusionMatrix">Confusion matrix entries.</param>
|
||||
public sealed record EvaluationResult(
|
||||
double Accuracy,
|
||||
double Precision,
|
||||
double Recall,
|
||||
double F1Score,
|
||||
double AucRoc,
|
||||
ImmutableArray<ConfusionEntry> ConfusionMatrix);
|
||||
|
||||
/// <summary>
|
||||
/// Entry in confusion matrix.
|
||||
/// </summary>
|
||||
/// <param name="Predicted">Predicted label.</param>
|
||||
/// <param name="Actual">Actual label.</param>
|
||||
/// <param name="Count">Number of occurrences.</param>
|
||||
public sealed record ConfusionEntry(
|
||||
string Predicted,
|
||||
string Actual,
|
||||
int Count);
|
||||
|
||||
/// <summary>
|
||||
/// Model export formats.
|
||||
/// </summary>
|
||||
public enum ModelExportFormat
|
||||
{
|
||||
/// <summary>ONNX format for cross-platform inference.</summary>
|
||||
Onnx,
|
||||
|
||||
/// <summary>PyTorch format.</summary>
|
||||
PyTorch,
|
||||
|
||||
/// <summary>TensorFlow SavedModel format.</summary>
|
||||
TensorFlow
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for ML service.
|
||||
/// </summary>
|
||||
public sealed record MlOptions
|
||||
{
|
||||
/// <summary>Path to ONNX model file.</summary>
|
||||
public string? ModelPath { get; init; }
|
||||
|
||||
/// <summary>Path to tokenizer vocabulary.</summary>
|
||||
public string? VocabularyPath { get; init; }
|
||||
|
||||
/// <summary>Device to use for inference (cpu, cuda).</summary>
|
||||
public string Device { get; init; } = "cpu";
|
||||
|
||||
/// <summary>Number of threads for inference.</summary>
|
||||
public int NumThreads { get; init; } = 4;
|
||||
|
||||
/// <summary>Whether to use GPU if available.</summary>
|
||||
public bool UseGpu { get; init; } = false;
|
||||
|
||||
/// <summary>Maximum batch size for inference.</summary>
|
||||
public int MaxBatchSize { get; init; } = 32;
|
||||
}
|
||||
@@ -0,0 +1,381 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using Microsoft.ML.OnnxRuntime;
|
||||
using Microsoft.ML.OnnxRuntime.Tensors;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML;
|
||||
|
||||
/// <summary>
|
||||
/// ONNX Runtime-based embedding inference engine.
|
||||
/// </summary>
|
||||
public sealed class OnnxInferenceEngine : IEmbeddingService, IAsyncDisposable
|
||||
{
|
||||
private readonly InferenceSession? _session;
|
||||
private readonly ITokenizer _tokenizer;
|
||||
private readonly IEmbeddingIndex? _index;
|
||||
private readonly MlOptions _options;
|
||||
private readonly ILogger<OnnxInferenceEngine> _logger;
|
||||
private readonly TimeProvider _timeProvider;
|
||||
private bool _disposed;
|
||||
|
||||
public OnnxInferenceEngine(
|
||||
ITokenizer tokenizer,
|
||||
IOptions<MlOptions> options,
|
||||
ILogger<OnnxInferenceEngine> logger,
|
||||
TimeProvider timeProvider,
|
||||
IEmbeddingIndex? index = null)
|
||||
{
|
||||
_tokenizer = tokenizer;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
_timeProvider = timeProvider;
|
||||
_index = index;
|
||||
|
||||
if (!string.IsNullOrEmpty(_options.ModelPath) && File.Exists(_options.ModelPath))
|
||||
{
|
||||
var sessionOptions = new SessionOptions
|
||||
{
|
||||
GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_ALL,
|
||||
ExecutionMode = ExecutionMode.ORT_PARALLEL,
|
||||
InterOpNumThreads = _options.NumThreads,
|
||||
IntraOpNumThreads = _options.NumThreads
|
||||
};
|
||||
|
||||
_session = new InferenceSession(_options.ModelPath, sessionOptions);
|
||||
_logger.LogInformation(
|
||||
"Loaded ONNX model from {Path}",
|
||||
_options.ModelPath);
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"No ONNX model found at {Path}, using fallback embedding",
|
||||
_options.ModelPath);
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<FunctionEmbedding> GenerateEmbeddingAsync(
|
||||
EmbeddingInput input,
|
||||
EmbeddingOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(input);
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
options ??= new EmbeddingOptions();
|
||||
|
||||
var text = GetInputText(input);
|
||||
var functionId = ComputeFunctionId(text);
|
||||
|
||||
float[] vector;
|
||||
|
||||
if (_session is not null)
|
||||
{
|
||||
vector = await RunInferenceAsync(text, options, ct);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fallback: generate hash-based pseudo-embedding for testing
|
||||
vector = GenerateFallbackEmbedding(text, 768);
|
||||
}
|
||||
|
||||
if (options.NormalizeVector)
|
||||
{
|
||||
NormalizeVector(vector);
|
||||
}
|
||||
|
||||
return new FunctionEmbedding(
|
||||
functionId,
|
||||
ExtractFunctionName(text),
|
||||
vector,
|
||||
EmbeddingModel.CodeBertBinary,
|
||||
input.PreferredInput,
|
||||
_timeProvider.GetUtcNow());
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<FunctionEmbedding>> GenerateBatchAsync(
|
||||
IEnumerable<EmbeddingInput> inputs,
|
||||
EmbeddingOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(inputs);
|
||||
|
||||
options ??= new EmbeddingOptions();
|
||||
var results = new List<FunctionEmbedding>();
|
||||
|
||||
// Process in batches
|
||||
var batch = new List<EmbeddingInput>();
|
||||
foreach (var input in inputs)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
batch.Add(input);
|
||||
|
||||
if (batch.Count >= options.BatchSize)
|
||||
{
|
||||
var batchResults = await ProcessBatchAsync(batch, options, ct);
|
||||
results.AddRange(batchResults);
|
||||
batch.Clear();
|
||||
}
|
||||
}
|
||||
|
||||
// Process remaining
|
||||
if (batch.Count > 0)
|
||||
{
|
||||
var batchResults = await ProcessBatchAsync(batch, options, ct);
|
||||
results.AddRange(batchResults);
|
||||
}
|
||||
|
||||
return [.. results];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public decimal ComputeSimilarity(
|
||||
FunctionEmbedding a,
|
||||
FunctionEmbedding b,
|
||||
SimilarityMetric metric = SimilarityMetric.Cosine)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
|
||||
if (a.Vector.Length != b.Vector.Length)
|
||||
{
|
||||
throw new ArgumentException("Embedding vectors must have same dimension");
|
||||
}
|
||||
|
||||
return metric switch
|
||||
{
|
||||
SimilarityMetric.Cosine => CosineSimilarity(a.Vector, b.Vector),
|
||||
SimilarityMetric.Euclidean => EuclideanSimilarity(a.Vector, b.Vector),
|
||||
SimilarityMetric.Manhattan => ManhattanSimilarity(a.Vector, b.Vector),
|
||||
SimilarityMetric.LearnedMetric => CosineSimilarity(a.Vector, b.Vector), // Fallback
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(metric))
|
||||
};
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<EmbeddingMatch>> FindSimilarAsync(
|
||||
FunctionEmbedding query,
|
||||
int topK = 10,
|
||||
decimal minSimilarity = 0.7m,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(query);
|
||||
|
||||
if (_index is null)
|
||||
{
|
||||
_logger.LogWarning("No embedding index configured, cannot search");
|
||||
return [];
|
||||
}
|
||||
|
||||
var results = await _index.SearchAsync(query.Vector, topK, ct);
|
||||
|
||||
return results
|
||||
.Where(r => r.Similarity >= minSimilarity)
|
||||
.Select(r => new EmbeddingMatch(
|
||||
r.Embedding.FunctionId,
|
||||
r.Embedding.FunctionName,
|
||||
r.Similarity,
|
||||
null, // Library info would come from metadata
|
||||
null))
|
||||
.ToImmutableArray();
|
||||
}
|
||||
|
||||
private async Task<float[]> RunInferenceAsync(
|
||||
string text,
|
||||
EmbeddingOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
if (_session is null)
|
||||
{
|
||||
throw new InvalidOperationException("ONNX session not initialized");
|
||||
}
|
||||
|
||||
var (inputIds, attentionMask) = _tokenizer.TokenizeWithMask(text, options.MaxSequenceLength);
|
||||
|
||||
var inputIdsTensor = new DenseTensor<long>(inputIds, [1, inputIds.Length]);
|
||||
var attentionMaskTensor = new DenseTensor<long>(attentionMask, [1, attentionMask.Length]);
|
||||
|
||||
var inputs = new List<NamedOnnxValue>
|
||||
{
|
||||
NamedOnnxValue.CreateFromTensor("input_ids", inputIdsTensor),
|
||||
NamedOnnxValue.CreateFromTensor("attention_mask", attentionMaskTensor)
|
||||
};
|
||||
|
||||
using var results = await Task.Run(() => _session.Run(inputs), ct);
|
||||
|
||||
var outputTensor = results.First().AsTensor<float>();
|
||||
return outputTensor.ToArray();
|
||||
}
|
||||
|
||||
private async Task<IEnumerable<FunctionEmbedding>> ProcessBatchAsync(
|
||||
List<EmbeddingInput> batch,
|
||||
EmbeddingOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
// For now, process sequentially
|
||||
// TODO: Implement true batch inference with batched tensors
|
||||
var results = new List<FunctionEmbedding>();
|
||||
foreach (var input in batch)
|
||||
{
|
||||
var embedding = await GenerateEmbeddingAsync(input, options, ct);
|
||||
results.Add(embedding);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
private static string GetInputText(EmbeddingInput input)
|
||||
{
|
||||
return input.PreferredInput switch
|
||||
{
|
||||
EmbeddingInputType.DecompiledCode => input.DecompiledCode
|
||||
?? throw new ArgumentException("DecompiledCode required"),
|
||||
EmbeddingInputType.SemanticGraph => SerializeGraph(input.SemanticGraph
|
||||
?? throw new ArgumentException("SemanticGraph required")),
|
||||
EmbeddingInputType.Instructions => SerializeInstructions(input.InstructionBytes
|
||||
?? throw new ArgumentException("InstructionBytes required")),
|
||||
_ => throw new ArgumentOutOfRangeException()
|
||||
};
|
||||
}
|
||||
|
||||
private static string SerializeGraph(Semantic.KeySemanticsGraph graph)
|
||||
{
|
||||
// Convert graph to textual representation for tokenization
|
||||
var sb = new System.Text.StringBuilder();
|
||||
sb.AppendLine($"// Graph: {graph.Nodes.Length} nodes");
|
||||
|
||||
foreach (var node in graph.Nodes)
|
||||
{
|
||||
sb.AppendLine($"node {node.Id}: {node.Operation}");
|
||||
}
|
||||
|
||||
foreach (var edge in graph.Edges)
|
||||
{
|
||||
sb.AppendLine($"edge {edge.SourceId} -> {edge.TargetId}");
|
||||
}
|
||||
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
private static string SerializeInstructions(byte[] bytes)
|
||||
{
|
||||
// Convert instruction bytes to hex representation
|
||||
return Convert.ToHexString(bytes);
|
||||
}
|
||||
|
||||
private static string ComputeFunctionId(string text)
|
||||
{
|
||||
var hash = System.Security.Cryptography.SHA256.HashData(
|
||||
System.Text.Encoding.UTF8.GetBytes(text));
|
||||
return Convert.ToHexString(hash)[..16];
|
||||
}
|
||||
|
||||
private static string ExtractFunctionName(string text)
|
||||
{
|
||||
// Try to extract function name from code
|
||||
var match = System.Text.RegularExpressions.Regex.Match(
|
||||
text,
|
||||
@"\b(\w+)\s*\(");
|
||||
return match.Success ? match.Groups[1].Value : "unknown";
|
||||
}
|
||||
|
||||
private static float[] GenerateFallbackEmbedding(string text, int dimension)
|
||||
{
|
||||
// Generate a deterministic pseudo-embedding based on text hash
|
||||
// This is only for testing when no model is available
|
||||
var hash = System.Security.Cryptography.SHA256.HashData(
|
||||
System.Text.Encoding.UTF8.GetBytes(text));
|
||||
|
||||
var random = new Random(BitConverter.ToInt32(hash, 0));
|
||||
var vector = new float[dimension];
|
||||
|
||||
for (var i = 0; i < dimension; i++)
|
||||
{
|
||||
vector[i] = (float)(random.NextDouble() * 2 - 1);
|
||||
}
|
||||
|
||||
return vector;
|
||||
}
|
||||
|
||||
private static void NormalizeVector(float[] vector)
|
||||
{
|
||||
var norm = 0.0;
|
||||
for (var i = 0; i < vector.Length; i++)
|
||||
{
|
||||
norm += vector[i] * vector[i];
|
||||
}
|
||||
|
||||
norm = Math.Sqrt(norm);
|
||||
if (norm > 0)
|
||||
{
|
||||
for (var i = 0; i < vector.Length; i++)
|
||||
{
|
||||
vector[i] /= (float)norm;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static decimal CosineSimilarity(float[] a, float[] b)
|
||||
{
|
||||
var dotProduct = 0.0;
|
||||
var normA = 0.0;
|
||||
var normB = 0.0;
|
||||
|
||||
for (var i = 0; i < a.Length; i++)
|
||||
{
|
||||
dotProduct += a[i] * b[i];
|
||||
normA += a[i] * a[i];
|
||||
normB += b[i] * b[i];
|
||||
}
|
||||
|
||||
if (normA == 0 || normB == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
var similarity = dotProduct / (Math.Sqrt(normA) * Math.Sqrt(normB));
|
||||
return (decimal)Math.Clamp(similarity, -1.0, 1.0);
|
||||
}
|
||||
|
||||
private static decimal EuclideanSimilarity(float[] a, float[] b)
|
||||
{
|
||||
var sumSquares = 0.0;
|
||||
for (var i = 0; i < a.Length; i++)
|
||||
{
|
||||
var diff = a[i] - b[i];
|
||||
sumSquares += diff * diff;
|
||||
}
|
||||
|
||||
var distance = Math.Sqrt(sumSquares);
|
||||
// Convert distance to similarity (0 = identical, larger = more different)
|
||||
return (decimal)(1.0 / (1.0 + distance));
|
||||
}
|
||||
|
||||
private static decimal ManhattanSimilarity(float[] a, float[] b)
|
||||
{
|
||||
var sum = 0.0;
|
||||
for (var i = 0; i < a.Length; i++)
|
||||
{
|
||||
sum += Math.Abs(a[i] - b[i]);
|
||||
}
|
||||
|
||||
// Convert distance to similarity
|
||||
return (decimal)(1.0 / (1.0 + sum));
|
||||
}
|
||||
|
||||
public async ValueTask DisposeAsync()
|
||||
{
|
||||
if (!_disposed)
|
||||
{
|
||||
_session?.Dispose();
|
||||
_disposed = true;
|
||||
}
|
||||
|
||||
await Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<Description>Machine learning-based function similarity using embeddings and ONNX inference for BinaryIndex.</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Decompiler\StellaOps.BinaryIndex.Decompiler.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Semantic\StellaOps.BinaryIndex.Semantic.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
<PackageReference Include="Microsoft.ML.OnnxRuntime" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,6 +2,8 @@ using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Core.Models;
|
||||
using StellaOps.BinaryIndex.Core.Services;
|
||||
using StellaOps.BinaryIndex.Corpus;
|
||||
using StellaOps.BinaryIndex.Corpus.Models;
|
||||
using StellaOps.BinaryIndex.DeltaSig;
|
||||
using StellaOps.BinaryIndex.FixIndex.Repositories;
|
||||
using StellaOps.BinaryIndex.Fingerprints.Matching;
|
||||
@@ -19,6 +21,7 @@ public sealed class BinaryVulnerabilityService : IBinaryVulnerabilityService
|
||||
private readonly IFingerprintMatcher? _fingerprintMatcher;
|
||||
private readonly IDeltaSignatureMatcher? _deltaSigMatcher;
|
||||
private readonly IDeltaSignatureRepository? _deltaSigRepo;
|
||||
private readonly ICorpusQueryService? _corpusQueryService;
|
||||
private readonly ILogger<BinaryVulnerabilityService> _logger;
|
||||
|
||||
public BinaryVulnerabilityService(
|
||||
@@ -27,7 +30,8 @@ public sealed class BinaryVulnerabilityService : IBinaryVulnerabilityService
|
||||
IFixIndexRepository? fixIndexRepo = null,
|
||||
IFingerprintMatcher? fingerprintMatcher = null,
|
||||
IDeltaSignatureMatcher? deltaSigMatcher = null,
|
||||
IDeltaSignatureRepository? deltaSigRepo = null)
|
||||
IDeltaSignatureRepository? deltaSigRepo = null,
|
||||
ICorpusQueryService? corpusQueryService = null)
|
||||
{
|
||||
_assertionRepo = assertionRepo;
|
||||
_logger = logger;
|
||||
@@ -35,6 +39,7 @@ public sealed class BinaryVulnerabilityService : IBinaryVulnerabilityService
|
||||
_fingerprintMatcher = fingerprintMatcher;
|
||||
_deltaSigMatcher = deltaSigMatcher;
|
||||
_deltaSigRepo = deltaSigRepo;
|
||||
_corpusQueryService = corpusQueryService;
|
||||
}
|
||||
|
||||
public async Task<ImmutableArray<BinaryVulnMatch>> LookupByIdentityAsync(
|
||||
@@ -429,4 +434,197 @@ public sealed class BinaryVulnerabilityService : IBinaryVulnerabilityService
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<CorpusFunctionMatch>> IdentifyFunctionFromCorpusAsync(
|
||||
FunctionFingerprintSet fingerprints,
|
||||
CorpusLookupOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
if (_corpusQueryService is null)
|
||||
{
|
||||
_logger.LogWarning("Corpus query service not configured, cannot identify function from corpus");
|
||||
return ImmutableArray<CorpusFunctionMatch>.Empty;
|
||||
}
|
||||
|
||||
options ??= new CorpusLookupOptions();
|
||||
|
||||
// Build corpus fingerprints from input
|
||||
var corpusFingerprints = BuildCorpusFingerprints(fingerprints);
|
||||
|
||||
var identifyOptions = new IdentifyOptions
|
||||
{
|
||||
MinSimilarity = options.MinSimilarity,
|
||||
MaxResults = options.MaxCandidates,
|
||||
LibraryFilter = options.LibraryFilter is not null
|
||||
? [options.LibraryFilter]
|
||||
: null,
|
||||
ArchitectureFilter = fingerprints.Architecture is not null
|
||||
? [fingerprints.Architecture]
|
||||
: null,
|
||||
IncludeCveInfo = options.IncludeCveAssociations
|
||||
};
|
||||
|
||||
var corpusMatches = await _corpusQueryService.IdentifyFunctionAsync(
|
||||
corpusFingerprints,
|
||||
identifyOptions,
|
||||
ct).ConfigureAwait(false);
|
||||
|
||||
// Convert corpus matches to service results
|
||||
var results = new List<CorpusFunctionMatch>();
|
||||
foreach (var match in corpusMatches)
|
||||
{
|
||||
// CVE associations would come from a separate query if needed
|
||||
var cveAssociations = ImmutableArray<CorpusCveAssociation>.Empty;
|
||||
if (options.IncludeCveAssociations)
|
||||
{
|
||||
cveAssociations = await GetCveAssociationsForFunctionAsync(
|
||||
match.LibraryName,
|
||||
match.FunctionName,
|
||||
match.Version,
|
||||
options,
|
||||
ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
results.Add(new CorpusFunctionMatch
|
||||
{
|
||||
LibraryName = match.LibraryName,
|
||||
VersionRange = match.Version,
|
||||
FunctionName = match.FunctionName,
|
||||
Confidence = match.Similarity,
|
||||
Method = MapCorpusMatchMethod(match.Details),
|
||||
SemanticSimilarity = match.Details.SemanticSimilarity,
|
||||
InstructionSimilarity = match.Details.InstructionSimilarity,
|
||||
CveAssociations = cveAssociations
|
||||
});
|
||||
}
|
||||
|
||||
_logger.LogDebug("Corpus identification found {Count} matches", results.Count);
|
||||
return results.ToImmutableArray();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableDictionary<string, ImmutableArray<CorpusFunctionMatch>>> IdentifyFunctionsFromCorpusBatchAsync(
|
||||
IEnumerable<(string Key, FunctionFingerprintSet Fingerprints)> functions,
|
||||
CorpusLookupOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
var results = new Dictionary<string, ImmutableArray<CorpusFunctionMatch>>();
|
||||
var functionList = functions.ToList();
|
||||
const int batchSize = 16;
|
||||
|
||||
for (var i = 0; i < functionList.Count; i += batchSize)
|
||||
{
|
||||
var batch = functionList.Skip(i).Take(batchSize).ToList();
|
||||
var tasks = batch.Select(async item =>
|
||||
{
|
||||
var matches = await IdentifyFunctionFromCorpusAsync(item.Fingerprints, options, ct)
|
||||
.ConfigureAwait(false);
|
||||
return (item.Key, matches);
|
||||
});
|
||||
|
||||
foreach (var (key, matches) in await Task.WhenAll(tasks).ConfigureAwait(false))
|
||||
{
|
||||
results[key] = matches;
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogDebug("Batch corpus identification processed {Count} functions", functionList.Count);
|
||||
return results.ToImmutableDictionary();
|
||||
}
|
||||
|
||||
private static FunctionFingerprints BuildCorpusFingerprints(FunctionFingerprintSet fingerprints)
|
||||
{
|
||||
return new FunctionFingerprints(
|
||||
SemanticHash: fingerprints.SemanticFingerprint,
|
||||
InstructionHash: fingerprints.InstructionFingerprint,
|
||||
CfgHash: null, // Map from API call or leave null
|
||||
ApiCalls: null,
|
||||
SizeBytes: fingerprints.FunctionSize);
|
||||
}
|
||||
|
||||
private async Task<ImmutableArray<CorpusCveAssociation>> GetCveAssociationsForFunctionAsync(
|
||||
string libraryName,
|
||||
string functionName,
|
||||
string version,
|
||||
CorpusLookupOptions options,
|
||||
CancellationToken ct)
|
||||
{
|
||||
if (_corpusQueryService is null)
|
||||
return ImmutableArray<CorpusCveAssociation>.Empty;
|
||||
|
||||
// Get function evolution which includes CVE IDs if available
|
||||
var evolution = await _corpusQueryService.GetFunctionEvolutionAsync(
|
||||
libraryName,
|
||||
functionName,
|
||||
ct).ConfigureAwait(false);
|
||||
|
||||
if (evolution is null)
|
||||
return ImmutableArray<CorpusCveAssociation>.Empty;
|
||||
|
||||
// Find matching version
|
||||
var versionInfo = evolution.Versions
|
||||
.FirstOrDefault(v => v.Version == version);
|
||||
|
||||
if (versionInfo?.CveIds is not { Length: > 0 })
|
||||
return ImmutableArray<CorpusCveAssociation>.Empty;
|
||||
|
||||
var associations = new List<CorpusCveAssociation>();
|
||||
|
||||
foreach (var cveId in versionInfo.CveIds.Value)
|
||||
{
|
||||
var affectedState = CorpusAffectedState.Vulnerable;
|
||||
string? fixedInVersion = null;
|
||||
|
||||
// Check fix status if requested
|
||||
if (options.CheckFixStatus && _fixIndexRepo is not null &&
|
||||
!string.IsNullOrEmpty(options.DistroHint) && !string.IsNullOrEmpty(options.ReleaseHint))
|
||||
{
|
||||
var fixStatus = await _fixIndexRepo.GetFixStatusAsync(
|
||||
options.DistroHint,
|
||||
options.ReleaseHint,
|
||||
libraryName,
|
||||
cveId,
|
||||
ct).ConfigureAwait(false);
|
||||
|
||||
if (fixStatus is not null)
|
||||
{
|
||||
fixedInVersion = fixStatus.FixedVersion;
|
||||
affectedState = fixStatus.State == FixState.Fixed
|
||||
? CorpusAffectedState.Fixed
|
||||
: CorpusAffectedState.Vulnerable;
|
||||
}
|
||||
}
|
||||
|
||||
associations.Add(new CorpusCveAssociation
|
||||
{
|
||||
CveId = cveId,
|
||||
AffectedState = affectedState,
|
||||
FixedInVersion = fixedInVersion,
|
||||
Confidence = 0.85m, // Default confidence for corpus-based associations
|
||||
EvidenceType = "corpus"
|
||||
});
|
||||
}
|
||||
|
||||
return associations.ToImmutableArray();
|
||||
}
|
||||
|
||||
private static CorpusMatchMethod MapCorpusMatchMethod(Corpus.Models.MatchDetails details)
|
||||
{
|
||||
// Determine primary match method based on which similarity is highest
|
||||
var hasSemantic = details.SemanticSimilarity > 0;
|
||||
var hasInstruction = details.InstructionSimilarity > 0;
|
||||
var hasApiCall = details.ApiCallSimilarity > 0;
|
||||
|
||||
if (hasSemantic && hasInstruction)
|
||||
return CorpusMatchMethod.Combined;
|
||||
if (hasSemantic)
|
||||
return CorpusMatchMethod.Semantic;
|
||||
if (hasInstruction)
|
||||
return CorpusMatchMethod.Instruction;
|
||||
if (hasApiCall)
|
||||
return CorpusMatchMethod.ApiCall;
|
||||
|
||||
return CorpusMatchMethod.Combined;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
# BinaryIndex.Semantic Module Charter
|
||||
|
||||
## Mission
|
||||
Provide semantic-level binary function analysis that goes beyond instruction-byte comparison. Enable accurate function matching that is resilient to compiler optimizations, instruction reordering, and register allocation differences.
|
||||
|
||||
## Responsibilities
|
||||
- Lift disassembled instructions to B2R2 LowUIR intermediate representation
|
||||
- Transform IR to SSA form for dataflow analysis (optional)
|
||||
- Extract Key-Semantics Graphs (KSG) capturing data/control dependencies
|
||||
- Generate deterministic semantic fingerprints via Weisfeiler-Lehman graph hashing
|
||||
- Provide semantic similarity matching between functions
|
||||
|
||||
## Key Abstractions
|
||||
|
||||
### Services
|
||||
- `IIrLiftingService` - Lifts instructions to IR (LowUIR/SSA)
|
||||
- `ISemanticGraphExtractor` - Extracts KSG from lifted IR
|
||||
- `ISemanticFingerprintGenerator` - Generates semantic fingerprints
|
||||
- `ISemanticMatcher` - Computes semantic similarity
|
||||
|
||||
### Models
|
||||
- `LiftedFunction` - Function with IR statements and CFG
|
||||
- `SsaFunction` - Function in SSA form with def-use chains
|
||||
- `KeySemanticsGraph` - Semantic graph with nodes and edges
|
||||
- `SemanticFingerprint` - Hash-based semantic fingerprint
|
||||
- `SemanticMatchResult` - Similarity result with confidence
|
||||
|
||||
## Dependencies
|
||||
- `StellaOps.BinaryIndex.Disassembly.Abstractions` - Instruction models
|
||||
- `StellaOps.BinaryIndex.Disassembly` - Disassembly service
|
||||
- B2R2 (via Disassembly.B2R2 plugin) - IR lifting backend
|
||||
|
||||
## Working Agreement
|
||||
1. **Determinism** - All graph hashing and fingerprinting must be deterministic
|
||||
2. **Stable ordering** - Node/edge enumeration must use stable ordering
|
||||
3. **Immutable outputs** - All result types are immutable records
|
||||
4. **CancellationToken** - All async operations must propagate cancellation
|
||||
5. **Culture-invariant** - Use InvariantCulture for all string operations
|
||||
|
||||
## Test Coverage
|
||||
- Unit tests for each component in `__Tests/StellaOps.BinaryIndex.Semantic.Tests`
|
||||
- Golden tests with binaries compiled at different optimization levels
|
||||
- Property-based tests for hash determinism and collision resistance
|
||||
@@ -0,0 +1,47 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using StellaOps.BinaryIndex.Disassembly;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
/// <summary>
|
||||
/// Service for lifting disassembled instructions to intermediate representation.
|
||||
/// </summary>
|
||||
public interface IIrLiftingService
|
||||
{
|
||||
/// <summary>
|
||||
/// Lift a disassembled function to B2R2 LowUIR intermediate representation.
|
||||
/// </summary>
|
||||
/// <param name="instructions">Disassembled instructions.</param>
|
||||
/// <param name="functionName">Name of the function.</param>
|
||||
/// <param name="startAddress">Start address of the function.</param>
|
||||
/// <param name="architecture">CPU architecture.</param>
|
||||
/// <param name="options">Lifting options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The lifted function with IR statements and CFG.</returns>
|
||||
Task<LiftedFunction> LiftToIrAsync(
|
||||
IReadOnlyList<DisassembledInstruction> instructions,
|
||||
string functionName,
|
||||
ulong startAddress,
|
||||
CpuArchitecture architecture,
|
||||
LiftOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Transform a lifted function to SSA form for dataflow analysis.
|
||||
/// </summary>
|
||||
/// <param name="lifted">The lifted function.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The function in SSA form with def-use chains.</returns>
|
||||
Task<SsaFunction> TransformToSsaAsync(
|
||||
LiftedFunction lifted,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Checks if the service supports the given architecture.
|
||||
/// </summary>
|
||||
/// <param name="architecture">CPU architecture to check.</param>
|
||||
/// <returns>True if the architecture is supported.</returns>
|
||||
bool SupportsArchitecture(CpuArchitecture architecture);
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
/// <summary>
|
||||
/// Service for generating semantic fingerprints from key-semantics graphs.
|
||||
/// </summary>
|
||||
public interface ISemanticFingerprintGenerator
|
||||
{
|
||||
/// <summary>
|
||||
/// Generate a semantic fingerprint from a key-semantics graph.
|
||||
/// </summary>
|
||||
/// <param name="graph">The key-semantics graph.</param>
|
||||
/// <param name="address">Function start address.</param>
|
||||
/// <param name="options">Fingerprint generation options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The generated semantic fingerprint.</returns>
|
||||
Task<SemanticFingerprint> GenerateAsync(
|
||||
KeySemanticsGraph graph,
|
||||
ulong address,
|
||||
SemanticFingerprintOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Generate a semantic fingerprint from a lifted function (convenience method).
|
||||
/// </summary>
|
||||
/// <param name="function">The lifted function.</param>
|
||||
/// <param name="graphExtractor">Graph extractor to use.</param>
|
||||
/// <param name="options">Fingerprint generation options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The generated semantic fingerprint.</returns>
|
||||
Task<SemanticFingerprint> GenerateFromFunctionAsync(
|
||||
LiftedFunction function,
|
||||
ISemanticGraphExtractor graphExtractor,
|
||||
SemanticFingerprintOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the algorithm used by this generator.
|
||||
/// </summary>
|
||||
SemanticFingerprintAlgorithm Algorithm { get; }
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
/// <summary>
|
||||
/// Service for extracting key-semantics graphs from lifted IR.
|
||||
/// </summary>
|
||||
public interface ISemanticGraphExtractor
|
||||
{
|
||||
/// <summary>
|
||||
/// Extract a key-semantics graph from a lifted function.
|
||||
/// Captures: data dependencies, control dependencies, memory operations.
|
||||
/// </summary>
|
||||
/// <param name="function">The lifted function.</param>
|
||||
/// <param name="options">Graph extraction options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The extracted key-semantics graph.</returns>
|
||||
Task<KeySemanticsGraph> ExtractGraphAsync(
|
||||
LiftedFunction function,
|
||||
GraphExtractionOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Extract a key-semantics graph from an SSA function.
|
||||
/// More precise due to explicit def-use information.
|
||||
/// </summary>
|
||||
/// <param name="function">The SSA function.</param>
|
||||
/// <param name="options">Graph extraction options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The extracted key-semantics graph.</returns>
|
||||
Task<KeySemanticsGraph> ExtractGraphFromSsaAsync(
|
||||
SsaFunction function,
|
||||
GraphExtractionOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Canonicalize a graph for deterministic comparison.
|
||||
/// </summary>
|
||||
/// <param name="graph">The graph to canonicalize.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The canonicalized graph with node mappings.</returns>
|
||||
Task<CanonicalGraph> CanonicalizeAsync(
|
||||
KeySemanticsGraph graph,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
/// <summary>
|
||||
/// Service for computing semantic similarity between functions.
|
||||
/// </summary>
|
||||
public interface ISemanticMatcher
|
||||
{
|
||||
/// <summary>
|
||||
/// Compute semantic similarity between two fingerprints.
|
||||
/// </summary>
|
||||
/// <param name="a">First fingerprint.</param>
|
||||
/// <param name="b">Second fingerprint.</param>
|
||||
/// <param name="options">Matching options.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>The match result with similarity scores.</returns>
|
||||
Task<SemanticMatchResult> MatchAsync(
|
||||
SemanticFingerprint a,
|
||||
SemanticFingerprint b,
|
||||
MatchOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Find the best matches for a fingerprint in a corpus.
|
||||
/// </summary>
|
||||
/// <param name="query">The query fingerprint.</param>
|
||||
/// <param name="corpus">The corpus of fingerprints to search.</param>
|
||||
/// <param name="minSimilarity">Minimum similarity threshold.</param>
|
||||
/// <param name="maxResults">Maximum number of results to return.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Best matching fingerprints ordered by similarity.</returns>
|
||||
Task<ImmutableArray<SemanticMatchResult>> FindMatchesAsync(
|
||||
SemanticFingerprint query,
|
||||
IAsyncEnumerable<SemanticFingerprint> corpus,
|
||||
decimal minSimilarity = 0.7m,
|
||||
int maxResults = 10,
|
||||
CancellationToken ct = default);
|
||||
|
||||
/// <summary>
|
||||
/// Compute similarity between two semantic graphs directly.
|
||||
/// </summary>
|
||||
/// <param name="a">First graph.</param>
|
||||
/// <param name="b">Second graph.</param>
|
||||
/// <param name="ct">Cancellation token.</param>
|
||||
/// <returns>Graph similarity score (0.0 to 1.0).</returns>
|
||||
Task<decimal> ComputeGraphSimilarityAsync(
|
||||
KeySemanticsGraph a,
|
||||
KeySemanticsGraph b,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
@@ -0,0 +1,113 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Globalization;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic.Internal;
|
||||
|
||||
/// <summary>
|
||||
/// Canonicalizes semantic graphs for deterministic comparison.
|
||||
/// </summary>
|
||||
internal sealed class GraphCanonicalizer
|
||||
{
|
||||
/// <summary>
|
||||
/// Canonicalize a semantic graph by assigning deterministic node IDs.
|
||||
/// </summary>
|
||||
/// <param name="graph">The graph to canonicalize.</param>
|
||||
/// <returns>Canonicalized graph with node mapping.</returns>
|
||||
public CanonicalGraph Canonicalize(KeySemanticsGraph graph)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(graph);
|
||||
|
||||
if (graph.Nodes.IsEmpty)
|
||||
{
|
||||
return new CanonicalGraph(
|
||||
graph,
|
||||
ImmutableDictionary<int, int>.Empty,
|
||||
[]);
|
||||
}
|
||||
|
||||
// Compute canonical labels using WL hashing
|
||||
var hasher = new WeisfeilerLehmanHasher(iterations: 3);
|
||||
var labels = hasher.ComputeCanonicalLabels(graph);
|
||||
|
||||
// Sort nodes by their canonical labels
|
||||
var sortedNodes = graph.Nodes
|
||||
.OrderBy(n => labels.Length > n.Id ? labels[n.Id] : string.Empty, StringComparer.Ordinal)
|
||||
.ThenBy(n => n.Type)
|
||||
.ThenBy(n => n.Operation, StringComparer.Ordinal)
|
||||
.ToList();
|
||||
|
||||
// Create mapping from old IDs to new canonical IDs
|
||||
var nodeMapping = new Dictionary<int, int>();
|
||||
for (var i = 0; i < sortedNodes.Count; i++)
|
||||
{
|
||||
nodeMapping[sortedNodes[i].Id] = i;
|
||||
}
|
||||
|
||||
// Remap nodes with new IDs
|
||||
var canonicalNodes = sortedNodes
|
||||
.Select((n, i) => n with { Id = i })
|
||||
.ToImmutableArray();
|
||||
|
||||
// Remap edges
|
||||
var canonicalEdges = graph.Edges
|
||||
.Where(e => nodeMapping.ContainsKey(e.SourceId) && nodeMapping.ContainsKey(e.TargetId))
|
||||
.Select(e => e with
|
||||
{
|
||||
SourceId = nodeMapping[e.SourceId],
|
||||
TargetId = nodeMapping[e.TargetId]
|
||||
})
|
||||
.OrderBy(e => e.SourceId)
|
||||
.ThenBy(e => e.TargetId)
|
||||
.ThenBy(e => e.Type)
|
||||
.ToImmutableArray();
|
||||
|
||||
// Recompute labels for canonical graph
|
||||
var canonicalGraph = new KeySemanticsGraph(
|
||||
graph.FunctionName,
|
||||
canonicalNodes,
|
||||
canonicalEdges,
|
||||
graph.Properties);
|
||||
|
||||
var canonicalLabels = hasher.ComputeCanonicalLabels(canonicalGraph);
|
||||
|
||||
return new CanonicalGraph(
|
||||
canonicalGraph,
|
||||
nodeMapping.ToImmutableDictionary(),
|
||||
canonicalLabels);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute a canonical string representation of a graph for hashing.
|
||||
/// </summary>
|
||||
/// <param name="graph">The graph to serialize.</param>
|
||||
/// <returns>Canonical string representation.</returns>
|
||||
public string ToCanonicalString(KeySemanticsGraph graph)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(graph);
|
||||
|
||||
var canonical = Canonicalize(graph);
|
||||
var parts = new List<string>();
|
||||
|
||||
// Add nodes
|
||||
foreach (var node in canonical.Graph.Nodes)
|
||||
{
|
||||
var operands = string.Join(",", node.Operands.OrderBy(o => o, StringComparer.Ordinal));
|
||||
parts.Add(string.Create(
|
||||
CultureInfo.InvariantCulture,
|
||||
$"N{node.Id}:{(int)node.Type}:{node.Operation}:[{operands}]"));
|
||||
}
|
||||
|
||||
// Add edges
|
||||
foreach (var edge in canonical.Graph.Edges)
|
||||
{
|
||||
parts.Add(string.Create(
|
||||
CultureInfo.InvariantCulture,
|
||||
$"E{edge.SourceId}->{edge.TargetId}:{(int)edge.Type}"));
|
||||
}
|
||||
|
||||
return string.Join("|", parts);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,228 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Globalization;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic.Internal;
|
||||
|
||||
/// <summary>
|
||||
/// Weisfeiler-Lehman graph hashing for deterministic semantic fingerprints.
|
||||
/// Uses iterative label refinement to capture graph structure.
|
||||
/// </summary>
|
||||
internal sealed class WeisfeilerLehmanHasher
|
||||
{
|
||||
private readonly int _iterations;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new Weisfeiler-Lehman hasher.
|
||||
/// </summary>
|
||||
/// <param name="iterations">Number of WL iterations (default: 3).</param>
|
||||
public WeisfeilerLehmanHasher(int iterations = 3)
|
||||
{
|
||||
ArgumentOutOfRangeException.ThrowIfLessThan(iterations, 1);
|
||||
_iterations = iterations;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute a deterministic hash of the semantic graph.
|
||||
/// </summary>
|
||||
/// <param name="graph">The semantic graph to hash.</param>
|
||||
/// <returns>SHA-256 hash of the graph.</returns>
|
||||
public byte[] ComputeHash(KeySemanticsGraph graph)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(graph);
|
||||
|
||||
if (graph.Nodes.IsEmpty)
|
||||
{
|
||||
return SHA256.HashData(Encoding.UTF8.GetBytes("EMPTY_GRAPH"));
|
||||
}
|
||||
|
||||
// Build adjacency lists for efficient neighbor lookup
|
||||
var outEdges = BuildAdjacencyList(graph.Edges, e => e.SourceId, e => e.TargetId);
|
||||
var inEdges = BuildAdjacencyList(graph.Edges, e => e.TargetId, e => e.SourceId);
|
||||
|
||||
// Initialize labels from node properties
|
||||
var labels = InitializeLabels(graph.Nodes);
|
||||
|
||||
// WL iterations
|
||||
for (var i = 0; i < _iterations; i++)
|
||||
{
|
||||
labels = RefineLabels(graph.Nodes, labels, outEdges, inEdges, graph.Edges);
|
||||
}
|
||||
|
||||
// Compute final hash from sorted labels
|
||||
return ComputeFinalHash(labels);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute canonical labels for all nodes (useful for graph comparison).
|
||||
/// </summary>
|
||||
/// <param name="graph">The semantic graph.</param>
|
||||
/// <returns>Array of canonical labels indexed by node ID.</returns>
|
||||
public ImmutableArray<string> ComputeCanonicalLabels(KeySemanticsGraph graph)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(graph);
|
||||
|
||||
if (graph.Nodes.IsEmpty)
|
||||
{
|
||||
return [];
|
||||
}
|
||||
|
||||
var outEdges = BuildAdjacencyList(graph.Edges, e => e.SourceId, e => e.TargetId);
|
||||
var inEdges = BuildAdjacencyList(graph.Edges, e => e.TargetId, e => e.SourceId);
|
||||
|
||||
var labels = InitializeLabels(graph.Nodes);
|
||||
|
||||
for (var i = 0; i < _iterations; i++)
|
||||
{
|
||||
labels = RefineLabels(graph.Nodes, labels, outEdges, inEdges, graph.Edges);
|
||||
}
|
||||
|
||||
// Return labels in node ID order
|
||||
var maxId = graph.Nodes.Max(n => n.Id);
|
||||
var result = new string[maxId + 1];
|
||||
|
||||
foreach (var node in graph.Nodes)
|
||||
{
|
||||
result[node.Id] = labels.TryGetValue(node.Id, out var label) ? label : string.Empty;
|
||||
}
|
||||
|
||||
return [.. result];
|
||||
}
|
||||
|
||||
private static Dictionary<int, List<int>> BuildAdjacencyList(
|
||||
ImmutableArray<SemanticEdge> edges,
|
||||
Func<SemanticEdge, int> keySelector,
|
||||
Func<SemanticEdge, int> valueSelector)
|
||||
{
|
||||
var result = new Dictionary<int, List<int>>();
|
||||
|
||||
foreach (var edge in edges)
|
||||
{
|
||||
var key = keySelector(edge);
|
||||
var value = valueSelector(edge);
|
||||
|
||||
if (!result.TryGetValue(key, out var list))
|
||||
{
|
||||
list = [];
|
||||
result[key] = list;
|
||||
}
|
||||
|
||||
list.Add(value);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private static Dictionary<int, string> InitializeLabels(ImmutableArray<SemanticNode> nodes)
|
||||
{
|
||||
var labels = new Dictionary<int, string>(nodes.Length);
|
||||
|
||||
foreach (var node in nodes)
|
||||
{
|
||||
// Create initial label from node type and operation
|
||||
var label = string.Create(
|
||||
CultureInfo.InvariantCulture,
|
||||
$"{(int)node.Type}:{node.Operation}");
|
||||
|
||||
labels[node.Id] = label;
|
||||
}
|
||||
|
||||
return labels;
|
||||
}
|
||||
|
||||
private static Dictionary<int, string> RefineLabels(
|
||||
ImmutableArray<SemanticNode> nodes,
|
||||
Dictionary<int, string> currentLabels,
|
||||
Dictionary<int, List<int>> outEdges,
|
||||
Dictionary<int, List<int>> inEdges,
|
||||
ImmutableArray<SemanticEdge> edges)
|
||||
{
|
||||
var newLabels = new Dictionary<int, string>(nodes.Length);
|
||||
var edgeLookup = BuildEdgeLookup(edges);
|
||||
|
||||
foreach (var node in nodes)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
sb.Append(currentLabels[node.Id]);
|
||||
sb.Append('|');
|
||||
|
||||
// Append sorted outgoing neighbor labels with edge types
|
||||
if (outEdges.TryGetValue(node.Id, out var outNeighbors))
|
||||
{
|
||||
var neighborLabels = outNeighbors
|
||||
.Select(n =>
|
||||
{
|
||||
var edgeType = GetEdgeType(edgeLookup, node.Id, n);
|
||||
return string.Create(
|
||||
CultureInfo.InvariantCulture,
|
||||
$"O{(int)edgeType}:{currentLabels[n]}");
|
||||
})
|
||||
.OrderBy(l => l, StringComparer.Ordinal)
|
||||
.ToList();
|
||||
|
||||
sb.AppendJoin(',', neighborLabels);
|
||||
}
|
||||
|
||||
sb.Append('|');
|
||||
|
||||
// Append sorted incoming neighbor labels with edge types
|
||||
if (inEdges.TryGetValue(node.Id, out var inNeighbors))
|
||||
{
|
||||
var neighborLabels = inNeighbors
|
||||
.Select(n =>
|
||||
{
|
||||
var edgeType = GetEdgeType(edgeLookup, n, node.Id);
|
||||
return string.Create(
|
||||
CultureInfo.InvariantCulture,
|
||||
$"I{(int)edgeType}:{currentLabels[n]}");
|
||||
})
|
||||
.OrderBy(l => l, StringComparer.Ordinal)
|
||||
.ToList();
|
||||
|
||||
sb.AppendJoin(',', neighborLabels);
|
||||
}
|
||||
|
||||
// Hash the combined string to create new label
|
||||
var combined = sb.ToString();
|
||||
var hash = SHA256.HashData(Encoding.UTF8.GetBytes(combined));
|
||||
newLabels[node.Id] = Convert.ToHexString(hash)[..16]; // Use first 16 hex chars
|
||||
}
|
||||
|
||||
return newLabels;
|
||||
}
|
||||
|
||||
private static Dictionary<(int, int), SemanticEdgeType> BuildEdgeLookup(ImmutableArray<SemanticEdge> edges)
|
||||
{
|
||||
var lookup = new Dictionary<(int, int), SemanticEdgeType>(edges.Length);
|
||||
|
||||
foreach (var edge in edges)
|
||||
{
|
||||
lookup[(edge.SourceId, edge.TargetId)] = edge.Type;
|
||||
}
|
||||
|
||||
return lookup;
|
||||
}
|
||||
|
||||
private static SemanticEdgeType GetEdgeType(
|
||||
Dictionary<(int, int), SemanticEdgeType> lookup,
|
||||
int source,
|
||||
int target)
|
||||
{
|
||||
return lookup.TryGetValue((source, target), out var type) ? type : SemanticEdgeType.Unknown;
|
||||
}
|
||||
|
||||
private static byte[] ComputeFinalHash(Dictionary<int, string> labels)
|
||||
{
|
||||
// Sort labels for deterministic output
|
||||
var sortedLabels = labels.Values
|
||||
.OrderBy(l => l, StringComparer.Ordinal)
|
||||
.ToList();
|
||||
|
||||
var combined = string.Join("|", sortedLabels);
|
||||
return SHA256.HashData(Encoding.UTF8.GetBytes(combined));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,458 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Disassembly;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of IR lifting service.
|
||||
/// Note: This implementation provides a basic IR model transformation.
|
||||
/// For full B2R2 LowUIR lifting, use the B2R2-specific adapter.
|
||||
/// </summary>
|
||||
public sealed class IrLiftingService : IIrLiftingService
|
||||
{
|
||||
private readonly ILogger<IrLiftingService> _logger;
|
||||
|
||||
private static readonly ImmutableHashSet<CpuArchitecture> SupportedArchitectures =
|
||||
[
|
||||
CpuArchitecture.X86,
|
||||
CpuArchitecture.X86_64,
|
||||
CpuArchitecture.ARM32,
|
||||
CpuArchitecture.ARM64
|
||||
];
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new IR lifting service.
|
||||
/// </summary>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public IrLiftingService(ILogger<IrLiftingService> logger)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public bool SupportsArchitecture(CpuArchitecture architecture) =>
|
||||
SupportedArchitectures.Contains(architecture);
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<LiftedFunction> LiftToIrAsync(
|
||||
IReadOnlyList<DisassembledInstruction> instructions,
|
||||
string functionName,
|
||||
ulong startAddress,
|
||||
CpuArchitecture architecture,
|
||||
LiftOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(instructions);
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
options ??= LiftOptions.Default;
|
||||
|
||||
if (!SupportsArchitecture(architecture))
|
||||
{
|
||||
throw new NotSupportedException(
|
||||
$"Architecture {architecture} is not supported for IR lifting.");
|
||||
}
|
||||
|
||||
_logger.LogDebug(
|
||||
"Lifting {InstructionCount} instructions for function {FunctionName} ({Architecture})",
|
||||
instructions.Count,
|
||||
functionName,
|
||||
architecture);
|
||||
|
||||
// Convert disassembled instructions to IR statements
|
||||
var statements = new List<IrStatement>();
|
||||
var basicBlocks = new List<IrBasicBlock>();
|
||||
var currentBlockStatements = new List<int>();
|
||||
var blockStartAddress = startAddress;
|
||||
var statementId = 0;
|
||||
var blockId = 0;
|
||||
|
||||
foreach (var instr in instructions.Take(options.MaxInstructions > 0 ? options.MaxInstructions : int.MaxValue))
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var stmt = ConvertInstructionToStatement(instr, statementId++);
|
||||
statements.Add(stmt);
|
||||
currentBlockStatements.Add(stmt.Id);
|
||||
|
||||
// Check for block-ending instructions
|
||||
if (IsBlockTerminator(instr))
|
||||
{
|
||||
var block = new IrBasicBlock(
|
||||
blockId++,
|
||||
$"bb_{blockId}",
|
||||
blockStartAddress,
|
||||
instr.Address + (ulong)instr.RawBytes.Length,
|
||||
[.. currentBlockStatements],
|
||||
[], // Predecessors filled in later
|
||||
[]); // Successors filled in later
|
||||
|
||||
basicBlocks.Add(block);
|
||||
currentBlockStatements.Clear();
|
||||
blockStartAddress = instr.Address + (ulong)instr.RawBytes.Length;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle trailing statements
|
||||
if (currentBlockStatements.Count > 0)
|
||||
{
|
||||
var lastInstr = instructions[^1];
|
||||
basicBlocks.Add(new IrBasicBlock(
|
||||
blockId,
|
||||
$"bb_{blockId}",
|
||||
blockStartAddress,
|
||||
lastInstr.Address + (ulong)lastInstr.RawBytes.Length,
|
||||
[.. currentBlockStatements],
|
||||
[],
|
||||
[]));
|
||||
}
|
||||
|
||||
// Build control flow graph
|
||||
var cfg = options.RecoverCfg
|
||||
? BuildControlFlowGraph(basicBlocks, statements)
|
||||
: new ControlFlowGraph(0, [basicBlocks.Count > 0 ? basicBlocks[^1].Id : 0], []);
|
||||
|
||||
var result = new LiftedFunction(
|
||||
functionName,
|
||||
startAddress,
|
||||
[.. statements],
|
||||
[.. basicBlocks],
|
||||
cfg);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Lifted function {FunctionName}: {StatementCount} statements, {BlockCount} blocks",
|
||||
functionName,
|
||||
statements.Count,
|
||||
basicBlocks.Count);
|
||||
|
||||
return Task.FromResult(result);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<SsaFunction> TransformToSsaAsync(
|
||||
LiftedFunction lifted,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(lifted);
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
_logger.LogDebug("Transforming function {FunctionName} to SSA form", lifted.Name);
|
||||
|
||||
// Convert IR statements to SSA statements with versioning
|
||||
var ssaStatements = new List<SsaStatement>();
|
||||
var ssaBlocks = new List<SsaBasicBlock>();
|
||||
var versions = new Dictionary<string, int>();
|
||||
|
||||
foreach (var stmt in lifted.Statements)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var ssaStmt = ConvertToSsaStatement(stmt, versions);
|
||||
ssaStatements.Add(ssaStmt);
|
||||
}
|
||||
|
||||
// Create SSA blocks
|
||||
foreach (var block in lifted.BasicBlocks)
|
||||
{
|
||||
var blockPhis = new List<SsaStatement>();
|
||||
var blockStmts = new List<SsaStatement>();
|
||||
|
||||
foreach (var stmtId in block.StatementIds)
|
||||
{
|
||||
var ssaStmt = ssaStatements.FirstOrDefault(s => s.Id == stmtId);
|
||||
if (ssaStmt is not null)
|
||||
{
|
||||
if (ssaStmt.Kind == IrStatementKind.Phi)
|
||||
{
|
||||
blockPhis.Add(ssaStmt);
|
||||
}
|
||||
else
|
||||
{
|
||||
blockStmts.Add(ssaStmt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ssaBlocks.Add(new SsaBasicBlock(
|
||||
block.Id,
|
||||
block.Label,
|
||||
[.. blockPhis],
|
||||
[.. blockStmts],
|
||||
block.Predecessors,
|
||||
block.Successors));
|
||||
}
|
||||
|
||||
// Build def-use chains
|
||||
var defUse = BuildDefUseChains(ssaStatements);
|
||||
|
||||
var result = new SsaFunction(
|
||||
lifted.Name,
|
||||
lifted.Address,
|
||||
[.. ssaStatements],
|
||||
[.. ssaBlocks],
|
||||
defUse);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Transformed function {FunctionName} to SSA: {StatementCount} statements",
|
||||
lifted.Name,
|
||||
ssaStatements.Count);
|
||||
|
||||
return Task.FromResult(result);
|
||||
}
|
||||
|
||||
private static IrStatement ConvertInstructionToStatement(
|
||||
DisassembledInstruction instr,
|
||||
int statementId)
|
||||
{
|
||||
var kind = MapInstructionKindToStatementKind(instr.Kind);
|
||||
var operation = instr.Mnemonic.ToUpperInvariant();
|
||||
|
||||
// Parse destination and sources from operands
|
||||
IrOperand? destination = null;
|
||||
var sources = new List<IrOperand>();
|
||||
|
||||
for (var i = 0; i < instr.Operands.Length; i++)
|
||||
{
|
||||
var operand = ConvertOperand(instr.Operands[i]);
|
||||
|
||||
// First operand is typically destination for most operations
|
||||
if (i == 0 && IsDestinationOperation(instr.Kind))
|
||||
{
|
||||
destination = operand;
|
||||
}
|
||||
else
|
||||
{
|
||||
sources.Add(operand);
|
||||
}
|
||||
}
|
||||
|
||||
return new IrStatement(
|
||||
statementId,
|
||||
instr.Address,
|
||||
kind,
|
||||
operation,
|
||||
destination,
|
||||
[.. sources]);
|
||||
}
|
||||
|
||||
private static IrStatementKind MapInstructionKindToStatementKind(InstructionKind kind)
|
||||
{
|
||||
return kind switch
|
||||
{
|
||||
InstructionKind.Arithmetic => IrStatementKind.BinaryOp,
|
||||
InstructionKind.Logic => IrStatementKind.BinaryOp,
|
||||
InstructionKind.Move => IrStatementKind.Assign,
|
||||
InstructionKind.Load => IrStatementKind.Load,
|
||||
InstructionKind.Store => IrStatementKind.Store,
|
||||
InstructionKind.Branch => IrStatementKind.Jump,
|
||||
InstructionKind.ConditionalBranch => IrStatementKind.ConditionalJump,
|
||||
InstructionKind.Call => IrStatementKind.Call,
|
||||
InstructionKind.Return => IrStatementKind.Return,
|
||||
InstructionKind.Nop => IrStatementKind.Nop,
|
||||
InstructionKind.Compare => IrStatementKind.Compare,
|
||||
InstructionKind.Shift => IrStatementKind.BinaryOp,
|
||||
InstructionKind.Syscall => IrStatementKind.Syscall,
|
||||
InstructionKind.Interrupt => IrStatementKind.Interrupt,
|
||||
_ => IrStatementKind.Unknown
|
||||
};
|
||||
}
|
||||
|
||||
private static bool IsDestinationOperation(InstructionKind kind)
|
||||
{
|
||||
return kind is InstructionKind.Arithmetic
|
||||
or InstructionKind.Logic
|
||||
or InstructionKind.Move
|
||||
or InstructionKind.Load
|
||||
or InstructionKind.Shift
|
||||
or InstructionKind.Compare;
|
||||
}
|
||||
|
||||
private static IrOperand ConvertOperand(Operand operand)
|
||||
{
|
||||
var kind = operand.Type switch
|
||||
{
|
||||
OperandType.Register => IrOperandKind.Register,
|
||||
OperandType.Immediate => IrOperandKind.Immediate,
|
||||
OperandType.Memory => IrOperandKind.Memory,
|
||||
OperandType.Address => IrOperandKind.Label,
|
||||
_ => IrOperandKind.Unknown
|
||||
};
|
||||
|
||||
return new IrOperand(
|
||||
kind,
|
||||
operand.Register ?? operand.Text,
|
||||
operand.Value,
|
||||
64, // Default bit size
|
||||
operand.Type == OperandType.Memory);
|
||||
}
|
||||
|
||||
private static bool IsBlockTerminator(DisassembledInstruction instr)
|
||||
{
|
||||
return instr.Kind is InstructionKind.Branch
|
||||
or InstructionKind.ConditionalBranch
|
||||
or InstructionKind.Return
|
||||
or InstructionKind.Call; // Optional: calls can be block terminators
|
||||
}
|
||||
|
||||
private static ControlFlowGraph BuildControlFlowGraph(
|
||||
List<IrBasicBlock> blocks,
|
||||
List<IrStatement> statements)
|
||||
{
|
||||
if (blocks.Count == 0)
|
||||
{
|
||||
return new ControlFlowGraph(0, [], []);
|
||||
}
|
||||
|
||||
var edges = new List<CfgEdge>();
|
||||
var exitBlocks = new List<int>();
|
||||
|
||||
for (var i = 0; i < blocks.Count; i++)
|
||||
{
|
||||
var block = blocks[i];
|
||||
var lastStmtId = block.StatementIds.LastOrDefault();
|
||||
var lastStmt = statements.FirstOrDefault(s => s.Id == lastStmtId);
|
||||
|
||||
if (lastStmt?.Kind == IrStatementKind.Return)
|
||||
{
|
||||
exitBlocks.Add(block.Id);
|
||||
}
|
||||
else if (lastStmt?.Kind == IrStatementKind.Jump)
|
||||
{
|
||||
// Unconditional jump - would need target resolution
|
||||
// For now, assume fall-through
|
||||
if (i + 1 < blocks.Count)
|
||||
{
|
||||
edges.Add(new CfgEdge(block.Id, blocks[i + 1].Id, CfgEdgeKind.Jump));
|
||||
}
|
||||
}
|
||||
else if (lastStmt?.Kind == IrStatementKind.ConditionalJump)
|
||||
{
|
||||
// Conditional jump - has both taken and fall-through edges
|
||||
if (i + 1 < blocks.Count)
|
||||
{
|
||||
edges.Add(new CfgEdge(block.Id, blocks[i + 1].Id, CfgEdgeKind.ConditionalFalse));
|
||||
}
|
||||
// Target block would need resolution
|
||||
}
|
||||
else if (i + 1 < blocks.Count)
|
||||
{
|
||||
// Fall-through to next block
|
||||
edges.Add(new CfgEdge(block.Id, blocks[i + 1].Id, CfgEdgeKind.FallThrough));
|
||||
}
|
||||
else
|
||||
{
|
||||
exitBlocks.Add(block.Id);
|
||||
}
|
||||
}
|
||||
|
||||
return new ControlFlowGraph(
|
||||
blocks[0].Id,
|
||||
[.. exitBlocks],
|
||||
[.. edges]);
|
||||
}
|
||||
|
||||
private static SsaStatement ConvertToSsaStatement(
|
||||
IrStatement stmt,
|
||||
Dictionary<string, int> versions)
|
||||
{
|
||||
// Convert sources to SSA variables
|
||||
var ssaSources = new List<SsaVariable>();
|
||||
foreach (var source in stmt.Sources)
|
||||
{
|
||||
var varName = GetVariableName(source);
|
||||
if (!string.IsNullOrEmpty(varName))
|
||||
{
|
||||
var version = versions.GetValueOrDefault(varName, 0);
|
||||
ssaSources.Add(new SsaVariable(
|
||||
varName,
|
||||
version,
|
||||
source.BitSize,
|
||||
MapOperandKindToVariableKind(source.Kind)));
|
||||
}
|
||||
}
|
||||
|
||||
// Handle destination with new version
|
||||
SsaVariable? ssaDest = null;
|
||||
if (stmt.Destination is not null)
|
||||
{
|
||||
var destName = GetVariableName(stmt.Destination);
|
||||
if (!string.IsNullOrEmpty(destName))
|
||||
{
|
||||
var newVersion = versions.GetValueOrDefault(destName, 0) + 1;
|
||||
versions[destName] = newVersion;
|
||||
|
||||
ssaDest = new SsaVariable(
|
||||
destName,
|
||||
newVersion,
|
||||
stmt.Destination.BitSize,
|
||||
MapOperandKindToVariableKind(stmt.Destination.Kind));
|
||||
}
|
||||
}
|
||||
|
||||
return new SsaStatement(
|
||||
stmt.Id,
|
||||
stmt.Address,
|
||||
stmt.Kind,
|
||||
stmt.Operation,
|
||||
ssaDest,
|
||||
[.. ssaSources]);
|
||||
}
|
||||
|
||||
private static string GetVariableName(IrOperand operand)
|
||||
{
|
||||
return operand.Kind switch
|
||||
{
|
||||
IrOperandKind.Register => operand.Name ?? "reg",
|
||||
IrOperandKind.Temporary => operand.Name ?? "tmp",
|
||||
_ => string.Empty
|
||||
};
|
||||
}
|
||||
|
||||
private static SsaVariableKind MapOperandKindToVariableKind(IrOperandKind kind)
|
||||
{
|
||||
return kind switch
|
||||
{
|
||||
IrOperandKind.Register => SsaVariableKind.Register,
|
||||
IrOperandKind.Temporary => SsaVariableKind.Temporary,
|
||||
IrOperandKind.Memory => SsaVariableKind.Memory,
|
||||
IrOperandKind.Immediate => SsaVariableKind.Constant,
|
||||
_ => SsaVariableKind.Temporary
|
||||
};
|
||||
}
|
||||
|
||||
private static DefUseChains BuildDefUseChains(List<SsaStatement> statements)
|
||||
{
|
||||
var definitions = new Dictionary<SsaVariable, int>();
|
||||
var uses = new Dictionary<SsaVariable, HashSet<int>>();
|
||||
|
||||
foreach (var stmt in statements)
|
||||
{
|
||||
// Track definition
|
||||
if (stmt.Destination is not null)
|
||||
{
|
||||
definitions[stmt.Destination] = stmt.Id;
|
||||
}
|
||||
|
||||
// Track uses
|
||||
foreach (var source in stmt.Sources)
|
||||
{
|
||||
if (!uses.TryGetValue(source, out var useSet))
|
||||
{
|
||||
useSet = [];
|
||||
uses[source] = useSet;
|
||||
}
|
||||
useSet.Add(stmt.Id);
|
||||
}
|
||||
}
|
||||
|
||||
return new DefUseChains(
|
||||
definitions.ToImmutableDictionary(),
|
||||
uses.ToImmutableDictionary(
|
||||
kvp => kvp.Key,
|
||||
kvp => kvp.Value.ToImmutableHashSet()));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,309 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
/// <summary>
|
||||
/// A semantic fingerprint for a function, used for similarity matching.
|
||||
/// </summary>
|
||||
/// <param name="FunctionName">Name of the source function.</param>
|
||||
/// <param name="Address">Start address of the function.</param>
|
||||
/// <param name="GraphHash">SHA-256 hash of the canonical semantic graph.</param>
|
||||
/// <param name="OperationHash">Hash of the operation sequence.</param>
|
||||
/// <param name="DataFlowHash">Hash of data dependency patterns.</param>
|
||||
/// <param name="NodeCount">Number of nodes in the semantic graph.</param>
|
||||
/// <param name="EdgeCount">Number of edges in the semantic graph.</param>
|
||||
/// <param name="CyclomaticComplexity">McCabe cyclomatic complexity.</param>
|
||||
/// <param name="ApiCalls">External API/function calls (semantic anchors).</param>
|
||||
/// <param name="Algorithm">Algorithm used to generate this fingerprint.</param>
|
||||
/// <param name="Metadata">Additional algorithm-specific metadata.</param>
|
||||
public sealed record SemanticFingerprint(
|
||||
string FunctionName,
|
||||
ulong Address,
|
||||
byte[] GraphHash,
|
||||
byte[] OperationHash,
|
||||
byte[] DataFlowHash,
|
||||
int NodeCount,
|
||||
int EdgeCount,
|
||||
int CyclomaticComplexity,
|
||||
ImmutableArray<string> ApiCalls,
|
||||
SemanticFingerprintAlgorithm Algorithm,
|
||||
ImmutableDictionary<string, object>? Metadata = null)
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the graph hash as a hexadecimal string.
|
||||
/// </summary>
|
||||
public string GraphHashHex => Convert.ToHexString(GraphHash);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the operation hash as a hexadecimal string.
|
||||
/// </summary>
|
||||
public string OperationHashHex => Convert.ToHexString(OperationHash);
|
||||
|
||||
/// <summary>
|
||||
/// Gets the data flow hash as a hexadecimal string.
|
||||
/// </summary>
|
||||
public string DataFlowHashHex => Convert.ToHexString(DataFlowHash);
|
||||
|
||||
/// <summary>
|
||||
/// Checks if this fingerprint equals another (by hash comparison).
|
||||
/// </summary>
|
||||
public bool HashEquals(SemanticFingerprint other) =>
|
||||
GraphHash.AsSpan().SequenceEqual(other.GraphHash.AsSpan()) &&
|
||||
OperationHash.AsSpan().SequenceEqual(other.OperationHash.AsSpan()) &&
|
||||
DataFlowHash.AsSpan().SequenceEqual(other.DataFlowHash.AsSpan());
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm used for semantic fingerprint generation.
|
||||
/// </summary>
|
||||
public enum SemanticFingerprintAlgorithm
|
||||
{
|
||||
/// <summary>Unknown algorithm.</summary>
|
||||
Unknown = 0,
|
||||
|
||||
/// <summary>Key-Semantics Graph v1 with Weisfeiler-Lehman hashing.</summary>
|
||||
KsgWeisfeilerLehmanV1,
|
||||
|
||||
/// <summary>Pure Weisfeiler-Lehman graph hashing.</summary>
|
||||
WeisfeilerLehman,
|
||||
|
||||
/// <summary>Graphlet counting-based similarity.</summary>
|
||||
GraphletCounting,
|
||||
|
||||
/// <summary>Random walk-based fingerprint.</summary>
|
||||
RandomWalk,
|
||||
|
||||
/// <summary>SimHash for approximate similarity.</summary>
|
||||
SimHash
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for semantic fingerprint generation.
|
||||
/// </summary>
|
||||
public sealed record SemanticFingerprintOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Default fingerprint generation options.
|
||||
/// </summary>
|
||||
public static SemanticFingerprintOptions Default { get; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Algorithm to use for fingerprint generation.
|
||||
/// </summary>
|
||||
public SemanticFingerprintAlgorithm Algorithm { get; init; } = SemanticFingerprintAlgorithm.KsgWeisfeilerLehmanV1;
|
||||
|
||||
/// <summary>
|
||||
/// Number of Weisfeiler-Lehman iterations.
|
||||
/// </summary>
|
||||
public int WlIterations { get; init; } = 3;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include API call hashes in the fingerprint.
|
||||
/// </summary>
|
||||
public bool IncludeApiCalls { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to compute separate data flow hash.
|
||||
/// </summary>
|
||||
public bool ComputeDataFlowHash { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Hash algorithm (SHA256, SHA384, SHA512).
|
||||
/// </summary>
|
||||
public string HashAlgorithm { get; init; } = "SHA256";
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of semantic similarity matching between two functions.
|
||||
/// </summary>
|
||||
/// <param name="FunctionA">Name of the first function.</param>
|
||||
/// <param name="FunctionB">Name of the second function.</param>
|
||||
/// <param name="OverallSimilarity">Overall similarity score (0.0 to 1.0).</param>
|
||||
/// <param name="GraphSimilarity">Graph structure similarity.</param>
|
||||
/// <param name="DataFlowSimilarity">Data flow pattern similarity.</param>
|
||||
/// <param name="ApiCallSimilarity">API call pattern similarity.</param>
|
||||
/// <param name="Confidence">Confidence level of the match.</param>
|
||||
/// <param name="Deltas">Detected differences between functions.</param>
|
||||
/// <param name="MatchDetails">Additional match details.</param>
|
||||
public sealed record SemanticMatchResult(
|
||||
string FunctionA,
|
||||
string FunctionB,
|
||||
decimal OverallSimilarity,
|
||||
decimal GraphSimilarity,
|
||||
decimal DataFlowSimilarity,
|
||||
decimal ApiCallSimilarity,
|
||||
MatchConfidence Confidence,
|
||||
ImmutableArray<MatchDelta> Deltas,
|
||||
ImmutableDictionary<string, object>? MatchDetails = null);
|
||||
|
||||
/// <summary>
|
||||
/// Confidence level for a semantic match.
|
||||
/// </summary>
|
||||
public enum MatchConfidence
|
||||
{
|
||||
/// <summary>Very high confidence: highly likely the same function.</summary>
|
||||
VeryHigh,
|
||||
|
||||
/// <summary>High confidence: likely the same function with minor changes.</summary>
|
||||
High,
|
||||
|
||||
/// <summary>Medium confidence: possibly related functions.</summary>
|
||||
Medium,
|
||||
|
||||
/// <summary>Low confidence: weak similarity detected.</summary>
|
||||
Low,
|
||||
|
||||
/// <summary>Very low confidence: minimal similarity.</summary>
|
||||
VeryLow
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A detected difference between matched functions.
|
||||
/// </summary>
|
||||
/// <param name="Type">Type of the delta.</param>
|
||||
/// <param name="Description">Human-readable description.</param>
|
||||
/// <param name="Impact">Impact on similarity score (0.0 to 1.0).</param>
|
||||
/// <param name="LocationA">Location in function A (if applicable).</param>
|
||||
/// <param name="LocationB">Location in function B (if applicable).</param>
|
||||
public sealed record MatchDelta(
|
||||
DeltaType Type,
|
||||
string Description,
|
||||
decimal Impact,
|
||||
string? LocationA = null,
|
||||
string? LocationB = null);
|
||||
|
||||
/// <summary>
|
||||
/// Type of difference between matched functions.
|
||||
/// </summary>
|
||||
public enum DeltaType
|
||||
{
|
||||
/// <summary>Unknown delta type.</summary>
|
||||
Unknown = 0,
|
||||
|
||||
/// <summary>Node added in target function.</summary>
|
||||
NodeAdded,
|
||||
|
||||
/// <summary>Node removed from source function.</summary>
|
||||
NodeRemoved,
|
||||
|
||||
/// <summary>Node modified between functions.</summary>
|
||||
NodeModified,
|
||||
|
||||
/// <summary>Edge added in target function.</summary>
|
||||
EdgeAdded,
|
||||
|
||||
/// <summary>Edge removed from source function.</summary>
|
||||
EdgeRemoved,
|
||||
|
||||
/// <summary>Operation changed (same structure, different operation).</summary>
|
||||
OperationChanged,
|
||||
|
||||
/// <summary>API call added.</summary>
|
||||
ApiCallAdded,
|
||||
|
||||
/// <summary>API call removed.</summary>
|
||||
ApiCallRemoved,
|
||||
|
||||
/// <summary>Control flow structure changed.</summary>
|
||||
ControlFlowChanged,
|
||||
|
||||
/// <summary>Data flow pattern changed.</summary>
|
||||
DataFlowChanged,
|
||||
|
||||
/// <summary>Constant value changed.</summary>
|
||||
ConstantChanged
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for semantic matching.
|
||||
/// </summary>
|
||||
public sealed record MatchOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Default matching options.
|
||||
/// </summary>
|
||||
public static MatchOptions Default { get; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Minimum similarity threshold to consider a match.
|
||||
/// </summary>
|
||||
public decimal MinSimilarity { get; init; } = 0.5m;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for graph structure similarity.
|
||||
/// </summary>
|
||||
public decimal GraphWeight { get; init; } = 0.4m;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for data flow similarity.
|
||||
/// </summary>
|
||||
public decimal DataFlowWeight { get; init; } = 0.3m;
|
||||
|
||||
/// <summary>
|
||||
/// Weight for API call similarity.
|
||||
/// </summary>
|
||||
public decimal ApiCallWeight { get; init; } = 0.3m;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to compute detailed deltas (slower but more informative).
|
||||
/// </summary>
|
||||
public bool ComputeDeltas { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum number of deltas to report.
|
||||
/// </summary>
|
||||
public int MaxDeltas { get; init; } = 100;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for lifting instructions to IR.
|
||||
/// </summary>
|
||||
public sealed record LiftOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Default lifting options.
|
||||
/// </summary>
|
||||
public static LiftOptions Default { get; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Whether to recover control flow graph.
|
||||
/// </summary>
|
||||
public bool RecoverCfg { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to transform to SSA form.
|
||||
/// </summary>
|
||||
public bool TransformToSsa { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to simplify IR (constant folding, dead code elimination).
|
||||
/// </summary>
|
||||
public bool SimplifyIr { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum instructions to lift (0 = unlimited).
|
||||
/// </summary>
|
||||
public int MaxInstructions { get; init; } = 100000;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A corpus match result when searching against a function corpus.
|
||||
/// </summary>
|
||||
/// <param name="QueryFunction">The query function name.</param>
|
||||
/// <param name="MatchedFunction">The matched function from corpus.</param>
|
||||
/// <param name="MatchedLibrary">Library containing the matched function.</param>
|
||||
/// <param name="MatchedVersion">Library version.</param>
|
||||
/// <param name="Similarity">Similarity score.</param>
|
||||
/// <param name="Confidence">Match confidence.</param>
|
||||
/// <param name="Rank">Rank in result set.</param>
|
||||
public sealed record CorpusMatchResult(
|
||||
string QueryFunction,
|
||||
string MatchedFunction,
|
||||
string MatchedLibrary,
|
||||
string MatchedVersion,
|
||||
decimal Similarity,
|
||||
MatchConfidence Confidence,
|
||||
int Rank);
|
||||
@@ -0,0 +1,261 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
/// <summary>
|
||||
/// A key-semantics graph capturing the semantic structure of a function.
|
||||
/// Abstracts away syntactic details to represent computation, data flow, and control flow.
|
||||
/// </summary>
|
||||
/// <param name="FunctionName">Name of the source function.</param>
|
||||
/// <param name="Nodes">Semantic nodes in the graph.</param>
|
||||
/// <param name="Edges">Semantic edges connecting nodes.</param>
|
||||
/// <param name="Properties">Computed graph properties.</param>
|
||||
public sealed record KeySemanticsGraph(
|
||||
string FunctionName,
|
||||
ImmutableArray<SemanticNode> Nodes,
|
||||
ImmutableArray<SemanticEdge> Edges,
|
||||
GraphProperties Properties);
|
||||
|
||||
/// <summary>
|
||||
/// A node in the key-semantics graph representing a semantic operation.
|
||||
/// </summary>
|
||||
/// <param name="Id">Unique node ID within the graph.</param>
|
||||
/// <param name="Type">Node type classification.</param>
|
||||
/// <param name="Operation">Operation name (e.g., add, mul, cmp, call).</param>
|
||||
/// <param name="Operands">Operand descriptors (normalized).</param>
|
||||
/// <param name="Attributes">Additional attributes for matching.</param>
|
||||
public sealed record SemanticNode(
|
||||
int Id,
|
||||
SemanticNodeType Type,
|
||||
string Operation,
|
||||
ImmutableArray<string> Operands,
|
||||
ImmutableDictionary<string, string>? Attributes = null);
|
||||
|
||||
/// <summary>
|
||||
/// Type of semantic node.
|
||||
/// </summary>
|
||||
public enum SemanticNodeType
|
||||
{
|
||||
/// <summary>Unknown node type.</summary>
|
||||
Unknown = 0,
|
||||
|
||||
/// <summary>Computation: arithmetic, logic, comparison operations.</summary>
|
||||
Compute,
|
||||
|
||||
/// <summary>Memory load operation.</summary>
|
||||
Load,
|
||||
|
||||
/// <summary>Memory store operation.</summary>
|
||||
Store,
|
||||
|
||||
/// <summary>Conditional branch.</summary>
|
||||
Branch,
|
||||
|
||||
/// <summary>Function/procedure call.</summary>
|
||||
Call,
|
||||
|
||||
/// <summary>Function return.</summary>
|
||||
Return,
|
||||
|
||||
/// <summary>PHI node (SSA merge point).</summary>
|
||||
Phi,
|
||||
|
||||
/// <summary>Constant value.</summary>
|
||||
Constant,
|
||||
|
||||
/// <summary>Input parameter.</summary>
|
||||
Parameter,
|
||||
|
||||
/// <summary>Address computation.</summary>
|
||||
Address,
|
||||
|
||||
/// <summary>Type cast/conversion.</summary>
|
||||
Cast,
|
||||
|
||||
/// <summary>String reference.</summary>
|
||||
StringRef,
|
||||
|
||||
/// <summary>External symbol reference.</summary>
|
||||
ExternalRef
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// An edge in the key-semantics graph.
|
||||
/// </summary>
|
||||
/// <param name="SourceId">Source node ID.</param>
|
||||
/// <param name="TargetId">Target node ID.</param>
|
||||
/// <param name="Type">Edge type.</param>
|
||||
/// <param name="Label">Optional edge label for additional context.</param>
|
||||
public sealed record SemanticEdge(
|
||||
int SourceId,
|
||||
int TargetId,
|
||||
SemanticEdgeType Type,
|
||||
string? Label = null);
|
||||
|
||||
/// <summary>
|
||||
/// Type of semantic edge.
|
||||
/// </summary>
|
||||
public enum SemanticEdgeType
|
||||
{
|
||||
/// <summary>Unknown edge type.</summary>
|
||||
Unknown = 0,
|
||||
|
||||
/// <summary>Data dependency: source produces value consumed by target.</summary>
|
||||
DataDependency,
|
||||
|
||||
/// <summary>Control dependency: target execution depends on source branch.</summary>
|
||||
ControlDependency,
|
||||
|
||||
/// <summary>Memory dependency: target depends on memory state from source.</summary>
|
||||
MemoryDependency,
|
||||
|
||||
/// <summary>Call edge: source calls target function.</summary>
|
||||
CallEdge,
|
||||
|
||||
/// <summary>Return edge: source returns to target.</summary>
|
||||
ReturnEdge,
|
||||
|
||||
/// <summary>Address-of: source computes address used by target.</summary>
|
||||
AddressOf,
|
||||
|
||||
/// <summary>Phi input: source is an input to a PHI node.</summary>
|
||||
PhiInput
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computed properties of a semantic graph.
|
||||
/// </summary>
|
||||
/// <param name="NodeCount">Total number of nodes.</param>
|
||||
/// <param name="EdgeCount">Total number of edges.</param>
|
||||
/// <param name="CyclomaticComplexity">McCabe cyclomatic complexity.</param>
|
||||
/// <param name="MaxDepth">Maximum path depth.</param>
|
||||
/// <param name="NodeTypeCounts">Count of each node type.</param>
|
||||
/// <param name="EdgeTypeCounts">Count of each edge type.</param>
|
||||
/// <param name="LoopCount">Number of detected loops.</param>
|
||||
/// <param name="BranchCount">Number of branch points.</param>
|
||||
public sealed record GraphProperties(
|
||||
int NodeCount,
|
||||
int EdgeCount,
|
||||
int CyclomaticComplexity,
|
||||
int MaxDepth,
|
||||
ImmutableDictionary<SemanticNodeType, int> NodeTypeCounts,
|
||||
ImmutableDictionary<SemanticEdgeType, int> EdgeTypeCounts,
|
||||
int LoopCount,
|
||||
int BranchCount);
|
||||
|
||||
/// <summary>
|
||||
/// Options for semantic graph extraction.
|
||||
/// </summary>
|
||||
public sealed record GraphExtractionOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Default extraction options.
|
||||
/// </summary>
|
||||
public static GraphExtractionOptions Default { get; } = new();
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include constant nodes.
|
||||
/// </summary>
|
||||
public bool IncludeConstants { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to include NOP operations.
|
||||
/// </summary>
|
||||
public bool IncludeNops { get; init; } = false;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to extract control dependencies.
|
||||
/// </summary>
|
||||
public bool ExtractControlDependencies { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to extract memory dependencies.
|
||||
/// </summary>
|
||||
public bool ExtractMemoryDependencies { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Maximum nodes before truncation (0 = unlimited).
|
||||
/// </summary>
|
||||
public int MaxNodes { get; init; } = 10000;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to normalize operation names to a canonical form.
|
||||
/// </summary>
|
||||
public bool NormalizeOperations { get; init; } = true;
|
||||
|
||||
/// <summary>
|
||||
/// Whether to merge equivalent constant nodes.
|
||||
/// </summary>
|
||||
public bool MergeConstants { get; init; } = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Result of graph canonicalization.
|
||||
/// </summary>
|
||||
/// <param name="Graph">The canonicalized graph.</param>
|
||||
/// <param name="NodeMapping">Mapping from original node IDs to canonical IDs.</param>
|
||||
/// <param name="CanonicalLabels">Canonical labels for each node.</param>
|
||||
public sealed record CanonicalGraph(
|
||||
KeySemanticsGraph Graph,
|
||||
ImmutableDictionary<int, int> NodeMapping,
|
||||
ImmutableArray<string> CanonicalLabels);
|
||||
|
||||
/// <summary>
|
||||
/// A subgraph pattern for matching.
|
||||
/// </summary>
|
||||
/// <param name="PatternId">Unique pattern identifier.</param>
|
||||
/// <param name="Name">Pattern name (e.g., "loop_counter", "memcpy_pattern").</param>
|
||||
/// <param name="Nodes">Pattern nodes.</param>
|
||||
/// <param name="Edges">Pattern edges.</param>
|
||||
public sealed record GraphPattern(
|
||||
string PatternId,
|
||||
string Name,
|
||||
ImmutableArray<PatternNode> Nodes,
|
||||
ImmutableArray<PatternEdge> Edges);
|
||||
|
||||
/// <summary>
|
||||
/// A node in a graph pattern (with wildcards).
|
||||
/// </summary>
|
||||
/// <param name="Id">Node ID within pattern.</param>
|
||||
/// <param name="TypeConstraint">Required node type (null = any).</param>
|
||||
/// <param name="OperationPattern">Operation pattern (null = any, supports wildcards).</param>
|
||||
/// <param name="IsCapture">Whether this node should be captured in match results.</param>
|
||||
/// <param name="CaptureName">Name for captured node.</param>
|
||||
public sealed record PatternNode(
|
||||
int Id,
|
||||
SemanticNodeType? TypeConstraint,
|
||||
string? OperationPattern,
|
||||
bool IsCapture = false,
|
||||
string? CaptureName = null);
|
||||
|
||||
/// <summary>
|
||||
/// An edge in a graph pattern.
|
||||
/// </summary>
|
||||
/// <param name="SourceId">Source node ID in pattern.</param>
|
||||
/// <param name="TargetId">Target node ID in pattern.</param>
|
||||
/// <param name="TypeConstraint">Required edge type (null = any).</param>
|
||||
public sealed record PatternEdge(
|
||||
int SourceId,
|
||||
int TargetId,
|
||||
SemanticEdgeType? TypeConstraint);
|
||||
|
||||
/// <summary>
|
||||
/// Result of pattern matching against a graph.
|
||||
/// </summary>
|
||||
/// <param name="Pattern">The matched pattern.</param>
|
||||
/// <param name="Matches">All matches found.</param>
|
||||
public sealed record PatternMatchResult(
|
||||
GraphPattern Pattern,
|
||||
ImmutableArray<PatternMatch> Matches);
|
||||
|
||||
/// <summary>
|
||||
/// A single pattern match instance.
|
||||
/// </summary>
|
||||
/// <param name="NodeBindings">Mapping from pattern node IDs to graph node IDs.</param>
|
||||
/// <param name="Captures">Named captures from the match.</param>
|
||||
public sealed record PatternMatch(
|
||||
ImmutableDictionary<int, int> NodeBindings,
|
||||
ImmutableDictionary<string, SemanticNode> Captures);
|
||||
@@ -0,0 +1,318 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
/// <summary>
|
||||
/// A function lifted to intermediate representation.
|
||||
/// </summary>
|
||||
/// <param name="Name">Function name (may be empty for unnamed functions).</param>
|
||||
/// <param name="Address">Start address of the function.</param>
|
||||
/// <param name="Statements">IR statements comprising the function body.</param>
|
||||
/// <param name="BasicBlocks">Basic blocks in the function.</param>
|
||||
/// <param name="Cfg">Control flow graph.</param>
|
||||
public sealed record LiftedFunction(
|
||||
string Name,
|
||||
ulong Address,
|
||||
ImmutableArray<IrStatement> Statements,
|
||||
ImmutableArray<IrBasicBlock> BasicBlocks,
|
||||
ControlFlowGraph Cfg);
|
||||
|
||||
/// <summary>
|
||||
/// A function transformed to Static Single Assignment (SSA) form.
|
||||
/// </summary>
|
||||
/// <param name="Name">Function name.</param>
|
||||
/// <param name="Address">Start address of the function.</param>
|
||||
/// <param name="Statements">SSA statements comprising the function body.</param>
|
||||
/// <param name="BasicBlocks">SSA basic blocks in the function.</param>
|
||||
/// <param name="DefUse">Definition-use chains for dataflow analysis.</param>
|
||||
public sealed record SsaFunction(
|
||||
string Name,
|
||||
ulong Address,
|
||||
ImmutableArray<SsaStatement> Statements,
|
||||
ImmutableArray<SsaBasicBlock> BasicBlocks,
|
||||
DefUseChains DefUse);
|
||||
|
||||
/// <summary>
|
||||
/// An intermediate representation statement.
|
||||
/// </summary>
|
||||
/// <param name="Id">Unique statement ID within the function.</param>
|
||||
/// <param name="Address">Original instruction address.</param>
|
||||
/// <param name="Kind">Statement kind.</param>
|
||||
/// <param name="Operation">Operation name (e.g., add, sub, load).</param>
|
||||
/// <param name="Destination">Destination operand (if any).</param>
|
||||
/// <param name="Sources">Source operands.</param>
|
||||
/// <param name="Metadata">Additional metadata.</param>
|
||||
public sealed record IrStatement(
|
||||
int Id,
|
||||
ulong Address,
|
||||
IrStatementKind Kind,
|
||||
string Operation,
|
||||
IrOperand? Destination,
|
||||
ImmutableArray<IrOperand> Sources,
|
||||
ImmutableDictionary<string, object>? Metadata = null);
|
||||
|
||||
/// <summary>
|
||||
/// Kind of IR statement.
|
||||
/// </summary>
|
||||
public enum IrStatementKind
|
||||
{
|
||||
/// <summary>Unknown statement kind.</summary>
|
||||
Unknown = 0,
|
||||
|
||||
/// <summary>Assignment: dest = expr.</summary>
|
||||
Assign,
|
||||
|
||||
/// <summary>Binary operation: dest = src1 op src2.</summary>
|
||||
BinaryOp,
|
||||
|
||||
/// <summary>Unary operation: dest = op src.</summary>
|
||||
UnaryOp,
|
||||
|
||||
/// <summary>Memory load: dest = [addr].</summary>
|
||||
Load,
|
||||
|
||||
/// <summary>Memory store: [addr] = src.</summary>
|
||||
Store,
|
||||
|
||||
/// <summary>Unconditional jump.</summary>
|
||||
Jump,
|
||||
|
||||
/// <summary>Conditional jump.</summary>
|
||||
ConditionalJump,
|
||||
|
||||
/// <summary>Function call.</summary>
|
||||
Call,
|
||||
|
||||
/// <summary>Function return.</summary>
|
||||
Return,
|
||||
|
||||
/// <summary>No operation.</summary>
|
||||
Nop,
|
||||
|
||||
/// <summary>PHI node (for SSA form).</summary>
|
||||
Phi,
|
||||
|
||||
/// <summary>System call.</summary>
|
||||
Syscall,
|
||||
|
||||
/// <summary>Interrupt.</summary>
|
||||
Interrupt,
|
||||
|
||||
/// <summary>Cast/type conversion.</summary>
|
||||
Cast,
|
||||
|
||||
/// <summary>Comparison.</summary>
|
||||
Compare,
|
||||
|
||||
/// <summary>Sign/zero extension.</summary>
|
||||
Extend
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// An operand in an IR statement.
|
||||
/// </summary>
|
||||
/// <param name="Kind">Operand kind.</param>
|
||||
/// <param name="Name">Name (for temporaries and registers).</param>
|
||||
/// <param name="Value">Constant value (for immediates).</param>
|
||||
/// <param name="BitSize">Size in bits.</param>
|
||||
/// <param name="IsMemory">Whether this is a memory reference.</param>
|
||||
public sealed record IrOperand(
|
||||
IrOperandKind Kind,
|
||||
string? Name,
|
||||
long? Value,
|
||||
int BitSize,
|
||||
bool IsMemory = false);
|
||||
|
||||
/// <summary>
|
||||
/// Kind of IR operand.
|
||||
/// </summary>
|
||||
public enum IrOperandKind
|
||||
{
|
||||
/// <summary>Unknown operand kind.</summary>
|
||||
Unknown = 0,
|
||||
|
||||
/// <summary>CPU register.</summary>
|
||||
Register,
|
||||
|
||||
/// <summary>IR temporary variable.</summary>
|
||||
Temporary,
|
||||
|
||||
/// <summary>Immediate constant value.</summary>
|
||||
Immediate,
|
||||
|
||||
/// <summary>Memory address.</summary>
|
||||
Memory,
|
||||
|
||||
/// <summary>Program counter / instruction pointer.</summary>
|
||||
ProgramCounter,
|
||||
|
||||
/// <summary>Stack pointer.</summary>
|
||||
StackPointer,
|
||||
|
||||
/// <summary>Base pointer / frame pointer.</summary>
|
||||
FramePointer,
|
||||
|
||||
/// <summary>Flags/condition register.</summary>
|
||||
Flags,
|
||||
|
||||
/// <summary>Undefined value (for SSA).</summary>
|
||||
Undefined,
|
||||
|
||||
/// <summary>Label / address reference.</summary>
|
||||
Label
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A basic block in the intermediate representation.
|
||||
/// </summary>
|
||||
/// <param name="Id">Unique block ID within the function.</param>
|
||||
/// <param name="Label">Block label/name.</param>
|
||||
/// <param name="StartAddress">Start address of the block.</param>
|
||||
/// <param name="EndAddress">End address of the block (exclusive).</param>
|
||||
/// <param name="StatementIds">IDs of statements in this block.</param>
|
||||
/// <param name="Predecessors">IDs of predecessor blocks.</param>
|
||||
/// <param name="Successors">IDs of successor blocks.</param>
|
||||
public sealed record IrBasicBlock(
|
||||
int Id,
|
||||
string Label,
|
||||
ulong StartAddress,
|
||||
ulong EndAddress,
|
||||
ImmutableArray<int> StatementIds,
|
||||
ImmutableArray<int> Predecessors,
|
||||
ImmutableArray<int> Successors);
|
||||
|
||||
/// <summary>
|
||||
/// Control flow graph for a function.
|
||||
/// </summary>
|
||||
/// <param name="EntryBlockId">ID of the entry block.</param>
|
||||
/// <param name="ExitBlockIds">IDs of exit blocks.</param>
|
||||
/// <param name="Edges">CFG edges.</param>
|
||||
public sealed record ControlFlowGraph(
|
||||
int EntryBlockId,
|
||||
ImmutableArray<int> ExitBlockIds,
|
||||
ImmutableArray<CfgEdge> Edges);
|
||||
|
||||
/// <summary>
|
||||
/// An edge in the control flow graph.
|
||||
/// </summary>
|
||||
/// <param name="SourceBlockId">Source block ID.</param>
|
||||
/// <param name="TargetBlockId">Target block ID.</param>
|
||||
/// <param name="Kind">Edge kind.</param>
|
||||
/// <param name="Condition">Condition for conditional edges.</param>
|
||||
public sealed record CfgEdge(
|
||||
int SourceBlockId,
|
||||
int TargetBlockId,
|
||||
CfgEdgeKind Kind,
|
||||
string? Condition = null);
|
||||
|
||||
/// <summary>
|
||||
/// Kind of CFG edge.
|
||||
/// </summary>
|
||||
public enum CfgEdgeKind
|
||||
{
|
||||
/// <summary>Sequential fall-through.</summary>
|
||||
FallThrough,
|
||||
|
||||
/// <summary>Unconditional jump.</summary>
|
||||
Jump,
|
||||
|
||||
/// <summary>Conditional branch taken.</summary>
|
||||
ConditionalTrue,
|
||||
|
||||
/// <summary>Conditional branch not taken.</summary>
|
||||
ConditionalFalse,
|
||||
|
||||
/// <summary>Function call edge.</summary>
|
||||
Call,
|
||||
|
||||
/// <summary>Function return edge.</summary>
|
||||
Return,
|
||||
|
||||
/// <summary>Indirect jump (computed target).</summary>
|
||||
Indirect,
|
||||
|
||||
/// <summary>Exception/interrupt edge.</summary>
|
||||
Exception
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// An SSA statement with versioned variables.
|
||||
/// </summary>
|
||||
/// <param name="Id">Unique statement ID within the function.</param>
|
||||
/// <param name="Address">Original instruction address.</param>
|
||||
/// <param name="Kind">Statement kind.</param>
|
||||
/// <param name="Operation">Operation name.</param>
|
||||
/// <param name="Destination">Destination SSA variable (if any).</param>
|
||||
/// <param name="Sources">Source SSA variables.</param>
|
||||
/// <param name="PhiSources">For PHI nodes: mapping from predecessor block to variable version.</param>
|
||||
public sealed record SsaStatement(
|
||||
int Id,
|
||||
ulong Address,
|
||||
IrStatementKind Kind,
|
||||
string Operation,
|
||||
SsaVariable? Destination,
|
||||
ImmutableArray<SsaVariable> Sources,
|
||||
ImmutableDictionary<int, SsaVariable>? PhiSources = null);
|
||||
|
||||
/// <summary>
|
||||
/// An SSA variable (versioned).
|
||||
/// </summary>
|
||||
/// <param name="BaseName">Original variable/register name.</param>
|
||||
/// <param name="Version">SSA version number.</param>
|
||||
/// <param name="BitSize">Size in bits.</param>
|
||||
/// <param name="Kind">Variable kind.</param>
|
||||
public sealed record SsaVariable(
|
||||
string BaseName,
|
||||
int Version,
|
||||
int BitSize,
|
||||
SsaVariableKind Kind);
|
||||
|
||||
/// <summary>
|
||||
/// Kind of SSA variable.
|
||||
/// </summary>
|
||||
public enum SsaVariableKind
|
||||
{
|
||||
/// <summary>CPU register.</summary>
|
||||
Register,
|
||||
|
||||
/// <summary>IR temporary.</summary>
|
||||
Temporary,
|
||||
|
||||
/// <summary>Memory location.</summary>
|
||||
Memory,
|
||||
|
||||
/// <summary>Immediate constant.</summary>
|
||||
Constant,
|
||||
|
||||
/// <summary>PHI result.</summary>
|
||||
Phi
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// An SSA basic block.
|
||||
/// </summary>
|
||||
/// <param name="Id">Unique block ID.</param>
|
||||
/// <param name="Label">Block label.</param>
|
||||
/// <param name="PhiNodes">PHI nodes at block entry.</param>
|
||||
/// <param name="Statements">Non-PHI statements.</param>
|
||||
/// <param name="Predecessors">Predecessor block IDs.</param>
|
||||
/// <param name="Successors">Successor block IDs.</param>
|
||||
public sealed record SsaBasicBlock(
|
||||
int Id,
|
||||
string Label,
|
||||
ImmutableArray<SsaStatement> PhiNodes,
|
||||
ImmutableArray<SsaStatement> Statements,
|
||||
ImmutableArray<int> Predecessors,
|
||||
ImmutableArray<int> Successors);
|
||||
|
||||
/// <summary>
|
||||
/// Definition-use chains for SSA form.
|
||||
/// </summary>
|
||||
/// <param name="Definitions">Maps variable to its defining statement.</param>
|
||||
/// <param name="Uses">Maps variable to statements that use it.</param>
|
||||
public sealed record DefUseChains(
|
||||
ImmutableDictionary<SsaVariable, int> Definitions,
|
||||
ImmutableDictionary<SsaVariable, ImmutableHashSet<int>> Uses);
|
||||
@@ -0,0 +1,184 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Globalization;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Semantic.Internal;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of semantic fingerprint generation.
|
||||
/// </summary>
|
||||
public sealed class SemanticFingerprintGenerator : ISemanticFingerprintGenerator
|
||||
{
|
||||
private readonly ILogger<SemanticFingerprintGenerator> _logger;
|
||||
private readonly WeisfeilerLehmanHasher _wlHasher;
|
||||
private readonly GraphCanonicalizer _canonicalizer;
|
||||
|
||||
/// <inheritdoc />
|
||||
public SemanticFingerprintAlgorithm Algorithm => SemanticFingerprintAlgorithm.KsgWeisfeilerLehmanV1;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new semantic fingerprint generator.
|
||||
/// </summary>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public SemanticFingerprintGenerator(ILogger<SemanticFingerprintGenerator> logger)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_wlHasher = new WeisfeilerLehmanHasher(iterations: 3);
|
||||
_canonicalizer = new GraphCanonicalizer();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<SemanticFingerprint> GenerateAsync(
|
||||
KeySemanticsGraph graph,
|
||||
ulong address,
|
||||
SemanticFingerprintOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(graph);
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
options ??= SemanticFingerprintOptions.Default;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Generating semantic fingerprint for function {FunctionName} using {Algorithm}",
|
||||
graph.FunctionName,
|
||||
options.Algorithm);
|
||||
|
||||
// Compute graph hash using Weisfeiler-Lehman
|
||||
var graphHash = ComputeGraphHash(graph, options);
|
||||
|
||||
// Compute operation sequence hash
|
||||
var operationHash = ComputeOperationHash(graph);
|
||||
|
||||
// Compute data flow hash
|
||||
var dataFlowHash = options.ComputeDataFlowHash
|
||||
? ComputeDataFlowHash(graph)
|
||||
: new byte[32];
|
||||
|
||||
// Extract API calls
|
||||
var apiCalls = options.IncludeApiCalls
|
||||
? ExtractApiCalls(graph)
|
||||
: [];
|
||||
|
||||
var fingerprint = new SemanticFingerprint(
|
||||
graph.FunctionName,
|
||||
address,
|
||||
graphHash,
|
||||
operationHash,
|
||||
dataFlowHash,
|
||||
graph.Properties.NodeCount,
|
||||
graph.Properties.EdgeCount,
|
||||
graph.Properties.CyclomaticComplexity,
|
||||
apiCalls,
|
||||
options.Algorithm);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Generated fingerprint for {FunctionName}: GraphHash={GraphHash}",
|
||||
graph.FunctionName,
|
||||
fingerprint.GraphHashHex[..16]);
|
||||
|
||||
return Task.FromResult(fingerprint);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<SemanticFingerprint> GenerateFromFunctionAsync(
|
||||
LiftedFunction function,
|
||||
ISemanticGraphExtractor graphExtractor,
|
||||
SemanticFingerprintOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(function);
|
||||
ArgumentNullException.ThrowIfNull(graphExtractor);
|
||||
|
||||
var graph = await graphExtractor.ExtractGraphAsync(function, ct: ct).ConfigureAwait(false);
|
||||
return await GenerateAsync(graph, function.Address, options, ct).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private byte[] ComputeGraphHash(KeySemanticsGraph graph, SemanticFingerprintOptions options)
|
||||
{
|
||||
if (graph.Nodes.IsEmpty)
|
||||
{
|
||||
return SHA256.HashData(Encoding.UTF8.GetBytes("EMPTY_GRAPH"));
|
||||
}
|
||||
|
||||
// Use Weisfeiler-Lehman hashing with configured iterations
|
||||
var hasher = new WeisfeilerLehmanHasher(options.WlIterations);
|
||||
return hasher.ComputeHash(graph);
|
||||
}
|
||||
|
||||
private static byte[] ComputeOperationHash(KeySemanticsGraph graph)
|
||||
{
|
||||
if (graph.Nodes.IsEmpty)
|
||||
{
|
||||
return SHA256.HashData(Encoding.UTF8.GetBytes("EMPTY_OPS"));
|
||||
}
|
||||
|
||||
// Create a sequence of operations ordered by node type then operation
|
||||
var operations = graph.Nodes
|
||||
.OrderBy(n => n.Type)
|
||||
.ThenBy(n => n.Operation, StringComparer.Ordinal)
|
||||
.Select(n => string.Create(
|
||||
CultureInfo.InvariantCulture,
|
||||
$"{(int)n.Type}:{n.Operation}"))
|
||||
.ToList();
|
||||
|
||||
var combined = string.Join("|", operations);
|
||||
return SHA256.HashData(Encoding.UTF8.GetBytes(combined));
|
||||
}
|
||||
|
||||
private static byte[] ComputeDataFlowHash(KeySemanticsGraph graph)
|
||||
{
|
||||
if (graph.Edges.IsEmpty)
|
||||
{
|
||||
return SHA256.HashData(Encoding.UTF8.GetBytes("EMPTY_DATAFLOW"));
|
||||
}
|
||||
|
||||
// Extract data dependency pattern
|
||||
var dataEdges = graph.Edges
|
||||
.Where(e => e.Type == SemanticEdgeType.DataDependency)
|
||||
.ToList();
|
||||
|
||||
if (dataEdges.Count == 0)
|
||||
{
|
||||
return SHA256.HashData(Encoding.UTF8.GetBytes("NO_DATAFLOW"));
|
||||
}
|
||||
|
||||
// Build a node lookup for edge descriptions
|
||||
var nodeMap = graph.Nodes.ToDictionary(n => n.Id);
|
||||
|
||||
// Create pattern string from data flow edges
|
||||
var patterns = dataEdges
|
||||
.OrderBy(e => e.SourceId)
|
||||
.ThenBy(e => e.TargetId)
|
||||
.Select(e =>
|
||||
{
|
||||
var srcOp = nodeMap.TryGetValue(e.SourceId, out var src) ? src.Operation : "?";
|
||||
var tgtOp = nodeMap.TryGetValue(e.TargetId, out var tgt) ? tgt.Operation : "?";
|
||||
return string.Create(CultureInfo.InvariantCulture, $"{srcOp}->{tgtOp}");
|
||||
})
|
||||
.ToList();
|
||||
|
||||
var combined = string.Join("|", patterns);
|
||||
return SHA256.HashData(Encoding.UTF8.GetBytes(combined));
|
||||
}
|
||||
|
||||
private static ImmutableArray<string> ExtractApiCalls(KeySemanticsGraph graph)
|
||||
{
|
||||
// Extract call nodes and their targets
|
||||
var calls = graph.Nodes
|
||||
.Where(n => n.Type == SemanticNodeType.Call)
|
||||
.SelectMany(n => n.Operands)
|
||||
.Where(o => !string.IsNullOrEmpty(o) && !o.StartsWith("R:", StringComparison.Ordinal))
|
||||
.Distinct(StringComparer.Ordinal)
|
||||
.OrderBy(c => c, StringComparer.Ordinal)
|
||||
.ToImmutableArray();
|
||||
|
||||
return calls;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,515 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using System.Globalization;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Semantic.Internal;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of semantic graph extraction from lifted IR.
|
||||
/// </summary>
|
||||
public sealed class SemanticGraphExtractor : ISemanticGraphExtractor
|
||||
{
|
||||
private readonly ILogger<SemanticGraphExtractor> _logger;
|
||||
private readonly GraphCanonicalizer _canonicalizer;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new semantic graph extractor.
|
||||
/// </summary>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public SemanticGraphExtractor(ILogger<SemanticGraphExtractor> logger)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_canonicalizer = new GraphCanonicalizer();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<KeySemanticsGraph> ExtractGraphAsync(
|
||||
LiftedFunction function,
|
||||
GraphExtractionOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(function);
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
options ??= GraphExtractionOptions.Default;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Extracting semantic graph from function {FunctionName} with {StatementCount} statements",
|
||||
function.Name,
|
||||
function.Statements.Length);
|
||||
|
||||
var nodes = new List<SemanticNode>();
|
||||
var edges = new List<SemanticEdge>();
|
||||
var defMap = new Dictionary<string, int>(); // Variable/register -> defining node ID
|
||||
var nodeId = 0;
|
||||
|
||||
foreach (var stmt in function.Statements)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
if (options.MaxNodes > 0 && nodeId >= options.MaxNodes)
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Truncating graph at {MaxNodes} nodes for function {FunctionName}",
|
||||
options.MaxNodes,
|
||||
function.Name);
|
||||
break;
|
||||
}
|
||||
|
||||
// Skip NOPs if configured
|
||||
if (!options.IncludeNops && stmt.Kind == IrStatementKind.Nop)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create semantic node
|
||||
var node = CreateSemanticNode(ref nodeId, stmt, options);
|
||||
if (node is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
nodes.Add(node);
|
||||
|
||||
// Add data dependency edges
|
||||
if (options.ExtractControlDependencies || options.ExtractMemoryDependencies)
|
||||
{
|
||||
AddDataDependencyEdges(stmt, node.Id, defMap, edges);
|
||||
}
|
||||
|
||||
// Track definitions
|
||||
if (stmt.Destination is not null)
|
||||
{
|
||||
var defKey = GetOperandKey(stmt.Destination);
|
||||
if (!string.IsNullOrEmpty(defKey))
|
||||
{
|
||||
defMap[defKey] = node.Id;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add control dependency edges from CFG
|
||||
if (options.ExtractControlDependencies)
|
||||
{
|
||||
AddControlDependencyEdges(function.Cfg, function.BasicBlocks, nodes, edges);
|
||||
}
|
||||
|
||||
// Compute graph properties
|
||||
var properties = ComputeProperties(nodes, edges, function.Cfg);
|
||||
|
||||
var graph = new KeySemanticsGraph(
|
||||
function.Name,
|
||||
[.. nodes],
|
||||
[.. edges],
|
||||
properties);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Extracted graph with {NodeCount} nodes and {EdgeCount} edges for function {FunctionName}",
|
||||
graph.Properties.NodeCount,
|
||||
graph.Properties.EdgeCount,
|
||||
function.Name);
|
||||
|
||||
return Task.FromResult(graph);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<KeySemanticsGraph> ExtractGraphFromSsaAsync(
|
||||
SsaFunction function,
|
||||
GraphExtractionOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(function);
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
options ??= GraphExtractionOptions.Default;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Extracting semantic graph from SSA function {FunctionName}",
|
||||
function.Name);
|
||||
|
||||
var nodes = new List<SemanticNode>();
|
||||
var edges = new List<SemanticEdge>();
|
||||
var defMap = new Dictionary<string, int>();
|
||||
var nodeId = 0;
|
||||
|
||||
foreach (var stmt in function.Statements)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
if (options.MaxNodes > 0 && nodeId >= options.MaxNodes)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
var node = CreateSemanticNodeFromSsa(ref nodeId, stmt, options);
|
||||
if (node is null)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
nodes.Add(node);
|
||||
|
||||
// SSA makes def-use explicit - use DefUse chains
|
||||
foreach (var source in stmt.Sources)
|
||||
{
|
||||
var useKey = GetSsaVariableKey(source);
|
||||
if (defMap.TryGetValue(useKey, out var defNodeId))
|
||||
{
|
||||
edges.Add(new SemanticEdge(defNodeId, node.Id, SemanticEdgeType.DataDependency));
|
||||
}
|
||||
}
|
||||
|
||||
// Track definition
|
||||
if (stmt.Destination is not null)
|
||||
{
|
||||
var defKey = GetSsaVariableKey(stmt.Destination);
|
||||
defMap[defKey] = node.Id;
|
||||
}
|
||||
}
|
||||
|
||||
// Build a minimal CFG from SSA blocks for properties
|
||||
var cfg = BuildCfgFromSsaBlocks(function.BasicBlocks);
|
||||
var properties = ComputeProperties(nodes, edges, cfg);
|
||||
|
||||
var graph = new KeySemanticsGraph(
|
||||
function.Name,
|
||||
[.. nodes],
|
||||
[.. edges],
|
||||
properties);
|
||||
|
||||
return Task.FromResult(graph);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<CanonicalGraph> CanonicalizeAsync(
|
||||
KeySemanticsGraph graph,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(graph);
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var result = _canonicalizer.Canonicalize(graph);
|
||||
return Task.FromResult(result);
|
||||
}
|
||||
|
||||
private static SemanticNode? CreateSemanticNode(
|
||||
ref int nodeId,
|
||||
IrStatement stmt,
|
||||
GraphExtractionOptions options)
|
||||
{
|
||||
var nodeType = MapStatementKindToNodeType(stmt.Kind);
|
||||
if (nodeType == SemanticNodeType.Unknown)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var operation = options.NormalizeOperations
|
||||
? NormalizeOperation(stmt.Operation)
|
||||
: stmt.Operation;
|
||||
|
||||
var operands = stmt.Sources
|
||||
.Select(GetNormalizedOperandName)
|
||||
.Where(o => !string.IsNullOrEmpty(o))
|
||||
.ToImmutableArray();
|
||||
|
||||
var node = new SemanticNode(
|
||||
nodeId++,
|
||||
nodeType,
|
||||
operation,
|
||||
operands!);
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
private static SemanticNode? CreateSemanticNodeFromSsa(
|
||||
ref int nodeId,
|
||||
SsaStatement stmt,
|
||||
GraphExtractionOptions options)
|
||||
{
|
||||
var nodeType = MapStatementKindToNodeType(stmt.Kind);
|
||||
if (nodeType == SemanticNodeType.Unknown)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
var operation = options.NormalizeOperations
|
||||
? NormalizeOperation(stmt.Operation)
|
||||
: stmt.Operation;
|
||||
|
||||
var operands = stmt.Sources
|
||||
.Select(s => string.Create(CultureInfo.InvariantCulture, $"{s.BaseName}_{s.Version}"))
|
||||
.ToImmutableArray();
|
||||
|
||||
return new SemanticNode(nodeId++, nodeType, operation, operands);
|
||||
}
|
||||
|
||||
private static SemanticNodeType MapStatementKindToNodeType(IrStatementKind kind)
|
||||
{
|
||||
return kind switch
|
||||
{
|
||||
IrStatementKind.Assign => SemanticNodeType.Compute,
|
||||
IrStatementKind.BinaryOp => SemanticNodeType.Compute,
|
||||
IrStatementKind.UnaryOp => SemanticNodeType.Compute,
|
||||
IrStatementKind.Compare => SemanticNodeType.Compute,
|
||||
IrStatementKind.Load => SemanticNodeType.Load,
|
||||
IrStatementKind.Store => SemanticNodeType.Store,
|
||||
IrStatementKind.Jump => SemanticNodeType.Branch,
|
||||
IrStatementKind.ConditionalJump => SemanticNodeType.Branch,
|
||||
IrStatementKind.Call => SemanticNodeType.Call,
|
||||
IrStatementKind.Return => SemanticNodeType.Return,
|
||||
IrStatementKind.Phi => SemanticNodeType.Phi,
|
||||
IrStatementKind.Cast => SemanticNodeType.Cast,
|
||||
IrStatementKind.Extend => SemanticNodeType.Cast,
|
||||
_ => SemanticNodeType.Unknown
|
||||
};
|
||||
}
|
||||
|
||||
private static string NormalizeOperation(string operation)
|
||||
{
|
||||
// Normalize operation names to canonical form
|
||||
return operation.ToUpperInvariant() switch
|
||||
{
|
||||
"ADD" or "IADD" or "FADD" => "ADD",
|
||||
"SUB" or "ISUB" or "FSUB" => "SUB",
|
||||
"MUL" or "IMUL" or "FMUL" => "MUL",
|
||||
"DIV" or "IDIV" or "FDIV" or "UDIV" => "DIV",
|
||||
"MOD" or "REM" or "UREM" or "SREM" => "MOD",
|
||||
"AND" or "IAND" => "AND",
|
||||
"OR" or "IOR" => "OR",
|
||||
"XOR" or "IXOR" => "XOR",
|
||||
"NOT" or "INOT" => "NOT",
|
||||
"NEG" or "INEG" or "FNEG" => "NEG",
|
||||
"SHL" or "ISHL" => "SHL",
|
||||
"SHR" or "ISHR" or "LSHR" or "ASHR" => "SHR",
|
||||
"CMP" or "ICMP" or "FCMP" => "CMP",
|
||||
"MOV" or "COPY" or "ASSIGN" => "MOV",
|
||||
"LOAD" or "LDR" or "LD" => "LOAD",
|
||||
"STORE" or "STR" or "ST" => "STORE",
|
||||
"CALL" or "INVOKE" => "CALL",
|
||||
"RET" or "RETURN" => "RET",
|
||||
"JMP" or "BR" or "GOTO" => "JMP",
|
||||
"JCC" or "BRC" or "CONDJMP" => "JCC",
|
||||
"ZEXT" or "SEXT" or "TRUNC" => "EXT",
|
||||
_ => operation.ToUpperInvariant()
|
||||
};
|
||||
}
|
||||
|
||||
private static string? GetNormalizedOperandName(IrOperand operand)
|
||||
{
|
||||
return operand.Kind switch
|
||||
{
|
||||
IrOperandKind.Register => $"R:{operand.Name}",
|
||||
IrOperandKind.Temporary => $"T:{operand.Name}",
|
||||
IrOperandKind.Immediate => $"I:{operand.Value}",
|
||||
IrOperandKind.Memory => "M",
|
||||
IrOperandKind.Label => operand.Name, // Call targets/labels keep their name for API extraction
|
||||
_ => null
|
||||
};
|
||||
}
|
||||
|
||||
private static string GetOperandKey(IrOperand operand)
|
||||
{
|
||||
return operand.Kind switch
|
||||
{
|
||||
IrOperandKind.Register => $"R:{operand.Name}",
|
||||
IrOperandKind.Temporary => $"T:{operand.Name}",
|
||||
_ => string.Empty
|
||||
};
|
||||
}
|
||||
|
||||
private static string GetSsaVariableKey(SsaVariable variable)
|
||||
{
|
||||
return string.Create(
|
||||
CultureInfo.InvariantCulture,
|
||||
$"{variable.BaseName}_{variable.Version}");
|
||||
}
|
||||
|
||||
private static void AddDataDependencyEdges(
|
||||
IrStatement stmt,
|
||||
int targetNodeId,
|
||||
Dictionary<string, int> defMap,
|
||||
List<SemanticEdge> edges)
|
||||
{
|
||||
foreach (var source in stmt.Sources)
|
||||
{
|
||||
var useKey = GetOperandKey(source);
|
||||
if (!string.IsNullOrEmpty(useKey) && defMap.TryGetValue(useKey, out var defNodeId))
|
||||
{
|
||||
edges.Add(new SemanticEdge(
|
||||
defNodeId,
|
||||
targetNodeId,
|
||||
SemanticEdgeType.DataDependency));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void AddControlDependencyEdges(
|
||||
ControlFlowGraph cfg,
|
||||
ImmutableArray<IrBasicBlock> blocks,
|
||||
List<SemanticNode> nodes,
|
||||
List<SemanticEdge> edges)
|
||||
{
|
||||
// Find branch nodes
|
||||
var branchNodes = nodes
|
||||
.Where(n => n.Type == SemanticNodeType.Branch)
|
||||
.ToList();
|
||||
|
||||
// For each branch, add control dependency to the first node in target blocks
|
||||
// This is a simplified version - full control dependence analysis is more complex
|
||||
foreach (var branch in branchNodes)
|
||||
{
|
||||
// Find nodes that are control-dependent on this branch
|
||||
var dependentNodes = nodes
|
||||
.Where(n => n.Id > branch.Id && n.Type != SemanticNodeType.Branch)
|
||||
.Take(5); // Simplified: just the next few nodes
|
||||
|
||||
foreach (var dependent in dependentNodes)
|
||||
{
|
||||
edges.Add(new SemanticEdge(
|
||||
branch.Id,
|
||||
dependent.Id,
|
||||
SemanticEdgeType.ControlDependency));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static ControlFlowGraph BuildCfgFromSsaBlocks(ImmutableArray<SsaBasicBlock> blocks)
|
||||
{
|
||||
if (blocks.IsEmpty)
|
||||
{
|
||||
return new ControlFlowGraph(0, [], []);
|
||||
}
|
||||
|
||||
var edges = new List<CfgEdge>();
|
||||
|
||||
foreach (var block in blocks)
|
||||
{
|
||||
foreach (var succ in block.Successors)
|
||||
{
|
||||
edges.Add(new CfgEdge(block.Id, succ, CfgEdgeKind.FallThrough));
|
||||
}
|
||||
}
|
||||
|
||||
var exitBlocks = blocks
|
||||
.Where(b => b.Successors.IsEmpty)
|
||||
.Select(b => b.Id)
|
||||
.ToImmutableArray();
|
||||
|
||||
return new ControlFlowGraph(
|
||||
blocks[0].Id,
|
||||
exitBlocks,
|
||||
[.. edges]);
|
||||
}
|
||||
|
||||
private static GraphProperties ComputeProperties(
|
||||
List<SemanticNode> nodes,
|
||||
List<SemanticEdge> edges,
|
||||
ControlFlowGraph cfg)
|
||||
{
|
||||
var nodeTypeCounts = nodes
|
||||
.GroupBy(n => n.Type)
|
||||
.ToImmutableDictionary(g => g.Key, g => g.Count());
|
||||
|
||||
var edgeTypeCounts = edges
|
||||
.GroupBy(e => e.Type)
|
||||
.ToImmutableDictionary(g => g.Key, g => g.Count());
|
||||
|
||||
// Cyclomatic complexity: E - N + 2P (simplified for single function)
|
||||
var cyclomaticComplexity = cfg.Edges.Length - cfg.ExitBlockIds.Length + 2;
|
||||
cyclomaticComplexity = Math.Max(1, cyclomaticComplexity);
|
||||
|
||||
// Count branches
|
||||
var branchCount = nodes.Count(n => n.Type == SemanticNodeType.Branch);
|
||||
|
||||
// Estimate max depth (simplified)
|
||||
var maxDepth = ComputeMaxDepth(nodes, edges);
|
||||
|
||||
// Estimate loop count from back edges
|
||||
var loopCount = CountBackEdges(cfg);
|
||||
|
||||
return new GraphProperties(
|
||||
nodes.Count,
|
||||
edges.Count,
|
||||
cyclomaticComplexity,
|
||||
maxDepth,
|
||||
nodeTypeCounts,
|
||||
edgeTypeCounts,
|
||||
loopCount,
|
||||
branchCount);
|
||||
}
|
||||
|
||||
private static int ComputeMaxDepth(List<SemanticNode> nodes, List<SemanticEdge> edges)
|
||||
{
|
||||
if (nodes.Count == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Build adjacency list
|
||||
var outEdges = new Dictionary<int, List<int>>();
|
||||
foreach (var edge in edges)
|
||||
{
|
||||
if (!outEdges.TryGetValue(edge.SourceId, out var list))
|
||||
{
|
||||
list = [];
|
||||
outEdges[edge.SourceId] = list;
|
||||
}
|
||||
list.Add(edge.TargetId);
|
||||
}
|
||||
|
||||
// Find nodes with no incoming edges (roots)
|
||||
var hasIncoming = new HashSet<int>(edges.Select(e => e.TargetId));
|
||||
var roots = nodes.Where(n => !hasIncoming.Contains(n.Id)).Select(n => n.Id).ToList();
|
||||
|
||||
if (roots.Count == 0)
|
||||
{
|
||||
roots.Add(nodes[0].Id);
|
||||
}
|
||||
|
||||
// BFS to find max depth
|
||||
var maxDepth = 0;
|
||||
var visited = new HashSet<int>();
|
||||
var queue = new Queue<(int nodeId, int depth)>();
|
||||
|
||||
foreach (var root in roots)
|
||||
{
|
||||
queue.Enqueue((root, 1));
|
||||
}
|
||||
|
||||
while (queue.Count > 0)
|
||||
{
|
||||
var (nodeId, depth) = queue.Dequeue();
|
||||
|
||||
if (!visited.Add(nodeId))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
maxDepth = Math.Max(maxDepth, depth);
|
||||
|
||||
if (outEdges.TryGetValue(nodeId, out var neighbors))
|
||||
{
|
||||
foreach (var neighbor in neighbors)
|
||||
{
|
||||
if (!visited.Contains(neighbor))
|
||||
{
|
||||
queue.Enqueue((neighbor, depth + 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return maxDepth;
|
||||
}
|
||||
|
||||
private static int CountBackEdges(ControlFlowGraph cfg)
|
||||
{
|
||||
// A back edge is an edge to a node that dominates the source
|
||||
// Simplified: count edges where target ID < source ID
|
||||
return cfg.Edges.Count(e => e.TargetBlockId < e.SourceBlockId);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,358 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using System.Collections.Immutable;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using StellaOps.BinaryIndex.Semantic.Internal;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
/// <summary>
|
||||
/// Default implementation of semantic similarity matching.
|
||||
/// </summary>
|
||||
public sealed class SemanticMatcher : ISemanticMatcher
|
||||
{
|
||||
private readonly ILogger<SemanticMatcher> _logger;
|
||||
private readonly GraphCanonicalizer _canonicalizer;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new semantic matcher.
|
||||
/// </summary>
|
||||
/// <param name="logger">Logger instance.</param>
|
||||
public SemanticMatcher(ILogger<SemanticMatcher> logger)
|
||||
{
|
||||
_logger = logger ?? throw new ArgumentNullException(nameof(logger));
|
||||
_canonicalizer = new GraphCanonicalizer();
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<SemanticMatchResult> MatchAsync(
|
||||
SemanticFingerprint a,
|
||||
SemanticFingerprint b,
|
||||
MatchOptions? options = null,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
options ??= MatchOptions.Default;
|
||||
|
||||
_logger.LogDebug(
|
||||
"Matching functions {FunctionA} and {FunctionB}",
|
||||
a.FunctionName,
|
||||
b.FunctionName);
|
||||
|
||||
// Check for exact hash match first
|
||||
if (a.HashEquals(b))
|
||||
{
|
||||
return Task.FromResult(CreateExactMatchResult(a, b));
|
||||
}
|
||||
|
||||
// Compute individual similarities
|
||||
var graphSimilarity = ComputeHashSimilarity(a.GraphHash, b.GraphHash);
|
||||
var dataFlowSimilarity = ComputeHashSimilarity(a.DataFlowHash, b.DataFlowHash);
|
||||
var apiCallSimilarity = ComputeApiCallSimilarity(a.ApiCalls, b.ApiCalls);
|
||||
|
||||
// Compute weighted overall similarity
|
||||
var overallSimilarity =
|
||||
(graphSimilarity * options.GraphWeight) +
|
||||
(dataFlowSimilarity * options.DataFlowWeight) +
|
||||
(apiCallSimilarity * options.ApiCallWeight);
|
||||
|
||||
// Normalize weights
|
||||
var totalWeight = options.GraphWeight + options.DataFlowWeight + options.ApiCallWeight;
|
||||
if (totalWeight > 0 && totalWeight != 1.0m)
|
||||
{
|
||||
overallSimilarity /= totalWeight;
|
||||
}
|
||||
|
||||
// Determine confidence level
|
||||
var confidence = DetermineConfidence(overallSimilarity, a, b);
|
||||
|
||||
// Compute deltas if requested
|
||||
var deltas = options.ComputeDeltas
|
||||
? ComputeDeltas(a, b, options.MaxDeltas)
|
||||
: [];
|
||||
|
||||
var result = new SemanticMatchResult(
|
||||
a.FunctionName,
|
||||
b.FunctionName,
|
||||
overallSimilarity,
|
||||
graphSimilarity,
|
||||
dataFlowSimilarity,
|
||||
apiCallSimilarity,
|
||||
confidence,
|
||||
deltas);
|
||||
|
||||
_logger.LogDebug(
|
||||
"Match result: {FunctionA} vs {FunctionB} = {Similarity:P2} ({Confidence})",
|
||||
a.FunctionName,
|
||||
b.FunctionName,
|
||||
(double)overallSimilarity,
|
||||
confidence);
|
||||
|
||||
return Task.FromResult(result);
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<ImmutableArray<SemanticMatchResult>> FindMatchesAsync(
|
||||
SemanticFingerprint query,
|
||||
IAsyncEnumerable<SemanticFingerprint> corpus,
|
||||
decimal minSimilarity = 0.7m,
|
||||
int maxResults = 10,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(query);
|
||||
ArgumentNullException.ThrowIfNull(corpus);
|
||||
|
||||
var results = new List<SemanticMatchResult>();
|
||||
var options = new MatchOptions
|
||||
{
|
||||
MinSimilarity = minSimilarity,
|
||||
ComputeDeltas = false // Skip deltas for performance
|
||||
};
|
||||
|
||||
await foreach (var candidate in corpus.WithCancellation(ct))
|
||||
{
|
||||
var match = await MatchAsync(query, candidate, options, ct).ConfigureAwait(false);
|
||||
|
||||
if (match.OverallSimilarity >= minSimilarity)
|
||||
{
|
||||
results.Add(match);
|
||||
|
||||
// Keep sorted and pruned to maxResults
|
||||
if (results.Count > maxResults * 2)
|
||||
{
|
||||
results = [.. results.OrderByDescending(r => r.OverallSimilarity).Take(maxResults)];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [.. results.OrderByDescending(r => r.OverallSimilarity).Take(maxResults)];
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public Task<decimal> ComputeGraphSimilarityAsync(
|
||||
KeySemanticsGraph a,
|
||||
KeySemanticsGraph b,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(a);
|
||||
ArgumentNullException.ThrowIfNull(b);
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
// Canonicalize both graphs
|
||||
var canonicalA = _canonicalizer.Canonicalize(a);
|
||||
var canonicalB = _canonicalizer.Canonicalize(b);
|
||||
|
||||
// Compare canonical labels
|
||||
var labelsA = canonicalA.CanonicalLabels;
|
||||
var labelsB = canonicalB.CanonicalLabels;
|
||||
|
||||
if (labelsA.IsEmpty && labelsB.IsEmpty)
|
||||
{
|
||||
return Task.FromResult(1.0m);
|
||||
}
|
||||
|
||||
if (labelsA.IsEmpty || labelsB.IsEmpty)
|
||||
{
|
||||
return Task.FromResult(0.0m);
|
||||
}
|
||||
|
||||
// Count matching labels
|
||||
var setA = new HashSet<string>(labelsA.Where(l => !string.IsNullOrEmpty(l)));
|
||||
var setB = new HashSet<string>(labelsB.Where(l => !string.IsNullOrEmpty(l)));
|
||||
|
||||
var intersection = setA.Intersect(setB).Count();
|
||||
var union = setA.Union(setB).Count();
|
||||
|
||||
var similarity = union > 0 ? (decimal)intersection / union : 0m;
|
||||
|
||||
return Task.FromResult(similarity);
|
||||
}
|
||||
|
||||
private static SemanticMatchResult CreateExactMatchResult(SemanticFingerprint a, SemanticFingerprint b)
|
||||
{
|
||||
return new SemanticMatchResult(
|
||||
a.FunctionName,
|
||||
b.FunctionName,
|
||||
1.0m,
|
||||
1.0m,
|
||||
1.0m,
|
||||
1.0m,
|
||||
MatchConfidence.VeryHigh,
|
||||
[]);
|
||||
}
|
||||
|
||||
private static decimal ComputeHashSimilarity(byte[] hashA, byte[] hashB)
|
||||
{
|
||||
if (hashA.Length == 0 || hashB.Length == 0)
|
||||
{
|
||||
return 0m;
|
||||
}
|
||||
|
||||
if (hashA.AsSpan().SequenceEqual(hashB))
|
||||
{
|
||||
return 1.0m;
|
||||
}
|
||||
|
||||
// Compute normalized Hamming distance for partial similarity
|
||||
var minLen = Math.Min(hashA.Length, hashB.Length);
|
||||
var matchingBits = 0;
|
||||
var totalBits = minLen * 8;
|
||||
|
||||
for (var i = 0; i < minLen; i++)
|
||||
{
|
||||
var xor = hashA[i] ^ hashB[i];
|
||||
matchingBits += 8 - CountSetBits(xor);
|
||||
}
|
||||
|
||||
return (decimal)matchingBits / totalBits;
|
||||
}
|
||||
|
||||
private static int CountSetBits(int value)
|
||||
{
|
||||
var count = 0;
|
||||
while (value != 0)
|
||||
{
|
||||
count += value & 1;
|
||||
value >>= 1;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private static decimal ComputeApiCallSimilarity(
|
||||
ImmutableArray<string> apiCallsA,
|
||||
ImmutableArray<string> apiCallsB)
|
||||
{
|
||||
if (apiCallsA.IsEmpty && apiCallsB.IsEmpty)
|
||||
{
|
||||
return 1.0m; // Both have no API calls
|
||||
}
|
||||
|
||||
if (apiCallsA.IsEmpty || apiCallsB.IsEmpty)
|
||||
{
|
||||
return 0.0m; // One has calls, one doesn't
|
||||
}
|
||||
|
||||
var setA = new HashSet<string>(apiCallsA, StringComparer.Ordinal);
|
||||
var setB = new HashSet<string>(apiCallsB, StringComparer.Ordinal);
|
||||
|
||||
var intersection = setA.Intersect(setB).Count();
|
||||
var union = setA.Union(setB).Count();
|
||||
|
||||
return union > 0 ? (decimal)intersection / union : 0m;
|
||||
}
|
||||
|
||||
private static MatchConfidence DetermineConfidence(
|
||||
decimal similarity,
|
||||
SemanticFingerprint a,
|
||||
SemanticFingerprint b)
|
||||
{
|
||||
// Base confidence on similarity score
|
||||
var baseConfidence = similarity switch
|
||||
{
|
||||
>= 0.95m => MatchConfidence.VeryHigh,
|
||||
>= 0.85m => MatchConfidence.High,
|
||||
>= 0.70m => MatchConfidence.Medium,
|
||||
>= 0.50m => MatchConfidence.Low,
|
||||
_ => MatchConfidence.VeryLow
|
||||
};
|
||||
|
||||
// Adjust based on size difference
|
||||
var sizeDiff = Math.Abs(a.NodeCount - b.NodeCount);
|
||||
var maxSize = Math.Max(a.NodeCount, b.NodeCount);
|
||||
|
||||
if (maxSize > 0 && sizeDiff > maxSize * 0.3)
|
||||
{
|
||||
// Large size difference reduces confidence
|
||||
baseConfidence = baseConfidence switch
|
||||
{
|
||||
MatchConfidence.VeryHigh => MatchConfidence.High,
|
||||
MatchConfidence.High => MatchConfidence.Medium,
|
||||
MatchConfidence.Medium => MatchConfidence.Low,
|
||||
_ => baseConfidence
|
||||
};
|
||||
}
|
||||
|
||||
return baseConfidence;
|
||||
}
|
||||
|
||||
private static ImmutableArray<MatchDelta> ComputeDeltas(
|
||||
SemanticFingerprint a,
|
||||
SemanticFingerprint b,
|
||||
int maxDeltas)
|
||||
{
|
||||
var deltas = new List<MatchDelta>();
|
||||
|
||||
// Node count difference
|
||||
if (a.NodeCount != b.NodeCount)
|
||||
{
|
||||
var diff = b.NodeCount - a.NodeCount;
|
||||
deltas.Add(new MatchDelta(
|
||||
diff > 0 ? DeltaType.NodeAdded : DeltaType.NodeRemoved,
|
||||
$"Node count changed from {a.NodeCount} to {b.NodeCount}",
|
||||
Math.Abs(diff) * 0.01m));
|
||||
}
|
||||
|
||||
// Edge count difference
|
||||
if (a.EdgeCount != b.EdgeCount)
|
||||
{
|
||||
var diff = b.EdgeCount - a.EdgeCount;
|
||||
deltas.Add(new MatchDelta(
|
||||
diff > 0 ? DeltaType.EdgeAdded : DeltaType.EdgeRemoved,
|
||||
$"Edge count changed from {a.EdgeCount} to {b.EdgeCount}",
|
||||
Math.Abs(diff) * 0.01m));
|
||||
}
|
||||
|
||||
// Complexity difference
|
||||
if (a.CyclomaticComplexity != b.CyclomaticComplexity)
|
||||
{
|
||||
deltas.Add(new MatchDelta(
|
||||
DeltaType.ControlFlowChanged,
|
||||
$"Cyclomatic complexity changed from {a.CyclomaticComplexity} to {b.CyclomaticComplexity}",
|
||||
0.05m));
|
||||
}
|
||||
|
||||
// Operation hash difference (detects different operations used)
|
||||
if (!a.OperationHash.AsSpan().SequenceEqual(b.OperationHash.AsSpan()))
|
||||
{
|
||||
deltas.Add(new MatchDelta(
|
||||
DeltaType.OperationChanged,
|
||||
"Operation sequence changed (different operations used)",
|
||||
0.15m));
|
||||
}
|
||||
|
||||
// Data flow hash difference (detects different data dependencies)
|
||||
if (!a.DataFlowHash.AsSpan().SequenceEqual(b.DataFlowHash.AsSpan()))
|
||||
{
|
||||
deltas.Add(new MatchDelta(
|
||||
DeltaType.DataFlowChanged,
|
||||
"Data flow patterns changed",
|
||||
0.1m));
|
||||
}
|
||||
|
||||
// API call differences
|
||||
var apiCallsA = new HashSet<string>(a.ApiCalls, StringComparer.Ordinal);
|
||||
var apiCallsB = new HashSet<string>(b.ApiCalls, StringComparer.Ordinal);
|
||||
|
||||
foreach (var added in apiCallsB.Except(apiCallsA).Take(maxDeltas / 2))
|
||||
{
|
||||
deltas.Add(new MatchDelta(
|
||||
DeltaType.ApiCallAdded,
|
||||
$"API call added: {added}",
|
||||
0.1m));
|
||||
}
|
||||
|
||||
foreach (var removed in apiCallsA.Except(apiCallsB).Take(maxDeltas / 2))
|
||||
{
|
||||
deltas.Add(new MatchDelta(
|
||||
DeltaType.ApiCallRemoved,
|
||||
$"API call removed: {removed}",
|
||||
0.1m));
|
||||
}
|
||||
|
||||
return [.. deltas.Take(maxDeltas)];
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
// Copyright (c) StellaOps. All rights reserved.
|
||||
// Licensed under AGPL-3.0-or-later. See LICENSE in the project root.
|
||||
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.DependencyInjection.Extensions;
|
||||
|
||||
namespace StellaOps.BinaryIndex.Semantic;
|
||||
|
||||
/// <summary>
|
||||
/// Extension methods for registering semantic analysis services.
|
||||
/// </summary>
|
||||
public static class ServiceCollectionExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Adds semantic analysis services to the service collection.
|
||||
/// </summary>
|
||||
/// <param name="services">The service collection.</param>
|
||||
/// <returns>The service collection for chaining.</returns>
|
||||
public static IServiceCollection AddBinaryIndexSemantic(this IServiceCollection services)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(services);
|
||||
|
||||
services.TryAddSingleton<IIrLiftingService, IrLiftingService>();
|
||||
services.TryAddSingleton<ISemanticGraphExtractor, SemanticGraphExtractor>();
|
||||
services.TryAddSingleton<ISemanticFingerprintGenerator, SemanticFingerprintGenerator>();
|
||||
services.TryAddSingleton<ISemanticMatcher, SemanticMatcher>();
|
||||
|
||||
return services;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<LangVersion>preview</LangVersion>
|
||||
<GenerateDocumentationFile>true</GenerateDocumentationFile>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<Description>Semantic analysis library for StellaOps BinaryIndex. Provides IR lifting, semantic graph extraction, and semantic fingerprinting for binary function comparison that is resilient to compiler optimizations and register allocation differences.</Description>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<InternalsVisibleTo Include="StellaOps.BinaryIndex.Semantic.Tests" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Disassembly.Abstractions\StellaOps.BinaryIndex.Disassembly.Abstractions.csproj" />
|
||||
<ProjectReference Include="..\StellaOps.BinaryIndex.Disassembly\StellaOps.BinaryIndex.Disassembly.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" />
|
||||
<PackageReference Include="Microsoft.Extensions.Options" />
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
Reference in New Issue
Block a user