sprints work.
This commit is contained in:
@@ -0,0 +1,249 @@
|
||||
// -----------------------------------------------------------------------------
|
||||
// GhidraDecompilerAdapter.cs
|
||||
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
||||
// Task: MLEM-004 - Decompiled Code Extraction
|
||||
// Description: Ghidra-based decompiler adapter implementation.
|
||||
// -----------------------------------------------------------------------------
|
||||
|
||||
using System.Diagnostics;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
|
||||
namespace StellaOps.BinaryIndex.ML.Training;
|
||||
|
||||
/// <summary>
|
||||
/// Ghidra-based decompiler adapter.
|
||||
/// </summary>
|
||||
public sealed partial class GhidraDecompilerAdapter : IDecompilerAdapter
|
||||
{
|
||||
private readonly GhidraAdapterOptions _options;
|
||||
private readonly ILogger<GhidraDecompilerAdapter> _logger;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="GhidraDecompilerAdapter"/> class.
|
||||
/// </summary>
|
||||
public GhidraDecompilerAdapter(
|
||||
IOptions<GhidraAdapterOptions> options,
|
||||
ILogger<GhidraDecompilerAdapter> logger)
|
||||
{
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<string?> DecompileAsync(
|
||||
string libraryName,
|
||||
string version,
|
||||
string functionName,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
_logger.LogDebug("Decompiling {Function} from {Library}:{Version}",
|
||||
functionName, libraryName, version);
|
||||
|
||||
// This would call Ghidra headless analyzer
|
||||
// For now, return placeholder
|
||||
return await Task.FromResult<string?>($"int {functionName}(void *param_1) {{\n int result;\n // Decompiled code placeholder\n result = 0;\n return result;\n}}");
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public async Task<string?> DecompileBytesAsync(
|
||||
ReadOnlyMemory<byte> bytes,
|
||||
string architecture,
|
||||
DecompilationOptions? options = null,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
options ??= DecompilationOptions.Default;
|
||||
|
||||
if (string.IsNullOrEmpty(_options.GhidraPath))
|
||||
{
|
||||
_logger.LogWarning("Ghidra path not configured");
|
||||
return null;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
// Create temp file with bytes
|
||||
var tempInput = Path.GetTempFileName();
|
||||
await File.WriteAllBytesAsync(tempInput, bytes.ToArray(), cancellationToken);
|
||||
|
||||
var tempOutput = Path.GetTempFileName();
|
||||
|
||||
try
|
||||
{
|
||||
// Run Ghidra headless
|
||||
var script = _options.DecompileScriptPath ?? "DecompileFunction.java";
|
||||
var args = $"-import {tempInput} -postScript {script} {tempOutput} -deleteProject -noanalysis";
|
||||
|
||||
var result = await RunGhidraAsync(args, options.Timeout, cancellationToken);
|
||||
|
||||
if (!result.Success)
|
||||
{
|
||||
_logger.LogWarning("Ghidra decompilation failed: {Error}", result.Error);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (File.Exists(tempOutput))
|
||||
{
|
||||
var decompiled = await File.ReadAllTextAsync(tempOutput, cancellationToken);
|
||||
return options.Simplify ? Normalize(decompiled) : decompiled;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (File.Exists(tempInput)) File.Delete(tempInput);
|
||||
if (File.Exists(tempOutput)) File.Delete(tempOutput);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Decompilation failed");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <inheritdoc />
|
||||
public string Normalize(string code, NormalizationOptions? options = null)
|
||||
{
|
||||
options ??= NormalizationOptions.Default;
|
||||
var result = code;
|
||||
|
||||
// Strip comments
|
||||
if (options.StripComments)
|
||||
{
|
||||
result = StripCommentsRegex().Replace(result, "");
|
||||
result = LineCommentRegex().Replace(result, "");
|
||||
}
|
||||
|
||||
// Normalize whitespace
|
||||
if (options.NormalizeWhitespace)
|
||||
{
|
||||
result = MultipleSpacesRegex().Replace(result, " ");
|
||||
result = EmptyLinesRegex().Replace(result, "\n");
|
||||
result = result.Trim();
|
||||
}
|
||||
|
||||
// Normalize variable names
|
||||
if (options.NormalizeVariables)
|
||||
{
|
||||
var varCounter = 0;
|
||||
var varMap = new Dictionary<string, string>();
|
||||
|
||||
result = VariableNameRegex().Replace(result, match =>
|
||||
{
|
||||
var name = match.Value;
|
||||
if (!varMap.TryGetValue(name, out var normalized))
|
||||
{
|
||||
normalized = $"var_{varCounter++}";
|
||||
varMap[name] = normalized;
|
||||
}
|
||||
return normalized;
|
||||
});
|
||||
}
|
||||
|
||||
// Remove type casts
|
||||
if (options.RemoveTypeCasts)
|
||||
{
|
||||
result = TypeCastRegex().Replace(result, "");
|
||||
}
|
||||
|
||||
// Truncate if too long
|
||||
if (result.Length > options.MaxLength)
|
||||
{
|
||||
result = result[..options.MaxLength] + "\n/* truncated */";
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private async Task<(bool Success, string? Error)> RunGhidraAsync(
|
||||
string args,
|
||||
TimeSpan timeout,
|
||||
CancellationToken ct)
|
||||
{
|
||||
var analyzeHeadless = Path.Combine(_options.GhidraPath!, "support", "analyzeHeadless");
|
||||
|
||||
var psi = new ProcessStartInfo
|
||||
{
|
||||
FileName = analyzeHeadless,
|
||||
Arguments = args,
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true
|
||||
};
|
||||
|
||||
using var process = new Process { StartInfo = psi };
|
||||
var output = new StringBuilder();
|
||||
var error = new StringBuilder();
|
||||
|
||||
process.OutputDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null) output.AppendLine(e.Data);
|
||||
};
|
||||
process.ErrorDataReceived += (_, e) =>
|
||||
{
|
||||
if (e.Data is not null) error.AppendLine(e.Data);
|
||||
};
|
||||
|
||||
process.Start();
|
||||
process.BeginOutputReadLine();
|
||||
process.BeginErrorReadLine();
|
||||
|
||||
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
||||
cts.CancelAfter(timeout);
|
||||
|
||||
try
|
||||
{
|
||||
await process.WaitForExitAsync(cts.Token);
|
||||
return (process.ExitCode == 0, error.Length > 0 ? error.ToString() : null);
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
process.Kill(true);
|
||||
return (false, "Timeout");
|
||||
}
|
||||
}
|
||||
|
||||
[GeneratedRegex(@"/\*.*?\*/", RegexOptions.Singleline)]
|
||||
private static partial Regex StripCommentsRegex();
|
||||
|
||||
[GeneratedRegex(@"//.*$", RegexOptions.Multiline)]
|
||||
private static partial Regex LineCommentRegex();
|
||||
|
||||
[GeneratedRegex(@"\s+")]
|
||||
private static partial Regex MultipleSpacesRegex();
|
||||
|
||||
[GeneratedRegex(@"\n\s*\n")]
|
||||
private static partial Regex EmptyLinesRegex();
|
||||
|
||||
[GeneratedRegex(@"\b(local_|param_|DAT_|FUN_)[a-zA-Z0-9_]+")]
|
||||
private static partial Regex VariableNameRegex();
|
||||
|
||||
[GeneratedRegex(@"\(\s*[a-zA-Z_][a-zA-Z0-9_]*\s*\*?\s*\)")]
|
||||
private static partial Regex TypeCastRegex();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Options for Ghidra adapter.
|
||||
/// </summary>
|
||||
public sealed record GhidraAdapterOptions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the path to Ghidra installation.
|
||||
/// </summary>
|
||||
public string? GhidraPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the path to decompile script.
|
||||
/// </summary>
|
||||
public string? DecompileScriptPath { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the project directory for temp projects.
|
||||
/// </summary>
|
||||
public string? ProjectDirectory { get; init; }
|
||||
}
|
||||
Reference in New Issue
Block a user