251 lines
7.7 KiB
C#
251 lines
7.7 KiB
C#
// -----------------------------------------------------------------------------
|
|
// GhidraDecompilerAdapter.cs
|
|
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
|
|
// Task: MLEM-004 - Decompiled Code Extraction
|
|
// Description: Ghidra-based decompiler adapter implementation.
|
|
// -----------------------------------------------------------------------------
|
|
|
|
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
using System.Diagnostics;
|
|
using System.Text;
|
|
using System.Text.RegularExpressions;
|
|
|
|
namespace StellaOps.BinaryIndex.ML.Training;
|
|
|
|
/// <summary>
|
|
/// Ghidra-based decompiler adapter.
|
|
/// </summary>
|
|
public sealed partial class GhidraDecompilerAdapter : IDecompilerAdapter
|
|
{
|
|
private readonly GhidraAdapterOptions _options;
|
|
private readonly ILogger<GhidraDecompilerAdapter> _logger;
|
|
|
|
/// <summary>
|
|
/// Initializes a new instance of the <see cref="GhidraDecompilerAdapter"/> class.
|
|
/// </summary>
|
|
public GhidraDecompilerAdapter(
|
|
IOptions<GhidraAdapterOptions> options,
|
|
ILogger<GhidraDecompilerAdapter> logger)
|
|
{
|
|
_options = options.Value;
|
|
_logger = logger;
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public async Task<string?> DecompileAsync(
|
|
string libraryName,
|
|
string version,
|
|
string functionName,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
_logger.LogDebug("Decompiling {Function} from {Library}:{Version}",
|
|
functionName, libraryName, version);
|
|
|
|
// This would call Ghidra headless analyzer
|
|
// For now, return placeholder
|
|
return await Task.FromResult<string?>($"int {functionName}(void *param_1) {{\n int result;\n // Decompiled code placeholder\n result = 0;\n return result;\n}}");
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public async Task<string?> DecompileBytesAsync(
|
|
ReadOnlyMemory<byte> bytes,
|
|
string architecture,
|
|
DecompilationOptions? options = null,
|
|
CancellationToken cancellationToken = default)
|
|
{
|
|
options ??= DecompilationOptions.Default;
|
|
|
|
if (string.IsNullOrEmpty(_options.GhidraPath))
|
|
{
|
|
_logger.LogWarning("Ghidra path not configured");
|
|
return null;
|
|
}
|
|
|
|
try
|
|
{
|
|
// Create temp file with bytes
|
|
var tempInput = Path.GetTempFileName();
|
|
await File.WriteAllBytesAsync(tempInput, bytes.ToArray(), cancellationToken);
|
|
|
|
var tempOutput = Path.GetTempFileName();
|
|
|
|
try
|
|
{
|
|
// Run Ghidra headless
|
|
var script = _options.DecompileScriptPath ?? "DecompileFunction.java";
|
|
var args = $"-import {tempInput} -postScript {script} {tempOutput} -deleteProject -noanalysis";
|
|
|
|
var result = await RunGhidraAsync(args, options.Timeout, cancellationToken);
|
|
|
|
if (!result.Success)
|
|
{
|
|
_logger.LogWarning("Ghidra decompilation failed: {Error}", result.Error);
|
|
return null;
|
|
}
|
|
|
|
if (File.Exists(tempOutput))
|
|
{
|
|
var decompiled = await File.ReadAllTextAsync(tempOutput, cancellationToken);
|
|
return options.Simplify ? Normalize(decompiled) : decompiled;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
finally
|
|
{
|
|
if (File.Exists(tempInput)) File.Delete(tempInput);
|
|
if (File.Exists(tempOutput)) File.Delete(tempOutput);
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Decompilation failed");
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/// <inheritdoc />
|
|
public string Normalize(string code, NormalizationOptions? options = null)
|
|
{
|
|
options ??= NormalizationOptions.Default;
|
|
var result = code;
|
|
|
|
// Strip comments
|
|
if (options.StripComments)
|
|
{
|
|
result = StripCommentsRegex().Replace(result, "");
|
|
result = LineCommentRegex().Replace(result, "");
|
|
}
|
|
|
|
// Normalize whitespace
|
|
if (options.NormalizeWhitespace)
|
|
{
|
|
result = MultipleSpacesRegex().Replace(result, " ");
|
|
result = EmptyLinesRegex().Replace(result, "\n");
|
|
result = result.Trim();
|
|
}
|
|
|
|
// Normalize variable names
|
|
if (options.NormalizeVariables)
|
|
{
|
|
var varCounter = 0;
|
|
var varMap = new Dictionary<string, string>();
|
|
|
|
result = VariableNameRegex().Replace(result, match =>
|
|
{
|
|
var name = match.Value;
|
|
if (!varMap.TryGetValue(name, out var normalized))
|
|
{
|
|
normalized = $"var_{varCounter++}";
|
|
varMap[name] = normalized;
|
|
}
|
|
return normalized;
|
|
});
|
|
}
|
|
|
|
// Remove type casts
|
|
if (options.RemoveTypeCasts)
|
|
{
|
|
result = TypeCastRegex().Replace(result, "");
|
|
}
|
|
|
|
// Truncate if too long
|
|
if (result.Length > options.MaxLength)
|
|
{
|
|
result = result[..options.MaxLength] + "\n/* truncated */";
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
private async Task<(bool Success, string? Error)> RunGhidraAsync(
|
|
string args,
|
|
TimeSpan timeout,
|
|
CancellationToken ct)
|
|
{
|
|
var analyzeHeadless = Path.Combine(_options.GhidraPath!, "support", "analyzeHeadless");
|
|
|
|
var psi = new ProcessStartInfo
|
|
{
|
|
FileName = analyzeHeadless,
|
|
Arguments = args,
|
|
RedirectStandardOutput = true,
|
|
RedirectStandardError = true,
|
|
UseShellExecute = false,
|
|
CreateNoWindow = true
|
|
};
|
|
|
|
using var process = new Process { StartInfo = psi };
|
|
var output = new StringBuilder();
|
|
var error = new StringBuilder();
|
|
|
|
process.OutputDataReceived += (_, e) =>
|
|
{
|
|
if (e.Data is not null) output.AppendLine(e.Data);
|
|
};
|
|
process.ErrorDataReceived += (_, e) =>
|
|
{
|
|
if (e.Data is not null) error.AppendLine(e.Data);
|
|
};
|
|
|
|
process.Start();
|
|
process.BeginOutputReadLine();
|
|
process.BeginErrorReadLine();
|
|
|
|
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
|
|
cts.CancelAfter(timeout);
|
|
|
|
try
|
|
{
|
|
await process.WaitForExitAsync(cts.Token);
|
|
return (process.ExitCode == 0, error.Length > 0 ? error.ToString() : null);
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
process.Kill(true);
|
|
return (false, "Timeout");
|
|
}
|
|
}
|
|
|
|
[GeneratedRegex(@"/\*.*?\*/", RegexOptions.Singleline)]
|
|
private static partial Regex StripCommentsRegex();
|
|
|
|
[GeneratedRegex(@"//.*$", RegexOptions.Multiline)]
|
|
private static partial Regex LineCommentRegex();
|
|
|
|
[GeneratedRegex(@"\s+")]
|
|
private static partial Regex MultipleSpacesRegex();
|
|
|
|
[GeneratedRegex(@"\n\s*\n")]
|
|
private static partial Regex EmptyLinesRegex();
|
|
|
|
[GeneratedRegex(@"\b(local_|param_|DAT_|FUN_)[a-zA-Z0-9_]+")]
|
|
private static partial Regex VariableNameRegex();
|
|
|
|
[GeneratedRegex(@"\(\s*[a-zA-Z_][a-zA-Z0-9_]*\s*\*?\s*\)")]
|
|
private static partial Regex TypeCastRegex();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Options for Ghidra adapter.
|
|
/// </summary>
|
|
public sealed record GhidraAdapterOptions
|
|
{
|
|
/// <summary>
|
|
/// Gets the path to Ghidra installation.
|
|
/// </summary>
|
|
public string? GhidraPath { get; init; }
|
|
|
|
/// <summary>
|
|
/// Gets the path to decompile script.
|
|
/// </summary>
|
|
public string? DecompileScriptPath { get; init; }
|
|
|
|
/// <summary>
|
|
/// Gets the project directory for temp projects.
|
|
/// </summary>
|
|
public string? ProjectDirectory { get; init; }
|
|
}
|