// -----------------------------------------------------------------------------
// GhidraDecompilerAdapter.cs
// Sprint: SPRINT_20260119_006 ML Embeddings Corpus
// Task: MLEM-004 - Decompiled Code Extraction
// Description: Ghidra-based decompiler adapter implementation.
// -----------------------------------------------------------------------------
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using System.Diagnostics;
using System.Text;
using System.Text.RegularExpressions;
namespace StellaOps.BinaryIndex.ML.Training;
///
/// Ghidra-based decompiler adapter.
///
public sealed partial class GhidraDecompilerAdapter : IDecompilerAdapter
{
private readonly GhidraAdapterOptions _options;
private readonly ILogger _logger;
///
/// Initializes a new instance of the class.
///
public GhidraDecompilerAdapter(
IOptions options,
ILogger logger)
{
_options = options.Value;
_logger = logger;
}
///
public async Task DecompileAsync(
string libraryName,
string version,
string functionName,
CancellationToken cancellationToken = default)
{
_logger.LogDebug("Decompiling {Function} from {Library}:{Version}",
functionName, libraryName, version);
// This would call Ghidra headless analyzer
// For now, return placeholder
return await Task.FromResult($"int {functionName}(void *param_1) {{\n int result;\n // Decompiled code placeholder\n result = 0;\n return result;\n}}");
}
///
public async Task DecompileBytesAsync(
ReadOnlyMemory bytes,
string architecture,
DecompilationOptions? options = null,
CancellationToken cancellationToken = default)
{
options ??= DecompilationOptions.Default;
if (string.IsNullOrEmpty(_options.GhidraPath))
{
_logger.LogWarning("Ghidra path not configured");
return null;
}
try
{
// Create temp file with bytes
var tempInput = Path.GetTempFileName();
await File.WriteAllBytesAsync(tempInput, bytes.ToArray(), cancellationToken);
var tempOutput = Path.GetTempFileName();
try
{
// Run Ghidra headless
var script = _options.DecompileScriptPath ?? "DecompileFunction.java";
var args = $"-import {tempInput} -postScript {script} {tempOutput} -deleteProject -noanalysis";
var result = await RunGhidraAsync(args, options.Timeout, cancellationToken);
if (!result.Success)
{
_logger.LogWarning("Ghidra decompilation failed: {Error}", result.Error);
return null;
}
if (File.Exists(tempOutput))
{
var decompiled = await File.ReadAllTextAsync(tempOutput, cancellationToken);
return options.Simplify ? Normalize(decompiled) : decompiled;
}
return null;
}
finally
{
if (File.Exists(tempInput)) File.Delete(tempInput);
if (File.Exists(tempOutput)) File.Delete(tempOutput);
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Decompilation failed");
return null;
}
}
///
public string Normalize(string code, NormalizationOptions? options = null)
{
options ??= NormalizationOptions.Default;
var result = code;
// Strip comments
if (options.StripComments)
{
result = StripCommentsRegex().Replace(result, "");
result = LineCommentRegex().Replace(result, "");
}
// Normalize whitespace
if (options.NormalizeWhitespace)
{
result = MultipleSpacesRegex().Replace(result, " ");
result = EmptyLinesRegex().Replace(result, "\n");
result = result.Trim();
}
// Normalize variable names
if (options.NormalizeVariables)
{
var varCounter = 0;
var varMap = new Dictionary();
result = VariableNameRegex().Replace(result, match =>
{
var name = match.Value;
if (!varMap.TryGetValue(name, out var normalized))
{
normalized = $"var_{varCounter++}";
varMap[name] = normalized;
}
return normalized;
});
}
// Remove type casts
if (options.RemoveTypeCasts)
{
result = TypeCastRegex().Replace(result, "");
}
// Truncate if too long
if (result.Length > options.MaxLength)
{
result = result[..options.MaxLength] + "\n/* truncated */";
}
return result;
}
private async Task<(bool Success, string? Error)> RunGhidraAsync(
string args,
TimeSpan timeout,
CancellationToken ct)
{
var analyzeHeadless = Path.Combine(_options.GhidraPath!, "support", "analyzeHeadless");
var psi = new ProcessStartInfo
{
FileName = analyzeHeadless,
Arguments = args,
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true
};
using var process = new Process { StartInfo = psi };
var output = new StringBuilder();
var error = new StringBuilder();
process.OutputDataReceived += (_, e) =>
{
if (e.Data is not null) output.AppendLine(e.Data);
};
process.ErrorDataReceived += (_, e) =>
{
if (e.Data is not null) error.AppendLine(e.Data);
};
process.Start();
process.BeginOutputReadLine();
process.BeginErrorReadLine();
using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct);
cts.CancelAfter(timeout);
try
{
await process.WaitForExitAsync(cts.Token);
return (process.ExitCode == 0, error.Length > 0 ? error.ToString() : null);
}
catch (OperationCanceledException)
{
process.Kill(true);
return (false, "Timeout");
}
}
[GeneratedRegex(@"/\*.*?\*/", RegexOptions.Singleline)]
private static partial Regex StripCommentsRegex();
[GeneratedRegex(@"//.*$", RegexOptions.Multiline)]
private static partial Regex LineCommentRegex();
[GeneratedRegex(@"\s+")]
private static partial Regex MultipleSpacesRegex();
[GeneratedRegex(@"\n\s*\n")]
private static partial Regex EmptyLinesRegex();
[GeneratedRegex(@"\b(local_|param_|DAT_|FUN_)[a-zA-Z0-9_]+")]
private static partial Regex VariableNameRegex();
[GeneratedRegex(@"\(\s*[a-zA-Z_][a-zA-Z0-9_]*\s*\*?\s*\)")]
private static partial Regex TypeCastRegex();
}
///
/// Options for Ghidra adapter.
///
public sealed record GhidraAdapterOptions
{
///
/// Gets the path to Ghidra installation.
///
public string? GhidraPath { get; init; }
///
/// Gets the path to decompile script.
///
public string? DecompileScriptPath { get; init; }
///
/// Gets the project directory for temp projects.
///
public string? ProjectDirectory { get; init; }
}