// ----------------------------------------------------------------------------- // GhidraDecompilerAdapter.cs // Sprint: SPRINT_20260119_006 ML Embeddings Corpus // Task: MLEM-004 - Decompiled Code Extraction // Description: Ghidra-based decompiler adapter implementation. // ----------------------------------------------------------------------------- using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using System.Diagnostics; using System.Text; using System.Text.RegularExpressions; namespace StellaOps.BinaryIndex.ML.Training; /// /// Ghidra-based decompiler adapter. /// public sealed partial class GhidraDecompilerAdapter : IDecompilerAdapter { private readonly GhidraAdapterOptions _options; private readonly ILogger _logger; /// /// Initializes a new instance of the class. /// public GhidraDecompilerAdapter( IOptions options, ILogger logger) { _options = options.Value; _logger = logger; } /// public async Task DecompileAsync( string libraryName, string version, string functionName, CancellationToken cancellationToken = default) { _logger.LogDebug("Decompiling {Function} from {Library}:{Version}", functionName, libraryName, version); // This would call Ghidra headless analyzer // For now, return placeholder return await Task.FromResult($"int {functionName}(void *param_1) {{\n int result;\n // Decompiled code placeholder\n result = 0;\n return result;\n}}"); } /// public async Task DecompileBytesAsync( ReadOnlyMemory bytes, string architecture, DecompilationOptions? options = null, CancellationToken cancellationToken = default) { options ??= DecompilationOptions.Default; if (string.IsNullOrEmpty(_options.GhidraPath)) { _logger.LogWarning("Ghidra path not configured"); return null; } try { // Create temp file with bytes var tempInput = Path.GetTempFileName(); await File.WriteAllBytesAsync(tempInput, bytes.ToArray(), cancellationToken); var tempOutput = Path.GetTempFileName(); try { // Run Ghidra headless var script = _options.DecompileScriptPath ?? "DecompileFunction.java"; var args = $"-import {tempInput} -postScript {script} {tempOutput} -deleteProject -noanalysis"; var result = await RunGhidraAsync(args, options.Timeout, cancellationToken); if (!result.Success) { _logger.LogWarning("Ghidra decompilation failed: {Error}", result.Error); return null; } if (File.Exists(tempOutput)) { var decompiled = await File.ReadAllTextAsync(tempOutput, cancellationToken); return options.Simplify ? Normalize(decompiled) : decompiled; } return null; } finally { if (File.Exists(tempInput)) File.Delete(tempInput); if (File.Exists(tempOutput)) File.Delete(tempOutput); } } catch (Exception ex) { _logger.LogError(ex, "Decompilation failed"); return null; } } /// public string Normalize(string code, NormalizationOptions? options = null) { options ??= NormalizationOptions.Default; var result = code; // Strip comments if (options.StripComments) { result = StripCommentsRegex().Replace(result, ""); result = LineCommentRegex().Replace(result, ""); } // Normalize whitespace if (options.NormalizeWhitespace) { result = MultipleSpacesRegex().Replace(result, " "); result = EmptyLinesRegex().Replace(result, "\n"); result = result.Trim(); } // Normalize variable names if (options.NormalizeVariables) { var varCounter = 0; var varMap = new Dictionary(); result = VariableNameRegex().Replace(result, match => { var name = match.Value; if (!varMap.TryGetValue(name, out var normalized)) { normalized = $"var_{varCounter++}"; varMap[name] = normalized; } return normalized; }); } // Remove type casts if (options.RemoveTypeCasts) { result = TypeCastRegex().Replace(result, ""); } // Truncate if too long if (result.Length > options.MaxLength) { result = result[..options.MaxLength] + "\n/* truncated */"; } return result; } private async Task<(bool Success, string? Error)> RunGhidraAsync( string args, TimeSpan timeout, CancellationToken ct) { var analyzeHeadless = Path.Combine(_options.GhidraPath!, "support", "analyzeHeadless"); var psi = new ProcessStartInfo { FileName = analyzeHeadless, Arguments = args, RedirectStandardOutput = true, RedirectStandardError = true, UseShellExecute = false, CreateNoWindow = true }; using var process = new Process { StartInfo = psi }; var output = new StringBuilder(); var error = new StringBuilder(); process.OutputDataReceived += (_, e) => { if (e.Data is not null) output.AppendLine(e.Data); }; process.ErrorDataReceived += (_, e) => { if (e.Data is not null) error.AppendLine(e.Data); }; process.Start(); process.BeginOutputReadLine(); process.BeginErrorReadLine(); using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct); cts.CancelAfter(timeout); try { await process.WaitForExitAsync(cts.Token); return (process.ExitCode == 0, error.Length > 0 ? error.ToString() : null); } catch (OperationCanceledException) { process.Kill(true); return (false, "Timeout"); } } [GeneratedRegex(@"/\*.*?\*/", RegexOptions.Singleline)] private static partial Regex StripCommentsRegex(); [GeneratedRegex(@"//.*$", RegexOptions.Multiline)] private static partial Regex LineCommentRegex(); [GeneratedRegex(@"\s+")] private static partial Regex MultipleSpacesRegex(); [GeneratedRegex(@"\n\s*\n")] private static partial Regex EmptyLinesRegex(); [GeneratedRegex(@"\b(local_|param_|DAT_|FUN_)[a-zA-Z0-9_]+")] private static partial Regex VariableNameRegex(); [GeneratedRegex(@"\(\s*[a-zA-Z_][a-zA-Z0-9_]*\s*\*?\s*\)")] private static partial Regex TypeCastRegex(); } /// /// Options for Ghidra adapter. /// public sealed record GhidraAdapterOptions { /// /// Gets the path to Ghidra installation. /// public string? GhidraPath { get; init; } /// /// Gets the path to decompile script. /// public string? DecompileScriptPath { get; init; } /// /// Gets the project directory for temp projects. /// public string? ProjectDirectory { get; init; } }