Files
git.stella-ops.org/docs/dev/extending-binary-analysis.md

17 KiB

Extending Binary Analysis

This guide explains how to add support for new binary formats or custom section extractors to the binary diff attestation system.

Overview

The binary analysis system is designed for extensibility. You can add support for:

  • New binary formats (PE, Mach-O, WebAssembly)
  • Custom section extractors (additional ELF sections, custom hash algorithms)
  • Verdict classifiers (custom backport detection logic)

Architecture

Core Interfaces

┌─────────────────────────────────────────────────────────────────┐
│                    Binary Analysis Pipeline                      │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│  IBinaryFormatDetector ──▶ ISectionHashExtractor<TConfig>       │
│          │                         │                            │
│          ▼                         ▼                            │
│  BinaryFormat enum         SectionHashSet                       │
│  (elf, pe, macho)          (per-format)                        │
│                                    │                            │
│                                    ▼                            │
│                            IVerdictClassifier                   │
│                                    │                            │
│                                    ▼                            │
│                            BinaryDiffFinding                    │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

Key Interfaces

/// <summary>
/// Detects binary format from file magic/headers.
/// </summary>
public interface IBinaryFormatDetector
{
    BinaryFormat Detect(ReadOnlySpan<byte> header);
    BinaryFormat DetectFromPath(string filePath);
}

/// <summary>
/// Extracts section hashes for a specific binary format.
/// </summary>
public interface ISectionHashExtractor<TConfig> where TConfig : class
{
    BinaryFormat SupportedFormat { get; }

    Task<SectionHashSet?> ExtractAsync(
        string filePath,
        TConfig? config = null,
        CancellationToken cancellationToken = default);

    Task<SectionHashSet?> ExtractFromBytesAsync(
        ReadOnlyMemory<byte> bytes,
        string virtualPath,
        TConfig? config = null,
        CancellationToken cancellationToken = default);
}

/// <summary>
/// Classifies binary changes as patched/vanilla/unknown.
/// </summary>
public interface IVerdictClassifier
{
    Verdict Classify(SectionHashSet? baseHashes, SectionHashSet? targetHashes);
    double ComputeConfidence(SectionHashSet? baseHashes, SectionHashSet? targetHashes);
}

Adding a New Binary Format

Step 1: Define Configuration

// src/Scanner/__Libraries/StellaOps.Scanner.Contracts/PeSectionConfig.cs

namespace StellaOps.Scanner.Contracts;

/// <summary>
/// Configuration for PE section hash extraction.
/// </summary>
public sealed record PeSectionConfig
{
    /// <summary>Sections to extract hashes from.</summary>
    public ImmutableArray<string> Sections { get; init; } = [".text", ".rdata", ".data", ".rsrc"];

    /// <summary>Hash algorithms to use.</summary>
    public ImmutableArray<string> HashAlgorithms { get; init; } = ["sha256"];

    /// <summary>Maximum section size to process (bytes).</summary>
    public long MaxSectionSize { get; init; } = 100 * 1024 * 1024; // 100MB

    /// <summary>Whether to extract version resources.</summary>
    public bool ExtractVersionInfo { get; init; } = true;
}

Step 2: Implement the Extractor

// src/Scanner/StellaOps.Scanner.Analyzers.Native/Hardening/PeSectionHashExtractor.cs

namespace StellaOps.Scanner.Analyzers.Native;

public sealed class PeSectionHashExtractor : ISectionHashExtractor<PeSectionConfig>
{
    private readonly TimeProvider _timeProvider;
    private readonly ILogger<PeSectionHashExtractor> _logger;

    public PeSectionHashExtractor(
        TimeProvider timeProvider,
        ILogger<PeSectionHashExtractor> logger)
    {
        _timeProvider = timeProvider;
        _logger = logger;
    }

    public BinaryFormat SupportedFormat => BinaryFormat.Pe;

    public async Task<SectionHashSet?> ExtractAsync(
        string filePath,
        PeSectionConfig? config = null,
        CancellationToken cancellationToken = default)
    {
        config ??= new PeSectionConfig();

        // Read file
        var bytes = await File.ReadAllBytesAsync(filePath, cancellationToken);
        return await ExtractFromBytesAsync(bytes, filePath, config, cancellationToken);
    }

    public async Task<SectionHashSet?> ExtractFromBytesAsync(
        ReadOnlyMemory<byte> bytes,
        string virtualPath,
        PeSectionConfig? config = null,
        CancellationToken cancellationToken = default)
    {
        config ??= new PeSectionConfig();

        // Validate PE magic
        if (!IsPeFile(bytes.Span))
        {
            _logger.LogDebug("Not a PE file: {Path}", virtualPath);
            return null;
        }

        try
        {
            var sections = new Dictionary<string, SectionInfo>();

            // Parse PE headers
            using var peReader = new PEReader(new MemoryStream(bytes.ToArray()));

            foreach (var sectionHeader in peReader.PEHeaders.SectionHeaders)
            {
                var sectionName = sectionHeader.Name;

                if (!config.Sections.Contains(sectionName))
                    continue;

                if (sectionHeader.SizeOfRawData > config.MaxSectionSize)
                {
                    _logger.LogWarning(
                        "Section {Section} exceeds max size ({Size} > {Max})",
                        sectionName, sectionHeader.SizeOfRawData, config.MaxSectionSize);
                    continue;
                }

                // Get section data
                var sectionData = peReader.GetSectionData(sectionName);
                if (sectionData.Length == 0)
                    continue;

                // Compute hash
                var sha256 = ComputeSha256(sectionData.GetContent());

                sections[sectionName] = new SectionInfo
                {
                    Sha256 = sha256,
                    Size = sectionData.Length,
                    Offset = sectionHeader.PointerToRawData
                };
            }

            // Compute file hash
            var fileHash = ComputeSha256(bytes.Span);

            return new SectionHashSet
            {
                FilePath = virtualPath,
                FileHash = fileHash,
                Sections = sections.ToImmutableDictionary(),
                ExtractedAt = _timeProvider.GetUtcNow(),
                ExtractorVersion = GetType().Assembly.GetName().Version?.ToString() ?? "1.0.0"
            };
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Failed to extract PE sections from {Path}", virtualPath);
            return null;
        }
    }

    private static bool IsPeFile(ReadOnlySpan<byte> bytes)
    {
        // Check DOS header magic (MZ)
        if (bytes.Length < 64)
            return false;

        return bytes[0] == 0x4D && bytes[1] == 0x5A; // "MZ"
    }

    private static string ComputeSha256(ReadOnlySpan<byte> data)
    {
        Span<byte> hash = stackalloc byte[32];
        SHA256.HashData(data, hash);
        return Convert.ToHexString(hash).ToLowerInvariant();
    }
}

Step 3: Register Services

// src/Scanner/StellaOps.Scanner.Analyzers.Native/ServiceCollectionExtensions.cs

public static class ServiceCollectionExtensions
{
    public static IServiceCollection AddNativeAnalyzers(
        this IServiceCollection services,
        IConfiguration configuration)
    {
        // Existing ELF extractor
        services.AddSingleton<IElfSectionHashExtractor, ElfSectionHashExtractor>();

        // New PE extractor
        services.AddSingleton<ISectionHashExtractor<PeSectionConfig>, PeSectionHashExtractor>();

        // Register in composite
        services.AddSingleton<IBinaryFormatDetector, CompositeBinaryFormatDetector>();
        services.AddSingleton<ICompositeSectionHashExtractor>(sp =>
        {
            var extractors = new Dictionary<BinaryFormat, object>
            {
                [BinaryFormat.Elf] = sp.GetRequiredService<IElfSectionHashExtractor>(),
                [BinaryFormat.Pe] = sp.GetRequiredService<ISectionHashExtractor<PeSectionConfig>>()
            };
            return new CompositeSectionHashExtractor(extractors);
        });

        // Configuration
        services.AddOptions<PeSectionConfig>()
            .Bind(configuration.GetSection("Scanner:Native:PeSections"))
            .ValidateDataAnnotations()
            .ValidateOnStart();

        return services;
    }
}

Step 4: Add Tests

// src/Scanner/__Tests/StellaOps.Scanner.Analyzers.Native.Tests/PeSectionHashExtractorTests.cs

namespace StellaOps.Scanner.Analyzers.Native.Tests;

public class PeSectionHashExtractorTests
{
    private readonly PeSectionHashExtractor _extractor;
    private readonly FakeTimeProvider _timeProvider;

    public PeSectionHashExtractorTests()
    {
        _timeProvider = new FakeTimeProvider(new DateTimeOffset(2026, 1, 13, 12, 0, 0, TimeSpan.Zero));
        _extractor = new PeSectionHashExtractor(
            _timeProvider,
            NullLogger<PeSectionHashExtractor>.Instance);
    }

    [Fact]
    public async Task ExtractAsync_ValidPe_ReturnsAllSections()
    {
        // Arrange
        var pePath = "TestData/sample.exe";

        // Act
        var result = await _extractor.ExtractAsync(pePath);

        // Assert
        Assert.NotNull(result);
        Assert.Contains(".text", result.Sections.Keys);
        Assert.Contains(".rdata", result.Sections.Keys);
        Assert.NotEmpty(result.FileHash);
    }

    [Fact]
    public async Task ExtractAsync_NotPeFile_ReturnsNull()
    {
        // Arrange
        var elfPath = "TestData/sample.elf";

        // Act
        var result = await _extractor.ExtractAsync(elfPath);

        // Assert
        Assert.Null(result);
    }

    [Fact]
    public async Task ExtractAsync_Deterministic_SameOutput()
    {
        // Arrange
        var pePath = "TestData/sample.exe";

        // Act
        var result1 = await _extractor.ExtractAsync(pePath);
        var result2 = await _extractor.ExtractAsync(pePath);

        // Assert
        Assert.Equal(result1!.FileHash, result2!.FileHash);
        Assert.Equal(result1.Sections[".text"].Sha256, result2.Sections[".text"].Sha256);
    }
}

Adding Custom Section Analysis

Custom Hash Algorithm

public interface IHashAlgorithmProvider
{
    string Name { get; }
    string ComputeHash(ReadOnlySpan<byte> data);
}

public sealed class Blake3HashProvider : IHashAlgorithmProvider
{
    public string Name => "blake3";

    public string ComputeHash(ReadOnlySpan<byte> data)
    {
        // Using Blake3 library
        var hash = Blake3.Hasher.Hash(data);
        return Convert.ToHexString(hash.AsSpan()).ToLowerInvariant();
    }
}

Custom Verdict Classifier

public sealed class EnhancedVerdictClassifier : IVerdictClassifier
{
    private readonly ISymbolAnalyzer _symbolAnalyzer;

    public Verdict Classify(SectionHashSet? baseHashes, SectionHashSet? targetHashes)
    {
        if (baseHashes == null || targetHashes == null)
            return Verdict.Unknown;

        // Check .text section change
        var textChanged = HasSectionChanged(baseHashes, targetHashes, ".text");
        var symbolsChanged = HasSectionChanged(baseHashes, targetHashes, ".symtab");

        // Custom logic: if .text changed but symbols are similar, likely a patch
        if (textChanged && !symbolsChanged)
        {
            return Verdict.Patched;
        }

        // If everything changed significantly, it's a vanilla update
        if (textChanged && symbolsChanged)
        {
            return Verdict.Vanilla;
        }

        return Verdict.Unknown;
    }

    public double ComputeConfidence(SectionHashSet? baseHashes, SectionHashSet? targetHashes)
    {
        if (baseHashes == null || targetHashes == null)
            return 0.0;

        // Compute similarity score
        var matchingSections = 0;
        var totalSections = 0;

        foreach (var (name, baseInfo) in baseHashes.Sections)
        {
            totalSections++;
            if (targetHashes.Sections.TryGetValue(name, out var targetInfo))
            {
                if (baseInfo.Sha256 == targetInfo.Sha256)
                    matchingSections++;
            }
        }

        if (totalSections == 0)
            return 0.0;

        // Higher similarity = higher confidence in classification
        return Math.Round((double)matchingSections / totalSections, 4, MidpointRounding.ToZero);
    }

    private static bool HasSectionChanged(SectionHashSet baseHashes, SectionHashSet targetHashes, string section)
    {
        if (!baseHashes.Sections.TryGetValue(section, out var baseInfo))
            return false;
        if (!targetHashes.Sections.TryGetValue(section, out var targetInfo))
            return true;

        return baseInfo.Sha256 != targetInfo.Sha256;
    }
}

Best Practices

1. Determinism

Always ensure deterministic output:

// BAD - Non-deterministic
public SectionHashSet Extract(string path)
{
    return new SectionHashSet
    {
        ExtractedAt = DateTimeOffset.UtcNow, // Non-deterministic!
        // ...
    };
}

// GOOD - Injected time provider
public SectionHashSet Extract(string path)
{
    return new SectionHashSet
    {
        ExtractedAt = _timeProvider.GetUtcNow(), // Deterministic
        // ...
    };
}

2. Error Handling

Handle malformed binaries gracefully:

public async Task<SectionHashSet?> ExtractAsync(string path, CancellationToken ct)
{
    try
    {
        // ... extraction logic
    }
    catch (BadImageFormatException ex)
    {
        _logger.LogDebug(ex, "Invalid binary format: {Path}", path);
        return null; // Return null, don't throw
    }
    catch (IOException ex)
    {
        _logger.LogWarning(ex, "I/O error reading: {Path}", path);
        return null;
    }
}

3. Memory Management

Stream large binaries instead of loading entirely:

public async Task<SectionHashSet?> ExtractLargeBinaryAsync(
    string path,
    CancellationToken ct)
{
    await using var stream = new FileStream(
        path,
        FileMode.Open,
        FileAccess.Read,
        FileShare.Read,
        bufferSize: 81920,
        useAsync: true);

    // Stream section data instead of loading all at once
    // ...
}

4. Configuration Validation

Validate configuration at startup:

public sealed class PeSectionConfigValidator : IValidateOptions<PeSectionConfig>
{
    public ValidateOptionsResult Validate(string? name, PeSectionConfig options)
    {
        if (options.Sections.Length == 0)
            return ValidateOptionsResult.Fail("At least one section must be specified");

        if (options.MaxSectionSize <= 0)
            return ValidateOptionsResult.Fail("MaxSectionSize must be positive");

        return ValidateOptionsResult.Success;
    }
}

Testing Guidelines

Golden File Tests

[Fact]
public async Task Extract_KnownBinary_MatchesGolden()
{
    // Arrange
    var binaryPath = "TestData/known-binary.exe";
    var goldenPath = "TestData/known-binary.golden.json";

    // Act
    var result = await _extractor.ExtractAsync(binaryPath);

    // Assert
    var expected = JsonSerializer.Deserialize<SectionHashSet>(
        await File.ReadAllTextAsync(goldenPath));

    Assert.Equal(expected!.FileHash, result!.FileHash);
    Assert.Equal(expected.Sections.Count, result.Sections.Count);
}

Fuzz Testing

[Theory]
[MemberData(nameof(MalformedBinaries))]
public async Task Extract_MalformedBinary_ReturnsNullOrThrows(byte[] malformedData)
{
    // Act & Assert - Should not crash
    var result = await _extractor.ExtractFromBytesAsync(
        malformedData,
        "test.bin");

    // Either null or valid result, never exception
    // (Exception would fail the test)
}

References