namespace StellaOps.Feedser.BinaryAnalysis.Fingerprinters; using System.Security.Cryptography; using System.Text; using StellaOps.Feedser.BinaryAnalysis.Models; /// /// Simplified locality-sensitive hash fingerprinter. /// /// NOTE: This is a simplified implementation for proof-of-concept. /// Production use should integrate with a full TLSH library (e.g., via P/Invoke to libtlsh). /// /// This implementation captures key TLSH principles: /// - Sliding window analysis /// - Byte distribution histograms /// - Quartile-based digest /// - Fuzzy matching with Hamming distance /// public sealed class SimplifiedTlshFingerprinter : IBinaryFingerprinter { private const string Version = "1.0.0-simplified"; private const int WindowSize = 5; private const int BucketCount = 256; private const int DigestSize = 32; // 32 bytes = 256 bits public FingerprintMethod Method => FingerprintMethod.TLSH; public async Task ExtractAsync( string binaryPath, string? cveId, string? targetFunction = null, CancellationToken cancellationToken = default) { var binaryData = await File.ReadAllBytesAsync(binaryPath, cancellationToken); var binaryName = Path.GetFileName(binaryPath); return await ExtractAsync(binaryData, binaryName, cveId, targetFunction, cancellationToken); } public Task ExtractAsync( ReadOnlyMemory binaryData, string binaryName, string? cveId, string? targetFunction = null, CancellationToken cancellationToken = default) { var hash = ComputeLocalitySensitiveHash(binaryData.Span); var metadata = ExtractMetadata(binaryData.Span, binaryName); var fingerprint = new BinaryFingerprint { FingerprintId = $"fingerprint:tlsh:{hash}", CveId = cveId, Method = FingerprintMethod.TLSH, FingerprintValue = hash, TargetBinary = binaryName, TargetFunction = targetFunction, Metadata = metadata, ExtractedAt = DateTimeOffset.UtcNow, ExtractorVersion = Version }; return Task.FromResult(fingerprint); } public async Task MatchAsync( string candidatePath, BinaryFingerprint knownFingerprint, CancellationToken cancellationToken = default) { var candidateData = await File.ReadAllBytesAsync(candidatePath, cancellationToken); return await MatchAsync(candidateData, knownFingerprint, cancellationToken); } public Task MatchAsync( ReadOnlyMemory candidateData, BinaryFingerprint knownFingerprint, CancellationToken cancellationToken = default) { var candidateHash = ComputeLocalitySensitiveHash(candidateData.Span); var similarity = ComputeSimilarity(candidateHash, knownFingerprint.FingerprintValue); // TLSH matching thresholds: // similarity > 0.90: High confidence match // similarity > 0.75: Medium confidence match // similarity > 0.60: Low confidence match var isMatch = similarity >= 0.60; var confidence = similarity switch { >= 0.90 => 0.85, // Tier 4 max confidence >= 0.75 => 0.70, >= 0.60 => 0.55, _ => 0.0 }; var result = new FingerprintMatchResult { IsMatch = isMatch, Similarity = similarity, Confidence = confidence, MatchedFingerprintId = isMatch ? knownFingerprint.FingerprintId : null, Method = FingerprintMethod.TLSH, MatchDetails = new Dictionary { ["candidate_hash"] = candidateHash, ["known_hash"] = knownFingerprint.FingerprintValue, ["hamming_distance"] = ComputeHammingDistance(candidateHash, knownFingerprint.FingerprintValue) } }; return Task.FromResult(result); } private static string ComputeLocalitySensitiveHash(ReadOnlySpan data) { if (data.Length < WindowSize) { // For very small data, fall back to regular hash return Convert.ToHexString(SHA256.HashData(data)).ToLowerInvariant()[..DigestSize]; } // Step 1: Compute sliding window triplets (pearson hashing) var buckets = new int[BucketCount]; for (int i = 0; i < data.Length - WindowSize + 1; i++) { var triplet = ComputeTripletHash(data.Slice(i, WindowSize)); buckets[triplet % BucketCount]++; } // Step 2: Compute quartiles (Q1, Q2, Q3) var sorted = buckets.OrderBy(b => b).ToArray(); var q1 = sorted[BucketCount / 4]; var q2 = sorted[BucketCount / 2]; var q3 = sorted[3 * BucketCount / 4]; // Step 3: Generate digest based on quartile comparisons var digest = new byte[DigestSize]; for (int i = 0; i < BucketCount && i / 8 < DigestSize; i++) { var byteIdx = i / 8; var bitIdx = i % 8; // Set bit based on quartile position if (buckets[i] >= q3) { digest[byteIdx] |= (byte)(1 << bitIdx); } else if (buckets[i] >= q2) { digest[byteIdx] |= (byte)(1 << (bitIdx + 1) % 8); } } // Step 4: Add length and checksum metadata var length = Math.Min(data.Length, 0xFFFF); var lengthBytes = BitConverter.GetBytes((ushort)length); digest[0] ^= lengthBytes[0]; digest[1] ^= lengthBytes[1]; return Convert.ToHexString(digest).ToLowerInvariant(); } private static byte ComputeTripletHash(ReadOnlySpan window) { // Pearson hashing for the window byte hash = 0; foreach (var b in window) { hash = PearsonTable[(hash ^ b) % 256]; } return hash; } private static double ComputeSimilarity(string hash1, string hash2) { if (hash1.Length != hash2.Length) { return 0.0; } var distance = ComputeHammingDistance(hash1, hash2); var maxDistance = hash1.Length * 4; // Each hex char = 4 bits return 1.0 - ((double)distance / maxDistance); } private static int ComputeHammingDistance(string hash1, string hash2) { var bytes1 = Convert.FromHexString(hash1); var bytes2 = Convert.FromHexString(hash2); var distance = 0; for (int i = 0; i < Math.Min(bytes1.Length, bytes2.Length); i++) { var xor = (byte)(bytes1[i] ^ bytes2[i]); distance += CountBits(xor); } return distance; } private static int CountBits(byte b) { var count = 0; while (b != 0) { count += b & 1; b >>= 1; } return count; } private static FingerprintMetadata ExtractMetadata(ReadOnlySpan data, string binaryName) { // Detect binary format from magic bytes var format = DetectFormat(data); var architecture = DetectArchitecture(data, format); return new FingerprintMetadata { Architecture = architecture, Format = format, Compiler = null, // Would require deeper analysis OptimizationLevel = null, HasDebugSymbols = false, // Simplified FileOffset = null, RegionSize = data.Length }; } private static string DetectFormat(ReadOnlySpan data) { if (data.Length < 4) return "unknown"; // ELF: 0x7F 'E' 'L' 'F' if (data[0] == 0x7F && data[1] == 'E' && data[2] == 'L' && data[3] == 'F') { return "ELF"; } // PE: 'M' 'Z' if (data[0] == 'M' && data[1] == 'Z') { return "PE"; } // Mach-O: 0xFEEDFACE or 0xFEEDFACF (32/64-bit) if (data.Length >= 4) { var magic = BitConverter.ToUInt32(data[..4]); if (magic == 0xFEEDFACE || magic == 0xFEEDFACF || magic == 0xCEFAEDFE || magic == 0xCFFAEDFE) { return "Mach-O"; } } return "unknown"; } private static string DetectArchitecture(ReadOnlySpan data, string format) { if (format == "ELF" && data.Length >= 18) { var machine = BitConverter.ToUInt16(data.Slice(18, 2)); return machine switch { 0x3E => "x86_64", 0x03 => "x86", 0xB7 => "aarch64", 0x28 => "armv7", _ => "unknown" }; } if (format == "PE" && data.Length >= 0x3C + 4) { // PE offset is at 0x3C var peOffset = BitConverter.ToInt32(data.Slice(0x3C, 4)); if (peOffset > 0 && peOffset + 6 < data.Length) { var machine = BitConverter.ToUInt16(data.Slice(peOffset + 4, 2)); return machine switch { 0x8664 => "x86_64", 0x014C => "x86", 0xAA64 => "aarch64", _ => "unknown" }; } } return "unknown"; } // Pearson hash lookup table private static readonly byte[] PearsonTable = new byte[256] { // Standard Pearson hash permutation table 98, 6, 85, 150, 36, 23, 112, 164, 135, 207, 169, 5, 26, 64, 165, 219, 61, 20, 68, 89, 130, 63, 52, 102, 24, 229, 132, 245, 80, 216, 195, 115, 90, 168, 156, 203, 177, 120, 2, 190, 188, 7, 100, 185, 174, 243, 162, 10, 237, 18, 253, 225, 8, 208, 172, 244, 255, 126, 101, 79, 145, 235, 228, 121, 123, 251, 67, 250, 161, 0, 107, 97, 241, 111, 181, 82, 249, 33, 69, 55, 59, 153, 29, 9, 213, 167, 84, 93, 30, 46, 94, 75, 151, 114, 73, 222, 197, 96, 210, 45, 16, 227, 248, 202, 51, 152, 252, 125, 81, 206, 215, 186, 39, 158, 178, 187, 131, 136, 1, 49, 50, 17, 141, 91, 47, 129, 60, 99, 154, 35, 86, 171, 105, 34, 38, 200, 147, 58, 77, 118, 173, 246, 76, 254, 133, 232, 196, 144, 198, 124, 53, 4, 108, 74, 223, 234, 134, 230, 157, 139, 189, 205, 199, 128, 176, 19, 211, 236, 127, 192, 231, 70, 233, 88, 146, 44, 183, 201, 22, 83, 13, 214, 116, 109, 159, 32, 95, 226, 140, 220, 57, 12, 221, 31, 209, 182, 143, 92, 149, 184, 148, 62, 113, 65, 37, 27, 106, 166, 3, 14, 204, 72, 21, 41, 56, 66, 28, 193, 40, 217, 25, 54, 179, 117, 238, 87, 240, 155, 180, 170, 242, 212, 191, 163, 78, 218, 137, 194, 175, 110, 43, 119, 224, 71, 122, 142, 42, 160, 104, 48, 247, 103, 15, 11, 138, 239 }; }