save progress
This commit is contained in:
@@ -218,7 +218,198 @@ public sealed record VulnFingerprint(
|
||||
public enum FingerprintType { BasicBlock, ControlFlowGraph, StringReferences, Combined }
|
||||
```
|
||||
|
||||
#### 2.2.5 Binary Vulnerability Service
|
||||
#### 2.2.5 Semantic Analysis Library
|
||||
|
||||
> **Library:** `StellaOps.BinaryIndex.Semantic`
|
||||
> **Sprint:** 20260105_001_001_BINDEX - Semantic Diffing Phase 1
|
||||
|
||||
The Semantic Analysis Library extends fingerprint generation with IR-level semantic matching, enabling detection of semantically equivalent code despite compiler optimizations, instruction reordering, and register allocation differences.
|
||||
|
||||
**Key Insight:** Traditional instruction-level fingerprinting loses accuracy on optimized binaries by ~15-20%. Semantic analysis lifts to B2R2's Intermediate Representation (LowUIR), extracts key-semantics graphs, and uses graph hashing for similarity computation.
|
||||
|
||||
##### 2.2.5.1 Architecture
|
||||
|
||||
```
|
||||
Binary Input
|
||||
│
|
||||
v
|
||||
B2R2 Disassembly → Raw Instructions
|
||||
│
|
||||
v
|
||||
IR Lifting Service → LowUIR Statements
|
||||
│
|
||||
v
|
||||
Semantic Graph Extractor → Key-Semantics Graph (KSG)
|
||||
│
|
||||
v
|
||||
Graph Fingerprinting → Semantic Fingerprint
|
||||
│
|
||||
v
|
||||
Semantic Matcher → Similarity Score + Deltas
|
||||
```
|
||||
|
||||
##### 2.2.5.2 Core Components
|
||||
|
||||
**IR Lifting Service** (`IIrLiftingService`)
|
||||
|
||||
Lifts disassembled instructions to B2R2 LowUIR:
|
||||
|
||||
```csharp
|
||||
public interface IIrLiftingService
|
||||
{
|
||||
Task<LiftedFunction> LiftToIrAsync(
|
||||
IReadOnlyList<DisassembledInstruction> instructions,
|
||||
string functionName,
|
||||
LiftOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public sealed record LiftedFunction(
|
||||
string Name,
|
||||
ImmutableArray<IrStatement> Statements,
|
||||
ImmutableArray<IrBasicBlock> BasicBlocks);
|
||||
```
|
||||
|
||||
**Semantic Graph Extractor** (`ISemanticGraphExtractor`)
|
||||
|
||||
Extracts key-semantics graphs capturing data dependencies, control flow, and memory operations:
|
||||
|
||||
```csharp
|
||||
public interface ISemanticGraphExtractor
|
||||
{
|
||||
Task<KeySemanticsGraph> ExtractGraphAsync(
|
||||
LiftedFunction function,
|
||||
GraphExtractionOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public sealed record KeySemanticsGraph(
|
||||
string FunctionName,
|
||||
ImmutableArray<SemanticNode> Nodes,
|
||||
ImmutableArray<SemanticEdge> Edges,
|
||||
GraphProperties Properties);
|
||||
|
||||
public enum SemanticNodeType { Compute, Load, Store, Branch, Call, Return, Phi }
|
||||
public enum SemanticEdgeType { DataDependency, ControlDependency, MemoryDependency }
|
||||
```
|
||||
|
||||
**Semantic Fingerprint Generator** (`ISemanticFingerprintGenerator`)
|
||||
|
||||
Generates semantic fingerprints using Weisfeiler-Lehman graph hashing:
|
||||
|
||||
```csharp
|
||||
public interface ISemanticFingerprintGenerator
|
||||
{
|
||||
Task<SemanticFingerprint> GenerateAsync(
|
||||
KeySemanticsGraph graph,
|
||||
SemanticFingerprintOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public sealed record SemanticFingerprint(
|
||||
string FunctionName,
|
||||
string GraphHashHex, // WL graph hash (SHA-256)
|
||||
string OperationHashHex, // Normalized operation sequence hash
|
||||
string DataFlowHashHex, // Data dependency pattern hash
|
||||
int NodeCount,
|
||||
int EdgeCount,
|
||||
int CyclomaticComplexity,
|
||||
ImmutableArray<string> ApiCalls,
|
||||
SemanticFingerprintAlgorithm Algorithm);
|
||||
```
|
||||
|
||||
**Semantic Matcher** (`ISemanticMatcher`)
|
||||
|
||||
Computes semantic similarity with weighted components:
|
||||
|
||||
```csharp
|
||||
public interface ISemanticMatcher
|
||||
{
|
||||
Task<SemanticMatchResult> MatchAsync(
|
||||
SemanticFingerprint a,
|
||||
SemanticFingerprint b,
|
||||
MatchOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
|
||||
Task<SemanticMatchResult> MatchWithDeltasAsync(
|
||||
SemanticFingerprint a,
|
||||
SemanticFingerprint b,
|
||||
MatchOptions? options = null,
|
||||
CancellationToken ct = default);
|
||||
}
|
||||
|
||||
public sealed record SemanticMatchResult(
|
||||
decimal Similarity, // 0.00-1.00
|
||||
decimal GraphSimilarity,
|
||||
decimal OperationSimilarity,
|
||||
decimal DataFlowSimilarity,
|
||||
decimal ApiCallSimilarity,
|
||||
MatchConfidence Confidence);
|
||||
```
|
||||
|
||||
##### 2.2.5.3 Algorithm Details
|
||||
|
||||
**Weisfeiler-Lehman Graph Hashing:**
|
||||
- 3 iterations of label propagation
|
||||
- SHA-256 for final hash computation
|
||||
- Deterministic node ordering via canonical sort
|
||||
|
||||
**Similarity Weights (Default):**
|
||||
| Component | Weight |
|
||||
|-----------|--------|
|
||||
| Graph Hash | 0.35 |
|
||||
| Operation Hash | 0.25 |
|
||||
| Data Flow Hash | 0.25 |
|
||||
| API Calls | 0.15 |
|
||||
|
||||
##### 2.2.5.4 Integration Points
|
||||
|
||||
The semantic library integrates with existing BinaryIndex components:
|
||||
|
||||
**DeltaSignatureGenerator Extension:**
|
||||
```csharp
|
||||
// Optional semantic services via constructor injection
|
||||
services.AddDeltaSignaturesWithSemantic();
|
||||
|
||||
// Extended SymbolSignature with semantic properties
|
||||
public sealed record SymbolSignature
|
||||
{
|
||||
// ... existing properties ...
|
||||
public string? SemanticHashHex { get; init; }
|
||||
public ImmutableArray<string> SemanticApiCalls { get; init; }
|
||||
}
|
||||
```
|
||||
|
||||
**PatchDiffEngine Extension:**
|
||||
```csharp
|
||||
// SemanticWeight in HashWeights
|
||||
public decimal SemanticWeight { get; init; } = 0.2m;
|
||||
|
||||
// FunctionFingerprint extended with semantic fingerprint
|
||||
public SemanticFingerprint? SemanticFingerprint { get; init; }
|
||||
```
|
||||
|
||||
##### 2.2.5.5 Test Coverage
|
||||
|
||||
| Category | Tests | Coverage |
|
||||
|----------|-------|----------|
|
||||
| Unit Tests (IR lifting, graph extraction, hashing) | 53 | Core algorithms |
|
||||
| Integration Tests (full pipeline) | 9 | End-to-end flow |
|
||||
| Golden Corpus (compiler variations) | 11 | Register allocation, optimization, compiler variants |
|
||||
| Benchmarks (accuracy, performance) | 7 | Baseline metrics |
|
||||
|
||||
##### 2.2.5.6 Current Baselines
|
||||
|
||||
> **Note:** Baselines reflect foundational implementation; accuracy improves as semantic features mature.
|
||||
|
||||
| Metric | Baseline | Target |
|
||||
|--------|----------|--------|
|
||||
| Similarity (register allocation variants) | ≥0.55 | ≥0.85 |
|
||||
| Overall accuracy | ≥40% | ≥70% |
|
||||
| False positive rate | <10% | <5% |
|
||||
| P95 fingerprint latency | <100ms | <50ms |
|
||||
|
||||
#### 2.2.6 Binary Vulnerability Service
|
||||
|
||||
Main query interface for consumers.
|
||||
|
||||
@@ -688,8 +879,11 @@ binaryindex:
|
||||
- Scanner Native Analysis: `src/Scanner/StellaOps.Scanner.Analyzers.Native/`
|
||||
- Existing Fingerprinting: `src/Scanner/__Libraries/StellaOps.Scanner.EntryTrace/Binary/`
|
||||
- Build-ID Index: `src/Scanner/StellaOps.Scanner.Analyzers.Native/Index/`
|
||||
- **Semantic Diffing Sprint:** `docs/implplan/SPRINT_20260105_001_001_BINDEX_semdiff_ir_semantics.md`
|
||||
- **Semantic Library:** `src/BinaryIndex/__Libraries/StellaOps.BinaryIndex.Semantic/`
|
||||
- **Semantic Tests:** `src/BinaryIndex/__Tests/StellaOps.BinaryIndex.Semantic.Tests/`
|
||||
|
||||
---
|
||||
|
||||
*Document Version: 1.0.0*
|
||||
*Last Updated: 2025-12-21*
|
||||
*Document Version: 1.1.0*
|
||||
*Last Updated: 2025-01-15*
|
||||
|
||||
Reference in New Issue
Block a user